support for Geforce FX5500 based on patch by Pascal Yu <yu_pascal at hotmail.com>
[mplayer/greg.git] / postproc / swscale_template.c
blob6a8e576ffb12dbe99a8d4678a4dd71877b4f6d1f
1 /*
2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 #undef REAL_MOVNTQ
20 #undef MOVNTQ
21 #undef PAVGB
22 #undef PREFETCH
23 #undef PREFETCHW
24 #undef EMMS
25 #undef SFENCE
27 #ifdef HAVE_3DNOW
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
29 #define EMMS "femms"
30 #else
31 #define EMMS "emms"
32 #endif
34 #ifdef HAVE_3DNOW
35 #define PREFETCH "prefetch"
36 #define PREFETCHW "prefetchw"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #else
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
43 #endif
45 #ifdef HAVE_MMX2
46 #define SFENCE "sfence"
47 #else
48 #define SFENCE "/nop"
49 #endif
51 #ifdef HAVE_MMX2
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53 #elif defined (HAVE_3DNOW)
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
55 #endif
57 #ifdef HAVE_MMX2
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59 #else
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
61 #endif
62 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
64 #ifdef HAVE_ALTIVEC
65 #include "swscale_altivec_template.c"
66 #endif
68 #define YSCALEYUV2YV12X(x, offset) \
69 "xor %%"REG_a", %%"REG_a" \n\t"\
70 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71 "movq %%mm3, %%mm4 \n\t"\
72 "lea " offset "(%0), %%"REG_d" \n\t"\
73 "mov (%%"REG_d"), %%"REG_S" \n\t"\
74 ".balign 16 \n\t" /* FIXME Unroll? */\
75 "1: \n\t"\
76 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
77 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79 "add $16, %%"REG_d" \n\t"\
80 "mov (%%"REG_d"), %%"REG_S" \n\t"\
81 "test %%"REG_S", %%"REG_S" \n\t"\
82 "pmulhw %%mm0, %%mm2 \n\t"\
83 "pmulhw %%mm0, %%mm5 \n\t"\
84 "paddw %%mm2, %%mm3 \n\t"\
85 "paddw %%mm5, %%mm4 \n\t"\
86 " jnz 1b \n\t"\
87 "psraw $3, %%mm3 \n\t"\
88 "psraw $3, %%mm4 \n\t"\
89 "packuswb %%mm4, %%mm3 \n\t"\
90 MOVNTQ(%%mm3, (%1, %%REGa))\
91 "add $8, %%"REG_a" \n\t"\
92 "cmp %2, %%"REG_a" \n\t"\
93 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94 "movq %%mm3, %%mm4 \n\t"\
95 "lea " offset "(%0), %%"REG_d" \n\t"\
96 "mov (%%"REG_d"), %%"REG_S" \n\t"\
97 "jb 1b \n\t"
99 #define YSCALEYUV2YV121 \
100 "mov %2, %%"REG_a" \n\t"\
101 ".balign 16 \n\t" /* FIXME Unroll? */\
102 "1: \n\t"\
103 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105 "psraw $7, %%mm0 \n\t"\
106 "psraw $7, %%mm1 \n\t"\
107 "packuswb %%mm1, %%mm0 \n\t"\
108 MOVNTQ(%%mm0, (%1, %%REGa))\
109 "add $8, %%"REG_a" \n\t"\
110 "jnc 1b \n\t"
113 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115 "r" (dest), "m" (dstW),
116 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
119 #define YSCALEYUV2PACKEDX \
120 "xor %%"REG_a", %%"REG_a" \n\t"\
121 ".balign 16 \n\t"\
122 "nop \n\t"\
123 "1: \n\t"\
124 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125 "mov (%%"REG_d"), %%"REG_S" \n\t"\
126 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127 "movq %%mm3, %%mm4 \n\t"\
128 ".balign 16 \n\t"\
129 "2: \n\t"\
130 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
131 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
132 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133 "add $16, %%"REG_d" \n\t"\
134 "mov (%%"REG_d"), %%"REG_S" \n\t"\
135 "pmulhw %%mm0, %%mm2 \n\t"\
136 "pmulhw %%mm0, %%mm5 \n\t"\
137 "paddw %%mm2, %%mm3 \n\t"\
138 "paddw %%mm5, %%mm4 \n\t"\
139 "test %%"REG_S", %%"REG_S" \n\t"\
140 " jnz 2b \n\t"\
142 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143 "mov (%%"REG_d"), %%"REG_S" \n\t"\
144 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145 "movq %%mm1, %%mm7 \n\t"\
146 ".balign 16 \n\t"\
147 "2: \n\t"\
148 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
149 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
150 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151 "add $16, %%"REG_d" \n\t"\
152 "mov (%%"REG_d"), %%"REG_S" \n\t"\
153 "pmulhw %%mm0, %%mm2 \n\t"\
154 "pmulhw %%mm0, %%mm5 \n\t"\
155 "paddw %%mm2, %%mm1 \n\t"\
156 "paddw %%mm5, %%mm7 \n\t"\
157 "test %%"REG_S", %%"REG_S" \n\t"\
158 " jnz 2b \n\t"\
161 #define YSCALEYUV2RGBX \
162 YSCALEYUV2PACKEDX\
163 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
164 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
165 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
166 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
167 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
168 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
169 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
171 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
172 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
173 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
174 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
175 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
176 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177 "paddw %%mm3, %%mm4 \n\t"\
178 "movq %%mm2, %%mm0 \n\t"\
179 "movq %%mm5, %%mm6 \n\t"\
180 "movq %%mm4, %%mm3 \n\t"\
181 "punpcklwd %%mm2, %%mm2 \n\t"\
182 "punpcklwd %%mm5, %%mm5 \n\t"\
183 "punpcklwd %%mm4, %%mm4 \n\t"\
184 "paddw %%mm1, %%mm2 \n\t"\
185 "paddw %%mm1, %%mm5 \n\t"\
186 "paddw %%mm1, %%mm4 \n\t"\
187 "punpckhwd %%mm0, %%mm0 \n\t"\
188 "punpckhwd %%mm6, %%mm6 \n\t"\
189 "punpckhwd %%mm3, %%mm3 \n\t"\
190 "paddw %%mm7, %%mm0 \n\t"\
191 "paddw %%mm7, %%mm6 \n\t"\
192 "paddw %%mm7, %%mm3 \n\t"\
193 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194 "packuswb %%mm0, %%mm2 \n\t"\
195 "packuswb %%mm6, %%mm5 \n\t"\
196 "packuswb %%mm3, %%mm4 \n\t"\
197 "pxor %%mm7, %%mm7 \n\t"
198 #if 0
199 #define FULL_YSCALEYUV2RGB \
200 "pxor %%mm7, %%mm7 \n\t"\
201 "movd %6, %%mm6 \n\t" /*yalpha1*/\
202 "punpcklwd %%mm6, %%mm6 \n\t"\
203 "punpcklwd %%mm6, %%mm6 \n\t"\
204 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
205 "punpcklwd %%mm5, %%mm5 \n\t"\
206 "punpcklwd %%mm5, %%mm5 \n\t"\
207 "xor %%"REG_a", %%"REG_a" \n\t"\
208 ".balign 16 \n\t"\
209 "1: \n\t"\
210 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
213 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
214 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
215 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
220 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
223 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
226 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
227 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
230 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
232 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
239 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
240 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242 "paddw %%mm1, %%mm3 \n\t" /* B*/\
243 "paddw %%mm1, %%mm0 \n\t" /* R*/\
244 "packuswb %%mm3, %%mm3 \n\t"\
246 "packuswb %%mm0, %%mm0 \n\t"\
247 "paddw %%mm4, %%mm2 \n\t"\
248 "paddw %%mm2, %%mm1 \n\t" /* G*/\
250 "packuswb %%mm1, %%mm1 \n\t"
251 #endif
253 #define REAL_YSCALEYUV2PACKED(index, c) \
254 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256 "psraw $3, %%mm0 \n\t"\
257 "psraw $3, %%mm1 \n\t"\
258 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260 "xor "#index", "#index" \n\t"\
261 ".balign 16 \n\t"\
262 "1: \n\t"\
263 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
264 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
265 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
277 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
278 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
281 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
282 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
289 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
291 #define REAL_YSCALEYUV2RGB(index, c) \
292 "xor "#index", "#index" \n\t"\
293 ".balign 16 \n\t"\
294 "1: \n\t"\
295 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
296 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
297 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
309 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
312 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
316 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
317 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
320 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
321 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
330 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
331 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
332 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
333 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334 "paddw %%mm3, %%mm4 \n\t"\
335 "movq %%mm2, %%mm0 \n\t"\
336 "movq %%mm5, %%mm6 \n\t"\
337 "movq %%mm4, %%mm3 \n\t"\
338 "punpcklwd %%mm2, %%mm2 \n\t"\
339 "punpcklwd %%mm5, %%mm5 \n\t"\
340 "punpcklwd %%mm4, %%mm4 \n\t"\
341 "paddw %%mm1, %%mm2 \n\t"\
342 "paddw %%mm1, %%mm5 \n\t"\
343 "paddw %%mm1, %%mm4 \n\t"\
344 "punpckhwd %%mm0, %%mm0 \n\t"\
345 "punpckhwd %%mm6, %%mm6 \n\t"\
346 "punpckhwd %%mm3, %%mm3 \n\t"\
347 "paddw %%mm7, %%mm0 \n\t"\
348 "paddw %%mm7, %%mm6 \n\t"\
349 "paddw %%mm7, %%mm3 \n\t"\
350 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351 "packuswb %%mm0, %%mm2 \n\t"\
352 "packuswb %%mm6, %%mm5 \n\t"\
353 "packuswb %%mm3, %%mm4 \n\t"\
354 "pxor %%mm7, %%mm7 \n\t"
355 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
357 #define REAL_YSCALEYUV2PACKED1(index, c) \
358 "xor "#index", "#index" \n\t"\
359 ".balign 16 \n\t"\
360 "1: \n\t"\
361 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
362 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363 "psraw $7, %%mm3 \n\t" \
364 "psraw $7, %%mm4 \n\t" \
365 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
366 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367 "psraw $7, %%mm1 \n\t" \
368 "psraw $7, %%mm7 \n\t" \
370 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
372 #define REAL_YSCALEYUV2RGB1(index, c) \
373 "xor "#index", "#index" \n\t"\
374 ".balign 16 \n\t"\
375 "1: \n\t"\
376 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
377 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
381 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
382 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
383 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
384 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
388 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
394 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
395 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
396 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
397 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398 "paddw %%mm3, %%mm4 \n\t"\
399 "movq %%mm2, %%mm0 \n\t"\
400 "movq %%mm5, %%mm6 \n\t"\
401 "movq %%mm4, %%mm3 \n\t"\
402 "punpcklwd %%mm2, %%mm2 \n\t"\
403 "punpcklwd %%mm5, %%mm5 \n\t"\
404 "punpcklwd %%mm4, %%mm4 \n\t"\
405 "paddw %%mm1, %%mm2 \n\t"\
406 "paddw %%mm1, %%mm5 \n\t"\
407 "paddw %%mm1, %%mm4 \n\t"\
408 "punpckhwd %%mm0, %%mm0 \n\t"\
409 "punpckhwd %%mm6, %%mm6 \n\t"\
410 "punpckhwd %%mm3, %%mm3 \n\t"\
411 "paddw %%mm7, %%mm0 \n\t"\
412 "paddw %%mm7, %%mm6 \n\t"\
413 "paddw %%mm7, %%mm3 \n\t"\
414 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415 "packuswb %%mm0, %%mm2 \n\t"\
416 "packuswb %%mm6, %%mm5 \n\t"\
417 "packuswb %%mm3, %%mm4 \n\t"\
418 "pxor %%mm7, %%mm7 \n\t"
419 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
421 #define REAL_YSCALEYUV2PACKED1b(index, c) \
422 "xor "#index", "#index" \n\t"\
423 ".balign 16 \n\t"\
424 "1: \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431 "psrlw $8, %%mm3 \n\t" \
432 "psrlw $8, %%mm4 \n\t" \
433 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
434 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435 "psraw $7, %%mm1 \n\t" \
436 "psraw $7, %%mm7 \n\t"
437 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
439 // do vertical chrominance interpolation
440 #define REAL_YSCALEYUV2RGB1b(index, c) \
441 "xor "#index", "#index" \n\t"\
442 ".balign 16 \n\t"\
443 "1: \n\t"\
444 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
445 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
446 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
451 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
452 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
453 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
454 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
455 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
456 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
460 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
466 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
467 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
468 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
469 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470 "paddw %%mm3, %%mm4 \n\t"\
471 "movq %%mm2, %%mm0 \n\t"\
472 "movq %%mm5, %%mm6 \n\t"\
473 "movq %%mm4, %%mm3 \n\t"\
474 "punpcklwd %%mm2, %%mm2 \n\t"\
475 "punpcklwd %%mm5, %%mm5 \n\t"\
476 "punpcklwd %%mm4, %%mm4 \n\t"\
477 "paddw %%mm1, %%mm2 \n\t"\
478 "paddw %%mm1, %%mm5 \n\t"\
479 "paddw %%mm1, %%mm4 \n\t"\
480 "punpckhwd %%mm0, %%mm0 \n\t"\
481 "punpckhwd %%mm6, %%mm6 \n\t"\
482 "punpckhwd %%mm3, %%mm3 \n\t"\
483 "paddw %%mm7, %%mm0 \n\t"\
484 "paddw %%mm7, %%mm6 \n\t"\
485 "paddw %%mm7, %%mm3 \n\t"\
486 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487 "packuswb %%mm0, %%mm2 \n\t"\
488 "packuswb %%mm6, %%mm5 \n\t"\
489 "packuswb %%mm3, %%mm4 \n\t"\
490 "pxor %%mm7, %%mm7 \n\t"
491 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
493 #define REAL_WRITEBGR32(dst, dstw, index) \
494 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495 "movq %%mm2, %%mm1 \n\t" /* B */\
496 "movq %%mm5, %%mm6 \n\t" /* R */\
497 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
498 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
499 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
500 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
501 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
502 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
503 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
504 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
505 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
506 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
508 MOVNTQ(%%mm0, (dst, index, 4))\
509 MOVNTQ(%%mm2, 8(dst, index, 4))\
510 MOVNTQ(%%mm1, 16(dst, index, 4))\
511 MOVNTQ(%%mm3, 24(dst, index, 4))\
513 "add $8, "#index" \n\t"\
514 "cmp "#dstw", "#index" \n\t"\
515 " jb 1b \n\t"
516 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
518 #define REAL_WRITEBGR16(dst, dstw, index) \
519 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
520 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
521 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
522 "psrlq $3, %%mm2 \n\t"\
524 "movq %%mm2, %%mm1 \n\t"\
525 "movq %%mm4, %%mm3 \n\t"\
527 "punpcklbw %%mm7, %%mm3 \n\t"\
528 "punpcklbw %%mm5, %%mm2 \n\t"\
529 "punpckhbw %%mm7, %%mm4 \n\t"\
530 "punpckhbw %%mm5, %%mm1 \n\t"\
532 "psllq $3, %%mm3 \n\t"\
533 "psllq $3, %%mm4 \n\t"\
535 "por %%mm3, %%mm2 \n\t"\
536 "por %%mm4, %%mm1 \n\t"\
538 MOVNTQ(%%mm2, (dst, index, 2))\
539 MOVNTQ(%%mm1, 8(dst, index, 2))\
541 "add $8, "#index" \n\t"\
542 "cmp "#dstw", "#index" \n\t"\
543 " jb 1b \n\t"
544 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
546 #define REAL_WRITEBGR15(dst, dstw, index) \
547 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
548 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
549 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
550 "psrlq $3, %%mm2 \n\t"\
551 "psrlq $1, %%mm5 \n\t"\
553 "movq %%mm2, %%mm1 \n\t"\
554 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklbw %%mm7, %%mm3 \n\t"\
557 "punpcklbw %%mm5, %%mm2 \n\t"\
558 "punpckhbw %%mm7, %%mm4 \n\t"\
559 "punpckhbw %%mm5, %%mm1 \n\t"\
561 "psllq $2, %%mm3 \n\t"\
562 "psllq $2, %%mm4 \n\t"\
564 "por %%mm3, %%mm2 \n\t"\
565 "por %%mm4, %%mm1 \n\t"\
567 MOVNTQ(%%mm2, (dst, index, 2))\
568 MOVNTQ(%%mm1, 8(dst, index, 2))\
570 "add $8, "#index" \n\t"\
571 "cmp "#dstw", "#index" \n\t"\
572 " jb 1b \n\t"
573 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
575 #define WRITEBGR24OLD(dst, dstw, index) \
576 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577 "movq %%mm2, %%mm1 \n\t" /* B */\
578 "movq %%mm5, %%mm6 \n\t" /* R */\
579 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
580 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
581 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
582 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
583 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
584 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
585 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
586 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
587 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
588 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
590 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
591 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
592 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
595 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
596 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
597 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
599 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
600 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
601 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
602 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
603 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
605 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
606 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
609 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
610 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
611 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
613 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
614 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
615 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
616 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
619 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
620 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
622 MOVNTQ(%%mm0, (dst))\
623 MOVNTQ(%%mm2, 8(dst))\
624 MOVNTQ(%%mm3, 16(dst))\
625 "add $24, "#dst" \n\t"\
627 "add $8, "#index" \n\t"\
628 "cmp "#dstw", "#index" \n\t"\
629 " jb 1b \n\t"
631 #define WRITEBGR24MMX(dst, dstw, index) \
632 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633 "movq %%mm2, %%mm1 \n\t" /* B */\
634 "movq %%mm5, %%mm6 \n\t" /* R */\
635 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
636 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
637 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
638 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
639 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
640 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
641 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
642 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
643 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
644 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
646 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
647 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
648 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
649 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
651 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
652 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
653 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
654 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
656 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
657 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
658 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
659 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
661 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
662 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
663 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
664 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
665 MOVNTQ(%%mm0, (dst))\
667 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
668 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
669 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
670 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
671 MOVNTQ(%%mm6, 8(dst))\
673 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
674 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
675 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
676 MOVNTQ(%%mm5, 16(dst))\
678 "add $24, "#dst" \n\t"\
680 "add $8, "#index" \n\t"\
681 "cmp "#dstw", "#index" \n\t"\
682 " jb 1b \n\t"
684 #define WRITEBGR24MMX2(dst, dstw, index) \
685 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686 "movq "MANGLE(M24A)", %%mm0 \n\t"\
687 "movq "MANGLE(M24C)", %%mm7 \n\t"\
688 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
689 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
690 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
692 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
693 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
694 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
696 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
697 "por %%mm1, %%mm6 \n\t"\
698 "por %%mm3, %%mm6 \n\t"\
699 MOVNTQ(%%mm6, (dst))\
701 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
702 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
703 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
704 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
706 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
707 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
708 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
710 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
711 "por %%mm3, %%mm6 \n\t"\
712 MOVNTQ(%%mm6, 8(dst))\
714 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
715 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
716 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
718 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
719 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
720 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
722 "por %%mm1, %%mm3 \n\t"\
723 "por %%mm3, %%mm6 \n\t"\
724 MOVNTQ(%%mm6, 16(dst))\
726 "add $24, "#dst" \n\t"\
728 "add $8, "#index" \n\t"\
729 "cmp "#dstw", "#index" \n\t"\
730 " jb 1b \n\t"
732 #ifdef HAVE_MMX2
733 #undef WRITEBGR24
734 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
735 #else
736 #undef WRITEBGR24
737 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
738 #endif
740 #define REAL_WRITEYUY2(dst, dstw, index) \
741 "packuswb %%mm3, %%mm3 \n\t"\
742 "packuswb %%mm4, %%mm4 \n\t"\
743 "packuswb %%mm7, %%mm1 \n\t"\
744 "punpcklbw %%mm4, %%mm3 \n\t"\
745 "movq %%mm1, %%mm7 \n\t"\
746 "punpcklbw %%mm3, %%mm1 \n\t"\
747 "punpckhbw %%mm3, %%mm7 \n\t"\
749 MOVNTQ(%%mm1, (dst, index, 2))\
750 MOVNTQ(%%mm7, 8(dst, index, 2))\
752 "add $8, "#index" \n\t"\
753 "cmp "#dstw", "#index" \n\t"\
754 " jb 1b \n\t"
755 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
758 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
760 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
762 #ifdef HAVE_MMX
763 if(uDest != NULL)
765 asm volatile(
766 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
767 :: "r" (&c->redDither),
768 "r" (uDest), "p" (chrDstW)
769 : "%"REG_a, "%"REG_d, "%"REG_S
772 asm volatile(
773 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
774 :: "r" (&c->redDither),
775 "r" (vDest), "p" (chrDstW)
776 : "%"REG_a, "%"REG_d, "%"REG_S
780 asm volatile(
781 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
782 :: "r" (&c->redDither),
783 "r" (dest), "p" (dstW)
784 : "%"REG_a, "%"REG_d, "%"REG_S
786 #else
787 #ifdef HAVE_ALTIVEC
788 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
789 chrFilter, chrSrc, chrFilterSize,
790 dest, uDest, vDest, dstW, chrDstW);
791 #else //HAVE_ALTIVEC
792 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793 chrFilter, chrSrc, chrFilterSize,
794 dest, uDest, vDest, dstW, chrDstW);
795 #endif //!HAVE_ALTIVEC
796 #endif
799 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
800 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
801 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
803 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
804 chrFilter, chrSrc, chrFilterSize,
805 dest, uDest, dstW, chrDstW, dstFormat);
808 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
809 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
811 #ifdef HAVE_MMX
812 if(uDest != NULL)
814 asm volatile(
815 YSCALEYUV2YV121
816 :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
817 "g" (-chrDstW)
818 : "%"REG_a
821 asm volatile(
822 YSCALEYUV2YV121
823 :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
824 "g" (-chrDstW)
825 : "%"REG_a
829 asm volatile(
830 YSCALEYUV2YV121
831 :: "r" (lumSrc + dstW), "r" (dest + dstW),
832 "g" (-dstW)
833 : "%"REG_a
835 #else
836 int i;
837 for(i=0; i<dstW; i++)
839 int val= lumSrc[i]>>7;
841 if(val&256){
842 if(val<0) val=0;
843 else val=255;
846 dest[i]= val;
849 if(uDest != NULL)
850 for(i=0; i<chrDstW; i++)
852 int u=chrSrc[i]>>7;
853 int v=chrSrc[i + 2048]>>7;
855 if((u|v)&256){
856 if(u<0) u=0;
857 else if (u>255) u=255;
858 if(v<0) v=0;
859 else if (v>255) v=255;
862 uDest[i]= u;
863 vDest[i]= v;
865 #endif
870 * vertical scale YV12 to RGB
872 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
873 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
874 uint8_t *dest, int dstW, int dstY)
876 int dummy=0;
877 switch(c->dstFormat)
879 #ifdef HAVE_MMX
880 case IMGFMT_BGR32:
882 asm volatile(
883 YSCALEYUV2RGBX
884 WRITEBGR32(%4, %5, %%REGa)
886 :: "r" (&c->redDither),
887 "m" (dummy), "m" (dummy), "m" (dummy),
888 "r" (dest), "m" (dstW)
889 : "%"REG_a, "%"REG_d, "%"REG_S
892 break;
893 case IMGFMT_BGR24:
895 asm volatile(
896 YSCALEYUV2RGBX
897 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
898 "add %4, %%"REG_b" \n\t"
899 WRITEBGR24(%%REGb, %5, %%REGa)
901 :: "r" (&c->redDither),
902 "m" (dummy), "m" (dummy), "m" (dummy),
903 "r" (dest), "m" (dstW)
904 : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
907 break;
908 case IMGFMT_BGR15:
910 asm volatile(
911 YSCALEYUV2RGBX
912 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
913 #ifdef DITHER1XBPP
914 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
915 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
916 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
917 #endif
919 WRITEBGR15(%4, %5, %%REGa)
921 :: "r" (&c->redDither),
922 "m" (dummy), "m" (dummy), "m" (dummy),
923 "r" (dest), "m" (dstW)
924 : "%"REG_a, "%"REG_d, "%"REG_S
927 break;
928 case IMGFMT_BGR16:
930 asm volatile(
931 YSCALEYUV2RGBX
932 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
933 #ifdef DITHER1XBPP
934 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
935 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
936 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
937 #endif
939 WRITEBGR16(%4, %5, %%REGa)
941 :: "r" (&c->redDither),
942 "m" (dummy), "m" (dummy), "m" (dummy),
943 "r" (dest), "m" (dstW)
944 : "%"REG_a, "%"REG_d, "%"REG_S
947 break;
948 case IMGFMT_YUY2:
950 asm volatile(
951 YSCALEYUV2PACKEDX
952 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
954 "psraw $3, %%mm3 \n\t"
955 "psraw $3, %%mm4 \n\t"
956 "psraw $3, %%mm1 \n\t"
957 "psraw $3, %%mm7 \n\t"
958 WRITEYUY2(%4, %5, %%REGa)
960 :: "r" (&c->redDither),
961 "m" (dummy), "m" (dummy), "m" (dummy),
962 "r" (dest), "m" (dstW)
963 : "%"REG_a, "%"REG_d, "%"REG_S
966 break;
967 #endif
968 default:
969 #ifdef HAVE_ALTIVEC
970 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
971 chrFilter, chrSrc, chrFilterSize,
972 dest, dstW, dstY);
973 #else
974 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
975 chrFilter, chrSrc, chrFilterSize,
976 dest, dstW, dstY);
977 #endif
978 break;
983 * vertical bilinear scale YV12 to RGB
985 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
986 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
988 int yalpha1=yalpha^4095;
989 int uvalpha1=uvalpha^4095;
990 int i;
992 #if 0 //isn't used
993 if(flags&SWS_FULL_CHR_H_INT)
995 switch(dstFormat)
997 #ifdef HAVE_MMX
998 case IMGFMT_BGR32:
999 asm volatile(
1002 FULL_YSCALEYUV2RGB
1003 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1004 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1006 "movq %%mm3, %%mm1 \n\t"
1007 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1008 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1010 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1011 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1013 "add $4, %%"REG_a" \n\t"
1014 "cmp %5, %%"REG_a" \n\t"
1015 " jb 1b \n\t"
1018 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1019 "m" (yalpha1), "m" (uvalpha1)
1020 : "%"REG_a
1022 break;
1023 case IMGFMT_BGR24:
1024 asm volatile(
1026 FULL_YSCALEYUV2RGB
1028 // lsb ... msb
1029 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1030 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1032 "movq %%mm3, %%mm1 \n\t"
1033 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1034 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1036 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1037 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1038 "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1039 "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1040 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1041 "movq %%mm1, %%mm2 \n\t"
1042 "psllq $48, %%mm1 \n\t" // 000000BG
1043 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1045 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1046 "psrld $16, %%mm2 \n\t" // R000R000
1047 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1048 "por %%mm2, %%mm1 \n\t" // RBGRR000
1050 "mov %4, %%"REG_b" \n\t"
1051 "add %%"REG_a", %%"REG_b" \n\t"
1053 #ifdef HAVE_MMX2
1054 //FIXME Alignment
1055 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1056 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1057 #else
1058 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1059 "psrlq $32, %%mm3 \n\t"
1060 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1061 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1062 #endif
1063 "add $4, %%"REG_a" \n\t"
1064 "cmp %5, %%"REG_a" \n\t"
1065 " jb 1b \n\t"
1067 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1068 "m" (yalpha1), "m" (uvalpha1)
1069 : "%"REG_a, "%"REG_b
1071 break;
1072 case IMGFMT_BGR15:
1073 asm volatile(
1075 FULL_YSCALEYUV2RGB
1076 #ifdef DITHER1XBPP
1077 "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1078 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1079 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1080 #endif
1081 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1082 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1083 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1085 "psrlw $3, %%mm3 \n\t"
1086 "psllw $2, %%mm1 \n\t"
1087 "psllw $7, %%mm0 \n\t"
1088 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1089 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1091 "por %%mm3, %%mm1 \n\t"
1092 "por %%mm1, %%mm0 \n\t"
1094 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1096 "add $4, %%"REG_a" \n\t"
1097 "cmp %5, %%"REG_a" \n\t"
1098 " jb 1b \n\t"
1100 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1101 "m" (yalpha1), "m" (uvalpha1)
1102 : "%"REG_a
1104 break;
1105 case IMGFMT_BGR16:
1106 asm volatile(
1108 FULL_YSCALEYUV2RGB
1109 #ifdef DITHER1XBPP
1110 "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1111 "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1112 "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1113 #endif
1114 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1115 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1116 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1118 "psrlw $3, %%mm3 \n\t"
1119 "psllw $3, %%mm1 \n\t"
1120 "psllw $8, %%mm0 \n\t"
1121 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1122 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1124 "por %%mm3, %%mm1 \n\t"
1125 "por %%mm1, %%mm0 \n\t"
1127 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1129 "add $4, %%"REG_a" \n\t"
1130 "cmp %5, %%"REG_a" \n\t"
1131 " jb 1b \n\t"
1133 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1134 "m" (yalpha1), "m" (uvalpha1)
1135 : "%"REG_a
1137 break;
1138 #endif
1139 case IMGFMT_RGB32:
1140 #ifndef HAVE_MMX
1141 case IMGFMT_BGR32:
1142 #endif
1143 if(dstFormat==IMGFMT_BGR32)
1145 int i;
1146 #ifdef WORDS_BIGENDIAN
1147 dest++;
1148 #endif
1149 for(i=0;i<dstW;i++){
1150 // vertical linear interpolation && yuv2rgb in a single step:
1151 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1152 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1153 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1154 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1155 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1156 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1157 dest+= 4;
1160 else if(dstFormat==IMGFMT_BGR24)
1162 int i;
1163 for(i=0;i<dstW;i++){
1164 // vertical linear interpolation && yuv2rgb in a single step:
1165 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1166 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1167 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1168 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1169 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1170 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1171 dest+= 3;
1174 else if(dstFormat==IMGFMT_BGR16)
1176 int i;
1177 for(i=0;i<dstW;i++){
1178 // vertical linear interpolation && yuv2rgb in a single step:
1179 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1180 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1181 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1183 ((uint16_t*)dest)[i] =
1184 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1185 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1186 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1189 else if(dstFormat==IMGFMT_BGR15)
1191 int i;
1192 for(i=0;i<dstW;i++){
1193 // vertical linear interpolation && yuv2rgb in a single step:
1194 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1195 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1196 int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1198 ((uint16_t*)dest)[i] =
1199 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1200 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1201 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1204 }//FULL_UV_IPOL
1205 else
1207 #endif // if 0
1208 #ifdef HAVE_MMX
1209 switch(c->dstFormat)
1211 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1212 case IMGFMT_BGR32:
1213 asm volatile(
1214 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1215 "mov %4, %%"REG_SP" \n\t"
1216 YSCALEYUV2RGB(%%REGa, %5)
1217 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1218 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1220 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1221 "r" (&c->redDither)
1222 : "%"REG_a
1224 return;
1225 case IMGFMT_BGR24:
1226 asm volatile(
1227 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1228 "mov %4, %%"REG_SP" \n\t"
1229 YSCALEYUV2RGB(%%REGa, %5)
1230 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1231 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1232 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1233 "r" (&c->redDither)
1234 : "%"REG_a
1236 return;
1237 case IMGFMT_BGR15:
1238 asm volatile(
1239 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1240 "mov %4, %%"REG_SP" \n\t"
1241 YSCALEYUV2RGB(%%REGa, %5)
1242 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1243 #ifdef DITHER1XBPP
1244 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1245 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1246 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1247 #endif
1249 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1250 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1252 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1253 "r" (&c->redDither)
1254 : "%"REG_a
1256 return;
1257 case IMGFMT_BGR16:
1258 asm volatile(
1259 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1260 "mov %4, %%"REG_SP" \n\t"
1261 YSCALEYUV2RGB(%%REGa, %5)
1262 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1263 #ifdef DITHER1XBPP
1264 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1265 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1266 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1267 #endif
1269 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1270 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1271 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1272 "r" (&c->redDither)
1273 : "%"REG_a
1275 return;
1276 case IMGFMT_YUY2:
1277 asm volatile(
1278 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1279 "mov %4, %%"REG_SP" \n\t"
1280 YSCALEYUV2PACKED(%%REGa, %5)
1281 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1282 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1283 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1284 "r" (&c->redDither)
1285 : "%"REG_a
1287 return;
1288 default: break;
1290 #endif //HAVE_MMX
1291 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1295 * YV12 to RGB without scaling or interpolating
1297 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1298 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1300 const int yalpha1=0;
1301 int i;
1303 uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1304 const int yalpha= 4096; //FIXME ...
1306 if(flags&SWS_FULL_CHR_H_INT)
1308 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1309 return;
1312 #ifdef HAVE_MMX
1313 if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1315 switch(dstFormat)
1317 case IMGFMT_BGR32:
1318 asm volatile(
1319 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1320 "mov %4, %%"REG_SP" \n\t"
1321 YSCALEYUV2RGB1(%%REGa, %5)
1322 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1323 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1325 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1326 "r" (&c->redDither)
1327 : "%"REG_a
1329 return;
1330 case IMGFMT_BGR24:
1331 asm volatile(
1332 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1333 "mov %4, %%"REG_SP" \n\t"
1334 YSCALEYUV2RGB1(%%REGa, %5)
1335 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1336 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1338 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1339 "r" (&c->redDither)
1340 : "%"REG_a
1342 return;
1343 case IMGFMT_BGR15:
1344 asm volatile(
1345 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1346 "mov %4, %%"REG_SP" \n\t"
1347 YSCALEYUV2RGB1(%%REGa, %5)
1348 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1349 #ifdef DITHER1XBPP
1350 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1351 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1352 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1353 #endif
1354 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1355 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1357 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1358 "r" (&c->redDither)
1359 : "%"REG_a
1361 return;
1362 case IMGFMT_BGR16:
1363 asm volatile(
1364 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1365 "mov %4, %%"REG_SP" \n\t"
1366 YSCALEYUV2RGB1(%%REGa, %5)
1367 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1368 #ifdef DITHER1XBPP
1369 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1370 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1371 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1372 #endif
1374 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1375 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1377 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1378 "r" (&c->redDither)
1379 : "%"REG_a
1381 return;
1382 case IMGFMT_YUY2:
1383 asm volatile(
1384 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1385 "mov %4, %%"REG_SP" \n\t"
1386 YSCALEYUV2PACKED1(%%REGa, %5)
1387 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1388 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1390 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1391 "r" (&c->redDither)
1392 : "%"REG_a
1394 return;
1397 else
1399 switch(dstFormat)
1401 case IMGFMT_BGR32:
1402 asm volatile(
1403 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1404 "mov %4, %%"REG_SP" \n\t"
1405 YSCALEYUV2RGB1b(%%REGa, %5)
1406 WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
1407 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1409 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1410 "r" (&c->redDither)
1411 : "%"REG_a
1413 return;
1414 case IMGFMT_BGR24:
1415 asm volatile(
1416 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1417 "mov %4, %%"REG_SP" \n\t"
1418 YSCALEYUV2RGB1b(%%REGa, %5)
1419 WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
1420 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1422 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1423 "r" (&c->redDither)
1424 : "%"REG_a
1426 return;
1427 case IMGFMT_BGR15:
1428 asm volatile(
1429 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1430 "mov %4, %%"REG_SP" \n\t"
1431 YSCALEYUV2RGB1b(%%REGa, %5)
1432 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1433 #ifdef DITHER1XBPP
1434 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1435 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1436 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1437 #endif
1438 WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
1439 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1441 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1442 "r" (&c->redDither)
1443 : "%"REG_a
1445 return;
1446 case IMGFMT_BGR16:
1447 asm volatile(
1448 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1449 "mov %4, %%"REG_SP" \n\t"
1450 YSCALEYUV2RGB1b(%%REGa, %5)
1451 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1452 #ifdef DITHER1XBPP
1453 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1454 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1455 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1456 #endif
1458 WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
1459 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1461 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1462 "r" (&c->redDither)
1463 : "%"REG_a
1465 return;
1466 case IMGFMT_YUY2:
1467 asm volatile(
1468 "mov %%"REG_SP", "ESP_OFFSET"(%5) \n\t"
1469 "mov %4, %%"REG_SP" \n\t"
1470 YSCALEYUV2PACKED1b(%%REGa, %5)
1471 WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
1472 "mov "ESP_OFFSET"(%5), %%"REG_SP" \n\t"
1474 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
1475 "r" (&c->redDither)
1476 : "%"REG_a
1478 return;
1481 #endif
1482 if( uvalpha < 2048 )
1484 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1485 }else{
1486 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1490 //FIXME yuy2* can read upto 7 samples to much
1492 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1494 #ifdef HAVE_MMX
1495 asm volatile(
1496 "movq "MANGLE(bm01010101)", %%mm2\n\t"
1497 "mov %0, %%"REG_a" \n\t"
1498 "1: \n\t"
1499 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1500 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1501 "pand %%mm2, %%mm0 \n\t"
1502 "pand %%mm2, %%mm1 \n\t"
1503 "packuswb %%mm1, %%mm0 \n\t"
1504 "movq %%mm0, (%2, %%"REG_a") \n\t"
1505 "add $8, %%"REG_a" \n\t"
1506 " js 1b \n\t"
1507 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1508 : "%"REG_a
1510 #else
1511 int i;
1512 for(i=0; i<width; i++)
1513 dst[i]= src[2*i];
1514 #endif
1517 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1519 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1520 asm volatile(
1521 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1522 "mov %0, %%"REG_a" \n\t"
1523 "1: \n\t"
1524 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1525 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1526 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1527 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1528 PAVGB(%%mm2, %%mm0)
1529 PAVGB(%%mm3, %%mm1)
1530 "psrlw $8, %%mm0 \n\t"
1531 "psrlw $8, %%mm1 \n\t"
1532 "packuswb %%mm1, %%mm0 \n\t"
1533 "movq %%mm0, %%mm1 \n\t"
1534 "psrlw $8, %%mm0 \n\t"
1535 "pand %%mm4, %%mm1 \n\t"
1536 "packuswb %%mm0, %%mm0 \n\t"
1537 "packuswb %%mm1, %%mm1 \n\t"
1538 "movd %%mm0, (%4, %%"REG_a") \n\t"
1539 "movd %%mm1, (%3, %%"REG_a") \n\t"
1540 "add $4, %%"REG_a" \n\t"
1541 " js 1b \n\t"
1542 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1543 : "%"REG_a
1545 #else
1546 int i;
1547 for(i=0; i<width; i++)
1549 dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1550 dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1552 #endif
1555 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1556 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1558 #ifdef HAVE_MMX
1559 asm volatile(
1560 "mov %0, %%"REG_a" \n\t"
1561 "1: \n\t"
1562 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1563 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1564 "psrlw $8, %%mm0 \n\t"
1565 "psrlw $8, %%mm1 \n\t"
1566 "packuswb %%mm1, %%mm0 \n\t"
1567 "movq %%mm0, (%2, %%"REG_a") \n\t"
1568 "add $8, %%"REG_a" \n\t"
1569 " js 1b \n\t"
1570 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1571 : "%"REG_a
1573 #else
1574 int i;
1575 for(i=0; i<width; i++)
1576 dst[i]= src[2*i+1];
1577 #endif
1580 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1582 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1583 asm volatile(
1584 "movq "MANGLE(bm01010101)", %%mm4\n\t"
1585 "mov %0, %%"REG_a" \n\t"
1586 "1: \n\t"
1587 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1588 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1589 "movq (%2, %%"REG_a",4), %%mm2 \n\t"
1590 "movq 8(%2, %%"REG_a",4), %%mm3 \n\t"
1591 PAVGB(%%mm2, %%mm0)
1592 PAVGB(%%mm3, %%mm1)
1593 "pand %%mm4, %%mm0 \n\t"
1594 "pand %%mm4, %%mm1 \n\t"
1595 "packuswb %%mm1, %%mm0 \n\t"
1596 "movq %%mm0, %%mm1 \n\t"
1597 "psrlw $8, %%mm0 \n\t"
1598 "pand %%mm4, %%mm1 \n\t"
1599 "packuswb %%mm0, %%mm0 \n\t"
1600 "packuswb %%mm1, %%mm1 \n\t"
1601 "movd %%mm0, (%4, %%"REG_a") \n\t"
1602 "movd %%mm1, (%3, %%"REG_a") \n\t"
1603 "add $4, %%"REG_a" \n\t"
1604 " js 1b \n\t"
1605 : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1606 : "%"REG_a
1608 #else
1609 int i;
1610 for(i=0; i<width; i++)
1612 dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1613 dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1615 #endif
1618 static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1620 int i;
1621 for(i=0; i<width; i++)
1623 int b= ((uint32_t*)src)[i]&0xFF;
1624 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1625 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1627 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1631 static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1633 int i;
1634 for(i=0; i<width; i++)
1636 const int a= ((uint32_t*)src1)[2*i+0];
1637 const int e= ((uint32_t*)src1)[2*i+1];
1638 const int c= ((uint32_t*)src2)[2*i+0];
1639 const int d= ((uint32_t*)src2)[2*i+1];
1640 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1641 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1642 const int b= l&0x3FF;
1643 const int g= h>>8;
1644 const int r= l>>16;
1646 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1647 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1651 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1653 #ifdef HAVE_MMX
1654 asm volatile(
1655 "mov %2, %%"REG_a" \n\t"
1656 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1657 "movq "MANGLE(w1111)", %%mm5 \n\t"
1658 "pxor %%mm7, %%mm7 \n\t"
1659 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1660 ".balign 16 \n\t"
1661 "1: \n\t"
1662 PREFETCH" 64(%0, %%"REG_b") \n\t"
1663 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1664 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
1665 "punpcklbw %%mm7, %%mm0 \n\t"
1666 "punpcklbw %%mm7, %%mm1 \n\t"
1667 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
1668 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
1669 "punpcklbw %%mm7, %%mm2 \n\t"
1670 "punpcklbw %%mm7, %%mm3 \n\t"
1671 "pmaddwd %%mm6, %%mm0 \n\t"
1672 "pmaddwd %%mm6, %%mm1 \n\t"
1673 "pmaddwd %%mm6, %%mm2 \n\t"
1674 "pmaddwd %%mm6, %%mm3 \n\t"
1675 #ifndef FAST_BGR2YV12
1676 "psrad $8, %%mm0 \n\t"
1677 "psrad $8, %%mm1 \n\t"
1678 "psrad $8, %%mm2 \n\t"
1679 "psrad $8, %%mm3 \n\t"
1680 #endif
1681 "packssdw %%mm1, %%mm0 \n\t"
1682 "packssdw %%mm3, %%mm2 \n\t"
1683 "pmaddwd %%mm5, %%mm0 \n\t"
1684 "pmaddwd %%mm5, %%mm2 \n\t"
1685 "packssdw %%mm2, %%mm0 \n\t"
1686 "psraw $7, %%mm0 \n\t"
1688 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1689 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
1690 "punpcklbw %%mm7, %%mm4 \n\t"
1691 "punpcklbw %%mm7, %%mm1 \n\t"
1692 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
1693 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
1694 "punpcklbw %%mm7, %%mm2 \n\t"
1695 "punpcklbw %%mm7, %%mm3 \n\t"
1696 "pmaddwd %%mm6, %%mm4 \n\t"
1697 "pmaddwd %%mm6, %%mm1 \n\t"
1698 "pmaddwd %%mm6, %%mm2 \n\t"
1699 "pmaddwd %%mm6, %%mm3 \n\t"
1700 #ifndef FAST_BGR2YV12
1701 "psrad $8, %%mm4 \n\t"
1702 "psrad $8, %%mm1 \n\t"
1703 "psrad $8, %%mm2 \n\t"
1704 "psrad $8, %%mm3 \n\t"
1705 #endif
1706 "packssdw %%mm1, %%mm4 \n\t"
1707 "packssdw %%mm3, %%mm2 \n\t"
1708 "pmaddwd %%mm5, %%mm4 \n\t"
1709 "pmaddwd %%mm5, %%mm2 \n\t"
1710 "add $24, %%"REG_b" \n\t"
1711 "packssdw %%mm2, %%mm4 \n\t"
1712 "psraw $7, %%mm4 \n\t"
1714 "packuswb %%mm4, %%mm0 \n\t"
1715 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1717 "movq %%mm0, (%1, %%"REG_a") \n\t"
1718 "add $8, %%"REG_a" \n\t"
1719 " js 1b \n\t"
1720 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1721 : "%"REG_a, "%"REG_b
1723 #else
1724 int i;
1725 for(i=0; i<width; i++)
1727 int b= src[i*3+0];
1728 int g= src[i*3+1];
1729 int r= src[i*3+2];
1731 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1733 #endif
1736 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1738 #ifdef HAVE_MMX
1739 asm volatile(
1740 "mov %4, %%"REG_a" \n\t"
1741 "movq "MANGLE(w1111)", %%mm5 \n\t"
1742 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1743 "pxor %%mm7, %%mm7 \n\t"
1744 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
1745 "add %%"REG_b", %%"REG_b" \n\t"
1746 ".balign 16 \n\t"
1747 "1: \n\t"
1748 PREFETCH" 64(%0, %%"REG_b") \n\t"
1749 PREFETCH" 64(%1, %%"REG_b") \n\t"
1750 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1751 "movq (%0, %%"REG_b"), %%mm0 \n\t"
1752 "movq (%1, %%"REG_b"), %%mm1 \n\t"
1753 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
1754 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
1755 PAVGB(%%mm1, %%mm0)
1756 PAVGB(%%mm3, %%mm2)
1757 "movq %%mm0, %%mm1 \n\t"
1758 "movq %%mm2, %%mm3 \n\t"
1759 "psrlq $24, %%mm0 \n\t"
1760 "psrlq $24, %%mm2 \n\t"
1761 PAVGB(%%mm1, %%mm0)
1762 PAVGB(%%mm3, %%mm2)
1763 "punpcklbw %%mm7, %%mm0 \n\t"
1764 "punpcklbw %%mm7, %%mm2 \n\t"
1765 #else
1766 "movd (%0, %%"REG_b"), %%mm0 \n\t"
1767 "movd (%1, %%"REG_b"), %%mm1 \n\t"
1768 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
1769 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
1770 "punpcklbw %%mm7, %%mm0 \n\t"
1771 "punpcklbw %%mm7, %%mm1 \n\t"
1772 "punpcklbw %%mm7, %%mm2 \n\t"
1773 "punpcklbw %%mm7, %%mm3 \n\t"
1774 "paddw %%mm1, %%mm0 \n\t"
1775 "paddw %%mm3, %%mm2 \n\t"
1776 "paddw %%mm2, %%mm0 \n\t"
1777 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
1778 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
1779 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
1780 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
1781 "punpcklbw %%mm7, %%mm4 \n\t"
1782 "punpcklbw %%mm7, %%mm1 \n\t"
1783 "punpcklbw %%mm7, %%mm2 \n\t"
1784 "punpcklbw %%mm7, %%mm3 \n\t"
1785 "paddw %%mm1, %%mm4 \n\t"
1786 "paddw %%mm3, %%mm2 \n\t"
1787 "paddw %%mm4, %%mm2 \n\t"
1788 "psrlw $2, %%mm0 \n\t"
1789 "psrlw $2, %%mm2 \n\t"
1790 #endif
1791 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1792 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1794 "pmaddwd %%mm0, %%mm1 \n\t"
1795 "pmaddwd %%mm2, %%mm3 \n\t"
1796 "pmaddwd %%mm6, %%mm0 \n\t"
1797 "pmaddwd %%mm6, %%mm2 \n\t"
1798 #ifndef FAST_BGR2YV12
1799 "psrad $8, %%mm0 \n\t"
1800 "psrad $8, %%mm1 \n\t"
1801 "psrad $8, %%mm2 \n\t"
1802 "psrad $8, %%mm3 \n\t"
1803 #endif
1804 "packssdw %%mm2, %%mm0 \n\t"
1805 "packssdw %%mm3, %%mm1 \n\t"
1806 "pmaddwd %%mm5, %%mm0 \n\t"
1807 "pmaddwd %%mm5, %%mm1 \n\t"
1808 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1809 "psraw $7, %%mm0 \n\t"
1811 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1812 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
1813 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
1814 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
1815 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
1816 PAVGB(%%mm1, %%mm4)
1817 PAVGB(%%mm3, %%mm2)
1818 "movq %%mm4, %%mm1 \n\t"
1819 "movq %%mm2, %%mm3 \n\t"
1820 "psrlq $24, %%mm4 \n\t"
1821 "psrlq $24, %%mm2 \n\t"
1822 PAVGB(%%mm1, %%mm4)
1823 PAVGB(%%mm3, %%mm2)
1824 "punpcklbw %%mm7, %%mm4 \n\t"
1825 "punpcklbw %%mm7, %%mm2 \n\t"
1826 #else
1827 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
1828 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
1829 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
1830 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
1831 "punpcklbw %%mm7, %%mm4 \n\t"
1832 "punpcklbw %%mm7, %%mm1 \n\t"
1833 "punpcklbw %%mm7, %%mm2 \n\t"
1834 "punpcklbw %%mm7, %%mm3 \n\t"
1835 "paddw %%mm1, %%mm4 \n\t"
1836 "paddw %%mm3, %%mm2 \n\t"
1837 "paddw %%mm2, %%mm4 \n\t"
1838 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
1839 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
1840 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
1841 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
1842 "punpcklbw %%mm7, %%mm5 \n\t"
1843 "punpcklbw %%mm7, %%mm1 \n\t"
1844 "punpcklbw %%mm7, %%mm2 \n\t"
1845 "punpcklbw %%mm7, %%mm3 \n\t"
1846 "paddw %%mm1, %%mm5 \n\t"
1847 "paddw %%mm3, %%mm2 \n\t"
1848 "paddw %%mm5, %%mm2 \n\t"
1849 "movq "MANGLE(w1111)", %%mm5 \n\t"
1850 "psrlw $2, %%mm4 \n\t"
1851 "psrlw $2, %%mm2 \n\t"
1852 #endif
1853 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1854 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1856 "pmaddwd %%mm4, %%mm1 \n\t"
1857 "pmaddwd %%mm2, %%mm3 \n\t"
1858 "pmaddwd %%mm6, %%mm4 \n\t"
1859 "pmaddwd %%mm6, %%mm2 \n\t"
1860 #ifndef FAST_BGR2YV12
1861 "psrad $8, %%mm4 \n\t"
1862 "psrad $8, %%mm1 \n\t"
1863 "psrad $8, %%mm2 \n\t"
1864 "psrad $8, %%mm3 \n\t"
1865 #endif
1866 "packssdw %%mm2, %%mm4 \n\t"
1867 "packssdw %%mm3, %%mm1 \n\t"
1868 "pmaddwd %%mm5, %%mm4 \n\t"
1869 "pmaddwd %%mm5, %%mm1 \n\t"
1870 "add $24, %%"REG_b" \n\t"
1871 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1872 "psraw $7, %%mm4 \n\t"
1874 "movq %%mm0, %%mm1 \n\t"
1875 "punpckldq %%mm4, %%mm0 \n\t"
1876 "punpckhdq %%mm4, %%mm1 \n\t"
1877 "packsswb %%mm1, %%mm0 \n\t"
1878 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1880 "movd %%mm0, (%2, %%"REG_a") \n\t"
1881 "punpckhdq %%mm0, %%mm0 \n\t"
1882 "movd %%mm0, (%3, %%"REG_a") \n\t"
1883 "add $4, %%"REG_a" \n\t"
1884 " js 1b \n\t"
1885 : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1886 : "%"REG_a, "%"REG_b
1888 #else
1889 int i;
1890 for(i=0; i<width; i++)
1892 int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1893 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1894 int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1896 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1897 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1899 #endif
1902 static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1904 int i;
1905 for(i=0; i<width; i++)
1907 int d= ((uint16_t*)src)[i];
1908 int b= d&0x1F;
1909 int g= (d>>5)&0x3F;
1910 int r= (d>>11)&0x1F;
1912 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1916 static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1918 int i;
1919 for(i=0; i<width; i++)
1921 int d0= ((uint32_t*)src1)[i];
1922 int d1= ((uint32_t*)src2)[i];
1924 int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1925 int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1927 int dh2= (dh>>11) + (dh<<21);
1928 int d= dh2 + dl;
1930 int b= d&0x7F;
1931 int r= (d>>11)&0x7F;
1932 int g= d>>21;
1933 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1934 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1938 static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1940 int i;
1941 for(i=0; i<width; i++)
1943 int d= ((uint16_t*)src)[i];
1944 int b= d&0x1F;
1945 int g= (d>>5)&0x1F;
1946 int r= (d>>10)&0x1F;
1948 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1952 static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1954 int i;
1955 for(i=0; i<width; i++)
1957 int d0= ((uint32_t*)src1)[i];
1958 int d1= ((uint32_t*)src2)[i];
1960 int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1961 int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1963 int dh2= (dh>>11) + (dh<<21);
1964 int d= dh2 + dl;
1966 int b= d&0x7F;
1967 int r= (d>>10)&0x7F;
1968 int g= d>>21;
1969 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1970 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1975 static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1977 int i;
1978 for(i=0; i<width; i++)
1980 int r= ((uint32_t*)src)[i]&0xFF;
1981 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1982 int b= (((uint32_t*)src)[i]>>16)&0xFF;
1984 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1988 static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1990 int i;
1991 for(i=0; i<width; i++)
1993 const int a= ((uint32_t*)src1)[2*i+0];
1994 const int e= ((uint32_t*)src1)[2*i+1];
1995 const int c= ((uint32_t*)src2)[2*i+0];
1996 const int d= ((uint32_t*)src2)[2*i+1];
1997 const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1998 const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1999 const int r= l&0x3FF;
2000 const int g= h>>8;
2001 const int b= l>>16;
2003 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2004 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2008 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2010 int i;
2011 for(i=0; i<width; i++)
2013 int r= src[i*3+0];
2014 int g= src[i*3+1];
2015 int b= src[i*3+2];
2017 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2021 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2023 int i;
2024 for(i=0; i<width; i++)
2026 int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2027 int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2028 int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2030 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2031 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2036 // Bilinear / Bicubic scaling
2037 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2038 int16_t *filter, int16_t *filterPos, long filterSize)
2040 #ifdef HAVE_MMX
2041 assert(filterSize % 4 == 0 && filterSize>0);
2042 if(filterSize==4) // allways true for upscaling, sometimes for down too
2044 long counter= -2*dstW;
2045 filter-= counter*2;
2046 filterPos-= counter/2;
2047 dst-= counter/2;
2048 asm volatile(
2049 "pxor %%mm7, %%mm7 \n\t"
2050 "movq "MANGLE(w02)", %%mm6 \n\t"
2051 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2052 "mov %%"REG_a", %%"REG_BP" \n\t"
2053 ".balign 16 \n\t"
2054 "1: \n\t"
2055 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2056 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2057 "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2058 "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2059 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2060 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2061 "punpcklbw %%mm7, %%mm0 \n\t"
2062 "punpcklbw %%mm7, %%mm2 \n\t"
2063 "pmaddwd %%mm1, %%mm0 \n\t"
2064 "pmaddwd %%mm2, %%mm3 \n\t"
2065 "psrad $8, %%mm0 \n\t"
2066 "psrad $8, %%mm3 \n\t"
2067 "packssdw %%mm3, %%mm0 \n\t"
2068 "pmaddwd %%mm6, %%mm0 \n\t"
2069 "packssdw %%mm0, %%mm0 \n\t"
2070 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2071 "add $4, %%"REG_BP" \n\t"
2072 " jnc 1b \n\t"
2074 "pop %%"REG_BP" \n\t"
2075 : "+a" (counter)
2076 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2077 : "%"REG_b
2080 else if(filterSize==8)
2082 long counter= -2*dstW;
2083 filter-= counter*4;
2084 filterPos-= counter/2;
2085 dst-= counter/2;
2086 asm volatile(
2087 "pxor %%mm7, %%mm7 \n\t"
2088 "movq "MANGLE(w02)", %%mm6 \n\t"
2089 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2090 "mov %%"REG_a", %%"REG_BP" \n\t"
2091 ".balign 16 \n\t"
2092 "1: \n\t"
2093 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2094 "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2095 "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2096 "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2097 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2098 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2099 "punpcklbw %%mm7, %%mm0 \n\t"
2100 "punpcklbw %%mm7, %%mm2 \n\t"
2101 "pmaddwd %%mm1, %%mm0 \n\t"
2102 "pmaddwd %%mm2, %%mm3 \n\t"
2104 "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2105 "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2106 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2107 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2108 "punpcklbw %%mm7, %%mm4 \n\t"
2109 "punpcklbw %%mm7, %%mm2 \n\t"
2110 "pmaddwd %%mm1, %%mm4 \n\t"
2111 "pmaddwd %%mm2, %%mm5 \n\t"
2112 "paddd %%mm4, %%mm0 \n\t"
2113 "paddd %%mm5, %%mm3 \n\t"
2115 "psrad $8, %%mm0 \n\t"
2116 "psrad $8, %%mm3 \n\t"
2117 "packssdw %%mm3, %%mm0 \n\t"
2118 "pmaddwd %%mm6, %%mm0 \n\t"
2119 "packssdw %%mm0, %%mm0 \n\t"
2120 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2121 "add $4, %%"REG_BP" \n\t"
2122 " jnc 1b \n\t"
2124 "pop %%"REG_BP" \n\t"
2125 : "+a" (counter)
2126 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2127 : "%"REG_b
2130 else
2132 uint8_t *offset = src+filterSize;
2133 long counter= -2*dstW;
2134 // filter-= counter*filterSize/2;
2135 filterPos-= counter/2;
2136 dst-= counter/2;
2137 asm volatile(
2138 "pxor %%mm7, %%mm7 \n\t"
2139 "movq "MANGLE(w02)", %%mm6 \n\t"
2140 ".balign 16 \n\t"
2141 "1: \n\t"
2142 "mov %2, %%"REG_c" \n\t"
2143 "movzwl (%%"REG_c", %0), %%eax \n\t"
2144 "movzwl 2(%%"REG_c", %0), %%ebx \n\t"
2145 "mov %5, %%"REG_c" \n\t"
2146 "pxor %%mm4, %%mm4 \n\t"
2147 "pxor %%mm5, %%mm5 \n\t"
2148 "2: \n\t"
2149 "movq (%1), %%mm1 \n\t"
2150 "movq (%1, %6), %%mm3 \n\t"
2151 "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2152 "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2153 "punpcklbw %%mm7, %%mm0 \n\t"
2154 "punpcklbw %%mm7, %%mm2 \n\t"
2155 "pmaddwd %%mm1, %%mm0 \n\t"
2156 "pmaddwd %%mm2, %%mm3 \n\t"
2157 "paddd %%mm3, %%mm5 \n\t"
2158 "paddd %%mm0, %%mm4 \n\t"
2159 "add $8, %1 \n\t"
2160 "add $4, %%"REG_c" \n\t"
2161 "cmp %4, %%"REG_c" \n\t"
2162 " jb 2b \n\t"
2163 "add %6, %1 \n\t"
2164 "psrad $8, %%mm4 \n\t"
2165 "psrad $8, %%mm5 \n\t"
2166 "packssdw %%mm5, %%mm4 \n\t"
2167 "pmaddwd %%mm6, %%mm4 \n\t"
2168 "packssdw %%mm4, %%mm4 \n\t"
2169 "mov %3, %%"REG_a" \n\t"
2170 "movd %%mm4, (%%"REG_a", %0) \n\t"
2171 "add $4, %0 \n\t"
2172 " jnc 1b \n\t"
2174 : "+r" (counter), "+r" (filter)
2175 : "m" (filterPos), "m" (dst), "m"(offset),
2176 "m" (src), "r" (filterSize*2)
2177 : "%"REG_b, "%"REG_a, "%"REG_c
2180 #else
2181 #ifdef HAVE_ALTIVEC
2182 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2183 #else
2184 int i;
2185 for(i=0; i<dstW; i++)
2187 int j;
2188 int srcPos= filterPos[i];
2189 int val=0;
2190 // printf("filterPos: %d\n", filterPos[i]);
2191 for(j=0; j<filterSize; j++)
2193 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2194 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2196 // filter += hFilterSize;
2197 dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2198 // dst[i] = val>>7;
2200 #endif
2201 #endif
2203 // *** horizontal scale Y line to temp buffer
2204 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2205 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2206 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2207 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2208 int32_t *mmx2FilterPos)
2210 if(srcFormat==IMGFMT_YUY2)
2212 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2213 src= formatConvBuffer;
2215 else if(srcFormat==IMGFMT_UYVY)
2217 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2218 src= formatConvBuffer;
2220 else if(srcFormat==IMGFMT_BGR32)
2222 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2223 src= formatConvBuffer;
2225 else if(srcFormat==IMGFMT_BGR24)
2227 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2228 src= formatConvBuffer;
2230 else if(srcFormat==IMGFMT_BGR16)
2232 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2233 src= formatConvBuffer;
2235 else if(srcFormat==IMGFMT_BGR15)
2237 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2238 src= formatConvBuffer;
2240 else if(srcFormat==IMGFMT_RGB32)
2242 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2243 src= formatConvBuffer;
2245 else if(srcFormat==IMGFMT_RGB24)
2247 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2248 src= formatConvBuffer;
2251 #ifdef HAVE_MMX
2252 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2253 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2254 #else
2255 if(!(flags&SWS_FAST_BILINEAR))
2256 #endif
2258 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2260 else // Fast Bilinear upscale / crap downscale
2262 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2263 #ifdef HAVE_MMX2
2264 int i;
2265 if(canMMX2BeUsed)
2267 asm volatile(
2268 "pxor %%mm7, %%mm7 \n\t"
2269 "mov %0, %%"REG_c" \n\t"
2270 "mov %1, %%"REG_D" \n\t"
2271 "mov %2, %%"REG_d" \n\t"
2272 "mov %3, %%"REG_b" \n\t"
2273 "xor %%"REG_a", %%"REG_a" \n\t" // i
2274 PREFETCH" (%%"REG_c") \n\t"
2275 PREFETCH" 32(%%"REG_c") \n\t"
2276 PREFETCH" 64(%%"REG_c") \n\t"
2278 #ifdef ARCH_X86_64
2280 #define FUNNY_Y_CODE \
2281 "movl (%%"REG_b"), %%esi \n\t"\
2282 "call *%4 \n\t"\
2283 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2284 "add %%"REG_S", %%"REG_c" \n\t"\
2285 "add %%"REG_a", %%"REG_D" \n\t"\
2286 "xor %%"REG_a", %%"REG_a" \n\t"\
2288 #else
2290 #define FUNNY_Y_CODE \
2291 "movl (%%"REG_b"), %%esi \n\t"\
2292 "call *%4 \n\t"\
2293 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2294 "add %%"REG_a", %%"REG_D" \n\t"\
2295 "xor %%"REG_a", %%"REG_a" \n\t"\
2297 #endif
2299 FUNNY_Y_CODE
2300 FUNNY_Y_CODE
2301 FUNNY_Y_CODE
2302 FUNNY_Y_CODE
2303 FUNNY_Y_CODE
2304 FUNNY_Y_CODE
2305 FUNNY_Y_CODE
2306 FUNNY_Y_CODE
2308 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2309 "m" (funnyYCode)
2310 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2312 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2314 else
2316 #endif
2317 int xInc_shr16 = xInc >> 16;
2318 int xInc_mask = xInc & 0xffff;
2319 //NO MMX just normal asm ...
2320 asm volatile(
2321 "xor %%"REG_a", %%"REG_a" \n\t" // i
2322 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2323 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2324 ".balign 16 \n\t"
2325 "1: \n\t"
2326 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2327 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2328 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2329 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2330 "shll $16, %%edi \n\t"
2331 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2332 "mov %1, %%"REG_D" \n\t"
2333 "shrl $9, %%esi \n\t"
2334 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2335 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2336 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2338 "movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
2339 "movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
2340 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2341 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2342 "shll $16, %%edi \n\t"
2343 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2344 "mov %1, %%"REG_D" \n\t"
2345 "shrl $9, %%esi \n\t"
2346 "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2347 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2348 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2351 "add $2, %%"REG_a" \n\t"
2352 "cmp %2, %%"REG_a" \n\t"
2353 " jb 1b \n\t"
2356 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2357 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2359 #ifdef HAVE_MMX2
2360 } //if MMX2 can't be used
2361 #endif
2362 #else
2363 int i;
2364 unsigned int xpos=0;
2365 for(i=0;i<dstWidth;i++)
2367 register unsigned int xx=xpos>>16;
2368 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2369 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2370 xpos+=xInc;
2372 #endif
2376 inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2377 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2378 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2379 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2380 int32_t *mmx2FilterPos)
2382 if(srcFormat==IMGFMT_YUY2)
2384 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2385 src1= formatConvBuffer;
2386 src2= formatConvBuffer+2048;
2388 else if(srcFormat==IMGFMT_UYVY)
2390 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2391 src1= formatConvBuffer;
2392 src2= formatConvBuffer+2048;
2394 else if(srcFormat==IMGFMT_BGR32)
2396 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2397 src1= formatConvBuffer;
2398 src2= formatConvBuffer+2048;
2400 else if(srcFormat==IMGFMT_BGR24)
2402 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2403 src1= formatConvBuffer;
2404 src2= formatConvBuffer+2048;
2406 else if(srcFormat==IMGFMT_BGR16)
2408 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2409 src1= formatConvBuffer;
2410 src2= formatConvBuffer+2048;
2412 else if(srcFormat==IMGFMT_BGR15)
2414 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2415 src1= formatConvBuffer;
2416 src2= formatConvBuffer+2048;
2418 else if(srcFormat==IMGFMT_RGB32)
2420 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2421 src1= formatConvBuffer;
2422 src2= formatConvBuffer+2048;
2424 else if(srcFormat==IMGFMT_RGB24)
2426 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2427 src1= formatConvBuffer;
2428 src2= formatConvBuffer+2048;
2430 else if(isGray(srcFormat))
2432 return;
2435 #ifdef HAVE_MMX
2436 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2437 if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2438 #else
2439 if(!(flags&SWS_FAST_BILINEAR))
2440 #endif
2442 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2443 RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2445 else // Fast Bilinear upscale / crap downscale
2447 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2448 #ifdef HAVE_MMX2
2449 int i;
2450 if(canMMX2BeUsed)
2452 asm volatile(
2453 "pxor %%mm7, %%mm7 \n\t"
2454 "mov %0, %%"REG_c" \n\t"
2455 "mov %1, %%"REG_D" \n\t"
2456 "mov %2, %%"REG_d" \n\t"
2457 "mov %3, %%"REG_b" \n\t"
2458 "xor %%"REG_a", %%"REG_a" \n\t" // i
2459 PREFETCH" (%%"REG_c") \n\t"
2460 PREFETCH" 32(%%"REG_c") \n\t"
2461 PREFETCH" 64(%%"REG_c") \n\t"
2463 #ifdef ARCH_X86_64
2465 #define FUNNY_UV_CODE \
2466 "movl (%%"REG_b"), %%esi \n\t"\
2467 "call *%4 \n\t"\
2468 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2469 "add %%"REG_S", %%"REG_c" \n\t"\
2470 "add %%"REG_a", %%"REG_D" \n\t"\
2471 "xor %%"REG_a", %%"REG_a" \n\t"\
2473 #else
2475 #define FUNNY_UV_CODE \
2476 "movl (%%"REG_b"), %%esi \n\t"\
2477 "call *%4 \n\t"\
2478 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2479 "add %%"REG_a", %%"REG_D" \n\t"\
2480 "xor %%"REG_a", %%"REG_a" \n\t"\
2482 #endif
2484 FUNNY_UV_CODE
2485 FUNNY_UV_CODE
2486 FUNNY_UV_CODE
2487 FUNNY_UV_CODE
2488 "xor %%"REG_a", %%"REG_a" \n\t" // i
2489 "mov %5, %%"REG_c" \n\t" // src
2490 "mov %1, %%"REG_D" \n\t" // buf1
2491 "add $4096, %%"REG_D" \n\t"
2492 PREFETCH" (%%"REG_c") \n\t"
2493 PREFETCH" 32(%%"REG_c") \n\t"
2494 PREFETCH" 64(%%"REG_c") \n\t"
2496 FUNNY_UV_CODE
2497 FUNNY_UV_CODE
2498 FUNNY_UV_CODE
2499 FUNNY_UV_CODE
2501 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2502 "m" (funnyUVCode), "m" (src2)
2503 : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2505 for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2507 // printf("%d %d %d\n", dstWidth, i, srcW);
2508 dst[i] = src1[srcW-1]*128;
2509 dst[i+2048] = src2[srcW-1]*128;
2512 else
2514 #endif
2515 long xInc_shr16 = (long) (xInc >> 16);
2516 int xInc_mask = xInc & 0xffff;
2517 asm volatile(
2518 "xor %%"REG_a", %%"REG_a" \n\t" // i
2519 "xor %%"REG_b", %%"REG_b" \n\t" // xx
2520 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2521 ".balign 16 \n\t"
2522 "1: \n\t"
2523 "mov %0, %%"REG_S" \n\t"
2524 "movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]
2525 "movzbl 1(%%"REG_S", %%"REG_b"), %%esi \n\t" //src[xx+1]
2526 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2527 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2528 "shll $16, %%edi \n\t"
2529 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2530 "mov %1, %%"REG_D" \n\t"
2531 "shrl $9, %%esi \n\t"
2532 "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2534 "movzbl (%5, %%"REG_b"), %%edi \n\t" //src[xx]
2535 "movzbl 1(%5, %%"REG_b"), %%esi \n\t" //src[xx+1]
2536 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2537 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2538 "shll $16, %%edi \n\t"
2539 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2540 "mov %1, %%"REG_D" \n\t"
2541 "shrl $9, %%esi \n\t"
2542 "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2544 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2545 "adc %3, %%"REG_b" \n\t" //xx+= xInc>>8 + carry
2546 "add $1, %%"REG_a" \n\t"
2547 "cmp %2, %%"REG_a" \n\t"
2548 " jb 1b \n\t"
2550 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2551 which is needed to support GCC-4.0 */
2552 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2553 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2554 #else
2555 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2556 #endif
2557 "r" (src2)
2558 : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2560 #ifdef HAVE_MMX2
2561 } //if MMX2 can't be used
2562 #endif
2563 #else
2564 int i;
2565 unsigned int xpos=0;
2566 for(i=0;i<dstWidth;i++)
2568 register unsigned int xx=xpos>>16;
2569 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2570 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2571 dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2572 /* slower
2573 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2574 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2576 xpos+=xInc;
2578 #endif
2582 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2583 int srcSliceH, uint8_t* dst[], int dstStride[]){
2585 /* load a few things into local vars to make the code more readable? and faster */
2586 const int srcW= c->srcW;
2587 const int dstW= c->dstW;
2588 const int dstH= c->dstH;
2589 const int chrDstW= c->chrDstW;
2590 const int chrSrcW= c->chrSrcW;
2591 const int lumXInc= c->lumXInc;
2592 const int chrXInc= c->chrXInc;
2593 const int dstFormat= c->dstFormat;
2594 const int srcFormat= c->srcFormat;
2595 const int flags= c->flags;
2596 const int canMMX2BeUsed= c->canMMX2BeUsed;
2597 int16_t *vLumFilterPos= c->vLumFilterPos;
2598 int16_t *vChrFilterPos= c->vChrFilterPos;
2599 int16_t *hLumFilterPos= c->hLumFilterPos;
2600 int16_t *hChrFilterPos= c->hChrFilterPos;
2601 int16_t *vLumFilter= c->vLumFilter;
2602 int16_t *vChrFilter= c->vChrFilter;
2603 int16_t *hLumFilter= c->hLumFilter;
2604 int16_t *hChrFilter= c->hChrFilter;
2605 int32_t *lumMmxFilter= c->lumMmxFilter;
2606 int32_t *chrMmxFilter= c->chrMmxFilter;
2607 const int vLumFilterSize= c->vLumFilterSize;
2608 const int vChrFilterSize= c->vChrFilterSize;
2609 const int hLumFilterSize= c->hLumFilterSize;
2610 const int hChrFilterSize= c->hChrFilterSize;
2611 int16_t **lumPixBuf= c->lumPixBuf;
2612 int16_t **chrPixBuf= c->chrPixBuf;
2613 const int vLumBufSize= c->vLumBufSize;
2614 const int vChrBufSize= c->vChrBufSize;
2615 uint8_t *funnyYCode= c->funnyYCode;
2616 uint8_t *funnyUVCode= c->funnyUVCode;
2617 uint8_t *formatConvBuffer= c->formatConvBuffer;
2618 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2619 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2620 int lastDstY;
2622 /* vars whch will change and which we need to storw back in the context */
2623 int dstY= c->dstY;
2624 int lumBufIndex= c->lumBufIndex;
2625 int chrBufIndex= c->chrBufIndex;
2626 int lastInLumBuf= c->lastInLumBuf;
2627 int lastInChrBuf= c->lastInChrBuf;
2629 if(isPacked(c->srcFormat)){
2630 src[0]=
2631 src[1]=
2632 src[2]= src[0];
2633 srcStride[0]=
2634 srcStride[1]=
2635 srcStride[2]= srcStride[0];
2637 srcStride[1]<<= c->vChrDrop;
2638 srcStride[2]<<= c->vChrDrop;
2640 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2641 // (int)dst[0], (int)dst[1], (int)dst[2]);
2643 #if 0 //self test FIXME move to a vfilter or something
2645 static volatile int i=0;
2646 i++;
2647 if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2648 selfTest(src, srcStride, c->srcW, c->srcH);
2649 i--;
2651 #endif
2653 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2654 //dstStride[0],dstStride[1],dstStride[2]);
2656 if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2658 static int firstTime=1; //FIXME move this into the context perhaps
2659 if(flags & SWS_PRINT_INFO && firstTime)
2661 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2662 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2663 firstTime=0;
2667 /* Note the user might start scaling the picture in the middle so this will not get executed
2668 this is not really intended but works currently, so ppl might do it */
2669 if(srcSliceY ==0){
2670 lumBufIndex=0;
2671 chrBufIndex=0;
2672 dstY=0;
2673 lastInLumBuf= -1;
2674 lastInChrBuf= -1;
2677 lastDstY= dstY;
2679 for(;dstY < dstH; dstY++){
2680 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2681 const int chrDstY= dstY>>c->chrDstVSubSample;
2682 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2683 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2685 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2686 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2687 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2688 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2690 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2691 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2692 //handle holes (FAST_BILINEAR & weird filters)
2693 if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2694 if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2695 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2696 ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2697 ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2699 // Do we have enough lines in this slice to output the dstY line
2700 if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2702 //Do horizontal scaling
2703 while(lastInLumBuf < lastLumSrcY)
2705 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2706 lumBufIndex++;
2707 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2708 ASSERT(lumBufIndex < 2*vLumBufSize)
2709 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2710 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2711 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2712 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2713 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2714 funnyYCode, c->srcFormat, formatConvBuffer,
2715 c->lumMmx2Filter, c->lumMmx2FilterPos);
2716 lastInLumBuf++;
2718 while(lastInChrBuf < lastChrSrcY)
2720 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2721 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2722 chrBufIndex++;
2723 ASSERT(chrBufIndex < 2*vChrBufSize)
2724 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2725 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2726 //FIXME replace parameters through context struct (some at least)
2728 if(!(isGray(srcFormat) || isGray(dstFormat)))
2729 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2730 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2731 funnyUVCode, c->srcFormat, formatConvBuffer,
2732 c->chrMmx2Filter, c->chrMmx2FilterPos);
2733 lastInChrBuf++;
2735 //wrap buf index around to stay inside the ring buffer
2736 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2737 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2739 else // not enough lines left in this slice -> load the rest in the buffer
2741 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2742 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2743 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2744 vChrBufSize, vLumBufSize);*/
2746 //Do horizontal scaling
2747 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2749 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2750 lumBufIndex++;
2751 ASSERT(lumBufIndex < 2*vLumBufSize)
2752 ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2753 ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2754 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2755 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2756 funnyYCode, c->srcFormat, formatConvBuffer,
2757 c->lumMmx2Filter, c->lumMmx2FilterPos);
2758 lastInLumBuf++;
2760 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2762 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2763 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2764 chrBufIndex++;
2765 ASSERT(chrBufIndex < 2*vChrBufSize)
2766 ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2767 ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2769 if(!(isGray(srcFormat) || isGray(dstFormat)))
2770 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2771 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2772 funnyUVCode, c->srcFormat, formatConvBuffer,
2773 c->chrMmx2Filter, c->chrMmx2FilterPos);
2774 lastInChrBuf++;
2776 //wrap buf index around to stay inside the ring buffer
2777 if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2778 if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2779 break; //we can't output a dstY line so let's try with the next slice
2782 #ifdef HAVE_MMX
2783 b5Dither= dither8[dstY&1];
2784 g6Dither= dither4[dstY&1];
2785 g5Dither= dither8[dstY&1];
2786 r5Dither= dither8[(dstY+1)&1];
2787 #endif
2788 if(dstY < dstH-2)
2790 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2791 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2792 #ifdef HAVE_MMX
2793 int i;
2794 for(i=0; i<vLumFilterSize; i++)
2796 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2797 lumMmxFilter[4*i+2]=
2798 lumMmxFilter[4*i+3]=
2799 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2801 for(i=0; i<vChrFilterSize; i++)
2803 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2804 chrMmxFilter[4*i+2]=
2805 chrMmxFilter[4*i+3]=
2806 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2808 #endif
2809 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2810 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2811 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2812 RENAME(yuv2nv12X)(c,
2813 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2814 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2815 dest, uDest, dstW, chrDstW, dstFormat);
2817 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2819 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2820 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2821 if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2823 int16_t *lumBuf = lumPixBuf[0];
2824 int16_t *chrBuf= chrPixBuf[0];
2825 RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2827 else //General YV12
2829 RENAME(yuv2yuvX)(c,
2830 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2831 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2832 dest, uDest, vDest, dstW, chrDstW);
2835 else
2837 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2838 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2839 if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2841 int chrAlpha= vChrFilter[2*dstY+1];
2842 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2843 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2845 else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2847 int lumAlpha= vLumFilter[2*dstY+1];
2848 int chrAlpha= vChrFilter[2*dstY+1];
2849 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2850 dest, dstW, lumAlpha, chrAlpha, dstY);
2852 else //General RGB
2854 RENAME(yuv2packedX)(c,
2855 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2856 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2857 dest, dstW, dstY);
2861 else // hmm looks like we can't use MMX here without overwriting this array's tail
2863 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2864 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2865 if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
2866 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2867 if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2868 yuv2nv12XinC(
2869 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2870 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2871 dest, uDest, dstW, chrDstW, dstFormat);
2873 else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2875 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2876 if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2877 yuv2yuvXinC(
2878 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2879 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2880 dest, uDest, vDest, dstW, chrDstW);
2882 else
2884 ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2885 ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2886 yuv2packedXinC(c,
2887 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2888 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2889 dest, dstW, dstY);
2894 #ifdef HAVE_MMX
2895 __asm __volatile(SFENCE:::"memory");
2896 __asm __volatile(EMMS:::"memory");
2897 #endif
2898 /* store changed local vars back in the context */
2899 c->dstY= dstY;
2900 c->lumBufIndex= lumBufIndex;
2901 c->chrBufIndex= chrBufIndex;
2902 c->lastInLumBuf= lastInLumBuf;
2903 c->lastInChrBuf= lastInChrBuf;
2905 return dstY - lastDstY;