2 Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
35 #define PREFETCH "prefetch"
36 #define PREFETCHW "prefetchw"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
46 #define SFENCE "sfence"
52 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53 #elif defined (HAVE_3DNOW)
54 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
58 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
60 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
62 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
65 #include "swscale_altivec_template.c"
68 #define YSCALEYUV2YV12X(x, offset) \
69 "xor %%"REG_a", %%"REG_a" \n\t"\
70 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
71 "movq %%mm3, %%mm4 \n\t"\
72 "lea " offset "(%0), %%"REG_d" \n\t"\
73 "mov (%%"REG_d"), %%"REG_S" \n\t"\
74 ".balign 16 \n\t" /* FIXME Unroll? */\
76 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
77 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
78 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
79 "add $16, %%"REG_d" \n\t"\
80 "mov (%%"REG_d"), %%"REG_S" \n\t"\
81 "test %%"REG_S", %%"REG_S" \n\t"\
82 "pmulhw %%mm0, %%mm2 \n\t"\
83 "pmulhw %%mm0, %%mm5 \n\t"\
84 "paddw %%mm2, %%mm3 \n\t"\
85 "paddw %%mm5, %%mm4 \n\t"\
87 "psraw $3, %%mm3 \n\t"\
88 "psraw $3, %%mm4 \n\t"\
89 "packuswb %%mm4, %%mm3 \n\t"\
90 MOVNTQ(%%mm3, (%1, %%REGa))\
91 "add $8, %%"REG_a" \n\t"\
92 "cmp %2, %%"REG_a" \n\t"\
93 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
94 "movq %%mm3, %%mm4 \n\t"\
95 "lea " offset "(%0), %%"REG_d" \n\t"\
96 "mov (%%"REG_d"), %%"REG_S" \n\t"\
99 #define YSCALEYUV2YV121 \
100 "mov %2, %%"REG_a" \n\t"\
101 ".balign 16 \n\t" /* FIXME Unroll? */\
103 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
104 "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105 "psraw $7, %%mm0 \n\t"\
106 "psraw $7, %%mm1 \n\t"\
107 "packuswb %%mm1, %%mm0 \n\t"\
108 MOVNTQ(%%mm0, (%1, %%REGa))\
109 "add $8, %%"REG_a" \n\t"\
113 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
114 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
115 "r" (dest), "m" (dstW),
116 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
117 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
119 #define YSCALEYUV2PACKEDX \
120 "xor %%"REG_a", %%"REG_a" \n\t"\
124 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
125 "mov (%%"REG_d"), %%"REG_S" \n\t"\
126 "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
127 "movq %%mm3, %%mm4 \n\t"\
130 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
131 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
132 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
133 "add $16, %%"REG_d" \n\t"\
134 "mov (%%"REG_d"), %%"REG_S" \n\t"\
135 "pmulhw %%mm0, %%mm2 \n\t"\
136 "pmulhw %%mm0, %%mm5 \n\t"\
137 "paddw %%mm2, %%mm3 \n\t"\
138 "paddw %%mm5, %%mm4 \n\t"\
139 "test %%"REG_S", %%"REG_S" \n\t"\
142 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
143 "mov (%%"REG_d"), %%"REG_S" \n\t"\
144 "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
145 "movq %%mm1, %%mm7 \n\t"\
148 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
149 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
150 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
151 "add $16, %%"REG_d" \n\t"\
152 "mov (%%"REG_d"), %%"REG_S" \n\t"\
153 "pmulhw %%mm0, %%mm2 \n\t"\
154 "pmulhw %%mm0, %%mm5 \n\t"\
155 "paddw %%mm2, %%mm1 \n\t"\
156 "paddw %%mm5, %%mm7 \n\t"\
157 "test %%"REG_S", %%"REG_S" \n\t"\
161 #define YSCALEYUV2RGBX \
163 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
164 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
165 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
166 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
167 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
168 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
169 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
170 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
171 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
172 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
173 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
174 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
175 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
176 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
177 "paddw %%mm3, %%mm4 \n\t"\
178 "movq %%mm2, %%mm0 \n\t"\
179 "movq %%mm5, %%mm6 \n\t"\
180 "movq %%mm4, %%mm3 \n\t"\
181 "punpcklwd %%mm2, %%mm2 \n\t"\
182 "punpcklwd %%mm5, %%mm5 \n\t"\
183 "punpcklwd %%mm4, %%mm4 \n\t"\
184 "paddw %%mm1, %%mm2 \n\t"\
185 "paddw %%mm1, %%mm5 \n\t"\
186 "paddw %%mm1, %%mm4 \n\t"\
187 "punpckhwd %%mm0, %%mm0 \n\t"\
188 "punpckhwd %%mm6, %%mm6 \n\t"\
189 "punpckhwd %%mm3, %%mm3 \n\t"\
190 "paddw %%mm7, %%mm0 \n\t"\
191 "paddw %%mm7, %%mm6 \n\t"\
192 "paddw %%mm7, %%mm3 \n\t"\
193 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
194 "packuswb %%mm0, %%mm2 \n\t"\
195 "packuswb %%mm6, %%mm5 \n\t"\
196 "packuswb %%mm3, %%mm4 \n\t"\
197 "pxor %%mm7, %%mm7 \n\t"
199 #define FULL_YSCALEYUV2RGB \
200 "pxor %%mm7, %%mm7 \n\t"\
201 "movd %6, %%mm6 \n\t" /*yalpha1*/\
202 "punpcklwd %%mm6, %%mm6 \n\t"\
203 "punpcklwd %%mm6, %%mm6 \n\t"\
204 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
205 "punpcklwd %%mm5, %%mm5 \n\t"\
206 "punpcklwd %%mm5, %%mm5 \n\t"\
207 "xor %%"REG_a", %%"REG_a" \n\t"\
210 "movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
211 "movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
212 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
213 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
214 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
215 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
216 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
217 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
218 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
219 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
220 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
221 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
223 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
224 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
226 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
227 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
230 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
231 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
232 "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234 "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
239 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
240 "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
241 "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242 "paddw %%mm1, %%mm3 \n\t" /* B*/\
243 "paddw %%mm1, %%mm0 \n\t" /* R*/\
244 "packuswb %%mm3, %%mm3 \n\t"\
246 "packuswb %%mm0, %%mm0 \n\t"\
247 "paddw %%mm4, %%mm2 \n\t"\
248 "paddw %%mm2, %%mm1 \n\t" /* G*/\
250 "packuswb %%mm1, %%mm1 \n\t"
253 #define REAL_YSCALEYUV2PACKED(index, c) \
254 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
255 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
256 "psraw $3, %%mm0 \n\t"\
257 "psraw $3, %%mm1 \n\t"\
258 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
259 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260 "xor "#index", "#index" \n\t"\
263 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
264 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
265 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
266 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
267 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
268 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
269 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
270 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
271 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
272 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
273 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
274 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
275 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
276 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
277 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
278 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
279 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
280 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
281 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
282 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
283 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
284 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
285 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
286 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
287 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
289 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
291 #define REAL_YSCALEYUV2RGB(index, c) \
292 "xor "#index", "#index" \n\t"\
295 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
296 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
297 "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
298 "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
300 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
301 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
303 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
304 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
305 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
306 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
307 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
308 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
309 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
310 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
311 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
312 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
313 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
316 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
317 "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
318 "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
320 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
321 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
322 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
324 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
325 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
326 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
327 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
328 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
329 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
330 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
331 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
332 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
333 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
334 "paddw %%mm3, %%mm4 \n\t"\
335 "movq %%mm2, %%mm0 \n\t"\
336 "movq %%mm5, %%mm6 \n\t"\
337 "movq %%mm4, %%mm3 \n\t"\
338 "punpcklwd %%mm2, %%mm2 \n\t"\
339 "punpcklwd %%mm5, %%mm5 \n\t"\
340 "punpcklwd %%mm4, %%mm4 \n\t"\
341 "paddw %%mm1, %%mm2 \n\t"\
342 "paddw %%mm1, %%mm5 \n\t"\
343 "paddw %%mm1, %%mm4 \n\t"\
344 "punpckhwd %%mm0, %%mm0 \n\t"\
345 "punpckhwd %%mm6, %%mm6 \n\t"\
346 "punpckhwd %%mm3, %%mm3 \n\t"\
347 "paddw %%mm7, %%mm0 \n\t"\
348 "paddw %%mm7, %%mm6 \n\t"\
349 "paddw %%mm7, %%mm3 \n\t"\
350 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
351 "packuswb %%mm0, %%mm2 \n\t"\
352 "packuswb %%mm6, %%mm5 \n\t"\
353 "packuswb %%mm3, %%mm4 \n\t"\
354 "pxor %%mm7, %%mm7 \n\t"
355 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
357 #define REAL_YSCALEYUV2PACKED1(index, c) \
358 "xor "#index", "#index" \n\t"\
361 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
362 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
363 "psraw $7, %%mm3 \n\t" \
364 "psraw $7, %%mm4 \n\t" \
365 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
366 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
367 "psraw $7, %%mm1 \n\t" \
368 "psraw $7, %%mm7 \n\t" \
370 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
372 #define REAL_YSCALEYUV2RGB1(index, c) \
373 "xor "#index", "#index" \n\t"\
376 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
377 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
378 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
379 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
380 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
381 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
382 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
383 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
384 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
385 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
387 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
388 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
389 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
390 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
392 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
393 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
394 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
395 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
396 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
397 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
398 "paddw %%mm3, %%mm4 \n\t"\
399 "movq %%mm2, %%mm0 \n\t"\
400 "movq %%mm5, %%mm6 \n\t"\
401 "movq %%mm4, %%mm3 \n\t"\
402 "punpcklwd %%mm2, %%mm2 \n\t"\
403 "punpcklwd %%mm5, %%mm5 \n\t"\
404 "punpcklwd %%mm4, %%mm4 \n\t"\
405 "paddw %%mm1, %%mm2 \n\t"\
406 "paddw %%mm1, %%mm5 \n\t"\
407 "paddw %%mm1, %%mm4 \n\t"\
408 "punpckhwd %%mm0, %%mm0 \n\t"\
409 "punpckhwd %%mm6, %%mm6 \n\t"\
410 "punpckhwd %%mm3, %%mm3 \n\t"\
411 "paddw %%mm7, %%mm0 \n\t"\
412 "paddw %%mm7, %%mm6 \n\t"\
413 "paddw %%mm7, %%mm3 \n\t"\
414 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
415 "packuswb %%mm0, %%mm2 \n\t"\
416 "packuswb %%mm6, %%mm5 \n\t"\
417 "packuswb %%mm3, %%mm4 \n\t"\
418 "pxor %%mm7, %%mm7 \n\t"
419 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
421 #define REAL_YSCALEYUV2PACKED1b(index, c) \
422 "xor "#index", "#index" \n\t"\
425 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
426 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
427 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
428 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
431 "psrlw $8, %%mm3 \n\t" \
432 "psrlw $8, %%mm4 \n\t" \
433 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
434 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
435 "psraw $7, %%mm1 \n\t" \
436 "psraw $7, %%mm7 \n\t"
437 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
439 // do vertical chrominance interpolation
440 #define REAL_YSCALEYUV2RGB1b(index, c) \
441 "xor "#index", "#index" \n\t"\
444 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
445 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
446 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
447 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
448 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
449 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
451 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
452 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
453 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
454 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
455 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
456 "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
457 "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
459 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
460 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
461 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
462 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
463 "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
464 "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
465 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
466 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
467 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
468 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
469 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470 "paddw %%mm3, %%mm4 \n\t"\
471 "movq %%mm2, %%mm0 \n\t"\
472 "movq %%mm5, %%mm6 \n\t"\
473 "movq %%mm4, %%mm3 \n\t"\
474 "punpcklwd %%mm2, %%mm2 \n\t"\
475 "punpcklwd %%mm5, %%mm5 \n\t"\
476 "punpcklwd %%mm4, %%mm4 \n\t"\
477 "paddw %%mm1, %%mm2 \n\t"\
478 "paddw %%mm1, %%mm5 \n\t"\
479 "paddw %%mm1, %%mm4 \n\t"\
480 "punpckhwd %%mm0, %%mm0 \n\t"\
481 "punpckhwd %%mm6, %%mm6 \n\t"\
482 "punpckhwd %%mm3, %%mm3 \n\t"\
483 "paddw %%mm7, %%mm0 \n\t"\
484 "paddw %%mm7, %%mm6 \n\t"\
485 "paddw %%mm7, %%mm3 \n\t"\
486 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487 "packuswb %%mm0, %%mm2 \n\t"\
488 "packuswb %%mm6, %%mm5 \n\t"\
489 "packuswb %%mm3, %%mm4 \n\t"\
490 "pxor %%mm7, %%mm7 \n\t"
491 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
493 #define REAL_WRITEBGR32(dst, dstw, index) \
494 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
495 "movq %%mm2, %%mm1 \n\t" /* B */\
496 "movq %%mm5, %%mm6 \n\t" /* R */\
497 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
498 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
499 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
500 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
501 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
502 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
503 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
504 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
505 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
506 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
508 MOVNTQ(%%mm0, (dst, index, 4))\
509 MOVNTQ(%%mm2, 8(dst, index, 4))\
510 MOVNTQ(%%mm1, 16(dst, index, 4))\
511 MOVNTQ(%%mm3, 24(dst, index, 4))\
513 "add $8, "#index" \n\t"\
514 "cmp "#dstw", "#index" \n\t"\
516 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
518 #define REAL_WRITEBGR16(dst, dstw, index) \
519 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
520 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
521 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
522 "psrlq $3, %%mm2 \n\t"\
524 "movq %%mm2, %%mm1 \n\t"\
525 "movq %%mm4, %%mm3 \n\t"\
527 "punpcklbw %%mm7, %%mm3 \n\t"\
528 "punpcklbw %%mm5, %%mm2 \n\t"\
529 "punpckhbw %%mm7, %%mm4 \n\t"\
530 "punpckhbw %%mm5, %%mm1 \n\t"\
532 "psllq $3, %%mm3 \n\t"\
533 "psllq $3, %%mm4 \n\t"\
535 "por %%mm3, %%mm2 \n\t"\
536 "por %%mm4, %%mm1 \n\t"\
538 MOVNTQ(%%mm2, (dst, index, 2))\
539 MOVNTQ(%%mm1, 8(dst, index, 2))\
541 "add $8, "#index" \n\t"\
542 "cmp "#dstw", "#index" \n\t"\
544 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
546 #define REAL_WRITEBGR15(dst, dstw, index) \
547 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
548 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
549 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
550 "psrlq $3, %%mm2 \n\t"\
551 "psrlq $1, %%mm5 \n\t"\
553 "movq %%mm2, %%mm1 \n\t"\
554 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklbw %%mm7, %%mm3 \n\t"\
557 "punpcklbw %%mm5, %%mm2 \n\t"\
558 "punpckhbw %%mm7, %%mm4 \n\t"\
559 "punpckhbw %%mm5, %%mm1 \n\t"\
561 "psllq $2, %%mm3 \n\t"\
562 "psllq $2, %%mm4 \n\t"\
564 "por %%mm3, %%mm2 \n\t"\
565 "por %%mm4, %%mm1 \n\t"\
567 MOVNTQ(%%mm2, (dst, index, 2))\
568 MOVNTQ(%%mm1, 8(dst, index, 2))\
570 "add $8, "#index" \n\t"\
571 "cmp "#dstw", "#index" \n\t"\
573 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
575 #define WRITEBGR24OLD(dst, dstw, index) \
576 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
577 "movq %%mm2, %%mm1 \n\t" /* B */\
578 "movq %%mm5, %%mm6 \n\t" /* R */\
579 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
580 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
581 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
582 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
583 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
584 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
585 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
586 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
587 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
588 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
590 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
591 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
592 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
593 "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
595 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
596 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
597 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
599 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
600 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
601 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
602 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
603 "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
605 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
606 "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
607 "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
609 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
610 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
611 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
613 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
614 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
615 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
616 "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
617 "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
619 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
620 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
622 MOVNTQ(%%mm0, (dst))\
623 MOVNTQ(%%mm2, 8(dst))\
624 MOVNTQ(%%mm3, 16(dst))\
625 "add $24, "#dst" \n\t"\
627 "add $8, "#index" \n\t"\
628 "cmp "#dstw", "#index" \n\t"\
631 #define WRITEBGR24MMX(dst, dstw, index) \
632 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
633 "movq %%mm2, %%mm1 \n\t" /* B */\
634 "movq %%mm5, %%mm6 \n\t" /* R */\
635 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
636 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
637 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
638 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
639 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
640 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
641 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
642 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
643 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
644 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
646 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
647 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
648 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
649 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
651 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
652 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
653 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
654 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
656 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
657 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
658 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
659 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
661 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
662 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
663 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
664 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
665 MOVNTQ(%%mm0, (dst))\
667 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
668 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
669 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
670 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
671 MOVNTQ(%%mm6, 8(dst))\
673 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
674 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
675 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
676 MOVNTQ(%%mm5, 16(dst))\
678 "add $24, "#dst" \n\t"\
680 "add $8, "#index" \n\t"\
681 "cmp "#dstw", "#index" \n\t"\
684 #define WRITEBGR24MMX2(dst, dstw, index) \
685 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686 "movq "MANGLE(M24A)", %%mm0 \n\t"\
687 "movq "MANGLE(M24C)", %%mm7 \n\t"\
688 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
689 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
690 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
692 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
693 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
694 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
696 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
697 "por %%mm1, %%mm6 \n\t"\
698 "por %%mm3, %%mm6 \n\t"\
699 MOVNTQ(%%mm6, (dst))\
701 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
702 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
703 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
704 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
706 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
707 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
708 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
710 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
711 "por %%mm3, %%mm6 \n\t"\
712 MOVNTQ(%%mm6, 8(dst))\
714 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
715 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
716 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
718 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
719 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
720 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
722 "por %%mm1, %%mm3 \n\t"\
723 "por %%mm3, %%mm6 \n\t"\
724 MOVNTQ(%%mm6, 16(dst))\
726 "add $24, "#dst" \n\t"\
728 "add $8, "#index" \n\t"\
729 "cmp "#dstw", "#index" \n\t"\
734 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
737 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
740 #define REAL_WRITEYUY2(dst, dstw, index) \
741 "packuswb %%mm3, %%mm3 \n\t"\
742 "packuswb %%mm4, %%mm4 \n\t"\
743 "packuswb %%mm7, %%mm1 \n\t"\
744 "punpcklbw %%mm4, %%mm3 \n\t"\
745 "movq %%mm1, %%mm7 \n\t"\
746 "punpcklbw %%mm3, %%mm1 \n\t"\
747 "punpckhbw %%mm3, %%mm7 \n\t"\
749 MOVNTQ(%%mm1, (dst, index, 2))\
750 MOVNTQ(%%mm7, 8(dst, index, 2))\
752 "add $8, "#index" \n\t"\
753 "cmp "#dstw", "#index" \n\t"\
755 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
758 static inline void RENAME(yuv2yuvX
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
759 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
760 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, long dstW
, long chrDstW
)
766 YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET
)
767 :: "r" (&c
->redDither
),
768 "r" (uDest
), "p" (chrDstW
)
769 : "%"REG_a
, "%"REG_d
, "%"REG_S
773 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET
)
774 :: "r" (&c
->redDither
),
775 "r" (vDest
), "p" (chrDstW
)
776 : "%"REG_a
, "%"REG_d
, "%"REG_S
781 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET
)
782 :: "r" (&c
->redDither
),
783 "r" (dest
), "p" (dstW
)
784 : "%"REG_a
, "%"REG_d
, "%"REG_S
788 yuv2yuvX_altivec_real(lumFilter
, lumSrc
, lumFilterSize
,
789 chrFilter
, chrSrc
, chrFilterSize
,
790 dest
, uDest
, vDest
, dstW
, chrDstW
);
792 yuv2yuvXinC(lumFilter
, lumSrc
, lumFilterSize
,
793 chrFilter
, chrSrc
, chrFilterSize
,
794 dest
, uDest
, vDest
, dstW
, chrDstW
);
795 #endif //!HAVE_ALTIVEC
799 static inline void RENAME(yuv2nv12X
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
800 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
801 uint8_t *dest
, uint8_t *uDest
, int dstW
, int chrDstW
, int dstFormat
)
803 yuv2nv12XinC(lumFilter
, lumSrc
, lumFilterSize
,
804 chrFilter
, chrSrc
, chrFilterSize
,
805 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
808 static inline void RENAME(yuv2yuv1
)(int16_t *lumSrc
, int16_t *chrSrc
,
809 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, long dstW
, long chrDstW
)
816 :: "r" (chrSrc
+ chrDstW
), "r" (uDest
+ chrDstW
),
823 :: "r" (chrSrc
+ 2048 + chrDstW
), "r" (vDest
+ chrDstW
),
831 :: "r" (lumSrc
+ dstW
), "r" (dest
+ dstW
),
837 for(i
=0; i
<dstW
; i
++)
839 int val
= lumSrc
[i
]>>7;
850 for(i
=0; i
<chrDstW
; i
++)
853 int v
=chrSrc
[i
+ 2048]>>7;
857 else if (u
>255) u
=255;
859 else if (v
>255) v
=255;
870 * vertical scale YV12 to RGB
872 static inline void RENAME(yuv2packedX
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
873 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
874 uint8_t *dest
, int dstW
, int dstY
)
884 WRITEBGR32(%4, %5, %%REGa
)
886 :: "r" (&c
->redDither
),
887 "m" (dummy
), "m" (dummy
), "m" (dummy
),
888 "r" (dest
), "m" (dstW
)
889 : "%"REG_a
, "%"REG_d
, "%"REG_S
897 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_b
"\n\t" //FIXME optimize
898 "add %4, %%"REG_b
" \n\t"
899 WRITEBGR24(%%REGb
, %5, %%REGa
)
901 :: "r" (&c
->redDither
),
902 "m" (dummy
), "m" (dummy
), "m" (dummy
),
903 "r" (dest
), "m" (dstW
)
904 : "%"REG_a
, "%"REG_b
, "%"REG_d
, "%"REG_S
//FIXME ebx
912 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
914 "paddusb "MANGLE(b5Dither
)", %%mm2\n\t"
915 "paddusb "MANGLE(g5Dither
)", %%mm4\n\t"
916 "paddusb "MANGLE(r5Dither
)", %%mm5\n\t"
919 WRITEBGR15(%4, %5, %%REGa
)
921 :: "r" (&c
->redDither
),
922 "m" (dummy
), "m" (dummy
), "m" (dummy
),
923 "r" (dest
), "m" (dstW
)
924 : "%"REG_a
, "%"REG_d
, "%"REG_S
932 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
934 "paddusb "MANGLE(b5Dither
)", %%mm2\n\t"
935 "paddusb "MANGLE(g6Dither
)", %%mm4\n\t"
936 "paddusb "MANGLE(r5Dither
)", %%mm5\n\t"
939 WRITEBGR16(%4, %5, %%REGa
)
941 :: "r" (&c
->redDither
),
942 "m" (dummy
), "m" (dummy
), "m" (dummy
),
943 "r" (dest
), "m" (dstW
)
944 : "%"REG_a
, "%"REG_d
, "%"REG_S
952 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
954 "psraw $3, %%mm3 \n\t"
955 "psraw $3, %%mm4 \n\t"
956 "psraw $3, %%mm1 \n\t"
957 "psraw $3, %%mm7 \n\t"
958 WRITEYUY2(%4, %5, %%REGa
)
960 :: "r" (&c
->redDither
),
961 "m" (dummy
), "m" (dummy
), "m" (dummy
),
962 "r" (dest
), "m" (dstW
)
963 : "%"REG_a
, "%"REG_d
, "%"REG_S
970 altivec_yuv2packedX (c
, lumFilter
, lumSrc
, lumFilterSize
,
971 chrFilter
, chrSrc
, chrFilterSize
,
974 yuv2packedXinC(c
, lumFilter
, lumSrc
, lumFilterSize
,
975 chrFilter
, chrSrc
, chrFilterSize
,
983 * vertical bilinear scale YV12 to RGB
985 static inline void RENAME(yuv2packed2
)(SwsContext
*c
, uint16_t *buf0
, uint16_t *buf1
, uint16_t *uvbuf0
, uint16_t *uvbuf1
,
986 uint8_t *dest
, int dstW
, int yalpha
, int uvalpha
, int y
)
988 int yalpha1
=yalpha
^4095;
989 int uvalpha1
=uvalpha
^4095;
993 if(flags
&SWS_FULL_CHR_H_INT
)
1003 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1004 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1006 "movq %%mm3, %%mm1 \n\t"
1007 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1008 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1010 MOVNTQ(%%mm3
, (%4, %%REGa
, 4))
1011 MOVNTQ(%%mm1
, 8(%4, %%REGa
, 4))
1013 "add $4, %%"REG_a
" \n\t"
1014 "cmp %5, %%"REG_a
" \n\t"
1018 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "r" (dest
), "m" ((long)dstW
),
1019 "m" (yalpha1
), "m" (uvalpha1
)
1029 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1030 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1032 "movq %%mm3, %%mm1 \n\t"
1033 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1034 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1036 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1037 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1038 "pand "MANGLE(bm00000111
)", %%mm2\n\t" // BGR00000
1039 "pand "MANGLE(bm11111000
)", %%mm3\n\t" // 000BGR00
1040 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1041 "movq %%mm1, %%mm2 \n\t"
1042 "psllq $48, %%mm1 \n\t" // 000000BG
1043 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1045 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1046 "psrld $16, %%mm2 \n\t" // R000R000
1047 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1048 "por %%mm2, %%mm1 \n\t" // RBGRR000
1050 "mov %4, %%"REG_b
" \n\t"
1051 "add %%"REG_a
", %%"REG_b
" \n\t"
1055 "movntq %%mm3, (%%"REG_b
", %%"REG_a
", 2)\n\t"
1056 "movntq %%mm1, 8(%%"REG_b
", %%"REG_a
", 2)\n\t"
1058 "movd %%mm3, (%%"REG_b
", %%"REG_a
", 2) \n\t"
1059 "psrlq $32, %%mm3 \n\t"
1060 "movd %%mm3, 4(%%"REG_b
", %%"REG_a
", 2) \n\t"
1061 "movd %%mm1, 8(%%"REG_b
", %%"REG_a
", 2) \n\t"
1063 "add $4, %%"REG_a
" \n\t"
1064 "cmp %5, %%"REG_a
" \n\t"
1067 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
), "m" (dstW
),
1068 "m" (yalpha1
), "m" (uvalpha1
)
1069 : "%"REG_a
, "%"REG_b
1077 "paddusb "MANGLE(g5Dither
)", %%mm1\n\t"
1078 "paddusb "MANGLE(r5Dither
)", %%mm0\n\t"
1079 "paddusb "MANGLE(b5Dither
)", %%mm3\n\t"
1081 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1082 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1083 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1085 "psrlw $3, %%mm3 \n\t"
1086 "psllw $2, %%mm1 \n\t"
1087 "psllw $7, %%mm0 \n\t"
1088 "pand "MANGLE(g15Mask
)", %%mm1 \n\t"
1089 "pand "MANGLE(r15Mask
)", %%mm0 \n\t"
1091 "por %%mm3, %%mm1 \n\t"
1092 "por %%mm1, %%mm0 \n\t"
1094 MOVNTQ(%%mm0
, (%4, %%REGa
, 2))
1096 "add $4, %%"REG_a
" \n\t"
1097 "cmp %5, %%"REG_a
" \n\t"
1100 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "r" (dest
), "m" (dstW
),
1101 "m" (yalpha1
), "m" (uvalpha1
)
1110 "paddusb "MANGLE(g6Dither
)", %%mm1\n\t"
1111 "paddusb "MANGLE(r5Dither
)", %%mm0\n\t"
1112 "paddusb "MANGLE(b5Dither
)", %%mm3\n\t"
1114 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1115 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1116 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1118 "psrlw $3, %%mm3 \n\t"
1119 "psllw $3, %%mm1 \n\t"
1120 "psllw $8, %%mm0 \n\t"
1121 "pand "MANGLE(g16Mask
)", %%mm1 \n\t"
1122 "pand "MANGLE(r16Mask
)", %%mm0 \n\t"
1124 "por %%mm3, %%mm1 \n\t"
1125 "por %%mm1, %%mm0 \n\t"
1127 MOVNTQ(%%mm0
, (%4, %%REGa
, 2))
1129 "add $4, %%"REG_a
" \n\t"
1130 "cmp %5, %%"REG_a
" \n\t"
1133 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "r" (dest
), "m" (dstW
),
1134 "m" (yalpha1
), "m" (uvalpha1
)
1143 if(dstFormat
==IMGFMT_BGR32
)
1146 #ifdef WORDS_BIGENDIAN
1149 for(i
=0;i
<dstW
;i
++){
1150 // vertical linear interpolation && yuv2rgb in a single step:
1151 int Y
=yuvtab_2568
[((buf0
[i
]*yalpha1
+buf1
[i
]*yalpha
)>>19)];
1152 int U
=((uvbuf0
[i
]*uvalpha1
+uvbuf1
[i
]*uvalpha
)>>19);
1153 int V
=((uvbuf0
[i
+2048]*uvalpha1
+uvbuf1
[i
+2048]*uvalpha
)>>19);
1154 dest
[0]=clip_table
[((Y
+ yuvtab_40cf
[U
]) >>13)];
1155 dest
[1]=clip_table
[((Y
+ yuvtab_1a1e
[V
] + yuvtab_0c92
[U
]) >>13)];
1156 dest
[2]=clip_table
[((Y
+ yuvtab_3343
[V
]) >>13)];
1160 else if(dstFormat
==IMGFMT_BGR24
)
1163 for(i
=0;i
<dstW
;i
++){
1164 // vertical linear interpolation && yuv2rgb in a single step:
1165 int Y
=yuvtab_2568
[((buf0
[i
]*yalpha1
+buf1
[i
]*yalpha
)>>19)];
1166 int U
=((uvbuf0
[i
]*uvalpha1
+uvbuf1
[i
]*uvalpha
)>>19);
1167 int V
=((uvbuf0
[i
+2048]*uvalpha1
+uvbuf1
[i
+2048]*uvalpha
)>>19);
1168 dest
[0]=clip_table
[((Y
+ yuvtab_40cf
[U
]) >>13)];
1169 dest
[1]=clip_table
[((Y
+ yuvtab_1a1e
[V
] + yuvtab_0c92
[U
]) >>13)];
1170 dest
[2]=clip_table
[((Y
+ yuvtab_3343
[V
]) >>13)];
1174 else if(dstFormat
==IMGFMT_BGR16
)
1177 for(i
=0;i
<dstW
;i
++){
1178 // vertical linear interpolation && yuv2rgb in a single step:
1179 int Y
=yuvtab_2568
[((buf0
[i
]*yalpha1
+buf1
[i
]*yalpha
)>>19)];
1180 int U
=((uvbuf0
[i
]*uvalpha1
+uvbuf1
[i
]*uvalpha
)>>19);
1181 int V
=((uvbuf0
[i
+2048]*uvalpha1
+uvbuf1
[i
+2048]*uvalpha
)>>19);
1183 ((uint16_t*)dest
)[i
] =
1184 clip_table16b
[(Y
+ yuvtab_40cf
[U
]) >>13] |
1185 clip_table16g
[(Y
+ yuvtab_1a1e
[V
] + yuvtab_0c92
[U
]) >>13] |
1186 clip_table16r
[(Y
+ yuvtab_3343
[V
]) >>13];
1189 else if(dstFormat
==IMGFMT_BGR15
)
1192 for(i
=0;i
<dstW
;i
++){
1193 // vertical linear interpolation && yuv2rgb in a single step:
1194 int Y
=yuvtab_2568
[((buf0
[i
]*yalpha1
+buf1
[i
]*yalpha
)>>19)];
1195 int U
=((uvbuf0
[i
]*uvalpha1
+uvbuf1
[i
]*uvalpha
)>>19);
1196 int V
=((uvbuf0
[i
+2048]*uvalpha1
+uvbuf1
[i
+2048]*uvalpha
)>>19);
1198 ((uint16_t*)dest
)[i
] =
1199 clip_table15b
[(Y
+ yuvtab_40cf
[U
]) >>13] |
1200 clip_table15g
[(Y
+ yuvtab_1a1e
[V
] + yuvtab_0c92
[U
]) >>13] |
1201 clip_table15r
[(Y
+ yuvtab_3343
[V
]) >>13];
1209 switch(c
->dstFormat
)
1211 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1214 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1215 "mov %4, %%"REG_SP
" \n\t"
1216 YSCALEYUV2RGB(%%REGa
, %5)
1217 WRITEBGR32(%%REGSP
, 8280(%5), %%REGa
)
1218 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1220 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1227 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1228 "mov %4, %%"REG_SP
" \n\t"
1229 YSCALEYUV2RGB(%%REGa
, %5)
1230 WRITEBGR24(%%REGSP
, 8280(%5), %%REGa
)
1231 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1232 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1239 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1240 "mov %4, %%"REG_SP
" \n\t"
1241 YSCALEYUV2RGB(%%REGa
, %5)
1242 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1244 "paddusb "MANGLE(b5Dither
)", %%mm2\n\t"
1245 "paddusb "MANGLE(g5Dither
)", %%mm4\n\t"
1246 "paddusb "MANGLE(r5Dither
)", %%mm5\n\t"
1249 WRITEBGR15(%%REGSP
, 8280(%5), %%REGa
)
1250 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1252 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1259 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1260 "mov %4, %%"REG_SP
" \n\t"
1261 YSCALEYUV2RGB(%%REGa
, %5)
1262 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1264 "paddusb "MANGLE(b5Dither
)", %%mm2\n\t"
1265 "paddusb "MANGLE(g6Dither
)", %%mm4\n\t"
1266 "paddusb "MANGLE(r5Dither
)", %%mm5\n\t"
1269 WRITEBGR16(%%REGSP
, 8280(%5), %%REGa
)
1270 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1271 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1278 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1279 "mov %4, %%"REG_SP
" \n\t"
1280 YSCALEYUV2PACKED(%%REGa
, %5)
1281 WRITEYUY2(%%REGSP
, 8280(%5), %%REGa
)
1282 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1283 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1291 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C
, YSCALE_YUV_2_PACKED2_C
)
1295 * YV12 to RGB without scaling or interpolating
1297 static inline void RENAME(yuv2packed1
)(SwsContext
*c
, uint16_t *buf0
, uint16_t *uvbuf0
, uint16_t *uvbuf1
,
1298 uint8_t *dest
, int dstW
, int uvalpha
, int dstFormat
, int flags
, int y
)
1300 const int yalpha1
=0;
1303 uint16_t *buf1
= buf0
; //FIXME needed for the rgb1/bgr1
1304 const int yalpha
= 4096; //FIXME ...
1306 if(flags
&SWS_FULL_CHR_H_INT
)
1308 RENAME(yuv2packed2
)(c
, buf0
, buf0
, uvbuf0
, uvbuf1
, dest
, dstW
, 0, uvalpha
, y
);
1313 if( uvalpha
< 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1319 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1320 "mov %4, %%"REG_SP
" \n\t"
1321 YSCALEYUV2RGB1(%%REGa
, %5)
1322 WRITEBGR32(%%REGSP
, 8280(%5), %%REGa
)
1323 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1325 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1332 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1333 "mov %4, %%"REG_SP
" \n\t"
1334 YSCALEYUV2RGB1(%%REGa
, %5)
1335 WRITEBGR24(%%REGSP
, 8280(%5), %%REGa
)
1336 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1338 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1345 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1346 "mov %4, %%"REG_SP
" \n\t"
1347 YSCALEYUV2RGB1(%%REGa
, %5)
1348 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1350 "paddusb "MANGLE(b5Dither
)", %%mm2\n\t"
1351 "paddusb "MANGLE(g5Dither
)", %%mm4\n\t"
1352 "paddusb "MANGLE(r5Dither
)", %%mm5\n\t"
1354 WRITEBGR15(%%REGSP
, 8280(%5), %%REGa
)
1355 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1357 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1364 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1365 "mov %4, %%"REG_SP
" \n\t"
1366 YSCALEYUV2RGB1(%%REGa
, %5)
1367 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1369 "paddusb "MANGLE(b5Dither
)", %%mm2\n\t"
1370 "paddusb "MANGLE(g6Dither
)", %%mm4\n\t"
1371 "paddusb "MANGLE(r5Dither
)", %%mm5\n\t"
1374 WRITEBGR16(%%REGSP
, 8280(%5), %%REGa
)
1375 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1377 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1384 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1385 "mov %4, %%"REG_SP
" \n\t"
1386 YSCALEYUV2PACKED1(%%REGa
, %5)
1387 WRITEYUY2(%%REGSP
, 8280(%5), %%REGa
)
1388 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1390 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1403 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1404 "mov %4, %%"REG_SP
" \n\t"
1405 YSCALEYUV2RGB1b(%%REGa
, %5)
1406 WRITEBGR32(%%REGSP
, 8280(%5), %%REGa
)
1407 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1409 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1416 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1417 "mov %4, %%"REG_SP
" \n\t"
1418 YSCALEYUV2RGB1b(%%REGa
, %5)
1419 WRITEBGR24(%%REGSP
, 8280(%5), %%REGa
)
1420 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1422 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1429 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1430 "mov %4, %%"REG_SP
" \n\t"
1431 YSCALEYUV2RGB1b(%%REGa
, %5)
1432 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1434 "paddusb "MANGLE(b5Dither
)", %%mm2\n\t"
1435 "paddusb "MANGLE(g5Dither
)", %%mm4\n\t"
1436 "paddusb "MANGLE(r5Dither
)", %%mm5\n\t"
1438 WRITEBGR15(%%REGSP
, 8280(%5), %%REGa
)
1439 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1441 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1448 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1449 "mov %4, %%"REG_SP
" \n\t"
1450 YSCALEYUV2RGB1b(%%REGa
, %5)
1451 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1453 "paddusb "MANGLE(b5Dither
)", %%mm2\n\t"
1454 "paddusb "MANGLE(g6Dither
)", %%mm4\n\t"
1455 "paddusb "MANGLE(r5Dither
)", %%mm5\n\t"
1458 WRITEBGR16(%%REGSP
, 8280(%5), %%REGa
)
1459 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1461 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1468 "mov %%"REG_SP
", "ESP_OFFSET
"(%5) \n\t"
1469 "mov %4, %%"REG_SP
" \n\t"
1470 YSCALEYUV2PACKED1b(%%REGa
, %5)
1471 WRITEYUY2(%%REGSP
, 8280(%5), %%REGa
)
1472 "mov "ESP_OFFSET
"(%5), %%"REG_SP
" \n\t"
1474 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
),
1482 if( uvalpha
< 2048 )
1484 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C
, YSCALE_YUV_2_PACKED1_C
)
1486 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C
, YSCALE_YUV_2_PACKED1B_C
)
1490 //FIXME yuy2* can read upto 7 samples to much
1492 static inline void RENAME(yuy2ToY
)(uint8_t *dst
, uint8_t *src
, long width
)
1496 "movq "MANGLE(bm01010101
)", %%mm2\n\t"
1497 "mov %0, %%"REG_a
" \n\t"
1499 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1500 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1501 "pand %%mm2, %%mm0 \n\t"
1502 "pand %%mm2, %%mm1 \n\t"
1503 "packuswb %%mm1, %%mm0 \n\t"
1504 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1505 "add $8, %%"REG_a
" \n\t"
1507 : : "g" (-width
), "r" (src
+width
*2), "r" (dst
+width
)
1512 for(i
=0; i
<width
; i
++)
1517 static inline void RENAME(yuy2ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
)
1519 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1521 "movq "MANGLE(bm01010101
)", %%mm4\n\t"
1522 "mov %0, %%"REG_a
" \n\t"
1524 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1525 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1526 "movq (%2, %%"REG_a
",4), %%mm2 \n\t"
1527 "movq 8(%2, %%"REG_a
",4), %%mm3 \n\t"
1530 "psrlw $8, %%mm0 \n\t"
1531 "psrlw $8, %%mm1 \n\t"
1532 "packuswb %%mm1, %%mm0 \n\t"
1533 "movq %%mm0, %%mm1 \n\t"
1534 "psrlw $8, %%mm0 \n\t"
1535 "pand %%mm4, %%mm1 \n\t"
1536 "packuswb %%mm0, %%mm0 \n\t"
1537 "packuswb %%mm1, %%mm1 \n\t"
1538 "movd %%mm0, (%4, %%"REG_a
") \n\t"
1539 "movd %%mm1, (%3, %%"REG_a
") \n\t"
1540 "add $4, %%"REG_a
" \n\t"
1542 : : "g" (-width
), "r" (src1
+width
*4), "r" (src2
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1547 for(i
=0; i
<width
; i
++)
1549 dstU
[i
]= (src1
[4*i
+ 1] + src2
[4*i
+ 1])>>1;
1550 dstV
[i
]= (src1
[4*i
+ 3] + src2
[4*i
+ 3])>>1;
1555 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1556 static inline void RENAME(uyvyToY
)(uint8_t *dst
, uint8_t *src
, long width
)
1560 "mov %0, %%"REG_a
" \n\t"
1562 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1563 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1564 "psrlw $8, %%mm0 \n\t"
1565 "psrlw $8, %%mm1 \n\t"
1566 "packuswb %%mm1, %%mm0 \n\t"
1567 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1568 "add $8, %%"REG_a
" \n\t"
1570 : : "g" (-width
), "r" (src
+width
*2), "r" (dst
+width
)
1575 for(i
=0; i
<width
; i
++)
1580 static inline void RENAME(uyvyToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
)
1582 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1584 "movq "MANGLE(bm01010101
)", %%mm4\n\t"
1585 "mov %0, %%"REG_a
" \n\t"
1587 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1588 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1589 "movq (%2, %%"REG_a
",4), %%mm2 \n\t"
1590 "movq 8(%2, %%"REG_a
",4), %%mm3 \n\t"
1593 "pand %%mm4, %%mm0 \n\t"
1594 "pand %%mm4, %%mm1 \n\t"
1595 "packuswb %%mm1, %%mm0 \n\t"
1596 "movq %%mm0, %%mm1 \n\t"
1597 "psrlw $8, %%mm0 \n\t"
1598 "pand %%mm4, %%mm1 \n\t"
1599 "packuswb %%mm0, %%mm0 \n\t"
1600 "packuswb %%mm1, %%mm1 \n\t"
1601 "movd %%mm0, (%4, %%"REG_a
") \n\t"
1602 "movd %%mm1, (%3, %%"REG_a
") \n\t"
1603 "add $4, %%"REG_a
" \n\t"
1605 : : "g" (-width
), "r" (src1
+width
*4), "r" (src2
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1610 for(i
=0; i
<width
; i
++)
1612 dstU
[i
]= (src1
[4*i
+ 0] + src2
[4*i
+ 0])>>1;
1613 dstV
[i
]= (src1
[4*i
+ 2] + src2
[4*i
+ 2])>>1;
1618 static inline void RENAME(bgr32ToY
)(uint8_t *dst
, uint8_t *src
, int width
)
1621 for(i
=0; i
<width
; i
++)
1623 int b
= ((uint32_t*)src
)[i
]&0xFF;
1624 int g
= (((uint32_t*)src
)[i
]>>8)&0xFF;
1625 int r
= (((uint32_t*)src
)[i
]>>16)&0xFF;
1627 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)) )>>RGB2YUV_SHIFT
);
1631 static inline void RENAME(bgr32ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, int width
)
1634 for(i
=0; i
<width
; i
++)
1636 const int a
= ((uint32_t*)src1
)[2*i
+0];
1637 const int e
= ((uint32_t*)src1
)[2*i
+1];
1638 const int c
= ((uint32_t*)src2
)[2*i
+0];
1639 const int d
= ((uint32_t*)src2
)[2*i
+1];
1640 const int l
= (a
&0xFF00FF) + (e
&0xFF00FF) + (c
&0xFF00FF) + (d
&0xFF00FF);
1641 const int h
= (a
&0x00FF00) + (e
&0x00FF00) + (c
&0x00FF00) + (d
&0x00FF00);
1642 const int b
= l
&0x3FF;
1646 dstU
[i
]= ((RU
*r
+ GU
*g
+ BU
*b
)>>(RGB2YUV_SHIFT
+2)) + 128;
1647 dstV
[i
]= ((RV
*r
+ GV
*g
+ BV
*b
)>>(RGB2YUV_SHIFT
+2)) + 128;
1651 static inline void RENAME(bgr24ToY
)(uint8_t *dst
, uint8_t *src
, long width
)
1655 "mov %2, %%"REG_a
" \n\t"
1656 "movq "MANGLE(bgr2YCoeff
)", %%mm6 \n\t"
1657 "movq "MANGLE(w1111
)", %%mm5 \n\t"
1658 "pxor %%mm7, %%mm7 \n\t"
1659 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_b
"\n\t"
1662 PREFETCH
" 64(%0, %%"REG_b
") \n\t"
1663 "movd (%0, %%"REG_b
"), %%mm0 \n\t"
1664 "movd 3(%0, %%"REG_b
"), %%mm1 \n\t"
1665 "punpcklbw %%mm7, %%mm0 \n\t"
1666 "punpcklbw %%mm7, %%mm1 \n\t"
1667 "movd 6(%0, %%"REG_b
"), %%mm2 \n\t"
1668 "movd 9(%0, %%"REG_b
"), %%mm3 \n\t"
1669 "punpcklbw %%mm7, %%mm2 \n\t"
1670 "punpcklbw %%mm7, %%mm3 \n\t"
1671 "pmaddwd %%mm6, %%mm0 \n\t"
1672 "pmaddwd %%mm6, %%mm1 \n\t"
1673 "pmaddwd %%mm6, %%mm2 \n\t"
1674 "pmaddwd %%mm6, %%mm3 \n\t"
1675 #ifndef FAST_BGR2YV12
1676 "psrad $8, %%mm0 \n\t"
1677 "psrad $8, %%mm1 \n\t"
1678 "psrad $8, %%mm2 \n\t"
1679 "psrad $8, %%mm3 \n\t"
1681 "packssdw %%mm1, %%mm0 \n\t"
1682 "packssdw %%mm3, %%mm2 \n\t"
1683 "pmaddwd %%mm5, %%mm0 \n\t"
1684 "pmaddwd %%mm5, %%mm2 \n\t"
1685 "packssdw %%mm2, %%mm0 \n\t"
1686 "psraw $7, %%mm0 \n\t"
1688 "movd 12(%0, %%"REG_b
"), %%mm4 \n\t"
1689 "movd 15(%0, %%"REG_b
"), %%mm1 \n\t"
1690 "punpcklbw %%mm7, %%mm4 \n\t"
1691 "punpcklbw %%mm7, %%mm1 \n\t"
1692 "movd 18(%0, %%"REG_b
"), %%mm2 \n\t"
1693 "movd 21(%0, %%"REG_b
"), %%mm3 \n\t"
1694 "punpcklbw %%mm7, %%mm2 \n\t"
1695 "punpcklbw %%mm7, %%mm3 \n\t"
1696 "pmaddwd %%mm6, %%mm4 \n\t"
1697 "pmaddwd %%mm6, %%mm1 \n\t"
1698 "pmaddwd %%mm6, %%mm2 \n\t"
1699 "pmaddwd %%mm6, %%mm3 \n\t"
1700 #ifndef FAST_BGR2YV12
1701 "psrad $8, %%mm4 \n\t"
1702 "psrad $8, %%mm1 \n\t"
1703 "psrad $8, %%mm2 \n\t"
1704 "psrad $8, %%mm3 \n\t"
1706 "packssdw %%mm1, %%mm4 \n\t"
1707 "packssdw %%mm3, %%mm2 \n\t"
1708 "pmaddwd %%mm5, %%mm4 \n\t"
1709 "pmaddwd %%mm5, %%mm2 \n\t"
1710 "add $24, %%"REG_b
" \n\t"
1711 "packssdw %%mm2, %%mm4 \n\t"
1712 "psraw $7, %%mm4 \n\t"
1714 "packuswb %%mm4, %%mm0 \n\t"
1715 "paddusb "MANGLE(bgr2YOffset
)", %%mm0 \n\t"
1717 "movq %%mm0, (%1, %%"REG_a
") \n\t"
1718 "add $8, %%"REG_a
" \n\t"
1720 : : "r" (src
+width
*3), "r" (dst
+width
), "g" (-width
)
1721 : "%"REG_a
, "%"REG_b
1725 for(i
=0; i
<width
; i
++)
1731 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)) )>>RGB2YUV_SHIFT
);
1736 static inline void RENAME(bgr24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
)
1740 "mov %4, %%"REG_a
" \n\t"
1741 "movq "MANGLE(w1111
)", %%mm5 \n\t"
1742 "movq "MANGLE(bgr2UCoeff
)", %%mm6 \n\t"
1743 "pxor %%mm7, %%mm7 \n\t"
1744 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_b
" \n\t"
1745 "add %%"REG_b
", %%"REG_b
" \n\t"
1748 PREFETCH
" 64(%0, %%"REG_b
") \n\t"
1749 PREFETCH
" 64(%1, %%"REG_b
") \n\t"
1750 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1751 "movq (%0, %%"REG_b
"), %%mm0 \n\t"
1752 "movq (%1, %%"REG_b
"), %%mm1 \n\t"
1753 "movq 6(%0, %%"REG_b
"), %%mm2 \n\t"
1754 "movq 6(%1, %%"REG_b
"), %%mm3 \n\t"
1757 "movq %%mm0, %%mm1 \n\t"
1758 "movq %%mm2, %%mm3 \n\t"
1759 "psrlq $24, %%mm0 \n\t"
1760 "psrlq $24, %%mm2 \n\t"
1763 "punpcklbw %%mm7, %%mm0 \n\t"
1764 "punpcklbw %%mm7, %%mm2 \n\t"
1766 "movd (%0, %%"REG_b
"), %%mm0 \n\t"
1767 "movd (%1, %%"REG_b
"), %%mm1 \n\t"
1768 "movd 3(%0, %%"REG_b
"), %%mm2 \n\t"
1769 "movd 3(%1, %%"REG_b
"), %%mm3 \n\t"
1770 "punpcklbw %%mm7, %%mm0 \n\t"
1771 "punpcklbw %%mm7, %%mm1 \n\t"
1772 "punpcklbw %%mm7, %%mm2 \n\t"
1773 "punpcklbw %%mm7, %%mm3 \n\t"
1774 "paddw %%mm1, %%mm0 \n\t"
1775 "paddw %%mm3, %%mm2 \n\t"
1776 "paddw %%mm2, %%mm0 \n\t"
1777 "movd 6(%0, %%"REG_b
"), %%mm4 \n\t"
1778 "movd 6(%1, %%"REG_b
"), %%mm1 \n\t"
1779 "movd 9(%0, %%"REG_b
"), %%mm2 \n\t"
1780 "movd 9(%1, %%"REG_b
"), %%mm3 \n\t"
1781 "punpcklbw %%mm7, %%mm4 \n\t"
1782 "punpcklbw %%mm7, %%mm1 \n\t"
1783 "punpcklbw %%mm7, %%mm2 \n\t"
1784 "punpcklbw %%mm7, %%mm3 \n\t"
1785 "paddw %%mm1, %%mm4 \n\t"
1786 "paddw %%mm3, %%mm2 \n\t"
1787 "paddw %%mm4, %%mm2 \n\t"
1788 "psrlw $2, %%mm0 \n\t"
1789 "psrlw $2, %%mm2 \n\t"
1791 "movq "MANGLE(bgr2VCoeff
)", %%mm1 \n\t"
1792 "movq "MANGLE(bgr2VCoeff
)", %%mm3 \n\t"
1794 "pmaddwd %%mm0, %%mm1 \n\t"
1795 "pmaddwd %%mm2, %%mm3 \n\t"
1796 "pmaddwd %%mm6, %%mm0 \n\t"
1797 "pmaddwd %%mm6, %%mm2 \n\t"
1798 #ifndef FAST_BGR2YV12
1799 "psrad $8, %%mm0 \n\t"
1800 "psrad $8, %%mm1 \n\t"
1801 "psrad $8, %%mm2 \n\t"
1802 "psrad $8, %%mm3 \n\t"
1804 "packssdw %%mm2, %%mm0 \n\t"
1805 "packssdw %%mm3, %%mm1 \n\t"
1806 "pmaddwd %%mm5, %%mm0 \n\t"
1807 "pmaddwd %%mm5, %%mm1 \n\t"
1808 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1809 "psraw $7, %%mm0 \n\t"
1811 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1812 "movq 12(%0, %%"REG_b
"), %%mm4 \n\t"
1813 "movq 12(%1, %%"REG_b
"), %%mm1 \n\t"
1814 "movq 18(%0, %%"REG_b
"), %%mm2 \n\t"
1815 "movq 18(%1, %%"REG_b
"), %%mm3 \n\t"
1818 "movq %%mm4, %%mm1 \n\t"
1819 "movq %%mm2, %%mm3 \n\t"
1820 "psrlq $24, %%mm4 \n\t"
1821 "psrlq $24, %%mm2 \n\t"
1824 "punpcklbw %%mm7, %%mm4 \n\t"
1825 "punpcklbw %%mm7, %%mm2 \n\t"
1827 "movd 12(%0, %%"REG_b
"), %%mm4 \n\t"
1828 "movd 12(%1, %%"REG_b
"), %%mm1 \n\t"
1829 "movd 15(%0, %%"REG_b
"), %%mm2 \n\t"
1830 "movd 15(%1, %%"REG_b
"), %%mm3 \n\t"
1831 "punpcklbw %%mm7, %%mm4 \n\t"
1832 "punpcklbw %%mm7, %%mm1 \n\t"
1833 "punpcklbw %%mm7, %%mm2 \n\t"
1834 "punpcklbw %%mm7, %%mm3 \n\t"
1835 "paddw %%mm1, %%mm4 \n\t"
1836 "paddw %%mm3, %%mm2 \n\t"
1837 "paddw %%mm2, %%mm4 \n\t"
1838 "movd 18(%0, %%"REG_b
"), %%mm5 \n\t"
1839 "movd 18(%1, %%"REG_b
"), %%mm1 \n\t"
1840 "movd 21(%0, %%"REG_b
"), %%mm2 \n\t"
1841 "movd 21(%1, %%"REG_b
"), %%mm3 \n\t"
1842 "punpcklbw %%mm7, %%mm5 \n\t"
1843 "punpcklbw %%mm7, %%mm1 \n\t"
1844 "punpcklbw %%mm7, %%mm2 \n\t"
1845 "punpcklbw %%mm7, %%mm3 \n\t"
1846 "paddw %%mm1, %%mm5 \n\t"
1847 "paddw %%mm3, %%mm2 \n\t"
1848 "paddw %%mm5, %%mm2 \n\t"
1849 "movq "MANGLE(w1111
)", %%mm5 \n\t"
1850 "psrlw $2, %%mm4 \n\t"
1851 "psrlw $2, %%mm2 \n\t"
1853 "movq "MANGLE(bgr2VCoeff
)", %%mm1 \n\t"
1854 "movq "MANGLE(bgr2VCoeff
)", %%mm3 \n\t"
1856 "pmaddwd %%mm4, %%mm1 \n\t"
1857 "pmaddwd %%mm2, %%mm3 \n\t"
1858 "pmaddwd %%mm6, %%mm4 \n\t"
1859 "pmaddwd %%mm6, %%mm2 \n\t"
1860 #ifndef FAST_BGR2YV12
1861 "psrad $8, %%mm4 \n\t"
1862 "psrad $8, %%mm1 \n\t"
1863 "psrad $8, %%mm2 \n\t"
1864 "psrad $8, %%mm3 \n\t"
1866 "packssdw %%mm2, %%mm4 \n\t"
1867 "packssdw %%mm3, %%mm1 \n\t"
1868 "pmaddwd %%mm5, %%mm4 \n\t"
1869 "pmaddwd %%mm5, %%mm1 \n\t"
1870 "add $24, %%"REG_b
" \n\t"
1871 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1872 "psraw $7, %%mm4 \n\t"
1874 "movq %%mm0, %%mm1 \n\t"
1875 "punpckldq %%mm4, %%mm0 \n\t"
1876 "punpckhdq %%mm4, %%mm1 \n\t"
1877 "packsswb %%mm1, %%mm0 \n\t"
1878 "paddb "MANGLE(bgr2UVOffset
)", %%mm0 \n\t"
1880 "movd %%mm0, (%2, %%"REG_a
") \n\t"
1881 "punpckhdq %%mm0, %%mm0 \n\t"
1882 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1883 "add $4, %%"REG_a
" \n\t"
1885 : : "r" (src1
+width
*6), "r" (src2
+width
*6), "r" (dstU
+width
), "r" (dstV
+width
), "g" (-width
)
1886 : "%"REG_a
, "%"REG_b
1890 for(i
=0; i
<width
; i
++)
1892 int b
= src1
[6*i
+ 0] + src1
[6*i
+ 3] + src2
[6*i
+ 0] + src2
[6*i
+ 3];
1893 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4] + src2
[6*i
+ 1] + src2
[6*i
+ 4];
1894 int r
= src1
[6*i
+ 2] + src1
[6*i
+ 5] + src2
[6*i
+ 2] + src2
[6*i
+ 5];
1896 dstU
[i
]= ((RU
*r
+ GU
*g
+ BU
*b
)>>(RGB2YUV_SHIFT
+2)) + 128;
1897 dstV
[i
]= ((RV
*r
+ GV
*g
+ BV
*b
)>>(RGB2YUV_SHIFT
+2)) + 128;
1902 static inline void RENAME(bgr16ToY
)(uint8_t *dst
, uint8_t *src
, int width
)
1905 for(i
=0; i
<width
; i
++)
1907 int d
= ((uint16_t*)src
)[i
];
1910 int r
= (d
>>11)&0x1F;
1912 dst
[i
]= ((2*RY
*r
+ GY
*g
+ 2*BY
*b
)>>(RGB2YUV_SHIFT
-2)) + 16;
1916 static inline void RENAME(bgr16ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, int width
)
1919 for(i
=0; i
<width
; i
++)
1921 int d0
= ((uint32_t*)src1
)[i
];
1922 int d1
= ((uint32_t*)src2
)[i
];
1924 int dl
= (d0
&0x07E0F81F) + (d1
&0x07E0F81F);
1925 int dh
= ((d0
>>5)&0x07C0F83F) + ((d1
>>5)&0x07C0F83F);
1927 int dh2
= (dh
>>11) + (dh
<<21);
1931 int r
= (d
>>11)&0x7F;
1933 dstU
[i
]= ((2*RU
*r
+ GU
*g
+ 2*BU
*b
)>>(RGB2YUV_SHIFT
+2-2)) + 128;
1934 dstV
[i
]= ((2*RV
*r
+ GV
*g
+ 2*BV
*b
)>>(RGB2YUV_SHIFT
+2-2)) + 128;
1938 static inline void RENAME(bgr15ToY
)(uint8_t *dst
, uint8_t *src
, int width
)
1941 for(i
=0; i
<width
; i
++)
1943 int d
= ((uint16_t*)src
)[i
];
1946 int r
= (d
>>10)&0x1F;
1948 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
)>>(RGB2YUV_SHIFT
-3)) + 16;
1952 static inline void RENAME(bgr15ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, int width
)
1955 for(i
=0; i
<width
; i
++)
1957 int d0
= ((uint32_t*)src1
)[i
];
1958 int d1
= ((uint32_t*)src2
)[i
];
1960 int dl
= (d0
&0x03E07C1F) + (d1
&0x03E07C1F);
1961 int dh
= ((d0
>>5)&0x03E0F81F) + ((d1
>>5)&0x03E0F81F);
1963 int dh2
= (dh
>>11) + (dh
<<21);
1967 int r
= (d
>>10)&0x7F;
1969 dstU
[i
]= ((RU
*r
+ GU
*g
+ BU
*b
)>>(RGB2YUV_SHIFT
+2-3)) + 128;
1970 dstV
[i
]= ((RV
*r
+ GV
*g
+ BV
*b
)>>(RGB2YUV_SHIFT
+2-3)) + 128;
1975 static inline void RENAME(rgb32ToY
)(uint8_t *dst
, uint8_t *src
, int width
)
1978 for(i
=0; i
<width
; i
++)
1980 int r
= ((uint32_t*)src
)[i
]&0xFF;
1981 int g
= (((uint32_t*)src
)[i
]>>8)&0xFF;
1982 int b
= (((uint32_t*)src
)[i
]>>16)&0xFF;
1984 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)) )>>RGB2YUV_SHIFT
);
1988 static inline void RENAME(rgb32ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, int width
)
1991 for(i
=0; i
<width
; i
++)
1993 const int a
= ((uint32_t*)src1
)[2*i
+0];
1994 const int e
= ((uint32_t*)src1
)[2*i
+1];
1995 const int c
= ((uint32_t*)src2
)[2*i
+0];
1996 const int d
= ((uint32_t*)src2
)[2*i
+1];
1997 const int l
= (a
&0xFF00FF) + (e
&0xFF00FF) + (c
&0xFF00FF) + (d
&0xFF00FF);
1998 const int h
= (a
&0x00FF00) + (e
&0x00FF00) + (c
&0x00FF00) + (d
&0x00FF00);
1999 const int r
= l
&0x3FF;
2003 dstU
[i
]= ((RU
*r
+ GU
*g
+ BU
*b
)>>(RGB2YUV_SHIFT
+2)) + 128;
2004 dstV
[i
]= ((RV
*r
+ GV
*g
+ BV
*b
)>>(RGB2YUV_SHIFT
+2)) + 128;
2008 static inline void RENAME(rgb24ToY
)(uint8_t *dst
, uint8_t *src
, int width
)
2011 for(i
=0; i
<width
; i
++)
2017 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)) )>>RGB2YUV_SHIFT
);
2021 static inline void RENAME(rgb24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, int width
)
2024 for(i
=0; i
<width
; i
++)
2026 int r
= src1
[6*i
+ 0] + src1
[6*i
+ 3] + src2
[6*i
+ 0] + src2
[6*i
+ 3];
2027 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4] + src2
[6*i
+ 1] + src2
[6*i
+ 4];
2028 int b
= src1
[6*i
+ 2] + src1
[6*i
+ 5] + src2
[6*i
+ 2] + src2
[6*i
+ 5];
2030 dstU
[i
]= ((RU
*r
+ GU
*g
+ BU
*b
)>>(RGB2YUV_SHIFT
+2)) + 128;
2031 dstV
[i
]= ((RV
*r
+ GV
*g
+ BV
*b
)>>(RGB2YUV_SHIFT
+2)) + 128;
2036 // Bilinear / Bicubic scaling
2037 static inline void RENAME(hScale
)(int16_t *dst
, int dstW
, uint8_t *src
, int srcW
, int xInc
,
2038 int16_t *filter
, int16_t *filterPos
, long filterSize
)
2041 assert(filterSize
% 4 == 0 && filterSize
>0);
2042 if(filterSize
==4) // allways true for upscaling, sometimes for down too
2044 long counter
= -2*dstW
;
2046 filterPos
-= counter
/2;
2049 "pxor %%mm7, %%mm7 \n\t"
2050 "movq "MANGLE(w02
)", %%mm6 \n\t"
2051 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2052 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2055 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2056 "movzwl 2(%2, %%"REG_BP
"), %%ebx\n\t"
2057 "movq (%1, %%"REG_BP
", 4), %%mm1\n\t"
2058 "movq 8(%1, %%"REG_BP
", 4), %%mm3\n\t"
2059 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2060 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2061 "punpcklbw %%mm7, %%mm0 \n\t"
2062 "punpcklbw %%mm7, %%mm2 \n\t"
2063 "pmaddwd %%mm1, %%mm0 \n\t"
2064 "pmaddwd %%mm2, %%mm3 \n\t"
2065 "psrad $8, %%mm0 \n\t"
2066 "psrad $8, %%mm3 \n\t"
2067 "packssdw %%mm3, %%mm0 \n\t"
2068 "pmaddwd %%mm6, %%mm0 \n\t"
2069 "packssdw %%mm0, %%mm0 \n\t"
2070 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2071 "add $4, %%"REG_BP
" \n\t"
2074 "pop %%"REG_BP
" \n\t"
2076 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2080 else if(filterSize
==8)
2082 long counter
= -2*dstW
;
2084 filterPos
-= counter
/2;
2087 "pxor %%mm7, %%mm7 \n\t"
2088 "movq "MANGLE(w02
)", %%mm6 \n\t"
2089 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2090 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2093 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2094 "movzwl 2(%2, %%"REG_BP
"), %%ebx\n\t"
2095 "movq (%1, %%"REG_BP
", 8), %%mm1\n\t"
2096 "movq 16(%1, %%"REG_BP
", 8), %%mm3\n\t"
2097 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2098 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2099 "punpcklbw %%mm7, %%mm0 \n\t"
2100 "punpcklbw %%mm7, %%mm2 \n\t"
2101 "pmaddwd %%mm1, %%mm0 \n\t"
2102 "pmaddwd %%mm2, %%mm3 \n\t"
2104 "movq 8(%1, %%"REG_BP
", 8), %%mm1\n\t"
2105 "movq 24(%1, %%"REG_BP
", 8), %%mm5\n\t"
2106 "movd 4(%3, %%"REG_a
"), %%mm4 \n\t"
2107 "movd 4(%3, %%"REG_b
"), %%mm2 \n\t"
2108 "punpcklbw %%mm7, %%mm4 \n\t"
2109 "punpcklbw %%mm7, %%mm2 \n\t"
2110 "pmaddwd %%mm1, %%mm4 \n\t"
2111 "pmaddwd %%mm2, %%mm5 \n\t"
2112 "paddd %%mm4, %%mm0 \n\t"
2113 "paddd %%mm5, %%mm3 \n\t"
2115 "psrad $8, %%mm0 \n\t"
2116 "psrad $8, %%mm3 \n\t"
2117 "packssdw %%mm3, %%mm0 \n\t"
2118 "pmaddwd %%mm6, %%mm0 \n\t"
2119 "packssdw %%mm0, %%mm0 \n\t"
2120 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2121 "add $4, %%"REG_BP
" \n\t"
2124 "pop %%"REG_BP
" \n\t"
2126 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2132 uint8_t *offset
= src
+filterSize
;
2133 long counter
= -2*dstW
;
2134 // filter-= counter*filterSize/2;
2135 filterPos
-= counter
/2;
2138 "pxor %%mm7, %%mm7 \n\t"
2139 "movq "MANGLE(w02
)", %%mm6 \n\t"
2142 "mov %2, %%"REG_c
" \n\t"
2143 "movzwl (%%"REG_c
", %0), %%eax \n\t"
2144 "movzwl 2(%%"REG_c
", %0), %%ebx \n\t"
2145 "mov %5, %%"REG_c
" \n\t"
2146 "pxor %%mm4, %%mm4 \n\t"
2147 "pxor %%mm5, %%mm5 \n\t"
2149 "movq (%1), %%mm1 \n\t"
2150 "movq (%1, %6), %%mm3 \n\t"
2151 "movd (%%"REG_c
", %%"REG_a
"), %%mm0\n\t"
2152 "movd (%%"REG_c
", %%"REG_b
"), %%mm2\n\t"
2153 "punpcklbw %%mm7, %%mm0 \n\t"
2154 "punpcklbw %%mm7, %%mm2 \n\t"
2155 "pmaddwd %%mm1, %%mm0 \n\t"
2156 "pmaddwd %%mm2, %%mm3 \n\t"
2157 "paddd %%mm3, %%mm5 \n\t"
2158 "paddd %%mm0, %%mm4 \n\t"
2160 "add $4, %%"REG_c
" \n\t"
2161 "cmp %4, %%"REG_c
" \n\t"
2164 "psrad $8, %%mm4 \n\t"
2165 "psrad $8, %%mm5 \n\t"
2166 "packssdw %%mm5, %%mm4 \n\t"
2167 "pmaddwd %%mm6, %%mm4 \n\t"
2168 "packssdw %%mm4, %%mm4 \n\t"
2169 "mov %3, %%"REG_a
" \n\t"
2170 "movd %%mm4, (%%"REG_a
", %0) \n\t"
2174 : "+r" (counter
), "+r" (filter
)
2175 : "m" (filterPos
), "m" (dst
), "m"(offset
),
2176 "m" (src
), "r" (filterSize
*2)
2177 : "%"REG_b
, "%"REG_a
, "%"REG_c
2182 hScale_altivec_real(dst
, dstW
, src
, srcW
, xInc
, filter
, filterPos
, filterSize
);
2185 for(i
=0; i
<dstW
; i
++)
2188 int srcPos
= filterPos
[i
];
2190 // printf("filterPos: %d\n", filterPos[i]);
2191 for(j
=0; j
<filterSize
; j
++)
2193 // printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2194 val
+= ((int)src
[srcPos
+ j
])*filter
[filterSize
*i
+ j
];
2196 // filter += hFilterSize;
2197 dst
[i
] = MIN(MAX(0, val
>>7), (1<<15)-1); // the cubic equation does overflow ...
2203 // *** horizontal scale Y line to temp buffer
2204 static inline void RENAME(hyscale
)(uint16_t *dst
, int dstWidth
, uint8_t *src
, int srcW
, int xInc
,
2205 int flags
, int canMMX2BeUsed
, int16_t *hLumFilter
,
2206 int16_t *hLumFilterPos
, int hLumFilterSize
, void *funnyYCode
,
2207 int srcFormat
, uint8_t *formatConvBuffer
, int16_t *mmx2Filter
,
2208 int32_t *mmx2FilterPos
)
2210 if(srcFormat
==IMGFMT_YUY2
)
2212 RENAME(yuy2ToY
)(formatConvBuffer
, src
, srcW
);
2213 src
= formatConvBuffer
;
2215 else if(srcFormat
==IMGFMT_UYVY
)
2217 RENAME(uyvyToY
)(formatConvBuffer
, src
, srcW
);
2218 src
= formatConvBuffer
;
2220 else if(srcFormat
==IMGFMT_BGR32
)
2222 RENAME(bgr32ToY
)(formatConvBuffer
, src
, srcW
);
2223 src
= formatConvBuffer
;
2225 else if(srcFormat
==IMGFMT_BGR24
)
2227 RENAME(bgr24ToY
)(formatConvBuffer
, src
, srcW
);
2228 src
= formatConvBuffer
;
2230 else if(srcFormat
==IMGFMT_BGR16
)
2232 RENAME(bgr16ToY
)(formatConvBuffer
, src
, srcW
);
2233 src
= formatConvBuffer
;
2235 else if(srcFormat
==IMGFMT_BGR15
)
2237 RENAME(bgr15ToY
)(formatConvBuffer
, src
, srcW
);
2238 src
= formatConvBuffer
;
2240 else if(srcFormat
==IMGFMT_RGB32
)
2242 RENAME(rgb32ToY
)(formatConvBuffer
, src
, srcW
);
2243 src
= formatConvBuffer
;
2245 else if(srcFormat
==IMGFMT_RGB24
)
2247 RENAME(rgb24ToY
)(formatConvBuffer
, src
, srcW
);
2248 src
= formatConvBuffer
;
2252 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2253 if(!(flags
&SWS_FAST_BILINEAR
) || (!canMMX2BeUsed
))
2255 if(!(flags
&SWS_FAST_BILINEAR
))
2258 RENAME(hScale
)(dst
, dstWidth
, src
, srcW
, xInc
, hLumFilter
, hLumFilterPos
, hLumFilterSize
);
2260 else // Fast Bilinear upscale / crap downscale
2262 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2268 "pxor %%mm7, %%mm7 \n\t"
2269 "mov %0, %%"REG_c
" \n\t"
2270 "mov %1, %%"REG_D
" \n\t"
2271 "mov %2, %%"REG_d
" \n\t"
2272 "mov %3, %%"REG_b
" \n\t"
2273 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2274 PREFETCH
" (%%"REG_c
") \n\t"
2275 PREFETCH
" 32(%%"REG_c
") \n\t"
2276 PREFETCH
" 64(%%"REG_c
") \n\t"
2280 #define FUNNY_Y_CODE \
2281 "movl (%%"REG_b"), %%esi \n\t"\
2283 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2284 "add %%"REG_S", %%"REG_c" \n\t"\
2285 "add %%"REG_a", %%"REG_D" \n\t"\
2286 "xor %%"REG_a", %%"REG_a" \n\t"\
2290 #define FUNNY_Y_CODE \
2291 "movl (%%"REG_b"), %%esi \n\t"\
2293 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2294 "add %%"REG_a", %%"REG_D" \n\t"\
2295 "xor %%"REG_a", %%"REG_a" \n\t"\
2308 :: "m" (src
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2310 : "%"REG_a
, "%"REG_b
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2312 for(i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--) dst
[i
] = src
[srcW
-1]*128;
2317 int xInc_shr16
= xInc
>> 16;
2318 int xInc_mask
= xInc
& 0xffff;
2319 //NO MMX just normal asm ...
2321 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2322 "xor %%"REG_b
", %%"REG_b
" \n\t" // xx
2323 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2326 "movzbl (%0, %%"REG_b
"), %%edi \n\t" //src[xx]
2327 "movzbl 1(%0, %%"REG_b
"), %%esi \n\t" //src[xx+1]
2328 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2329 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2330 "shll $16, %%edi \n\t"
2331 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2332 "mov %1, %%"REG_D
" \n\t"
2333 "shrl $9, %%esi \n\t"
2334 "movw %%si, (%%"REG_D
", %%"REG_a
", 2)\n\t"
2335 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2336 "adc %3, %%"REG_b
" \n\t" //xx+= xInc>>8 + carry
2338 "movzbl (%0, %%"REG_b
"), %%edi \n\t" //src[xx]
2339 "movzbl 1(%0, %%"REG_b
"), %%esi \n\t" //src[xx+1]
2340 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2341 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2342 "shll $16, %%edi \n\t"
2343 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2344 "mov %1, %%"REG_D
" \n\t"
2345 "shrl $9, %%esi \n\t"
2346 "movw %%si, 2(%%"REG_D
", %%"REG_a
", 2)\n\t"
2347 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2348 "adc %3, %%"REG_b
" \n\t" //xx+= xInc>>8 + carry
2351 "add $2, %%"REG_a
" \n\t"
2352 "cmp %2, %%"REG_a
" \n\t"
2356 :: "r" (src
), "m" (dst
), "m" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
)
2357 : "%"REG_a
, "%"REG_b
, "%ecx", "%"REG_D
, "%esi"
2360 } //if MMX2 can't be used
2364 unsigned int xpos
=0;
2365 for(i
=0;i
<dstWidth
;i
++)
2367 register unsigned int xx
=xpos
>>16;
2368 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2369 dst
[i
]= (src
[xx
]<<7) + (src
[xx
+1] - src
[xx
])*xalpha
;
2376 inline static void RENAME(hcscale
)(uint16_t *dst
, long dstWidth
, uint8_t *src1
, uint8_t *src2
,
2377 int srcW
, int xInc
, int flags
, int canMMX2BeUsed
, int16_t *hChrFilter
,
2378 int16_t *hChrFilterPos
, int hChrFilterSize
, void *funnyUVCode
,
2379 int srcFormat
, uint8_t *formatConvBuffer
, int16_t *mmx2Filter
,
2380 int32_t *mmx2FilterPos
)
2382 if(srcFormat
==IMGFMT_YUY2
)
2384 RENAME(yuy2ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2385 src1
= formatConvBuffer
;
2386 src2
= formatConvBuffer
+2048;
2388 else if(srcFormat
==IMGFMT_UYVY
)
2390 RENAME(uyvyToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2391 src1
= formatConvBuffer
;
2392 src2
= formatConvBuffer
+2048;
2394 else if(srcFormat
==IMGFMT_BGR32
)
2396 RENAME(bgr32ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2397 src1
= formatConvBuffer
;
2398 src2
= formatConvBuffer
+2048;
2400 else if(srcFormat
==IMGFMT_BGR24
)
2402 RENAME(bgr24ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2403 src1
= formatConvBuffer
;
2404 src2
= formatConvBuffer
+2048;
2406 else if(srcFormat
==IMGFMT_BGR16
)
2408 RENAME(bgr16ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2409 src1
= formatConvBuffer
;
2410 src2
= formatConvBuffer
+2048;
2412 else if(srcFormat
==IMGFMT_BGR15
)
2414 RENAME(bgr15ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2415 src1
= formatConvBuffer
;
2416 src2
= formatConvBuffer
+2048;
2418 else if(srcFormat
==IMGFMT_RGB32
)
2420 RENAME(rgb32ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2421 src1
= formatConvBuffer
;
2422 src2
= formatConvBuffer
+2048;
2424 else if(srcFormat
==IMGFMT_RGB24
)
2426 RENAME(rgb24ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2427 src1
= formatConvBuffer
;
2428 src2
= formatConvBuffer
+2048;
2430 else if(isGray(srcFormat
))
2436 // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2437 if(!(flags
&SWS_FAST_BILINEAR
) || (!canMMX2BeUsed
))
2439 if(!(flags
&SWS_FAST_BILINEAR
))
2442 RENAME(hScale
)(dst
, dstWidth
, src1
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2443 RENAME(hScale
)(dst
+2048, dstWidth
, src2
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2445 else // Fast Bilinear upscale / crap downscale
2447 #if defined(ARCH_X86) || defined(ARCH_X86_64)
2453 "pxor %%mm7, %%mm7 \n\t"
2454 "mov %0, %%"REG_c
" \n\t"
2455 "mov %1, %%"REG_D
" \n\t"
2456 "mov %2, %%"REG_d
" \n\t"
2457 "mov %3, %%"REG_b
" \n\t"
2458 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2459 PREFETCH
" (%%"REG_c
") \n\t"
2460 PREFETCH
" 32(%%"REG_c
") \n\t"
2461 PREFETCH
" 64(%%"REG_c
") \n\t"
2465 #define FUNNY_UV_CODE \
2466 "movl (%%"REG_b"), %%esi \n\t"\
2468 "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2469 "add %%"REG_S", %%"REG_c" \n\t"\
2470 "add %%"REG_a", %%"REG_D" \n\t"\
2471 "xor %%"REG_a", %%"REG_a" \n\t"\
2475 #define FUNNY_UV_CODE \
2476 "movl (%%"REG_b"), %%esi \n\t"\
2478 "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2479 "add %%"REG_a", %%"REG_D" \n\t"\
2480 "xor %%"REG_a", %%"REG_a" \n\t"\
2488 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2489 "mov %5, %%"REG_c
" \n\t" // src
2490 "mov %1, %%"REG_D
" \n\t" // buf1
2491 "add $4096, %%"REG_D
" \n\t"
2492 PREFETCH
" (%%"REG_c
") \n\t"
2493 PREFETCH
" 32(%%"REG_c
") \n\t"
2494 PREFETCH
" 64(%%"REG_c
") \n\t"
2501 :: "m" (src1
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2502 "m" (funnyUVCode
), "m" (src2
)
2503 : "%"REG_a
, "%"REG_b
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2505 for(i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--)
2507 // printf("%d %d %d\n", dstWidth, i, srcW);
2508 dst
[i
] = src1
[srcW
-1]*128;
2509 dst
[i
+2048] = src2
[srcW
-1]*128;
2515 long xInc_shr16
= (long) (xInc
>> 16);
2516 int xInc_mask
= xInc
& 0xffff;
2518 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2519 "xor %%"REG_b
", %%"REG_b
" \n\t" // xx
2520 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2523 "mov %0, %%"REG_S
" \n\t"
2524 "movzbl (%%"REG_S
", %%"REG_b
"), %%edi \n\t" //src[xx]
2525 "movzbl 1(%%"REG_S
", %%"REG_b
"), %%esi \n\t" //src[xx+1]
2526 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2527 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2528 "shll $16, %%edi \n\t"
2529 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2530 "mov %1, %%"REG_D
" \n\t"
2531 "shrl $9, %%esi \n\t"
2532 "movw %%si, (%%"REG_D
", %%"REG_a
", 2)\n\t"
2534 "movzbl (%5, %%"REG_b
"), %%edi \n\t" //src[xx]
2535 "movzbl 1(%5, %%"REG_b
"), %%esi \n\t" //src[xx+1]
2536 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2537 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2538 "shll $16, %%edi \n\t"
2539 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2540 "mov %1, %%"REG_D
" \n\t"
2541 "shrl $9, %%esi \n\t"
2542 "movw %%si, 4096(%%"REG_D
", %%"REG_a
", 2)\n\t"
2544 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2545 "adc %3, %%"REG_b
" \n\t" //xx+= xInc>>8 + carry
2546 "add $1, %%"REG_a
" \n\t"
2547 "cmp %2, %%"REG_a
" \n\t"
2550 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2551 which is needed to support GCC-4.0 */
2552 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2553 :: "m" (src1
), "m" (dst
), "g" ((long)dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2555 :: "m" (src1
), "m" (dst
), "m" ((long)dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2558 : "%"REG_a
, "%"REG_b
, "%ecx", "%"REG_D
, "%esi"
2561 } //if MMX2 can't be used
2565 unsigned int xpos
=0;
2566 for(i
=0;i
<dstWidth
;i
++)
2568 register unsigned int xx
=xpos
>>16;
2569 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2570 dst
[i
]=(src1
[xx
]*(xalpha
^127)+src1
[xx
+1]*xalpha
);
2571 dst
[i
+2048]=(src2
[xx
]*(xalpha
^127)+src2
[xx
+1]*xalpha
);
2573 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2574 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2582 static int RENAME(swScale
)(SwsContext
*c
, uint8_t* src
[], int srcStride
[], int srcSliceY
,
2583 int srcSliceH
, uint8_t* dst
[], int dstStride
[]){
2585 /* load a few things into local vars to make the code more readable? and faster */
2586 const int srcW
= c
->srcW
;
2587 const int dstW
= c
->dstW
;
2588 const int dstH
= c
->dstH
;
2589 const int chrDstW
= c
->chrDstW
;
2590 const int chrSrcW
= c
->chrSrcW
;
2591 const int lumXInc
= c
->lumXInc
;
2592 const int chrXInc
= c
->chrXInc
;
2593 const int dstFormat
= c
->dstFormat
;
2594 const int srcFormat
= c
->srcFormat
;
2595 const int flags
= c
->flags
;
2596 const int canMMX2BeUsed
= c
->canMMX2BeUsed
;
2597 int16_t *vLumFilterPos
= c
->vLumFilterPos
;
2598 int16_t *vChrFilterPos
= c
->vChrFilterPos
;
2599 int16_t *hLumFilterPos
= c
->hLumFilterPos
;
2600 int16_t *hChrFilterPos
= c
->hChrFilterPos
;
2601 int16_t *vLumFilter
= c
->vLumFilter
;
2602 int16_t *vChrFilter
= c
->vChrFilter
;
2603 int16_t *hLumFilter
= c
->hLumFilter
;
2604 int16_t *hChrFilter
= c
->hChrFilter
;
2605 int32_t *lumMmxFilter
= c
->lumMmxFilter
;
2606 int32_t *chrMmxFilter
= c
->chrMmxFilter
;
2607 const int vLumFilterSize
= c
->vLumFilterSize
;
2608 const int vChrFilterSize
= c
->vChrFilterSize
;
2609 const int hLumFilterSize
= c
->hLumFilterSize
;
2610 const int hChrFilterSize
= c
->hChrFilterSize
;
2611 int16_t **lumPixBuf
= c
->lumPixBuf
;
2612 int16_t **chrPixBuf
= c
->chrPixBuf
;
2613 const int vLumBufSize
= c
->vLumBufSize
;
2614 const int vChrBufSize
= c
->vChrBufSize
;
2615 uint8_t *funnyYCode
= c
->funnyYCode
;
2616 uint8_t *funnyUVCode
= c
->funnyUVCode
;
2617 uint8_t *formatConvBuffer
= c
->formatConvBuffer
;
2618 const int chrSrcSliceY
= srcSliceY
>> c
->chrSrcVSubSample
;
2619 const int chrSrcSliceH
= -((-srcSliceH
) >> c
->chrSrcVSubSample
);
2622 /* vars whch will change and which we need to storw back in the context */
2624 int lumBufIndex
= c
->lumBufIndex
;
2625 int chrBufIndex
= c
->chrBufIndex
;
2626 int lastInLumBuf
= c
->lastInLumBuf
;
2627 int lastInChrBuf
= c
->lastInChrBuf
;
2629 if(isPacked(c
->srcFormat
)){
2635 srcStride
[2]= srcStride
[0];
2637 srcStride
[1]<<= c
->vChrDrop
;
2638 srcStride
[2]<<= c
->vChrDrop
;
2640 // printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2641 // (int)dst[0], (int)dst[1], (int)dst[2]);
2643 #if 0 //self test FIXME move to a vfilter or something
2645 static volatile int i
=0;
2647 if(srcFormat
==IMGFMT_YV12
&& i
==1 && srcSliceH
>= c
->srcH
)
2648 selfTest(src
, srcStride
, c
->srcW
, c
->srcH
);
2653 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2654 //dstStride[0],dstStride[1],dstStride[2]);
2656 if(dstStride
[0]%8 !=0 || dstStride
[1]%8 !=0 || dstStride
[2]%8 !=0)
2658 static int firstTime
=1; //FIXME move this into the context perhaps
2659 if(flags
& SWS_PRINT_INFO
&& firstTime
)
2661 MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2662 "SwScaler: ->cannot do aligned memory acesses anymore\n");
2667 /* Note the user might start scaling the picture in the middle so this will not get executed
2668 this is not really intended but works currently, so ppl might do it */
2679 for(;dstY
< dstH
; dstY
++){
2680 unsigned char *dest
=dst
[0]+dstStride
[0]*dstY
;
2681 const int chrDstY
= dstY
>>c
->chrDstVSubSample
;
2682 unsigned char *uDest
=dst
[1]+dstStride
[1]*chrDstY
;
2683 unsigned char *vDest
=dst
[2]+dstStride
[2]*chrDstY
;
2685 const int firstLumSrcY
= vLumFilterPos
[dstY
]; //First line needed as input
2686 const int firstChrSrcY
= vChrFilterPos
[chrDstY
]; //First line needed as input
2687 const int lastLumSrcY
= firstLumSrcY
+ vLumFilterSize
-1; // Last line needed as input
2688 const int lastChrSrcY
= firstChrSrcY
+ vChrFilterSize
-1; // Last line needed as input
2690 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2691 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2692 //handle holes (FAST_BILINEAR & weird filters)
2693 if(firstLumSrcY
> lastInLumBuf
) lastInLumBuf
= firstLumSrcY
-1;
2694 if(firstChrSrcY
> lastInChrBuf
) lastInChrBuf
= firstChrSrcY
-1;
2695 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2696 ASSERT(firstLumSrcY
>= lastInLumBuf
- vLumBufSize
+ 1)
2697 ASSERT(firstChrSrcY
>= lastInChrBuf
- vChrBufSize
+ 1)
2699 // Do we have enough lines in this slice to output the dstY line
2700 if(lastLumSrcY
< srcSliceY
+ srcSliceH
&& lastChrSrcY
< -((-srcSliceY
- srcSliceH
)>>c
->chrSrcVSubSample
))
2702 //Do horizontal scaling
2703 while(lastInLumBuf
< lastLumSrcY
)
2705 uint8_t *s
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
2707 // printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2708 ASSERT(lumBufIndex
< 2*vLumBufSize
)
2709 ASSERT(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
)
2710 ASSERT(lastInLumBuf
+ 1 - srcSliceY
>= 0)
2711 // printf("%d %d\n", lumBufIndex, vLumBufSize);
2712 RENAME(hyscale
)(lumPixBuf
[ lumBufIndex
], dstW
, s
, srcW
, lumXInc
,
2713 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2714 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
2715 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
);
2718 while(lastInChrBuf
< lastChrSrcY
)
2720 uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
2721 uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
2723 ASSERT(chrBufIndex
< 2*vChrBufSize
)
2724 ASSERT(lastInChrBuf
+ 1 - chrSrcSliceY
< (chrSrcSliceH
))
2725 ASSERT(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0)
2726 //FIXME replace parameters through context struct (some at least)
2728 if(!(isGray(srcFormat
) || isGray(dstFormat
)))
2729 RENAME(hcscale
)(chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
2730 flags
, canMMX2BeUsed
, hChrFilter
, hChrFilterPos
, hChrFilterSize
,
2731 funnyUVCode
, c
->srcFormat
, formatConvBuffer
,
2732 c
->chrMmx2Filter
, c
->chrMmx2FilterPos
);
2735 //wrap buf index around to stay inside the ring buffer
2736 if(lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
2737 if(chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
2739 else // not enough lines left in this slice -> load the rest in the buffer
2741 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2742 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2743 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2744 vChrBufSize, vLumBufSize);*/
2746 //Do horizontal scaling
2747 while(lastInLumBuf
+1 < srcSliceY
+ srcSliceH
)
2749 uint8_t *s
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
2751 ASSERT(lumBufIndex
< 2*vLumBufSize
)
2752 ASSERT(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
)
2753 ASSERT(lastInLumBuf
+ 1 - srcSliceY
>= 0)
2754 RENAME(hyscale
)(lumPixBuf
[ lumBufIndex
], dstW
, s
, srcW
, lumXInc
,
2755 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2756 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
2757 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
);
2760 while(lastInChrBuf
+1 < (chrSrcSliceY
+ chrSrcSliceH
))
2762 uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
2763 uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
2765 ASSERT(chrBufIndex
< 2*vChrBufSize
)
2766 ASSERT(lastInChrBuf
+ 1 - chrSrcSliceY
< chrSrcSliceH
)
2767 ASSERT(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0)
2769 if(!(isGray(srcFormat
) || isGray(dstFormat
)))
2770 RENAME(hcscale
)(chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
2771 flags
, canMMX2BeUsed
, hChrFilter
, hChrFilterPos
, hChrFilterSize
,
2772 funnyUVCode
, c
->srcFormat
, formatConvBuffer
,
2773 c
->chrMmx2Filter
, c
->chrMmx2FilterPos
);
2776 //wrap buf index around to stay inside the ring buffer
2777 if(lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
2778 if(chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
2779 break; //we can't output a dstY line so let's try with the next slice
2783 b5Dither
= dither8
[dstY
&1];
2784 g6Dither
= dither4
[dstY
&1];
2785 g5Dither
= dither8
[dstY
&1];
2786 r5Dither
= dither8
[(dstY
+1)&1];
2790 int16_t **lumSrcPtr
= lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2791 int16_t **chrSrcPtr
= chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2794 for(i
=0; i
<vLumFilterSize
; i
++)
2796 lumMmxFilter
[4*i
+0]= (int32_t)lumSrcPtr
[i
];
2797 lumMmxFilter
[4*i
+2]=
2798 lumMmxFilter
[4*i
+3]=
2799 ((uint16_t)vLumFilter
[dstY
*vLumFilterSize
+ i
])*0x10001;
2801 for(i
=0; i
<vChrFilterSize
; i
++)
2803 chrMmxFilter
[4*i
+0]= (int32_t)chrSrcPtr
[i
];
2804 chrMmxFilter
[4*i
+2]=
2805 chrMmxFilter
[4*i
+3]=
2806 ((uint16_t)vChrFilter
[chrDstY
*vChrFilterSize
+ i
])*0x10001;
2809 if(dstFormat
== IMGFMT_NV12
|| dstFormat
== IMGFMT_NV21
){
2810 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2811 if(dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2812 RENAME(yuv2nv12X
)(c
,
2813 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2814 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2815 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
2817 else if(isPlanarYUV(dstFormat
) || isGray(dstFormat
)) //YV12 like
2819 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2820 if((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
2821 if(vLumFilterSize
== 1 && vChrFilterSize
== 1) // Unscaled YV12
2823 int16_t *lumBuf
= lumPixBuf
[0];
2824 int16_t *chrBuf
= chrPixBuf
[0];
2825 RENAME(yuv2yuv1
)(lumBuf
, chrBuf
, dest
, uDest
, vDest
, dstW
, chrDstW
);
2830 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2831 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2832 dest
, uDest
, vDest
, dstW
, chrDstW
);
2837 ASSERT(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
2838 ASSERT(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
2839 if(vLumFilterSize
== 1 && vChrFilterSize
== 2) //Unscaled RGB
2841 int chrAlpha
= vChrFilter
[2*dstY
+1];
2842 RENAME(yuv2packed1
)(c
, *lumSrcPtr
, *chrSrcPtr
, *(chrSrcPtr
+1),
2843 dest
, dstW
, chrAlpha
, dstFormat
, flags
, dstY
);
2845 else if(vLumFilterSize
== 2 && vChrFilterSize
== 2) //BiLinear Upscale RGB
2847 int lumAlpha
= vLumFilter
[2*dstY
+1];
2848 int chrAlpha
= vChrFilter
[2*dstY
+1];
2849 RENAME(yuv2packed2
)(c
, *lumSrcPtr
, *(lumSrcPtr
+1), *chrSrcPtr
, *(chrSrcPtr
+1),
2850 dest
, dstW
, lumAlpha
, chrAlpha
, dstY
);
2854 RENAME(yuv2packedX
)(c
,
2855 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2856 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2861 else // hmm looks like we can't use MMX here without overwriting this array's tail
2863 int16_t **lumSrcPtr
= lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2864 int16_t **chrSrcPtr
= chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2865 if(dstFormat
== IMGFMT_NV12
|| dstFormat
== IMGFMT_NV21
){
2866 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2867 if(dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2869 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2870 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2871 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
2873 else if(isPlanarYUV(dstFormat
) || isGray(dstFormat
)) //YV12
2875 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2876 if((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
2878 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2879 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2880 dest
, uDest
, vDest
, dstW
, chrDstW
);
2884 ASSERT(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
2885 ASSERT(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
2887 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2888 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2895 __asm
__volatile(SFENCE:::"memory");
2896 __asm
__volatile(EMMS:::"memory");
2898 /* store changed local vars back in the context */
2900 c
->lumBufIndex
= lumBufIndex
;
2901 c
->chrBufIndex
= chrBufIndex
;
2902 c
->lastInLumBuf
= lastInLumBuf
;
2903 c
->lastInChrBuf
= lastInChrBuf
;
2905 return dstY
- lastDstY
;