2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * the C code (not assembly, mmx, ...) of this file can be used
21 * under the LGPL license too
33 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
42 #elif defined ( HAVE_MMX2 )
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
51 #define SFENCE "sfence"
53 #define SFENCE " # nop"
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58 #elif defined (HAVE_3DNOW)
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70 #include "swscale_altivec_template.c"
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $16, %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
185 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
186 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187 "r" (dest), "m" (dstW),
188 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
191 #define YSCALEYUV2PACKEDX \
193 "xor %%"REG_a", %%"REG_a" \n\t"\
197 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
198 "mov (%%"REG_d"), %%"REG_S" \n\t"\
199 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
200 "movq %%mm3, %%mm4 \n\t"\
203 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
204 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
205 "movq 4096(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
206 "add $16, %%"REG_d" \n\t"\
207 "mov (%%"REG_d"), %%"REG_S" \n\t"\
208 "pmulhw %%mm0, %%mm2 \n\t"\
209 "pmulhw %%mm0, %%mm5 \n\t"\
210 "paddw %%mm2, %%mm3 \n\t"\
211 "paddw %%mm5, %%mm4 \n\t"\
212 "test %%"REG_S", %%"REG_S" \n\t"\
215 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
218 "movq %%mm1, %%mm7 \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm1 \n\t"\
229 "paddw %%mm5, %%mm7 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 #define YSCALEYUV2PACKEDX_END \
234 :: "r" (&c->redDither), \
235 "m" (dummy), "m" (dummy), "m" (dummy),\
236 "r" (dest), "m" (dstW) \
237 : "%"REG_a, "%"REG_d, "%"REG_S \
240 #define YSCALEYUV2PACKEDX_ACCURATE \
242 "xor %%"REG_a", %%"REG_a" \n\t"\
246 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
247 "mov (%%"REG_d"), %%"REG_S" \n\t"\
248 "pxor %%mm4, %%mm4 \n\t"\
249 "pxor %%mm5, %%mm5 \n\t"\
250 "pxor %%mm6, %%mm6 \n\t"\
251 "pxor %%mm7, %%mm7 \n\t"\
254 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
255 "movq 4096(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
256 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
257 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
258 "movq %%mm0, %%mm3 \n\t"\
259 "punpcklwd %%mm1, %%mm0 \n\t"\
260 "punpckhwd %%mm1, %%mm3 \n\t"\
261 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
262 "pmaddwd %%mm1, %%mm0 \n\t"\
263 "pmaddwd %%mm1, %%mm3 \n\t"\
264 "paddd %%mm0, %%mm4 \n\t"\
265 "paddd %%mm3, %%mm5 \n\t"\
266 "movq 4096(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
267 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
268 "add $16, %%"REG_d" \n\t"\
269 "test %%"REG_S", %%"REG_S" \n\t"\
270 "movq %%mm2, %%mm0 \n\t"\
271 "punpcklwd %%mm3, %%mm2 \n\t"\
272 "punpckhwd %%mm3, %%mm0 \n\t"\
273 "pmaddwd %%mm1, %%mm2 \n\t"\
274 "pmaddwd %%mm1, %%mm0 \n\t"\
275 "paddd %%mm2, %%mm6 \n\t"\
276 "paddd %%mm0, %%mm7 \n\t"\
278 "psrad $16, %%mm4 \n\t"\
279 "psrad $16, %%mm5 \n\t"\
280 "psrad $16, %%mm6 \n\t"\
281 "psrad $16, %%mm7 \n\t"\
282 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
283 "packssdw %%mm5, %%mm4 \n\t"\
284 "packssdw %%mm7, %%mm6 \n\t"\
285 "paddw %%mm0, %%mm4 \n\t"\
286 "paddw %%mm0, %%mm6 \n\t"\
287 "movq %%mm4, "U_TEMP"(%0) \n\t"\
288 "movq %%mm6, "V_TEMP"(%0) \n\t"\
290 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
291 "mov (%%"REG_d"), %%"REG_S" \n\t"\
292 "pxor %%mm1, %%mm1 \n\t"\
293 "pxor %%mm5, %%mm5 \n\t"\
294 "pxor %%mm7, %%mm7 \n\t"\
295 "pxor %%mm6, %%mm6 \n\t"\
298 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
299 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
300 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
301 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
302 "movq %%mm0, %%mm3 \n\t"\
303 "punpcklwd %%mm4, %%mm0 \n\t"\
304 "punpckhwd %%mm4, %%mm3 \n\t"\
305 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
306 "pmaddwd %%mm4, %%mm0 \n\t"\
307 "pmaddwd %%mm4, %%mm3 \n\t"\
308 "paddd %%mm0, %%mm1 \n\t"\
309 "paddd %%mm3, %%mm5 \n\t"\
310 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
311 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
312 "add $16, %%"REG_d" \n\t"\
313 "test %%"REG_S", %%"REG_S" \n\t"\
314 "movq %%mm2, %%mm0 \n\t"\
315 "punpcklwd %%mm3, %%mm2 \n\t"\
316 "punpckhwd %%mm3, %%mm0 \n\t"\
317 "pmaddwd %%mm4, %%mm2 \n\t"\
318 "pmaddwd %%mm4, %%mm0 \n\t"\
319 "paddd %%mm2, %%mm7 \n\t"\
320 "paddd %%mm0, %%mm6 \n\t"\
322 "psrad $16, %%mm1 \n\t"\
323 "psrad $16, %%mm5 \n\t"\
324 "psrad $16, %%mm7 \n\t"\
325 "psrad $16, %%mm6 \n\t"\
326 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
327 "packssdw %%mm5, %%mm1 \n\t"\
328 "packssdw %%mm6, %%mm7 \n\t"\
329 "paddw %%mm0, %%mm1 \n\t"\
330 "paddw %%mm0, %%mm7 \n\t"\
331 "movq "U_TEMP"(%0), %%mm3 \n\t"\
332 "movq "V_TEMP"(%0), %%mm4 \n\t"\
334 #define YSCALEYUV2RGBX \
335 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
336 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
337 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
338 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
339 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
340 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
341 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
343 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
344 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
345 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
346 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
347 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
348 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349 "paddw %%mm3, %%mm4 \n\t"\
350 "movq %%mm2, %%mm0 \n\t"\
351 "movq %%mm5, %%mm6 \n\t"\
352 "movq %%mm4, %%mm3 \n\t"\
353 "punpcklwd %%mm2, %%mm2 \n\t"\
354 "punpcklwd %%mm5, %%mm5 \n\t"\
355 "punpcklwd %%mm4, %%mm4 \n\t"\
356 "paddw %%mm1, %%mm2 \n\t"\
357 "paddw %%mm1, %%mm5 \n\t"\
358 "paddw %%mm1, %%mm4 \n\t"\
359 "punpckhwd %%mm0, %%mm0 \n\t"\
360 "punpckhwd %%mm6, %%mm6 \n\t"\
361 "punpckhwd %%mm3, %%mm3 \n\t"\
362 "paddw %%mm7, %%mm0 \n\t"\
363 "paddw %%mm7, %%mm6 \n\t"\
364 "paddw %%mm7, %%mm3 \n\t"\
365 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366 "packuswb %%mm0, %%mm2 \n\t"\
367 "packuswb %%mm6, %%mm5 \n\t"\
368 "packuswb %%mm3, %%mm4 \n\t"\
369 "pxor %%mm7, %%mm7 \n\t"
371 #define FULL_YSCALEYUV2RGB \
372 "pxor %%mm7, %%mm7 \n\t"\
373 "movd %6, %%mm6 \n\t" /*yalpha1*/\
374 "punpcklwd %%mm6, %%mm6 \n\t"\
375 "punpcklwd %%mm6, %%mm6 \n\t"\
376 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
377 "punpcklwd %%mm5, %%mm5 \n\t"\
378 "punpcklwd %%mm5, %%mm5 \n\t"\
379 "xor %%"REG_a", %%"REG_a" \n\t"\
382 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
383 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
384 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
385 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
386 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
387 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391 "movq 4096(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
392 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394 "movq 4096(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
395 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
398 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
399 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
402 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
404 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
405 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
407 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
411 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
412 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
413 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
414 "paddw %%mm1, %%mm3 \n\t" /* B*/\
415 "paddw %%mm1, %%mm0 \n\t" /* R*/\
416 "packuswb %%mm3, %%mm3 \n\t"\
418 "packuswb %%mm0, %%mm0 \n\t"\
419 "paddw %%mm4, %%mm2 \n\t"\
420 "paddw %%mm2, %%mm1 \n\t" /* G*/\
422 "packuswb %%mm1, %%mm1 \n\t"
425 #define REAL_YSCALEYUV2PACKED(index, c) \
426 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
427 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
428 "psraw $3, %%mm0 \n\t"\
429 "psraw $3, %%mm1 \n\t"\
430 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
431 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
432 "xor "#index", "#index" \n\t"\
435 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
436 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
437 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
438 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
439 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
442 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
449 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
450 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
451 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
452 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
453 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
454 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
461 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
463 #define REAL_YSCALEYUV2RGB(index, c) \
464 "xor "#index", "#index" \n\t"\
467 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
468 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
469 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
470 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
471 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
474 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
481 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
482 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
483 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
484 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
485 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
486 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
488 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
489 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
490 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
491 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
492 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
493 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
500 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
501 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
502 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
503 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
504 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
505 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506 "paddw %%mm3, %%mm4 \n\t"\
507 "movq %%mm2, %%mm0 \n\t"\
508 "movq %%mm5, %%mm6 \n\t"\
509 "movq %%mm4, %%mm3 \n\t"\
510 "punpcklwd %%mm2, %%mm2 \n\t"\
511 "punpcklwd %%mm5, %%mm5 \n\t"\
512 "punpcklwd %%mm4, %%mm4 \n\t"\
513 "paddw %%mm1, %%mm2 \n\t"\
514 "paddw %%mm1, %%mm5 \n\t"\
515 "paddw %%mm1, %%mm4 \n\t"\
516 "punpckhwd %%mm0, %%mm0 \n\t"\
517 "punpckhwd %%mm6, %%mm6 \n\t"\
518 "punpckhwd %%mm3, %%mm3 \n\t"\
519 "paddw %%mm7, %%mm0 \n\t"\
520 "paddw %%mm7, %%mm6 \n\t"\
521 "paddw %%mm7, %%mm3 \n\t"\
522 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523 "packuswb %%mm0, %%mm2 \n\t"\
524 "packuswb %%mm6, %%mm5 \n\t"\
525 "packuswb %%mm3, %%mm4 \n\t"\
526 "pxor %%mm7, %%mm7 \n\t"
527 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
529 #define REAL_YSCALEYUV2PACKED1(index, c) \
530 "xor "#index", "#index" \n\t"\
533 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
534 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
535 "psraw $7, %%mm3 \n\t" \
536 "psraw $7, %%mm4 \n\t" \
537 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
538 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
539 "psraw $7, %%mm1 \n\t" \
540 "psraw $7, %%mm7 \n\t" \
542 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
544 #define REAL_YSCALEYUV2RGB1(index, c) \
545 "xor "#index", "#index" \n\t"\
548 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
549 "movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
550 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
553 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
554 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
555 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
556 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
557 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
558 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
560 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
561 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
563 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
564 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
565 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
566 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
567 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
568 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
569 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570 "paddw %%mm3, %%mm4 \n\t"\
571 "movq %%mm2, %%mm0 \n\t"\
572 "movq %%mm5, %%mm6 \n\t"\
573 "movq %%mm4, %%mm3 \n\t"\
574 "punpcklwd %%mm2, %%mm2 \n\t"\
575 "punpcklwd %%mm5, %%mm5 \n\t"\
576 "punpcklwd %%mm4, %%mm4 \n\t"\
577 "paddw %%mm1, %%mm2 \n\t"\
578 "paddw %%mm1, %%mm5 \n\t"\
579 "paddw %%mm1, %%mm4 \n\t"\
580 "punpckhwd %%mm0, %%mm0 \n\t"\
581 "punpckhwd %%mm6, %%mm6 \n\t"\
582 "punpckhwd %%mm3, %%mm3 \n\t"\
583 "paddw %%mm7, %%mm0 \n\t"\
584 "paddw %%mm7, %%mm6 \n\t"\
585 "paddw %%mm7, %%mm3 \n\t"\
586 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587 "packuswb %%mm0, %%mm2 \n\t"\
588 "packuswb %%mm6, %%mm5 \n\t"\
589 "packuswb %%mm3, %%mm4 \n\t"\
590 "pxor %%mm7, %%mm7 \n\t"
591 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
593 #define REAL_YSCALEYUV2PACKED1b(index, c) \
594 "xor "#index", "#index" \n\t"\
597 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
598 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
599 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
600 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
601 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603 "psrlw $8, %%mm3 \n\t" \
604 "psrlw $8, %%mm4 \n\t" \
605 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
606 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
607 "psraw $7, %%mm1 \n\t" \
608 "psraw $7, %%mm7 \n\t"
609 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
611 // do vertical chrominance interpolation
612 #define REAL_YSCALEYUV2RGB1b(index, c) \
613 "xor "#index", "#index" \n\t"\
616 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
617 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
618 "movq 4096(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
619 "movq 4096(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
620 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
623 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
624 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
625 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
626 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
627 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
628 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
629 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
630 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
632 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
633 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
635 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
636 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
637 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
638 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
639 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
640 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
641 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642 "paddw %%mm3, %%mm4 \n\t"\
643 "movq %%mm2, %%mm0 \n\t"\
644 "movq %%mm5, %%mm6 \n\t"\
645 "movq %%mm4, %%mm3 \n\t"\
646 "punpcklwd %%mm2, %%mm2 \n\t"\
647 "punpcklwd %%mm5, %%mm5 \n\t"\
648 "punpcklwd %%mm4, %%mm4 \n\t"\
649 "paddw %%mm1, %%mm2 \n\t"\
650 "paddw %%mm1, %%mm5 \n\t"\
651 "paddw %%mm1, %%mm4 \n\t"\
652 "punpckhwd %%mm0, %%mm0 \n\t"\
653 "punpckhwd %%mm6, %%mm6 \n\t"\
654 "punpckhwd %%mm3, %%mm3 \n\t"\
655 "paddw %%mm7, %%mm0 \n\t"\
656 "paddw %%mm7, %%mm6 \n\t"\
657 "paddw %%mm7, %%mm3 \n\t"\
658 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659 "packuswb %%mm0, %%mm2 \n\t"\
660 "packuswb %%mm6, %%mm5 \n\t"\
661 "packuswb %%mm3, %%mm4 \n\t"\
662 "pxor %%mm7, %%mm7 \n\t"
663 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
665 #define REAL_WRITEBGR32(dst, dstw, index) \
666 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667 "movq %%mm2, %%mm1 \n\t" /* B */\
668 "movq %%mm5, %%mm6 \n\t" /* R */\
669 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
670 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
671 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
672 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
673 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
674 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
675 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
676 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
677 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
678 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
680 MOVNTQ(%%mm0, (dst, index, 4))\
681 MOVNTQ(%%mm2, 8(dst, index, 4))\
682 MOVNTQ(%%mm1, 16(dst, index, 4))\
683 MOVNTQ(%%mm3, 24(dst, index, 4))\
685 "add $8, "#index" \n\t"\
686 "cmp "#dstw", "#index" \n\t"\
688 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
690 #define REAL_WRITEBGR16(dst, dstw, index) \
691 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
692 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
693 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
694 "psrlq $3, %%mm2 \n\t"\
696 "movq %%mm2, %%mm1 \n\t"\
697 "movq %%mm4, %%mm3 \n\t"\
699 "punpcklbw %%mm7, %%mm3 \n\t"\
700 "punpcklbw %%mm5, %%mm2 \n\t"\
701 "punpckhbw %%mm7, %%mm4 \n\t"\
702 "punpckhbw %%mm5, %%mm1 \n\t"\
704 "psllq $3, %%mm3 \n\t"\
705 "psllq $3, %%mm4 \n\t"\
707 "por %%mm3, %%mm2 \n\t"\
708 "por %%mm4, %%mm1 \n\t"\
710 MOVNTQ(%%mm2, (dst, index, 2))\
711 MOVNTQ(%%mm1, 8(dst, index, 2))\
713 "add $8, "#index" \n\t"\
714 "cmp "#dstw", "#index" \n\t"\
716 #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
718 #define REAL_WRITEBGR15(dst, dstw, index) \
719 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
720 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
721 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
722 "psrlq $3, %%mm2 \n\t"\
723 "psrlq $1, %%mm5 \n\t"\
725 "movq %%mm2, %%mm1 \n\t"\
726 "movq %%mm4, %%mm3 \n\t"\
728 "punpcklbw %%mm7, %%mm3 \n\t"\
729 "punpcklbw %%mm5, %%mm2 \n\t"\
730 "punpckhbw %%mm7, %%mm4 \n\t"\
731 "punpckhbw %%mm5, %%mm1 \n\t"\
733 "psllq $2, %%mm3 \n\t"\
734 "psllq $2, %%mm4 \n\t"\
736 "por %%mm3, %%mm2 \n\t"\
737 "por %%mm4, %%mm1 \n\t"\
739 MOVNTQ(%%mm2, (dst, index, 2))\
740 MOVNTQ(%%mm1, 8(dst, index, 2))\
742 "add $8, "#index" \n\t"\
743 "cmp "#dstw", "#index" \n\t"\
745 #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
747 #define WRITEBGR24OLD(dst, dstw, index) \
748 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749 "movq %%mm2, %%mm1 \n\t" /* B */\
750 "movq %%mm5, %%mm6 \n\t" /* R */\
751 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
752 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
753 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
754 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
755 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
756 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
757 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
758 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
759 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
760 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
762 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
763 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
764 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
765 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
766 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
767 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
768 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
769 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
771 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
772 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
773 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
774 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
775 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
776 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
777 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
778 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
779 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
780 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
781 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
782 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
783 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
785 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
786 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
787 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
788 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
789 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
790 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
791 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
792 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
794 MOVNTQ(%%mm0, (dst))\
795 MOVNTQ(%%mm2, 8(dst))\
796 MOVNTQ(%%mm3, 16(dst))\
797 "add $24, "#dst" \n\t"\
799 "add $8, "#index" \n\t"\
800 "cmp "#dstw", "#index" \n\t"\
803 #define WRITEBGR24MMX(dst, dstw, index) \
804 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805 "movq %%mm2, %%mm1 \n\t" /* B */\
806 "movq %%mm5, %%mm6 \n\t" /* R */\
807 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
808 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
809 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
810 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
811 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
812 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
813 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
814 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
815 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
816 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
818 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
819 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
820 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
821 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
823 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
824 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
825 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
826 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
828 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
829 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
830 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
831 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
833 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
834 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
835 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
836 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
837 MOVNTQ(%%mm0, (dst))\
839 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
840 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
841 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
842 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
843 MOVNTQ(%%mm6, 8(dst))\
845 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
846 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
847 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
848 MOVNTQ(%%mm5, 16(dst))\
850 "add $24, "#dst" \n\t"\
852 "add $8, "#index" \n\t"\
853 "cmp "#dstw", "#index" \n\t"\
856 #define WRITEBGR24MMX2(dst, dstw, index) \
857 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
858 "movq "MANGLE(M24A)", %%mm0 \n\t"\
859 "movq "MANGLE(M24C)", %%mm7 \n\t"\
860 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
861 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
862 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
864 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
865 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
866 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
868 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
869 "por %%mm1, %%mm6 \n\t"\
870 "por %%mm3, %%mm6 \n\t"\
871 MOVNTQ(%%mm6, (dst))\
873 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
874 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
875 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
876 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
878 "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
879 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
880 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
882 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
883 "por %%mm3, %%mm6 \n\t"\
884 MOVNTQ(%%mm6, 8(dst))\
886 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
887 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
888 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
890 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
891 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
892 "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
894 "por %%mm1, %%mm3 \n\t"\
895 "por %%mm3, %%mm6 \n\t"\
896 MOVNTQ(%%mm6, 16(dst))\
898 "add $24, "#dst" \n\t"\
900 "add $8, "#index" \n\t"\
901 "cmp "#dstw", "#index" \n\t"\
906 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
909 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
912 #define REAL_WRITEYUY2(dst, dstw, index) \
913 "packuswb %%mm3, %%mm3 \n\t"\
914 "packuswb %%mm4, %%mm4 \n\t"\
915 "packuswb %%mm7, %%mm1 \n\t"\
916 "punpcklbw %%mm4, %%mm3 \n\t"\
917 "movq %%mm1, %%mm7 \n\t"\
918 "punpcklbw %%mm3, %%mm1 \n\t"\
919 "punpckhbw %%mm3, %%mm7 \n\t"\
921 MOVNTQ(%%mm1, (dst, index, 2))\
922 MOVNTQ(%%mm7, 8(dst, index, 2))\
924 "add $8, "#index" \n\t"\
925 "cmp "#dstw", "#index" \n\t"\
927 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
930 static inline void RENAME(yuv2yuvX
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
931 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
932 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, long dstW
, long chrDstW
)
935 if (c
->flags
& SWS_ACCURATE_RND
){
937 YSCALEYUV2YV12X_ACCURATE( 0, CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
938 YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
941 YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
944 YSCALEYUV2YV12X( 0, CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
945 YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
948 YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
952 yuv2yuvX_altivec_real(lumFilter
, lumSrc
, lumFilterSize
,
953 chrFilter
, chrSrc
, chrFilterSize
,
954 dest
, uDest
, vDest
, dstW
, chrDstW
);
956 yuv2yuvXinC(lumFilter
, lumSrc
, lumFilterSize
,
957 chrFilter
, chrSrc
, chrFilterSize
,
958 dest
, uDest
, vDest
, dstW
, chrDstW
);
959 #endif //!HAVE_ALTIVEC
960 #endif /* HAVE_MMX */
963 static inline void RENAME(yuv2nv12X
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
964 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
965 uint8_t *dest
, uint8_t *uDest
, int dstW
, int chrDstW
, int dstFormat
)
967 yuv2nv12XinC(lumFilter
, lumSrc
, lumFilterSize
,
968 chrFilter
, chrSrc
, chrFilterSize
,
969 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
972 static inline void RENAME(yuv2yuv1
)(int16_t *lumSrc
, int16_t *chrSrc
,
973 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, long dstW
, long chrDstW
)
980 :: "r" (chrSrc
+ chrDstW
), "r" (uDest
+ chrDstW
),
987 :: "r" (chrSrc
+ 2048 + chrDstW
), "r" (vDest
+ chrDstW
),
995 :: "r" (lumSrc
+ dstW
), "r" (dest
+ dstW
),
1001 for (i
=0; i
<dstW
; i
++)
1003 int val
= lumSrc
[i
]>>7;
1014 for (i
=0; i
<chrDstW
; i
++)
1017 int v
=chrSrc
[i
+ 2048]>>7;
1021 else if (u
>255) u
=255;
1023 else if (v
>255) v
=255;
1034 * vertical scale YV12 to RGB
1036 static inline void RENAME(yuv2packedX
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
1037 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
1038 uint8_t *dest
, long dstW
, long dstY
)
1042 if (c
->flags
& SWS_ACCURATE_RND
){
1043 switch(c
->dstFormat
){
1045 YSCALEYUV2PACKEDX_ACCURATE
1047 WRITEBGR32(%4, %5, %%REGa
)
1049 YSCALEYUV2PACKEDX_END
1052 YSCALEYUV2PACKEDX_ACCURATE
1054 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
"\n\t" //FIXME optimize
1055 "add %4, %%"REG_c
" \n\t"
1056 WRITEBGR24(%%REGc
, %5, %%REGa
)
1059 :: "r" (&c
->redDither
),
1060 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1061 "r" (dest
), "m" (dstW
)
1062 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1065 case PIX_FMT_BGR555
:
1066 YSCALEYUV2PACKEDX_ACCURATE
1068 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1070 "paddusb "MANGLE(b5Dither
)", %%mm2\n\t"
1071 "paddusb "MANGLE(g5Dither
)", %%mm4\n\t"
1072 "paddusb "MANGLE(r5Dither
)", %%mm5\n\t"
1075 WRITEBGR15(%4, %5, %%REGa
)
1076 YSCALEYUV2PACKEDX_END
1078 case PIX_FMT_BGR565
:
1079 YSCALEYUV2PACKEDX_ACCURATE
1081 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1083 "paddusb "MANGLE(b5Dither
)", %%mm2\n\t"
1084 "paddusb "MANGLE(g6Dither
)", %%mm4\n\t"
1085 "paddusb "MANGLE(r5Dither
)", %%mm5\n\t"
1088 WRITEBGR16(%4, %5, %%REGa
)
1089 YSCALEYUV2PACKEDX_END
1091 case PIX_FMT_YUYV422
:
1092 YSCALEYUV2PACKEDX_ACCURATE
1093 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1095 "psraw $3, %%mm3 \n\t"
1096 "psraw $3, %%mm4 \n\t"
1097 "psraw $3, %%mm1 \n\t"
1098 "psraw $3, %%mm7 \n\t"
1099 WRITEYUY2(%4, %5, %%REGa
)
1100 YSCALEYUV2PACKEDX_END
1104 switch(c
->dstFormat
)
1109 WRITEBGR32(%4, %5, %%REGa
)
1110 YSCALEYUV2PACKEDX_END
1115 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
" \n\t" //FIXME optimize
1116 "add %4, %%"REG_c
" \n\t"
1117 WRITEBGR24(%%REGc
, %5, %%REGa
)
1119 :: "r" (&c
->redDither
),
1120 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1121 "r" (dest
), "m" (dstW
)
1122 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1125 case PIX_FMT_BGR555
:
1128 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1130 "paddusb "MANGLE(b5Dither
)", %%mm2 \n\t"
1131 "paddusb "MANGLE(g5Dither
)", %%mm4 \n\t"
1132 "paddusb "MANGLE(r5Dither
)", %%mm5 \n\t"
1135 WRITEBGR15(%4, %5, %%REGa
)
1136 YSCALEYUV2PACKEDX_END
1138 case PIX_FMT_BGR565
:
1141 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1143 "paddusb "MANGLE(b5Dither
)", %%mm2 \n\t"
1144 "paddusb "MANGLE(g6Dither
)", %%mm4 \n\t"
1145 "paddusb "MANGLE(r5Dither
)", %%mm5 \n\t"
1148 WRITEBGR16(%4, %5, %%REGa
)
1149 YSCALEYUV2PACKEDX_END
1151 case PIX_FMT_YUYV422
:
1153 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1155 "psraw $3, %%mm3 \n\t"
1156 "psraw $3, %%mm4 \n\t"
1157 "psraw $3, %%mm1 \n\t"
1158 "psraw $3, %%mm7 \n\t"
1159 WRITEYUY2(%4, %5, %%REGa
)
1160 YSCALEYUV2PACKEDX_END
1164 #endif /* HAVE_MMX */
1166 /* The following list of supported dstFormat values should
1167 match what's found in the body of altivec_yuv2packedX() */
1168 if (c
->dstFormat
==PIX_FMT_ABGR
|| c
->dstFormat
==PIX_FMT_BGRA
||
1169 c
->dstFormat
==PIX_FMT_BGR24
|| c
->dstFormat
==PIX_FMT_RGB24
||
1170 c
->dstFormat
==PIX_FMT_RGBA
|| c
->dstFormat
==PIX_FMT_ARGB
)
1171 altivec_yuv2packedX (c
, lumFilter
, lumSrc
, lumFilterSize
,
1172 chrFilter
, chrSrc
, chrFilterSize
,
1176 yuv2packedXinC(c
, lumFilter
, lumSrc
, lumFilterSize
,
1177 chrFilter
, chrSrc
, chrFilterSize
,
1182 * vertical bilinear scale YV12 to RGB
1184 static inline void RENAME(yuv2packed2
)(SwsContext
*c
, uint16_t *buf0
, uint16_t *buf1
, uint16_t *uvbuf0
, uint16_t *uvbuf1
,
1185 uint8_t *dest
, int dstW
, int yalpha
, int uvalpha
, int y
)
1187 int yalpha1
=yalpha
^4095;
1188 int uvalpha1
=uvalpha
^4095;
1192 if (flags
&SWS_FULL_CHR_H_INT
)
1202 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1203 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1205 "movq %%mm3, %%mm1 \n\t"
1206 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1207 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1209 MOVNTQ(%%mm3
, (%4, %%REGa
, 4))
1210 MOVNTQ(%%mm1
, 8(%4, %%REGa
, 4))
1212 "add $4, %%"REG_a
" \n\t"
1213 "cmp %5, %%"REG_a
" \n\t"
1216 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "r" (dest
), "m" ((long)dstW
),
1217 "m" (yalpha1
), "m" (uvalpha1
)
1227 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1228 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1230 "movq %%mm3, %%mm1 \n\t"
1231 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1232 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1234 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1235 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1236 "pand "MANGLE(bm00000111
)", %%mm2 \n\t" // BGR00000
1237 "pand "MANGLE(bm11111000
)", %%mm3 \n\t" // 000BGR00
1238 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1239 "movq %%mm1, %%mm2 \n\t"
1240 "psllq $48, %%mm1 \n\t" // 000000BG
1241 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1243 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1244 "psrld $16, %%mm2 \n\t" // R000R000
1245 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1246 "por %%mm2, %%mm1 \n\t" // RBGRR000
1248 "mov %4, %%"REG_b
" \n\t"
1249 "add %%"REG_a
", %%"REG_b
" \n\t"
1253 "movntq %%mm3, (%%"REG_b
", %%"REG_a
", 2) \n\t"
1254 "movntq %%mm1, 8(%%"REG_b
", %%"REG_a
", 2) \n\t"
1256 "movd %%mm3, (%%"REG_b
", %%"REG_a
", 2) \n\t"
1257 "psrlq $32, %%mm3 \n\t"
1258 "movd %%mm3, 4(%%"REG_b
", %%"REG_a
", 2) \n\t"
1259 "movd %%mm1, 8(%%"REG_b
", %%"REG_a
", 2) \n\t"
1261 "add $4, %%"REG_a
" \n\t"
1262 "cmp %5, %%"REG_a
" \n\t"
1265 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "m" (dest
), "m" (dstW
),
1266 "m" (yalpha1
), "m" (uvalpha1
)
1267 : "%"REG_a
, "%"REG_b
1270 case PIX_FMT_BGR555
:
1275 "paddusb "MANGLE(g5Dither
)", %%mm1 \n\t"
1276 "paddusb "MANGLE(r5Dither
)", %%mm0 \n\t"
1277 "paddusb "MANGLE(b5Dither
)", %%mm3 \n\t"
1279 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1280 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1281 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1283 "psrlw $3, %%mm3 \n\t"
1284 "psllw $2, %%mm1 \n\t"
1285 "psllw $7, %%mm0 \n\t"
1286 "pand "MANGLE(g15Mask
)", %%mm1 \n\t"
1287 "pand "MANGLE(r15Mask
)", %%mm0 \n\t"
1289 "por %%mm3, %%mm1 \n\t"
1290 "por %%mm1, %%mm0 \n\t"
1292 MOVNTQ(%%mm0
, (%4, %%REGa
, 2))
1294 "add $4, %%"REG_a
" \n\t"
1295 "cmp %5, %%"REG_a
" \n\t"
1298 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "r" (dest
), "m" (dstW
),
1299 "m" (yalpha1
), "m" (uvalpha1
)
1303 case PIX_FMT_BGR565
:
1308 "paddusb "MANGLE(g6Dither
)", %%mm1 \n\t"
1309 "paddusb "MANGLE(r5Dither
)", %%mm0 \n\t"
1310 "paddusb "MANGLE(b5Dither
)", %%mm3 \n\t"
1312 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1313 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1314 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1316 "psrlw $3, %%mm3 \n\t"
1317 "psllw $3, %%mm1 \n\t"
1318 "psllw $8, %%mm0 \n\t"
1319 "pand "MANGLE(g16Mask
)", %%mm1 \n\t"
1320 "pand "MANGLE(r16Mask
)", %%mm0 \n\t"
1322 "por %%mm3, %%mm1 \n\t"
1323 "por %%mm1, %%mm0 \n\t"
1325 MOVNTQ(%%mm0
, (%4, %%REGa
, 2))
1327 "add $4, %%"REG_a
" \n\t"
1328 "cmp %5, %%"REG_a
" \n\t"
1331 :: "r" (buf0
), "r" (buf1
), "r" (uvbuf0
), "r" (uvbuf1
), "r" (dest
), "m" (dstW
),
1332 "m" (yalpha1
), "m" (uvalpha1
)
1336 #endif /* HAVE_MMX */
1341 if (dstFormat
==PIX_FMT_RGB32
)
1344 #ifdef WORDS_BIGENDIAN
1347 for (i
=0;i
<dstW
;i
++){
1348 // vertical linear interpolation && yuv2rgb in a single step:
1349 int Y
=yuvtab_2568
[((buf0
[i
]*yalpha1
+buf1
[i
]*yalpha
)>>19)];
1350 int U
=((uvbuf0
[i
]*uvalpha1
+uvbuf1
[i
]*uvalpha
)>>19);
1351 int V
=((uvbuf0
[i
+2048]*uvalpha1
+uvbuf1
[i
+2048]*uvalpha
)>>19);
1352 dest
[0]=clip_table
[((Y
+ yuvtab_40cf
[U
]) >>13)];
1353 dest
[1]=clip_table
[((Y
+ yuvtab_1a1e
[V
] + yuvtab_0c92
[U
]) >>13)];
1354 dest
[2]=clip_table
[((Y
+ yuvtab_3343
[V
]) >>13)];
1358 else if (dstFormat
==PIX_FMT_BGR24
)
1361 for (i
=0;i
<dstW
;i
++){
1362 // vertical linear interpolation && yuv2rgb in a single step:
1363 int Y
=yuvtab_2568
[((buf0
[i
]*yalpha1
+buf1
[i
]*yalpha
)>>19)];
1364 int U
=((uvbuf0
[i
]*uvalpha1
+uvbuf1
[i
]*uvalpha
)>>19);
1365 int V
=((uvbuf0
[i
+2048]*uvalpha1
+uvbuf1
[i
+2048]*uvalpha
)>>19);
1366 dest
[0]=clip_table
[((Y
+ yuvtab_40cf
[U
]) >>13)];
1367 dest
[1]=clip_table
[((Y
+ yuvtab_1a1e
[V
] + yuvtab_0c92
[U
]) >>13)];
1368 dest
[2]=clip_table
[((Y
+ yuvtab_3343
[V
]) >>13)];
1372 else if (dstFormat
==PIX_FMT_BGR565
)
1375 for (i
=0;i
<dstW
;i
++){
1376 // vertical linear interpolation && yuv2rgb in a single step:
1377 int Y
=yuvtab_2568
[((buf0
[i
]*yalpha1
+buf1
[i
]*yalpha
)>>19)];
1378 int U
=((uvbuf0
[i
]*uvalpha1
+uvbuf1
[i
]*uvalpha
)>>19);
1379 int V
=((uvbuf0
[i
+2048]*uvalpha1
+uvbuf1
[i
+2048]*uvalpha
)>>19);
1381 ((uint16_t*)dest
)[i
] =
1382 clip_table16b
[(Y
+ yuvtab_40cf
[U
]) >>13] |
1383 clip_table16g
[(Y
+ yuvtab_1a1e
[V
] + yuvtab_0c92
[U
]) >>13] |
1384 clip_table16r
[(Y
+ yuvtab_3343
[V
]) >>13];
1387 else if (dstFormat
==PIX_FMT_BGR555
)
1390 for (i
=0;i
<dstW
;i
++){
1391 // vertical linear interpolation && yuv2rgb in a single step:
1392 int Y
=yuvtab_2568
[((buf0
[i
]*yalpha1
+buf1
[i
]*yalpha
)>>19)];
1393 int U
=((uvbuf0
[i
]*uvalpha1
+uvbuf1
[i
]*uvalpha
)>>19);
1394 int V
=((uvbuf0
[i
+2048]*uvalpha1
+uvbuf1
[i
+2048]*uvalpha
)>>19);
1396 ((uint16_t*)dest
)[i
] =
1397 clip_table15b
[(Y
+ yuvtab_40cf
[U
]) >>13] |
1398 clip_table15g
[(Y
+ yuvtab_1a1e
[V
] + yuvtab_0c92
[U
]) >>13] |
1399 clip_table15r
[(Y
+ yuvtab_3343
[V
]) >>13];
1407 switch(c
->dstFormat
)
1409 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1412 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1413 "mov %4, %%"REG_b
" \n\t"
1414 "push %%"REG_BP
" \n\t"
1415 YSCALEYUV2RGB(%%REGBP
, %5)
1416 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
)
1417 "pop %%"REG_BP
" \n\t"
1418 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1420 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1426 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1427 "mov %4, %%"REG_b
" \n\t"
1428 "push %%"REG_BP
" \n\t"
1429 YSCALEYUV2RGB(%%REGBP
, %5)
1430 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1431 "pop %%"REG_BP
" \n\t"
1432 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1433 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1437 case PIX_FMT_BGR555
:
1439 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1440 "mov %4, %%"REG_b
" \n\t"
1441 "push %%"REG_BP
" \n\t"
1442 YSCALEYUV2RGB(%%REGBP
, %5)
1443 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1445 "paddusb "MANGLE(b5Dither
)", %%mm2 \n\t"
1446 "paddusb "MANGLE(g5Dither
)", %%mm4 \n\t"
1447 "paddusb "MANGLE(r5Dither
)", %%mm5 \n\t"
1450 WRITEBGR15(%%REGb
, 8280(%5), %%REGBP
)
1451 "pop %%"REG_BP
" \n\t"
1452 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1454 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1458 case PIX_FMT_BGR565
:
1460 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1461 "mov %4, %%"REG_b
" \n\t"
1462 "push %%"REG_BP
" \n\t"
1463 YSCALEYUV2RGB(%%REGBP
, %5)
1464 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1466 "paddusb "MANGLE(b5Dither
)", %%mm2 \n\t"
1467 "paddusb "MANGLE(g6Dither
)", %%mm4 \n\t"
1468 "paddusb "MANGLE(r5Dither
)", %%mm5 \n\t"
1471 WRITEBGR16(%%REGb
, 8280(%5), %%REGBP
)
1472 "pop %%"REG_BP
" \n\t"
1473 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1474 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1478 case PIX_FMT_YUYV422
:
1480 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1481 "mov %4, %%"REG_b
" \n\t"
1482 "push %%"REG_BP
" \n\t"
1483 YSCALEYUV2PACKED(%%REGBP
, %5)
1484 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1485 "pop %%"REG_BP
" \n\t"
1486 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1487 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1494 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C
, YSCALE_YUV_2_PACKED2_C
)
1498 * YV12 to RGB without scaling or interpolating
1500 static inline void RENAME(yuv2packed1
)(SwsContext
*c
, uint16_t *buf0
, uint16_t *uvbuf0
, uint16_t *uvbuf1
,
1501 uint8_t *dest
, int dstW
, int uvalpha
, int dstFormat
, int flags
, int y
)
1503 const int yalpha1
=0;
1506 uint16_t *buf1
= buf0
; //FIXME needed for the rgb1/bgr1
1507 const int yalpha
= 4096; //FIXME ...
1509 if (flags
&SWS_FULL_CHR_H_INT
)
1511 RENAME(yuv2packed2
)(c
, buf0
, buf0
, uvbuf0
, uvbuf1
, dest
, dstW
, 0, uvalpha
, y
);
1516 if ( uvalpha
< 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1522 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1523 "mov %4, %%"REG_b
" \n\t"
1524 "push %%"REG_BP
" \n\t"
1525 YSCALEYUV2RGB1(%%REGBP
, %5)
1526 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
)
1527 "pop %%"REG_BP
" \n\t"
1528 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1530 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1536 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1537 "mov %4, %%"REG_b
" \n\t"
1538 "push %%"REG_BP
" \n\t"
1539 YSCALEYUV2RGB1(%%REGBP
, %5)
1540 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1541 "pop %%"REG_BP
" \n\t"
1542 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1544 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1548 case PIX_FMT_BGR555
:
1550 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1551 "mov %4, %%"REG_b
" \n\t"
1552 "push %%"REG_BP
" \n\t"
1553 YSCALEYUV2RGB1(%%REGBP
, %5)
1554 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1556 "paddusb "MANGLE(b5Dither
)", %%mm2 \n\t"
1557 "paddusb "MANGLE(g5Dither
)", %%mm4 \n\t"
1558 "paddusb "MANGLE(r5Dither
)", %%mm5 \n\t"
1560 WRITEBGR15(%%REGb
, 8280(%5), %%REGBP
)
1561 "pop %%"REG_BP
" \n\t"
1562 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1564 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1568 case PIX_FMT_BGR565
:
1570 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1571 "mov %4, %%"REG_b
" \n\t"
1572 "push %%"REG_BP
" \n\t"
1573 YSCALEYUV2RGB1(%%REGBP
, %5)
1574 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1576 "paddusb "MANGLE(b5Dither
)", %%mm2 \n\t"
1577 "paddusb "MANGLE(g6Dither
)", %%mm4 \n\t"
1578 "paddusb "MANGLE(r5Dither
)", %%mm5 \n\t"
1581 WRITEBGR16(%%REGb
, 8280(%5), %%REGBP
)
1582 "pop %%"REG_BP
" \n\t"
1583 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1585 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1589 case PIX_FMT_YUYV422
:
1591 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1592 "mov %4, %%"REG_b
" \n\t"
1593 "push %%"REG_BP
" \n\t"
1594 YSCALEYUV2PACKED1(%%REGBP
, %5)
1595 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1596 "pop %%"REG_BP
" \n\t"
1597 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1599 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1611 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1612 "mov %4, %%"REG_b
" \n\t"
1613 "push %%"REG_BP
" \n\t"
1614 YSCALEYUV2RGB1b(%%REGBP
, %5)
1615 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
)
1616 "pop %%"REG_BP
" \n\t"
1617 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1619 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1625 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1626 "mov %4, %%"REG_b
" \n\t"
1627 "push %%"REG_BP
" \n\t"
1628 YSCALEYUV2RGB1b(%%REGBP
, %5)
1629 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1630 "pop %%"REG_BP
" \n\t"
1631 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1633 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1637 case PIX_FMT_BGR555
:
1639 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1640 "mov %4, %%"REG_b
" \n\t"
1641 "push %%"REG_BP
" \n\t"
1642 YSCALEYUV2RGB1b(%%REGBP
, %5)
1643 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1645 "paddusb "MANGLE(b5Dither
)", %%mm2 \n\t"
1646 "paddusb "MANGLE(g5Dither
)", %%mm4 \n\t"
1647 "paddusb "MANGLE(r5Dither
)", %%mm5 \n\t"
1649 WRITEBGR15(%%REGb
, 8280(%5), %%REGBP
)
1650 "pop %%"REG_BP
" \n\t"
1651 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1653 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1657 case PIX_FMT_BGR565
:
1659 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1660 "mov %4, %%"REG_b
" \n\t"
1661 "push %%"REG_BP
" \n\t"
1662 YSCALEYUV2RGB1b(%%REGBP
, %5)
1663 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1665 "paddusb "MANGLE(b5Dither
)", %%mm2 \n\t"
1666 "paddusb "MANGLE(g6Dither
)", %%mm4 \n\t"
1667 "paddusb "MANGLE(r5Dither
)", %%mm5 \n\t"
1670 WRITEBGR16(%%REGb
, 8280(%5), %%REGBP
)
1671 "pop %%"REG_BP
" \n\t"
1672 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1674 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1678 case PIX_FMT_YUYV422
:
1680 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1681 "mov %4, %%"REG_b
" \n\t"
1682 "push %%"REG_BP
" \n\t"
1683 YSCALEYUV2PACKED1b(%%REGBP
, %5)
1684 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1685 "pop %%"REG_BP
" \n\t"
1686 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1688 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1694 #endif /* HAVE_MMX */
1695 if ( uvalpha
< 2048 )
1697 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C
, YSCALE_YUV_2_PACKED1_C
)
1699 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C
, YSCALE_YUV_2_PACKED1B_C
)
1703 //FIXME yuy2* can read upto 7 samples to much
1705 static inline void RENAME(yuy2ToY
)(uint8_t *dst
, uint8_t *src
, long width
)
1709 "movq "MANGLE(bm01010101
)", %%mm2 \n\t"
1710 "mov %0, %%"REG_a
" \n\t"
1712 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1713 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1714 "pand %%mm2, %%mm0 \n\t"
1715 "pand %%mm2, %%mm1 \n\t"
1716 "packuswb %%mm1, %%mm0 \n\t"
1717 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1718 "add $8, %%"REG_a
" \n\t"
1720 : : "g" (-width
), "r" (src
+width
*2), "r" (dst
+width
)
1725 for (i
=0; i
<width
; i
++)
1730 static inline void RENAME(yuy2ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
)
1734 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1735 "mov %0, %%"REG_a
" \n\t"
1737 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1738 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1739 "psrlw $8, %%mm0 \n\t"
1740 "psrlw $8, %%mm1 \n\t"
1741 "packuswb %%mm1, %%mm0 \n\t"
1742 "movq %%mm0, %%mm1 \n\t"
1743 "psrlw $8, %%mm0 \n\t"
1744 "pand %%mm4, %%mm1 \n\t"
1745 "packuswb %%mm0, %%mm0 \n\t"
1746 "packuswb %%mm1, %%mm1 \n\t"
1747 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1748 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1749 "add $4, %%"REG_a
" \n\t"
1751 : : "g" (-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1756 for (i
=0; i
<width
; i
++)
1758 dstU
[i
]= src1
[4*i
+ 1];
1759 dstV
[i
]= src1
[4*i
+ 3];
1762 assert(src1
== src2
);
1765 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1766 static inline void RENAME(uyvyToY
)(uint8_t *dst
, uint8_t *src
, long width
)
1770 "mov %0, %%"REG_a
" \n\t"
1772 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1773 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1774 "psrlw $8, %%mm0 \n\t"
1775 "psrlw $8, %%mm1 \n\t"
1776 "packuswb %%mm1, %%mm0 \n\t"
1777 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1778 "add $8, %%"REG_a
" \n\t"
1780 : : "g" (-width
), "r" (src
+width
*2), "r" (dst
+width
)
1785 for (i
=0; i
<width
; i
++)
1790 static inline void RENAME(uyvyToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
)
1794 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1795 "mov %0, %%"REG_a
" \n\t"
1797 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1798 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1799 "pand %%mm4, %%mm0 \n\t"
1800 "pand %%mm4, %%mm1 \n\t"
1801 "packuswb %%mm1, %%mm0 \n\t"
1802 "movq %%mm0, %%mm1 \n\t"
1803 "psrlw $8, %%mm0 \n\t"
1804 "pand %%mm4, %%mm1 \n\t"
1805 "packuswb %%mm0, %%mm0 \n\t"
1806 "packuswb %%mm1, %%mm1 \n\t"
1807 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1808 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1809 "add $4, %%"REG_a
" \n\t"
1811 : : "g" (-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1816 for (i
=0; i
<width
; i
++)
1818 dstU
[i
]= src1
[4*i
+ 0];
1819 dstV
[i
]= src1
[4*i
+ 2];
1822 assert(src1
== src2
);
1825 static inline void RENAME(bgr32ToY
)(uint8_t *dst
, uint8_t *src
, int width
)
1828 for (i
=0; i
<width
; i
++)
1830 int b
= ((uint32_t*)src
)[i
]&0xFF;
1831 int g
= (((uint32_t*)src
)[i
]>>8)&0xFF;
1832 int r
= (((uint32_t*)src
)[i
]>>16)&0xFF;
1834 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)) )>>RGB2YUV_SHIFT
);
1838 static inline void RENAME(bgr32ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, int width
)
1841 assert(src1
== src2
);
1842 for (i
=0; i
<width
; i
++)
1844 const int a
= ((uint32_t*)src1
)[2*i
+0];
1845 const int e
= ((uint32_t*)src1
)[2*i
+1];
1846 const int l
= (a
&0xFF00FF) + (e
&0xFF00FF);
1847 const int h
= (a
&0x00FF00) + (e
&0x00FF00);
1848 const int b
= l
&0x3FF;
1852 dstU
[i
]= ((RU
*r
+ GU
*g
+ BU
*b
)>>(RGB2YUV_SHIFT
+1)) + 128;
1853 dstV
[i
]= ((RV
*r
+ GV
*g
+ BV
*b
)>>(RGB2YUV_SHIFT
+1)) + 128;
1857 static inline void RENAME(bgr24ToY
)(uint8_t *dst
, uint8_t *src
, long width
)
1861 "mov %2, %%"REG_a
" \n\t"
1862 "movq "MANGLE(bgr2YCoeff
)", %%mm6 \n\t"
1863 "movq "MANGLE(w1111
)", %%mm5 \n\t"
1864 "pxor %%mm7, %%mm7 \n\t"
1865 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1868 PREFETCH
" 64(%0, %%"REG_d
") \n\t"
1869 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1870 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
1871 "punpcklbw %%mm7, %%mm0 \n\t"
1872 "punpcklbw %%mm7, %%mm1 \n\t"
1873 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
1874 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
1875 "punpcklbw %%mm7, %%mm2 \n\t"
1876 "punpcklbw %%mm7, %%mm3 \n\t"
1877 "pmaddwd %%mm6, %%mm0 \n\t"
1878 "pmaddwd %%mm6, %%mm1 \n\t"
1879 "pmaddwd %%mm6, %%mm2 \n\t"
1880 "pmaddwd %%mm6, %%mm3 \n\t"
1881 #ifndef FAST_BGR2YV12
1882 "psrad $8, %%mm0 \n\t"
1883 "psrad $8, %%mm1 \n\t"
1884 "psrad $8, %%mm2 \n\t"
1885 "psrad $8, %%mm3 \n\t"
1887 "packssdw %%mm1, %%mm0 \n\t"
1888 "packssdw %%mm3, %%mm2 \n\t"
1889 "pmaddwd %%mm5, %%mm0 \n\t"
1890 "pmaddwd %%mm5, %%mm2 \n\t"
1891 "packssdw %%mm2, %%mm0 \n\t"
1892 "psraw $7, %%mm0 \n\t"
1894 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1895 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
1896 "punpcklbw %%mm7, %%mm4 \n\t"
1897 "punpcklbw %%mm7, %%mm1 \n\t"
1898 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
1899 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
1900 "punpcklbw %%mm7, %%mm2 \n\t"
1901 "punpcklbw %%mm7, %%mm3 \n\t"
1902 "pmaddwd %%mm6, %%mm4 \n\t"
1903 "pmaddwd %%mm6, %%mm1 \n\t"
1904 "pmaddwd %%mm6, %%mm2 \n\t"
1905 "pmaddwd %%mm6, %%mm3 \n\t"
1906 #ifndef FAST_BGR2YV12
1907 "psrad $8, %%mm4 \n\t"
1908 "psrad $8, %%mm1 \n\t"
1909 "psrad $8, %%mm2 \n\t"
1910 "psrad $8, %%mm3 \n\t"
1912 "packssdw %%mm1, %%mm4 \n\t"
1913 "packssdw %%mm3, %%mm2 \n\t"
1914 "pmaddwd %%mm5, %%mm4 \n\t"
1915 "pmaddwd %%mm5, %%mm2 \n\t"
1916 "add $24, %%"REG_d
" \n\t"
1917 "packssdw %%mm2, %%mm4 \n\t"
1918 "psraw $7, %%mm4 \n\t"
1920 "packuswb %%mm4, %%mm0 \n\t"
1921 "paddusb "MANGLE(bgr2YOffset
)", %%mm0 \n\t"
1923 "movq %%mm0, (%1, %%"REG_a
") \n\t"
1924 "add $8, %%"REG_a
" \n\t"
1926 : : "r" (src
+width
*3), "r" (dst
+width
), "g" (-width
)
1927 : "%"REG_a
, "%"REG_d
1931 for (i
=0; i
<width
; i
++)
1937 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)) )>>RGB2YUV_SHIFT
);
1939 #endif /* HAVE_MMX */
1942 static inline void RENAME(bgr24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
)
1946 "mov %3, %%"REG_a
" \n\t"
1947 "movq "MANGLE(w1111
)", %%mm5 \n\t"
1948 "movq "MANGLE(bgr2UCoeff
)", %%mm6 \n\t"
1949 "pxor %%mm7, %%mm7 \n\t"
1950 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1951 "add %%"REG_d
", %%"REG_d
" \n\t"
1954 PREFETCH
" 64(%0, %%"REG_d
") \n\t"
1955 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1956 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
1957 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
1958 "movq %%mm0, %%mm1 \n\t"
1959 "movq %%mm2, %%mm3 \n\t"
1960 "psrlq $24, %%mm0 \n\t"
1961 "psrlq $24, %%mm2 \n\t"
1964 "punpcklbw %%mm7, %%mm0 \n\t"
1965 "punpcklbw %%mm7, %%mm2 \n\t"
1967 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1968 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
1969 "punpcklbw %%mm7, %%mm0 \n\t"
1970 "punpcklbw %%mm7, %%mm2 \n\t"
1971 "paddw %%mm2, %%mm0 \n\t"
1972 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
1973 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
1974 "punpcklbw %%mm7, %%mm4 \n\t"
1975 "punpcklbw %%mm7, %%mm2 \n\t"
1976 "paddw %%mm4, %%mm2 \n\t"
1977 "psrlw $1, %%mm0 \n\t"
1978 "psrlw $1, %%mm2 \n\t"
1980 "movq "MANGLE(bgr2VCoeff
)", %%mm1 \n\t"
1981 "movq "MANGLE(bgr2VCoeff
)", %%mm3 \n\t"
1983 "pmaddwd %%mm0, %%mm1 \n\t"
1984 "pmaddwd %%mm2, %%mm3 \n\t"
1985 "pmaddwd %%mm6, %%mm0 \n\t"
1986 "pmaddwd %%mm6, %%mm2 \n\t"
1987 #ifndef FAST_BGR2YV12
1988 "psrad $8, %%mm0 \n\t"
1989 "psrad $8, %%mm1 \n\t"
1990 "psrad $8, %%mm2 \n\t"
1991 "psrad $8, %%mm3 \n\t"
1993 "packssdw %%mm2, %%mm0 \n\t"
1994 "packssdw %%mm3, %%mm1 \n\t"
1995 "pmaddwd %%mm5, %%mm0 \n\t"
1996 "pmaddwd %%mm5, %%mm1 \n\t"
1997 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1998 "psraw $7, %%mm0 \n\t"
2000 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2001 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
2002 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
2003 "movq %%mm4, %%mm1 \n\t"
2004 "movq %%mm2, %%mm3 \n\t"
2005 "psrlq $24, %%mm4 \n\t"
2006 "psrlq $24, %%mm2 \n\t"
2009 "punpcklbw %%mm7, %%mm4 \n\t"
2010 "punpcklbw %%mm7, %%mm2 \n\t"
2012 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
2013 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
2014 "punpcklbw %%mm7, %%mm4 \n\t"
2015 "punpcklbw %%mm7, %%mm2 \n\t"
2016 "paddw %%mm2, %%mm4 \n\t"
2017 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
2018 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
2019 "punpcklbw %%mm7, %%mm5 \n\t"
2020 "punpcklbw %%mm7, %%mm2 \n\t"
2021 "paddw %%mm5, %%mm2 \n\t"
2022 "movq "MANGLE(w1111
)", %%mm5 \n\t"
2023 "psrlw $2, %%mm4 \n\t"
2024 "psrlw $2, %%mm2 \n\t"
2026 "movq "MANGLE(bgr2VCoeff
)", %%mm1 \n\t"
2027 "movq "MANGLE(bgr2VCoeff
)", %%mm3 \n\t"
2029 "pmaddwd %%mm4, %%mm1 \n\t"
2030 "pmaddwd %%mm2, %%mm3 \n\t"
2031 "pmaddwd %%mm6, %%mm4 \n\t"
2032 "pmaddwd %%mm6, %%mm2 \n\t"
2033 #ifndef FAST_BGR2YV12
2034 "psrad $8, %%mm4 \n\t"
2035 "psrad $8, %%mm1 \n\t"
2036 "psrad $8, %%mm2 \n\t"
2037 "psrad $8, %%mm3 \n\t"
2039 "packssdw %%mm2, %%mm4 \n\t"
2040 "packssdw %%mm3, %%mm1 \n\t"
2041 "pmaddwd %%mm5, %%mm4 \n\t"
2042 "pmaddwd %%mm5, %%mm1 \n\t"
2043 "add $24, %%"REG_d
" \n\t"
2044 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2045 "psraw $7, %%mm4 \n\t"
2047 "movq %%mm0, %%mm1 \n\t"
2048 "punpckldq %%mm4, %%mm0 \n\t"
2049 "punpckhdq %%mm4, %%mm1 \n\t"
2050 "packsswb %%mm1, %%mm0 \n\t"
2051 "paddb "MANGLE(bgr2UVOffset
)", %%mm0 \n\t"
2053 "movd %%mm0, (%1, %%"REG_a
") \n\t"
2054 "punpckhdq %%mm0, %%mm0 \n\t"
2055 "movd %%mm0, (%2, %%"REG_a
") \n\t"
2056 "add $4, %%"REG_a
" \n\t"
2058 : : "r" (src1
+width
*6), "r" (dstU
+width
), "r" (dstV
+width
), "g" (-width
)
2059 : "%"REG_a
, "%"REG_d
2063 for (i
=0; i
<width
; i
++)
2065 int b
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
2066 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
2067 int r
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
2069 dstU
[i
]= ((RU
*r
+ GU
*g
+ BU
*b
)>>(RGB2YUV_SHIFT
+1)) + 128;
2070 dstV
[i
]= ((RV
*r
+ GV
*g
+ BV
*b
)>>(RGB2YUV_SHIFT
+1)) + 128;
2072 #endif /* HAVE_MMX */
2073 assert(src1
== src2
);
2076 static inline void RENAME(rgb16ToY
)(uint8_t *dst
, uint8_t *src
, int width
)
2079 for (i
=0; i
<width
; i
++)
2081 int d
= ((uint16_t*)src
)[i
];
2084 int r
= (d
>>11)&0x1F;
2086 dst
[i
]= ((2*RY
*r
+ GY
*g
+ 2*BY
*b
)>>(RGB2YUV_SHIFT
-2)) + 16;
2090 static inline void RENAME(rgb16ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, int width
)
2094 for (i
=0; i
<width
; i
++)
2096 int d0
= ((uint32_t*)src1
)[i
];
2098 int dl
= (d0
&0x07E0F81F);
2099 int dh
= ((d0
>>5)&0x07C0F83F);
2101 int dh2
= (dh
>>11) + (dh
<<21);
2105 int r
= (d
>>11)&0x7F;
2107 dstU
[i
]= ((2*RU
*r
+ GU
*g
+ 2*BU
*b
)>>(RGB2YUV_SHIFT
+1-2)) + 128;
2108 dstV
[i
]= ((2*RV
*r
+ GV
*g
+ 2*BV
*b
)>>(RGB2YUV_SHIFT
+1-2)) + 128;
2112 static inline void RENAME(rgb15ToY
)(uint8_t *dst
, uint8_t *src
, int width
)
2115 for (i
=0; i
<width
; i
++)
2117 int d
= ((uint16_t*)src
)[i
];
2120 int r
= (d
>>10)&0x1F;
2122 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
)>>(RGB2YUV_SHIFT
-3)) + 16;
2126 static inline void RENAME(rgb15ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, int width
)
2130 for (i
=0; i
<width
; i
++)
2132 int d0
= ((uint32_t*)src1
)[i
];
2134 int dl
= (d0
&0x03E07C1F);
2135 int dh
= ((d0
>>5)&0x03E0F81F);
2137 int dh2
= (dh
>>11) + (dh
<<21);
2141 int r
= (d
>>10)&0x7F;
2143 dstU
[i
]= ((RU
*r
+ GU
*g
+ BU
*b
)>>(RGB2YUV_SHIFT
+1-3)) + 128;
2144 dstV
[i
]= ((RV
*r
+ GV
*g
+ BV
*b
)>>(RGB2YUV_SHIFT
+1-3)) + 128;
2149 static inline void RENAME(rgb32ToY
)(uint8_t *dst
, uint8_t *src
, int width
)
2152 for (i
=0; i
<width
; i
++)
2154 int r
= ((uint32_t*)src
)[i
]&0xFF;
2155 int g
= (((uint32_t*)src
)[i
]>>8)&0xFF;
2156 int b
= (((uint32_t*)src
)[i
]>>16)&0xFF;
2158 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)) )>>RGB2YUV_SHIFT
);
2162 static inline void RENAME(rgb32ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, int width
)
2166 for (i
=0; i
<width
; i
++)
2168 const int a
= ((uint32_t*)src1
)[2*i
+0];
2169 const int e
= ((uint32_t*)src1
)[2*i
+1];
2170 const int l
= (a
&0xFF00FF) + (e
&0xFF00FF);
2171 const int h
= (a
&0x00FF00) + (e
&0x00FF00);
2172 const int r
= l
&0x3FF;
2176 dstU
[i
]= ((RU
*r
+ GU
*g
+ BU
*b
)>>(RGB2YUV_SHIFT
+1)) + 128;
2177 dstV
[i
]= ((RV
*r
+ GV
*g
+ BV
*b
)>>(RGB2YUV_SHIFT
+1)) + 128;
2181 static inline void RENAME(rgb24ToY
)(uint8_t *dst
, uint8_t *src
, int width
)
2184 for (i
=0; i
<width
; i
++)
2190 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)) )>>RGB2YUV_SHIFT
);
2194 static inline void RENAME(rgb24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, int width
)
2198 for (i
=0; i
<width
; i
++)
2200 int r
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
2201 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
2202 int b
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
2204 dstU
[i
]= ((RU
*r
+ GU
*g
+ BU
*b
)>>(RGB2YUV_SHIFT
+1)) + 128;
2205 dstV
[i
]= ((RV
*r
+ GV
*g
+ BV
*b
)>>(RGB2YUV_SHIFT
+1)) + 128;
2209 static inline void RENAME(bgr16ToY
)(uint8_t *dst
, uint8_t *src
, int width
)
2212 for (i
=0; i
<width
; i
++)
2214 int d
= ((uint16_t*)src
)[i
];
2217 int b
= (d
>>11)&0x1F;
2219 dst
[i
]= ((2*RY
*r
+ GY
*g
+ 2*BY
*b
)>>(RGB2YUV_SHIFT
-2)) + 16;
2223 static inline void RENAME(bgr16ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, int width
)
2226 assert(src1
== src2
);
2227 for (i
=0; i
<width
; i
++)
2229 int d0
= ((uint32_t*)src1
)[i
];
2231 int dl
= (d0
&0x07E0F81F);
2232 int d
= dl
+ (((d0
>>16) + (d0
<<16))&0x07E0F81F);
2235 int b
= (d
>>11)&0x3F;
2237 dstU
[i
]= ((2*RU
*r
+ GU
*g
+ 2*BU
*b
)>>(RGB2YUV_SHIFT
+1-2)) + 128;
2238 dstV
[i
]= ((2*RV
*r
+ GV
*g
+ 2*BV
*b
)>>(RGB2YUV_SHIFT
+1-2)) + 128;
2242 static inline void RENAME(bgr15ToY
)(uint8_t *dst
, uint8_t *src
, int width
)
2245 for (i
=0; i
<width
; i
++)
2247 int d
= ((uint16_t*)src
)[i
];
2250 int b
= (d
>>10)&0x1F;
2252 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
)>>(RGB2YUV_SHIFT
-3)) + 16;
2256 static inline void RENAME(bgr15ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, int width
)
2259 assert(src1
== src2
);
2260 for (i
=0; i
<width
; i
++)
2262 int d0
= ((uint32_t*)src1
)[i
];
2264 int dl
= (d0
&0x03E07C1F);
2265 int d
= dl
+ (((d0
>>16) + (d0
<<16))&0x03E07C1F);
2268 int b
= (d
>>10)&0x3F;
2270 dstU
[i
]= ((RU
*r
+ GU
*g
+ BU
*b
)>>(RGB2YUV_SHIFT
+1-3)) + 128;
2271 dstV
[i
]= ((RV
*r
+ GV
*g
+ BV
*b
)>>(RGB2YUV_SHIFT
+1-3)) + 128;
2275 static inline void RENAME(palToY
)(uint8_t *dst
, uint8_t *src
, int width
, uint32_t *pal
)
2278 for (i
=0; i
<width
; i
++)
2282 dst
[i
]= pal
[d
] & 0xFF;
2286 static inline void RENAME(palToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, int width
, uint32_t *pal
)
2289 assert(src1
== src2
);
2290 for (i
=0; i
<width
; i
++)
2292 int p
= pal
[src1
[i
]];
2299 // Bilinear / Bicubic scaling
2300 static inline void RENAME(hScale
)(int16_t *dst
, int dstW
, uint8_t *src
, int srcW
, int xInc
,
2301 int16_t *filter
, int16_t *filterPos
, long filterSize
)
2304 assert(filterSize
% 4 == 0 && filterSize
>0);
2305 if (filterSize
==4) // Always true for upscaling, sometimes for down, too.
2307 long counter
= -2*dstW
;
2309 filterPos
-= counter
/2;
2313 "push %%"REG_b
" \n\t"
2315 "pxor %%mm7, %%mm7 \n\t"
2316 "movq "MANGLE(w02
)", %%mm6 \n\t"
2317 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2318 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2321 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2322 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
2323 "movq (%1, %%"REG_BP
", 4), %%mm1 \n\t"
2324 "movq 8(%1, %%"REG_BP
", 4), %%mm3 \n\t"
2325 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2326 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2327 "punpcklbw %%mm7, %%mm0 \n\t"
2328 "punpcklbw %%mm7, %%mm2 \n\t"
2329 "pmaddwd %%mm1, %%mm0 \n\t"
2330 "pmaddwd %%mm2, %%mm3 \n\t"
2331 "psrad $8, %%mm0 \n\t"
2332 "psrad $8, %%mm3 \n\t"
2333 "packssdw %%mm3, %%mm0 \n\t"
2334 "pmaddwd %%mm6, %%mm0 \n\t"
2335 "packssdw %%mm0, %%mm0 \n\t"
2336 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2337 "add $4, %%"REG_BP
" \n\t"
2340 "pop %%"REG_BP
" \n\t"
2342 "pop %%"REG_b
" \n\t"
2345 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2351 else if (filterSize
==8)
2353 long counter
= -2*dstW
;
2355 filterPos
-= counter
/2;
2359 "push %%"REG_b
" \n\t"
2361 "pxor %%mm7, %%mm7 \n\t"
2362 "movq "MANGLE(w02
)", %%mm6 \n\t"
2363 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2364 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2367 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2368 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
2369 "movq (%1, %%"REG_BP
", 8), %%mm1 \n\t"
2370 "movq 16(%1, %%"REG_BP
", 8), %%mm3 \n\t"
2371 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2372 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2373 "punpcklbw %%mm7, %%mm0 \n\t"
2374 "punpcklbw %%mm7, %%mm2 \n\t"
2375 "pmaddwd %%mm1, %%mm0 \n\t"
2376 "pmaddwd %%mm2, %%mm3 \n\t"
2378 "movq 8(%1, %%"REG_BP
", 8), %%mm1 \n\t"
2379 "movq 24(%1, %%"REG_BP
", 8), %%mm5 \n\t"
2380 "movd 4(%3, %%"REG_a
"), %%mm4 \n\t"
2381 "movd 4(%3, %%"REG_b
"), %%mm2 \n\t"
2382 "punpcklbw %%mm7, %%mm4 \n\t"
2383 "punpcklbw %%mm7, %%mm2 \n\t"
2384 "pmaddwd %%mm1, %%mm4 \n\t"
2385 "pmaddwd %%mm2, %%mm5 \n\t"
2386 "paddd %%mm4, %%mm0 \n\t"
2387 "paddd %%mm5, %%mm3 \n\t"
2389 "psrad $8, %%mm0 \n\t"
2390 "psrad $8, %%mm3 \n\t"
2391 "packssdw %%mm3, %%mm0 \n\t"
2392 "pmaddwd %%mm6, %%mm0 \n\t"
2393 "packssdw %%mm0, %%mm0 \n\t"
2394 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2395 "add $4, %%"REG_BP
" \n\t"
2398 "pop %%"REG_BP
" \n\t"
2400 "pop %%"REG_b
" \n\t"
2403 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2411 uint8_t *offset
= src
+filterSize
;
2412 long counter
= -2*dstW
;
2413 //filter-= counter*filterSize/2;
2414 filterPos
-= counter
/2;
2417 "pxor %%mm7, %%mm7 \n\t"
2418 "movq "MANGLE(w02
)", %%mm6 \n\t"
2421 "mov %2, %%"REG_c
" \n\t"
2422 "movzwl (%%"REG_c
", %0), %%eax \n\t"
2423 "movzwl 2(%%"REG_c
", %0), %%edx \n\t"
2424 "mov %5, %%"REG_c
" \n\t"
2425 "pxor %%mm4, %%mm4 \n\t"
2426 "pxor %%mm5, %%mm5 \n\t"
2428 "movq (%1), %%mm1 \n\t"
2429 "movq (%1, %6), %%mm3 \n\t"
2430 "movd (%%"REG_c
", %%"REG_a
"), %%mm0 \n\t"
2431 "movd (%%"REG_c
", %%"REG_d
"), %%mm2 \n\t"
2432 "punpcklbw %%mm7, %%mm0 \n\t"
2433 "punpcklbw %%mm7, %%mm2 \n\t"
2434 "pmaddwd %%mm1, %%mm0 \n\t"
2435 "pmaddwd %%mm2, %%mm3 \n\t"
2436 "paddd %%mm3, %%mm5 \n\t"
2437 "paddd %%mm0, %%mm4 \n\t"
2439 "add $4, %%"REG_c
" \n\t"
2440 "cmp %4, %%"REG_c
" \n\t"
2443 "psrad $8, %%mm4 \n\t"
2444 "psrad $8, %%mm5 \n\t"
2445 "packssdw %%mm5, %%mm4 \n\t"
2446 "pmaddwd %%mm6, %%mm4 \n\t"
2447 "packssdw %%mm4, %%mm4 \n\t"
2448 "mov %3, %%"REG_a
" \n\t"
2449 "movd %%mm4, (%%"REG_a
", %0) \n\t"
2453 : "+r" (counter
), "+r" (filter
)
2454 : "m" (filterPos
), "m" (dst
), "m"(offset
),
2455 "m" (src
), "r" (filterSize
*2)
2456 : "%"REG_a
, "%"REG_c
, "%"REG_d
2461 hScale_altivec_real(dst
, dstW
, src
, srcW
, xInc
, filter
, filterPos
, filterSize
);
2464 for (i
=0; i
<dstW
; i
++)
2467 int srcPos
= filterPos
[i
];
2469 //printf("filterPos: %d\n", filterPos[i]);
2470 for (j
=0; j
<filterSize
; j
++)
2472 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2473 val
+= ((int)src
[srcPos
+ j
])*filter
[filterSize
*i
+ j
];
2475 //filter += hFilterSize;
2476 dst
[i
] = av_clip(val
>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2479 #endif /* HAVE_ALTIVEC */
2480 #endif /* HAVE_MMX */
2482 // *** horizontal scale Y line to temp buffer
2483 static inline void RENAME(hyscale
)(uint16_t *dst
, long dstWidth
, uint8_t *src
, int srcW
, int xInc
,
2484 int flags
, int canMMX2BeUsed
, int16_t *hLumFilter
,
2485 int16_t *hLumFilterPos
, int hLumFilterSize
, void *funnyYCode
,
2486 int srcFormat
, uint8_t *formatConvBuffer
, int16_t *mmx2Filter
,
2487 int32_t *mmx2FilterPos
, uint8_t *pal
)
2489 if (srcFormat
==PIX_FMT_YUYV422
|| srcFormat
==PIX_FMT_GRAY16BE
)
2491 RENAME(yuy2ToY
)(formatConvBuffer
, src
, srcW
);
2492 src
= formatConvBuffer
;
2494 else if (srcFormat
==PIX_FMT_UYVY422
|| srcFormat
==PIX_FMT_GRAY16LE
)
2496 RENAME(uyvyToY
)(formatConvBuffer
, src
, srcW
);
2497 src
= formatConvBuffer
;
2499 else if (srcFormat
==PIX_FMT_RGB32
)
2501 RENAME(bgr32ToY
)(formatConvBuffer
, src
, srcW
);
2502 src
= formatConvBuffer
;
2504 else if (srcFormat
==PIX_FMT_BGR24
)
2506 RENAME(bgr24ToY
)(formatConvBuffer
, src
, srcW
);
2507 src
= formatConvBuffer
;
2509 else if (srcFormat
==PIX_FMT_BGR565
)
2511 RENAME(bgr16ToY
)(formatConvBuffer
, src
, srcW
);
2512 src
= formatConvBuffer
;
2514 else if (srcFormat
==PIX_FMT_BGR555
)
2516 RENAME(bgr15ToY
)(formatConvBuffer
, src
, srcW
);
2517 src
= formatConvBuffer
;
2519 else if (srcFormat
==PIX_FMT_BGR32
)
2521 RENAME(rgb32ToY
)(formatConvBuffer
, src
, srcW
);
2522 src
= formatConvBuffer
;
2524 else if (srcFormat
==PIX_FMT_RGB24
)
2526 RENAME(rgb24ToY
)(formatConvBuffer
, src
, srcW
);
2527 src
= formatConvBuffer
;
2529 else if (srcFormat
==PIX_FMT_RGB565
)
2531 RENAME(rgb16ToY
)(formatConvBuffer
, src
, srcW
);
2532 src
= formatConvBuffer
;
2534 else if (srcFormat
==PIX_FMT_RGB555
)
2536 RENAME(rgb15ToY
)(formatConvBuffer
, src
, srcW
);
2537 src
= formatConvBuffer
;
2539 else if (srcFormat
==PIX_FMT_RGB8
|| srcFormat
==PIX_FMT_BGR8
|| srcFormat
==PIX_FMT_PAL8
|| srcFormat
==PIX_FMT_BGR4_BYTE
|| srcFormat
==PIX_FMT_RGB4_BYTE
)
2541 RENAME(palToY
)(formatConvBuffer
, src
, srcW
, pal
);
2542 src
= formatConvBuffer
;
2546 // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2547 if (!(flags
&SWS_FAST_BILINEAR
) || (!canMMX2BeUsed
))
2549 if (!(flags
&SWS_FAST_BILINEAR
))
2552 RENAME(hScale
)(dst
, dstWidth
, src
, srcW
, xInc
, hLumFilter
, hLumFilterPos
, hLumFilterSize
);
2554 else // Fast Bilinear upscale / crap downscale
2556 #if defined(ARCH_X86)
2560 uint64_t ebxsave
__attribute__((aligned(8)));
2566 "mov %%"REG_b
", %5 \n\t"
2568 "pxor %%mm7, %%mm7 \n\t"
2569 "mov %0, %%"REG_c
" \n\t"
2570 "mov %1, %%"REG_D
" \n\t"
2571 "mov %2, %%"REG_d
" \n\t"
2572 "mov %3, %%"REG_b
" \n\t"
2573 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2574 PREFETCH
" (%%"REG_c
") \n\t"
2575 PREFETCH
" 32(%%"REG_c
") \n\t"
2576 PREFETCH
" 64(%%"REG_c
") \n\t"
2580 #define FUNNY_Y_CODE \
2581 "movl (%%"REG_b"), %%esi \n\t"\
2583 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2584 "add %%"REG_S", %%"REG_c" \n\t"\
2585 "add %%"REG_a", %%"REG_D" \n\t"\
2586 "xor %%"REG_a", %%"REG_a" \n\t"\
2590 #define FUNNY_Y_CODE \
2591 "movl (%%"REG_b"), %%esi \n\t"\
2593 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2594 "add %%"REG_a", %%"REG_D" \n\t"\
2595 "xor %%"REG_a", %%"REG_a" \n\t"\
2597 #endif /* ARCH_X86_64 */
2609 "mov %5, %%"REG_b
" \n\t"
2611 :: "m" (src
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2616 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2621 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--) dst
[i
] = src
[srcW
-1]*128;
2625 #endif /* HAVE_MMX2 */
2626 long xInc_shr16
= xInc
>> 16;
2627 uint16_t xInc_mask
= xInc
& 0xffff;
2628 //NO MMX just normal asm ...
2630 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2631 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2632 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2635 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2636 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2637 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2638 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2639 "shll $16, %%edi \n\t"
2640 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2641 "mov %1, %%"REG_D
" \n\t"
2642 "shrl $9, %%esi \n\t"
2643 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2644 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2645 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2647 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2648 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2649 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2650 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2651 "shll $16, %%edi \n\t"
2652 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2653 "mov %1, %%"REG_D
" \n\t"
2654 "shrl $9, %%esi \n\t"
2655 "movw %%si, 2(%%"REG_D
", %%"REG_a
", 2) \n\t"
2656 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2657 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2660 "add $2, %%"REG_a
" \n\t"
2661 "cmp %2, %%"REG_a
" \n\t"
2665 :: "r" (src
), "m" (dst
), "m" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
)
2666 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2669 } //if MMX2 can't be used
2673 unsigned int xpos
=0;
2674 for (i
=0;i
<dstWidth
;i
++)
2676 register unsigned int xx
=xpos
>>16;
2677 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2678 dst
[i
]= (src
[xx
]<<7) + (src
[xx
+1] - src
[xx
])*xalpha
;
2681 #endif /* defined(ARCH_X86) */
2685 inline static void RENAME(hcscale
)(uint16_t *dst
, long dstWidth
, uint8_t *src1
, uint8_t *src2
,
2686 int srcW
, int xInc
, int flags
, int canMMX2BeUsed
, int16_t *hChrFilter
,
2687 int16_t *hChrFilterPos
, int hChrFilterSize
, void *funnyUVCode
,
2688 int srcFormat
, uint8_t *formatConvBuffer
, int16_t *mmx2Filter
,
2689 int32_t *mmx2FilterPos
, uint8_t *pal
)
2691 if (srcFormat
==PIX_FMT_YUYV422
)
2693 RENAME(yuy2ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2694 src1
= formatConvBuffer
;
2695 src2
= formatConvBuffer
+2048;
2697 else if (srcFormat
==PIX_FMT_UYVY422
)
2699 RENAME(uyvyToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2700 src1
= formatConvBuffer
;
2701 src2
= formatConvBuffer
+2048;
2703 else if (srcFormat
==PIX_FMT_RGB32
)
2705 RENAME(bgr32ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2706 src1
= formatConvBuffer
;
2707 src2
= formatConvBuffer
+2048;
2709 else if (srcFormat
==PIX_FMT_BGR24
)
2711 RENAME(bgr24ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2712 src1
= formatConvBuffer
;
2713 src2
= formatConvBuffer
+2048;
2715 else if (srcFormat
==PIX_FMT_BGR565
)
2717 RENAME(bgr16ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2718 src1
= formatConvBuffer
;
2719 src2
= formatConvBuffer
+2048;
2721 else if (srcFormat
==PIX_FMT_BGR555
)
2723 RENAME(bgr15ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2724 src1
= formatConvBuffer
;
2725 src2
= formatConvBuffer
+2048;
2727 else if (srcFormat
==PIX_FMT_BGR32
)
2729 RENAME(rgb32ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2730 src1
= formatConvBuffer
;
2731 src2
= formatConvBuffer
+2048;
2733 else if (srcFormat
==PIX_FMT_RGB24
)
2735 RENAME(rgb24ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2736 src1
= formatConvBuffer
;
2737 src2
= formatConvBuffer
+2048;
2739 else if (srcFormat
==PIX_FMT_RGB565
)
2741 RENAME(rgb16ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2742 src1
= formatConvBuffer
;
2743 src2
= formatConvBuffer
+2048;
2745 else if (srcFormat
==PIX_FMT_RGB555
)
2747 RENAME(rgb15ToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
);
2748 src1
= formatConvBuffer
;
2749 src2
= formatConvBuffer
+2048;
2751 else if (isGray(srcFormat
))
2755 else if (srcFormat
==PIX_FMT_RGB8
|| srcFormat
==PIX_FMT_BGR8
|| srcFormat
==PIX_FMT_PAL8
|| srcFormat
==PIX_FMT_BGR4_BYTE
|| srcFormat
==PIX_FMT_RGB4_BYTE
)
2757 RENAME(palToUV
)(formatConvBuffer
, formatConvBuffer
+2048, src1
, src2
, srcW
, pal
);
2758 src1
= formatConvBuffer
;
2759 src2
= formatConvBuffer
+2048;
2763 // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2764 if (!(flags
&SWS_FAST_BILINEAR
) || (!canMMX2BeUsed
))
2766 if (!(flags
&SWS_FAST_BILINEAR
))
2769 RENAME(hScale
)(dst
, dstWidth
, src1
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2770 RENAME(hScale
)(dst
+2048, dstWidth
, src2
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2772 else // Fast Bilinear upscale / crap downscale
2774 #if defined(ARCH_X86)
2778 uint64_t ebxsave
__attribute__((aligned(8)));
2784 "mov %%"REG_b
", %6 \n\t"
2786 "pxor %%mm7, %%mm7 \n\t"
2787 "mov %0, %%"REG_c
" \n\t"
2788 "mov %1, %%"REG_D
" \n\t"
2789 "mov %2, %%"REG_d
" \n\t"
2790 "mov %3, %%"REG_b
" \n\t"
2791 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2792 PREFETCH
" (%%"REG_c
") \n\t"
2793 PREFETCH
" 32(%%"REG_c
") \n\t"
2794 PREFETCH
" 64(%%"REG_c
") \n\t"
2798 #define FUNNY_UV_CODE \
2799 "movl (%%"REG_b"), %%esi \n\t"\
2801 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2802 "add %%"REG_S", %%"REG_c" \n\t"\
2803 "add %%"REG_a", %%"REG_D" \n\t"\
2804 "xor %%"REG_a", %%"REG_a" \n\t"\
2808 #define FUNNY_UV_CODE \
2809 "movl (%%"REG_b"), %%esi \n\t"\
2811 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2812 "add %%"REG_a", %%"REG_D" \n\t"\
2813 "xor %%"REG_a", %%"REG_a" \n\t"\
2815 #endif /* ARCH_X86_64 */
2821 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2822 "mov %5, %%"REG_c
" \n\t" // src
2823 "mov %1, %%"REG_D
" \n\t" // buf1
2824 "add $4096, %%"REG_D
" \n\t"
2825 PREFETCH
" (%%"REG_c
") \n\t"
2826 PREFETCH
" 32(%%"REG_c
") \n\t"
2827 PREFETCH
" 64(%%"REG_c
") \n\t"
2835 "mov %6, %%"REG_b
" \n\t"
2837 :: "m" (src1
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2838 "m" (funnyUVCode
), "m" (src2
)
2842 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2847 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--)
2849 //printf("%d %d %d\n", dstWidth, i, srcW);
2850 dst
[i
] = src1
[srcW
-1]*128;
2851 dst
[i
+2048] = src2
[srcW
-1]*128;
2856 #endif /* HAVE_MMX2 */
2857 long xInc_shr16
= (long) (xInc
>> 16);
2858 uint16_t xInc_mask
= xInc
& 0xffff;
2860 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2861 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2862 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2865 "mov %0, %%"REG_S
" \n\t"
2866 "movzbl (%%"REG_S
", %%"REG_d
"), %%edi \n\t" //src[xx]
2867 "movzbl 1(%%"REG_S
", %%"REG_d
"), %%esi \n\t" //src[xx+1]
2868 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2869 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2870 "shll $16, %%edi \n\t"
2871 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2872 "mov %1, %%"REG_D
" \n\t"
2873 "shrl $9, %%esi \n\t"
2874 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2876 "movzbl (%5, %%"REG_d
"), %%edi \n\t" //src[xx]
2877 "movzbl 1(%5, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2878 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2879 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2880 "shll $16, %%edi \n\t"
2881 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2882 "mov %1, %%"REG_D
" \n\t"
2883 "shrl $9, %%esi \n\t"
2884 "movw %%si, 4096(%%"REG_D
", %%"REG_a
", 2) \n\t"
2886 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2887 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2888 "add $1, %%"REG_a
" \n\t"
2889 "cmp %2, %%"REG_a
" \n\t"
2892 /* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2893 which is needed to support GCC-4.0 */
2894 #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2895 :: "m" (src1
), "m" (dst
), "g" ((long)dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2897 :: "m" (src1
), "m" (dst
), "m" ((long)dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2900 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2903 } //if MMX2 can't be used
2907 unsigned int xpos
=0;
2908 for (i
=0;i
<dstWidth
;i
++)
2910 register unsigned int xx
=xpos
>>16;
2911 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2912 dst
[i
]=(src1
[xx
]*(xalpha
^127)+src1
[xx
+1]*xalpha
);
2913 dst
[i
+2048]=(src2
[xx
]*(xalpha
^127)+src2
[xx
+1]*xalpha
);
2915 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2916 dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2920 #endif /* defined(ARCH_X86) */
2924 static int RENAME(swScale
)(SwsContext
*c
, uint8_t* src
[], int srcStride
[], int srcSliceY
,
2925 int srcSliceH
, uint8_t* dst
[], int dstStride
[]){
2927 /* load a few things into local vars to make the code more readable? and faster */
2928 const int srcW
= c
->srcW
;
2929 const int dstW
= c
->dstW
;
2930 const int dstH
= c
->dstH
;
2931 const int chrDstW
= c
->chrDstW
;
2932 const int chrSrcW
= c
->chrSrcW
;
2933 const int lumXInc
= c
->lumXInc
;
2934 const int chrXInc
= c
->chrXInc
;
2935 const int dstFormat
= c
->dstFormat
;
2936 const int srcFormat
= c
->srcFormat
;
2937 const int flags
= c
->flags
;
2938 const int canMMX2BeUsed
= c
->canMMX2BeUsed
;
2939 int16_t *vLumFilterPos
= c
->vLumFilterPos
;
2940 int16_t *vChrFilterPos
= c
->vChrFilterPos
;
2941 int16_t *hLumFilterPos
= c
->hLumFilterPos
;
2942 int16_t *hChrFilterPos
= c
->hChrFilterPos
;
2943 int16_t *vLumFilter
= c
->vLumFilter
;
2944 int16_t *vChrFilter
= c
->vChrFilter
;
2945 int16_t *hLumFilter
= c
->hLumFilter
;
2946 int16_t *hChrFilter
= c
->hChrFilter
;
2947 int32_t *lumMmxFilter
= c
->lumMmxFilter
;
2948 int32_t *chrMmxFilter
= c
->chrMmxFilter
;
2949 const int vLumFilterSize
= c
->vLumFilterSize
;
2950 const int vChrFilterSize
= c
->vChrFilterSize
;
2951 const int hLumFilterSize
= c
->hLumFilterSize
;
2952 const int hChrFilterSize
= c
->hChrFilterSize
;
2953 int16_t **lumPixBuf
= c
->lumPixBuf
;
2954 int16_t **chrPixBuf
= c
->chrPixBuf
;
2955 const int vLumBufSize
= c
->vLumBufSize
;
2956 const int vChrBufSize
= c
->vChrBufSize
;
2957 uint8_t *funnyYCode
= c
->funnyYCode
;
2958 uint8_t *funnyUVCode
= c
->funnyUVCode
;
2959 uint8_t *formatConvBuffer
= c
->formatConvBuffer
;
2960 const int chrSrcSliceY
= srcSliceY
>> c
->chrSrcVSubSample
;
2961 const int chrSrcSliceH
= -((-srcSliceH
) >> c
->chrSrcVSubSample
);
2965 /* vars whch will change and which we need to storw back in the context */
2967 int lumBufIndex
= c
->lumBufIndex
;
2968 int chrBufIndex
= c
->chrBufIndex
;
2969 int lastInLumBuf
= c
->lastInLumBuf
;
2970 int lastInChrBuf
= c
->lastInChrBuf
;
2972 if (isPacked(c
->srcFormat
)){
2979 srcStride
[2]= srcStride
[0];
2981 srcStride
[1]<<= c
->vChrDrop
;
2982 srcStride
[2]<<= c
->vChrDrop
;
2984 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2985 // (int)dst[0], (int)dst[1], (int)dst[2]);
2987 #if 0 //self test FIXME move to a vfilter or something
2989 static volatile int i
=0;
2991 if (srcFormat
==PIX_FMT_YUV420P
&& i
==1 && srcSliceH
>= c
->srcH
)
2992 selfTest(src
, srcStride
, c
->srcW
, c
->srcH
);
2997 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2998 //dstStride[0],dstStride[1],dstStride[2]);
3000 if (dstStride
[0]%8 !=0 || dstStride
[1]%8 !=0 || dstStride
[2]%8 !=0)
3002 static int firstTime
=1; //FIXME move this into the context perhaps
3003 if (flags
& SWS_PRINT_INFO
&& firstTime
)
3005 av_log(c
, AV_LOG_WARNING
, "SwScaler: Warning: dstStride is not aligned!\n"
3006 "SwScaler: ->cannot do aligned memory acesses anymore\n");
3011 /* Note the user might start scaling the picture in the middle so this will not get executed
3012 this is not really intended but works currently, so ppl might do it */
3023 for (;dstY
< dstH
; dstY
++){
3024 unsigned char *dest
=dst
[0]+dstStride
[0]*dstY
;
3025 const int chrDstY
= dstY
>>c
->chrDstVSubSample
;
3026 unsigned char *uDest
=dst
[1]+dstStride
[1]*chrDstY
;
3027 unsigned char *vDest
=dst
[2]+dstStride
[2]*chrDstY
;
3029 const int firstLumSrcY
= vLumFilterPos
[dstY
]; //First line needed as input
3030 const int firstChrSrcY
= vChrFilterPos
[chrDstY
]; //First line needed as input
3031 const int lastLumSrcY
= firstLumSrcY
+ vLumFilterSize
-1; // Last line needed as input
3032 const int lastChrSrcY
= firstChrSrcY
+ vChrFilterSize
-1; // Last line needed as input
3034 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3035 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3036 //handle holes (FAST_BILINEAR & weird filters)
3037 if (firstLumSrcY
> lastInLumBuf
) lastInLumBuf
= firstLumSrcY
-1;
3038 if (firstChrSrcY
> lastInChrBuf
) lastInChrBuf
= firstChrSrcY
-1;
3039 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3040 ASSERT(firstLumSrcY
>= lastInLumBuf
- vLumBufSize
+ 1)
3041 ASSERT(firstChrSrcY
>= lastInChrBuf
- vChrBufSize
+ 1)
3043 // Do we have enough lines in this slice to output the dstY line
3044 if (lastLumSrcY
< srcSliceY
+ srcSliceH
&& lastChrSrcY
< -((-srcSliceY
- srcSliceH
)>>c
->chrSrcVSubSample
))
3046 //Do horizontal scaling
3047 while(lastInLumBuf
< lastLumSrcY
)
3049 uint8_t *s
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
3051 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3052 ASSERT(lumBufIndex
< 2*vLumBufSize
)
3053 ASSERT(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
)
3054 ASSERT(lastInLumBuf
+ 1 - srcSliceY
>= 0)
3055 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3056 RENAME(hyscale
)(lumPixBuf
[ lumBufIndex
], dstW
, s
, srcW
, lumXInc
,
3057 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
3058 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
3059 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
, pal
);
3062 while(lastInChrBuf
< lastChrSrcY
)
3064 uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
3065 uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
3067 ASSERT(chrBufIndex
< 2*vChrBufSize
)
3068 ASSERT(lastInChrBuf
+ 1 - chrSrcSliceY
< (chrSrcSliceH
))
3069 ASSERT(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0)
3070 //FIXME replace parameters through context struct (some at least)
3072 if (!(isGray(srcFormat
) || isGray(dstFormat
)))
3073 RENAME(hcscale
)(chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
3074 flags
, canMMX2BeUsed
, hChrFilter
, hChrFilterPos
, hChrFilterSize
,
3075 funnyUVCode
, c
->srcFormat
, formatConvBuffer
,
3076 c
->chrMmx2Filter
, c
->chrMmx2FilterPos
, pal
);
3079 //wrap buf index around to stay inside the ring buffer
3080 if (lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
3081 if (chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
3083 else // not enough lines left in this slice -> load the rest in the buffer
3085 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3086 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3087 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3088 vChrBufSize, vLumBufSize);*/
3090 //Do horizontal scaling
3091 while(lastInLumBuf
+1 < srcSliceY
+ srcSliceH
)
3093 uint8_t *s
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
3095 ASSERT(lumBufIndex
< 2*vLumBufSize
)
3096 ASSERT(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
)
3097 ASSERT(lastInLumBuf
+ 1 - srcSliceY
>= 0)
3098 RENAME(hyscale
)(lumPixBuf
[ lumBufIndex
], dstW
, s
, srcW
, lumXInc
,
3099 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
3100 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
3101 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
, pal
);
3104 while(lastInChrBuf
+1 < (chrSrcSliceY
+ chrSrcSliceH
))
3106 uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
3107 uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
3109 ASSERT(chrBufIndex
< 2*vChrBufSize
)
3110 ASSERT(lastInChrBuf
+ 1 - chrSrcSliceY
< chrSrcSliceH
)
3111 ASSERT(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0)
3113 if (!(isGray(srcFormat
) || isGray(dstFormat
)))
3114 RENAME(hcscale
)(chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
3115 flags
, canMMX2BeUsed
, hChrFilter
, hChrFilterPos
, hChrFilterSize
,
3116 funnyUVCode
, c
->srcFormat
, formatConvBuffer
,
3117 c
->chrMmx2Filter
, c
->chrMmx2FilterPos
, pal
);
3120 //wrap buf index around to stay inside the ring buffer
3121 if (lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
3122 if (chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
3123 break; //we can't output a dstY line so let's try with the next slice
3127 b5Dither
= dither8
[dstY
&1];
3128 g6Dither
= dither4
[dstY
&1];
3129 g5Dither
= dither8
[dstY
&1];
3130 r5Dither
= dither8
[(dstY
+1)&1];
3134 int16_t **lumSrcPtr
= lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
3135 int16_t **chrSrcPtr
= chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
3138 if (flags
& SWS_ACCURATE_RND
){
3139 for (i
=0; i
<vLumFilterSize
; i
+=2){
3140 lumMmxFilter
[2*i
+0]= (int32_t)lumSrcPtr
[i
];
3141 lumMmxFilter
[2*i
+1]= (int32_t)lumSrcPtr
[i
+(vLumFilterSize
>1)];
3142 lumMmxFilter
[2*i
+2]=
3143 lumMmxFilter
[2*i
+3]= vLumFilter
[dstY
*vLumFilterSize
+ i
]
3144 + (vLumFilterSize
>1 ? vLumFilter
[dstY
*vLumFilterSize
+ i
+ 1]<<16 : 0);
3146 for (i
=0; i
<vChrFilterSize
; i
+=2){
3147 chrMmxFilter
[2*i
+0]= (int32_t)chrSrcPtr
[i
];
3148 chrMmxFilter
[2*i
+1]= (int32_t)chrSrcPtr
[i
+(vChrFilterSize
>1)];
3149 chrMmxFilter
[2*i
+2]=
3150 chrMmxFilter
[2*i
+3]= vChrFilter
[chrDstY
*vChrFilterSize
+ i
]
3151 + (vChrFilterSize
>1 ? vChrFilter
[chrDstY
*vChrFilterSize
+ i
+ 1]<<16 : 0);
3154 for (i
=0; i
<vLumFilterSize
; i
++)
3156 lumMmxFilter
[4*i
+0]= (int32_t)lumSrcPtr
[i
];
3157 lumMmxFilter
[4*i
+1]= (uint64_t)lumSrcPtr
[i
] >> 32;
3158 lumMmxFilter
[4*i
+2]=
3159 lumMmxFilter
[4*i
+3]=
3160 ((uint16_t)vLumFilter
[dstY
*vLumFilterSize
+ i
])*0x10001;
3162 for (i
=0; i
<vChrFilterSize
; i
++)
3164 chrMmxFilter
[4*i
+0]= (int32_t)chrSrcPtr
[i
];
3165 chrMmxFilter
[4*i
+1]= (uint64_t)chrSrcPtr
[i
] >> 32;
3166 chrMmxFilter
[4*i
+2]=
3167 chrMmxFilter
[4*i
+3]=
3168 ((uint16_t)vChrFilter
[chrDstY
*vChrFilterSize
+ i
])*0x10001;
3172 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
){
3173 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
3174 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
3175 RENAME(yuv2nv12X
)(c
,
3176 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3177 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3178 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
3180 else if (isPlanarYUV(dstFormat
) || isGray(dstFormat
)) //YV12 like
3182 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
3183 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
3184 if (vLumFilterSize
== 1 && vChrFilterSize
== 1) // Unscaled YV12
3186 int16_t *lumBuf
= lumPixBuf
[0];
3187 int16_t *chrBuf
= chrPixBuf
[0];
3188 RENAME(yuv2yuv1
)(lumBuf
, chrBuf
, dest
, uDest
, vDest
, dstW
, chrDstW
);
3193 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3194 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3195 dest
, uDest
, vDest
, dstW
, chrDstW
);
3200 ASSERT(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
3201 ASSERT(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
3202 if (vLumFilterSize
== 1 && vChrFilterSize
== 2) //Unscaled RGB
3204 int chrAlpha
= vChrFilter
[2*dstY
+1];
3205 RENAME(yuv2packed1
)(c
, *lumSrcPtr
, *chrSrcPtr
, *(chrSrcPtr
+1),
3206 dest
, dstW
, chrAlpha
, dstFormat
, flags
, dstY
);
3208 else if (vLumFilterSize
== 2 && vChrFilterSize
== 2) //BiLinear Upscale RGB
3210 int lumAlpha
= vLumFilter
[2*dstY
+1];
3211 int chrAlpha
= vChrFilter
[2*dstY
+1];
3213 lumMmxFilter
[3]= vLumFilter
[2*dstY
]*0x10001;
3215 chrMmxFilter
[3]= vChrFilter
[2*chrDstY
]*0x10001;
3216 RENAME(yuv2packed2
)(c
, *lumSrcPtr
, *(lumSrcPtr
+1), *chrSrcPtr
, *(chrSrcPtr
+1),
3217 dest
, dstW
, lumAlpha
, chrAlpha
, dstY
);
3221 RENAME(yuv2packedX
)(c
,
3222 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3223 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3228 else // hmm looks like we can't use MMX here without overwriting this array's tail
3230 int16_t **lumSrcPtr
= lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
3231 int16_t **chrSrcPtr
= chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
3232 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
){
3233 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
3234 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
3236 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3237 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3238 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
3240 else if (isPlanarYUV(dstFormat
) || isGray(dstFormat
)) //YV12
3242 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
3243 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
3245 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3246 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3247 dest
, uDest
, vDest
, dstW
, chrDstW
);
3251 ASSERT(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
3252 ASSERT(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
3254 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3255 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3262 __asm
__volatile(SFENCE:::"memory");
3263 __asm
__volatile(EMMS:::"memory");
3265 /* store changed local vars back in the context */
3267 c
->lumBufIndex
= lumBufIndex
;
3268 c
->chrBufIndex
= chrBufIndex
;
3269 c
->lastInLumBuf
= lastInLumBuf
;
3270 c
->lastInChrBuf
= lastInChrBuf
;
3272 return dstY
- lastDstY
;