2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
51 #define SFENCE "sfence"
53 #define SFENCE " # nop"
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70 #include "swscale_altivec_template.c"
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX \
211 "xor %%"REG_a", %%"REG_a" \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
251 #define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
258 #define YSCALEYUV2PACKEDX_ACCURATE \
260 "xor %%"REG_a", %%"REG_a" \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
274 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
279 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
285 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
318 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
323 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
329 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
352 #define YSCALEYUV2RGBX \
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
389 #define REAL_YSCALEYUV2PACKED(index, c) \
390 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
391 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
392 "psraw $3, %%mm0 \n\t"\
393 "psraw $3, %%mm1 \n\t"\
394 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
395 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
396 "xor "#index", "#index" \n\t"\
399 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
400 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
401 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
402 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
403 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
404 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
405 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
406 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
407 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
408 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
409 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
410 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
411 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
412 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
413 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
414 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
415 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
416 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
417 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
418 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
419 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
420 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
421 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
422 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
423 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
425 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
427 #define REAL_YSCALEYUV2RGB(index, c) \
428 "xor "#index", "#index" \n\t"\
431 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
432 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
433 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
434 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
435 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
436 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
437 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
438 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
439 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
440 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
441 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
442 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
443 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
444 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
445 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
446 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
447 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
448 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
449 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
450 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
451 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
452 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
453 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
454 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
455 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
456 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
457 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
458 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
459 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
460 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
461 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
462 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
463 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
464 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
465 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
466 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
467 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
468 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
469 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470 "paddw %%mm3, %%mm4 \n\t"\
471 "movq %%mm2, %%mm0 \n\t"\
472 "movq %%mm5, %%mm6 \n\t"\
473 "movq %%mm4, %%mm3 \n\t"\
474 "punpcklwd %%mm2, %%mm2 \n\t"\
475 "punpcklwd %%mm5, %%mm5 \n\t"\
476 "punpcklwd %%mm4, %%mm4 \n\t"\
477 "paddw %%mm1, %%mm2 \n\t"\
478 "paddw %%mm1, %%mm5 \n\t"\
479 "paddw %%mm1, %%mm4 \n\t"\
480 "punpckhwd %%mm0, %%mm0 \n\t"\
481 "punpckhwd %%mm6, %%mm6 \n\t"\
482 "punpckhwd %%mm3, %%mm3 \n\t"\
483 "paddw %%mm7, %%mm0 \n\t"\
484 "paddw %%mm7, %%mm6 \n\t"\
485 "paddw %%mm7, %%mm3 \n\t"\
486 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487 "packuswb %%mm0, %%mm2 \n\t"\
488 "packuswb %%mm6, %%mm5 \n\t"\
489 "packuswb %%mm3, %%mm4 \n\t"\
490 "pxor %%mm7, %%mm7 \n\t"
491 #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
493 #define REAL_YSCALEYUV2PACKED1(index, c) \
494 "xor "#index", "#index" \n\t"\
497 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
498 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
499 "psraw $7, %%mm3 \n\t" \
500 "psraw $7, %%mm4 \n\t" \
501 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
502 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
503 "psraw $7, %%mm1 \n\t" \
504 "psraw $7, %%mm7 \n\t" \
506 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
508 #define REAL_YSCALEYUV2RGB1(index, c) \
509 "xor "#index", "#index" \n\t"\
512 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
513 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
514 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
515 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
516 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
517 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
518 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
519 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
520 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
521 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
522 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
523 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
524 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
525 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
526 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
527 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
528 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
529 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
530 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
531 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
532 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
533 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
534 "paddw %%mm3, %%mm4 \n\t"\
535 "movq %%mm2, %%mm0 \n\t"\
536 "movq %%mm5, %%mm6 \n\t"\
537 "movq %%mm4, %%mm3 \n\t"\
538 "punpcklwd %%mm2, %%mm2 \n\t"\
539 "punpcklwd %%mm5, %%mm5 \n\t"\
540 "punpcklwd %%mm4, %%mm4 \n\t"\
541 "paddw %%mm1, %%mm2 \n\t"\
542 "paddw %%mm1, %%mm5 \n\t"\
543 "paddw %%mm1, %%mm4 \n\t"\
544 "punpckhwd %%mm0, %%mm0 \n\t"\
545 "punpckhwd %%mm6, %%mm6 \n\t"\
546 "punpckhwd %%mm3, %%mm3 \n\t"\
547 "paddw %%mm7, %%mm0 \n\t"\
548 "paddw %%mm7, %%mm6 \n\t"\
549 "paddw %%mm7, %%mm3 \n\t"\
550 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
551 "packuswb %%mm0, %%mm2 \n\t"\
552 "packuswb %%mm6, %%mm5 \n\t"\
553 "packuswb %%mm3, %%mm4 \n\t"\
554 "pxor %%mm7, %%mm7 \n\t"
555 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
557 #define REAL_YSCALEYUV2PACKED1b(index, c) \
558 "xor "#index", "#index" \n\t"\
561 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
562 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
563 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
564 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
565 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
566 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
567 "psrlw $8, %%mm3 \n\t" \
568 "psrlw $8, %%mm4 \n\t" \
569 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
570 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
571 "psraw $7, %%mm1 \n\t" \
572 "psraw $7, %%mm7 \n\t"
573 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
575 // do vertical chrominance interpolation
576 #define REAL_YSCALEYUV2RGB1b(index, c) \
577 "xor "#index", "#index" \n\t"\
580 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
581 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
582 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
583 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
584 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
585 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
586 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
587 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
588 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
589 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
590 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
591 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
592 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
593 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
594 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
595 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
596 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
597 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
598 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
599 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
600 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
601 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
602 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
603 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
604 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
605 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
606 "paddw %%mm3, %%mm4 \n\t"\
607 "movq %%mm2, %%mm0 \n\t"\
608 "movq %%mm5, %%mm6 \n\t"\
609 "movq %%mm4, %%mm3 \n\t"\
610 "punpcklwd %%mm2, %%mm2 \n\t"\
611 "punpcklwd %%mm5, %%mm5 \n\t"\
612 "punpcklwd %%mm4, %%mm4 \n\t"\
613 "paddw %%mm1, %%mm2 \n\t"\
614 "paddw %%mm1, %%mm5 \n\t"\
615 "paddw %%mm1, %%mm4 \n\t"\
616 "punpckhwd %%mm0, %%mm0 \n\t"\
617 "punpckhwd %%mm6, %%mm6 \n\t"\
618 "punpckhwd %%mm3, %%mm3 \n\t"\
619 "paddw %%mm7, %%mm0 \n\t"\
620 "paddw %%mm7, %%mm6 \n\t"\
621 "paddw %%mm7, %%mm3 \n\t"\
622 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
623 "packuswb %%mm0, %%mm2 \n\t"\
624 "packuswb %%mm6, %%mm5 \n\t"\
625 "packuswb %%mm3, %%mm4 \n\t"\
626 "pxor %%mm7, %%mm7 \n\t"
627 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
629 #define REAL_WRITEBGR32(dst, dstw, index) \
630 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
631 "movq %%mm2, %%mm1 \n\t" /* B */\
632 "movq %%mm5, %%mm6 \n\t" /* R */\
633 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
634 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
635 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
636 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
637 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
638 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
639 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
640 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
641 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
642 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
644 MOVNTQ(%%mm0, (dst, index, 4))\
645 MOVNTQ(%%mm2, 8(dst, index, 4))\
646 MOVNTQ(%%mm1, 16(dst, index, 4))\
647 MOVNTQ(%%mm3, 24(dst, index, 4))\
649 "add $8, "#index" \n\t"\
650 "cmp "#dstw", "#index" \n\t"\
652 #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
654 #define REAL_WRITERGB16(dst, dstw, index) \
655 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
656 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
657 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
658 "psrlq $3, %%mm2 \n\t"\
660 "movq %%mm2, %%mm1 \n\t"\
661 "movq %%mm4, %%mm3 \n\t"\
663 "punpcklbw %%mm7, %%mm3 \n\t"\
664 "punpcklbw %%mm5, %%mm2 \n\t"\
665 "punpckhbw %%mm7, %%mm4 \n\t"\
666 "punpckhbw %%mm5, %%mm1 \n\t"\
668 "psllq $3, %%mm3 \n\t"\
669 "psllq $3, %%mm4 \n\t"\
671 "por %%mm3, %%mm2 \n\t"\
672 "por %%mm4, %%mm1 \n\t"\
674 MOVNTQ(%%mm2, (dst, index, 2))\
675 MOVNTQ(%%mm1, 8(dst, index, 2))\
677 "add $8, "#index" \n\t"\
678 "cmp "#dstw", "#index" \n\t"\
680 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
682 #define REAL_WRITERGB15(dst, dstw, index) \
683 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
684 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
685 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
686 "psrlq $3, %%mm2 \n\t"\
687 "psrlq $1, %%mm5 \n\t"\
689 "movq %%mm2, %%mm1 \n\t"\
690 "movq %%mm4, %%mm3 \n\t"\
692 "punpcklbw %%mm7, %%mm3 \n\t"\
693 "punpcklbw %%mm5, %%mm2 \n\t"\
694 "punpckhbw %%mm7, %%mm4 \n\t"\
695 "punpckhbw %%mm5, %%mm1 \n\t"\
697 "psllq $2, %%mm3 \n\t"\
698 "psllq $2, %%mm4 \n\t"\
700 "por %%mm3, %%mm2 \n\t"\
701 "por %%mm4, %%mm1 \n\t"\
703 MOVNTQ(%%mm2, (dst, index, 2))\
704 MOVNTQ(%%mm1, 8(dst, index, 2))\
706 "add $8, "#index" \n\t"\
707 "cmp "#dstw", "#index" \n\t"\
709 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
711 #define WRITEBGR24OLD(dst, dstw, index) \
712 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
713 "movq %%mm2, %%mm1 \n\t" /* B */\
714 "movq %%mm5, %%mm6 \n\t" /* R */\
715 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
716 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
717 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
718 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
719 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
720 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
721 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
722 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
723 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
724 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
726 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
727 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
728 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
729 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
730 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
731 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
732 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
733 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
735 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
736 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
737 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
738 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
739 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
740 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
741 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
742 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
743 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
744 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
745 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
746 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
747 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
749 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
750 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
751 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
752 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
753 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
754 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
755 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
756 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
758 MOVNTQ(%%mm0, (dst))\
759 MOVNTQ(%%mm2, 8(dst))\
760 MOVNTQ(%%mm3, 16(dst))\
761 "add $24, "#dst" \n\t"\
763 "add $8, "#index" \n\t"\
764 "cmp "#dstw", "#index" \n\t"\
767 #define WRITEBGR24MMX(dst, dstw, index) \
768 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
769 "movq %%mm2, %%mm1 \n\t" /* B */\
770 "movq %%mm5, %%mm6 \n\t" /* R */\
771 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
772 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
773 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
774 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
775 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
776 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
777 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
778 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
779 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
780 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
782 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
783 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
784 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
785 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
787 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
788 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
789 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
790 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
792 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
793 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
794 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
795 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
797 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
798 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
799 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
800 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
801 MOVNTQ(%%mm0, (dst))\
803 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
804 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
805 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
806 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
807 MOVNTQ(%%mm6, 8(dst))\
809 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
810 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
811 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
812 MOVNTQ(%%mm5, 16(dst))\
814 "add $24, "#dst" \n\t"\
816 "add $8, "#index" \n\t"\
817 "cmp "#dstw", "#index" \n\t"\
820 #define WRITEBGR24MMX2(dst, dstw, index) \
821 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
822 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
823 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
824 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
825 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
826 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
828 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
829 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
830 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
832 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
833 "por %%mm1, %%mm6 \n\t"\
834 "por %%mm3, %%mm6 \n\t"\
835 MOVNTQ(%%mm6, (dst))\
837 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
838 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
839 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
840 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
842 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
843 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
844 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
846 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
847 "por %%mm3, %%mm6 \n\t"\
848 MOVNTQ(%%mm6, 8(dst))\
850 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
851 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
852 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
854 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
855 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
856 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
858 "por %%mm1, %%mm3 \n\t"\
859 "por %%mm3, %%mm6 \n\t"\
860 MOVNTQ(%%mm6, 16(dst))\
862 "add $24, "#dst" \n\t"\
864 "add $8, "#index" \n\t"\
865 "cmp "#dstw", "#index" \n\t"\
870 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
873 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
876 #define REAL_WRITEYUY2(dst, dstw, index) \
877 "packuswb %%mm3, %%mm3 \n\t"\
878 "packuswb %%mm4, %%mm4 \n\t"\
879 "packuswb %%mm7, %%mm1 \n\t"\
880 "punpcklbw %%mm4, %%mm3 \n\t"\
881 "movq %%mm1, %%mm7 \n\t"\
882 "punpcklbw %%mm3, %%mm1 \n\t"\
883 "punpckhbw %%mm3, %%mm7 \n\t"\
885 MOVNTQ(%%mm1, (dst, index, 2))\
886 MOVNTQ(%%mm7, 8(dst, index, 2))\
888 "add $8, "#index" \n\t"\
889 "cmp "#dstw", "#index" \n\t"\
891 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
894 static inline void RENAME(yuv2yuvX
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
895 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
896 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, long dstW
, long chrDstW
)
899 if(!(c
->flags
& SWS_BITEXACT
)){
900 if (c
->flags
& SWS_ACCURATE_RND
){
902 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
903 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
906 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
909 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
910 YSCALEYUV2YV12X(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
913 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
919 yuv2yuvX_altivec_real(lumFilter
, lumSrc
, lumFilterSize
,
920 chrFilter
, chrSrc
, chrFilterSize
,
921 dest
, uDest
, vDest
, dstW
, chrDstW
);
923 yuv2yuvXinC(lumFilter
, lumSrc
, lumFilterSize
,
924 chrFilter
, chrSrc
, chrFilterSize
,
925 dest
, uDest
, vDest
, dstW
, chrDstW
);
926 #endif //!HAVE_ALTIVEC
929 static inline void RENAME(yuv2nv12X
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
930 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
931 uint8_t *dest
, uint8_t *uDest
, int dstW
, int chrDstW
, int dstFormat
)
933 yuv2nv12XinC(lumFilter
, lumSrc
, lumFilterSize
,
934 chrFilter
, chrSrc
, chrFilterSize
,
935 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
938 static inline void RENAME(yuv2yuv1
)(SwsContext
*c
, int16_t *lumSrc
, int16_t *chrSrc
,
939 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, long dstW
, long chrDstW
)
943 if(!(c
->flags
& SWS_BITEXACT
)){
944 long p
= uDest
? 3 : 1;
945 uint8_t *src
[3]= {lumSrc
+ dstW
, chrSrc
+ chrDstW
, chrSrc
+ VOFW
+ chrDstW
};
946 uint8_t *dst
[3]= {dest
, uDest
, vDest
};
947 long counter
[3] = {dstW
, chrDstW
, chrDstW
};
949 if (c
->flags
& SWS_ACCURATE_RND
){
952 YSCALEYUV2YV121_ACCURATE
953 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
962 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
971 for (i
=0; i
<dstW
; i
++)
973 int val
= (lumSrc
[i
]+64)>>7;
984 for (i
=0; i
<chrDstW
; i
++)
986 int u
=(chrSrc
[i
]+64)>>7;
987 int v
=(chrSrc
[i
+ VOFW
]+64)>>7;
991 else if (u
>255) u
=255;
993 else if (v
>255) v
=255;
1003 * vertical scale YV12 to RGB
1005 static inline void RENAME(yuv2packedX
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
1006 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
1007 uint8_t *dest
, long dstW
, long dstY
)
1011 if(!(c
->flags
& SWS_BITEXACT
)){
1012 if (c
->flags
& SWS_ACCURATE_RND
){
1013 switch(c
->dstFormat
){
1015 YSCALEYUV2PACKEDX_ACCURATE
1017 WRITEBGR32(%4, %5, %%REGa
)
1019 YSCALEYUV2PACKEDX_END
1022 YSCALEYUV2PACKEDX_ACCURATE
1024 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
"\n\t" //FIXME optimize
1025 "add %4, %%"REG_c
" \n\t"
1026 WRITEBGR24(%%REGc
, %5, %%REGa
)
1029 :: "r" (&c
->redDither
),
1030 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1031 "r" (dest
), "m" (dstW
)
1032 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1035 case PIX_FMT_RGB555
:
1036 YSCALEYUV2PACKEDX_ACCURATE
1038 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1040 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1041 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1042 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1045 WRITERGB15(%4, %5, %%REGa
)
1046 YSCALEYUV2PACKEDX_END
1048 case PIX_FMT_RGB565
:
1049 YSCALEYUV2PACKEDX_ACCURATE
1051 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1053 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1054 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1055 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1058 WRITERGB16(%4, %5, %%REGa
)
1059 YSCALEYUV2PACKEDX_END
1061 case PIX_FMT_YUYV422
:
1062 YSCALEYUV2PACKEDX_ACCURATE
1063 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1065 "psraw $3, %%mm3 \n\t"
1066 "psraw $3, %%mm4 \n\t"
1067 "psraw $3, %%mm1 \n\t"
1068 "psraw $3, %%mm7 \n\t"
1069 WRITEYUY2(%4, %5, %%REGa
)
1070 YSCALEYUV2PACKEDX_END
1074 switch(c
->dstFormat
)
1079 WRITEBGR32(%4, %5, %%REGa
)
1080 YSCALEYUV2PACKEDX_END
1085 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
" \n\t" //FIXME optimize
1086 "add %4, %%"REG_c
" \n\t"
1087 WRITEBGR24(%%REGc
, %5, %%REGa
)
1089 :: "r" (&c
->redDither
),
1090 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1091 "r" (dest
), "m" (dstW
)
1092 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1095 case PIX_FMT_RGB555
:
1098 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1100 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1101 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1102 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1105 WRITERGB15(%4, %5, %%REGa
)
1106 YSCALEYUV2PACKEDX_END
1108 case PIX_FMT_RGB565
:
1111 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1113 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1114 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1115 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1118 WRITERGB16(%4, %5, %%REGa
)
1119 YSCALEYUV2PACKEDX_END
1121 case PIX_FMT_YUYV422
:
1123 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1125 "psraw $3, %%mm3 \n\t"
1126 "psraw $3, %%mm4 \n\t"
1127 "psraw $3, %%mm1 \n\t"
1128 "psraw $3, %%mm7 \n\t"
1129 WRITEYUY2(%4, %5, %%REGa
)
1130 YSCALEYUV2PACKEDX_END
1135 #endif /* HAVE_MMX */
1137 /* The following list of supported dstFormat values should
1138 match what's found in the body of altivec_yuv2packedX() */
1139 if (!(c
->flags
& SWS_BITEXACT
) &&
1140 (c
->dstFormat
==PIX_FMT_ABGR
|| c
->dstFormat
==PIX_FMT_BGRA
||
1141 c
->dstFormat
==PIX_FMT_BGR24
|| c
->dstFormat
==PIX_FMT_RGB24
||
1142 c
->dstFormat
==PIX_FMT_RGBA
|| c
->dstFormat
==PIX_FMT_ARGB
))
1143 altivec_yuv2packedX (c
, lumFilter
, lumSrc
, lumFilterSize
,
1144 chrFilter
, chrSrc
, chrFilterSize
,
1148 yuv2packedXinC(c
, lumFilter
, lumSrc
, lumFilterSize
,
1149 chrFilter
, chrSrc
, chrFilterSize
,
1154 * vertical bilinear scale YV12 to RGB
1156 static inline void RENAME(yuv2packed2
)(SwsContext
*c
, uint16_t *buf0
, uint16_t *buf1
, uint16_t *uvbuf0
, uint16_t *uvbuf1
,
1157 uint8_t *dest
, int dstW
, int yalpha
, int uvalpha
, int y
)
1159 int yalpha1
=4095- yalpha
;
1160 int uvalpha1
=4095-uvalpha
;
1164 if(!(c
->flags
& SWS_BITEXACT
)){
1165 switch(c
->dstFormat
)
1167 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1170 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1171 "mov %4, %%"REG_b
" \n\t"
1172 "push %%"REG_BP
" \n\t"
1173 YSCALEYUV2RGB(%%REGBP
, %5)
1174 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
)
1175 "pop %%"REG_BP
" \n\t"
1176 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1178 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1184 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1185 "mov %4, %%"REG_b
" \n\t"
1186 "push %%"REG_BP
" \n\t"
1187 YSCALEYUV2RGB(%%REGBP
, %5)
1188 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1189 "pop %%"REG_BP
" \n\t"
1190 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1191 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1195 case PIX_FMT_RGB555
:
1197 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1198 "mov %4, %%"REG_b
" \n\t"
1199 "push %%"REG_BP
" \n\t"
1200 YSCALEYUV2RGB(%%REGBP
, %5)
1201 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1203 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1204 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1205 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1208 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1209 "pop %%"REG_BP
" \n\t"
1210 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1212 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1216 case PIX_FMT_RGB565
:
1218 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1219 "mov %4, %%"REG_b
" \n\t"
1220 "push %%"REG_BP
" \n\t"
1221 YSCALEYUV2RGB(%%REGBP
, %5)
1222 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1224 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1225 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1226 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1229 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1230 "pop %%"REG_BP
" \n\t"
1231 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1232 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1236 case PIX_FMT_YUYV422
:
1238 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1239 "mov %4, %%"REG_b
" \n\t"
1240 "push %%"REG_BP
" \n\t"
1241 YSCALEYUV2PACKED(%%REGBP
, %5)
1242 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1243 "pop %%"REG_BP
" \n\t"
1244 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1245 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1253 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C
, YSCALE_YUV_2_PACKED2_C
, YSCALE_YUV_2_GRAY16_2_C
, YSCALE_YUV_2_MONO2_C
)
1257 * YV12 to RGB without scaling or interpolating
1259 static inline void RENAME(yuv2packed1
)(SwsContext
*c
, uint16_t *buf0
, uint16_t *uvbuf0
, uint16_t *uvbuf1
,
1260 uint8_t *dest
, int dstW
, int uvalpha
, int dstFormat
, int flags
, int y
)
1262 const int yalpha1
=0;
1265 uint16_t *buf1
= buf0
; //FIXME needed for RGB1/BGR1
1266 const int yalpha
= 4096; //FIXME ...
1268 if (flags
&SWS_FULL_CHR_H_INT
)
1270 RENAME(yuv2packed2
)(c
, buf0
, buf0
, uvbuf0
, uvbuf1
, dest
, dstW
, 0, uvalpha
, y
);
1275 if(!(flags
& SWS_BITEXACT
)){
1276 if (uvalpha
< 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1282 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1283 "mov %4, %%"REG_b
" \n\t"
1284 "push %%"REG_BP
" \n\t"
1285 YSCALEYUV2RGB1(%%REGBP
, %5)
1286 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
)
1287 "pop %%"REG_BP
" \n\t"
1288 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1290 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1296 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1297 "mov %4, %%"REG_b
" \n\t"
1298 "push %%"REG_BP
" \n\t"
1299 YSCALEYUV2RGB1(%%REGBP
, %5)
1300 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1301 "pop %%"REG_BP
" \n\t"
1302 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1304 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1308 case PIX_FMT_RGB555
:
1310 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1311 "mov %4, %%"REG_b
" \n\t"
1312 "push %%"REG_BP
" \n\t"
1313 YSCALEYUV2RGB1(%%REGBP
, %5)
1314 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1316 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1317 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1318 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1320 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1321 "pop %%"REG_BP
" \n\t"
1322 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1324 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1328 case PIX_FMT_RGB565
:
1330 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1331 "mov %4, %%"REG_b
" \n\t"
1332 "push %%"REG_BP
" \n\t"
1333 YSCALEYUV2RGB1(%%REGBP
, %5)
1334 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1336 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1337 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1338 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1341 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1342 "pop %%"REG_BP
" \n\t"
1343 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1345 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1349 case PIX_FMT_YUYV422
:
1351 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1352 "mov %4, %%"REG_b
" \n\t"
1353 "push %%"REG_BP
" \n\t"
1354 YSCALEYUV2PACKED1(%%REGBP
, %5)
1355 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1356 "pop %%"REG_BP
" \n\t"
1357 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1359 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1371 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1372 "mov %4, %%"REG_b
" \n\t"
1373 "push %%"REG_BP
" \n\t"
1374 YSCALEYUV2RGB1b(%%REGBP
, %5)
1375 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
)
1376 "pop %%"REG_BP
" \n\t"
1377 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1379 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1385 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1386 "mov %4, %%"REG_b
" \n\t"
1387 "push %%"REG_BP
" \n\t"
1388 YSCALEYUV2RGB1b(%%REGBP
, %5)
1389 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1390 "pop %%"REG_BP
" \n\t"
1391 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1393 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1397 case PIX_FMT_RGB555
:
1399 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1400 "mov %4, %%"REG_b
" \n\t"
1401 "push %%"REG_BP
" \n\t"
1402 YSCALEYUV2RGB1b(%%REGBP
, %5)
1403 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1405 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1406 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1407 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1409 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1410 "pop %%"REG_BP
" \n\t"
1411 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1413 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1417 case PIX_FMT_RGB565
:
1419 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1420 "mov %4, %%"REG_b
" \n\t"
1421 "push %%"REG_BP
" \n\t"
1422 YSCALEYUV2RGB1b(%%REGBP
, %5)
1423 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1425 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1426 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1427 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1430 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1431 "pop %%"REG_BP
" \n\t"
1432 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1434 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1438 case PIX_FMT_YUYV422
:
1440 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1441 "mov %4, %%"REG_b
" \n\t"
1442 "push %%"REG_BP
" \n\t"
1443 YSCALEYUV2PACKED1b(%%REGBP
, %5)
1444 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1445 "pop %%"REG_BP
" \n\t"
1446 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1448 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1455 #endif /* HAVE_MMX */
1458 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C
, YSCALE_YUV_2_PACKED1_C
, YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1460 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C
, YSCALE_YUV_2_PACKED1B_C
, YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1464 //FIXME yuy2* can read up to 7 samples too much
1466 static inline void RENAME(yuy2ToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1470 "movq "MANGLE(bm01010101
)", %%mm2 \n\t"
1471 "mov %0, %%"REG_a
" \n\t"
1473 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1474 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1475 "pand %%mm2, %%mm0 \n\t"
1476 "pand %%mm2, %%mm1 \n\t"
1477 "packuswb %%mm1, %%mm0 \n\t"
1478 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1479 "add $8, %%"REG_a
" \n\t"
1481 : : "g" (-width
), "r" (src
+width
*2), "r" (dst
+width
)
1486 for (i
=0; i
<width
; i
++)
1491 static inline void RENAME(yuy2ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1495 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1496 "mov %0, %%"REG_a
" \n\t"
1498 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1499 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1500 "psrlw $8, %%mm0 \n\t"
1501 "psrlw $8, %%mm1 \n\t"
1502 "packuswb %%mm1, %%mm0 \n\t"
1503 "movq %%mm0, %%mm1 \n\t"
1504 "psrlw $8, %%mm0 \n\t"
1505 "pand %%mm4, %%mm1 \n\t"
1506 "packuswb %%mm0, %%mm0 \n\t"
1507 "packuswb %%mm1, %%mm1 \n\t"
1508 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1509 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1510 "add $4, %%"REG_a
" \n\t"
1512 : : "g" (-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1517 for (i
=0; i
<width
; i
++)
1519 dstU
[i
]= src1
[4*i
+ 1];
1520 dstV
[i
]= src1
[4*i
+ 3];
1523 assert(src1
== src2
);
1526 /* This is almost identical to the previous, end exists only because
1527 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1528 static inline void RENAME(uyvyToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1532 "mov %0, %%"REG_a
" \n\t"
1534 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1535 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1536 "psrlw $8, %%mm0 \n\t"
1537 "psrlw $8, %%mm1 \n\t"
1538 "packuswb %%mm1, %%mm0 \n\t"
1539 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1540 "add $8, %%"REG_a
" \n\t"
1542 : : "g" (-width
), "r" (src
+width
*2), "r" (dst
+width
)
1547 for (i
=0; i
<width
; i
++)
1552 static inline void RENAME(uyvyToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1556 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1557 "mov %0, %%"REG_a
" \n\t"
1559 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1560 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1561 "pand %%mm4, %%mm0 \n\t"
1562 "pand %%mm4, %%mm1 \n\t"
1563 "packuswb %%mm1, %%mm0 \n\t"
1564 "movq %%mm0, %%mm1 \n\t"
1565 "psrlw $8, %%mm0 \n\t"
1566 "pand %%mm4, %%mm1 \n\t"
1567 "packuswb %%mm0, %%mm0 \n\t"
1568 "packuswb %%mm1, %%mm1 \n\t"
1569 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1570 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1571 "add $4, %%"REG_a
" \n\t"
1573 : : "g" (-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1578 for (i
=0; i
<width
; i
++)
1580 dstU
[i
]= src1
[4*i
+ 0];
1581 dstV
[i
]= src1
[4*i
+ 2];
1584 assert(src1
== src2
);
1587 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1588 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1591 for (i=0; i<width; i++)\
1593 int b= (((type*)src)[i]>>shb)&maskb;\
1594 int g= (((type*)src)[i]>>shg)&maskg;\
1595 int r= (((type*)src)[i]>>shr)&maskr;\
1597 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1601 BGR2Y(uint32_t, bgr32ToY
,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY
<< 8, GY
, BY
<< 8, RGB2YUV_SHIFT
+8)
1602 BGR2Y(uint32_t, rgb32ToY
, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY
<< 8, GY
, BY
<< 8, RGB2YUV_SHIFT
+8)
1603 BGR2Y(uint16_t, bgr16ToY
, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY
<<11, GY
<<5, BY
, RGB2YUV_SHIFT
+8)
1604 BGR2Y(uint16_t, bgr15ToY
, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY
<<10, GY
<<5, BY
, RGB2YUV_SHIFT
+7)
1605 BGR2Y(uint16_t, rgb16ToY
, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY
, GY
<<5, BY
<<11, RGB2YUV_SHIFT
+8)
1606 BGR2Y(uint16_t, rgb15ToY
, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY
, GY
<<5, BY
<<10, RGB2YUV_SHIFT
+7)
1608 #define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1609 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1612 for (i=0; i<width; i++)\
1614 int b= (((type*)src)[i]&maskb)>>shb;\
1615 int g= (((type*)src)[i]&maskg)>>shg;\
1616 int r= (((type*)src)[i]&maskr)>>shr;\
1618 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1619 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1622 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1625 for (i=0; i<width; i++)\
1627 int pix0= ((type*)src)[2*i+0];\
1628 int pix1= ((type*)src)[2*i+1];\
1629 int g= (pix0&maskg)+(pix1&maskg);\
1630 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1631 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1635 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1636 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1640 BGR2UV(uint32_t, bgr32ToUV
,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU
<< 8, GU
, BU
<< 8, RV
<< 8, GV
, BV
<< 8, RGB2YUV_SHIFT
+8)
1641 BGR2UV(uint32_t, rgb32ToUV
, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU
<< 8, GU
, BU
<< 8, RV
<< 8, GV
, BV
<< 8, RGB2YUV_SHIFT
+8)
1642 BGR2UV(uint16_t, bgr16ToUV
, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU
<<11, GU
<<5, BU
, RV
<<11, GV
<<5, BV
, RGB2YUV_SHIFT
+8)
1643 BGR2UV(uint16_t, bgr15ToUV
, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU
<<10, GU
<<5, BU
, RV
<<10, GV
<<5, BV
, RGB2YUV_SHIFT
+7)
1644 BGR2UV(uint16_t, rgb16ToUV
, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU
, GU
<<5, BU
<<11, RV
, GV
<<5, BV
<<11, RGB2YUV_SHIFT
+8)
1645 BGR2UV(uint16_t, rgb15ToUV
, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU
, GU
<<5, BU
<<10, RV
, GV
<<5, BV
<<10, RGB2YUV_SHIFT
+7)
1648 static inline void RENAME(bgr24ToY_mmx
)(uint8_t *dst
, uint8_t *src
, long width
, int srcFormat
)
1651 if(srcFormat
== PIX_FMT_BGR24
){
1653 "movq "MANGLE(ff_bgr24toY1Coeff
)", %%mm5 \n\t"
1654 "movq "MANGLE(ff_bgr24toY2Coeff
)", %%mm6 \n\t"
1659 "movq "MANGLE(ff_rgb24toY1Coeff
)", %%mm5 \n\t"
1660 "movq "MANGLE(ff_rgb24toY2Coeff
)", %%mm6 \n\t"
1666 "movq "MANGLE(ff_bgr24toYOffset
)", %%mm4 \n\t"
1667 "mov %2, %%"REG_a
" \n\t"
1668 "pxor %%mm7, %%mm7 \n\t"
1670 PREFETCH
" 64(%0) \n\t"
1671 "movd (%0), %%mm0 \n\t"
1672 "movd 2(%0), %%mm1 \n\t"
1673 "movd 6(%0), %%mm2 \n\t"
1674 "movd 8(%0), %%mm3 \n\t"
1676 "punpcklbw %%mm7, %%mm0 \n\t"
1677 "punpcklbw %%mm7, %%mm1 \n\t"
1678 "punpcklbw %%mm7, %%mm2 \n\t"
1679 "punpcklbw %%mm7, %%mm3 \n\t"
1680 "pmaddwd %%mm5, %%mm0 \n\t"
1681 "pmaddwd %%mm6, %%mm1 \n\t"
1682 "pmaddwd %%mm5, %%mm2 \n\t"
1683 "pmaddwd %%mm6, %%mm3 \n\t"
1684 "paddd %%mm1, %%mm0 \n\t"
1685 "paddd %%mm3, %%mm2 \n\t"
1686 "paddd %%mm4, %%mm0 \n\t"
1687 "paddd %%mm4, %%mm2 \n\t"
1688 "psrad $15, %%mm0 \n\t"
1689 "psrad $15, %%mm2 \n\t"
1690 "packssdw %%mm2, %%mm0 \n\t"
1691 "packuswb %%mm0, %%mm0 \n\t"
1692 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1693 "add $4, %%"REG_a
" \n\t"
1696 : "r" (dst
+width
), "g" (-width
)
1701 static inline void RENAME(bgr24ToUV_mmx
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src
, long width
, int srcFormat
)
1704 "movq 24+%4, %%mm6 \n\t"
1705 "mov %3, %%"REG_a
" \n\t"
1706 "pxor %%mm7, %%mm7 \n\t"
1708 PREFETCH
" 64(%0) \n\t"
1709 "movd (%0), %%mm0 \n\t"
1710 "movd 2(%0), %%mm1 \n\t"
1711 "punpcklbw %%mm7, %%mm0 \n\t"
1712 "punpcklbw %%mm7, %%mm1 \n\t"
1713 "movq %%mm0, %%mm2 \n\t"
1714 "movq %%mm1, %%mm3 \n\t"
1715 "pmaddwd %4, %%mm0 \n\t"
1716 "pmaddwd 8+%4, %%mm1 \n\t"
1717 "pmaddwd 16+%4, %%mm2 \n\t"
1718 "pmaddwd %%mm6, %%mm3 \n\t"
1719 "paddd %%mm1, %%mm0 \n\t"
1720 "paddd %%mm3, %%mm2 \n\t"
1722 "movd 6(%0), %%mm1 \n\t"
1723 "movd 8(%0), %%mm3 \n\t"
1725 "punpcklbw %%mm7, %%mm1 \n\t"
1726 "punpcklbw %%mm7, %%mm3 \n\t"
1727 "movq %%mm1, %%mm4 \n\t"
1728 "movq %%mm3, %%mm5 \n\t"
1729 "pmaddwd %4, %%mm1 \n\t"
1730 "pmaddwd 8+%4, %%mm3 \n\t"
1731 "pmaddwd 16+%4, %%mm4 \n\t"
1732 "pmaddwd %%mm6, %%mm5 \n\t"
1733 "paddd %%mm3, %%mm1 \n\t"
1734 "paddd %%mm5, %%mm4 \n\t"
1736 "movq "MANGLE(ff_bgr24toUVOffset
)", %%mm3 \n\t"
1737 "paddd %%mm3, %%mm0 \n\t"
1738 "paddd %%mm3, %%mm2 \n\t"
1739 "paddd %%mm3, %%mm1 \n\t"
1740 "paddd %%mm3, %%mm4 \n\t"
1741 "psrad $15, %%mm0 \n\t"
1742 "psrad $15, %%mm2 \n\t"
1743 "psrad $15, %%mm1 \n\t"
1744 "psrad $15, %%mm4 \n\t"
1745 "packssdw %%mm1, %%mm0 \n\t"
1746 "packssdw %%mm4, %%mm2 \n\t"
1747 "packuswb %%mm0, %%mm0 \n\t"
1748 "packuswb %%mm2, %%mm2 \n\t"
1749 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1750 "movd %%mm2, (%2, %%"REG_a
") \n\t"
1751 "add $4, %%"REG_a
" \n\t"
1754 : "r" (dstU
+width
), "r" (dstV
+width
), "g" (-width
), "m"(ff_bgr24toUV
[srcFormat
== PIX_FMT_RGB24
][0])
1760 static inline void RENAME(bgr24ToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1763 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_BGR24
);
1766 for (i
=0; i
<width
; i
++)
1772 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
1774 #endif /* HAVE_MMX */
1777 static inline void RENAME(bgr24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1780 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_BGR24
);
1783 for (i
=0; i
<width
; i
++)
1785 int b
= src1
[3*i
+ 0];
1786 int g
= src1
[3*i
+ 1];
1787 int r
= src1
[3*i
+ 2];
1789 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1790 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1792 #endif /* HAVE_MMX */
1793 assert(src1
== src2
);
1796 static inline void RENAME(bgr24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1799 for (i
=0; i
<width
; i
++)
1801 int b
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
1802 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
1803 int r
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
1805 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1806 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1808 assert(src1
== src2
);
1811 static inline void RENAME(rgb24ToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1814 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_RGB24
);
1817 for (i
=0; i
<width
; i
++)
1823 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
1828 static inline void RENAME(rgb24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1832 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_RGB24
);
1836 for (i
=0; i
<width
; i
++)
1838 int r
= src1
[3*i
+ 0];
1839 int g
= src1
[3*i
+ 1];
1840 int b
= src1
[3*i
+ 2];
1842 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1843 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1848 static inline void RENAME(rgb24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1852 for (i
=0; i
<width
; i
++)
1854 int r
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
1855 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
1856 int b
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
1858 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1859 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1864 static inline void RENAME(palToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *pal
)
1867 for (i
=0; i
<width
; i
++)
1871 dst
[i
]= pal
[d
] & 0xFF;
1875 static inline void RENAME(palToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *pal
)
1878 assert(src1
== src2
);
1879 for (i
=0; i
<width
; i
++)
1881 int p
= pal
[src1
[i
]];
1888 static inline void RENAME(monowhite2Y
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1891 for (i
=0; i
<width
/8; i
++){
1894 dst
[8*i
+j
]= ((d
>>(7-j
))&1)*255;
1898 static inline void RENAME(monoblack2Y
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1901 for (i
=0; i
<width
/8; i
++){
1904 dst
[8*i
+j
]= ((d
>>(7-j
))&1)*255;
1908 // bilinear / bicubic scaling
1909 static inline void RENAME(hScale
)(int16_t *dst
, int dstW
, uint8_t *src
, int srcW
, int xInc
,
1910 int16_t *filter
, int16_t *filterPos
, long filterSize
)
1913 assert(filterSize
% 4 == 0 && filterSize
>0);
1914 if (filterSize
==4) // Always true for upscaling, sometimes for down, too.
1916 long counter
= -2*dstW
;
1918 filterPos
-= counter
/2;
1922 "push %%"REG_b
" \n\t"
1924 "pxor %%mm7, %%mm7 \n\t"
1925 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
1926 "mov %%"REG_a
", %%"REG_BP
" \n\t"
1929 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
1930 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
1931 "movq (%1, %%"REG_BP
", 4), %%mm1 \n\t"
1932 "movq 8(%1, %%"REG_BP
", 4), %%mm3 \n\t"
1933 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
1934 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
1935 "punpcklbw %%mm7, %%mm0 \n\t"
1936 "punpcklbw %%mm7, %%mm2 \n\t"
1937 "pmaddwd %%mm1, %%mm0 \n\t"
1938 "pmaddwd %%mm2, %%mm3 \n\t"
1939 "movq %%mm0, %%mm4 \n\t"
1940 "punpckldq %%mm3, %%mm0 \n\t"
1941 "punpckhdq %%mm3, %%mm4 \n\t"
1942 "paddd %%mm4, %%mm0 \n\t"
1943 "psrad $7, %%mm0 \n\t"
1944 "packssdw %%mm0, %%mm0 \n\t"
1945 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
1946 "add $4, %%"REG_BP
" \n\t"
1949 "pop %%"REG_BP
" \n\t"
1951 "pop %%"REG_b
" \n\t"
1954 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
1960 else if (filterSize
==8)
1962 long counter
= -2*dstW
;
1964 filterPos
-= counter
/2;
1968 "push %%"REG_b
" \n\t"
1970 "pxor %%mm7, %%mm7 \n\t"
1971 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
1972 "mov %%"REG_a
", %%"REG_BP
" \n\t"
1975 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
1976 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
1977 "movq (%1, %%"REG_BP
", 8), %%mm1 \n\t"
1978 "movq 16(%1, %%"REG_BP
", 8), %%mm3 \n\t"
1979 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
1980 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
1981 "punpcklbw %%mm7, %%mm0 \n\t"
1982 "punpcklbw %%mm7, %%mm2 \n\t"
1983 "pmaddwd %%mm1, %%mm0 \n\t"
1984 "pmaddwd %%mm2, %%mm3 \n\t"
1986 "movq 8(%1, %%"REG_BP
", 8), %%mm1 \n\t"
1987 "movq 24(%1, %%"REG_BP
", 8), %%mm5 \n\t"
1988 "movd 4(%3, %%"REG_a
"), %%mm4 \n\t"
1989 "movd 4(%3, %%"REG_b
"), %%mm2 \n\t"
1990 "punpcklbw %%mm7, %%mm4 \n\t"
1991 "punpcklbw %%mm7, %%mm2 \n\t"
1992 "pmaddwd %%mm1, %%mm4 \n\t"
1993 "pmaddwd %%mm2, %%mm5 \n\t"
1994 "paddd %%mm4, %%mm0 \n\t"
1995 "paddd %%mm5, %%mm3 \n\t"
1996 "movq %%mm0, %%mm4 \n\t"
1997 "punpckldq %%mm3, %%mm0 \n\t"
1998 "punpckhdq %%mm3, %%mm4 \n\t"
1999 "paddd %%mm4, %%mm0 \n\t"
2000 "psrad $7, %%mm0 \n\t"
2001 "packssdw %%mm0, %%mm0 \n\t"
2002 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2003 "add $4, %%"REG_BP
" \n\t"
2006 "pop %%"REG_BP
" \n\t"
2008 "pop %%"REG_b
" \n\t"
2011 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2019 uint8_t *offset
= src
+filterSize
;
2020 long counter
= -2*dstW
;
2021 //filter-= counter*filterSize/2;
2022 filterPos
-= counter
/2;
2025 "pxor %%mm7, %%mm7 \n\t"
2028 "mov %2, %%"REG_c
" \n\t"
2029 "movzwl (%%"REG_c
", %0), %%eax \n\t"
2030 "movzwl 2(%%"REG_c
", %0), %%edx \n\t"
2031 "mov %5, %%"REG_c
" \n\t"
2032 "pxor %%mm4, %%mm4 \n\t"
2033 "pxor %%mm5, %%mm5 \n\t"
2035 "movq (%1), %%mm1 \n\t"
2036 "movq (%1, %6), %%mm3 \n\t"
2037 "movd (%%"REG_c
", %%"REG_a
"), %%mm0 \n\t"
2038 "movd (%%"REG_c
", %%"REG_d
"), %%mm2 \n\t"
2039 "punpcklbw %%mm7, %%mm0 \n\t"
2040 "punpcklbw %%mm7, %%mm2 \n\t"
2041 "pmaddwd %%mm1, %%mm0 \n\t"
2042 "pmaddwd %%mm2, %%mm3 \n\t"
2043 "paddd %%mm3, %%mm5 \n\t"
2044 "paddd %%mm0, %%mm4 \n\t"
2046 "add $4, %%"REG_c
" \n\t"
2047 "cmp %4, %%"REG_c
" \n\t"
2050 "movq %%mm4, %%mm0 \n\t"
2051 "punpckldq %%mm5, %%mm4 \n\t"
2052 "punpckhdq %%mm5, %%mm0 \n\t"
2053 "paddd %%mm0, %%mm4 \n\t"
2054 "psrad $7, %%mm4 \n\t"
2055 "packssdw %%mm4, %%mm4 \n\t"
2056 "mov %3, %%"REG_a
" \n\t"
2057 "movd %%mm4, (%%"REG_a
", %0) \n\t"
2061 : "+r" (counter
), "+r" (filter
)
2062 : "m" (filterPos
), "m" (dst
), "m"(offset
),
2063 "m" (src
), "r" (filterSize
*2)
2064 : "%"REG_a
, "%"REG_c
, "%"REG_d
2069 hScale_altivec_real(dst
, dstW
, src
, srcW
, xInc
, filter
, filterPos
, filterSize
);
2072 for (i
=0; i
<dstW
; i
++)
2075 int srcPos
= filterPos
[i
];
2077 //printf("filterPos: %d\n", filterPos[i]);
2078 for (j
=0; j
<filterSize
; j
++)
2080 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2081 val
+= ((int)src
[srcPos
+ j
])*filter
[filterSize
*i
+ j
];
2083 //filter += hFilterSize;
2084 dst
[i
] = FFMIN(val
>>7, (1<<15)-1); // the cubic equation does overflow ...
2087 #endif /* HAVE_ALTIVEC */
2088 #endif /* HAVE_MMX */
2090 // *** horizontal scale Y line to temp buffer
2091 static inline void RENAME(hyscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, uint8_t *src
, int srcW
, int xInc
,
2092 int flags
, int canMMX2BeUsed
, int16_t *hLumFilter
,
2093 int16_t *hLumFilterPos
, int hLumFilterSize
, void *funnyYCode
,
2094 int srcFormat
, uint8_t *formatConvBuffer
, int16_t *mmx2Filter
,
2095 int32_t *mmx2FilterPos
, uint32_t *pal
)
2097 if (srcFormat
==PIX_FMT_YUYV422
|| srcFormat
==PIX_FMT_GRAY16BE
)
2099 RENAME(yuy2ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2100 src
= formatConvBuffer
;
2102 else if (srcFormat
==PIX_FMT_UYVY422
|| srcFormat
==PIX_FMT_GRAY16LE
)
2104 RENAME(uyvyToY
)(formatConvBuffer
, src
, srcW
, pal
);
2105 src
= formatConvBuffer
;
2107 else if (srcFormat
==PIX_FMT_RGB32
)
2109 RENAME(bgr32ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2110 src
= formatConvBuffer
;
2112 else if (srcFormat
==PIX_FMT_RGB32_1
)
2114 RENAME(bgr32ToY
)(formatConvBuffer
, src
+ALT32_CORR
, srcW
, pal
);
2115 src
= formatConvBuffer
;
2117 else if (srcFormat
==PIX_FMT_BGR24
)
2119 RENAME(bgr24ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2120 src
= formatConvBuffer
;
2122 else if (srcFormat
==PIX_FMT_BGR565
)
2124 RENAME(bgr16ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2125 src
= formatConvBuffer
;
2127 else if (srcFormat
==PIX_FMT_BGR555
)
2129 RENAME(bgr15ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2130 src
= formatConvBuffer
;
2132 else if (srcFormat
==PIX_FMT_BGR32
)
2134 RENAME(rgb32ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2135 src
= formatConvBuffer
;
2137 else if (srcFormat
==PIX_FMT_BGR32_1
)
2139 RENAME(rgb32ToY
)(formatConvBuffer
, src
+ALT32_CORR
, srcW
, pal
);
2140 src
= formatConvBuffer
;
2142 else if (srcFormat
==PIX_FMT_RGB24
)
2144 RENAME(rgb24ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2145 src
= formatConvBuffer
;
2147 else if (srcFormat
==PIX_FMT_RGB565
)
2149 RENAME(rgb16ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2150 src
= formatConvBuffer
;
2152 else if (srcFormat
==PIX_FMT_RGB555
)
2154 RENAME(rgb15ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2155 src
= formatConvBuffer
;
2157 else if (srcFormat
==PIX_FMT_RGB8
|| srcFormat
==PIX_FMT_BGR8
|| srcFormat
==PIX_FMT_PAL8
|| srcFormat
==PIX_FMT_BGR4_BYTE
|| srcFormat
==PIX_FMT_RGB4_BYTE
)
2159 RENAME(palToY
)(formatConvBuffer
, src
, srcW
, pal
);
2160 src
= formatConvBuffer
;
2162 else if (srcFormat
==PIX_FMT_MONOBLACK
)
2164 RENAME(monoblack2Y
)(formatConvBuffer
, src
, srcW
, pal
);
2165 src
= formatConvBuffer
;
2167 else if (srcFormat
==PIX_FMT_MONOWHITE
)
2169 RENAME(monowhite2Y
)(formatConvBuffer
, src
, srcW
, pal
);
2170 src
= formatConvBuffer
;
2174 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2175 if (!(flags
&SWS_FAST_BILINEAR
) || (!canMMX2BeUsed
))
2177 if (!(flags
&SWS_FAST_BILINEAR
))
2180 RENAME(hScale
)(dst
, dstWidth
, src
, srcW
, xInc
, hLumFilter
, hLumFilterPos
, hLumFilterSize
);
2182 else // fast bilinear upscale / crap downscale
2188 uint64_t ebxsave
__attribute__((aligned(8)));
2194 "mov %%"REG_b
", %5 \n\t"
2196 "pxor %%mm7, %%mm7 \n\t"
2197 "mov %0, %%"REG_c
" \n\t"
2198 "mov %1, %%"REG_D
" \n\t"
2199 "mov %2, %%"REG_d
" \n\t"
2200 "mov %3, %%"REG_b
" \n\t"
2201 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2202 PREFETCH
" (%%"REG_c
") \n\t"
2203 PREFETCH
" 32(%%"REG_c
") \n\t"
2204 PREFETCH
" 64(%%"REG_c
") \n\t"
2208 #define FUNNY_Y_CODE \
2209 "movl (%%"REG_b"), %%esi \n\t"\
2211 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2212 "add %%"REG_S", %%"REG_c" \n\t"\
2213 "add %%"REG_a", %%"REG_D" \n\t"\
2214 "xor %%"REG_a", %%"REG_a" \n\t"\
2218 #define FUNNY_Y_CODE \
2219 "movl (%%"REG_b"), %%esi \n\t"\
2221 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2222 "add %%"REG_a", %%"REG_D" \n\t"\
2223 "xor %%"REG_a", %%"REG_a" \n\t"\
2225 #endif /* ARCH_X86_64 */
2237 "mov %5, %%"REG_b
" \n\t"
2239 :: "m" (src
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2244 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2249 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--) dst
[i
] = src
[srcW
-1]*128;
2253 #endif /* HAVE_MMX2 */
2254 long xInc_shr16
= xInc
>> 16;
2255 uint16_t xInc_mask
= xInc
& 0xffff;
2256 //NO MMX just normal asm ...
2258 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2259 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2260 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2263 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2264 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2265 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2266 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2267 "shll $16, %%edi \n\t"
2268 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2269 "mov %1, %%"REG_D
" \n\t"
2270 "shrl $9, %%esi \n\t"
2271 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2272 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2273 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2275 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2276 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2277 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2278 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2279 "shll $16, %%edi \n\t"
2280 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2281 "mov %1, %%"REG_D
" \n\t"
2282 "shrl $9, %%esi \n\t"
2283 "movw %%si, 2(%%"REG_D
", %%"REG_a
", 2) \n\t"
2284 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2285 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2288 "add $2, %%"REG_a
" \n\t"
2289 "cmp %2, %%"REG_a
" \n\t"
2293 :: "r" (src
), "m" (dst
), "m" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
)
2294 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2297 } //if MMX2 can't be used
2301 unsigned int xpos
=0;
2302 for (i
=0;i
<dstWidth
;i
++)
2304 register unsigned int xx
=xpos
>>16;
2305 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2306 dst
[i
]= (src
[xx
]<<7) + (src
[xx
+1] - src
[xx
])*xalpha
;
2309 #endif /* ARCH_X86 */
2312 if(c
->srcRange
!= c
->dstRange
&& !(isRGB(c
->dstFormat
) || isBGR(c
->dstFormat
))){
2314 //FIXME all pal and rgb srcFormats could do this convertion as well
2315 //FIXME all scalers more complex than bilinear could do half of this transform
2317 for (i
=0; i
<dstWidth
; i
++)
2318 dst
[i
]= (dst
[i
]*14071 + 33561947)>>14;
2320 for (i
=0; i
<dstWidth
; i
++)
2321 dst
[i
]= (FFMIN(dst
[i
],30189)*19077 - 39057361)>>14;
2326 inline static void RENAME(hcscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, uint8_t *src1
, uint8_t *src2
,
2327 int srcW
, int xInc
, int flags
, int canMMX2BeUsed
, int16_t *hChrFilter
,
2328 int16_t *hChrFilterPos
, int hChrFilterSize
, void *funnyUVCode
,
2329 int srcFormat
, uint8_t *formatConvBuffer
, int16_t *mmx2Filter
,
2330 int32_t *mmx2FilterPos
, uint32_t *pal
)
2332 if (srcFormat
==PIX_FMT_YUYV422
)
2334 RENAME(yuy2ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2335 src1
= formatConvBuffer
;
2336 src2
= formatConvBuffer
+VOFW
;
2338 else if (srcFormat
==PIX_FMT_UYVY422
)
2340 RENAME(uyvyToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2341 src1
= formatConvBuffer
;
2342 src2
= formatConvBuffer
+VOFW
;
2344 else if (srcFormat
==PIX_FMT_RGB32
)
2346 if(c
->chrSrcHSubSample
)
2347 RENAME(bgr32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2349 RENAME(bgr32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2350 src1
= formatConvBuffer
;
2351 src2
= formatConvBuffer
+VOFW
;
2353 else if (srcFormat
==PIX_FMT_RGB32_1
)
2355 if(c
->chrSrcHSubSample
)
2356 RENAME(bgr32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2358 RENAME(bgr32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2359 src1
= formatConvBuffer
;
2360 src2
= formatConvBuffer
+VOFW
;
2362 else if (srcFormat
==PIX_FMT_BGR24
)
2364 if(c
->chrSrcHSubSample
)
2365 RENAME(bgr24ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2367 RENAME(bgr24ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2368 src1
= formatConvBuffer
;
2369 src2
= formatConvBuffer
+VOFW
;
2371 else if (srcFormat
==PIX_FMT_BGR565
)
2373 if(c
->chrSrcHSubSample
)
2374 RENAME(bgr16ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2376 RENAME(bgr16ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2377 src1
= formatConvBuffer
;
2378 src2
= formatConvBuffer
+VOFW
;
2380 else if (srcFormat
==PIX_FMT_BGR555
)
2382 if(c
->chrSrcHSubSample
)
2383 RENAME(bgr15ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2385 RENAME(bgr15ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2386 src1
= formatConvBuffer
;
2387 src2
= formatConvBuffer
+VOFW
;
2389 else if (srcFormat
==PIX_FMT_BGR32
)
2391 if(c
->chrSrcHSubSample
)
2392 RENAME(rgb32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2394 RENAME(rgb32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2395 src1
= formatConvBuffer
;
2396 src2
= formatConvBuffer
+VOFW
;
2398 else if (srcFormat
==PIX_FMT_BGR32_1
)
2400 if(c
->chrSrcHSubSample
)
2401 RENAME(rgb32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2403 RENAME(rgb32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2404 src1
= formatConvBuffer
;
2405 src2
= formatConvBuffer
+VOFW
;
2407 else if (srcFormat
==PIX_FMT_RGB24
)
2409 if(c
->chrSrcHSubSample
)
2410 RENAME(rgb24ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2412 RENAME(rgb24ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2413 src1
= formatConvBuffer
;
2414 src2
= formatConvBuffer
+VOFW
;
2416 else if (srcFormat
==PIX_FMT_RGB565
)
2418 if(c
->chrSrcHSubSample
)
2419 RENAME(rgb16ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2421 RENAME(rgb16ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2422 src1
= formatConvBuffer
;
2423 src2
= formatConvBuffer
+VOFW
;
2425 else if (srcFormat
==PIX_FMT_RGB555
)
2427 if(c
->chrSrcHSubSample
)
2428 RENAME(rgb15ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2430 RENAME(rgb15ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2431 src1
= formatConvBuffer
;
2432 src2
= formatConvBuffer
+VOFW
;
2434 else if (isGray(srcFormat
) || srcFormat
==PIX_FMT_MONOBLACK
|| srcFormat
==PIX_FMT_MONOWHITE
)
2438 else if (srcFormat
==PIX_FMT_RGB8
|| srcFormat
==PIX_FMT_BGR8
|| srcFormat
==PIX_FMT_PAL8
|| srcFormat
==PIX_FMT_BGR4_BYTE
|| srcFormat
==PIX_FMT_RGB4_BYTE
)
2440 RENAME(palToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2441 src1
= formatConvBuffer
;
2442 src2
= formatConvBuffer
+VOFW
;
2446 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2447 if (!(flags
&SWS_FAST_BILINEAR
) || (!canMMX2BeUsed
))
2449 if (!(flags
&SWS_FAST_BILINEAR
))
2452 RENAME(hScale
)(dst
, dstWidth
, src1
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2453 RENAME(hScale
)(dst
+VOFW
, dstWidth
, src2
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2455 else // fast bilinear upscale / crap downscale
2461 uint64_t ebxsave
__attribute__((aligned(8)));
2467 "mov %%"REG_b
", %6 \n\t"
2469 "pxor %%mm7, %%mm7 \n\t"
2470 "mov %0, %%"REG_c
" \n\t"
2471 "mov %1, %%"REG_D
" \n\t"
2472 "mov %2, %%"REG_d
" \n\t"
2473 "mov %3, %%"REG_b
" \n\t"
2474 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2475 PREFETCH
" (%%"REG_c
") \n\t"
2476 PREFETCH
" 32(%%"REG_c
") \n\t"
2477 PREFETCH
" 64(%%"REG_c
") \n\t"
2481 #define FUNNY_UV_CODE \
2482 "movl (%%"REG_b"), %%esi \n\t"\
2484 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2485 "add %%"REG_S", %%"REG_c" \n\t"\
2486 "add %%"REG_a", %%"REG_D" \n\t"\
2487 "xor %%"REG_a", %%"REG_a" \n\t"\
2491 #define FUNNY_UV_CODE \
2492 "movl (%%"REG_b"), %%esi \n\t"\
2494 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2495 "add %%"REG_a", %%"REG_D" \n\t"\
2496 "xor %%"REG_a", %%"REG_a" \n\t"\
2498 #endif /* ARCH_X86_64 */
2504 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2505 "mov %5, %%"REG_c
" \n\t" // src
2506 "mov %1, %%"REG_D
" \n\t" // buf1
2507 "add $"AV_STRINGIFY(VOF
)", %%"REG_D
" \n\t"
2508 PREFETCH
" (%%"REG_c
") \n\t"
2509 PREFETCH
" 32(%%"REG_c
") \n\t"
2510 PREFETCH
" 64(%%"REG_c
") \n\t"
2518 "mov %6, %%"REG_b
" \n\t"
2520 :: "m" (src1
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2521 "m" (funnyUVCode
), "m" (src2
)
2525 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2530 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--)
2532 //printf("%d %d %d\n", dstWidth, i, srcW);
2533 dst
[i
] = src1
[srcW
-1]*128;
2534 dst
[i
+VOFW
] = src2
[srcW
-1]*128;
2539 #endif /* HAVE_MMX2 */
2540 long xInc_shr16
= (long) (xInc
>> 16);
2541 uint16_t xInc_mask
= xInc
& 0xffff;
2543 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2544 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2545 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2548 "mov %0, %%"REG_S
" \n\t"
2549 "movzbl (%%"REG_S
", %%"REG_d
"), %%edi \n\t" //src[xx]
2550 "movzbl 1(%%"REG_S
", %%"REG_d
"), %%esi \n\t" //src[xx+1]
2551 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2552 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2553 "shll $16, %%edi \n\t"
2554 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2555 "mov %1, %%"REG_D
" \n\t"
2556 "shrl $9, %%esi \n\t"
2557 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2559 "movzbl (%5, %%"REG_d
"), %%edi \n\t" //src[xx]
2560 "movzbl 1(%5, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2561 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2562 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2563 "shll $16, %%edi \n\t"
2564 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2565 "mov %1, %%"REG_D
" \n\t"
2566 "shrl $9, %%esi \n\t"
2567 "movw %%si, "AV_STRINGIFY(VOF
)"(%%"REG_D
", %%"REG_a
", 2) \n\t"
2569 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2570 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2571 "add $1, %%"REG_a
" \n\t"
2572 "cmp %2, %%"REG_a
" \n\t"
2575 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2576 which is needed to support GCC 4.0. */
2577 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2578 :: "m" (src1
), "m" (dst
), "g" ((long)dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2580 :: "m" (src1
), "m" (dst
), "m" ((long)dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2583 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2586 } //if MMX2 can't be used
2590 unsigned int xpos
=0;
2591 for (i
=0;i
<dstWidth
;i
++)
2593 register unsigned int xx
=xpos
>>16;
2594 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2595 dst
[i
]=(src1
[xx
]*(xalpha
^127)+src1
[xx
+1]*xalpha
);
2596 dst
[i
+VOFW
]=(src2
[xx
]*(xalpha
^127)+src2
[xx
+1]*xalpha
);
2598 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2599 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2603 #endif /* ARCH_X86 */
2605 if(c
->srcRange
!= c
->dstRange
&& !(isRGB(c
->dstFormat
) || isBGR(c
->dstFormat
))){
2607 //FIXME all pal and rgb srcFormats could do this convertion as well
2608 //FIXME all scalers more complex than bilinear could do half of this transform
2610 for (i
=0; i
<dstWidth
; i
++){
2611 dst
[i
]= (dst
[i
]*1799 + 4081085)>>11; //1469
2612 dst
[i
+VOFW
]= (dst
[i
+VOFW
]*1799 + 4081085)>>11; //1469
2615 for (i
=0; i
<dstWidth
; i
++){
2616 dst
[i
]= (FFMIN(dst
[i
],30775)*4663 - 9289992)>>12; //-264
2617 dst
[i
+VOFW
]= (FFMIN(dst
[i
+VOFW
],30775)*4663 - 9289992)>>12; //-264
2623 static int RENAME(swScale
)(SwsContext
*c
, uint8_t* src
[], int srcStride
[], int srcSliceY
,
2624 int srcSliceH
, uint8_t* dst
[], int dstStride
[]){
2626 /* load a few things into local vars to make the code more readable? and faster */
2627 const int srcW
= c
->srcW
;
2628 const int dstW
= c
->dstW
;
2629 const int dstH
= c
->dstH
;
2630 const int chrDstW
= c
->chrDstW
;
2631 const int chrSrcW
= c
->chrSrcW
;
2632 const int lumXInc
= c
->lumXInc
;
2633 const int chrXInc
= c
->chrXInc
;
2634 const int dstFormat
= c
->dstFormat
;
2635 const int srcFormat
= c
->srcFormat
;
2636 const int flags
= c
->flags
;
2637 const int canMMX2BeUsed
= c
->canMMX2BeUsed
;
2638 int16_t *vLumFilterPos
= c
->vLumFilterPos
;
2639 int16_t *vChrFilterPos
= c
->vChrFilterPos
;
2640 int16_t *hLumFilterPos
= c
->hLumFilterPos
;
2641 int16_t *hChrFilterPos
= c
->hChrFilterPos
;
2642 int16_t *vLumFilter
= c
->vLumFilter
;
2643 int16_t *vChrFilter
= c
->vChrFilter
;
2644 int16_t *hLumFilter
= c
->hLumFilter
;
2645 int16_t *hChrFilter
= c
->hChrFilter
;
2646 int32_t *lumMmxFilter
= c
->lumMmxFilter
;
2647 int32_t *chrMmxFilter
= c
->chrMmxFilter
;
2648 const int vLumFilterSize
= c
->vLumFilterSize
;
2649 const int vChrFilterSize
= c
->vChrFilterSize
;
2650 const int hLumFilterSize
= c
->hLumFilterSize
;
2651 const int hChrFilterSize
= c
->hChrFilterSize
;
2652 int16_t **lumPixBuf
= c
->lumPixBuf
;
2653 int16_t **chrPixBuf
= c
->chrPixBuf
;
2654 const int vLumBufSize
= c
->vLumBufSize
;
2655 const int vChrBufSize
= c
->vChrBufSize
;
2656 uint8_t *funnyYCode
= c
->funnyYCode
;
2657 uint8_t *funnyUVCode
= c
->funnyUVCode
;
2658 uint8_t *formatConvBuffer
= c
->formatConvBuffer
;
2659 const int chrSrcSliceY
= srcSliceY
>> c
->chrSrcVSubSample
;
2660 const int chrSrcSliceH
= -((-srcSliceH
) >> c
->chrSrcVSubSample
);
2662 uint32_t *pal
=c
->pal_yuv
;
2664 /* vars which will change and which we need to store back in the context */
2666 int lumBufIndex
= c
->lumBufIndex
;
2667 int chrBufIndex
= c
->chrBufIndex
;
2668 int lastInLumBuf
= c
->lastInLumBuf
;
2669 int lastInChrBuf
= c
->lastInChrBuf
;
2671 if (isPacked(c
->srcFormat
)){
2677 srcStride
[2]= srcStride
[0];
2679 srcStride
[1]<<= c
->vChrDrop
;
2680 srcStride
[2]<<= c
->vChrDrop
;
2682 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2683 // (int)dst[0], (int)dst[1], (int)dst[2]);
2685 #if 0 //self test FIXME move to a vfilter or something
2687 static volatile int i
=0;
2689 if (srcFormat
==PIX_FMT_YUV420P
&& i
==1 && srcSliceH
>= c
->srcH
)
2690 selfTest(src
, srcStride
, c
->srcW
, c
->srcH
);
2695 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2696 //dstStride[0],dstStride[1],dstStride[2]);
2698 if (dstStride
[0]%8 !=0 || dstStride
[1]%8 !=0 || dstStride
[2]%8 !=0)
2700 static int warnedAlready
=0; //FIXME move this into the context perhaps
2701 if (flags
& SWS_PRINT_INFO
&& !warnedAlready
)
2703 av_log(c
, AV_LOG_WARNING
, "Warning: dstStride is not aligned!\n"
2704 " ->cannot do aligned memory accesses anymore\n");
2709 /* Note the user might start scaling the picture in the middle so this
2710 will not get executed. This is not really intended but works
2711 currently, so people might do it. */
2722 for (;dstY
< dstH
; dstY
++){
2723 unsigned char *dest
=dst
[0]+dstStride
[0]*dstY
;
2724 const int chrDstY
= dstY
>>c
->chrDstVSubSample
;
2725 unsigned char *uDest
=dst
[1]+dstStride
[1]*chrDstY
;
2726 unsigned char *vDest
=dst
[2]+dstStride
[2]*chrDstY
;
2728 const int firstLumSrcY
= vLumFilterPos
[dstY
]; //First line needed as input
2729 const int firstChrSrcY
= vChrFilterPos
[chrDstY
]; //First line needed as input
2730 const int lastLumSrcY
= firstLumSrcY
+ vLumFilterSize
-1; // Last line needed as input
2731 const int lastChrSrcY
= firstChrSrcY
+ vChrFilterSize
-1; // Last line needed as input
2733 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2734 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2735 //handle holes (FAST_BILINEAR & weird filters)
2736 if (firstLumSrcY
> lastInLumBuf
) lastInLumBuf
= firstLumSrcY
-1;
2737 if (firstChrSrcY
> lastInChrBuf
) lastInChrBuf
= firstChrSrcY
-1;
2738 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2739 assert(firstLumSrcY
>= lastInLumBuf
- vLumBufSize
+ 1);
2740 assert(firstChrSrcY
>= lastInChrBuf
- vChrBufSize
+ 1);
2742 // Do we have enough lines in this slice to output the dstY line
2743 if (lastLumSrcY
< srcSliceY
+ srcSliceH
&& lastChrSrcY
< -((-srcSliceY
- srcSliceH
)>>c
->chrSrcVSubSample
))
2745 //Do horizontal scaling
2746 while(lastInLumBuf
< lastLumSrcY
)
2748 uint8_t *s
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
2750 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2751 assert(lumBufIndex
< 2*vLumBufSize
);
2752 assert(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
);
2753 assert(lastInLumBuf
+ 1 - srcSliceY
>= 0);
2754 //printf("%d %d\n", lumBufIndex, vLumBufSize);
2755 RENAME(hyscale
)(c
, lumPixBuf
[ lumBufIndex
], dstW
, s
, srcW
, lumXInc
,
2756 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2757 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
2758 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
, pal
);
2761 while(lastInChrBuf
< lastChrSrcY
)
2763 uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
2764 uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
2766 assert(chrBufIndex
< 2*vChrBufSize
);
2767 assert(lastInChrBuf
+ 1 - chrSrcSliceY
< (chrSrcSliceH
));
2768 assert(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0);
2769 //FIXME replace parameters through context struct (some at least)
2771 if (!(isGray(srcFormat
) || isGray(dstFormat
)))
2772 RENAME(hcscale
)(c
, chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
2773 flags
, canMMX2BeUsed
, hChrFilter
, hChrFilterPos
, hChrFilterSize
,
2774 funnyUVCode
, c
->srcFormat
, formatConvBuffer
,
2775 c
->chrMmx2Filter
, c
->chrMmx2FilterPos
, pal
);
2778 //wrap buf index around to stay inside the ring buffer
2779 if (lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
2780 if (chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
2782 else // not enough lines left in this slice -> load the rest in the buffer
2784 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2785 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2786 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2787 vChrBufSize, vLumBufSize);*/
2789 //Do horizontal scaling
2790 while(lastInLumBuf
+1 < srcSliceY
+ srcSliceH
)
2792 uint8_t *s
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
2794 assert(lumBufIndex
< 2*vLumBufSize
);
2795 assert(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
);
2796 assert(lastInLumBuf
+ 1 - srcSliceY
>= 0);
2797 RENAME(hyscale
)(c
, lumPixBuf
[ lumBufIndex
], dstW
, s
, srcW
, lumXInc
,
2798 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2799 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
2800 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
, pal
);
2803 while(lastInChrBuf
+1 < (chrSrcSliceY
+ chrSrcSliceH
))
2805 uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
2806 uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
2808 assert(chrBufIndex
< 2*vChrBufSize
);
2809 assert(lastInChrBuf
+ 1 - chrSrcSliceY
< chrSrcSliceH
);
2810 assert(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0);
2812 if (!(isGray(srcFormat
) || isGray(dstFormat
)))
2813 RENAME(hcscale
)(c
, chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
2814 flags
, canMMX2BeUsed
, hChrFilter
, hChrFilterPos
, hChrFilterSize
,
2815 funnyUVCode
, c
->srcFormat
, formatConvBuffer
,
2816 c
->chrMmx2Filter
, c
->chrMmx2FilterPos
, pal
);
2819 //wrap buf index around to stay inside the ring buffer
2820 if (lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
2821 if (chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
2822 break; //we can't output a dstY line so let's try with the next slice
2826 c
->blueDither
= ff_dither8
[dstY
&1];
2827 if (c
->dstFormat
== PIX_FMT_RGB555
|| c
->dstFormat
== PIX_FMT_BGR555
)
2828 c
->greenDither
= ff_dither8
[dstY
&1];
2830 c
->greenDither
= ff_dither4
[dstY
&1];
2831 c
->redDither
= ff_dither8
[(dstY
+1)&1];
2835 int16_t **lumSrcPtr
= lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2836 int16_t **chrSrcPtr
= chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2839 if (flags
& SWS_ACCURATE_RND
){
2840 int s
= APCK_SIZE
/ 8;
2841 for (i
=0; i
<vLumFilterSize
; i
+=2){
2842 *(void**)&lumMmxFilter
[s
*i
]= lumSrcPtr
[i
];
2843 *(void**)&lumMmxFilter
[s
*i
+APCK_PTR2
/4 ]= lumSrcPtr
[i
+(vLumFilterSize
>1)];
2844 lumMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2845 lumMmxFilter
[s
*i
+APCK_COEF
/4+1]= vLumFilter
[dstY
*vLumFilterSize
+ i
]
2846 + (vLumFilterSize
>1 ? vLumFilter
[dstY
*vLumFilterSize
+ i
+ 1]<<16 : 0);
2848 for (i
=0; i
<vChrFilterSize
; i
+=2){
2849 *(void**)&chrMmxFilter
[s
*i
]= chrSrcPtr
[i
];
2850 *(void**)&chrMmxFilter
[s
*i
+APCK_PTR2
/4 ]= chrSrcPtr
[i
+(vChrFilterSize
>1)];
2851 chrMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2852 chrMmxFilter
[s
*i
+APCK_COEF
/4+1]= vChrFilter
[chrDstY
*vChrFilterSize
+ i
]
2853 + (vChrFilterSize
>1 ? vChrFilter
[chrDstY
*vChrFilterSize
+ i
+ 1]<<16 : 0);
2856 for (i
=0; i
<vLumFilterSize
; i
++)
2858 lumMmxFilter
[4*i
+0]= (int32_t)lumSrcPtr
[i
];
2859 lumMmxFilter
[4*i
+1]= (uint64_t)lumSrcPtr
[i
] >> 32;
2860 lumMmxFilter
[4*i
+2]=
2861 lumMmxFilter
[4*i
+3]=
2862 ((uint16_t)vLumFilter
[dstY
*vLumFilterSize
+ i
])*0x10001;
2864 for (i
=0; i
<vChrFilterSize
; i
++)
2866 chrMmxFilter
[4*i
+0]= (int32_t)chrSrcPtr
[i
];
2867 chrMmxFilter
[4*i
+1]= (uint64_t)chrSrcPtr
[i
] >> 32;
2868 chrMmxFilter
[4*i
+2]=
2869 chrMmxFilter
[4*i
+3]=
2870 ((uint16_t)vChrFilter
[chrDstY
*vChrFilterSize
+ i
])*0x10001;
2874 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
){
2875 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2876 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2877 RENAME(yuv2nv12X
)(c
,
2878 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2879 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2880 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
2882 else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) //YV12 like
2884 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2885 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
2886 if (vLumFilterSize
== 1 && vChrFilterSize
== 1) // unscaled YV12
2888 int16_t *lumBuf
= lumPixBuf
[0];
2889 int16_t *chrBuf
= chrPixBuf
[0];
2890 RENAME(yuv2yuv1
)(c
, lumBuf
, chrBuf
, dest
, uDest
, vDest
, dstW
, chrDstW
);
2895 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2896 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2897 dest
, uDest
, vDest
, dstW
, chrDstW
);
2902 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
2903 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
2904 if (vLumFilterSize
== 1 && vChrFilterSize
== 2) //unscaled RGB
2906 int chrAlpha
= vChrFilter
[2*dstY
+1];
2907 if(flags
& SWS_FULL_CHR_H_INT
){
2908 yuv2rgbXinC_full(c
, //FIXME write a packed1_full function
2909 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2910 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2913 RENAME(yuv2packed1
)(c
, *lumSrcPtr
, *chrSrcPtr
, *(chrSrcPtr
+1),
2914 dest
, dstW
, chrAlpha
, dstFormat
, flags
, dstY
);
2917 else if (vLumFilterSize
== 2 && vChrFilterSize
== 2) //bilinear upscale RGB
2919 int lumAlpha
= vLumFilter
[2*dstY
+1];
2920 int chrAlpha
= vChrFilter
[2*dstY
+1];
2922 lumMmxFilter
[3]= vLumFilter
[2*dstY
]*0x10001;
2924 chrMmxFilter
[3]= vChrFilter
[2*chrDstY
]*0x10001;
2925 if(flags
& SWS_FULL_CHR_H_INT
){
2926 yuv2rgbXinC_full(c
, //FIXME write a packed2_full function
2927 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2928 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2931 RENAME(yuv2packed2
)(c
, *lumSrcPtr
, *(lumSrcPtr
+1), *chrSrcPtr
, *(chrSrcPtr
+1),
2932 dest
, dstW
, lumAlpha
, chrAlpha
, dstY
);
2937 if(flags
& SWS_FULL_CHR_H_INT
){
2939 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2940 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2943 RENAME(yuv2packedX
)(c
,
2944 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2945 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2951 else // hmm looks like we can't use MMX here without overwriting this array's tail
2953 int16_t **lumSrcPtr
= lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2954 int16_t **chrSrcPtr
= chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2955 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
){
2956 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2957 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2959 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2960 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2961 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
2963 else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) //YV12
2965 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2966 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
2968 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2969 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2970 dest
, uDest
, vDest
, dstW
, chrDstW
);
2974 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
2975 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
2976 if(flags
& SWS_FULL_CHR_H_INT
){
2978 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2979 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2983 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2984 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2992 __asm__
volatile(SFENCE:::"memory");
2993 __asm__
volatile(EMMS:::"memory");
2995 /* store changed local vars back in the context */
2997 c
->lumBufIndex
= lumBufIndex
;
2998 c
->chrBufIndex
= chrBufIndex
;
2999 c
->lastInLumBuf
= lastInLumBuf
;
3000 c
->lastInChrBuf
= lastInChrBuf
;
3002 return dstY
- lastDstY
;