2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
51 #define SFENCE "sfence"
53 #define SFENCE " # nop"
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70 #include "swscale_altivec_template.c"
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX_UV \
211 "xor %%"REG_a", %%"REG_a" \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
234 "lea "offset"(%0), %%"REG_d" \n\t"\
235 "mov (%%"REG_d"), %%"REG_S" \n\t"\
236 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
237 "movq "#dst1", "#dst2" \n\t"\
240 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
241 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
242 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
243 "add $16, %%"REG_d" \n\t"\
244 "mov (%%"REG_d"), %%"REG_S" \n\t"\
245 "pmulhw "#coeff", "#src1" \n\t"\
246 "pmulhw "#coeff", "#src2" \n\t"\
247 "paddw "#src1", "#dst1" \n\t"\
248 "paddw "#src2", "#dst2" \n\t"\
249 "test %%"REG_S", %%"REG_S" \n\t"\
252 #define YSCALEYUV2PACKEDX \
253 YSCALEYUV2PACKEDX_UV \
254 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
256 #define YSCALEYUV2PACKEDX_END \
257 :: "r" (&c->redDither), \
258 "m" (dummy), "m" (dummy), "m" (dummy),\
259 "r" (dest), "m" (dstW) \
260 : "%"REG_a, "%"REG_d, "%"REG_S \
263 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
265 "xor %%"REG_a", %%"REG_a" \n\t"\
269 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
270 "mov (%%"REG_d"), %%"REG_S" \n\t"\
271 "pxor %%mm4, %%mm4 \n\t"\
272 "pxor %%mm5, %%mm5 \n\t"\
273 "pxor %%mm6, %%mm6 \n\t"\
274 "pxor %%mm7, %%mm7 \n\t"\
277 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
278 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
279 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
280 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
281 "movq %%mm0, %%mm3 \n\t"\
282 "punpcklwd %%mm1, %%mm0 \n\t"\
283 "punpckhwd %%mm1, %%mm3 \n\t"\
284 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
285 "pmaddwd %%mm1, %%mm0 \n\t"\
286 "pmaddwd %%mm1, %%mm3 \n\t"\
287 "paddd %%mm0, %%mm4 \n\t"\
288 "paddd %%mm3, %%mm5 \n\t"\
289 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
290 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
291 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
292 "test %%"REG_S", %%"REG_S" \n\t"\
293 "movq %%mm2, %%mm0 \n\t"\
294 "punpcklwd %%mm3, %%mm2 \n\t"\
295 "punpckhwd %%mm3, %%mm0 \n\t"\
296 "pmaddwd %%mm1, %%mm2 \n\t"\
297 "pmaddwd %%mm1, %%mm0 \n\t"\
298 "paddd %%mm2, %%mm6 \n\t"\
299 "paddd %%mm0, %%mm7 \n\t"\
301 "psrad $16, %%mm4 \n\t"\
302 "psrad $16, %%mm5 \n\t"\
303 "psrad $16, %%mm6 \n\t"\
304 "psrad $16, %%mm7 \n\t"\
305 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
306 "packssdw %%mm5, %%mm4 \n\t"\
307 "packssdw %%mm7, %%mm6 \n\t"\
308 "paddw %%mm0, %%mm4 \n\t"\
309 "paddw %%mm0, %%mm6 \n\t"\
310 "movq %%mm4, "U_TEMP"(%0) \n\t"\
311 "movq %%mm6, "V_TEMP"(%0) \n\t"\
313 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
314 "lea "offset"(%0), %%"REG_d" \n\t"\
315 "mov (%%"REG_d"), %%"REG_S" \n\t"\
316 "pxor %%mm1, %%mm1 \n\t"\
317 "pxor %%mm5, %%mm5 \n\t"\
318 "pxor %%mm7, %%mm7 \n\t"\
319 "pxor %%mm6, %%mm6 \n\t"\
322 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
323 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
324 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
325 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
326 "movq %%mm0, %%mm3 \n\t"\
327 "punpcklwd %%mm4, %%mm0 \n\t"\
328 "punpckhwd %%mm4, %%mm3 \n\t"\
329 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
330 "pmaddwd %%mm4, %%mm0 \n\t"\
331 "pmaddwd %%mm4, %%mm3 \n\t"\
332 "paddd %%mm0, %%mm1 \n\t"\
333 "paddd %%mm3, %%mm5 \n\t"\
334 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
335 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
336 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
337 "test %%"REG_S", %%"REG_S" \n\t"\
338 "movq %%mm2, %%mm0 \n\t"\
339 "punpcklwd %%mm3, %%mm2 \n\t"\
340 "punpckhwd %%mm3, %%mm0 \n\t"\
341 "pmaddwd %%mm4, %%mm2 \n\t"\
342 "pmaddwd %%mm4, %%mm0 \n\t"\
343 "paddd %%mm2, %%mm7 \n\t"\
344 "paddd %%mm0, %%mm6 \n\t"\
346 "psrad $16, %%mm1 \n\t"\
347 "psrad $16, %%mm5 \n\t"\
348 "psrad $16, %%mm7 \n\t"\
349 "psrad $16, %%mm6 \n\t"\
350 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
351 "packssdw %%mm5, %%mm1 \n\t"\
352 "packssdw %%mm6, %%mm7 \n\t"\
353 "paddw %%mm0, %%mm1 \n\t"\
354 "paddw %%mm0, %%mm7 \n\t"\
355 "movq "U_TEMP"(%0), %%mm3 \n\t"\
356 "movq "V_TEMP"(%0), %%mm4 \n\t"\
358 #define YSCALEYUV2PACKEDX_ACCURATE \
359 YSCALEYUV2PACKEDX_ACCURATE_UV \
360 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
362 #define YSCALEYUV2RGBX \
363 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
364 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
365 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
366 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
367 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
368 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
369 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
370 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
371 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
372 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
373 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
374 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
375 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
376 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
377 "paddw %%mm3, %%mm4 \n\t"\
378 "movq %%mm2, %%mm0 \n\t"\
379 "movq %%mm5, %%mm6 \n\t"\
380 "movq %%mm4, %%mm3 \n\t"\
381 "punpcklwd %%mm2, %%mm2 \n\t"\
382 "punpcklwd %%mm5, %%mm5 \n\t"\
383 "punpcklwd %%mm4, %%mm4 \n\t"\
384 "paddw %%mm1, %%mm2 \n\t"\
385 "paddw %%mm1, %%mm5 \n\t"\
386 "paddw %%mm1, %%mm4 \n\t"\
387 "punpckhwd %%mm0, %%mm0 \n\t"\
388 "punpckhwd %%mm6, %%mm6 \n\t"\
389 "punpckhwd %%mm3, %%mm3 \n\t"\
390 "paddw %%mm7, %%mm0 \n\t"\
391 "paddw %%mm7, %%mm6 \n\t"\
392 "paddw %%mm7, %%mm3 \n\t"\
393 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
394 "packuswb %%mm0, %%mm2 \n\t"\
395 "packuswb %%mm6, %%mm5 \n\t"\
396 "packuswb %%mm3, %%mm4 \n\t"\
398 #define REAL_YSCALEYUV2PACKED(index, c) \
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
401 "psraw $3, %%mm0 \n\t"\
402 "psraw $3, %%mm1 \n\t"\
403 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
404 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
405 "xor "#index", "#index" \n\t"\
408 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
409 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
410 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
411 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
412 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
413 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
414 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
415 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
416 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
417 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
418 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
419 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
420 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
421 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
422 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
423 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
424 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
425 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
426 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
427 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
428 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
429 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
430 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
431 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
432 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
434 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
436 #define REAL_YSCALEYUV2RGB_UV(index, c) \
437 "xor "#index", "#index" \n\t"\
440 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
441 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
442 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
443 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
444 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
445 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
446 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
447 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
448 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
449 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
450 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
451 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
452 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
453 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
454 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
455 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
456 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
457 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
458 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
459 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
461 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
462 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
463 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
464 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
465 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
466 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
467 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
468 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
469 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
470 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
471 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
472 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
473 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
475 #define REAL_YSCALEYUV2RGB_COEFF(c) \
476 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
477 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
478 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
479 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
480 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
481 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
482 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
483 "paddw %%mm3, %%mm4 \n\t"\
484 "movq %%mm2, %%mm0 \n\t"\
485 "movq %%mm5, %%mm6 \n\t"\
486 "movq %%mm4, %%mm3 \n\t"\
487 "punpcklwd %%mm2, %%mm2 \n\t"\
488 "punpcklwd %%mm5, %%mm5 \n\t"\
489 "punpcklwd %%mm4, %%mm4 \n\t"\
490 "paddw %%mm1, %%mm2 \n\t"\
491 "paddw %%mm1, %%mm5 \n\t"\
492 "paddw %%mm1, %%mm4 \n\t"\
493 "punpckhwd %%mm0, %%mm0 \n\t"\
494 "punpckhwd %%mm6, %%mm6 \n\t"\
495 "punpckhwd %%mm3, %%mm3 \n\t"\
496 "paddw %%mm7, %%mm0 \n\t"\
497 "paddw %%mm7, %%mm6 \n\t"\
498 "paddw %%mm7, %%mm3 \n\t"\
499 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
500 "packuswb %%mm0, %%mm2 \n\t"\
501 "packuswb %%mm6, %%mm5 \n\t"\
502 "packuswb %%mm3, %%mm4 \n\t"\
504 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
506 #define YSCALEYUV2RGB(index, c) \
507 REAL_YSCALEYUV2RGB_UV(index, c) \
508 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
509 REAL_YSCALEYUV2RGB_COEFF(c)
511 #define REAL_YSCALEYUV2PACKED1(index, c) \
512 "xor "#index", "#index" \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $7, %%mm3 \n\t" \
518 "psraw $7, %%mm4 \n\t" \
519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
521 "psraw $7, %%mm1 \n\t" \
522 "psraw $7, %%mm7 \n\t" \
524 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
526 #define REAL_YSCALEYUV2RGB1(index, c) \
527 "xor "#index", "#index" \n\t"\
530 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
531 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
532 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
533 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
534 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
535 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
536 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
537 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
538 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
539 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
540 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
541 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
542 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
543 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
544 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
545 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
546 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
547 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
548 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
549 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
550 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
551 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
552 "paddw %%mm3, %%mm4 \n\t"\
553 "movq %%mm2, %%mm0 \n\t"\
554 "movq %%mm5, %%mm6 \n\t"\
555 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklwd %%mm2, %%mm2 \n\t"\
557 "punpcklwd %%mm5, %%mm5 \n\t"\
558 "punpcklwd %%mm4, %%mm4 \n\t"\
559 "paddw %%mm1, %%mm2 \n\t"\
560 "paddw %%mm1, %%mm5 \n\t"\
561 "paddw %%mm1, %%mm4 \n\t"\
562 "punpckhwd %%mm0, %%mm0 \n\t"\
563 "punpckhwd %%mm6, %%mm6 \n\t"\
564 "punpckhwd %%mm3, %%mm3 \n\t"\
565 "paddw %%mm7, %%mm0 \n\t"\
566 "paddw %%mm7, %%mm6 \n\t"\
567 "paddw %%mm7, %%mm3 \n\t"\
568 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
569 "packuswb %%mm0, %%mm2 \n\t"\
570 "packuswb %%mm6, %%mm5 \n\t"\
571 "packuswb %%mm3, %%mm4 \n\t"\
573 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
575 #define REAL_YSCALEYUV2PACKED1b(index, c) \
576 "xor "#index", "#index" \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $8, %%mm3 \n\t" \
586 "psrlw $8, %%mm4 \n\t" \
587 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
588 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
589 "psraw $7, %%mm1 \n\t" \
590 "psraw $7, %%mm7 \n\t"
591 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
593 // do vertical chrominance interpolation
594 #define REAL_YSCALEYUV2RGB1b(index, c) \
595 "xor "#index", "#index" \n\t"\
598 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
599 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
600 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
601 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
602 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
603 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
604 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
605 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
606 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
607 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
608 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
609 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
610 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
611 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
612 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
613 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
614 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
615 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
616 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
617 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
618 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
619 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
620 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
621 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
622 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
623 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
624 "paddw %%mm3, %%mm4 \n\t"\
625 "movq %%mm2, %%mm0 \n\t"\
626 "movq %%mm5, %%mm6 \n\t"\
627 "movq %%mm4, %%mm3 \n\t"\
628 "punpcklwd %%mm2, %%mm2 \n\t"\
629 "punpcklwd %%mm5, %%mm5 \n\t"\
630 "punpcklwd %%mm4, %%mm4 \n\t"\
631 "paddw %%mm1, %%mm2 \n\t"\
632 "paddw %%mm1, %%mm5 \n\t"\
633 "paddw %%mm1, %%mm4 \n\t"\
634 "punpckhwd %%mm0, %%mm0 \n\t"\
635 "punpckhwd %%mm6, %%mm6 \n\t"\
636 "punpckhwd %%mm3, %%mm3 \n\t"\
637 "paddw %%mm7, %%mm0 \n\t"\
638 "paddw %%mm7, %%mm6 \n\t"\
639 "paddw %%mm7, %%mm3 \n\t"\
640 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
641 "packuswb %%mm0, %%mm2 \n\t"\
642 "packuswb %%mm6, %%mm5 \n\t"\
643 "packuswb %%mm3, %%mm4 \n\t"\
645 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
647 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
648 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
649 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
650 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
651 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
652 "packuswb %%mm1, %%mm7 \n\t"
653 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
655 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
656 "movq "#b", "#q2" \n\t" /* B */\
657 "movq "#r", "#t" \n\t" /* R */\
658 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
659 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
660 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
661 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
662 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
663 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
664 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
665 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
666 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
667 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
669 MOVNTQ( q0, (dst, index, 4))\
670 MOVNTQ( b, 8(dst, index, 4))\
671 MOVNTQ( q2, 16(dst, index, 4))\
672 MOVNTQ( q3, 24(dst, index, 4))\
674 "add $8, "#index" \n\t"\
675 "cmp "#dstw", "#index" \n\t"\
677 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
679 #define REAL_WRITERGB16(dst, dstw, index) \
680 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
681 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
682 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
683 "psrlq $3, %%mm2 \n\t"\
685 "movq %%mm2, %%mm1 \n\t"\
686 "movq %%mm4, %%mm3 \n\t"\
688 "punpcklbw %%mm7, %%mm3 \n\t"\
689 "punpcklbw %%mm5, %%mm2 \n\t"\
690 "punpckhbw %%mm7, %%mm4 \n\t"\
691 "punpckhbw %%mm5, %%mm1 \n\t"\
693 "psllq $3, %%mm3 \n\t"\
694 "psllq $3, %%mm4 \n\t"\
696 "por %%mm3, %%mm2 \n\t"\
697 "por %%mm4, %%mm1 \n\t"\
699 MOVNTQ(%%mm2, (dst, index, 2))\
700 MOVNTQ(%%mm1, 8(dst, index, 2))\
702 "add $8, "#index" \n\t"\
703 "cmp "#dstw", "#index" \n\t"\
705 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
707 #define REAL_WRITERGB15(dst, dstw, index) \
708 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
709 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
710 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
711 "psrlq $3, %%mm2 \n\t"\
712 "psrlq $1, %%mm5 \n\t"\
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
722 "psllq $2, %%mm3 \n\t"\
723 "psllq $2, %%mm4 \n\t"\
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
734 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
736 #define WRITEBGR24OLD(dst, dstw, index) \
737 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
738 "movq %%mm2, %%mm1 \n\t" /* B */\
739 "movq %%mm5, %%mm6 \n\t" /* R */\
740 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
741 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
742 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
743 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
744 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
745 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
746 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
747 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
748 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
749 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
751 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
752 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
753 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
754 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
755 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
756 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
757 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
758 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
760 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
761 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
762 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
763 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
764 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
765 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
766 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
767 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
768 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
769 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
770 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
771 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
772 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
774 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
775 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
776 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
777 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
778 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
779 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
780 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
781 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
783 MOVNTQ(%%mm0, (dst))\
784 MOVNTQ(%%mm2, 8(dst))\
785 MOVNTQ(%%mm3, 16(dst))\
786 "add $24, "#dst" \n\t"\
788 "add $8, "#index" \n\t"\
789 "cmp "#dstw", "#index" \n\t"\
792 #define WRITEBGR24MMX(dst, dstw, index) \
793 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
794 "movq %%mm2, %%mm1 \n\t" /* B */\
795 "movq %%mm5, %%mm6 \n\t" /* R */\
796 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
797 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
798 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
799 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
800 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
801 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
802 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
803 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
804 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
805 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
807 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
808 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
809 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
810 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
812 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
813 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
814 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
815 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
817 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
818 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
819 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
820 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
822 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
823 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
824 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
825 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
826 MOVNTQ(%%mm0, (dst))\
828 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
829 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
830 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
831 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
832 MOVNTQ(%%mm6, 8(dst))\
834 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
835 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
836 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
837 MOVNTQ(%%mm5, 16(dst))\
839 "add $24, "#dst" \n\t"\
841 "add $8, "#index" \n\t"\
842 "cmp "#dstw", "#index" \n\t"\
845 #define WRITEBGR24MMX2(dst, dstw, index) \
846 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
847 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
848 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
849 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
850 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
851 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
853 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
854 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
855 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
857 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
858 "por %%mm1, %%mm6 \n\t"\
859 "por %%mm3, %%mm6 \n\t"\
860 MOVNTQ(%%mm6, (dst))\
862 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
863 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
864 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
865 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
867 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
868 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
869 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
871 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
872 "por %%mm3, %%mm6 \n\t"\
873 MOVNTQ(%%mm6, 8(dst))\
875 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
876 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
877 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
879 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
880 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
881 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
883 "por %%mm1, %%mm3 \n\t"\
884 "por %%mm3, %%mm6 \n\t"\
885 MOVNTQ(%%mm6, 16(dst))\
887 "add $24, "#dst" \n\t"\
889 "add $8, "#index" \n\t"\
890 "cmp "#dstw", "#index" \n\t"\
895 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
898 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
901 #define REAL_WRITEYUY2(dst, dstw, index) \
902 "packuswb %%mm3, %%mm3 \n\t"\
903 "packuswb %%mm4, %%mm4 \n\t"\
904 "packuswb %%mm7, %%mm1 \n\t"\
905 "punpcklbw %%mm4, %%mm3 \n\t"\
906 "movq %%mm1, %%mm7 \n\t"\
907 "punpcklbw %%mm3, %%mm1 \n\t"\
908 "punpckhbw %%mm3, %%mm7 \n\t"\
910 MOVNTQ(%%mm1, (dst, index, 2))\
911 MOVNTQ(%%mm7, 8(dst, index, 2))\
913 "add $8, "#index" \n\t"\
914 "cmp "#dstw", "#index" \n\t"\
916 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
919 static inline void RENAME(yuv2yuvX
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
920 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
, int16_t **alpSrc
,
921 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, uint8_t *aDest
, long dstW
, long chrDstW
)
924 if(!(c
->flags
& SWS_BITEXACT
)){
925 if (c
->flags
& SWS_ACCURATE_RND
){
927 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
928 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
930 if (CONFIG_SWSCALE_ALPHA
&& aDest
){
931 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET
, aDest
, dstW
)
934 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
937 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
938 YSCALEYUV2YV12X(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
940 if (CONFIG_SWSCALE_ALPHA
&& aDest
){
941 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET
, aDest
, dstW
)
944 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
950 yuv2yuvX_altivec_real(lumFilter
, lumSrc
, lumFilterSize
,
951 chrFilter
, chrSrc
, chrFilterSize
,
952 dest
, uDest
, vDest
, dstW
, chrDstW
);
954 yuv2yuvXinC(lumFilter
, lumSrc
, lumFilterSize
,
955 chrFilter
, chrSrc
, chrFilterSize
,
956 alpSrc
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
957 #endif //!HAVE_ALTIVEC
960 static inline void RENAME(yuv2nv12X
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
961 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
962 uint8_t *dest
, uint8_t *uDest
, int dstW
, int chrDstW
, int dstFormat
)
964 yuv2nv12XinC(lumFilter
, lumSrc
, lumFilterSize
,
965 chrFilter
, chrSrc
, chrFilterSize
,
966 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
969 static inline void RENAME(yuv2yuv1
)(SwsContext
*c
, int16_t *lumSrc
, int16_t *chrSrc
, int16_t *alpSrc
,
970 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, uint8_t *aDest
, long dstW
, long chrDstW
)
974 if(!(c
->flags
& SWS_BITEXACT
)){
976 uint8_t *src
[4]= {alpSrc
+ dstW
, lumSrc
+ dstW
, chrSrc
+ chrDstW
, chrSrc
+ VOFW
+ chrDstW
};
977 uint8_t *dst
[4]= {aDest
, dest
, uDest
, vDest
};
978 x86_reg counter
[4]= {dstW
, dstW
, chrDstW
, chrDstW
};
980 if (c
->flags
& SWS_ACCURATE_RND
){
984 YSCALEYUV2YV121_ACCURATE
985 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
996 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
1006 for (i
=0; i
<dstW
; i
++)
1008 int val
= (lumSrc
[i
]+64)>>7;
1019 for (i
=0; i
<chrDstW
; i
++)
1021 int u
=(chrSrc
[i
]+64)>>7;
1022 int v
=(chrSrc
[i
+ VOFW
]+64)>>7;
1026 else if (u
>255) u
=255;
1028 else if (v
>255) v
=255;
1035 if (CONFIG_SWSCALE_ALPHA
&& aDest
)
1036 for (i
=0; i
<dstW
; i
++){
1037 int val
= (alpSrc
[i
]+64)>>7;
1038 aDest
[i
]= av_clip_uint8(val
);
1044 * vertical scale YV12 to RGB
1046 static inline void RENAME(yuv2packedX
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
1047 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
1048 int16_t **alpSrc
, uint8_t *dest
, long dstW
, long dstY
)
1052 if(!(c
->flags
& SWS_BITEXACT
)){
1053 if (c
->flags
& SWS_ACCURATE_RND
){
1054 switch(c
->dstFormat
){
1056 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
){
1057 YSCALEYUV2PACKEDX_ACCURATE
1059 "movq %%mm2, "U_TEMP
"(%0) \n\t"
1060 "movq %%mm4, "V_TEMP
"(%0) \n\t"
1061 "movq %%mm5, "Y_TEMP
"(%0) \n\t"
1062 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET
)
1063 "movq "Y_TEMP
"(%0), %%mm5 \n\t"
1064 "psraw $3, %%mm1 \n\t"
1065 "psraw $3, %%mm7 \n\t"
1066 "packuswb %%mm7, %%mm1 \n\t"
1067 WRITEBGR32(%4, %5, %%REGa
, %%mm3
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm2
, %%mm6
)
1069 YSCALEYUV2PACKEDX_END
1071 YSCALEYUV2PACKEDX_ACCURATE
1073 "pcmpeqd %%mm7, %%mm7 \n\t"
1074 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1076 YSCALEYUV2PACKEDX_END
1080 YSCALEYUV2PACKEDX_ACCURATE
1082 "pxor %%mm7, %%mm7 \n\t"
1083 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
"\n\t" //FIXME optimize
1084 "add %4, %%"REG_c
" \n\t"
1085 WRITEBGR24(%%REGc
, %5, %%REGa
)
1088 :: "r" (&c
->redDither
),
1089 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1090 "r" (dest
), "m" (dstW
)
1091 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1094 case PIX_FMT_RGB555
:
1095 YSCALEYUV2PACKEDX_ACCURATE
1097 "pxor %%mm7, %%mm7 \n\t"
1098 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1100 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1101 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1102 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1105 WRITERGB15(%4, %5, %%REGa
)
1106 YSCALEYUV2PACKEDX_END
1108 case PIX_FMT_RGB565
:
1109 YSCALEYUV2PACKEDX_ACCURATE
1111 "pxor %%mm7, %%mm7 \n\t"
1112 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1114 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1115 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1116 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1119 WRITERGB16(%4, %5, %%REGa
)
1120 YSCALEYUV2PACKEDX_END
1122 case PIX_FMT_YUYV422
:
1123 YSCALEYUV2PACKEDX_ACCURATE
1124 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1126 "psraw $3, %%mm3 \n\t"
1127 "psraw $3, %%mm4 \n\t"
1128 "psraw $3, %%mm1 \n\t"
1129 "psraw $3, %%mm7 \n\t"
1130 WRITEYUY2(%4, %5, %%REGa
)
1131 YSCALEYUV2PACKEDX_END
1135 switch(c
->dstFormat
)
1138 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
){
1141 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET
, %%mm0
, %%mm3
, %%mm6
, %%mm1
, %%mm7
)
1142 "psraw $3, %%mm1 \n\t"
1143 "psraw $3, %%mm7 \n\t"
1144 "packuswb %%mm7, %%mm1 \n\t"
1145 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm3
, %%mm6
)
1146 YSCALEYUV2PACKEDX_END
1150 "pcmpeqd %%mm7, %%mm7 \n\t"
1151 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1152 YSCALEYUV2PACKEDX_END
1158 "pxor %%mm7, %%mm7 \n\t"
1159 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
" \n\t" //FIXME optimize
1160 "add %4, %%"REG_c
" \n\t"
1161 WRITEBGR24(%%REGc
, %5, %%REGa
)
1163 :: "r" (&c
->redDither
),
1164 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1165 "r" (dest
), "m" (dstW
)
1166 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1169 case PIX_FMT_RGB555
:
1172 "pxor %%mm7, %%mm7 \n\t"
1173 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1175 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1176 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1177 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1180 WRITERGB15(%4, %5, %%REGa
)
1181 YSCALEYUV2PACKEDX_END
1183 case PIX_FMT_RGB565
:
1186 "pxor %%mm7, %%mm7 \n\t"
1187 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1189 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1190 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1191 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1194 WRITERGB16(%4, %5, %%REGa
)
1195 YSCALEYUV2PACKEDX_END
1197 case PIX_FMT_YUYV422
:
1199 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1201 "psraw $3, %%mm3 \n\t"
1202 "psraw $3, %%mm4 \n\t"
1203 "psraw $3, %%mm1 \n\t"
1204 "psraw $3, %%mm7 \n\t"
1205 WRITEYUY2(%4, %5, %%REGa
)
1206 YSCALEYUV2PACKEDX_END
1211 #endif /* HAVE_MMX */
1213 /* The following list of supported dstFormat values should
1214 match what's found in the body of ff_yuv2packedX_altivec() */
1215 if (!(c
->flags
& SWS_BITEXACT
) && !c
->alpPixBuf
&&
1216 (c
->dstFormat
==PIX_FMT_ABGR
|| c
->dstFormat
==PIX_FMT_BGRA
||
1217 c
->dstFormat
==PIX_FMT_BGR24
|| c
->dstFormat
==PIX_FMT_RGB24
||
1218 c
->dstFormat
==PIX_FMT_RGBA
|| c
->dstFormat
==PIX_FMT_ARGB
))
1219 ff_yuv2packedX_altivec(c
, lumFilter
, lumSrc
, lumFilterSize
,
1220 chrFilter
, chrSrc
, chrFilterSize
,
1224 yuv2packedXinC(c
, lumFilter
, lumSrc
, lumFilterSize
,
1225 chrFilter
, chrSrc
, chrFilterSize
,
1226 alpSrc
, dest
, dstW
, dstY
);
1230 * vertical bilinear scale YV12 to RGB
1232 static inline void RENAME(yuv2packed2
)(SwsContext
*c
, uint16_t *buf0
, uint16_t *buf1
, uint16_t *uvbuf0
, uint16_t *uvbuf1
,
1233 uint16_t *abuf0
, uint16_t *abuf1
, uint8_t *dest
, int dstW
, int yalpha
, int uvalpha
, int y
)
1235 int yalpha1
=4095- yalpha
;
1236 int uvalpha1
=4095-uvalpha
;
1240 if(!(c
->flags
& SWS_BITEXACT
)){
1241 switch(c
->dstFormat
)
1243 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1245 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
){
1248 YSCALEYUV2RGB(%%REGBP
, %5)
1249 YSCALEYUV2RGB_YA(%%REGBP
, %5, %6, %7)
1250 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1251 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1252 "packuswb %%mm7, %%mm1 \n\t"
1253 WRITEBGR32(%4, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm3
, %%mm6
)
1255 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "r" (dest
),
1257 ,"r" (abuf0
), "r" (abuf1
)
1261 *(uint16_t **)(&c
->u_temp
)=abuf0
;
1262 *(uint16_t **)(&c
->v_temp
)=abuf1
;
1264 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1265 "mov %4, %%"REG_b
" \n\t"
1266 "push %%"REG_BP
" \n\t"
1267 YSCALEYUV2RGB(%%REGBP
, %5)
1270 "mov "U_TEMP
"(%5), %0 \n\t"
1271 "mov "V_TEMP
"(%5), %1 \n\t"
1272 YSCALEYUV2RGB_YA(%%REGBP
, %5, %0, %1)
1273 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1274 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1275 "packuswb %%mm7, %%mm1 \n\t"
1278 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm3
, %%mm6
)
1279 "pop %%"REG_BP
" \n\t"
1280 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1282 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1288 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1289 "mov %4, %%"REG_b
" \n\t"
1290 "push %%"REG_BP
" \n\t"
1291 YSCALEYUV2RGB(%%REGBP
, %5)
1292 "pcmpeqd %%mm7, %%mm7 \n\t"
1293 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1294 "pop %%"REG_BP
" \n\t"
1295 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1297 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1304 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1305 "mov %4, %%"REG_b
" \n\t"
1306 "push %%"REG_BP
" \n\t"
1307 YSCALEYUV2RGB(%%REGBP
, %5)
1308 "pxor %%mm7, %%mm7 \n\t"
1309 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1310 "pop %%"REG_BP
" \n\t"
1311 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1312 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1316 case PIX_FMT_RGB555
:
1318 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1319 "mov %4, %%"REG_b
" \n\t"
1320 "push %%"REG_BP
" \n\t"
1321 YSCALEYUV2RGB(%%REGBP
, %5)
1322 "pxor %%mm7, %%mm7 \n\t"
1323 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1325 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1326 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1327 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1330 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1331 "pop %%"REG_BP
" \n\t"
1332 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1334 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1338 case PIX_FMT_RGB565
:
1340 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1341 "mov %4, %%"REG_b
" \n\t"
1342 "push %%"REG_BP
" \n\t"
1343 YSCALEYUV2RGB(%%REGBP
, %5)
1344 "pxor %%mm7, %%mm7 \n\t"
1345 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1347 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1348 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1349 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1352 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1353 "pop %%"REG_BP
" \n\t"
1354 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1355 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1359 case PIX_FMT_YUYV422
:
1361 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1362 "mov %4, %%"REG_b
" \n\t"
1363 "push %%"REG_BP
" \n\t"
1364 YSCALEYUV2PACKED(%%REGBP
, %5)
1365 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1366 "pop %%"REG_BP
" \n\t"
1367 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1368 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1376 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C
, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C
, YSCALE_YUV_2_MONO2_C
)
1380 * YV12 to RGB without scaling or interpolating
1382 static inline void RENAME(yuv2packed1
)(SwsContext
*c
, uint16_t *buf0
, uint16_t *uvbuf0
, uint16_t *uvbuf1
,
1383 uint16_t *abuf0
, uint8_t *dest
, int dstW
, int uvalpha
, int dstFormat
, int flags
, int y
)
1385 const int yalpha1
=0;
1388 uint16_t *buf1
= buf0
; //FIXME needed for RGB1/BGR1
1389 const int yalpha
= 4096; //FIXME ...
1391 if (flags
&SWS_FULL_CHR_H_INT
)
1393 RENAME(yuv2packed2
)(c
, buf0
, buf0
, uvbuf0
, uvbuf1
, abuf0
, abuf0
, dest
, dstW
, 0, uvalpha
, y
);
1398 if(!(flags
& SWS_BITEXACT
)){
1399 if (uvalpha
< 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1404 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
){
1406 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1407 "mov %4, %%"REG_b
" \n\t"
1408 "push %%"REG_BP
" \n\t"
1409 YSCALEYUV2RGB1(%%REGBP
, %5)
1410 YSCALEYUV2RGB1_ALPHA(%%REGBP
)
1411 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1412 "pop %%"REG_BP
" \n\t"
1413 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1415 :: "c" (buf0
), "d" (abuf0
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1420 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1421 "mov %4, %%"REG_b
" \n\t"
1422 "push %%"REG_BP
" \n\t"
1423 YSCALEYUV2RGB1(%%REGBP
, %5)
1424 "pcmpeqd %%mm7, %%mm7 \n\t"
1425 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1426 "pop %%"REG_BP
" \n\t"
1427 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1429 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1436 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1437 "mov %4, %%"REG_b
" \n\t"
1438 "push %%"REG_BP
" \n\t"
1439 YSCALEYUV2RGB1(%%REGBP
, %5)
1440 "pxor %%mm7, %%mm7 \n\t"
1441 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1442 "pop %%"REG_BP
" \n\t"
1443 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1445 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1449 case PIX_FMT_RGB555
:
1451 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1452 "mov %4, %%"REG_b
" \n\t"
1453 "push %%"REG_BP
" \n\t"
1454 YSCALEYUV2RGB1(%%REGBP
, %5)
1455 "pxor %%mm7, %%mm7 \n\t"
1456 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1458 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1459 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1460 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1462 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1463 "pop %%"REG_BP
" \n\t"
1464 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1466 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1470 case PIX_FMT_RGB565
:
1472 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1473 "mov %4, %%"REG_b
" \n\t"
1474 "push %%"REG_BP
" \n\t"
1475 YSCALEYUV2RGB1(%%REGBP
, %5)
1476 "pxor %%mm7, %%mm7 \n\t"
1477 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1479 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1480 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1481 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1484 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1485 "pop %%"REG_BP
" \n\t"
1486 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1488 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1492 case PIX_FMT_YUYV422
:
1494 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1495 "mov %4, %%"REG_b
" \n\t"
1496 "push %%"REG_BP
" \n\t"
1497 YSCALEYUV2PACKED1(%%REGBP
, %5)
1498 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1499 "pop %%"REG_BP
" \n\t"
1500 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1502 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1513 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
){
1515 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1516 "mov %4, %%"REG_b
" \n\t"
1517 "push %%"REG_BP
" \n\t"
1518 YSCALEYUV2RGB1b(%%REGBP
, %5)
1519 YSCALEYUV2RGB1_ALPHA(%%REGBP
)
1520 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1521 "pop %%"REG_BP
" \n\t"
1522 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1524 :: "c" (buf0
), "d" (abuf0
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1529 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1530 "mov %4, %%"REG_b
" \n\t"
1531 "push %%"REG_BP
" \n\t"
1532 YSCALEYUV2RGB1b(%%REGBP
, %5)
1533 "pcmpeqd %%mm7, %%mm7 \n\t"
1534 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1535 "pop %%"REG_BP
" \n\t"
1536 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1538 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1545 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1546 "mov %4, %%"REG_b
" \n\t"
1547 "push %%"REG_BP
" \n\t"
1548 YSCALEYUV2RGB1b(%%REGBP
, %5)
1549 "pxor %%mm7, %%mm7 \n\t"
1550 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1551 "pop %%"REG_BP
" \n\t"
1552 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1554 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1558 case PIX_FMT_RGB555
:
1560 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1561 "mov %4, %%"REG_b
" \n\t"
1562 "push %%"REG_BP
" \n\t"
1563 YSCALEYUV2RGB1b(%%REGBP
, %5)
1564 "pxor %%mm7, %%mm7 \n\t"
1565 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1567 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1568 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1569 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1571 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1572 "pop %%"REG_BP
" \n\t"
1573 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1575 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1579 case PIX_FMT_RGB565
:
1581 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1582 "mov %4, %%"REG_b
" \n\t"
1583 "push %%"REG_BP
" \n\t"
1584 YSCALEYUV2RGB1b(%%REGBP
, %5)
1585 "pxor %%mm7, %%mm7 \n\t"
1586 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1588 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1589 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1590 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1593 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1594 "pop %%"REG_BP
" \n\t"
1595 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1597 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1601 case PIX_FMT_YUYV422
:
1603 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1604 "mov %4, %%"REG_b
" \n\t"
1605 "push %%"REG_BP
" \n\t"
1606 YSCALEYUV2PACKED1b(%%REGBP
, %5)
1607 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1608 "pop %%"REG_BP
" \n\t"
1609 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1611 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1618 #endif /* HAVE_MMX */
1621 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C
, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1623 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C
, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1627 //FIXME yuy2* can read up to 7 samples too much
1629 static inline void RENAME(yuy2ToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1633 "movq "MANGLE(bm01010101
)", %%mm2 \n\t"
1634 "mov %0, %%"REG_a
" \n\t"
1636 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1637 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1638 "pand %%mm2, %%mm0 \n\t"
1639 "pand %%mm2, %%mm1 \n\t"
1640 "packuswb %%mm1, %%mm0 \n\t"
1641 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1642 "add $8, %%"REG_a
" \n\t"
1644 : : "g" ((x86_reg
)-width
), "r" (src
+width
*2), "r" (dst
+width
)
1649 for (i
=0; i
<width
; i
++)
1654 static inline void RENAME(yuy2ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1658 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1659 "mov %0, %%"REG_a
" \n\t"
1661 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1662 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1663 "psrlw $8, %%mm0 \n\t"
1664 "psrlw $8, %%mm1 \n\t"
1665 "packuswb %%mm1, %%mm0 \n\t"
1666 "movq %%mm0, %%mm1 \n\t"
1667 "psrlw $8, %%mm0 \n\t"
1668 "pand %%mm4, %%mm1 \n\t"
1669 "packuswb %%mm0, %%mm0 \n\t"
1670 "packuswb %%mm1, %%mm1 \n\t"
1671 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1672 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1673 "add $4, %%"REG_a
" \n\t"
1675 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1680 for (i
=0; i
<width
; i
++)
1682 dstU
[i
]= src1
[4*i
+ 1];
1683 dstV
[i
]= src1
[4*i
+ 3];
1686 assert(src1
== src2
);
1689 /* This is almost identical to the previous, end exists only because
1690 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1691 static inline void RENAME(uyvyToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1695 "mov %0, %%"REG_a
" \n\t"
1697 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1698 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1699 "psrlw $8, %%mm0 \n\t"
1700 "psrlw $8, %%mm1 \n\t"
1701 "packuswb %%mm1, %%mm0 \n\t"
1702 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1703 "add $8, %%"REG_a
" \n\t"
1705 : : "g" ((x86_reg
)-width
), "r" (src
+width
*2), "r" (dst
+width
)
1710 for (i
=0; i
<width
; i
++)
1715 static inline void RENAME(uyvyToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1719 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1720 "mov %0, %%"REG_a
" \n\t"
1722 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1723 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1724 "pand %%mm4, %%mm0 \n\t"
1725 "pand %%mm4, %%mm1 \n\t"
1726 "packuswb %%mm1, %%mm0 \n\t"
1727 "movq %%mm0, %%mm1 \n\t"
1728 "psrlw $8, %%mm0 \n\t"
1729 "pand %%mm4, %%mm1 \n\t"
1730 "packuswb %%mm0, %%mm0 \n\t"
1731 "packuswb %%mm1, %%mm1 \n\t"
1732 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1733 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1734 "add $4, %%"REG_a
" \n\t"
1736 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1741 for (i
=0; i
<width
; i
++)
1743 dstU
[i
]= src1
[4*i
+ 0];
1744 dstV
[i
]= src1
[4*i
+ 2];
1747 assert(src1
== src2
);
1750 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1751 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1754 for (i=0; i<width; i++)\
1756 int b= (((type*)src)[i]>>shb)&maskb;\
1757 int g= (((type*)src)[i]>>shg)&maskg;\
1758 int r= (((type*)src)[i]>>shr)&maskr;\
1760 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1764 BGR2Y(uint32_t, bgr32ToY
,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY
<< 8, GY
, BY
<< 8, RGB2YUV_SHIFT
+8)
1765 BGR2Y(uint32_t, rgb32ToY
, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY
<< 8, GY
, BY
<< 8, RGB2YUV_SHIFT
+8)
1766 BGR2Y(uint16_t, bgr16ToY
, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY
<<11, GY
<<5, BY
, RGB2YUV_SHIFT
+8)
1767 BGR2Y(uint16_t, bgr15ToY
, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY
<<10, GY
<<5, BY
, RGB2YUV_SHIFT
+7)
1768 BGR2Y(uint16_t, rgb16ToY
, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY
, GY
<<5, BY
<<11, RGB2YUV_SHIFT
+8)
1769 BGR2Y(uint16_t, rgb15ToY
, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY
, GY
<<5, BY
<<10, RGB2YUV_SHIFT
+7)
1771 static inline void RENAME(abgrToA
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
){
1773 for (i
=0; i
<width
; i
++){
1778 #define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1779 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1782 for (i=0; i<width; i++)\
1784 int b= (((type*)src)[i]&maskb)>>shb;\
1785 int g= (((type*)src)[i]&maskg)>>shg;\
1786 int r= (((type*)src)[i]&maskr)>>shr;\
1788 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1789 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1792 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1795 for (i=0; i<width; i++)\
1797 int pix0= ((type*)src)[2*i+0];\
1798 int pix1= ((type*)src)[2*i+1];\
1799 int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
1800 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1801 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1802 g&= maskg|(2*maskg);\
1806 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1807 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1811 BGR2UV(uint32_t, bgr32ToUV
,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU
<< 8, GU
, BU
<< 8, RV
<< 8, GV
, BV
<< 8, RGB2YUV_SHIFT
+8)
1812 BGR2UV(uint32_t, rgb32ToUV
, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU
<< 8, GU
, BU
<< 8, RV
<< 8, GV
, BV
<< 8, RGB2YUV_SHIFT
+8)
1813 BGR2UV(uint16_t, bgr16ToUV
, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU
<<11, GU
<<5, BU
, RV
<<11, GV
<<5, BV
, RGB2YUV_SHIFT
+8)
1814 BGR2UV(uint16_t, bgr15ToUV
, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU
<<10, GU
<<5, BU
, RV
<<10, GV
<<5, BV
, RGB2YUV_SHIFT
+7)
1815 BGR2UV(uint16_t, rgb16ToUV
, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU
, GU
<<5, BU
<<11, RV
, GV
<<5, BV
<<11, RGB2YUV_SHIFT
+8)
1816 BGR2UV(uint16_t, rgb15ToUV
, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU
, GU
<<5, BU
<<10, RV
, GV
<<5, BV
<<10, RGB2YUV_SHIFT
+7)
1819 static inline void RENAME(bgr24ToY_mmx
)(uint8_t *dst
, uint8_t *src
, long width
, int srcFormat
)
1822 if(srcFormat
== PIX_FMT_BGR24
){
1824 "movq "MANGLE(ff_bgr24toY1Coeff
)", %%mm5 \n\t"
1825 "movq "MANGLE(ff_bgr24toY2Coeff
)", %%mm6 \n\t"
1830 "movq "MANGLE(ff_rgb24toY1Coeff
)", %%mm5 \n\t"
1831 "movq "MANGLE(ff_rgb24toY2Coeff
)", %%mm6 \n\t"
1837 "movq "MANGLE(ff_bgr24toYOffset
)", %%mm4 \n\t"
1838 "mov %2, %%"REG_a
" \n\t"
1839 "pxor %%mm7, %%mm7 \n\t"
1841 PREFETCH
" 64(%0) \n\t"
1842 "movd (%0), %%mm0 \n\t"
1843 "movd 2(%0), %%mm1 \n\t"
1844 "movd 6(%0), %%mm2 \n\t"
1845 "movd 8(%0), %%mm3 \n\t"
1847 "punpcklbw %%mm7, %%mm0 \n\t"
1848 "punpcklbw %%mm7, %%mm1 \n\t"
1849 "punpcklbw %%mm7, %%mm2 \n\t"
1850 "punpcklbw %%mm7, %%mm3 \n\t"
1851 "pmaddwd %%mm5, %%mm0 \n\t"
1852 "pmaddwd %%mm6, %%mm1 \n\t"
1853 "pmaddwd %%mm5, %%mm2 \n\t"
1854 "pmaddwd %%mm6, %%mm3 \n\t"
1855 "paddd %%mm1, %%mm0 \n\t"
1856 "paddd %%mm3, %%mm2 \n\t"
1857 "paddd %%mm4, %%mm0 \n\t"
1858 "paddd %%mm4, %%mm2 \n\t"
1859 "psrad $15, %%mm0 \n\t"
1860 "psrad $15, %%mm2 \n\t"
1861 "packssdw %%mm2, %%mm0 \n\t"
1862 "packuswb %%mm0, %%mm0 \n\t"
1863 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1864 "add $4, %%"REG_a
" \n\t"
1867 : "r" (dst
+width
), "g" ((x86_reg
)-width
)
1872 static inline void RENAME(bgr24ToUV_mmx
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src
, long width
, int srcFormat
)
1875 "movq 24+%4, %%mm6 \n\t"
1876 "mov %3, %%"REG_a
" \n\t"
1877 "pxor %%mm7, %%mm7 \n\t"
1879 PREFETCH
" 64(%0) \n\t"
1880 "movd (%0), %%mm0 \n\t"
1881 "movd 2(%0), %%mm1 \n\t"
1882 "punpcklbw %%mm7, %%mm0 \n\t"
1883 "punpcklbw %%mm7, %%mm1 \n\t"
1884 "movq %%mm0, %%mm2 \n\t"
1885 "movq %%mm1, %%mm3 \n\t"
1886 "pmaddwd %4, %%mm0 \n\t"
1887 "pmaddwd 8+%4, %%mm1 \n\t"
1888 "pmaddwd 16+%4, %%mm2 \n\t"
1889 "pmaddwd %%mm6, %%mm3 \n\t"
1890 "paddd %%mm1, %%mm0 \n\t"
1891 "paddd %%mm3, %%mm2 \n\t"
1893 "movd 6(%0), %%mm1 \n\t"
1894 "movd 8(%0), %%mm3 \n\t"
1896 "punpcklbw %%mm7, %%mm1 \n\t"
1897 "punpcklbw %%mm7, %%mm3 \n\t"
1898 "movq %%mm1, %%mm4 \n\t"
1899 "movq %%mm3, %%mm5 \n\t"
1900 "pmaddwd %4, %%mm1 \n\t"
1901 "pmaddwd 8+%4, %%mm3 \n\t"
1902 "pmaddwd 16+%4, %%mm4 \n\t"
1903 "pmaddwd %%mm6, %%mm5 \n\t"
1904 "paddd %%mm3, %%mm1 \n\t"
1905 "paddd %%mm5, %%mm4 \n\t"
1907 "movq "MANGLE(ff_bgr24toUVOffset
)", %%mm3 \n\t"
1908 "paddd %%mm3, %%mm0 \n\t"
1909 "paddd %%mm3, %%mm2 \n\t"
1910 "paddd %%mm3, %%mm1 \n\t"
1911 "paddd %%mm3, %%mm4 \n\t"
1912 "psrad $15, %%mm0 \n\t"
1913 "psrad $15, %%mm2 \n\t"
1914 "psrad $15, %%mm1 \n\t"
1915 "psrad $15, %%mm4 \n\t"
1916 "packssdw %%mm1, %%mm0 \n\t"
1917 "packssdw %%mm4, %%mm2 \n\t"
1918 "packuswb %%mm0, %%mm0 \n\t"
1919 "packuswb %%mm2, %%mm2 \n\t"
1920 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1921 "movd %%mm2, (%2, %%"REG_a
") \n\t"
1922 "add $4, %%"REG_a
" \n\t"
1925 : "r" (dstU
+width
), "r" (dstV
+width
), "g" ((x86_reg
)-width
), "m"(ff_bgr24toUV
[srcFormat
== PIX_FMT_RGB24
][0])
1931 static inline void RENAME(bgr24ToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1934 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_BGR24
);
1937 for (i
=0; i
<width
; i
++)
1943 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
1945 #endif /* HAVE_MMX */
1948 static inline void RENAME(bgr24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1951 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_BGR24
);
1954 for (i
=0; i
<width
; i
++)
1956 int b
= src1
[3*i
+ 0];
1957 int g
= src1
[3*i
+ 1];
1958 int r
= src1
[3*i
+ 2];
1960 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1961 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1963 #endif /* HAVE_MMX */
1964 assert(src1
== src2
);
1967 static inline void RENAME(bgr24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1970 for (i
=0; i
<width
; i
++)
1972 int b
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
1973 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
1974 int r
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
1976 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1977 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1979 assert(src1
== src2
);
1982 static inline void RENAME(rgb24ToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1985 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_RGB24
);
1988 for (i
=0; i
<width
; i
++)
1994 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
1999 static inline void RENAME(rgb24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
2003 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_RGB24
);
2007 for (i
=0; i
<width
; i
++)
2009 int r
= src1
[3*i
+ 0];
2010 int g
= src1
[3*i
+ 1];
2011 int b
= src1
[3*i
+ 2];
2013 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
2014 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
2019 static inline void RENAME(rgb24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
2023 for (i
=0; i
<width
; i
++)
2025 int r
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
2026 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
2027 int b
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
2029 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
2030 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
2035 static inline void RENAME(palToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *pal
)
2038 for (i
=0; i
<width
; i
++)
2042 dst
[i
]= pal
[d
] & 0xFF;
2046 static inline void RENAME(palToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *pal
)
2049 assert(src1
== src2
);
2050 for (i
=0; i
<width
; i
++)
2052 int p
= pal
[src1
[i
]];
2059 static inline void RENAME(monowhite2Y
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
2062 for (i
=0; i
<width
/8; i
++){
2065 dst
[8*i
+j
]= ((d
>>(7-j
))&1)*255;
2069 static inline void RENAME(monoblack2Y
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
2072 for (i
=0; i
<width
/8; i
++){
2075 dst
[8*i
+j
]= ((d
>>(7-j
))&1)*255;
2079 // bilinear / bicubic scaling
2080 static inline void RENAME(hScale
)(int16_t *dst
, int dstW
, uint8_t *src
, int srcW
, int xInc
,
2081 int16_t *filter
, int16_t *filterPos
, long filterSize
)
2084 assert(filterSize
% 4 == 0 && filterSize
>0);
2085 if (filterSize
==4) // Always true for upscaling, sometimes for down, too.
2087 x86_reg counter
= -2*dstW
;
2089 filterPos
-= counter
/2;
2093 "push %%"REG_b
" \n\t"
2095 "pxor %%mm7, %%mm7 \n\t"
2096 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2097 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2100 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2101 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
2102 "movq (%1, %%"REG_BP
", 4), %%mm1 \n\t"
2103 "movq 8(%1, %%"REG_BP
", 4), %%mm3 \n\t"
2104 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2105 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2106 "punpcklbw %%mm7, %%mm0 \n\t"
2107 "punpcklbw %%mm7, %%mm2 \n\t"
2108 "pmaddwd %%mm1, %%mm0 \n\t"
2109 "pmaddwd %%mm2, %%mm3 \n\t"
2110 "movq %%mm0, %%mm4 \n\t"
2111 "punpckldq %%mm3, %%mm0 \n\t"
2112 "punpckhdq %%mm3, %%mm4 \n\t"
2113 "paddd %%mm4, %%mm0 \n\t"
2114 "psrad $7, %%mm0 \n\t"
2115 "packssdw %%mm0, %%mm0 \n\t"
2116 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2117 "add $4, %%"REG_BP
" \n\t"
2120 "pop %%"REG_BP
" \n\t"
2122 "pop %%"REG_b
" \n\t"
2125 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2131 else if (filterSize
==8)
2133 x86_reg counter
= -2*dstW
;
2135 filterPos
-= counter
/2;
2139 "push %%"REG_b
" \n\t"
2141 "pxor %%mm7, %%mm7 \n\t"
2142 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2143 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2146 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2147 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
2148 "movq (%1, %%"REG_BP
", 8), %%mm1 \n\t"
2149 "movq 16(%1, %%"REG_BP
", 8), %%mm3 \n\t"
2150 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2151 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2152 "punpcklbw %%mm7, %%mm0 \n\t"
2153 "punpcklbw %%mm7, %%mm2 \n\t"
2154 "pmaddwd %%mm1, %%mm0 \n\t"
2155 "pmaddwd %%mm2, %%mm3 \n\t"
2157 "movq 8(%1, %%"REG_BP
", 8), %%mm1 \n\t"
2158 "movq 24(%1, %%"REG_BP
", 8), %%mm5 \n\t"
2159 "movd 4(%3, %%"REG_a
"), %%mm4 \n\t"
2160 "movd 4(%3, %%"REG_b
"), %%mm2 \n\t"
2161 "punpcklbw %%mm7, %%mm4 \n\t"
2162 "punpcklbw %%mm7, %%mm2 \n\t"
2163 "pmaddwd %%mm1, %%mm4 \n\t"
2164 "pmaddwd %%mm2, %%mm5 \n\t"
2165 "paddd %%mm4, %%mm0 \n\t"
2166 "paddd %%mm5, %%mm3 \n\t"
2167 "movq %%mm0, %%mm4 \n\t"
2168 "punpckldq %%mm3, %%mm0 \n\t"
2169 "punpckhdq %%mm3, %%mm4 \n\t"
2170 "paddd %%mm4, %%mm0 \n\t"
2171 "psrad $7, %%mm0 \n\t"
2172 "packssdw %%mm0, %%mm0 \n\t"
2173 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2174 "add $4, %%"REG_BP
" \n\t"
2177 "pop %%"REG_BP
" \n\t"
2179 "pop %%"REG_b
" \n\t"
2182 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2190 uint8_t *offset
= src
+filterSize
;
2191 x86_reg counter
= -2*dstW
;
2192 //filter-= counter*filterSize/2;
2193 filterPos
-= counter
/2;
2196 "pxor %%mm7, %%mm7 \n\t"
2199 "mov %2, %%"REG_c
" \n\t"
2200 "movzwl (%%"REG_c
", %0), %%eax \n\t"
2201 "movzwl 2(%%"REG_c
", %0), %%edx \n\t"
2202 "mov %5, %%"REG_c
" \n\t"
2203 "pxor %%mm4, %%mm4 \n\t"
2204 "pxor %%mm5, %%mm5 \n\t"
2206 "movq (%1), %%mm1 \n\t"
2207 "movq (%1, %6), %%mm3 \n\t"
2208 "movd (%%"REG_c
", %%"REG_a
"), %%mm0 \n\t"
2209 "movd (%%"REG_c
", %%"REG_d
"), %%mm2 \n\t"
2210 "punpcklbw %%mm7, %%mm0 \n\t"
2211 "punpcklbw %%mm7, %%mm2 \n\t"
2212 "pmaddwd %%mm1, %%mm0 \n\t"
2213 "pmaddwd %%mm2, %%mm3 \n\t"
2214 "paddd %%mm3, %%mm5 \n\t"
2215 "paddd %%mm0, %%mm4 \n\t"
2217 "add $4, %%"REG_c
" \n\t"
2218 "cmp %4, %%"REG_c
" \n\t"
2221 "movq %%mm4, %%mm0 \n\t"
2222 "punpckldq %%mm5, %%mm4 \n\t"
2223 "punpckhdq %%mm5, %%mm0 \n\t"
2224 "paddd %%mm0, %%mm4 \n\t"
2225 "psrad $7, %%mm4 \n\t"
2226 "packssdw %%mm4, %%mm4 \n\t"
2227 "mov %3, %%"REG_a
" \n\t"
2228 "movd %%mm4, (%%"REG_a
", %0) \n\t"
2232 : "+r" (counter
), "+r" (filter
)
2233 : "m" (filterPos
), "m" (dst
), "m"(offset
),
2234 "m" (src
), "r" ((x86_reg
)filterSize
*2)
2235 : "%"REG_a
, "%"REG_c
, "%"REG_d
2240 hScale_altivec_real(dst
, dstW
, src
, srcW
, xInc
, filter
, filterPos
, filterSize
);
2243 for (i
=0; i
<dstW
; i
++)
2246 int srcPos
= filterPos
[i
];
2248 //printf("filterPos: %d\n", filterPos[i]);
2249 for (j
=0; j
<filterSize
; j
++)
2251 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2252 val
+= ((int)src
[srcPos
+ j
])*filter
[filterSize
*i
+ j
];
2254 //filter += hFilterSize;
2255 dst
[i
] = FFMIN(val
>>7, (1<<15)-1); // the cubic equation does overflow ...
2258 #endif /* HAVE_ALTIVEC */
2259 #endif /* HAVE_MMX */
2261 // *** horizontal scale Y line to temp buffer
2262 static inline void RENAME(hyscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, uint8_t *src
, int srcW
, int xInc
,
2263 int flags
, int canMMX2BeUsed
, int16_t *hLumFilter
,
2264 int16_t *hLumFilterPos
, int hLumFilterSize
, void *funnyYCode
,
2265 int srcFormat
, uint8_t *formatConvBuffer
, int16_t *mmx2Filter
,
2266 int32_t *mmx2FilterPos
, uint32_t *pal
, int isAlpha
)
2268 if (srcFormat
==PIX_FMT_YUYV422
|| srcFormat
==PIX_FMT_GRAY16BE
)
2270 RENAME(yuy2ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2271 src
= formatConvBuffer
;
2273 else if (srcFormat
==PIX_FMT_UYVY422
|| srcFormat
==PIX_FMT_GRAY16LE
)
2275 RENAME(uyvyToY
)(formatConvBuffer
, src
, srcW
, pal
);
2276 src
= formatConvBuffer
;
2278 else if (srcFormat
==PIX_FMT_RGB32
)
2281 RENAME(abgrToA
)(formatConvBuffer
, src
+3, srcW
, pal
);
2283 RENAME(bgr32ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2284 src
= formatConvBuffer
;
2286 else if (srcFormat
==PIX_FMT_RGB32_1
)
2289 RENAME(abgrToA
)(formatConvBuffer
, src
, srcW
, pal
);
2291 RENAME(bgr32ToY
)(formatConvBuffer
, src
+ALT32_CORR
, srcW
, pal
);
2292 src
= formatConvBuffer
;
2294 else if (srcFormat
==PIX_FMT_BGR24
)
2296 RENAME(bgr24ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2297 src
= formatConvBuffer
;
2299 else if (srcFormat
==PIX_FMT_BGR565
)
2301 RENAME(bgr16ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2302 src
= formatConvBuffer
;
2304 else if (srcFormat
==PIX_FMT_BGR555
)
2306 RENAME(bgr15ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2307 src
= formatConvBuffer
;
2309 else if (srcFormat
==PIX_FMT_BGR32
)
2312 RENAME(abgrToA
)(formatConvBuffer
, src
+3, srcW
, pal
);
2314 RENAME(rgb32ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2315 src
= formatConvBuffer
;
2317 else if (srcFormat
==PIX_FMT_BGR32_1
)
2320 RENAME(abgrToA
)(formatConvBuffer
, src
, srcW
, pal
);
2322 RENAME(rgb32ToY
)(formatConvBuffer
, src
+ALT32_CORR
, srcW
, pal
);
2323 src
= formatConvBuffer
;
2325 else if (srcFormat
==PIX_FMT_RGB24
)
2327 RENAME(rgb24ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2328 src
= formatConvBuffer
;
2330 else if (srcFormat
==PIX_FMT_RGB565
)
2332 RENAME(rgb16ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2333 src
= formatConvBuffer
;
2335 else if (srcFormat
==PIX_FMT_RGB555
)
2337 RENAME(rgb15ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2338 src
= formatConvBuffer
;
2340 else if (srcFormat
==PIX_FMT_RGB8
|| srcFormat
==PIX_FMT_BGR8
|| srcFormat
==PIX_FMT_PAL8
|| srcFormat
==PIX_FMT_BGR4_BYTE
|| srcFormat
==PIX_FMT_RGB4_BYTE
)
2342 RENAME(palToY
)(formatConvBuffer
, src
, srcW
, pal
);
2343 src
= formatConvBuffer
;
2345 else if (srcFormat
==PIX_FMT_MONOBLACK
)
2347 RENAME(monoblack2Y
)(formatConvBuffer
, src
, srcW
, pal
);
2348 src
= formatConvBuffer
;
2350 else if (srcFormat
==PIX_FMT_MONOWHITE
)
2352 RENAME(monowhite2Y
)(formatConvBuffer
, src
, srcW
, pal
);
2353 src
= formatConvBuffer
;
2357 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2358 if (!(flags
&SWS_FAST_BILINEAR
) || (!canMMX2BeUsed
))
2360 if (!(flags
&SWS_FAST_BILINEAR
))
2363 RENAME(hScale
)(dst
, dstWidth
, src
, srcW
, xInc
, hLumFilter
, hLumFilterPos
, hLumFilterSize
);
2365 else // fast bilinear upscale / crap downscale
2367 #if ARCH_X86 && CONFIG_GPL
2371 uint64_t ebxsave
__attribute__((aligned(8)));
2377 "mov %%"REG_b
", %5 \n\t"
2379 "pxor %%mm7, %%mm7 \n\t"
2380 "mov %0, %%"REG_c
" \n\t"
2381 "mov %1, %%"REG_D
" \n\t"
2382 "mov %2, %%"REG_d
" \n\t"
2383 "mov %3, %%"REG_b
" \n\t"
2384 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2385 PREFETCH
" (%%"REG_c
") \n\t"
2386 PREFETCH
" 32(%%"REG_c
") \n\t"
2387 PREFETCH
" 64(%%"REG_c
") \n\t"
2391 #define FUNNY_Y_CODE \
2392 "movl (%%"REG_b"), %%esi \n\t"\
2394 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2395 "add %%"REG_S", %%"REG_c" \n\t"\
2396 "add %%"REG_a", %%"REG_D" \n\t"\
2397 "xor %%"REG_a", %%"REG_a" \n\t"\
2401 #define FUNNY_Y_CODE \
2402 "movl (%%"REG_b"), %%esi \n\t"\
2404 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2405 "add %%"REG_a", %%"REG_D" \n\t"\
2406 "xor %%"REG_a", %%"REG_a" \n\t"\
2408 #endif /* ARCH_X86_64 */
2420 "mov %5, %%"REG_b
" \n\t"
2422 :: "m" (src
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2427 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2432 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--) dst
[i
] = src
[srcW
-1]*128;
2436 #endif /* HAVE_MMX2 */
2437 x86_reg xInc_shr16
= xInc
>> 16;
2438 uint16_t xInc_mask
= xInc
& 0xffff;
2439 //NO MMX just normal asm ...
2441 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2442 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2443 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2446 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2447 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2448 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2449 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2450 "shll $16, %%edi \n\t"
2451 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2452 "mov %1, %%"REG_D
" \n\t"
2453 "shrl $9, %%esi \n\t"
2454 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2455 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2456 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2458 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2459 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2460 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2461 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2462 "shll $16, %%edi \n\t"
2463 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2464 "mov %1, %%"REG_D
" \n\t"
2465 "shrl $9, %%esi \n\t"
2466 "movw %%si, 2(%%"REG_D
", %%"REG_a
", 2) \n\t"
2467 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2468 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2471 "add $2, %%"REG_a
" \n\t"
2472 "cmp %2, %%"REG_a
" \n\t"
2476 :: "r" (src
), "m" (dst
), "m" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
)
2477 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2480 } //if MMX2 can't be used
2484 unsigned int xpos
=0;
2485 for (i
=0;i
<dstWidth
;i
++)
2487 register unsigned int xx
=xpos
>>16;
2488 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2489 dst
[i
]= (src
[xx
]<<7) + (src
[xx
+1] - src
[xx
])*xalpha
;
2492 #endif /* ARCH_X86 */
2495 if(!isAlpha
&& c
->srcRange
!= c
->dstRange
&& !(isRGB(c
->dstFormat
) || isBGR(c
->dstFormat
))){
2497 //FIXME all pal and rgb srcFormats could do this convertion as well
2498 //FIXME all scalers more complex than bilinear could do half of this transform
2500 for (i
=0; i
<dstWidth
; i
++)
2501 dst
[i
]= (dst
[i
]*14071 + 33561947)>>14;
2503 for (i
=0; i
<dstWidth
; i
++)
2504 dst
[i
]= (FFMIN(dst
[i
],30189)*19077 - 39057361)>>14;
2509 inline static void RENAME(hcscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, uint8_t *src1
, uint8_t *src2
,
2510 int srcW
, int xInc
, int flags
, int canMMX2BeUsed
, int16_t *hChrFilter
,
2511 int16_t *hChrFilterPos
, int hChrFilterSize
, void *funnyUVCode
,
2512 int srcFormat
, uint8_t *formatConvBuffer
, int16_t *mmx2Filter
,
2513 int32_t *mmx2FilterPos
, uint32_t *pal
)
2515 if (srcFormat
==PIX_FMT_YUYV422
)
2517 RENAME(yuy2ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2518 src1
= formatConvBuffer
;
2519 src2
= formatConvBuffer
+VOFW
;
2521 else if (srcFormat
==PIX_FMT_UYVY422
)
2523 RENAME(uyvyToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2524 src1
= formatConvBuffer
;
2525 src2
= formatConvBuffer
+VOFW
;
2527 else if (srcFormat
==PIX_FMT_RGB32
)
2529 if(c
->chrSrcHSubSample
)
2530 RENAME(bgr32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2532 RENAME(bgr32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2533 src1
= formatConvBuffer
;
2534 src2
= formatConvBuffer
+VOFW
;
2536 else if (srcFormat
==PIX_FMT_RGB32_1
)
2538 if(c
->chrSrcHSubSample
)
2539 RENAME(bgr32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2541 RENAME(bgr32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2542 src1
= formatConvBuffer
;
2543 src2
= formatConvBuffer
+VOFW
;
2545 else if (srcFormat
==PIX_FMT_BGR24
)
2547 if(c
->chrSrcHSubSample
)
2548 RENAME(bgr24ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2550 RENAME(bgr24ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2551 src1
= formatConvBuffer
;
2552 src2
= formatConvBuffer
+VOFW
;
2554 else if (srcFormat
==PIX_FMT_BGR565
)
2556 if(c
->chrSrcHSubSample
)
2557 RENAME(bgr16ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2559 RENAME(bgr16ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2560 src1
= formatConvBuffer
;
2561 src2
= formatConvBuffer
+VOFW
;
2563 else if (srcFormat
==PIX_FMT_BGR555
)
2565 if(c
->chrSrcHSubSample
)
2566 RENAME(bgr15ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2568 RENAME(bgr15ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2569 src1
= formatConvBuffer
;
2570 src2
= formatConvBuffer
+VOFW
;
2572 else if (srcFormat
==PIX_FMT_BGR32
)
2574 if(c
->chrSrcHSubSample
)
2575 RENAME(rgb32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2577 RENAME(rgb32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2578 src1
= formatConvBuffer
;
2579 src2
= formatConvBuffer
+VOFW
;
2581 else if (srcFormat
==PIX_FMT_BGR32_1
)
2583 if(c
->chrSrcHSubSample
)
2584 RENAME(rgb32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2586 RENAME(rgb32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2587 src1
= formatConvBuffer
;
2588 src2
= formatConvBuffer
+VOFW
;
2590 else if (srcFormat
==PIX_FMT_RGB24
)
2592 if(c
->chrSrcHSubSample
)
2593 RENAME(rgb24ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2595 RENAME(rgb24ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2596 src1
= formatConvBuffer
;
2597 src2
= formatConvBuffer
+VOFW
;
2599 else if (srcFormat
==PIX_FMT_RGB565
)
2601 if(c
->chrSrcHSubSample
)
2602 RENAME(rgb16ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2604 RENAME(rgb16ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2605 src1
= formatConvBuffer
;
2606 src2
= formatConvBuffer
+VOFW
;
2608 else if (srcFormat
==PIX_FMT_RGB555
)
2610 if(c
->chrSrcHSubSample
)
2611 RENAME(rgb15ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2613 RENAME(rgb15ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2614 src1
= formatConvBuffer
;
2615 src2
= formatConvBuffer
+VOFW
;
2617 else if (isGray(srcFormat
) || srcFormat
==PIX_FMT_MONOBLACK
|| srcFormat
==PIX_FMT_MONOWHITE
)
2621 else if (srcFormat
==PIX_FMT_RGB8
|| srcFormat
==PIX_FMT_BGR8
|| srcFormat
==PIX_FMT_PAL8
|| srcFormat
==PIX_FMT_BGR4_BYTE
|| srcFormat
==PIX_FMT_RGB4_BYTE
)
2623 RENAME(palToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2624 src1
= formatConvBuffer
;
2625 src2
= formatConvBuffer
+VOFW
;
2629 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2630 if (!(flags
&SWS_FAST_BILINEAR
) || (!canMMX2BeUsed
))
2632 if (!(flags
&SWS_FAST_BILINEAR
))
2635 RENAME(hScale
)(dst
, dstWidth
, src1
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2636 RENAME(hScale
)(dst
+VOFW
, dstWidth
, src2
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2638 else // fast bilinear upscale / crap downscale
2640 #if ARCH_X86 && CONFIG_GPL
2644 uint64_t ebxsave
__attribute__((aligned(8)));
2650 "mov %%"REG_b
", %6 \n\t"
2652 "pxor %%mm7, %%mm7 \n\t"
2653 "mov %0, %%"REG_c
" \n\t"
2654 "mov %1, %%"REG_D
" \n\t"
2655 "mov %2, %%"REG_d
" \n\t"
2656 "mov %3, %%"REG_b
" \n\t"
2657 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2658 PREFETCH
" (%%"REG_c
") \n\t"
2659 PREFETCH
" 32(%%"REG_c
") \n\t"
2660 PREFETCH
" 64(%%"REG_c
") \n\t"
2664 #define FUNNY_UV_CODE \
2665 "movl (%%"REG_b"), %%esi \n\t"\
2667 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2668 "add %%"REG_S", %%"REG_c" \n\t"\
2669 "add %%"REG_a", %%"REG_D" \n\t"\
2670 "xor %%"REG_a", %%"REG_a" \n\t"\
2674 #define FUNNY_UV_CODE \
2675 "movl (%%"REG_b"), %%esi \n\t"\
2677 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2678 "add %%"REG_a", %%"REG_D" \n\t"\
2679 "xor %%"REG_a", %%"REG_a" \n\t"\
2681 #endif /* ARCH_X86_64 */
2687 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2688 "mov %5, %%"REG_c
" \n\t" // src
2689 "mov %1, %%"REG_D
" \n\t" // buf1
2690 "add $"AV_STRINGIFY(VOF
)", %%"REG_D
" \n\t"
2691 PREFETCH
" (%%"REG_c
") \n\t"
2692 PREFETCH
" 32(%%"REG_c
") \n\t"
2693 PREFETCH
" 64(%%"REG_c
") \n\t"
2701 "mov %6, %%"REG_b
" \n\t"
2703 :: "m" (src1
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2704 "m" (funnyUVCode
), "m" (src2
)
2708 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2713 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--)
2715 //printf("%d %d %d\n", dstWidth, i, srcW);
2716 dst
[i
] = src1
[srcW
-1]*128;
2717 dst
[i
+VOFW
] = src2
[srcW
-1]*128;
2722 #endif /* HAVE_MMX2 */
2723 x86_reg xInc_shr16
= (x86_reg
) (xInc
>> 16);
2724 uint16_t xInc_mask
= xInc
& 0xffff;
2726 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2727 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2728 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2731 "mov %0, %%"REG_S
" \n\t"
2732 "movzbl (%%"REG_S
", %%"REG_d
"), %%edi \n\t" //src[xx]
2733 "movzbl 1(%%"REG_S
", %%"REG_d
"), %%esi \n\t" //src[xx+1]
2734 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2735 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2736 "shll $16, %%edi \n\t"
2737 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2738 "mov %1, %%"REG_D
" \n\t"
2739 "shrl $9, %%esi \n\t"
2740 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2742 "movzbl (%5, %%"REG_d
"), %%edi \n\t" //src[xx]
2743 "movzbl 1(%5, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2744 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2745 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2746 "shll $16, %%edi \n\t"
2747 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2748 "mov %1, %%"REG_D
" \n\t"
2749 "shrl $9, %%esi \n\t"
2750 "movw %%si, "AV_STRINGIFY(VOF
)"(%%"REG_D
", %%"REG_a
", 2) \n\t"
2752 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2753 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2754 "add $1, %%"REG_a
" \n\t"
2755 "cmp %2, %%"REG_a
" \n\t"
2758 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2759 which is needed to support GCC 4.0. */
2760 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2761 :: "m" (src1
), "m" (dst
), "g" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2763 :: "m" (src1
), "m" (dst
), "m" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2766 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2769 } //if MMX2 can't be used
2773 unsigned int xpos
=0;
2774 for (i
=0;i
<dstWidth
;i
++)
2776 register unsigned int xx
=xpos
>>16;
2777 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2778 dst
[i
]=(src1
[xx
]*(xalpha
^127)+src1
[xx
+1]*xalpha
);
2779 dst
[i
+VOFW
]=(src2
[xx
]*(xalpha
^127)+src2
[xx
+1]*xalpha
);
2781 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2782 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2786 #endif /* ARCH_X86 */
2788 if(c
->srcRange
!= c
->dstRange
&& !(isRGB(c
->dstFormat
) || isBGR(c
->dstFormat
))){
2790 //FIXME all pal and rgb srcFormats could do this convertion as well
2791 //FIXME all scalers more complex than bilinear could do half of this transform
2793 for (i
=0; i
<dstWidth
; i
++){
2794 dst
[i
]= (dst
[i
]*1799 + 4081085)>>11; //1469
2795 dst
[i
+VOFW
]= (dst
[i
+VOFW
]*1799 + 4081085)>>11; //1469
2798 for (i
=0; i
<dstWidth
; i
++){
2799 dst
[i
]= (FFMIN(dst
[i
],30775)*4663 - 9289992)>>12; //-264
2800 dst
[i
+VOFW
]= (FFMIN(dst
[i
+VOFW
],30775)*4663 - 9289992)>>12; //-264
2806 static int RENAME(swScale
)(SwsContext
*c
, uint8_t* src
[], int srcStride
[], int srcSliceY
,
2807 int srcSliceH
, uint8_t* dst
[], int dstStride
[]){
2809 /* load a few things into local vars to make the code more readable? and faster */
2810 const int srcW
= c
->srcW
;
2811 const int dstW
= c
->dstW
;
2812 const int dstH
= c
->dstH
;
2813 const int chrDstW
= c
->chrDstW
;
2814 const int chrSrcW
= c
->chrSrcW
;
2815 const int lumXInc
= c
->lumXInc
;
2816 const int chrXInc
= c
->chrXInc
;
2817 const int dstFormat
= c
->dstFormat
;
2818 const int srcFormat
= c
->srcFormat
;
2819 const int flags
= c
->flags
;
2820 const int canMMX2BeUsed
= c
->canMMX2BeUsed
;
2821 int16_t *vLumFilterPos
= c
->vLumFilterPos
;
2822 int16_t *vChrFilterPos
= c
->vChrFilterPos
;
2823 int16_t *hLumFilterPos
= c
->hLumFilterPos
;
2824 int16_t *hChrFilterPos
= c
->hChrFilterPos
;
2825 int16_t *vLumFilter
= c
->vLumFilter
;
2826 int16_t *vChrFilter
= c
->vChrFilter
;
2827 int16_t *hLumFilter
= c
->hLumFilter
;
2828 int16_t *hChrFilter
= c
->hChrFilter
;
2829 int32_t *lumMmxFilter
= c
->lumMmxFilter
;
2830 int32_t *chrMmxFilter
= c
->chrMmxFilter
;
2831 int32_t *alpMmxFilter
= c
->alpMmxFilter
;
2832 const int vLumFilterSize
= c
->vLumFilterSize
;
2833 const int vChrFilterSize
= c
->vChrFilterSize
;
2834 const int hLumFilterSize
= c
->hLumFilterSize
;
2835 const int hChrFilterSize
= c
->hChrFilterSize
;
2836 int16_t **lumPixBuf
= c
->lumPixBuf
;
2837 int16_t **chrPixBuf
= c
->chrPixBuf
;
2838 int16_t **alpPixBuf
= c
->alpPixBuf
;
2839 const int vLumBufSize
= c
->vLumBufSize
;
2840 const int vChrBufSize
= c
->vChrBufSize
;
2841 uint8_t *funnyYCode
= c
->funnyYCode
;
2842 uint8_t *funnyUVCode
= c
->funnyUVCode
;
2843 uint8_t *formatConvBuffer
= c
->formatConvBuffer
;
2844 const int chrSrcSliceY
= srcSliceY
>> c
->chrSrcVSubSample
;
2845 const int chrSrcSliceH
= -((-srcSliceH
) >> c
->chrSrcVSubSample
);
2847 uint32_t *pal
=c
->pal_yuv
;
2849 /* vars which will change and which we need to store back in the context */
2851 int lumBufIndex
= c
->lumBufIndex
;
2852 int chrBufIndex
= c
->chrBufIndex
;
2853 int lastInLumBuf
= c
->lastInLumBuf
;
2854 int lastInChrBuf
= c
->lastInChrBuf
;
2856 if (isPacked(c
->srcFormat
)){
2864 srcStride
[3]= srcStride
[0];
2866 srcStride
[1]<<= c
->vChrDrop
;
2867 srcStride
[2]<<= c
->vChrDrop
;
2869 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2870 // (int)dst[0], (int)dst[1], (int)dst[2]);
2872 #if 0 //self test FIXME move to a vfilter or something
2874 static volatile int i
=0;
2876 if (srcFormat
==PIX_FMT_YUV420P
&& i
==1 && srcSliceH
>= c
->srcH
)
2877 selfTest(src
, srcStride
, c
->srcW
, c
->srcH
);
2882 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2883 //dstStride[0],dstStride[1],dstStride[2]);
2885 if (dstStride
[0]%8 !=0 || dstStride
[1]%8 !=0 || dstStride
[2]%8 !=0 || dstStride
[3]%8 != 0)
2887 static int warnedAlready
=0; //FIXME move this into the context perhaps
2888 if (flags
& SWS_PRINT_INFO
&& !warnedAlready
)
2890 av_log(c
, AV_LOG_WARNING
, "Warning: dstStride is not aligned!\n"
2891 " ->cannot do aligned memory accesses anymore\n");
2896 /* Note the user might start scaling the picture in the middle so this
2897 will not get executed. This is not really intended but works
2898 currently, so people might do it. */
2909 for (;dstY
< dstH
; dstY
++){
2910 unsigned char *dest
=dst
[0]+dstStride
[0]*dstY
;
2911 const int chrDstY
= dstY
>>c
->chrDstVSubSample
;
2912 unsigned char *uDest
=dst
[1]+dstStride
[1]*chrDstY
;
2913 unsigned char *vDest
=dst
[2]+dstStride
[2]*chrDstY
;
2914 unsigned char *aDest
=(CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? dst
[3]+dstStride
[3]*dstY
: NULL
;
2916 const int firstLumSrcY
= vLumFilterPos
[dstY
]; //First line needed as input
2917 const int firstChrSrcY
= vChrFilterPos
[chrDstY
]; //First line needed as input
2918 const int lastLumSrcY
= firstLumSrcY
+ vLumFilterSize
-1; // Last line needed as input
2919 const int lastChrSrcY
= firstChrSrcY
+ vChrFilterSize
-1; // Last line needed as input
2921 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2922 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2923 //handle holes (FAST_BILINEAR & weird filters)
2924 if (firstLumSrcY
> lastInLumBuf
) lastInLumBuf
= firstLumSrcY
-1;
2925 if (firstChrSrcY
> lastInChrBuf
) lastInChrBuf
= firstChrSrcY
-1;
2926 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2927 assert(firstLumSrcY
>= lastInLumBuf
- vLumBufSize
+ 1);
2928 assert(firstChrSrcY
>= lastInChrBuf
- vChrBufSize
+ 1);
2930 // Do we have enough lines in this slice to output the dstY line
2931 if (lastLumSrcY
< srcSliceY
+ srcSliceH
&& lastChrSrcY
< -((-srcSliceY
- srcSliceH
)>>c
->chrSrcVSubSample
))
2933 //Do horizontal scaling
2934 while(lastInLumBuf
< lastLumSrcY
)
2936 uint8_t *src1
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
2937 uint8_t *src2
= src
[3]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[3];
2939 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2940 assert(lumBufIndex
< 2*vLumBufSize
);
2941 assert(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
);
2942 assert(lastInLumBuf
+ 1 - srcSliceY
>= 0);
2943 //printf("%d %d\n", lumBufIndex, vLumBufSize);
2944 RENAME(hyscale
)(c
, lumPixBuf
[ lumBufIndex
], dstW
, src1
, srcW
, lumXInc
,
2945 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2946 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
2947 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
, pal
, 0);
2948 if (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
)
2949 RENAME(hyscale
)(c
, alpPixBuf
[ lumBufIndex
], dstW
, src2
, srcW
, lumXInc
,
2950 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2951 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
2952 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
, pal
, 1);
2955 while(lastInChrBuf
< lastChrSrcY
)
2957 uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
2958 uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
2960 assert(chrBufIndex
< 2*vChrBufSize
);
2961 assert(lastInChrBuf
+ 1 - chrSrcSliceY
< (chrSrcSliceH
));
2962 assert(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0);
2963 //FIXME replace parameters through context struct (some at least)
2965 if (!(isGray(srcFormat
) || isGray(dstFormat
)))
2966 RENAME(hcscale
)(c
, chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
2967 flags
, canMMX2BeUsed
, hChrFilter
, hChrFilterPos
, hChrFilterSize
,
2968 funnyUVCode
, c
->srcFormat
, formatConvBuffer
,
2969 c
->chrMmx2Filter
, c
->chrMmx2FilterPos
, pal
);
2972 //wrap buf index around to stay inside the ring buffer
2973 if (lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
2974 if (chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
2976 else // not enough lines left in this slice -> load the rest in the buffer
2978 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2979 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2980 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2981 vChrBufSize, vLumBufSize);*/
2983 //Do horizontal scaling
2984 while(lastInLumBuf
+1 < srcSliceY
+ srcSliceH
)
2986 uint8_t *src1
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
2987 uint8_t *src2
= src
[3]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[3];
2989 assert(lumBufIndex
< 2*vLumBufSize
);
2990 assert(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
);
2991 assert(lastInLumBuf
+ 1 - srcSliceY
>= 0);
2992 RENAME(hyscale
)(c
, lumPixBuf
[ lumBufIndex
], dstW
, src1
, srcW
, lumXInc
,
2993 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2994 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
2995 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
, pal
, 0);
2996 if (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
)
2997 RENAME(hyscale
)(c
, alpPixBuf
[ lumBufIndex
], dstW
, src2
, srcW
, lumXInc
,
2998 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2999 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
3000 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
, pal
, 1);
3003 while(lastInChrBuf
+1 < (chrSrcSliceY
+ chrSrcSliceH
))
3005 uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
3006 uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
3008 assert(chrBufIndex
< 2*vChrBufSize
);
3009 assert(lastInChrBuf
+ 1 - chrSrcSliceY
< chrSrcSliceH
);
3010 assert(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0);
3012 if (!(isGray(srcFormat
) || isGray(dstFormat
)))
3013 RENAME(hcscale
)(c
, chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
3014 flags
, canMMX2BeUsed
, hChrFilter
, hChrFilterPos
, hChrFilterSize
,
3015 funnyUVCode
, c
->srcFormat
, formatConvBuffer
,
3016 c
->chrMmx2Filter
, c
->chrMmx2FilterPos
, pal
);
3019 //wrap buf index around to stay inside the ring buffer
3020 if (lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
3021 if (chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
3022 break; //we can't output a dstY line so let's try with the next slice
3026 c
->blueDither
= ff_dither8
[dstY
&1];
3027 if (c
->dstFormat
== PIX_FMT_RGB555
|| c
->dstFormat
== PIX_FMT_BGR555
)
3028 c
->greenDither
= ff_dither8
[dstY
&1];
3030 c
->greenDither
= ff_dither4
[dstY
&1];
3031 c
->redDither
= ff_dither8
[(dstY
+1)&1];
3035 int16_t **lumSrcPtr
= lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
3036 int16_t **chrSrcPtr
= chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
3037 int16_t **alpSrcPtr
= (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? alpPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
: NULL
;
3040 if (flags
& SWS_ACCURATE_RND
){
3041 int s
= APCK_SIZE
/ 8;
3042 for (i
=0; i
<vLumFilterSize
; i
+=2){
3043 *(void**)&lumMmxFilter
[s
*i
]= lumSrcPtr
[i
];
3044 *(void**)&lumMmxFilter
[s
*i
+APCK_PTR2
/4 ]= lumSrcPtr
[i
+(vLumFilterSize
>1)];
3045 lumMmxFilter
[s
*i
+APCK_COEF
/4 ]=
3046 lumMmxFilter
[s
*i
+APCK_COEF
/4+1]= vLumFilter
[dstY
*vLumFilterSize
+ i
]
3047 + (vLumFilterSize
>1 ? vLumFilter
[dstY
*vLumFilterSize
+ i
+ 1]<<16 : 0);
3048 if (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
){
3049 *(void**)&alpMmxFilter
[s
*i
]= alpSrcPtr
[i
];
3050 *(void**)&alpMmxFilter
[s
*i
+APCK_PTR2
/4 ]= alpSrcPtr
[i
+(vLumFilterSize
>1)];
3051 alpMmxFilter
[s
*i
+APCK_COEF
/4 ]=
3052 alpMmxFilter
[s
*i
+APCK_COEF
/4+1]= lumMmxFilter
[s
*i
+APCK_COEF
/4 ];
3055 for (i
=0; i
<vChrFilterSize
; i
+=2){
3056 *(void**)&chrMmxFilter
[s
*i
]= chrSrcPtr
[i
];
3057 *(void**)&chrMmxFilter
[s
*i
+APCK_PTR2
/4 ]= chrSrcPtr
[i
+(vChrFilterSize
>1)];
3058 chrMmxFilter
[s
*i
+APCK_COEF
/4 ]=
3059 chrMmxFilter
[s
*i
+APCK_COEF
/4+1]= vChrFilter
[chrDstY
*vChrFilterSize
+ i
]
3060 + (vChrFilterSize
>1 ? vChrFilter
[chrDstY
*vChrFilterSize
+ i
+ 1]<<16 : 0);
3063 for (i
=0; i
<vLumFilterSize
; i
++)
3065 lumMmxFilter
[4*i
+0]= (int32_t)lumSrcPtr
[i
];
3066 lumMmxFilter
[4*i
+1]= (uint64_t)lumSrcPtr
[i
] >> 32;
3067 lumMmxFilter
[4*i
+2]=
3068 lumMmxFilter
[4*i
+3]=
3069 ((uint16_t)vLumFilter
[dstY
*vLumFilterSize
+ i
])*0x10001;
3070 if (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
){
3071 alpMmxFilter
[4*i
+0]= (int32_t)alpSrcPtr
[i
];
3072 alpMmxFilter
[4*i
+1]= (uint64_t)alpSrcPtr
[i
] >> 32;
3073 alpMmxFilter
[4*i
+2]=
3074 alpMmxFilter
[4*i
+3]= lumMmxFilter
[4*i
+2];
3077 for (i
=0; i
<vChrFilterSize
; i
++)
3079 chrMmxFilter
[4*i
+0]= (int32_t)chrSrcPtr
[i
];
3080 chrMmxFilter
[4*i
+1]= (uint64_t)chrSrcPtr
[i
] >> 32;
3081 chrMmxFilter
[4*i
+2]=
3082 chrMmxFilter
[4*i
+3]=
3083 ((uint16_t)vChrFilter
[chrDstY
*vChrFilterSize
+ i
])*0x10001;
3087 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
){
3088 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
3089 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
3090 RENAME(yuv2nv12X
)(c
,
3091 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3092 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3093 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
3095 else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) //YV12 like
3097 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
3098 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
3099 if (vLumFilterSize
== 1 && vChrFilterSize
== 1) // unscaled YV12
3101 int16_t *lumBuf
= lumPixBuf
[0];
3102 int16_t *chrBuf
= chrPixBuf
[0];
3103 int16_t *alpBuf
= (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? alpPixBuf
[0] : NULL
;
3104 RENAME(yuv2yuv1
)(c
, lumBuf
, chrBuf
, alpBuf
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
3109 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3110 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3111 alpSrcPtr
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
3116 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
3117 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
3118 if (vLumFilterSize
== 1 && vChrFilterSize
== 2) //unscaled RGB
3120 int chrAlpha
= vChrFilter
[2*dstY
+1];
3121 if(flags
& SWS_FULL_CHR_H_INT
){
3122 yuv2rgbXinC_full(c
, //FIXME write a packed1_full function
3123 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3124 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3125 alpSrcPtr
, dest
, dstW
, dstY
);
3127 RENAME(yuv2packed1
)(c
, *lumSrcPtr
, *chrSrcPtr
, *(chrSrcPtr
+1),
3128 alpPixBuf
? *alpSrcPtr
: NULL
,
3129 dest
, dstW
, chrAlpha
, dstFormat
, flags
, dstY
);
3132 else if (vLumFilterSize
== 2 && vChrFilterSize
== 2) //bilinear upscale RGB
3134 int lumAlpha
= vLumFilter
[2*dstY
+1];
3135 int chrAlpha
= vChrFilter
[2*dstY
+1];
3137 lumMmxFilter
[3]= vLumFilter
[2*dstY
]*0x10001;
3139 chrMmxFilter
[3]= vChrFilter
[2*chrDstY
]*0x10001;
3140 if(flags
& SWS_FULL_CHR_H_INT
){
3141 yuv2rgbXinC_full(c
, //FIXME write a packed2_full function
3142 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3143 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3144 alpSrcPtr
, dest
, dstW
, dstY
);
3146 RENAME(yuv2packed2
)(c
, *lumSrcPtr
, *(lumSrcPtr
+1), *chrSrcPtr
, *(chrSrcPtr
+1),
3147 alpPixBuf
? *alpSrcPtr
: NULL
, alpPixBuf
? *(alpSrcPtr
+1) : NULL
,
3148 dest
, dstW
, lumAlpha
, chrAlpha
, dstY
);
3153 if(flags
& SWS_FULL_CHR_H_INT
){
3155 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3156 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3157 alpSrcPtr
, dest
, dstW
, dstY
);
3159 RENAME(yuv2packedX
)(c
,
3160 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3161 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3162 alpSrcPtr
, dest
, dstW
, dstY
);
3167 else // hmm looks like we can't use MMX here without overwriting this array's tail
3169 int16_t **lumSrcPtr
= lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
3170 int16_t **chrSrcPtr
= chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
3171 int16_t **alpSrcPtr
= (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? alpPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
: NULL
;
3172 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
){
3173 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
3174 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
3176 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3177 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3178 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
3180 else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) //YV12
3182 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
3183 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
3185 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3186 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3187 alpSrcPtr
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
3191 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
3192 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
3193 if(flags
& SWS_FULL_CHR_H_INT
){
3195 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3196 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3197 alpSrcPtr
, dest
, dstW
, dstY
);
3200 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3201 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3202 alpSrcPtr
, dest
, dstW
, dstY
);
3208 if ((dstFormat
== PIX_FMT_YUVA420P
) && !alpPixBuf
)
3209 fillPlane(dst
[3], dstStride
[3], dstW
, dstY
-lastDstY
, lastDstY
, 255);
3212 __asm__
volatile(SFENCE:::"memory");
3213 __asm__
volatile(EMMS:::"memory");
3215 /* store changed local vars back in the context */
3217 c
->lumBufIndex
= lumBufIndex
;
3218 c
->chrBufIndex
= chrBufIndex
;
3219 c
->lastInLumBuf
= lastInLumBuf
;
3220 c
->lastInChrBuf
= lastInChrBuf
;
3222 return dstY
- lastDstY
;