2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
51 #define SFENCE "sfence"
53 #define SFENCE " # nop"
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70 #include "swscale_altivec_template.c"
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX_UV \
211 "xor %%"REG_a", %%"REG_a" \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 #define YSCALEYUV2PACKEDX_YA(offset) \
234 "lea "offset"(%0), %%"REG_d" \n\t"\
235 "mov (%%"REG_d"), %%"REG_S" \n\t"\
236 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
237 "movq %%mm1, %%mm7 \n\t"\
240 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
241 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
242 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
243 "add $16, %%"REG_d" \n\t"\
244 "mov (%%"REG_d"), %%"REG_S" \n\t"\
245 "pmulhw %%mm0, %%mm2 \n\t"\
246 "pmulhw %%mm0, %%mm5 \n\t"\
247 "paddw %%mm2, %%mm1 \n\t"\
248 "paddw %%mm5, %%mm7 \n\t"\
249 "test %%"REG_S", %%"REG_S" \n\t"\
252 #define YSCALEYUV2PACKEDX \
253 YSCALEYUV2PACKEDX_UV \
254 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET) \
256 #define YSCALEYUV2PACKEDX_END \
257 :: "r" (&c->redDither), \
258 "m" (dummy), "m" (dummy), "m" (dummy),\
259 "r" (dest), "m" (dstW) \
260 : "%"REG_a, "%"REG_d, "%"REG_S \
263 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
265 "xor %%"REG_a", %%"REG_a" \n\t"\
269 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
270 "mov (%%"REG_d"), %%"REG_S" \n\t"\
271 "pxor %%mm4, %%mm4 \n\t"\
272 "pxor %%mm5, %%mm5 \n\t"\
273 "pxor %%mm6, %%mm6 \n\t"\
274 "pxor %%mm7, %%mm7 \n\t"\
277 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
278 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
279 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
280 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
281 "movq %%mm0, %%mm3 \n\t"\
282 "punpcklwd %%mm1, %%mm0 \n\t"\
283 "punpckhwd %%mm1, %%mm3 \n\t"\
284 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
285 "pmaddwd %%mm1, %%mm0 \n\t"\
286 "pmaddwd %%mm1, %%mm3 \n\t"\
287 "paddd %%mm0, %%mm4 \n\t"\
288 "paddd %%mm3, %%mm5 \n\t"\
289 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
290 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
291 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
292 "test %%"REG_S", %%"REG_S" \n\t"\
293 "movq %%mm2, %%mm0 \n\t"\
294 "punpcklwd %%mm3, %%mm2 \n\t"\
295 "punpckhwd %%mm3, %%mm0 \n\t"\
296 "pmaddwd %%mm1, %%mm2 \n\t"\
297 "pmaddwd %%mm1, %%mm0 \n\t"\
298 "paddd %%mm2, %%mm6 \n\t"\
299 "paddd %%mm0, %%mm7 \n\t"\
301 "psrad $16, %%mm4 \n\t"\
302 "psrad $16, %%mm5 \n\t"\
303 "psrad $16, %%mm6 \n\t"\
304 "psrad $16, %%mm7 \n\t"\
305 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
306 "packssdw %%mm5, %%mm4 \n\t"\
307 "packssdw %%mm7, %%mm6 \n\t"\
308 "paddw %%mm0, %%mm4 \n\t"\
309 "paddw %%mm0, %%mm6 \n\t"\
310 "movq %%mm4, "U_TEMP"(%0) \n\t"\
311 "movq %%mm6, "V_TEMP"(%0) \n\t"\
313 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
314 "lea "offset"(%0), %%"REG_d" \n\t"\
315 "mov (%%"REG_d"), %%"REG_S" \n\t"\
316 "pxor %%mm1, %%mm1 \n\t"\
317 "pxor %%mm5, %%mm5 \n\t"\
318 "pxor %%mm7, %%mm7 \n\t"\
319 "pxor %%mm6, %%mm6 \n\t"\
322 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
323 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
324 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
325 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
326 "movq %%mm0, %%mm3 \n\t"\
327 "punpcklwd %%mm4, %%mm0 \n\t"\
328 "punpckhwd %%mm4, %%mm3 \n\t"\
329 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
330 "pmaddwd %%mm4, %%mm0 \n\t"\
331 "pmaddwd %%mm4, %%mm3 \n\t"\
332 "paddd %%mm0, %%mm1 \n\t"\
333 "paddd %%mm3, %%mm5 \n\t"\
334 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
335 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
336 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
337 "test %%"REG_S", %%"REG_S" \n\t"\
338 "movq %%mm2, %%mm0 \n\t"\
339 "punpcklwd %%mm3, %%mm2 \n\t"\
340 "punpckhwd %%mm3, %%mm0 \n\t"\
341 "pmaddwd %%mm4, %%mm2 \n\t"\
342 "pmaddwd %%mm4, %%mm0 \n\t"\
343 "paddd %%mm2, %%mm7 \n\t"\
344 "paddd %%mm0, %%mm6 \n\t"\
346 "psrad $16, %%mm1 \n\t"\
347 "psrad $16, %%mm5 \n\t"\
348 "psrad $16, %%mm7 \n\t"\
349 "psrad $16, %%mm6 \n\t"\
350 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
351 "packssdw %%mm5, %%mm1 \n\t"\
352 "packssdw %%mm6, %%mm7 \n\t"\
353 "paddw %%mm0, %%mm1 \n\t"\
354 "paddw %%mm0, %%mm7 \n\t"\
355 "movq "U_TEMP"(%0), %%mm3 \n\t"\
356 "movq "V_TEMP"(%0), %%mm4 \n\t"\
358 #define YSCALEYUV2PACKEDX_ACCURATE \
359 YSCALEYUV2PACKEDX_ACCURATE_UV \
360 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
362 #define YSCALEYUV2RGBX \
363 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
364 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
365 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
366 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
367 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
368 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
369 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
370 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
371 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
372 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
373 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
374 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
375 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
376 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
377 "paddw %%mm3, %%mm4 \n\t"\
378 "movq %%mm2, %%mm0 \n\t"\
379 "movq %%mm5, %%mm6 \n\t"\
380 "movq %%mm4, %%mm3 \n\t"\
381 "punpcklwd %%mm2, %%mm2 \n\t"\
382 "punpcklwd %%mm5, %%mm5 \n\t"\
383 "punpcklwd %%mm4, %%mm4 \n\t"\
384 "paddw %%mm1, %%mm2 \n\t"\
385 "paddw %%mm1, %%mm5 \n\t"\
386 "paddw %%mm1, %%mm4 \n\t"\
387 "punpckhwd %%mm0, %%mm0 \n\t"\
388 "punpckhwd %%mm6, %%mm6 \n\t"\
389 "punpckhwd %%mm3, %%mm3 \n\t"\
390 "paddw %%mm7, %%mm0 \n\t"\
391 "paddw %%mm7, %%mm6 \n\t"\
392 "paddw %%mm7, %%mm3 \n\t"\
393 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
394 "packuswb %%mm0, %%mm2 \n\t"\
395 "packuswb %%mm6, %%mm5 \n\t"\
396 "packuswb %%mm3, %%mm4 \n\t"\
398 #define REAL_YSCALEYUV2PACKED(index, c) \
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
401 "psraw $3, %%mm0 \n\t"\
402 "psraw $3, %%mm1 \n\t"\
403 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
404 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
405 "xor "#index", "#index" \n\t"\
408 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
409 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
410 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
411 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
412 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
413 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
414 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
415 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
416 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
417 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
418 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
419 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
420 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
421 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
422 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
423 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
424 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
425 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
426 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
427 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
428 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
429 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
430 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
431 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
432 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
434 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
436 #define REAL_YSCALEYUV2RGB_UV(index, c) \
437 "xor "#index", "#index" \n\t"\
440 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
441 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
442 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
443 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
444 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
445 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
446 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
447 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
448 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
449 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
450 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
451 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
452 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
453 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
454 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
455 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
456 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
457 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
458 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
459 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
461 #define REAL_YSCALEYUV2RGB_YA(index, c) \
462 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
463 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
464 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
465 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
466 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
467 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
468 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
469 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
470 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
471 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
472 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
473 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
475 #define REAL_YSCALEYUV2RGB_COEFF(c) \
476 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
477 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
478 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
479 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
480 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
481 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
482 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
483 "paddw %%mm3, %%mm4 \n\t"\
484 "movq %%mm2, %%mm0 \n\t"\
485 "movq %%mm5, %%mm6 \n\t"\
486 "movq %%mm4, %%mm3 \n\t"\
487 "punpcklwd %%mm2, %%mm2 \n\t"\
488 "punpcklwd %%mm5, %%mm5 \n\t"\
489 "punpcklwd %%mm4, %%mm4 \n\t"\
490 "paddw %%mm1, %%mm2 \n\t"\
491 "paddw %%mm1, %%mm5 \n\t"\
492 "paddw %%mm1, %%mm4 \n\t"\
493 "punpckhwd %%mm0, %%mm0 \n\t"\
494 "punpckhwd %%mm6, %%mm6 \n\t"\
495 "punpckhwd %%mm3, %%mm3 \n\t"\
496 "paddw %%mm7, %%mm0 \n\t"\
497 "paddw %%mm7, %%mm6 \n\t"\
498 "paddw %%mm7, %%mm3 \n\t"\
499 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
500 "packuswb %%mm0, %%mm2 \n\t"\
501 "packuswb %%mm6, %%mm5 \n\t"\
502 "packuswb %%mm3, %%mm4 \n\t"\
504 #define YSCALEYUV2RGB_YA(index, c) REAL_YSCALEYUV2RGB_YA(index, c)
506 #define YSCALEYUV2RGB(index, c) \
507 REAL_YSCALEYUV2RGB_UV(index, c) \
508 REAL_YSCALEYUV2RGB_YA(index, c) \
509 REAL_YSCALEYUV2RGB_COEFF(c)
511 #define REAL_YSCALEYUV2PACKED1(index, c) \
512 "xor "#index", "#index" \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $7, %%mm3 \n\t" \
518 "psraw $7, %%mm4 \n\t" \
519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
521 "psraw $7, %%mm1 \n\t" \
522 "psraw $7, %%mm7 \n\t" \
524 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
526 #define REAL_YSCALEYUV2RGB1(index, c) \
527 "xor "#index", "#index" \n\t"\
530 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
531 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
532 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
533 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
534 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
535 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
536 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
537 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
538 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
539 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
540 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
541 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
542 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
543 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
544 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
545 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
546 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
547 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
548 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
549 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
550 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
551 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
552 "paddw %%mm3, %%mm4 \n\t"\
553 "movq %%mm2, %%mm0 \n\t"\
554 "movq %%mm5, %%mm6 \n\t"\
555 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklwd %%mm2, %%mm2 \n\t"\
557 "punpcklwd %%mm5, %%mm5 \n\t"\
558 "punpcklwd %%mm4, %%mm4 \n\t"\
559 "paddw %%mm1, %%mm2 \n\t"\
560 "paddw %%mm1, %%mm5 \n\t"\
561 "paddw %%mm1, %%mm4 \n\t"\
562 "punpckhwd %%mm0, %%mm0 \n\t"\
563 "punpckhwd %%mm6, %%mm6 \n\t"\
564 "punpckhwd %%mm3, %%mm3 \n\t"\
565 "paddw %%mm7, %%mm0 \n\t"\
566 "paddw %%mm7, %%mm6 \n\t"\
567 "paddw %%mm7, %%mm3 \n\t"\
568 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
569 "packuswb %%mm0, %%mm2 \n\t"\
570 "packuswb %%mm6, %%mm5 \n\t"\
571 "packuswb %%mm3, %%mm4 \n\t"\
573 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
575 #define REAL_YSCALEYUV2PACKED1b(index, c) \
576 "xor "#index", "#index" \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $8, %%mm3 \n\t" \
586 "psrlw $8, %%mm4 \n\t" \
587 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
588 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
589 "psraw $7, %%mm1 \n\t" \
590 "psraw $7, %%mm7 \n\t"
591 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
593 // do vertical chrominance interpolation
594 #define REAL_YSCALEYUV2RGB1b(index, c) \
595 "xor "#index", "#index" \n\t"\
598 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
599 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
600 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
601 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
602 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
603 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
604 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
605 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
606 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
607 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
608 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
609 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
610 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
611 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
612 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
613 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
614 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
615 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
616 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
617 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
618 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
619 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
620 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
621 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
622 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
623 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
624 "paddw %%mm3, %%mm4 \n\t"\
625 "movq %%mm2, %%mm0 \n\t"\
626 "movq %%mm5, %%mm6 \n\t"\
627 "movq %%mm4, %%mm3 \n\t"\
628 "punpcklwd %%mm2, %%mm2 \n\t"\
629 "punpcklwd %%mm5, %%mm5 \n\t"\
630 "punpcklwd %%mm4, %%mm4 \n\t"\
631 "paddw %%mm1, %%mm2 \n\t"\
632 "paddw %%mm1, %%mm5 \n\t"\
633 "paddw %%mm1, %%mm4 \n\t"\
634 "punpckhwd %%mm0, %%mm0 \n\t"\
635 "punpckhwd %%mm6, %%mm6 \n\t"\
636 "punpckhwd %%mm3, %%mm3 \n\t"\
637 "paddw %%mm7, %%mm0 \n\t"\
638 "paddw %%mm7, %%mm6 \n\t"\
639 "paddw %%mm7, %%mm3 \n\t"\
640 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
641 "packuswb %%mm0, %%mm2 \n\t"\
642 "packuswb %%mm6, %%mm5 \n\t"\
643 "packuswb %%mm3, %%mm4 \n\t"\
645 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
647 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
648 "movq "#b", "#q2" \n\t" /* B */\
649 "movq "#r", "#t" \n\t" /* R */\
650 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
651 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
652 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
653 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
654 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
655 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
656 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
657 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
658 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
659 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
661 MOVNTQ( q0, (dst, index, 4))\
662 MOVNTQ( b, 8(dst, index, 4))\
663 MOVNTQ( q2, 16(dst, index, 4))\
664 MOVNTQ( q3, 24(dst, index, 4))\
666 "add $8, "#index" \n\t"\
667 "cmp "#dstw", "#index" \n\t"\
669 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
671 #define REAL_WRITERGB16(dst, dstw, index) \
672 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
673 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
674 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
675 "psrlq $3, %%mm2 \n\t"\
677 "movq %%mm2, %%mm1 \n\t"\
678 "movq %%mm4, %%mm3 \n\t"\
680 "punpcklbw %%mm7, %%mm3 \n\t"\
681 "punpcklbw %%mm5, %%mm2 \n\t"\
682 "punpckhbw %%mm7, %%mm4 \n\t"\
683 "punpckhbw %%mm5, %%mm1 \n\t"\
685 "psllq $3, %%mm3 \n\t"\
686 "psllq $3, %%mm4 \n\t"\
688 "por %%mm3, %%mm2 \n\t"\
689 "por %%mm4, %%mm1 \n\t"\
691 MOVNTQ(%%mm2, (dst, index, 2))\
692 MOVNTQ(%%mm1, 8(dst, index, 2))\
694 "add $8, "#index" \n\t"\
695 "cmp "#dstw", "#index" \n\t"\
697 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
699 #define REAL_WRITERGB15(dst, dstw, index) \
700 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
701 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
702 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
703 "psrlq $3, %%mm2 \n\t"\
704 "psrlq $1, %%mm5 \n\t"\
706 "movq %%mm2, %%mm1 \n\t"\
707 "movq %%mm4, %%mm3 \n\t"\
709 "punpcklbw %%mm7, %%mm3 \n\t"\
710 "punpcklbw %%mm5, %%mm2 \n\t"\
711 "punpckhbw %%mm7, %%mm4 \n\t"\
712 "punpckhbw %%mm5, %%mm1 \n\t"\
714 "psllq $2, %%mm3 \n\t"\
715 "psllq $2, %%mm4 \n\t"\
717 "por %%mm3, %%mm2 \n\t"\
718 "por %%mm4, %%mm1 \n\t"\
720 MOVNTQ(%%mm2, (dst, index, 2))\
721 MOVNTQ(%%mm1, 8(dst, index, 2))\
723 "add $8, "#index" \n\t"\
724 "cmp "#dstw", "#index" \n\t"\
726 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
728 #define WRITEBGR24OLD(dst, dstw, index) \
729 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
730 "movq %%mm2, %%mm1 \n\t" /* B */\
731 "movq %%mm5, %%mm6 \n\t" /* R */\
732 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
733 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
734 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
735 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
736 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
737 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
738 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
739 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
740 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
741 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
743 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
744 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
745 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
746 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
747 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
748 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
749 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
750 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
752 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
753 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
754 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
755 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
756 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
757 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
758 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
759 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
760 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
761 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
762 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
763 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
764 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
766 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
767 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
768 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
769 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
770 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
771 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
772 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
773 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
775 MOVNTQ(%%mm0, (dst))\
776 MOVNTQ(%%mm2, 8(dst))\
777 MOVNTQ(%%mm3, 16(dst))\
778 "add $24, "#dst" \n\t"\
780 "add $8, "#index" \n\t"\
781 "cmp "#dstw", "#index" \n\t"\
784 #define WRITEBGR24MMX(dst, dstw, index) \
785 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
786 "movq %%mm2, %%mm1 \n\t" /* B */\
787 "movq %%mm5, %%mm6 \n\t" /* R */\
788 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
789 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
790 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
791 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
792 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
793 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
794 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
795 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
796 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
797 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
799 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
800 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
801 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
802 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
804 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
805 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
806 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
807 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
809 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
810 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
811 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
812 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
814 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
815 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
816 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
817 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
818 MOVNTQ(%%mm0, (dst))\
820 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
821 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
822 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
823 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
824 MOVNTQ(%%mm6, 8(dst))\
826 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
827 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
828 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
829 MOVNTQ(%%mm5, 16(dst))\
831 "add $24, "#dst" \n\t"\
833 "add $8, "#index" \n\t"\
834 "cmp "#dstw", "#index" \n\t"\
837 #define WRITEBGR24MMX2(dst, dstw, index) \
838 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
839 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
840 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
841 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
842 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
843 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
845 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
846 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
847 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
849 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
850 "por %%mm1, %%mm6 \n\t"\
851 "por %%mm3, %%mm6 \n\t"\
852 MOVNTQ(%%mm6, (dst))\
854 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
855 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
856 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
857 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
859 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
860 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
861 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
863 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
864 "por %%mm3, %%mm6 \n\t"\
865 MOVNTQ(%%mm6, 8(dst))\
867 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
868 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
869 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
871 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
872 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
873 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
875 "por %%mm1, %%mm3 \n\t"\
876 "por %%mm3, %%mm6 \n\t"\
877 MOVNTQ(%%mm6, 16(dst))\
879 "add $24, "#dst" \n\t"\
881 "add $8, "#index" \n\t"\
882 "cmp "#dstw", "#index" \n\t"\
887 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
890 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
893 #define REAL_WRITEYUY2(dst, dstw, index) \
894 "packuswb %%mm3, %%mm3 \n\t"\
895 "packuswb %%mm4, %%mm4 \n\t"\
896 "packuswb %%mm7, %%mm1 \n\t"\
897 "punpcklbw %%mm4, %%mm3 \n\t"\
898 "movq %%mm1, %%mm7 \n\t"\
899 "punpcklbw %%mm3, %%mm1 \n\t"\
900 "punpckhbw %%mm3, %%mm7 \n\t"\
902 MOVNTQ(%%mm1, (dst, index, 2))\
903 MOVNTQ(%%mm7, 8(dst, index, 2))\
905 "add $8, "#index" \n\t"\
906 "cmp "#dstw", "#index" \n\t"\
908 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
911 static inline void RENAME(yuv2yuvX
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
912 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
913 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, long dstW
, long chrDstW
)
916 if(!(c
->flags
& SWS_BITEXACT
)){
917 if (c
->flags
& SWS_ACCURATE_RND
){
919 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
920 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
923 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
926 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
927 YSCALEYUV2YV12X(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
930 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
936 yuv2yuvX_altivec_real(lumFilter
, lumSrc
, lumFilterSize
,
937 chrFilter
, chrSrc
, chrFilterSize
,
938 dest
, uDest
, vDest
, dstW
, chrDstW
);
940 yuv2yuvXinC(lumFilter
, lumSrc
, lumFilterSize
,
941 chrFilter
, chrSrc
, chrFilterSize
,
942 dest
, uDest
, vDest
, dstW
, chrDstW
);
943 #endif //!HAVE_ALTIVEC
946 static inline void RENAME(yuv2nv12X
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
947 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
948 uint8_t *dest
, uint8_t *uDest
, int dstW
, int chrDstW
, int dstFormat
)
950 yuv2nv12XinC(lumFilter
, lumSrc
, lumFilterSize
,
951 chrFilter
, chrSrc
, chrFilterSize
,
952 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
955 static inline void RENAME(yuv2yuv1
)(SwsContext
*c
, int16_t *lumSrc
, int16_t *chrSrc
,
956 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, long dstW
, long chrDstW
)
960 if(!(c
->flags
& SWS_BITEXACT
)){
961 long p
= uDest
? 3 : 1;
962 uint8_t *src
[3]= {lumSrc
+ dstW
, chrSrc
+ chrDstW
, chrSrc
+ VOFW
+ chrDstW
};
963 uint8_t *dst
[3]= {dest
, uDest
, vDest
};
964 long counter
[3] = {dstW
, chrDstW
, chrDstW
};
966 if (c
->flags
& SWS_ACCURATE_RND
){
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
979 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
988 for (i
=0; i
<dstW
; i
++)
990 int val
= (lumSrc
[i
]+64)>>7;
1001 for (i
=0; i
<chrDstW
; i
++)
1003 int u
=(chrSrc
[i
]+64)>>7;
1004 int v
=(chrSrc
[i
+ VOFW
]+64)>>7;
1008 else if (u
>255) u
=255;
1010 else if (v
>255) v
=255;
1020 * vertical scale YV12 to RGB
1022 static inline void RENAME(yuv2packedX
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
1023 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
1024 uint8_t *dest
, long dstW
, long dstY
)
1028 if(!(c
->flags
& SWS_BITEXACT
)){
1029 if (c
->flags
& SWS_ACCURATE_RND
){
1030 switch(c
->dstFormat
){
1032 YSCALEYUV2PACKEDX_ACCURATE
1034 "pxor %%mm7, %%mm7 \n\t"
1035 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1037 YSCALEYUV2PACKEDX_END
1040 YSCALEYUV2PACKEDX_ACCURATE
1042 "pxor %%mm7, %%mm7 \n\t"
1043 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
"\n\t" //FIXME optimize
1044 "add %4, %%"REG_c
" \n\t"
1045 WRITEBGR24(%%REGc
, %5, %%REGa
)
1048 :: "r" (&c
->redDither
),
1049 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1050 "r" (dest
), "m" (dstW
)
1051 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1054 case PIX_FMT_RGB555
:
1055 YSCALEYUV2PACKEDX_ACCURATE
1057 "pxor %%mm7, %%mm7 \n\t"
1058 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1060 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1061 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1062 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1065 WRITERGB15(%4, %5, %%REGa
)
1066 YSCALEYUV2PACKEDX_END
1068 case PIX_FMT_RGB565
:
1069 YSCALEYUV2PACKEDX_ACCURATE
1071 "pxor %%mm7, %%mm7 \n\t"
1072 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1074 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1075 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1076 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1079 WRITERGB16(%4, %5, %%REGa
)
1080 YSCALEYUV2PACKEDX_END
1082 case PIX_FMT_YUYV422
:
1083 YSCALEYUV2PACKEDX_ACCURATE
1084 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1086 "psraw $3, %%mm3 \n\t"
1087 "psraw $3, %%mm4 \n\t"
1088 "psraw $3, %%mm1 \n\t"
1089 "psraw $3, %%mm7 \n\t"
1090 WRITEYUY2(%4, %5, %%REGa
)
1091 YSCALEYUV2PACKEDX_END
1095 switch(c
->dstFormat
)
1100 "pxor %%mm7, %%mm7 \n\t"
1101 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1102 YSCALEYUV2PACKEDX_END
1107 "pxor %%mm7, %%mm7 \n\t"
1108 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
" \n\t" //FIXME optimize
1109 "add %4, %%"REG_c
" \n\t"
1110 WRITEBGR24(%%REGc
, %5, %%REGa
)
1112 :: "r" (&c
->redDither
),
1113 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1114 "r" (dest
), "m" (dstW
)
1115 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1118 case PIX_FMT_RGB555
:
1121 "pxor %%mm7, %%mm7 \n\t"
1122 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1124 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1125 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1126 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1129 WRITERGB15(%4, %5, %%REGa
)
1130 YSCALEYUV2PACKEDX_END
1132 case PIX_FMT_RGB565
:
1135 "pxor %%mm7, %%mm7 \n\t"
1136 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1138 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1139 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1140 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1143 WRITERGB16(%4, %5, %%REGa
)
1144 YSCALEYUV2PACKEDX_END
1146 case PIX_FMT_YUYV422
:
1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150 "psraw $3, %%mm3 \n\t"
1151 "psraw $3, %%mm4 \n\t"
1152 "psraw $3, %%mm1 \n\t"
1153 "psraw $3, %%mm7 \n\t"
1154 WRITEYUY2(%4, %5, %%REGa
)
1155 YSCALEYUV2PACKEDX_END
1160 #endif /* HAVE_MMX */
1162 /* The following list of supported dstFormat values should
1163 match what's found in the body of altivec_yuv2packedX() */
1164 if (!(c
->flags
& SWS_BITEXACT
) &&
1165 (c
->dstFormat
==PIX_FMT_ABGR
|| c
->dstFormat
==PIX_FMT_BGRA
||
1166 c
->dstFormat
==PIX_FMT_BGR24
|| c
->dstFormat
==PIX_FMT_RGB24
||
1167 c
->dstFormat
==PIX_FMT_RGBA
|| c
->dstFormat
==PIX_FMT_ARGB
))
1168 altivec_yuv2packedX (c
, lumFilter
, lumSrc
, lumFilterSize
,
1169 chrFilter
, chrSrc
, chrFilterSize
,
1173 yuv2packedXinC(c
, lumFilter
, lumSrc
, lumFilterSize
,
1174 chrFilter
, chrSrc
, chrFilterSize
,
1179 * vertical bilinear scale YV12 to RGB
1181 static inline void RENAME(yuv2packed2
)(SwsContext
*c
, uint16_t *buf0
, uint16_t *buf1
, uint16_t *uvbuf0
, uint16_t *uvbuf1
,
1182 uint8_t *dest
, int dstW
, int yalpha
, int uvalpha
, int y
)
1184 int yalpha1
=4095- yalpha
;
1185 int uvalpha1
=4095-uvalpha
;
1189 if(!(c
->flags
& SWS_BITEXACT
)){
1190 switch(c
->dstFormat
)
1192 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1195 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1196 "mov %4, %%"REG_b
" \n\t"
1197 "push %%"REG_BP
" \n\t"
1198 YSCALEYUV2RGB(%%REGBP
, %5)
1199 "pxor %%mm7, %%mm7 \n\t"
1200 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1201 "pop %%"REG_BP
" \n\t"
1202 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1204 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1210 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1211 "mov %4, %%"REG_b
" \n\t"
1212 "push %%"REG_BP
" \n\t"
1213 YSCALEYUV2RGB(%%REGBP
, %5)
1214 "pxor %%mm7, %%mm7 \n\t"
1215 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1216 "pop %%"REG_BP
" \n\t"
1217 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1218 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1222 case PIX_FMT_RGB555
:
1224 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1225 "mov %4, %%"REG_b
" \n\t"
1226 "push %%"REG_BP
" \n\t"
1227 YSCALEYUV2RGB(%%REGBP
, %5)
1228 "pxor %%mm7, %%mm7 \n\t"
1229 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1231 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1232 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1233 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1236 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1237 "pop %%"REG_BP
" \n\t"
1238 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1240 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1244 case PIX_FMT_RGB565
:
1246 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1247 "mov %4, %%"REG_b
" \n\t"
1248 "push %%"REG_BP
" \n\t"
1249 YSCALEYUV2RGB(%%REGBP
, %5)
1250 "pxor %%mm7, %%mm7 \n\t"
1251 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1253 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1254 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1255 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1258 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1259 "pop %%"REG_BP
" \n\t"
1260 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1261 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1265 case PIX_FMT_YUYV422
:
1267 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1268 "mov %4, %%"REG_b
" \n\t"
1269 "push %%"REG_BP
" \n\t"
1270 YSCALEYUV2PACKED(%%REGBP
, %5)
1271 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1272 "pop %%"REG_BP
" \n\t"
1273 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1274 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1282 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C
, YSCALE_YUV_2_PACKED2_C
, YSCALE_YUV_2_GRAY16_2_C
, YSCALE_YUV_2_MONO2_C
)
1286 * YV12 to RGB without scaling or interpolating
1288 static inline void RENAME(yuv2packed1
)(SwsContext
*c
, uint16_t *buf0
, uint16_t *uvbuf0
, uint16_t *uvbuf1
,
1289 uint8_t *dest
, int dstW
, int uvalpha
, int dstFormat
, int flags
, int y
)
1291 const int yalpha1
=0;
1294 uint16_t *buf1
= buf0
; //FIXME needed for RGB1/BGR1
1295 const int yalpha
= 4096; //FIXME ...
1297 if (flags
&SWS_FULL_CHR_H_INT
)
1299 RENAME(yuv2packed2
)(c
, buf0
, buf0
, uvbuf0
, uvbuf1
, dest
, dstW
, 0, uvalpha
, y
);
1304 if(!(flags
& SWS_BITEXACT
)){
1305 if (uvalpha
< 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1311 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1312 "mov %4, %%"REG_b
" \n\t"
1313 "push %%"REG_BP
" \n\t"
1314 YSCALEYUV2RGB1(%%REGBP
, %5)
1315 "pxor %%mm7, %%mm7 \n\t"
1316 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1317 "pop %%"REG_BP
" \n\t"
1318 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1320 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1326 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1327 "mov %4, %%"REG_b
" \n\t"
1328 "push %%"REG_BP
" \n\t"
1329 YSCALEYUV2RGB1(%%REGBP
, %5)
1330 "pxor %%mm7, %%mm7 \n\t"
1331 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1332 "pop %%"REG_BP
" \n\t"
1333 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1335 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1339 case PIX_FMT_RGB555
:
1341 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1342 "mov %4, %%"REG_b
" \n\t"
1343 "push %%"REG_BP
" \n\t"
1344 YSCALEYUV2RGB1(%%REGBP
, %5)
1345 "pxor %%mm7, %%mm7 \n\t"
1346 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1348 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1349 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1350 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1352 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1353 "pop %%"REG_BP
" \n\t"
1354 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1356 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1360 case PIX_FMT_RGB565
:
1362 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1363 "mov %4, %%"REG_b
" \n\t"
1364 "push %%"REG_BP
" \n\t"
1365 YSCALEYUV2RGB1(%%REGBP
, %5)
1366 "pxor %%mm7, %%mm7 \n\t"
1367 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1369 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1370 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1371 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1374 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1375 "pop %%"REG_BP
" \n\t"
1376 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1378 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1382 case PIX_FMT_YUYV422
:
1384 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1385 "mov %4, %%"REG_b
" \n\t"
1386 "push %%"REG_BP
" \n\t"
1387 YSCALEYUV2PACKED1(%%REGBP
, %5)
1388 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1389 "pop %%"REG_BP
" \n\t"
1390 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1392 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1404 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1405 "mov %4, %%"REG_b
" \n\t"
1406 "push %%"REG_BP
" \n\t"
1407 YSCALEYUV2RGB1b(%%REGBP
, %5)
1408 "pxor %%mm7, %%mm7 \n\t"
1409 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1410 "pop %%"REG_BP
" \n\t"
1411 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1413 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1419 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1420 "mov %4, %%"REG_b
" \n\t"
1421 "push %%"REG_BP
" \n\t"
1422 YSCALEYUV2RGB1b(%%REGBP
, %5)
1423 "pxor %%mm7, %%mm7 \n\t"
1424 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1425 "pop %%"REG_BP
" \n\t"
1426 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1428 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1432 case PIX_FMT_RGB555
:
1434 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1435 "mov %4, %%"REG_b
" \n\t"
1436 "push %%"REG_BP
" \n\t"
1437 YSCALEYUV2RGB1b(%%REGBP
, %5)
1438 "pxor %%mm7, %%mm7 \n\t"
1439 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1441 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1442 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1443 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1445 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1446 "pop %%"REG_BP
" \n\t"
1447 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1449 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1453 case PIX_FMT_RGB565
:
1455 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1456 "mov %4, %%"REG_b
" \n\t"
1457 "push %%"REG_BP
" \n\t"
1458 YSCALEYUV2RGB1b(%%REGBP
, %5)
1459 "pxor %%mm7, %%mm7 \n\t"
1460 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1462 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1463 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1464 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1467 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1468 "pop %%"REG_BP
" \n\t"
1469 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1471 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1475 case PIX_FMT_YUYV422
:
1477 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1478 "mov %4, %%"REG_b
" \n\t"
1479 "push %%"REG_BP
" \n\t"
1480 YSCALEYUV2PACKED1b(%%REGBP
, %5)
1481 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1482 "pop %%"REG_BP
" \n\t"
1483 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1485 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1492 #endif /* HAVE_MMX */
1495 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C
, YSCALE_YUV_2_PACKED1_C
, YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1497 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C
, YSCALE_YUV_2_PACKED1B_C
, YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1501 //FIXME yuy2* can read up to 7 samples too much
1503 static inline void RENAME(yuy2ToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1507 "movq "MANGLE(bm01010101
)", %%mm2 \n\t"
1508 "mov %0, %%"REG_a
" \n\t"
1510 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1511 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1512 "pand %%mm2, %%mm0 \n\t"
1513 "pand %%mm2, %%mm1 \n\t"
1514 "packuswb %%mm1, %%mm0 \n\t"
1515 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1516 "add $8, %%"REG_a
" \n\t"
1518 : : "g" (-width
), "r" (src
+width
*2), "r" (dst
+width
)
1523 for (i
=0; i
<width
; i
++)
1528 static inline void RENAME(yuy2ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1532 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1533 "mov %0, %%"REG_a
" \n\t"
1535 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1536 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1537 "psrlw $8, %%mm0 \n\t"
1538 "psrlw $8, %%mm1 \n\t"
1539 "packuswb %%mm1, %%mm0 \n\t"
1540 "movq %%mm0, %%mm1 \n\t"
1541 "psrlw $8, %%mm0 \n\t"
1542 "pand %%mm4, %%mm1 \n\t"
1543 "packuswb %%mm0, %%mm0 \n\t"
1544 "packuswb %%mm1, %%mm1 \n\t"
1545 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1546 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1547 "add $4, %%"REG_a
" \n\t"
1549 : : "g" (-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1554 for (i
=0; i
<width
; i
++)
1556 dstU
[i
]= src1
[4*i
+ 1];
1557 dstV
[i
]= src1
[4*i
+ 3];
1560 assert(src1
== src2
);
1563 /* This is almost identical to the previous, end exists only because
1564 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1565 static inline void RENAME(uyvyToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1569 "mov %0, %%"REG_a
" \n\t"
1571 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1572 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1573 "psrlw $8, %%mm0 \n\t"
1574 "psrlw $8, %%mm1 \n\t"
1575 "packuswb %%mm1, %%mm0 \n\t"
1576 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1577 "add $8, %%"REG_a
" \n\t"
1579 : : "g" (-width
), "r" (src
+width
*2), "r" (dst
+width
)
1584 for (i
=0; i
<width
; i
++)
1589 static inline void RENAME(uyvyToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1593 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1594 "mov %0, %%"REG_a
" \n\t"
1596 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1597 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1598 "pand %%mm4, %%mm0 \n\t"
1599 "pand %%mm4, %%mm1 \n\t"
1600 "packuswb %%mm1, %%mm0 \n\t"
1601 "movq %%mm0, %%mm1 \n\t"
1602 "psrlw $8, %%mm0 \n\t"
1603 "pand %%mm4, %%mm1 \n\t"
1604 "packuswb %%mm0, %%mm0 \n\t"
1605 "packuswb %%mm1, %%mm1 \n\t"
1606 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1607 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1608 "add $4, %%"REG_a
" \n\t"
1610 : : "g" (-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1615 for (i
=0; i
<width
; i
++)
1617 dstU
[i
]= src1
[4*i
+ 0];
1618 dstV
[i
]= src1
[4*i
+ 2];
1621 assert(src1
== src2
);
1624 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1625 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1628 for (i=0; i<width; i++)\
1630 int b= (((type*)src)[i]>>shb)&maskb;\
1631 int g= (((type*)src)[i]>>shg)&maskg;\
1632 int r= (((type*)src)[i]>>shr)&maskr;\
1634 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1638 BGR2Y(uint32_t, bgr32ToY
,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY
<< 8, GY
, BY
<< 8, RGB2YUV_SHIFT
+8)
1639 BGR2Y(uint32_t, rgb32ToY
, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY
<< 8, GY
, BY
<< 8, RGB2YUV_SHIFT
+8)
1640 BGR2Y(uint16_t, bgr16ToY
, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY
<<11, GY
<<5, BY
, RGB2YUV_SHIFT
+8)
1641 BGR2Y(uint16_t, bgr15ToY
, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY
<<10, GY
<<5, BY
, RGB2YUV_SHIFT
+7)
1642 BGR2Y(uint16_t, rgb16ToY
, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY
, GY
<<5, BY
<<11, RGB2YUV_SHIFT
+8)
1643 BGR2Y(uint16_t, rgb15ToY
, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY
, GY
<<5, BY
<<10, RGB2YUV_SHIFT
+7)
1645 #define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1646 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1649 for (i=0; i<width; i++)\
1651 int b= (((type*)src)[i]&maskb)>>shb;\
1652 int g= (((type*)src)[i]&maskg)>>shg;\
1653 int r= (((type*)src)[i]&maskr)>>shr;\
1655 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1656 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1659 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1662 for (i=0; i<width; i++)\
1664 int pix0= ((type*)src)[2*i+0];\
1665 int pix1= ((type*)src)[2*i+1];\
1666 int g= (pix0&maskg)+(pix1&maskg);\
1667 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1668 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1672 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1673 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1677 BGR2UV(uint32_t, bgr32ToUV
,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU
<< 8, GU
, BU
<< 8, RV
<< 8, GV
, BV
<< 8, RGB2YUV_SHIFT
+8)
1678 BGR2UV(uint32_t, rgb32ToUV
, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU
<< 8, GU
, BU
<< 8, RV
<< 8, GV
, BV
<< 8, RGB2YUV_SHIFT
+8)
1679 BGR2UV(uint16_t, bgr16ToUV
, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU
<<11, GU
<<5, BU
, RV
<<11, GV
<<5, BV
, RGB2YUV_SHIFT
+8)
1680 BGR2UV(uint16_t, bgr15ToUV
, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU
<<10, GU
<<5, BU
, RV
<<10, GV
<<5, BV
, RGB2YUV_SHIFT
+7)
1681 BGR2UV(uint16_t, rgb16ToUV
, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU
, GU
<<5, BU
<<11, RV
, GV
<<5, BV
<<11, RGB2YUV_SHIFT
+8)
1682 BGR2UV(uint16_t, rgb15ToUV
, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU
, GU
<<5, BU
<<10, RV
, GV
<<5, BV
<<10, RGB2YUV_SHIFT
+7)
1685 static inline void RENAME(bgr24ToY_mmx
)(uint8_t *dst
, uint8_t *src
, long width
, int srcFormat
)
1688 if(srcFormat
== PIX_FMT_BGR24
){
1690 "movq "MANGLE(ff_bgr24toY1Coeff
)", %%mm5 \n\t"
1691 "movq "MANGLE(ff_bgr24toY2Coeff
)", %%mm6 \n\t"
1696 "movq "MANGLE(ff_rgb24toY1Coeff
)", %%mm5 \n\t"
1697 "movq "MANGLE(ff_rgb24toY2Coeff
)", %%mm6 \n\t"
1703 "movq "MANGLE(ff_bgr24toYOffset
)", %%mm4 \n\t"
1704 "mov %2, %%"REG_a
" \n\t"
1705 "pxor %%mm7, %%mm7 \n\t"
1707 PREFETCH
" 64(%0) \n\t"
1708 "movd (%0), %%mm0 \n\t"
1709 "movd 2(%0), %%mm1 \n\t"
1710 "movd 6(%0), %%mm2 \n\t"
1711 "movd 8(%0), %%mm3 \n\t"
1713 "punpcklbw %%mm7, %%mm0 \n\t"
1714 "punpcklbw %%mm7, %%mm1 \n\t"
1715 "punpcklbw %%mm7, %%mm2 \n\t"
1716 "punpcklbw %%mm7, %%mm3 \n\t"
1717 "pmaddwd %%mm5, %%mm0 \n\t"
1718 "pmaddwd %%mm6, %%mm1 \n\t"
1719 "pmaddwd %%mm5, %%mm2 \n\t"
1720 "pmaddwd %%mm6, %%mm3 \n\t"
1721 "paddd %%mm1, %%mm0 \n\t"
1722 "paddd %%mm3, %%mm2 \n\t"
1723 "paddd %%mm4, %%mm0 \n\t"
1724 "paddd %%mm4, %%mm2 \n\t"
1725 "psrad $15, %%mm0 \n\t"
1726 "psrad $15, %%mm2 \n\t"
1727 "packssdw %%mm2, %%mm0 \n\t"
1728 "packuswb %%mm0, %%mm0 \n\t"
1729 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1730 "add $4, %%"REG_a
" \n\t"
1733 : "r" (dst
+width
), "g" (-width
)
1738 static inline void RENAME(bgr24ToUV_mmx
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src
, long width
, int srcFormat
)
1741 "movq 24+%4, %%mm6 \n\t"
1742 "mov %3, %%"REG_a
" \n\t"
1743 "pxor %%mm7, %%mm7 \n\t"
1745 PREFETCH
" 64(%0) \n\t"
1746 "movd (%0), %%mm0 \n\t"
1747 "movd 2(%0), %%mm1 \n\t"
1748 "punpcklbw %%mm7, %%mm0 \n\t"
1749 "punpcklbw %%mm7, %%mm1 \n\t"
1750 "movq %%mm0, %%mm2 \n\t"
1751 "movq %%mm1, %%mm3 \n\t"
1752 "pmaddwd %4, %%mm0 \n\t"
1753 "pmaddwd 8+%4, %%mm1 \n\t"
1754 "pmaddwd 16+%4, %%mm2 \n\t"
1755 "pmaddwd %%mm6, %%mm3 \n\t"
1756 "paddd %%mm1, %%mm0 \n\t"
1757 "paddd %%mm3, %%mm2 \n\t"
1759 "movd 6(%0), %%mm1 \n\t"
1760 "movd 8(%0), %%mm3 \n\t"
1762 "punpcklbw %%mm7, %%mm1 \n\t"
1763 "punpcklbw %%mm7, %%mm3 \n\t"
1764 "movq %%mm1, %%mm4 \n\t"
1765 "movq %%mm3, %%mm5 \n\t"
1766 "pmaddwd %4, %%mm1 \n\t"
1767 "pmaddwd 8+%4, %%mm3 \n\t"
1768 "pmaddwd 16+%4, %%mm4 \n\t"
1769 "pmaddwd %%mm6, %%mm5 \n\t"
1770 "paddd %%mm3, %%mm1 \n\t"
1771 "paddd %%mm5, %%mm4 \n\t"
1773 "movq "MANGLE(ff_bgr24toUVOffset
)", %%mm3 \n\t"
1774 "paddd %%mm3, %%mm0 \n\t"
1775 "paddd %%mm3, %%mm2 \n\t"
1776 "paddd %%mm3, %%mm1 \n\t"
1777 "paddd %%mm3, %%mm4 \n\t"
1778 "psrad $15, %%mm0 \n\t"
1779 "psrad $15, %%mm2 \n\t"
1780 "psrad $15, %%mm1 \n\t"
1781 "psrad $15, %%mm4 \n\t"
1782 "packssdw %%mm1, %%mm0 \n\t"
1783 "packssdw %%mm4, %%mm2 \n\t"
1784 "packuswb %%mm0, %%mm0 \n\t"
1785 "packuswb %%mm2, %%mm2 \n\t"
1786 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1787 "movd %%mm2, (%2, %%"REG_a
") \n\t"
1788 "add $4, %%"REG_a
" \n\t"
1791 : "r" (dstU
+width
), "r" (dstV
+width
), "g" (-width
), "m"(ff_bgr24toUV
[srcFormat
== PIX_FMT_RGB24
][0])
1797 static inline void RENAME(bgr24ToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1800 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_BGR24
);
1803 for (i
=0; i
<width
; i
++)
1809 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
1811 #endif /* HAVE_MMX */
1814 static inline void RENAME(bgr24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1817 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_BGR24
);
1820 for (i
=0; i
<width
; i
++)
1822 int b
= src1
[3*i
+ 0];
1823 int g
= src1
[3*i
+ 1];
1824 int r
= src1
[3*i
+ 2];
1826 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1827 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1829 #endif /* HAVE_MMX */
1830 assert(src1
== src2
);
1833 static inline void RENAME(bgr24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1836 for (i
=0; i
<width
; i
++)
1838 int b
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
1839 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
1840 int r
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
1842 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1843 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1845 assert(src1
== src2
);
1848 static inline void RENAME(rgb24ToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1851 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_RGB24
);
1854 for (i
=0; i
<width
; i
++)
1860 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
1865 static inline void RENAME(rgb24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1869 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_RGB24
);
1873 for (i
=0; i
<width
; i
++)
1875 int r
= src1
[3*i
+ 0];
1876 int g
= src1
[3*i
+ 1];
1877 int b
= src1
[3*i
+ 2];
1879 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1880 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1885 static inline void RENAME(rgb24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1889 for (i
=0; i
<width
; i
++)
1891 int r
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
1892 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
1893 int b
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
1895 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1896 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1901 static inline void RENAME(palToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *pal
)
1904 for (i
=0; i
<width
; i
++)
1908 dst
[i
]= pal
[d
] & 0xFF;
1912 static inline void RENAME(palToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *pal
)
1915 assert(src1
== src2
);
1916 for (i
=0; i
<width
; i
++)
1918 int p
= pal
[src1
[i
]];
1925 static inline void RENAME(monowhite2Y
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1928 for (i
=0; i
<width
/8; i
++){
1931 dst
[8*i
+j
]= ((d
>>(7-j
))&1)*255;
1935 static inline void RENAME(monoblack2Y
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1938 for (i
=0; i
<width
/8; i
++){
1941 dst
[8*i
+j
]= ((d
>>(7-j
))&1)*255;
1945 // bilinear / bicubic scaling
1946 static inline void RENAME(hScale
)(int16_t *dst
, int dstW
, uint8_t *src
, int srcW
, int xInc
,
1947 int16_t *filter
, int16_t *filterPos
, long filterSize
)
1950 assert(filterSize
% 4 == 0 && filterSize
>0);
1951 if (filterSize
==4) // Always true for upscaling, sometimes for down, too.
1953 long counter
= -2*dstW
;
1955 filterPos
-= counter
/2;
1959 "push %%"REG_b
" \n\t"
1961 "pxor %%mm7, %%mm7 \n\t"
1962 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
1963 "mov %%"REG_a
", %%"REG_BP
" \n\t"
1966 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
1967 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
1968 "movq (%1, %%"REG_BP
", 4), %%mm1 \n\t"
1969 "movq 8(%1, %%"REG_BP
", 4), %%mm3 \n\t"
1970 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
1971 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
1972 "punpcklbw %%mm7, %%mm0 \n\t"
1973 "punpcklbw %%mm7, %%mm2 \n\t"
1974 "pmaddwd %%mm1, %%mm0 \n\t"
1975 "pmaddwd %%mm2, %%mm3 \n\t"
1976 "movq %%mm0, %%mm4 \n\t"
1977 "punpckldq %%mm3, %%mm0 \n\t"
1978 "punpckhdq %%mm3, %%mm4 \n\t"
1979 "paddd %%mm4, %%mm0 \n\t"
1980 "psrad $7, %%mm0 \n\t"
1981 "packssdw %%mm0, %%mm0 \n\t"
1982 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
1983 "add $4, %%"REG_BP
" \n\t"
1986 "pop %%"REG_BP
" \n\t"
1988 "pop %%"REG_b
" \n\t"
1991 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
1997 else if (filterSize
==8)
1999 long counter
= -2*dstW
;
2001 filterPos
-= counter
/2;
2005 "push %%"REG_b
" \n\t"
2007 "pxor %%mm7, %%mm7 \n\t"
2008 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2009 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2012 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2013 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
2014 "movq (%1, %%"REG_BP
", 8), %%mm1 \n\t"
2015 "movq 16(%1, %%"REG_BP
", 8), %%mm3 \n\t"
2016 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2017 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2018 "punpcklbw %%mm7, %%mm0 \n\t"
2019 "punpcklbw %%mm7, %%mm2 \n\t"
2020 "pmaddwd %%mm1, %%mm0 \n\t"
2021 "pmaddwd %%mm2, %%mm3 \n\t"
2023 "movq 8(%1, %%"REG_BP
", 8), %%mm1 \n\t"
2024 "movq 24(%1, %%"REG_BP
", 8), %%mm5 \n\t"
2025 "movd 4(%3, %%"REG_a
"), %%mm4 \n\t"
2026 "movd 4(%3, %%"REG_b
"), %%mm2 \n\t"
2027 "punpcklbw %%mm7, %%mm4 \n\t"
2028 "punpcklbw %%mm7, %%mm2 \n\t"
2029 "pmaddwd %%mm1, %%mm4 \n\t"
2030 "pmaddwd %%mm2, %%mm5 \n\t"
2031 "paddd %%mm4, %%mm0 \n\t"
2032 "paddd %%mm5, %%mm3 \n\t"
2033 "movq %%mm0, %%mm4 \n\t"
2034 "punpckldq %%mm3, %%mm0 \n\t"
2035 "punpckhdq %%mm3, %%mm4 \n\t"
2036 "paddd %%mm4, %%mm0 \n\t"
2037 "psrad $7, %%mm0 \n\t"
2038 "packssdw %%mm0, %%mm0 \n\t"
2039 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2040 "add $4, %%"REG_BP
" \n\t"
2043 "pop %%"REG_BP
" \n\t"
2045 "pop %%"REG_b
" \n\t"
2048 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2056 uint8_t *offset
= src
+filterSize
;
2057 long counter
= -2*dstW
;
2058 //filter-= counter*filterSize/2;
2059 filterPos
-= counter
/2;
2062 "pxor %%mm7, %%mm7 \n\t"
2065 "mov %2, %%"REG_c
" \n\t"
2066 "movzwl (%%"REG_c
", %0), %%eax \n\t"
2067 "movzwl 2(%%"REG_c
", %0), %%edx \n\t"
2068 "mov %5, %%"REG_c
" \n\t"
2069 "pxor %%mm4, %%mm4 \n\t"
2070 "pxor %%mm5, %%mm5 \n\t"
2072 "movq (%1), %%mm1 \n\t"
2073 "movq (%1, %6), %%mm3 \n\t"
2074 "movd (%%"REG_c
", %%"REG_a
"), %%mm0 \n\t"
2075 "movd (%%"REG_c
", %%"REG_d
"), %%mm2 \n\t"
2076 "punpcklbw %%mm7, %%mm0 \n\t"
2077 "punpcklbw %%mm7, %%mm2 \n\t"
2078 "pmaddwd %%mm1, %%mm0 \n\t"
2079 "pmaddwd %%mm2, %%mm3 \n\t"
2080 "paddd %%mm3, %%mm5 \n\t"
2081 "paddd %%mm0, %%mm4 \n\t"
2083 "add $4, %%"REG_c
" \n\t"
2084 "cmp %4, %%"REG_c
" \n\t"
2087 "movq %%mm4, %%mm0 \n\t"
2088 "punpckldq %%mm5, %%mm4 \n\t"
2089 "punpckhdq %%mm5, %%mm0 \n\t"
2090 "paddd %%mm0, %%mm4 \n\t"
2091 "psrad $7, %%mm4 \n\t"
2092 "packssdw %%mm4, %%mm4 \n\t"
2093 "mov %3, %%"REG_a
" \n\t"
2094 "movd %%mm4, (%%"REG_a
", %0) \n\t"
2098 : "+r" (counter
), "+r" (filter
)
2099 : "m" (filterPos
), "m" (dst
), "m"(offset
),
2100 "m" (src
), "r" (filterSize
*2)
2101 : "%"REG_a
, "%"REG_c
, "%"REG_d
2106 hScale_altivec_real(dst
, dstW
, src
, srcW
, xInc
, filter
, filterPos
, filterSize
);
2109 for (i
=0; i
<dstW
; i
++)
2112 int srcPos
= filterPos
[i
];
2114 //printf("filterPos: %d\n", filterPos[i]);
2115 for (j
=0; j
<filterSize
; j
++)
2117 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2118 val
+= ((int)src
[srcPos
+ j
])*filter
[filterSize
*i
+ j
];
2120 //filter += hFilterSize;
2121 dst
[i
] = FFMIN(val
>>7, (1<<15)-1); // the cubic equation does overflow ...
2124 #endif /* HAVE_ALTIVEC */
2125 #endif /* HAVE_MMX */
2127 // *** horizontal scale Y line to temp buffer
2128 static inline void RENAME(hyscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, uint8_t *src
, int srcW
, int xInc
,
2129 int flags
, int canMMX2BeUsed
, int16_t *hLumFilter
,
2130 int16_t *hLumFilterPos
, int hLumFilterSize
, void *funnyYCode
,
2131 int srcFormat
, uint8_t *formatConvBuffer
, int16_t *mmx2Filter
,
2132 int32_t *mmx2FilterPos
, uint32_t *pal
)
2134 if (srcFormat
==PIX_FMT_YUYV422
|| srcFormat
==PIX_FMT_GRAY16BE
)
2136 RENAME(yuy2ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2137 src
= formatConvBuffer
;
2139 else if (srcFormat
==PIX_FMT_UYVY422
|| srcFormat
==PIX_FMT_GRAY16LE
)
2141 RENAME(uyvyToY
)(formatConvBuffer
, src
, srcW
, pal
);
2142 src
= formatConvBuffer
;
2144 else if (srcFormat
==PIX_FMT_RGB32
)
2146 RENAME(bgr32ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2147 src
= formatConvBuffer
;
2149 else if (srcFormat
==PIX_FMT_RGB32_1
)
2151 RENAME(bgr32ToY
)(formatConvBuffer
, src
+ALT32_CORR
, srcW
, pal
);
2152 src
= formatConvBuffer
;
2154 else if (srcFormat
==PIX_FMT_BGR24
)
2156 RENAME(bgr24ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2157 src
= formatConvBuffer
;
2159 else if (srcFormat
==PIX_FMT_BGR565
)
2161 RENAME(bgr16ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2162 src
= formatConvBuffer
;
2164 else if (srcFormat
==PIX_FMT_BGR555
)
2166 RENAME(bgr15ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2167 src
= formatConvBuffer
;
2169 else if (srcFormat
==PIX_FMT_BGR32
)
2171 RENAME(rgb32ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2172 src
= formatConvBuffer
;
2174 else if (srcFormat
==PIX_FMT_BGR32_1
)
2176 RENAME(rgb32ToY
)(formatConvBuffer
, src
+ALT32_CORR
, srcW
, pal
);
2177 src
= formatConvBuffer
;
2179 else if (srcFormat
==PIX_FMT_RGB24
)
2181 RENAME(rgb24ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2182 src
= formatConvBuffer
;
2184 else if (srcFormat
==PIX_FMT_RGB565
)
2186 RENAME(rgb16ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2187 src
= formatConvBuffer
;
2189 else if (srcFormat
==PIX_FMT_RGB555
)
2191 RENAME(rgb15ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2192 src
= formatConvBuffer
;
2194 else if (srcFormat
==PIX_FMT_RGB8
|| srcFormat
==PIX_FMT_BGR8
|| srcFormat
==PIX_FMT_PAL8
|| srcFormat
==PIX_FMT_BGR4_BYTE
|| srcFormat
==PIX_FMT_RGB4_BYTE
)
2196 RENAME(palToY
)(formatConvBuffer
, src
, srcW
, pal
);
2197 src
= formatConvBuffer
;
2199 else if (srcFormat
==PIX_FMT_MONOBLACK
)
2201 RENAME(monoblack2Y
)(formatConvBuffer
, src
, srcW
, pal
);
2202 src
= formatConvBuffer
;
2204 else if (srcFormat
==PIX_FMT_MONOWHITE
)
2206 RENAME(monowhite2Y
)(formatConvBuffer
, src
, srcW
, pal
);
2207 src
= formatConvBuffer
;
2211 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2212 if (!(flags
&SWS_FAST_BILINEAR
) || (!canMMX2BeUsed
))
2214 if (!(flags
&SWS_FAST_BILINEAR
))
2217 RENAME(hScale
)(dst
, dstWidth
, src
, srcW
, xInc
, hLumFilter
, hLumFilterPos
, hLumFilterSize
);
2219 else // fast bilinear upscale / crap downscale
2225 uint64_t ebxsave
__attribute__((aligned(8)));
2231 "mov %%"REG_b
", %5 \n\t"
2233 "pxor %%mm7, %%mm7 \n\t"
2234 "mov %0, %%"REG_c
" \n\t"
2235 "mov %1, %%"REG_D
" \n\t"
2236 "mov %2, %%"REG_d
" \n\t"
2237 "mov %3, %%"REG_b
" \n\t"
2238 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2239 PREFETCH
" (%%"REG_c
") \n\t"
2240 PREFETCH
" 32(%%"REG_c
") \n\t"
2241 PREFETCH
" 64(%%"REG_c
") \n\t"
2245 #define FUNNY_Y_CODE \
2246 "movl (%%"REG_b"), %%esi \n\t"\
2248 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2249 "add %%"REG_S", %%"REG_c" \n\t"\
2250 "add %%"REG_a", %%"REG_D" \n\t"\
2251 "xor %%"REG_a", %%"REG_a" \n\t"\
2255 #define FUNNY_Y_CODE \
2256 "movl (%%"REG_b"), %%esi \n\t"\
2258 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2259 "add %%"REG_a", %%"REG_D" \n\t"\
2260 "xor %%"REG_a", %%"REG_a" \n\t"\
2262 #endif /* ARCH_X86_64 */
2274 "mov %5, %%"REG_b
" \n\t"
2276 :: "m" (src
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2281 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2286 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--) dst
[i
] = src
[srcW
-1]*128;
2290 #endif /* HAVE_MMX2 */
2291 long xInc_shr16
= xInc
>> 16;
2292 uint16_t xInc_mask
= xInc
& 0xffff;
2293 //NO MMX just normal asm ...
2295 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2296 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2297 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2300 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2301 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2302 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2303 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2304 "shll $16, %%edi \n\t"
2305 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2306 "mov %1, %%"REG_D
" \n\t"
2307 "shrl $9, %%esi \n\t"
2308 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2309 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2310 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2312 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2313 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2314 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2315 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2316 "shll $16, %%edi \n\t"
2317 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2318 "mov %1, %%"REG_D
" \n\t"
2319 "shrl $9, %%esi \n\t"
2320 "movw %%si, 2(%%"REG_D
", %%"REG_a
", 2) \n\t"
2321 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2322 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2325 "add $2, %%"REG_a
" \n\t"
2326 "cmp %2, %%"REG_a
" \n\t"
2330 :: "r" (src
), "m" (dst
), "m" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
)
2331 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2334 } //if MMX2 can't be used
2338 unsigned int xpos
=0;
2339 for (i
=0;i
<dstWidth
;i
++)
2341 register unsigned int xx
=xpos
>>16;
2342 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2343 dst
[i
]= (src
[xx
]<<7) + (src
[xx
+1] - src
[xx
])*xalpha
;
2346 #endif /* ARCH_X86 */
2349 if(c
->srcRange
!= c
->dstRange
&& !(isRGB(c
->dstFormat
) || isBGR(c
->dstFormat
))){
2351 //FIXME all pal and rgb srcFormats could do this convertion as well
2352 //FIXME all scalers more complex than bilinear could do half of this transform
2354 for (i
=0; i
<dstWidth
; i
++)
2355 dst
[i
]= (dst
[i
]*14071 + 33561947)>>14;
2357 for (i
=0; i
<dstWidth
; i
++)
2358 dst
[i
]= (FFMIN(dst
[i
],30189)*19077 - 39057361)>>14;
2363 inline static void RENAME(hcscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, uint8_t *src1
, uint8_t *src2
,
2364 int srcW
, int xInc
, int flags
, int canMMX2BeUsed
, int16_t *hChrFilter
,
2365 int16_t *hChrFilterPos
, int hChrFilterSize
, void *funnyUVCode
,
2366 int srcFormat
, uint8_t *formatConvBuffer
, int16_t *mmx2Filter
,
2367 int32_t *mmx2FilterPos
, uint32_t *pal
)
2369 if (srcFormat
==PIX_FMT_YUYV422
)
2371 RENAME(yuy2ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2372 src1
= formatConvBuffer
;
2373 src2
= formatConvBuffer
+VOFW
;
2375 else if (srcFormat
==PIX_FMT_UYVY422
)
2377 RENAME(uyvyToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2378 src1
= formatConvBuffer
;
2379 src2
= formatConvBuffer
+VOFW
;
2381 else if (srcFormat
==PIX_FMT_RGB32
)
2383 if(c
->chrSrcHSubSample
)
2384 RENAME(bgr32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2386 RENAME(bgr32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2387 src1
= formatConvBuffer
;
2388 src2
= formatConvBuffer
+VOFW
;
2390 else if (srcFormat
==PIX_FMT_RGB32_1
)
2392 if(c
->chrSrcHSubSample
)
2393 RENAME(bgr32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2395 RENAME(bgr32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2396 src1
= formatConvBuffer
;
2397 src2
= formatConvBuffer
+VOFW
;
2399 else if (srcFormat
==PIX_FMT_BGR24
)
2401 if(c
->chrSrcHSubSample
)
2402 RENAME(bgr24ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2404 RENAME(bgr24ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2405 src1
= formatConvBuffer
;
2406 src2
= formatConvBuffer
+VOFW
;
2408 else if (srcFormat
==PIX_FMT_BGR565
)
2410 if(c
->chrSrcHSubSample
)
2411 RENAME(bgr16ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2413 RENAME(bgr16ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2414 src1
= formatConvBuffer
;
2415 src2
= formatConvBuffer
+VOFW
;
2417 else if (srcFormat
==PIX_FMT_BGR555
)
2419 if(c
->chrSrcHSubSample
)
2420 RENAME(bgr15ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2422 RENAME(bgr15ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2423 src1
= formatConvBuffer
;
2424 src2
= formatConvBuffer
+VOFW
;
2426 else if (srcFormat
==PIX_FMT_BGR32
)
2428 if(c
->chrSrcHSubSample
)
2429 RENAME(rgb32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2431 RENAME(rgb32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2432 src1
= formatConvBuffer
;
2433 src2
= formatConvBuffer
+VOFW
;
2435 else if (srcFormat
==PIX_FMT_BGR32_1
)
2437 if(c
->chrSrcHSubSample
)
2438 RENAME(rgb32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2440 RENAME(rgb32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2441 src1
= formatConvBuffer
;
2442 src2
= formatConvBuffer
+VOFW
;
2444 else if (srcFormat
==PIX_FMT_RGB24
)
2446 if(c
->chrSrcHSubSample
)
2447 RENAME(rgb24ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2449 RENAME(rgb24ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2450 src1
= formatConvBuffer
;
2451 src2
= formatConvBuffer
+VOFW
;
2453 else if (srcFormat
==PIX_FMT_RGB565
)
2455 if(c
->chrSrcHSubSample
)
2456 RENAME(rgb16ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2458 RENAME(rgb16ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2459 src1
= formatConvBuffer
;
2460 src2
= formatConvBuffer
+VOFW
;
2462 else if (srcFormat
==PIX_FMT_RGB555
)
2464 if(c
->chrSrcHSubSample
)
2465 RENAME(rgb15ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2467 RENAME(rgb15ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2468 src1
= formatConvBuffer
;
2469 src2
= formatConvBuffer
+VOFW
;
2471 else if (isGray(srcFormat
) || srcFormat
==PIX_FMT_MONOBLACK
|| srcFormat
==PIX_FMT_MONOWHITE
)
2475 else if (srcFormat
==PIX_FMT_RGB8
|| srcFormat
==PIX_FMT_BGR8
|| srcFormat
==PIX_FMT_PAL8
|| srcFormat
==PIX_FMT_BGR4_BYTE
|| srcFormat
==PIX_FMT_RGB4_BYTE
)
2477 RENAME(palToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2478 src1
= formatConvBuffer
;
2479 src2
= formatConvBuffer
+VOFW
;
2483 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2484 if (!(flags
&SWS_FAST_BILINEAR
) || (!canMMX2BeUsed
))
2486 if (!(flags
&SWS_FAST_BILINEAR
))
2489 RENAME(hScale
)(dst
, dstWidth
, src1
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2490 RENAME(hScale
)(dst
+VOFW
, dstWidth
, src2
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2492 else // fast bilinear upscale / crap downscale
2498 uint64_t ebxsave
__attribute__((aligned(8)));
2504 "mov %%"REG_b
", %6 \n\t"
2506 "pxor %%mm7, %%mm7 \n\t"
2507 "mov %0, %%"REG_c
" \n\t"
2508 "mov %1, %%"REG_D
" \n\t"
2509 "mov %2, %%"REG_d
" \n\t"
2510 "mov %3, %%"REG_b
" \n\t"
2511 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2512 PREFETCH
" (%%"REG_c
") \n\t"
2513 PREFETCH
" 32(%%"REG_c
") \n\t"
2514 PREFETCH
" 64(%%"REG_c
") \n\t"
2518 #define FUNNY_UV_CODE \
2519 "movl (%%"REG_b"), %%esi \n\t"\
2521 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2522 "add %%"REG_S", %%"REG_c" \n\t"\
2523 "add %%"REG_a", %%"REG_D" \n\t"\
2524 "xor %%"REG_a", %%"REG_a" \n\t"\
2528 #define FUNNY_UV_CODE \
2529 "movl (%%"REG_b"), %%esi \n\t"\
2531 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2532 "add %%"REG_a", %%"REG_D" \n\t"\
2533 "xor %%"REG_a", %%"REG_a" \n\t"\
2535 #endif /* ARCH_X86_64 */
2541 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2542 "mov %5, %%"REG_c
" \n\t" // src
2543 "mov %1, %%"REG_D
" \n\t" // buf1
2544 "add $"AV_STRINGIFY(VOF
)", %%"REG_D
" \n\t"
2545 PREFETCH
" (%%"REG_c
") \n\t"
2546 PREFETCH
" 32(%%"REG_c
") \n\t"
2547 PREFETCH
" 64(%%"REG_c
") \n\t"
2555 "mov %6, %%"REG_b
" \n\t"
2557 :: "m" (src1
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2558 "m" (funnyUVCode
), "m" (src2
)
2562 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2567 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--)
2569 //printf("%d %d %d\n", dstWidth, i, srcW);
2570 dst
[i
] = src1
[srcW
-1]*128;
2571 dst
[i
+VOFW
] = src2
[srcW
-1]*128;
2576 #endif /* HAVE_MMX2 */
2577 long xInc_shr16
= (long) (xInc
>> 16);
2578 uint16_t xInc_mask
= xInc
& 0xffff;
2580 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2581 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2582 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2585 "mov %0, %%"REG_S
" \n\t"
2586 "movzbl (%%"REG_S
", %%"REG_d
"), %%edi \n\t" //src[xx]
2587 "movzbl 1(%%"REG_S
", %%"REG_d
"), %%esi \n\t" //src[xx+1]
2588 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2589 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2590 "shll $16, %%edi \n\t"
2591 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2592 "mov %1, %%"REG_D
" \n\t"
2593 "shrl $9, %%esi \n\t"
2594 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2596 "movzbl (%5, %%"REG_d
"), %%edi \n\t" //src[xx]
2597 "movzbl 1(%5, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2598 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2599 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2600 "shll $16, %%edi \n\t"
2601 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2602 "mov %1, %%"REG_D
" \n\t"
2603 "shrl $9, %%esi \n\t"
2604 "movw %%si, "AV_STRINGIFY(VOF
)"(%%"REG_D
", %%"REG_a
", 2) \n\t"
2606 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2607 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2608 "add $1, %%"REG_a
" \n\t"
2609 "cmp %2, %%"REG_a
" \n\t"
2612 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2613 which is needed to support GCC 4.0. */
2614 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2615 :: "m" (src1
), "m" (dst
), "g" ((long)dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2617 :: "m" (src1
), "m" (dst
), "m" ((long)dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2620 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2623 } //if MMX2 can't be used
2627 unsigned int xpos
=0;
2628 for (i
=0;i
<dstWidth
;i
++)
2630 register unsigned int xx
=xpos
>>16;
2631 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2632 dst
[i
]=(src1
[xx
]*(xalpha
^127)+src1
[xx
+1]*xalpha
);
2633 dst
[i
+VOFW
]=(src2
[xx
]*(xalpha
^127)+src2
[xx
+1]*xalpha
);
2635 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2636 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2640 #endif /* ARCH_X86 */
2642 if(c
->srcRange
!= c
->dstRange
&& !(isRGB(c
->dstFormat
) || isBGR(c
->dstFormat
))){
2644 //FIXME all pal and rgb srcFormats could do this convertion as well
2645 //FIXME all scalers more complex than bilinear could do half of this transform
2647 for (i
=0; i
<dstWidth
; i
++){
2648 dst
[i
]= (dst
[i
]*1799 + 4081085)>>11; //1469
2649 dst
[i
+VOFW
]= (dst
[i
+VOFW
]*1799 + 4081085)>>11; //1469
2652 for (i
=0; i
<dstWidth
; i
++){
2653 dst
[i
]= (FFMIN(dst
[i
],30775)*4663 - 9289992)>>12; //-264
2654 dst
[i
+VOFW
]= (FFMIN(dst
[i
+VOFW
],30775)*4663 - 9289992)>>12; //-264
2660 static int RENAME(swScale
)(SwsContext
*c
, uint8_t* src
[], int srcStride
[], int srcSliceY
,
2661 int srcSliceH
, uint8_t* dst
[], int dstStride
[]){
2663 /* load a few things into local vars to make the code more readable? and faster */
2664 const int srcW
= c
->srcW
;
2665 const int dstW
= c
->dstW
;
2666 const int dstH
= c
->dstH
;
2667 const int chrDstW
= c
->chrDstW
;
2668 const int chrSrcW
= c
->chrSrcW
;
2669 const int lumXInc
= c
->lumXInc
;
2670 const int chrXInc
= c
->chrXInc
;
2671 const int dstFormat
= c
->dstFormat
;
2672 const int srcFormat
= c
->srcFormat
;
2673 const int flags
= c
->flags
;
2674 const int canMMX2BeUsed
= c
->canMMX2BeUsed
;
2675 int16_t *vLumFilterPos
= c
->vLumFilterPos
;
2676 int16_t *vChrFilterPos
= c
->vChrFilterPos
;
2677 int16_t *hLumFilterPos
= c
->hLumFilterPos
;
2678 int16_t *hChrFilterPos
= c
->hChrFilterPos
;
2679 int16_t *vLumFilter
= c
->vLumFilter
;
2680 int16_t *vChrFilter
= c
->vChrFilter
;
2681 int16_t *hLumFilter
= c
->hLumFilter
;
2682 int16_t *hChrFilter
= c
->hChrFilter
;
2683 int32_t *lumMmxFilter
= c
->lumMmxFilter
;
2684 int32_t *chrMmxFilter
= c
->chrMmxFilter
;
2685 const int vLumFilterSize
= c
->vLumFilterSize
;
2686 const int vChrFilterSize
= c
->vChrFilterSize
;
2687 const int hLumFilterSize
= c
->hLumFilterSize
;
2688 const int hChrFilterSize
= c
->hChrFilterSize
;
2689 int16_t **lumPixBuf
= c
->lumPixBuf
;
2690 int16_t **chrPixBuf
= c
->chrPixBuf
;
2691 const int vLumBufSize
= c
->vLumBufSize
;
2692 const int vChrBufSize
= c
->vChrBufSize
;
2693 uint8_t *funnyYCode
= c
->funnyYCode
;
2694 uint8_t *funnyUVCode
= c
->funnyUVCode
;
2695 uint8_t *formatConvBuffer
= c
->formatConvBuffer
;
2696 const int chrSrcSliceY
= srcSliceY
>> c
->chrSrcVSubSample
;
2697 const int chrSrcSliceH
= -((-srcSliceH
) >> c
->chrSrcVSubSample
);
2699 uint32_t *pal
=c
->pal_yuv
;
2701 /* vars which will change and which we need to store back in the context */
2703 int lumBufIndex
= c
->lumBufIndex
;
2704 int chrBufIndex
= c
->chrBufIndex
;
2705 int lastInLumBuf
= c
->lastInLumBuf
;
2706 int lastInChrBuf
= c
->lastInChrBuf
;
2708 if (isPacked(c
->srcFormat
)){
2714 srcStride
[2]= srcStride
[0];
2716 srcStride
[1]<<= c
->vChrDrop
;
2717 srcStride
[2]<<= c
->vChrDrop
;
2719 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2720 // (int)dst[0], (int)dst[1], (int)dst[2]);
2722 #if 0 //self test FIXME move to a vfilter or something
2724 static volatile int i
=0;
2726 if (srcFormat
==PIX_FMT_YUV420P
&& i
==1 && srcSliceH
>= c
->srcH
)
2727 selfTest(src
, srcStride
, c
->srcW
, c
->srcH
);
2732 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2733 //dstStride[0],dstStride[1],dstStride[2]);
2735 if (dstStride
[0]%8 !=0 || dstStride
[1]%8 !=0 || dstStride
[2]%8 !=0)
2737 static int warnedAlready
=0; //FIXME move this into the context perhaps
2738 if (flags
& SWS_PRINT_INFO
&& !warnedAlready
)
2740 av_log(c
, AV_LOG_WARNING
, "Warning: dstStride is not aligned!\n"
2741 " ->cannot do aligned memory accesses anymore\n");
2746 /* Note the user might start scaling the picture in the middle so this
2747 will not get executed. This is not really intended but works
2748 currently, so people might do it. */
2759 for (;dstY
< dstH
; dstY
++){
2760 unsigned char *dest
=dst
[0]+dstStride
[0]*dstY
;
2761 const int chrDstY
= dstY
>>c
->chrDstVSubSample
;
2762 unsigned char *uDest
=dst
[1]+dstStride
[1]*chrDstY
;
2763 unsigned char *vDest
=dst
[2]+dstStride
[2]*chrDstY
;
2765 const int firstLumSrcY
= vLumFilterPos
[dstY
]; //First line needed as input
2766 const int firstChrSrcY
= vChrFilterPos
[chrDstY
]; //First line needed as input
2767 const int lastLumSrcY
= firstLumSrcY
+ vLumFilterSize
-1; // Last line needed as input
2768 const int lastChrSrcY
= firstChrSrcY
+ vChrFilterSize
-1; // Last line needed as input
2770 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2771 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2772 //handle holes (FAST_BILINEAR & weird filters)
2773 if (firstLumSrcY
> lastInLumBuf
) lastInLumBuf
= firstLumSrcY
-1;
2774 if (firstChrSrcY
> lastInChrBuf
) lastInChrBuf
= firstChrSrcY
-1;
2775 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2776 assert(firstLumSrcY
>= lastInLumBuf
- vLumBufSize
+ 1);
2777 assert(firstChrSrcY
>= lastInChrBuf
- vChrBufSize
+ 1);
2779 // Do we have enough lines in this slice to output the dstY line
2780 if (lastLumSrcY
< srcSliceY
+ srcSliceH
&& lastChrSrcY
< -((-srcSliceY
- srcSliceH
)>>c
->chrSrcVSubSample
))
2782 //Do horizontal scaling
2783 while(lastInLumBuf
< lastLumSrcY
)
2785 uint8_t *s
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
2787 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2788 assert(lumBufIndex
< 2*vLumBufSize
);
2789 assert(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
);
2790 assert(lastInLumBuf
+ 1 - srcSliceY
>= 0);
2791 //printf("%d %d\n", lumBufIndex, vLumBufSize);
2792 RENAME(hyscale
)(c
, lumPixBuf
[ lumBufIndex
], dstW
, s
, srcW
, lumXInc
,
2793 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2794 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
2795 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
, pal
);
2798 while(lastInChrBuf
< lastChrSrcY
)
2800 uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
2801 uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
2803 assert(chrBufIndex
< 2*vChrBufSize
);
2804 assert(lastInChrBuf
+ 1 - chrSrcSliceY
< (chrSrcSliceH
));
2805 assert(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0);
2806 //FIXME replace parameters through context struct (some at least)
2808 if (!(isGray(srcFormat
) || isGray(dstFormat
)))
2809 RENAME(hcscale
)(c
, chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
2810 flags
, canMMX2BeUsed
, hChrFilter
, hChrFilterPos
, hChrFilterSize
,
2811 funnyUVCode
, c
->srcFormat
, formatConvBuffer
,
2812 c
->chrMmx2Filter
, c
->chrMmx2FilterPos
, pal
);
2815 //wrap buf index around to stay inside the ring buffer
2816 if (lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
2817 if (chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
2819 else // not enough lines left in this slice -> load the rest in the buffer
2821 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2822 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2823 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2824 vChrBufSize, vLumBufSize);*/
2826 //Do horizontal scaling
2827 while(lastInLumBuf
+1 < srcSliceY
+ srcSliceH
)
2829 uint8_t *s
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
2831 assert(lumBufIndex
< 2*vLumBufSize
);
2832 assert(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
);
2833 assert(lastInLumBuf
+ 1 - srcSliceY
>= 0);
2834 RENAME(hyscale
)(c
, lumPixBuf
[ lumBufIndex
], dstW
, s
, srcW
, lumXInc
,
2835 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2836 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
2837 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
, pal
);
2840 while(lastInChrBuf
+1 < (chrSrcSliceY
+ chrSrcSliceH
))
2842 uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
2843 uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
2845 assert(chrBufIndex
< 2*vChrBufSize
);
2846 assert(lastInChrBuf
+ 1 - chrSrcSliceY
< chrSrcSliceH
);
2847 assert(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0);
2849 if (!(isGray(srcFormat
) || isGray(dstFormat
)))
2850 RENAME(hcscale
)(c
, chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
2851 flags
, canMMX2BeUsed
, hChrFilter
, hChrFilterPos
, hChrFilterSize
,
2852 funnyUVCode
, c
->srcFormat
, formatConvBuffer
,
2853 c
->chrMmx2Filter
, c
->chrMmx2FilterPos
, pal
);
2856 //wrap buf index around to stay inside the ring buffer
2857 if (lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
2858 if (chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
2859 break; //we can't output a dstY line so let's try with the next slice
2863 c
->blueDither
= ff_dither8
[dstY
&1];
2864 if (c
->dstFormat
== PIX_FMT_RGB555
|| c
->dstFormat
== PIX_FMT_BGR555
)
2865 c
->greenDither
= ff_dither8
[dstY
&1];
2867 c
->greenDither
= ff_dither4
[dstY
&1];
2868 c
->redDither
= ff_dither8
[(dstY
+1)&1];
2872 int16_t **lumSrcPtr
= lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2873 int16_t **chrSrcPtr
= chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2876 if (flags
& SWS_ACCURATE_RND
){
2877 int s
= APCK_SIZE
/ 8;
2878 for (i
=0; i
<vLumFilterSize
; i
+=2){
2879 *(void**)&lumMmxFilter
[s
*i
]= lumSrcPtr
[i
];
2880 *(void**)&lumMmxFilter
[s
*i
+APCK_PTR2
/4 ]= lumSrcPtr
[i
+(vLumFilterSize
>1)];
2881 lumMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2882 lumMmxFilter
[s
*i
+APCK_COEF
/4+1]= vLumFilter
[dstY
*vLumFilterSize
+ i
]
2883 + (vLumFilterSize
>1 ? vLumFilter
[dstY
*vLumFilterSize
+ i
+ 1]<<16 : 0);
2885 for (i
=0; i
<vChrFilterSize
; i
+=2){
2886 *(void**)&chrMmxFilter
[s
*i
]= chrSrcPtr
[i
];
2887 *(void**)&chrMmxFilter
[s
*i
+APCK_PTR2
/4 ]= chrSrcPtr
[i
+(vChrFilterSize
>1)];
2888 chrMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2889 chrMmxFilter
[s
*i
+APCK_COEF
/4+1]= vChrFilter
[chrDstY
*vChrFilterSize
+ i
]
2890 + (vChrFilterSize
>1 ? vChrFilter
[chrDstY
*vChrFilterSize
+ i
+ 1]<<16 : 0);
2893 for (i
=0; i
<vLumFilterSize
; i
++)
2895 lumMmxFilter
[4*i
+0]= (int32_t)lumSrcPtr
[i
];
2896 lumMmxFilter
[4*i
+1]= (uint64_t)lumSrcPtr
[i
] >> 32;
2897 lumMmxFilter
[4*i
+2]=
2898 lumMmxFilter
[4*i
+3]=
2899 ((uint16_t)vLumFilter
[dstY
*vLumFilterSize
+ i
])*0x10001;
2901 for (i
=0; i
<vChrFilterSize
; i
++)
2903 chrMmxFilter
[4*i
+0]= (int32_t)chrSrcPtr
[i
];
2904 chrMmxFilter
[4*i
+1]= (uint64_t)chrSrcPtr
[i
] >> 32;
2905 chrMmxFilter
[4*i
+2]=
2906 chrMmxFilter
[4*i
+3]=
2907 ((uint16_t)vChrFilter
[chrDstY
*vChrFilterSize
+ i
])*0x10001;
2911 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
){
2912 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2913 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2914 RENAME(yuv2nv12X
)(c
,
2915 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2916 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2917 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
2919 else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) //YV12 like
2921 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2922 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
2923 if (vLumFilterSize
== 1 && vChrFilterSize
== 1) // unscaled YV12
2925 int16_t *lumBuf
= lumPixBuf
[0];
2926 int16_t *chrBuf
= chrPixBuf
[0];
2927 RENAME(yuv2yuv1
)(c
, lumBuf
, chrBuf
, dest
, uDest
, vDest
, dstW
, chrDstW
);
2932 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2933 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2934 dest
, uDest
, vDest
, dstW
, chrDstW
);
2939 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
2940 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
2941 if (vLumFilterSize
== 1 && vChrFilterSize
== 2) //unscaled RGB
2943 int chrAlpha
= vChrFilter
[2*dstY
+1];
2944 if(flags
& SWS_FULL_CHR_H_INT
){
2945 yuv2rgbXinC_full(c
, //FIXME write a packed1_full function
2946 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2947 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2950 RENAME(yuv2packed1
)(c
, *lumSrcPtr
, *chrSrcPtr
, *(chrSrcPtr
+1),
2951 dest
, dstW
, chrAlpha
, dstFormat
, flags
, dstY
);
2954 else if (vLumFilterSize
== 2 && vChrFilterSize
== 2) //bilinear upscale RGB
2956 int lumAlpha
= vLumFilter
[2*dstY
+1];
2957 int chrAlpha
= vChrFilter
[2*dstY
+1];
2959 lumMmxFilter
[3]= vLumFilter
[2*dstY
]*0x10001;
2961 chrMmxFilter
[3]= vChrFilter
[2*chrDstY
]*0x10001;
2962 if(flags
& SWS_FULL_CHR_H_INT
){
2963 yuv2rgbXinC_full(c
, //FIXME write a packed2_full function
2964 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2965 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2968 RENAME(yuv2packed2
)(c
, *lumSrcPtr
, *(lumSrcPtr
+1), *chrSrcPtr
, *(chrSrcPtr
+1),
2969 dest
, dstW
, lumAlpha
, chrAlpha
, dstY
);
2974 if(flags
& SWS_FULL_CHR_H_INT
){
2976 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2977 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2980 RENAME(yuv2packedX
)(c
,
2981 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2982 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2988 else // hmm looks like we can't use MMX here without overwriting this array's tail
2990 int16_t **lumSrcPtr
= lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2991 int16_t **chrSrcPtr
= chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2992 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
){
2993 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2994 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2996 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2997 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2998 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
3000 else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) //YV12
3002 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
3003 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
3005 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3006 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3007 dest
, uDest
, vDest
, dstW
, chrDstW
);
3011 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
3012 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
3013 if(flags
& SWS_FULL_CHR_H_INT
){
3015 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3016 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3020 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3021 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3029 __asm__
volatile(SFENCE:::"memory");
3030 __asm__
volatile(EMMS:::"memory");
3032 /* store changed local vars back in the context */
3034 c
->lumBufIndex
= lumBufIndex
;
3035 c
->chrBufIndex
= chrBufIndex
;
3036 c
->lastInLumBuf
= lastInLumBuf
;
3037 c
->lastInChrBuf
= lastInChrBuf
;
3039 return dstY
- lastDstY
;