2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
33 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
40 #define PREFETCH "prefetch"
41 #define PREFETCHW "prefetchw"
43 #define PREFETCH "prefetchnta"
44 #define PREFETCHW "prefetcht0"
46 #define PREFETCH " # nop"
47 #define PREFETCHW " # nop"
51 #define SFENCE "sfence"
53 #define SFENCE " # nop"
57 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
59 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
63 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
65 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
67 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
70 #include "swscale_altivec_template.c"
73 #define YSCALEYUV2YV12X(x, offset, dest, width) \
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
171 #define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
184 #define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddsw %%mm7, %%mm0 \n\t"\
194 "paddsw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
209 #define YSCALEYUV2PACKEDX_UV \
211 "xor %%"REG_a", %%"REG_a" \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 #define YSCALEYUV2PACKEDX_YA(offset) \
234 "lea "offset"(%0), %%"REG_d" \n\t"\
235 "mov (%%"REG_d"), %%"REG_S" \n\t"\
236 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
237 "movq %%mm1, %%mm7 \n\t"\
240 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
241 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
242 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
243 "add $16, %%"REG_d" \n\t"\
244 "mov (%%"REG_d"), %%"REG_S" \n\t"\
245 "pmulhw %%mm0, %%mm2 \n\t"\
246 "pmulhw %%mm0, %%mm5 \n\t"\
247 "paddw %%mm2, %%mm1 \n\t"\
248 "paddw %%mm5, %%mm7 \n\t"\
249 "test %%"REG_S", %%"REG_S" \n\t"\
252 #define YSCALEYUV2PACKEDX \
253 YSCALEYUV2PACKEDX_UV \
254 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET) \
256 #define YSCALEYUV2PACKEDX_END \
257 :: "r" (&c->redDither), \
258 "m" (dummy), "m" (dummy), "m" (dummy),\
259 "r" (dest), "m" (dstW) \
260 : "%"REG_a, "%"REG_d, "%"REG_S \
263 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
265 "xor %%"REG_a", %%"REG_a" \n\t"\
269 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
270 "mov (%%"REG_d"), %%"REG_S" \n\t"\
271 "pxor %%mm4, %%mm4 \n\t"\
272 "pxor %%mm5, %%mm5 \n\t"\
273 "pxor %%mm6, %%mm6 \n\t"\
274 "pxor %%mm7, %%mm7 \n\t"\
277 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
278 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
279 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
280 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
281 "movq %%mm0, %%mm3 \n\t"\
282 "punpcklwd %%mm1, %%mm0 \n\t"\
283 "punpckhwd %%mm1, %%mm3 \n\t"\
284 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
285 "pmaddwd %%mm1, %%mm0 \n\t"\
286 "pmaddwd %%mm1, %%mm3 \n\t"\
287 "paddd %%mm0, %%mm4 \n\t"\
288 "paddd %%mm3, %%mm5 \n\t"\
289 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
290 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
291 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
292 "test %%"REG_S", %%"REG_S" \n\t"\
293 "movq %%mm2, %%mm0 \n\t"\
294 "punpcklwd %%mm3, %%mm2 \n\t"\
295 "punpckhwd %%mm3, %%mm0 \n\t"\
296 "pmaddwd %%mm1, %%mm2 \n\t"\
297 "pmaddwd %%mm1, %%mm0 \n\t"\
298 "paddd %%mm2, %%mm6 \n\t"\
299 "paddd %%mm0, %%mm7 \n\t"\
301 "psrad $16, %%mm4 \n\t"\
302 "psrad $16, %%mm5 \n\t"\
303 "psrad $16, %%mm6 \n\t"\
304 "psrad $16, %%mm7 \n\t"\
305 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
306 "packssdw %%mm5, %%mm4 \n\t"\
307 "packssdw %%mm7, %%mm6 \n\t"\
308 "paddw %%mm0, %%mm4 \n\t"\
309 "paddw %%mm0, %%mm6 \n\t"\
310 "movq %%mm4, "U_TEMP"(%0) \n\t"\
311 "movq %%mm6, "V_TEMP"(%0) \n\t"\
313 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
314 "lea "offset"(%0), %%"REG_d" \n\t"\
315 "mov (%%"REG_d"), %%"REG_S" \n\t"\
316 "pxor %%mm1, %%mm1 \n\t"\
317 "pxor %%mm5, %%mm5 \n\t"\
318 "pxor %%mm7, %%mm7 \n\t"\
319 "pxor %%mm6, %%mm6 \n\t"\
322 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
323 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
324 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
325 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
326 "movq %%mm0, %%mm3 \n\t"\
327 "punpcklwd %%mm4, %%mm0 \n\t"\
328 "punpckhwd %%mm4, %%mm3 \n\t"\
329 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
330 "pmaddwd %%mm4, %%mm0 \n\t"\
331 "pmaddwd %%mm4, %%mm3 \n\t"\
332 "paddd %%mm0, %%mm1 \n\t"\
333 "paddd %%mm3, %%mm5 \n\t"\
334 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
335 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
336 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
337 "test %%"REG_S", %%"REG_S" \n\t"\
338 "movq %%mm2, %%mm0 \n\t"\
339 "punpcklwd %%mm3, %%mm2 \n\t"\
340 "punpckhwd %%mm3, %%mm0 \n\t"\
341 "pmaddwd %%mm4, %%mm2 \n\t"\
342 "pmaddwd %%mm4, %%mm0 \n\t"\
343 "paddd %%mm2, %%mm7 \n\t"\
344 "paddd %%mm0, %%mm6 \n\t"\
346 "psrad $16, %%mm1 \n\t"\
347 "psrad $16, %%mm5 \n\t"\
348 "psrad $16, %%mm7 \n\t"\
349 "psrad $16, %%mm6 \n\t"\
350 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
351 "packssdw %%mm5, %%mm1 \n\t"\
352 "packssdw %%mm6, %%mm7 \n\t"\
353 "paddw %%mm0, %%mm1 \n\t"\
354 "paddw %%mm0, %%mm7 \n\t"\
355 "movq "U_TEMP"(%0), %%mm3 \n\t"\
356 "movq "V_TEMP"(%0), %%mm4 \n\t"\
358 #define YSCALEYUV2PACKEDX_ACCURATE \
359 YSCALEYUV2PACKEDX_ACCURATE_UV \
360 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
362 #define YSCALEYUV2RGBX \
363 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
364 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
365 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
366 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
367 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
368 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
369 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
370 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
371 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
372 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
373 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
374 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
375 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
376 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
377 "paddw %%mm3, %%mm4 \n\t"\
378 "movq %%mm2, %%mm0 \n\t"\
379 "movq %%mm5, %%mm6 \n\t"\
380 "movq %%mm4, %%mm3 \n\t"\
381 "punpcklwd %%mm2, %%mm2 \n\t"\
382 "punpcklwd %%mm5, %%mm5 \n\t"\
383 "punpcklwd %%mm4, %%mm4 \n\t"\
384 "paddw %%mm1, %%mm2 \n\t"\
385 "paddw %%mm1, %%mm5 \n\t"\
386 "paddw %%mm1, %%mm4 \n\t"\
387 "punpckhwd %%mm0, %%mm0 \n\t"\
388 "punpckhwd %%mm6, %%mm6 \n\t"\
389 "punpckhwd %%mm3, %%mm3 \n\t"\
390 "paddw %%mm7, %%mm0 \n\t"\
391 "paddw %%mm7, %%mm6 \n\t"\
392 "paddw %%mm7, %%mm3 \n\t"\
393 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
394 "packuswb %%mm0, %%mm2 \n\t"\
395 "packuswb %%mm6, %%mm5 \n\t"\
396 "packuswb %%mm3, %%mm4 \n\t"\
398 #define REAL_YSCALEYUV2PACKED(index, c) \
399 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
400 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
401 "psraw $3, %%mm0 \n\t"\
402 "psraw $3, %%mm1 \n\t"\
403 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
404 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
405 "xor "#index", "#index" \n\t"\
408 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
409 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
410 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
411 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
412 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
413 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
414 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
415 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
416 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
417 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
418 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
419 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
420 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
421 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
422 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
423 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
424 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
425 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
426 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
427 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
428 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
429 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
430 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
431 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
432 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
434 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
436 #define REAL_YSCALEYUV2RGB_UV(index, c) \
437 "xor "#index", "#index" \n\t"\
440 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
441 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
442 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
443 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
444 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
445 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
446 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
447 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
448 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
449 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
450 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
451 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
452 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
453 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
454 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
455 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
456 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
457 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
458 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
459 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
461 #define REAL_YSCALEYUV2RGB_YA(index, c) \
462 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
463 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
464 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
465 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
466 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
467 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
468 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
469 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
470 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
471 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
472 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
473 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
475 #define REAL_YSCALEYUV2RGB_COEFF(c) \
476 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
477 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
478 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
479 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
480 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
481 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
482 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
483 "paddw %%mm3, %%mm4 \n\t"\
484 "movq %%mm2, %%mm0 \n\t"\
485 "movq %%mm5, %%mm6 \n\t"\
486 "movq %%mm4, %%mm3 \n\t"\
487 "punpcklwd %%mm2, %%mm2 \n\t"\
488 "punpcklwd %%mm5, %%mm5 \n\t"\
489 "punpcklwd %%mm4, %%mm4 \n\t"\
490 "paddw %%mm1, %%mm2 \n\t"\
491 "paddw %%mm1, %%mm5 \n\t"\
492 "paddw %%mm1, %%mm4 \n\t"\
493 "punpckhwd %%mm0, %%mm0 \n\t"\
494 "punpckhwd %%mm6, %%mm6 \n\t"\
495 "punpckhwd %%mm3, %%mm3 \n\t"\
496 "paddw %%mm7, %%mm0 \n\t"\
497 "paddw %%mm7, %%mm6 \n\t"\
498 "paddw %%mm7, %%mm3 \n\t"\
499 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
500 "packuswb %%mm0, %%mm2 \n\t"\
501 "packuswb %%mm6, %%mm5 \n\t"\
502 "packuswb %%mm3, %%mm4 \n\t"\
504 #define YSCALEYUV2RGB_YA(index, c) REAL_YSCALEYUV2RGB_YA(index, c)
506 #define YSCALEYUV2RGB(index, c) \
507 REAL_YSCALEYUV2RGB_UV(index, c) \
508 REAL_YSCALEYUV2RGB_YA(index, c) \
509 REAL_YSCALEYUV2RGB_COEFF(c)
511 #define REAL_YSCALEYUV2PACKED1(index, c) \
512 "xor "#index", "#index" \n\t"\
515 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
516 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
517 "psraw $7, %%mm3 \n\t" \
518 "psraw $7, %%mm4 \n\t" \
519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
521 "psraw $7, %%mm1 \n\t" \
522 "psraw $7, %%mm7 \n\t" \
524 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
526 #define REAL_YSCALEYUV2RGB1(index, c) \
527 "xor "#index", "#index" \n\t"\
530 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
531 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
532 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
533 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
534 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
535 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
536 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
537 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
538 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
539 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
540 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
541 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
542 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
543 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
544 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
545 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
546 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
547 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
548 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
549 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
550 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
551 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
552 "paddw %%mm3, %%mm4 \n\t"\
553 "movq %%mm2, %%mm0 \n\t"\
554 "movq %%mm5, %%mm6 \n\t"\
555 "movq %%mm4, %%mm3 \n\t"\
556 "punpcklwd %%mm2, %%mm2 \n\t"\
557 "punpcklwd %%mm5, %%mm5 \n\t"\
558 "punpcklwd %%mm4, %%mm4 \n\t"\
559 "paddw %%mm1, %%mm2 \n\t"\
560 "paddw %%mm1, %%mm5 \n\t"\
561 "paddw %%mm1, %%mm4 \n\t"\
562 "punpckhwd %%mm0, %%mm0 \n\t"\
563 "punpckhwd %%mm6, %%mm6 \n\t"\
564 "punpckhwd %%mm3, %%mm3 \n\t"\
565 "paddw %%mm7, %%mm0 \n\t"\
566 "paddw %%mm7, %%mm6 \n\t"\
567 "paddw %%mm7, %%mm3 \n\t"\
568 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
569 "packuswb %%mm0, %%mm2 \n\t"\
570 "packuswb %%mm6, %%mm5 \n\t"\
571 "packuswb %%mm3, %%mm4 \n\t"\
573 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
575 #define REAL_YSCALEYUV2PACKED1b(index, c) \
576 "xor "#index", "#index" \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $8, %%mm3 \n\t" \
586 "psrlw $8, %%mm4 \n\t" \
587 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
588 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
589 "psraw $7, %%mm1 \n\t" \
590 "psraw $7, %%mm7 \n\t"
591 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
593 // do vertical chrominance interpolation
594 #define REAL_YSCALEYUV2RGB1b(index, c) \
595 "xor "#index", "#index" \n\t"\
598 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
599 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
600 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
601 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
602 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
603 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
604 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
605 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
606 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
607 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
608 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
609 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
610 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
611 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
612 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
613 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
614 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
615 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
616 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
617 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
618 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
619 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
620 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
621 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
622 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
623 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
624 "paddw %%mm3, %%mm4 \n\t"\
625 "movq %%mm2, %%mm0 \n\t"\
626 "movq %%mm5, %%mm6 \n\t"\
627 "movq %%mm4, %%mm3 \n\t"\
628 "punpcklwd %%mm2, %%mm2 \n\t"\
629 "punpcklwd %%mm5, %%mm5 \n\t"\
630 "punpcklwd %%mm4, %%mm4 \n\t"\
631 "paddw %%mm1, %%mm2 \n\t"\
632 "paddw %%mm1, %%mm5 \n\t"\
633 "paddw %%mm1, %%mm4 \n\t"\
634 "punpckhwd %%mm0, %%mm0 \n\t"\
635 "punpckhwd %%mm6, %%mm6 \n\t"\
636 "punpckhwd %%mm3, %%mm3 \n\t"\
637 "paddw %%mm7, %%mm0 \n\t"\
638 "paddw %%mm7, %%mm6 \n\t"\
639 "paddw %%mm7, %%mm3 \n\t"\
640 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
641 "packuswb %%mm0, %%mm2 \n\t"\
642 "packuswb %%mm6, %%mm5 \n\t"\
643 "packuswb %%mm3, %%mm4 \n\t"\
645 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
647 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
648 "movq "#b", "#q2" \n\t" /* B */\
649 "movq "#r", "#t" \n\t" /* R */\
650 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
651 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
652 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
653 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
654 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
655 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
656 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
657 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
658 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
659 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
661 MOVNTQ( q0, (dst, index, 4))\
662 MOVNTQ( b, 8(dst, index, 4))\
663 MOVNTQ( q2, 16(dst, index, 4))\
664 MOVNTQ( q3, 24(dst, index, 4))\
666 "add $8, "#index" \n\t"\
667 "cmp "#dstw", "#index" \n\t"\
669 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
671 #define REAL_WRITERGB16(dst, dstw, index) \
672 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
673 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
674 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
675 "psrlq $3, %%mm2 \n\t"\
677 "movq %%mm2, %%mm1 \n\t"\
678 "movq %%mm4, %%mm3 \n\t"\
680 "punpcklbw %%mm7, %%mm3 \n\t"\
681 "punpcklbw %%mm5, %%mm2 \n\t"\
682 "punpckhbw %%mm7, %%mm4 \n\t"\
683 "punpckhbw %%mm5, %%mm1 \n\t"\
685 "psllq $3, %%mm3 \n\t"\
686 "psllq $3, %%mm4 \n\t"\
688 "por %%mm3, %%mm2 \n\t"\
689 "por %%mm4, %%mm1 \n\t"\
691 MOVNTQ(%%mm2, (dst, index, 2))\
692 MOVNTQ(%%mm1, 8(dst, index, 2))\
694 "add $8, "#index" \n\t"\
695 "cmp "#dstw", "#index" \n\t"\
697 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
699 #define REAL_WRITERGB15(dst, dstw, index) \
700 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
701 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
702 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
703 "psrlq $3, %%mm2 \n\t"\
704 "psrlq $1, %%mm5 \n\t"\
706 "movq %%mm2, %%mm1 \n\t"\
707 "movq %%mm4, %%mm3 \n\t"\
709 "punpcklbw %%mm7, %%mm3 \n\t"\
710 "punpcklbw %%mm5, %%mm2 \n\t"\
711 "punpckhbw %%mm7, %%mm4 \n\t"\
712 "punpckhbw %%mm5, %%mm1 \n\t"\
714 "psllq $2, %%mm3 \n\t"\
715 "psllq $2, %%mm4 \n\t"\
717 "por %%mm3, %%mm2 \n\t"\
718 "por %%mm4, %%mm1 \n\t"\
720 MOVNTQ(%%mm2, (dst, index, 2))\
721 MOVNTQ(%%mm1, 8(dst, index, 2))\
723 "add $8, "#index" \n\t"\
724 "cmp "#dstw", "#index" \n\t"\
726 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
728 #define WRITEBGR24OLD(dst, dstw, index) \
729 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
730 "movq %%mm2, %%mm1 \n\t" /* B */\
731 "movq %%mm5, %%mm6 \n\t" /* R */\
732 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
733 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
734 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
735 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
736 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
737 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
738 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
739 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
740 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
741 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
743 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
744 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
745 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
746 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
747 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
748 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
749 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
750 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
752 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
753 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
754 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
755 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
756 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
757 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
758 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
759 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
760 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
761 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
762 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
763 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
764 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
766 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
767 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
768 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
769 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
770 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
771 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
772 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
773 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
775 MOVNTQ(%%mm0, (dst))\
776 MOVNTQ(%%mm2, 8(dst))\
777 MOVNTQ(%%mm3, 16(dst))\
778 "add $24, "#dst" \n\t"\
780 "add $8, "#index" \n\t"\
781 "cmp "#dstw", "#index" \n\t"\
784 #define WRITEBGR24MMX(dst, dstw, index) \
785 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
786 "movq %%mm2, %%mm1 \n\t" /* B */\
787 "movq %%mm5, %%mm6 \n\t" /* R */\
788 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
789 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
790 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
791 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
792 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
793 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
794 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
795 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
796 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
797 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
799 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
800 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
801 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
802 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
804 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
805 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
806 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
807 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
809 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
810 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
811 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
812 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
814 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
815 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
816 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
817 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
818 MOVNTQ(%%mm0, (dst))\
820 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
821 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
822 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
823 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
824 MOVNTQ(%%mm6, 8(dst))\
826 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
827 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
828 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
829 MOVNTQ(%%mm5, 16(dst))\
831 "add $24, "#dst" \n\t"\
833 "add $8, "#index" \n\t"\
834 "cmp "#dstw", "#index" \n\t"\
837 #define WRITEBGR24MMX2(dst, dstw, index) \
838 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
839 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
840 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
841 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
842 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
843 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
845 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
846 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
847 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
849 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
850 "por %%mm1, %%mm6 \n\t"\
851 "por %%mm3, %%mm6 \n\t"\
852 MOVNTQ(%%mm6, (dst))\
854 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
855 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
856 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
857 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
859 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
860 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
861 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
863 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
864 "por %%mm3, %%mm6 \n\t"\
865 MOVNTQ(%%mm6, 8(dst))\
867 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
868 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
869 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
871 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
872 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
873 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
875 "por %%mm1, %%mm3 \n\t"\
876 "por %%mm3, %%mm6 \n\t"\
877 MOVNTQ(%%mm6, 16(dst))\
879 "add $24, "#dst" \n\t"\
881 "add $8, "#index" \n\t"\
882 "cmp "#dstw", "#index" \n\t"\
887 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
890 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
893 #define REAL_WRITEYUY2(dst, dstw, index) \
894 "packuswb %%mm3, %%mm3 \n\t"\
895 "packuswb %%mm4, %%mm4 \n\t"\
896 "packuswb %%mm7, %%mm1 \n\t"\
897 "punpcklbw %%mm4, %%mm3 \n\t"\
898 "movq %%mm1, %%mm7 \n\t"\
899 "punpcklbw %%mm3, %%mm1 \n\t"\
900 "punpckhbw %%mm3, %%mm7 \n\t"\
902 MOVNTQ(%%mm1, (dst, index, 2))\
903 MOVNTQ(%%mm7, 8(dst, index, 2))\
905 "add $8, "#index" \n\t"\
906 "cmp "#dstw", "#index" \n\t"\
908 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
911 static inline void RENAME(yuv2yuvX
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
912 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
913 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, long dstW
, long chrDstW
)
916 if(!(c
->flags
& SWS_BITEXACT
)){
917 if (c
->flags
& SWS_ACCURATE_RND
){
919 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
920 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
923 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
926 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
927 YSCALEYUV2YV12X(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
930 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
936 yuv2yuvX_altivec_real(lumFilter
, lumSrc
, lumFilterSize
,
937 chrFilter
, chrSrc
, chrFilterSize
,
938 dest
, uDest
, vDest
, dstW
, chrDstW
);
940 yuv2yuvXinC(lumFilter
, lumSrc
, lumFilterSize
,
941 chrFilter
, chrSrc
, chrFilterSize
,
942 dest
, uDest
, vDest
, dstW
, chrDstW
);
943 #endif //!HAVE_ALTIVEC
946 static inline void RENAME(yuv2nv12X
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
947 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
948 uint8_t *dest
, uint8_t *uDest
, int dstW
, int chrDstW
, int dstFormat
)
950 yuv2nv12XinC(lumFilter
, lumSrc
, lumFilterSize
,
951 chrFilter
, chrSrc
, chrFilterSize
,
952 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
955 static inline void RENAME(yuv2yuv1
)(SwsContext
*c
, int16_t *lumSrc
, int16_t *chrSrc
,
956 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, long dstW
, long chrDstW
)
960 if(!(c
->flags
& SWS_BITEXACT
)){
961 long p
= uDest
? 3 : 1;
962 uint8_t *src
[3]= {lumSrc
+ dstW
, chrSrc
+ chrDstW
, chrSrc
+ VOFW
+ chrDstW
};
963 uint8_t *dst
[3]= {dest
, uDest
, vDest
};
964 long counter
[3] = {dstW
, chrDstW
, chrDstW
};
966 if (c
->flags
& SWS_ACCURATE_RND
){
969 YSCALEYUV2YV121_ACCURATE
970 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
979 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
988 for (i
=0; i
<dstW
; i
++)
990 int val
= (lumSrc
[i
]+64)>>7;
1001 for (i
=0; i
<chrDstW
; i
++)
1003 int u
=(chrSrc
[i
]+64)>>7;
1004 int v
=(chrSrc
[i
+ VOFW
]+64)>>7;
1008 else if (u
>255) u
=255;
1010 else if (v
>255) v
=255;
1020 * vertical scale YV12 to RGB
1022 static inline void RENAME(yuv2packedX
)(SwsContext
*c
, int16_t *lumFilter
, int16_t **lumSrc
, int lumFilterSize
,
1023 int16_t *chrFilter
, int16_t **chrSrc
, int chrFilterSize
,
1024 uint8_t *dest
, long dstW
, long dstY
)
1028 if(!(c
->flags
& SWS_BITEXACT
)){
1029 if (c
->flags
& SWS_ACCURATE_RND
){
1030 switch(c
->dstFormat
){
1032 YSCALEYUV2PACKEDX_ACCURATE
1034 "pcmpeqd %%mm7, %%mm7 \n\t"
1035 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1037 YSCALEYUV2PACKEDX_END
1040 YSCALEYUV2PACKEDX_ACCURATE
1042 "pxor %%mm7, %%mm7 \n\t"
1043 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
"\n\t" //FIXME optimize
1044 "add %4, %%"REG_c
" \n\t"
1045 WRITEBGR24(%%REGc
, %5, %%REGa
)
1048 :: "r" (&c
->redDither
),
1049 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1050 "r" (dest
), "m" (dstW
)
1051 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1054 case PIX_FMT_RGB555
:
1055 YSCALEYUV2PACKEDX_ACCURATE
1057 "pxor %%mm7, %%mm7 \n\t"
1058 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1060 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1061 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1062 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1065 WRITERGB15(%4, %5, %%REGa
)
1066 YSCALEYUV2PACKEDX_END
1068 case PIX_FMT_RGB565
:
1069 YSCALEYUV2PACKEDX_ACCURATE
1071 "pxor %%mm7, %%mm7 \n\t"
1072 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1074 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1075 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1076 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1079 WRITERGB16(%4, %5, %%REGa
)
1080 YSCALEYUV2PACKEDX_END
1082 case PIX_FMT_YUYV422
:
1083 YSCALEYUV2PACKEDX_ACCURATE
1084 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1086 "psraw $3, %%mm3 \n\t"
1087 "psraw $3, %%mm4 \n\t"
1088 "psraw $3, %%mm1 \n\t"
1089 "psraw $3, %%mm7 \n\t"
1090 WRITEYUY2(%4, %5, %%REGa
)
1091 YSCALEYUV2PACKEDX_END
1095 switch(c
->dstFormat
)
1100 "pcmpeqd %%mm7, %%mm7 \n\t"
1101 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1102 YSCALEYUV2PACKEDX_END
1107 "pxor %%mm7, %%mm7 \n\t"
1108 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
" \n\t" //FIXME optimize
1109 "add %4, %%"REG_c
" \n\t"
1110 WRITEBGR24(%%REGc
, %5, %%REGa
)
1112 :: "r" (&c
->redDither
),
1113 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1114 "r" (dest
), "m" (dstW
)
1115 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1118 case PIX_FMT_RGB555
:
1121 "pxor %%mm7, %%mm7 \n\t"
1122 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1124 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1125 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1126 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1129 WRITERGB15(%4, %5, %%REGa
)
1130 YSCALEYUV2PACKEDX_END
1132 case PIX_FMT_RGB565
:
1135 "pxor %%mm7, %%mm7 \n\t"
1136 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1138 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1139 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1140 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1143 WRITERGB16(%4, %5, %%REGa
)
1144 YSCALEYUV2PACKEDX_END
1146 case PIX_FMT_YUYV422
:
1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150 "psraw $3, %%mm3 \n\t"
1151 "psraw $3, %%mm4 \n\t"
1152 "psraw $3, %%mm1 \n\t"
1153 "psraw $3, %%mm7 \n\t"
1154 WRITEYUY2(%4, %5, %%REGa
)
1155 YSCALEYUV2PACKEDX_END
1160 #endif /* HAVE_MMX */
1162 /* The following list of supported dstFormat values should
1163 match what's found in the body of altivec_yuv2packedX() */
1164 if (!(c
->flags
& SWS_BITEXACT
) &&
1165 (c
->dstFormat
==PIX_FMT_ABGR
|| c
->dstFormat
==PIX_FMT_BGRA
||
1166 c
->dstFormat
==PIX_FMT_BGR24
|| c
->dstFormat
==PIX_FMT_RGB24
||
1167 c
->dstFormat
==PIX_FMT_RGBA
|| c
->dstFormat
==PIX_FMT_ARGB
))
1168 altivec_yuv2packedX (c
, lumFilter
, lumSrc
, lumFilterSize
,
1169 chrFilter
, chrSrc
, chrFilterSize
,
1173 yuv2packedXinC(c
, lumFilter
, lumSrc
, lumFilterSize
,
1174 chrFilter
, chrSrc
, chrFilterSize
,
1179 * vertical bilinear scale YV12 to RGB
1181 static inline void RENAME(yuv2packed2
)(SwsContext
*c
, uint16_t *buf0
, uint16_t *buf1
, uint16_t *uvbuf0
, uint16_t *uvbuf1
,
1182 uint8_t *dest
, int dstW
, int yalpha
, int uvalpha
, int y
)
1184 int yalpha1
=4095- yalpha
;
1185 int uvalpha1
=4095-uvalpha
;
1189 if(!(c
->flags
& SWS_BITEXACT
)){
1190 switch(c
->dstFormat
)
1192 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1195 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1196 "mov %4, %%"REG_b
" \n\t"
1197 "push %%"REG_BP
" \n\t"
1198 YSCALEYUV2RGB(%%REGBP
, %5)
1199 "pcmpeqd %%mm7, %%mm7 \n\t"
1200 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1201 "pop %%"REG_BP
" \n\t"
1202 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1204 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1210 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1211 "mov %4, %%"REG_b
" \n\t"
1212 "push %%"REG_BP
" \n\t"
1213 YSCALEYUV2RGB(%%REGBP
, %5)
1214 "pxor %%mm7, %%mm7 \n\t"
1215 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1216 "pop %%"REG_BP
" \n\t"
1217 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1218 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1222 case PIX_FMT_RGB555
:
1224 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1225 "mov %4, %%"REG_b
" \n\t"
1226 "push %%"REG_BP
" \n\t"
1227 YSCALEYUV2RGB(%%REGBP
, %5)
1228 "pxor %%mm7, %%mm7 \n\t"
1229 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1231 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1232 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1233 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1236 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1237 "pop %%"REG_BP
" \n\t"
1238 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1240 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1244 case PIX_FMT_RGB565
:
1246 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1247 "mov %4, %%"REG_b
" \n\t"
1248 "push %%"REG_BP
" \n\t"
1249 YSCALEYUV2RGB(%%REGBP
, %5)
1250 "pxor %%mm7, %%mm7 \n\t"
1251 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1253 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1254 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1255 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1258 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1259 "pop %%"REG_BP
" \n\t"
1260 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1261 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1265 case PIX_FMT_YUYV422
:
1267 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1268 "mov %4, %%"REG_b
" \n\t"
1269 "push %%"REG_BP
" \n\t"
1270 YSCALEYUV2PACKED(%%REGBP
, %5)
1271 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1272 "pop %%"REG_BP
" \n\t"
1273 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1274 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1282 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C
, YSCALE_YUV_2_PACKED2_C
, YSCALE_YUV_2_GRAY16_2_C
, YSCALE_YUV_2_MONO2_C
)
1286 * YV12 to RGB without scaling or interpolating
1288 static inline void RENAME(yuv2packed1
)(SwsContext
*c
, uint16_t *buf0
, uint16_t *uvbuf0
, uint16_t *uvbuf1
,
1289 uint8_t *dest
, int dstW
, int uvalpha
, int dstFormat
, int flags
, int y
)
1291 const int yalpha1
=0;
1294 uint16_t *buf1
= buf0
; //FIXME needed for RGB1/BGR1
1295 const int yalpha
= 4096; //FIXME ...
1297 if (flags
&SWS_FULL_CHR_H_INT
)
1299 RENAME(yuv2packed2
)(c
, buf0
, buf0
, uvbuf0
, uvbuf1
, dest
, dstW
, 0, uvalpha
, y
);
1304 if(!(flags
& SWS_BITEXACT
)){
1305 if (uvalpha
< 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1311 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1312 "mov %4, %%"REG_b
" \n\t"
1313 "push %%"REG_BP
" \n\t"
1314 YSCALEYUV2RGB1(%%REGBP
, %5)
1315 "pcmpeqd %%mm7, %%mm7 \n\t"
1316 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1317 "pop %%"REG_BP
" \n\t"
1318 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1320 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1326 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1327 "mov %4, %%"REG_b
" \n\t"
1328 "push %%"REG_BP
" \n\t"
1329 YSCALEYUV2RGB1(%%REGBP
, %5)
1330 "pxor %%mm7, %%mm7 \n\t"
1331 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1332 "pop %%"REG_BP
" \n\t"
1333 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1335 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1339 case PIX_FMT_RGB555
:
1341 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1342 "mov %4, %%"REG_b
" \n\t"
1343 "push %%"REG_BP
" \n\t"
1344 YSCALEYUV2RGB1(%%REGBP
, %5)
1345 "pxor %%mm7, %%mm7 \n\t"
1346 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1348 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1349 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1350 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1352 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1353 "pop %%"REG_BP
" \n\t"
1354 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1356 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1360 case PIX_FMT_RGB565
:
1362 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1363 "mov %4, %%"REG_b
" \n\t"
1364 "push %%"REG_BP
" \n\t"
1365 YSCALEYUV2RGB1(%%REGBP
, %5)
1366 "pxor %%mm7, %%mm7 \n\t"
1367 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1369 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1370 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1371 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1374 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1375 "pop %%"REG_BP
" \n\t"
1376 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1378 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1382 case PIX_FMT_YUYV422
:
1384 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1385 "mov %4, %%"REG_b
" \n\t"
1386 "push %%"REG_BP
" \n\t"
1387 YSCALEYUV2PACKED1(%%REGBP
, %5)
1388 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1389 "pop %%"REG_BP
" \n\t"
1390 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1392 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1404 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1405 "mov %4, %%"REG_b
" \n\t"
1406 "push %%"REG_BP
" \n\t"
1407 YSCALEYUV2RGB1b(%%REGBP
, %5)
1408 "pcmpeqd %%mm7, %%mm7 \n\t"
1409 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1410 "pop %%"REG_BP
" \n\t"
1411 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1413 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1419 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1420 "mov %4, %%"REG_b
" \n\t"
1421 "push %%"REG_BP
" \n\t"
1422 YSCALEYUV2RGB1b(%%REGBP
, %5)
1423 "pxor %%mm7, %%mm7 \n\t"
1424 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1425 "pop %%"REG_BP
" \n\t"
1426 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1428 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1432 case PIX_FMT_RGB555
:
1434 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1435 "mov %4, %%"REG_b
" \n\t"
1436 "push %%"REG_BP
" \n\t"
1437 YSCALEYUV2RGB1b(%%REGBP
, %5)
1438 "pxor %%mm7, %%mm7 \n\t"
1439 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1441 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1442 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1443 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1445 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1446 "pop %%"REG_BP
" \n\t"
1447 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1449 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1453 case PIX_FMT_RGB565
:
1455 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1456 "mov %4, %%"REG_b
" \n\t"
1457 "push %%"REG_BP
" \n\t"
1458 YSCALEYUV2RGB1b(%%REGBP
, %5)
1459 "pxor %%mm7, %%mm7 \n\t"
1460 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1462 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1463 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1464 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1467 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1468 "pop %%"REG_BP
" \n\t"
1469 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1471 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1475 case PIX_FMT_YUYV422
:
1477 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1478 "mov %4, %%"REG_b
" \n\t"
1479 "push %%"REG_BP
" \n\t"
1480 YSCALEYUV2PACKED1b(%%REGBP
, %5)
1481 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1482 "pop %%"REG_BP
" \n\t"
1483 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1485 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1492 #endif /* HAVE_MMX */
1495 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C
, YSCALE_YUV_2_PACKED1_C
, YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1497 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C
, YSCALE_YUV_2_PACKED1B_C
, YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1501 //FIXME yuy2* can read up to 7 samples too much
1503 static inline void RENAME(yuy2ToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1507 "movq "MANGLE(bm01010101
)", %%mm2 \n\t"
1508 "mov %0, %%"REG_a
" \n\t"
1510 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1511 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1512 "pand %%mm2, %%mm0 \n\t"
1513 "pand %%mm2, %%mm1 \n\t"
1514 "packuswb %%mm1, %%mm0 \n\t"
1515 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1516 "add $8, %%"REG_a
" \n\t"
1518 : : "g" (-width
), "r" (src
+width
*2), "r" (dst
+width
)
1523 for (i
=0; i
<width
; i
++)
1528 static inline void RENAME(yuy2ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1532 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1533 "mov %0, %%"REG_a
" \n\t"
1535 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1536 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1537 "psrlw $8, %%mm0 \n\t"
1538 "psrlw $8, %%mm1 \n\t"
1539 "packuswb %%mm1, %%mm0 \n\t"
1540 "movq %%mm0, %%mm1 \n\t"
1541 "psrlw $8, %%mm0 \n\t"
1542 "pand %%mm4, %%mm1 \n\t"
1543 "packuswb %%mm0, %%mm0 \n\t"
1544 "packuswb %%mm1, %%mm1 \n\t"
1545 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1546 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1547 "add $4, %%"REG_a
" \n\t"
1549 : : "g" (-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1554 for (i
=0; i
<width
; i
++)
1556 dstU
[i
]= src1
[4*i
+ 1];
1557 dstV
[i
]= src1
[4*i
+ 3];
1560 assert(src1
== src2
);
1563 /* This is almost identical to the previous, end exists only because
1564 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1565 static inline void RENAME(uyvyToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1569 "mov %0, %%"REG_a
" \n\t"
1571 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1572 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1573 "psrlw $8, %%mm0 \n\t"
1574 "psrlw $8, %%mm1 \n\t"
1575 "packuswb %%mm1, %%mm0 \n\t"
1576 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1577 "add $8, %%"REG_a
" \n\t"
1579 : : "g" (-width
), "r" (src
+width
*2), "r" (dst
+width
)
1584 for (i
=0; i
<width
; i
++)
1589 static inline void RENAME(uyvyToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1593 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1594 "mov %0, %%"REG_a
" \n\t"
1596 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1597 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1598 "pand %%mm4, %%mm0 \n\t"
1599 "pand %%mm4, %%mm1 \n\t"
1600 "packuswb %%mm1, %%mm0 \n\t"
1601 "movq %%mm0, %%mm1 \n\t"
1602 "psrlw $8, %%mm0 \n\t"
1603 "pand %%mm4, %%mm1 \n\t"
1604 "packuswb %%mm0, %%mm0 \n\t"
1605 "packuswb %%mm1, %%mm1 \n\t"
1606 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1607 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1608 "add $4, %%"REG_a
" \n\t"
1610 : : "g" (-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1615 for (i
=0; i
<width
; i
++)
1617 dstU
[i
]= src1
[4*i
+ 0];
1618 dstV
[i
]= src1
[4*i
+ 2];
1621 assert(src1
== src2
);
1624 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1625 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1628 for (i=0; i<width; i++)\
1630 int b= (((type*)src)[i]>>shb)&maskb;\
1631 int g= (((type*)src)[i]>>shg)&maskg;\
1632 int r= (((type*)src)[i]>>shr)&maskr;\
1634 dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1638 BGR2Y(uint32_t, bgr32ToY
,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY
<< 8, GY
, BY
<< 8, RGB2YUV_SHIFT
+8)
1639 BGR2Y(uint32_t, rgb32ToY
, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY
<< 8, GY
, BY
<< 8, RGB2YUV_SHIFT
+8)
1640 BGR2Y(uint16_t, bgr16ToY
, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY
<<11, GY
<<5, BY
, RGB2YUV_SHIFT
+8)
1641 BGR2Y(uint16_t, bgr15ToY
, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY
<<10, GY
<<5, BY
, RGB2YUV_SHIFT
+7)
1642 BGR2Y(uint16_t, rgb16ToY
, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY
, GY
<<5, BY
<<11, RGB2YUV_SHIFT
+8)
1643 BGR2Y(uint16_t, rgb15ToY
, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY
, GY
<<5, BY
<<10, RGB2YUV_SHIFT
+7)
1645 #define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1646 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1649 for (i=0; i<width; i++)\
1651 int b= (((type*)src)[i]&maskb)>>shb;\
1652 int g= (((type*)src)[i]&maskg)>>shg;\
1653 int r= (((type*)src)[i]&maskr)>>shr;\
1655 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1656 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1659 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1662 for (i=0; i<width; i++)\
1664 int pix0= ((type*)src)[2*i+0];\
1665 int pix1= ((type*)src)[2*i+1];\
1666 int g= (pix0&(maskg|maska))+(pix1&(maskg|maska));\
1667 int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1668 int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1669 g&= maskg|(2*maskg);\
1673 dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1674 dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1678 BGR2UV(uint32_t, bgr32ToUV
,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU
<< 8, GU
, BU
<< 8, RV
<< 8, GV
, BV
<< 8, RGB2YUV_SHIFT
+8)
1679 BGR2UV(uint32_t, rgb32ToUV
, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU
<< 8, GU
, BU
<< 8, RV
<< 8, GV
, BV
<< 8, RGB2YUV_SHIFT
+8)
1680 BGR2UV(uint16_t, bgr16ToUV
, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU
<<11, GU
<<5, BU
, RV
<<11, GV
<<5, BV
, RGB2YUV_SHIFT
+8)
1681 BGR2UV(uint16_t, bgr15ToUV
, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU
<<10, GU
<<5, BU
, RV
<<10, GV
<<5, BV
, RGB2YUV_SHIFT
+7)
1682 BGR2UV(uint16_t, rgb16ToUV
, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU
, GU
<<5, BU
<<11, RV
, GV
<<5, BV
<<11, RGB2YUV_SHIFT
+8)
1683 BGR2UV(uint16_t, rgb15ToUV
, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU
, GU
<<5, BU
<<10, RV
, GV
<<5, BV
<<10, RGB2YUV_SHIFT
+7)
1686 static inline void RENAME(bgr24ToY_mmx
)(uint8_t *dst
, uint8_t *src
, long width
, int srcFormat
)
1689 if(srcFormat
== PIX_FMT_BGR24
){
1691 "movq "MANGLE(ff_bgr24toY1Coeff
)", %%mm5 \n\t"
1692 "movq "MANGLE(ff_bgr24toY2Coeff
)", %%mm6 \n\t"
1697 "movq "MANGLE(ff_rgb24toY1Coeff
)", %%mm5 \n\t"
1698 "movq "MANGLE(ff_rgb24toY2Coeff
)", %%mm6 \n\t"
1704 "movq "MANGLE(ff_bgr24toYOffset
)", %%mm4 \n\t"
1705 "mov %2, %%"REG_a
" \n\t"
1706 "pxor %%mm7, %%mm7 \n\t"
1708 PREFETCH
" 64(%0) \n\t"
1709 "movd (%0), %%mm0 \n\t"
1710 "movd 2(%0), %%mm1 \n\t"
1711 "movd 6(%0), %%mm2 \n\t"
1712 "movd 8(%0), %%mm3 \n\t"
1714 "punpcklbw %%mm7, %%mm0 \n\t"
1715 "punpcklbw %%mm7, %%mm1 \n\t"
1716 "punpcklbw %%mm7, %%mm2 \n\t"
1717 "punpcklbw %%mm7, %%mm3 \n\t"
1718 "pmaddwd %%mm5, %%mm0 \n\t"
1719 "pmaddwd %%mm6, %%mm1 \n\t"
1720 "pmaddwd %%mm5, %%mm2 \n\t"
1721 "pmaddwd %%mm6, %%mm3 \n\t"
1722 "paddd %%mm1, %%mm0 \n\t"
1723 "paddd %%mm3, %%mm2 \n\t"
1724 "paddd %%mm4, %%mm0 \n\t"
1725 "paddd %%mm4, %%mm2 \n\t"
1726 "psrad $15, %%mm0 \n\t"
1727 "psrad $15, %%mm2 \n\t"
1728 "packssdw %%mm2, %%mm0 \n\t"
1729 "packuswb %%mm0, %%mm0 \n\t"
1730 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1731 "add $4, %%"REG_a
" \n\t"
1734 : "r" (dst
+width
), "g" (-width
)
1739 static inline void RENAME(bgr24ToUV_mmx
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src
, long width
, int srcFormat
)
1742 "movq 24+%4, %%mm6 \n\t"
1743 "mov %3, %%"REG_a
" \n\t"
1744 "pxor %%mm7, %%mm7 \n\t"
1746 PREFETCH
" 64(%0) \n\t"
1747 "movd (%0), %%mm0 \n\t"
1748 "movd 2(%0), %%mm1 \n\t"
1749 "punpcklbw %%mm7, %%mm0 \n\t"
1750 "punpcklbw %%mm7, %%mm1 \n\t"
1751 "movq %%mm0, %%mm2 \n\t"
1752 "movq %%mm1, %%mm3 \n\t"
1753 "pmaddwd %4, %%mm0 \n\t"
1754 "pmaddwd 8+%4, %%mm1 \n\t"
1755 "pmaddwd 16+%4, %%mm2 \n\t"
1756 "pmaddwd %%mm6, %%mm3 \n\t"
1757 "paddd %%mm1, %%mm0 \n\t"
1758 "paddd %%mm3, %%mm2 \n\t"
1760 "movd 6(%0), %%mm1 \n\t"
1761 "movd 8(%0), %%mm3 \n\t"
1763 "punpcklbw %%mm7, %%mm1 \n\t"
1764 "punpcklbw %%mm7, %%mm3 \n\t"
1765 "movq %%mm1, %%mm4 \n\t"
1766 "movq %%mm3, %%mm5 \n\t"
1767 "pmaddwd %4, %%mm1 \n\t"
1768 "pmaddwd 8+%4, %%mm3 \n\t"
1769 "pmaddwd 16+%4, %%mm4 \n\t"
1770 "pmaddwd %%mm6, %%mm5 \n\t"
1771 "paddd %%mm3, %%mm1 \n\t"
1772 "paddd %%mm5, %%mm4 \n\t"
1774 "movq "MANGLE(ff_bgr24toUVOffset
)", %%mm3 \n\t"
1775 "paddd %%mm3, %%mm0 \n\t"
1776 "paddd %%mm3, %%mm2 \n\t"
1777 "paddd %%mm3, %%mm1 \n\t"
1778 "paddd %%mm3, %%mm4 \n\t"
1779 "psrad $15, %%mm0 \n\t"
1780 "psrad $15, %%mm2 \n\t"
1781 "psrad $15, %%mm1 \n\t"
1782 "psrad $15, %%mm4 \n\t"
1783 "packssdw %%mm1, %%mm0 \n\t"
1784 "packssdw %%mm4, %%mm2 \n\t"
1785 "packuswb %%mm0, %%mm0 \n\t"
1786 "packuswb %%mm2, %%mm2 \n\t"
1787 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1788 "movd %%mm2, (%2, %%"REG_a
") \n\t"
1789 "add $4, %%"REG_a
" \n\t"
1792 : "r" (dstU
+width
), "r" (dstV
+width
), "g" (-width
), "m"(ff_bgr24toUV
[srcFormat
== PIX_FMT_RGB24
][0])
1798 static inline void RENAME(bgr24ToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1801 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_BGR24
);
1804 for (i
=0; i
<width
; i
++)
1810 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
1812 #endif /* HAVE_MMX */
1815 static inline void RENAME(bgr24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1818 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_BGR24
);
1821 for (i
=0; i
<width
; i
++)
1823 int b
= src1
[3*i
+ 0];
1824 int g
= src1
[3*i
+ 1];
1825 int r
= src1
[3*i
+ 2];
1827 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1828 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1830 #endif /* HAVE_MMX */
1831 assert(src1
== src2
);
1834 static inline void RENAME(bgr24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1837 for (i
=0; i
<width
; i
++)
1839 int b
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
1840 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
1841 int r
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
1843 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1844 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1846 assert(src1
== src2
);
1849 static inline void RENAME(rgb24ToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1852 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_RGB24
);
1855 for (i
=0; i
<width
; i
++)
1861 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
1866 static inline void RENAME(rgb24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1870 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_RGB24
);
1874 for (i
=0; i
<width
; i
++)
1876 int r
= src1
[3*i
+ 0];
1877 int g
= src1
[3*i
+ 1];
1878 int b
= src1
[3*i
+ 2];
1880 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1881 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1886 static inline void RENAME(rgb24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *unused
)
1890 for (i
=0; i
<width
; i
++)
1892 int r
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
1893 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
1894 int b
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
1896 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1897 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1902 static inline void RENAME(palToY
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *pal
)
1905 for (i
=0; i
<width
; i
++)
1909 dst
[i
]= pal
[d
] & 0xFF;
1913 static inline void RENAME(palToUV
)(uint8_t *dstU
, uint8_t *dstV
, uint8_t *src1
, uint8_t *src2
, long width
, uint32_t *pal
)
1916 assert(src1
== src2
);
1917 for (i
=0; i
<width
; i
++)
1919 int p
= pal
[src1
[i
]];
1926 static inline void RENAME(monowhite2Y
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1929 for (i
=0; i
<width
/8; i
++){
1932 dst
[8*i
+j
]= ((d
>>(7-j
))&1)*255;
1936 static inline void RENAME(monoblack2Y
)(uint8_t *dst
, uint8_t *src
, long width
, uint32_t *unused
)
1939 for (i
=0; i
<width
/8; i
++){
1942 dst
[8*i
+j
]= ((d
>>(7-j
))&1)*255;
1946 // bilinear / bicubic scaling
1947 static inline void RENAME(hScale
)(int16_t *dst
, int dstW
, uint8_t *src
, int srcW
, int xInc
,
1948 int16_t *filter
, int16_t *filterPos
, long filterSize
)
1951 assert(filterSize
% 4 == 0 && filterSize
>0);
1952 if (filterSize
==4) // Always true for upscaling, sometimes for down, too.
1954 long counter
= -2*dstW
;
1956 filterPos
-= counter
/2;
1960 "push %%"REG_b
" \n\t"
1962 "pxor %%mm7, %%mm7 \n\t"
1963 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
1964 "mov %%"REG_a
", %%"REG_BP
" \n\t"
1967 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
1968 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
1969 "movq (%1, %%"REG_BP
", 4), %%mm1 \n\t"
1970 "movq 8(%1, %%"REG_BP
", 4), %%mm3 \n\t"
1971 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
1972 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
1973 "punpcklbw %%mm7, %%mm0 \n\t"
1974 "punpcklbw %%mm7, %%mm2 \n\t"
1975 "pmaddwd %%mm1, %%mm0 \n\t"
1976 "pmaddwd %%mm2, %%mm3 \n\t"
1977 "movq %%mm0, %%mm4 \n\t"
1978 "punpckldq %%mm3, %%mm0 \n\t"
1979 "punpckhdq %%mm3, %%mm4 \n\t"
1980 "paddd %%mm4, %%mm0 \n\t"
1981 "psrad $7, %%mm0 \n\t"
1982 "packssdw %%mm0, %%mm0 \n\t"
1983 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
1984 "add $4, %%"REG_BP
" \n\t"
1987 "pop %%"REG_BP
" \n\t"
1989 "pop %%"REG_b
" \n\t"
1992 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
1998 else if (filterSize
==8)
2000 long counter
= -2*dstW
;
2002 filterPos
-= counter
/2;
2006 "push %%"REG_b
" \n\t"
2008 "pxor %%mm7, %%mm7 \n\t"
2009 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2010 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2013 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2014 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
2015 "movq (%1, %%"REG_BP
", 8), %%mm1 \n\t"
2016 "movq 16(%1, %%"REG_BP
", 8), %%mm3 \n\t"
2017 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2018 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2019 "punpcklbw %%mm7, %%mm0 \n\t"
2020 "punpcklbw %%mm7, %%mm2 \n\t"
2021 "pmaddwd %%mm1, %%mm0 \n\t"
2022 "pmaddwd %%mm2, %%mm3 \n\t"
2024 "movq 8(%1, %%"REG_BP
", 8), %%mm1 \n\t"
2025 "movq 24(%1, %%"REG_BP
", 8), %%mm5 \n\t"
2026 "movd 4(%3, %%"REG_a
"), %%mm4 \n\t"
2027 "movd 4(%3, %%"REG_b
"), %%mm2 \n\t"
2028 "punpcklbw %%mm7, %%mm4 \n\t"
2029 "punpcklbw %%mm7, %%mm2 \n\t"
2030 "pmaddwd %%mm1, %%mm4 \n\t"
2031 "pmaddwd %%mm2, %%mm5 \n\t"
2032 "paddd %%mm4, %%mm0 \n\t"
2033 "paddd %%mm5, %%mm3 \n\t"
2034 "movq %%mm0, %%mm4 \n\t"
2035 "punpckldq %%mm3, %%mm0 \n\t"
2036 "punpckhdq %%mm3, %%mm4 \n\t"
2037 "paddd %%mm4, %%mm0 \n\t"
2038 "psrad $7, %%mm0 \n\t"
2039 "packssdw %%mm0, %%mm0 \n\t"
2040 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2041 "add $4, %%"REG_BP
" \n\t"
2044 "pop %%"REG_BP
" \n\t"
2046 "pop %%"REG_b
" \n\t"
2049 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2057 uint8_t *offset
= src
+filterSize
;
2058 long counter
= -2*dstW
;
2059 //filter-= counter*filterSize/2;
2060 filterPos
-= counter
/2;
2063 "pxor %%mm7, %%mm7 \n\t"
2066 "mov %2, %%"REG_c
" \n\t"
2067 "movzwl (%%"REG_c
", %0), %%eax \n\t"
2068 "movzwl 2(%%"REG_c
", %0), %%edx \n\t"
2069 "mov %5, %%"REG_c
" \n\t"
2070 "pxor %%mm4, %%mm4 \n\t"
2071 "pxor %%mm5, %%mm5 \n\t"
2073 "movq (%1), %%mm1 \n\t"
2074 "movq (%1, %6), %%mm3 \n\t"
2075 "movd (%%"REG_c
", %%"REG_a
"), %%mm0 \n\t"
2076 "movd (%%"REG_c
", %%"REG_d
"), %%mm2 \n\t"
2077 "punpcklbw %%mm7, %%mm0 \n\t"
2078 "punpcklbw %%mm7, %%mm2 \n\t"
2079 "pmaddwd %%mm1, %%mm0 \n\t"
2080 "pmaddwd %%mm2, %%mm3 \n\t"
2081 "paddd %%mm3, %%mm5 \n\t"
2082 "paddd %%mm0, %%mm4 \n\t"
2084 "add $4, %%"REG_c
" \n\t"
2085 "cmp %4, %%"REG_c
" \n\t"
2088 "movq %%mm4, %%mm0 \n\t"
2089 "punpckldq %%mm5, %%mm4 \n\t"
2090 "punpckhdq %%mm5, %%mm0 \n\t"
2091 "paddd %%mm0, %%mm4 \n\t"
2092 "psrad $7, %%mm4 \n\t"
2093 "packssdw %%mm4, %%mm4 \n\t"
2094 "mov %3, %%"REG_a
" \n\t"
2095 "movd %%mm4, (%%"REG_a
", %0) \n\t"
2099 : "+r" (counter
), "+r" (filter
)
2100 : "m" (filterPos
), "m" (dst
), "m"(offset
),
2101 "m" (src
), "r" (filterSize
*2)
2102 : "%"REG_a
, "%"REG_c
, "%"REG_d
2107 hScale_altivec_real(dst
, dstW
, src
, srcW
, xInc
, filter
, filterPos
, filterSize
);
2110 for (i
=0; i
<dstW
; i
++)
2113 int srcPos
= filterPos
[i
];
2115 //printf("filterPos: %d\n", filterPos[i]);
2116 for (j
=0; j
<filterSize
; j
++)
2118 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2119 val
+= ((int)src
[srcPos
+ j
])*filter
[filterSize
*i
+ j
];
2121 //filter += hFilterSize;
2122 dst
[i
] = FFMIN(val
>>7, (1<<15)-1); // the cubic equation does overflow ...
2125 #endif /* HAVE_ALTIVEC */
2126 #endif /* HAVE_MMX */
2128 // *** horizontal scale Y line to temp buffer
2129 static inline void RENAME(hyscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, uint8_t *src
, int srcW
, int xInc
,
2130 int flags
, int canMMX2BeUsed
, int16_t *hLumFilter
,
2131 int16_t *hLumFilterPos
, int hLumFilterSize
, void *funnyYCode
,
2132 int srcFormat
, uint8_t *formatConvBuffer
, int16_t *mmx2Filter
,
2133 int32_t *mmx2FilterPos
, uint32_t *pal
)
2135 if (srcFormat
==PIX_FMT_YUYV422
|| srcFormat
==PIX_FMT_GRAY16BE
)
2137 RENAME(yuy2ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2138 src
= formatConvBuffer
;
2140 else if (srcFormat
==PIX_FMT_UYVY422
|| srcFormat
==PIX_FMT_GRAY16LE
)
2142 RENAME(uyvyToY
)(formatConvBuffer
, src
, srcW
, pal
);
2143 src
= formatConvBuffer
;
2145 else if (srcFormat
==PIX_FMT_RGB32
)
2147 RENAME(bgr32ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2148 src
= formatConvBuffer
;
2150 else if (srcFormat
==PIX_FMT_RGB32_1
)
2152 RENAME(bgr32ToY
)(formatConvBuffer
, src
+ALT32_CORR
, srcW
, pal
);
2153 src
= formatConvBuffer
;
2155 else if (srcFormat
==PIX_FMT_BGR24
)
2157 RENAME(bgr24ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2158 src
= formatConvBuffer
;
2160 else if (srcFormat
==PIX_FMT_BGR565
)
2162 RENAME(bgr16ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2163 src
= formatConvBuffer
;
2165 else if (srcFormat
==PIX_FMT_BGR555
)
2167 RENAME(bgr15ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2168 src
= formatConvBuffer
;
2170 else if (srcFormat
==PIX_FMT_BGR32
)
2172 RENAME(rgb32ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2173 src
= formatConvBuffer
;
2175 else if (srcFormat
==PIX_FMT_BGR32_1
)
2177 RENAME(rgb32ToY
)(formatConvBuffer
, src
+ALT32_CORR
, srcW
, pal
);
2178 src
= formatConvBuffer
;
2180 else if (srcFormat
==PIX_FMT_RGB24
)
2182 RENAME(rgb24ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2183 src
= formatConvBuffer
;
2185 else if (srcFormat
==PIX_FMT_RGB565
)
2187 RENAME(rgb16ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2188 src
= formatConvBuffer
;
2190 else if (srcFormat
==PIX_FMT_RGB555
)
2192 RENAME(rgb15ToY
)(formatConvBuffer
, src
, srcW
, pal
);
2193 src
= formatConvBuffer
;
2195 else if (srcFormat
==PIX_FMT_RGB8
|| srcFormat
==PIX_FMT_BGR8
|| srcFormat
==PIX_FMT_PAL8
|| srcFormat
==PIX_FMT_BGR4_BYTE
|| srcFormat
==PIX_FMT_RGB4_BYTE
)
2197 RENAME(palToY
)(formatConvBuffer
, src
, srcW
, pal
);
2198 src
= formatConvBuffer
;
2200 else if (srcFormat
==PIX_FMT_MONOBLACK
)
2202 RENAME(monoblack2Y
)(formatConvBuffer
, src
, srcW
, pal
);
2203 src
= formatConvBuffer
;
2205 else if (srcFormat
==PIX_FMT_MONOWHITE
)
2207 RENAME(monowhite2Y
)(formatConvBuffer
, src
, srcW
, pal
);
2208 src
= formatConvBuffer
;
2212 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2213 if (!(flags
&SWS_FAST_BILINEAR
) || (!canMMX2BeUsed
))
2215 if (!(flags
&SWS_FAST_BILINEAR
))
2218 RENAME(hScale
)(dst
, dstWidth
, src
, srcW
, xInc
, hLumFilter
, hLumFilterPos
, hLumFilterSize
);
2220 else // fast bilinear upscale / crap downscale
2222 #if ARCH_X86 && CONFIG_GPL
2226 uint64_t ebxsave
__attribute__((aligned(8)));
2232 "mov %%"REG_b
", %5 \n\t"
2234 "pxor %%mm7, %%mm7 \n\t"
2235 "mov %0, %%"REG_c
" \n\t"
2236 "mov %1, %%"REG_D
" \n\t"
2237 "mov %2, %%"REG_d
" \n\t"
2238 "mov %3, %%"REG_b
" \n\t"
2239 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2240 PREFETCH
" (%%"REG_c
") \n\t"
2241 PREFETCH
" 32(%%"REG_c
") \n\t"
2242 PREFETCH
" 64(%%"REG_c
") \n\t"
2246 #define FUNNY_Y_CODE \
2247 "movl (%%"REG_b"), %%esi \n\t"\
2249 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2250 "add %%"REG_S", %%"REG_c" \n\t"\
2251 "add %%"REG_a", %%"REG_D" \n\t"\
2252 "xor %%"REG_a", %%"REG_a" \n\t"\
2256 #define FUNNY_Y_CODE \
2257 "movl (%%"REG_b"), %%esi \n\t"\
2259 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2260 "add %%"REG_a", %%"REG_D" \n\t"\
2261 "xor %%"REG_a", %%"REG_a" \n\t"\
2263 #endif /* ARCH_X86_64 */
2275 "mov %5, %%"REG_b
" \n\t"
2277 :: "m" (src
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2282 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2287 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--) dst
[i
] = src
[srcW
-1]*128;
2291 #endif /* HAVE_MMX2 */
2292 long xInc_shr16
= xInc
>> 16;
2293 uint16_t xInc_mask
= xInc
& 0xffff;
2294 //NO MMX just normal asm ...
2296 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2297 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2298 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2301 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2302 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2303 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2304 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2305 "shll $16, %%edi \n\t"
2306 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2307 "mov %1, %%"REG_D
" \n\t"
2308 "shrl $9, %%esi \n\t"
2309 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2310 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2311 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2313 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2314 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2315 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2316 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2317 "shll $16, %%edi \n\t"
2318 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2319 "mov %1, %%"REG_D
" \n\t"
2320 "shrl $9, %%esi \n\t"
2321 "movw %%si, 2(%%"REG_D
", %%"REG_a
", 2) \n\t"
2322 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2323 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2326 "add $2, %%"REG_a
" \n\t"
2327 "cmp %2, %%"REG_a
" \n\t"
2331 :: "r" (src
), "m" (dst
), "m" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
)
2332 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2335 } //if MMX2 can't be used
2339 unsigned int xpos
=0;
2340 for (i
=0;i
<dstWidth
;i
++)
2342 register unsigned int xx
=xpos
>>16;
2343 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2344 dst
[i
]= (src
[xx
]<<7) + (src
[xx
+1] - src
[xx
])*xalpha
;
2347 #endif /* ARCH_X86 */
2350 if(c
->srcRange
!= c
->dstRange
&& !(isRGB(c
->dstFormat
) || isBGR(c
->dstFormat
))){
2352 //FIXME all pal and rgb srcFormats could do this convertion as well
2353 //FIXME all scalers more complex than bilinear could do half of this transform
2355 for (i
=0; i
<dstWidth
; i
++)
2356 dst
[i
]= (dst
[i
]*14071 + 33561947)>>14;
2358 for (i
=0; i
<dstWidth
; i
++)
2359 dst
[i
]= (FFMIN(dst
[i
],30189)*19077 - 39057361)>>14;
2364 inline static void RENAME(hcscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, uint8_t *src1
, uint8_t *src2
,
2365 int srcW
, int xInc
, int flags
, int canMMX2BeUsed
, int16_t *hChrFilter
,
2366 int16_t *hChrFilterPos
, int hChrFilterSize
, void *funnyUVCode
,
2367 int srcFormat
, uint8_t *formatConvBuffer
, int16_t *mmx2Filter
,
2368 int32_t *mmx2FilterPos
, uint32_t *pal
)
2370 if (srcFormat
==PIX_FMT_YUYV422
)
2372 RENAME(yuy2ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2373 src1
= formatConvBuffer
;
2374 src2
= formatConvBuffer
+VOFW
;
2376 else if (srcFormat
==PIX_FMT_UYVY422
)
2378 RENAME(uyvyToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2379 src1
= formatConvBuffer
;
2380 src2
= formatConvBuffer
+VOFW
;
2382 else if (srcFormat
==PIX_FMT_RGB32
)
2384 if(c
->chrSrcHSubSample
)
2385 RENAME(bgr32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2387 RENAME(bgr32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2388 src1
= formatConvBuffer
;
2389 src2
= formatConvBuffer
+VOFW
;
2391 else if (srcFormat
==PIX_FMT_RGB32_1
)
2393 if(c
->chrSrcHSubSample
)
2394 RENAME(bgr32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2396 RENAME(bgr32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2397 src1
= formatConvBuffer
;
2398 src2
= formatConvBuffer
+VOFW
;
2400 else if (srcFormat
==PIX_FMT_BGR24
)
2402 if(c
->chrSrcHSubSample
)
2403 RENAME(bgr24ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2405 RENAME(bgr24ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2406 src1
= formatConvBuffer
;
2407 src2
= formatConvBuffer
+VOFW
;
2409 else if (srcFormat
==PIX_FMT_BGR565
)
2411 if(c
->chrSrcHSubSample
)
2412 RENAME(bgr16ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2414 RENAME(bgr16ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2415 src1
= formatConvBuffer
;
2416 src2
= formatConvBuffer
+VOFW
;
2418 else if (srcFormat
==PIX_FMT_BGR555
)
2420 if(c
->chrSrcHSubSample
)
2421 RENAME(bgr15ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2423 RENAME(bgr15ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2424 src1
= formatConvBuffer
;
2425 src2
= formatConvBuffer
+VOFW
;
2427 else if (srcFormat
==PIX_FMT_BGR32
)
2429 if(c
->chrSrcHSubSample
)
2430 RENAME(rgb32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2432 RENAME(rgb32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2433 src1
= formatConvBuffer
;
2434 src2
= formatConvBuffer
+VOFW
;
2436 else if (srcFormat
==PIX_FMT_BGR32_1
)
2438 if(c
->chrSrcHSubSample
)
2439 RENAME(rgb32ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2441 RENAME(rgb32ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
+ALT32_CORR
, src2
+ALT32_CORR
, srcW
, pal
);
2442 src1
= formatConvBuffer
;
2443 src2
= formatConvBuffer
+VOFW
;
2445 else if (srcFormat
==PIX_FMT_RGB24
)
2447 if(c
->chrSrcHSubSample
)
2448 RENAME(rgb24ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2450 RENAME(rgb24ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2451 src1
= formatConvBuffer
;
2452 src2
= formatConvBuffer
+VOFW
;
2454 else if (srcFormat
==PIX_FMT_RGB565
)
2456 if(c
->chrSrcHSubSample
)
2457 RENAME(rgb16ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2459 RENAME(rgb16ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2460 src1
= formatConvBuffer
;
2461 src2
= formatConvBuffer
+VOFW
;
2463 else if (srcFormat
==PIX_FMT_RGB555
)
2465 if(c
->chrSrcHSubSample
)
2466 RENAME(rgb15ToUV_half
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2468 RENAME(rgb15ToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2469 src1
= formatConvBuffer
;
2470 src2
= formatConvBuffer
+VOFW
;
2472 else if (isGray(srcFormat
) || srcFormat
==PIX_FMT_MONOBLACK
|| srcFormat
==PIX_FMT_MONOWHITE
)
2476 else if (srcFormat
==PIX_FMT_RGB8
|| srcFormat
==PIX_FMT_BGR8
|| srcFormat
==PIX_FMT_PAL8
|| srcFormat
==PIX_FMT_BGR4_BYTE
|| srcFormat
==PIX_FMT_RGB4_BYTE
)
2478 RENAME(palToUV
)(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2479 src1
= formatConvBuffer
;
2480 src2
= formatConvBuffer
+VOFW
;
2484 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2485 if (!(flags
&SWS_FAST_BILINEAR
) || (!canMMX2BeUsed
))
2487 if (!(flags
&SWS_FAST_BILINEAR
))
2490 RENAME(hScale
)(dst
, dstWidth
, src1
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2491 RENAME(hScale
)(dst
+VOFW
, dstWidth
, src2
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2493 else // fast bilinear upscale / crap downscale
2495 #if ARCH_X86 && CONFIG_GPL
2499 uint64_t ebxsave
__attribute__((aligned(8)));
2505 "mov %%"REG_b
", %6 \n\t"
2507 "pxor %%mm7, %%mm7 \n\t"
2508 "mov %0, %%"REG_c
" \n\t"
2509 "mov %1, %%"REG_D
" \n\t"
2510 "mov %2, %%"REG_d
" \n\t"
2511 "mov %3, %%"REG_b
" \n\t"
2512 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2513 PREFETCH
" (%%"REG_c
") \n\t"
2514 PREFETCH
" 32(%%"REG_c
") \n\t"
2515 PREFETCH
" 64(%%"REG_c
") \n\t"
2519 #define FUNNY_UV_CODE \
2520 "movl (%%"REG_b"), %%esi \n\t"\
2522 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2523 "add %%"REG_S", %%"REG_c" \n\t"\
2524 "add %%"REG_a", %%"REG_D" \n\t"\
2525 "xor %%"REG_a", %%"REG_a" \n\t"\
2529 #define FUNNY_UV_CODE \
2530 "movl (%%"REG_b"), %%esi \n\t"\
2532 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2533 "add %%"REG_a", %%"REG_D" \n\t"\
2534 "xor %%"REG_a", %%"REG_a" \n\t"\
2536 #endif /* ARCH_X86_64 */
2542 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2543 "mov %5, %%"REG_c
" \n\t" // src
2544 "mov %1, %%"REG_D
" \n\t" // buf1
2545 "add $"AV_STRINGIFY(VOF
)", %%"REG_D
" \n\t"
2546 PREFETCH
" (%%"REG_c
") \n\t"
2547 PREFETCH
" 32(%%"REG_c
") \n\t"
2548 PREFETCH
" 64(%%"REG_c
") \n\t"
2556 "mov %6, %%"REG_b
" \n\t"
2558 :: "m" (src1
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2559 "m" (funnyUVCode
), "m" (src2
)
2563 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2568 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--)
2570 //printf("%d %d %d\n", dstWidth, i, srcW);
2571 dst
[i
] = src1
[srcW
-1]*128;
2572 dst
[i
+VOFW
] = src2
[srcW
-1]*128;
2577 #endif /* HAVE_MMX2 */
2578 long xInc_shr16
= (long) (xInc
>> 16);
2579 uint16_t xInc_mask
= xInc
& 0xffff;
2581 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2582 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2583 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2586 "mov %0, %%"REG_S
" \n\t"
2587 "movzbl (%%"REG_S
", %%"REG_d
"), %%edi \n\t" //src[xx]
2588 "movzbl 1(%%"REG_S
", %%"REG_d
"), %%esi \n\t" //src[xx+1]
2589 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2590 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2591 "shll $16, %%edi \n\t"
2592 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2593 "mov %1, %%"REG_D
" \n\t"
2594 "shrl $9, %%esi \n\t"
2595 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2597 "movzbl (%5, %%"REG_d
"), %%edi \n\t" //src[xx]
2598 "movzbl 1(%5, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2599 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2600 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2601 "shll $16, %%edi \n\t"
2602 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2603 "mov %1, %%"REG_D
" \n\t"
2604 "shrl $9, %%esi \n\t"
2605 "movw %%si, "AV_STRINGIFY(VOF
)"(%%"REG_D
", %%"REG_a
", 2) \n\t"
2607 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2608 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>8 + carry
2609 "add $1, %%"REG_a
" \n\t"
2610 "cmp %2, %%"REG_a
" \n\t"
2613 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2614 which is needed to support GCC 4.0. */
2615 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2616 :: "m" (src1
), "m" (dst
), "g" ((long)dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2618 :: "m" (src1
), "m" (dst
), "m" ((long)dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2621 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2624 } //if MMX2 can't be used
2628 unsigned int xpos
=0;
2629 for (i
=0;i
<dstWidth
;i
++)
2631 register unsigned int xx
=xpos
>>16;
2632 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2633 dst
[i
]=(src1
[xx
]*(xalpha
^127)+src1
[xx
+1]*xalpha
);
2634 dst
[i
+VOFW
]=(src2
[xx
]*(xalpha
^127)+src2
[xx
+1]*xalpha
);
2636 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2637 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2641 #endif /* ARCH_X86 */
2643 if(c
->srcRange
!= c
->dstRange
&& !(isRGB(c
->dstFormat
) || isBGR(c
->dstFormat
))){
2645 //FIXME all pal and rgb srcFormats could do this convertion as well
2646 //FIXME all scalers more complex than bilinear could do half of this transform
2648 for (i
=0; i
<dstWidth
; i
++){
2649 dst
[i
]= (dst
[i
]*1799 + 4081085)>>11; //1469
2650 dst
[i
+VOFW
]= (dst
[i
+VOFW
]*1799 + 4081085)>>11; //1469
2653 for (i
=0; i
<dstWidth
; i
++){
2654 dst
[i
]= (FFMIN(dst
[i
],30775)*4663 - 9289992)>>12; //-264
2655 dst
[i
+VOFW
]= (FFMIN(dst
[i
+VOFW
],30775)*4663 - 9289992)>>12; //-264
2661 static int RENAME(swScale
)(SwsContext
*c
, uint8_t* src
[], int srcStride
[], int srcSliceY
,
2662 int srcSliceH
, uint8_t* dst
[], int dstStride
[]){
2664 /* load a few things into local vars to make the code more readable? and faster */
2665 const int srcW
= c
->srcW
;
2666 const int dstW
= c
->dstW
;
2667 const int dstH
= c
->dstH
;
2668 const int chrDstW
= c
->chrDstW
;
2669 const int chrSrcW
= c
->chrSrcW
;
2670 const int lumXInc
= c
->lumXInc
;
2671 const int chrXInc
= c
->chrXInc
;
2672 const int dstFormat
= c
->dstFormat
;
2673 const int srcFormat
= c
->srcFormat
;
2674 const int flags
= c
->flags
;
2675 const int canMMX2BeUsed
= c
->canMMX2BeUsed
;
2676 int16_t *vLumFilterPos
= c
->vLumFilterPos
;
2677 int16_t *vChrFilterPos
= c
->vChrFilterPos
;
2678 int16_t *hLumFilterPos
= c
->hLumFilterPos
;
2679 int16_t *hChrFilterPos
= c
->hChrFilterPos
;
2680 int16_t *vLumFilter
= c
->vLumFilter
;
2681 int16_t *vChrFilter
= c
->vChrFilter
;
2682 int16_t *hLumFilter
= c
->hLumFilter
;
2683 int16_t *hChrFilter
= c
->hChrFilter
;
2684 int32_t *lumMmxFilter
= c
->lumMmxFilter
;
2685 int32_t *chrMmxFilter
= c
->chrMmxFilter
;
2686 const int vLumFilterSize
= c
->vLumFilterSize
;
2687 const int vChrFilterSize
= c
->vChrFilterSize
;
2688 const int hLumFilterSize
= c
->hLumFilterSize
;
2689 const int hChrFilterSize
= c
->hChrFilterSize
;
2690 int16_t **lumPixBuf
= c
->lumPixBuf
;
2691 int16_t **chrPixBuf
= c
->chrPixBuf
;
2692 const int vLumBufSize
= c
->vLumBufSize
;
2693 const int vChrBufSize
= c
->vChrBufSize
;
2694 uint8_t *funnyYCode
= c
->funnyYCode
;
2695 uint8_t *funnyUVCode
= c
->funnyUVCode
;
2696 uint8_t *formatConvBuffer
= c
->formatConvBuffer
;
2697 const int chrSrcSliceY
= srcSliceY
>> c
->chrSrcVSubSample
;
2698 const int chrSrcSliceH
= -((-srcSliceH
) >> c
->chrSrcVSubSample
);
2700 uint32_t *pal
=c
->pal_yuv
;
2702 /* vars which will change and which we need to store back in the context */
2704 int lumBufIndex
= c
->lumBufIndex
;
2705 int chrBufIndex
= c
->chrBufIndex
;
2706 int lastInLumBuf
= c
->lastInLumBuf
;
2707 int lastInChrBuf
= c
->lastInChrBuf
;
2709 if (isPacked(c
->srcFormat
)){
2715 srcStride
[2]= srcStride
[0];
2717 srcStride
[1]<<= c
->vChrDrop
;
2718 srcStride
[2]<<= c
->vChrDrop
;
2720 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2721 // (int)dst[0], (int)dst[1], (int)dst[2]);
2723 #if 0 //self test FIXME move to a vfilter or something
2725 static volatile int i
=0;
2727 if (srcFormat
==PIX_FMT_YUV420P
&& i
==1 && srcSliceH
>= c
->srcH
)
2728 selfTest(src
, srcStride
, c
->srcW
, c
->srcH
);
2733 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2734 //dstStride[0],dstStride[1],dstStride[2]);
2736 if (dstStride
[0]%8 !=0 || dstStride
[1]%8 !=0 || dstStride
[2]%8 !=0)
2738 static int warnedAlready
=0; //FIXME move this into the context perhaps
2739 if (flags
& SWS_PRINT_INFO
&& !warnedAlready
)
2741 av_log(c
, AV_LOG_WARNING
, "Warning: dstStride is not aligned!\n"
2742 " ->cannot do aligned memory accesses anymore\n");
2747 /* Note the user might start scaling the picture in the middle so this
2748 will not get executed. This is not really intended but works
2749 currently, so people might do it. */
2760 for (;dstY
< dstH
; dstY
++){
2761 unsigned char *dest
=dst
[0]+dstStride
[0]*dstY
;
2762 const int chrDstY
= dstY
>>c
->chrDstVSubSample
;
2763 unsigned char *uDest
=dst
[1]+dstStride
[1]*chrDstY
;
2764 unsigned char *vDest
=dst
[2]+dstStride
[2]*chrDstY
;
2766 const int firstLumSrcY
= vLumFilterPos
[dstY
]; //First line needed as input
2767 const int firstChrSrcY
= vChrFilterPos
[chrDstY
]; //First line needed as input
2768 const int lastLumSrcY
= firstLumSrcY
+ vLumFilterSize
-1; // Last line needed as input
2769 const int lastChrSrcY
= firstChrSrcY
+ vChrFilterSize
-1; // Last line needed as input
2771 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2772 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
2773 //handle holes (FAST_BILINEAR & weird filters)
2774 if (firstLumSrcY
> lastInLumBuf
) lastInLumBuf
= firstLumSrcY
-1;
2775 if (firstChrSrcY
> lastInChrBuf
) lastInChrBuf
= firstChrSrcY
-1;
2776 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2777 assert(firstLumSrcY
>= lastInLumBuf
- vLumBufSize
+ 1);
2778 assert(firstChrSrcY
>= lastInChrBuf
- vChrBufSize
+ 1);
2780 // Do we have enough lines in this slice to output the dstY line
2781 if (lastLumSrcY
< srcSliceY
+ srcSliceH
&& lastChrSrcY
< -((-srcSliceY
- srcSliceH
)>>c
->chrSrcVSubSample
))
2783 //Do horizontal scaling
2784 while(lastInLumBuf
< lastLumSrcY
)
2786 uint8_t *s
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
2788 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
2789 assert(lumBufIndex
< 2*vLumBufSize
);
2790 assert(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
);
2791 assert(lastInLumBuf
+ 1 - srcSliceY
>= 0);
2792 //printf("%d %d\n", lumBufIndex, vLumBufSize);
2793 RENAME(hyscale
)(c
, lumPixBuf
[ lumBufIndex
], dstW
, s
, srcW
, lumXInc
,
2794 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2795 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
2796 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
, pal
);
2799 while(lastInChrBuf
< lastChrSrcY
)
2801 uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
2802 uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
2804 assert(chrBufIndex
< 2*vChrBufSize
);
2805 assert(lastInChrBuf
+ 1 - chrSrcSliceY
< (chrSrcSliceH
));
2806 assert(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0);
2807 //FIXME replace parameters through context struct (some at least)
2809 if (!(isGray(srcFormat
) || isGray(dstFormat
)))
2810 RENAME(hcscale
)(c
, chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
2811 flags
, canMMX2BeUsed
, hChrFilter
, hChrFilterPos
, hChrFilterSize
,
2812 funnyUVCode
, c
->srcFormat
, formatConvBuffer
,
2813 c
->chrMmx2Filter
, c
->chrMmx2FilterPos
, pal
);
2816 //wrap buf index around to stay inside the ring buffer
2817 if (lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
2818 if (chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
2820 else // not enough lines left in this slice -> load the rest in the buffer
2822 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2823 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2824 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2825 vChrBufSize, vLumBufSize);*/
2827 //Do horizontal scaling
2828 while(lastInLumBuf
+1 < srcSliceY
+ srcSliceH
)
2830 uint8_t *s
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
2832 assert(lumBufIndex
< 2*vLumBufSize
);
2833 assert(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
);
2834 assert(lastInLumBuf
+ 1 - srcSliceY
>= 0);
2835 RENAME(hyscale
)(c
, lumPixBuf
[ lumBufIndex
], dstW
, s
, srcW
, lumXInc
,
2836 flags
, canMMX2BeUsed
, hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2837 funnyYCode
, c
->srcFormat
, formatConvBuffer
,
2838 c
->lumMmx2Filter
, c
->lumMmx2FilterPos
, pal
);
2841 while(lastInChrBuf
+1 < (chrSrcSliceY
+ chrSrcSliceH
))
2843 uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
2844 uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
2846 assert(chrBufIndex
< 2*vChrBufSize
);
2847 assert(lastInChrBuf
+ 1 - chrSrcSliceY
< chrSrcSliceH
);
2848 assert(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0);
2850 if (!(isGray(srcFormat
) || isGray(dstFormat
)))
2851 RENAME(hcscale
)(c
, chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
2852 flags
, canMMX2BeUsed
, hChrFilter
, hChrFilterPos
, hChrFilterSize
,
2853 funnyUVCode
, c
->srcFormat
, formatConvBuffer
,
2854 c
->chrMmx2Filter
, c
->chrMmx2FilterPos
, pal
);
2857 //wrap buf index around to stay inside the ring buffer
2858 if (lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
2859 if (chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
2860 break; //we can't output a dstY line so let's try with the next slice
2864 c
->blueDither
= ff_dither8
[dstY
&1];
2865 if (c
->dstFormat
== PIX_FMT_RGB555
|| c
->dstFormat
== PIX_FMT_BGR555
)
2866 c
->greenDither
= ff_dither8
[dstY
&1];
2868 c
->greenDither
= ff_dither4
[dstY
&1];
2869 c
->redDither
= ff_dither8
[(dstY
+1)&1];
2873 int16_t **lumSrcPtr
= lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2874 int16_t **chrSrcPtr
= chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2877 if (flags
& SWS_ACCURATE_RND
){
2878 int s
= APCK_SIZE
/ 8;
2879 for (i
=0; i
<vLumFilterSize
; i
+=2){
2880 *(void**)&lumMmxFilter
[s
*i
]= lumSrcPtr
[i
];
2881 *(void**)&lumMmxFilter
[s
*i
+APCK_PTR2
/4 ]= lumSrcPtr
[i
+(vLumFilterSize
>1)];
2882 lumMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2883 lumMmxFilter
[s
*i
+APCK_COEF
/4+1]= vLumFilter
[dstY
*vLumFilterSize
+ i
]
2884 + (vLumFilterSize
>1 ? vLumFilter
[dstY
*vLumFilterSize
+ i
+ 1]<<16 : 0);
2886 for (i
=0; i
<vChrFilterSize
; i
+=2){
2887 *(void**)&chrMmxFilter
[s
*i
]= chrSrcPtr
[i
];
2888 *(void**)&chrMmxFilter
[s
*i
+APCK_PTR2
/4 ]= chrSrcPtr
[i
+(vChrFilterSize
>1)];
2889 chrMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2890 chrMmxFilter
[s
*i
+APCK_COEF
/4+1]= vChrFilter
[chrDstY
*vChrFilterSize
+ i
]
2891 + (vChrFilterSize
>1 ? vChrFilter
[chrDstY
*vChrFilterSize
+ i
+ 1]<<16 : 0);
2894 for (i
=0; i
<vLumFilterSize
; i
++)
2896 lumMmxFilter
[4*i
+0]= (int32_t)lumSrcPtr
[i
];
2897 lumMmxFilter
[4*i
+1]= (uint64_t)lumSrcPtr
[i
] >> 32;
2898 lumMmxFilter
[4*i
+2]=
2899 lumMmxFilter
[4*i
+3]=
2900 ((uint16_t)vLumFilter
[dstY
*vLumFilterSize
+ i
])*0x10001;
2902 for (i
=0; i
<vChrFilterSize
; i
++)
2904 chrMmxFilter
[4*i
+0]= (int32_t)chrSrcPtr
[i
];
2905 chrMmxFilter
[4*i
+1]= (uint64_t)chrSrcPtr
[i
] >> 32;
2906 chrMmxFilter
[4*i
+2]=
2907 chrMmxFilter
[4*i
+3]=
2908 ((uint16_t)vChrFilter
[chrDstY
*vChrFilterSize
+ i
])*0x10001;
2912 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
){
2913 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2914 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2915 RENAME(yuv2nv12X
)(c
,
2916 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2917 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2918 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
2920 else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) //YV12 like
2922 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2923 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
2924 if (vLumFilterSize
== 1 && vChrFilterSize
== 1) // unscaled YV12
2926 int16_t *lumBuf
= lumPixBuf
[0];
2927 int16_t *chrBuf
= chrPixBuf
[0];
2928 RENAME(yuv2yuv1
)(c
, lumBuf
, chrBuf
, dest
, uDest
, vDest
, dstW
, chrDstW
);
2933 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2934 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2935 dest
, uDest
, vDest
, dstW
, chrDstW
);
2940 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
2941 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
2942 if (vLumFilterSize
== 1 && vChrFilterSize
== 2) //unscaled RGB
2944 int chrAlpha
= vChrFilter
[2*dstY
+1];
2945 if(flags
& SWS_FULL_CHR_H_INT
){
2946 yuv2rgbXinC_full(c
, //FIXME write a packed1_full function
2947 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2948 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2951 RENAME(yuv2packed1
)(c
, *lumSrcPtr
, *chrSrcPtr
, *(chrSrcPtr
+1),
2952 dest
, dstW
, chrAlpha
, dstFormat
, flags
, dstY
);
2955 else if (vLumFilterSize
== 2 && vChrFilterSize
== 2) //bilinear upscale RGB
2957 int lumAlpha
= vLumFilter
[2*dstY
+1];
2958 int chrAlpha
= vChrFilter
[2*dstY
+1];
2960 lumMmxFilter
[3]= vLumFilter
[2*dstY
]*0x10001;
2962 chrMmxFilter
[3]= vChrFilter
[2*chrDstY
]*0x10001;
2963 if(flags
& SWS_FULL_CHR_H_INT
){
2964 yuv2rgbXinC_full(c
, //FIXME write a packed2_full function
2965 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2966 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2969 RENAME(yuv2packed2
)(c
, *lumSrcPtr
, *(lumSrcPtr
+1), *chrSrcPtr
, *(chrSrcPtr
+1),
2970 dest
, dstW
, lumAlpha
, chrAlpha
, dstY
);
2975 if(flags
& SWS_FULL_CHR_H_INT
){
2977 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2978 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2981 RENAME(yuv2packedX
)(c
,
2982 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2983 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2989 else // hmm looks like we can't use MMX here without overwriting this array's tail
2991 int16_t **lumSrcPtr
= lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2992 int16_t **chrSrcPtr
= chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2993 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
){
2994 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2995 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2997 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2998 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2999 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
3001 else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) //YV12
3003 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
3004 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
3006 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3007 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3008 dest
, uDest
, vDest
, dstW
, chrDstW
);
3012 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
3013 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
3014 if(flags
& SWS_FULL_CHR_H_INT
){
3016 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3017 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3021 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
3022 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
3030 __asm__
volatile(SFENCE:::"memory");
3031 __asm__
volatile(EMMS:::"memory");
3033 /* store changed local vars back in the context */
3035 c
->lumBufIndex
= lumBufIndex
;
3036 c
->chrBufIndex
= chrBufIndex
;
3037 c
->lastInLumBuf
= lastInLumBuf
;
3038 c
->lastInChrBuf
= lastInChrBuf
;
3040 return dstY
- lastDstY
;