2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
29 #if COMPILE_TEMPLATE_AMD3DNOW
30 #define PREFETCH "prefetch"
31 #elif COMPILE_TEMPLATE_MMX2
32 #define PREFETCH "prefetchnta"
34 #define PREFETCH " # nop"
37 #if COMPILE_TEMPLATE_MMX2
38 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
39 #elif COMPILE_TEMPLATE_AMD3DNOW
40 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
43 #if COMPILE_TEMPLATE_MMX2
44 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
46 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
48 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
50 #if COMPILE_TEMPLATE_ALTIVEC
51 #include "ppc/swscale_altivec_template.c"
54 #define YSCALEYUV2YV12X(x, offset, dest, width) \
56 "xor %%"REG_a", %%"REG_a" \n\t"\
57 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
58 "movq %%mm3, %%mm4 \n\t"\
59 "lea " offset "(%0), %%"REG_d" \n\t"\
60 "mov (%%"REG_d"), %%"REG_S" \n\t"\
61 ASMALIGN(4) /* FIXME Unroll? */\
63 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
64 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
65 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
66 "add $16, %%"REG_d" \n\t"\
67 "mov (%%"REG_d"), %%"REG_S" \n\t"\
68 "test %%"REG_S", %%"REG_S" \n\t"\
69 "pmulhw %%mm0, %%mm2 \n\t"\
70 "pmulhw %%mm0, %%mm5 \n\t"\
71 "paddw %%mm2, %%mm3 \n\t"\
72 "paddw %%mm5, %%mm4 \n\t"\
74 "psraw $3, %%mm3 \n\t"\
75 "psraw $3, %%mm4 \n\t"\
76 "packuswb %%mm4, %%mm3 \n\t"\
77 MOVNTQ(%%mm3, (%1, %%REGa))\
78 "add $8, %%"REG_a" \n\t"\
79 "cmp %2, %%"REG_a" \n\t"\
80 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
81 "movq %%mm3, %%mm4 \n\t"\
82 "lea " offset "(%0), %%"REG_d" \n\t"\
83 "mov (%%"REG_d"), %%"REG_S" \n\t"\
85 :: "r" (&c->redDither),\
86 "r" (dest), "g" (width)\
87 : "%"REG_a, "%"REG_d, "%"REG_S\
90 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
92 "lea " offset "(%0), %%"REG_d" \n\t"\
93 "xor %%"REG_a", %%"REG_a" \n\t"\
94 "pxor %%mm4, %%mm4 \n\t"\
95 "pxor %%mm5, %%mm5 \n\t"\
96 "pxor %%mm6, %%mm6 \n\t"\
97 "pxor %%mm7, %%mm7 \n\t"\
98 "mov (%%"REG_d"), %%"REG_S" \n\t"\
101 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
102 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
103 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
104 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
105 "movq %%mm0, %%mm3 \n\t"\
106 "punpcklwd %%mm1, %%mm0 \n\t"\
107 "punpckhwd %%mm1, %%mm3 \n\t"\
108 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
109 "pmaddwd %%mm1, %%mm0 \n\t"\
110 "pmaddwd %%mm1, %%mm3 \n\t"\
111 "paddd %%mm0, %%mm4 \n\t"\
112 "paddd %%mm3, %%mm5 \n\t"\
113 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
114 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
115 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
116 "test %%"REG_S", %%"REG_S" \n\t"\
117 "movq %%mm2, %%mm0 \n\t"\
118 "punpcklwd %%mm3, %%mm2 \n\t"\
119 "punpckhwd %%mm3, %%mm0 \n\t"\
120 "pmaddwd %%mm1, %%mm2 \n\t"\
121 "pmaddwd %%mm1, %%mm0 \n\t"\
122 "paddd %%mm2, %%mm6 \n\t"\
123 "paddd %%mm0, %%mm7 \n\t"\
125 "psrad $16, %%mm4 \n\t"\
126 "psrad $16, %%mm5 \n\t"\
127 "psrad $16, %%mm6 \n\t"\
128 "psrad $16, %%mm7 \n\t"\
129 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
130 "packssdw %%mm5, %%mm4 \n\t"\
131 "packssdw %%mm7, %%mm6 \n\t"\
132 "paddw %%mm0, %%mm4 \n\t"\
133 "paddw %%mm0, %%mm6 \n\t"\
134 "psraw $3, %%mm4 \n\t"\
135 "psraw $3, %%mm6 \n\t"\
136 "packuswb %%mm6, %%mm4 \n\t"\
137 MOVNTQ(%%mm4, (%1, %%REGa))\
138 "add $8, %%"REG_a" \n\t"\
139 "cmp %2, %%"REG_a" \n\t"\
140 "lea " offset "(%0), %%"REG_d" \n\t"\
141 "pxor %%mm4, %%mm4 \n\t"\
142 "pxor %%mm5, %%mm5 \n\t"\
143 "pxor %%mm6, %%mm6 \n\t"\
144 "pxor %%mm7, %%mm7 \n\t"\
145 "mov (%%"REG_d"), %%"REG_S" \n\t"\
147 :: "r" (&c->redDither),\
148 "r" (dest), "g" (width)\
149 : "%"REG_a, "%"REG_d, "%"REG_S\
152 #define YSCALEYUV2YV121 \
153 "mov %2, %%"REG_a" \n\t"\
154 ASMALIGN(4) /* FIXME Unroll? */\
156 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
157 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
158 "psraw $7, %%mm0 \n\t"\
159 "psraw $7, %%mm1 \n\t"\
160 "packuswb %%mm1, %%mm0 \n\t"\
161 MOVNTQ(%%mm0, (%1, %%REGa))\
162 "add $8, %%"REG_a" \n\t"\
165 #define YSCALEYUV2YV121_ACCURATE \
166 "mov %2, %%"REG_a" \n\t"\
167 "pcmpeqw %%mm7, %%mm7 \n\t"\
168 "psrlw $15, %%mm7 \n\t"\
169 "psllw $6, %%mm7 \n\t"\
170 ASMALIGN(4) /* FIXME Unroll? */\
172 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
173 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
174 "paddsw %%mm7, %%mm0 \n\t"\
175 "paddsw %%mm7, %%mm1 \n\t"\
176 "psraw $7, %%mm0 \n\t"\
177 "psraw $7, %%mm1 \n\t"\
178 "packuswb %%mm1, %%mm0 \n\t"\
179 MOVNTQ(%%mm0, (%1, %%REGa))\
180 "add $8, %%"REG_a" \n\t"\
184 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
185 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
186 "r" (dest), "m" (dstW),
187 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
188 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
190 #define YSCALEYUV2PACKEDX_UV \
192 "xor %%"REG_a", %%"REG_a" \n\t"\
196 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
197 "mov (%%"REG_d"), %%"REG_S" \n\t"\
198 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
199 "movq %%mm3, %%mm4 \n\t"\
202 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
203 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
204 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
205 "add $16, %%"REG_d" \n\t"\
206 "mov (%%"REG_d"), %%"REG_S" \n\t"\
207 "pmulhw %%mm0, %%mm2 \n\t"\
208 "pmulhw %%mm0, %%mm5 \n\t"\
209 "paddw %%mm2, %%mm3 \n\t"\
210 "paddw %%mm5, %%mm4 \n\t"\
211 "test %%"REG_S", %%"REG_S" \n\t"\
214 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
215 "lea "offset"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
218 "movq "#dst1", "#dst2" \n\t"\
221 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw "#coeff", "#src1" \n\t"\
227 "pmulhw "#coeff", "#src2" \n\t"\
228 "paddw "#src1", "#dst1" \n\t"\
229 "paddw "#src2", "#dst2" \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 #define YSCALEYUV2PACKEDX \
234 YSCALEYUV2PACKEDX_UV \
235 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
237 #define YSCALEYUV2PACKEDX_END \
238 :: "r" (&c->redDither), \
239 "m" (dummy), "m" (dummy), "m" (dummy),\
240 "r" (dest), "m" (dstW) \
241 : "%"REG_a, "%"REG_d, "%"REG_S \
244 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
246 "xor %%"REG_a", %%"REG_a" \n\t"\
250 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
251 "mov (%%"REG_d"), %%"REG_S" \n\t"\
252 "pxor %%mm4, %%mm4 \n\t"\
253 "pxor %%mm5, %%mm5 \n\t"\
254 "pxor %%mm6, %%mm6 \n\t"\
255 "pxor %%mm7, %%mm7 \n\t"\
258 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
259 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
260 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
261 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
262 "movq %%mm0, %%mm3 \n\t"\
263 "punpcklwd %%mm1, %%mm0 \n\t"\
264 "punpckhwd %%mm1, %%mm3 \n\t"\
265 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
266 "pmaddwd %%mm1, %%mm0 \n\t"\
267 "pmaddwd %%mm1, %%mm3 \n\t"\
268 "paddd %%mm0, %%mm4 \n\t"\
269 "paddd %%mm3, %%mm5 \n\t"\
270 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
271 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
272 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
273 "test %%"REG_S", %%"REG_S" \n\t"\
274 "movq %%mm2, %%mm0 \n\t"\
275 "punpcklwd %%mm3, %%mm2 \n\t"\
276 "punpckhwd %%mm3, %%mm0 \n\t"\
277 "pmaddwd %%mm1, %%mm2 \n\t"\
278 "pmaddwd %%mm1, %%mm0 \n\t"\
279 "paddd %%mm2, %%mm6 \n\t"\
280 "paddd %%mm0, %%mm7 \n\t"\
282 "psrad $16, %%mm4 \n\t"\
283 "psrad $16, %%mm5 \n\t"\
284 "psrad $16, %%mm6 \n\t"\
285 "psrad $16, %%mm7 \n\t"\
286 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
287 "packssdw %%mm5, %%mm4 \n\t"\
288 "packssdw %%mm7, %%mm6 \n\t"\
289 "paddw %%mm0, %%mm4 \n\t"\
290 "paddw %%mm0, %%mm6 \n\t"\
291 "movq %%mm4, "U_TEMP"(%0) \n\t"\
292 "movq %%mm6, "V_TEMP"(%0) \n\t"\
294 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
295 "lea "offset"(%0), %%"REG_d" \n\t"\
296 "mov (%%"REG_d"), %%"REG_S" \n\t"\
297 "pxor %%mm1, %%mm1 \n\t"\
298 "pxor %%mm5, %%mm5 \n\t"\
299 "pxor %%mm7, %%mm7 \n\t"\
300 "pxor %%mm6, %%mm6 \n\t"\
303 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
304 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
305 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
306 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
307 "movq %%mm0, %%mm3 \n\t"\
308 "punpcklwd %%mm4, %%mm0 \n\t"\
309 "punpckhwd %%mm4, %%mm3 \n\t"\
310 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
311 "pmaddwd %%mm4, %%mm0 \n\t"\
312 "pmaddwd %%mm4, %%mm3 \n\t"\
313 "paddd %%mm0, %%mm1 \n\t"\
314 "paddd %%mm3, %%mm5 \n\t"\
315 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
316 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
317 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
318 "test %%"REG_S", %%"REG_S" \n\t"\
319 "movq %%mm2, %%mm0 \n\t"\
320 "punpcklwd %%mm3, %%mm2 \n\t"\
321 "punpckhwd %%mm3, %%mm0 \n\t"\
322 "pmaddwd %%mm4, %%mm2 \n\t"\
323 "pmaddwd %%mm4, %%mm0 \n\t"\
324 "paddd %%mm2, %%mm7 \n\t"\
325 "paddd %%mm0, %%mm6 \n\t"\
327 "psrad $16, %%mm1 \n\t"\
328 "psrad $16, %%mm5 \n\t"\
329 "psrad $16, %%mm7 \n\t"\
330 "psrad $16, %%mm6 \n\t"\
331 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
332 "packssdw %%mm5, %%mm1 \n\t"\
333 "packssdw %%mm6, %%mm7 \n\t"\
334 "paddw %%mm0, %%mm1 \n\t"\
335 "paddw %%mm0, %%mm7 \n\t"\
336 "movq "U_TEMP"(%0), %%mm3 \n\t"\
337 "movq "V_TEMP"(%0), %%mm4 \n\t"\
339 #define YSCALEYUV2PACKEDX_ACCURATE \
340 YSCALEYUV2PACKEDX_ACCURATE_UV \
341 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
343 #define YSCALEYUV2RGBX \
344 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
345 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
346 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
347 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
348 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
349 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
350 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
351 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
352 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
353 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
354 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
355 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
356 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
357 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
358 "paddw %%mm3, %%mm4 \n\t"\
359 "movq %%mm2, %%mm0 \n\t"\
360 "movq %%mm5, %%mm6 \n\t"\
361 "movq %%mm4, %%mm3 \n\t"\
362 "punpcklwd %%mm2, %%mm2 \n\t"\
363 "punpcklwd %%mm5, %%mm5 \n\t"\
364 "punpcklwd %%mm4, %%mm4 \n\t"\
365 "paddw %%mm1, %%mm2 \n\t"\
366 "paddw %%mm1, %%mm5 \n\t"\
367 "paddw %%mm1, %%mm4 \n\t"\
368 "punpckhwd %%mm0, %%mm0 \n\t"\
369 "punpckhwd %%mm6, %%mm6 \n\t"\
370 "punpckhwd %%mm3, %%mm3 \n\t"\
371 "paddw %%mm7, %%mm0 \n\t"\
372 "paddw %%mm7, %%mm6 \n\t"\
373 "paddw %%mm7, %%mm3 \n\t"\
374 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
375 "packuswb %%mm0, %%mm2 \n\t"\
376 "packuswb %%mm6, %%mm5 \n\t"\
377 "packuswb %%mm3, %%mm4 \n\t"\
379 #define REAL_YSCALEYUV2PACKED(index, c) \
380 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
381 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
382 "psraw $3, %%mm0 \n\t"\
383 "psraw $3, %%mm1 \n\t"\
384 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
385 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
386 "xor "#index", "#index" \n\t"\
389 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
390 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
391 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
392 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
393 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
394 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
395 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
396 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
397 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
398 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
399 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
400 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
401 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
402 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
403 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
404 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
405 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
406 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
407 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
408 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
409 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
410 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
411 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
412 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
413 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
415 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
417 #define REAL_YSCALEYUV2RGB_UV(index, c) \
418 "xor "#index", "#index" \n\t"\
421 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
422 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
423 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
424 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
425 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
426 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
427 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
428 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
429 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
430 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
431 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
432 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
433 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
434 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
435 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
436 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
437 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
438 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
439 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
440 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
442 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
443 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
444 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
445 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
446 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
447 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
448 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
449 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
450 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
451 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
452 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
454 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
456 #define REAL_YSCALEYUV2RGB_COEFF(c) \
457 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
458 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
459 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
460 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
461 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
462 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
463 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
464 "paddw %%mm3, %%mm4 \n\t"\
465 "movq %%mm2, %%mm0 \n\t"\
466 "movq %%mm5, %%mm6 \n\t"\
467 "movq %%mm4, %%mm3 \n\t"\
468 "punpcklwd %%mm2, %%mm2 \n\t"\
469 "punpcklwd %%mm5, %%mm5 \n\t"\
470 "punpcklwd %%mm4, %%mm4 \n\t"\
471 "paddw %%mm1, %%mm2 \n\t"\
472 "paddw %%mm1, %%mm5 \n\t"\
473 "paddw %%mm1, %%mm4 \n\t"\
474 "punpckhwd %%mm0, %%mm0 \n\t"\
475 "punpckhwd %%mm6, %%mm6 \n\t"\
476 "punpckhwd %%mm3, %%mm3 \n\t"\
477 "paddw %%mm7, %%mm0 \n\t"\
478 "paddw %%mm7, %%mm6 \n\t"\
479 "paddw %%mm7, %%mm3 \n\t"\
480 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
481 "packuswb %%mm0, %%mm2 \n\t"\
482 "packuswb %%mm6, %%mm5 \n\t"\
483 "packuswb %%mm3, %%mm4 \n\t"\
485 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
487 #define YSCALEYUV2RGB(index, c) \
488 REAL_YSCALEYUV2RGB_UV(index, c) \
489 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
490 REAL_YSCALEYUV2RGB_COEFF(c)
492 #define REAL_YSCALEYUV2PACKED1(index, c) \
493 "xor "#index", "#index" \n\t"\
496 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
497 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
498 "psraw $7, %%mm3 \n\t" \
499 "psraw $7, %%mm4 \n\t" \
500 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
501 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
502 "psraw $7, %%mm1 \n\t" \
503 "psraw $7, %%mm7 \n\t" \
505 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
507 #define REAL_YSCALEYUV2RGB1(index, c) \
508 "xor "#index", "#index" \n\t"\
511 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
512 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
513 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
514 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
515 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
516 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
517 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
518 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
519 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
520 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
521 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
522 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
523 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
524 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
525 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
526 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
527 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
528 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
529 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
530 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
531 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
532 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
533 "paddw %%mm3, %%mm4 \n\t"\
534 "movq %%mm2, %%mm0 \n\t"\
535 "movq %%mm5, %%mm6 \n\t"\
536 "movq %%mm4, %%mm3 \n\t"\
537 "punpcklwd %%mm2, %%mm2 \n\t"\
538 "punpcklwd %%mm5, %%mm5 \n\t"\
539 "punpcklwd %%mm4, %%mm4 \n\t"\
540 "paddw %%mm1, %%mm2 \n\t"\
541 "paddw %%mm1, %%mm5 \n\t"\
542 "paddw %%mm1, %%mm4 \n\t"\
543 "punpckhwd %%mm0, %%mm0 \n\t"\
544 "punpckhwd %%mm6, %%mm6 \n\t"\
545 "punpckhwd %%mm3, %%mm3 \n\t"\
546 "paddw %%mm7, %%mm0 \n\t"\
547 "paddw %%mm7, %%mm6 \n\t"\
548 "paddw %%mm7, %%mm3 \n\t"\
549 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
550 "packuswb %%mm0, %%mm2 \n\t"\
551 "packuswb %%mm6, %%mm5 \n\t"\
552 "packuswb %%mm3, %%mm4 \n\t"\
554 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
556 #define REAL_YSCALEYUV2PACKED1b(index, c) \
557 "xor "#index", "#index" \n\t"\
560 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
561 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
562 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
563 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
564 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
565 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
566 "psrlw $8, %%mm3 \n\t" \
567 "psrlw $8, %%mm4 \n\t" \
568 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
569 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
570 "psraw $7, %%mm1 \n\t" \
571 "psraw $7, %%mm7 \n\t"
572 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
574 // do vertical chrominance interpolation
575 #define REAL_YSCALEYUV2RGB1b(index, c) \
576 "xor "#index", "#index" \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
586 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
587 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
588 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
589 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
590 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
591 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
592 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
593 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
594 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
595 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
596 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
597 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
598 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
599 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
600 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
601 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
602 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
603 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
604 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
605 "paddw %%mm3, %%mm4 \n\t"\
606 "movq %%mm2, %%mm0 \n\t"\
607 "movq %%mm5, %%mm6 \n\t"\
608 "movq %%mm4, %%mm3 \n\t"\
609 "punpcklwd %%mm2, %%mm2 \n\t"\
610 "punpcklwd %%mm5, %%mm5 \n\t"\
611 "punpcklwd %%mm4, %%mm4 \n\t"\
612 "paddw %%mm1, %%mm2 \n\t"\
613 "paddw %%mm1, %%mm5 \n\t"\
614 "paddw %%mm1, %%mm4 \n\t"\
615 "punpckhwd %%mm0, %%mm0 \n\t"\
616 "punpckhwd %%mm6, %%mm6 \n\t"\
617 "punpckhwd %%mm3, %%mm3 \n\t"\
618 "paddw %%mm7, %%mm0 \n\t"\
619 "paddw %%mm7, %%mm6 \n\t"\
620 "paddw %%mm7, %%mm3 \n\t"\
621 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
622 "packuswb %%mm0, %%mm2 \n\t"\
623 "packuswb %%mm6, %%mm5 \n\t"\
624 "packuswb %%mm3, %%mm4 \n\t"\
626 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
628 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
629 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
630 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
631 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
632 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
633 "packuswb %%mm1, %%mm7 \n\t"
634 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
636 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
637 "movq "#b", "#q2" \n\t" /* B */\
638 "movq "#r", "#t" \n\t" /* R */\
639 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
640 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
641 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
642 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
643 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
644 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
645 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
646 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
647 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
648 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
650 MOVNTQ( q0, (dst, index, 4))\
651 MOVNTQ( b, 8(dst, index, 4))\
652 MOVNTQ( q2, 16(dst, index, 4))\
653 MOVNTQ( q3, 24(dst, index, 4))\
655 "add $8, "#index" \n\t"\
656 "cmp "#dstw", "#index" \n\t"\
658 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
660 #define REAL_WRITERGB16(dst, dstw, index) \
661 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
662 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
663 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
664 "psrlq $3, %%mm2 \n\t"\
666 "movq %%mm2, %%mm1 \n\t"\
667 "movq %%mm4, %%mm3 \n\t"\
669 "punpcklbw %%mm7, %%mm3 \n\t"\
670 "punpcklbw %%mm5, %%mm2 \n\t"\
671 "punpckhbw %%mm7, %%mm4 \n\t"\
672 "punpckhbw %%mm5, %%mm1 \n\t"\
674 "psllq $3, %%mm3 \n\t"\
675 "psllq $3, %%mm4 \n\t"\
677 "por %%mm3, %%mm2 \n\t"\
678 "por %%mm4, %%mm1 \n\t"\
680 MOVNTQ(%%mm2, (dst, index, 2))\
681 MOVNTQ(%%mm1, 8(dst, index, 2))\
683 "add $8, "#index" \n\t"\
684 "cmp "#dstw", "#index" \n\t"\
686 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
688 #define REAL_WRITERGB15(dst, dstw, index) \
689 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
690 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
691 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
692 "psrlq $3, %%mm2 \n\t"\
693 "psrlq $1, %%mm5 \n\t"\
695 "movq %%mm2, %%mm1 \n\t"\
696 "movq %%mm4, %%mm3 \n\t"\
698 "punpcklbw %%mm7, %%mm3 \n\t"\
699 "punpcklbw %%mm5, %%mm2 \n\t"\
700 "punpckhbw %%mm7, %%mm4 \n\t"\
701 "punpckhbw %%mm5, %%mm1 \n\t"\
703 "psllq $2, %%mm3 \n\t"\
704 "psllq $2, %%mm4 \n\t"\
706 "por %%mm3, %%mm2 \n\t"\
707 "por %%mm4, %%mm1 \n\t"\
709 MOVNTQ(%%mm2, (dst, index, 2))\
710 MOVNTQ(%%mm1, 8(dst, index, 2))\
712 "add $8, "#index" \n\t"\
713 "cmp "#dstw", "#index" \n\t"\
715 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
717 #define WRITEBGR24OLD(dst, dstw, index) \
718 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
719 "movq %%mm2, %%mm1 \n\t" /* B */\
720 "movq %%mm5, %%mm6 \n\t" /* R */\
721 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
722 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
723 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
724 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
725 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
726 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
727 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
728 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
729 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
730 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
732 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
733 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
734 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
735 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
736 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
737 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
738 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
739 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
741 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
742 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
743 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
744 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
745 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
746 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
747 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
748 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
749 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
750 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
751 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
752 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
753 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
755 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
756 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
757 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
758 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
759 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
760 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
761 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
762 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
764 MOVNTQ(%%mm0, (dst))\
765 MOVNTQ(%%mm2, 8(dst))\
766 MOVNTQ(%%mm3, 16(dst))\
767 "add $24, "#dst" \n\t"\
769 "add $8, "#index" \n\t"\
770 "cmp "#dstw", "#index" \n\t"\
773 #define WRITEBGR24MMX(dst, dstw, index) \
774 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
775 "movq %%mm2, %%mm1 \n\t" /* B */\
776 "movq %%mm5, %%mm6 \n\t" /* R */\
777 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
778 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
779 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
780 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
781 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
782 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
783 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
784 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
785 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
786 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
788 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
789 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
790 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
791 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
793 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
794 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
795 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
796 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
798 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
799 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
800 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
801 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
803 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
804 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
805 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
806 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
807 MOVNTQ(%%mm0, (dst))\
809 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
810 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
811 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
812 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
813 MOVNTQ(%%mm6, 8(dst))\
815 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
816 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
817 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
818 MOVNTQ(%%mm5, 16(dst))\
820 "add $24, "#dst" \n\t"\
822 "add $8, "#index" \n\t"\
823 "cmp "#dstw", "#index" \n\t"\
826 #define WRITEBGR24MMX2(dst, dstw, index) \
827 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
828 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
829 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
830 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
831 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
832 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
834 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
835 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
836 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
838 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
839 "por %%mm1, %%mm6 \n\t"\
840 "por %%mm3, %%mm6 \n\t"\
841 MOVNTQ(%%mm6, (dst))\
843 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
844 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
845 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
846 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
848 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
849 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
850 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
852 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
853 "por %%mm3, %%mm6 \n\t"\
854 MOVNTQ(%%mm6, 8(dst))\
856 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
857 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
858 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
860 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
861 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
862 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
864 "por %%mm1, %%mm3 \n\t"\
865 "por %%mm3, %%mm6 \n\t"\
866 MOVNTQ(%%mm6, 16(dst))\
868 "add $24, "#dst" \n\t"\
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
874 #if COMPILE_TEMPLATE_MMX2
876 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
879 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
882 #define REAL_WRITEYUY2(dst, dstw, index) \
883 "packuswb %%mm3, %%mm3 \n\t"\
884 "packuswb %%mm4, %%mm4 \n\t"\
885 "packuswb %%mm7, %%mm1 \n\t"\
886 "punpcklbw %%mm4, %%mm3 \n\t"\
887 "movq %%mm1, %%mm7 \n\t"\
888 "punpcklbw %%mm3, %%mm1 \n\t"\
889 "punpckhbw %%mm3, %%mm7 \n\t"\
891 MOVNTQ(%%mm1, (dst, index, 2))\
892 MOVNTQ(%%mm7, 8(dst, index, 2))\
894 "add $8, "#index" \n\t"\
895 "cmp "#dstw", "#index" \n\t"\
897 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
900 static inline void RENAME(yuv2yuvX
)(SwsContext
*c
, const int16_t *lumFilter
, const int16_t **lumSrc
, int lumFilterSize
,
901 const int16_t *chrFilter
, const int16_t **chrSrc
, int chrFilterSize
, const int16_t **alpSrc
,
902 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, uint8_t *aDest
, long dstW
, long chrDstW
)
904 #if COMPILE_TEMPLATE_MMX
905 if(!(c
->flags
& SWS_BITEXACT
)) {
906 if (c
->flags
& SWS_ACCURATE_RND
) {
908 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
909 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
911 if (CONFIG_SWSCALE_ALPHA
&& aDest
) {
912 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET
, aDest
, dstW
)
915 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
918 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
919 YSCALEYUV2YV12X(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
921 if (CONFIG_SWSCALE_ALPHA
&& aDest
) {
922 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET
, aDest
, dstW
)
925 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
930 #if COMPILE_TEMPLATE_ALTIVEC
931 yuv2yuvX_altivec_real(lumFilter
, lumSrc
, lumFilterSize
,
932 chrFilter
, chrSrc
, chrFilterSize
,
933 dest
, uDest
, vDest
, dstW
, chrDstW
);
934 #else //COMPILE_TEMPLATE_ALTIVEC
935 yuv2yuvXinC(lumFilter
, lumSrc
, lumFilterSize
,
936 chrFilter
, chrSrc
, chrFilterSize
,
937 alpSrc
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
938 #endif //!COMPILE_TEMPLATE_ALTIVEC
941 static inline void RENAME(yuv2nv12X
)(SwsContext
*c
, const int16_t *lumFilter
, const int16_t **lumSrc
, int lumFilterSize
,
942 const int16_t *chrFilter
, const int16_t **chrSrc
, int chrFilterSize
,
943 uint8_t *dest
, uint8_t *uDest
, int dstW
, int chrDstW
, enum PixelFormat dstFormat
)
945 yuv2nv12XinC(lumFilter
, lumSrc
, lumFilterSize
,
946 chrFilter
, chrSrc
, chrFilterSize
,
947 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
950 static inline void RENAME(yuv2yuv1
)(SwsContext
*c
, const int16_t *lumSrc
, const int16_t *chrSrc
, const int16_t *alpSrc
,
951 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, uint8_t *aDest
, long dstW
, long chrDstW
)
954 #if COMPILE_TEMPLATE_MMX
955 if(!(c
->flags
& SWS_BITEXACT
)) {
957 uint8_t *src
[4]= {alpSrc
+ dstW
, lumSrc
+ dstW
, chrSrc
+ chrDstW
, chrSrc
+ VOFW
+ chrDstW
};
958 uint8_t *dst
[4]= {aDest
, dest
, uDest
, vDest
};
959 x86_reg counter
[4]= {dstW
, dstW
, chrDstW
, chrDstW
};
961 if (c
->flags
& SWS_ACCURATE_RND
) {
965 YSCALEYUV2YV121_ACCURATE
966 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
977 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
987 for (i
=0; i
<dstW
; i
++) {
988 int val
= (lumSrc
[i
]+64)>>7;
999 for (i
=0; i
<chrDstW
; i
++) {
1000 int u
=(chrSrc
[i
]+64)>>7;
1001 int v
=(chrSrc
[i
+ VOFW
]+64)>>7;
1005 else if (u
>255) u
=255;
1007 else if (v
>255) v
=255;
1014 if (CONFIG_SWSCALE_ALPHA
&& aDest
)
1015 for (i
=0; i
<dstW
; i
++) {
1016 int val
= (alpSrc
[i
]+64)>>7;
1017 aDest
[i
]= av_clip_uint8(val
);
1023 * vertical scale YV12 to RGB
1025 static inline void RENAME(yuv2packedX
)(SwsContext
*c
, const int16_t *lumFilter
, const int16_t **lumSrc
, int lumFilterSize
,
1026 const int16_t *chrFilter
, const int16_t **chrSrc
, int chrFilterSize
,
1027 const int16_t **alpSrc
, uint8_t *dest
, long dstW
, long dstY
)
1029 #if COMPILE_TEMPLATE_MMX
1031 if(!(c
->flags
& SWS_BITEXACT
)) {
1032 if (c
->flags
& SWS_ACCURATE_RND
) {
1033 switch(c
->dstFormat
) {
1035 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1036 YSCALEYUV2PACKEDX_ACCURATE
1038 "movq %%mm2, "U_TEMP
"(%0) \n\t"
1039 "movq %%mm4, "V_TEMP
"(%0) \n\t"
1040 "movq %%mm5, "Y_TEMP
"(%0) \n\t"
1041 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET
)
1042 "movq "Y_TEMP
"(%0), %%mm5 \n\t"
1043 "psraw $3, %%mm1 \n\t"
1044 "psraw $3, %%mm7 \n\t"
1045 "packuswb %%mm7, %%mm1 \n\t"
1046 WRITEBGR32(%4, %5, %%REGa
, %%mm3
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm2
, %%mm6
)
1048 YSCALEYUV2PACKEDX_END
1050 YSCALEYUV2PACKEDX_ACCURATE
1052 "pcmpeqd %%mm7, %%mm7 \n\t"
1053 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1055 YSCALEYUV2PACKEDX_END
1059 YSCALEYUV2PACKEDX_ACCURATE
1061 "pxor %%mm7, %%mm7 \n\t"
1062 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
"\n\t" //FIXME optimize
1063 "add %4, %%"REG_c
" \n\t"
1064 WRITEBGR24(%%REGc
, %5, %%REGa
)
1067 :: "r" (&c
->redDither
),
1068 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1069 "r" (dest
), "m" (dstW
)
1070 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1073 case PIX_FMT_RGB555
:
1074 YSCALEYUV2PACKEDX_ACCURATE
1076 "pxor %%mm7, %%mm7 \n\t"
1077 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1079 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1080 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1081 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1084 WRITERGB15(%4, %5, %%REGa
)
1085 YSCALEYUV2PACKEDX_END
1087 case PIX_FMT_RGB565
:
1088 YSCALEYUV2PACKEDX_ACCURATE
1090 "pxor %%mm7, %%mm7 \n\t"
1091 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1093 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1094 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1095 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1098 WRITERGB16(%4, %5, %%REGa
)
1099 YSCALEYUV2PACKEDX_END
1101 case PIX_FMT_YUYV422
:
1102 YSCALEYUV2PACKEDX_ACCURATE
1103 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1105 "psraw $3, %%mm3 \n\t"
1106 "psraw $3, %%mm4 \n\t"
1107 "psraw $3, %%mm1 \n\t"
1108 "psraw $3, %%mm7 \n\t"
1109 WRITEYUY2(%4, %5, %%REGa
)
1110 YSCALEYUV2PACKEDX_END
1114 switch(c
->dstFormat
) {
1116 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1119 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET
, %%mm0
, %%mm3
, %%mm6
, %%mm1
, %%mm7
)
1120 "psraw $3, %%mm1 \n\t"
1121 "psraw $3, %%mm7 \n\t"
1122 "packuswb %%mm7, %%mm1 \n\t"
1123 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm3
, %%mm6
)
1124 YSCALEYUV2PACKEDX_END
1128 "pcmpeqd %%mm7, %%mm7 \n\t"
1129 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1130 YSCALEYUV2PACKEDX_END
1136 "pxor %%mm7, %%mm7 \n\t"
1137 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
" \n\t" //FIXME optimize
1138 "add %4, %%"REG_c
" \n\t"
1139 WRITEBGR24(%%REGc
, %5, %%REGa
)
1141 :: "r" (&c
->redDither
),
1142 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1143 "r" (dest
), "m" (dstW
)
1144 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1147 case PIX_FMT_RGB555
:
1150 "pxor %%mm7, %%mm7 \n\t"
1151 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1153 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1154 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1155 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1158 WRITERGB15(%4, %5, %%REGa
)
1159 YSCALEYUV2PACKEDX_END
1161 case PIX_FMT_RGB565
:
1164 "pxor %%mm7, %%mm7 \n\t"
1165 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1167 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1168 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1169 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1172 WRITERGB16(%4, %5, %%REGa
)
1173 YSCALEYUV2PACKEDX_END
1175 case PIX_FMT_YUYV422
:
1177 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1179 "psraw $3, %%mm3 \n\t"
1180 "psraw $3, %%mm4 \n\t"
1181 "psraw $3, %%mm1 \n\t"
1182 "psraw $3, %%mm7 \n\t"
1183 WRITEYUY2(%4, %5, %%REGa
)
1184 YSCALEYUV2PACKEDX_END
1189 #endif /* COMPILE_TEMPLATE_MMX */
1190 #if COMPILE_TEMPLATE_ALTIVEC
1191 /* The following list of supported dstFormat values should
1192 match what's found in the body of ff_yuv2packedX_altivec() */
1193 if (!(c
->flags
& SWS_BITEXACT
) && !c
->alpPixBuf
&&
1194 (c
->dstFormat
==PIX_FMT_ABGR
|| c
->dstFormat
==PIX_FMT_BGRA
||
1195 c
->dstFormat
==PIX_FMT_BGR24
|| c
->dstFormat
==PIX_FMT_RGB24
||
1196 c
->dstFormat
==PIX_FMT_RGBA
|| c
->dstFormat
==PIX_FMT_ARGB
))
1197 ff_yuv2packedX_altivec(c
, lumFilter
, lumSrc
, lumFilterSize
,
1198 chrFilter
, chrSrc
, chrFilterSize
,
1202 yuv2packedXinC(c
, lumFilter
, lumSrc
, lumFilterSize
,
1203 chrFilter
, chrSrc
, chrFilterSize
,
1204 alpSrc
, dest
, dstW
, dstY
);
1208 * vertical bilinear scale YV12 to RGB
1210 static inline void RENAME(yuv2packed2
)(SwsContext
*c
, const uint16_t *buf0
, const uint16_t *buf1
, const uint16_t *uvbuf0
, const uint16_t *uvbuf1
,
1211 const uint16_t *abuf0
, const uint16_t *abuf1
, uint8_t *dest
, int dstW
, int yalpha
, int uvalpha
, int y
)
1213 int yalpha1
=4095- yalpha
;
1214 int uvalpha1
=4095-uvalpha
;
1217 #if COMPILE_TEMPLATE_MMX
1218 if(!(c
->flags
& SWS_BITEXACT
)) {
1219 switch(c
->dstFormat
) {
1220 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1222 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1225 YSCALEYUV2RGB(%%r8
, %5)
1226 YSCALEYUV2RGB_YA(%%r8
, %5, %6, %7)
1227 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1228 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1229 "packuswb %%mm7, %%mm1 \n\t"
1230 WRITEBGR32(%4, 8280(%5), %%r8
, %%mm2
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm3
, %%mm6
)
1232 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "r" (dest
),
1234 ,"r" (abuf0
), "r" (abuf1
)
1238 *(uint16_t **)(&c
->u_temp
)=abuf0
;
1239 *(uint16_t **)(&c
->v_temp
)=abuf1
;
1241 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1242 "mov %4, %%"REG_b
" \n\t"
1243 "push %%"REG_BP
" \n\t"
1244 YSCALEYUV2RGB(%%REGBP
, %5)
1247 "mov "U_TEMP
"(%5), %0 \n\t"
1248 "mov "V_TEMP
"(%5), %1 \n\t"
1249 YSCALEYUV2RGB_YA(%%REGBP
, %5, %0, %1)
1250 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1251 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1252 "packuswb %%mm7, %%mm1 \n\t"
1255 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm3
, %%mm6
)
1256 "pop %%"REG_BP
" \n\t"
1257 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1259 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1265 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1266 "mov %4, %%"REG_b
" \n\t"
1267 "push %%"REG_BP
" \n\t"
1268 YSCALEYUV2RGB(%%REGBP
, %5)
1269 "pcmpeqd %%mm7, %%mm7 \n\t"
1270 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1271 "pop %%"REG_BP
" \n\t"
1272 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1274 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1281 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1282 "mov %4, %%"REG_b
" \n\t"
1283 "push %%"REG_BP
" \n\t"
1284 YSCALEYUV2RGB(%%REGBP
, %5)
1285 "pxor %%mm7, %%mm7 \n\t"
1286 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1287 "pop %%"REG_BP
" \n\t"
1288 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1289 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1293 case PIX_FMT_RGB555
:
1295 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1296 "mov %4, %%"REG_b
" \n\t"
1297 "push %%"REG_BP
" \n\t"
1298 YSCALEYUV2RGB(%%REGBP
, %5)
1299 "pxor %%mm7, %%mm7 \n\t"
1300 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1302 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1303 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1304 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1307 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1308 "pop %%"REG_BP
" \n\t"
1309 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1311 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1315 case PIX_FMT_RGB565
:
1317 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1318 "mov %4, %%"REG_b
" \n\t"
1319 "push %%"REG_BP
" \n\t"
1320 YSCALEYUV2RGB(%%REGBP
, %5)
1321 "pxor %%mm7, %%mm7 \n\t"
1322 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1324 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1325 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1326 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1329 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1330 "pop %%"REG_BP
" \n\t"
1331 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1332 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1336 case PIX_FMT_YUYV422
:
1338 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1339 "mov %4, %%"REG_b
" \n\t"
1340 "push %%"REG_BP
" \n\t"
1341 YSCALEYUV2PACKED(%%REGBP
, %5)
1342 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1343 "pop %%"REG_BP
" \n\t"
1344 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1345 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1352 #endif //COMPILE_TEMPLATE_MMX
1353 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C
, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C
, YSCALE_YUV_2_MONO2_C
)
1357 * YV12 to RGB without scaling or interpolating
1359 static inline void RENAME(yuv2packed1
)(SwsContext
*c
, const uint16_t *buf0
, const uint16_t *uvbuf0
, const uint16_t *uvbuf1
,
1360 const uint16_t *abuf0
, uint8_t *dest
, int dstW
, int uvalpha
, enum PixelFormat dstFormat
, int flags
, int y
)
1362 const int yalpha1
=0;
1365 const uint16_t *buf1
= buf0
; //FIXME needed for RGB1/BGR1
1366 const int yalpha
= 4096; //FIXME ...
1368 if (flags
&SWS_FULL_CHR_H_INT
) {
1369 c
->yuv2packed2(c
, buf0
, buf0
, uvbuf0
, uvbuf1
, abuf0
, abuf0
, dest
, dstW
, 0, uvalpha
, y
);
1373 #if COMPILE_TEMPLATE_MMX
1374 if(!(flags
& SWS_BITEXACT
)) {
1375 if (uvalpha
< 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1378 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1380 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1381 "mov %4, %%"REG_b
" \n\t"
1382 "push %%"REG_BP
" \n\t"
1383 YSCALEYUV2RGB1(%%REGBP
, %5)
1384 YSCALEYUV2RGB1_ALPHA(%%REGBP
)
1385 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1386 "pop %%"REG_BP
" \n\t"
1387 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1389 :: "c" (buf0
), "d" (abuf0
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1394 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1395 "mov %4, %%"REG_b
" \n\t"
1396 "push %%"REG_BP
" \n\t"
1397 YSCALEYUV2RGB1(%%REGBP
, %5)
1398 "pcmpeqd %%mm7, %%mm7 \n\t"
1399 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1400 "pop %%"REG_BP
" \n\t"
1401 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1403 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1410 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1411 "mov %4, %%"REG_b
" \n\t"
1412 "push %%"REG_BP
" \n\t"
1413 YSCALEYUV2RGB1(%%REGBP
, %5)
1414 "pxor %%mm7, %%mm7 \n\t"
1415 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1416 "pop %%"REG_BP
" \n\t"
1417 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1419 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1423 case PIX_FMT_RGB555
:
1425 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1426 "mov %4, %%"REG_b
" \n\t"
1427 "push %%"REG_BP
" \n\t"
1428 YSCALEYUV2RGB1(%%REGBP
, %5)
1429 "pxor %%mm7, %%mm7 \n\t"
1430 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1432 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1433 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1434 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1436 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1437 "pop %%"REG_BP
" \n\t"
1438 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1440 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1444 case PIX_FMT_RGB565
:
1446 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1447 "mov %4, %%"REG_b
" \n\t"
1448 "push %%"REG_BP
" \n\t"
1449 YSCALEYUV2RGB1(%%REGBP
, %5)
1450 "pxor %%mm7, %%mm7 \n\t"
1451 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1453 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1454 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1455 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1458 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1459 "pop %%"REG_BP
" \n\t"
1460 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1462 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1466 case PIX_FMT_YUYV422
:
1468 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1469 "mov %4, %%"REG_b
" \n\t"
1470 "push %%"REG_BP
" \n\t"
1471 YSCALEYUV2PACKED1(%%REGBP
, %5)
1472 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1473 "pop %%"REG_BP
" \n\t"
1474 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1476 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1484 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1486 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1487 "mov %4, %%"REG_b
" \n\t"
1488 "push %%"REG_BP
" \n\t"
1489 YSCALEYUV2RGB1b(%%REGBP
, %5)
1490 YSCALEYUV2RGB1_ALPHA(%%REGBP
)
1491 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1492 "pop %%"REG_BP
" \n\t"
1493 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1495 :: "c" (buf0
), "d" (abuf0
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1500 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1501 "mov %4, %%"REG_b
" \n\t"
1502 "push %%"REG_BP
" \n\t"
1503 YSCALEYUV2RGB1b(%%REGBP
, %5)
1504 "pcmpeqd %%mm7, %%mm7 \n\t"
1505 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1506 "pop %%"REG_BP
" \n\t"
1507 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1509 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1516 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1517 "mov %4, %%"REG_b
" \n\t"
1518 "push %%"REG_BP
" \n\t"
1519 YSCALEYUV2RGB1b(%%REGBP
, %5)
1520 "pxor %%mm7, %%mm7 \n\t"
1521 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1522 "pop %%"REG_BP
" \n\t"
1523 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1525 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1529 case PIX_FMT_RGB555
:
1531 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1532 "mov %4, %%"REG_b
" \n\t"
1533 "push %%"REG_BP
" \n\t"
1534 YSCALEYUV2RGB1b(%%REGBP
, %5)
1535 "pxor %%mm7, %%mm7 \n\t"
1536 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1538 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1539 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1540 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1542 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1543 "pop %%"REG_BP
" \n\t"
1544 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1546 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1550 case PIX_FMT_RGB565
:
1552 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1553 "mov %4, %%"REG_b
" \n\t"
1554 "push %%"REG_BP
" \n\t"
1555 YSCALEYUV2RGB1b(%%REGBP
, %5)
1556 "pxor %%mm7, %%mm7 \n\t"
1557 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1559 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1560 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1561 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1564 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1565 "pop %%"REG_BP
" \n\t"
1566 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1568 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1572 case PIX_FMT_YUYV422
:
1574 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1575 "mov %4, %%"REG_b
" \n\t"
1576 "push %%"REG_BP
" \n\t"
1577 YSCALEYUV2PACKED1b(%%REGBP
, %5)
1578 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1579 "pop %%"REG_BP
" \n\t"
1580 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1582 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1589 #endif /* COMPILE_TEMPLATE_MMX */
1590 if (uvalpha
< 2048) {
1591 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C
, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1593 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C
, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1597 //FIXME yuy2* can read up to 7 samples too much
1599 static inline void RENAME(yuy2ToY
)(uint8_t *dst
, const uint8_t *src
, long width
, uint32_t *unused
)
1601 #if COMPILE_TEMPLATE_MMX
1603 "movq "MANGLE(bm01010101
)", %%mm2 \n\t"
1604 "mov %0, %%"REG_a
" \n\t"
1606 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1607 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1608 "pand %%mm2, %%mm0 \n\t"
1609 "pand %%mm2, %%mm1 \n\t"
1610 "packuswb %%mm1, %%mm0 \n\t"
1611 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1612 "add $8, %%"REG_a
" \n\t"
1614 : : "g" ((x86_reg
)-width
), "r" (src
+width
*2), "r" (dst
+width
)
1619 for (i
=0; i
<width
; i
++)
1624 static inline void RENAME(yuy2ToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1626 #if COMPILE_TEMPLATE_MMX
1628 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1629 "mov %0, %%"REG_a
" \n\t"
1631 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1632 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1633 "psrlw $8, %%mm0 \n\t"
1634 "psrlw $8, %%mm1 \n\t"
1635 "packuswb %%mm1, %%mm0 \n\t"
1636 "movq %%mm0, %%mm1 \n\t"
1637 "psrlw $8, %%mm0 \n\t"
1638 "pand %%mm4, %%mm1 \n\t"
1639 "packuswb %%mm0, %%mm0 \n\t"
1640 "packuswb %%mm1, %%mm1 \n\t"
1641 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1642 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1643 "add $4, %%"REG_a
" \n\t"
1645 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1650 for (i
=0; i
<width
; i
++) {
1651 dstU
[i
]= src1
[4*i
+ 1];
1652 dstV
[i
]= src1
[4*i
+ 3];
1655 assert(src1
== src2
);
1658 static inline void RENAME(LEToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1660 #if COMPILE_TEMPLATE_MMX
1662 "mov %0, %%"REG_a
" \n\t"
1664 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1665 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1666 "movq (%2, %%"REG_a
",2), %%mm2 \n\t"
1667 "movq 8(%2, %%"REG_a
",2), %%mm3 \n\t"
1668 "psrlw $8, %%mm0 \n\t"
1669 "psrlw $8, %%mm1 \n\t"
1670 "psrlw $8, %%mm2 \n\t"
1671 "psrlw $8, %%mm3 \n\t"
1672 "packuswb %%mm1, %%mm0 \n\t"
1673 "packuswb %%mm3, %%mm2 \n\t"
1674 "movq %%mm0, (%3, %%"REG_a
") \n\t"
1675 "movq %%mm2, (%4, %%"REG_a
") \n\t"
1676 "add $8, %%"REG_a
" \n\t"
1678 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*2), "r" (src2
+width
*2), "r" (dstU
+width
), "r" (dstV
+width
)
1683 for (i
=0; i
<width
; i
++) {
1684 dstU
[i
]= src1
[2*i
+ 1];
1685 dstV
[i
]= src2
[2*i
+ 1];
1690 /* This is almost identical to the previous, end exists only because
1691 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1692 static inline void RENAME(uyvyToY
)(uint8_t *dst
, const uint8_t *src
, long width
, uint32_t *unused
)
1694 #if COMPILE_TEMPLATE_MMX
1696 "mov %0, %%"REG_a
" \n\t"
1698 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1699 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1700 "psrlw $8, %%mm0 \n\t"
1701 "psrlw $8, %%mm1 \n\t"
1702 "packuswb %%mm1, %%mm0 \n\t"
1703 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1704 "add $8, %%"REG_a
" \n\t"
1706 : : "g" ((x86_reg
)-width
), "r" (src
+width
*2), "r" (dst
+width
)
1711 for (i
=0; i
<width
; i
++)
1716 static inline void RENAME(uyvyToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1718 #if COMPILE_TEMPLATE_MMX
1720 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1721 "mov %0, %%"REG_a
" \n\t"
1723 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1724 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1725 "pand %%mm4, %%mm0 \n\t"
1726 "pand %%mm4, %%mm1 \n\t"
1727 "packuswb %%mm1, %%mm0 \n\t"
1728 "movq %%mm0, %%mm1 \n\t"
1729 "psrlw $8, %%mm0 \n\t"
1730 "pand %%mm4, %%mm1 \n\t"
1731 "packuswb %%mm0, %%mm0 \n\t"
1732 "packuswb %%mm1, %%mm1 \n\t"
1733 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1734 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1735 "add $4, %%"REG_a
" \n\t"
1737 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1742 for (i
=0; i
<width
; i
++) {
1743 dstU
[i
]= src1
[4*i
+ 0];
1744 dstV
[i
]= src1
[4*i
+ 2];
1747 assert(src1
== src2
);
1750 static inline void RENAME(BEToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1752 #if COMPILE_TEMPLATE_MMX
1754 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1755 "mov %0, %%"REG_a
" \n\t"
1757 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1758 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1759 "movq (%2, %%"REG_a
",2), %%mm2 \n\t"
1760 "movq 8(%2, %%"REG_a
",2), %%mm3 \n\t"
1761 "pand %%mm4, %%mm0 \n\t"
1762 "pand %%mm4, %%mm1 \n\t"
1763 "pand %%mm4, %%mm2 \n\t"
1764 "pand %%mm4, %%mm3 \n\t"
1765 "packuswb %%mm1, %%mm0 \n\t"
1766 "packuswb %%mm3, %%mm2 \n\t"
1767 "movq %%mm0, (%3, %%"REG_a
") \n\t"
1768 "movq %%mm2, (%4, %%"REG_a
") \n\t"
1769 "add $8, %%"REG_a
" \n\t"
1771 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*2), "r" (src2
+width
*2), "r" (dstU
+width
), "r" (dstV
+width
)
1776 for (i
=0; i
<width
; i
++) {
1783 static inline void RENAME(nvXXtoUV
)(uint8_t *dst1
, uint8_t *dst2
,
1784 const uint8_t *src
, long width
)
1786 #if COMPILE_TEMPLATE_MMX
1788 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1789 "mov %0, %%"REG_a
" \n\t"
1791 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1792 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1793 "movq %%mm0, %%mm2 \n\t"
1794 "movq %%mm1, %%mm3 \n\t"
1795 "pand %%mm4, %%mm0 \n\t"
1796 "pand %%mm4, %%mm1 \n\t"
1797 "psrlw $8, %%mm2 \n\t"
1798 "psrlw $8, %%mm3 \n\t"
1799 "packuswb %%mm1, %%mm0 \n\t"
1800 "packuswb %%mm3, %%mm2 \n\t"
1801 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1802 "movq %%mm2, (%3, %%"REG_a
") \n\t"
1803 "add $8, %%"REG_a
" \n\t"
1805 : : "g" ((x86_reg
)-width
), "r" (src
+width
*2), "r" (dst1
+width
), "r" (dst2
+width
)
1810 for (i
= 0; i
< width
; i
++) {
1811 dst1
[i
] = src
[2*i
+0];
1812 dst2
[i
] = src
[2*i
+1];
1817 static inline void RENAME(nv12ToUV
)(uint8_t *dstU
, uint8_t *dstV
,
1818 const uint8_t *src1
, const uint8_t *src2
,
1819 long width
, uint32_t *unused
)
1821 RENAME(nvXXtoUV
)(dstU
, dstV
, src1
, width
);
1824 static inline void RENAME(nv21ToUV
)(uint8_t *dstU
, uint8_t *dstV
,
1825 const uint8_t *src1
, const uint8_t *src2
,
1826 long width
, uint32_t *unused
)
1828 RENAME(nvXXtoUV
)(dstV
, dstU
, src1
, width
);
1831 #if COMPILE_TEMPLATE_MMX
1832 static inline void RENAME(bgr24ToY_mmx
)(uint8_t *dst
, const uint8_t *src
, long width
, enum PixelFormat srcFormat
)
1835 if(srcFormat
== PIX_FMT_BGR24
) {
1837 "movq "MANGLE(ff_bgr24toY1Coeff
)", %%mm5 \n\t"
1838 "movq "MANGLE(ff_bgr24toY2Coeff
)", %%mm6 \n\t"
1843 "movq "MANGLE(ff_rgb24toY1Coeff
)", %%mm5 \n\t"
1844 "movq "MANGLE(ff_rgb24toY2Coeff
)", %%mm6 \n\t"
1850 "movq "MANGLE(ff_bgr24toYOffset
)", %%mm4 \n\t"
1851 "mov %2, %%"REG_a
" \n\t"
1852 "pxor %%mm7, %%mm7 \n\t"
1854 PREFETCH
" 64(%0) \n\t"
1855 "movd (%0), %%mm0 \n\t"
1856 "movd 2(%0), %%mm1 \n\t"
1857 "movd 6(%0), %%mm2 \n\t"
1858 "movd 8(%0), %%mm3 \n\t"
1860 "punpcklbw %%mm7, %%mm0 \n\t"
1861 "punpcklbw %%mm7, %%mm1 \n\t"
1862 "punpcklbw %%mm7, %%mm2 \n\t"
1863 "punpcklbw %%mm7, %%mm3 \n\t"
1864 "pmaddwd %%mm5, %%mm0 \n\t"
1865 "pmaddwd %%mm6, %%mm1 \n\t"
1866 "pmaddwd %%mm5, %%mm2 \n\t"
1867 "pmaddwd %%mm6, %%mm3 \n\t"
1868 "paddd %%mm1, %%mm0 \n\t"
1869 "paddd %%mm3, %%mm2 \n\t"
1870 "paddd %%mm4, %%mm0 \n\t"
1871 "paddd %%mm4, %%mm2 \n\t"
1872 "psrad $15, %%mm0 \n\t"
1873 "psrad $15, %%mm2 \n\t"
1874 "packssdw %%mm2, %%mm0 \n\t"
1875 "packuswb %%mm0, %%mm0 \n\t"
1876 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1877 "add $4, %%"REG_a
" \n\t"
1880 : "r" (dst
+width
), "g" ((x86_reg
)-width
)
1885 static inline void RENAME(bgr24ToUV_mmx
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src
, long width
, enum PixelFormat srcFormat
)
1888 "movq 24+%4, %%mm6 \n\t"
1889 "mov %3, %%"REG_a
" \n\t"
1890 "pxor %%mm7, %%mm7 \n\t"
1892 PREFETCH
" 64(%0) \n\t"
1893 "movd (%0), %%mm0 \n\t"
1894 "movd 2(%0), %%mm1 \n\t"
1895 "punpcklbw %%mm7, %%mm0 \n\t"
1896 "punpcklbw %%mm7, %%mm1 \n\t"
1897 "movq %%mm0, %%mm2 \n\t"
1898 "movq %%mm1, %%mm3 \n\t"
1899 "pmaddwd %4, %%mm0 \n\t"
1900 "pmaddwd 8+%4, %%mm1 \n\t"
1901 "pmaddwd 16+%4, %%mm2 \n\t"
1902 "pmaddwd %%mm6, %%mm3 \n\t"
1903 "paddd %%mm1, %%mm0 \n\t"
1904 "paddd %%mm3, %%mm2 \n\t"
1906 "movd 6(%0), %%mm1 \n\t"
1907 "movd 8(%0), %%mm3 \n\t"
1909 "punpcklbw %%mm7, %%mm1 \n\t"
1910 "punpcklbw %%mm7, %%mm3 \n\t"
1911 "movq %%mm1, %%mm4 \n\t"
1912 "movq %%mm3, %%mm5 \n\t"
1913 "pmaddwd %4, %%mm1 \n\t"
1914 "pmaddwd 8+%4, %%mm3 \n\t"
1915 "pmaddwd 16+%4, %%mm4 \n\t"
1916 "pmaddwd %%mm6, %%mm5 \n\t"
1917 "paddd %%mm3, %%mm1 \n\t"
1918 "paddd %%mm5, %%mm4 \n\t"
1920 "movq "MANGLE(ff_bgr24toUVOffset
)", %%mm3 \n\t"
1921 "paddd %%mm3, %%mm0 \n\t"
1922 "paddd %%mm3, %%mm2 \n\t"
1923 "paddd %%mm3, %%mm1 \n\t"
1924 "paddd %%mm3, %%mm4 \n\t"
1925 "psrad $15, %%mm0 \n\t"
1926 "psrad $15, %%mm2 \n\t"
1927 "psrad $15, %%mm1 \n\t"
1928 "psrad $15, %%mm4 \n\t"
1929 "packssdw %%mm1, %%mm0 \n\t"
1930 "packssdw %%mm4, %%mm2 \n\t"
1931 "packuswb %%mm0, %%mm0 \n\t"
1932 "packuswb %%mm2, %%mm2 \n\t"
1933 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1934 "movd %%mm2, (%2, %%"REG_a
") \n\t"
1935 "add $4, %%"REG_a
" \n\t"
1938 : "r" (dstU
+width
), "r" (dstV
+width
), "g" ((x86_reg
)-width
), "m"(ff_bgr24toUV
[srcFormat
== PIX_FMT_RGB24
][0])
1944 static inline void RENAME(bgr24ToY
)(uint8_t *dst
, const uint8_t *src
, long width
, uint32_t *unused
)
1946 #if COMPILE_TEMPLATE_MMX
1947 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_BGR24
);
1950 for (i
=0; i
<width
; i
++) {
1955 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
1957 #endif /* COMPILE_TEMPLATE_MMX */
1960 static inline void RENAME(bgr24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1962 #if COMPILE_TEMPLATE_MMX
1963 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_BGR24
);
1966 for (i
=0; i
<width
; i
++) {
1967 int b
= src1
[3*i
+ 0];
1968 int g
= src1
[3*i
+ 1];
1969 int r
= src1
[3*i
+ 2];
1971 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1972 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1974 #endif /* COMPILE_TEMPLATE_MMX */
1975 assert(src1
== src2
);
1978 static inline void RENAME(bgr24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1981 for (i
=0; i
<width
; i
++) {
1982 int b
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
1983 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
1984 int r
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
1986 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1987 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1989 assert(src1
== src2
);
1992 static inline void RENAME(rgb24ToY
)(uint8_t *dst
, const uint8_t *src
, long width
, uint32_t *unused
)
1994 #if COMPILE_TEMPLATE_MMX
1995 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_RGB24
);
1998 for (i
=0; i
<width
; i
++) {
2003 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
2008 static inline void RENAME(rgb24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
2010 #if COMPILE_TEMPLATE_MMX
2012 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_RGB24
);
2016 for (i
=0; i
<width
; i
++) {
2017 int r
= src1
[3*i
+ 0];
2018 int g
= src1
[3*i
+ 1];
2019 int b
= src1
[3*i
+ 2];
2021 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
2022 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
2027 static inline void RENAME(rgb24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
2031 for (i
=0; i
<width
; i
++) {
2032 int r
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
2033 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
2034 int b
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
2036 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
2037 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
2042 // bilinear / bicubic scaling
2043 static inline void RENAME(hScale
)(int16_t *dst
, int dstW
, const uint8_t *src
, int srcW
, int xInc
,
2044 const int16_t *filter
, const int16_t *filterPos
, long filterSize
)
2046 #if COMPILE_TEMPLATE_MMX
2047 assert(filterSize
% 4 == 0 && filterSize
>0);
2048 if (filterSize
==4) { // Always true for upscaling, sometimes for down, too.
2049 x86_reg counter
= -2*dstW
;
2051 filterPos
-= counter
/2;
2055 "push %%"REG_b
" \n\t"
2057 "pxor %%mm7, %%mm7 \n\t"
2058 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2059 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2062 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2063 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
2064 "movq (%1, %%"REG_BP
", 4), %%mm1 \n\t"
2065 "movq 8(%1, %%"REG_BP
", 4), %%mm3 \n\t"
2066 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2067 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2068 "punpcklbw %%mm7, %%mm0 \n\t"
2069 "punpcklbw %%mm7, %%mm2 \n\t"
2070 "pmaddwd %%mm1, %%mm0 \n\t"
2071 "pmaddwd %%mm2, %%mm3 \n\t"
2072 "movq %%mm0, %%mm4 \n\t"
2073 "punpckldq %%mm3, %%mm0 \n\t"
2074 "punpckhdq %%mm3, %%mm4 \n\t"
2075 "paddd %%mm4, %%mm0 \n\t"
2076 "psrad $7, %%mm0 \n\t"
2077 "packssdw %%mm0, %%mm0 \n\t"
2078 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2079 "add $4, %%"REG_BP
" \n\t"
2082 "pop %%"REG_BP
" \n\t"
2084 "pop %%"REG_b
" \n\t"
2087 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2092 } else if (filterSize
==8) {
2093 x86_reg counter
= -2*dstW
;
2095 filterPos
-= counter
/2;
2099 "push %%"REG_b
" \n\t"
2101 "pxor %%mm7, %%mm7 \n\t"
2102 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2103 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2106 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2107 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
2108 "movq (%1, %%"REG_BP
", 8), %%mm1 \n\t"
2109 "movq 16(%1, %%"REG_BP
", 8), %%mm3 \n\t"
2110 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2111 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2112 "punpcklbw %%mm7, %%mm0 \n\t"
2113 "punpcklbw %%mm7, %%mm2 \n\t"
2114 "pmaddwd %%mm1, %%mm0 \n\t"
2115 "pmaddwd %%mm2, %%mm3 \n\t"
2117 "movq 8(%1, %%"REG_BP
", 8), %%mm1 \n\t"
2118 "movq 24(%1, %%"REG_BP
", 8), %%mm5 \n\t"
2119 "movd 4(%3, %%"REG_a
"), %%mm4 \n\t"
2120 "movd 4(%3, %%"REG_b
"), %%mm2 \n\t"
2121 "punpcklbw %%mm7, %%mm4 \n\t"
2122 "punpcklbw %%mm7, %%mm2 \n\t"
2123 "pmaddwd %%mm1, %%mm4 \n\t"
2124 "pmaddwd %%mm2, %%mm5 \n\t"
2125 "paddd %%mm4, %%mm0 \n\t"
2126 "paddd %%mm5, %%mm3 \n\t"
2127 "movq %%mm0, %%mm4 \n\t"
2128 "punpckldq %%mm3, %%mm0 \n\t"
2129 "punpckhdq %%mm3, %%mm4 \n\t"
2130 "paddd %%mm4, %%mm0 \n\t"
2131 "psrad $7, %%mm0 \n\t"
2132 "packssdw %%mm0, %%mm0 \n\t"
2133 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2134 "add $4, %%"REG_BP
" \n\t"
2137 "pop %%"REG_BP
" \n\t"
2139 "pop %%"REG_b
" \n\t"
2142 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2148 uint8_t *offset
= src
+filterSize
;
2149 x86_reg counter
= -2*dstW
;
2150 //filter-= counter*filterSize/2;
2151 filterPos
-= counter
/2;
2154 "pxor %%mm7, %%mm7 \n\t"
2157 "mov %2, %%"REG_c
" \n\t"
2158 "movzwl (%%"REG_c
", %0), %%eax \n\t"
2159 "movzwl 2(%%"REG_c
", %0), %%edx \n\t"
2160 "mov %5, %%"REG_c
" \n\t"
2161 "pxor %%mm4, %%mm4 \n\t"
2162 "pxor %%mm5, %%mm5 \n\t"
2164 "movq (%1), %%mm1 \n\t"
2165 "movq (%1, %6), %%mm3 \n\t"
2166 "movd (%%"REG_c
", %%"REG_a
"), %%mm0 \n\t"
2167 "movd (%%"REG_c
", %%"REG_d
"), %%mm2 \n\t"
2168 "punpcklbw %%mm7, %%mm0 \n\t"
2169 "punpcklbw %%mm7, %%mm2 \n\t"
2170 "pmaddwd %%mm1, %%mm0 \n\t"
2171 "pmaddwd %%mm2, %%mm3 \n\t"
2172 "paddd %%mm3, %%mm5 \n\t"
2173 "paddd %%mm0, %%mm4 \n\t"
2175 "add $4, %%"REG_c
" \n\t"
2176 "cmp %4, %%"REG_c
" \n\t"
2179 "movq %%mm4, %%mm0 \n\t"
2180 "punpckldq %%mm5, %%mm4 \n\t"
2181 "punpckhdq %%mm5, %%mm0 \n\t"
2182 "paddd %%mm0, %%mm4 \n\t"
2183 "psrad $7, %%mm4 \n\t"
2184 "packssdw %%mm4, %%mm4 \n\t"
2185 "mov %3, %%"REG_a
" \n\t"
2186 "movd %%mm4, (%%"REG_a
", %0) \n\t"
2190 : "+r" (counter
), "+r" (filter
)
2191 : "m" (filterPos
), "m" (dst
), "m"(offset
),
2192 "m" (src
), "r" ((x86_reg
)filterSize
*2)
2193 : "%"REG_a
, "%"REG_c
, "%"REG_d
2197 #if COMPILE_TEMPLATE_ALTIVEC
2198 hScale_altivec_real(dst
, dstW
, src
, srcW
, xInc
, filter
, filterPos
, filterSize
);
2201 for (i
=0; i
<dstW
; i
++) {
2203 int srcPos
= filterPos
[i
];
2205 //printf("filterPos: %d\n", filterPos[i]);
2206 for (j
=0; j
<filterSize
; j
++) {
2207 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2208 val
+= ((int)src
[srcPos
+ j
])*filter
[filterSize
*i
+ j
];
2210 //filter += hFilterSize;
2211 dst
[i
] = FFMIN(val
>>7, (1<<15)-1); // the cubic equation does overflow ...
2214 #endif /* COMPILE_ALTIVEC */
2215 #endif /* COMPILE_MMX */
2218 //FIXME all pal and rgb srcFormats could do this convertion as well
2219 //FIXME all scalers more complex than bilinear could do half of this transform
2220 static void RENAME(chrRangeToJpeg
)(uint16_t *dst
, int width
)
2223 for (i
= 0; i
< width
; i
++) {
2224 dst
[i
] = (FFMIN(dst
[i
],30775)*4663 - 9289992)>>12; //-264
2225 dst
[i
+VOFW
] = (FFMIN(dst
[i
+VOFW
],30775)*4663 - 9289992)>>12; //-264
2228 static void RENAME(chrRangeFromJpeg
)(uint16_t *dst
, int width
)
2231 for (i
= 0; i
< width
; i
++) {
2232 dst
[i
] = (dst
[i
]*1799 + 4081085)>>11; //1469
2233 dst
[i
+VOFW
] = (dst
[i
+VOFW
]*1799 + 4081085)>>11; //1469
2236 static void RENAME(lumRangeToJpeg
)(uint16_t *dst
, int width
)
2239 for (i
= 0; i
< width
; i
++)
2240 dst
[i
] = (FFMIN(dst
[i
],30189)*19077 - 39057361)>>14;
2242 static void RENAME(lumRangeFromJpeg
)(uint16_t *dst
, int width
)
2245 for (i
= 0; i
< width
; i
++)
2246 dst
[i
] = (dst
[i
]*14071 + 33561947)>>14;
2249 #define FAST_BILINEAR_X86 \
2250 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2251 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2252 "shll $16, %%edi \n\t" \
2253 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2254 "mov %1, %%"REG_D"\n\t" \
2255 "shrl $9, %%esi \n\t" \
2257 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2258 long dstWidth
, const uint8_t *src
, int srcW
,
2261 #if ARCH_X86 && CONFIG_GPL
2262 #if COMPILE_TEMPLATE_MMX2
2263 int32_t *mmx2FilterPos
= c
->lumMmx2FilterPos
;
2264 int16_t *mmx2Filter
= c
->lumMmx2Filter
;
2265 int canMMX2BeUsed
= c
->canMMX2BeUsed
;
2266 void *mmx2FilterCode
= c
->lumMmx2FilterCode
;
2269 DECLARE_ALIGNED(8, uint64_t, ebxsave
);
2271 if (canMMX2BeUsed
) {
2274 "mov %%"REG_b
", %5 \n\t"
2276 "pxor %%mm7, %%mm7 \n\t"
2277 "mov %0, %%"REG_c
" \n\t"
2278 "mov %1, %%"REG_D
" \n\t"
2279 "mov %2, %%"REG_d
" \n\t"
2280 "mov %3, %%"REG_b
" \n\t"
2281 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2282 PREFETCH
" (%%"REG_c
") \n\t"
2283 PREFETCH
" 32(%%"REG_c
") \n\t"
2284 PREFETCH
" 64(%%"REG_c
") \n\t"
2288 #define CALL_MMX2_FILTER_CODE \
2289 "movl (%%"REG_b"), %%esi \n\t"\
2291 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2292 "add %%"REG_S", %%"REG_c" \n\t"\
2293 "add %%"REG_a", %%"REG_D" \n\t"\
2294 "xor %%"REG_a", %%"REG_a" \n\t"\
2298 #define CALL_MMX2_FILTER_CODE \
2299 "movl (%%"REG_b"), %%esi \n\t"\
2301 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2302 "add %%"REG_a", %%"REG_D" \n\t"\
2303 "xor %%"REG_a", %%"REG_a" \n\t"\
2305 #endif /* ARCH_X86_64 */
2307 CALL_MMX2_FILTER_CODE
2308 CALL_MMX2_FILTER_CODE
2309 CALL_MMX2_FILTER_CODE
2310 CALL_MMX2_FILTER_CODE
2311 CALL_MMX2_FILTER_CODE
2312 CALL_MMX2_FILTER_CODE
2313 CALL_MMX2_FILTER_CODE
2314 CALL_MMX2_FILTER_CODE
2317 "mov %5, %%"REG_b
" \n\t"
2319 :: "m" (src
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2320 "m" (mmx2FilterCode
)
2324 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2329 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--) dst
[i
] = src
[srcW
-1]*128;
2331 #endif /* COMPILE_TEMPLATE_MMX2 */
2332 x86_reg xInc_shr16
= xInc
>> 16;
2333 uint16_t xInc_mask
= xInc
& 0xffff;
2334 //NO MMX just normal asm ...
2336 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2337 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2338 "xorl %%ecx, %%ecx \n\t" // xalpha
2341 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2342 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2344 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2345 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2346 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>16 + carry
2348 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2349 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2351 "movw %%si, 2(%%"REG_D
", %%"REG_a
", 2) \n\t"
2352 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2353 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>16 + carry
2356 "add $2, %%"REG_a
" \n\t"
2357 "cmp %2, %%"REG_a
" \n\t"
2361 :: "r" (src
), "m" (dst
), "m" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
)
2362 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2364 #if COMPILE_TEMPLATE_MMX2
2365 } //if MMX2 can't be used
2369 unsigned int xpos
=0;
2370 for (i
=0;i
<dstWidth
;i
++) {
2371 register unsigned int xx
=xpos
>>16;
2372 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2373 dst
[i
]= (src
[xx
]<<7) + (src
[xx
+1] - src
[xx
])*xalpha
;
2376 #endif /* ARCH_X86 */
2379 // *** horizontal scale Y line to temp buffer
2380 static inline void RENAME(hyscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, const uint8_t *src
, int srcW
, int xInc
,
2381 const int16_t *hLumFilter
,
2382 const int16_t *hLumFilterPos
, int hLumFilterSize
,
2383 uint8_t *formatConvBuffer
,
2384 uint32_t *pal
, int isAlpha
)
2386 void (*toYV12
)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha
? c
->alpToYV12
: c
->lumToYV12
;
2387 void (*convertRange
)(uint16_t *, int) = isAlpha
? NULL
: c
->lumConvertRange
;
2389 src
+= isAlpha
? c
->alpSrcOffset
: c
->lumSrcOffset
;
2392 toYV12(formatConvBuffer
, src
, srcW
, pal
);
2393 src
= formatConvBuffer
;
2396 if (!c
->hyscale_fast
) {
2397 c
->hScale(dst
, dstWidth
, src
, srcW
, xInc
, hLumFilter
, hLumFilterPos
, hLumFilterSize
);
2398 } else { // fast bilinear upscale / crap downscale
2399 c
->hyscale_fast(c
, dst
, dstWidth
, src
, srcW
, xInc
);
2403 convertRange(dst
, dstWidth
);
2406 static inline void RENAME(hcscale_fast
)(SwsContext
*c
, int16_t *dst
,
2407 long dstWidth
, const uint8_t *src1
,
2408 const uint8_t *src2
, int srcW
, int xInc
)
2410 #if ARCH_X86 && CONFIG_GPL
2411 #if COMPILE_TEMPLATE_MMX2
2412 int32_t *mmx2FilterPos
= c
->chrMmx2FilterPos
;
2413 int16_t *mmx2Filter
= c
->chrMmx2Filter
;
2414 int canMMX2BeUsed
= c
->canMMX2BeUsed
;
2415 void *mmx2FilterCode
= c
->chrMmx2FilterCode
;
2418 DECLARE_ALIGNED(8, uint64_t, ebxsave
);
2420 if (canMMX2BeUsed
) {
2423 "mov %%"REG_b
", %6 \n\t"
2425 "pxor %%mm7, %%mm7 \n\t"
2426 "mov %0, %%"REG_c
" \n\t"
2427 "mov %1, %%"REG_D
" \n\t"
2428 "mov %2, %%"REG_d
" \n\t"
2429 "mov %3, %%"REG_b
" \n\t"
2430 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2431 PREFETCH
" (%%"REG_c
") \n\t"
2432 PREFETCH
" 32(%%"REG_c
") \n\t"
2433 PREFETCH
" 64(%%"REG_c
") \n\t"
2435 CALL_MMX2_FILTER_CODE
2436 CALL_MMX2_FILTER_CODE
2437 CALL_MMX2_FILTER_CODE
2438 CALL_MMX2_FILTER_CODE
2439 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2440 "mov %5, %%"REG_c
" \n\t" // src
2441 "mov %1, %%"REG_D
" \n\t" // buf1
2442 "add $"AV_STRINGIFY(VOF
)", %%"REG_D
" \n\t"
2443 PREFETCH
" (%%"REG_c
") \n\t"
2444 PREFETCH
" 32(%%"REG_c
") \n\t"
2445 PREFETCH
" 64(%%"REG_c
") \n\t"
2447 CALL_MMX2_FILTER_CODE
2448 CALL_MMX2_FILTER_CODE
2449 CALL_MMX2_FILTER_CODE
2450 CALL_MMX2_FILTER_CODE
2453 "mov %6, %%"REG_b
" \n\t"
2455 :: "m" (src1
), "m" (dst
), "m" (mmx2Filter
), "m" (mmx2FilterPos
),
2456 "m" (mmx2FilterCode
), "m" (src2
)
2460 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2465 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--) {
2466 //printf("%d %d %d\n", dstWidth, i, srcW);
2467 dst
[i
] = src1
[srcW
-1]*128;
2468 dst
[i
+VOFW
] = src2
[srcW
-1]*128;
2471 #endif /* COMPILE_TEMPLATE_MMX2 */
2472 x86_reg xInc_shr16
= (x86_reg
) (xInc
>> 16);
2473 uint16_t xInc_mask
= xInc
& 0xffff;
2475 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2476 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2477 "xorl %%ecx, %%ecx \n\t" // xalpha
2480 "mov %0, %%"REG_S
" \n\t"
2481 "movzbl (%%"REG_S
", %%"REG_d
"), %%edi \n\t" //src[xx]
2482 "movzbl 1(%%"REG_S
", %%"REG_d
"), %%esi \n\t" //src[xx+1]
2484 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2486 "movzbl (%5, %%"REG_d
"), %%edi \n\t" //src[xx]
2487 "movzbl 1(%5, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2489 "movw %%si, "AV_STRINGIFY(VOF
)"(%%"REG_D
", %%"REG_a
", 2) \n\t"
2491 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2492 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>16 + carry
2493 "add $1, %%"REG_a
" \n\t"
2494 "cmp %2, %%"REG_a
" \n\t"
2497 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2498 which is needed to support GCC 4.0. */
2499 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2500 :: "m" (src1
), "m" (dst
), "g" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2502 :: "m" (src1
), "m" (dst
), "m" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2505 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2507 #if COMPILE_TEMPLATE_MMX2
2508 } //if MMX2 can't be used
2512 unsigned int xpos
=0;
2513 for (i
=0;i
<dstWidth
;i
++) {
2514 register unsigned int xx
=xpos
>>16;
2515 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2516 dst
[i
]=(src1
[xx
]*(xalpha
^127)+src1
[xx
+1]*xalpha
);
2517 dst
[i
+VOFW
]=(src2
[xx
]*(xalpha
^127)+src2
[xx
+1]*xalpha
);
2519 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2520 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2524 #endif /* ARCH_X86 */
2527 inline static void RENAME(hcscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, const uint8_t *src1
, const uint8_t *src2
,
2528 int srcW
, int xInc
, const int16_t *hChrFilter
,
2529 const int16_t *hChrFilterPos
, int hChrFilterSize
,
2530 uint8_t *formatConvBuffer
,
2534 src1
+= c
->chrSrcOffset
;
2535 src2
+= c
->chrSrcOffset
;
2538 c
->chrToYV12(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2539 src1
= formatConvBuffer
;
2540 src2
= formatConvBuffer
+VOFW
;
2543 if (!c
->hcscale_fast
) {
2544 c
->hScale(dst
, dstWidth
, src1
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2545 c
->hScale(dst
+VOFW
, dstWidth
, src2
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2546 } else { // fast bilinear upscale / crap downscale
2547 c
->hcscale_fast(c
, dst
, dstWidth
, src1
, src2
, srcW
, xInc
);
2550 if (c
->chrConvertRange
)
2551 c
->chrConvertRange(dst
, dstWidth
);
2554 #define DEBUG_SWSCALE_BUFFERS 0
2555 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2557 static int RENAME(swScale
)(SwsContext
*c
, uint8_t* src
[], int srcStride
[], int srcSliceY
,
2558 int srcSliceH
, uint8_t* dst
[], int dstStride
[])
2560 /* load a few things into local vars to make the code more readable? and faster */
2561 const int srcW
= c
->srcW
;
2562 const int dstW
= c
->dstW
;
2563 const int dstH
= c
->dstH
;
2564 const int chrDstW
= c
->chrDstW
;
2565 const int chrSrcW
= c
->chrSrcW
;
2566 const int lumXInc
= c
->lumXInc
;
2567 const int chrXInc
= c
->chrXInc
;
2568 const enum PixelFormat dstFormat
= c
->dstFormat
;
2569 const int flags
= c
->flags
;
2570 int16_t *vLumFilterPos
= c
->vLumFilterPos
;
2571 int16_t *vChrFilterPos
= c
->vChrFilterPos
;
2572 int16_t *hLumFilterPos
= c
->hLumFilterPos
;
2573 int16_t *hChrFilterPos
= c
->hChrFilterPos
;
2574 int16_t *vLumFilter
= c
->vLumFilter
;
2575 int16_t *vChrFilter
= c
->vChrFilter
;
2576 int16_t *hLumFilter
= c
->hLumFilter
;
2577 int16_t *hChrFilter
= c
->hChrFilter
;
2578 int32_t *lumMmxFilter
= c
->lumMmxFilter
;
2579 int32_t *chrMmxFilter
= c
->chrMmxFilter
;
2580 int32_t *alpMmxFilter
= c
->alpMmxFilter
;
2581 const int vLumFilterSize
= c
->vLumFilterSize
;
2582 const int vChrFilterSize
= c
->vChrFilterSize
;
2583 const int hLumFilterSize
= c
->hLumFilterSize
;
2584 const int hChrFilterSize
= c
->hChrFilterSize
;
2585 int16_t **lumPixBuf
= c
->lumPixBuf
;
2586 int16_t **chrPixBuf
= c
->chrPixBuf
;
2587 int16_t **alpPixBuf
= c
->alpPixBuf
;
2588 const int vLumBufSize
= c
->vLumBufSize
;
2589 const int vChrBufSize
= c
->vChrBufSize
;
2590 uint8_t *formatConvBuffer
= c
->formatConvBuffer
;
2591 const int chrSrcSliceY
= srcSliceY
>> c
->chrSrcVSubSample
;
2592 const int chrSrcSliceH
= -((-srcSliceH
) >> c
->chrSrcVSubSample
);
2594 uint32_t *pal
=c
->pal_yuv
;
2596 /* vars which will change and which we need to store back in the context */
2598 int lumBufIndex
= c
->lumBufIndex
;
2599 int chrBufIndex
= c
->chrBufIndex
;
2600 int lastInLumBuf
= c
->lastInLumBuf
;
2601 int lastInChrBuf
= c
->lastInChrBuf
;
2603 if (isPacked(c
->srcFormat
)) {
2611 srcStride
[3]= srcStride
[0];
2613 srcStride
[1]<<= c
->vChrDrop
;
2614 srcStride
[2]<<= c
->vChrDrop
;
2616 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2617 src
[0], srcStride
[0], src
[1], srcStride
[1], src
[2], srcStride
[2], src
[3], srcStride
[3],
2618 dst
[0], dstStride
[0], dst
[1], dstStride
[1], dst
[2], dstStride
[2], dst
[3], dstStride
[3]);
2619 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2620 srcSliceY
, srcSliceH
, dstY
, dstH
);
2621 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2622 vLumFilterSize
, vLumBufSize
, vChrFilterSize
, vChrBufSize
);
2624 if (dstStride
[0]%8 !=0 || dstStride
[1]%8 !=0 || dstStride
[2]%8 !=0 || dstStride
[3]%8 != 0) {
2625 static int warnedAlready
=0; //FIXME move this into the context perhaps
2626 if (flags
& SWS_PRINT_INFO
&& !warnedAlready
) {
2627 av_log(c
, AV_LOG_WARNING
, "Warning: dstStride is not aligned!\n"
2628 " ->cannot do aligned memory accesses anymore\n");
2633 /* Note the user might start scaling the picture in the middle so this
2634 will not get executed. This is not really intended but works
2635 currently, so people might do it. */
2636 if (srcSliceY
==0) {
2646 for (;dstY
< dstH
; dstY
++) {
2647 unsigned char *dest
=dst
[0]+dstStride
[0]*dstY
;
2648 const int chrDstY
= dstY
>>c
->chrDstVSubSample
;
2649 unsigned char *uDest
=dst
[1]+dstStride
[1]*chrDstY
;
2650 unsigned char *vDest
=dst
[2]+dstStride
[2]*chrDstY
;
2651 unsigned char *aDest
=(CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? dst
[3]+dstStride
[3]*dstY
: NULL
;
2653 const int firstLumSrcY
= vLumFilterPos
[dstY
]; //First line needed as input
2654 const int firstChrSrcY
= vChrFilterPos
[chrDstY
]; //First line needed as input
2655 int lastLumSrcY
= firstLumSrcY
+ vLumFilterSize
-1; // Last line needed as input
2656 int lastChrSrcY
= firstChrSrcY
+ vChrFilterSize
-1; // Last line needed as input
2659 //handle holes (FAST_BILINEAR & weird filters)
2660 if (firstLumSrcY
> lastInLumBuf
) lastInLumBuf
= firstLumSrcY
-1;
2661 if (firstChrSrcY
> lastInChrBuf
) lastInChrBuf
= firstChrSrcY
-1;
2662 assert(firstLumSrcY
>= lastInLumBuf
- vLumBufSize
+ 1);
2663 assert(firstChrSrcY
>= lastInChrBuf
- vChrBufSize
+ 1);
2665 // Do we have enough lines in this slice to output the dstY line
2666 enough_lines
= lastLumSrcY
< srcSliceY
+ srcSliceH
&& lastChrSrcY
< -((-srcSliceY
- srcSliceH
)>>c
->chrSrcVSubSample
);
2667 if (!enough_lines
) {
2668 lastLumSrcY
= srcSliceY
+ srcSliceH
- 1;
2669 lastChrSrcY
= chrSrcSliceY
+ chrSrcSliceH
- 1;
2672 DEBUG_BUFFERS("dstY: %d\n", dstY
);
2673 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2674 firstLumSrcY
, lastLumSrcY
, lastInLumBuf
);
2675 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2676 firstChrSrcY
, lastChrSrcY
, lastInChrBuf
);
2678 //Do horizontal scaling
2679 while(lastInLumBuf
< lastLumSrcY
) {
2680 uint8_t *src1
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
2681 uint8_t *src2
= src
[3]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[3];
2683 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2684 lumBufIndex
, lastInLumBuf
);
2685 assert(lumBufIndex
< 2*vLumBufSize
);
2686 assert(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
);
2687 assert(lastInLumBuf
+ 1 - srcSliceY
>= 0);
2688 RENAME(hyscale
)(c
, lumPixBuf
[ lumBufIndex
], dstW
, src1
, srcW
, lumXInc
,
2689 hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2692 if (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
)
2693 RENAME(hyscale
)(c
, alpPixBuf
[ lumBufIndex
], dstW
, src2
, srcW
, lumXInc
,
2694 hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2699 while(lastInChrBuf
< lastChrSrcY
) {
2700 uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
2701 uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
2703 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2704 chrBufIndex
, lastInChrBuf
);
2705 assert(chrBufIndex
< 2*vChrBufSize
);
2706 assert(lastInChrBuf
+ 1 - chrSrcSliceY
< (chrSrcSliceH
));
2707 assert(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0);
2708 //FIXME replace parameters through context struct (some at least)
2710 if (c
->needs_hcscale
)
2711 RENAME(hcscale
)(c
, chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
2712 hChrFilter
, hChrFilterPos
, hChrFilterSize
,
2717 //wrap buf index around to stay inside the ring buffer
2718 if (lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
2719 if (chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
2721 break; //we can't output a dstY line so let's try with the next slice
2723 #if COMPILE_TEMPLATE_MMX
2724 c
->blueDither
= ff_dither8
[dstY
&1];
2725 if (c
->dstFormat
== PIX_FMT_RGB555
|| c
->dstFormat
== PIX_FMT_BGR555
)
2726 c
->greenDither
= ff_dither8
[dstY
&1];
2728 c
->greenDither
= ff_dither4
[dstY
&1];
2729 c
->redDither
= ff_dither8
[(dstY
+1)&1];
2731 if (dstY
< dstH
-2) {
2732 const int16_t **lumSrcPtr
= (const int16_t **) lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2733 const int16_t **chrSrcPtr
= (const int16_t **) chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2734 const int16_t **alpSrcPtr
= (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? (const int16_t **) alpPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
: NULL
;
2735 #if COMPILE_TEMPLATE_MMX
2737 if (flags
& SWS_ACCURATE_RND
) {
2738 int s
= APCK_SIZE
/ 8;
2739 for (i
=0; i
<vLumFilterSize
; i
+=2) {
2740 *(void**)&lumMmxFilter
[s
*i
]= lumSrcPtr
[i
];
2741 *(void**)&lumMmxFilter
[s
*i
+APCK_PTR2
/4 ]= lumSrcPtr
[i
+(vLumFilterSize
>1)];
2742 lumMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2743 lumMmxFilter
[s
*i
+APCK_COEF
/4+1]= vLumFilter
[dstY
*vLumFilterSize
+ i
]
2744 + (vLumFilterSize
>1 ? vLumFilter
[dstY
*vLumFilterSize
+ i
+ 1]<<16 : 0);
2745 if (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) {
2746 *(void**)&alpMmxFilter
[s
*i
]= alpSrcPtr
[i
];
2747 *(void**)&alpMmxFilter
[s
*i
+APCK_PTR2
/4 ]= alpSrcPtr
[i
+(vLumFilterSize
>1)];
2748 alpMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2749 alpMmxFilter
[s
*i
+APCK_COEF
/4+1]= lumMmxFilter
[s
*i
+APCK_COEF
/4 ];
2752 for (i
=0; i
<vChrFilterSize
; i
+=2) {
2753 *(void**)&chrMmxFilter
[s
*i
]= chrSrcPtr
[i
];
2754 *(void**)&chrMmxFilter
[s
*i
+APCK_PTR2
/4 ]= chrSrcPtr
[i
+(vChrFilterSize
>1)];
2755 chrMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2756 chrMmxFilter
[s
*i
+APCK_COEF
/4+1]= vChrFilter
[chrDstY
*vChrFilterSize
+ i
]
2757 + (vChrFilterSize
>1 ? vChrFilter
[chrDstY
*vChrFilterSize
+ i
+ 1]<<16 : 0);
2760 for (i
=0; i
<vLumFilterSize
; i
++) {
2761 lumMmxFilter
[4*i
+0]= (int32_t)lumSrcPtr
[i
];
2762 lumMmxFilter
[4*i
+1]= (uint64_t)lumSrcPtr
[i
] >> 32;
2763 lumMmxFilter
[4*i
+2]=
2764 lumMmxFilter
[4*i
+3]=
2765 ((uint16_t)vLumFilter
[dstY
*vLumFilterSize
+ i
])*0x10001;
2766 if (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) {
2767 alpMmxFilter
[4*i
+0]= (int32_t)alpSrcPtr
[i
];
2768 alpMmxFilter
[4*i
+1]= (uint64_t)alpSrcPtr
[i
] >> 32;
2769 alpMmxFilter
[4*i
+2]=
2770 alpMmxFilter
[4*i
+3]= lumMmxFilter
[4*i
+2];
2773 for (i
=0; i
<vChrFilterSize
; i
++) {
2774 chrMmxFilter
[4*i
+0]= (int32_t)chrSrcPtr
[i
];
2775 chrMmxFilter
[4*i
+1]= (uint64_t)chrSrcPtr
[i
] >> 32;
2776 chrMmxFilter
[4*i
+2]=
2777 chrMmxFilter
[4*i
+3]=
2778 ((uint16_t)vChrFilter
[chrDstY
*vChrFilterSize
+ i
])*0x10001;
2782 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
) {
2783 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2784 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2786 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2787 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2788 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
2789 } else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) { //YV12 like
2790 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2791 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
2792 if (is16BPS(dstFormat
)) {
2794 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2795 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2796 alpSrcPtr
, (uint16_t *) dest
, (uint16_t *) uDest
, (uint16_t *) vDest
, (uint16_t *) aDest
, dstW
, chrDstW
,
2798 } else if (vLumFilterSize
== 1 && vChrFilterSize
== 1) { // unscaled YV12
2799 int16_t *lumBuf
= lumSrcPtr
[0];
2800 int16_t *chrBuf
= chrSrcPtr
[0];
2801 int16_t *alpBuf
= (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? alpSrcPtr
[0] : NULL
;
2802 c
->yuv2yuv1(c
, lumBuf
, chrBuf
, alpBuf
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
2803 } else { //General YV12
2805 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2806 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2807 alpSrcPtr
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
2810 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
2811 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
2812 if (vLumFilterSize
== 1 && vChrFilterSize
== 2) { //unscaled RGB
2813 int chrAlpha
= vChrFilter
[2*dstY
+1];
2814 if(flags
& SWS_FULL_CHR_H_INT
) {
2815 yuv2rgbXinC_full(c
, //FIXME write a packed1_full function
2816 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2817 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2818 alpSrcPtr
, dest
, dstW
, dstY
);
2820 c
->yuv2packed1(c
, *lumSrcPtr
, *chrSrcPtr
, *(chrSrcPtr
+1),
2821 alpPixBuf
? *alpSrcPtr
: NULL
,
2822 dest
, dstW
, chrAlpha
, dstFormat
, flags
, dstY
);
2824 } else if (vLumFilterSize
== 2 && vChrFilterSize
== 2) { //bilinear upscale RGB
2825 int lumAlpha
= vLumFilter
[2*dstY
+1];
2826 int chrAlpha
= vChrFilter
[2*dstY
+1];
2828 lumMmxFilter
[3]= vLumFilter
[2*dstY
]*0x10001;
2830 chrMmxFilter
[3]= vChrFilter
[2*chrDstY
]*0x10001;
2831 if(flags
& SWS_FULL_CHR_H_INT
) {
2832 yuv2rgbXinC_full(c
, //FIXME write a packed2_full function
2833 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2834 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2835 alpSrcPtr
, dest
, dstW
, dstY
);
2837 c
->yuv2packed2(c
, *lumSrcPtr
, *(lumSrcPtr
+1), *chrSrcPtr
, *(chrSrcPtr
+1),
2838 alpPixBuf
? *alpSrcPtr
: NULL
, alpPixBuf
? *(alpSrcPtr
+1) : NULL
,
2839 dest
, dstW
, lumAlpha
, chrAlpha
, dstY
);
2841 } else { //general RGB
2842 if(flags
& SWS_FULL_CHR_H_INT
) {
2844 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2845 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2846 alpSrcPtr
, dest
, dstW
, dstY
);
2849 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2850 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2851 alpSrcPtr
, dest
, dstW
, dstY
);
2855 } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2856 const int16_t **lumSrcPtr
= (const int16_t **)lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2857 const int16_t **chrSrcPtr
= (const int16_t **)chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2858 const int16_t **alpSrcPtr
= (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? (const int16_t **)alpPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
: NULL
;
2859 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
) {
2860 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2861 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2863 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2864 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2865 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
2866 } else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) { //YV12
2867 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2868 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
2869 if (is16BPS(dstFormat
)) {
2871 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2872 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2873 alpSrcPtr
, (uint16_t *) dest
, (uint16_t *) uDest
, (uint16_t *) vDest
, (uint16_t *) aDest
, dstW
, chrDstW
,
2877 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2878 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2879 alpSrcPtr
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
2882 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
2883 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
2884 if(flags
& SWS_FULL_CHR_H_INT
) {
2886 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2887 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2888 alpSrcPtr
, dest
, dstW
, dstY
);
2891 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2892 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2893 alpSrcPtr
, dest
, dstW
, dstY
);
2899 if ((dstFormat
== PIX_FMT_YUVA420P
) && !alpPixBuf
)
2900 fillPlane(dst
[3], dstStride
[3], dstW
, dstY
-lastDstY
, lastDstY
, 255);
2902 #if COMPILE_TEMPLATE_MMX
2903 if (flags
& SWS_CPU_CAPS_MMX2
) __asm__
volatile("sfence":::"memory");
2904 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2905 if (flags
& SWS_CPU_CAPS_3DNOW
) __asm__
volatile("femms" :::"memory");
2906 else __asm__
volatile("emms" :::"memory");
2908 /* store changed local vars back in the context */
2910 c
->lumBufIndex
= lumBufIndex
;
2911 c
->chrBufIndex
= chrBufIndex
;
2912 c
->lastInLumBuf
= lastInLumBuf
;
2913 c
->lastInChrBuf
= lastInChrBuf
;
2915 return dstY
- lastDstY
;
2918 static void RENAME(sws_init_swScale
)(SwsContext
*c
)
2920 enum PixelFormat srcFormat
= c
->srcFormat
;
2922 c
->yuv2nv12X
= RENAME(yuv2nv12X
);
2923 c
->yuv2yuv1
= RENAME(yuv2yuv1
);
2924 c
->yuv2yuvX
= RENAME(yuv2yuvX
);
2925 c
->yuv2packed1
= RENAME(yuv2packed1
);
2926 c
->yuv2packed2
= RENAME(yuv2packed2
);
2927 c
->yuv2packedX
= RENAME(yuv2packedX
);
2929 c
->hScale
= RENAME(hScale
);
2931 #if COMPILE_TEMPLATE_MMX
2932 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2933 if (c
->flags
& SWS_FAST_BILINEAR
&& c
->canMMX2BeUsed
)
2935 if (c
->flags
& SWS_FAST_BILINEAR
)
2938 c
->hyscale_fast
= RENAME(hyscale_fast
);
2939 c
->hcscale_fast
= RENAME(hcscale_fast
);
2942 c
->chrToYV12
= NULL
;
2944 case PIX_FMT_YUYV422
: c
->chrToYV12
= RENAME(yuy2ToUV
); break;
2945 case PIX_FMT_UYVY422
: c
->chrToYV12
= RENAME(uyvyToUV
); break;
2946 case PIX_FMT_NV12
: c
->chrToYV12
= RENAME(nv12ToUV
); break;
2947 case PIX_FMT_NV21
: c
->chrToYV12
= RENAME(nv21ToUV
); break;
2951 case PIX_FMT_BGR4_BYTE
:
2952 case PIX_FMT_RGB4_BYTE
: c
->chrToYV12
= palToUV
; break;
2953 case PIX_FMT_YUV420P16BE
:
2954 case PIX_FMT_YUV422P16BE
:
2955 case PIX_FMT_YUV444P16BE
: c
->chrToYV12
= RENAME(BEToUV
); break;
2956 case PIX_FMT_YUV420P16LE
:
2957 case PIX_FMT_YUV422P16LE
:
2958 case PIX_FMT_YUV444P16LE
: c
->chrToYV12
= RENAME(LEToUV
); break;
2960 if (c
->chrSrcHSubSample
) {
2962 case PIX_FMT_RGB48BE
:
2963 case PIX_FMT_RGB48LE
: c
->chrToYV12
= rgb48ToUV_half
; break;
2964 case PIX_FMT_RGB32
:
2965 case PIX_FMT_RGB32_1
: c
->chrToYV12
= bgr32ToUV_half
; break;
2966 case PIX_FMT_BGR24
: c
->chrToYV12
= RENAME(bgr24ToUV_half
); break;
2967 case PIX_FMT_BGR565
: c
->chrToYV12
= bgr16ToUV_half
; break;
2968 case PIX_FMT_BGR555
: c
->chrToYV12
= bgr15ToUV_half
; break;
2969 case PIX_FMT_BGR32
:
2970 case PIX_FMT_BGR32_1
: c
->chrToYV12
= rgb32ToUV_half
; break;
2971 case PIX_FMT_RGB24
: c
->chrToYV12
= RENAME(rgb24ToUV_half
); break;
2972 case PIX_FMT_RGB565
: c
->chrToYV12
= rgb16ToUV_half
; break;
2973 case PIX_FMT_RGB555
: c
->chrToYV12
= rgb15ToUV_half
; break;
2977 case PIX_FMT_RGB48BE
:
2978 case PIX_FMT_RGB48LE
: c
->chrToYV12
= rgb48ToUV
; break;
2979 case PIX_FMT_RGB32
:
2980 case PIX_FMT_RGB32_1
: c
->chrToYV12
= bgr32ToUV
; break;
2981 case PIX_FMT_BGR24
: c
->chrToYV12
= RENAME(bgr24ToUV
); break;
2982 case PIX_FMT_BGR565
: c
->chrToYV12
= bgr16ToUV
; break;
2983 case PIX_FMT_BGR555
: c
->chrToYV12
= bgr15ToUV
; break;
2984 case PIX_FMT_BGR32
:
2985 case PIX_FMT_BGR32_1
: c
->chrToYV12
= rgb32ToUV
; break;
2986 case PIX_FMT_RGB24
: c
->chrToYV12
= RENAME(rgb24ToUV
); break;
2987 case PIX_FMT_RGB565
: c
->chrToYV12
= rgb16ToUV
; break;
2988 case PIX_FMT_RGB555
: c
->chrToYV12
= rgb15ToUV
; break;
2992 c
->lumToYV12
= NULL
;
2993 c
->alpToYV12
= NULL
;
2994 switch (srcFormat
) {
2995 case PIX_FMT_YUYV422
:
2996 case PIX_FMT_YUV420P16BE
:
2997 case PIX_FMT_YUV422P16BE
:
2998 case PIX_FMT_YUV444P16BE
:
2999 case PIX_FMT_GRAY16BE
: c
->lumToYV12
= RENAME(yuy2ToY
); break;
3000 case PIX_FMT_UYVY422
:
3001 case PIX_FMT_YUV420P16LE
:
3002 case PIX_FMT_YUV422P16LE
:
3003 case PIX_FMT_YUV444P16LE
:
3004 case PIX_FMT_GRAY16LE
: c
->lumToYV12
= RENAME(uyvyToY
); break;
3005 case PIX_FMT_BGR24
: c
->lumToYV12
= RENAME(bgr24ToY
); break;
3006 case PIX_FMT_BGR565
: c
->lumToYV12
= bgr16ToY
; break;
3007 case PIX_FMT_BGR555
: c
->lumToYV12
= bgr15ToY
; break;
3008 case PIX_FMT_RGB24
: c
->lumToYV12
= RENAME(rgb24ToY
); break;
3009 case PIX_FMT_RGB565
: c
->lumToYV12
= rgb16ToY
; break;
3010 case PIX_FMT_RGB555
: c
->lumToYV12
= rgb15ToY
; break;
3014 case PIX_FMT_BGR4_BYTE
:
3015 case PIX_FMT_RGB4_BYTE
: c
->lumToYV12
= palToY
; break;
3016 case PIX_FMT_MONOBLACK
: c
->lumToYV12
= monoblack2Y
; break;
3017 case PIX_FMT_MONOWHITE
: c
->lumToYV12
= monowhite2Y
; break;
3018 case PIX_FMT_RGB32
:
3019 case PIX_FMT_RGB32_1
: c
->lumToYV12
= bgr32ToY
; break;
3020 case PIX_FMT_BGR32
:
3021 case PIX_FMT_BGR32_1
: c
->lumToYV12
= rgb32ToY
; break;
3022 case PIX_FMT_RGB48BE
:
3023 case PIX_FMT_RGB48LE
: c
->lumToYV12
= rgb48ToY
; break;
3026 switch (srcFormat
) {
3027 case PIX_FMT_RGB32
:
3028 case PIX_FMT_RGB32_1
:
3029 case PIX_FMT_BGR32
:
3030 case PIX_FMT_BGR32_1
: c
->alpToYV12
= abgrToA
; break;
3034 switch (srcFormat
) {
3035 case PIX_FMT_RGB32
:
3036 case PIX_FMT_BGR32
:
3037 c
->alpSrcOffset
= 3;
3039 case PIX_FMT_RGB32_1
:
3040 case PIX_FMT_BGR32_1
:
3041 c
->lumSrcOffset
= ALT32_CORR
;
3042 c
->chrSrcOffset
= ALT32_CORR
;
3044 case PIX_FMT_RGB48LE
:
3045 c
->lumSrcOffset
= 1;
3046 c
->chrSrcOffset
= 1;
3047 c
->alpSrcOffset
= 1;
3051 if (c
->srcRange
!= c
->dstRange
&& !(isRGB(c
->dstFormat
) || isBGR(c
->dstFormat
))) {
3053 c
->lumConvertRange
= RENAME(lumRangeFromJpeg
);
3054 c
->chrConvertRange
= RENAME(chrRangeFromJpeg
);
3056 c
->lumConvertRange
= RENAME(lumRangeToJpeg
);
3057 c
->chrConvertRange
= RENAME(chrRangeToJpeg
);
3061 if (!(isGray(srcFormat
) || isGray(c
->dstFormat
) ||
3062 srcFormat
== PIX_FMT_MONOBLACK
|| srcFormat
== PIX_FMT_MONOWHITE
))
3063 c
->needs_hcscale
= 1;