2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
29 #if COMPILE_TEMPLATE_AMD3DNOW
30 #define PREFETCH "prefetch"
31 #elif COMPILE_TEMPLATE_MMX2
32 #define PREFETCH "prefetchnta"
34 #define PREFETCH " # nop"
37 #if COMPILE_TEMPLATE_MMX2
38 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
39 #elif COMPILE_TEMPLATE_AMD3DNOW
40 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
43 #if COMPILE_TEMPLATE_MMX2
44 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
46 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
48 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
50 #if COMPILE_TEMPLATE_ALTIVEC
51 #include "ppc/swscale_altivec_template.c"
54 #define YSCALEYUV2YV12X(x, offset, dest, width) \
56 "xor %%"REG_a", %%"REG_a" \n\t"\
57 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
58 "movq %%mm3, %%mm4 \n\t"\
59 "lea " offset "(%0), %%"REG_d" \n\t"\
60 "mov (%%"REG_d"), %%"REG_S" \n\t"\
61 ASMALIGN(4) /* FIXME Unroll? */\
63 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
64 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
65 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
66 "add $16, %%"REG_d" \n\t"\
67 "mov (%%"REG_d"), %%"REG_S" \n\t"\
68 "test %%"REG_S", %%"REG_S" \n\t"\
69 "pmulhw %%mm0, %%mm2 \n\t"\
70 "pmulhw %%mm0, %%mm5 \n\t"\
71 "paddw %%mm2, %%mm3 \n\t"\
72 "paddw %%mm5, %%mm4 \n\t"\
74 "psraw $3, %%mm3 \n\t"\
75 "psraw $3, %%mm4 \n\t"\
76 "packuswb %%mm4, %%mm3 \n\t"\
77 MOVNTQ(%%mm3, (%1, %%REGa))\
78 "add $8, %%"REG_a" \n\t"\
79 "cmp %2, %%"REG_a" \n\t"\
80 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
81 "movq %%mm3, %%mm4 \n\t"\
82 "lea " offset "(%0), %%"REG_d" \n\t"\
83 "mov (%%"REG_d"), %%"REG_S" \n\t"\
85 :: "r" (&c->redDither),\
86 "r" (dest), "g" (width)\
87 : "%"REG_a, "%"REG_d, "%"REG_S\
90 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
92 "lea " offset "(%0), %%"REG_d" \n\t"\
93 "xor %%"REG_a", %%"REG_a" \n\t"\
94 "pxor %%mm4, %%mm4 \n\t"\
95 "pxor %%mm5, %%mm5 \n\t"\
96 "pxor %%mm6, %%mm6 \n\t"\
97 "pxor %%mm7, %%mm7 \n\t"\
98 "mov (%%"REG_d"), %%"REG_S" \n\t"\
101 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
102 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
103 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
104 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
105 "movq %%mm0, %%mm3 \n\t"\
106 "punpcklwd %%mm1, %%mm0 \n\t"\
107 "punpckhwd %%mm1, %%mm3 \n\t"\
108 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
109 "pmaddwd %%mm1, %%mm0 \n\t"\
110 "pmaddwd %%mm1, %%mm3 \n\t"\
111 "paddd %%mm0, %%mm4 \n\t"\
112 "paddd %%mm3, %%mm5 \n\t"\
113 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
114 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
115 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
116 "test %%"REG_S", %%"REG_S" \n\t"\
117 "movq %%mm2, %%mm0 \n\t"\
118 "punpcklwd %%mm3, %%mm2 \n\t"\
119 "punpckhwd %%mm3, %%mm0 \n\t"\
120 "pmaddwd %%mm1, %%mm2 \n\t"\
121 "pmaddwd %%mm1, %%mm0 \n\t"\
122 "paddd %%mm2, %%mm6 \n\t"\
123 "paddd %%mm0, %%mm7 \n\t"\
125 "psrad $16, %%mm4 \n\t"\
126 "psrad $16, %%mm5 \n\t"\
127 "psrad $16, %%mm6 \n\t"\
128 "psrad $16, %%mm7 \n\t"\
129 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
130 "packssdw %%mm5, %%mm4 \n\t"\
131 "packssdw %%mm7, %%mm6 \n\t"\
132 "paddw %%mm0, %%mm4 \n\t"\
133 "paddw %%mm0, %%mm6 \n\t"\
134 "psraw $3, %%mm4 \n\t"\
135 "psraw $3, %%mm6 \n\t"\
136 "packuswb %%mm6, %%mm4 \n\t"\
137 MOVNTQ(%%mm4, (%1, %%REGa))\
138 "add $8, %%"REG_a" \n\t"\
139 "cmp %2, %%"REG_a" \n\t"\
140 "lea " offset "(%0), %%"REG_d" \n\t"\
141 "pxor %%mm4, %%mm4 \n\t"\
142 "pxor %%mm5, %%mm5 \n\t"\
143 "pxor %%mm6, %%mm6 \n\t"\
144 "pxor %%mm7, %%mm7 \n\t"\
145 "mov (%%"REG_d"), %%"REG_S" \n\t"\
147 :: "r" (&c->redDither),\
148 "r" (dest), "g" (width)\
149 : "%"REG_a, "%"REG_d, "%"REG_S\
152 #define YSCALEYUV2YV121 \
153 "mov %2, %%"REG_a" \n\t"\
154 ASMALIGN(4) /* FIXME Unroll? */\
156 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
157 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
158 "psraw $7, %%mm0 \n\t"\
159 "psraw $7, %%mm1 \n\t"\
160 "packuswb %%mm1, %%mm0 \n\t"\
161 MOVNTQ(%%mm0, (%1, %%REGa))\
162 "add $8, %%"REG_a" \n\t"\
165 #define YSCALEYUV2YV121_ACCURATE \
166 "mov %2, %%"REG_a" \n\t"\
167 "pcmpeqw %%mm7, %%mm7 \n\t"\
168 "psrlw $15, %%mm7 \n\t"\
169 "psllw $6, %%mm7 \n\t"\
170 ASMALIGN(4) /* FIXME Unroll? */\
172 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
173 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
174 "paddsw %%mm7, %%mm0 \n\t"\
175 "paddsw %%mm7, %%mm1 \n\t"\
176 "psraw $7, %%mm0 \n\t"\
177 "psraw $7, %%mm1 \n\t"\
178 "packuswb %%mm1, %%mm0 \n\t"\
179 MOVNTQ(%%mm0, (%1, %%REGa))\
180 "add $8, %%"REG_a" \n\t"\
184 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
185 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
186 "r" (dest), "m" (dstW),
187 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
188 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
190 #define YSCALEYUV2PACKEDX_UV \
192 "xor %%"REG_a", %%"REG_a" \n\t"\
196 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
197 "mov (%%"REG_d"), %%"REG_S" \n\t"\
198 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
199 "movq %%mm3, %%mm4 \n\t"\
202 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
203 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
204 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
205 "add $16, %%"REG_d" \n\t"\
206 "mov (%%"REG_d"), %%"REG_S" \n\t"\
207 "pmulhw %%mm0, %%mm2 \n\t"\
208 "pmulhw %%mm0, %%mm5 \n\t"\
209 "paddw %%mm2, %%mm3 \n\t"\
210 "paddw %%mm5, %%mm4 \n\t"\
211 "test %%"REG_S", %%"REG_S" \n\t"\
214 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
215 "lea "offset"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
218 "movq "#dst1", "#dst2" \n\t"\
221 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw "#coeff", "#src1" \n\t"\
227 "pmulhw "#coeff", "#src2" \n\t"\
228 "paddw "#src1", "#dst1" \n\t"\
229 "paddw "#src2", "#dst2" \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
233 #define YSCALEYUV2PACKEDX \
234 YSCALEYUV2PACKEDX_UV \
235 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
237 #define YSCALEYUV2PACKEDX_END \
238 :: "r" (&c->redDither), \
239 "m" (dummy), "m" (dummy), "m" (dummy),\
240 "r" (dest), "m" (dstW) \
241 : "%"REG_a, "%"REG_d, "%"REG_S \
244 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
246 "xor %%"REG_a", %%"REG_a" \n\t"\
250 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
251 "mov (%%"REG_d"), %%"REG_S" \n\t"\
252 "pxor %%mm4, %%mm4 \n\t"\
253 "pxor %%mm5, %%mm5 \n\t"\
254 "pxor %%mm6, %%mm6 \n\t"\
255 "pxor %%mm7, %%mm7 \n\t"\
258 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
259 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
260 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
261 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
262 "movq %%mm0, %%mm3 \n\t"\
263 "punpcklwd %%mm1, %%mm0 \n\t"\
264 "punpckhwd %%mm1, %%mm3 \n\t"\
265 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
266 "pmaddwd %%mm1, %%mm0 \n\t"\
267 "pmaddwd %%mm1, %%mm3 \n\t"\
268 "paddd %%mm0, %%mm4 \n\t"\
269 "paddd %%mm3, %%mm5 \n\t"\
270 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
271 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
272 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
273 "test %%"REG_S", %%"REG_S" \n\t"\
274 "movq %%mm2, %%mm0 \n\t"\
275 "punpcklwd %%mm3, %%mm2 \n\t"\
276 "punpckhwd %%mm3, %%mm0 \n\t"\
277 "pmaddwd %%mm1, %%mm2 \n\t"\
278 "pmaddwd %%mm1, %%mm0 \n\t"\
279 "paddd %%mm2, %%mm6 \n\t"\
280 "paddd %%mm0, %%mm7 \n\t"\
282 "psrad $16, %%mm4 \n\t"\
283 "psrad $16, %%mm5 \n\t"\
284 "psrad $16, %%mm6 \n\t"\
285 "psrad $16, %%mm7 \n\t"\
286 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
287 "packssdw %%mm5, %%mm4 \n\t"\
288 "packssdw %%mm7, %%mm6 \n\t"\
289 "paddw %%mm0, %%mm4 \n\t"\
290 "paddw %%mm0, %%mm6 \n\t"\
291 "movq %%mm4, "U_TEMP"(%0) \n\t"\
292 "movq %%mm6, "V_TEMP"(%0) \n\t"\
294 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
295 "lea "offset"(%0), %%"REG_d" \n\t"\
296 "mov (%%"REG_d"), %%"REG_S" \n\t"\
297 "pxor %%mm1, %%mm1 \n\t"\
298 "pxor %%mm5, %%mm5 \n\t"\
299 "pxor %%mm7, %%mm7 \n\t"\
300 "pxor %%mm6, %%mm6 \n\t"\
303 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
304 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
305 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
306 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
307 "movq %%mm0, %%mm3 \n\t"\
308 "punpcklwd %%mm4, %%mm0 \n\t"\
309 "punpckhwd %%mm4, %%mm3 \n\t"\
310 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
311 "pmaddwd %%mm4, %%mm0 \n\t"\
312 "pmaddwd %%mm4, %%mm3 \n\t"\
313 "paddd %%mm0, %%mm1 \n\t"\
314 "paddd %%mm3, %%mm5 \n\t"\
315 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
316 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
317 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
318 "test %%"REG_S", %%"REG_S" \n\t"\
319 "movq %%mm2, %%mm0 \n\t"\
320 "punpcklwd %%mm3, %%mm2 \n\t"\
321 "punpckhwd %%mm3, %%mm0 \n\t"\
322 "pmaddwd %%mm4, %%mm2 \n\t"\
323 "pmaddwd %%mm4, %%mm0 \n\t"\
324 "paddd %%mm2, %%mm7 \n\t"\
325 "paddd %%mm0, %%mm6 \n\t"\
327 "psrad $16, %%mm1 \n\t"\
328 "psrad $16, %%mm5 \n\t"\
329 "psrad $16, %%mm7 \n\t"\
330 "psrad $16, %%mm6 \n\t"\
331 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
332 "packssdw %%mm5, %%mm1 \n\t"\
333 "packssdw %%mm6, %%mm7 \n\t"\
334 "paddw %%mm0, %%mm1 \n\t"\
335 "paddw %%mm0, %%mm7 \n\t"\
336 "movq "U_TEMP"(%0), %%mm3 \n\t"\
337 "movq "V_TEMP"(%0), %%mm4 \n\t"\
339 #define YSCALEYUV2PACKEDX_ACCURATE \
340 YSCALEYUV2PACKEDX_ACCURATE_UV \
341 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
343 #define YSCALEYUV2RGBX \
344 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
345 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
346 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
347 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
348 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
349 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
350 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
351 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
352 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
353 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
354 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
355 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
356 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
357 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
358 "paddw %%mm3, %%mm4 \n\t"\
359 "movq %%mm2, %%mm0 \n\t"\
360 "movq %%mm5, %%mm6 \n\t"\
361 "movq %%mm4, %%mm3 \n\t"\
362 "punpcklwd %%mm2, %%mm2 \n\t"\
363 "punpcklwd %%mm5, %%mm5 \n\t"\
364 "punpcklwd %%mm4, %%mm4 \n\t"\
365 "paddw %%mm1, %%mm2 \n\t"\
366 "paddw %%mm1, %%mm5 \n\t"\
367 "paddw %%mm1, %%mm4 \n\t"\
368 "punpckhwd %%mm0, %%mm0 \n\t"\
369 "punpckhwd %%mm6, %%mm6 \n\t"\
370 "punpckhwd %%mm3, %%mm3 \n\t"\
371 "paddw %%mm7, %%mm0 \n\t"\
372 "paddw %%mm7, %%mm6 \n\t"\
373 "paddw %%mm7, %%mm3 \n\t"\
374 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
375 "packuswb %%mm0, %%mm2 \n\t"\
376 "packuswb %%mm6, %%mm5 \n\t"\
377 "packuswb %%mm3, %%mm4 \n\t"\
379 #define REAL_YSCALEYUV2PACKED(index, c) \
380 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
381 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
382 "psraw $3, %%mm0 \n\t"\
383 "psraw $3, %%mm1 \n\t"\
384 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
385 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
386 "xor "#index", "#index" \n\t"\
389 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
390 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
391 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
392 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
393 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
394 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
395 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
396 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
397 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
398 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
399 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
400 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
401 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
402 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
403 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
404 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
405 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
406 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
407 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
408 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
409 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
410 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
411 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
412 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
413 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
415 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
417 #define REAL_YSCALEYUV2RGB_UV(index, c) \
418 "xor "#index", "#index" \n\t"\
421 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
422 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
423 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
424 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
425 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
426 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
427 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
428 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
429 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
430 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
431 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
432 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
433 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
434 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
435 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
436 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
437 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
438 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
439 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
440 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
442 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
443 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
444 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
445 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
446 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
447 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
448 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
449 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
450 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
451 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
452 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
454 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
456 #define REAL_YSCALEYUV2RGB_COEFF(c) \
457 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
458 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
459 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
460 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
461 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
462 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
463 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
464 "paddw %%mm3, %%mm4 \n\t"\
465 "movq %%mm2, %%mm0 \n\t"\
466 "movq %%mm5, %%mm6 \n\t"\
467 "movq %%mm4, %%mm3 \n\t"\
468 "punpcklwd %%mm2, %%mm2 \n\t"\
469 "punpcklwd %%mm5, %%mm5 \n\t"\
470 "punpcklwd %%mm4, %%mm4 \n\t"\
471 "paddw %%mm1, %%mm2 \n\t"\
472 "paddw %%mm1, %%mm5 \n\t"\
473 "paddw %%mm1, %%mm4 \n\t"\
474 "punpckhwd %%mm0, %%mm0 \n\t"\
475 "punpckhwd %%mm6, %%mm6 \n\t"\
476 "punpckhwd %%mm3, %%mm3 \n\t"\
477 "paddw %%mm7, %%mm0 \n\t"\
478 "paddw %%mm7, %%mm6 \n\t"\
479 "paddw %%mm7, %%mm3 \n\t"\
480 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
481 "packuswb %%mm0, %%mm2 \n\t"\
482 "packuswb %%mm6, %%mm5 \n\t"\
483 "packuswb %%mm3, %%mm4 \n\t"\
485 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
487 #define YSCALEYUV2RGB(index, c) \
488 REAL_YSCALEYUV2RGB_UV(index, c) \
489 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
490 REAL_YSCALEYUV2RGB_COEFF(c)
492 #define REAL_YSCALEYUV2PACKED1(index, c) \
493 "xor "#index", "#index" \n\t"\
496 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
497 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
498 "psraw $7, %%mm3 \n\t" \
499 "psraw $7, %%mm4 \n\t" \
500 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
501 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
502 "psraw $7, %%mm1 \n\t" \
503 "psraw $7, %%mm7 \n\t" \
505 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
507 #define REAL_YSCALEYUV2RGB1(index, c) \
508 "xor "#index", "#index" \n\t"\
511 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
512 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
513 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
514 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
515 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
516 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
517 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
518 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
519 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
520 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
521 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
522 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
523 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
524 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
525 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
526 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
527 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
528 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
529 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
530 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
531 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
532 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
533 "paddw %%mm3, %%mm4 \n\t"\
534 "movq %%mm2, %%mm0 \n\t"\
535 "movq %%mm5, %%mm6 \n\t"\
536 "movq %%mm4, %%mm3 \n\t"\
537 "punpcklwd %%mm2, %%mm2 \n\t"\
538 "punpcklwd %%mm5, %%mm5 \n\t"\
539 "punpcklwd %%mm4, %%mm4 \n\t"\
540 "paddw %%mm1, %%mm2 \n\t"\
541 "paddw %%mm1, %%mm5 \n\t"\
542 "paddw %%mm1, %%mm4 \n\t"\
543 "punpckhwd %%mm0, %%mm0 \n\t"\
544 "punpckhwd %%mm6, %%mm6 \n\t"\
545 "punpckhwd %%mm3, %%mm3 \n\t"\
546 "paddw %%mm7, %%mm0 \n\t"\
547 "paddw %%mm7, %%mm6 \n\t"\
548 "paddw %%mm7, %%mm3 \n\t"\
549 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
550 "packuswb %%mm0, %%mm2 \n\t"\
551 "packuswb %%mm6, %%mm5 \n\t"\
552 "packuswb %%mm3, %%mm4 \n\t"\
554 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
556 #define REAL_YSCALEYUV2PACKED1b(index, c) \
557 "xor "#index", "#index" \n\t"\
560 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
561 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
562 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
563 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
564 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
565 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
566 "psrlw $8, %%mm3 \n\t" \
567 "psrlw $8, %%mm4 \n\t" \
568 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
569 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
570 "psraw $7, %%mm1 \n\t" \
571 "psraw $7, %%mm7 \n\t"
572 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
574 // do vertical chrominance interpolation
575 #define REAL_YSCALEYUV2RGB1b(index, c) \
576 "xor "#index", "#index" \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
586 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
587 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
588 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
589 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
590 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
591 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
592 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
593 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
594 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
595 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
596 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
597 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
598 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
599 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
600 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
601 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
602 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
603 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
604 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
605 "paddw %%mm3, %%mm4 \n\t"\
606 "movq %%mm2, %%mm0 \n\t"\
607 "movq %%mm5, %%mm6 \n\t"\
608 "movq %%mm4, %%mm3 \n\t"\
609 "punpcklwd %%mm2, %%mm2 \n\t"\
610 "punpcklwd %%mm5, %%mm5 \n\t"\
611 "punpcklwd %%mm4, %%mm4 \n\t"\
612 "paddw %%mm1, %%mm2 \n\t"\
613 "paddw %%mm1, %%mm5 \n\t"\
614 "paddw %%mm1, %%mm4 \n\t"\
615 "punpckhwd %%mm0, %%mm0 \n\t"\
616 "punpckhwd %%mm6, %%mm6 \n\t"\
617 "punpckhwd %%mm3, %%mm3 \n\t"\
618 "paddw %%mm7, %%mm0 \n\t"\
619 "paddw %%mm7, %%mm6 \n\t"\
620 "paddw %%mm7, %%mm3 \n\t"\
621 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
622 "packuswb %%mm0, %%mm2 \n\t"\
623 "packuswb %%mm6, %%mm5 \n\t"\
624 "packuswb %%mm3, %%mm4 \n\t"\
626 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
628 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
629 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
630 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
631 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
632 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
633 "packuswb %%mm1, %%mm7 \n\t"
634 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
636 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
637 "movq "#b", "#q2" \n\t" /* B */\
638 "movq "#r", "#t" \n\t" /* R */\
639 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
640 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
641 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
642 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
643 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
644 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
645 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
646 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
647 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
648 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
650 MOVNTQ( q0, (dst, index, 4))\
651 MOVNTQ( b, 8(dst, index, 4))\
652 MOVNTQ( q2, 16(dst, index, 4))\
653 MOVNTQ( q3, 24(dst, index, 4))\
655 "add $8, "#index" \n\t"\
656 "cmp "#dstw", "#index" \n\t"\
658 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
660 #define REAL_WRITERGB16(dst, dstw, index) \
661 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
662 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
663 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
664 "psrlq $3, %%mm2 \n\t"\
666 "movq %%mm2, %%mm1 \n\t"\
667 "movq %%mm4, %%mm3 \n\t"\
669 "punpcklbw %%mm7, %%mm3 \n\t"\
670 "punpcklbw %%mm5, %%mm2 \n\t"\
671 "punpckhbw %%mm7, %%mm4 \n\t"\
672 "punpckhbw %%mm5, %%mm1 \n\t"\
674 "psllq $3, %%mm3 \n\t"\
675 "psllq $3, %%mm4 \n\t"\
677 "por %%mm3, %%mm2 \n\t"\
678 "por %%mm4, %%mm1 \n\t"\
680 MOVNTQ(%%mm2, (dst, index, 2))\
681 MOVNTQ(%%mm1, 8(dst, index, 2))\
683 "add $8, "#index" \n\t"\
684 "cmp "#dstw", "#index" \n\t"\
686 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
688 #define REAL_WRITERGB15(dst, dstw, index) \
689 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
690 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
691 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
692 "psrlq $3, %%mm2 \n\t"\
693 "psrlq $1, %%mm5 \n\t"\
695 "movq %%mm2, %%mm1 \n\t"\
696 "movq %%mm4, %%mm3 \n\t"\
698 "punpcklbw %%mm7, %%mm3 \n\t"\
699 "punpcklbw %%mm5, %%mm2 \n\t"\
700 "punpckhbw %%mm7, %%mm4 \n\t"\
701 "punpckhbw %%mm5, %%mm1 \n\t"\
703 "psllq $2, %%mm3 \n\t"\
704 "psllq $2, %%mm4 \n\t"\
706 "por %%mm3, %%mm2 \n\t"\
707 "por %%mm4, %%mm1 \n\t"\
709 MOVNTQ(%%mm2, (dst, index, 2))\
710 MOVNTQ(%%mm1, 8(dst, index, 2))\
712 "add $8, "#index" \n\t"\
713 "cmp "#dstw", "#index" \n\t"\
715 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
717 #define WRITEBGR24OLD(dst, dstw, index) \
718 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
719 "movq %%mm2, %%mm1 \n\t" /* B */\
720 "movq %%mm5, %%mm6 \n\t" /* R */\
721 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
722 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
723 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
724 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
725 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
726 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
727 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
728 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
729 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
730 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
732 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
733 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
734 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
735 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
736 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
737 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
738 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
739 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
741 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
742 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
743 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
744 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
745 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
746 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
747 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
748 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
749 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
750 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
751 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
752 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
753 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
755 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
756 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
757 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
758 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
759 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
760 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
761 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
762 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
764 MOVNTQ(%%mm0, (dst))\
765 MOVNTQ(%%mm2, 8(dst))\
766 MOVNTQ(%%mm3, 16(dst))\
767 "add $24, "#dst" \n\t"\
769 "add $8, "#index" \n\t"\
770 "cmp "#dstw", "#index" \n\t"\
773 #define WRITEBGR24MMX(dst, dstw, index) \
774 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
775 "movq %%mm2, %%mm1 \n\t" /* B */\
776 "movq %%mm5, %%mm6 \n\t" /* R */\
777 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
778 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
779 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
780 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
781 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
782 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
783 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
784 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
785 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
786 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
788 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
789 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
790 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
791 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
793 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
794 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
795 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
796 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
798 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
799 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
800 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
801 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
803 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
804 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
805 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
806 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
807 MOVNTQ(%%mm0, (dst))\
809 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
810 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
811 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
812 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
813 MOVNTQ(%%mm6, 8(dst))\
815 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
816 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
817 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
818 MOVNTQ(%%mm5, 16(dst))\
820 "add $24, "#dst" \n\t"\
822 "add $8, "#index" \n\t"\
823 "cmp "#dstw", "#index" \n\t"\
826 #define WRITEBGR24MMX2(dst, dstw, index) \
827 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
828 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
829 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
830 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
831 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
832 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
834 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
835 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
836 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
838 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
839 "por %%mm1, %%mm6 \n\t"\
840 "por %%mm3, %%mm6 \n\t"\
841 MOVNTQ(%%mm6, (dst))\
843 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
844 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
845 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
846 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
848 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
849 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
850 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
852 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
853 "por %%mm3, %%mm6 \n\t"\
854 MOVNTQ(%%mm6, 8(dst))\
856 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
857 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
858 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
860 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
861 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
862 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
864 "por %%mm1, %%mm3 \n\t"\
865 "por %%mm3, %%mm6 \n\t"\
866 MOVNTQ(%%mm6, 16(dst))\
868 "add $24, "#dst" \n\t"\
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
874 #if COMPILE_TEMPLATE_MMX2
876 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
879 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
882 #define REAL_WRITEYUY2(dst, dstw, index) \
883 "packuswb %%mm3, %%mm3 \n\t"\
884 "packuswb %%mm4, %%mm4 \n\t"\
885 "packuswb %%mm7, %%mm1 \n\t"\
886 "punpcklbw %%mm4, %%mm3 \n\t"\
887 "movq %%mm1, %%mm7 \n\t"\
888 "punpcklbw %%mm3, %%mm1 \n\t"\
889 "punpckhbw %%mm3, %%mm7 \n\t"\
891 MOVNTQ(%%mm1, (dst, index, 2))\
892 MOVNTQ(%%mm7, 8(dst, index, 2))\
894 "add $8, "#index" \n\t"\
895 "cmp "#dstw", "#index" \n\t"\
897 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
900 static inline void RENAME(yuv2yuvX
)(SwsContext
*c
, const int16_t *lumFilter
, const int16_t **lumSrc
, int lumFilterSize
,
901 const int16_t *chrFilter
, const int16_t **chrSrc
, int chrFilterSize
, const int16_t **alpSrc
,
902 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, uint8_t *aDest
, long dstW
, long chrDstW
)
904 #if COMPILE_TEMPLATE_MMX
905 if(!(c
->flags
& SWS_BITEXACT
)) {
906 if (c
->flags
& SWS_ACCURATE_RND
) {
908 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
909 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
911 if (CONFIG_SWSCALE_ALPHA
&& aDest
) {
912 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET
, aDest
, dstW
)
915 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
918 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
919 YSCALEYUV2YV12X(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
921 if (CONFIG_SWSCALE_ALPHA
&& aDest
) {
922 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET
, aDest
, dstW
)
925 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
930 #if COMPILE_TEMPLATE_ALTIVEC
931 yuv2yuvX_altivec_real(lumFilter
, lumSrc
, lumFilterSize
,
932 chrFilter
, chrSrc
, chrFilterSize
,
933 dest
, uDest
, vDest
, dstW
, chrDstW
);
934 #else //COMPILE_TEMPLATE_ALTIVEC
935 yuv2yuvXinC(lumFilter
, lumSrc
, lumFilterSize
,
936 chrFilter
, chrSrc
, chrFilterSize
,
937 alpSrc
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
938 #endif //!COMPILE_TEMPLATE_ALTIVEC
941 static inline void RENAME(yuv2nv12X
)(SwsContext
*c
, const int16_t *lumFilter
, const int16_t **lumSrc
, int lumFilterSize
,
942 const int16_t *chrFilter
, const int16_t **chrSrc
, int chrFilterSize
,
943 uint8_t *dest
, uint8_t *uDest
, int dstW
, int chrDstW
, enum PixelFormat dstFormat
)
945 yuv2nv12XinC(lumFilter
, lumSrc
, lumFilterSize
,
946 chrFilter
, chrSrc
, chrFilterSize
,
947 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
950 static inline void RENAME(yuv2yuv1
)(SwsContext
*c
, const int16_t *lumSrc
, const int16_t *chrSrc
, const int16_t *alpSrc
,
951 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, uint8_t *aDest
, long dstW
, long chrDstW
)
954 #if COMPILE_TEMPLATE_MMX
955 if(!(c
->flags
& SWS_BITEXACT
)) {
957 const uint8_t *src
[4]= {alpSrc
+ dstW
, lumSrc
+ dstW
, chrSrc
+ chrDstW
, chrSrc
+ VOFW
+ chrDstW
};
958 uint8_t *dst
[4]= {aDest
, dest
, uDest
, vDest
};
959 x86_reg counter
[4]= {dstW
, dstW
, chrDstW
, chrDstW
};
961 if (c
->flags
& SWS_ACCURATE_RND
) {
965 YSCALEYUV2YV121_ACCURATE
966 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
977 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
987 for (i
=0; i
<dstW
; i
++) {
988 int val
= (lumSrc
[i
]+64)>>7;
999 for (i
=0; i
<chrDstW
; i
++) {
1000 int u
=(chrSrc
[i
]+64)>>7;
1001 int v
=(chrSrc
[i
+ VOFW
]+64)>>7;
1005 else if (u
>255) u
=255;
1007 else if (v
>255) v
=255;
1014 if (CONFIG_SWSCALE_ALPHA
&& aDest
)
1015 for (i
=0; i
<dstW
; i
++) {
1016 int val
= (alpSrc
[i
]+64)>>7;
1017 aDest
[i
]= av_clip_uint8(val
);
1023 * vertical scale YV12 to RGB
1025 static inline void RENAME(yuv2packedX
)(SwsContext
*c
, const int16_t *lumFilter
, const int16_t **lumSrc
, int lumFilterSize
,
1026 const int16_t *chrFilter
, const int16_t **chrSrc
, int chrFilterSize
,
1027 const int16_t **alpSrc
, uint8_t *dest
, long dstW
, long dstY
)
1029 #if COMPILE_TEMPLATE_MMX
1031 if(!(c
->flags
& SWS_BITEXACT
)) {
1032 if (c
->flags
& SWS_ACCURATE_RND
) {
1033 switch(c
->dstFormat
) {
1035 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1036 YSCALEYUV2PACKEDX_ACCURATE
1038 "movq %%mm2, "U_TEMP
"(%0) \n\t"
1039 "movq %%mm4, "V_TEMP
"(%0) \n\t"
1040 "movq %%mm5, "Y_TEMP
"(%0) \n\t"
1041 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET
)
1042 "movq "Y_TEMP
"(%0), %%mm5 \n\t"
1043 "psraw $3, %%mm1 \n\t"
1044 "psraw $3, %%mm7 \n\t"
1045 "packuswb %%mm7, %%mm1 \n\t"
1046 WRITEBGR32(%4, %5, %%REGa
, %%mm3
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm2
, %%mm6
)
1048 YSCALEYUV2PACKEDX_END
1050 YSCALEYUV2PACKEDX_ACCURATE
1052 "pcmpeqd %%mm7, %%mm7 \n\t"
1053 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1055 YSCALEYUV2PACKEDX_END
1059 YSCALEYUV2PACKEDX_ACCURATE
1061 "pxor %%mm7, %%mm7 \n\t"
1062 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
"\n\t" //FIXME optimize
1063 "add %4, %%"REG_c
" \n\t"
1064 WRITEBGR24(%%REGc
, %5, %%REGa
)
1067 :: "r" (&c
->redDither
),
1068 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1069 "r" (dest
), "m" (dstW
)
1070 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1073 case PIX_FMT_RGB555
:
1074 YSCALEYUV2PACKEDX_ACCURATE
1076 "pxor %%mm7, %%mm7 \n\t"
1077 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1079 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1080 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1081 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1084 WRITERGB15(%4, %5, %%REGa
)
1085 YSCALEYUV2PACKEDX_END
1087 case PIX_FMT_RGB565
:
1088 YSCALEYUV2PACKEDX_ACCURATE
1090 "pxor %%mm7, %%mm7 \n\t"
1091 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1093 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1094 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1095 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1098 WRITERGB16(%4, %5, %%REGa
)
1099 YSCALEYUV2PACKEDX_END
1101 case PIX_FMT_YUYV422
:
1102 YSCALEYUV2PACKEDX_ACCURATE
1103 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1105 "psraw $3, %%mm3 \n\t"
1106 "psraw $3, %%mm4 \n\t"
1107 "psraw $3, %%mm1 \n\t"
1108 "psraw $3, %%mm7 \n\t"
1109 WRITEYUY2(%4, %5, %%REGa
)
1110 YSCALEYUV2PACKEDX_END
1114 switch(c
->dstFormat
) {
1116 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1119 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET
, %%mm0
, %%mm3
, %%mm6
, %%mm1
, %%mm7
)
1120 "psraw $3, %%mm1 \n\t"
1121 "psraw $3, %%mm7 \n\t"
1122 "packuswb %%mm7, %%mm1 \n\t"
1123 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm3
, %%mm6
)
1124 YSCALEYUV2PACKEDX_END
1128 "pcmpeqd %%mm7, %%mm7 \n\t"
1129 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1130 YSCALEYUV2PACKEDX_END
1136 "pxor %%mm7, %%mm7 \n\t"
1137 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
" \n\t" //FIXME optimize
1138 "add %4, %%"REG_c
" \n\t"
1139 WRITEBGR24(%%REGc
, %5, %%REGa
)
1141 :: "r" (&c
->redDither
),
1142 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1143 "r" (dest
), "m" (dstW
)
1144 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1147 case PIX_FMT_RGB555
:
1150 "pxor %%mm7, %%mm7 \n\t"
1151 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1153 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1154 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1155 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1158 WRITERGB15(%4, %5, %%REGa
)
1159 YSCALEYUV2PACKEDX_END
1161 case PIX_FMT_RGB565
:
1164 "pxor %%mm7, %%mm7 \n\t"
1165 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1167 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1168 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1169 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1172 WRITERGB16(%4, %5, %%REGa
)
1173 YSCALEYUV2PACKEDX_END
1175 case PIX_FMT_YUYV422
:
1177 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1179 "psraw $3, %%mm3 \n\t"
1180 "psraw $3, %%mm4 \n\t"
1181 "psraw $3, %%mm1 \n\t"
1182 "psraw $3, %%mm7 \n\t"
1183 WRITEYUY2(%4, %5, %%REGa
)
1184 YSCALEYUV2PACKEDX_END
1189 #endif /* COMPILE_TEMPLATE_MMX */
1190 #if COMPILE_TEMPLATE_ALTIVEC
1191 /* The following list of supported dstFormat values should
1192 match what's found in the body of ff_yuv2packedX_altivec() */
1193 if (!(c
->flags
& SWS_BITEXACT
) && !c
->alpPixBuf
&&
1194 (c
->dstFormat
==PIX_FMT_ABGR
|| c
->dstFormat
==PIX_FMT_BGRA
||
1195 c
->dstFormat
==PIX_FMT_BGR24
|| c
->dstFormat
==PIX_FMT_RGB24
||
1196 c
->dstFormat
==PIX_FMT_RGBA
|| c
->dstFormat
==PIX_FMT_ARGB
))
1197 ff_yuv2packedX_altivec(c
, lumFilter
, lumSrc
, lumFilterSize
,
1198 chrFilter
, chrSrc
, chrFilterSize
,
1202 yuv2packedXinC(c
, lumFilter
, lumSrc
, lumFilterSize
,
1203 chrFilter
, chrSrc
, chrFilterSize
,
1204 alpSrc
, dest
, dstW
, dstY
);
1208 * vertical bilinear scale YV12 to RGB
1210 static inline void RENAME(yuv2packed2
)(SwsContext
*c
, const uint16_t *buf0
, const uint16_t *buf1
, const uint16_t *uvbuf0
, const uint16_t *uvbuf1
,
1211 const uint16_t *abuf0
, const uint16_t *abuf1
, uint8_t *dest
, int dstW
, int yalpha
, int uvalpha
, int y
)
1213 int yalpha1
=4095- yalpha
;
1214 int uvalpha1
=4095-uvalpha
;
1217 #if COMPILE_TEMPLATE_MMX
1218 if(!(c
->flags
& SWS_BITEXACT
)) {
1219 switch(c
->dstFormat
) {
1220 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1222 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1225 YSCALEYUV2RGB(%%r8
, %5)
1226 YSCALEYUV2RGB_YA(%%r8
, %5, %6, %7)
1227 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1228 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1229 "packuswb %%mm7, %%mm1 \n\t"
1230 WRITEBGR32(%4, 8280(%5), %%r8
, %%mm2
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm3
, %%mm6
)
1232 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "r" (dest
),
1234 ,"r" (abuf0
), "r" (abuf1
)
1238 *(const uint16_t **)(&c
->u_temp
)=abuf0
;
1239 *(const uint16_t **)(&c
->v_temp
)=abuf1
;
1241 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1242 "mov %4, %%"REG_b
" \n\t"
1243 "push %%"REG_BP
" \n\t"
1244 YSCALEYUV2RGB(%%REGBP
, %5)
1247 "mov "U_TEMP
"(%5), %0 \n\t"
1248 "mov "V_TEMP
"(%5), %1 \n\t"
1249 YSCALEYUV2RGB_YA(%%REGBP
, %5, %0, %1)
1250 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1251 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1252 "packuswb %%mm7, %%mm1 \n\t"
1255 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm3
, %%mm6
)
1256 "pop %%"REG_BP
" \n\t"
1257 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1259 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1265 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1266 "mov %4, %%"REG_b
" \n\t"
1267 "push %%"REG_BP
" \n\t"
1268 YSCALEYUV2RGB(%%REGBP
, %5)
1269 "pcmpeqd %%mm7, %%mm7 \n\t"
1270 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1271 "pop %%"REG_BP
" \n\t"
1272 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1274 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1281 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1282 "mov %4, %%"REG_b
" \n\t"
1283 "push %%"REG_BP
" \n\t"
1284 YSCALEYUV2RGB(%%REGBP
, %5)
1285 "pxor %%mm7, %%mm7 \n\t"
1286 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1287 "pop %%"REG_BP
" \n\t"
1288 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1289 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1293 case PIX_FMT_RGB555
:
1295 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1296 "mov %4, %%"REG_b
" \n\t"
1297 "push %%"REG_BP
" \n\t"
1298 YSCALEYUV2RGB(%%REGBP
, %5)
1299 "pxor %%mm7, %%mm7 \n\t"
1300 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1302 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1303 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1304 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1307 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1308 "pop %%"REG_BP
" \n\t"
1309 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1311 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1315 case PIX_FMT_RGB565
:
1317 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1318 "mov %4, %%"REG_b
" \n\t"
1319 "push %%"REG_BP
" \n\t"
1320 YSCALEYUV2RGB(%%REGBP
, %5)
1321 "pxor %%mm7, %%mm7 \n\t"
1322 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1324 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1325 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1326 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1329 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1330 "pop %%"REG_BP
" \n\t"
1331 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1332 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1336 case PIX_FMT_YUYV422
:
1338 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1339 "mov %4, %%"REG_b
" \n\t"
1340 "push %%"REG_BP
" \n\t"
1341 YSCALEYUV2PACKED(%%REGBP
, %5)
1342 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1343 "pop %%"REG_BP
" \n\t"
1344 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1345 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1352 #endif //COMPILE_TEMPLATE_MMX
1353 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C
, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C
, YSCALE_YUV_2_MONO2_C
)
1357 * YV12 to RGB without scaling or interpolating
1359 static inline void RENAME(yuv2packed1
)(SwsContext
*c
, const uint16_t *buf0
, const uint16_t *uvbuf0
, const uint16_t *uvbuf1
,
1360 const uint16_t *abuf0
, uint8_t *dest
, int dstW
, int uvalpha
, enum PixelFormat dstFormat
, int flags
, int y
)
1362 const int yalpha1
=0;
1365 const uint16_t *buf1
= buf0
; //FIXME needed for RGB1/BGR1
1366 const int yalpha
= 4096; //FIXME ...
1368 if (flags
&SWS_FULL_CHR_H_INT
) {
1369 c
->yuv2packed2(c
, buf0
, buf0
, uvbuf0
, uvbuf1
, abuf0
, abuf0
, dest
, dstW
, 0, uvalpha
, y
);
1373 #if COMPILE_TEMPLATE_MMX
1374 if(!(flags
& SWS_BITEXACT
)) {
1375 if (uvalpha
< 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1378 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1380 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1381 "mov %4, %%"REG_b
" \n\t"
1382 "push %%"REG_BP
" \n\t"
1383 YSCALEYUV2RGB1(%%REGBP
, %5)
1384 YSCALEYUV2RGB1_ALPHA(%%REGBP
)
1385 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1386 "pop %%"REG_BP
" \n\t"
1387 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1389 :: "c" (buf0
), "d" (abuf0
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1394 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1395 "mov %4, %%"REG_b
" \n\t"
1396 "push %%"REG_BP
" \n\t"
1397 YSCALEYUV2RGB1(%%REGBP
, %5)
1398 "pcmpeqd %%mm7, %%mm7 \n\t"
1399 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1400 "pop %%"REG_BP
" \n\t"
1401 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1403 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1410 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1411 "mov %4, %%"REG_b
" \n\t"
1412 "push %%"REG_BP
" \n\t"
1413 YSCALEYUV2RGB1(%%REGBP
, %5)
1414 "pxor %%mm7, %%mm7 \n\t"
1415 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1416 "pop %%"REG_BP
" \n\t"
1417 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1419 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1423 case PIX_FMT_RGB555
:
1425 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1426 "mov %4, %%"REG_b
" \n\t"
1427 "push %%"REG_BP
" \n\t"
1428 YSCALEYUV2RGB1(%%REGBP
, %5)
1429 "pxor %%mm7, %%mm7 \n\t"
1430 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1432 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1433 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1434 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1436 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1437 "pop %%"REG_BP
" \n\t"
1438 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1440 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1444 case PIX_FMT_RGB565
:
1446 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1447 "mov %4, %%"REG_b
" \n\t"
1448 "push %%"REG_BP
" \n\t"
1449 YSCALEYUV2RGB1(%%REGBP
, %5)
1450 "pxor %%mm7, %%mm7 \n\t"
1451 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1453 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1454 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1455 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1458 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1459 "pop %%"REG_BP
" \n\t"
1460 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1462 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1466 case PIX_FMT_YUYV422
:
1468 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1469 "mov %4, %%"REG_b
" \n\t"
1470 "push %%"REG_BP
" \n\t"
1471 YSCALEYUV2PACKED1(%%REGBP
, %5)
1472 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1473 "pop %%"REG_BP
" \n\t"
1474 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1476 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1484 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1486 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1487 "mov %4, %%"REG_b
" \n\t"
1488 "push %%"REG_BP
" \n\t"
1489 YSCALEYUV2RGB1b(%%REGBP
, %5)
1490 YSCALEYUV2RGB1_ALPHA(%%REGBP
)
1491 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1492 "pop %%"REG_BP
" \n\t"
1493 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1495 :: "c" (buf0
), "d" (abuf0
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1500 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1501 "mov %4, %%"REG_b
" \n\t"
1502 "push %%"REG_BP
" \n\t"
1503 YSCALEYUV2RGB1b(%%REGBP
, %5)
1504 "pcmpeqd %%mm7, %%mm7 \n\t"
1505 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1506 "pop %%"REG_BP
" \n\t"
1507 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1509 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1516 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1517 "mov %4, %%"REG_b
" \n\t"
1518 "push %%"REG_BP
" \n\t"
1519 YSCALEYUV2RGB1b(%%REGBP
, %5)
1520 "pxor %%mm7, %%mm7 \n\t"
1521 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1522 "pop %%"REG_BP
" \n\t"
1523 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1525 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1529 case PIX_FMT_RGB555
:
1531 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1532 "mov %4, %%"REG_b
" \n\t"
1533 "push %%"REG_BP
" \n\t"
1534 YSCALEYUV2RGB1b(%%REGBP
, %5)
1535 "pxor %%mm7, %%mm7 \n\t"
1536 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1538 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1539 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1540 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1542 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1543 "pop %%"REG_BP
" \n\t"
1544 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1546 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1550 case PIX_FMT_RGB565
:
1552 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1553 "mov %4, %%"REG_b
" \n\t"
1554 "push %%"REG_BP
" \n\t"
1555 YSCALEYUV2RGB1b(%%REGBP
, %5)
1556 "pxor %%mm7, %%mm7 \n\t"
1557 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1559 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1560 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1561 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1564 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1565 "pop %%"REG_BP
" \n\t"
1566 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1568 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1572 case PIX_FMT_YUYV422
:
1574 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1575 "mov %4, %%"REG_b
" \n\t"
1576 "push %%"REG_BP
" \n\t"
1577 YSCALEYUV2PACKED1b(%%REGBP
, %5)
1578 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1579 "pop %%"REG_BP
" \n\t"
1580 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1582 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1589 #endif /* COMPILE_TEMPLATE_MMX */
1590 if (uvalpha
< 2048) {
1591 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C
, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1593 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C
, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1597 //FIXME yuy2* can read up to 7 samples too much
1599 static inline void RENAME(yuy2ToY
)(uint8_t *dst
, const uint8_t *src
, long width
, uint32_t *unused
)
1601 #if COMPILE_TEMPLATE_MMX
1603 "movq "MANGLE(bm01010101
)", %%mm2 \n\t"
1604 "mov %0, %%"REG_a
" \n\t"
1606 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1607 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1608 "pand %%mm2, %%mm0 \n\t"
1609 "pand %%mm2, %%mm1 \n\t"
1610 "packuswb %%mm1, %%mm0 \n\t"
1611 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1612 "add $8, %%"REG_a
" \n\t"
1614 : : "g" ((x86_reg
)-width
), "r" (src
+width
*2), "r" (dst
+width
)
1619 for (i
=0; i
<width
; i
++)
1624 static inline void RENAME(yuy2ToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1626 #if COMPILE_TEMPLATE_MMX
1628 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1629 "mov %0, %%"REG_a
" \n\t"
1631 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1632 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1633 "psrlw $8, %%mm0 \n\t"
1634 "psrlw $8, %%mm1 \n\t"
1635 "packuswb %%mm1, %%mm0 \n\t"
1636 "movq %%mm0, %%mm1 \n\t"
1637 "psrlw $8, %%mm0 \n\t"
1638 "pand %%mm4, %%mm1 \n\t"
1639 "packuswb %%mm0, %%mm0 \n\t"
1640 "packuswb %%mm1, %%mm1 \n\t"
1641 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1642 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1643 "add $4, %%"REG_a
" \n\t"
1645 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1650 for (i
=0; i
<width
; i
++) {
1651 dstU
[i
]= src1
[4*i
+ 1];
1652 dstV
[i
]= src1
[4*i
+ 3];
1655 assert(src1
== src2
);
1658 static inline void RENAME(LEToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1660 #if COMPILE_TEMPLATE_MMX
1662 "mov %0, %%"REG_a
" \n\t"
1664 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1665 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1666 "movq (%2, %%"REG_a
",2), %%mm2 \n\t"
1667 "movq 8(%2, %%"REG_a
",2), %%mm3 \n\t"
1668 "psrlw $8, %%mm0 \n\t"
1669 "psrlw $8, %%mm1 \n\t"
1670 "psrlw $8, %%mm2 \n\t"
1671 "psrlw $8, %%mm3 \n\t"
1672 "packuswb %%mm1, %%mm0 \n\t"
1673 "packuswb %%mm3, %%mm2 \n\t"
1674 "movq %%mm0, (%3, %%"REG_a
") \n\t"
1675 "movq %%mm2, (%4, %%"REG_a
") \n\t"
1676 "add $8, %%"REG_a
" \n\t"
1678 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*2), "r" (src2
+width
*2), "r" (dstU
+width
), "r" (dstV
+width
)
1683 for (i
=0; i
<width
; i
++) {
1684 dstU
[i
]= src1
[2*i
+ 1];
1685 dstV
[i
]= src2
[2*i
+ 1];
1690 /* This is almost identical to the previous, end exists only because
1691 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1692 static inline void RENAME(uyvyToY
)(uint8_t *dst
, const uint8_t *src
, long width
, uint32_t *unused
)
1694 #if COMPILE_TEMPLATE_MMX
1696 "mov %0, %%"REG_a
" \n\t"
1698 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1699 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1700 "psrlw $8, %%mm0 \n\t"
1701 "psrlw $8, %%mm1 \n\t"
1702 "packuswb %%mm1, %%mm0 \n\t"
1703 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1704 "add $8, %%"REG_a
" \n\t"
1706 : : "g" ((x86_reg
)-width
), "r" (src
+width
*2), "r" (dst
+width
)
1711 for (i
=0; i
<width
; i
++)
1716 static inline void RENAME(uyvyToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1718 #if COMPILE_TEMPLATE_MMX
1720 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1721 "mov %0, %%"REG_a
" \n\t"
1723 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1724 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1725 "pand %%mm4, %%mm0 \n\t"
1726 "pand %%mm4, %%mm1 \n\t"
1727 "packuswb %%mm1, %%mm0 \n\t"
1728 "movq %%mm0, %%mm1 \n\t"
1729 "psrlw $8, %%mm0 \n\t"
1730 "pand %%mm4, %%mm1 \n\t"
1731 "packuswb %%mm0, %%mm0 \n\t"
1732 "packuswb %%mm1, %%mm1 \n\t"
1733 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1734 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1735 "add $4, %%"REG_a
" \n\t"
1737 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1742 for (i
=0; i
<width
; i
++) {
1743 dstU
[i
]= src1
[4*i
+ 0];
1744 dstV
[i
]= src1
[4*i
+ 2];
1747 assert(src1
== src2
);
1750 static inline void RENAME(BEToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1752 #if COMPILE_TEMPLATE_MMX
1754 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1755 "mov %0, %%"REG_a
" \n\t"
1757 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1758 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1759 "movq (%2, %%"REG_a
",2), %%mm2 \n\t"
1760 "movq 8(%2, %%"REG_a
",2), %%mm3 \n\t"
1761 "pand %%mm4, %%mm0 \n\t"
1762 "pand %%mm4, %%mm1 \n\t"
1763 "pand %%mm4, %%mm2 \n\t"
1764 "pand %%mm4, %%mm3 \n\t"
1765 "packuswb %%mm1, %%mm0 \n\t"
1766 "packuswb %%mm3, %%mm2 \n\t"
1767 "movq %%mm0, (%3, %%"REG_a
") \n\t"
1768 "movq %%mm2, (%4, %%"REG_a
") \n\t"
1769 "add $8, %%"REG_a
" \n\t"
1771 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*2), "r" (src2
+width
*2), "r" (dstU
+width
), "r" (dstV
+width
)
1776 for (i
=0; i
<width
; i
++) {
1783 static inline void RENAME(nvXXtoUV
)(uint8_t *dst1
, uint8_t *dst2
,
1784 const uint8_t *src
, long width
)
1786 #if COMPILE_TEMPLATE_MMX
1788 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1789 "mov %0, %%"REG_a
" \n\t"
1791 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1792 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1793 "movq %%mm0, %%mm2 \n\t"
1794 "movq %%mm1, %%mm3 \n\t"
1795 "pand %%mm4, %%mm0 \n\t"
1796 "pand %%mm4, %%mm1 \n\t"
1797 "psrlw $8, %%mm2 \n\t"
1798 "psrlw $8, %%mm3 \n\t"
1799 "packuswb %%mm1, %%mm0 \n\t"
1800 "packuswb %%mm3, %%mm2 \n\t"
1801 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1802 "movq %%mm2, (%3, %%"REG_a
") \n\t"
1803 "add $8, %%"REG_a
" \n\t"
1805 : : "g" ((x86_reg
)-width
), "r" (src
+width
*2), "r" (dst1
+width
), "r" (dst2
+width
)
1810 for (i
= 0; i
< width
; i
++) {
1811 dst1
[i
] = src
[2*i
+0];
1812 dst2
[i
] = src
[2*i
+1];
1817 static inline void RENAME(nv12ToUV
)(uint8_t *dstU
, uint8_t *dstV
,
1818 const uint8_t *src1
, const uint8_t *src2
,
1819 long width
, uint32_t *unused
)
1821 RENAME(nvXXtoUV
)(dstU
, dstV
, src1
, width
);
1824 static inline void RENAME(nv21ToUV
)(uint8_t *dstU
, uint8_t *dstV
,
1825 const uint8_t *src1
, const uint8_t *src2
,
1826 long width
, uint32_t *unused
)
1828 RENAME(nvXXtoUV
)(dstV
, dstU
, src1
, width
);
1831 #if COMPILE_TEMPLATE_MMX
1832 static inline void RENAME(bgr24ToY_mmx
)(uint8_t *dst
, const uint8_t *src
, long width
, enum PixelFormat srcFormat
)
1835 if(srcFormat
== PIX_FMT_BGR24
) {
1837 "movq "MANGLE(ff_bgr24toY1Coeff
)", %%mm5 \n\t"
1838 "movq "MANGLE(ff_bgr24toY2Coeff
)", %%mm6 \n\t"
1843 "movq "MANGLE(ff_rgb24toY1Coeff
)", %%mm5 \n\t"
1844 "movq "MANGLE(ff_rgb24toY2Coeff
)", %%mm6 \n\t"
1850 "movq "MANGLE(ff_bgr24toYOffset
)", %%mm4 \n\t"
1851 "mov %2, %%"REG_a
" \n\t"
1852 "pxor %%mm7, %%mm7 \n\t"
1854 PREFETCH
" 64(%0) \n\t"
1855 "movd (%0), %%mm0 \n\t"
1856 "movd 2(%0), %%mm1 \n\t"
1857 "movd 6(%0), %%mm2 \n\t"
1858 "movd 8(%0), %%mm3 \n\t"
1860 "punpcklbw %%mm7, %%mm0 \n\t"
1861 "punpcklbw %%mm7, %%mm1 \n\t"
1862 "punpcklbw %%mm7, %%mm2 \n\t"
1863 "punpcklbw %%mm7, %%mm3 \n\t"
1864 "pmaddwd %%mm5, %%mm0 \n\t"
1865 "pmaddwd %%mm6, %%mm1 \n\t"
1866 "pmaddwd %%mm5, %%mm2 \n\t"
1867 "pmaddwd %%mm6, %%mm3 \n\t"
1868 "paddd %%mm1, %%mm0 \n\t"
1869 "paddd %%mm3, %%mm2 \n\t"
1870 "paddd %%mm4, %%mm0 \n\t"
1871 "paddd %%mm4, %%mm2 \n\t"
1872 "psrad $15, %%mm0 \n\t"
1873 "psrad $15, %%mm2 \n\t"
1874 "packssdw %%mm2, %%mm0 \n\t"
1875 "packuswb %%mm0, %%mm0 \n\t"
1876 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1877 "add $4, %%"REG_a
" \n\t"
1880 : "r" (dst
+width
), "g" ((x86_reg
)-width
)
1885 static inline void RENAME(bgr24ToUV_mmx
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src
, long width
, enum PixelFormat srcFormat
)
1888 "movq 24+%4, %%mm6 \n\t"
1889 "mov %3, %%"REG_a
" \n\t"
1890 "pxor %%mm7, %%mm7 \n\t"
1892 PREFETCH
" 64(%0) \n\t"
1893 "movd (%0), %%mm0 \n\t"
1894 "movd 2(%0), %%mm1 \n\t"
1895 "punpcklbw %%mm7, %%mm0 \n\t"
1896 "punpcklbw %%mm7, %%mm1 \n\t"
1897 "movq %%mm0, %%mm2 \n\t"
1898 "movq %%mm1, %%mm3 \n\t"
1899 "pmaddwd %4, %%mm0 \n\t"
1900 "pmaddwd 8+%4, %%mm1 \n\t"
1901 "pmaddwd 16+%4, %%mm2 \n\t"
1902 "pmaddwd %%mm6, %%mm3 \n\t"
1903 "paddd %%mm1, %%mm0 \n\t"
1904 "paddd %%mm3, %%mm2 \n\t"
1906 "movd 6(%0), %%mm1 \n\t"
1907 "movd 8(%0), %%mm3 \n\t"
1909 "punpcklbw %%mm7, %%mm1 \n\t"
1910 "punpcklbw %%mm7, %%mm3 \n\t"
1911 "movq %%mm1, %%mm4 \n\t"
1912 "movq %%mm3, %%mm5 \n\t"
1913 "pmaddwd %4, %%mm1 \n\t"
1914 "pmaddwd 8+%4, %%mm3 \n\t"
1915 "pmaddwd 16+%4, %%mm4 \n\t"
1916 "pmaddwd %%mm6, %%mm5 \n\t"
1917 "paddd %%mm3, %%mm1 \n\t"
1918 "paddd %%mm5, %%mm4 \n\t"
1920 "movq "MANGLE(ff_bgr24toUVOffset
)", %%mm3 \n\t"
1921 "paddd %%mm3, %%mm0 \n\t"
1922 "paddd %%mm3, %%mm2 \n\t"
1923 "paddd %%mm3, %%mm1 \n\t"
1924 "paddd %%mm3, %%mm4 \n\t"
1925 "psrad $15, %%mm0 \n\t"
1926 "psrad $15, %%mm2 \n\t"
1927 "psrad $15, %%mm1 \n\t"
1928 "psrad $15, %%mm4 \n\t"
1929 "packssdw %%mm1, %%mm0 \n\t"
1930 "packssdw %%mm4, %%mm2 \n\t"
1931 "packuswb %%mm0, %%mm0 \n\t"
1932 "packuswb %%mm2, %%mm2 \n\t"
1933 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1934 "movd %%mm2, (%2, %%"REG_a
") \n\t"
1935 "add $4, %%"REG_a
" \n\t"
1938 : "r" (dstU
+width
), "r" (dstV
+width
), "g" ((x86_reg
)-width
), "m"(ff_bgr24toUV
[srcFormat
== PIX_FMT_RGB24
][0])
1944 static inline void RENAME(bgr24ToY
)(uint8_t *dst
, const uint8_t *src
, long width
, uint32_t *unused
)
1946 #if COMPILE_TEMPLATE_MMX
1947 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_BGR24
);
1950 for (i
=0; i
<width
; i
++) {
1955 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
1957 #endif /* COMPILE_TEMPLATE_MMX */
1960 static inline void RENAME(bgr24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1962 #if COMPILE_TEMPLATE_MMX
1963 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_BGR24
);
1966 for (i
=0; i
<width
; i
++) {
1967 int b
= src1
[3*i
+ 0];
1968 int g
= src1
[3*i
+ 1];
1969 int r
= src1
[3*i
+ 2];
1971 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1972 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1974 #endif /* COMPILE_TEMPLATE_MMX */
1975 assert(src1
== src2
);
1978 static inline void RENAME(bgr24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1981 for (i
=0; i
<width
; i
++) {
1982 int b
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
1983 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
1984 int r
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
1986 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1987 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1989 assert(src1
== src2
);
1992 static inline void RENAME(rgb24ToY
)(uint8_t *dst
, const uint8_t *src
, long width
, uint32_t *unused
)
1994 #if COMPILE_TEMPLATE_MMX
1995 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_RGB24
);
1998 for (i
=0; i
<width
; i
++) {
2003 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
2008 static inline void RENAME(rgb24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
2010 #if COMPILE_TEMPLATE_MMX
2012 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_RGB24
);
2016 for (i
=0; i
<width
; i
++) {
2017 int r
= src1
[3*i
+ 0];
2018 int g
= src1
[3*i
+ 1];
2019 int b
= src1
[3*i
+ 2];
2021 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
2022 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
2027 static inline void RENAME(rgb24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
2031 for (i
=0; i
<width
; i
++) {
2032 int r
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
2033 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
2034 int b
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
2036 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
2037 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
2042 // bilinear / bicubic scaling
2043 static inline void RENAME(hScale
)(int16_t *dst
, int dstW
, const uint8_t *src
, int srcW
, int xInc
,
2044 const int16_t *filter
, const int16_t *filterPos
, long filterSize
)
2046 #if COMPILE_TEMPLATE_MMX
2047 assert(filterSize
% 4 == 0 && filterSize
>0);
2048 if (filterSize
==4) { // Always true for upscaling, sometimes for down, too.
2049 x86_reg counter
= -2*dstW
;
2051 filterPos
-= counter
/2;
2055 "push %%"REG_b
" \n\t"
2057 "pxor %%mm7, %%mm7 \n\t"
2058 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2059 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2062 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2063 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
2064 "movq (%1, %%"REG_BP
", 4), %%mm1 \n\t"
2065 "movq 8(%1, %%"REG_BP
", 4), %%mm3 \n\t"
2066 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2067 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2068 "punpcklbw %%mm7, %%mm0 \n\t"
2069 "punpcklbw %%mm7, %%mm2 \n\t"
2070 "pmaddwd %%mm1, %%mm0 \n\t"
2071 "pmaddwd %%mm2, %%mm3 \n\t"
2072 "movq %%mm0, %%mm4 \n\t"
2073 "punpckldq %%mm3, %%mm0 \n\t"
2074 "punpckhdq %%mm3, %%mm4 \n\t"
2075 "paddd %%mm4, %%mm0 \n\t"
2076 "psrad $7, %%mm0 \n\t"
2077 "packssdw %%mm0, %%mm0 \n\t"
2078 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2079 "add $4, %%"REG_BP
" \n\t"
2082 "pop %%"REG_BP
" \n\t"
2084 "pop %%"REG_b
" \n\t"
2087 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2092 } else if (filterSize
==8) {
2093 x86_reg counter
= -2*dstW
;
2095 filterPos
-= counter
/2;
2099 "push %%"REG_b
" \n\t"
2101 "pxor %%mm7, %%mm7 \n\t"
2102 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2103 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2106 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2107 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
2108 "movq (%1, %%"REG_BP
", 8), %%mm1 \n\t"
2109 "movq 16(%1, %%"REG_BP
", 8), %%mm3 \n\t"
2110 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2111 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2112 "punpcklbw %%mm7, %%mm0 \n\t"
2113 "punpcklbw %%mm7, %%mm2 \n\t"
2114 "pmaddwd %%mm1, %%mm0 \n\t"
2115 "pmaddwd %%mm2, %%mm3 \n\t"
2117 "movq 8(%1, %%"REG_BP
", 8), %%mm1 \n\t"
2118 "movq 24(%1, %%"REG_BP
", 8), %%mm5 \n\t"
2119 "movd 4(%3, %%"REG_a
"), %%mm4 \n\t"
2120 "movd 4(%3, %%"REG_b
"), %%mm2 \n\t"
2121 "punpcklbw %%mm7, %%mm4 \n\t"
2122 "punpcklbw %%mm7, %%mm2 \n\t"
2123 "pmaddwd %%mm1, %%mm4 \n\t"
2124 "pmaddwd %%mm2, %%mm5 \n\t"
2125 "paddd %%mm4, %%mm0 \n\t"
2126 "paddd %%mm5, %%mm3 \n\t"
2127 "movq %%mm0, %%mm4 \n\t"
2128 "punpckldq %%mm3, %%mm0 \n\t"
2129 "punpckhdq %%mm3, %%mm4 \n\t"
2130 "paddd %%mm4, %%mm0 \n\t"
2131 "psrad $7, %%mm0 \n\t"
2132 "packssdw %%mm0, %%mm0 \n\t"
2133 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2134 "add $4, %%"REG_BP
" \n\t"
2137 "pop %%"REG_BP
" \n\t"
2139 "pop %%"REG_b
" \n\t"
2142 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2148 const uint8_t *offset
= src
+filterSize
;
2149 x86_reg counter
= -2*dstW
;
2150 //filter-= counter*filterSize/2;
2151 filterPos
-= counter
/2;
2154 "pxor %%mm7, %%mm7 \n\t"
2157 "mov %2, %%"REG_c
" \n\t"
2158 "movzwl (%%"REG_c
", %0), %%eax \n\t"
2159 "movzwl 2(%%"REG_c
", %0), %%edx \n\t"
2160 "mov %5, %%"REG_c
" \n\t"
2161 "pxor %%mm4, %%mm4 \n\t"
2162 "pxor %%mm5, %%mm5 \n\t"
2164 "movq (%1), %%mm1 \n\t"
2165 "movq (%1, %6), %%mm3 \n\t"
2166 "movd (%%"REG_c
", %%"REG_a
"), %%mm0 \n\t"
2167 "movd (%%"REG_c
", %%"REG_d
"), %%mm2 \n\t"
2168 "punpcklbw %%mm7, %%mm0 \n\t"
2169 "punpcklbw %%mm7, %%mm2 \n\t"
2170 "pmaddwd %%mm1, %%mm0 \n\t"
2171 "pmaddwd %%mm2, %%mm3 \n\t"
2172 "paddd %%mm3, %%mm5 \n\t"
2173 "paddd %%mm0, %%mm4 \n\t"
2175 "add $4, %%"REG_c
" \n\t"
2176 "cmp %4, %%"REG_c
" \n\t"
2179 "movq %%mm4, %%mm0 \n\t"
2180 "punpckldq %%mm5, %%mm4 \n\t"
2181 "punpckhdq %%mm5, %%mm0 \n\t"
2182 "paddd %%mm0, %%mm4 \n\t"
2183 "psrad $7, %%mm4 \n\t"
2184 "packssdw %%mm4, %%mm4 \n\t"
2185 "mov %3, %%"REG_a
" \n\t"
2186 "movd %%mm4, (%%"REG_a
", %0) \n\t"
2190 : "+r" (counter
), "+r" (filter
)
2191 : "m" (filterPos
), "m" (dst
), "m"(offset
),
2192 "m" (src
), "r" ((x86_reg
)filterSize
*2)
2193 : "%"REG_a
, "%"REG_c
, "%"REG_d
2197 #if COMPILE_TEMPLATE_ALTIVEC
2198 hScale_altivec_real(dst
, dstW
, src
, srcW
, xInc
, filter
, filterPos
, filterSize
);
2201 for (i
=0; i
<dstW
; i
++) {
2203 int srcPos
= filterPos
[i
];
2205 //printf("filterPos: %d\n", filterPos[i]);
2206 for (j
=0; j
<filterSize
; j
++) {
2207 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2208 val
+= ((int)src
[srcPos
+ j
])*filter
[filterSize
*i
+ j
];
2210 //filter += hFilterSize;
2211 dst
[i
] = FFMIN(val
>>7, (1<<15)-1); // the cubic equation does overflow ...
2214 #endif /* COMPILE_ALTIVEC */
2215 #endif /* COMPILE_MMX */
2218 //FIXME all pal and rgb srcFormats could do this convertion as well
2219 //FIXME all scalers more complex than bilinear could do half of this transform
2220 static void RENAME(chrRangeToJpeg
)(uint16_t *dst
, int width
)
2223 for (i
= 0; i
< width
; i
++) {
2224 dst
[i
] = (FFMIN(dst
[i
],30775)*4663 - 9289992)>>12; //-264
2225 dst
[i
+VOFW
] = (FFMIN(dst
[i
+VOFW
],30775)*4663 - 9289992)>>12; //-264
2228 static void RENAME(chrRangeFromJpeg
)(uint16_t *dst
, int width
)
2231 for (i
= 0; i
< width
; i
++) {
2232 dst
[i
] = (dst
[i
]*1799 + 4081085)>>11; //1469
2233 dst
[i
+VOFW
] = (dst
[i
+VOFW
]*1799 + 4081085)>>11; //1469
2236 static void RENAME(lumRangeToJpeg
)(uint16_t *dst
, int width
)
2239 for (i
= 0; i
< width
; i
++)
2240 dst
[i
] = (FFMIN(dst
[i
],30189)*19077 - 39057361)>>14;
2242 static void RENAME(lumRangeFromJpeg
)(uint16_t *dst
, int width
)
2245 for (i
= 0; i
< width
; i
++)
2246 dst
[i
] = (dst
[i
]*14071 + 33561947)>>14;
2249 #define FAST_BILINEAR_X86 \
2250 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2251 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2252 "shll $16, %%edi \n\t" \
2253 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2254 "mov %1, %%"REG_D"\n\t" \
2255 "shrl $9, %%esi \n\t" \
2257 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2258 long dstWidth
, const uint8_t *src
, int srcW
,
2261 #if ARCH_X86 && CONFIG_GPL
2262 #if COMPILE_TEMPLATE_MMX2
2263 int32_t *filterPos
= c
->hLumFilterPos
;
2264 int16_t *filter
= c
->hLumFilter
;
2265 int canMMX2BeUsed
= c
->canMMX2BeUsed
;
2266 void *mmx2FilterCode
= c
->lumMmx2FilterCode
;
2269 DECLARE_ALIGNED(8, uint64_t, ebxsave
);
2271 if (canMMX2BeUsed
) {
2274 "mov %%"REG_b
", %5 \n\t"
2276 "pxor %%mm7, %%mm7 \n\t"
2277 "mov %0, %%"REG_c
" \n\t"
2278 "mov %1, %%"REG_D
" \n\t"
2279 "mov %2, %%"REG_d
" \n\t"
2280 "mov %3, %%"REG_b
" \n\t"
2281 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2282 PREFETCH
" (%%"REG_c
") \n\t"
2283 PREFETCH
" 32(%%"REG_c
") \n\t"
2284 PREFETCH
" 64(%%"REG_c
") \n\t"
2288 #define CALL_MMX2_FILTER_CODE \
2289 "movl (%%"REG_b"), %%esi \n\t"\
2291 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2292 "add %%"REG_S", %%"REG_c" \n\t"\
2293 "add %%"REG_a", %%"REG_D" \n\t"\
2294 "xor %%"REG_a", %%"REG_a" \n\t"\
2298 #define CALL_MMX2_FILTER_CODE \
2299 "movl (%%"REG_b"), %%esi \n\t"\
2301 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2302 "add %%"REG_a", %%"REG_D" \n\t"\
2303 "xor %%"REG_a", %%"REG_a" \n\t"\
2305 #endif /* ARCH_X86_64 */
2307 CALL_MMX2_FILTER_CODE
2308 CALL_MMX2_FILTER_CODE
2309 CALL_MMX2_FILTER_CODE
2310 CALL_MMX2_FILTER_CODE
2311 CALL_MMX2_FILTER_CODE
2312 CALL_MMX2_FILTER_CODE
2313 CALL_MMX2_FILTER_CODE
2314 CALL_MMX2_FILTER_CODE
2317 "mov %5, %%"REG_b
" \n\t"
2319 :: "m" (src
), "m" (dst
), "m" (filter
), "m" (filterPos
),
2320 "m" (mmx2FilterCode
)
2324 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2329 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--) dst
[i
] = src
[srcW
-1]*128;
2331 #endif /* COMPILE_TEMPLATE_MMX2 */
2332 x86_reg xInc_shr16
= xInc
>> 16;
2333 uint16_t xInc_mask
= xInc
& 0xffff;
2334 //NO MMX just normal asm ...
2336 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2337 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2338 "xorl %%ecx, %%ecx \n\t" // xalpha
2341 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2342 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2344 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2345 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2346 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>16 + carry
2348 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2349 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2351 "movw %%si, 2(%%"REG_D
", %%"REG_a
", 2) \n\t"
2352 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2353 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>16 + carry
2356 "add $2, %%"REG_a
" \n\t"
2357 "cmp %2, %%"REG_a
" \n\t"
2361 :: "r" (src
), "m" (dst
), "m" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
)
2362 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2364 #if COMPILE_TEMPLATE_MMX2
2365 } //if MMX2 can't be used
2369 unsigned int xpos
=0;
2370 for (i
=0;i
<dstWidth
;i
++) {
2371 register unsigned int xx
=xpos
>>16;
2372 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2373 dst
[i
]= (src
[xx
]<<7) + (src
[xx
+1] - src
[xx
])*xalpha
;
2376 #endif /* ARCH_X86 */
2379 // *** horizontal scale Y line to temp buffer
2380 static inline void RENAME(hyscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, const uint8_t *src
, int srcW
, int xInc
,
2381 const int16_t *hLumFilter
,
2382 const int16_t *hLumFilterPos
, int hLumFilterSize
,
2383 uint8_t *formatConvBuffer
,
2384 uint32_t *pal
, int isAlpha
)
2386 void (*toYV12
)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha
? c
->alpToYV12
: c
->lumToYV12
;
2387 void (*convertRange
)(uint16_t *, int) = isAlpha
? NULL
: c
->lumConvertRange
;
2389 src
+= isAlpha
? c
->alpSrcOffset
: c
->lumSrcOffset
;
2392 toYV12(formatConvBuffer
, src
, srcW
, pal
);
2393 src
= formatConvBuffer
;
2396 if (!c
->hyscale_fast
) {
2397 c
->hScale(dst
, dstWidth
, src
, srcW
, xInc
, hLumFilter
, hLumFilterPos
, hLumFilterSize
);
2398 } else { // fast bilinear upscale / crap downscale
2399 c
->hyscale_fast(c
, dst
, dstWidth
, src
, srcW
, xInc
);
2403 convertRange(dst
, dstWidth
);
2406 static inline void RENAME(hcscale_fast
)(SwsContext
*c
, int16_t *dst
,
2407 long dstWidth
, const uint8_t *src1
,
2408 const uint8_t *src2
, int srcW
, int xInc
)
2410 #if ARCH_X86 && CONFIG_GPL
2411 #if COMPILE_TEMPLATE_MMX2
2412 int32_t *filterPos
= c
->hChrFilterPos
;
2413 int16_t *filter
= c
->hChrFilter
;
2414 int canMMX2BeUsed
= c
->canMMX2BeUsed
;
2415 void *mmx2FilterCode
= c
->chrMmx2FilterCode
;
2418 DECLARE_ALIGNED(8, uint64_t, ebxsave
);
2420 if (canMMX2BeUsed
) {
2423 "mov %%"REG_b
", %6 \n\t"
2425 "pxor %%mm7, %%mm7 \n\t"
2426 "mov %0, %%"REG_c
" \n\t"
2427 "mov %1, %%"REG_D
" \n\t"
2428 "mov %2, %%"REG_d
" \n\t"
2429 "mov %3, %%"REG_b
" \n\t"
2430 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2431 PREFETCH
" (%%"REG_c
") \n\t"
2432 PREFETCH
" 32(%%"REG_c
") \n\t"
2433 PREFETCH
" 64(%%"REG_c
") \n\t"
2435 CALL_MMX2_FILTER_CODE
2436 CALL_MMX2_FILTER_CODE
2437 CALL_MMX2_FILTER_CODE
2438 CALL_MMX2_FILTER_CODE
2439 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2440 "mov %5, %%"REG_c
" \n\t" // src
2441 "mov %1, %%"REG_D
" \n\t" // buf1
2442 "add $"AV_STRINGIFY(VOF
)", %%"REG_D
" \n\t"
2443 PREFETCH
" (%%"REG_c
") \n\t"
2444 PREFETCH
" 32(%%"REG_c
") \n\t"
2445 PREFETCH
" 64(%%"REG_c
") \n\t"
2447 CALL_MMX2_FILTER_CODE
2448 CALL_MMX2_FILTER_CODE
2449 CALL_MMX2_FILTER_CODE
2450 CALL_MMX2_FILTER_CODE
2453 "mov %6, %%"REG_b
" \n\t"
2455 :: "m" (src1
), "m" (dst
), "m" (filter
), "m" (filterPos
),
2456 "m" (mmx2FilterCode
), "m" (src2
)
2460 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2465 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--) {
2466 //printf("%d %d %d\n", dstWidth, i, srcW);
2467 dst
[i
] = src1
[srcW
-1]*128;
2468 dst
[i
+VOFW
] = src2
[srcW
-1]*128;
2471 #endif /* COMPILE_TEMPLATE_MMX2 */
2472 x86_reg xInc_shr16
= (x86_reg
) (xInc
>> 16);
2473 uint16_t xInc_mask
= xInc
& 0xffff;
2475 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2476 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2477 "xorl %%ecx, %%ecx \n\t" // xalpha
2480 "mov %0, %%"REG_S
" \n\t"
2481 "movzbl (%%"REG_S
", %%"REG_d
"), %%edi \n\t" //src[xx]
2482 "movzbl 1(%%"REG_S
", %%"REG_d
"), %%esi \n\t" //src[xx+1]
2484 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2486 "movzbl (%5, %%"REG_d
"), %%edi \n\t" //src[xx]
2487 "movzbl 1(%5, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2489 "movw %%si, "AV_STRINGIFY(VOF
)"(%%"REG_D
", %%"REG_a
", 2) \n\t"
2491 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2492 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>16 + carry
2493 "add $1, %%"REG_a
" \n\t"
2494 "cmp %2, %%"REG_a
" \n\t"
2497 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2498 which is needed to support GCC 4.0. */
2499 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2500 :: "m" (src1
), "m" (dst
), "g" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2502 :: "m" (src1
), "m" (dst
), "m" (dstWidth
), "m" (xInc_shr16
), "m" (xInc_mask
),
2505 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2507 #if COMPILE_TEMPLATE_MMX2
2508 } //if MMX2 can't be used
2512 unsigned int xpos
=0;
2513 for (i
=0;i
<dstWidth
;i
++) {
2514 register unsigned int xx
=xpos
>>16;
2515 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2516 dst
[i
]=(src1
[xx
]*(xalpha
^127)+src1
[xx
+1]*xalpha
);
2517 dst
[i
+VOFW
]=(src2
[xx
]*(xalpha
^127)+src2
[xx
+1]*xalpha
);
2519 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2520 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2524 #endif /* ARCH_X86 */
2527 inline static void RENAME(hcscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, const uint8_t *src1
, const uint8_t *src2
,
2528 int srcW
, int xInc
, const int16_t *hChrFilter
,
2529 const int16_t *hChrFilterPos
, int hChrFilterSize
,
2530 uint8_t *formatConvBuffer
,
2534 src1
+= c
->chrSrcOffset
;
2535 src2
+= c
->chrSrcOffset
;
2538 c
->chrToYV12(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2539 src1
= formatConvBuffer
;
2540 src2
= formatConvBuffer
+VOFW
;
2543 if (!c
->hcscale_fast
) {
2544 c
->hScale(dst
, dstWidth
, src1
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2545 c
->hScale(dst
+VOFW
, dstWidth
, src2
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2546 } else { // fast bilinear upscale / crap downscale
2547 c
->hcscale_fast(c
, dst
, dstWidth
, src1
, src2
, srcW
, xInc
);
2550 if (c
->chrConvertRange
)
2551 c
->chrConvertRange(dst
, dstWidth
);
2554 #define DEBUG_SWSCALE_BUFFERS 0
2555 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2557 static int RENAME(swScale
)(SwsContext
*c
, const uint8_t* src
[], int srcStride
[], int srcSliceY
,
2558 int srcSliceH
, uint8_t* dst
[], int dstStride
[])
2560 /* load a few things into local vars to make the code more readable? and faster */
2561 const int srcW
= c
->srcW
;
2562 const int dstW
= c
->dstW
;
2563 const int dstH
= c
->dstH
;
2564 const int chrDstW
= c
->chrDstW
;
2565 const int chrSrcW
= c
->chrSrcW
;
2566 const int lumXInc
= c
->lumXInc
;
2567 const int chrXInc
= c
->chrXInc
;
2568 const enum PixelFormat dstFormat
= c
->dstFormat
;
2569 const int flags
= c
->flags
;
2570 int16_t *vLumFilterPos
= c
->vLumFilterPos
;
2571 int16_t *vChrFilterPos
= c
->vChrFilterPos
;
2572 int16_t *hLumFilterPos
= c
->hLumFilterPos
;
2573 int16_t *hChrFilterPos
= c
->hChrFilterPos
;
2574 int16_t *vLumFilter
= c
->vLumFilter
;
2575 int16_t *vChrFilter
= c
->vChrFilter
;
2576 int16_t *hLumFilter
= c
->hLumFilter
;
2577 int16_t *hChrFilter
= c
->hChrFilter
;
2578 int32_t *lumMmxFilter
= c
->lumMmxFilter
;
2579 int32_t *chrMmxFilter
= c
->chrMmxFilter
;
2580 int32_t av_unused
*alpMmxFilter
= c
->alpMmxFilter
;
2581 const int vLumFilterSize
= c
->vLumFilterSize
;
2582 const int vChrFilterSize
= c
->vChrFilterSize
;
2583 const int hLumFilterSize
= c
->hLumFilterSize
;
2584 const int hChrFilterSize
= c
->hChrFilterSize
;
2585 int16_t **lumPixBuf
= c
->lumPixBuf
;
2586 int16_t **chrPixBuf
= c
->chrPixBuf
;
2587 int16_t **alpPixBuf
= c
->alpPixBuf
;
2588 const int vLumBufSize
= c
->vLumBufSize
;
2589 const int vChrBufSize
= c
->vChrBufSize
;
2590 uint8_t *formatConvBuffer
= c
->formatConvBuffer
;
2591 const int chrSrcSliceY
= srcSliceY
>> c
->chrSrcVSubSample
;
2592 const int chrSrcSliceH
= -((-srcSliceH
) >> c
->chrSrcVSubSample
);
2594 uint32_t *pal
=c
->pal_yuv
;
2596 /* vars which will change and which we need to store back in the context */
2598 int lumBufIndex
= c
->lumBufIndex
;
2599 int chrBufIndex
= c
->chrBufIndex
;
2600 int lastInLumBuf
= c
->lastInLumBuf
;
2601 int lastInChrBuf
= c
->lastInChrBuf
;
2603 if (isPacked(c
->srcFormat
)) {
2611 srcStride
[3]= srcStride
[0];
2613 srcStride
[1]<<= c
->vChrDrop
;
2614 srcStride
[2]<<= c
->vChrDrop
;
2616 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2617 src
[0], srcStride
[0], src
[1], srcStride
[1], src
[2], srcStride
[2], src
[3], srcStride
[3],
2618 dst
[0], dstStride
[0], dst
[1], dstStride
[1], dst
[2], dstStride
[2], dst
[3], dstStride
[3]);
2619 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2620 srcSliceY
, srcSliceH
, dstY
, dstH
);
2621 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2622 vLumFilterSize
, vLumBufSize
, vChrFilterSize
, vChrBufSize
);
2624 if (dstStride
[0]%8 !=0 || dstStride
[1]%8 !=0 || dstStride
[2]%8 !=0 || dstStride
[3]%8 != 0) {
2625 static int warnedAlready
=0; //FIXME move this into the context perhaps
2626 if (flags
& SWS_PRINT_INFO
&& !warnedAlready
) {
2627 av_log(c
, AV_LOG_WARNING
, "Warning: dstStride is not aligned!\n"
2628 " ->cannot do aligned memory accesses anymore\n");
2633 /* Note the user might start scaling the picture in the middle so this
2634 will not get executed. This is not really intended but works
2635 currently, so people might do it. */
2636 if (srcSliceY
==0) {
2646 for (;dstY
< dstH
; dstY
++) {
2647 unsigned char *dest
=dst
[0]+dstStride
[0]*dstY
;
2648 const int chrDstY
= dstY
>>c
->chrDstVSubSample
;
2649 unsigned char *uDest
=dst
[1]+dstStride
[1]*chrDstY
;
2650 unsigned char *vDest
=dst
[2]+dstStride
[2]*chrDstY
;
2651 unsigned char *aDest
=(CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? dst
[3]+dstStride
[3]*dstY
: NULL
;
2653 const int firstLumSrcY
= vLumFilterPos
[dstY
]; //First line needed as input
2654 const int firstChrSrcY
= vChrFilterPos
[chrDstY
]; //First line needed as input
2655 int lastLumSrcY
= firstLumSrcY
+ vLumFilterSize
-1; // Last line needed as input
2656 int lastChrSrcY
= firstChrSrcY
+ vChrFilterSize
-1; // Last line needed as input
2659 //handle holes (FAST_BILINEAR & weird filters)
2660 if (firstLumSrcY
> lastInLumBuf
) lastInLumBuf
= firstLumSrcY
-1;
2661 if (firstChrSrcY
> lastInChrBuf
) lastInChrBuf
= firstChrSrcY
-1;
2662 assert(firstLumSrcY
>= lastInLumBuf
- vLumBufSize
+ 1);
2663 assert(firstChrSrcY
>= lastInChrBuf
- vChrBufSize
+ 1);
2665 DEBUG_BUFFERS("dstY: %d\n", dstY
);
2666 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2667 firstLumSrcY
, lastLumSrcY
, lastInLumBuf
);
2668 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2669 firstChrSrcY
, lastChrSrcY
, lastInChrBuf
);
2671 // Do we have enough lines in this slice to output the dstY line
2672 enough_lines
= lastLumSrcY
< srcSliceY
+ srcSliceH
&& lastChrSrcY
< -((-srcSliceY
- srcSliceH
)>>c
->chrSrcVSubSample
);
2673 if (!enough_lines
) {
2674 lastLumSrcY
= srcSliceY
+ srcSliceH
- 1;
2675 lastChrSrcY
= chrSrcSliceY
+ chrSrcSliceH
- 1;
2676 DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2677 lastLumSrcY
, lastChrSrcY
);
2680 //Do horizontal scaling
2681 while(lastInLumBuf
< lastLumSrcY
) {
2682 const uint8_t *src1
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
2683 const uint8_t *src2
= src
[3]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[3];
2685 assert(lumBufIndex
< 2*vLumBufSize
);
2686 assert(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
);
2687 assert(lastInLumBuf
+ 1 - srcSliceY
>= 0);
2688 RENAME(hyscale
)(c
, lumPixBuf
[ lumBufIndex
], dstW
, src1
, srcW
, lumXInc
,
2689 hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2692 if (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
)
2693 RENAME(hyscale
)(c
, alpPixBuf
[ lumBufIndex
], dstW
, src2
, srcW
, lumXInc
,
2694 hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2698 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2699 lumBufIndex
, lastInLumBuf
);
2701 while(lastInChrBuf
< lastChrSrcY
) {
2702 const uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
2703 const uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
2705 assert(chrBufIndex
< 2*vChrBufSize
);
2706 assert(lastInChrBuf
+ 1 - chrSrcSliceY
< (chrSrcSliceH
));
2707 assert(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0);
2708 //FIXME replace parameters through context struct (some at least)
2710 if (c
->needs_hcscale
)
2711 RENAME(hcscale
)(c
, chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
2712 hChrFilter
, hChrFilterPos
, hChrFilterSize
,
2716 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2717 chrBufIndex
, lastInChrBuf
);
2719 //wrap buf index around to stay inside the ring buffer
2720 if (lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
2721 if (chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
2723 break; //we can't output a dstY line so let's try with the next slice
2725 #if COMPILE_TEMPLATE_MMX
2726 c
->blueDither
= ff_dither8
[dstY
&1];
2727 if (c
->dstFormat
== PIX_FMT_RGB555
|| c
->dstFormat
== PIX_FMT_BGR555
)
2728 c
->greenDither
= ff_dither8
[dstY
&1];
2730 c
->greenDither
= ff_dither4
[dstY
&1];
2731 c
->redDither
= ff_dither8
[(dstY
+1)&1];
2733 if (dstY
< dstH
-2) {
2734 const int16_t **lumSrcPtr
= (const int16_t **) lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2735 const int16_t **chrSrcPtr
= (const int16_t **) chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2736 const int16_t **alpSrcPtr
= (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? (const int16_t **) alpPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
: NULL
;
2737 #if COMPILE_TEMPLATE_MMX
2739 if (flags
& SWS_ACCURATE_RND
) {
2740 int s
= APCK_SIZE
/ 8;
2741 for (i
=0; i
<vLumFilterSize
; i
+=2) {
2742 *(const void**)&lumMmxFilter
[s
*i
]= lumSrcPtr
[i
];
2743 *(const void**)&lumMmxFilter
[s
*i
+APCK_PTR2
/4 ]= lumSrcPtr
[i
+(vLumFilterSize
>1)];
2744 lumMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2745 lumMmxFilter
[s
*i
+APCK_COEF
/4+1]= vLumFilter
[dstY
*vLumFilterSize
+ i
]
2746 + (vLumFilterSize
>1 ? vLumFilter
[dstY
*vLumFilterSize
+ i
+ 1]<<16 : 0);
2747 if (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) {
2748 *(const void**)&alpMmxFilter
[s
*i
]= alpSrcPtr
[i
];
2749 *(const void**)&alpMmxFilter
[s
*i
+APCK_PTR2
/4 ]= alpSrcPtr
[i
+(vLumFilterSize
>1)];
2750 alpMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2751 alpMmxFilter
[s
*i
+APCK_COEF
/4+1]= lumMmxFilter
[s
*i
+APCK_COEF
/4 ];
2754 for (i
=0; i
<vChrFilterSize
; i
+=2) {
2755 *(const void**)&chrMmxFilter
[s
*i
]= chrSrcPtr
[i
];
2756 *(const void**)&chrMmxFilter
[s
*i
+APCK_PTR2
/4 ]= chrSrcPtr
[i
+(vChrFilterSize
>1)];
2757 chrMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2758 chrMmxFilter
[s
*i
+APCK_COEF
/4+1]= vChrFilter
[chrDstY
*vChrFilterSize
+ i
]
2759 + (vChrFilterSize
>1 ? vChrFilter
[chrDstY
*vChrFilterSize
+ i
+ 1]<<16 : 0);
2762 for (i
=0; i
<vLumFilterSize
; i
++) {
2763 lumMmxFilter
[4*i
+0]= (int32_t)lumSrcPtr
[i
];
2764 lumMmxFilter
[4*i
+1]= (uint64_t)lumSrcPtr
[i
] >> 32;
2765 lumMmxFilter
[4*i
+2]=
2766 lumMmxFilter
[4*i
+3]=
2767 ((uint16_t)vLumFilter
[dstY
*vLumFilterSize
+ i
])*0x10001;
2768 if (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) {
2769 alpMmxFilter
[4*i
+0]= (int32_t)alpSrcPtr
[i
];
2770 alpMmxFilter
[4*i
+1]= (uint64_t)alpSrcPtr
[i
] >> 32;
2771 alpMmxFilter
[4*i
+2]=
2772 alpMmxFilter
[4*i
+3]= lumMmxFilter
[4*i
+2];
2775 for (i
=0; i
<vChrFilterSize
; i
++) {
2776 chrMmxFilter
[4*i
+0]= (int32_t)chrSrcPtr
[i
];
2777 chrMmxFilter
[4*i
+1]= (uint64_t)chrSrcPtr
[i
] >> 32;
2778 chrMmxFilter
[4*i
+2]=
2779 chrMmxFilter
[4*i
+3]=
2780 ((uint16_t)vChrFilter
[chrDstY
*vChrFilterSize
+ i
])*0x10001;
2784 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
) {
2785 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2786 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2788 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2789 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2790 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
2791 } else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) { //YV12 like
2792 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2793 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
2794 if (is16BPS(dstFormat
)) {
2796 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2797 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2798 alpSrcPtr
, (uint16_t *) dest
, (uint16_t *) uDest
, (uint16_t *) vDest
, (uint16_t *) aDest
, dstW
, chrDstW
,
2800 } else if (vLumFilterSize
== 1 && vChrFilterSize
== 1) { // unscaled YV12
2801 const int16_t *lumBuf
= lumSrcPtr
[0];
2802 const int16_t *chrBuf
= chrSrcPtr
[0];
2803 const int16_t *alpBuf
= (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? alpSrcPtr
[0] : NULL
;
2804 c
->yuv2yuv1(c
, lumBuf
, chrBuf
, alpBuf
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
2805 } else { //General YV12
2807 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2808 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2809 alpSrcPtr
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
2812 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
2813 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
2814 if (vLumFilterSize
== 1 && vChrFilterSize
== 2) { //unscaled RGB
2815 int chrAlpha
= vChrFilter
[2*dstY
+1];
2816 if(flags
& SWS_FULL_CHR_H_INT
) {
2817 yuv2rgbXinC_full(c
, //FIXME write a packed1_full function
2818 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2819 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2820 alpSrcPtr
, dest
, dstW
, dstY
);
2822 c
->yuv2packed1(c
, *lumSrcPtr
, *chrSrcPtr
, *(chrSrcPtr
+1),
2823 alpPixBuf
? *alpSrcPtr
: NULL
,
2824 dest
, dstW
, chrAlpha
, dstFormat
, flags
, dstY
);
2826 } else if (vLumFilterSize
== 2 && vChrFilterSize
== 2) { //bilinear upscale RGB
2827 int lumAlpha
= vLumFilter
[2*dstY
+1];
2828 int chrAlpha
= vChrFilter
[2*dstY
+1];
2830 lumMmxFilter
[3]= vLumFilter
[2*dstY
]*0x10001;
2832 chrMmxFilter
[3]= vChrFilter
[2*chrDstY
]*0x10001;
2833 if(flags
& SWS_FULL_CHR_H_INT
) {
2834 yuv2rgbXinC_full(c
, //FIXME write a packed2_full function
2835 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2836 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2837 alpSrcPtr
, dest
, dstW
, dstY
);
2839 c
->yuv2packed2(c
, *lumSrcPtr
, *(lumSrcPtr
+1), *chrSrcPtr
, *(chrSrcPtr
+1),
2840 alpPixBuf
? *alpSrcPtr
: NULL
, alpPixBuf
? *(alpSrcPtr
+1) : NULL
,
2841 dest
, dstW
, lumAlpha
, chrAlpha
, dstY
);
2843 } else { //general RGB
2844 if(flags
& SWS_FULL_CHR_H_INT
) {
2846 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2847 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2848 alpSrcPtr
, dest
, dstW
, dstY
);
2851 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2852 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2853 alpSrcPtr
, dest
, dstW
, dstY
);
2857 } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2858 const int16_t **lumSrcPtr
= (const int16_t **)lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2859 const int16_t **chrSrcPtr
= (const int16_t **)chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2860 const int16_t **alpSrcPtr
= (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? (const int16_t **)alpPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
: NULL
;
2861 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
) {
2862 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2863 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2865 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2866 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2867 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
2868 } else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) { //YV12
2869 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2870 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
2871 if (is16BPS(dstFormat
)) {
2873 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2874 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2875 alpSrcPtr
, (uint16_t *) dest
, (uint16_t *) uDest
, (uint16_t *) vDest
, (uint16_t *) aDest
, dstW
, chrDstW
,
2879 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2880 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2881 alpSrcPtr
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
2884 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
2885 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
2886 if(flags
& SWS_FULL_CHR_H_INT
) {
2888 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2889 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2890 alpSrcPtr
, dest
, dstW
, dstY
);
2893 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2894 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2895 alpSrcPtr
, dest
, dstW
, dstY
);
2901 if ((dstFormat
== PIX_FMT_YUVA420P
) && !alpPixBuf
)
2902 fillPlane(dst
[3], dstStride
[3], dstW
, dstY
-lastDstY
, lastDstY
, 255);
2904 #if COMPILE_TEMPLATE_MMX
2905 if (flags
& SWS_CPU_CAPS_MMX2
) __asm__
volatile("sfence":::"memory");
2906 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2907 if (flags
& SWS_CPU_CAPS_3DNOW
) __asm__
volatile("femms" :::"memory");
2908 else __asm__
volatile("emms" :::"memory");
2910 /* store changed local vars back in the context */
2912 c
->lumBufIndex
= lumBufIndex
;
2913 c
->chrBufIndex
= chrBufIndex
;
2914 c
->lastInLumBuf
= lastInLumBuf
;
2915 c
->lastInChrBuf
= lastInChrBuf
;
2917 return dstY
- lastDstY
;
2920 static void RENAME(sws_init_swScale
)(SwsContext
*c
)
2922 enum PixelFormat srcFormat
= c
->srcFormat
;
2924 c
->yuv2nv12X
= RENAME(yuv2nv12X
);
2925 c
->yuv2yuv1
= RENAME(yuv2yuv1
);
2926 c
->yuv2yuvX
= RENAME(yuv2yuvX
);
2927 c
->yuv2packed1
= RENAME(yuv2packed1
);
2928 c
->yuv2packed2
= RENAME(yuv2packed2
);
2929 c
->yuv2packedX
= RENAME(yuv2packedX
);
2931 c
->hScale
= RENAME(hScale
);
2933 #if COMPILE_TEMPLATE_MMX
2934 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2935 if (c
->flags
& SWS_FAST_BILINEAR
&& c
->canMMX2BeUsed
)
2937 if (c
->flags
& SWS_FAST_BILINEAR
)
2940 c
->hyscale_fast
= RENAME(hyscale_fast
);
2941 c
->hcscale_fast
= RENAME(hcscale_fast
);
2944 c
->chrToYV12
= NULL
;
2946 case PIX_FMT_YUYV422
: c
->chrToYV12
= RENAME(yuy2ToUV
); break;
2947 case PIX_FMT_UYVY422
: c
->chrToYV12
= RENAME(uyvyToUV
); break;
2948 case PIX_FMT_NV12
: c
->chrToYV12
= RENAME(nv12ToUV
); break;
2949 case PIX_FMT_NV21
: c
->chrToYV12
= RENAME(nv21ToUV
); break;
2953 case PIX_FMT_BGR4_BYTE
:
2954 case PIX_FMT_RGB4_BYTE
: c
->chrToYV12
= palToUV
; break;
2955 case PIX_FMT_YUV420P16BE
:
2956 case PIX_FMT_YUV422P16BE
:
2957 case PIX_FMT_YUV444P16BE
: c
->chrToYV12
= RENAME(BEToUV
); break;
2958 case PIX_FMT_YUV420P16LE
:
2959 case PIX_FMT_YUV422P16LE
:
2960 case PIX_FMT_YUV444P16LE
: c
->chrToYV12
= RENAME(LEToUV
); break;
2962 if (c
->chrSrcHSubSample
) {
2964 case PIX_FMT_RGB48BE
:
2965 case PIX_FMT_RGB48LE
: c
->chrToYV12
= rgb48ToUV_half
; break;
2966 case PIX_FMT_RGB32
:
2967 case PIX_FMT_RGB32_1
: c
->chrToYV12
= bgr32ToUV_half
; break;
2968 case PIX_FMT_BGR24
: c
->chrToYV12
= RENAME(bgr24ToUV_half
); break;
2969 case PIX_FMT_BGR565
: c
->chrToYV12
= bgr16ToUV_half
; break;
2970 case PIX_FMT_BGR555
: c
->chrToYV12
= bgr15ToUV_half
; break;
2971 case PIX_FMT_BGR32
:
2972 case PIX_FMT_BGR32_1
: c
->chrToYV12
= rgb32ToUV_half
; break;
2973 case PIX_FMT_RGB24
: c
->chrToYV12
= RENAME(rgb24ToUV_half
); break;
2974 case PIX_FMT_RGB565
: c
->chrToYV12
= rgb16ToUV_half
; break;
2975 case PIX_FMT_RGB555
: c
->chrToYV12
= rgb15ToUV_half
; break;
2979 case PIX_FMT_RGB48BE
:
2980 case PIX_FMT_RGB48LE
: c
->chrToYV12
= rgb48ToUV
; break;
2981 case PIX_FMT_RGB32
:
2982 case PIX_FMT_RGB32_1
: c
->chrToYV12
= bgr32ToUV
; break;
2983 case PIX_FMT_BGR24
: c
->chrToYV12
= RENAME(bgr24ToUV
); break;
2984 case PIX_FMT_BGR565
: c
->chrToYV12
= bgr16ToUV
; break;
2985 case PIX_FMT_BGR555
: c
->chrToYV12
= bgr15ToUV
; break;
2986 case PIX_FMT_BGR32
:
2987 case PIX_FMT_BGR32_1
: c
->chrToYV12
= rgb32ToUV
; break;
2988 case PIX_FMT_RGB24
: c
->chrToYV12
= RENAME(rgb24ToUV
); break;
2989 case PIX_FMT_RGB565
: c
->chrToYV12
= rgb16ToUV
; break;
2990 case PIX_FMT_RGB555
: c
->chrToYV12
= rgb15ToUV
; break;
2994 c
->lumToYV12
= NULL
;
2995 c
->alpToYV12
= NULL
;
2996 switch (srcFormat
) {
2997 case PIX_FMT_YUYV422
:
2998 case PIX_FMT_YUV420P16BE
:
2999 case PIX_FMT_YUV422P16BE
:
3000 case PIX_FMT_YUV444P16BE
:
3001 case PIX_FMT_GRAY16BE
: c
->lumToYV12
= RENAME(yuy2ToY
); break;
3002 case PIX_FMT_UYVY422
:
3003 case PIX_FMT_YUV420P16LE
:
3004 case PIX_FMT_YUV422P16LE
:
3005 case PIX_FMT_YUV444P16LE
:
3006 case PIX_FMT_GRAY16LE
: c
->lumToYV12
= RENAME(uyvyToY
); break;
3007 case PIX_FMT_BGR24
: c
->lumToYV12
= RENAME(bgr24ToY
); break;
3008 case PIX_FMT_BGR565
: c
->lumToYV12
= bgr16ToY
; break;
3009 case PIX_FMT_BGR555
: c
->lumToYV12
= bgr15ToY
; break;
3010 case PIX_FMT_RGB24
: c
->lumToYV12
= RENAME(rgb24ToY
); break;
3011 case PIX_FMT_RGB565
: c
->lumToYV12
= rgb16ToY
; break;
3012 case PIX_FMT_RGB555
: c
->lumToYV12
= rgb15ToY
; break;
3016 case PIX_FMT_BGR4_BYTE
:
3017 case PIX_FMT_RGB4_BYTE
: c
->lumToYV12
= palToY
; break;
3018 case PIX_FMT_MONOBLACK
: c
->lumToYV12
= monoblack2Y
; break;
3019 case PIX_FMT_MONOWHITE
: c
->lumToYV12
= monowhite2Y
; break;
3020 case PIX_FMT_RGB32
:
3021 case PIX_FMT_RGB32_1
: c
->lumToYV12
= bgr32ToY
; break;
3022 case PIX_FMT_BGR32
:
3023 case PIX_FMT_BGR32_1
: c
->lumToYV12
= rgb32ToY
; break;
3024 case PIX_FMT_RGB48BE
:
3025 case PIX_FMT_RGB48LE
: c
->lumToYV12
= rgb48ToY
; break;
3028 switch (srcFormat
) {
3029 case PIX_FMT_RGB32
:
3030 case PIX_FMT_RGB32_1
:
3031 case PIX_FMT_BGR32
:
3032 case PIX_FMT_BGR32_1
: c
->alpToYV12
= abgrToA
; break;
3036 switch (srcFormat
) {
3037 case PIX_FMT_RGB32
:
3038 case PIX_FMT_BGR32
:
3039 c
->alpSrcOffset
= 3;
3041 case PIX_FMT_RGB32_1
:
3042 case PIX_FMT_BGR32_1
:
3043 c
->lumSrcOffset
= ALT32_CORR
;
3044 c
->chrSrcOffset
= ALT32_CORR
;
3046 case PIX_FMT_RGB48LE
:
3047 c
->lumSrcOffset
= 1;
3048 c
->chrSrcOffset
= 1;
3049 c
->alpSrcOffset
= 1;
3053 if (c
->srcRange
!= c
->dstRange
&& !isAnyRGB(c
->dstFormat
)) {
3055 c
->lumConvertRange
= RENAME(lumRangeFromJpeg
);
3056 c
->chrConvertRange
= RENAME(chrRangeFromJpeg
);
3058 c
->lumConvertRange
= RENAME(lumRangeToJpeg
);
3059 c
->chrConvertRange
= RENAME(chrRangeToJpeg
);
3063 if (!(isGray(srcFormat
) || isGray(c
->dstFormat
) ||
3064 srcFormat
== PIX_FMT_MONOBLACK
|| srcFormat
== PIX_FMT_MONOWHITE
))
3065 c
->needs_hcscale
= 1;