2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 #if COMPILE_TEMPLATE_AMD3DNOW
27 #define PREFETCH "prefetch"
28 #elif COMPILE_TEMPLATE_MMX2
29 #define PREFETCH "prefetchnta"
31 #define PREFETCH " # nop"
34 #if COMPILE_TEMPLATE_MMX2
35 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
36 #elif COMPILE_TEMPLATE_AMD3DNOW
37 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
40 #if COMPILE_TEMPLATE_MMX2
41 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
43 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
45 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
47 #if COMPILE_TEMPLATE_ALTIVEC
48 #include "ppc/swscale_altivec_template.c"
51 #define YSCALEYUV2YV12X(x, offset, dest, width) \
53 "xor %%"REG_a", %%"REG_a" \n\t"\
54 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
55 "movq %%mm3, %%mm4 \n\t"\
56 "lea " offset "(%0), %%"REG_d" \n\t"\
57 "mov (%%"REG_d"), %%"REG_S" \n\t"\
58 ASMALIGN(4) /* FIXME Unroll? */\
60 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
61 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
62 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
63 "add $16, %%"REG_d" \n\t"\
64 "mov (%%"REG_d"), %%"REG_S" \n\t"\
65 "test %%"REG_S", %%"REG_S" \n\t"\
66 "pmulhw %%mm0, %%mm2 \n\t"\
67 "pmulhw %%mm0, %%mm5 \n\t"\
68 "paddw %%mm2, %%mm3 \n\t"\
69 "paddw %%mm5, %%mm4 \n\t"\
71 "psraw $3, %%mm3 \n\t"\
72 "psraw $3, %%mm4 \n\t"\
73 "packuswb %%mm4, %%mm3 \n\t"\
74 MOVNTQ(%%mm3, (%1, %%REGa))\
75 "add $8, %%"REG_a" \n\t"\
76 "cmp %2, %%"REG_a" \n\t"\
77 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
78 "movq %%mm3, %%mm4 \n\t"\
79 "lea " offset "(%0), %%"REG_d" \n\t"\
80 "mov (%%"REG_d"), %%"REG_S" \n\t"\
82 :: "r" (&c->redDither),\
83 "r" (dest), "g" ((x86_reg)width)\
84 : "%"REG_a, "%"REG_d, "%"REG_S\
87 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
89 "lea " offset "(%0), %%"REG_d" \n\t"\
90 "xor %%"REG_a", %%"REG_a" \n\t"\
91 "pxor %%mm4, %%mm4 \n\t"\
92 "pxor %%mm5, %%mm5 \n\t"\
93 "pxor %%mm6, %%mm6 \n\t"\
94 "pxor %%mm7, %%mm7 \n\t"\
95 "mov (%%"REG_d"), %%"REG_S" \n\t"\
98 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
99 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
100 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
101 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
102 "movq %%mm0, %%mm3 \n\t"\
103 "punpcklwd %%mm1, %%mm0 \n\t"\
104 "punpckhwd %%mm1, %%mm3 \n\t"\
105 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
106 "pmaddwd %%mm1, %%mm0 \n\t"\
107 "pmaddwd %%mm1, %%mm3 \n\t"\
108 "paddd %%mm0, %%mm4 \n\t"\
109 "paddd %%mm3, %%mm5 \n\t"\
110 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
111 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
112 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
113 "test %%"REG_S", %%"REG_S" \n\t"\
114 "movq %%mm2, %%mm0 \n\t"\
115 "punpcklwd %%mm3, %%mm2 \n\t"\
116 "punpckhwd %%mm3, %%mm0 \n\t"\
117 "pmaddwd %%mm1, %%mm2 \n\t"\
118 "pmaddwd %%mm1, %%mm0 \n\t"\
119 "paddd %%mm2, %%mm6 \n\t"\
120 "paddd %%mm0, %%mm7 \n\t"\
122 "psrad $16, %%mm4 \n\t"\
123 "psrad $16, %%mm5 \n\t"\
124 "psrad $16, %%mm6 \n\t"\
125 "psrad $16, %%mm7 \n\t"\
126 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
127 "packssdw %%mm5, %%mm4 \n\t"\
128 "packssdw %%mm7, %%mm6 \n\t"\
129 "paddw %%mm0, %%mm4 \n\t"\
130 "paddw %%mm0, %%mm6 \n\t"\
131 "psraw $3, %%mm4 \n\t"\
132 "psraw $3, %%mm6 \n\t"\
133 "packuswb %%mm6, %%mm4 \n\t"\
134 MOVNTQ(%%mm4, (%1, %%REGa))\
135 "add $8, %%"REG_a" \n\t"\
136 "cmp %2, %%"REG_a" \n\t"\
137 "lea " offset "(%0), %%"REG_d" \n\t"\
138 "pxor %%mm4, %%mm4 \n\t"\
139 "pxor %%mm5, %%mm5 \n\t"\
140 "pxor %%mm6, %%mm6 \n\t"\
141 "pxor %%mm7, %%mm7 \n\t"\
142 "mov (%%"REG_d"), %%"REG_S" \n\t"\
144 :: "r" (&c->redDither),\
145 "r" (dest), "g" ((x86_reg)width)\
146 : "%"REG_a, "%"REG_d, "%"REG_S\
149 #define YSCALEYUV2YV121 \
150 "mov %2, %%"REG_a" \n\t"\
151 ASMALIGN(4) /* FIXME Unroll? */\
153 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
154 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
155 "psraw $7, %%mm0 \n\t"\
156 "psraw $7, %%mm1 \n\t"\
157 "packuswb %%mm1, %%mm0 \n\t"\
158 MOVNTQ(%%mm0, (%1, %%REGa))\
159 "add $8, %%"REG_a" \n\t"\
162 #define YSCALEYUV2YV121_ACCURATE \
163 "mov %2, %%"REG_a" \n\t"\
164 "pcmpeqw %%mm7, %%mm7 \n\t"\
165 "psrlw $15, %%mm7 \n\t"\
166 "psllw $6, %%mm7 \n\t"\
167 ASMALIGN(4) /* FIXME Unroll? */\
169 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
170 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
171 "paddsw %%mm7, %%mm0 \n\t"\
172 "paddsw %%mm7, %%mm1 \n\t"\
173 "psraw $7, %%mm0 \n\t"\
174 "psraw $7, %%mm1 \n\t"\
175 "packuswb %%mm1, %%mm0 \n\t"\
176 MOVNTQ(%%mm0, (%1, %%REGa))\
177 "add $8, %%"REG_a" \n\t"\
181 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
182 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
183 "r" (dest), "m" (dstW_reg),
184 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
185 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
187 #define YSCALEYUV2PACKEDX_UV \
189 "xor %%"REG_a", %%"REG_a" \n\t"\
193 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
194 "mov (%%"REG_d"), %%"REG_S" \n\t"\
195 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
196 "movq %%mm3, %%mm4 \n\t"\
199 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
200 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
201 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
202 "add $16, %%"REG_d" \n\t"\
203 "mov (%%"REG_d"), %%"REG_S" \n\t"\
204 "pmulhw %%mm0, %%mm2 \n\t"\
205 "pmulhw %%mm0, %%mm5 \n\t"\
206 "paddw %%mm2, %%mm3 \n\t"\
207 "paddw %%mm5, %%mm4 \n\t"\
208 "test %%"REG_S", %%"REG_S" \n\t"\
211 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
212 "lea "offset"(%0), %%"REG_d" \n\t"\
213 "mov (%%"REG_d"), %%"REG_S" \n\t"\
214 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
215 "movq "#dst1", "#dst2" \n\t"\
218 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
219 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
220 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
221 "add $16, %%"REG_d" \n\t"\
222 "mov (%%"REG_d"), %%"REG_S" \n\t"\
223 "pmulhw "#coeff", "#src1" \n\t"\
224 "pmulhw "#coeff", "#src2" \n\t"\
225 "paddw "#src1", "#dst1" \n\t"\
226 "paddw "#src2", "#dst2" \n\t"\
227 "test %%"REG_S", %%"REG_S" \n\t"\
230 #define YSCALEYUV2PACKEDX \
231 YSCALEYUV2PACKEDX_UV \
232 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
234 #define YSCALEYUV2PACKEDX_END \
235 :: "r" (&c->redDither), \
236 "m" (dummy), "m" (dummy), "m" (dummy),\
237 "r" (dest), "m" (dstW_reg) \
238 : "%"REG_a, "%"REG_d, "%"REG_S \
241 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
243 "xor %%"REG_a", %%"REG_a" \n\t"\
247 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
248 "mov (%%"REG_d"), %%"REG_S" \n\t"\
249 "pxor %%mm4, %%mm4 \n\t"\
250 "pxor %%mm5, %%mm5 \n\t"\
251 "pxor %%mm6, %%mm6 \n\t"\
252 "pxor %%mm7, %%mm7 \n\t"\
255 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
256 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
257 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
258 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
259 "movq %%mm0, %%mm3 \n\t"\
260 "punpcklwd %%mm1, %%mm0 \n\t"\
261 "punpckhwd %%mm1, %%mm3 \n\t"\
262 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
263 "pmaddwd %%mm1, %%mm0 \n\t"\
264 "pmaddwd %%mm1, %%mm3 \n\t"\
265 "paddd %%mm0, %%mm4 \n\t"\
266 "paddd %%mm3, %%mm5 \n\t"\
267 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
268 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
269 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
270 "test %%"REG_S", %%"REG_S" \n\t"\
271 "movq %%mm2, %%mm0 \n\t"\
272 "punpcklwd %%mm3, %%mm2 \n\t"\
273 "punpckhwd %%mm3, %%mm0 \n\t"\
274 "pmaddwd %%mm1, %%mm2 \n\t"\
275 "pmaddwd %%mm1, %%mm0 \n\t"\
276 "paddd %%mm2, %%mm6 \n\t"\
277 "paddd %%mm0, %%mm7 \n\t"\
279 "psrad $16, %%mm4 \n\t"\
280 "psrad $16, %%mm5 \n\t"\
281 "psrad $16, %%mm6 \n\t"\
282 "psrad $16, %%mm7 \n\t"\
283 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
284 "packssdw %%mm5, %%mm4 \n\t"\
285 "packssdw %%mm7, %%mm6 \n\t"\
286 "paddw %%mm0, %%mm4 \n\t"\
287 "paddw %%mm0, %%mm6 \n\t"\
288 "movq %%mm4, "U_TEMP"(%0) \n\t"\
289 "movq %%mm6, "V_TEMP"(%0) \n\t"\
291 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
292 "lea "offset"(%0), %%"REG_d" \n\t"\
293 "mov (%%"REG_d"), %%"REG_S" \n\t"\
294 "pxor %%mm1, %%mm1 \n\t"\
295 "pxor %%mm5, %%mm5 \n\t"\
296 "pxor %%mm7, %%mm7 \n\t"\
297 "pxor %%mm6, %%mm6 \n\t"\
300 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
301 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
302 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
303 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
304 "movq %%mm0, %%mm3 \n\t"\
305 "punpcklwd %%mm4, %%mm0 \n\t"\
306 "punpckhwd %%mm4, %%mm3 \n\t"\
307 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
308 "pmaddwd %%mm4, %%mm0 \n\t"\
309 "pmaddwd %%mm4, %%mm3 \n\t"\
310 "paddd %%mm0, %%mm1 \n\t"\
311 "paddd %%mm3, %%mm5 \n\t"\
312 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
313 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
314 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
315 "test %%"REG_S", %%"REG_S" \n\t"\
316 "movq %%mm2, %%mm0 \n\t"\
317 "punpcklwd %%mm3, %%mm2 \n\t"\
318 "punpckhwd %%mm3, %%mm0 \n\t"\
319 "pmaddwd %%mm4, %%mm2 \n\t"\
320 "pmaddwd %%mm4, %%mm0 \n\t"\
321 "paddd %%mm2, %%mm7 \n\t"\
322 "paddd %%mm0, %%mm6 \n\t"\
324 "psrad $16, %%mm1 \n\t"\
325 "psrad $16, %%mm5 \n\t"\
326 "psrad $16, %%mm7 \n\t"\
327 "psrad $16, %%mm6 \n\t"\
328 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
329 "packssdw %%mm5, %%mm1 \n\t"\
330 "packssdw %%mm6, %%mm7 \n\t"\
331 "paddw %%mm0, %%mm1 \n\t"\
332 "paddw %%mm0, %%mm7 \n\t"\
333 "movq "U_TEMP"(%0), %%mm3 \n\t"\
334 "movq "V_TEMP"(%0), %%mm4 \n\t"\
336 #define YSCALEYUV2PACKEDX_ACCURATE \
337 YSCALEYUV2PACKEDX_ACCURATE_UV \
338 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
340 #define YSCALEYUV2RGBX \
341 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
342 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
343 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
344 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
345 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
346 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
347 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
348 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
349 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
350 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
351 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
352 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
353 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
354 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
355 "paddw %%mm3, %%mm4 \n\t"\
356 "movq %%mm2, %%mm0 \n\t"\
357 "movq %%mm5, %%mm6 \n\t"\
358 "movq %%mm4, %%mm3 \n\t"\
359 "punpcklwd %%mm2, %%mm2 \n\t"\
360 "punpcklwd %%mm5, %%mm5 \n\t"\
361 "punpcklwd %%mm4, %%mm4 \n\t"\
362 "paddw %%mm1, %%mm2 \n\t"\
363 "paddw %%mm1, %%mm5 \n\t"\
364 "paddw %%mm1, %%mm4 \n\t"\
365 "punpckhwd %%mm0, %%mm0 \n\t"\
366 "punpckhwd %%mm6, %%mm6 \n\t"\
367 "punpckhwd %%mm3, %%mm3 \n\t"\
368 "paddw %%mm7, %%mm0 \n\t"\
369 "paddw %%mm7, %%mm6 \n\t"\
370 "paddw %%mm7, %%mm3 \n\t"\
371 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
372 "packuswb %%mm0, %%mm2 \n\t"\
373 "packuswb %%mm6, %%mm5 \n\t"\
374 "packuswb %%mm3, %%mm4 \n\t"\
376 #define REAL_YSCALEYUV2PACKED(index, c) \
377 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
378 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
379 "psraw $3, %%mm0 \n\t"\
380 "psraw $3, %%mm1 \n\t"\
381 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
382 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
383 "xor "#index", "#index" \n\t"\
386 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
387 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
388 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
389 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
390 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
391 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
392 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
393 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
394 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
395 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
396 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
397 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
398 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
399 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
400 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
401 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
402 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
403 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
404 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
406 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
408 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
410 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
414 #define REAL_YSCALEYUV2RGB_UV(index, c) \
415 "xor "#index", "#index" \n\t"\
418 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
419 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
420 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
421 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
422 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
423 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
424 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
425 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
426 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
427 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
428 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
431 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
432 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
433 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
434 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
435 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
436 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
437 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
439 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
440 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
441 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
442 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
443 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
444 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
445 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
446 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
447 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
448 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
449 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
450 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
451 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
453 #define REAL_YSCALEYUV2RGB_COEFF(c) \
454 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
455 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
456 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
457 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
458 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
459 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
460 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
461 "paddw %%mm3, %%mm4 \n\t"\
462 "movq %%mm2, %%mm0 \n\t"\
463 "movq %%mm5, %%mm6 \n\t"\
464 "movq %%mm4, %%mm3 \n\t"\
465 "punpcklwd %%mm2, %%mm2 \n\t"\
466 "punpcklwd %%mm5, %%mm5 \n\t"\
467 "punpcklwd %%mm4, %%mm4 \n\t"\
468 "paddw %%mm1, %%mm2 \n\t"\
469 "paddw %%mm1, %%mm5 \n\t"\
470 "paddw %%mm1, %%mm4 \n\t"\
471 "punpckhwd %%mm0, %%mm0 \n\t"\
472 "punpckhwd %%mm6, %%mm6 \n\t"\
473 "punpckhwd %%mm3, %%mm3 \n\t"\
474 "paddw %%mm7, %%mm0 \n\t"\
475 "paddw %%mm7, %%mm6 \n\t"\
476 "paddw %%mm7, %%mm3 \n\t"\
477 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
478 "packuswb %%mm0, %%mm2 \n\t"\
479 "packuswb %%mm6, %%mm5 \n\t"\
480 "packuswb %%mm3, %%mm4 \n\t"\
482 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
484 #define YSCALEYUV2RGB(index, c) \
485 REAL_YSCALEYUV2RGB_UV(index, c) \
486 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
487 REAL_YSCALEYUV2RGB_COEFF(c)
489 #define REAL_YSCALEYUV2PACKED1(index, c) \
490 "xor "#index", "#index" \n\t"\
493 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
494 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
495 "psraw $7, %%mm3 \n\t" \
496 "psraw $7, %%mm4 \n\t" \
497 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
498 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
499 "psraw $7, %%mm1 \n\t" \
500 "psraw $7, %%mm7 \n\t" \
502 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
504 #define REAL_YSCALEYUV2RGB1(index, c) \
505 "xor "#index", "#index" \n\t"\
508 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
509 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
510 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
511 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
512 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
513 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
514 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
515 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
516 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
517 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
518 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
521 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
522 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
523 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
524 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
525 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
526 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
527 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
528 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
529 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
530 "paddw %%mm3, %%mm4 \n\t"\
531 "movq %%mm2, %%mm0 \n\t"\
532 "movq %%mm5, %%mm6 \n\t"\
533 "movq %%mm4, %%mm3 \n\t"\
534 "punpcklwd %%mm2, %%mm2 \n\t"\
535 "punpcklwd %%mm5, %%mm5 \n\t"\
536 "punpcklwd %%mm4, %%mm4 \n\t"\
537 "paddw %%mm1, %%mm2 \n\t"\
538 "paddw %%mm1, %%mm5 \n\t"\
539 "paddw %%mm1, %%mm4 \n\t"\
540 "punpckhwd %%mm0, %%mm0 \n\t"\
541 "punpckhwd %%mm6, %%mm6 \n\t"\
542 "punpckhwd %%mm3, %%mm3 \n\t"\
543 "paddw %%mm7, %%mm0 \n\t"\
544 "paddw %%mm7, %%mm6 \n\t"\
545 "paddw %%mm7, %%mm3 \n\t"\
546 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
547 "packuswb %%mm0, %%mm2 \n\t"\
548 "packuswb %%mm6, %%mm5 \n\t"\
549 "packuswb %%mm3, %%mm4 \n\t"\
551 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
553 #define REAL_YSCALEYUV2PACKED1b(index, c) \
554 "xor "#index", "#index" \n\t"\
557 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
558 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
559 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
560 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
561 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
562 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
563 "psrlw $8, %%mm3 \n\t" \
564 "psrlw $8, %%mm4 \n\t" \
565 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
566 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
567 "psraw $7, %%mm1 \n\t" \
568 "psraw $7, %%mm7 \n\t"
569 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
571 // do vertical chrominance interpolation
572 #define REAL_YSCALEYUV2RGB1b(index, c) \
573 "xor "#index", "#index" \n\t"\
576 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
577 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
578 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
579 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
580 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
581 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
582 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
583 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
584 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
585 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
586 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
587 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
588 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
589 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
590 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
591 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
592 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
593 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
594 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
595 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
596 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
597 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
598 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
599 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
600 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
601 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
602 "paddw %%mm3, %%mm4 \n\t"\
603 "movq %%mm2, %%mm0 \n\t"\
604 "movq %%mm5, %%mm6 \n\t"\
605 "movq %%mm4, %%mm3 \n\t"\
606 "punpcklwd %%mm2, %%mm2 \n\t"\
607 "punpcklwd %%mm5, %%mm5 \n\t"\
608 "punpcklwd %%mm4, %%mm4 \n\t"\
609 "paddw %%mm1, %%mm2 \n\t"\
610 "paddw %%mm1, %%mm5 \n\t"\
611 "paddw %%mm1, %%mm4 \n\t"\
612 "punpckhwd %%mm0, %%mm0 \n\t"\
613 "punpckhwd %%mm6, %%mm6 \n\t"\
614 "punpckhwd %%mm3, %%mm3 \n\t"\
615 "paddw %%mm7, %%mm0 \n\t"\
616 "paddw %%mm7, %%mm6 \n\t"\
617 "paddw %%mm7, %%mm3 \n\t"\
618 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
619 "packuswb %%mm0, %%mm2 \n\t"\
620 "packuswb %%mm6, %%mm5 \n\t"\
621 "packuswb %%mm3, %%mm4 \n\t"\
623 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
625 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
626 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
627 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
628 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
629 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
630 "packuswb %%mm1, %%mm7 \n\t"
631 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
633 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
634 "movq "#b", "#q2" \n\t" /* B */\
635 "movq "#r", "#t" \n\t" /* R */\
636 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
637 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
638 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
639 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
640 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
641 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
642 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
643 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
644 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
645 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
647 MOVNTQ( q0, (dst, index, 4))\
648 MOVNTQ( b, 8(dst, index, 4))\
649 MOVNTQ( q2, 16(dst, index, 4))\
650 MOVNTQ( q3, 24(dst, index, 4))\
652 "add $8, "#index" \n\t"\
653 "cmp "#dstw", "#index" \n\t"\
655 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
657 #define REAL_WRITERGB16(dst, dstw, index) \
658 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
659 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
660 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
661 "psrlq $3, %%mm2 \n\t"\
663 "movq %%mm2, %%mm1 \n\t"\
664 "movq %%mm4, %%mm3 \n\t"\
666 "punpcklbw %%mm7, %%mm3 \n\t"\
667 "punpcklbw %%mm5, %%mm2 \n\t"\
668 "punpckhbw %%mm7, %%mm4 \n\t"\
669 "punpckhbw %%mm5, %%mm1 \n\t"\
671 "psllq $3, %%mm3 \n\t"\
672 "psllq $3, %%mm4 \n\t"\
674 "por %%mm3, %%mm2 \n\t"\
675 "por %%mm4, %%mm1 \n\t"\
677 MOVNTQ(%%mm2, (dst, index, 2))\
678 MOVNTQ(%%mm1, 8(dst, index, 2))\
680 "add $8, "#index" \n\t"\
681 "cmp "#dstw", "#index" \n\t"\
683 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
685 #define REAL_WRITERGB15(dst, dstw, index) \
686 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
687 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
688 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
689 "psrlq $3, %%mm2 \n\t"\
690 "psrlq $1, %%mm5 \n\t"\
692 "movq %%mm2, %%mm1 \n\t"\
693 "movq %%mm4, %%mm3 \n\t"\
695 "punpcklbw %%mm7, %%mm3 \n\t"\
696 "punpcklbw %%mm5, %%mm2 \n\t"\
697 "punpckhbw %%mm7, %%mm4 \n\t"\
698 "punpckhbw %%mm5, %%mm1 \n\t"\
700 "psllq $2, %%mm3 \n\t"\
701 "psllq $2, %%mm4 \n\t"\
703 "por %%mm3, %%mm2 \n\t"\
704 "por %%mm4, %%mm1 \n\t"\
706 MOVNTQ(%%mm2, (dst, index, 2))\
707 MOVNTQ(%%mm1, 8(dst, index, 2))\
709 "add $8, "#index" \n\t"\
710 "cmp "#dstw", "#index" \n\t"\
712 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
714 #define WRITEBGR24OLD(dst, dstw, index) \
715 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
716 "movq %%mm2, %%mm1 \n\t" /* B */\
717 "movq %%mm5, %%mm6 \n\t" /* R */\
718 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
719 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
720 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
721 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
722 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
723 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
724 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
725 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
726 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
727 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
729 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
730 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
731 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
732 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
733 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
734 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
735 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
736 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
738 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
739 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
740 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
741 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
742 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
743 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
744 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
745 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
746 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
747 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
748 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
749 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
750 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
752 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
753 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
754 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
755 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
756 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
757 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
758 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
759 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
761 MOVNTQ(%%mm0, (dst))\
762 MOVNTQ(%%mm2, 8(dst))\
763 MOVNTQ(%%mm3, 16(dst))\
764 "add $24, "#dst" \n\t"\
766 "add $8, "#index" \n\t"\
767 "cmp "#dstw", "#index" \n\t"\
770 #define WRITEBGR24MMX(dst, dstw, index) \
771 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
772 "movq %%mm2, %%mm1 \n\t" /* B */\
773 "movq %%mm5, %%mm6 \n\t" /* R */\
774 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
775 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
776 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
777 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
778 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
779 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
780 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
781 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
782 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
783 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
785 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
786 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
787 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
788 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
790 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
791 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
792 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
793 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
795 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
796 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
797 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
798 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
800 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
801 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
802 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
803 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
804 MOVNTQ(%%mm0, (dst))\
806 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
807 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
808 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
809 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
810 MOVNTQ(%%mm6, 8(dst))\
812 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
813 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
814 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
815 MOVNTQ(%%mm5, 16(dst))\
817 "add $24, "#dst" \n\t"\
819 "add $8, "#index" \n\t"\
820 "cmp "#dstw", "#index" \n\t"\
823 #define WRITEBGR24MMX2(dst, dstw, index) \
824 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
825 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
826 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
827 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
828 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
829 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
831 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
832 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
833 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
835 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
836 "por %%mm1, %%mm6 \n\t"\
837 "por %%mm3, %%mm6 \n\t"\
838 MOVNTQ(%%mm6, (dst))\
840 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
841 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
842 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
843 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
845 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
846 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
847 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
849 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
850 "por %%mm3, %%mm6 \n\t"\
851 MOVNTQ(%%mm6, 8(dst))\
853 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
854 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
855 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
857 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
858 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
859 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
861 "por %%mm1, %%mm3 \n\t"\
862 "por %%mm3, %%mm6 \n\t"\
863 MOVNTQ(%%mm6, 16(dst))\
865 "add $24, "#dst" \n\t"\
867 "add $8, "#index" \n\t"\
868 "cmp "#dstw", "#index" \n\t"\
871 #if COMPILE_TEMPLATE_MMX2
873 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
876 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
879 #define REAL_WRITEYUY2(dst, dstw, index) \
880 "packuswb %%mm3, %%mm3 \n\t"\
881 "packuswb %%mm4, %%mm4 \n\t"\
882 "packuswb %%mm7, %%mm1 \n\t"\
883 "punpcklbw %%mm4, %%mm3 \n\t"\
884 "movq %%mm1, %%mm7 \n\t"\
885 "punpcklbw %%mm3, %%mm1 \n\t"\
886 "punpckhbw %%mm3, %%mm7 \n\t"\
888 MOVNTQ(%%mm1, (dst, index, 2))\
889 MOVNTQ(%%mm7, 8(dst, index, 2))\
891 "add $8, "#index" \n\t"\
892 "cmp "#dstw", "#index" \n\t"\
894 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
897 static inline void RENAME(yuv2yuvX
)(SwsContext
*c
, const int16_t *lumFilter
, const int16_t **lumSrc
, int lumFilterSize
,
898 const int16_t *chrFilter
, const int16_t **chrSrc
, int chrFilterSize
, const int16_t **alpSrc
,
899 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, uint8_t *aDest
, long dstW
, long chrDstW
)
901 #if COMPILE_TEMPLATE_MMX
902 if(!(c
->flags
& SWS_BITEXACT
)) {
903 if (c
->flags
& SWS_ACCURATE_RND
) {
905 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
906 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
908 if (CONFIG_SWSCALE_ALPHA
&& aDest
) {
909 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET
, aDest
, dstW
)
912 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
915 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET
, uDest
, chrDstW
)
916 YSCALEYUV2YV12X(AV_STRINGIFY(VOF
), CHR_MMX_FILTER_OFFSET
, vDest
, chrDstW
)
918 if (CONFIG_SWSCALE_ALPHA
&& aDest
) {
919 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET
, aDest
, dstW
)
922 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET
, dest
, dstW
)
927 #if COMPILE_TEMPLATE_ALTIVEC
928 yuv2yuvX_altivec_real(lumFilter
, lumSrc
, lumFilterSize
,
929 chrFilter
, chrSrc
, chrFilterSize
,
930 dest
, uDest
, vDest
, dstW
, chrDstW
);
931 #else //COMPILE_TEMPLATE_ALTIVEC
932 yuv2yuvXinC(lumFilter
, lumSrc
, lumFilterSize
,
933 chrFilter
, chrSrc
, chrFilterSize
,
934 alpSrc
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
935 #endif //!COMPILE_TEMPLATE_ALTIVEC
938 static inline void RENAME(yuv2nv12X
)(SwsContext
*c
, const int16_t *lumFilter
, const int16_t **lumSrc
, int lumFilterSize
,
939 const int16_t *chrFilter
, const int16_t **chrSrc
, int chrFilterSize
,
940 uint8_t *dest
, uint8_t *uDest
, int dstW
, int chrDstW
, enum PixelFormat dstFormat
)
942 yuv2nv12XinC(lumFilter
, lumSrc
, lumFilterSize
,
943 chrFilter
, chrSrc
, chrFilterSize
,
944 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
947 static inline void RENAME(yuv2yuv1
)(SwsContext
*c
, const int16_t *lumSrc
, const int16_t *chrSrc
, const int16_t *alpSrc
,
948 uint8_t *dest
, uint8_t *uDest
, uint8_t *vDest
, uint8_t *aDest
, long dstW
, long chrDstW
)
951 #if COMPILE_TEMPLATE_MMX
952 if(!(c
->flags
& SWS_BITEXACT
)) {
954 const uint8_t *src
[4]= {alpSrc
+ dstW
, lumSrc
+ dstW
, chrSrc
+ chrDstW
, chrSrc
+ VOFW
+ chrDstW
};
955 uint8_t *dst
[4]= {aDest
, dest
, uDest
, vDest
};
956 x86_reg counter
[4]= {dstW
, dstW
, chrDstW
, chrDstW
};
958 if (c
->flags
& SWS_ACCURATE_RND
) {
962 YSCALEYUV2YV121_ACCURATE
963 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
974 :: "r" (src
[p
]), "r" (dst
[p
] + counter
[p
]),
984 for (i
=0; i
<dstW
; i
++) {
985 int val
= (lumSrc
[i
]+64)>>7;
996 for (i
=0; i
<chrDstW
; i
++) {
997 int u
=(chrSrc
[i
]+64)>>7;
998 int v
=(chrSrc
[i
+ VOFW
]+64)>>7;
1002 else if (u
>255) u
=255;
1004 else if (v
>255) v
=255;
1011 if (CONFIG_SWSCALE_ALPHA
&& aDest
)
1012 for (i
=0; i
<dstW
; i
++) {
1013 int val
= (alpSrc
[i
]+64)>>7;
1014 aDest
[i
]= av_clip_uint8(val
);
1020 * vertical scale YV12 to RGB
1022 static inline void RENAME(yuv2packedX
)(SwsContext
*c
, const int16_t *lumFilter
, const int16_t **lumSrc
, int lumFilterSize
,
1023 const int16_t *chrFilter
, const int16_t **chrSrc
, int chrFilterSize
,
1024 const int16_t **alpSrc
, uint8_t *dest
, long dstW
, long dstY
)
1026 #if COMPILE_TEMPLATE_MMX
1028 x86_reg dstW_reg
= dstW
;
1029 if(!(c
->flags
& SWS_BITEXACT
)) {
1030 if (c
->flags
& SWS_ACCURATE_RND
) {
1031 switch(c
->dstFormat
) {
1033 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1034 YSCALEYUV2PACKEDX_ACCURATE
1036 "movq %%mm2, "U_TEMP
"(%0) \n\t"
1037 "movq %%mm4, "V_TEMP
"(%0) \n\t"
1038 "movq %%mm5, "Y_TEMP
"(%0) \n\t"
1039 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET
)
1040 "movq "Y_TEMP
"(%0), %%mm5 \n\t"
1041 "psraw $3, %%mm1 \n\t"
1042 "psraw $3, %%mm7 \n\t"
1043 "packuswb %%mm7, %%mm1 \n\t"
1044 WRITEBGR32(%4, %5, %%REGa
, %%mm3
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm2
, %%mm6
)
1046 YSCALEYUV2PACKEDX_END
1048 YSCALEYUV2PACKEDX_ACCURATE
1050 "pcmpeqd %%mm7, %%mm7 \n\t"
1051 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1053 YSCALEYUV2PACKEDX_END
1057 YSCALEYUV2PACKEDX_ACCURATE
1059 "pxor %%mm7, %%mm7 \n\t"
1060 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
"\n\t" //FIXME optimize
1061 "add %4, %%"REG_c
" \n\t"
1062 WRITEBGR24(%%REGc
, %5, %%REGa
)
1065 :: "r" (&c
->redDither
),
1066 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1067 "r" (dest
), "m" (dstW_reg
)
1068 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1071 case PIX_FMT_RGB555
:
1072 YSCALEYUV2PACKEDX_ACCURATE
1074 "pxor %%mm7, %%mm7 \n\t"
1075 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1077 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1078 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1079 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1082 WRITERGB15(%4, %5, %%REGa
)
1083 YSCALEYUV2PACKEDX_END
1085 case PIX_FMT_RGB565
:
1086 YSCALEYUV2PACKEDX_ACCURATE
1088 "pxor %%mm7, %%mm7 \n\t"
1089 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1091 "paddusb "BLUE_DITHER
"(%0), %%mm2\n\t"
1092 "paddusb "GREEN_DITHER
"(%0), %%mm4\n\t"
1093 "paddusb "RED_DITHER
"(%0), %%mm5\n\t"
1096 WRITERGB16(%4, %5, %%REGa
)
1097 YSCALEYUV2PACKEDX_END
1099 case PIX_FMT_YUYV422
:
1100 YSCALEYUV2PACKEDX_ACCURATE
1101 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1103 "psraw $3, %%mm3 \n\t"
1104 "psraw $3, %%mm4 \n\t"
1105 "psraw $3, %%mm1 \n\t"
1106 "psraw $3, %%mm7 \n\t"
1107 WRITEYUY2(%4, %5, %%REGa
)
1108 YSCALEYUV2PACKEDX_END
1112 switch(c
->dstFormat
) {
1114 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1117 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET
, %%mm0
, %%mm3
, %%mm6
, %%mm1
, %%mm7
)
1118 "psraw $3, %%mm1 \n\t"
1119 "psraw $3, %%mm7 \n\t"
1120 "packuswb %%mm7, %%mm1 \n\t"
1121 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm3
, %%mm6
)
1122 YSCALEYUV2PACKEDX_END
1126 "pcmpeqd %%mm7, %%mm7 \n\t"
1127 WRITEBGR32(%4, %5, %%REGa
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1128 YSCALEYUV2PACKEDX_END
1134 "pxor %%mm7, %%mm7 \n\t"
1135 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
" \n\t" //FIXME optimize
1136 "add %4, %%"REG_c
" \n\t"
1137 WRITEBGR24(%%REGc
, %5, %%REGa
)
1139 :: "r" (&c
->redDither
),
1140 "m" (dummy
), "m" (dummy
), "m" (dummy
),
1141 "r" (dest
), "m" (dstW_reg
)
1142 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
1145 case PIX_FMT_RGB555
:
1148 "pxor %%mm7, %%mm7 \n\t"
1149 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1151 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1152 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1153 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1156 WRITERGB15(%4, %5, %%REGa
)
1157 YSCALEYUV2PACKEDX_END
1159 case PIX_FMT_RGB565
:
1162 "pxor %%mm7, %%mm7 \n\t"
1163 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1165 "paddusb "BLUE_DITHER
"(%0), %%mm2 \n\t"
1166 "paddusb "GREEN_DITHER
"(%0), %%mm4 \n\t"
1167 "paddusb "RED_DITHER
"(%0), %%mm5 \n\t"
1170 WRITERGB16(%4, %5, %%REGa
)
1171 YSCALEYUV2PACKEDX_END
1173 case PIX_FMT_YUYV422
:
1175 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1177 "psraw $3, %%mm3 \n\t"
1178 "psraw $3, %%mm4 \n\t"
1179 "psraw $3, %%mm1 \n\t"
1180 "psraw $3, %%mm7 \n\t"
1181 WRITEYUY2(%4, %5, %%REGa
)
1182 YSCALEYUV2PACKEDX_END
1187 #endif /* COMPILE_TEMPLATE_MMX */
1188 #if COMPILE_TEMPLATE_ALTIVEC
1189 /* The following list of supported dstFormat values should
1190 match what's found in the body of ff_yuv2packedX_altivec() */
1191 if (!(c
->flags
& SWS_BITEXACT
) && !c
->alpPixBuf
&&
1192 (c
->dstFormat
==PIX_FMT_ABGR
|| c
->dstFormat
==PIX_FMT_BGRA
||
1193 c
->dstFormat
==PIX_FMT_BGR24
|| c
->dstFormat
==PIX_FMT_RGB24
||
1194 c
->dstFormat
==PIX_FMT_RGBA
|| c
->dstFormat
==PIX_FMT_ARGB
))
1195 ff_yuv2packedX_altivec(c
, lumFilter
, lumSrc
, lumFilterSize
,
1196 chrFilter
, chrSrc
, chrFilterSize
,
1200 yuv2packedXinC(c
, lumFilter
, lumSrc
, lumFilterSize
,
1201 chrFilter
, chrSrc
, chrFilterSize
,
1202 alpSrc
, dest
, dstW
, dstY
);
1206 * vertical bilinear scale YV12 to RGB
1208 static inline void RENAME(yuv2packed2
)(SwsContext
*c
, const uint16_t *buf0
, const uint16_t *buf1
, const uint16_t *uvbuf0
, const uint16_t *uvbuf1
,
1209 const uint16_t *abuf0
, const uint16_t *abuf1
, uint8_t *dest
, int dstW
, int yalpha
, int uvalpha
, int y
)
1211 int yalpha1
=4095- yalpha
;
1212 int uvalpha1
=4095-uvalpha
;
1215 #if COMPILE_TEMPLATE_MMX
1216 if(!(c
->flags
& SWS_BITEXACT
)) {
1217 switch(c
->dstFormat
) {
1218 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1220 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1223 YSCALEYUV2RGB(%%r8
, %5)
1224 YSCALEYUV2RGB_YA(%%r8
, %5, %6, %7)
1225 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1226 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1227 "packuswb %%mm7, %%mm1 \n\t"
1228 WRITEBGR32(%4, 8280(%5), %%r8
, %%mm2
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm3
, %%mm6
)
1230 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "r" (dest
),
1232 ,"r" (abuf0
), "r" (abuf1
)
1236 *(const uint16_t **)(&c
->u_temp
)=abuf0
;
1237 *(const uint16_t **)(&c
->v_temp
)=abuf1
;
1239 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1240 "mov %4, %%"REG_b
" \n\t"
1241 "push %%"REG_BP
" \n\t"
1242 YSCALEYUV2RGB(%%REGBP
, %5)
1245 "mov "U_TEMP
"(%5), %0 \n\t"
1246 "mov "V_TEMP
"(%5), %1 \n\t"
1247 YSCALEYUV2RGB_YA(%%REGBP
, %5, %0, %1)
1248 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1249 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1250 "packuswb %%mm7, %%mm1 \n\t"
1253 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm1
, %%mm0
, %%mm7
, %%mm3
, %%mm6
)
1254 "pop %%"REG_BP
" \n\t"
1255 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1257 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1263 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1264 "mov %4, %%"REG_b
" \n\t"
1265 "push %%"REG_BP
" \n\t"
1266 YSCALEYUV2RGB(%%REGBP
, %5)
1267 "pcmpeqd %%mm7, %%mm7 \n\t"
1268 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1269 "pop %%"REG_BP
" \n\t"
1270 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1272 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1279 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1280 "mov %4, %%"REG_b
" \n\t"
1281 "push %%"REG_BP
" \n\t"
1282 YSCALEYUV2RGB(%%REGBP
, %5)
1283 "pxor %%mm7, %%mm7 \n\t"
1284 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1285 "pop %%"REG_BP
" \n\t"
1286 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1287 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1291 case PIX_FMT_RGB555
:
1293 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1294 "mov %4, %%"REG_b
" \n\t"
1295 "push %%"REG_BP
" \n\t"
1296 YSCALEYUV2RGB(%%REGBP
, %5)
1297 "pxor %%mm7, %%mm7 \n\t"
1298 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1300 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1301 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1302 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1305 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1306 "pop %%"REG_BP
" \n\t"
1307 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1309 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1313 case PIX_FMT_RGB565
:
1315 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1316 "mov %4, %%"REG_b
" \n\t"
1317 "push %%"REG_BP
" \n\t"
1318 YSCALEYUV2RGB(%%REGBP
, %5)
1319 "pxor %%mm7, %%mm7 \n\t"
1320 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1322 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1323 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1324 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1327 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1328 "pop %%"REG_BP
" \n\t"
1329 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1330 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1334 case PIX_FMT_YUYV422
:
1336 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1337 "mov %4, %%"REG_b
" \n\t"
1338 "push %%"REG_BP
" \n\t"
1339 YSCALEYUV2PACKED(%%REGBP
, %5)
1340 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1341 "pop %%"REG_BP
" \n\t"
1342 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1343 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1350 #endif //COMPILE_TEMPLATE_MMX
1351 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C
, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C
, YSCALE_YUV_2_MONO2_C
)
1355 * YV12 to RGB without scaling or interpolating
1357 static inline void RENAME(yuv2packed1
)(SwsContext
*c
, const uint16_t *buf0
, const uint16_t *uvbuf0
, const uint16_t *uvbuf1
,
1358 const uint16_t *abuf0
, uint8_t *dest
, int dstW
, int uvalpha
, enum PixelFormat dstFormat
, int flags
, int y
)
1360 const int yalpha1
=0;
1363 const uint16_t *buf1
= buf0
; //FIXME needed for RGB1/BGR1
1364 const int yalpha
= 4096; //FIXME ...
1366 if (flags
&SWS_FULL_CHR_H_INT
) {
1367 c
->yuv2packed2(c
, buf0
, buf0
, uvbuf0
, uvbuf1
, abuf0
, abuf0
, dest
, dstW
, 0, uvalpha
, y
);
1371 #if COMPILE_TEMPLATE_MMX
1372 if(!(flags
& SWS_BITEXACT
)) {
1373 if (uvalpha
< 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1376 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1378 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1379 "mov %4, %%"REG_b
" \n\t"
1380 "push %%"REG_BP
" \n\t"
1381 YSCALEYUV2RGB1(%%REGBP
, %5)
1382 YSCALEYUV2RGB1_ALPHA(%%REGBP
)
1383 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1384 "pop %%"REG_BP
" \n\t"
1385 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1387 :: "c" (buf0
), "d" (abuf0
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1392 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1393 "mov %4, %%"REG_b
" \n\t"
1394 "push %%"REG_BP
" \n\t"
1395 YSCALEYUV2RGB1(%%REGBP
, %5)
1396 "pcmpeqd %%mm7, %%mm7 \n\t"
1397 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1398 "pop %%"REG_BP
" \n\t"
1399 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1401 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1408 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1409 "mov %4, %%"REG_b
" \n\t"
1410 "push %%"REG_BP
" \n\t"
1411 YSCALEYUV2RGB1(%%REGBP
, %5)
1412 "pxor %%mm7, %%mm7 \n\t"
1413 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1414 "pop %%"REG_BP
" \n\t"
1415 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1417 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1421 case PIX_FMT_RGB555
:
1423 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1424 "mov %4, %%"REG_b
" \n\t"
1425 "push %%"REG_BP
" \n\t"
1426 YSCALEYUV2RGB1(%%REGBP
, %5)
1427 "pxor %%mm7, %%mm7 \n\t"
1428 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1430 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1431 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1432 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1434 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1435 "pop %%"REG_BP
" \n\t"
1436 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1438 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1442 case PIX_FMT_RGB565
:
1444 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1445 "mov %4, %%"REG_b
" \n\t"
1446 "push %%"REG_BP
" \n\t"
1447 YSCALEYUV2RGB1(%%REGBP
, %5)
1448 "pxor %%mm7, %%mm7 \n\t"
1449 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1451 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1452 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1453 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1456 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1457 "pop %%"REG_BP
" \n\t"
1458 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1460 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1464 case PIX_FMT_YUYV422
:
1466 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1467 "mov %4, %%"REG_b
" \n\t"
1468 "push %%"REG_BP
" \n\t"
1469 YSCALEYUV2PACKED1(%%REGBP
, %5)
1470 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1471 "pop %%"REG_BP
" \n\t"
1472 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1474 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1482 if (CONFIG_SWSCALE_ALPHA
&& c
->alpPixBuf
) {
1484 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1485 "mov %4, %%"REG_b
" \n\t"
1486 "push %%"REG_BP
" \n\t"
1487 YSCALEYUV2RGB1b(%%REGBP
, %5)
1488 YSCALEYUV2RGB1_ALPHA(%%REGBP
)
1489 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1490 "pop %%"REG_BP
" \n\t"
1491 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1493 :: "c" (buf0
), "d" (abuf0
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1498 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1499 "mov %4, %%"REG_b
" \n\t"
1500 "push %%"REG_BP
" \n\t"
1501 YSCALEYUV2RGB1b(%%REGBP
, %5)
1502 "pcmpeqd %%mm7, %%mm7 \n\t"
1503 WRITEBGR32(%%REGb
, 8280(%5), %%REGBP
, %%mm2
, %%mm4
, %%mm5
, %%mm7
, %%mm0
, %%mm1
, %%mm3
, %%mm6
)
1504 "pop %%"REG_BP
" \n\t"
1505 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1507 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1514 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1515 "mov %4, %%"REG_b
" \n\t"
1516 "push %%"REG_BP
" \n\t"
1517 YSCALEYUV2RGB1b(%%REGBP
, %5)
1518 "pxor %%mm7, %%mm7 \n\t"
1519 WRITEBGR24(%%REGb
, 8280(%5), %%REGBP
)
1520 "pop %%"REG_BP
" \n\t"
1521 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1523 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1527 case PIX_FMT_RGB555
:
1529 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1530 "mov %4, %%"REG_b
" \n\t"
1531 "push %%"REG_BP
" \n\t"
1532 YSCALEYUV2RGB1b(%%REGBP
, %5)
1533 "pxor %%mm7, %%mm7 \n\t"
1534 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1536 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1537 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1538 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1540 WRITERGB15(%%REGb
, 8280(%5), %%REGBP
)
1541 "pop %%"REG_BP
" \n\t"
1542 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1544 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1548 case PIX_FMT_RGB565
:
1550 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1551 "mov %4, %%"REG_b
" \n\t"
1552 "push %%"REG_BP
" \n\t"
1553 YSCALEYUV2RGB1b(%%REGBP
, %5)
1554 "pxor %%mm7, %%mm7 \n\t"
1555 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1557 "paddusb "BLUE_DITHER
"(%5), %%mm2 \n\t"
1558 "paddusb "GREEN_DITHER
"(%5), %%mm4 \n\t"
1559 "paddusb "RED_DITHER
"(%5), %%mm5 \n\t"
1562 WRITERGB16(%%REGb
, 8280(%5), %%REGBP
)
1563 "pop %%"REG_BP
" \n\t"
1564 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1566 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1570 case PIX_FMT_YUYV422
:
1572 "mov %%"REG_b
", "ESP_OFFSET
"(%5) \n\t"
1573 "mov %4, %%"REG_b
" \n\t"
1574 "push %%"REG_BP
" \n\t"
1575 YSCALEYUV2PACKED1b(%%REGBP
, %5)
1576 WRITEYUY2(%%REGb
, 8280(%5), %%REGBP
)
1577 "pop %%"REG_BP
" \n\t"
1578 "mov "ESP_OFFSET
"(%5), %%"REG_b
" \n\t"
1580 :: "c" (buf0
), "d" (buf1
), "S" (uvbuf0
), "D" (uvbuf1
), "m" (dest
),
1587 #endif /* COMPILE_TEMPLATE_MMX */
1588 if (uvalpha
< 2048) {
1589 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C
, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1591 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C
, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C
, YSCALE_YUV_2_MONO2_C
)
1595 //FIXME yuy2* can read up to 7 samples too much
1597 static inline void RENAME(yuy2ToY
)(uint8_t *dst
, const uint8_t *src
, long width
, uint32_t *unused
)
1599 #if COMPILE_TEMPLATE_MMX
1601 "movq "MANGLE(bm01010101
)", %%mm2 \n\t"
1602 "mov %0, %%"REG_a
" \n\t"
1604 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1605 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1606 "pand %%mm2, %%mm0 \n\t"
1607 "pand %%mm2, %%mm1 \n\t"
1608 "packuswb %%mm1, %%mm0 \n\t"
1609 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1610 "add $8, %%"REG_a
" \n\t"
1612 : : "g" ((x86_reg
)-width
), "r" (src
+width
*2), "r" (dst
+width
)
1617 for (i
=0; i
<width
; i
++)
1622 static inline void RENAME(yuy2ToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1624 #if COMPILE_TEMPLATE_MMX
1626 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1627 "mov %0, %%"REG_a
" \n\t"
1629 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1630 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1631 "psrlw $8, %%mm0 \n\t"
1632 "psrlw $8, %%mm1 \n\t"
1633 "packuswb %%mm1, %%mm0 \n\t"
1634 "movq %%mm0, %%mm1 \n\t"
1635 "psrlw $8, %%mm0 \n\t"
1636 "pand %%mm4, %%mm1 \n\t"
1637 "packuswb %%mm0, %%mm0 \n\t"
1638 "packuswb %%mm1, %%mm1 \n\t"
1639 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1640 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1641 "add $4, %%"REG_a
" \n\t"
1643 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1648 for (i
=0; i
<width
; i
++) {
1649 dstU
[i
]= src1
[4*i
+ 1];
1650 dstV
[i
]= src1
[4*i
+ 3];
1653 assert(src1
== src2
);
1656 static inline void RENAME(LEToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1658 #if COMPILE_TEMPLATE_MMX
1660 "mov %0, %%"REG_a
" \n\t"
1662 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1663 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1664 "movq (%2, %%"REG_a
",2), %%mm2 \n\t"
1665 "movq 8(%2, %%"REG_a
",2), %%mm3 \n\t"
1666 "psrlw $8, %%mm0 \n\t"
1667 "psrlw $8, %%mm1 \n\t"
1668 "psrlw $8, %%mm2 \n\t"
1669 "psrlw $8, %%mm3 \n\t"
1670 "packuswb %%mm1, %%mm0 \n\t"
1671 "packuswb %%mm3, %%mm2 \n\t"
1672 "movq %%mm0, (%3, %%"REG_a
") \n\t"
1673 "movq %%mm2, (%4, %%"REG_a
") \n\t"
1674 "add $8, %%"REG_a
" \n\t"
1676 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*2), "r" (src2
+width
*2), "r" (dstU
+width
), "r" (dstV
+width
)
1681 for (i
=0; i
<width
; i
++) {
1682 dstU
[i
]= src1
[2*i
+ 1];
1683 dstV
[i
]= src2
[2*i
+ 1];
1688 /* This is almost identical to the previous, end exists only because
1689 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1690 static inline void RENAME(uyvyToY
)(uint8_t *dst
, const uint8_t *src
, long width
, uint32_t *unused
)
1692 #if COMPILE_TEMPLATE_MMX
1694 "mov %0, %%"REG_a
" \n\t"
1696 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1697 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1698 "psrlw $8, %%mm0 \n\t"
1699 "psrlw $8, %%mm1 \n\t"
1700 "packuswb %%mm1, %%mm0 \n\t"
1701 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1702 "add $8, %%"REG_a
" \n\t"
1704 : : "g" ((x86_reg
)-width
), "r" (src
+width
*2), "r" (dst
+width
)
1709 for (i
=0; i
<width
; i
++)
1714 static inline void RENAME(uyvyToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1716 #if COMPILE_TEMPLATE_MMX
1718 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1719 "mov %0, %%"REG_a
" \n\t"
1721 "movq (%1, %%"REG_a
",4), %%mm0 \n\t"
1722 "movq 8(%1, %%"REG_a
",4), %%mm1 \n\t"
1723 "pand %%mm4, %%mm0 \n\t"
1724 "pand %%mm4, %%mm1 \n\t"
1725 "packuswb %%mm1, %%mm0 \n\t"
1726 "movq %%mm0, %%mm1 \n\t"
1727 "psrlw $8, %%mm0 \n\t"
1728 "pand %%mm4, %%mm1 \n\t"
1729 "packuswb %%mm0, %%mm0 \n\t"
1730 "packuswb %%mm1, %%mm1 \n\t"
1731 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1732 "movd %%mm1, (%2, %%"REG_a
") \n\t"
1733 "add $4, %%"REG_a
" \n\t"
1735 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*4), "r" (dstU
+width
), "r" (dstV
+width
)
1740 for (i
=0; i
<width
; i
++) {
1741 dstU
[i
]= src1
[4*i
+ 0];
1742 dstV
[i
]= src1
[4*i
+ 2];
1745 assert(src1
== src2
);
1748 static inline void RENAME(BEToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1750 #if COMPILE_TEMPLATE_MMX
1752 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1753 "mov %0, %%"REG_a
" \n\t"
1755 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1756 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1757 "movq (%2, %%"REG_a
",2), %%mm2 \n\t"
1758 "movq 8(%2, %%"REG_a
",2), %%mm3 \n\t"
1759 "pand %%mm4, %%mm0 \n\t"
1760 "pand %%mm4, %%mm1 \n\t"
1761 "pand %%mm4, %%mm2 \n\t"
1762 "pand %%mm4, %%mm3 \n\t"
1763 "packuswb %%mm1, %%mm0 \n\t"
1764 "packuswb %%mm3, %%mm2 \n\t"
1765 "movq %%mm0, (%3, %%"REG_a
") \n\t"
1766 "movq %%mm2, (%4, %%"REG_a
") \n\t"
1767 "add $8, %%"REG_a
" \n\t"
1769 : : "g" ((x86_reg
)-width
), "r" (src1
+width
*2), "r" (src2
+width
*2), "r" (dstU
+width
), "r" (dstV
+width
)
1774 for (i
=0; i
<width
; i
++) {
1781 static inline void RENAME(nvXXtoUV
)(uint8_t *dst1
, uint8_t *dst2
,
1782 const uint8_t *src
, long width
)
1784 #if COMPILE_TEMPLATE_MMX
1786 "movq "MANGLE(bm01010101
)", %%mm4 \n\t"
1787 "mov %0, %%"REG_a
" \n\t"
1789 "movq (%1, %%"REG_a
",2), %%mm0 \n\t"
1790 "movq 8(%1, %%"REG_a
",2), %%mm1 \n\t"
1791 "movq %%mm0, %%mm2 \n\t"
1792 "movq %%mm1, %%mm3 \n\t"
1793 "pand %%mm4, %%mm0 \n\t"
1794 "pand %%mm4, %%mm1 \n\t"
1795 "psrlw $8, %%mm2 \n\t"
1796 "psrlw $8, %%mm3 \n\t"
1797 "packuswb %%mm1, %%mm0 \n\t"
1798 "packuswb %%mm3, %%mm2 \n\t"
1799 "movq %%mm0, (%2, %%"REG_a
") \n\t"
1800 "movq %%mm2, (%3, %%"REG_a
") \n\t"
1801 "add $8, %%"REG_a
" \n\t"
1803 : : "g" ((x86_reg
)-width
), "r" (src
+width
*2), "r" (dst1
+width
), "r" (dst2
+width
)
1808 for (i
= 0; i
< width
; i
++) {
1809 dst1
[i
] = src
[2*i
+0];
1810 dst2
[i
] = src
[2*i
+1];
1815 static inline void RENAME(nv12ToUV
)(uint8_t *dstU
, uint8_t *dstV
,
1816 const uint8_t *src1
, const uint8_t *src2
,
1817 long width
, uint32_t *unused
)
1819 RENAME(nvXXtoUV
)(dstU
, dstV
, src1
, width
);
1822 static inline void RENAME(nv21ToUV
)(uint8_t *dstU
, uint8_t *dstV
,
1823 const uint8_t *src1
, const uint8_t *src2
,
1824 long width
, uint32_t *unused
)
1826 RENAME(nvXXtoUV
)(dstV
, dstU
, src1
, width
);
1829 #if COMPILE_TEMPLATE_MMX
1830 static inline void RENAME(bgr24ToY_mmx
)(uint8_t *dst
, const uint8_t *src
, long width
, enum PixelFormat srcFormat
)
1833 if(srcFormat
== PIX_FMT_BGR24
) {
1835 "movq "MANGLE(ff_bgr24toY1Coeff
)", %%mm5 \n\t"
1836 "movq "MANGLE(ff_bgr24toY2Coeff
)", %%mm6 \n\t"
1841 "movq "MANGLE(ff_rgb24toY1Coeff
)", %%mm5 \n\t"
1842 "movq "MANGLE(ff_rgb24toY2Coeff
)", %%mm6 \n\t"
1848 "movq "MANGLE(ff_bgr24toYOffset
)", %%mm4 \n\t"
1849 "mov %2, %%"REG_a
" \n\t"
1850 "pxor %%mm7, %%mm7 \n\t"
1852 PREFETCH
" 64(%0) \n\t"
1853 "movd (%0), %%mm0 \n\t"
1854 "movd 2(%0), %%mm1 \n\t"
1855 "movd 6(%0), %%mm2 \n\t"
1856 "movd 8(%0), %%mm3 \n\t"
1858 "punpcklbw %%mm7, %%mm0 \n\t"
1859 "punpcklbw %%mm7, %%mm1 \n\t"
1860 "punpcklbw %%mm7, %%mm2 \n\t"
1861 "punpcklbw %%mm7, %%mm3 \n\t"
1862 "pmaddwd %%mm5, %%mm0 \n\t"
1863 "pmaddwd %%mm6, %%mm1 \n\t"
1864 "pmaddwd %%mm5, %%mm2 \n\t"
1865 "pmaddwd %%mm6, %%mm3 \n\t"
1866 "paddd %%mm1, %%mm0 \n\t"
1867 "paddd %%mm3, %%mm2 \n\t"
1868 "paddd %%mm4, %%mm0 \n\t"
1869 "paddd %%mm4, %%mm2 \n\t"
1870 "psrad $15, %%mm0 \n\t"
1871 "psrad $15, %%mm2 \n\t"
1872 "packssdw %%mm2, %%mm0 \n\t"
1873 "packuswb %%mm0, %%mm0 \n\t"
1874 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1875 "add $4, %%"REG_a
" \n\t"
1878 : "r" (dst
+width
), "g" ((x86_reg
)-width
)
1883 static inline void RENAME(bgr24ToUV_mmx
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src
, long width
, enum PixelFormat srcFormat
)
1886 "movq 24+%4, %%mm6 \n\t"
1887 "mov %3, %%"REG_a
" \n\t"
1888 "pxor %%mm7, %%mm7 \n\t"
1890 PREFETCH
" 64(%0) \n\t"
1891 "movd (%0), %%mm0 \n\t"
1892 "movd 2(%0), %%mm1 \n\t"
1893 "punpcklbw %%mm7, %%mm0 \n\t"
1894 "punpcklbw %%mm7, %%mm1 \n\t"
1895 "movq %%mm0, %%mm2 \n\t"
1896 "movq %%mm1, %%mm3 \n\t"
1897 "pmaddwd %4, %%mm0 \n\t"
1898 "pmaddwd 8+%4, %%mm1 \n\t"
1899 "pmaddwd 16+%4, %%mm2 \n\t"
1900 "pmaddwd %%mm6, %%mm3 \n\t"
1901 "paddd %%mm1, %%mm0 \n\t"
1902 "paddd %%mm3, %%mm2 \n\t"
1904 "movd 6(%0), %%mm1 \n\t"
1905 "movd 8(%0), %%mm3 \n\t"
1907 "punpcklbw %%mm7, %%mm1 \n\t"
1908 "punpcklbw %%mm7, %%mm3 \n\t"
1909 "movq %%mm1, %%mm4 \n\t"
1910 "movq %%mm3, %%mm5 \n\t"
1911 "pmaddwd %4, %%mm1 \n\t"
1912 "pmaddwd 8+%4, %%mm3 \n\t"
1913 "pmaddwd 16+%4, %%mm4 \n\t"
1914 "pmaddwd %%mm6, %%mm5 \n\t"
1915 "paddd %%mm3, %%mm1 \n\t"
1916 "paddd %%mm5, %%mm4 \n\t"
1918 "movq "MANGLE(ff_bgr24toUVOffset
)", %%mm3 \n\t"
1919 "paddd %%mm3, %%mm0 \n\t"
1920 "paddd %%mm3, %%mm2 \n\t"
1921 "paddd %%mm3, %%mm1 \n\t"
1922 "paddd %%mm3, %%mm4 \n\t"
1923 "psrad $15, %%mm0 \n\t"
1924 "psrad $15, %%mm2 \n\t"
1925 "psrad $15, %%mm1 \n\t"
1926 "psrad $15, %%mm4 \n\t"
1927 "packssdw %%mm1, %%mm0 \n\t"
1928 "packssdw %%mm4, %%mm2 \n\t"
1929 "packuswb %%mm0, %%mm0 \n\t"
1930 "packuswb %%mm2, %%mm2 \n\t"
1931 "movd %%mm0, (%1, %%"REG_a
") \n\t"
1932 "movd %%mm2, (%2, %%"REG_a
") \n\t"
1933 "add $4, %%"REG_a
" \n\t"
1936 : "r" (dstU
+width
), "r" (dstV
+width
), "g" ((x86_reg
)-width
), "m"(ff_bgr24toUV
[srcFormat
== PIX_FMT_RGB24
][0])
1942 static inline void RENAME(bgr24ToY
)(uint8_t *dst
, const uint8_t *src
, long width
, uint32_t *unused
)
1944 #if COMPILE_TEMPLATE_MMX
1945 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_BGR24
);
1948 for (i
=0; i
<width
; i
++) {
1953 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
1955 #endif /* COMPILE_TEMPLATE_MMX */
1958 static inline void RENAME(bgr24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1960 #if COMPILE_TEMPLATE_MMX
1961 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_BGR24
);
1964 for (i
=0; i
<width
; i
++) {
1965 int b
= src1
[3*i
+ 0];
1966 int g
= src1
[3*i
+ 1];
1967 int r
= src1
[3*i
+ 2];
1969 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1970 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
1972 #endif /* COMPILE_TEMPLATE_MMX */
1973 assert(src1
== src2
);
1976 static inline void RENAME(bgr24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
1979 for (i
=0; i
<width
; i
++) {
1980 int b
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
1981 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
1982 int r
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
1984 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1985 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
1987 assert(src1
== src2
);
1990 static inline void RENAME(rgb24ToY
)(uint8_t *dst
, const uint8_t *src
, long width
, uint32_t *unused
)
1992 #if COMPILE_TEMPLATE_MMX
1993 RENAME(bgr24ToY_mmx
)(dst
, src
, width
, PIX_FMT_RGB24
);
1996 for (i
=0; i
<width
; i
++) {
2001 dst
[i
]= ((RY
*r
+ GY
*g
+ BY
*b
+ (33<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
);
2006 static inline void RENAME(rgb24ToUV
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
2008 #if COMPILE_TEMPLATE_MMX
2010 RENAME(bgr24ToUV_mmx
)(dstU
, dstV
, src1
, width
, PIX_FMT_RGB24
);
2014 for (i
=0; i
<width
; i
++) {
2015 int r
= src1
[3*i
+ 0];
2016 int g
= src1
[3*i
+ 1];
2017 int b
= src1
[3*i
+ 2];
2019 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
2020 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<(RGB2YUV_SHIFT
-1)))>>RGB2YUV_SHIFT
;
2025 static inline void RENAME(rgb24ToUV_half
)(uint8_t *dstU
, uint8_t *dstV
, const uint8_t *src1
, const uint8_t *src2
, long width
, uint32_t *unused
)
2029 for (i
=0; i
<width
; i
++) {
2030 int r
= src1
[6*i
+ 0] + src1
[6*i
+ 3];
2031 int g
= src1
[6*i
+ 1] + src1
[6*i
+ 4];
2032 int b
= src1
[6*i
+ 2] + src1
[6*i
+ 5];
2034 dstU
[i
]= (RU
*r
+ GU
*g
+ BU
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
2035 dstV
[i
]= (RV
*r
+ GV
*g
+ BV
*b
+ (257<<RGB2YUV_SHIFT
))>>(RGB2YUV_SHIFT
+1);
2040 // bilinear / bicubic scaling
2041 static inline void RENAME(hScale
)(int16_t *dst
, int dstW
, const uint8_t *src
, int srcW
, int xInc
,
2042 const int16_t *filter
, const int16_t *filterPos
, long filterSize
)
2044 #if COMPILE_TEMPLATE_MMX
2045 assert(filterSize
% 4 == 0 && filterSize
>0);
2046 if (filterSize
==4) { // Always true for upscaling, sometimes for down, too.
2047 x86_reg counter
= -2*dstW
;
2049 filterPos
-= counter
/2;
2053 "push %%"REG_b
" \n\t"
2055 "pxor %%mm7, %%mm7 \n\t"
2056 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2057 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2060 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2061 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
2062 "movq (%1, %%"REG_BP
", 4), %%mm1 \n\t"
2063 "movq 8(%1, %%"REG_BP
", 4), %%mm3 \n\t"
2064 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2065 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2066 "punpcklbw %%mm7, %%mm0 \n\t"
2067 "punpcklbw %%mm7, %%mm2 \n\t"
2068 "pmaddwd %%mm1, %%mm0 \n\t"
2069 "pmaddwd %%mm2, %%mm3 \n\t"
2070 "movq %%mm0, %%mm4 \n\t"
2071 "punpckldq %%mm3, %%mm0 \n\t"
2072 "punpckhdq %%mm3, %%mm4 \n\t"
2073 "paddd %%mm4, %%mm0 \n\t"
2074 "psrad $7, %%mm0 \n\t"
2075 "packssdw %%mm0, %%mm0 \n\t"
2076 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2077 "add $4, %%"REG_BP
" \n\t"
2080 "pop %%"REG_BP
" \n\t"
2082 "pop %%"REG_b
" \n\t"
2085 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2090 } else if (filterSize
==8) {
2091 x86_reg counter
= -2*dstW
;
2093 filterPos
-= counter
/2;
2097 "push %%"REG_b
" \n\t"
2099 "pxor %%mm7, %%mm7 \n\t"
2100 "push %%"REG_BP
" \n\t" // we use 7 regs here ...
2101 "mov %%"REG_a
", %%"REG_BP
" \n\t"
2104 "movzwl (%2, %%"REG_BP
"), %%eax \n\t"
2105 "movzwl 2(%2, %%"REG_BP
"), %%ebx \n\t"
2106 "movq (%1, %%"REG_BP
", 8), %%mm1 \n\t"
2107 "movq 16(%1, %%"REG_BP
", 8), %%mm3 \n\t"
2108 "movd (%3, %%"REG_a
"), %%mm0 \n\t"
2109 "movd (%3, %%"REG_b
"), %%mm2 \n\t"
2110 "punpcklbw %%mm7, %%mm0 \n\t"
2111 "punpcklbw %%mm7, %%mm2 \n\t"
2112 "pmaddwd %%mm1, %%mm0 \n\t"
2113 "pmaddwd %%mm2, %%mm3 \n\t"
2115 "movq 8(%1, %%"REG_BP
", 8), %%mm1 \n\t"
2116 "movq 24(%1, %%"REG_BP
", 8), %%mm5 \n\t"
2117 "movd 4(%3, %%"REG_a
"), %%mm4 \n\t"
2118 "movd 4(%3, %%"REG_b
"), %%mm2 \n\t"
2119 "punpcklbw %%mm7, %%mm4 \n\t"
2120 "punpcklbw %%mm7, %%mm2 \n\t"
2121 "pmaddwd %%mm1, %%mm4 \n\t"
2122 "pmaddwd %%mm2, %%mm5 \n\t"
2123 "paddd %%mm4, %%mm0 \n\t"
2124 "paddd %%mm5, %%mm3 \n\t"
2125 "movq %%mm0, %%mm4 \n\t"
2126 "punpckldq %%mm3, %%mm0 \n\t"
2127 "punpckhdq %%mm3, %%mm4 \n\t"
2128 "paddd %%mm4, %%mm0 \n\t"
2129 "psrad $7, %%mm0 \n\t"
2130 "packssdw %%mm0, %%mm0 \n\t"
2131 "movd %%mm0, (%4, %%"REG_BP
") \n\t"
2132 "add $4, %%"REG_BP
" \n\t"
2135 "pop %%"REG_BP
" \n\t"
2137 "pop %%"REG_b
" \n\t"
2140 : "c" (filter
), "d" (filterPos
), "S" (src
), "D" (dst
)
2146 const uint8_t *offset
= src
+filterSize
;
2147 x86_reg counter
= -2*dstW
;
2148 //filter-= counter*filterSize/2;
2149 filterPos
-= counter
/2;
2152 "pxor %%mm7, %%mm7 \n\t"
2155 "mov %2, %%"REG_c
" \n\t"
2156 "movzwl (%%"REG_c
", %0), %%eax \n\t"
2157 "movzwl 2(%%"REG_c
", %0), %%edx \n\t"
2158 "mov %5, %%"REG_c
" \n\t"
2159 "pxor %%mm4, %%mm4 \n\t"
2160 "pxor %%mm5, %%mm5 \n\t"
2162 "movq (%1), %%mm1 \n\t"
2163 "movq (%1, %6), %%mm3 \n\t"
2164 "movd (%%"REG_c
", %%"REG_a
"), %%mm0 \n\t"
2165 "movd (%%"REG_c
", %%"REG_d
"), %%mm2 \n\t"
2166 "punpcklbw %%mm7, %%mm0 \n\t"
2167 "punpcklbw %%mm7, %%mm2 \n\t"
2168 "pmaddwd %%mm1, %%mm0 \n\t"
2169 "pmaddwd %%mm2, %%mm3 \n\t"
2170 "paddd %%mm3, %%mm5 \n\t"
2171 "paddd %%mm0, %%mm4 \n\t"
2173 "add $4, %%"REG_c
" \n\t"
2174 "cmp %4, %%"REG_c
" \n\t"
2177 "movq %%mm4, %%mm0 \n\t"
2178 "punpckldq %%mm5, %%mm4 \n\t"
2179 "punpckhdq %%mm5, %%mm0 \n\t"
2180 "paddd %%mm0, %%mm4 \n\t"
2181 "psrad $7, %%mm4 \n\t"
2182 "packssdw %%mm4, %%mm4 \n\t"
2183 "mov %3, %%"REG_a
" \n\t"
2184 "movd %%mm4, (%%"REG_a
", %0) \n\t"
2188 : "+r" (counter
), "+r" (filter
)
2189 : "m" (filterPos
), "m" (dst
), "m"(offset
),
2190 "m" (src
), "r" ((x86_reg
)filterSize
*2)
2191 : "%"REG_a
, "%"REG_c
, "%"REG_d
2195 #if COMPILE_TEMPLATE_ALTIVEC
2196 hScale_altivec_real(dst
, dstW
, src
, srcW
, xInc
, filter
, filterPos
, filterSize
);
2199 for (i
=0; i
<dstW
; i
++) {
2201 int srcPos
= filterPos
[i
];
2203 //printf("filterPos: %d\n", filterPos[i]);
2204 for (j
=0; j
<filterSize
; j
++) {
2205 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2206 val
+= ((int)src
[srcPos
+ j
])*filter
[filterSize
*i
+ j
];
2208 //filter += hFilterSize;
2209 dst
[i
] = FFMIN(val
>>7, (1<<15)-1); // the cubic equation does overflow ...
2212 #endif /* COMPILE_TEMPLATE_ALTIVEC */
2213 #endif /* COMPILE_MMX */
2216 //FIXME all pal and rgb srcFormats could do this convertion as well
2217 //FIXME all scalers more complex than bilinear could do half of this transform
2218 static void RENAME(chrRangeToJpeg
)(uint16_t *dst
, int width
)
2221 for (i
= 0; i
< width
; i
++) {
2222 dst
[i
] = (FFMIN(dst
[i
],30775)*4663 - 9289992)>>12; //-264
2223 dst
[i
+VOFW
] = (FFMIN(dst
[i
+VOFW
],30775)*4663 - 9289992)>>12; //-264
2226 static void RENAME(chrRangeFromJpeg
)(uint16_t *dst
, int width
)
2229 for (i
= 0; i
< width
; i
++) {
2230 dst
[i
] = (dst
[i
]*1799 + 4081085)>>11; //1469
2231 dst
[i
+VOFW
] = (dst
[i
+VOFW
]*1799 + 4081085)>>11; //1469
2234 static void RENAME(lumRangeToJpeg
)(uint16_t *dst
, int width
)
2237 for (i
= 0; i
< width
; i
++)
2238 dst
[i
] = (FFMIN(dst
[i
],30189)*19077 - 39057361)>>14;
2240 static void RENAME(lumRangeFromJpeg
)(uint16_t *dst
, int width
)
2243 for (i
= 0; i
< width
; i
++)
2244 dst
[i
] = (dst
[i
]*14071 + 33561947)>>14;
2247 #define FAST_BILINEAR_X86 \
2248 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2249 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2250 "shll $16, %%edi \n\t" \
2251 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2252 "mov %1, %%"REG_D"\n\t" \
2253 "shrl $9, %%esi \n\t" \
2255 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2256 long dstWidth
, const uint8_t *src
, int srcW
,
2260 #if COMPILE_TEMPLATE_MMX2
2261 int32_t *filterPos
= c
->hLumFilterPos
;
2262 int16_t *filter
= c
->hLumFilter
;
2263 int canMMX2BeUsed
= c
->canMMX2BeUsed
;
2264 void *mmx2FilterCode
= c
->lumMmx2FilterCode
;
2267 DECLARE_ALIGNED(8, uint64_t, ebxsave
);
2269 if (canMMX2BeUsed
) {
2272 "mov %%"REG_b
", %5 \n\t"
2274 "pxor %%mm7, %%mm7 \n\t"
2275 "mov %0, %%"REG_c
" \n\t"
2276 "mov %1, %%"REG_D
" \n\t"
2277 "mov %2, %%"REG_d
" \n\t"
2278 "mov %3, %%"REG_b
" \n\t"
2279 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2280 PREFETCH
" (%%"REG_c
") \n\t"
2281 PREFETCH
" 32(%%"REG_c
") \n\t"
2282 PREFETCH
" 64(%%"REG_c
") \n\t"
2286 #define CALL_MMX2_FILTER_CODE \
2287 "movl (%%"REG_b"), %%esi \n\t"\
2289 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2290 "add %%"REG_S", %%"REG_c" \n\t"\
2291 "add %%"REG_a", %%"REG_D" \n\t"\
2292 "xor %%"REG_a", %%"REG_a" \n\t"\
2296 #define CALL_MMX2_FILTER_CODE \
2297 "movl (%%"REG_b"), %%esi \n\t"\
2299 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2300 "add %%"REG_a", %%"REG_D" \n\t"\
2301 "xor %%"REG_a", %%"REG_a" \n\t"\
2303 #endif /* ARCH_X86_64 */
2305 CALL_MMX2_FILTER_CODE
2306 CALL_MMX2_FILTER_CODE
2307 CALL_MMX2_FILTER_CODE
2308 CALL_MMX2_FILTER_CODE
2309 CALL_MMX2_FILTER_CODE
2310 CALL_MMX2_FILTER_CODE
2311 CALL_MMX2_FILTER_CODE
2312 CALL_MMX2_FILTER_CODE
2315 "mov %5, %%"REG_b
" \n\t"
2317 :: "m" (src
), "m" (dst
), "m" (filter
), "m" (filterPos
),
2318 "m" (mmx2FilterCode
)
2322 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2327 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--) dst
[i
] = src
[srcW
-1]*128;
2329 #endif /* COMPILE_TEMPLATE_MMX2 */
2330 x86_reg xInc_shr16
= xInc
>> 16;
2331 uint16_t xInc_mask
= xInc
& 0xffff;
2332 x86_reg dstWidth_reg
= dstWidth
;
2333 //NO MMX just normal asm ...
2335 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2336 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2337 "xorl %%ecx, %%ecx \n\t" // xalpha
2340 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2341 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2343 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2344 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2345 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>16 + carry
2347 "movzbl (%0, %%"REG_d
"), %%edi \n\t" //src[xx]
2348 "movzbl 1(%0, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2350 "movw %%si, 2(%%"REG_D
", %%"REG_a
", 2) \n\t"
2351 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2352 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>16 + carry
2355 "add $2, %%"REG_a
" \n\t"
2356 "cmp %2, %%"REG_a
" \n\t"
2360 :: "r" (src
), "m" (dst
), "m" (dstWidth_reg
), "m" (xInc_shr16
), "m" (xInc_mask
)
2361 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2363 #if COMPILE_TEMPLATE_MMX2
2364 } //if MMX2 can't be used
2368 unsigned int xpos
=0;
2369 for (i
=0;i
<dstWidth
;i
++) {
2370 register unsigned int xx
=xpos
>>16;
2371 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2372 dst
[i
]= (src
[xx
]<<7) + (src
[xx
+1] - src
[xx
])*xalpha
;
2375 #endif /* ARCH_X86 */
2378 // *** horizontal scale Y line to temp buffer
2379 static inline void RENAME(hyscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, const uint8_t *src
, int srcW
, int xInc
,
2380 const int16_t *hLumFilter
,
2381 const int16_t *hLumFilterPos
, int hLumFilterSize
,
2382 uint8_t *formatConvBuffer
,
2383 uint32_t *pal
, int isAlpha
)
2385 void (*toYV12
)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha
? c
->alpToYV12
: c
->lumToYV12
;
2386 void (*convertRange
)(uint16_t *, int) = isAlpha
? NULL
: c
->lumConvertRange
;
2388 src
+= isAlpha
? c
->alpSrcOffset
: c
->lumSrcOffset
;
2391 toYV12(formatConvBuffer
, src
, srcW
, pal
);
2392 src
= formatConvBuffer
;
2395 if (!c
->hyscale_fast
) {
2396 c
->hScale(dst
, dstWidth
, src
, srcW
, xInc
, hLumFilter
, hLumFilterPos
, hLumFilterSize
);
2397 } else { // fast bilinear upscale / crap downscale
2398 c
->hyscale_fast(c
, dst
, dstWidth
, src
, srcW
, xInc
);
2402 convertRange(dst
, dstWidth
);
2405 static inline void RENAME(hcscale_fast
)(SwsContext
*c
, int16_t *dst
,
2406 long dstWidth
, const uint8_t *src1
,
2407 const uint8_t *src2
, int srcW
, int xInc
)
2410 #if COMPILE_TEMPLATE_MMX2
2411 int32_t *filterPos
= c
->hChrFilterPos
;
2412 int16_t *filter
= c
->hChrFilter
;
2413 int canMMX2BeUsed
= c
->canMMX2BeUsed
;
2414 void *mmx2FilterCode
= c
->chrMmx2FilterCode
;
2417 DECLARE_ALIGNED(8, uint64_t, ebxsave
);
2419 if (canMMX2BeUsed
) {
2422 "mov %%"REG_b
", %6 \n\t"
2424 "pxor %%mm7, %%mm7 \n\t"
2425 "mov %0, %%"REG_c
" \n\t"
2426 "mov %1, %%"REG_D
" \n\t"
2427 "mov %2, %%"REG_d
" \n\t"
2428 "mov %3, %%"REG_b
" \n\t"
2429 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2430 PREFETCH
" (%%"REG_c
") \n\t"
2431 PREFETCH
" 32(%%"REG_c
") \n\t"
2432 PREFETCH
" 64(%%"REG_c
") \n\t"
2434 CALL_MMX2_FILTER_CODE
2435 CALL_MMX2_FILTER_CODE
2436 CALL_MMX2_FILTER_CODE
2437 CALL_MMX2_FILTER_CODE
2438 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2439 "mov %5, %%"REG_c
" \n\t" // src
2440 "mov %1, %%"REG_D
" \n\t" // buf1
2441 "add $"AV_STRINGIFY(VOF
)", %%"REG_D
" \n\t"
2442 PREFETCH
" (%%"REG_c
") \n\t"
2443 PREFETCH
" 32(%%"REG_c
") \n\t"
2444 PREFETCH
" 64(%%"REG_c
") \n\t"
2446 CALL_MMX2_FILTER_CODE
2447 CALL_MMX2_FILTER_CODE
2448 CALL_MMX2_FILTER_CODE
2449 CALL_MMX2_FILTER_CODE
2452 "mov %6, %%"REG_b
" \n\t"
2454 :: "m" (src1
), "m" (dst
), "m" (filter
), "m" (filterPos
),
2455 "m" (mmx2FilterCode
), "m" (src2
)
2459 : "%"REG_a
, "%"REG_c
, "%"REG_d
, "%"REG_S
, "%"REG_D
2464 for (i
=dstWidth
-1; (i
*xInc
)>>16 >=srcW
-1; i
--) {
2465 //printf("%d %d %d\n", dstWidth, i, srcW);
2466 dst
[i
] = src1
[srcW
-1]*128;
2467 dst
[i
+VOFW
] = src2
[srcW
-1]*128;
2470 #endif /* COMPILE_TEMPLATE_MMX2 */
2471 x86_reg xInc_shr16
= (x86_reg
) (xInc
>> 16);
2472 uint16_t xInc_mask
= xInc
& 0xffff;
2473 x86_reg dstWidth_reg
= dstWidth
;
2475 "xor %%"REG_a
", %%"REG_a
" \n\t" // i
2476 "xor %%"REG_d
", %%"REG_d
" \n\t" // xx
2477 "xorl %%ecx, %%ecx \n\t" // xalpha
2480 "mov %0, %%"REG_S
" \n\t"
2481 "movzbl (%%"REG_S
", %%"REG_d
"), %%edi \n\t" //src[xx]
2482 "movzbl 1(%%"REG_S
", %%"REG_d
"), %%esi \n\t" //src[xx+1]
2484 "movw %%si, (%%"REG_D
", %%"REG_a
", 2) \n\t"
2486 "movzbl (%5, %%"REG_d
"), %%edi \n\t" //src[xx]
2487 "movzbl 1(%5, %%"REG_d
"), %%esi \n\t" //src[xx+1]
2489 "movw %%si, "AV_STRINGIFY(VOF
)"(%%"REG_D
", %%"REG_a
", 2) \n\t"
2491 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2492 "adc %3, %%"REG_d
" \n\t" //xx+= xInc>>16 + carry
2493 "add $1, %%"REG_a
" \n\t"
2494 "cmp %2, %%"REG_a
" \n\t"
2497 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2498 which is needed to support GCC 4.0. */
2499 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2500 :: "m" (src1
), "m" (dst
), "g" (dstWidth_reg
), "m" (xInc_shr16
), "m" (xInc_mask
),
2502 :: "m" (src1
), "m" (dst
), "m" (dstWidth_reg
), "m" (xInc_shr16
), "m" (xInc_mask
),
2505 : "%"REG_a
, "%"REG_d
, "%ecx", "%"REG_D
, "%esi"
2507 #if COMPILE_TEMPLATE_MMX2
2508 } //if MMX2 can't be used
2512 unsigned int xpos
=0;
2513 for (i
=0;i
<dstWidth
;i
++) {
2514 register unsigned int xx
=xpos
>>16;
2515 register unsigned int xalpha
=(xpos
&0xFFFF)>>9;
2516 dst
[i
]=(src1
[xx
]*(xalpha
^127)+src1
[xx
+1]*xalpha
);
2517 dst
[i
+VOFW
]=(src2
[xx
]*(xalpha
^127)+src2
[xx
+1]*xalpha
);
2519 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2520 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2524 #endif /* ARCH_X86 */
2527 inline static void RENAME(hcscale
)(SwsContext
*c
, uint16_t *dst
, long dstWidth
, const uint8_t *src1
, const uint8_t *src2
,
2528 int srcW
, int xInc
, const int16_t *hChrFilter
,
2529 const int16_t *hChrFilterPos
, int hChrFilterSize
,
2530 uint8_t *formatConvBuffer
,
2534 src1
+= c
->chrSrcOffset
;
2535 src2
+= c
->chrSrcOffset
;
2538 c
->chrToYV12(formatConvBuffer
, formatConvBuffer
+VOFW
, src1
, src2
, srcW
, pal
);
2539 src1
= formatConvBuffer
;
2540 src2
= formatConvBuffer
+VOFW
;
2543 if (!c
->hcscale_fast
) {
2544 c
->hScale(dst
, dstWidth
, src1
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2545 c
->hScale(dst
+VOFW
, dstWidth
, src2
, srcW
, xInc
, hChrFilter
, hChrFilterPos
, hChrFilterSize
);
2546 } else { // fast bilinear upscale / crap downscale
2547 c
->hcscale_fast(c
, dst
, dstWidth
, src1
, src2
, srcW
, xInc
);
2550 if (c
->chrConvertRange
)
2551 c
->chrConvertRange(dst
, dstWidth
);
2554 #define DEBUG_SWSCALE_BUFFERS 0
2555 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2557 static int RENAME(swScale
)(SwsContext
*c
, const uint8_t* src
[], int srcStride
[], int srcSliceY
,
2558 int srcSliceH
, uint8_t* dst
[], int dstStride
[])
2560 /* load a few things into local vars to make the code more readable? and faster */
2561 const int srcW
= c
->srcW
;
2562 const int dstW
= c
->dstW
;
2563 const int dstH
= c
->dstH
;
2564 const int chrDstW
= c
->chrDstW
;
2565 const int chrSrcW
= c
->chrSrcW
;
2566 const int lumXInc
= c
->lumXInc
;
2567 const int chrXInc
= c
->chrXInc
;
2568 const enum PixelFormat dstFormat
= c
->dstFormat
;
2569 const int flags
= c
->flags
;
2570 int16_t *vLumFilterPos
= c
->vLumFilterPos
;
2571 int16_t *vChrFilterPos
= c
->vChrFilterPos
;
2572 int16_t *hLumFilterPos
= c
->hLumFilterPos
;
2573 int16_t *hChrFilterPos
= c
->hChrFilterPos
;
2574 int16_t *vLumFilter
= c
->vLumFilter
;
2575 int16_t *vChrFilter
= c
->vChrFilter
;
2576 int16_t *hLumFilter
= c
->hLumFilter
;
2577 int16_t *hChrFilter
= c
->hChrFilter
;
2578 int32_t *lumMmxFilter
= c
->lumMmxFilter
;
2579 int32_t *chrMmxFilter
= c
->chrMmxFilter
;
2580 int32_t av_unused
*alpMmxFilter
= c
->alpMmxFilter
;
2581 const int vLumFilterSize
= c
->vLumFilterSize
;
2582 const int vChrFilterSize
= c
->vChrFilterSize
;
2583 const int hLumFilterSize
= c
->hLumFilterSize
;
2584 const int hChrFilterSize
= c
->hChrFilterSize
;
2585 int16_t **lumPixBuf
= c
->lumPixBuf
;
2586 int16_t **chrPixBuf
= c
->chrPixBuf
;
2587 int16_t **alpPixBuf
= c
->alpPixBuf
;
2588 const int vLumBufSize
= c
->vLumBufSize
;
2589 const int vChrBufSize
= c
->vChrBufSize
;
2590 uint8_t *formatConvBuffer
= c
->formatConvBuffer
;
2591 const int chrSrcSliceY
= srcSliceY
>> c
->chrSrcVSubSample
;
2592 const int chrSrcSliceH
= -((-srcSliceH
) >> c
->chrSrcVSubSample
);
2594 uint32_t *pal
=c
->pal_yuv
;
2596 /* vars which will change and which we need to store back in the context */
2598 int lumBufIndex
= c
->lumBufIndex
;
2599 int chrBufIndex
= c
->chrBufIndex
;
2600 int lastInLumBuf
= c
->lastInLumBuf
;
2601 int lastInChrBuf
= c
->lastInChrBuf
;
2603 if (isPacked(c
->srcFormat
)) {
2611 srcStride
[3]= srcStride
[0];
2613 srcStride
[1]<<= c
->vChrDrop
;
2614 srcStride
[2]<<= c
->vChrDrop
;
2616 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2617 src
[0], srcStride
[0], src
[1], srcStride
[1], src
[2], srcStride
[2], src
[3], srcStride
[3],
2618 dst
[0], dstStride
[0], dst
[1], dstStride
[1], dst
[2], dstStride
[2], dst
[3], dstStride
[3]);
2619 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2620 srcSliceY
, srcSliceH
, dstY
, dstH
);
2621 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2622 vLumFilterSize
, vLumBufSize
, vChrFilterSize
, vChrBufSize
);
2624 if (dstStride
[0]%8 !=0 || dstStride
[1]%8 !=0 || dstStride
[2]%8 !=0 || dstStride
[3]%8 != 0) {
2625 static int warnedAlready
=0; //FIXME move this into the context perhaps
2626 if (flags
& SWS_PRINT_INFO
&& !warnedAlready
) {
2627 av_log(c
, AV_LOG_WARNING
, "Warning: dstStride is not aligned!\n"
2628 " ->cannot do aligned memory accesses anymore\n");
2633 /* Note the user might start scaling the picture in the middle so this
2634 will not get executed. This is not really intended but works
2635 currently, so people might do it. */
2636 if (srcSliceY
==0) {
2646 for (;dstY
< dstH
; dstY
++) {
2647 unsigned char *dest
=dst
[0]+dstStride
[0]*dstY
;
2648 const int chrDstY
= dstY
>>c
->chrDstVSubSample
;
2649 unsigned char *uDest
=dst
[1]+dstStride
[1]*chrDstY
;
2650 unsigned char *vDest
=dst
[2]+dstStride
[2]*chrDstY
;
2651 unsigned char *aDest
=(CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? dst
[3]+dstStride
[3]*dstY
: NULL
;
2653 const int firstLumSrcY
= vLumFilterPos
[dstY
]; //First line needed as input
2654 const int firstLumSrcY2
= vLumFilterPos
[FFMIN(dstY
| ((1<<c
->chrDstVSubSample
) - 1), dstH
-1)];
2655 const int firstChrSrcY
= vChrFilterPos
[chrDstY
]; //First line needed as input
2656 int lastLumSrcY
= firstLumSrcY
+ vLumFilterSize
-1; // Last line needed as input
2657 int lastLumSrcY2
=firstLumSrcY2
+ vLumFilterSize
-1; // Last line needed as input
2658 int lastChrSrcY
= firstChrSrcY
+ vChrFilterSize
-1; // Last line needed as input
2661 //handle holes (FAST_BILINEAR & weird filters)
2662 if (firstLumSrcY
> lastInLumBuf
) lastInLumBuf
= firstLumSrcY
-1;
2663 if (firstChrSrcY
> lastInChrBuf
) lastInChrBuf
= firstChrSrcY
-1;
2664 assert(firstLumSrcY
>= lastInLumBuf
- vLumBufSize
+ 1);
2665 assert(firstChrSrcY
>= lastInChrBuf
- vChrBufSize
+ 1);
2667 DEBUG_BUFFERS("dstY: %d\n", dstY
);
2668 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2669 firstLumSrcY
, lastLumSrcY
, lastInLumBuf
);
2670 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2671 firstChrSrcY
, lastChrSrcY
, lastInChrBuf
);
2673 // Do we have enough lines in this slice to output the dstY line
2674 enough_lines
= lastLumSrcY2
< srcSliceY
+ srcSliceH
&& lastChrSrcY
< -((-srcSliceY
- srcSliceH
)>>c
->chrSrcVSubSample
);
2676 if (!enough_lines
) {
2677 lastLumSrcY
= srcSliceY
+ srcSliceH
- 1;
2678 lastChrSrcY
= chrSrcSliceY
+ chrSrcSliceH
- 1;
2679 DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2680 lastLumSrcY
, lastChrSrcY
);
2683 //Do horizontal scaling
2684 while(lastInLumBuf
< lastLumSrcY
) {
2685 const uint8_t *src1
= src
[0]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[0];
2686 const uint8_t *src2
= src
[3]+(lastInLumBuf
+ 1 - srcSliceY
)*srcStride
[3];
2688 assert(lumBufIndex
< 2*vLumBufSize
);
2689 assert(lastInLumBuf
+ 1 - srcSliceY
< srcSliceH
);
2690 assert(lastInLumBuf
+ 1 - srcSliceY
>= 0);
2691 RENAME(hyscale
)(c
, lumPixBuf
[ lumBufIndex
], dstW
, src1
, srcW
, lumXInc
,
2692 hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2695 if (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
)
2696 RENAME(hyscale
)(c
, alpPixBuf
[ lumBufIndex
], dstW
, src2
, srcW
, lumXInc
,
2697 hLumFilter
, hLumFilterPos
, hLumFilterSize
,
2701 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2702 lumBufIndex
, lastInLumBuf
);
2704 while(lastInChrBuf
< lastChrSrcY
) {
2705 const uint8_t *src1
= src
[1]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[1];
2706 const uint8_t *src2
= src
[2]+(lastInChrBuf
+ 1 - chrSrcSliceY
)*srcStride
[2];
2708 assert(chrBufIndex
< 2*vChrBufSize
);
2709 assert(lastInChrBuf
+ 1 - chrSrcSliceY
< (chrSrcSliceH
));
2710 assert(lastInChrBuf
+ 1 - chrSrcSliceY
>= 0);
2711 //FIXME replace parameters through context struct (some at least)
2713 if (c
->needs_hcscale
)
2714 RENAME(hcscale
)(c
, chrPixBuf
[ chrBufIndex
], chrDstW
, src1
, src2
, chrSrcW
, chrXInc
,
2715 hChrFilter
, hChrFilterPos
, hChrFilterSize
,
2719 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2720 chrBufIndex
, lastInChrBuf
);
2722 //wrap buf index around to stay inside the ring buffer
2723 if (lumBufIndex
>= vLumBufSize
) lumBufIndex
-= vLumBufSize
;
2724 if (chrBufIndex
>= vChrBufSize
) chrBufIndex
-= vChrBufSize
;
2726 break; //we can't output a dstY line so let's try with the next slice
2728 #if COMPILE_TEMPLATE_MMX
2729 c
->blueDither
= ff_dither8
[dstY
&1];
2730 if (c
->dstFormat
== PIX_FMT_RGB555
|| c
->dstFormat
== PIX_FMT_BGR555
)
2731 c
->greenDither
= ff_dither8
[dstY
&1];
2733 c
->greenDither
= ff_dither4
[dstY
&1];
2734 c
->redDither
= ff_dither8
[(dstY
+1)&1];
2736 if (dstY
< dstH
-2) {
2737 const int16_t **lumSrcPtr
= (const int16_t **) lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2738 const int16_t **chrSrcPtr
= (const int16_t **) chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2739 const int16_t **alpSrcPtr
= (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? (const int16_t **) alpPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
: NULL
;
2740 #if COMPILE_TEMPLATE_MMX
2742 if (flags
& SWS_ACCURATE_RND
) {
2743 int s
= APCK_SIZE
/ 8;
2744 for (i
=0; i
<vLumFilterSize
; i
+=2) {
2745 *(const void**)&lumMmxFilter
[s
*i
]= lumSrcPtr
[i
];
2746 *(const void**)&lumMmxFilter
[s
*i
+APCK_PTR2
/4 ]= lumSrcPtr
[i
+(vLumFilterSize
>1)];
2747 lumMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2748 lumMmxFilter
[s
*i
+APCK_COEF
/4+1]= vLumFilter
[dstY
*vLumFilterSize
+ i
]
2749 + (vLumFilterSize
>1 ? vLumFilter
[dstY
*vLumFilterSize
+ i
+ 1]<<16 : 0);
2750 if (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) {
2751 *(const void**)&alpMmxFilter
[s
*i
]= alpSrcPtr
[i
];
2752 *(const void**)&alpMmxFilter
[s
*i
+APCK_PTR2
/4 ]= alpSrcPtr
[i
+(vLumFilterSize
>1)];
2753 alpMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2754 alpMmxFilter
[s
*i
+APCK_COEF
/4+1]= lumMmxFilter
[s
*i
+APCK_COEF
/4 ];
2757 for (i
=0; i
<vChrFilterSize
; i
+=2) {
2758 *(const void**)&chrMmxFilter
[s
*i
]= chrSrcPtr
[i
];
2759 *(const void**)&chrMmxFilter
[s
*i
+APCK_PTR2
/4 ]= chrSrcPtr
[i
+(vChrFilterSize
>1)];
2760 chrMmxFilter
[s
*i
+APCK_COEF
/4 ]=
2761 chrMmxFilter
[s
*i
+APCK_COEF
/4+1]= vChrFilter
[chrDstY
*vChrFilterSize
+ i
]
2762 + (vChrFilterSize
>1 ? vChrFilter
[chrDstY
*vChrFilterSize
+ i
+ 1]<<16 : 0);
2765 for (i
=0; i
<vLumFilterSize
; i
++) {
2766 lumMmxFilter
[4*i
+0]= (int32_t)lumSrcPtr
[i
];
2767 lumMmxFilter
[4*i
+1]= (uint64_t)lumSrcPtr
[i
] >> 32;
2768 lumMmxFilter
[4*i
+2]=
2769 lumMmxFilter
[4*i
+3]=
2770 ((uint16_t)vLumFilter
[dstY
*vLumFilterSize
+ i
])*0x10001;
2771 if (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) {
2772 alpMmxFilter
[4*i
+0]= (int32_t)alpSrcPtr
[i
];
2773 alpMmxFilter
[4*i
+1]= (uint64_t)alpSrcPtr
[i
] >> 32;
2774 alpMmxFilter
[4*i
+2]=
2775 alpMmxFilter
[4*i
+3]= lumMmxFilter
[4*i
+2];
2778 for (i
=0; i
<vChrFilterSize
; i
++) {
2779 chrMmxFilter
[4*i
+0]= (int32_t)chrSrcPtr
[i
];
2780 chrMmxFilter
[4*i
+1]= (uint64_t)chrSrcPtr
[i
] >> 32;
2781 chrMmxFilter
[4*i
+2]=
2782 chrMmxFilter
[4*i
+3]=
2783 ((uint16_t)vChrFilter
[chrDstY
*vChrFilterSize
+ i
])*0x10001;
2787 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
) {
2788 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2789 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2791 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2792 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2793 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
2794 } else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) { //YV12 like
2795 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2796 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
2797 if (is16BPS(dstFormat
)) {
2799 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2800 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2801 alpSrcPtr
, (uint16_t *) dest
, (uint16_t *) uDest
, (uint16_t *) vDest
, (uint16_t *) aDest
, dstW
, chrDstW
,
2803 } else if (vLumFilterSize
== 1 && vChrFilterSize
== 1) { // unscaled YV12
2804 const int16_t *lumBuf
= lumSrcPtr
[0];
2805 const int16_t *chrBuf
= chrSrcPtr
[0];
2806 const int16_t *alpBuf
= (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? alpSrcPtr
[0] : NULL
;
2807 c
->yuv2yuv1(c
, lumBuf
, chrBuf
, alpBuf
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
2808 } else { //General YV12
2810 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2811 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2812 alpSrcPtr
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
2815 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
2816 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
2817 if (vLumFilterSize
== 1 && vChrFilterSize
== 2) { //unscaled RGB
2818 int chrAlpha
= vChrFilter
[2*dstY
+1];
2819 if(flags
& SWS_FULL_CHR_H_INT
) {
2820 yuv2rgbXinC_full(c
, //FIXME write a packed1_full function
2821 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2822 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2823 alpSrcPtr
, dest
, dstW
, dstY
);
2825 c
->yuv2packed1(c
, *lumSrcPtr
, *chrSrcPtr
, *(chrSrcPtr
+1),
2826 alpPixBuf
? *alpSrcPtr
: NULL
,
2827 dest
, dstW
, chrAlpha
, dstFormat
, flags
, dstY
);
2829 } else if (vLumFilterSize
== 2 && vChrFilterSize
== 2) { //bilinear upscale RGB
2830 int lumAlpha
= vLumFilter
[2*dstY
+1];
2831 int chrAlpha
= vChrFilter
[2*dstY
+1];
2833 lumMmxFilter
[3]= vLumFilter
[2*dstY
]*0x10001;
2835 chrMmxFilter
[3]= vChrFilter
[2*chrDstY
]*0x10001;
2836 if(flags
& SWS_FULL_CHR_H_INT
) {
2837 yuv2rgbXinC_full(c
, //FIXME write a packed2_full function
2838 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2839 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2840 alpSrcPtr
, dest
, dstW
, dstY
);
2842 c
->yuv2packed2(c
, *lumSrcPtr
, *(lumSrcPtr
+1), *chrSrcPtr
, *(chrSrcPtr
+1),
2843 alpPixBuf
? *alpSrcPtr
: NULL
, alpPixBuf
? *(alpSrcPtr
+1) : NULL
,
2844 dest
, dstW
, lumAlpha
, chrAlpha
, dstY
);
2846 } else { //general RGB
2847 if(flags
& SWS_FULL_CHR_H_INT
) {
2849 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2850 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2851 alpSrcPtr
, dest
, dstW
, dstY
);
2854 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2855 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2856 alpSrcPtr
, dest
, dstW
, dstY
);
2860 } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2861 const int16_t **lumSrcPtr
= (const int16_t **)lumPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
;
2862 const int16_t **chrSrcPtr
= (const int16_t **)chrPixBuf
+ chrBufIndex
+ firstChrSrcY
- lastInChrBuf
+ vChrBufSize
;
2863 const int16_t **alpSrcPtr
= (CONFIG_SWSCALE_ALPHA
&& alpPixBuf
) ? (const int16_t **)alpPixBuf
+ lumBufIndex
+ firstLumSrcY
- lastInLumBuf
+ vLumBufSize
: NULL
;
2864 if (dstFormat
== PIX_FMT_NV12
|| dstFormat
== PIX_FMT_NV21
) {
2865 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2866 if (dstY
&chrSkipMask
) uDest
= NULL
; //FIXME split functions in lumi / chromi
2868 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2869 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2870 dest
, uDest
, dstW
, chrDstW
, dstFormat
);
2871 } else if (isPlanarYUV(dstFormat
) || dstFormat
==PIX_FMT_GRAY8
) { //YV12
2872 const int chrSkipMask
= (1<<c
->chrDstVSubSample
)-1;
2873 if ((dstY
&chrSkipMask
) || isGray(dstFormat
)) uDest
=vDest
= NULL
; //FIXME split functions in lumi / chromi
2874 if (is16BPS(dstFormat
)) {
2876 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2877 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2878 alpSrcPtr
, (uint16_t *) dest
, (uint16_t *) uDest
, (uint16_t *) vDest
, (uint16_t *) aDest
, dstW
, chrDstW
,
2882 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2883 vChrFilter
+chrDstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2884 alpSrcPtr
, dest
, uDest
, vDest
, aDest
, dstW
, chrDstW
);
2887 assert(lumSrcPtr
+ vLumFilterSize
- 1 < lumPixBuf
+ vLumBufSize
*2);
2888 assert(chrSrcPtr
+ vChrFilterSize
- 1 < chrPixBuf
+ vChrBufSize
*2);
2889 if(flags
& SWS_FULL_CHR_H_INT
) {
2891 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2892 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2893 alpSrcPtr
, dest
, dstW
, dstY
);
2896 vLumFilter
+dstY
*vLumFilterSize
, lumSrcPtr
, vLumFilterSize
,
2897 vChrFilter
+dstY
*vChrFilterSize
, chrSrcPtr
, vChrFilterSize
,
2898 alpSrcPtr
, dest
, dstW
, dstY
);
2904 if ((dstFormat
== PIX_FMT_YUVA420P
) && !alpPixBuf
)
2905 fillPlane(dst
[3], dstStride
[3], dstW
, dstY
-lastDstY
, lastDstY
, 255);
2907 #if COMPILE_TEMPLATE_MMX
2908 if (flags
& SWS_CPU_CAPS_MMX2
) __asm__
volatile("sfence":::"memory");
2909 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2910 if (flags
& SWS_CPU_CAPS_3DNOW
) __asm__
volatile("femms" :::"memory");
2911 else __asm__
volatile("emms" :::"memory");
2913 /* store changed local vars back in the context */
2915 c
->lumBufIndex
= lumBufIndex
;
2916 c
->chrBufIndex
= chrBufIndex
;
2917 c
->lastInLumBuf
= lastInLumBuf
;
2918 c
->lastInChrBuf
= lastInChrBuf
;
2920 return dstY
- lastDstY
;
2923 static void RENAME(sws_init_swScale
)(SwsContext
*c
)
2925 enum PixelFormat srcFormat
= c
->srcFormat
;
2927 c
->yuv2nv12X
= RENAME(yuv2nv12X
);
2928 c
->yuv2yuv1
= RENAME(yuv2yuv1
);
2929 c
->yuv2yuvX
= RENAME(yuv2yuvX
);
2930 c
->yuv2packed1
= RENAME(yuv2packed1
);
2931 c
->yuv2packed2
= RENAME(yuv2packed2
);
2932 c
->yuv2packedX
= RENAME(yuv2packedX
);
2934 c
->hScale
= RENAME(hScale
);
2936 #if COMPILE_TEMPLATE_MMX
2937 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2938 if (c
->flags
& SWS_FAST_BILINEAR
&& c
->canMMX2BeUsed
)
2940 if (c
->flags
& SWS_FAST_BILINEAR
)
2943 c
->hyscale_fast
= RENAME(hyscale_fast
);
2944 c
->hcscale_fast
= RENAME(hcscale_fast
);
2947 c
->chrToYV12
= NULL
;
2949 case PIX_FMT_YUYV422
: c
->chrToYV12
= RENAME(yuy2ToUV
); break;
2950 case PIX_FMT_UYVY422
: c
->chrToYV12
= RENAME(uyvyToUV
); break;
2951 case PIX_FMT_NV12
: c
->chrToYV12
= RENAME(nv12ToUV
); break;
2952 case PIX_FMT_NV21
: c
->chrToYV12
= RENAME(nv21ToUV
); break;
2956 case PIX_FMT_BGR4_BYTE
:
2957 case PIX_FMT_RGB4_BYTE
: c
->chrToYV12
= palToUV
; break;
2958 case PIX_FMT_YUV420P16BE
:
2959 case PIX_FMT_YUV422P16BE
:
2960 case PIX_FMT_YUV444P16BE
: c
->chrToYV12
= RENAME(BEToUV
); break;
2961 case PIX_FMT_YUV420P16LE
:
2962 case PIX_FMT_YUV422P16LE
:
2963 case PIX_FMT_YUV444P16LE
: c
->chrToYV12
= RENAME(LEToUV
); break;
2965 if (c
->chrSrcHSubSample
) {
2967 case PIX_FMT_RGB48BE
:
2968 case PIX_FMT_RGB48LE
: c
->chrToYV12
= rgb48ToUV_half
; break;
2969 case PIX_FMT_RGB32
: c
->chrToYV12
= bgr32ToUV_half
; break;
2970 case PIX_FMT_RGB32_1
: c
->chrToYV12
= bgr321ToUV_half
; break;
2971 case PIX_FMT_BGR24
: c
->chrToYV12
= RENAME(bgr24ToUV_half
); break;
2972 case PIX_FMT_BGR565
: c
->chrToYV12
= bgr16ToUV_half
; break;
2973 case PIX_FMT_BGR555
: c
->chrToYV12
= bgr15ToUV_half
; break;
2974 case PIX_FMT_BGR32
: c
->chrToYV12
= rgb32ToUV_half
; break;
2975 case PIX_FMT_BGR32_1
: c
->chrToYV12
= rgb321ToUV_half
; break;
2976 case PIX_FMT_RGB24
: c
->chrToYV12
= RENAME(rgb24ToUV_half
); break;
2977 case PIX_FMT_RGB565
: c
->chrToYV12
= rgb16ToUV_half
; break;
2978 case PIX_FMT_RGB555
: c
->chrToYV12
= rgb15ToUV_half
; break;
2982 case PIX_FMT_RGB48BE
:
2983 case PIX_FMT_RGB48LE
: c
->chrToYV12
= rgb48ToUV
; break;
2984 case PIX_FMT_RGB32
: c
->chrToYV12
= bgr32ToUV
; break;
2985 case PIX_FMT_RGB32_1
: c
->chrToYV12
= bgr321ToUV
; break;
2986 case PIX_FMT_BGR24
: c
->chrToYV12
= RENAME(bgr24ToUV
); break;
2987 case PIX_FMT_BGR565
: c
->chrToYV12
= bgr16ToUV
; break;
2988 case PIX_FMT_BGR555
: c
->chrToYV12
= bgr15ToUV
; break;
2989 case PIX_FMT_BGR32
: c
->chrToYV12
= rgb32ToUV
; break;
2990 case PIX_FMT_BGR32_1
: c
->chrToYV12
= rgb321ToUV
; break;
2991 case PIX_FMT_RGB24
: c
->chrToYV12
= RENAME(rgb24ToUV
); break;
2992 case PIX_FMT_RGB565
: c
->chrToYV12
= rgb16ToUV
; break;
2993 case PIX_FMT_RGB555
: c
->chrToYV12
= rgb15ToUV
; break;
2997 c
->lumToYV12
= NULL
;
2998 c
->alpToYV12
= NULL
;
2999 switch (srcFormat
) {
3000 case PIX_FMT_YUYV422
:
3001 case PIX_FMT_YUV420P16BE
:
3002 case PIX_FMT_YUV422P16BE
:
3003 case PIX_FMT_YUV444P16BE
:
3004 case PIX_FMT_Y400A
:
3005 case PIX_FMT_GRAY16BE
: c
->lumToYV12
= RENAME(yuy2ToY
); break;
3006 case PIX_FMT_UYVY422
:
3007 case PIX_FMT_YUV420P16LE
:
3008 case PIX_FMT_YUV422P16LE
:
3009 case PIX_FMT_YUV444P16LE
:
3010 case PIX_FMT_GRAY16LE
: c
->lumToYV12
= RENAME(uyvyToY
); break;
3011 case PIX_FMT_BGR24
: c
->lumToYV12
= RENAME(bgr24ToY
); break;
3012 case PIX_FMT_BGR565
: c
->lumToYV12
= bgr16ToY
; break;
3013 case PIX_FMT_BGR555
: c
->lumToYV12
= bgr15ToY
; break;
3014 case PIX_FMT_RGB24
: c
->lumToYV12
= RENAME(rgb24ToY
); break;
3015 case PIX_FMT_RGB565
: c
->lumToYV12
= rgb16ToY
; break;
3016 case PIX_FMT_RGB555
: c
->lumToYV12
= rgb15ToY
; break;
3020 case PIX_FMT_BGR4_BYTE
:
3021 case PIX_FMT_RGB4_BYTE
: c
->lumToYV12
= palToY
; break;
3022 case PIX_FMT_MONOBLACK
: c
->lumToYV12
= monoblack2Y
; break;
3023 case PIX_FMT_MONOWHITE
: c
->lumToYV12
= monowhite2Y
; break;
3024 case PIX_FMT_RGB32
: c
->lumToYV12
= bgr32ToY
; break;
3025 case PIX_FMT_RGB32_1
: c
->lumToYV12
= bgr321ToY
; break;
3026 case PIX_FMT_BGR32
: c
->lumToYV12
= rgb32ToY
; break;
3027 case PIX_FMT_BGR32_1
: c
->lumToYV12
= rgb321ToY
; break;
3028 case PIX_FMT_RGB48BE
:
3029 case PIX_FMT_RGB48LE
: c
->lumToYV12
= rgb48ToY
; break;
3032 switch (srcFormat
) {
3033 case PIX_FMT_RGB32
:
3034 case PIX_FMT_RGB32_1
:
3035 case PIX_FMT_BGR32
:
3036 case PIX_FMT_BGR32_1
: c
->alpToYV12
= abgrToA
; break;
3037 case PIX_FMT_Y400A
: c
->alpToYV12
= RENAME(yuy2ToY
); break;
3041 switch (srcFormat
) {
3042 case PIX_FMT_Y400A
:
3043 c
->alpSrcOffset
= 1;
3045 case PIX_FMT_RGB32
:
3046 case PIX_FMT_BGR32
:
3047 c
->alpSrcOffset
= 3;
3049 case PIX_FMT_RGB48LE
:
3050 c
->lumSrcOffset
= 1;
3051 c
->chrSrcOffset
= 1;
3052 c
->alpSrcOffset
= 1;
3056 if (c
->srcRange
!= c
->dstRange
&& !isAnyRGB(c
->dstFormat
)) {
3058 c
->lumConvertRange
= RENAME(lumRangeFromJpeg
);
3059 c
->chrConvertRange
= RENAME(chrRangeFromJpeg
);
3061 c
->lumConvertRange
= RENAME(lumRangeToJpeg
);
3062 c
->chrConvertRange
= RENAME(chrRangeToJpeg
);
3066 if (!(isGray(srcFormat
) || isGray(c
->dstFormat
) ||
3067 srcFormat
== PIX_FMT_MONOBLACK
|| srcFormat
== PIX_FMT_MONOWHITE
))
3068 c
->needs_hcscale
= 1;