sync with en/mplayer.1 rev. 30677
[mplayer/glamo.git] / libswscale / swscale_template.c
blobbbd9a1f7016ef3fedd525af6b453b1e01cb3ca9b
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
24 #undef REAL_MOVNTQ
25 #undef MOVNTQ
26 #undef PAVGB
27 #undef PREFETCH
29 #if COMPILE_TEMPLATE_AMD3DNOW
30 #define PREFETCH "prefetch"
31 #elif COMPILE_TEMPLATE_MMX2
32 #define PREFETCH "prefetchnta"
33 #else
34 #define PREFETCH " # nop"
35 #endif
37 #if COMPILE_TEMPLATE_MMX2
38 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
39 #elif COMPILE_TEMPLATE_AMD3DNOW
40 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
41 #endif
43 #if COMPILE_TEMPLATE_MMX2
44 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
45 #else
46 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
47 #endif
48 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
50 #if COMPILE_TEMPLATE_ALTIVEC
51 #include "ppc/swscale_altivec_template.c"
52 #endif
54 #define YSCALEYUV2YV12X(x, offset, dest, width) \
55 __asm__ volatile(\
56 "xor %%"REG_a", %%"REG_a" \n\t"\
57 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
58 "movq %%mm3, %%mm4 \n\t"\
59 "lea " offset "(%0), %%"REG_d" \n\t"\
60 "mov (%%"REG_d"), %%"REG_S" \n\t"\
61 ASMALIGN(4) /* FIXME Unroll? */\
62 "1: \n\t"\
63 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
64 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
65 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
66 "add $16, %%"REG_d" \n\t"\
67 "mov (%%"REG_d"), %%"REG_S" \n\t"\
68 "test %%"REG_S", %%"REG_S" \n\t"\
69 "pmulhw %%mm0, %%mm2 \n\t"\
70 "pmulhw %%mm0, %%mm5 \n\t"\
71 "paddw %%mm2, %%mm3 \n\t"\
72 "paddw %%mm5, %%mm4 \n\t"\
73 " jnz 1b \n\t"\
74 "psraw $3, %%mm3 \n\t"\
75 "psraw $3, %%mm4 \n\t"\
76 "packuswb %%mm4, %%mm3 \n\t"\
77 MOVNTQ(%%mm3, (%1, %%REGa))\
78 "add $8, %%"REG_a" \n\t"\
79 "cmp %2, %%"REG_a" \n\t"\
80 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
81 "movq %%mm3, %%mm4 \n\t"\
82 "lea " offset "(%0), %%"REG_d" \n\t"\
83 "mov (%%"REG_d"), %%"REG_S" \n\t"\
84 "jb 1b \n\t"\
85 :: "r" (&c->redDither),\
86 "r" (dest), "g" (width)\
87 : "%"REG_a, "%"REG_d, "%"REG_S\
90 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
91 __asm__ volatile(\
92 "lea " offset "(%0), %%"REG_d" \n\t"\
93 "xor %%"REG_a", %%"REG_a" \n\t"\
94 "pxor %%mm4, %%mm4 \n\t"\
95 "pxor %%mm5, %%mm5 \n\t"\
96 "pxor %%mm6, %%mm6 \n\t"\
97 "pxor %%mm7, %%mm7 \n\t"\
98 "mov (%%"REG_d"), %%"REG_S" \n\t"\
99 ASMALIGN(4) \
100 "1: \n\t"\
101 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
102 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
103 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
104 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
105 "movq %%mm0, %%mm3 \n\t"\
106 "punpcklwd %%mm1, %%mm0 \n\t"\
107 "punpckhwd %%mm1, %%mm3 \n\t"\
108 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
109 "pmaddwd %%mm1, %%mm0 \n\t"\
110 "pmaddwd %%mm1, %%mm3 \n\t"\
111 "paddd %%mm0, %%mm4 \n\t"\
112 "paddd %%mm3, %%mm5 \n\t"\
113 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
114 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
115 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
116 "test %%"REG_S", %%"REG_S" \n\t"\
117 "movq %%mm2, %%mm0 \n\t"\
118 "punpcklwd %%mm3, %%mm2 \n\t"\
119 "punpckhwd %%mm3, %%mm0 \n\t"\
120 "pmaddwd %%mm1, %%mm2 \n\t"\
121 "pmaddwd %%mm1, %%mm0 \n\t"\
122 "paddd %%mm2, %%mm6 \n\t"\
123 "paddd %%mm0, %%mm7 \n\t"\
124 " jnz 1b \n\t"\
125 "psrad $16, %%mm4 \n\t"\
126 "psrad $16, %%mm5 \n\t"\
127 "psrad $16, %%mm6 \n\t"\
128 "psrad $16, %%mm7 \n\t"\
129 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
130 "packssdw %%mm5, %%mm4 \n\t"\
131 "packssdw %%mm7, %%mm6 \n\t"\
132 "paddw %%mm0, %%mm4 \n\t"\
133 "paddw %%mm0, %%mm6 \n\t"\
134 "psraw $3, %%mm4 \n\t"\
135 "psraw $3, %%mm6 \n\t"\
136 "packuswb %%mm6, %%mm4 \n\t"\
137 MOVNTQ(%%mm4, (%1, %%REGa))\
138 "add $8, %%"REG_a" \n\t"\
139 "cmp %2, %%"REG_a" \n\t"\
140 "lea " offset "(%0), %%"REG_d" \n\t"\
141 "pxor %%mm4, %%mm4 \n\t"\
142 "pxor %%mm5, %%mm5 \n\t"\
143 "pxor %%mm6, %%mm6 \n\t"\
144 "pxor %%mm7, %%mm7 \n\t"\
145 "mov (%%"REG_d"), %%"REG_S" \n\t"\
146 "jb 1b \n\t"\
147 :: "r" (&c->redDither),\
148 "r" (dest), "g" (width)\
149 : "%"REG_a, "%"REG_d, "%"REG_S\
152 #define YSCALEYUV2YV121 \
153 "mov %2, %%"REG_a" \n\t"\
154 ASMALIGN(4) /* FIXME Unroll? */\
155 "1: \n\t"\
156 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
157 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
158 "psraw $7, %%mm0 \n\t"\
159 "psraw $7, %%mm1 \n\t"\
160 "packuswb %%mm1, %%mm0 \n\t"\
161 MOVNTQ(%%mm0, (%1, %%REGa))\
162 "add $8, %%"REG_a" \n\t"\
163 "jnc 1b \n\t"
165 #define YSCALEYUV2YV121_ACCURATE \
166 "mov %2, %%"REG_a" \n\t"\
167 "pcmpeqw %%mm7, %%mm7 \n\t"\
168 "psrlw $15, %%mm7 \n\t"\
169 "psllw $6, %%mm7 \n\t"\
170 ASMALIGN(4) /* FIXME Unroll? */\
171 "1: \n\t"\
172 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
173 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
174 "paddsw %%mm7, %%mm0 \n\t"\
175 "paddsw %%mm7, %%mm1 \n\t"\
176 "psraw $7, %%mm0 \n\t"\
177 "psraw $7, %%mm1 \n\t"\
178 "packuswb %%mm1, %%mm0 \n\t"\
179 MOVNTQ(%%mm0, (%1, %%REGa))\
180 "add $8, %%"REG_a" \n\t"\
181 "jnc 1b \n\t"
184 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
185 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
186 "r" (dest), "m" (dstW),
187 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
188 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
190 #define YSCALEYUV2PACKEDX_UV \
191 __asm__ volatile(\
192 "xor %%"REG_a", %%"REG_a" \n\t"\
193 ASMALIGN(4)\
194 "nop \n\t"\
195 "1: \n\t"\
196 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
197 "mov (%%"REG_d"), %%"REG_S" \n\t"\
198 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
199 "movq %%mm3, %%mm4 \n\t"\
200 ASMALIGN(4)\
201 "2: \n\t"\
202 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
203 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
204 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
205 "add $16, %%"REG_d" \n\t"\
206 "mov (%%"REG_d"), %%"REG_S" \n\t"\
207 "pmulhw %%mm0, %%mm2 \n\t"\
208 "pmulhw %%mm0, %%mm5 \n\t"\
209 "paddw %%mm2, %%mm3 \n\t"\
210 "paddw %%mm5, %%mm4 \n\t"\
211 "test %%"REG_S", %%"REG_S" \n\t"\
212 " jnz 2b \n\t"\
214 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
215 "lea "offset"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
218 "movq "#dst1", "#dst2" \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
223 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw "#coeff", "#src1" \n\t"\
227 "pmulhw "#coeff", "#src2" \n\t"\
228 "paddw "#src1", "#dst1" \n\t"\
229 "paddw "#src2", "#dst2" \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
233 #define YSCALEYUV2PACKEDX \
234 YSCALEYUV2PACKEDX_UV \
235 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
237 #define YSCALEYUV2PACKEDX_END \
238 :: "r" (&c->redDither), \
239 "m" (dummy), "m" (dummy), "m" (dummy),\
240 "r" (dest), "m" (dstW) \
241 : "%"REG_a, "%"REG_d, "%"REG_S \
244 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
245 __asm__ volatile(\
246 "xor %%"REG_a", %%"REG_a" \n\t"\
247 ASMALIGN(4)\
248 "nop \n\t"\
249 "1: \n\t"\
250 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
251 "mov (%%"REG_d"), %%"REG_S" \n\t"\
252 "pxor %%mm4, %%mm4 \n\t"\
253 "pxor %%mm5, %%mm5 \n\t"\
254 "pxor %%mm6, %%mm6 \n\t"\
255 "pxor %%mm7, %%mm7 \n\t"\
256 ASMALIGN(4)\
257 "2: \n\t"\
258 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
259 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
260 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
261 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
262 "movq %%mm0, %%mm3 \n\t"\
263 "punpcklwd %%mm1, %%mm0 \n\t"\
264 "punpckhwd %%mm1, %%mm3 \n\t"\
265 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
266 "pmaddwd %%mm1, %%mm0 \n\t"\
267 "pmaddwd %%mm1, %%mm3 \n\t"\
268 "paddd %%mm0, %%mm4 \n\t"\
269 "paddd %%mm3, %%mm5 \n\t"\
270 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
271 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
272 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
273 "test %%"REG_S", %%"REG_S" \n\t"\
274 "movq %%mm2, %%mm0 \n\t"\
275 "punpcklwd %%mm3, %%mm2 \n\t"\
276 "punpckhwd %%mm3, %%mm0 \n\t"\
277 "pmaddwd %%mm1, %%mm2 \n\t"\
278 "pmaddwd %%mm1, %%mm0 \n\t"\
279 "paddd %%mm2, %%mm6 \n\t"\
280 "paddd %%mm0, %%mm7 \n\t"\
281 " jnz 2b \n\t"\
282 "psrad $16, %%mm4 \n\t"\
283 "psrad $16, %%mm5 \n\t"\
284 "psrad $16, %%mm6 \n\t"\
285 "psrad $16, %%mm7 \n\t"\
286 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
287 "packssdw %%mm5, %%mm4 \n\t"\
288 "packssdw %%mm7, %%mm6 \n\t"\
289 "paddw %%mm0, %%mm4 \n\t"\
290 "paddw %%mm0, %%mm6 \n\t"\
291 "movq %%mm4, "U_TEMP"(%0) \n\t"\
292 "movq %%mm6, "V_TEMP"(%0) \n\t"\
294 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
295 "lea "offset"(%0), %%"REG_d" \n\t"\
296 "mov (%%"REG_d"), %%"REG_S" \n\t"\
297 "pxor %%mm1, %%mm1 \n\t"\
298 "pxor %%mm5, %%mm5 \n\t"\
299 "pxor %%mm7, %%mm7 \n\t"\
300 "pxor %%mm6, %%mm6 \n\t"\
301 ASMALIGN(4)\
302 "2: \n\t"\
303 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
304 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
305 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
306 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
307 "movq %%mm0, %%mm3 \n\t"\
308 "punpcklwd %%mm4, %%mm0 \n\t"\
309 "punpckhwd %%mm4, %%mm3 \n\t"\
310 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
311 "pmaddwd %%mm4, %%mm0 \n\t"\
312 "pmaddwd %%mm4, %%mm3 \n\t"\
313 "paddd %%mm0, %%mm1 \n\t"\
314 "paddd %%mm3, %%mm5 \n\t"\
315 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
316 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
317 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
318 "test %%"REG_S", %%"REG_S" \n\t"\
319 "movq %%mm2, %%mm0 \n\t"\
320 "punpcklwd %%mm3, %%mm2 \n\t"\
321 "punpckhwd %%mm3, %%mm0 \n\t"\
322 "pmaddwd %%mm4, %%mm2 \n\t"\
323 "pmaddwd %%mm4, %%mm0 \n\t"\
324 "paddd %%mm2, %%mm7 \n\t"\
325 "paddd %%mm0, %%mm6 \n\t"\
326 " jnz 2b \n\t"\
327 "psrad $16, %%mm1 \n\t"\
328 "psrad $16, %%mm5 \n\t"\
329 "psrad $16, %%mm7 \n\t"\
330 "psrad $16, %%mm6 \n\t"\
331 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
332 "packssdw %%mm5, %%mm1 \n\t"\
333 "packssdw %%mm6, %%mm7 \n\t"\
334 "paddw %%mm0, %%mm1 \n\t"\
335 "paddw %%mm0, %%mm7 \n\t"\
336 "movq "U_TEMP"(%0), %%mm3 \n\t"\
337 "movq "V_TEMP"(%0), %%mm4 \n\t"\
339 #define YSCALEYUV2PACKEDX_ACCURATE \
340 YSCALEYUV2PACKEDX_ACCURATE_UV \
341 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
343 #define YSCALEYUV2RGBX \
344 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
345 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
346 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
347 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
348 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
349 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
350 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
351 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
352 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
353 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
354 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
355 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
356 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
357 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
358 "paddw %%mm3, %%mm4 \n\t"\
359 "movq %%mm2, %%mm0 \n\t"\
360 "movq %%mm5, %%mm6 \n\t"\
361 "movq %%mm4, %%mm3 \n\t"\
362 "punpcklwd %%mm2, %%mm2 \n\t"\
363 "punpcklwd %%mm5, %%mm5 \n\t"\
364 "punpcklwd %%mm4, %%mm4 \n\t"\
365 "paddw %%mm1, %%mm2 \n\t"\
366 "paddw %%mm1, %%mm5 \n\t"\
367 "paddw %%mm1, %%mm4 \n\t"\
368 "punpckhwd %%mm0, %%mm0 \n\t"\
369 "punpckhwd %%mm6, %%mm6 \n\t"\
370 "punpckhwd %%mm3, %%mm3 \n\t"\
371 "paddw %%mm7, %%mm0 \n\t"\
372 "paddw %%mm7, %%mm6 \n\t"\
373 "paddw %%mm7, %%mm3 \n\t"\
374 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
375 "packuswb %%mm0, %%mm2 \n\t"\
376 "packuswb %%mm6, %%mm5 \n\t"\
377 "packuswb %%mm3, %%mm4 \n\t"\
379 #define REAL_YSCALEYUV2PACKED(index, c) \
380 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
381 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
382 "psraw $3, %%mm0 \n\t"\
383 "psraw $3, %%mm1 \n\t"\
384 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
385 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
386 "xor "#index", "#index" \n\t"\
387 ASMALIGN(4)\
388 "1: \n\t"\
389 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
390 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
391 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
392 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
393 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
394 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
395 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
396 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
397 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
398 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
399 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
400 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
401 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
402 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
403 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
404 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
405 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
406 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
407 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
408 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
409 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
410 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
411 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
412 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
413 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
415 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
417 #define REAL_YSCALEYUV2RGB_UV(index, c) \
418 "xor "#index", "#index" \n\t"\
419 ASMALIGN(4)\
420 "1: \n\t"\
421 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
422 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
423 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
424 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
425 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
426 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
427 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
428 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
429 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
430 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
431 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
432 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
433 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
434 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
435 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
436 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
437 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
438 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
439 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
440 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
442 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
443 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
444 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
445 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
446 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
447 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
448 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
449 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
450 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
451 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
452 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
454 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
456 #define REAL_YSCALEYUV2RGB_COEFF(c) \
457 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
458 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
459 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
460 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
461 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
462 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
463 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
464 "paddw %%mm3, %%mm4 \n\t"\
465 "movq %%mm2, %%mm0 \n\t"\
466 "movq %%mm5, %%mm6 \n\t"\
467 "movq %%mm4, %%mm3 \n\t"\
468 "punpcklwd %%mm2, %%mm2 \n\t"\
469 "punpcklwd %%mm5, %%mm5 \n\t"\
470 "punpcklwd %%mm4, %%mm4 \n\t"\
471 "paddw %%mm1, %%mm2 \n\t"\
472 "paddw %%mm1, %%mm5 \n\t"\
473 "paddw %%mm1, %%mm4 \n\t"\
474 "punpckhwd %%mm0, %%mm0 \n\t"\
475 "punpckhwd %%mm6, %%mm6 \n\t"\
476 "punpckhwd %%mm3, %%mm3 \n\t"\
477 "paddw %%mm7, %%mm0 \n\t"\
478 "paddw %%mm7, %%mm6 \n\t"\
479 "paddw %%mm7, %%mm3 \n\t"\
480 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
481 "packuswb %%mm0, %%mm2 \n\t"\
482 "packuswb %%mm6, %%mm5 \n\t"\
483 "packuswb %%mm3, %%mm4 \n\t"\
485 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
487 #define YSCALEYUV2RGB(index, c) \
488 REAL_YSCALEYUV2RGB_UV(index, c) \
489 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
490 REAL_YSCALEYUV2RGB_COEFF(c)
492 #define REAL_YSCALEYUV2PACKED1(index, c) \
493 "xor "#index", "#index" \n\t"\
494 ASMALIGN(4)\
495 "1: \n\t"\
496 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
497 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
498 "psraw $7, %%mm3 \n\t" \
499 "psraw $7, %%mm4 \n\t" \
500 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
501 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
502 "psraw $7, %%mm1 \n\t" \
503 "psraw $7, %%mm7 \n\t" \
505 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
507 #define REAL_YSCALEYUV2RGB1(index, c) \
508 "xor "#index", "#index" \n\t"\
509 ASMALIGN(4)\
510 "1: \n\t"\
511 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
512 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
513 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
514 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
515 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
516 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
517 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
518 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
519 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
520 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
521 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
522 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
523 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
524 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
525 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
526 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
527 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
528 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
529 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
530 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
531 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
532 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
533 "paddw %%mm3, %%mm4 \n\t"\
534 "movq %%mm2, %%mm0 \n\t"\
535 "movq %%mm5, %%mm6 \n\t"\
536 "movq %%mm4, %%mm3 \n\t"\
537 "punpcklwd %%mm2, %%mm2 \n\t"\
538 "punpcklwd %%mm5, %%mm5 \n\t"\
539 "punpcklwd %%mm4, %%mm4 \n\t"\
540 "paddw %%mm1, %%mm2 \n\t"\
541 "paddw %%mm1, %%mm5 \n\t"\
542 "paddw %%mm1, %%mm4 \n\t"\
543 "punpckhwd %%mm0, %%mm0 \n\t"\
544 "punpckhwd %%mm6, %%mm6 \n\t"\
545 "punpckhwd %%mm3, %%mm3 \n\t"\
546 "paddw %%mm7, %%mm0 \n\t"\
547 "paddw %%mm7, %%mm6 \n\t"\
548 "paddw %%mm7, %%mm3 \n\t"\
549 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
550 "packuswb %%mm0, %%mm2 \n\t"\
551 "packuswb %%mm6, %%mm5 \n\t"\
552 "packuswb %%mm3, %%mm4 \n\t"\
554 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
556 #define REAL_YSCALEYUV2PACKED1b(index, c) \
557 "xor "#index", "#index" \n\t"\
558 ASMALIGN(4)\
559 "1: \n\t"\
560 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
561 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
562 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
563 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
564 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
565 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
566 "psrlw $8, %%mm3 \n\t" \
567 "psrlw $8, %%mm4 \n\t" \
568 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
569 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
570 "psraw $7, %%mm1 \n\t" \
571 "psraw $7, %%mm7 \n\t"
572 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
574 // do vertical chrominance interpolation
575 #define REAL_YSCALEYUV2RGB1b(index, c) \
576 "xor "#index", "#index" \n\t"\
577 ASMALIGN(4)\
578 "1: \n\t"\
579 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
580 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
581 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
582 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
583 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
586 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
587 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
588 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
589 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
590 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
591 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
592 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
593 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
594 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
595 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
596 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
597 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
598 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
599 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
600 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
601 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
602 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
603 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
604 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
605 "paddw %%mm3, %%mm4 \n\t"\
606 "movq %%mm2, %%mm0 \n\t"\
607 "movq %%mm5, %%mm6 \n\t"\
608 "movq %%mm4, %%mm3 \n\t"\
609 "punpcklwd %%mm2, %%mm2 \n\t"\
610 "punpcklwd %%mm5, %%mm5 \n\t"\
611 "punpcklwd %%mm4, %%mm4 \n\t"\
612 "paddw %%mm1, %%mm2 \n\t"\
613 "paddw %%mm1, %%mm5 \n\t"\
614 "paddw %%mm1, %%mm4 \n\t"\
615 "punpckhwd %%mm0, %%mm0 \n\t"\
616 "punpckhwd %%mm6, %%mm6 \n\t"\
617 "punpckhwd %%mm3, %%mm3 \n\t"\
618 "paddw %%mm7, %%mm0 \n\t"\
619 "paddw %%mm7, %%mm6 \n\t"\
620 "paddw %%mm7, %%mm3 \n\t"\
621 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
622 "packuswb %%mm0, %%mm2 \n\t"\
623 "packuswb %%mm6, %%mm5 \n\t"\
624 "packuswb %%mm3, %%mm4 \n\t"\
626 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
628 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
629 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
630 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
631 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
632 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
633 "packuswb %%mm1, %%mm7 \n\t"
634 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
636 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
637 "movq "#b", "#q2" \n\t" /* B */\
638 "movq "#r", "#t" \n\t" /* R */\
639 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
640 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
641 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
642 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
643 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
644 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
645 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
646 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
647 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
648 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
650 MOVNTQ( q0, (dst, index, 4))\
651 MOVNTQ( b, 8(dst, index, 4))\
652 MOVNTQ( q2, 16(dst, index, 4))\
653 MOVNTQ( q3, 24(dst, index, 4))\
655 "add $8, "#index" \n\t"\
656 "cmp "#dstw", "#index" \n\t"\
657 " jb 1b \n\t"
658 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
660 #define REAL_WRITERGB16(dst, dstw, index) \
661 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
662 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
663 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
664 "psrlq $3, %%mm2 \n\t"\
666 "movq %%mm2, %%mm1 \n\t"\
667 "movq %%mm4, %%mm3 \n\t"\
669 "punpcklbw %%mm7, %%mm3 \n\t"\
670 "punpcklbw %%mm5, %%mm2 \n\t"\
671 "punpckhbw %%mm7, %%mm4 \n\t"\
672 "punpckhbw %%mm5, %%mm1 \n\t"\
674 "psllq $3, %%mm3 \n\t"\
675 "psllq $3, %%mm4 \n\t"\
677 "por %%mm3, %%mm2 \n\t"\
678 "por %%mm4, %%mm1 \n\t"\
680 MOVNTQ(%%mm2, (dst, index, 2))\
681 MOVNTQ(%%mm1, 8(dst, index, 2))\
683 "add $8, "#index" \n\t"\
684 "cmp "#dstw", "#index" \n\t"\
685 " jb 1b \n\t"
686 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
688 #define REAL_WRITERGB15(dst, dstw, index) \
689 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
690 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
691 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
692 "psrlq $3, %%mm2 \n\t"\
693 "psrlq $1, %%mm5 \n\t"\
695 "movq %%mm2, %%mm1 \n\t"\
696 "movq %%mm4, %%mm3 \n\t"\
698 "punpcklbw %%mm7, %%mm3 \n\t"\
699 "punpcklbw %%mm5, %%mm2 \n\t"\
700 "punpckhbw %%mm7, %%mm4 \n\t"\
701 "punpckhbw %%mm5, %%mm1 \n\t"\
703 "psllq $2, %%mm3 \n\t"\
704 "psllq $2, %%mm4 \n\t"\
706 "por %%mm3, %%mm2 \n\t"\
707 "por %%mm4, %%mm1 \n\t"\
709 MOVNTQ(%%mm2, (dst, index, 2))\
710 MOVNTQ(%%mm1, 8(dst, index, 2))\
712 "add $8, "#index" \n\t"\
713 "cmp "#dstw", "#index" \n\t"\
714 " jb 1b \n\t"
715 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
717 #define WRITEBGR24OLD(dst, dstw, index) \
718 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
719 "movq %%mm2, %%mm1 \n\t" /* B */\
720 "movq %%mm5, %%mm6 \n\t" /* R */\
721 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
722 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
723 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
724 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
725 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
726 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
727 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
728 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
729 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
730 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
732 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
733 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
734 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
735 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
736 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
737 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
738 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
739 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
741 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
742 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
743 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
744 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
745 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
746 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
747 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
748 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
749 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
750 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
751 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
752 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
753 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
755 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
756 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
757 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
758 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
759 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
760 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
761 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
762 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
764 MOVNTQ(%%mm0, (dst))\
765 MOVNTQ(%%mm2, 8(dst))\
766 MOVNTQ(%%mm3, 16(dst))\
767 "add $24, "#dst" \n\t"\
769 "add $8, "#index" \n\t"\
770 "cmp "#dstw", "#index" \n\t"\
771 " jb 1b \n\t"
773 #define WRITEBGR24MMX(dst, dstw, index) \
774 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
775 "movq %%mm2, %%mm1 \n\t" /* B */\
776 "movq %%mm5, %%mm6 \n\t" /* R */\
777 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
778 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
779 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
780 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
781 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
782 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
783 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
784 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
785 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
786 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
788 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
789 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
790 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
791 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
793 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
794 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
795 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
796 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
798 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
799 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
800 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
801 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
803 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
804 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
805 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
806 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
807 MOVNTQ(%%mm0, (dst))\
809 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
810 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
811 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
812 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
813 MOVNTQ(%%mm6, 8(dst))\
815 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
816 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
817 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
818 MOVNTQ(%%mm5, 16(dst))\
820 "add $24, "#dst" \n\t"\
822 "add $8, "#index" \n\t"\
823 "cmp "#dstw", "#index" \n\t"\
824 " jb 1b \n\t"
826 #define WRITEBGR24MMX2(dst, dstw, index) \
827 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
828 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
829 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
830 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
831 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
832 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
834 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
835 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
836 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
838 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
839 "por %%mm1, %%mm6 \n\t"\
840 "por %%mm3, %%mm6 \n\t"\
841 MOVNTQ(%%mm6, (dst))\
843 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
844 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
845 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
846 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
848 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
849 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
850 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
852 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
853 "por %%mm3, %%mm6 \n\t"\
854 MOVNTQ(%%mm6, 8(dst))\
856 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
857 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
858 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
860 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
861 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
862 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
864 "por %%mm1, %%mm3 \n\t"\
865 "por %%mm3, %%mm6 \n\t"\
866 MOVNTQ(%%mm6, 16(dst))\
868 "add $24, "#dst" \n\t"\
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
872 " jb 1b \n\t"
874 #if COMPILE_TEMPLATE_MMX2
875 #undef WRITEBGR24
876 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
877 #else
878 #undef WRITEBGR24
879 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
880 #endif
882 #define REAL_WRITEYUY2(dst, dstw, index) \
883 "packuswb %%mm3, %%mm3 \n\t"\
884 "packuswb %%mm4, %%mm4 \n\t"\
885 "packuswb %%mm7, %%mm1 \n\t"\
886 "punpcklbw %%mm4, %%mm3 \n\t"\
887 "movq %%mm1, %%mm7 \n\t"\
888 "punpcklbw %%mm3, %%mm1 \n\t"\
889 "punpckhbw %%mm3, %%mm7 \n\t"\
891 MOVNTQ(%%mm1, (dst, index, 2))\
892 MOVNTQ(%%mm7, 8(dst, index, 2))\
894 "add $8, "#index" \n\t"\
895 "cmp "#dstw", "#index" \n\t"\
896 " jb 1b \n\t"
897 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
900 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
901 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
902 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
904 #if COMPILE_TEMPLATE_MMX
905 if(!(c->flags & SWS_BITEXACT)) {
906 if (c->flags & SWS_ACCURATE_RND) {
907 if (uDest) {
908 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
909 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
911 if (CONFIG_SWSCALE_ALPHA && aDest) {
912 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
915 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
916 } else {
917 if (uDest) {
918 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
919 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
921 if (CONFIG_SWSCALE_ALPHA && aDest) {
922 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
925 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
927 return;
929 #endif
930 #if COMPILE_TEMPLATE_ALTIVEC
931 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
932 chrFilter, chrSrc, chrFilterSize,
933 dest, uDest, vDest, dstW, chrDstW);
934 #else //COMPILE_TEMPLATE_ALTIVEC
935 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
936 chrFilter, chrSrc, chrFilterSize,
937 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
938 #endif //!COMPILE_TEMPLATE_ALTIVEC
941 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
942 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
943 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
945 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
946 chrFilter, chrSrc, chrFilterSize,
947 dest, uDest, dstW, chrDstW, dstFormat);
950 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
951 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
953 int i;
954 #if COMPILE_TEMPLATE_MMX
955 if(!(c->flags & SWS_BITEXACT)) {
956 long p= 4;
957 const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
958 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
959 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
961 if (c->flags & SWS_ACCURATE_RND) {
962 while(p--) {
963 if (dst[p]) {
964 __asm__ volatile(
965 YSCALEYUV2YV121_ACCURATE
966 :: "r" (src[p]), "r" (dst[p] + counter[p]),
967 "g" (-counter[p])
968 : "%"REG_a
972 } else {
973 while(p--) {
974 if (dst[p]) {
975 __asm__ volatile(
976 YSCALEYUV2YV121
977 :: "r" (src[p]), "r" (dst[p] + counter[p]),
978 "g" (-counter[p])
979 : "%"REG_a
984 return;
986 #endif
987 for (i=0; i<dstW; i++) {
988 int val= (lumSrc[i]+64)>>7;
990 if (val&256) {
991 if (val<0) val=0;
992 else val=255;
995 dest[i]= val;
998 if (uDest)
999 for (i=0; i<chrDstW; i++) {
1000 int u=(chrSrc[i ]+64)>>7;
1001 int v=(chrSrc[i + VOFW]+64)>>7;
1003 if ((u|v)&256) {
1004 if (u<0) u=0;
1005 else if (u>255) u=255;
1006 if (v<0) v=0;
1007 else if (v>255) v=255;
1010 uDest[i]= u;
1011 vDest[i]= v;
1014 if (CONFIG_SWSCALE_ALPHA && aDest)
1015 for (i=0; i<dstW; i++) {
1016 int val= (alpSrc[i]+64)>>7;
1017 aDest[i]= av_clip_uint8(val);
1023 * vertical scale YV12 to RGB
1025 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1026 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1027 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1029 #if COMPILE_TEMPLATE_MMX
1030 x86_reg dummy=0;
1031 if(!(c->flags & SWS_BITEXACT)) {
1032 if (c->flags & SWS_ACCURATE_RND) {
1033 switch(c->dstFormat) {
1034 case PIX_FMT_RGB32:
1035 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1036 YSCALEYUV2PACKEDX_ACCURATE
1037 YSCALEYUV2RGBX
1038 "movq %%mm2, "U_TEMP"(%0) \n\t"
1039 "movq %%mm4, "V_TEMP"(%0) \n\t"
1040 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1041 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1042 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1043 "psraw $3, %%mm1 \n\t"
1044 "psraw $3, %%mm7 \n\t"
1045 "packuswb %%mm7, %%mm1 \n\t"
1046 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1048 YSCALEYUV2PACKEDX_END
1049 } else {
1050 YSCALEYUV2PACKEDX_ACCURATE
1051 YSCALEYUV2RGBX
1052 "pcmpeqd %%mm7, %%mm7 \n\t"
1053 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1055 YSCALEYUV2PACKEDX_END
1057 return;
1058 case PIX_FMT_BGR24:
1059 YSCALEYUV2PACKEDX_ACCURATE
1060 YSCALEYUV2RGBX
1061 "pxor %%mm7, %%mm7 \n\t"
1062 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1063 "add %4, %%"REG_c" \n\t"
1064 WRITEBGR24(%%REGc, %5, %%REGa)
1067 :: "r" (&c->redDither),
1068 "m" (dummy), "m" (dummy), "m" (dummy),
1069 "r" (dest), "m" (dstW)
1070 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1072 return;
1073 case PIX_FMT_RGB555:
1074 YSCALEYUV2PACKEDX_ACCURATE
1075 YSCALEYUV2RGBX
1076 "pxor %%mm7, %%mm7 \n\t"
1077 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1078 #ifdef DITHER1XBPP
1079 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1080 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1081 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1082 #endif
1084 WRITERGB15(%4, %5, %%REGa)
1085 YSCALEYUV2PACKEDX_END
1086 return;
1087 case PIX_FMT_RGB565:
1088 YSCALEYUV2PACKEDX_ACCURATE
1089 YSCALEYUV2RGBX
1090 "pxor %%mm7, %%mm7 \n\t"
1091 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1092 #ifdef DITHER1XBPP
1093 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1094 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1095 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1096 #endif
1098 WRITERGB16(%4, %5, %%REGa)
1099 YSCALEYUV2PACKEDX_END
1100 return;
1101 case PIX_FMT_YUYV422:
1102 YSCALEYUV2PACKEDX_ACCURATE
1103 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1105 "psraw $3, %%mm3 \n\t"
1106 "psraw $3, %%mm4 \n\t"
1107 "psraw $3, %%mm1 \n\t"
1108 "psraw $3, %%mm7 \n\t"
1109 WRITEYUY2(%4, %5, %%REGa)
1110 YSCALEYUV2PACKEDX_END
1111 return;
1113 } else {
1114 switch(c->dstFormat) {
1115 case PIX_FMT_RGB32:
1116 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1117 YSCALEYUV2PACKEDX
1118 YSCALEYUV2RGBX
1119 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1120 "psraw $3, %%mm1 \n\t"
1121 "psraw $3, %%mm7 \n\t"
1122 "packuswb %%mm7, %%mm1 \n\t"
1123 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1124 YSCALEYUV2PACKEDX_END
1125 } else {
1126 YSCALEYUV2PACKEDX
1127 YSCALEYUV2RGBX
1128 "pcmpeqd %%mm7, %%mm7 \n\t"
1129 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1130 YSCALEYUV2PACKEDX_END
1132 return;
1133 case PIX_FMT_BGR24:
1134 YSCALEYUV2PACKEDX
1135 YSCALEYUV2RGBX
1136 "pxor %%mm7, %%mm7 \n\t"
1137 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1138 "add %4, %%"REG_c" \n\t"
1139 WRITEBGR24(%%REGc, %5, %%REGa)
1141 :: "r" (&c->redDither),
1142 "m" (dummy), "m" (dummy), "m" (dummy),
1143 "r" (dest), "m" (dstW)
1144 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1146 return;
1147 case PIX_FMT_RGB555:
1148 YSCALEYUV2PACKEDX
1149 YSCALEYUV2RGBX
1150 "pxor %%mm7, %%mm7 \n\t"
1151 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1152 #ifdef DITHER1XBPP
1153 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1154 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1155 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1156 #endif
1158 WRITERGB15(%4, %5, %%REGa)
1159 YSCALEYUV2PACKEDX_END
1160 return;
1161 case PIX_FMT_RGB565:
1162 YSCALEYUV2PACKEDX
1163 YSCALEYUV2RGBX
1164 "pxor %%mm7, %%mm7 \n\t"
1165 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1166 #ifdef DITHER1XBPP
1167 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1168 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1169 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1170 #endif
1172 WRITERGB16(%4, %5, %%REGa)
1173 YSCALEYUV2PACKEDX_END
1174 return;
1175 case PIX_FMT_YUYV422:
1176 YSCALEYUV2PACKEDX
1177 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1179 "psraw $3, %%mm3 \n\t"
1180 "psraw $3, %%mm4 \n\t"
1181 "psraw $3, %%mm1 \n\t"
1182 "psraw $3, %%mm7 \n\t"
1183 WRITEYUY2(%4, %5, %%REGa)
1184 YSCALEYUV2PACKEDX_END
1185 return;
1189 #endif /* COMPILE_TEMPLATE_MMX */
1190 #if COMPILE_TEMPLATE_ALTIVEC
1191 /* The following list of supported dstFormat values should
1192 match what's found in the body of ff_yuv2packedX_altivec() */
1193 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1194 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1195 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1196 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1197 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1198 chrFilter, chrSrc, chrFilterSize,
1199 dest, dstW, dstY);
1200 else
1201 #endif
1202 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1203 chrFilter, chrSrc, chrFilterSize,
1204 alpSrc, dest, dstW, dstY);
1208 * vertical bilinear scale YV12 to RGB
1210 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1211 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1213 int yalpha1=4095- yalpha;
1214 int uvalpha1=4095-uvalpha;
1215 int i;
1217 #if COMPILE_TEMPLATE_MMX
1218 if(!(c->flags & SWS_BITEXACT)) {
1219 switch(c->dstFormat) {
1220 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1221 case PIX_FMT_RGB32:
1222 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1223 #if ARCH_X86_64
1224 __asm__ volatile(
1225 YSCALEYUV2RGB(%%r8, %5)
1226 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1227 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1228 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1229 "packuswb %%mm7, %%mm1 \n\t"
1230 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1232 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1233 "a" (&c->redDither)
1234 ,"r" (abuf0), "r" (abuf1)
1235 : "%r8"
1237 #else
1238 *(const uint16_t **)(&c->u_temp)=abuf0;
1239 *(const uint16_t **)(&c->v_temp)=abuf1;
1240 __asm__ volatile(
1241 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1242 "mov %4, %%"REG_b" \n\t"
1243 "push %%"REG_BP" \n\t"
1244 YSCALEYUV2RGB(%%REGBP, %5)
1245 "push %0 \n\t"
1246 "push %1 \n\t"
1247 "mov "U_TEMP"(%5), %0 \n\t"
1248 "mov "V_TEMP"(%5), %1 \n\t"
1249 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1250 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1251 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1252 "packuswb %%mm7, %%mm1 \n\t"
1253 "pop %1 \n\t"
1254 "pop %0 \n\t"
1255 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1256 "pop %%"REG_BP" \n\t"
1257 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1259 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1260 "a" (&c->redDither)
1262 #endif
1263 } else {
1264 __asm__ volatile(
1265 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1266 "mov %4, %%"REG_b" \n\t"
1267 "push %%"REG_BP" \n\t"
1268 YSCALEYUV2RGB(%%REGBP, %5)
1269 "pcmpeqd %%mm7, %%mm7 \n\t"
1270 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1271 "pop %%"REG_BP" \n\t"
1272 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1274 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1275 "a" (&c->redDither)
1278 return;
1279 case PIX_FMT_BGR24:
1280 __asm__ volatile(
1281 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1282 "mov %4, %%"REG_b" \n\t"
1283 "push %%"REG_BP" \n\t"
1284 YSCALEYUV2RGB(%%REGBP, %5)
1285 "pxor %%mm7, %%mm7 \n\t"
1286 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1287 "pop %%"REG_BP" \n\t"
1288 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1289 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1290 "a" (&c->redDither)
1292 return;
1293 case PIX_FMT_RGB555:
1294 __asm__ volatile(
1295 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1296 "mov %4, %%"REG_b" \n\t"
1297 "push %%"REG_BP" \n\t"
1298 YSCALEYUV2RGB(%%REGBP, %5)
1299 "pxor %%mm7, %%mm7 \n\t"
1300 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1301 #ifdef DITHER1XBPP
1302 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1303 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1304 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1305 #endif
1307 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1308 "pop %%"REG_BP" \n\t"
1309 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1311 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1312 "a" (&c->redDither)
1314 return;
1315 case PIX_FMT_RGB565:
1316 __asm__ volatile(
1317 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1318 "mov %4, %%"REG_b" \n\t"
1319 "push %%"REG_BP" \n\t"
1320 YSCALEYUV2RGB(%%REGBP, %5)
1321 "pxor %%mm7, %%mm7 \n\t"
1322 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1323 #ifdef DITHER1XBPP
1324 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1325 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1326 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1327 #endif
1329 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1330 "pop %%"REG_BP" \n\t"
1331 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1332 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1333 "a" (&c->redDither)
1335 return;
1336 case PIX_FMT_YUYV422:
1337 __asm__ volatile(
1338 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1339 "mov %4, %%"REG_b" \n\t"
1340 "push %%"REG_BP" \n\t"
1341 YSCALEYUV2PACKED(%%REGBP, %5)
1342 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1343 "pop %%"REG_BP" \n\t"
1344 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1345 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1346 "a" (&c->redDither)
1348 return;
1349 default: break;
1352 #endif //COMPILE_TEMPLATE_MMX
1353 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1357 * YV12 to RGB without scaling or interpolating
1359 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1360 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1362 const int yalpha1=0;
1363 int i;
1365 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1366 const int yalpha= 4096; //FIXME ...
1368 if (flags&SWS_FULL_CHR_H_INT) {
1369 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1370 return;
1373 #if COMPILE_TEMPLATE_MMX
1374 if(!(flags & SWS_BITEXACT)) {
1375 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1376 switch(dstFormat) {
1377 case PIX_FMT_RGB32:
1378 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1379 __asm__ volatile(
1380 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1381 "mov %4, %%"REG_b" \n\t"
1382 "push %%"REG_BP" \n\t"
1383 YSCALEYUV2RGB1(%%REGBP, %5)
1384 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1385 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1386 "pop %%"REG_BP" \n\t"
1387 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1389 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1390 "a" (&c->redDither)
1392 } else {
1393 __asm__ volatile(
1394 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1395 "mov %4, %%"REG_b" \n\t"
1396 "push %%"REG_BP" \n\t"
1397 YSCALEYUV2RGB1(%%REGBP, %5)
1398 "pcmpeqd %%mm7, %%mm7 \n\t"
1399 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1400 "pop %%"REG_BP" \n\t"
1401 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1403 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1404 "a" (&c->redDither)
1407 return;
1408 case PIX_FMT_BGR24:
1409 __asm__ volatile(
1410 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1411 "mov %4, %%"REG_b" \n\t"
1412 "push %%"REG_BP" \n\t"
1413 YSCALEYUV2RGB1(%%REGBP, %5)
1414 "pxor %%mm7, %%mm7 \n\t"
1415 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1416 "pop %%"REG_BP" \n\t"
1417 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1419 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1420 "a" (&c->redDither)
1422 return;
1423 case PIX_FMT_RGB555:
1424 __asm__ volatile(
1425 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1426 "mov %4, %%"REG_b" \n\t"
1427 "push %%"REG_BP" \n\t"
1428 YSCALEYUV2RGB1(%%REGBP, %5)
1429 "pxor %%mm7, %%mm7 \n\t"
1430 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1431 #ifdef DITHER1XBPP
1432 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1433 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1434 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1435 #endif
1436 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1437 "pop %%"REG_BP" \n\t"
1438 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1440 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1441 "a" (&c->redDither)
1443 return;
1444 case PIX_FMT_RGB565:
1445 __asm__ volatile(
1446 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1447 "mov %4, %%"REG_b" \n\t"
1448 "push %%"REG_BP" \n\t"
1449 YSCALEYUV2RGB1(%%REGBP, %5)
1450 "pxor %%mm7, %%mm7 \n\t"
1451 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1452 #ifdef DITHER1XBPP
1453 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1454 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1455 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1456 #endif
1458 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1459 "pop %%"REG_BP" \n\t"
1460 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1462 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1463 "a" (&c->redDither)
1465 return;
1466 case PIX_FMT_YUYV422:
1467 __asm__ volatile(
1468 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1469 "mov %4, %%"REG_b" \n\t"
1470 "push %%"REG_BP" \n\t"
1471 YSCALEYUV2PACKED1(%%REGBP, %5)
1472 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1473 "pop %%"REG_BP" \n\t"
1474 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1476 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1477 "a" (&c->redDither)
1479 return;
1481 } else {
1482 switch(dstFormat) {
1483 case PIX_FMT_RGB32:
1484 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1485 __asm__ volatile(
1486 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1487 "mov %4, %%"REG_b" \n\t"
1488 "push %%"REG_BP" \n\t"
1489 YSCALEYUV2RGB1b(%%REGBP, %5)
1490 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1491 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1492 "pop %%"REG_BP" \n\t"
1493 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1495 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1496 "a" (&c->redDither)
1498 } else {
1499 __asm__ volatile(
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2RGB1b(%%REGBP, %5)
1504 "pcmpeqd %%mm7, %%mm7 \n\t"
1505 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506 "pop %%"REG_BP" \n\t"
1507 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1509 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1510 "a" (&c->redDither)
1513 return;
1514 case PIX_FMT_BGR24:
1515 __asm__ volatile(
1516 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1517 "mov %4, %%"REG_b" \n\t"
1518 "push %%"REG_BP" \n\t"
1519 YSCALEYUV2RGB1b(%%REGBP, %5)
1520 "pxor %%mm7, %%mm7 \n\t"
1521 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1522 "pop %%"REG_BP" \n\t"
1523 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1525 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1526 "a" (&c->redDither)
1528 return;
1529 case PIX_FMT_RGB555:
1530 __asm__ volatile(
1531 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1532 "mov %4, %%"REG_b" \n\t"
1533 "push %%"REG_BP" \n\t"
1534 YSCALEYUV2RGB1b(%%REGBP, %5)
1535 "pxor %%mm7, %%mm7 \n\t"
1536 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1537 #ifdef DITHER1XBPP
1538 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1539 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1540 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1541 #endif
1542 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1543 "pop %%"REG_BP" \n\t"
1544 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1546 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1547 "a" (&c->redDither)
1549 return;
1550 case PIX_FMT_RGB565:
1551 __asm__ volatile(
1552 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1553 "mov %4, %%"REG_b" \n\t"
1554 "push %%"REG_BP" \n\t"
1555 YSCALEYUV2RGB1b(%%REGBP, %5)
1556 "pxor %%mm7, %%mm7 \n\t"
1557 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1558 #ifdef DITHER1XBPP
1559 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1560 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1561 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1562 #endif
1564 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1565 "pop %%"REG_BP" \n\t"
1566 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1568 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1569 "a" (&c->redDither)
1571 return;
1572 case PIX_FMT_YUYV422:
1573 __asm__ volatile(
1574 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1575 "mov %4, %%"REG_b" \n\t"
1576 "push %%"REG_BP" \n\t"
1577 YSCALEYUV2PACKED1b(%%REGBP, %5)
1578 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1579 "pop %%"REG_BP" \n\t"
1580 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1582 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1583 "a" (&c->redDither)
1585 return;
1589 #endif /* COMPILE_TEMPLATE_MMX */
1590 if (uvalpha < 2048) {
1591 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1592 } else {
1593 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1597 //FIXME yuy2* can read up to 7 samples too much
1599 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1601 #if COMPILE_TEMPLATE_MMX
1602 __asm__ volatile(
1603 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1604 "mov %0, %%"REG_a" \n\t"
1605 "1: \n\t"
1606 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1607 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1608 "pand %%mm2, %%mm0 \n\t"
1609 "pand %%mm2, %%mm1 \n\t"
1610 "packuswb %%mm1, %%mm0 \n\t"
1611 "movq %%mm0, (%2, %%"REG_a") \n\t"
1612 "add $8, %%"REG_a" \n\t"
1613 " js 1b \n\t"
1614 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1615 : "%"REG_a
1617 #else
1618 int i;
1619 for (i=0; i<width; i++)
1620 dst[i]= src[2*i];
1621 #endif
1624 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1626 #if COMPILE_TEMPLATE_MMX
1627 __asm__ volatile(
1628 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1629 "mov %0, %%"REG_a" \n\t"
1630 "1: \n\t"
1631 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1632 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1633 "psrlw $8, %%mm0 \n\t"
1634 "psrlw $8, %%mm1 \n\t"
1635 "packuswb %%mm1, %%mm0 \n\t"
1636 "movq %%mm0, %%mm1 \n\t"
1637 "psrlw $8, %%mm0 \n\t"
1638 "pand %%mm4, %%mm1 \n\t"
1639 "packuswb %%mm0, %%mm0 \n\t"
1640 "packuswb %%mm1, %%mm1 \n\t"
1641 "movd %%mm0, (%3, %%"REG_a") \n\t"
1642 "movd %%mm1, (%2, %%"REG_a") \n\t"
1643 "add $4, %%"REG_a" \n\t"
1644 " js 1b \n\t"
1645 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1646 : "%"REG_a
1648 #else
1649 int i;
1650 for (i=0; i<width; i++) {
1651 dstU[i]= src1[4*i + 1];
1652 dstV[i]= src1[4*i + 3];
1654 #endif
1655 assert(src1 == src2);
1658 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1660 #if COMPILE_TEMPLATE_MMX
1661 __asm__ volatile(
1662 "mov %0, %%"REG_a" \n\t"
1663 "1: \n\t"
1664 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1665 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1666 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1667 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1668 "psrlw $8, %%mm0 \n\t"
1669 "psrlw $8, %%mm1 \n\t"
1670 "psrlw $8, %%mm2 \n\t"
1671 "psrlw $8, %%mm3 \n\t"
1672 "packuswb %%mm1, %%mm0 \n\t"
1673 "packuswb %%mm3, %%mm2 \n\t"
1674 "movq %%mm0, (%3, %%"REG_a") \n\t"
1675 "movq %%mm2, (%4, %%"REG_a") \n\t"
1676 "add $8, %%"REG_a" \n\t"
1677 " js 1b \n\t"
1678 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1679 : "%"REG_a
1681 #else
1682 int i;
1683 for (i=0; i<width; i++) {
1684 dstU[i]= src1[2*i + 1];
1685 dstV[i]= src2[2*i + 1];
1687 #endif
1690 /* This is almost identical to the previous, end exists only because
1691 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1692 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1694 #if COMPILE_TEMPLATE_MMX
1695 __asm__ volatile(
1696 "mov %0, %%"REG_a" \n\t"
1697 "1: \n\t"
1698 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1699 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1700 "psrlw $8, %%mm0 \n\t"
1701 "psrlw $8, %%mm1 \n\t"
1702 "packuswb %%mm1, %%mm0 \n\t"
1703 "movq %%mm0, (%2, %%"REG_a") \n\t"
1704 "add $8, %%"REG_a" \n\t"
1705 " js 1b \n\t"
1706 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1707 : "%"REG_a
1709 #else
1710 int i;
1711 for (i=0; i<width; i++)
1712 dst[i]= src[2*i+1];
1713 #endif
1716 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1718 #if COMPILE_TEMPLATE_MMX
1719 __asm__ volatile(
1720 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1721 "mov %0, %%"REG_a" \n\t"
1722 "1: \n\t"
1723 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1724 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1725 "pand %%mm4, %%mm0 \n\t"
1726 "pand %%mm4, %%mm1 \n\t"
1727 "packuswb %%mm1, %%mm0 \n\t"
1728 "movq %%mm0, %%mm1 \n\t"
1729 "psrlw $8, %%mm0 \n\t"
1730 "pand %%mm4, %%mm1 \n\t"
1731 "packuswb %%mm0, %%mm0 \n\t"
1732 "packuswb %%mm1, %%mm1 \n\t"
1733 "movd %%mm0, (%3, %%"REG_a") \n\t"
1734 "movd %%mm1, (%2, %%"REG_a") \n\t"
1735 "add $4, %%"REG_a" \n\t"
1736 " js 1b \n\t"
1737 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1738 : "%"REG_a
1740 #else
1741 int i;
1742 for (i=0; i<width; i++) {
1743 dstU[i]= src1[4*i + 0];
1744 dstV[i]= src1[4*i + 2];
1746 #endif
1747 assert(src1 == src2);
1750 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1752 #if COMPILE_TEMPLATE_MMX
1753 __asm__ volatile(
1754 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1755 "mov %0, %%"REG_a" \n\t"
1756 "1: \n\t"
1757 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1758 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1759 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1760 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1761 "pand %%mm4, %%mm0 \n\t"
1762 "pand %%mm4, %%mm1 \n\t"
1763 "pand %%mm4, %%mm2 \n\t"
1764 "pand %%mm4, %%mm3 \n\t"
1765 "packuswb %%mm1, %%mm0 \n\t"
1766 "packuswb %%mm3, %%mm2 \n\t"
1767 "movq %%mm0, (%3, %%"REG_a") \n\t"
1768 "movq %%mm2, (%4, %%"REG_a") \n\t"
1769 "add $8, %%"REG_a" \n\t"
1770 " js 1b \n\t"
1771 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1772 : "%"REG_a
1774 #else
1775 int i;
1776 for (i=0; i<width; i++) {
1777 dstU[i]= src1[2*i];
1778 dstV[i]= src2[2*i];
1780 #endif
1783 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1784 const uint8_t *src, long width)
1786 #if COMPILE_TEMPLATE_MMX
1787 __asm__ volatile(
1788 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1789 "mov %0, %%"REG_a" \n\t"
1790 "1: \n\t"
1791 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1792 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1793 "movq %%mm0, %%mm2 \n\t"
1794 "movq %%mm1, %%mm3 \n\t"
1795 "pand %%mm4, %%mm0 \n\t"
1796 "pand %%mm4, %%mm1 \n\t"
1797 "psrlw $8, %%mm2 \n\t"
1798 "psrlw $8, %%mm3 \n\t"
1799 "packuswb %%mm1, %%mm0 \n\t"
1800 "packuswb %%mm3, %%mm2 \n\t"
1801 "movq %%mm0, (%2, %%"REG_a") \n\t"
1802 "movq %%mm2, (%3, %%"REG_a") \n\t"
1803 "add $8, %%"REG_a" \n\t"
1804 " js 1b \n\t"
1805 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1806 : "%"REG_a
1808 #else
1809 int i;
1810 for (i = 0; i < width; i++) {
1811 dst1[i] = src[2*i+0];
1812 dst2[i] = src[2*i+1];
1814 #endif
1817 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1818 const uint8_t *src1, const uint8_t *src2,
1819 long width, uint32_t *unused)
1821 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1824 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1825 const uint8_t *src1, const uint8_t *src2,
1826 long width, uint32_t *unused)
1828 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1831 #if COMPILE_TEMPLATE_MMX
1832 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1835 if(srcFormat == PIX_FMT_BGR24) {
1836 __asm__ volatile(
1837 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1838 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1841 } else {
1842 __asm__ volatile(
1843 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1844 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1849 __asm__ volatile(
1850 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1851 "mov %2, %%"REG_a" \n\t"
1852 "pxor %%mm7, %%mm7 \n\t"
1853 "1: \n\t"
1854 PREFETCH" 64(%0) \n\t"
1855 "movd (%0), %%mm0 \n\t"
1856 "movd 2(%0), %%mm1 \n\t"
1857 "movd 6(%0), %%mm2 \n\t"
1858 "movd 8(%0), %%mm3 \n\t"
1859 "add $12, %0 \n\t"
1860 "punpcklbw %%mm7, %%mm0 \n\t"
1861 "punpcklbw %%mm7, %%mm1 \n\t"
1862 "punpcklbw %%mm7, %%mm2 \n\t"
1863 "punpcklbw %%mm7, %%mm3 \n\t"
1864 "pmaddwd %%mm5, %%mm0 \n\t"
1865 "pmaddwd %%mm6, %%mm1 \n\t"
1866 "pmaddwd %%mm5, %%mm2 \n\t"
1867 "pmaddwd %%mm6, %%mm3 \n\t"
1868 "paddd %%mm1, %%mm0 \n\t"
1869 "paddd %%mm3, %%mm2 \n\t"
1870 "paddd %%mm4, %%mm0 \n\t"
1871 "paddd %%mm4, %%mm2 \n\t"
1872 "psrad $15, %%mm0 \n\t"
1873 "psrad $15, %%mm2 \n\t"
1874 "packssdw %%mm2, %%mm0 \n\t"
1875 "packuswb %%mm0, %%mm0 \n\t"
1876 "movd %%mm0, (%1, %%"REG_a") \n\t"
1877 "add $4, %%"REG_a" \n\t"
1878 " js 1b \n\t"
1879 : "+r" (src)
1880 : "r" (dst+width), "g" ((x86_reg)-width)
1881 : "%"REG_a
1885 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1887 __asm__ volatile(
1888 "movq 24+%4, %%mm6 \n\t"
1889 "mov %3, %%"REG_a" \n\t"
1890 "pxor %%mm7, %%mm7 \n\t"
1891 "1: \n\t"
1892 PREFETCH" 64(%0) \n\t"
1893 "movd (%0), %%mm0 \n\t"
1894 "movd 2(%0), %%mm1 \n\t"
1895 "punpcklbw %%mm7, %%mm0 \n\t"
1896 "punpcklbw %%mm7, %%mm1 \n\t"
1897 "movq %%mm0, %%mm2 \n\t"
1898 "movq %%mm1, %%mm3 \n\t"
1899 "pmaddwd %4, %%mm0 \n\t"
1900 "pmaddwd 8+%4, %%mm1 \n\t"
1901 "pmaddwd 16+%4, %%mm2 \n\t"
1902 "pmaddwd %%mm6, %%mm3 \n\t"
1903 "paddd %%mm1, %%mm0 \n\t"
1904 "paddd %%mm3, %%mm2 \n\t"
1906 "movd 6(%0), %%mm1 \n\t"
1907 "movd 8(%0), %%mm3 \n\t"
1908 "add $12, %0 \n\t"
1909 "punpcklbw %%mm7, %%mm1 \n\t"
1910 "punpcklbw %%mm7, %%mm3 \n\t"
1911 "movq %%mm1, %%mm4 \n\t"
1912 "movq %%mm3, %%mm5 \n\t"
1913 "pmaddwd %4, %%mm1 \n\t"
1914 "pmaddwd 8+%4, %%mm3 \n\t"
1915 "pmaddwd 16+%4, %%mm4 \n\t"
1916 "pmaddwd %%mm6, %%mm5 \n\t"
1917 "paddd %%mm3, %%mm1 \n\t"
1918 "paddd %%mm5, %%mm4 \n\t"
1920 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1921 "paddd %%mm3, %%mm0 \n\t"
1922 "paddd %%mm3, %%mm2 \n\t"
1923 "paddd %%mm3, %%mm1 \n\t"
1924 "paddd %%mm3, %%mm4 \n\t"
1925 "psrad $15, %%mm0 \n\t"
1926 "psrad $15, %%mm2 \n\t"
1927 "psrad $15, %%mm1 \n\t"
1928 "psrad $15, %%mm4 \n\t"
1929 "packssdw %%mm1, %%mm0 \n\t"
1930 "packssdw %%mm4, %%mm2 \n\t"
1931 "packuswb %%mm0, %%mm0 \n\t"
1932 "packuswb %%mm2, %%mm2 \n\t"
1933 "movd %%mm0, (%1, %%"REG_a") \n\t"
1934 "movd %%mm2, (%2, %%"REG_a") \n\t"
1935 "add $4, %%"REG_a" \n\t"
1936 " js 1b \n\t"
1937 : "+r" (src)
1938 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1939 : "%"REG_a
1942 #endif
1944 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1946 #if COMPILE_TEMPLATE_MMX
1947 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1948 #else
1949 int i;
1950 for (i=0; i<width; i++) {
1951 int b= src[i*3+0];
1952 int g= src[i*3+1];
1953 int r= src[i*3+2];
1955 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1957 #endif /* COMPILE_TEMPLATE_MMX */
1960 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1962 #if COMPILE_TEMPLATE_MMX
1963 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1964 #else
1965 int i;
1966 for (i=0; i<width; i++) {
1967 int b= src1[3*i + 0];
1968 int g= src1[3*i + 1];
1969 int r= src1[3*i + 2];
1971 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1972 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1974 #endif /* COMPILE_TEMPLATE_MMX */
1975 assert(src1 == src2);
1978 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1980 int i;
1981 for (i=0; i<width; i++) {
1982 int b= src1[6*i + 0] + src1[6*i + 3];
1983 int g= src1[6*i + 1] + src1[6*i + 4];
1984 int r= src1[6*i + 2] + src1[6*i + 5];
1986 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1987 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1989 assert(src1 == src2);
1992 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1994 #if COMPILE_TEMPLATE_MMX
1995 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1996 #else
1997 int i;
1998 for (i=0; i<width; i++) {
1999 int r= src[i*3+0];
2000 int g= src[i*3+1];
2001 int b= src[i*3+2];
2003 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2005 #endif
2008 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2010 #if COMPILE_TEMPLATE_MMX
2011 assert(src1==src2);
2012 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2013 #else
2014 int i;
2015 assert(src1==src2);
2016 for (i=0; i<width; i++) {
2017 int r= src1[3*i + 0];
2018 int g= src1[3*i + 1];
2019 int b= src1[3*i + 2];
2021 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2022 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2024 #endif
2027 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2029 int i;
2030 assert(src1==src2);
2031 for (i=0; i<width; i++) {
2032 int r= src1[6*i + 0] + src1[6*i + 3];
2033 int g= src1[6*i + 1] + src1[6*i + 4];
2034 int b= src1[6*i + 2] + src1[6*i + 5];
2036 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2037 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2042 // bilinear / bicubic scaling
2043 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2044 const int16_t *filter, const int16_t *filterPos, long filterSize)
2046 #if COMPILE_TEMPLATE_MMX
2047 assert(filterSize % 4 == 0 && filterSize>0);
2048 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2049 x86_reg counter= -2*dstW;
2050 filter-= counter*2;
2051 filterPos-= counter/2;
2052 dst-= counter/2;
2053 __asm__ volatile(
2054 #if defined(PIC)
2055 "push %%"REG_b" \n\t"
2056 #endif
2057 "pxor %%mm7, %%mm7 \n\t"
2058 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2059 "mov %%"REG_a", %%"REG_BP" \n\t"
2060 ASMALIGN(4)
2061 "1: \n\t"
2062 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2063 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2064 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2065 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2066 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2067 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2068 "punpcklbw %%mm7, %%mm0 \n\t"
2069 "punpcklbw %%mm7, %%mm2 \n\t"
2070 "pmaddwd %%mm1, %%mm0 \n\t"
2071 "pmaddwd %%mm2, %%mm3 \n\t"
2072 "movq %%mm0, %%mm4 \n\t"
2073 "punpckldq %%mm3, %%mm0 \n\t"
2074 "punpckhdq %%mm3, %%mm4 \n\t"
2075 "paddd %%mm4, %%mm0 \n\t"
2076 "psrad $7, %%mm0 \n\t"
2077 "packssdw %%mm0, %%mm0 \n\t"
2078 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2079 "add $4, %%"REG_BP" \n\t"
2080 " jnc 1b \n\t"
2082 "pop %%"REG_BP" \n\t"
2083 #if defined(PIC)
2084 "pop %%"REG_b" \n\t"
2085 #endif
2086 : "+a" (counter)
2087 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2088 #if !defined(PIC)
2089 : "%"REG_b
2090 #endif
2092 } else if (filterSize==8) {
2093 x86_reg counter= -2*dstW;
2094 filter-= counter*4;
2095 filterPos-= counter/2;
2096 dst-= counter/2;
2097 __asm__ volatile(
2098 #if defined(PIC)
2099 "push %%"REG_b" \n\t"
2100 #endif
2101 "pxor %%mm7, %%mm7 \n\t"
2102 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2103 "mov %%"REG_a", %%"REG_BP" \n\t"
2104 ASMALIGN(4)
2105 "1: \n\t"
2106 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2107 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2108 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2109 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2110 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2111 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2112 "punpcklbw %%mm7, %%mm0 \n\t"
2113 "punpcklbw %%mm7, %%mm2 \n\t"
2114 "pmaddwd %%mm1, %%mm0 \n\t"
2115 "pmaddwd %%mm2, %%mm3 \n\t"
2117 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2118 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2119 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2120 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2121 "punpcklbw %%mm7, %%mm4 \n\t"
2122 "punpcklbw %%mm7, %%mm2 \n\t"
2123 "pmaddwd %%mm1, %%mm4 \n\t"
2124 "pmaddwd %%mm2, %%mm5 \n\t"
2125 "paddd %%mm4, %%mm0 \n\t"
2126 "paddd %%mm5, %%mm3 \n\t"
2127 "movq %%mm0, %%mm4 \n\t"
2128 "punpckldq %%mm3, %%mm0 \n\t"
2129 "punpckhdq %%mm3, %%mm4 \n\t"
2130 "paddd %%mm4, %%mm0 \n\t"
2131 "psrad $7, %%mm0 \n\t"
2132 "packssdw %%mm0, %%mm0 \n\t"
2133 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2134 "add $4, %%"REG_BP" \n\t"
2135 " jnc 1b \n\t"
2137 "pop %%"REG_BP" \n\t"
2138 #if defined(PIC)
2139 "pop %%"REG_b" \n\t"
2140 #endif
2141 : "+a" (counter)
2142 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2143 #if !defined(PIC)
2144 : "%"REG_b
2145 #endif
2147 } else {
2148 const uint8_t *offset = src+filterSize;
2149 x86_reg counter= -2*dstW;
2150 //filter-= counter*filterSize/2;
2151 filterPos-= counter/2;
2152 dst-= counter/2;
2153 __asm__ volatile(
2154 "pxor %%mm7, %%mm7 \n\t"
2155 ASMALIGN(4)
2156 "1: \n\t"
2157 "mov %2, %%"REG_c" \n\t"
2158 "movzwl (%%"REG_c", %0), %%eax \n\t"
2159 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2160 "mov %5, %%"REG_c" \n\t"
2161 "pxor %%mm4, %%mm4 \n\t"
2162 "pxor %%mm5, %%mm5 \n\t"
2163 "2: \n\t"
2164 "movq (%1), %%mm1 \n\t"
2165 "movq (%1, %6), %%mm3 \n\t"
2166 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2167 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2168 "punpcklbw %%mm7, %%mm0 \n\t"
2169 "punpcklbw %%mm7, %%mm2 \n\t"
2170 "pmaddwd %%mm1, %%mm0 \n\t"
2171 "pmaddwd %%mm2, %%mm3 \n\t"
2172 "paddd %%mm3, %%mm5 \n\t"
2173 "paddd %%mm0, %%mm4 \n\t"
2174 "add $8, %1 \n\t"
2175 "add $4, %%"REG_c" \n\t"
2176 "cmp %4, %%"REG_c" \n\t"
2177 " jb 2b \n\t"
2178 "add %6, %1 \n\t"
2179 "movq %%mm4, %%mm0 \n\t"
2180 "punpckldq %%mm5, %%mm4 \n\t"
2181 "punpckhdq %%mm5, %%mm0 \n\t"
2182 "paddd %%mm0, %%mm4 \n\t"
2183 "psrad $7, %%mm4 \n\t"
2184 "packssdw %%mm4, %%mm4 \n\t"
2185 "mov %3, %%"REG_a" \n\t"
2186 "movd %%mm4, (%%"REG_a", %0) \n\t"
2187 "add $4, %0 \n\t"
2188 " jnc 1b \n\t"
2190 : "+r" (counter), "+r" (filter)
2191 : "m" (filterPos), "m" (dst), "m"(offset),
2192 "m" (src), "r" ((x86_reg)filterSize*2)
2193 : "%"REG_a, "%"REG_c, "%"REG_d
2196 #else
2197 #if COMPILE_TEMPLATE_ALTIVEC
2198 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2199 #else
2200 int i;
2201 for (i=0; i<dstW; i++) {
2202 int j;
2203 int srcPos= filterPos[i];
2204 int val=0;
2205 //printf("filterPos: %d\n", filterPos[i]);
2206 for (j=0; j<filterSize; j++) {
2207 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2208 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2210 //filter += hFilterSize;
2211 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2212 //dst[i] = val>>7;
2214 #endif /* COMPILE_ALTIVEC */
2215 #endif /* COMPILE_MMX */
2218 //FIXME all pal and rgb srcFormats could do this convertion as well
2219 //FIXME all scalers more complex than bilinear could do half of this transform
2220 static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2222 int i;
2223 for (i = 0; i < width; i++) {
2224 dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2225 dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2228 static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2230 int i;
2231 for (i = 0; i < width; i++) {
2232 dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469
2233 dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2236 static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2238 int i;
2239 for (i = 0; i < width; i++)
2240 dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2242 static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2244 int i;
2245 for (i = 0; i < width; i++)
2246 dst[i] = (dst[i]*14071 + 33561947)>>14;
2249 #define FAST_BILINEAR_X86 \
2250 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2251 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2252 "shll $16, %%edi \n\t" \
2253 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2254 "mov %1, %%"REG_D"\n\t" \
2255 "shrl $9, %%esi \n\t" \
2257 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2258 long dstWidth, const uint8_t *src, int srcW,
2259 int xInc)
2261 #if ARCH_X86 && CONFIG_GPL
2262 #if COMPILE_TEMPLATE_MMX2
2263 int32_t *filterPos = c->hLumFilterPos;
2264 int16_t *filter = c->hLumFilter;
2265 int canMMX2BeUsed = c->canMMX2BeUsed;
2266 void *mmx2FilterCode= c->lumMmx2FilterCode;
2267 int i;
2268 #if defined(PIC)
2269 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2270 #endif
2271 if (canMMX2BeUsed) {
2272 __asm__ volatile(
2273 #if defined(PIC)
2274 "mov %%"REG_b", %5 \n\t"
2275 #endif
2276 "pxor %%mm7, %%mm7 \n\t"
2277 "mov %0, %%"REG_c" \n\t"
2278 "mov %1, %%"REG_D" \n\t"
2279 "mov %2, %%"REG_d" \n\t"
2280 "mov %3, %%"REG_b" \n\t"
2281 "xor %%"REG_a", %%"REG_a" \n\t" // i
2282 PREFETCH" (%%"REG_c") \n\t"
2283 PREFETCH" 32(%%"REG_c") \n\t"
2284 PREFETCH" 64(%%"REG_c") \n\t"
2286 #if ARCH_X86_64
2288 #define CALL_MMX2_FILTER_CODE \
2289 "movl (%%"REG_b"), %%esi \n\t"\
2290 "call *%4 \n\t"\
2291 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2292 "add %%"REG_S", %%"REG_c" \n\t"\
2293 "add %%"REG_a", %%"REG_D" \n\t"\
2294 "xor %%"REG_a", %%"REG_a" \n\t"\
2296 #else
2298 #define CALL_MMX2_FILTER_CODE \
2299 "movl (%%"REG_b"), %%esi \n\t"\
2300 "call *%4 \n\t"\
2301 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2302 "add %%"REG_a", %%"REG_D" \n\t"\
2303 "xor %%"REG_a", %%"REG_a" \n\t"\
2305 #endif /* ARCH_X86_64 */
2307 CALL_MMX2_FILTER_CODE
2308 CALL_MMX2_FILTER_CODE
2309 CALL_MMX2_FILTER_CODE
2310 CALL_MMX2_FILTER_CODE
2311 CALL_MMX2_FILTER_CODE
2312 CALL_MMX2_FILTER_CODE
2313 CALL_MMX2_FILTER_CODE
2314 CALL_MMX2_FILTER_CODE
2316 #if defined(PIC)
2317 "mov %5, %%"REG_b" \n\t"
2318 #endif
2319 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2320 "m" (mmx2FilterCode)
2321 #if defined(PIC)
2322 ,"m" (ebxsave)
2323 #endif
2324 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2325 #if !defined(PIC)
2326 ,"%"REG_b
2327 #endif
2329 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2330 } else {
2331 #endif /* COMPILE_TEMPLATE_MMX2 */
2332 x86_reg xInc_shr16 = xInc >> 16;
2333 uint16_t xInc_mask = xInc & 0xffff;
2334 //NO MMX just normal asm ...
2335 __asm__ volatile(
2336 "xor %%"REG_a", %%"REG_a" \n\t" // i
2337 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2338 "xorl %%ecx, %%ecx \n\t" // xalpha
2339 ASMALIGN(4)
2340 "1: \n\t"
2341 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2342 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2343 FAST_BILINEAR_X86
2344 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2345 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2346 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2348 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2349 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2350 FAST_BILINEAR_X86
2351 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2352 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2353 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2356 "add $2, %%"REG_a" \n\t"
2357 "cmp %2, %%"REG_a" \n\t"
2358 " jb 1b \n\t"
2361 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2362 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2364 #if COMPILE_TEMPLATE_MMX2
2365 } //if MMX2 can't be used
2366 #endif
2367 #else
2368 int i;
2369 unsigned int xpos=0;
2370 for (i=0;i<dstWidth;i++) {
2371 register unsigned int xx=xpos>>16;
2372 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2373 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2374 xpos+=xInc;
2376 #endif /* ARCH_X86 */
2379 // *** horizontal scale Y line to temp buffer
2380 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2381 const int16_t *hLumFilter,
2382 const int16_t *hLumFilterPos, int hLumFilterSize,
2383 uint8_t *formatConvBuffer,
2384 uint32_t *pal, int isAlpha)
2386 void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2387 void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2389 src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2391 if (toYV12) {
2392 toYV12(formatConvBuffer, src, srcW, pal);
2393 src= formatConvBuffer;
2396 if (!c->hyscale_fast) {
2397 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2398 } else { // fast bilinear upscale / crap downscale
2399 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2402 if (convertRange)
2403 convertRange(dst, dstWidth);
2406 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2407 long dstWidth, const uint8_t *src1,
2408 const uint8_t *src2, int srcW, int xInc)
2410 #if ARCH_X86 && CONFIG_GPL
2411 #if COMPILE_TEMPLATE_MMX2
2412 int32_t *filterPos = c->hChrFilterPos;
2413 int16_t *filter = c->hChrFilter;
2414 int canMMX2BeUsed = c->canMMX2BeUsed;
2415 void *mmx2FilterCode= c->chrMmx2FilterCode;
2416 int i;
2417 #if defined(PIC)
2418 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2419 #endif
2420 if (canMMX2BeUsed) {
2421 __asm__ volatile(
2422 #if defined(PIC)
2423 "mov %%"REG_b", %6 \n\t"
2424 #endif
2425 "pxor %%mm7, %%mm7 \n\t"
2426 "mov %0, %%"REG_c" \n\t"
2427 "mov %1, %%"REG_D" \n\t"
2428 "mov %2, %%"REG_d" \n\t"
2429 "mov %3, %%"REG_b" \n\t"
2430 "xor %%"REG_a", %%"REG_a" \n\t" // i
2431 PREFETCH" (%%"REG_c") \n\t"
2432 PREFETCH" 32(%%"REG_c") \n\t"
2433 PREFETCH" 64(%%"REG_c") \n\t"
2435 CALL_MMX2_FILTER_CODE
2436 CALL_MMX2_FILTER_CODE
2437 CALL_MMX2_FILTER_CODE
2438 CALL_MMX2_FILTER_CODE
2439 "xor %%"REG_a", %%"REG_a" \n\t" // i
2440 "mov %5, %%"REG_c" \n\t" // src
2441 "mov %1, %%"REG_D" \n\t" // buf1
2442 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2443 PREFETCH" (%%"REG_c") \n\t"
2444 PREFETCH" 32(%%"REG_c") \n\t"
2445 PREFETCH" 64(%%"REG_c") \n\t"
2447 CALL_MMX2_FILTER_CODE
2448 CALL_MMX2_FILTER_CODE
2449 CALL_MMX2_FILTER_CODE
2450 CALL_MMX2_FILTER_CODE
2452 #if defined(PIC)
2453 "mov %6, %%"REG_b" \n\t"
2454 #endif
2455 :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2456 "m" (mmx2FilterCode), "m" (src2)
2457 #if defined(PIC)
2458 ,"m" (ebxsave)
2459 #endif
2460 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2461 #if !defined(PIC)
2462 ,"%"REG_b
2463 #endif
2465 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2466 //printf("%d %d %d\n", dstWidth, i, srcW);
2467 dst[i] = src1[srcW-1]*128;
2468 dst[i+VOFW] = src2[srcW-1]*128;
2470 } else {
2471 #endif /* COMPILE_TEMPLATE_MMX2 */
2472 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2473 uint16_t xInc_mask = xInc & 0xffff;
2474 __asm__ volatile(
2475 "xor %%"REG_a", %%"REG_a" \n\t" // i
2476 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2477 "xorl %%ecx, %%ecx \n\t" // xalpha
2478 ASMALIGN(4)
2479 "1: \n\t"
2480 "mov %0, %%"REG_S" \n\t"
2481 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2482 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2483 FAST_BILINEAR_X86
2484 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2486 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2487 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2488 FAST_BILINEAR_X86
2489 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2491 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2492 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2493 "add $1, %%"REG_a" \n\t"
2494 "cmp %2, %%"REG_a" \n\t"
2495 " jb 1b \n\t"
2497 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2498 which is needed to support GCC 4.0. */
2499 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2500 :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2501 #else
2502 :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2503 #endif
2504 "r" (src2)
2505 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2507 #if COMPILE_TEMPLATE_MMX2
2508 } //if MMX2 can't be used
2509 #endif
2510 #else
2511 int i;
2512 unsigned int xpos=0;
2513 for (i=0;i<dstWidth;i++) {
2514 register unsigned int xx=xpos>>16;
2515 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2516 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2517 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2518 /* slower
2519 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2520 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2522 xpos+=xInc;
2524 #endif /* ARCH_X86 */
2527 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2528 int srcW, int xInc, const int16_t *hChrFilter,
2529 const int16_t *hChrFilterPos, int hChrFilterSize,
2530 uint8_t *formatConvBuffer,
2531 uint32_t *pal)
2534 src1 += c->chrSrcOffset;
2535 src2 += c->chrSrcOffset;
2537 if (c->chrToYV12) {
2538 c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2539 src1= formatConvBuffer;
2540 src2= formatConvBuffer+VOFW;
2543 if (!c->hcscale_fast) {
2544 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2545 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2546 } else { // fast bilinear upscale / crap downscale
2547 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2550 if (c->chrConvertRange)
2551 c->chrConvertRange(dst, dstWidth);
2554 #define DEBUG_SWSCALE_BUFFERS 0
2555 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2557 static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2558 int srcSliceH, uint8_t* dst[], int dstStride[])
2560 /* load a few things into local vars to make the code more readable? and faster */
2561 const int srcW= c->srcW;
2562 const int dstW= c->dstW;
2563 const int dstH= c->dstH;
2564 const int chrDstW= c->chrDstW;
2565 const int chrSrcW= c->chrSrcW;
2566 const int lumXInc= c->lumXInc;
2567 const int chrXInc= c->chrXInc;
2568 const enum PixelFormat dstFormat= c->dstFormat;
2569 const int flags= c->flags;
2570 int16_t *vLumFilterPos= c->vLumFilterPos;
2571 int16_t *vChrFilterPos= c->vChrFilterPos;
2572 int16_t *hLumFilterPos= c->hLumFilterPos;
2573 int16_t *hChrFilterPos= c->hChrFilterPos;
2574 int16_t *vLumFilter= c->vLumFilter;
2575 int16_t *vChrFilter= c->vChrFilter;
2576 int16_t *hLumFilter= c->hLumFilter;
2577 int16_t *hChrFilter= c->hChrFilter;
2578 int32_t *lumMmxFilter= c->lumMmxFilter;
2579 int32_t *chrMmxFilter= c->chrMmxFilter;
2580 int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2581 const int vLumFilterSize= c->vLumFilterSize;
2582 const int vChrFilterSize= c->vChrFilterSize;
2583 const int hLumFilterSize= c->hLumFilterSize;
2584 const int hChrFilterSize= c->hChrFilterSize;
2585 int16_t **lumPixBuf= c->lumPixBuf;
2586 int16_t **chrPixBuf= c->chrPixBuf;
2587 int16_t **alpPixBuf= c->alpPixBuf;
2588 const int vLumBufSize= c->vLumBufSize;
2589 const int vChrBufSize= c->vChrBufSize;
2590 uint8_t *formatConvBuffer= c->formatConvBuffer;
2591 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2592 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2593 int lastDstY;
2594 uint32_t *pal=c->pal_yuv;
2596 /* vars which will change and which we need to store back in the context */
2597 int dstY= c->dstY;
2598 int lumBufIndex= c->lumBufIndex;
2599 int chrBufIndex= c->chrBufIndex;
2600 int lastInLumBuf= c->lastInLumBuf;
2601 int lastInChrBuf= c->lastInChrBuf;
2603 if (isPacked(c->srcFormat)) {
2604 src[0]=
2605 src[1]=
2606 src[2]=
2607 src[3]= src[0];
2608 srcStride[0]=
2609 srcStride[1]=
2610 srcStride[2]=
2611 srcStride[3]= srcStride[0];
2613 srcStride[1]<<= c->vChrDrop;
2614 srcStride[2]<<= c->vChrDrop;
2616 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2617 src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2618 dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2619 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2620 srcSliceY, srcSliceH, dstY, dstH);
2621 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2622 vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
2624 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2625 static int warnedAlready=0; //FIXME move this into the context perhaps
2626 if (flags & SWS_PRINT_INFO && !warnedAlready) {
2627 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2628 " ->cannot do aligned memory accesses anymore\n");
2629 warnedAlready=1;
2633 /* Note the user might start scaling the picture in the middle so this
2634 will not get executed. This is not really intended but works
2635 currently, so people might do it. */
2636 if (srcSliceY ==0) {
2637 lumBufIndex=-1;
2638 chrBufIndex=-1;
2639 dstY=0;
2640 lastInLumBuf= -1;
2641 lastInChrBuf= -1;
2644 lastDstY= dstY;
2646 for (;dstY < dstH; dstY++) {
2647 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2648 const int chrDstY= dstY>>c->chrDstVSubSample;
2649 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2650 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2651 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2653 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2654 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2655 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2656 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2657 int enough_lines;
2659 //handle holes (FAST_BILINEAR & weird filters)
2660 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2661 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2662 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2663 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2665 DEBUG_BUFFERS("dstY: %d\n", dstY);
2666 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2667 firstLumSrcY, lastLumSrcY, lastInLumBuf);
2668 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2669 firstChrSrcY, lastChrSrcY, lastInChrBuf);
2671 // Do we have enough lines in this slice to output the dstY line
2672 enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2673 if (!enough_lines) {
2674 lastLumSrcY = srcSliceY + srcSliceH - 1;
2675 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2676 DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2677 lastLumSrcY, lastChrSrcY);
2680 //Do horizontal scaling
2681 while(lastInLumBuf < lastLumSrcY) {
2682 const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2683 const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2684 lumBufIndex++;
2685 assert(lumBufIndex < 2*vLumBufSize);
2686 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2687 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2688 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2689 hLumFilter, hLumFilterPos, hLumFilterSize,
2690 formatConvBuffer,
2691 pal, 0);
2692 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2693 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2694 hLumFilter, hLumFilterPos, hLumFilterSize,
2695 formatConvBuffer,
2696 pal, 1);
2697 lastInLumBuf++;
2698 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2699 lumBufIndex, lastInLumBuf);
2701 while(lastInChrBuf < lastChrSrcY) {
2702 const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2703 const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2704 chrBufIndex++;
2705 assert(chrBufIndex < 2*vChrBufSize);
2706 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2707 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2708 //FIXME replace parameters through context struct (some at least)
2710 if (c->needs_hcscale)
2711 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2712 hChrFilter, hChrFilterPos, hChrFilterSize,
2713 formatConvBuffer,
2714 pal);
2715 lastInChrBuf++;
2716 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2717 chrBufIndex, lastInChrBuf);
2719 //wrap buf index around to stay inside the ring buffer
2720 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2721 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2722 if (!enough_lines)
2723 break; //we can't output a dstY line so let's try with the next slice
2725 #if COMPILE_TEMPLATE_MMX
2726 c->blueDither= ff_dither8[dstY&1];
2727 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2728 c->greenDither= ff_dither8[dstY&1];
2729 else
2730 c->greenDither= ff_dither4[dstY&1];
2731 c->redDither= ff_dither8[(dstY+1)&1];
2732 #endif
2733 if (dstY < dstH-2) {
2734 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2735 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2736 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2737 #if COMPILE_TEMPLATE_MMX
2738 int i;
2739 if (flags & SWS_ACCURATE_RND) {
2740 int s= APCK_SIZE / 8;
2741 for (i=0; i<vLumFilterSize; i+=2) {
2742 *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2743 *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2744 lumMmxFilter[s*i+APCK_COEF/4 ]=
2745 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2746 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2747 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2748 *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2749 *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
2750 alpMmxFilter[s*i+APCK_COEF/4 ]=
2751 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2754 for (i=0; i<vChrFilterSize; i+=2) {
2755 *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2756 *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2757 chrMmxFilter[s*i+APCK_COEF/4 ]=
2758 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2759 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2761 } else {
2762 for (i=0; i<vLumFilterSize; i++) {
2763 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2764 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2765 lumMmxFilter[4*i+2]=
2766 lumMmxFilter[4*i+3]=
2767 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2768 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2769 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2770 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2771 alpMmxFilter[4*i+2]=
2772 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2775 for (i=0; i<vChrFilterSize; i++) {
2776 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2777 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2778 chrMmxFilter[4*i+2]=
2779 chrMmxFilter[4*i+3]=
2780 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2783 #endif
2784 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2785 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2786 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2787 c->yuv2nv12X(c,
2788 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2789 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2790 dest, uDest, dstW, chrDstW, dstFormat);
2791 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2792 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2793 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2794 if (is16BPS(dstFormat)) {
2795 yuv2yuvX16inC(
2796 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2797 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2798 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2799 dstFormat);
2800 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2801 const int16_t *lumBuf = lumSrcPtr[0];
2802 const int16_t *chrBuf= chrSrcPtr[0];
2803 const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2804 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2805 } else { //General YV12
2806 c->yuv2yuvX(c,
2807 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2808 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2809 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2811 } else {
2812 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2813 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2814 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2815 int chrAlpha= vChrFilter[2*dstY+1];
2816 if(flags & SWS_FULL_CHR_H_INT) {
2817 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2818 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2819 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2820 alpSrcPtr, dest, dstW, dstY);
2821 } else {
2822 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2823 alpPixBuf ? *alpSrcPtr : NULL,
2824 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2826 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2827 int lumAlpha= vLumFilter[2*dstY+1];
2828 int chrAlpha= vChrFilter[2*dstY+1];
2829 lumMmxFilter[2]=
2830 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2831 chrMmxFilter[2]=
2832 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2833 if(flags & SWS_FULL_CHR_H_INT) {
2834 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2835 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2836 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2837 alpSrcPtr, dest, dstW, dstY);
2838 } else {
2839 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2840 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2841 dest, dstW, lumAlpha, chrAlpha, dstY);
2843 } else { //general RGB
2844 if(flags & SWS_FULL_CHR_H_INT) {
2845 yuv2rgbXinC_full(c,
2846 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2847 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2848 alpSrcPtr, dest, dstW, dstY);
2849 } else {
2850 c->yuv2packedX(c,
2851 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2852 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2853 alpSrcPtr, dest, dstW, dstY);
2857 } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2858 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2859 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2860 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2861 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2862 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2863 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2864 yuv2nv12XinC(
2865 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2866 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2867 dest, uDest, dstW, chrDstW, dstFormat);
2868 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2869 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2870 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2871 if (is16BPS(dstFormat)) {
2872 yuv2yuvX16inC(
2873 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2874 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2875 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2876 dstFormat);
2877 } else {
2878 yuv2yuvXinC(
2879 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2880 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2881 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2883 } else {
2884 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2885 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2886 if(flags & SWS_FULL_CHR_H_INT) {
2887 yuv2rgbXinC_full(c,
2888 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2889 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2890 alpSrcPtr, dest, dstW, dstY);
2891 } else {
2892 yuv2packedXinC(c,
2893 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2894 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2895 alpSrcPtr, dest, dstW, dstY);
2901 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2902 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2904 #if COMPILE_TEMPLATE_MMX
2905 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
2906 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2907 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
2908 else __asm__ volatile("emms" :::"memory");
2909 #endif
2910 /* store changed local vars back in the context */
2911 c->dstY= dstY;
2912 c->lumBufIndex= lumBufIndex;
2913 c->chrBufIndex= chrBufIndex;
2914 c->lastInLumBuf= lastInLumBuf;
2915 c->lastInChrBuf= lastInChrBuf;
2917 return dstY - lastDstY;
2920 static void RENAME(sws_init_swScale)(SwsContext *c)
2922 enum PixelFormat srcFormat = c->srcFormat;
2924 c->yuv2nv12X = RENAME(yuv2nv12X );
2925 c->yuv2yuv1 = RENAME(yuv2yuv1 );
2926 c->yuv2yuvX = RENAME(yuv2yuvX );
2927 c->yuv2packed1 = RENAME(yuv2packed1 );
2928 c->yuv2packed2 = RENAME(yuv2packed2 );
2929 c->yuv2packedX = RENAME(yuv2packedX );
2931 c->hScale = RENAME(hScale );
2933 #if COMPILE_TEMPLATE_MMX
2934 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2935 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2936 #else
2937 if (c->flags & SWS_FAST_BILINEAR)
2938 #endif
2940 c->hyscale_fast = RENAME(hyscale_fast);
2941 c->hcscale_fast = RENAME(hcscale_fast);
2944 c->chrToYV12 = NULL;
2945 switch(srcFormat) {
2946 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
2947 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
2948 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
2949 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
2950 case PIX_FMT_RGB8 :
2951 case PIX_FMT_BGR8 :
2952 case PIX_FMT_PAL8 :
2953 case PIX_FMT_BGR4_BYTE:
2954 case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
2955 case PIX_FMT_YUV420P16BE:
2956 case PIX_FMT_YUV422P16BE:
2957 case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2958 case PIX_FMT_YUV420P16LE:
2959 case PIX_FMT_YUV422P16LE:
2960 case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2962 if (c->chrSrcHSubSample) {
2963 switch(srcFormat) {
2964 case PIX_FMT_RGB48BE:
2965 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
2966 case PIX_FMT_RGB32 :
2967 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV_half; break;
2968 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2969 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
2970 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
2971 case PIX_FMT_BGR32 :
2972 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV_half; break;
2973 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2974 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
2975 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
2977 } else {
2978 switch(srcFormat) {
2979 case PIX_FMT_RGB48BE:
2980 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
2981 case PIX_FMT_RGB32 :
2982 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV; break;
2983 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
2984 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
2985 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
2986 case PIX_FMT_BGR32 :
2987 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV; break;
2988 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
2989 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
2990 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
2994 c->lumToYV12 = NULL;
2995 c->alpToYV12 = NULL;
2996 switch (srcFormat) {
2997 case PIX_FMT_YUYV422 :
2998 case PIX_FMT_YUV420P16BE:
2999 case PIX_FMT_YUV422P16BE:
3000 case PIX_FMT_YUV444P16BE:
3001 case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
3002 case PIX_FMT_UYVY422 :
3003 case PIX_FMT_YUV420P16LE:
3004 case PIX_FMT_YUV422P16LE:
3005 case PIX_FMT_YUV444P16LE:
3006 case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
3007 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
3008 case PIX_FMT_BGR565 : c->lumToYV12 = bgr16ToY; break;
3009 case PIX_FMT_BGR555 : c->lumToYV12 = bgr15ToY; break;
3010 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
3011 case PIX_FMT_RGB565 : c->lumToYV12 = rgb16ToY; break;
3012 case PIX_FMT_RGB555 : c->lumToYV12 = rgb15ToY; break;
3013 case PIX_FMT_RGB8 :
3014 case PIX_FMT_BGR8 :
3015 case PIX_FMT_PAL8 :
3016 case PIX_FMT_BGR4_BYTE:
3017 case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
3018 case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
3019 case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
3020 case PIX_FMT_RGB32 :
3021 case PIX_FMT_RGB32_1: c->lumToYV12 = bgr32ToY; break;
3022 case PIX_FMT_BGR32 :
3023 case PIX_FMT_BGR32_1: c->lumToYV12 = rgb32ToY; break;
3024 case PIX_FMT_RGB48BE:
3025 case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
3027 if (c->alpPixBuf) {
3028 switch (srcFormat) {
3029 case PIX_FMT_RGB32 :
3030 case PIX_FMT_RGB32_1:
3031 case PIX_FMT_BGR32 :
3032 case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
3036 switch (srcFormat) {
3037 case PIX_FMT_RGB32 :
3038 case PIX_FMT_BGR32 :
3039 c->alpSrcOffset = 3;
3040 break;
3041 case PIX_FMT_RGB32_1:
3042 case PIX_FMT_BGR32_1:
3043 c->lumSrcOffset = ALT32_CORR;
3044 c->chrSrcOffset = ALT32_CORR;
3045 break;
3046 case PIX_FMT_RGB48LE:
3047 c->lumSrcOffset = 1;
3048 c->chrSrcOffset = 1;
3049 c->alpSrcOffset = 1;
3050 break;
3053 if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
3054 if (c->srcRange) {
3055 c->lumConvertRange = RENAME(lumRangeFromJpeg);
3056 c->chrConvertRange = RENAME(chrRangeFromJpeg);
3057 } else {
3058 c->lumConvertRange = RENAME(lumRangeToJpeg);
3059 c->chrConvertRange = RENAME(chrRangeToJpeg);
3063 if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3064 srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3065 c->needs_hcscale = 1;