colorspace-test: fix build
[libswscale-FFmpeg.git] / swscale_template.c
blob34868ac5cc49361968149e34c87141909e6e170d
1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #undef REAL_MOVNTQ
22 #undef MOVNTQ
23 #undef PAVGB
24 #undef PREFETCH
26 #if COMPILE_TEMPLATE_AMD3DNOW
27 #define PREFETCH "prefetch"
28 #elif COMPILE_TEMPLATE_MMX2
29 #define PREFETCH "prefetchnta"
30 #else
31 #define PREFETCH " # nop"
32 #endif
34 #if COMPILE_TEMPLATE_MMX2
35 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
36 #elif COMPILE_TEMPLATE_AMD3DNOW
37 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
38 #endif
40 #if COMPILE_TEMPLATE_MMX2
41 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
42 #else
43 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
44 #endif
45 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
47 #if COMPILE_TEMPLATE_ALTIVEC
48 #include "ppc/swscale_altivec_template.c"
49 #endif
51 #define YSCALEYUV2YV12X(x, offset, dest, width) \
52 __asm__ volatile(\
53 "xor %%"REG_a", %%"REG_a" \n\t"\
54 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
55 "movq %%mm3, %%mm4 \n\t"\
56 "lea " offset "(%0), %%"REG_d" \n\t"\
57 "mov (%%"REG_d"), %%"REG_S" \n\t"\
58 ASMALIGN(4) /* FIXME Unroll? */\
59 "1: \n\t"\
60 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
61 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
62 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
63 "add $16, %%"REG_d" \n\t"\
64 "mov (%%"REG_d"), %%"REG_S" \n\t"\
65 "test %%"REG_S", %%"REG_S" \n\t"\
66 "pmulhw %%mm0, %%mm2 \n\t"\
67 "pmulhw %%mm0, %%mm5 \n\t"\
68 "paddw %%mm2, %%mm3 \n\t"\
69 "paddw %%mm5, %%mm4 \n\t"\
70 " jnz 1b \n\t"\
71 "psraw $3, %%mm3 \n\t"\
72 "psraw $3, %%mm4 \n\t"\
73 "packuswb %%mm4, %%mm3 \n\t"\
74 MOVNTQ(%%mm3, (%1, %%REGa))\
75 "add $8, %%"REG_a" \n\t"\
76 "cmp %2, %%"REG_a" \n\t"\
77 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
78 "movq %%mm3, %%mm4 \n\t"\
79 "lea " offset "(%0), %%"REG_d" \n\t"\
80 "mov (%%"REG_d"), %%"REG_S" \n\t"\
81 "jb 1b \n\t"\
82 :: "r" (&c->redDither),\
83 "r" (dest), "g" ((x86_reg)width)\
84 : "%"REG_a, "%"REG_d, "%"REG_S\
87 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
88 __asm__ volatile(\
89 "lea " offset "(%0), %%"REG_d" \n\t"\
90 "xor %%"REG_a", %%"REG_a" \n\t"\
91 "pxor %%mm4, %%mm4 \n\t"\
92 "pxor %%mm5, %%mm5 \n\t"\
93 "pxor %%mm6, %%mm6 \n\t"\
94 "pxor %%mm7, %%mm7 \n\t"\
95 "mov (%%"REG_d"), %%"REG_S" \n\t"\
96 ASMALIGN(4) \
97 "1: \n\t"\
98 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
99 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
100 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
101 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
102 "movq %%mm0, %%mm3 \n\t"\
103 "punpcklwd %%mm1, %%mm0 \n\t"\
104 "punpckhwd %%mm1, %%mm3 \n\t"\
105 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
106 "pmaddwd %%mm1, %%mm0 \n\t"\
107 "pmaddwd %%mm1, %%mm3 \n\t"\
108 "paddd %%mm0, %%mm4 \n\t"\
109 "paddd %%mm3, %%mm5 \n\t"\
110 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
111 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
112 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
113 "test %%"REG_S", %%"REG_S" \n\t"\
114 "movq %%mm2, %%mm0 \n\t"\
115 "punpcklwd %%mm3, %%mm2 \n\t"\
116 "punpckhwd %%mm3, %%mm0 \n\t"\
117 "pmaddwd %%mm1, %%mm2 \n\t"\
118 "pmaddwd %%mm1, %%mm0 \n\t"\
119 "paddd %%mm2, %%mm6 \n\t"\
120 "paddd %%mm0, %%mm7 \n\t"\
121 " jnz 1b \n\t"\
122 "psrad $16, %%mm4 \n\t"\
123 "psrad $16, %%mm5 \n\t"\
124 "psrad $16, %%mm6 \n\t"\
125 "psrad $16, %%mm7 \n\t"\
126 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
127 "packssdw %%mm5, %%mm4 \n\t"\
128 "packssdw %%mm7, %%mm6 \n\t"\
129 "paddw %%mm0, %%mm4 \n\t"\
130 "paddw %%mm0, %%mm6 \n\t"\
131 "psraw $3, %%mm4 \n\t"\
132 "psraw $3, %%mm6 \n\t"\
133 "packuswb %%mm6, %%mm4 \n\t"\
134 MOVNTQ(%%mm4, (%1, %%REGa))\
135 "add $8, %%"REG_a" \n\t"\
136 "cmp %2, %%"REG_a" \n\t"\
137 "lea " offset "(%0), %%"REG_d" \n\t"\
138 "pxor %%mm4, %%mm4 \n\t"\
139 "pxor %%mm5, %%mm5 \n\t"\
140 "pxor %%mm6, %%mm6 \n\t"\
141 "pxor %%mm7, %%mm7 \n\t"\
142 "mov (%%"REG_d"), %%"REG_S" \n\t"\
143 "jb 1b \n\t"\
144 :: "r" (&c->redDither),\
145 "r" (dest), "g" ((x86_reg)width)\
146 : "%"REG_a, "%"REG_d, "%"REG_S\
149 #define YSCALEYUV2YV121 \
150 "mov %2, %%"REG_a" \n\t"\
151 ASMALIGN(4) /* FIXME Unroll? */\
152 "1: \n\t"\
153 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
154 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
155 "psraw $7, %%mm0 \n\t"\
156 "psraw $7, %%mm1 \n\t"\
157 "packuswb %%mm1, %%mm0 \n\t"\
158 MOVNTQ(%%mm0, (%1, %%REGa))\
159 "add $8, %%"REG_a" \n\t"\
160 "jnc 1b \n\t"
162 #define YSCALEYUV2YV121_ACCURATE \
163 "mov %2, %%"REG_a" \n\t"\
164 "pcmpeqw %%mm7, %%mm7 \n\t"\
165 "psrlw $15, %%mm7 \n\t"\
166 "psllw $6, %%mm7 \n\t"\
167 ASMALIGN(4) /* FIXME Unroll? */\
168 "1: \n\t"\
169 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
170 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
171 "paddsw %%mm7, %%mm0 \n\t"\
172 "paddsw %%mm7, %%mm1 \n\t"\
173 "psraw $7, %%mm0 \n\t"\
174 "psraw $7, %%mm1 \n\t"\
175 "packuswb %%mm1, %%mm0 \n\t"\
176 MOVNTQ(%%mm0, (%1, %%REGa))\
177 "add $8, %%"REG_a" \n\t"\
178 "jnc 1b \n\t"
181 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
182 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
183 "r" (dest), "m" (dstW_reg),
184 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
185 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
187 #define YSCALEYUV2PACKEDX_UV \
188 __asm__ volatile(\
189 "xor %%"REG_a", %%"REG_a" \n\t"\
190 ASMALIGN(4)\
191 "nop \n\t"\
192 "1: \n\t"\
193 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
194 "mov (%%"REG_d"), %%"REG_S" \n\t"\
195 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
196 "movq %%mm3, %%mm4 \n\t"\
197 ASMALIGN(4)\
198 "2: \n\t"\
199 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
200 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
201 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
202 "add $16, %%"REG_d" \n\t"\
203 "mov (%%"REG_d"), %%"REG_S" \n\t"\
204 "pmulhw %%mm0, %%mm2 \n\t"\
205 "pmulhw %%mm0, %%mm5 \n\t"\
206 "paddw %%mm2, %%mm3 \n\t"\
207 "paddw %%mm5, %%mm4 \n\t"\
208 "test %%"REG_S", %%"REG_S" \n\t"\
209 " jnz 2b \n\t"\
211 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
212 "lea "offset"(%0), %%"REG_d" \n\t"\
213 "mov (%%"REG_d"), %%"REG_S" \n\t"\
214 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
215 "movq "#dst1", "#dst2" \n\t"\
216 ASMALIGN(4)\
217 "2: \n\t"\
218 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
219 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
220 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
221 "add $16, %%"REG_d" \n\t"\
222 "mov (%%"REG_d"), %%"REG_S" \n\t"\
223 "pmulhw "#coeff", "#src1" \n\t"\
224 "pmulhw "#coeff", "#src2" \n\t"\
225 "paddw "#src1", "#dst1" \n\t"\
226 "paddw "#src2", "#dst2" \n\t"\
227 "test %%"REG_S", %%"REG_S" \n\t"\
228 " jnz 2b \n\t"\
230 #define YSCALEYUV2PACKEDX \
231 YSCALEYUV2PACKEDX_UV \
232 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
234 #define YSCALEYUV2PACKEDX_END \
235 :: "r" (&c->redDither), \
236 "m" (dummy), "m" (dummy), "m" (dummy),\
237 "r" (dest), "m" (dstW_reg) \
238 : "%"REG_a, "%"REG_d, "%"REG_S \
241 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
242 __asm__ volatile(\
243 "xor %%"REG_a", %%"REG_a" \n\t"\
244 ASMALIGN(4)\
245 "nop \n\t"\
246 "1: \n\t"\
247 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
248 "mov (%%"REG_d"), %%"REG_S" \n\t"\
249 "pxor %%mm4, %%mm4 \n\t"\
250 "pxor %%mm5, %%mm5 \n\t"\
251 "pxor %%mm6, %%mm6 \n\t"\
252 "pxor %%mm7, %%mm7 \n\t"\
253 ASMALIGN(4)\
254 "2: \n\t"\
255 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
256 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
257 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
258 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
259 "movq %%mm0, %%mm3 \n\t"\
260 "punpcklwd %%mm1, %%mm0 \n\t"\
261 "punpckhwd %%mm1, %%mm3 \n\t"\
262 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
263 "pmaddwd %%mm1, %%mm0 \n\t"\
264 "pmaddwd %%mm1, %%mm3 \n\t"\
265 "paddd %%mm0, %%mm4 \n\t"\
266 "paddd %%mm3, %%mm5 \n\t"\
267 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
268 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
269 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
270 "test %%"REG_S", %%"REG_S" \n\t"\
271 "movq %%mm2, %%mm0 \n\t"\
272 "punpcklwd %%mm3, %%mm2 \n\t"\
273 "punpckhwd %%mm3, %%mm0 \n\t"\
274 "pmaddwd %%mm1, %%mm2 \n\t"\
275 "pmaddwd %%mm1, %%mm0 \n\t"\
276 "paddd %%mm2, %%mm6 \n\t"\
277 "paddd %%mm0, %%mm7 \n\t"\
278 " jnz 2b \n\t"\
279 "psrad $16, %%mm4 \n\t"\
280 "psrad $16, %%mm5 \n\t"\
281 "psrad $16, %%mm6 \n\t"\
282 "psrad $16, %%mm7 \n\t"\
283 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
284 "packssdw %%mm5, %%mm4 \n\t"\
285 "packssdw %%mm7, %%mm6 \n\t"\
286 "paddw %%mm0, %%mm4 \n\t"\
287 "paddw %%mm0, %%mm6 \n\t"\
288 "movq %%mm4, "U_TEMP"(%0) \n\t"\
289 "movq %%mm6, "V_TEMP"(%0) \n\t"\
291 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
292 "lea "offset"(%0), %%"REG_d" \n\t"\
293 "mov (%%"REG_d"), %%"REG_S" \n\t"\
294 "pxor %%mm1, %%mm1 \n\t"\
295 "pxor %%mm5, %%mm5 \n\t"\
296 "pxor %%mm7, %%mm7 \n\t"\
297 "pxor %%mm6, %%mm6 \n\t"\
298 ASMALIGN(4)\
299 "2: \n\t"\
300 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
301 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
302 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
303 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
304 "movq %%mm0, %%mm3 \n\t"\
305 "punpcklwd %%mm4, %%mm0 \n\t"\
306 "punpckhwd %%mm4, %%mm3 \n\t"\
307 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
308 "pmaddwd %%mm4, %%mm0 \n\t"\
309 "pmaddwd %%mm4, %%mm3 \n\t"\
310 "paddd %%mm0, %%mm1 \n\t"\
311 "paddd %%mm3, %%mm5 \n\t"\
312 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
313 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
314 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
315 "test %%"REG_S", %%"REG_S" \n\t"\
316 "movq %%mm2, %%mm0 \n\t"\
317 "punpcklwd %%mm3, %%mm2 \n\t"\
318 "punpckhwd %%mm3, %%mm0 \n\t"\
319 "pmaddwd %%mm4, %%mm2 \n\t"\
320 "pmaddwd %%mm4, %%mm0 \n\t"\
321 "paddd %%mm2, %%mm7 \n\t"\
322 "paddd %%mm0, %%mm6 \n\t"\
323 " jnz 2b \n\t"\
324 "psrad $16, %%mm1 \n\t"\
325 "psrad $16, %%mm5 \n\t"\
326 "psrad $16, %%mm7 \n\t"\
327 "psrad $16, %%mm6 \n\t"\
328 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
329 "packssdw %%mm5, %%mm1 \n\t"\
330 "packssdw %%mm6, %%mm7 \n\t"\
331 "paddw %%mm0, %%mm1 \n\t"\
332 "paddw %%mm0, %%mm7 \n\t"\
333 "movq "U_TEMP"(%0), %%mm3 \n\t"\
334 "movq "V_TEMP"(%0), %%mm4 \n\t"\
336 #define YSCALEYUV2PACKEDX_ACCURATE \
337 YSCALEYUV2PACKEDX_ACCURATE_UV \
338 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
340 #define YSCALEYUV2RGBX \
341 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
342 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
343 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
344 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
345 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
346 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
347 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
348 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
349 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
350 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
351 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
352 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
353 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
354 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
355 "paddw %%mm3, %%mm4 \n\t"\
356 "movq %%mm2, %%mm0 \n\t"\
357 "movq %%mm5, %%mm6 \n\t"\
358 "movq %%mm4, %%mm3 \n\t"\
359 "punpcklwd %%mm2, %%mm2 \n\t"\
360 "punpcklwd %%mm5, %%mm5 \n\t"\
361 "punpcklwd %%mm4, %%mm4 \n\t"\
362 "paddw %%mm1, %%mm2 \n\t"\
363 "paddw %%mm1, %%mm5 \n\t"\
364 "paddw %%mm1, %%mm4 \n\t"\
365 "punpckhwd %%mm0, %%mm0 \n\t"\
366 "punpckhwd %%mm6, %%mm6 \n\t"\
367 "punpckhwd %%mm3, %%mm3 \n\t"\
368 "paddw %%mm7, %%mm0 \n\t"\
369 "paddw %%mm7, %%mm6 \n\t"\
370 "paddw %%mm7, %%mm3 \n\t"\
371 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
372 "packuswb %%mm0, %%mm2 \n\t"\
373 "packuswb %%mm6, %%mm5 \n\t"\
374 "packuswb %%mm3, %%mm4 \n\t"\
376 #define REAL_YSCALEYUV2PACKED(index, c) \
377 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
378 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
379 "psraw $3, %%mm0 \n\t"\
380 "psraw $3, %%mm1 \n\t"\
381 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
382 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
383 "xor "#index", "#index" \n\t"\
384 ASMALIGN(4)\
385 "1: \n\t"\
386 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
387 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
388 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
389 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
390 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
391 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
392 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
393 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
394 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
395 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
396 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
397 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
398 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
399 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
400 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
401 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
402 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
403 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
404 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
406 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
408 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
410 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
414 #define REAL_YSCALEYUV2RGB_UV(index, c) \
415 "xor "#index", "#index" \n\t"\
416 ASMALIGN(4)\
417 "1: \n\t"\
418 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
419 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
420 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
421 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
422 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
423 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
424 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
425 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
426 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
427 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
428 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
429 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
430 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
431 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
432 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
433 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
434 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
435 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
436 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
437 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
439 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
440 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
441 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
442 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
443 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
444 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
445 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
446 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
447 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
448 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
449 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
450 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
451 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
453 #define REAL_YSCALEYUV2RGB_COEFF(c) \
454 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
455 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
456 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
457 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
458 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
459 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
460 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
461 "paddw %%mm3, %%mm4 \n\t"\
462 "movq %%mm2, %%mm0 \n\t"\
463 "movq %%mm5, %%mm6 \n\t"\
464 "movq %%mm4, %%mm3 \n\t"\
465 "punpcklwd %%mm2, %%mm2 \n\t"\
466 "punpcklwd %%mm5, %%mm5 \n\t"\
467 "punpcklwd %%mm4, %%mm4 \n\t"\
468 "paddw %%mm1, %%mm2 \n\t"\
469 "paddw %%mm1, %%mm5 \n\t"\
470 "paddw %%mm1, %%mm4 \n\t"\
471 "punpckhwd %%mm0, %%mm0 \n\t"\
472 "punpckhwd %%mm6, %%mm6 \n\t"\
473 "punpckhwd %%mm3, %%mm3 \n\t"\
474 "paddw %%mm7, %%mm0 \n\t"\
475 "paddw %%mm7, %%mm6 \n\t"\
476 "paddw %%mm7, %%mm3 \n\t"\
477 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
478 "packuswb %%mm0, %%mm2 \n\t"\
479 "packuswb %%mm6, %%mm5 \n\t"\
480 "packuswb %%mm3, %%mm4 \n\t"\
482 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
484 #define YSCALEYUV2RGB(index, c) \
485 REAL_YSCALEYUV2RGB_UV(index, c) \
486 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
487 REAL_YSCALEYUV2RGB_COEFF(c)
489 #define REAL_YSCALEYUV2PACKED1(index, c) \
490 "xor "#index", "#index" \n\t"\
491 ASMALIGN(4)\
492 "1: \n\t"\
493 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
494 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
495 "psraw $7, %%mm3 \n\t" \
496 "psraw $7, %%mm4 \n\t" \
497 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
498 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
499 "psraw $7, %%mm1 \n\t" \
500 "psraw $7, %%mm7 \n\t" \
502 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
504 #define REAL_YSCALEYUV2RGB1(index, c) \
505 "xor "#index", "#index" \n\t"\
506 ASMALIGN(4)\
507 "1: \n\t"\
508 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
509 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
510 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
511 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
512 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
513 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
514 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
515 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
516 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
517 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
518 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
519 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
520 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
521 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
522 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
523 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
524 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
525 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
526 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
527 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
528 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
529 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
530 "paddw %%mm3, %%mm4 \n\t"\
531 "movq %%mm2, %%mm0 \n\t"\
532 "movq %%mm5, %%mm6 \n\t"\
533 "movq %%mm4, %%mm3 \n\t"\
534 "punpcklwd %%mm2, %%mm2 \n\t"\
535 "punpcklwd %%mm5, %%mm5 \n\t"\
536 "punpcklwd %%mm4, %%mm4 \n\t"\
537 "paddw %%mm1, %%mm2 \n\t"\
538 "paddw %%mm1, %%mm5 \n\t"\
539 "paddw %%mm1, %%mm4 \n\t"\
540 "punpckhwd %%mm0, %%mm0 \n\t"\
541 "punpckhwd %%mm6, %%mm6 \n\t"\
542 "punpckhwd %%mm3, %%mm3 \n\t"\
543 "paddw %%mm7, %%mm0 \n\t"\
544 "paddw %%mm7, %%mm6 \n\t"\
545 "paddw %%mm7, %%mm3 \n\t"\
546 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
547 "packuswb %%mm0, %%mm2 \n\t"\
548 "packuswb %%mm6, %%mm5 \n\t"\
549 "packuswb %%mm3, %%mm4 \n\t"\
551 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
553 #define REAL_YSCALEYUV2PACKED1b(index, c) \
554 "xor "#index", "#index" \n\t"\
555 ASMALIGN(4)\
556 "1: \n\t"\
557 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
558 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
559 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
560 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
561 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
562 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
563 "psrlw $8, %%mm3 \n\t" \
564 "psrlw $8, %%mm4 \n\t" \
565 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
566 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
567 "psraw $7, %%mm1 \n\t" \
568 "psraw $7, %%mm7 \n\t"
569 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
571 // do vertical chrominance interpolation
572 #define REAL_YSCALEYUV2RGB1b(index, c) \
573 "xor "#index", "#index" \n\t"\
574 ASMALIGN(4)\
575 "1: \n\t"\
576 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
577 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
578 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
579 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
580 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
581 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
582 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
583 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
584 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
585 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
586 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
587 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
588 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
589 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
590 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
591 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
592 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
593 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
594 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
595 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
596 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
597 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
598 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
599 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
600 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
601 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
602 "paddw %%mm3, %%mm4 \n\t"\
603 "movq %%mm2, %%mm0 \n\t"\
604 "movq %%mm5, %%mm6 \n\t"\
605 "movq %%mm4, %%mm3 \n\t"\
606 "punpcklwd %%mm2, %%mm2 \n\t"\
607 "punpcklwd %%mm5, %%mm5 \n\t"\
608 "punpcklwd %%mm4, %%mm4 \n\t"\
609 "paddw %%mm1, %%mm2 \n\t"\
610 "paddw %%mm1, %%mm5 \n\t"\
611 "paddw %%mm1, %%mm4 \n\t"\
612 "punpckhwd %%mm0, %%mm0 \n\t"\
613 "punpckhwd %%mm6, %%mm6 \n\t"\
614 "punpckhwd %%mm3, %%mm3 \n\t"\
615 "paddw %%mm7, %%mm0 \n\t"\
616 "paddw %%mm7, %%mm6 \n\t"\
617 "paddw %%mm7, %%mm3 \n\t"\
618 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
619 "packuswb %%mm0, %%mm2 \n\t"\
620 "packuswb %%mm6, %%mm5 \n\t"\
621 "packuswb %%mm3, %%mm4 \n\t"\
623 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
625 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
626 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
627 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
628 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
629 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
630 "packuswb %%mm1, %%mm7 \n\t"
631 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
633 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
634 "movq "#b", "#q2" \n\t" /* B */\
635 "movq "#r", "#t" \n\t" /* R */\
636 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
637 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
638 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
639 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
640 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
641 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
642 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
643 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
644 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
645 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
647 MOVNTQ( q0, (dst, index, 4))\
648 MOVNTQ( b, 8(dst, index, 4))\
649 MOVNTQ( q2, 16(dst, index, 4))\
650 MOVNTQ( q3, 24(dst, index, 4))\
652 "add $8, "#index" \n\t"\
653 "cmp "#dstw", "#index" \n\t"\
654 " jb 1b \n\t"
655 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
657 #define REAL_WRITERGB16(dst, dstw, index) \
658 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
659 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
660 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
661 "psrlq $3, %%mm2 \n\t"\
663 "movq %%mm2, %%mm1 \n\t"\
664 "movq %%mm4, %%mm3 \n\t"\
666 "punpcklbw %%mm7, %%mm3 \n\t"\
667 "punpcklbw %%mm5, %%mm2 \n\t"\
668 "punpckhbw %%mm7, %%mm4 \n\t"\
669 "punpckhbw %%mm5, %%mm1 \n\t"\
671 "psllq $3, %%mm3 \n\t"\
672 "psllq $3, %%mm4 \n\t"\
674 "por %%mm3, %%mm2 \n\t"\
675 "por %%mm4, %%mm1 \n\t"\
677 MOVNTQ(%%mm2, (dst, index, 2))\
678 MOVNTQ(%%mm1, 8(dst, index, 2))\
680 "add $8, "#index" \n\t"\
681 "cmp "#dstw", "#index" \n\t"\
682 " jb 1b \n\t"
683 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
685 #define REAL_WRITERGB15(dst, dstw, index) \
686 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
687 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
688 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
689 "psrlq $3, %%mm2 \n\t"\
690 "psrlq $1, %%mm5 \n\t"\
692 "movq %%mm2, %%mm1 \n\t"\
693 "movq %%mm4, %%mm3 \n\t"\
695 "punpcklbw %%mm7, %%mm3 \n\t"\
696 "punpcklbw %%mm5, %%mm2 \n\t"\
697 "punpckhbw %%mm7, %%mm4 \n\t"\
698 "punpckhbw %%mm5, %%mm1 \n\t"\
700 "psllq $2, %%mm3 \n\t"\
701 "psllq $2, %%mm4 \n\t"\
703 "por %%mm3, %%mm2 \n\t"\
704 "por %%mm4, %%mm1 \n\t"\
706 MOVNTQ(%%mm2, (dst, index, 2))\
707 MOVNTQ(%%mm1, 8(dst, index, 2))\
709 "add $8, "#index" \n\t"\
710 "cmp "#dstw", "#index" \n\t"\
711 " jb 1b \n\t"
712 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
714 #define WRITEBGR24OLD(dst, dstw, index) \
715 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
716 "movq %%mm2, %%mm1 \n\t" /* B */\
717 "movq %%mm5, %%mm6 \n\t" /* R */\
718 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
719 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
720 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
721 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
722 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
723 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
724 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
725 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
726 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
727 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
729 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
730 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
731 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
732 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
733 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
734 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
735 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
736 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
738 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
739 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
740 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
741 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
742 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
743 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
744 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
745 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
746 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
747 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
748 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
749 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
750 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
752 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
753 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
754 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
755 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
756 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
757 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
758 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
759 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
761 MOVNTQ(%%mm0, (dst))\
762 MOVNTQ(%%mm2, 8(dst))\
763 MOVNTQ(%%mm3, 16(dst))\
764 "add $24, "#dst" \n\t"\
766 "add $8, "#index" \n\t"\
767 "cmp "#dstw", "#index" \n\t"\
768 " jb 1b \n\t"
770 #define WRITEBGR24MMX(dst, dstw, index) \
771 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
772 "movq %%mm2, %%mm1 \n\t" /* B */\
773 "movq %%mm5, %%mm6 \n\t" /* R */\
774 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
775 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
776 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
777 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
778 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
779 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
780 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
781 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
782 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
783 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
785 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
786 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
787 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
788 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
790 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
791 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
792 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
793 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
795 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
796 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
797 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
798 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
800 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
801 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
802 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
803 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
804 MOVNTQ(%%mm0, (dst))\
806 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
807 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
808 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
809 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
810 MOVNTQ(%%mm6, 8(dst))\
812 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
813 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
814 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
815 MOVNTQ(%%mm5, 16(dst))\
817 "add $24, "#dst" \n\t"\
819 "add $8, "#index" \n\t"\
820 "cmp "#dstw", "#index" \n\t"\
821 " jb 1b \n\t"
823 #define WRITEBGR24MMX2(dst, dstw, index) \
824 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
825 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
826 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
827 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
828 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
829 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
831 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
832 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
833 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
835 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
836 "por %%mm1, %%mm6 \n\t"\
837 "por %%mm3, %%mm6 \n\t"\
838 MOVNTQ(%%mm6, (dst))\
840 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
841 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
842 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
843 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
845 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
846 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
847 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
849 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
850 "por %%mm3, %%mm6 \n\t"\
851 MOVNTQ(%%mm6, 8(dst))\
853 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
854 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
855 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
857 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
858 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
859 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
861 "por %%mm1, %%mm3 \n\t"\
862 "por %%mm3, %%mm6 \n\t"\
863 MOVNTQ(%%mm6, 16(dst))\
865 "add $24, "#dst" \n\t"\
867 "add $8, "#index" \n\t"\
868 "cmp "#dstw", "#index" \n\t"\
869 " jb 1b \n\t"
871 #if COMPILE_TEMPLATE_MMX2
872 #undef WRITEBGR24
873 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
874 #else
875 #undef WRITEBGR24
876 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
877 #endif
879 #define REAL_WRITEYUY2(dst, dstw, index) \
880 "packuswb %%mm3, %%mm3 \n\t"\
881 "packuswb %%mm4, %%mm4 \n\t"\
882 "packuswb %%mm7, %%mm1 \n\t"\
883 "punpcklbw %%mm4, %%mm3 \n\t"\
884 "movq %%mm1, %%mm7 \n\t"\
885 "punpcklbw %%mm3, %%mm1 \n\t"\
886 "punpckhbw %%mm3, %%mm7 \n\t"\
888 MOVNTQ(%%mm1, (dst, index, 2))\
889 MOVNTQ(%%mm7, 8(dst, index, 2))\
891 "add $8, "#index" \n\t"\
892 "cmp "#dstw", "#index" \n\t"\
893 " jb 1b \n\t"
894 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
897 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
898 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
899 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
901 #if COMPILE_TEMPLATE_MMX
902 if(!(c->flags & SWS_BITEXACT)) {
903 if (c->flags & SWS_ACCURATE_RND) {
904 if (uDest) {
905 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
906 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
908 if (CONFIG_SWSCALE_ALPHA && aDest) {
909 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
912 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
913 } else {
914 if (uDest) {
915 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
916 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
918 if (CONFIG_SWSCALE_ALPHA && aDest) {
919 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
922 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
924 return;
926 #endif
927 #if COMPILE_TEMPLATE_ALTIVEC
928 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
929 chrFilter, chrSrc, chrFilterSize,
930 dest, uDest, vDest, dstW, chrDstW);
931 #else //COMPILE_TEMPLATE_ALTIVEC
932 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
933 chrFilter, chrSrc, chrFilterSize,
934 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
935 #endif //!COMPILE_TEMPLATE_ALTIVEC
938 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
939 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
940 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
942 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
943 chrFilter, chrSrc, chrFilterSize,
944 dest, uDest, dstW, chrDstW, dstFormat);
947 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
948 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
950 int i;
951 #if COMPILE_TEMPLATE_MMX
952 if(!(c->flags & SWS_BITEXACT)) {
953 long p= 4;
954 const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
955 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
956 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
958 if (c->flags & SWS_ACCURATE_RND) {
959 while(p--) {
960 if (dst[p]) {
961 __asm__ volatile(
962 YSCALEYUV2YV121_ACCURATE
963 :: "r" (src[p]), "r" (dst[p] + counter[p]),
964 "g" (-counter[p])
965 : "%"REG_a
969 } else {
970 while(p--) {
971 if (dst[p]) {
972 __asm__ volatile(
973 YSCALEYUV2YV121
974 :: "r" (src[p]), "r" (dst[p] + counter[p]),
975 "g" (-counter[p])
976 : "%"REG_a
981 return;
983 #endif
984 for (i=0; i<dstW; i++) {
985 int val= (lumSrc[i]+64)>>7;
987 if (val&256) {
988 if (val<0) val=0;
989 else val=255;
992 dest[i]= val;
995 if (uDest)
996 for (i=0; i<chrDstW; i++) {
997 int u=(chrSrc[i ]+64)>>7;
998 int v=(chrSrc[i + VOFW]+64)>>7;
1000 if ((u|v)&256) {
1001 if (u<0) u=0;
1002 else if (u>255) u=255;
1003 if (v<0) v=0;
1004 else if (v>255) v=255;
1007 uDest[i]= u;
1008 vDest[i]= v;
1011 if (CONFIG_SWSCALE_ALPHA && aDest)
1012 for (i=0; i<dstW; i++) {
1013 int val= (alpSrc[i]+64)>>7;
1014 aDest[i]= av_clip_uint8(val);
1020 * vertical scale YV12 to RGB
1022 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1023 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1024 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1026 #if COMPILE_TEMPLATE_MMX
1027 x86_reg dummy=0;
1028 x86_reg dstW_reg = dstW;
1029 if(!(c->flags & SWS_BITEXACT)) {
1030 if (c->flags & SWS_ACCURATE_RND) {
1031 switch(c->dstFormat) {
1032 case PIX_FMT_RGB32:
1033 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1034 YSCALEYUV2PACKEDX_ACCURATE
1035 YSCALEYUV2RGBX
1036 "movq %%mm2, "U_TEMP"(%0) \n\t"
1037 "movq %%mm4, "V_TEMP"(%0) \n\t"
1038 "movq %%mm5, "Y_TEMP"(%0) \n\t"
1039 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1040 "movq "Y_TEMP"(%0), %%mm5 \n\t"
1041 "psraw $3, %%mm1 \n\t"
1042 "psraw $3, %%mm7 \n\t"
1043 "packuswb %%mm7, %%mm1 \n\t"
1044 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1046 YSCALEYUV2PACKEDX_END
1047 } else {
1048 YSCALEYUV2PACKEDX_ACCURATE
1049 YSCALEYUV2RGBX
1050 "pcmpeqd %%mm7, %%mm7 \n\t"
1051 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1053 YSCALEYUV2PACKEDX_END
1055 return;
1056 case PIX_FMT_BGR24:
1057 YSCALEYUV2PACKEDX_ACCURATE
1058 YSCALEYUV2RGBX
1059 "pxor %%mm7, %%mm7 \n\t"
1060 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1061 "add %4, %%"REG_c" \n\t"
1062 WRITEBGR24(%%REGc, %5, %%REGa)
1065 :: "r" (&c->redDither),
1066 "m" (dummy), "m" (dummy), "m" (dummy),
1067 "r" (dest), "m" (dstW_reg)
1068 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1070 return;
1071 case PIX_FMT_RGB555:
1072 YSCALEYUV2PACKEDX_ACCURATE
1073 YSCALEYUV2RGBX
1074 "pxor %%mm7, %%mm7 \n\t"
1075 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1076 #ifdef DITHER1XBPP
1077 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1078 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1079 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1080 #endif
1082 WRITERGB15(%4, %5, %%REGa)
1083 YSCALEYUV2PACKEDX_END
1084 return;
1085 case PIX_FMT_RGB565:
1086 YSCALEYUV2PACKEDX_ACCURATE
1087 YSCALEYUV2RGBX
1088 "pxor %%mm7, %%mm7 \n\t"
1089 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1090 #ifdef DITHER1XBPP
1091 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1092 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1093 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1094 #endif
1096 WRITERGB16(%4, %5, %%REGa)
1097 YSCALEYUV2PACKEDX_END
1098 return;
1099 case PIX_FMT_YUYV422:
1100 YSCALEYUV2PACKEDX_ACCURATE
1101 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1103 "psraw $3, %%mm3 \n\t"
1104 "psraw $3, %%mm4 \n\t"
1105 "psraw $3, %%mm1 \n\t"
1106 "psraw $3, %%mm7 \n\t"
1107 WRITEYUY2(%4, %5, %%REGa)
1108 YSCALEYUV2PACKEDX_END
1109 return;
1111 } else {
1112 switch(c->dstFormat) {
1113 case PIX_FMT_RGB32:
1114 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1115 YSCALEYUV2PACKEDX
1116 YSCALEYUV2RGBX
1117 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1118 "psraw $3, %%mm1 \n\t"
1119 "psraw $3, %%mm7 \n\t"
1120 "packuswb %%mm7, %%mm1 \n\t"
1121 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1122 YSCALEYUV2PACKEDX_END
1123 } else {
1124 YSCALEYUV2PACKEDX
1125 YSCALEYUV2RGBX
1126 "pcmpeqd %%mm7, %%mm7 \n\t"
1127 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1128 YSCALEYUV2PACKEDX_END
1130 return;
1131 case PIX_FMT_BGR24:
1132 YSCALEYUV2PACKEDX
1133 YSCALEYUV2RGBX
1134 "pxor %%mm7, %%mm7 \n\t"
1135 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1136 "add %4, %%"REG_c" \n\t"
1137 WRITEBGR24(%%REGc, %5, %%REGa)
1139 :: "r" (&c->redDither),
1140 "m" (dummy), "m" (dummy), "m" (dummy),
1141 "r" (dest), "m" (dstW_reg)
1142 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1144 return;
1145 case PIX_FMT_RGB555:
1146 YSCALEYUV2PACKEDX
1147 YSCALEYUV2RGBX
1148 "pxor %%mm7, %%mm7 \n\t"
1149 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150 #ifdef DITHER1XBPP
1151 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1152 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1153 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1154 #endif
1156 WRITERGB15(%4, %5, %%REGa)
1157 YSCALEYUV2PACKEDX_END
1158 return;
1159 case PIX_FMT_RGB565:
1160 YSCALEYUV2PACKEDX
1161 YSCALEYUV2RGBX
1162 "pxor %%mm7, %%mm7 \n\t"
1163 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1164 #ifdef DITHER1XBPP
1165 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
1166 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
1167 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
1168 #endif
1170 WRITERGB16(%4, %5, %%REGa)
1171 YSCALEYUV2PACKEDX_END
1172 return;
1173 case PIX_FMT_YUYV422:
1174 YSCALEYUV2PACKEDX
1175 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1177 "psraw $3, %%mm3 \n\t"
1178 "psraw $3, %%mm4 \n\t"
1179 "psraw $3, %%mm1 \n\t"
1180 "psraw $3, %%mm7 \n\t"
1181 WRITEYUY2(%4, %5, %%REGa)
1182 YSCALEYUV2PACKEDX_END
1183 return;
1187 #endif /* COMPILE_TEMPLATE_MMX */
1188 #if COMPILE_TEMPLATE_ALTIVEC
1189 /* The following list of supported dstFormat values should
1190 match what's found in the body of ff_yuv2packedX_altivec() */
1191 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1192 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1193 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1194 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
1195 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1196 chrFilter, chrSrc, chrFilterSize,
1197 dest, dstW, dstY);
1198 else
1199 #endif
1200 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1201 chrFilter, chrSrc, chrFilterSize,
1202 alpSrc, dest, dstW, dstY);
1206 * vertical bilinear scale YV12 to RGB
1208 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1209 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1211 int yalpha1=4095- yalpha;
1212 int uvalpha1=4095-uvalpha;
1213 int i;
1215 #if COMPILE_TEMPLATE_MMX
1216 if(!(c->flags & SWS_BITEXACT)) {
1217 switch(c->dstFormat) {
1218 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1219 case PIX_FMT_RGB32:
1220 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1221 #if ARCH_X86_64
1222 __asm__ volatile(
1223 YSCALEYUV2RGB(%%r8, %5)
1224 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1225 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1226 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1227 "packuswb %%mm7, %%mm1 \n\t"
1228 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1230 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1231 "a" (&c->redDither)
1232 ,"r" (abuf0), "r" (abuf1)
1233 : "%r8"
1235 #else
1236 *(const uint16_t **)(&c->u_temp)=abuf0;
1237 *(const uint16_t **)(&c->v_temp)=abuf1;
1238 __asm__ volatile(
1239 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1240 "mov %4, %%"REG_b" \n\t"
1241 "push %%"REG_BP" \n\t"
1242 YSCALEYUV2RGB(%%REGBP, %5)
1243 "push %0 \n\t"
1244 "push %1 \n\t"
1245 "mov "U_TEMP"(%5), %0 \n\t"
1246 "mov "V_TEMP"(%5), %1 \n\t"
1247 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1248 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1249 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1250 "packuswb %%mm7, %%mm1 \n\t"
1251 "pop %1 \n\t"
1252 "pop %0 \n\t"
1253 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1254 "pop %%"REG_BP" \n\t"
1255 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1257 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1258 "a" (&c->redDither)
1260 #endif
1261 } else {
1262 __asm__ volatile(
1263 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1264 "mov %4, %%"REG_b" \n\t"
1265 "push %%"REG_BP" \n\t"
1266 YSCALEYUV2RGB(%%REGBP, %5)
1267 "pcmpeqd %%mm7, %%mm7 \n\t"
1268 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1269 "pop %%"REG_BP" \n\t"
1270 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1272 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1273 "a" (&c->redDither)
1276 return;
1277 case PIX_FMT_BGR24:
1278 __asm__ volatile(
1279 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1280 "mov %4, %%"REG_b" \n\t"
1281 "push %%"REG_BP" \n\t"
1282 YSCALEYUV2RGB(%%REGBP, %5)
1283 "pxor %%mm7, %%mm7 \n\t"
1284 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1285 "pop %%"REG_BP" \n\t"
1286 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1287 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1288 "a" (&c->redDither)
1290 return;
1291 case PIX_FMT_RGB555:
1292 __asm__ volatile(
1293 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1294 "mov %4, %%"REG_b" \n\t"
1295 "push %%"REG_BP" \n\t"
1296 YSCALEYUV2RGB(%%REGBP, %5)
1297 "pxor %%mm7, %%mm7 \n\t"
1298 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1299 #ifdef DITHER1XBPP
1300 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1301 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1302 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1303 #endif
1305 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1306 "pop %%"REG_BP" \n\t"
1307 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1309 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1310 "a" (&c->redDither)
1312 return;
1313 case PIX_FMT_RGB565:
1314 __asm__ volatile(
1315 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1316 "mov %4, %%"REG_b" \n\t"
1317 "push %%"REG_BP" \n\t"
1318 YSCALEYUV2RGB(%%REGBP, %5)
1319 "pxor %%mm7, %%mm7 \n\t"
1320 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1321 #ifdef DITHER1XBPP
1322 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1323 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1324 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1325 #endif
1327 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1328 "pop %%"REG_BP" \n\t"
1329 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1330 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1331 "a" (&c->redDither)
1333 return;
1334 case PIX_FMT_YUYV422:
1335 __asm__ volatile(
1336 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1337 "mov %4, %%"REG_b" \n\t"
1338 "push %%"REG_BP" \n\t"
1339 YSCALEYUV2PACKED(%%REGBP, %5)
1340 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1341 "pop %%"REG_BP" \n\t"
1342 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1343 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1344 "a" (&c->redDither)
1346 return;
1347 default: break;
1350 #endif //COMPILE_TEMPLATE_MMX
1351 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1355 * YV12 to RGB without scaling or interpolating
1357 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1358 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1360 const int yalpha1=0;
1361 int i;
1363 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1364 const int yalpha= 4096; //FIXME ...
1366 if (flags&SWS_FULL_CHR_H_INT) {
1367 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1368 return;
1371 #if COMPILE_TEMPLATE_MMX
1372 if(!(flags & SWS_BITEXACT)) {
1373 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1374 switch(dstFormat) {
1375 case PIX_FMT_RGB32:
1376 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1377 __asm__ volatile(
1378 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1379 "mov %4, %%"REG_b" \n\t"
1380 "push %%"REG_BP" \n\t"
1381 YSCALEYUV2RGB1(%%REGBP, %5)
1382 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1383 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1384 "pop %%"REG_BP" \n\t"
1385 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1387 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1388 "a" (&c->redDither)
1390 } else {
1391 __asm__ volatile(
1392 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1393 "mov %4, %%"REG_b" \n\t"
1394 "push %%"REG_BP" \n\t"
1395 YSCALEYUV2RGB1(%%REGBP, %5)
1396 "pcmpeqd %%mm7, %%mm7 \n\t"
1397 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1398 "pop %%"REG_BP" \n\t"
1399 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1401 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1402 "a" (&c->redDither)
1405 return;
1406 case PIX_FMT_BGR24:
1407 __asm__ volatile(
1408 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1409 "mov %4, %%"REG_b" \n\t"
1410 "push %%"REG_BP" \n\t"
1411 YSCALEYUV2RGB1(%%REGBP, %5)
1412 "pxor %%mm7, %%mm7 \n\t"
1413 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1414 "pop %%"REG_BP" \n\t"
1415 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1417 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1418 "a" (&c->redDither)
1420 return;
1421 case PIX_FMT_RGB555:
1422 __asm__ volatile(
1423 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1424 "mov %4, %%"REG_b" \n\t"
1425 "push %%"REG_BP" \n\t"
1426 YSCALEYUV2RGB1(%%REGBP, %5)
1427 "pxor %%mm7, %%mm7 \n\t"
1428 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1429 #ifdef DITHER1XBPP
1430 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1431 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1432 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1433 #endif
1434 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1435 "pop %%"REG_BP" \n\t"
1436 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1438 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1439 "a" (&c->redDither)
1441 return;
1442 case PIX_FMT_RGB565:
1443 __asm__ volatile(
1444 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1445 "mov %4, %%"REG_b" \n\t"
1446 "push %%"REG_BP" \n\t"
1447 YSCALEYUV2RGB1(%%REGBP, %5)
1448 "pxor %%mm7, %%mm7 \n\t"
1449 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1450 #ifdef DITHER1XBPP
1451 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1452 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1453 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1454 #endif
1456 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1457 "pop %%"REG_BP" \n\t"
1458 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1460 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1461 "a" (&c->redDither)
1463 return;
1464 case PIX_FMT_YUYV422:
1465 __asm__ volatile(
1466 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1467 "mov %4, %%"REG_b" \n\t"
1468 "push %%"REG_BP" \n\t"
1469 YSCALEYUV2PACKED1(%%REGBP, %5)
1470 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1471 "pop %%"REG_BP" \n\t"
1472 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475 "a" (&c->redDither)
1477 return;
1479 } else {
1480 switch(dstFormat) {
1481 case PIX_FMT_RGB32:
1482 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1483 __asm__ volatile(
1484 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1485 "mov %4, %%"REG_b" \n\t"
1486 "push %%"REG_BP" \n\t"
1487 YSCALEYUV2RGB1b(%%REGBP, %5)
1488 YSCALEYUV2RGB1_ALPHA(%%REGBP)
1489 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1490 "pop %%"REG_BP" \n\t"
1491 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1493 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1494 "a" (&c->redDither)
1496 } else {
1497 __asm__ volatile(
1498 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1499 "mov %4, %%"REG_b" \n\t"
1500 "push %%"REG_BP" \n\t"
1501 YSCALEYUV2RGB1b(%%REGBP, %5)
1502 "pcmpeqd %%mm7, %%mm7 \n\t"
1503 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1504 "pop %%"REG_BP" \n\t"
1505 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1507 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1508 "a" (&c->redDither)
1511 return;
1512 case PIX_FMT_BGR24:
1513 __asm__ volatile(
1514 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1515 "mov %4, %%"REG_b" \n\t"
1516 "push %%"REG_BP" \n\t"
1517 YSCALEYUV2RGB1b(%%REGBP, %5)
1518 "pxor %%mm7, %%mm7 \n\t"
1519 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1520 "pop %%"REG_BP" \n\t"
1521 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1523 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1524 "a" (&c->redDither)
1526 return;
1527 case PIX_FMT_RGB555:
1528 __asm__ volatile(
1529 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1530 "mov %4, %%"REG_b" \n\t"
1531 "push %%"REG_BP" \n\t"
1532 YSCALEYUV2RGB1b(%%REGBP, %5)
1533 "pxor %%mm7, %%mm7 \n\t"
1534 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1535 #ifdef DITHER1XBPP
1536 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1537 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1538 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1539 #endif
1540 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1541 "pop %%"REG_BP" \n\t"
1542 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1544 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1545 "a" (&c->redDither)
1547 return;
1548 case PIX_FMT_RGB565:
1549 __asm__ volatile(
1550 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1551 "mov %4, %%"REG_b" \n\t"
1552 "push %%"REG_BP" \n\t"
1553 YSCALEYUV2RGB1b(%%REGBP, %5)
1554 "pxor %%mm7, %%mm7 \n\t"
1555 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1556 #ifdef DITHER1XBPP
1557 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1558 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1559 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1560 #endif
1562 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1563 "pop %%"REG_BP" \n\t"
1564 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1566 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1567 "a" (&c->redDither)
1569 return;
1570 case PIX_FMT_YUYV422:
1571 __asm__ volatile(
1572 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1573 "mov %4, %%"REG_b" \n\t"
1574 "push %%"REG_BP" \n\t"
1575 YSCALEYUV2PACKED1b(%%REGBP, %5)
1576 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1577 "pop %%"REG_BP" \n\t"
1578 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1580 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1581 "a" (&c->redDither)
1583 return;
1587 #endif /* COMPILE_TEMPLATE_MMX */
1588 if (uvalpha < 2048) {
1589 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1590 } else {
1591 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1595 //FIXME yuy2* can read up to 7 samples too much
1597 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1599 #if COMPILE_TEMPLATE_MMX
1600 __asm__ volatile(
1601 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1602 "mov %0, %%"REG_a" \n\t"
1603 "1: \n\t"
1604 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1605 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1606 "pand %%mm2, %%mm0 \n\t"
1607 "pand %%mm2, %%mm1 \n\t"
1608 "packuswb %%mm1, %%mm0 \n\t"
1609 "movq %%mm0, (%2, %%"REG_a") \n\t"
1610 "add $8, %%"REG_a" \n\t"
1611 " js 1b \n\t"
1612 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1613 : "%"REG_a
1615 #else
1616 int i;
1617 for (i=0; i<width; i++)
1618 dst[i]= src[2*i];
1619 #endif
1622 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1624 #if COMPILE_TEMPLATE_MMX
1625 __asm__ volatile(
1626 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1627 "mov %0, %%"REG_a" \n\t"
1628 "1: \n\t"
1629 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1630 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1631 "psrlw $8, %%mm0 \n\t"
1632 "psrlw $8, %%mm1 \n\t"
1633 "packuswb %%mm1, %%mm0 \n\t"
1634 "movq %%mm0, %%mm1 \n\t"
1635 "psrlw $8, %%mm0 \n\t"
1636 "pand %%mm4, %%mm1 \n\t"
1637 "packuswb %%mm0, %%mm0 \n\t"
1638 "packuswb %%mm1, %%mm1 \n\t"
1639 "movd %%mm0, (%3, %%"REG_a") \n\t"
1640 "movd %%mm1, (%2, %%"REG_a") \n\t"
1641 "add $4, %%"REG_a" \n\t"
1642 " js 1b \n\t"
1643 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1644 : "%"REG_a
1646 #else
1647 int i;
1648 for (i=0; i<width; i++) {
1649 dstU[i]= src1[4*i + 1];
1650 dstV[i]= src1[4*i + 3];
1652 #endif
1653 assert(src1 == src2);
1656 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1658 #if COMPILE_TEMPLATE_MMX
1659 __asm__ volatile(
1660 "mov %0, %%"REG_a" \n\t"
1661 "1: \n\t"
1662 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1663 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1664 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1665 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1666 "psrlw $8, %%mm0 \n\t"
1667 "psrlw $8, %%mm1 \n\t"
1668 "psrlw $8, %%mm2 \n\t"
1669 "psrlw $8, %%mm3 \n\t"
1670 "packuswb %%mm1, %%mm0 \n\t"
1671 "packuswb %%mm3, %%mm2 \n\t"
1672 "movq %%mm0, (%3, %%"REG_a") \n\t"
1673 "movq %%mm2, (%4, %%"REG_a") \n\t"
1674 "add $8, %%"REG_a" \n\t"
1675 " js 1b \n\t"
1676 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1677 : "%"REG_a
1679 #else
1680 int i;
1681 for (i=0; i<width; i++) {
1682 dstU[i]= src1[2*i + 1];
1683 dstV[i]= src2[2*i + 1];
1685 #endif
1688 /* This is almost identical to the previous, end exists only because
1689 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1690 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1692 #if COMPILE_TEMPLATE_MMX
1693 __asm__ volatile(
1694 "mov %0, %%"REG_a" \n\t"
1695 "1: \n\t"
1696 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1697 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1698 "psrlw $8, %%mm0 \n\t"
1699 "psrlw $8, %%mm1 \n\t"
1700 "packuswb %%mm1, %%mm0 \n\t"
1701 "movq %%mm0, (%2, %%"REG_a") \n\t"
1702 "add $8, %%"REG_a" \n\t"
1703 " js 1b \n\t"
1704 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1705 : "%"REG_a
1707 #else
1708 int i;
1709 for (i=0; i<width; i++)
1710 dst[i]= src[2*i+1];
1711 #endif
1714 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1716 #if COMPILE_TEMPLATE_MMX
1717 __asm__ volatile(
1718 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1719 "mov %0, %%"REG_a" \n\t"
1720 "1: \n\t"
1721 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1722 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1723 "pand %%mm4, %%mm0 \n\t"
1724 "pand %%mm4, %%mm1 \n\t"
1725 "packuswb %%mm1, %%mm0 \n\t"
1726 "movq %%mm0, %%mm1 \n\t"
1727 "psrlw $8, %%mm0 \n\t"
1728 "pand %%mm4, %%mm1 \n\t"
1729 "packuswb %%mm0, %%mm0 \n\t"
1730 "packuswb %%mm1, %%mm1 \n\t"
1731 "movd %%mm0, (%3, %%"REG_a") \n\t"
1732 "movd %%mm1, (%2, %%"REG_a") \n\t"
1733 "add $4, %%"REG_a" \n\t"
1734 " js 1b \n\t"
1735 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1736 : "%"REG_a
1738 #else
1739 int i;
1740 for (i=0; i<width; i++) {
1741 dstU[i]= src1[4*i + 0];
1742 dstV[i]= src1[4*i + 2];
1744 #endif
1745 assert(src1 == src2);
1748 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1750 #if COMPILE_TEMPLATE_MMX
1751 __asm__ volatile(
1752 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1753 "mov %0, %%"REG_a" \n\t"
1754 "1: \n\t"
1755 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1756 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1757 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
1758 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
1759 "pand %%mm4, %%mm0 \n\t"
1760 "pand %%mm4, %%mm1 \n\t"
1761 "pand %%mm4, %%mm2 \n\t"
1762 "pand %%mm4, %%mm3 \n\t"
1763 "packuswb %%mm1, %%mm0 \n\t"
1764 "packuswb %%mm3, %%mm2 \n\t"
1765 "movq %%mm0, (%3, %%"REG_a") \n\t"
1766 "movq %%mm2, (%4, %%"REG_a") \n\t"
1767 "add $8, %%"REG_a" \n\t"
1768 " js 1b \n\t"
1769 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1770 : "%"REG_a
1772 #else
1773 int i;
1774 for (i=0; i<width; i++) {
1775 dstU[i]= src1[2*i];
1776 dstV[i]= src2[2*i];
1778 #endif
1781 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1782 const uint8_t *src, long width)
1784 #if COMPILE_TEMPLATE_MMX
1785 __asm__ volatile(
1786 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1787 "mov %0, %%"REG_a" \n\t"
1788 "1: \n\t"
1789 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1790 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1791 "movq %%mm0, %%mm2 \n\t"
1792 "movq %%mm1, %%mm3 \n\t"
1793 "pand %%mm4, %%mm0 \n\t"
1794 "pand %%mm4, %%mm1 \n\t"
1795 "psrlw $8, %%mm2 \n\t"
1796 "psrlw $8, %%mm3 \n\t"
1797 "packuswb %%mm1, %%mm0 \n\t"
1798 "packuswb %%mm3, %%mm2 \n\t"
1799 "movq %%mm0, (%2, %%"REG_a") \n\t"
1800 "movq %%mm2, (%3, %%"REG_a") \n\t"
1801 "add $8, %%"REG_a" \n\t"
1802 " js 1b \n\t"
1803 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1804 : "%"REG_a
1806 #else
1807 int i;
1808 for (i = 0; i < width; i++) {
1809 dst1[i] = src[2*i+0];
1810 dst2[i] = src[2*i+1];
1812 #endif
1815 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1816 const uint8_t *src1, const uint8_t *src2,
1817 long width, uint32_t *unused)
1819 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1822 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1823 const uint8_t *src1, const uint8_t *src2,
1824 long width, uint32_t *unused)
1826 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1829 #if COMPILE_TEMPLATE_MMX
1830 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1833 if(srcFormat == PIX_FMT_BGR24) {
1834 __asm__ volatile(
1835 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
1836 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
1839 } else {
1840 __asm__ volatile(
1841 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
1842 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
1847 __asm__ volatile(
1848 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
1849 "mov %2, %%"REG_a" \n\t"
1850 "pxor %%mm7, %%mm7 \n\t"
1851 "1: \n\t"
1852 PREFETCH" 64(%0) \n\t"
1853 "movd (%0), %%mm0 \n\t"
1854 "movd 2(%0), %%mm1 \n\t"
1855 "movd 6(%0), %%mm2 \n\t"
1856 "movd 8(%0), %%mm3 \n\t"
1857 "add $12, %0 \n\t"
1858 "punpcklbw %%mm7, %%mm0 \n\t"
1859 "punpcklbw %%mm7, %%mm1 \n\t"
1860 "punpcklbw %%mm7, %%mm2 \n\t"
1861 "punpcklbw %%mm7, %%mm3 \n\t"
1862 "pmaddwd %%mm5, %%mm0 \n\t"
1863 "pmaddwd %%mm6, %%mm1 \n\t"
1864 "pmaddwd %%mm5, %%mm2 \n\t"
1865 "pmaddwd %%mm6, %%mm3 \n\t"
1866 "paddd %%mm1, %%mm0 \n\t"
1867 "paddd %%mm3, %%mm2 \n\t"
1868 "paddd %%mm4, %%mm0 \n\t"
1869 "paddd %%mm4, %%mm2 \n\t"
1870 "psrad $15, %%mm0 \n\t"
1871 "psrad $15, %%mm2 \n\t"
1872 "packssdw %%mm2, %%mm0 \n\t"
1873 "packuswb %%mm0, %%mm0 \n\t"
1874 "movd %%mm0, (%1, %%"REG_a") \n\t"
1875 "add $4, %%"REG_a" \n\t"
1876 " js 1b \n\t"
1877 : "+r" (src)
1878 : "r" (dst+width), "g" ((x86_reg)-width)
1879 : "%"REG_a
1883 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1885 __asm__ volatile(
1886 "movq 24+%4, %%mm6 \n\t"
1887 "mov %3, %%"REG_a" \n\t"
1888 "pxor %%mm7, %%mm7 \n\t"
1889 "1: \n\t"
1890 PREFETCH" 64(%0) \n\t"
1891 "movd (%0), %%mm0 \n\t"
1892 "movd 2(%0), %%mm1 \n\t"
1893 "punpcklbw %%mm7, %%mm0 \n\t"
1894 "punpcklbw %%mm7, %%mm1 \n\t"
1895 "movq %%mm0, %%mm2 \n\t"
1896 "movq %%mm1, %%mm3 \n\t"
1897 "pmaddwd %4, %%mm0 \n\t"
1898 "pmaddwd 8+%4, %%mm1 \n\t"
1899 "pmaddwd 16+%4, %%mm2 \n\t"
1900 "pmaddwd %%mm6, %%mm3 \n\t"
1901 "paddd %%mm1, %%mm0 \n\t"
1902 "paddd %%mm3, %%mm2 \n\t"
1904 "movd 6(%0), %%mm1 \n\t"
1905 "movd 8(%0), %%mm3 \n\t"
1906 "add $12, %0 \n\t"
1907 "punpcklbw %%mm7, %%mm1 \n\t"
1908 "punpcklbw %%mm7, %%mm3 \n\t"
1909 "movq %%mm1, %%mm4 \n\t"
1910 "movq %%mm3, %%mm5 \n\t"
1911 "pmaddwd %4, %%mm1 \n\t"
1912 "pmaddwd 8+%4, %%mm3 \n\t"
1913 "pmaddwd 16+%4, %%mm4 \n\t"
1914 "pmaddwd %%mm6, %%mm5 \n\t"
1915 "paddd %%mm3, %%mm1 \n\t"
1916 "paddd %%mm5, %%mm4 \n\t"
1918 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
1919 "paddd %%mm3, %%mm0 \n\t"
1920 "paddd %%mm3, %%mm2 \n\t"
1921 "paddd %%mm3, %%mm1 \n\t"
1922 "paddd %%mm3, %%mm4 \n\t"
1923 "psrad $15, %%mm0 \n\t"
1924 "psrad $15, %%mm2 \n\t"
1925 "psrad $15, %%mm1 \n\t"
1926 "psrad $15, %%mm4 \n\t"
1927 "packssdw %%mm1, %%mm0 \n\t"
1928 "packssdw %%mm4, %%mm2 \n\t"
1929 "packuswb %%mm0, %%mm0 \n\t"
1930 "packuswb %%mm2, %%mm2 \n\t"
1931 "movd %%mm0, (%1, %%"REG_a") \n\t"
1932 "movd %%mm2, (%2, %%"REG_a") \n\t"
1933 "add $4, %%"REG_a" \n\t"
1934 " js 1b \n\t"
1935 : "+r" (src)
1936 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1937 : "%"REG_a
1940 #endif
1942 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1944 #if COMPILE_TEMPLATE_MMX
1945 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1946 #else
1947 int i;
1948 for (i=0; i<width; i++) {
1949 int b= src[i*3+0];
1950 int g= src[i*3+1];
1951 int r= src[i*3+2];
1953 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1955 #endif /* COMPILE_TEMPLATE_MMX */
1958 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1960 #if COMPILE_TEMPLATE_MMX
1961 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1962 #else
1963 int i;
1964 for (i=0; i<width; i++) {
1965 int b= src1[3*i + 0];
1966 int g= src1[3*i + 1];
1967 int r= src1[3*i + 2];
1969 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1970 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1972 #endif /* COMPILE_TEMPLATE_MMX */
1973 assert(src1 == src2);
1976 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1978 int i;
1979 for (i=0; i<width; i++) {
1980 int b= src1[6*i + 0] + src1[6*i + 3];
1981 int g= src1[6*i + 1] + src1[6*i + 4];
1982 int r= src1[6*i + 2] + src1[6*i + 5];
1984 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1985 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1987 assert(src1 == src2);
1990 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1992 #if COMPILE_TEMPLATE_MMX
1993 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1994 #else
1995 int i;
1996 for (i=0; i<width; i++) {
1997 int r= src[i*3+0];
1998 int g= src[i*3+1];
1999 int b= src[i*3+2];
2001 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2003 #endif
2006 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2008 #if COMPILE_TEMPLATE_MMX
2009 assert(src1==src2);
2010 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2011 #else
2012 int i;
2013 assert(src1==src2);
2014 for (i=0; i<width; i++) {
2015 int r= src1[3*i + 0];
2016 int g= src1[3*i + 1];
2017 int b= src1[3*i + 2];
2019 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2020 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2022 #endif
2025 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2027 int i;
2028 assert(src1==src2);
2029 for (i=0; i<width; i++) {
2030 int r= src1[6*i + 0] + src1[6*i + 3];
2031 int g= src1[6*i + 1] + src1[6*i + 4];
2032 int b= src1[6*i + 2] + src1[6*i + 5];
2034 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2035 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2040 // bilinear / bicubic scaling
2041 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2042 const int16_t *filter, const int16_t *filterPos, long filterSize)
2044 #if COMPILE_TEMPLATE_MMX
2045 assert(filterSize % 4 == 0 && filterSize>0);
2046 if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2047 x86_reg counter= -2*dstW;
2048 filter-= counter*2;
2049 filterPos-= counter/2;
2050 dst-= counter/2;
2051 __asm__ volatile(
2052 #if defined(PIC)
2053 "push %%"REG_b" \n\t"
2054 #endif
2055 "pxor %%mm7, %%mm7 \n\t"
2056 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2057 "mov %%"REG_a", %%"REG_BP" \n\t"
2058 ASMALIGN(4)
2059 "1: \n\t"
2060 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2061 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2062 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2063 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2064 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2065 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2066 "punpcklbw %%mm7, %%mm0 \n\t"
2067 "punpcklbw %%mm7, %%mm2 \n\t"
2068 "pmaddwd %%mm1, %%mm0 \n\t"
2069 "pmaddwd %%mm2, %%mm3 \n\t"
2070 "movq %%mm0, %%mm4 \n\t"
2071 "punpckldq %%mm3, %%mm0 \n\t"
2072 "punpckhdq %%mm3, %%mm4 \n\t"
2073 "paddd %%mm4, %%mm0 \n\t"
2074 "psrad $7, %%mm0 \n\t"
2075 "packssdw %%mm0, %%mm0 \n\t"
2076 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2077 "add $4, %%"REG_BP" \n\t"
2078 " jnc 1b \n\t"
2080 "pop %%"REG_BP" \n\t"
2081 #if defined(PIC)
2082 "pop %%"REG_b" \n\t"
2083 #endif
2084 : "+a" (counter)
2085 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2086 #if !defined(PIC)
2087 : "%"REG_b
2088 #endif
2090 } else if (filterSize==8) {
2091 x86_reg counter= -2*dstW;
2092 filter-= counter*4;
2093 filterPos-= counter/2;
2094 dst-= counter/2;
2095 __asm__ volatile(
2096 #if defined(PIC)
2097 "push %%"REG_b" \n\t"
2098 #endif
2099 "pxor %%mm7, %%mm7 \n\t"
2100 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2101 "mov %%"REG_a", %%"REG_BP" \n\t"
2102 ASMALIGN(4)
2103 "1: \n\t"
2104 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2105 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2106 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2107 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2108 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2109 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2110 "punpcklbw %%mm7, %%mm0 \n\t"
2111 "punpcklbw %%mm7, %%mm2 \n\t"
2112 "pmaddwd %%mm1, %%mm0 \n\t"
2113 "pmaddwd %%mm2, %%mm3 \n\t"
2115 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2116 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2117 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2118 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2119 "punpcklbw %%mm7, %%mm4 \n\t"
2120 "punpcklbw %%mm7, %%mm2 \n\t"
2121 "pmaddwd %%mm1, %%mm4 \n\t"
2122 "pmaddwd %%mm2, %%mm5 \n\t"
2123 "paddd %%mm4, %%mm0 \n\t"
2124 "paddd %%mm5, %%mm3 \n\t"
2125 "movq %%mm0, %%mm4 \n\t"
2126 "punpckldq %%mm3, %%mm0 \n\t"
2127 "punpckhdq %%mm3, %%mm4 \n\t"
2128 "paddd %%mm4, %%mm0 \n\t"
2129 "psrad $7, %%mm0 \n\t"
2130 "packssdw %%mm0, %%mm0 \n\t"
2131 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2132 "add $4, %%"REG_BP" \n\t"
2133 " jnc 1b \n\t"
2135 "pop %%"REG_BP" \n\t"
2136 #if defined(PIC)
2137 "pop %%"REG_b" \n\t"
2138 #endif
2139 : "+a" (counter)
2140 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2141 #if !defined(PIC)
2142 : "%"REG_b
2143 #endif
2145 } else {
2146 const uint8_t *offset = src+filterSize;
2147 x86_reg counter= -2*dstW;
2148 //filter-= counter*filterSize/2;
2149 filterPos-= counter/2;
2150 dst-= counter/2;
2151 __asm__ volatile(
2152 "pxor %%mm7, %%mm7 \n\t"
2153 ASMALIGN(4)
2154 "1: \n\t"
2155 "mov %2, %%"REG_c" \n\t"
2156 "movzwl (%%"REG_c", %0), %%eax \n\t"
2157 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2158 "mov %5, %%"REG_c" \n\t"
2159 "pxor %%mm4, %%mm4 \n\t"
2160 "pxor %%mm5, %%mm5 \n\t"
2161 "2: \n\t"
2162 "movq (%1), %%mm1 \n\t"
2163 "movq (%1, %6), %%mm3 \n\t"
2164 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2165 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2166 "punpcklbw %%mm7, %%mm0 \n\t"
2167 "punpcklbw %%mm7, %%mm2 \n\t"
2168 "pmaddwd %%mm1, %%mm0 \n\t"
2169 "pmaddwd %%mm2, %%mm3 \n\t"
2170 "paddd %%mm3, %%mm5 \n\t"
2171 "paddd %%mm0, %%mm4 \n\t"
2172 "add $8, %1 \n\t"
2173 "add $4, %%"REG_c" \n\t"
2174 "cmp %4, %%"REG_c" \n\t"
2175 " jb 2b \n\t"
2176 "add %6, %1 \n\t"
2177 "movq %%mm4, %%mm0 \n\t"
2178 "punpckldq %%mm5, %%mm4 \n\t"
2179 "punpckhdq %%mm5, %%mm0 \n\t"
2180 "paddd %%mm0, %%mm4 \n\t"
2181 "psrad $7, %%mm4 \n\t"
2182 "packssdw %%mm4, %%mm4 \n\t"
2183 "mov %3, %%"REG_a" \n\t"
2184 "movd %%mm4, (%%"REG_a", %0) \n\t"
2185 "add $4, %0 \n\t"
2186 " jnc 1b \n\t"
2188 : "+r" (counter), "+r" (filter)
2189 : "m" (filterPos), "m" (dst), "m"(offset),
2190 "m" (src), "r" ((x86_reg)filterSize*2)
2191 : "%"REG_a, "%"REG_c, "%"REG_d
2194 #else
2195 #if COMPILE_TEMPLATE_ALTIVEC
2196 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2197 #else
2198 int i;
2199 for (i=0; i<dstW; i++) {
2200 int j;
2201 int srcPos= filterPos[i];
2202 int val=0;
2203 //printf("filterPos: %d\n", filterPos[i]);
2204 for (j=0; j<filterSize; j++) {
2205 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2206 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2208 //filter += hFilterSize;
2209 dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2210 //dst[i] = val>>7;
2212 #endif /* COMPILE_TEMPLATE_ALTIVEC */
2213 #endif /* COMPILE_MMX */
2216 //FIXME all pal and rgb srcFormats could do this convertion as well
2217 //FIXME all scalers more complex than bilinear could do half of this transform
2218 static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2220 int i;
2221 for (i = 0; i < width; i++) {
2222 dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264
2223 dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2226 static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2228 int i;
2229 for (i = 0; i < width; i++) {
2230 dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469
2231 dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2234 static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2236 int i;
2237 for (i = 0; i < width; i++)
2238 dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2240 static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2242 int i;
2243 for (i = 0; i < width; i++)
2244 dst[i] = (dst[i]*14071 + 33561947)>>14;
2247 #define FAST_BILINEAR_X86 \
2248 "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \
2249 "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \
2250 "shll $16, %%edi \n\t" \
2251 "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \
2252 "mov %1, %%"REG_D"\n\t" \
2253 "shrl $9, %%esi \n\t" \
2255 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2256 long dstWidth, const uint8_t *src, int srcW,
2257 int xInc)
2259 #if ARCH_X86
2260 #if COMPILE_TEMPLATE_MMX2
2261 int32_t *filterPos = c->hLumFilterPos;
2262 int16_t *filter = c->hLumFilter;
2263 int canMMX2BeUsed = c->canMMX2BeUsed;
2264 void *mmx2FilterCode= c->lumMmx2FilterCode;
2265 int i;
2266 #if defined(PIC)
2267 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2268 #endif
2269 if (canMMX2BeUsed) {
2270 __asm__ volatile(
2271 #if defined(PIC)
2272 "mov %%"REG_b", %5 \n\t"
2273 #endif
2274 "pxor %%mm7, %%mm7 \n\t"
2275 "mov %0, %%"REG_c" \n\t"
2276 "mov %1, %%"REG_D" \n\t"
2277 "mov %2, %%"REG_d" \n\t"
2278 "mov %3, %%"REG_b" \n\t"
2279 "xor %%"REG_a", %%"REG_a" \n\t" // i
2280 PREFETCH" (%%"REG_c") \n\t"
2281 PREFETCH" 32(%%"REG_c") \n\t"
2282 PREFETCH" 64(%%"REG_c") \n\t"
2284 #if ARCH_X86_64
2286 #define CALL_MMX2_FILTER_CODE \
2287 "movl (%%"REG_b"), %%esi \n\t"\
2288 "call *%4 \n\t"\
2289 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2290 "add %%"REG_S", %%"REG_c" \n\t"\
2291 "add %%"REG_a", %%"REG_D" \n\t"\
2292 "xor %%"REG_a", %%"REG_a" \n\t"\
2294 #else
2296 #define CALL_MMX2_FILTER_CODE \
2297 "movl (%%"REG_b"), %%esi \n\t"\
2298 "call *%4 \n\t"\
2299 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2300 "add %%"REG_a", %%"REG_D" \n\t"\
2301 "xor %%"REG_a", %%"REG_a" \n\t"\
2303 #endif /* ARCH_X86_64 */
2305 CALL_MMX2_FILTER_CODE
2306 CALL_MMX2_FILTER_CODE
2307 CALL_MMX2_FILTER_CODE
2308 CALL_MMX2_FILTER_CODE
2309 CALL_MMX2_FILTER_CODE
2310 CALL_MMX2_FILTER_CODE
2311 CALL_MMX2_FILTER_CODE
2312 CALL_MMX2_FILTER_CODE
2314 #if defined(PIC)
2315 "mov %5, %%"REG_b" \n\t"
2316 #endif
2317 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2318 "m" (mmx2FilterCode)
2319 #if defined(PIC)
2320 ,"m" (ebxsave)
2321 #endif
2322 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2323 #if !defined(PIC)
2324 ,"%"REG_b
2325 #endif
2327 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2328 } else {
2329 #endif /* COMPILE_TEMPLATE_MMX2 */
2330 x86_reg xInc_shr16 = xInc >> 16;
2331 uint16_t xInc_mask = xInc & 0xffff;
2332 x86_reg dstWidth_reg = dstWidth;
2333 //NO MMX just normal asm ...
2334 __asm__ volatile(
2335 "xor %%"REG_a", %%"REG_a" \n\t" // i
2336 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2337 "xorl %%ecx, %%ecx \n\t" // xalpha
2338 ASMALIGN(4)
2339 "1: \n\t"
2340 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2341 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2342 FAST_BILINEAR_X86
2343 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2344 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2345 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2347 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2348 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2349 FAST_BILINEAR_X86
2350 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2351 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2352 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2355 "add $2, %%"REG_a" \n\t"
2356 "cmp %2, %%"REG_a" \n\t"
2357 " jb 1b \n\t"
2360 :: "r" (src), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask)
2361 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2363 #if COMPILE_TEMPLATE_MMX2
2364 } //if MMX2 can't be used
2365 #endif
2366 #else
2367 int i;
2368 unsigned int xpos=0;
2369 for (i=0;i<dstWidth;i++) {
2370 register unsigned int xx=xpos>>16;
2371 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2372 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2373 xpos+=xInc;
2375 #endif /* ARCH_X86 */
2378 // *** horizontal scale Y line to temp buffer
2379 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2380 const int16_t *hLumFilter,
2381 const int16_t *hLumFilterPos, int hLumFilterSize,
2382 uint8_t *formatConvBuffer,
2383 uint32_t *pal, int isAlpha)
2385 void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2386 void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2388 src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2390 if (toYV12) {
2391 toYV12(formatConvBuffer, src, srcW, pal);
2392 src= formatConvBuffer;
2395 if (!c->hyscale_fast) {
2396 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2397 } else { // fast bilinear upscale / crap downscale
2398 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2401 if (convertRange)
2402 convertRange(dst, dstWidth);
2405 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2406 long dstWidth, const uint8_t *src1,
2407 const uint8_t *src2, int srcW, int xInc)
2409 #if ARCH_X86
2410 #if COMPILE_TEMPLATE_MMX2
2411 int32_t *filterPos = c->hChrFilterPos;
2412 int16_t *filter = c->hChrFilter;
2413 int canMMX2BeUsed = c->canMMX2BeUsed;
2414 void *mmx2FilterCode= c->chrMmx2FilterCode;
2415 int i;
2416 #if defined(PIC)
2417 DECLARE_ALIGNED(8, uint64_t, ebxsave);
2418 #endif
2419 if (canMMX2BeUsed) {
2420 __asm__ volatile(
2421 #if defined(PIC)
2422 "mov %%"REG_b", %6 \n\t"
2423 #endif
2424 "pxor %%mm7, %%mm7 \n\t"
2425 "mov %0, %%"REG_c" \n\t"
2426 "mov %1, %%"REG_D" \n\t"
2427 "mov %2, %%"REG_d" \n\t"
2428 "mov %3, %%"REG_b" \n\t"
2429 "xor %%"REG_a", %%"REG_a" \n\t" // i
2430 PREFETCH" (%%"REG_c") \n\t"
2431 PREFETCH" 32(%%"REG_c") \n\t"
2432 PREFETCH" 64(%%"REG_c") \n\t"
2434 CALL_MMX2_FILTER_CODE
2435 CALL_MMX2_FILTER_CODE
2436 CALL_MMX2_FILTER_CODE
2437 CALL_MMX2_FILTER_CODE
2438 "xor %%"REG_a", %%"REG_a" \n\t" // i
2439 "mov %5, %%"REG_c" \n\t" // src
2440 "mov %1, %%"REG_D" \n\t" // buf1
2441 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2442 PREFETCH" (%%"REG_c") \n\t"
2443 PREFETCH" 32(%%"REG_c") \n\t"
2444 PREFETCH" 64(%%"REG_c") \n\t"
2446 CALL_MMX2_FILTER_CODE
2447 CALL_MMX2_FILTER_CODE
2448 CALL_MMX2_FILTER_CODE
2449 CALL_MMX2_FILTER_CODE
2451 #if defined(PIC)
2452 "mov %6, %%"REG_b" \n\t"
2453 #endif
2454 :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2455 "m" (mmx2FilterCode), "m" (src2)
2456 #if defined(PIC)
2457 ,"m" (ebxsave)
2458 #endif
2459 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2460 #if !defined(PIC)
2461 ,"%"REG_b
2462 #endif
2464 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2465 //printf("%d %d %d\n", dstWidth, i, srcW);
2466 dst[i] = src1[srcW-1]*128;
2467 dst[i+VOFW] = src2[srcW-1]*128;
2469 } else {
2470 #endif /* COMPILE_TEMPLATE_MMX2 */
2471 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2472 uint16_t xInc_mask = xInc & 0xffff;
2473 x86_reg dstWidth_reg = dstWidth;
2474 __asm__ volatile(
2475 "xor %%"REG_a", %%"REG_a" \n\t" // i
2476 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2477 "xorl %%ecx, %%ecx \n\t" // xalpha
2478 ASMALIGN(4)
2479 "1: \n\t"
2480 "mov %0, %%"REG_S" \n\t"
2481 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2482 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2483 FAST_BILINEAR_X86
2484 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2486 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2487 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2488 FAST_BILINEAR_X86
2489 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2491 "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF
2492 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry
2493 "add $1, %%"REG_a" \n\t"
2494 "cmp %2, %%"REG_a" \n\t"
2495 " jb 1b \n\t"
2497 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2498 which is needed to support GCC 4.0. */
2499 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2500 :: "m" (src1), "m" (dst), "g" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2501 #else
2502 :: "m" (src1), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2503 #endif
2504 "r" (src2)
2505 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2507 #if COMPILE_TEMPLATE_MMX2
2508 } //if MMX2 can't be used
2509 #endif
2510 #else
2511 int i;
2512 unsigned int xpos=0;
2513 for (i=0;i<dstWidth;i++) {
2514 register unsigned int xx=xpos>>16;
2515 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2516 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2517 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2518 /* slower
2519 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2520 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2522 xpos+=xInc;
2524 #endif /* ARCH_X86 */
2527 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2528 int srcW, int xInc, const int16_t *hChrFilter,
2529 const int16_t *hChrFilterPos, int hChrFilterSize,
2530 uint8_t *formatConvBuffer,
2531 uint32_t *pal)
2534 src1 += c->chrSrcOffset;
2535 src2 += c->chrSrcOffset;
2537 if (c->chrToYV12) {
2538 c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2539 src1= formatConvBuffer;
2540 src2= formatConvBuffer+VOFW;
2543 if (!c->hcscale_fast) {
2544 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2545 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2546 } else { // fast bilinear upscale / crap downscale
2547 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2550 if (c->chrConvertRange)
2551 c->chrConvertRange(dst, dstWidth);
2554 #define DEBUG_SWSCALE_BUFFERS 0
2555 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2557 static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2558 int srcSliceH, uint8_t* dst[], int dstStride[])
2560 /* load a few things into local vars to make the code more readable? and faster */
2561 const int srcW= c->srcW;
2562 const int dstW= c->dstW;
2563 const int dstH= c->dstH;
2564 const int chrDstW= c->chrDstW;
2565 const int chrSrcW= c->chrSrcW;
2566 const int lumXInc= c->lumXInc;
2567 const int chrXInc= c->chrXInc;
2568 const enum PixelFormat dstFormat= c->dstFormat;
2569 const int flags= c->flags;
2570 int16_t *vLumFilterPos= c->vLumFilterPos;
2571 int16_t *vChrFilterPos= c->vChrFilterPos;
2572 int16_t *hLumFilterPos= c->hLumFilterPos;
2573 int16_t *hChrFilterPos= c->hChrFilterPos;
2574 int16_t *vLumFilter= c->vLumFilter;
2575 int16_t *vChrFilter= c->vChrFilter;
2576 int16_t *hLumFilter= c->hLumFilter;
2577 int16_t *hChrFilter= c->hChrFilter;
2578 int32_t *lumMmxFilter= c->lumMmxFilter;
2579 int32_t *chrMmxFilter= c->chrMmxFilter;
2580 int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2581 const int vLumFilterSize= c->vLumFilterSize;
2582 const int vChrFilterSize= c->vChrFilterSize;
2583 const int hLumFilterSize= c->hLumFilterSize;
2584 const int hChrFilterSize= c->hChrFilterSize;
2585 int16_t **lumPixBuf= c->lumPixBuf;
2586 int16_t **chrPixBuf= c->chrPixBuf;
2587 int16_t **alpPixBuf= c->alpPixBuf;
2588 const int vLumBufSize= c->vLumBufSize;
2589 const int vChrBufSize= c->vChrBufSize;
2590 uint8_t *formatConvBuffer= c->formatConvBuffer;
2591 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2592 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2593 int lastDstY;
2594 uint32_t *pal=c->pal_yuv;
2596 /* vars which will change and which we need to store back in the context */
2597 int dstY= c->dstY;
2598 int lumBufIndex= c->lumBufIndex;
2599 int chrBufIndex= c->chrBufIndex;
2600 int lastInLumBuf= c->lastInLumBuf;
2601 int lastInChrBuf= c->lastInChrBuf;
2603 if (isPacked(c->srcFormat)) {
2604 src[0]=
2605 src[1]=
2606 src[2]=
2607 src[3]= src[0];
2608 srcStride[0]=
2609 srcStride[1]=
2610 srcStride[2]=
2611 srcStride[3]= srcStride[0];
2613 srcStride[1]<<= c->vChrDrop;
2614 srcStride[2]<<= c->vChrDrop;
2616 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2617 src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2618 dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2619 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2620 srcSliceY, srcSliceH, dstY, dstH);
2621 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2622 vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
2624 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2625 static int warnedAlready=0; //FIXME move this into the context perhaps
2626 if (flags & SWS_PRINT_INFO && !warnedAlready) {
2627 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2628 " ->cannot do aligned memory accesses anymore\n");
2629 warnedAlready=1;
2633 /* Note the user might start scaling the picture in the middle so this
2634 will not get executed. This is not really intended but works
2635 currently, so people might do it. */
2636 if (srcSliceY ==0) {
2637 lumBufIndex=-1;
2638 chrBufIndex=-1;
2639 dstY=0;
2640 lastInLumBuf= -1;
2641 lastInChrBuf= -1;
2644 lastDstY= dstY;
2646 for (;dstY < dstH; dstY++) {
2647 unsigned char *dest =dst[0]+dstStride[0]*dstY;
2648 const int chrDstY= dstY>>c->chrDstVSubSample;
2649 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2650 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2651 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2653 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2654 const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2655 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2656 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2657 int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2658 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2659 int enough_lines;
2661 //handle holes (FAST_BILINEAR & weird filters)
2662 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2663 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2664 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2665 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2667 DEBUG_BUFFERS("dstY: %d\n", dstY);
2668 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2669 firstLumSrcY, lastLumSrcY, lastInLumBuf);
2670 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2671 firstChrSrcY, lastChrSrcY, lastInChrBuf);
2673 // Do we have enough lines in this slice to output the dstY line
2674 enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2676 if (!enough_lines) {
2677 lastLumSrcY = srcSliceY + srcSliceH - 1;
2678 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2679 DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2680 lastLumSrcY, lastChrSrcY);
2683 //Do horizontal scaling
2684 while(lastInLumBuf < lastLumSrcY) {
2685 const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2686 const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2687 lumBufIndex++;
2688 assert(lumBufIndex < 2*vLumBufSize);
2689 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2690 assert(lastInLumBuf + 1 - srcSliceY >= 0);
2691 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2692 hLumFilter, hLumFilterPos, hLumFilterSize,
2693 formatConvBuffer,
2694 pal, 0);
2695 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2696 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2697 hLumFilter, hLumFilterPos, hLumFilterSize,
2698 formatConvBuffer,
2699 pal, 1);
2700 lastInLumBuf++;
2701 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2702 lumBufIndex, lastInLumBuf);
2704 while(lastInChrBuf < lastChrSrcY) {
2705 const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2706 const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2707 chrBufIndex++;
2708 assert(chrBufIndex < 2*vChrBufSize);
2709 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2710 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2711 //FIXME replace parameters through context struct (some at least)
2713 if (c->needs_hcscale)
2714 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2715 hChrFilter, hChrFilterPos, hChrFilterSize,
2716 formatConvBuffer,
2717 pal);
2718 lastInChrBuf++;
2719 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2720 chrBufIndex, lastInChrBuf);
2722 //wrap buf index around to stay inside the ring buffer
2723 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2724 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2725 if (!enough_lines)
2726 break; //we can't output a dstY line so let's try with the next slice
2728 #if COMPILE_TEMPLATE_MMX
2729 c->blueDither= ff_dither8[dstY&1];
2730 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2731 c->greenDither= ff_dither8[dstY&1];
2732 else
2733 c->greenDither= ff_dither4[dstY&1];
2734 c->redDither= ff_dither8[(dstY+1)&1];
2735 #endif
2736 if (dstY < dstH-2) {
2737 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2738 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2739 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2740 #if COMPILE_TEMPLATE_MMX
2741 int i;
2742 if (flags & SWS_ACCURATE_RND) {
2743 int s= APCK_SIZE / 8;
2744 for (i=0; i<vLumFilterSize; i+=2) {
2745 *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
2746 *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
2747 lumMmxFilter[s*i+APCK_COEF/4 ]=
2748 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
2749 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2750 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2751 *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
2752 *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
2753 alpMmxFilter[s*i+APCK_COEF/4 ]=
2754 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
2757 for (i=0; i<vChrFilterSize; i+=2) {
2758 *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
2759 *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
2760 chrMmxFilter[s*i+APCK_COEF/4 ]=
2761 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
2762 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2764 } else {
2765 for (i=0; i<vLumFilterSize; i++) {
2766 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2767 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2768 lumMmxFilter[4*i+2]=
2769 lumMmxFilter[4*i+3]=
2770 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2771 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2772 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2773 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2774 alpMmxFilter[4*i+2]=
2775 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2778 for (i=0; i<vChrFilterSize; i++) {
2779 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2780 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2781 chrMmxFilter[4*i+2]=
2782 chrMmxFilter[4*i+3]=
2783 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2786 #endif
2787 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2788 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2789 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2790 c->yuv2nv12X(c,
2791 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2792 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2793 dest, uDest, dstW, chrDstW, dstFormat);
2794 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2795 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2796 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2797 if (is16BPS(dstFormat)) {
2798 yuv2yuvX16inC(
2799 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2800 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2801 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2802 dstFormat);
2803 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2804 const int16_t *lumBuf = lumSrcPtr[0];
2805 const int16_t *chrBuf= chrSrcPtr[0];
2806 const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2807 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2808 } else { //General YV12
2809 c->yuv2yuvX(c,
2810 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2811 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2812 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2814 } else {
2815 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2816 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2817 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2818 int chrAlpha= vChrFilter[2*dstY+1];
2819 if(flags & SWS_FULL_CHR_H_INT) {
2820 yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2821 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2822 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2823 alpSrcPtr, dest, dstW, dstY);
2824 } else {
2825 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2826 alpPixBuf ? *alpSrcPtr : NULL,
2827 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2829 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2830 int lumAlpha= vLumFilter[2*dstY+1];
2831 int chrAlpha= vChrFilter[2*dstY+1];
2832 lumMmxFilter[2]=
2833 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
2834 chrMmxFilter[2]=
2835 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2836 if(flags & SWS_FULL_CHR_H_INT) {
2837 yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2838 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2839 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2840 alpSrcPtr, dest, dstW, dstY);
2841 } else {
2842 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2843 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2844 dest, dstW, lumAlpha, chrAlpha, dstY);
2846 } else { //general RGB
2847 if(flags & SWS_FULL_CHR_H_INT) {
2848 yuv2rgbXinC_full(c,
2849 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2850 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2851 alpSrcPtr, dest, dstW, dstY);
2852 } else {
2853 c->yuv2packedX(c,
2854 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2855 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2856 alpSrcPtr, dest, dstW, dstY);
2860 } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2861 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2862 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2863 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2864 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2865 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2866 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2867 yuv2nv12XinC(
2868 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2869 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2870 dest, uDest, dstW, chrDstW, dstFormat);
2871 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2872 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2873 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2874 if (is16BPS(dstFormat)) {
2875 yuv2yuvX16inC(
2876 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2877 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2878 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2879 dstFormat);
2880 } else {
2881 yuv2yuvXinC(
2882 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
2883 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2884 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2886 } else {
2887 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2888 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2889 if(flags & SWS_FULL_CHR_H_INT) {
2890 yuv2rgbXinC_full(c,
2891 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2892 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893 alpSrcPtr, dest, dstW, dstY);
2894 } else {
2895 yuv2packedXinC(c,
2896 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2897 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2898 alpSrcPtr, dest, dstW, dstY);
2904 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2905 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2907 #if COMPILE_TEMPLATE_MMX
2908 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
2909 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2910 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
2911 else __asm__ volatile("emms" :::"memory");
2912 #endif
2913 /* store changed local vars back in the context */
2914 c->dstY= dstY;
2915 c->lumBufIndex= lumBufIndex;
2916 c->chrBufIndex= chrBufIndex;
2917 c->lastInLumBuf= lastInLumBuf;
2918 c->lastInChrBuf= lastInChrBuf;
2920 return dstY - lastDstY;
2923 static void RENAME(sws_init_swScale)(SwsContext *c)
2925 enum PixelFormat srcFormat = c->srcFormat;
2927 c->yuv2nv12X = RENAME(yuv2nv12X );
2928 c->yuv2yuv1 = RENAME(yuv2yuv1 );
2929 c->yuv2yuvX = RENAME(yuv2yuvX );
2930 c->yuv2packed1 = RENAME(yuv2packed1 );
2931 c->yuv2packed2 = RENAME(yuv2packed2 );
2932 c->yuv2packedX = RENAME(yuv2packedX );
2934 c->hScale = RENAME(hScale );
2936 #if COMPILE_TEMPLATE_MMX
2937 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2938 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2939 #else
2940 if (c->flags & SWS_FAST_BILINEAR)
2941 #endif
2943 c->hyscale_fast = RENAME(hyscale_fast);
2944 c->hcscale_fast = RENAME(hcscale_fast);
2947 c->chrToYV12 = NULL;
2948 switch(srcFormat) {
2949 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
2950 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
2951 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
2952 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
2953 case PIX_FMT_RGB8 :
2954 case PIX_FMT_BGR8 :
2955 case PIX_FMT_PAL8 :
2956 case PIX_FMT_BGR4_BYTE:
2957 case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
2958 case PIX_FMT_YUV420P16BE:
2959 case PIX_FMT_YUV422P16BE:
2960 case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2961 case PIX_FMT_YUV420P16LE:
2962 case PIX_FMT_YUV422P16LE:
2963 case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2965 if (c->chrSrcHSubSample) {
2966 switch(srcFormat) {
2967 case PIX_FMT_RGB48BE:
2968 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
2969 case PIX_FMT_RGB32 : c->chrToYV12 = bgr32ToUV_half; break;
2970 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half; break;
2971 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2972 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
2973 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
2974 case PIX_FMT_BGR32 : c->chrToYV12 = rgb32ToUV_half; break;
2975 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half; break;
2976 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2977 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
2978 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
2980 } else {
2981 switch(srcFormat) {
2982 case PIX_FMT_RGB48BE:
2983 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
2984 case PIX_FMT_RGB32 : c->chrToYV12 = bgr32ToUV; break;
2985 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV; break;
2986 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
2987 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
2988 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
2989 case PIX_FMT_BGR32 : c->chrToYV12 = rgb32ToUV; break;
2990 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV; break;
2991 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
2992 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
2993 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
2997 c->lumToYV12 = NULL;
2998 c->alpToYV12 = NULL;
2999 switch (srcFormat) {
3000 case PIX_FMT_YUYV422 :
3001 case PIX_FMT_YUV420P16BE:
3002 case PIX_FMT_YUV422P16BE:
3003 case PIX_FMT_YUV444P16BE:
3004 case PIX_FMT_Y400A :
3005 case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
3006 case PIX_FMT_UYVY422 :
3007 case PIX_FMT_YUV420P16LE:
3008 case PIX_FMT_YUV422P16LE:
3009 case PIX_FMT_YUV444P16LE:
3010 case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
3011 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
3012 case PIX_FMT_BGR565 : c->lumToYV12 = bgr16ToY; break;
3013 case PIX_FMT_BGR555 : c->lumToYV12 = bgr15ToY; break;
3014 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
3015 case PIX_FMT_RGB565 : c->lumToYV12 = rgb16ToY; break;
3016 case PIX_FMT_RGB555 : c->lumToYV12 = rgb15ToY; break;
3017 case PIX_FMT_RGB8 :
3018 case PIX_FMT_BGR8 :
3019 case PIX_FMT_PAL8 :
3020 case PIX_FMT_BGR4_BYTE:
3021 case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
3022 case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
3023 case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
3024 case PIX_FMT_RGB32 : c->lumToYV12 = bgr32ToY; break;
3025 case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY; break;
3026 case PIX_FMT_BGR32 : c->lumToYV12 = rgb32ToY; break;
3027 case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY; break;
3028 case PIX_FMT_RGB48BE:
3029 case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
3031 if (c->alpPixBuf) {
3032 switch (srcFormat) {
3033 case PIX_FMT_RGB32 :
3034 case PIX_FMT_RGB32_1:
3035 case PIX_FMT_BGR32 :
3036 case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
3037 case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
3041 switch (srcFormat) {
3042 case PIX_FMT_Y400A :
3043 c->alpSrcOffset = 1;
3044 break;
3045 case PIX_FMT_RGB32 :
3046 case PIX_FMT_BGR32 :
3047 c->alpSrcOffset = 3;
3048 break;
3049 case PIX_FMT_RGB48LE:
3050 c->lumSrcOffset = 1;
3051 c->chrSrcOffset = 1;
3052 c->alpSrcOffset = 1;
3053 break;
3056 if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
3057 if (c->srcRange) {
3058 c->lumConvertRange = RENAME(lumRangeFromJpeg);
3059 c->chrConvertRange = RENAME(chrRangeFromJpeg);
3060 } else {
3061 c->lumConvertRange = RENAME(lumRangeToJpeg);
3062 c->chrConvertRange = RENAME(chrRangeToJpeg);
3066 if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3067 srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3068 c->needs_hcscale = 1;