add_savefile: remove callback parameter
[vlc/asuraparaju-public.git] / modules / video_chroma / i420_yuy2.h
blobe6994428ce16dae462487ebd6e6524e64c627f48
1 /*****************************************************************************
2 * i420_yuy2.h : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 the VideoLAN team
5 * $Id$
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
27 #if defined(CAN_COMPILE_MMX)
29 /* MMX assembly */
31 #define MMX_CALL(MMX_INSTRUCTIONS) \
32 do { \
33 __asm__ __volatile__( \
34 ".p2align 3 \n\t \
35 movd (%0), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
36 movd (%1), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
37 movq (%2), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
38 movq (%3), %%mm3 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
39 " \
40 : \
41 : "r" (p_u), "r" (p_v), \
42 "r" (p_y1), "r" (p_y2) ); \
43 __asm__ __volatile__( \
44 ".p2align 3 \n\t" \
45 MMX_INSTRUCTIONS \
46 : \
47 : "r" (p_line1), "r" (p_line2) ); \
48 p_line1 += 16; p_line2 += 16; \
49 p_y1 += 8; p_y2 += 8; \
50 p_u += 4; p_v += 4; \
51 } while(0)
53 #define MMX_END __asm__ __volatile__ ( "emms" )
55 #define MMX_YUV420_YUYV " \n\
56 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
57 movq %%mm0, %%mm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
58 punpcklbw %%mm1, %%mm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
59 movq %%mm2, (%0) # Store low YUYV \n\
60 punpckhbw %%mm1, %%mm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
61 movq %%mm0, 8(%0) # Store high YUYV \n\
62 movq %%mm3, %%mm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
63 punpcklbw %%mm1, %%mm4 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\
64 movq %%mm4, (%1) # Store low YUYV \n\
65 punpckhbw %%mm1, %%mm3 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\
66 movq %%mm3, 8(%1) # Store high YUYV \n\
69 #define MMX_YUV420_YVYU " \n\
70 punpcklbw %%mm1, %%mm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
71 movq %%mm0, %%mm1 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
72 punpcklbw %%mm2, %%mm1 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
73 movq %%mm1, (%0) # Store low YUYV \n\
74 punpckhbw %%mm2, %%mm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
75 movq %%mm0, 8(%0) # Store high YUYV \n\
76 movq %%mm3, %%mm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
77 punpcklbw %%mm2, %%mm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\
78 movq %%mm4, (%1) # Store low YUYV \n\
79 punpckhbw %%mm2, %%mm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\
80 movq %%mm3, 8(%1) # Store high YUYV \n\
83 #define MMX_YUV420_UYVY " \n\
84 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
85 movq %%mm1, %%mm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
86 punpcklbw %%mm0, %%mm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
87 movq %%mm2, (%0) # Store low UYVY \n\
88 movq %%mm1, %%mm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
89 punpckhbw %%mm0, %%mm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
90 movq %%mm2, 8(%0) # Store high UYVY \n\
91 movq %%mm1, %%mm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
92 punpcklbw %%mm3, %%mm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\
93 movq %%mm4, (%1) # Store low UYVY \n\
94 punpckhbw %%mm3, %%mm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\
95 movq %%mm1, 8(%1) # Store high UYVY \n\
98 /* FIXME: this code does not work ! Chroma seems to be wrong. */
99 #define MMX_YUV420_Y211 " \n\
100 movd (%4), %%mm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
101 movd (%5), %%mm3 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
102 pand i_00ffw, %%mm0 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\
103 packuswb %%mm0, %%mm0 # pack Y y6 y4 y2 y0 y6 y4 y2 y0 \n\
104 pand i_00ffw, %%mm2 # get U even 00 u6 00 u4 00 u2 00 u0 \n\
105 packuswb %%mm2, %%mm2 # pack U 00 00 u2 u0 00 00 u2 u0 \n\
106 pand i_00ffw, %%mm3 # get V even 00 v6 00 v4 00 v2 00 v0 \n\
107 packuswb %%mm3, %%mm3 # pack V 00 00 v2 v0 00 00 v2 v0 \n\
108 punpcklbw %%mm3, %%mm2 # 00 00 00 00 v2 u2 v0 u0 \n\
109 psubsw i_80w, %%mm2 # U,V -= 128 \n\
110 punpcklbw %%mm2, %%mm0 # v2 y6 u2 y4 v0 y2 u0 y0 \n\
111 movq %%mm0, (%0) # Store YUYV \n\
112 pand i_00ffw, %%mm1 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\
113 packuswb %%mm1, %%mm1 # pack Y Y6 Y4 Y2 Y0 Y6 Y4 Y2 Y0 \n\
114 punpcklbw %%mm2, %%mm1 # v2 Y6 u2 Y4 v0 Y2 u0 Y0 \n\
115 movq %%mm1, (%1) # Store YUYV \n\
117 #elif defined(HAVE_MMX_INTRINSICS)
119 /* MMX intrinsics */
121 #include <mmintrin.h>
123 #define MMX_CALL(MMX_INSTRUCTIONS) \
124 do { \
125 __m64 mm0, mm1, mm2, mm3, mm4; \
126 MMX_INSTRUCTIONS \
127 p_line1 += 16; p_line2 += 16; \
128 p_y1 += 8; p_y2 += 8; \
129 p_u += 4; p_v += 4; \
130 } while(0)
132 #define MMX_END _mm_empty()
134 #define MMX_YUV420_YUYV \
135 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
136 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
137 mm0 = (__m64)*(uint64_t*)p_y1; \
138 mm3 = (__m64)*(uint64_t*)p_y2; \
139 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
140 mm2 = mm0; \
141 mm2 = _mm_unpacklo_pi8(mm2, mm1); \
142 *(uint64_t*)p_line1 = (uint64_t)mm2; \
143 mm0 = _mm_unpackhi_pi8(mm0, mm1); \
144 *(uint64_t*)(p_line1+8) = (uint64_t)mm0;\
145 mm4 = mm3; \
146 mm4 = _mm_unpacklo_pi8(mm4, mm1); \
147 *(uint64_t*)p_line2 = (uint64_t)mm4; \
148 mm3 = _mm_unpackhi_pi8(mm3, mm1); \
149 *(uint64_t*)(p_line2+8) = (uint64_t)mm3;
151 #define MMX_YUV420_YVYU \
152 mm2 = _mm_cvtsi32_si64(*(int*)p_u); \
153 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
154 mm0 = (__m64)*(uint64_t*)p_y1; \
155 mm3 = (__m64)*(uint64_t*)p_y2; \
156 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
157 mm2 = mm0; \
158 mm2 = _mm_unpacklo_pi8(mm2, mm1); \
159 *(uint64_t*)p_line1 = (uint64_t)mm2; \
160 mm0 = _mm_unpackhi_pi8(mm0, mm1); \
161 *(uint64_t*)(p_line1+8) = (uint64_t)mm0;\
162 mm4 = mm3; \
163 mm4 = _mm_unpacklo_pi8(mm4, mm1); \
164 *(uint64_t*)p_line2 = (uint64_t)mm4; \
165 mm3 = _mm_unpackhi_pi8(mm3, mm1); \
166 *(uint64_t*)(p_line2+8) = (uint64_t)mm3;
168 #define MMX_YUV420_UYVY \
169 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
170 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
171 mm0 = (__m64)*(uint64_t*)p_y1; \
172 mm3 = (__m64)*(uint64_t*)p_y2; \
173 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
174 mm2 = mm1; \
175 mm2 = _mm_unpacklo_pi8(mm2, mm0); \
176 *(uint64_t*)p_line1 = (uint64_t)mm2; \
177 mm2 = mm1; \
178 mm2 = _mm_unpackhi_pi8(mm2, mm0); \
179 *(uint64_t*)(p_line1+8) = (uint64_t)mm2;\
180 mm4 = mm1; \
181 mm4 = _mm_unpacklo_pi8(mm4, mm3); \
182 *(uint64_t*)p_line2 = (uint64_t)mm4; \
183 mm1 = _mm_unpackhi_pi8(mm1, mm3); \
184 *(uint64_t*)(p_line2+8) = (uint64_t)mm1;
186 #endif
188 #elif defined( MODULE_NAME_IS_i420_yuy2_sse2 )
190 #if defined(CAN_COMPILE_SSE2)
192 /* SSE2 assembly */
194 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
195 do { \
196 __asm__ __volatile__( \
197 ".p2align 3 \n\t \
198 movq (%0), %%xmm1 # Load 8 Cb u7 u6 u5 u4 u3 u2 u1 u0 \n\
199 movq (%1), %%xmm2 # Load 8 Cr v7 06 v5 v4 v3 v2 v1 v0 \n\
202 : "r" (p_u), "r" (p_v) ); \
203 __asm__ __volatile__( \
204 ".p2align 3 \n\t" \
205 SSE2_INSTRUCTIONS \
207 : "r" (p_line1), "r" (p_line2), \
208 "r" (p_y1), "r" (p_y2) ); \
209 p_line1 += 32; p_line2 += 32; \
210 p_y1 += 16; p_y2 += 16; \
211 p_u += 8; p_v += 8; \
212 } while(0)
214 #define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
216 #define SSE2_YUV420_YUYV_ALIGNED " \n\
217 movdqa (%2), %%xmm0 # Load 16 Y y15 y14 y13 .. y2 y1 y0 \n\
218 movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
219 punpcklbw %%xmm2, %%xmm1 # v7 u7 v6 u6 .. u1 v0 u0 \n\
220 movdqa %%xmm0, %%xmm2 # y15 y14 y13 .. y2 y1 y0 \n\
221 punpcklbw %%xmm1, %%xmm2 # v3 y7 u3 .. v0 y1 u0 y0 \n\
222 movntdq %%xmm2, (%0) # Store low YUYV \n\
223 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
224 movntdq %%xmm0, 16(%0) # Store high YUYV \n\
225 movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
226 punpcklbw %%xmm1, %%xmm4 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\
227 movntdq %%xmm4, (%1) # Store low YUYV \n\
228 punpckhbw %%xmm1, %%xmm3 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\
229 movntdq %%xmm3, 16(%1) # Store high YUYV \n\
232 #define SSE2_YUV420_YUYV_UNALIGNED " \n\
233 movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
234 movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
235 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
236 prefetchnta (%1) # Tell CPU not to cache output YUYV data \n\
237 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
238 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
239 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
240 movdqu %%xmm2, (%0) # Store low YUYV \n\
241 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
242 movdqu %%xmm0, 16(%0) # Store high YUYV \n\
243 movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
244 punpcklbw %%xmm1, %%xmm4 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\
245 movdqu %%xmm4, (%1) # Store low YUYV \n\
246 punpckhbw %%xmm1, %%xmm3 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\
247 movdqu %%xmm3, 16(%1) # Store high YUYV \n\
250 #define SSE2_YUV420_YVYU_ALIGNED " \n\
251 movdqa (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
252 movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
253 punpcklbw %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
254 movdqa %%xmm0, %%xmm1 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
255 punpcklbw %%xmm2, %%xmm1 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
256 movntdq %%xmm1, (%0) # Store low YUYV \n\
257 punpckhbw %%xmm2, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
258 movntdq %%xmm0, 16(%0) # Store high YUYV \n\
259 movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
260 punpcklbw %%xmm2, %%xmm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\
261 movntdq %%xmm4, (%1) # Store low YUYV \n\
262 punpckhbw %%xmm2, %%xmm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\
263 movntdq %%xmm3, 16(%1) # Store high YUYV \n\
266 #define SSE2_YUV420_YVYU_UNALIGNED " \n\
267 movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
268 movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
269 prefetchnta (%0) # Tell CPU not to cache output YVYU data \n\
270 prefetchnta (%1) # Tell CPU not to cache output YVYU data \n\
271 punpcklbw %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
272 movdqu %%xmm0, %%xmm1 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
273 punpcklbw %%xmm2, %%xmm1 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
274 movdqu %%xmm1, (%0) # Store low YUYV \n\
275 punpckhbw %%xmm2, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
276 movdqu %%xmm0, 16(%0) # Store high YUYV \n\
277 movdqu %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
278 punpcklbw %%xmm2, %%xmm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\
279 movdqu %%xmm4, (%1) # Store low YUYV \n\
280 punpckhbw %%xmm2, %%xmm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\
281 movdqu %%xmm3, 16(%1) # Store high YUYV \n\
284 #define SSE2_YUV420_UYVY_ALIGNED " \n\
285 movdqa (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
286 movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
287 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
288 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
289 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
290 movntdq %%xmm2, (%0) # Store low UYVY \n\
291 movdqa %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
292 punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
293 movntdq %%xmm2, 16(%0) # Store high UYVY \n\
294 movdqa %%xmm1, %%xmm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
295 punpcklbw %%xmm3, %%xmm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\
296 movntdq %%xmm4, (%1) # Store low UYVY \n\
297 punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\
298 movntdq %%xmm1, 16(%1) # Store high UYVY \n\
301 #define SSE2_YUV420_UYVY_UNALIGNED " \n\
302 movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
303 movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
304 prefetchnta (%0) # Tell CPU not to cache output UYVY data \n\
305 prefetchnta (%1) # Tell CPU not to cache output UYVY data \n\
306 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
307 movdqu %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
308 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
309 movdqu %%xmm2, (%0) # Store low UYVY \n\
310 movdqu %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
311 punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
312 movdqu %%xmm2, 16(%0) # Store high UYVY \n\
313 movdqu %%xmm1, %%xmm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
314 punpcklbw %%xmm3, %%xmm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\
315 movdqu %%xmm4, (%1) # Store low UYVY \n\
316 punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\
317 movdqu %%xmm1, 16(%1) # Store high UYVY \n\
320 #elif defined(HAVE_SSE2_INTRINSICS)
322 /* SSE2 intrinsics */
324 #include <emmintrin.h>
326 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
327 do { \
328 __m128i xmm0, xmm1, xmm2, xmm3, xmm4; \
329 SSE2_INSTRUCTIONS \
330 p_line1 += 32; p_line2 += 32; \
331 p_y1 += 16; p_y2 += 16; \
332 p_u += 8; p_v += 8; \
333 } while(0)
335 #define SSE2_END _mm_sfence()
337 #define SSE2_YUV420_YUYV_ALIGNED \
338 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
339 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
340 xmm0 = _mm_load_si128((__m128i *)p_y1); \
341 xmm3 = _mm_load_si128((__m128i *)p_y2); \
342 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
343 xmm2 = xmm0; \
344 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
345 _mm_stream_si128((__m128i*)(p_line1), xmm2); \
346 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
347 _mm_stream_si128((__m128i*)(p_line1+16), xmm0); \
348 xmm4 = xmm3; \
349 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
350 _mm_stream_si128((__m128i*)(p_line2), xmm4); \
351 xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \
352 _mm_stream_si128((__m128i*)(p_line1+16), xmm3);
354 #define SSE2_YUV420_YUYV_UNALIGNED \
355 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
356 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
357 xmm0 = _mm_loadu_si128((__m128i *)p_y1); \
358 xmm3 = _mm_loadu_si128((__m128i *)p_y2); \
359 _mm_prefetch(p_line1, _MM_HINT_NTA); \
360 _mm_prefetch(p_line2, _MM_HINT_NTA); \
361 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
362 xmm2 = xmm0; \
363 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
364 _mm_storeu_si128((__m128i*)(p_line1), xmm2); \
365 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
366 _mm_storeu_si128((__m128i*)(p_line1+16), xmm0); \
367 xmm4 = xmm3; \
368 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
369 _mm_storeu_si128((__m128i*)(p_line2), xmm4); \
370 xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \
371 _mm_storeu_si128((__m128i*)(p_line1+16), xmm3);
373 #define SSE2_YUV420_YVYU_ALIGNED \
374 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
375 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
376 xmm0 = _mm_load_si128((__m128i *)p_y1); \
377 xmm3 = _mm_load_si128((__m128i *)p_y2); \
378 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
379 xmm2 = xmm0; \
380 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
381 _mm_stream_si128((__m128i*)(p_line1), xmm2); \
382 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
383 _mm_stream_si128((__m128i*)(p_line1+16), xmm0); \
384 xmm4 = xmm3; \
385 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
386 _mm_stream_si128((__m128i*)(p_line2), xmm4); \
387 xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \
388 _mm_stream_si128((__m128i*)(p_line1+16), xmm3);
390 #define SSE2_YUV420_YVYU_UNALIGNED \
391 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
392 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
393 xmm0 = _mm_loadu_si128((__m128i *)p_y1); \
394 xmm3 = _mm_loadu_si128((__m128i *)p_y2); \
395 _mm_prefetch(p_line1, _MM_HINT_NTA); \
396 _mm_prefetch(p_line2, _MM_HINT_NTA); \
397 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
398 xmm2 = xmm0; \
399 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
400 _mm_storeu_si128((__m128i*)(p_line1), xmm2); \
401 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
402 _mm_storeu_si128((__m128i*)(p_line1+16), xmm0); \
403 xmm4 = xmm3; \
404 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \
405 _mm_storeu_si128((__m128i*)(p_line2), xmm4); \
406 xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \
407 _mm_storeu_si128((__m128i*)(p_line1+16), xmm3);
409 #define SSE2_YUV420_UYVY_ALIGNED \
410 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
411 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
412 xmm0 = _mm_load_si128((__m128i *)p_y1); \
413 xmm3 = _mm_load_si128((__m128i *)p_y2); \
414 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
415 xmm2 = xmm1; \
416 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
417 _mm_stream_si128((__m128i*)(p_line1), xmm2); \
418 xmm2 = xmm1; \
419 xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
420 _mm_stream_si128((__m128i*)(p_line1+16), xmm2); \
421 xmm4 = xmm1; \
422 xmm4 = _mm_unpacklo_epi8(xmm4, xmm3); \
423 _mm_stream_si128((__m128i*)(p_line2), xmm4); \
424 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
425 _mm_stream_si128((__m128i*)(p_line1+16), xmm1);
427 #define SSE2_YUV420_UYVY_UNALIGNED \
428 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
429 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
430 xmm0 = _mm_loadu_si128((__m128i *)p_y1); \
431 xmm3 = _mm_loadu_si128((__m128i *)p_y2); \
432 _mm_prefetch(p_line1, _MM_HINT_NTA); \
433 _mm_prefetch(p_line2, _MM_HINT_NTA); \
434 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
435 xmm2 = xmm1; \
436 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
437 _mm_storeu_si128((__m128i*)(p_line1), xmm2); \
438 xmm2 = xmm1; \
439 xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
440 _mm_storeu_si128((__m128i*)(p_line1+16), xmm2); \
441 xmm4 = xmm1; \
442 xmm4 = _mm_unpacklo_epi8(xmm4, xmm3); \
443 _mm_storeu_si128((__m128i*)(p_line2), xmm4); \
444 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \
445 _mm_storeu_si128((__m128i*)(p_line1+16), xmm1);
447 #endif
449 #endif
451 /* Used in both accelerated and C modules */
453 #define C_YUV420_YVYU( ) \
454 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \
455 *(p_line1)++ = *(p_line2)++ = *(p_v)++; \
456 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \
457 *(p_line1)++ = *(p_line2)++ = *(p_u)++; \
459 #define C_YUV420_Y211( ) \
460 *(p_line1)++ = *(p_y1); p_y1 += 2; \
461 *(p_line2)++ = *(p_y2); p_y2 += 2; \
462 *(p_line1)++ = *(p_line2)++ = *(p_u) - 0x80; p_u += 2; \
463 *(p_line1)++ = *(p_y1); p_y1 += 2; \
464 *(p_line2)++ = *(p_y2); p_y2 += 2; \
465 *(p_line1)++ = *(p_line2)++ = *(p_v) - 0x80; p_v += 2; \
468 #define C_YUV420_YUYV( ) \
469 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \
470 *(p_line1)++ = *(p_line2)++ = *(p_u)++; \
471 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \
472 *(p_line1)++ = *(p_line2)++ = *(p_v)++; \
474 #define C_YUV420_UYVY( ) \
475 *(p_line1)++ = *(p_line2)++ = *(p_u)++; \
476 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \
477 *(p_line1)++ = *(p_line2)++ = *(p_v)++; \
478 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \