Typos
[vlc/asuraparaju-public.git] / modules / video_chroma / i422_yuy2.h
blob52ad7e4c3fd571c65b319714141afcceb4ad58d6
1 /*****************************************************************************
2 * i422_yuy2.h : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2002 the VideoLAN team
5 * $Id$
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damienf@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 #ifdef MODULE_NAME_IS_i422_yuy2_mmx
27 #if defined(CAN_COMPILE_MMX)
29 /* MMX assembly */
31 #define MMX_CALL(MMX_INSTRUCTIONS) \
32 do { \
33 __asm__ __volatile__( \
34 ".p2align 3 \n\t" \
35 MMX_INSTRUCTIONS \
36 : \
37 : "r" (p_line), "r" (p_y), \
38 "r" (p_u), "r" (p_v) ); \
39 p_line += 16; p_y += 8; \
40 p_u += 4; p_v += 4; \
41 } while(0)
43 #define MMX_END __asm__ __volatile__ ( "emms" )
45 #define MMX_YUV422_YUYV " \n\
46 movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
47 movd (%2), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
48 movd (%3), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
49 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
50 movq %%mm0, %%mm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
51 punpcklbw %%mm1, %%mm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
52 movq %%mm2, (%0) # Store low YUYV \n\
53 punpckhbw %%mm1, %%mm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
54 movq %%mm0, 8(%0) # Store high YUYV \n\
57 #define MMX_YUV422_YVYU " \n\
58 movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
59 movd (%2), %%mm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
60 movd (%3), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
61 punpcklbw %%mm2, %%mm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
62 movq %%mm0, %%mm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
63 punpcklbw %%mm1, %%mm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
64 movq %%mm2, (%0) # Store low YUYV \n\
65 punpckhbw %%mm1, %%mm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
66 movq %%mm0, 8(%0) # Store high YUYV \n\
69 #define MMX_YUV422_UYVY " \n\
70 movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
71 movd (%2), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
72 movd (%3), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
73 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
74 movq %%mm1, %%mm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
75 punpcklbw %%mm0, %%mm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
76 movq %%mm2, (%0) # Store low UYVY \n\
77 punpckhbw %%mm0, %%mm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
78 movq %%mm1, 8(%0) # Store high UYVY \n\
81 #define MMX_YUV422_Y211 " \n\
84 #elif defined(HAVE_MMX_INTRINSICS)
86 /* MMX intrinsics */
88 #include <mmintrin.h>
90 #define MMX_CALL(MMX_INSTRUCTIONS) \
91 do { \
92 __m64 mm0, mm1, mm2; \
93 MMX_INSTRUCTIONS \
94 p_line += 16; p_y += 8; \
95 p_u += 4; p_v += 4; \
96 } while(0)
98 #define MMX_END _mm_empty()
100 #define MMX_YUV422_YUYV \
101 mm0 = (__m64)*(uint64_t*)p_y; \
102 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
103 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
104 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
105 mm2 = mm0; \
106 mm2 = _mm_unpacklo_pi8(mm2, mm1); \
107 *(uint64_t*)p_line = (uint64_t)mm2; \
108 mm0 = _mm_unpackhi_pi8(mm0, mm1); \
109 *(uint64_t*)(p_line+8) = (uint64_t)mm0;
111 #define MMX_YUV422_YVYU \
112 mm0 = (__m64)*(uint64_t*)p_y; \
113 mm2 = _mm_cvtsi32_si64(*(int*)p_u); \
114 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
115 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
116 mm2 = mm0; \
117 mm2 = _mm_unpacklo_pi8(mm2, mm1); \
118 *(uint64_t*)p_line = (uint64_t)mm2; \
119 mm0 = _mm_unpackhi_pi8(mm0, mm1); \
120 *(uint64_t*)(p_line+8) = (uint64_t)mm0;
122 #define MMX_YUV422_UYVY \
123 mm0 = (__m64)*(uint64_t*)p_y; \
124 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
125 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
126 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
127 mm2 = mm1; \
128 mm2 = _mm_unpacklo_pi8(mm2, mm0); \
129 *(uint64_t*)p_line = (uint64_t)mm2; \
130 mm1 = _mm_unpackhi_pi8(mm1, mm0); \
131 *(uint64_t*)(p_line+8) = (uint64_t)mm1;
133 #endif
135 #elif defined( MODULE_NAME_IS_i422_yuy2_sse2 )
137 #if defined(CAN_COMPILE_SSE2)
139 /* SSE2 assembly */
141 #define SSE2_CALL(MMX_INSTRUCTIONS) \
142 do { \
143 __asm__ __volatile__( \
144 ".p2align 3 \n\t" \
145 MMX_INSTRUCTIONS \
147 : "r" (p_line), "r" (p_y), \
148 "r" (p_u), "r" (p_v) ); \
149 p_line += 32; p_y += 16; \
150 p_u += 8; p_v += 8; \
151 } while(0)
153 #define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
155 #define SSE2_YUV422_YUYV_ALIGNED " \n\
156 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
157 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
158 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
159 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
160 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
161 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
162 movntdq %%xmm2, (%0) # Store low YUYV \n\
163 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
164 movntdq %%xmm0, 16(%0) # Store high YUYV \n\
167 #define SSE2_YUV422_YUYV_UNALIGNED " \n\
168 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
169 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
170 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
171 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
172 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
173 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
174 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
175 movdqu %%xmm2, (%0) # Store low YUYV \n\
176 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
177 movdqu %%xmm0, 16(%0) # Store high YUYV \n\
180 #define SSE2_YUV422_YVYU_ALIGNED " \n\
181 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
182 movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
183 movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
184 punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
185 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
186 punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
187 movntdq %%xmm2, (%0) # Store low YUYV \n\
188 punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
189 movntdq %%xmm0, 16(%0) # Store high YUYV \n\
192 #define SSE2_YUV422_YVYU_UNALIGNED " \n\
193 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
194 movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
195 movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
196 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
197 punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
198 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
199 punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
200 movdqu %%xmm2, (%0) # Store low YUYV \n\
201 punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
202 movdqu %%xmm0, 16(%0) # Store high YUYV \n\
205 #define SSE2_YUV422_UYVY_ALIGNED " \n\
206 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
207 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
208 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
209 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
210 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
211 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
212 movntdq %%xmm2, (%0) # Store low UYVY \n\
213 punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
214 movntdq %%xmm1, 16(%0) # Store high UYVY \n\
217 #define SSE2_YUV422_UYVY_UNALIGNED " \n\
218 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
219 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
220 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
221 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
222 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
223 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
224 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
225 movdqu %%xmm2, (%0) # Store low UYVY \n\
226 punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
227 movdqu %%xmm1, 16(%0) # Store high UYVY \n\
230 #elif defined(HAVE_SSE2_INTRINSICS)
232 /* SSE2 intrinsics */
234 #include <emmintrin.h>
236 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
237 do { \
238 __m128i xmm0, xmm1, xmm2; \
239 SSE2_INSTRUCTIONS \
240 p_line += 32; p_y += 16; \
241 p_u += 8; p_v += 8; \
242 } while(0)
244 #define SSE2_END _mm_sfence()
246 #define SSE2_YUV422_YUYV_ALIGNED \
247 xmm0 = _mm_load_si128((__m128i *)p_y); \
248 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
249 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
250 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
251 xmm2 = xmm0; \
252 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
253 _mm_stream_si128((__m128i*)(p_line), xmm2); \
254 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
255 _mm_stream_si128((__m128i*)(p_line+16), xmm0);
257 #define SSE2_YUV422_YUYV_UNALIGNED \
258 xmm0 = _mm_loadu_si128((__m128i *)p_y); \
259 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
260 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
261 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
262 xmm2 = xmm0; \
263 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
264 _mm_storeu_si128((__m128i*)(p_line), xmm2); \
265 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
266 _mm_storeu_si128((__m128i*)(p_line+16), xmm0);
268 #define SSE2_YUV422_YVYU_ALIGNED \
269 xmm0 = _mm_load_si128((__m128i *)p_y); \
270 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
271 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
272 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
273 xmm2 = xmm0; \
274 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
275 _mm_stream_si128((__m128i*)(p_line), xmm2); \
276 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
277 _mm_stream_si128((__m128i*)(p_line+16), xmm0);
279 #define SSE2_YUV422_YVYU_UNALIGNED \
280 xmm0 = _mm_loadu_si128((__m128i *)p_y); \
281 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
282 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
283 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
284 xmm2 = xmm0; \
285 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
286 _mm_storeu_si128((__m128i*)(p_line), xmm2); \
287 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
288 _mm_storeu_si128((__m128i*)(p_line+16), xmm0);
290 #define SSE2_YUV422_UYVY_ALIGNED \
291 xmm0 = _mm_load_si128((__m128i *)p_y); \
292 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
293 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
294 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
295 xmm2 = xmm1; \
296 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
297 _mm_stream_si128((__m128i*)(p_line), xmm2); \
298 xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
299 _mm_stream_si128((__m128i*)(p_line+16), xmm1);
301 #define SSE2_YUV422_UYVY_UNALIGNED \
302 xmm0 = _mm_loadu_si128((__m128i *)p_y); \
303 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
304 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
305 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
306 xmm2 = xmm1; \
307 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
308 _mm_storeu_si128((__m128i*)(p_line), xmm2); \
309 xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
310 _mm_storeu_si128((__m128i*)(p_line+16), xmm1);
312 #endif
314 #endif
316 #define C_YUV422_YUYV( p_line, p_y, p_u, p_v ) \
317 *(p_line)++ = *(p_y)++; \
318 *(p_line)++ = *(p_u)++; \
319 *(p_line)++ = *(p_y)++; \
320 *(p_line)++ = *(p_v)++; \
322 #define C_YUV422_YVYU( p_line, p_y, p_u, p_v ) \
323 *(p_line)++ = *(p_y)++; \
324 *(p_line)++ = *(p_v)++; \
325 *(p_line)++ = *(p_y)++; \
326 *(p_line)++ = *(p_u)++; \
328 #define C_YUV422_UYVY( p_line, p_y, p_u, p_v ) \
329 *(p_line)++ = *(p_u)++; \
330 *(p_line)++ = *(p_y)++; \
331 *(p_line)++ = *(p_v)++; \
332 *(p_line)++ = *(p_y)++; \
334 #define C_YUV422_Y211( p_line, p_y, p_u, p_v ) \
335 *(p_line)++ = *(p_y); p_y += 2; \
336 *(p_line)++ = *(p_u) - 0x80; p_u += 2; \
337 *(p_line)++ = *(p_y); p_y += 2; \
338 *(p_line)++ = *(p_v) - 0x80; p_v += 2; \