GLwin32, WGL: fix typos
[vlc.git] / modules / video_chroma / i422_yuy2.h
blob7d2c469128180e8a7721c90f6cb8bf6044da96f6
1 /*****************************************************************************
2 * i422_yuy2.h : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2002 VLC authors and VideoLAN
5 * $Id$
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damienf@videolan.org>
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 #ifdef MODULE_NAME_IS_i422_yuy2_mmx
27 #if defined(CAN_COMPILE_MMX)
29 /* MMX assembly */
31 #define MMX_CALL(MMX_INSTRUCTIONS) \
32 do { \
33 __asm__ __volatile__( \
34 ".p2align 3 \n\t" \
35 MMX_INSTRUCTIONS \
36 : \
37 : "r" (p_line), "r" (p_y), \
38 "r" (p_u), "r" (p_v) \
39 : "mm0", "mm1", "mm2" ); \
40 p_line += 16; p_y += 8; \
41 p_u += 4; p_v += 4; \
42 } while(0)
44 #define MMX_END __asm__ __volatile__ ( "emms" )
46 #define MMX_YUV422_YUYV " \n\
47 movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
48 movd (%2), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
49 movd (%3), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
50 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
51 movq %%mm0, %%mm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
52 punpcklbw %%mm1, %%mm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
53 movq %%mm2, (%0) # Store low YUYV \n\
54 punpckhbw %%mm1, %%mm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
55 movq %%mm0, 8(%0) # Store high YUYV \n\
58 #define MMX_YUV422_YVYU " \n\
59 movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
60 movd (%2), %%mm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
61 movd (%3), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
62 punpcklbw %%mm2, %%mm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
63 movq %%mm0, %%mm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
64 punpcklbw %%mm1, %%mm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
65 movq %%mm2, (%0) # Store low YUYV \n\
66 punpckhbw %%mm1, %%mm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
67 movq %%mm0, 8(%0) # Store high YUYV \n\
70 #define MMX_YUV422_UYVY " \n\
71 movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
72 movd (%2), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
73 movd (%3), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
74 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
75 movq %%mm1, %%mm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
76 punpcklbw %%mm0, %%mm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
77 movq %%mm2, (%0) # Store low UYVY \n\
78 punpckhbw %%mm0, %%mm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
79 movq %%mm1, 8(%0) # Store high UYVY \n\
82 #define MMX_YUV422_Y211 " \n\
85 #elif defined(HAVE_MMX_INTRINSICS)
87 /* MMX intrinsics */
89 #include <mmintrin.h>
91 #define MMX_CALL(MMX_INSTRUCTIONS) \
92 do { \
93 __m64 mm0, mm1, mm2; \
94 MMX_INSTRUCTIONS \
95 p_line += 16; p_y += 8; \
96 p_u += 4; p_v += 4; \
97 } while(0)
99 #define MMX_END _mm_empty()
101 #define MMX_YUV422_YUYV \
102 mm0 = (__m64)*(uint64_t*)p_y; \
103 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
104 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
105 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
106 mm2 = mm0; \
107 mm2 = _mm_unpacklo_pi8(mm2, mm1); \
108 *(uint64_t*)p_line = (uint64_t)mm2; \
109 mm0 = _mm_unpackhi_pi8(mm0, mm1); \
110 *(uint64_t*)(p_line+8) = (uint64_t)mm0;
112 #define MMX_YUV422_YVYU \
113 mm0 = (__m64)*(uint64_t*)p_y; \
114 mm2 = _mm_cvtsi32_si64(*(int*)p_u); \
115 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
116 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
117 mm2 = mm0; \
118 mm2 = _mm_unpacklo_pi8(mm2, mm1); \
119 *(uint64_t*)p_line = (uint64_t)mm2; \
120 mm0 = _mm_unpackhi_pi8(mm0, mm1); \
121 *(uint64_t*)(p_line+8) = (uint64_t)mm0;
123 #define MMX_YUV422_UYVY \
124 mm0 = (__m64)*(uint64_t*)p_y; \
125 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
126 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
127 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
128 mm2 = mm1; \
129 mm2 = _mm_unpacklo_pi8(mm2, mm0); \
130 *(uint64_t*)p_line = (uint64_t)mm2; \
131 mm1 = _mm_unpackhi_pi8(mm1, mm0); \
132 *(uint64_t*)(p_line+8) = (uint64_t)mm1;
134 #endif
136 #elif defined( MODULE_NAME_IS_i422_yuy2_sse2 )
138 #if defined(CAN_COMPILE_SSE2)
140 /* SSE2 assembly */
142 #define SSE2_CALL(MMX_INSTRUCTIONS) \
143 do { \
144 __asm__ __volatile__( \
145 ".p2align 3 \n\t" \
146 MMX_INSTRUCTIONS \
148 : "r" (p_line), "r" (p_y), \
149 "r" (p_u), "r" (p_v) \
150 : "xmm0", "xmm1", "xmm2" ); \
151 p_line += 32; p_y += 16; \
152 p_u += 8; p_v += 8; \
153 } while(0)
155 #define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
157 #define SSE2_YUV422_YUYV_ALIGNED " \n\
158 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
159 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
160 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
161 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
162 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
163 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
164 movntdq %%xmm2, (%0) # Store low YUYV \n\
165 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
166 movntdq %%xmm0, 16(%0) # Store high YUYV \n\
169 #define SSE2_YUV422_YUYV_UNALIGNED " \n\
170 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
171 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
172 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
173 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
174 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
175 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
176 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
177 movdqu %%xmm2, (%0) # Store low YUYV \n\
178 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
179 movdqu %%xmm0, 16(%0) # Store high YUYV \n\
182 #define SSE2_YUV422_YVYU_ALIGNED " \n\
183 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
184 movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
185 movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
186 punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
187 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
188 punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
189 movntdq %%xmm2, (%0) # Store low YUYV \n\
190 punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
191 movntdq %%xmm0, 16(%0) # Store high YUYV \n\
194 #define SSE2_YUV422_YVYU_UNALIGNED " \n\
195 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
196 movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
197 movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
198 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
199 punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
200 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
201 punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
202 movdqu %%xmm2, (%0) # Store low YUYV \n\
203 punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
204 movdqu %%xmm0, 16(%0) # Store high YUYV \n\
207 #define SSE2_YUV422_UYVY_ALIGNED " \n\
208 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
209 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
210 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
211 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
212 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
213 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
214 movntdq %%xmm2, (%0) # Store low UYVY \n\
215 punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
216 movntdq %%xmm1, 16(%0) # Store high UYVY \n\
219 #define SSE2_YUV422_UYVY_UNALIGNED " \n\
220 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
221 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
222 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
223 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
224 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
225 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
226 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
227 movdqu %%xmm2, (%0) # Store low UYVY \n\
228 punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
229 movdqu %%xmm1, 16(%0) # Store high UYVY \n\
232 #elif defined(HAVE_SSE2_INTRINSICS)
234 /* SSE2 intrinsics */
236 #include <emmintrin.h>
238 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
239 do { \
240 __m128i xmm0, xmm1, xmm2; \
241 SSE2_INSTRUCTIONS \
242 p_line += 32; p_y += 16; \
243 p_u += 8; p_v += 8; \
244 } while(0)
246 #define SSE2_END _mm_sfence()
248 #define SSE2_YUV422_YUYV_ALIGNED \
249 xmm0 = _mm_load_si128((__m128i *)p_y); \
250 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
251 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
252 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
253 xmm2 = xmm0; \
254 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
255 _mm_stream_si128((__m128i*)(p_line), xmm2); \
256 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
257 _mm_stream_si128((__m128i*)(p_line+16), xmm0);
259 #define SSE2_YUV422_YUYV_UNALIGNED \
260 xmm0 = _mm_loadu_si128((__m128i *)p_y); \
261 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
262 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
263 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
264 xmm2 = xmm0; \
265 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
266 _mm_storeu_si128((__m128i*)(p_line), xmm2); \
267 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
268 _mm_storeu_si128((__m128i*)(p_line+16), xmm0);
270 #define SSE2_YUV422_YVYU_ALIGNED \
271 xmm0 = _mm_load_si128((__m128i *)p_y); \
272 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
273 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
274 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
275 xmm2 = xmm0; \
276 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
277 _mm_stream_si128((__m128i*)(p_line), xmm2); \
278 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
279 _mm_stream_si128((__m128i*)(p_line+16), xmm0);
281 #define SSE2_YUV422_YVYU_UNALIGNED \
282 xmm0 = _mm_loadu_si128((__m128i *)p_y); \
283 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
284 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
285 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
286 xmm2 = xmm0; \
287 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
288 _mm_storeu_si128((__m128i*)(p_line), xmm2); \
289 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
290 _mm_storeu_si128((__m128i*)(p_line+16), xmm0);
292 #define SSE2_YUV422_UYVY_ALIGNED \
293 xmm0 = _mm_load_si128((__m128i *)p_y); \
294 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
295 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
296 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
297 xmm2 = xmm1; \
298 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
299 _mm_stream_si128((__m128i*)(p_line), xmm2); \
300 xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
301 _mm_stream_si128((__m128i*)(p_line+16), xmm1);
303 #define SSE2_YUV422_UYVY_UNALIGNED \
304 xmm0 = _mm_loadu_si128((__m128i *)p_y); \
305 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
306 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
307 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
308 xmm2 = xmm1; \
309 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
310 _mm_storeu_si128((__m128i*)(p_line), xmm2); \
311 xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
312 _mm_storeu_si128((__m128i*)(p_line+16), xmm1);
314 #endif
316 #endif
318 #define C_YUV422_YUYV( p_line, p_y, p_u, p_v ) \
319 *(p_line)++ = *(p_y)++; \
320 *(p_line)++ = *(p_u)++; \
321 *(p_line)++ = *(p_y)++; \
322 *(p_line)++ = *(p_v)++; \
324 #define C_YUV422_YVYU( p_line, p_y, p_u, p_v ) \
325 *(p_line)++ = *(p_y)++; \
326 *(p_line)++ = *(p_v)++; \
327 *(p_line)++ = *(p_y)++; \
328 *(p_line)++ = *(p_u)++; \
330 #define C_YUV422_UYVY( p_line, p_y, p_u, p_v ) \
331 *(p_line)++ = *(p_u)++; \
332 *(p_line)++ = *(p_y)++; \
333 *(p_line)++ = *(p_v)++; \
334 *(p_line)++ = *(p_y)++; \
336 #define C_YUV422_Y211( p_line, p_y, p_u, p_v ) \
337 *(p_line)++ = *(p_y); p_y += 2; \
338 *(p_line)++ = *(p_u) - 0x80; p_u += 2; \
339 *(p_line)++ = *(p_y); p_y += 2; \
340 *(p_line)++ = *(p_v) - 0x80; p_v += 2; \