demux: mp4: avoid audio cuts on seek
[vlc.git] / modules / video_chroma / i422_yuy2.h
blob655ab32365fa07727f7253c06cbcc59f6b48f631
1 /*****************************************************************************
2 * i422_yuy2.h : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2002 VLC authors and VideoLAN
5 * $Id$
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damienf@videolan.org>
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 #ifdef MODULE_NAME_IS_i422_yuy2_mmx
27 #if defined(CAN_COMPILE_MMX)
29 /* MMX assembly */
31 #define MMX_CALL(MMX_INSTRUCTIONS) \
32 do { \
33 __asm__ __volatile__( \
34 ".p2align 3 \n\t" \
35 MMX_INSTRUCTIONS \
36 : \
37 : "r" (p_line), "r" (p_y), \
38 "r" (p_u), "r" (p_v) \
39 : "mm0", "mm1", "mm2" ); \
40 p_line += 16; p_y += 8; \
41 p_u += 4; p_v += 4; \
42 } while(0)
44 #define MMX_END __asm__ __volatile__ ( "emms" )
46 #define MMX_YUV422_YUYV " \n\
47 movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
48 movd (%2), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
49 movd (%3), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
50 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
51 movq %%mm0, %%mm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
52 punpcklbw %%mm1, %%mm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
53 movq %%mm2, (%0) # Store low YUYV \n\
54 punpckhbw %%mm1, %%mm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
55 movq %%mm0, 8(%0) # Store high YUYV \n\
58 #define MMX_YUV422_YVYU " \n\
59 movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
60 movd (%2), %%mm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
61 movd (%3), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
62 punpcklbw %%mm2, %%mm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
63 movq %%mm0, %%mm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
64 punpcklbw %%mm1, %%mm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
65 movq %%mm2, (%0) # Store low YUYV \n\
66 punpckhbw %%mm1, %%mm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
67 movq %%mm0, 8(%0) # Store high YUYV \n\
70 #define MMX_YUV422_UYVY " \n\
71 movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
72 movd (%2), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
73 movd (%3), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
74 punpcklbw %%mm2, %%mm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
75 movq %%mm1, %%mm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
76 punpcklbw %%mm0, %%mm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
77 movq %%mm2, (%0) # Store low UYVY \n\
78 punpckhbw %%mm0, %%mm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
79 movq %%mm1, 8(%0) # Store high UYVY \n\
82 #elif defined(HAVE_MMX_INTRINSICS)
84 /* MMX intrinsics */
86 #include <mmintrin.h>
88 #define MMX_CALL(MMX_INSTRUCTIONS) \
89 do { \
90 __m64 mm0, mm1, mm2; \
91 MMX_INSTRUCTIONS \
92 p_line += 16; p_y += 8; \
93 p_u += 4; p_v += 4; \
94 } while(0)
96 #define MMX_END _mm_empty()
98 #define MMX_YUV422_YUYV \
99 mm0 = (__m64)*(uint64_t*)p_y; \
100 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
101 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
102 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
103 mm2 = mm0; \
104 mm2 = _mm_unpacklo_pi8(mm2, mm1); \
105 *(uint64_t*)p_line = (uint64_t)mm2; \
106 mm0 = _mm_unpackhi_pi8(mm0, mm1); \
107 *(uint64_t*)(p_line+8) = (uint64_t)mm0;
109 #define MMX_YUV422_YVYU \
110 mm0 = (__m64)*(uint64_t*)p_y; \
111 mm2 = _mm_cvtsi32_si64(*(int*)p_u); \
112 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \
113 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
114 mm2 = mm0; \
115 mm2 = _mm_unpacklo_pi8(mm2, mm1); \
116 *(uint64_t*)p_line = (uint64_t)mm2; \
117 mm0 = _mm_unpackhi_pi8(mm0, mm1); \
118 *(uint64_t*)(p_line+8) = (uint64_t)mm0;
120 #define MMX_YUV422_UYVY \
121 mm0 = (__m64)*(uint64_t*)p_y; \
122 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \
123 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \
124 mm1 = _mm_unpacklo_pi8(mm1, mm2); \
125 mm2 = mm1; \
126 mm2 = _mm_unpacklo_pi8(mm2, mm0); \
127 *(uint64_t*)p_line = (uint64_t)mm2; \
128 mm1 = _mm_unpackhi_pi8(mm1, mm0); \
129 *(uint64_t*)(p_line+8) = (uint64_t)mm1;
131 #endif
133 #elif defined( MODULE_NAME_IS_i422_yuy2_sse2 )
135 #if defined(CAN_COMPILE_SSE2)
137 /* SSE2 assembly */
139 #define SSE2_CALL(MMX_INSTRUCTIONS) \
140 do { \
141 __asm__ __volatile__( \
142 ".p2align 3 \n\t" \
143 MMX_INSTRUCTIONS \
145 : "r" (p_line), "r" (p_y), \
146 "r" (p_u), "r" (p_v) \
147 : "xmm0", "xmm1", "xmm2" ); \
148 p_line += 32; p_y += 16; \
149 p_u += 8; p_v += 8; \
150 } while(0)
152 #define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
154 #define SSE2_YUV422_YUYV_ALIGNED " \n\
155 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
156 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
157 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
158 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
159 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
160 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
161 movntdq %%xmm2, (%0) # Store low YUYV \n\
162 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
163 movntdq %%xmm0, 16(%0) # Store high YUYV \n\
166 #define SSE2_YUV422_YUYV_UNALIGNED " \n\
167 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
168 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
169 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
170 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
171 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
172 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
173 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\
174 movdqu %%xmm2, (%0) # Store low YUYV \n\
175 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\
176 movdqu %%xmm0, 16(%0) # Store high YUYV \n\
179 #define SSE2_YUV422_YVYU_ALIGNED " \n\
180 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
181 movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
182 movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
183 punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
184 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
185 punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
186 movntdq %%xmm2, (%0) # Store low YUYV \n\
187 punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
188 movntdq %%xmm0, 16(%0) # Store high YUYV \n\
191 #define SSE2_YUV422_YVYU_UNALIGNED " \n\
192 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
193 movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
194 movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
195 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
196 punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\
197 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\
198 punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\
199 movdqu %%xmm2, (%0) # Store low YUYV \n\
200 punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\
201 movdqu %%xmm0, 16(%0) # Store high YUYV \n\
204 #define SSE2_YUV422_UYVY_ALIGNED " \n\
205 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
206 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
207 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
208 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
209 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
210 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
211 movntdq %%xmm2, (%0) # Store low UYVY \n\
212 punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
213 movntdq %%xmm1, 16(%0) # Store high UYVY \n\
216 #define SSE2_YUV422_UYVY_UNALIGNED " \n\
217 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\
218 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\
219 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\
220 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\
221 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
222 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\
223 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\
224 movdqu %%xmm2, (%0) # Store low UYVY \n\
225 punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\
226 movdqu %%xmm1, 16(%0) # Store high UYVY \n\
229 #elif defined(HAVE_SSE2_INTRINSICS)
231 /* SSE2 intrinsics */
233 #include <emmintrin.h>
235 #define SSE2_CALL(SSE2_INSTRUCTIONS) \
236 do { \
237 __m128i xmm0, xmm1, xmm2; \
238 SSE2_INSTRUCTIONS \
239 p_line += 32; p_y += 16; \
240 p_u += 8; p_v += 8; \
241 } while(0)
243 #define SSE2_END _mm_sfence()
245 #define SSE2_YUV422_YUYV_ALIGNED \
246 xmm0 = _mm_load_si128((__m128i *)p_y); \
247 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
248 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
249 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
250 xmm2 = xmm0; \
251 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
252 _mm_stream_si128((__m128i*)(p_line), xmm2); \
253 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
254 _mm_stream_si128((__m128i*)(p_line+16), xmm0);
256 #define SSE2_YUV422_YUYV_UNALIGNED \
257 xmm0 = _mm_loadu_si128((__m128i *)p_y); \
258 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
259 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
260 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
261 xmm2 = xmm0; \
262 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
263 _mm_storeu_si128((__m128i*)(p_line), xmm2); \
264 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
265 _mm_storeu_si128((__m128i*)(p_line+16), xmm0);
267 #define SSE2_YUV422_YVYU_ALIGNED \
268 xmm0 = _mm_load_si128((__m128i *)p_y); \
269 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
270 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
271 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
272 xmm2 = xmm0; \
273 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
274 _mm_stream_si128((__m128i*)(p_line), xmm2); \
275 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
276 _mm_stream_si128((__m128i*)(p_line+16), xmm0);
278 #define SSE2_YUV422_YVYU_UNALIGNED \
279 xmm0 = _mm_loadu_si128((__m128i *)p_y); \
280 xmm2 = _mm_loadl_epi64((__m128i *)p_u); \
281 xmm1 = _mm_loadl_epi64((__m128i *)p_v); \
282 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
283 xmm2 = xmm0; \
284 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \
285 _mm_storeu_si128((__m128i*)(p_line), xmm2); \
286 xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \
287 _mm_storeu_si128((__m128i*)(p_line+16), xmm0);
289 #define SSE2_YUV422_UYVY_ALIGNED \
290 xmm0 = _mm_load_si128((__m128i *)p_y); \
291 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
292 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
293 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
294 xmm2 = xmm1; \
295 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
296 _mm_stream_si128((__m128i*)(p_line), xmm2); \
297 xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
298 _mm_stream_si128((__m128i*)(p_line+16), xmm1);
300 #define SSE2_YUV422_UYVY_UNALIGNED \
301 xmm0 = _mm_loadu_si128((__m128i *)p_y); \
302 xmm1 = _mm_loadl_epi64((__m128i *)p_u); \
303 xmm2 = _mm_loadl_epi64((__m128i *)p_v); \
304 xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \
305 xmm2 = xmm1; \
306 xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \
307 _mm_storeu_si128((__m128i*)(p_line), xmm2); \
308 xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \
309 _mm_storeu_si128((__m128i*)(p_line+16), xmm1);
311 #endif
313 #endif
315 #define C_YUV422_YUYV( p_line, p_y, p_u, p_v ) \
316 *(p_line)++ = *(p_y)++; \
317 *(p_line)++ = *(p_u)++; \
318 *(p_line)++ = *(p_y)++; \
319 *(p_line)++ = *(p_v)++; \
321 #define C_YUV422_YVYU( p_line, p_y, p_u, p_v ) \
322 *(p_line)++ = *(p_y)++; \
323 *(p_line)++ = *(p_v)++; \
324 *(p_line)++ = *(p_y)++; \
325 *(p_line)++ = *(p_u)++; \
327 #define C_YUV422_UYVY( p_line, p_y, p_u, p_v ) \
328 *(p_line)++ = *(p_u)++; \
329 *(p_line)++ = *(p_y)++; \
330 *(p_line)++ = *(p_v)++; \
331 *(p_line)++ = *(p_y)++; \
333 #define C_YUV422_Y211( p_line, p_y, p_u, p_v ) \
334 *(p_line)++ = *(p_y); p_y += 2; \
335 *(p_line)++ = *(p_u) - 0x80; p_u += 2; \
336 *(p_line)++ = *(p_y); p_y += 2; \
337 *(p_line)++ = *(p_v) - 0x80; p_v += 2; \