Qt: synchronization: subtitles duration parameter
[vlc/vlc-skelet.git] / modules / codec / avcodec / copy.c
bloba7ee432527dd8639b07d406f42cc65afa50e71fe
1 /*****************************************************************************
2 * copy.c: Fast YV12/NV12 copy
3 *****************************************************************************
4 * Copyright (C) 2010 Laurent Aimar
5 * $Id$
7 * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 *****************************************************************************/
24 #ifdef HAVE_CONFIG_H
25 # include "config.h"
26 #endif
28 #include <vlc_common.h>
29 #include <vlc_picture.h>
30 #include <vlc_cpu.h>
31 #include <assert.h>
33 #include "copy.h"
35 /* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
36 * storing data with the SSE>=2 instruction store.
38 #define COPY64(dstp, srcp, load, store) \
39 asm volatile ( \
40 load " 0(%[src]), %%xmm1\n" \
41 load " 16(%[src]), %%xmm2\n" \
42 load " 32(%[src]), %%xmm3\n" \
43 load " 48(%[src]), %%xmm4\n" \
44 store " %%xmm1, 0(%[dst])\n" \
45 store " %%xmm2, 16(%[dst])\n" \
46 store " %%xmm3, 32(%[dst])\n" \
47 store " %%xmm4, 48(%[dst])\n" \
48 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
50 /* Execute the instruction op only if SSE2 is supported. */
51 #ifdef CAN_COMPILE_SSE2
52 # define ASM_SSE2(cpu, op) do { \
53 if (cpu & CPU_CAPABILITY_SSE2) \
54 asm volatile (op); \
55 } while (0)
56 #else
57 # define ASM_SSE2(cpu, op)
58 #endif
60 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
61 * as used by some video surface.
62 * XXX It is really efficient only when SSE4.1 is available.
64 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
65 const uint8_t *src, size_t src_pitch,
66 unsigned width, unsigned height,
67 unsigned cpu)
69 assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
71 ASM_SSE2(cpu, "mfence");
72 for (unsigned y = 0; y < height; y++) {
73 const unsigned unaligned = (intptr_t)src & 0x0f;
74 unsigned x;
76 for (x = 0; x < unaligned; x++)
77 dst[x] = src[x];
79 #ifdef CAN_COMPILE_SSE4_1
80 if (cpu & CPU_CAPABILITY_SSE4_1) {
81 if (!unaligned) {
82 for (; x+63 < width; x += 64)
83 COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
84 } else {
85 for (; x+63 < width; x += 64)
86 COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
88 } else
89 #endif
90 #ifdef CAN_COMPILE_SSE2
91 if (cpu & CPU_CAPABILITY_SSE2) {
92 if (!unaligned) {
93 for (; x+63 < width; x += 64)
94 COPY64(&dst[x], &src[x], "movdqa", "movdqa");
95 } else {
96 for (; x+63 < width; x += 64)
97 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
100 #endif
102 for (; x < width; x++)
103 dst[x] = src[x];
105 src += src_pitch;
106 dst += dst_pitch;
110 static void Copy2d(uint8_t *dst, size_t dst_pitch,
111 const uint8_t *src, size_t src_pitch,
112 unsigned width, unsigned height,
113 unsigned cpu)
115 assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
117 ASM_SSE2(cpu, "mfence");
119 for (unsigned y = 0; y < height; y++) {
120 unsigned x = 0;
121 bool unaligned = ((intptr_t)dst & 0x0f) != 0;
123 #ifdef CAN_COMPILE_SSE2
124 if (cpu & CPU_CAPABILITY_SSE2) {
125 if (!unaligned) {
126 for (; x+63 < width; x += 64)
127 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
128 } else {
129 for (; x+63 < width; x += 64)
130 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
133 #endif
135 for (; x < width; x++)
136 dst[x] = src[x];
138 src += src_pitch;
139 dst += dst_pitch;
143 static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
144 uint8_t *dstv, size_t dstv_pitch,
145 const uint8_t *src, size_t src_pitch,
146 unsigned width, unsigned height, unsigned cpu)
148 const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
149 1, 3, 5, 7, 9, 11, 13, 15 };
150 const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
151 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
153 assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
155 ASM_SSE2(cpu, "mfence");
157 for (unsigned y = 0; y < height; y++) {
158 unsigned x = 0;
160 #define LOAD64 \
161 "movdqa 0(%[src]), %%xmm0\n" \
162 "movdqa 16(%[src]), %%xmm1\n" \
163 "movdqa 32(%[src]), %%xmm2\n" \
164 "movdqa 48(%[src]), %%xmm3\n"
166 #define STORE2X32 \
167 "movq %%xmm0, 0(%[dst1])\n" \
168 "movq %%xmm1, 8(%[dst1])\n" \
169 "movhpd %%xmm0, 0(%[dst2])\n" \
170 "movhpd %%xmm1, 8(%[dst2])\n" \
171 "movq %%xmm2, 16(%[dst1])\n" \
172 "movq %%xmm3, 24(%[dst1])\n" \
173 "movhpd %%xmm2, 16(%[dst2])\n" \
174 "movhpd %%xmm3, 24(%[dst2])\n"
176 #ifdef CAN_COMPILE_SSSE3
177 if (cpu & CPU_CAPABILITY_SSSE3) {
178 for (x = 0; x < (width & ~31); x += 32) {
179 asm volatile (
180 "movdqu (%[shuffle]), %%xmm7\n"
181 LOAD64
182 "pshufb %%xmm7, %%xmm0\n"
183 "pshufb %%xmm7, %%xmm1\n"
184 "pshufb %%xmm7, %%xmm2\n"
185 "pshufb %%xmm7, %%xmm3\n"
186 STORE2X32
187 : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory");
189 } else
190 #endif
191 #ifdef CAN_COMPILE_SSE2
192 if (cpu & CPU_CAPABILITY_SSE2) {
193 for (x = 0; x < (width & ~31); x += 32) {
194 asm volatile (
195 "movdqu (%[mask]), %%xmm7\n"
196 LOAD64
197 "movdqa %%xmm0, %%xmm4\n"
198 "movdqa %%xmm1, %%xmm5\n"
199 "movdqa %%xmm2, %%xmm6\n"
200 "psrlw $8, %%xmm0\n"
201 "psrlw $8, %%xmm1\n"
202 "pand %%xmm7, %%xmm4\n"
203 "pand %%xmm7, %%xmm5\n"
204 "pand %%xmm7, %%xmm6\n"
205 "packuswb %%xmm4, %%xmm0\n"
206 "packuswb %%xmm5, %%xmm1\n"
207 "pand %%xmm3, %%xmm7\n"
208 "psrlw $8, %%xmm2\n"
209 "psrlw $8, %%xmm3\n"
210 "packuswb %%xmm6, %%xmm2\n"
211 "packuswb %%xmm7, %%xmm3\n"
212 STORE2X32
213 : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory");
216 #endif
217 #undef STORE2X32
218 #undef LOAD64
220 for (; x < width; x++) {
221 dstu[x] = src[2*x+0];
222 dstv[x] = src[2*x+1];
224 src += src_pitch;
225 dstu += dstu_pitch;
226 dstv += dstv_pitch;
230 static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch,
231 uint8_t *cache, size_t cache_size,
232 unsigned width, unsigned height,
233 unsigned cpu)
235 const unsigned w16 = (width+15) & ~15;
236 const unsigned hstep = cache_size / w16;
237 assert(hstep > 0);
239 for (unsigned y = 0; y < height; y += hstep) {
240 const unsigned hblock = __MIN(hstep, height - y);
242 /* Copy a bunch of line into our cache */
243 CopyFromUswc(cache, w16,
244 src, src_pitch,
245 width, hblock, cpu);
247 /* Copy from our cache to the destination */
248 Copy2d(dst, dst_pitch,
249 cache, w16,
250 width, hblock, cpu);
252 /* */
253 src += src_pitch * hblock;
254 dst += dst_pitch * hblock;
257 ASM_SSE2(cpu, "mfence");
259 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
260 uint8_t *dstv, size_t dstv_pitch,
261 const uint8_t *src, size_t src_pitch,
262 uint8_t *cache, size_t cache_size,
263 unsigned width, unsigned height,
264 unsigned cpu)
266 const unsigned w2_16 = (2*width+15) & ~15;
267 const unsigned hstep = cache_size / w2_16;
268 assert(hstep > 0);
270 for (unsigned y = 0; y < height; y += hstep) {
271 const unsigned hblock = __MIN(hstep, height - y);
273 /* Copy a bunch of line into our cache */
274 CopyFromUswc(cache, w2_16,
275 src, src_pitch,
276 2*width, hblock, cpu);
278 /* Copy from our cache to the destination */
279 SplitUV(dstu, dstu_pitch,
280 dstv, dstv_pitch,
281 cache, w2_16,
282 width, hblock, cpu);
284 /* */
285 src += src_pitch * hblock;
286 dstu += dstu_pitch * hblock;
287 dstv += dstv_pitch * hblock;
290 ASM_SSE2(cpu, "mfence");
293 int CopyInitCache(copy_cache_t *cache, unsigned width)
295 cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
296 cache->buffer = vlc_memalign(&cache->base, 16, cache->size);
297 if (!cache->base)
298 return VLC_EGENERIC;
299 return VLC_SUCCESS;
301 void CopyCleanCache(copy_cache_t *cache)
303 free(cache->base);
305 cache->base = NULL;
306 cache->buffer = NULL;
307 cache->size = 0;
310 void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
311 unsigned width, unsigned height,
312 copy_cache_t *cache)
314 const unsigned cpu = vlc_CPU();
316 /* */
317 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
318 src[0], src_pitch[0],
319 cache->buffer, cache->size,
320 width, height, cpu);
321 SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
322 dst->p[1].p_pixels, dst->p[1].i_pitch,
323 src[1], src_pitch[1],
324 cache->buffer, cache->size,
325 width/2, height/2, cpu);
327 ASM_SSE2(cpu, "emms");
329 void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
330 unsigned width, unsigned height,
331 copy_cache_t *cache)
333 const unsigned cpu = vlc_CPU();
335 /* */
336 for (unsigned n = 0; n < 3; n++) {
337 const unsigned d = n > 0 ? 2 : 1;
338 CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
339 src[n], src_pitch[n],
340 cache->buffer, cache->size,
341 width/d, height/d, cpu);
343 ASM_SSE2(cpu, "emms");
346 #undef ASM_SSE2
347 #undef COPY64