qt: playlist: use item title if available
[vlc.git] / modules / video_chroma / copy.c
blob2ad9d2b7a710cd579d06390267669a4ad80cc2ba
1 /*****************************************************************************
2 * copy.c: Fast YV12/NV12 copy
3 *****************************************************************************
4 * Copyright (C) 2010 Laurent Aimar
6 * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
7 * Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU Lesser General Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with this program; if not, write to the Free Software Foundation,
21 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 *****************************************************************************/
24 #ifdef HAVE_CONFIG_H
25 # include "config.h"
26 #endif
28 #ifdef COPY_TEST
29 # undef NDEBUG
30 #endif
32 #include <vlc_common.h>
33 #include <vlc_picture.h>
34 #include <vlc_cpu.h>
35 #include <assert.h>
37 #include "copy.h"
38 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
39 const uint8_t *src, size_t src_pitch,
40 unsigned height, int bitshift);
42 #define ASSERT_PLANE(i) assert(src[i]); \
43 assert(src_pitch[i])
45 #define ASSERT_2PLANES \
46 assert(dst); \
47 ASSERT_PLANE(0); \
48 ASSERT_PLANE(1); \
49 assert(height)
51 #define ASSERT_3PLANES ASSERT_2PLANES; \
52 ASSERT_PLANE(2)
54 int CopyInitCache(copy_cache_t *cache, unsigned width)
56 #ifdef CAN_COMPILE_SSE2
57 cache->size = __MAX((width + 0x3f) & ~ 0x3f, 16384);
58 cache->buffer = aligned_alloc(64, cache->size);
59 if (!cache->buffer)
60 return VLC_EGENERIC;
61 #else
62 (void) cache; (void) width;
63 #endif
64 return VLC_SUCCESS;
67 void CopyCleanCache(copy_cache_t *cache)
69 #ifdef CAN_COMPILE_SSE2
70 aligned_free(cache->buffer);
71 cache->buffer = NULL;
72 cache->size = 0;
73 #else
74 (void) cache;
75 #endif
78 #ifdef CAN_COMPILE_SSE2
79 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
80 * load and storing data with the SSE>=2 instruction store.
83 #define COPY16_SHIFTR(x) \
84 "psrlw "x", %%xmm1\n"
85 #define COPY16_SHIFTL(x) \
86 "psllw "x", %%xmm1\n"
88 #define COPY16_S(dstp, srcp, load, store, shiftstr) \
89 asm volatile ( \
90 load " 0(%[src]), %%xmm1\n" \
91 shiftstr \
92 store " %%xmm1, 0(%[dst])\n" \
93 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
95 #define COPY16(dstp, srcp, load, store) COPY16_S(dstp, srcp, load, store, "")
97 #define COPY64_SHIFTR(x) \
98 "psrlw "x", %%xmm1\n" \
99 "psrlw "x", %%xmm2\n" \
100 "psrlw "x", %%xmm3\n" \
101 "psrlw "x", %%xmm4\n"
102 #define COPY64_SHIFTL(x) \
103 "psllw "x", %%xmm1\n" \
104 "psllw "x", %%xmm2\n" \
105 "psllw "x", %%xmm3\n" \
106 "psllw "x", %%xmm4\n"
108 #define COPY64_S(dstp, srcp, load, store, shiftstr) \
109 asm volatile ( \
110 load " 0(%[src]), %%xmm1\n" \
111 load " 16(%[src]), %%xmm2\n" \
112 load " 32(%[src]), %%xmm3\n" \
113 load " 48(%[src]), %%xmm4\n" \
114 shiftstr \
115 store " %%xmm1, 0(%[dst])\n" \
116 store " %%xmm2, 16(%[dst])\n" \
117 store " %%xmm3, 32(%[dst])\n" \
118 store " %%xmm4, 48(%[dst])\n" \
119 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
121 #define COPY64(dstp, srcp, load, store) \
122 COPY64_S(dstp, srcp, load, store, "")
124 #ifdef COPY_TEST_NOOPTIM
125 # undef vlc_CPU_SSE4_1
126 # define vlc_CPU_SSE4_1() (0)
127 # undef vlc_CPU_SSE3
128 # define vlc_CPU_SSE3() (0)
129 # undef vlc_CPU_SSSE3
130 # define vlc_CPU_SSSE3() (0)
131 # undef vlc_CPU_SSE2
132 # define vlc_CPU_SSE2() (0)
133 #endif
135 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
136 * as used by some video surface.
137 * XXX It is really efficient only when SSE4.1 is available.
139 VLC_SSE
140 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
141 const uint8_t *src, size_t src_pitch,
142 unsigned width, unsigned height, int bitshift)
144 assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
146 asm volatile ("mfence");
148 #define SSE_USWC_COPY(shiftstr16, shiftstr64) \
149 for (unsigned y = 0; y < height; y++) { \
150 const unsigned unaligned = (-(uintptr_t)src) & 0x0f; \
151 unsigned x = unaligned; \
152 if (vlc_CPU_SSE4_1()) { \
153 if (!unaligned) { \
154 for (; x+63 < width; x += 64) \
155 COPY64_S(&dst[x], &src[x], "movntdqa", "movdqa", shiftstr64); \
156 } else { \
157 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
158 for (; x+63 < width; x += 64) \
159 COPY64_S(&dst[x], &src[x], "movntdqa", "movdqu", shiftstr64); \
161 } else { \
162 if (!unaligned) { \
163 for (; x+63 < width; x += 64) \
164 COPY64_S(&dst[x], &src[x], "movdqa", "movdqa", shiftstr64); \
165 } else { \
166 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
167 for (; x+63 < width; x += 64) \
168 COPY64_S(&dst[x], &src[x], "movdqa", "movdqu", shiftstr64); \
171 /* The following should not happen since buffers are generally well aligned */ \
172 if (x < width) \
173 CopyPlane(&dst[x], dst_pitch - x, &src[x], src_pitch - x, 1, bitshift); \
174 src += src_pitch; \
175 dst += dst_pitch; \
178 switch (bitshift)
180 case 0:
181 SSE_USWC_COPY("", "")
182 break;
183 case -6:
184 SSE_USWC_COPY(COPY16_SHIFTL("$6"), COPY64_SHIFTL("$6"))
185 break;
186 case 6:
187 SSE_USWC_COPY(COPY16_SHIFTR("$6"), COPY64_SHIFTR("$6"))
188 break;
189 case 2:
190 SSE_USWC_COPY(COPY16_SHIFTR("$2"), COPY64_SHIFTR("$2"))
191 break;
192 case -2:
193 SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
194 break;
195 case 4:
196 SSE_USWC_COPY(COPY16_SHIFTR("$4"), COPY64_SHIFTR("$4"))
197 break;
198 case -4:
199 SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
200 break;
201 default:
202 vlc_assert_unreachable();
204 #undef SSE_USWC_COPY
206 asm volatile ("mfence");
209 VLC_SSE
210 static void Copy2d(uint8_t *dst, size_t dst_pitch,
211 const uint8_t *src, size_t src_pitch,
212 unsigned width, unsigned height)
214 assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
216 for (unsigned y = 0; y < height; y++) {
217 unsigned x = 0;
219 bool unaligned = ((intptr_t)dst & 0x0f) != 0;
220 if (!unaligned) {
221 for (; x+63 < width; x += 64)
222 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
223 } else {
224 for (; x+63 < width; x += 64)
225 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
228 for (; x < width; x++)
229 dst[x] = src[x];
231 src += src_pitch;
232 dst += dst_pitch;
236 VLC_SSE
237 static void
238 SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch,
239 uint8_t *srcu, size_t srcu_pitch,
240 uint8_t *srcv, size_t srcv_pitch,
241 unsigned int width, unsigned int height, uint8_t pixel_size)
243 assert(!((intptr_t)srcu & 0xf) && !(srcu_pitch & 0x0f) &&
244 !((intptr_t)srcv & 0xf) && !(srcv_pitch & 0x0f));
246 static const uint8_t shuffle_8[] = { 0, 8,
247 1, 9,
248 2, 10,
249 3, 11,
250 4, 12,
251 5, 13,
252 6, 14,
253 7, 15 };
254 static const uint8_t shuffle_16[] = { 0, 1, 8, 9,
255 2, 3, 10, 11,
256 4, 5, 12, 13,
257 6, 7, 14, 15 };
258 const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
260 for (unsigned int y = 0; y < height; ++y)
262 unsigned int x;
264 #define LOAD2X32 \
265 "movhpd 0x00(%[src2]), %%xmm0\n" \
266 "movlpd 0x00(%[src1]), %%xmm0\n" \
268 "movhpd 0x08(%[src2]), %%xmm1\n" \
269 "movlpd 0x08(%[src1]), %%xmm1\n" \
271 "movhpd 0x10(%[src2]), %%xmm2\n" \
272 "movlpd 0x10(%[src1]), %%xmm2\n" \
274 "movhpd 0x18(%[src2]), %%xmm3\n" \
275 "movlpd 0x18(%[src1]), %%xmm3\n"
277 #define STORE64 \
278 "movdqu %%xmm0, 0x00(%[dst])\n" \
279 "movdqu %%xmm1, 0x10(%[dst])\n" \
280 "movdqu %%xmm2, 0x20(%[dst])\n" \
281 "movdqu %%xmm3, 0x30(%[dst])\n"
283 #ifdef CAN_COMPILE_SSSE3
284 if (vlc_CPU_SSSE3())
285 for (x = 0; x < (width & ~31); x += 32)
286 asm volatile
288 "movdqu (%[shuffle]), %%xmm7\n"
289 LOAD2X32
290 "pshufb %%xmm7, %%xmm0\n"
291 "pshufb %%xmm7, %%xmm1\n"
292 "pshufb %%xmm7, %%xmm2\n"
293 "pshufb %%xmm7, %%xmm3\n"
294 STORE64
295 : : [dst]"r"(dst+2*x),
296 [src1]"r"(srcu+x), [src2]"r"(srcv+x),
297 [shuffle]"r"(shuffle)
298 : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
300 else
301 #endif
304 assert(pixel_size == 1);
305 for (x = 0; x < (width & ~31); x += 32)
306 asm volatile
308 LOAD2X32
309 "movhlps %%xmm0, %%xmm4\n"
310 "punpcklbw %%xmm4, %%xmm0\n"
312 "movhlps %%xmm1, %%xmm4\n"
313 "punpcklbw %%xmm4, %%xmm1\n"
315 "movhlps %%xmm2, %%xmm4\n"
316 "punpcklbw %%xmm4, %%xmm2\n"
318 "movhlps %%xmm3, %%xmm4\n"
319 "punpcklbw %%xmm4, %%xmm3\n"
320 STORE64
321 : : [dst]"r"(dst+2*x),
322 [src1]"r"(srcu+x), [src2]"r"(srcv+x)
323 : "memory",
324 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm7"
327 #undef LOAD2X32
328 #undef STORE64
330 if (pixel_size == 1)
332 for (; x < width; x++) {
333 dst[2*x+0] = srcu[x];
334 dst[2*x+1] = srcv[x];
337 else
339 for (; x < width; x+= 2) {
340 dst[2*x+0] = srcu[x];
341 dst[2*x+1] = srcu[x + 1];
342 dst[2*x+2] = srcv[x];
343 dst[2*x+3] = srcv[x + 1];
346 srcu += srcu_pitch;
347 srcv += srcv_pitch;
348 dst += dst_pitch;
352 VLC_SSE
353 static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
354 uint8_t *dstv, size_t dstv_pitch,
355 const uint8_t *src, size_t src_pitch,
356 unsigned width, unsigned height, uint8_t pixel_size)
358 assert(pixel_size == 1 || pixel_size == 2);
359 assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
361 #define LOAD64 \
362 "movdqa 0(%[src]), %%xmm0\n" \
363 "movdqa 16(%[src]), %%xmm1\n" \
364 "movdqa 32(%[src]), %%xmm2\n" \
365 "movdqa 48(%[src]), %%xmm3\n"
367 #define STORE2X32 \
368 "movq %%xmm0, 0(%[dst1])\n" \
369 "movq %%xmm1, 8(%[dst1])\n" \
370 "movhpd %%xmm0, 0(%[dst2])\n" \
371 "movhpd %%xmm1, 8(%[dst2])\n" \
372 "movq %%xmm2, 16(%[dst1])\n" \
373 "movq %%xmm3, 24(%[dst1])\n" \
374 "movhpd %%xmm2, 16(%[dst2])\n" \
375 "movhpd %%xmm3, 24(%[dst2])\n"
377 #ifdef CAN_COMPILE_SSSE3
378 if (vlc_CPU_SSSE3())
380 static const uint8_t shuffle_8[] = { 0, 2, 4, 6, 8, 10, 12, 14,
381 1, 3, 5, 7, 9, 11, 13, 15 };
382 static const uint8_t shuffle_16[] = { 0, 1, 4, 5, 8, 9, 12, 13,
383 2, 3, 6, 7, 10, 11, 14, 15 };
384 const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
385 for (unsigned y = 0; y < height; y++) {
386 unsigned x = 0;
387 for (; x < (width & ~31); x += 32) {
388 asm volatile (
389 "movdqu (%[shuffle]), %%xmm7\n"
390 LOAD64
391 "pshufb %%xmm7, %%xmm0\n"
392 "pshufb %%xmm7, %%xmm1\n"
393 "pshufb %%xmm7, %%xmm2\n"
394 "pshufb %%xmm7, %%xmm3\n"
395 STORE2X32
396 : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
398 if (pixel_size == 1)
400 for (; x < width; x++) {
401 dstu[x] = src[2*x+0];
402 dstv[x] = src[2*x+1];
405 else
407 for (; x < width; x+= 2) {
408 dstu[x] = src[2*x+0];
409 dstu[x+1] = src[2*x+1];
410 dstv[x] = src[2*x+2];
411 dstv[x+1] = src[2*x+3];
414 src += src_pitch;
415 dstu += dstu_pitch;
416 dstv += dstv_pitch;
418 } else
419 #endif
421 assert(pixel_size == 1);
422 static const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
423 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
425 for (unsigned y = 0; y < height; y++)
427 unsigned x = 0;
428 for (; x < (width & ~31); x += 32) {
429 asm volatile (
430 "movdqu (%[mask]), %%xmm7\n"
431 LOAD64
432 "movdqa %%xmm0, %%xmm4\n"
433 "movdqa %%xmm1, %%xmm5\n"
434 "movdqa %%xmm2, %%xmm6\n"
435 "psrlw $8, %%xmm0\n"
436 "psrlw $8, %%xmm1\n"
437 "pand %%xmm7, %%xmm4\n"
438 "pand %%xmm7, %%xmm5\n"
439 "pand %%xmm7, %%xmm6\n"
440 "packuswb %%xmm4, %%xmm0\n"
441 "packuswb %%xmm5, %%xmm1\n"
442 "pand %%xmm3, %%xmm7\n"
443 "psrlw $8, %%xmm2\n"
444 "psrlw $8, %%xmm3\n"
445 "packuswb %%xmm6, %%xmm2\n"
446 "packuswb %%xmm7, %%xmm3\n"
447 STORE2X32
448 : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
450 for (; x < width; x++) {
451 dstu[x] = src[2*x+0];
452 dstv[x] = src[2*x+1];
454 src += src_pitch;
455 dstu += dstu_pitch;
456 dstv += dstv_pitch;
459 #undef STORE2X32
460 #undef LOAD64
463 static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
464 const uint8_t *src, size_t src_pitch,
465 uint8_t *cache, size_t cache_size,
466 unsigned height, int bitshift)
468 const size_t copy_pitch = __MIN(src_pitch, dst_pitch);
469 assert(copy_pitch > 0);
470 const unsigned w16 = (copy_pitch+15) & ~15;
471 const unsigned hstep = cache_size / w16;
472 const unsigned cache_width = __MIN(src_pitch, cache_size);
473 assert(hstep > 0);
475 /* If SSE4.1: CopyFromUswc is faster than memcpy */
476 if (!vlc_CPU_SSE4_1() && bitshift == 0 && src_pitch == dst_pitch)
477 memcpy(dst, src, copy_pitch * height);
478 else
479 for (unsigned y = 0; y < height; y += hstep) {
480 const unsigned hblock = __MIN(hstep, height - y);
482 /* Copy a bunch of line into our cache */
483 CopyFromUswc(cache, w16, src, src_pitch, cache_width, hblock, bitshift);
485 /* Copy from our cache to the destination */
486 Copy2d(dst, dst_pitch, cache, w16, copy_pitch, hblock);
488 /* */
489 src += src_pitch * hblock;
490 dst += dst_pitch * hblock;
494 static void
495 SSE_InterleavePlanes(uint8_t *dst, size_t dst_pitch,
496 const uint8_t *srcu, size_t srcu_pitch,
497 const uint8_t *srcv, size_t srcv_pitch,
498 uint8_t *cache, size_t cache_size,
499 unsigned int height, uint8_t pixel_size, int bitshift)
501 assert(srcu_pitch == srcv_pitch);
502 size_t copy_pitch = __MIN(dst_pitch / 2, srcu_pitch);
503 unsigned int const w16 = (srcu_pitch+15) & ~15;
504 unsigned int const hstep = (cache_size) / (2*w16);
505 const unsigned cacheu_width = __MIN(srcu_pitch, cache_size);
506 const unsigned cachev_width = __MIN(srcv_pitch, cache_size);
507 assert(hstep > 0);
509 for (unsigned int y = 0; y < height; y += hstep)
511 unsigned int const hblock = __MIN(hstep, height - y);
513 /* Copy a bunch of line into our cache */
514 CopyFromUswc(cache, w16, srcu, srcu_pitch, cacheu_width, hblock, bitshift);
515 CopyFromUswc(cache+w16*hblock, w16, srcv, srcv_pitch,
516 cachev_width, hblock, bitshift);
518 /* Copy from our cache to the destination */
519 SSE_InterleaveUV(dst, dst_pitch, cache, w16,
520 cache + w16 * hblock, w16,
521 copy_pitch, hblock, pixel_size);
523 /* */
524 srcu += hblock * srcu_pitch;
525 srcv += hblock * srcv_pitch;
526 dst += hblock * dst_pitch;
530 static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
531 uint8_t *dstv, size_t dstv_pitch,
532 const uint8_t *src, size_t src_pitch,
533 uint8_t *cache, size_t cache_size,
534 unsigned height, uint8_t pixel_size, int bitshift)
536 size_t copy_pitch = __MIN(__MIN(src_pitch / 2, dstu_pitch), dstv_pitch);
537 const unsigned w16 = (src_pitch+15) & ~15;
538 const unsigned hstep = cache_size / w16;
539 const unsigned cache_width = __MIN(src_pitch, cache_size);
540 assert(hstep > 0);
542 for (unsigned y = 0; y < height; y += hstep) {
543 const unsigned hblock = __MIN(hstep, height - y);
545 /* Copy a bunch of line into our cache */
546 CopyFromUswc(cache, w16, src, src_pitch, cache_width, hblock, bitshift);
548 /* Copy from our cache to the destination */
549 SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
550 cache, w16, copy_pitch, hblock, pixel_size);
552 /* */
553 src += src_pitch * hblock;
554 dstu += dstu_pitch * hblock;
555 dstv += dstv_pitch * hblock;
559 static void SSE_Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
560 const size_t src_pitch[static 3], unsigned height,
561 const copy_cache_t *cache)
563 for (unsigned n = 0; n < 3; n++) {
564 const unsigned d = n > 0 ? 2 : 1;
565 SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
566 src[n], src_pitch[n],
567 cache->buffer, cache->size,
568 (height+d-1)/d, 0);
570 asm volatile ("emms");
574 static void SSE_Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
575 const size_t src_pitch[static 2], unsigned height,
576 const copy_cache_t *cache)
578 SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
579 cache->buffer, cache->size, height, 0);
580 SSE_CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch, src[1], src_pitch[1],
581 cache->buffer, cache->size, (height+1) / 2, 0);
582 asm volatile ("emms");
585 static void
586 SSE_Copy420_SP_to_P(picture_t *dest, const uint8_t *src[static 2],
587 const size_t src_pitch[static 2], unsigned int height,
588 uint8_t pixel_size, int bitshift, const copy_cache_t *cache)
590 SSE_CopyPlane(dest->p[0].p_pixels, dest->p[0].i_pitch,
591 src[0], src_pitch[0], cache->buffer, cache->size, height, bitshift);
593 SSE_SplitPlanes(dest->p[1].p_pixels, dest->p[1].i_pitch,
594 dest->p[2].p_pixels, dest->p[2].i_pitch,
595 src[1], src_pitch[1], cache->buffer, cache->size,
596 (height+1) / 2, pixel_size, bitshift);
597 asm volatile ("emms");
600 static void SSE_Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
601 const size_t src_pitch[static 3],
602 unsigned height, uint8_t pixel_size,
603 int bitshift, const copy_cache_t *cache)
605 SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
606 cache->buffer, cache->size, height, bitshift);
607 SSE_InterleavePlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
608 src[U_PLANE], src_pitch[U_PLANE],
609 src[V_PLANE], src_pitch[V_PLANE],
610 cache->buffer, cache->size, (height+1) / 2, pixel_size, bitshift);
611 asm volatile ("emms");
613 #undef COPY64
614 #endif /* CAN_COMPILE_SSE2 */
616 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
617 const uint8_t *src, size_t src_pitch,
618 unsigned height, int bitshift)
620 const size_t copy_pitch = __MIN(src_pitch, dst_pitch);
621 if (bitshift != 0)
623 for (unsigned y = 0; y < height; y++)
625 uint16_t *dst16 = (uint16_t *) dst;
626 const uint16_t *src16 = (const uint16_t *) src;
628 if (bitshift > 0)
629 for (unsigned x = 0; x < (copy_pitch / 2); x++)
630 *dst16++ = (*src16++) >> (bitshift & 0xf);
631 else
632 for (unsigned x = 0; x < (copy_pitch / 2); x++)
633 *dst16++ = (*src16++) << ((-bitshift) & 0xf);
634 src += src_pitch;
635 dst += dst_pitch;
638 else if (src_pitch == dst_pitch)
639 memcpy(dst, src, copy_pitch * height);
640 else
641 for (unsigned y = 0; y < height; y++) {
642 memcpy(dst, src, copy_pitch);
643 src += src_pitch;
644 dst += dst_pitch;
648 void CopyPacked(picture_t *dst, const uint8_t *src, const size_t src_pitch,
649 unsigned height, const copy_cache_t *cache)
651 assert(dst);
652 assert(src); assert(src_pitch);
653 assert(height);
655 #ifdef CAN_COMPILE_SSE2
656 if (vlc_CPU_SSE4_1())
657 return SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src, src_pitch,
658 cache->buffer, cache->size, height, 0);
659 #else
660 (void) cache;
661 #endif
662 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src, src_pitch,
663 height, 0);
666 void Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
667 const size_t src_pitch[static 2], unsigned height,
668 const copy_cache_t *cache)
670 ASSERT_2PLANES;
671 #ifdef CAN_COMPILE_SSE2
672 if (vlc_CPU_SSE2())
673 return SSE_Copy420_SP_to_SP(dst, src, src_pitch, height, cache);
674 #else
675 (void) cache;
676 #endif
678 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
679 src[0], src_pitch[0], height, 0);
680 CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
681 src[1], src_pitch[1], (height+1)/2, 0);
684 #define SPLIT_PLANES(type, pitch_den) do { \
685 size_t copy_pitch = __MIN(__MIN(src_pitch / pitch_den, dstu_pitch), dstv_pitch); \
686 for (unsigned y = 0; y < height; y++) { \
687 for (unsigned x = 0; x < copy_pitch; x++) { \
688 ((type *) dstu)[x] = ((const type *) src)[2*x+0]; \
689 ((type *) dstv)[x] = ((const type *) src)[2*x+1]; \
691 src += src_pitch; \
692 dstu += dstu_pitch; \
693 dstv += dstv_pitch; \
695 } while(0)
697 #define SPLIT_PLANES_SHIFTR(type, pitch_den, bitshift) do { \
698 size_t copy_pitch = __MIN(__MIN(src_pitch / pitch_den, dstu_pitch), dstv_pitch); \
699 for (unsigned y = 0; y < height; y++) { \
700 for (unsigned x = 0; x < copy_pitch; x++) { \
701 ((type *) dstu)[x] = (((const type *) src)[2*x+0]) >> (bitshift); \
702 ((type *) dstv)[x] = (((const type *) src)[2*x+1]) >> (bitshift); \
704 src += src_pitch; \
705 dstu += dstu_pitch; \
706 dstv += dstv_pitch; \
708 } while(0)
710 #define SPLIT_PLANES_SHIFTL(type, pitch_den, bitshift) do { \
711 size_t copy_pitch = __MIN(__MIN(src_pitch / pitch_den, dstu_pitch), dstv_pitch); \
712 for (unsigned y = 0; y < height; y++) { \
713 for (unsigned x = 0; x < copy_pitch; x++) { \
714 ((type *) dstu)[x] = (((const type *) src)[2*x+0]) << (bitshift); \
715 ((type *) dstv)[x] = (((const type *) src)[2*x+1]) << (bitshift); \
717 src += src_pitch; \
718 dstu += dstu_pitch; \
719 dstv += dstv_pitch; \
721 } while(0)
723 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
724 uint8_t *dstv, size_t dstv_pitch,
725 const uint8_t *src, size_t src_pitch, unsigned height)
727 SPLIT_PLANES(uint8_t, 2);
730 static void SplitPlanes16(uint8_t *dstu, size_t dstu_pitch,
731 uint8_t *dstv, size_t dstv_pitch,
732 const uint8_t *src, size_t src_pitch, unsigned height,
733 int bitshift)
735 if (bitshift == 0)
736 SPLIT_PLANES(uint16_t, 4);
737 else if (bitshift > 0)
738 SPLIT_PLANES_SHIFTR(uint16_t, 4, bitshift & 0xf);
739 else
740 SPLIT_PLANES_SHIFTL(uint16_t, 4, (-bitshift) & 0xf);
743 void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
744 const size_t src_pitch[static 2], unsigned height,
745 const copy_cache_t *cache)
747 ASSERT_2PLANES;
748 #ifdef CAN_COMPILE_SSE2
749 if (vlc_CPU_SSE2())
750 return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 1, 0, cache);
751 #else
752 VLC_UNUSED(cache);
753 #endif
755 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
756 src[0], src_pitch[0], height, 0);
757 SplitPlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
758 dst->p[2].p_pixels, dst->p[2].i_pitch,
759 src[1], src_pitch[1], (height+1)/2);
762 void Copy420_16_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
763 const size_t src_pitch[static 2], unsigned height,
764 int bitshift, const copy_cache_t *cache)
766 ASSERT_2PLANES;
767 assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0));
769 #ifdef CAN_COMPILE_SSE3
770 if (vlc_CPU_SSSE3())
771 return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 2, bitshift, cache);
772 #else
773 VLC_UNUSED(cache);
774 #endif
776 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
777 src[0], src_pitch[0], height, bitshift);
778 SplitPlanes16(dst->p[1].p_pixels, dst->p[1].i_pitch,
779 dst->p[2].p_pixels, dst->p[2].i_pitch,
780 src[1], src_pitch[1], (height+1)/2, bitshift);
783 #define INTERLEAVE_UV() do { \
784 for ( unsigned int line = 0; line < copy_lines; line++ ) { \
785 for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
786 *dstUV++ = *srcU++; \
787 *dstUV++ = *srcV++; \
789 dstUV += i_extra_pitch_uv; \
790 srcU += i_extra_pitch_u; \
791 srcV += i_extra_pitch_v; \
793 }while(0)
795 #define INTERLEAVE_UV_SHIFTR(bitshitf) do { \
796 for ( unsigned int line = 0; line < copy_lines; line++ ) { \
797 for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
798 *dstUV++ = (*srcU++) >> (bitshitf); \
799 *dstUV++ = (*srcV++) >> (bitshitf); \
801 dstUV += i_extra_pitch_uv; \
802 srcU += i_extra_pitch_u; \
803 srcV += i_extra_pitch_v; \
805 }while(0)
807 #define INTERLEAVE_UV_SHIFTL(bitshitf) do { \
808 for ( unsigned int line = 0; line < copy_lines; line++ ) { \
809 for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
810 *dstUV++ = (*srcU++) << (bitshitf); \
811 *dstUV++ = (*srcV++) << (bitshitf); \
813 dstUV += i_extra_pitch_uv; \
814 srcU += i_extra_pitch_u; \
815 srcV += i_extra_pitch_v; \
817 }while(0)
819 void Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
820 const size_t src_pitch[static 3], unsigned height,
821 const copy_cache_t *cache)
823 ASSERT_3PLANES;
824 #ifdef CAN_COMPILE_SSE2
825 if (vlc_CPU_SSE2())
826 return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 1, 0, cache);
827 #else
828 (void) cache;
829 #endif
831 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
832 src[0], src_pitch[0], height, 0);
834 const unsigned copy_lines = (height+1) / 2;
835 unsigned copy_pitch = src_pitch[1];
836 if (copy_pitch > (size_t)dst->p[1].i_pitch / 2)
837 copy_pitch = dst->p[1].i_pitch / 2;
839 const int i_extra_pitch_uv = dst->p[1].i_pitch - 2 * copy_pitch;
840 const int i_extra_pitch_u = src_pitch[U_PLANE] - copy_pitch;
841 const int i_extra_pitch_v = src_pitch[V_PLANE] - copy_pitch;
843 uint8_t *dstUV = dst->p[1].p_pixels;
844 const uint8_t *srcU = src[U_PLANE];
845 const uint8_t *srcV = src[V_PLANE];
846 INTERLEAVE_UV();
849 void Copy420_16_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
850 const size_t src_pitch[static 3], unsigned height,
851 int bitshift, const copy_cache_t *cache)
853 ASSERT_3PLANES;
854 assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0));
855 #ifdef CAN_COMPILE_SSE2
856 if (vlc_CPU_SSSE3())
857 return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 2, bitshift, cache);
858 #else
859 (void) cache;
860 #endif
862 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
863 src[0], src_pitch[0], height, bitshift);
865 const unsigned copy_lines = (height+1) / 2;
866 const unsigned copy_pitch = src_pitch[1] / 2;
868 const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
869 const int i_extra_pitch_u = src_pitch[U_PLANE] / 2 - copy_pitch;
870 const int i_extra_pitch_v = src_pitch[V_PLANE] / 2 - copy_pitch;
872 uint16_t *dstUV = (void*) dst->p[1].p_pixels;
873 const uint16_t *srcU = (const uint16_t *) src[U_PLANE];
874 const uint16_t *srcV = (const uint16_t *) src[V_PLANE];
876 if (bitshift == 0)
877 INTERLEAVE_UV();
878 else if (bitshift > 0)
879 INTERLEAVE_UV_SHIFTR(bitshift & 0xf);
880 else
881 INTERLEAVE_UV_SHIFTL((-bitshift) & 0xf);
884 void Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
885 const size_t src_pitch[static 3], unsigned height,
886 const copy_cache_t *cache)
888 ASSERT_3PLANES;
889 #ifdef CAN_COMPILE_SSE2
890 if (vlc_CPU_SSE2())
891 return SSE_Copy420_P_to_P(dst, src, src_pitch, height, cache);
892 #else
893 (void) cache;
894 #endif
896 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
897 src[0], src_pitch[0], height, 0);
898 CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
899 src[1], src_pitch[1], (height+1) / 2, 0);
900 CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
901 src[2], src_pitch[2], (height+1) / 2, 0);
904 int picture_UpdatePlanes(picture_t *picture, uint8_t *data, unsigned pitch)
906 /* fill in buffer info in first plane */
907 picture->p->p_pixels = data;
908 picture->p->i_pitch = pitch;
909 picture->p->i_lines = picture->format.i_height;
910 assert(picture->p->i_visible_pitch <= picture->p->i_pitch);
911 assert(picture->p->i_visible_lines <= picture->p->i_lines);
913 /* Fill chroma planes for biplanar YUV */
914 if (picture->format.i_chroma == VLC_CODEC_NV12 ||
915 picture->format.i_chroma == VLC_CODEC_NV21 ||
916 picture->format.i_chroma == VLC_CODEC_P010) {
918 for (int n = 1; n < picture->i_planes; n++) {
919 const plane_t *o = &picture->p[n-1];
920 plane_t *p = &picture->p[n];
922 p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
923 p->i_pitch = pitch;
924 p->i_lines = picture->format.i_height / 2;
925 assert(p->i_visible_pitch <= p->i_pitch);
926 assert(p->i_visible_lines <= p->i_lines);
928 /* The dx/d3d buffer is always allocated as NV12 */
929 if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_NV12)) {
930 /* TODO : Swap NV21 UV planes to match NV12 */
931 return VLC_EGENERIC;
935 /* Fill chroma planes for planar YUV */
936 else
937 if (picture->format.i_chroma == VLC_CODEC_I420 ||
938 picture->format.i_chroma == VLC_CODEC_J420 ||
939 picture->format.i_chroma == VLC_CODEC_YV12) {
941 for (int n = 1; n < picture->i_planes; n++) {
942 const plane_t *o = &picture->p[n-1];
943 plane_t *p = &picture->p[n];
945 p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
946 p->i_pitch = pitch / 2;
947 p->i_lines = picture->format.i_height / 2;
949 /* The dx/d3d buffer is always allocated as YV12 */
950 if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_YV12))
951 picture_SwapUV( picture );
953 return VLC_SUCCESS;
956 #ifdef COPY_TEST
958 #include <vlc_picture.h>
960 struct test_dst
962 vlc_fourcc_t chroma;
963 int bitshift;
964 union
966 void (*conv)(picture_t *, const uint8_t *[], const size_t [], unsigned,
967 const copy_cache_t *);
968 void (*conv16)(picture_t *, const uint8_t *[], const size_t [], unsigned, int,
969 const copy_cache_t *);
973 struct test_conv
975 vlc_fourcc_t src_chroma;
976 struct test_dst dsts[3];
979 static const struct test_conv convs[] = {
980 { .src_chroma = VLC_CODEC_NV12,
981 .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_SP_to_P },
982 { VLC_CODEC_NV12, 0, .conv = Copy420_SP_to_SP } },
984 { .src_chroma = VLC_CODEC_I420,
985 .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_P_to_P },
986 { VLC_CODEC_NV12, 0, .conv = Copy420_P_to_SP } },
988 { .src_chroma = VLC_CODEC_P010,
989 .dsts = { { VLC_CODEC_I420_10L, 6, .conv16 = Copy420_16_SP_to_P } },
991 { .src_chroma = VLC_CODEC_I420_10L,
992 .dsts = { { VLC_CODEC_P010, -6, .conv16 = Copy420_16_P_to_SP } },
995 #define NB_CONVS ARRAY_SIZE(convs)
997 struct test_size
999 int i_width;
1000 int i_height;
1001 int i_visible_width;
1002 int i_visible_height;
1004 static const struct test_size sizes[] = {
1005 { 1, 1, 1, 1 },
1006 { 3, 3, 3, 3 },
1007 { 65, 39, 65, 39 },
1008 { 560, 369, 540, 350 },
1009 { 1274, 721, 1200, 720 },
1010 { 1920, 1088, 1920, 1080 },
1011 { 3840, 2160, 3840, 2160 },
1012 #if 0 /* too long */
1013 { 8192, 8192, 8192, 8192 },
1014 #endif
1016 #define NB_SIZES ARRAY_SIZE(sizes)
1018 static void piccheck(picture_t *pic, const vlc_chroma_description_t *dsc,
1019 bool init)
1021 #define ASSERT_COLOR(good) do { \
1022 fprintf(stderr, "error: pixel doesn't match @ plane: %d: %d x %d: 0x%X vs 0x%X\n", i, x, y, *(--p), good); \
1023 assert(!"error: pixel doesn't match"); \
1024 } while(0)
1026 #define PICCHECK(type_u, type_uv, colors_P, color_UV, pitch_den) do { \
1027 for (int i = 0; i < pic->i_planes; ++i) \
1029 const struct plane_t *plane = &pic->p[i]; \
1030 for (int y = 0; y < plane->i_visible_lines; ++y) \
1032 if (pic->i_planes == 2 && i == 1) \
1034 type_uv *p = (type_uv *)&plane->p_pixels[y * plane->i_pitch]; \
1035 for (int x = 0; x < plane->i_visible_pitch / 2 / pitch_den; ++x) \
1036 if (init) \
1037 *(p++) = color_UV; \
1038 else if (*(p++) != color_UV) \
1039 ASSERT_COLOR(color_UV); \
1041 else \
1043 type_u *p = (type_u *) &plane->p_pixels[y * plane->i_pitch]; \
1044 for (int x = 0; x < plane->i_visible_pitch / pitch_den; ++x) \
1045 if (init) \
1046 *(p++) = colors_P[i]; \
1047 else if (*(p++) != colors_P[i]) \
1048 ASSERT_COLOR(colors_P[i]); \
1052 } while (0)
1054 assert(pic->i_planes == 2 || pic->i_planes == 3);
1055 assert(dsc->pixel_size == 1 || dsc->pixel_size == 2);
1057 if (dsc->pixel_size == 1)
1059 const uint8_t colors_8_P[3] = { 0x42, 0xF1, 0x36 };
1060 const uint16_t color_8_UV = ntoh16(0xF136);
1061 PICCHECK(uint8_t, uint16_t, colors_8_P, color_8_UV, 1);
1063 else
1065 const unsigned mask = (1 << dsc->pixel_bits) - 1;
1066 uint16_t colors_16_P[3] = { 0x1042 &mask, 0xF114 &mask, 0x3645 &mask};
1068 switch (pic->format.i_chroma)
1070 case VLC_CODEC_P010:
1071 for (size_t i = 0; i < 3; ++i)
1072 colors_16_P[i] <<= 6;
1073 break;
1074 case VLC_CODEC_I420_10L:
1075 break;
1076 default:
1077 vlc_assert_unreachable();
1080 uint32_t color_16_UV = GetDWLE( &colors_16_P[1] );
1082 PICCHECK(uint16_t, uint32_t, colors_16_P, color_16_UV, 2);
1086 static void pic_rsc_destroy(picture_t *pic)
1088 for (unsigned i = 0; i < 3; i++)
1089 free(pic->p[i].p_pixels);
1092 static picture_t *pic_new_unaligned(const video_format_t *fmt)
1094 /* Allocate a no-aligned picture in order to ease buffer overflow detection
1095 * from the source picture */
1096 const vlc_chroma_description_t *dsc = vlc_fourcc_GetChromaDescription(fmt->i_chroma);
1097 assert(dsc);
1098 picture_resource_t rsc = { .pf_destroy = pic_rsc_destroy };
1099 for (unsigned i = 0; i < dsc->plane_count; i++)
1101 rsc.p[i].i_lines = ((fmt->i_visible_height + (dsc->p[i].h.den - 1)) / dsc->p[i].h.den) * dsc->p[i].h.num;
1102 rsc.p[i].i_pitch = ((fmt->i_visible_width + (dsc->p[i].w.den - 1)) / dsc->p[i].w.den) * dsc->p[i].w.num * dsc->pixel_size;
1103 rsc.p[i].p_pixels = malloc(rsc.p[i].i_lines * rsc.p[i].i_pitch);
1104 assert(rsc.p[i].p_pixels);
1106 return picture_NewFromResource(fmt, &rsc);
1109 int main(void)
1111 alarm(10);
1113 #ifndef COPY_TEST_NOOPTIM
1114 if (!vlc_CPU_SSE2())
1116 fprintf(stderr, "WARNING: could not test SSE\n");
1117 return 77;
1119 #endif
1121 for (size_t i = 0; i < NB_CONVS; ++i)
1123 const struct test_conv *conv = &convs[i];
1125 for (size_t j = 0; j < NB_SIZES; ++j)
1127 const struct test_size *size = &sizes[j];
1129 const vlc_chroma_description_t *src_dsc =
1130 vlc_fourcc_GetChromaDescription(conv->src_chroma);
1131 assert(src_dsc);
1133 video_format_t fmt;
1134 video_format_Init(&fmt, 0);
1135 video_format_Setup(&fmt, conv->src_chroma,
1136 size->i_width, size->i_height,
1137 size->i_visible_width, size->i_visible_height,
1138 1, 1);
1139 picture_t *src = pic_new_unaligned(&fmt);
1140 assert(src);
1141 piccheck(src, src_dsc, true);
1143 copy_cache_t cache;
1144 int ret = CopyInitCache(&cache, src->format.i_width
1145 * src_dsc->pixel_size);
1146 assert(ret == VLC_SUCCESS);
1148 for (size_t f = 0; conv->dsts[f].chroma != 0; ++f)
1150 const struct test_dst *test_dst= &conv->dsts[f];
1152 const vlc_chroma_description_t *dst_dsc =
1153 vlc_fourcc_GetChromaDescription(test_dst->chroma);
1154 assert(dst_dsc);
1155 fmt.i_chroma = test_dst->chroma;
1156 picture_t *dst = picture_NewFromFormat(&fmt);
1157 assert(dst);
1159 const uint8_t * src_planes[3] = { src->p[Y_PLANE].p_pixels,
1160 src->p[U_PLANE].p_pixels,
1161 src->p[V_PLANE].p_pixels };
1162 const size_t src_pitches[3] = { src->p[Y_PLANE].i_pitch,
1163 src->p[U_PLANE].i_pitch,
1164 src->p[V_PLANE].i_pitch };
1166 fprintf(stderr, "testing: %u x %u (vis: %u x %u) %4.4s -> %4.4s\n",
1167 size->i_width, size->i_height,
1168 size->i_visible_width, size->i_visible_height,
1169 (const char *) &src->format.i_chroma,
1170 (const char *) &dst->format.i_chroma);
1171 if (test_dst->bitshift == 0)
1172 test_dst->conv(dst, src_planes, src_pitches,
1173 src->format.i_visible_height, &cache);
1174 else
1175 test_dst->conv16(dst, src_planes, src_pitches,
1176 src->format.i_visible_height, test_dst->bitshift,
1177 &cache);
1178 piccheck(dst, dst_dsc, false);
1179 picture_Release(dst);
1181 picture_Release(src);
1182 CopyCleanCache(&cache);
1185 return 0;
1188 #endif