chroma: copy: remove old debug code
[vlc.git] / modules / video_chroma / copy.c
blob72e6b03783ac245d9d38a6ea7ce7b4b380e204fc
1 /*****************************************************************************
2 * copy.c: Fast YV12/NV12 copy
3 *****************************************************************************
4 * Copyright (C) 2010 Laurent Aimar
5 * $Id$
7 * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
8 * Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 #ifdef HAVE_CONFIG_H
26 # include "config.h"
27 #endif
29 #include <vlc_common.h>
30 #include <vlc_picture.h>
31 #include <vlc_cpu.h>
32 #include <assert.h>
34 #include "copy.h"
36 #define ASSERT_PLANE(i) assert(src[i]); \
37 assert(src_pitch[i])
39 #define ASSERT_2PLANES \
40 assert(dst); \
41 ASSERT_PLANE(0); \
42 ASSERT_PLANE(1); \
43 assert(height)
45 #define ASSERT_3PLANES ASSERT_2PLANES; \
46 ASSERT_PLANE(2)
48 int CopyInitCache(copy_cache_t *cache, unsigned width)
50 #ifdef CAN_COMPILE_SSE2
51 cache->size = __MAX((width + 0x3f) & ~ 0x3f, 16384);
52 cache->buffer = aligned_alloc(64, cache->size);
53 if (!cache->buffer)
54 return VLC_EGENERIC;
55 #else
56 (void) cache; (void) width;
57 #endif
58 return VLC_SUCCESS;
61 void CopyCleanCache(copy_cache_t *cache)
63 #ifdef CAN_COMPILE_SSE2
64 aligned_free(cache->buffer);
65 cache->buffer = NULL;
66 cache->size = 0;
67 #else
68 (void) cache;
69 #endif
72 #ifdef CAN_COMPILE_SSE2
73 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
74 * load and storing data with the SSE>=2 instruction store.
76 #define COPY16(dstp, srcp, load, store) \
77 asm volatile ( \
78 load " 0(%[src]), %%xmm1\n" \
79 store " %%xmm1, 0(%[dst])\n" \
80 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
82 #define COPY64(dstp, srcp, load, store) \
83 asm volatile ( \
84 load " 0(%[src]), %%xmm1\n" \
85 load " 16(%[src]), %%xmm2\n" \
86 load " 32(%[src]), %%xmm3\n" \
87 load " 48(%[src]), %%xmm4\n" \
88 store " %%xmm1, 0(%[dst])\n" \
89 store " %%xmm2, 16(%[dst])\n" \
90 store " %%xmm3, 32(%[dst])\n" \
91 store " %%xmm4, 48(%[dst])\n" \
92 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
94 #ifndef __SSE4_1__
95 # undef vlc_CPU_SSE4_1
96 # define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
97 #endif
99 #ifndef __SSSE3__
100 # undef vlc_CPU_SSSE3
101 # define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
102 #endif
104 #ifndef __SSE2__
105 # undef vlc_CPU_SSE2
106 # define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
107 #endif
109 #ifdef COPY_TEST_NOOTPIM
110 # undef vlc_CPU_SSE4_1
111 # define vlc_CPU_SSE4_1() (0)
112 # undef vlc_CPU_SSE3
113 # define vlc_CPU_SSE3() (0)
114 # undef vlc_CPU_SSE2
115 # define vlc_CPU_SSE2() (0)
116 #endif
118 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
119 * as used by some video surface.
120 * XXX It is really efficient only when SSE4.1 is available.
122 VLC_SSE
123 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
124 const uint8_t *src, size_t src_pitch,
125 unsigned width, unsigned height,
126 unsigned cpu)
128 #if defined (__SSE4_1__) || !defined(CAN_COMPILE_SSSE3)
129 VLC_UNUSED(cpu);
130 #endif
131 assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
133 asm volatile ("mfence");
135 for (unsigned y = 0; y < height; y++) {
136 const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
137 unsigned x = unaligned;
139 #ifdef CAN_COMPILE_SSE4_1
140 if (vlc_CPU_SSE4_1()) {
141 if (!unaligned) {
142 for (; x+63 < width; x += 64)
143 COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
144 } else {
145 COPY16(dst, src, "movdqu", "movdqa");
146 for (; x+63 < width; x += 64)
147 COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
149 } else
150 #endif
152 if (!unaligned) {
153 for (; x+63 < width; x += 64)
154 COPY64(&dst[x], &src[x], "movdqa", "movdqa");
155 } else {
156 COPY16(dst, src, "movdqu", "movdqa");
157 for (; x+63 < width; x += 64)
158 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
162 for (; x < width; x++)
163 dst[x] = src[x];
165 src += src_pitch;
166 dst += dst_pitch;
168 asm volatile ("mfence");
171 VLC_SSE
172 static void Copy2d(uint8_t *dst, size_t dst_pitch,
173 const uint8_t *src, size_t src_pitch,
174 unsigned width, unsigned height)
176 assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
178 for (unsigned y = 0; y < height; y++) {
179 unsigned x = 0;
181 bool unaligned = ((intptr_t)dst & 0x0f) != 0;
182 if (!unaligned) {
183 for (; x+63 < width; x += 64)
184 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
185 } else {
186 for (; x+63 < width; x += 64)
187 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
190 for (; x < width; x++)
191 dst[x] = src[x];
193 src += src_pitch;
194 dst += dst_pitch;
198 VLC_SSE
199 static void
200 SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch,
201 uint8_t *srcu, size_t srcu_pitch,
202 uint8_t *srcv, size_t srcv_pitch,
203 unsigned int width, unsigned int height, uint8_t pixel_size,
204 unsigned int cpu)
206 assert(!((intptr_t)srcu & 0xf) && !(srcu_pitch & 0x0f) &&
207 !((intptr_t)srcv & 0xf) && !(srcv_pitch & 0x0f));
209 #if defined(__SSSE3__) || !defined (CAN_COMPILE_SSSE3)
210 VLC_UNUSED(cpu);
211 #endif
213 static const uint8_t shuffle_8[] = { 0, 8,
214 1, 9,
215 2, 10,
216 3, 11,
217 4, 12,
218 5, 13,
219 6, 14,
220 7, 15 };
221 static const uint8_t shuffle_16[] = { 0, 1, 8, 9,
222 2, 3, 10, 11,
223 4, 5, 12, 13,
224 6, 7, 14, 15 };
225 const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
227 for (unsigned int y = 0; y < height; ++y)
229 unsigned int x;
231 #define LOAD2X32 \
232 "movhpd 0x00(%[src2]), %%xmm0\n" \
233 "movlpd 0x00(%[src1]), %%xmm0\n" \
235 "movhpd 0x08(%[src2]), %%xmm1\n" \
236 "movlpd 0x08(%[src1]), %%xmm1\n" \
238 "movhpd 0x10(%[src2]), %%xmm2\n" \
239 "movlpd 0x10(%[src1]), %%xmm2\n" \
241 "movhpd 0x18(%[src2]), %%xmm3\n" \
242 "movlpd 0x18(%[src1]), %%xmm3\n"
244 #define STORE64 \
245 "movdqu %%xmm0, 0x00(%[dst])\n" \
246 "movdqu %%xmm1, 0x10(%[dst])\n" \
247 "movdqu %%xmm2, 0x20(%[dst])\n" \
248 "movdqu %%xmm3, 0x30(%[dst])\n"
250 #ifdef CAN_COMPILE_SSSE3
251 if (vlc_CPU_SSSE3())
252 for (x = 0; x < (width & ~31); x += 32)
253 asm volatile
255 "movdqu (%[shuffle]), %%xmm7\n"
256 LOAD2X32
257 "pshufb %%xmm7, %%xmm0\n"
258 "pshufb %%xmm7, %%xmm1\n"
259 "pshufb %%xmm7, %%xmm2\n"
260 "pshufb %%xmm7, %%xmm3\n"
261 STORE64
262 : : [dst]"r"(dst+2*x),
263 [src1]"r"(srcu+x), [src2]"r"(srcv+x),
264 [shuffle]"r"(shuffle)
265 : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
267 else
268 #endif
271 assert(pixel_size == 1);
272 for (x = 0; x < (width & ~31); x += 32)
273 asm volatile
275 LOAD2X32
276 "movhlps %%xmm0, %%xmm4\n"
277 "punpcklbw %%xmm4, %%xmm0\n"
279 "movhlps %%xmm1, %%xmm4\n"
280 "punpcklbw %%xmm4, %%xmm1\n"
282 "movhlps %%xmm2, %%xmm4\n"
283 "punpcklbw %%xmm4, %%xmm2\n"
285 "movhlps %%xmm3, %%xmm4\n"
286 "punpcklbw %%xmm4, %%xmm3\n"
287 STORE64
288 : : [dst]"r"(dst+2*x),
289 [src1]"r"(srcu+x), [src2]"r"(srcv+x)
290 : "memory",
291 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm7"
294 #undef LOAD2X32
295 #undef STORE64
297 if (pixel_size == 1)
299 for (; x < width; x++) {
300 dst[2*x+0] = srcu[x];
301 dst[2*x+1] = srcv[x];
304 else
306 for (; x < width; x+= 2) {
307 dst[2*x+0] = srcu[x];
308 dst[2*x+1] = srcu[x + 1];
309 dst[2*x+2] = srcv[x];
310 dst[2*x+3] = srcv[x + 1];
313 srcu += srcu_pitch;
314 srcv += srcv_pitch;
315 dst += dst_pitch;
319 VLC_SSE
320 static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
321 uint8_t *dstv, size_t dstv_pitch,
322 const uint8_t *src, size_t src_pitch,
323 unsigned width, unsigned height, uint8_t pixel_size,
324 unsigned cpu)
326 #if defined(__SSSE3__) || !defined (CAN_COMPILE_SSSE3)
327 VLC_UNUSED(cpu);
328 #endif
329 assert(pixel_size == 1 || pixel_size == 2);
330 assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
332 #define LOAD64 \
333 "movdqa 0(%[src]), %%xmm0\n" \
334 "movdqa 16(%[src]), %%xmm1\n" \
335 "movdqa 32(%[src]), %%xmm2\n" \
336 "movdqa 48(%[src]), %%xmm3\n"
338 #define STORE2X32 \
339 "movq %%xmm0, 0(%[dst1])\n" \
340 "movq %%xmm1, 8(%[dst1])\n" \
341 "movhpd %%xmm0, 0(%[dst2])\n" \
342 "movhpd %%xmm1, 8(%[dst2])\n" \
343 "movq %%xmm2, 16(%[dst1])\n" \
344 "movq %%xmm3, 24(%[dst1])\n" \
345 "movhpd %%xmm2, 16(%[dst2])\n" \
346 "movhpd %%xmm3, 24(%[dst2])\n"
348 #ifdef CAN_COMPILE_SSSE3
349 if (vlc_CPU_SSSE3())
351 static const uint8_t shuffle_8[] = { 0, 2, 4, 6, 8, 10, 12, 14,
352 1, 3, 5, 7, 9, 11, 13, 15 };
353 static const uint8_t shuffle_16[] = { 0, 1, 4, 5, 8, 9, 12, 13,
354 2, 3, 6, 7, 10, 11, 14, 15 };
355 const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
356 for (unsigned y = 0; y < height; y++) {
357 unsigned x = 0;
358 for (; x < (width & ~31); x += 32) {
359 asm volatile (
360 "movdqu (%[shuffle]), %%xmm7\n"
361 LOAD64
362 "pshufb %%xmm7, %%xmm0\n"
363 "pshufb %%xmm7, %%xmm1\n"
364 "pshufb %%xmm7, %%xmm2\n"
365 "pshufb %%xmm7, %%xmm3\n"
366 STORE2X32
367 : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
369 if (pixel_size == 1)
371 for (; x < width; x++) {
372 dstu[x] = src[2*x+0];
373 dstv[x] = src[2*x+1];
376 else
378 for (; x < width; x+= 2) {
379 dstu[x] = src[2*x+0];
380 dstu[x+1] = src[2*x+1];
381 dstv[x] = src[2*x+2];
382 dstv[x+1] = src[2*x+3];
385 src += src_pitch;
386 dstu += dstu_pitch;
387 dstv += dstv_pitch;
389 } else
390 #endif
392 assert(pixel_size == 1);
393 static const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
394 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
396 for (unsigned y = 0; y < height; y++)
398 unsigned x = 0;
399 for (; x < (width & ~31); x += 32) {
400 asm volatile (
401 "movdqu (%[mask]), %%xmm7\n"
402 LOAD64
403 "movdqa %%xmm0, %%xmm4\n"
404 "movdqa %%xmm1, %%xmm5\n"
405 "movdqa %%xmm2, %%xmm6\n"
406 "psrlw $8, %%xmm0\n"
407 "psrlw $8, %%xmm1\n"
408 "pand %%xmm7, %%xmm4\n"
409 "pand %%xmm7, %%xmm5\n"
410 "pand %%xmm7, %%xmm6\n"
411 "packuswb %%xmm4, %%xmm0\n"
412 "packuswb %%xmm5, %%xmm1\n"
413 "pand %%xmm3, %%xmm7\n"
414 "psrlw $8, %%xmm2\n"
415 "psrlw $8, %%xmm3\n"
416 "packuswb %%xmm6, %%xmm2\n"
417 "packuswb %%xmm7, %%xmm3\n"
418 STORE2X32
419 : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
421 for (; x < width; x++) {
422 dstu[x] = src[2*x+0];
423 dstv[x] = src[2*x+1];
425 src += src_pitch;
426 dstu += dstu_pitch;
427 dstv += dstv_pitch;
430 #undef STORE2X32
431 #undef LOAD64
434 static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
435 const uint8_t *src, size_t src_pitch,
436 uint8_t *cache, size_t cache_size,
437 unsigned height, unsigned cpu)
439 const unsigned w16 = (src_pitch+15) & ~15;
440 const unsigned hstep = cache_size / w16;
441 assert(hstep > 0);
443 if (src_pitch == dst_pitch)
444 memcpy(dst, src, src_pitch * height);
445 else
446 for (unsigned y = 0; y < height; y += hstep) {
447 const unsigned hblock = __MIN(hstep, height - y);
449 /* Copy a bunch of line into our cache */
450 CopyFromUswc(cache, w16,
451 src, src_pitch,
452 src_pitch, hblock, cpu);
454 /* Copy from our cache to the destination */
455 Copy2d(dst, dst_pitch,
456 cache, w16,
457 src_pitch, hblock);
459 /* */
460 src += src_pitch * hblock;
461 dst += dst_pitch * hblock;
465 static void
466 SSE_InterleavePlanes(uint8_t *dst, size_t dst_pitch,
467 const uint8_t *srcu, size_t srcu_pitch,
468 const uint8_t *srcv, size_t srcv_pitch,
469 uint8_t *cache, size_t cache_size,
470 unsigned int height, uint8_t pixel_size, unsigned int cpu)
472 assert(srcu_pitch == srcv_pitch);
473 unsigned int const w16 = (srcu_pitch+15) & ~15;
474 unsigned int const hstep = (cache_size) / (2*w16);
475 assert(hstep > 0);
477 for (unsigned int y = 0; y < height; y += hstep)
479 unsigned int const hblock = __MIN(hstep, height - y);
481 /* Copy a bunch of line into our cache */
482 CopyFromUswc(cache, w16, srcu, srcu_pitch,
483 srcu_pitch, hblock, cpu);
484 CopyFromUswc(cache+w16*hblock, w16, srcv, srcv_pitch,
485 srcv_pitch, hblock, cpu);
487 /* Copy from our cache to the destination */
488 SSE_InterleaveUV(dst, dst_pitch, cache, w16,
489 cache+w16*hblock, w16, srcu_pitch, hblock, pixel_size,
490 cpu);
492 /* */
493 srcu += hblock * srcu_pitch;
494 srcv += hblock * srcv_pitch;
495 dst += hblock * dst_pitch;
499 static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
500 uint8_t *dstv, size_t dstv_pitch,
501 const uint8_t *src, size_t src_pitch,
502 uint8_t *cache, size_t cache_size,
503 unsigned height, uint8_t pixel_size, unsigned cpu)
505 const unsigned w16 = (src_pitch+15) & ~15;
506 const unsigned hstep = cache_size / w16;
507 assert(hstep > 0);
509 for (unsigned y = 0; y < height; y += hstep) {
510 const unsigned hblock = __MIN(hstep, height - y);
512 /* Copy a bunch of line into our cache */
513 CopyFromUswc(cache, w16, src, src_pitch,
514 src_pitch, hblock, cpu);
516 /* Copy from our cache to the destination */
517 SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
518 cache, w16, src_pitch / 2, hblock, pixel_size, cpu);
520 /* */
521 src += src_pitch * hblock;
522 dstu += dstu_pitch * hblock;
523 dstv += dstv_pitch * hblock;
527 static void SSE_Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
528 const size_t src_pitch[static 3], unsigned height,
529 const copy_cache_t *cache, unsigned cpu)
531 for (unsigned n = 0; n < 3; n++) {
532 const unsigned d = n > 0 ? 2 : 1;
533 SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
534 src[n], src_pitch[n],
535 cache->buffer, cache->size,
536 (height+d-1)/d, cpu);
538 asm volatile ("emms");
542 static void SSE_Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
543 const size_t src_pitch[static 2], unsigned height,
544 const copy_cache_t *cache, unsigned cpu)
546 SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
547 src[0], src_pitch[0],
548 cache->buffer, cache->size,
549 height, cpu);
550 SSE_CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
551 src[1], src_pitch[1],
552 cache->buffer, cache->size,
553 height/2, cpu);
554 asm volatile ("emms");
557 static void
558 SSE_Copy420_SP_to_P(picture_t *dest, const uint8_t *src[static 2],
559 const size_t src_pitch[static 2], unsigned int height,
560 const copy_cache_t *cache, uint8_t pixel_size,
561 unsigned int cpu)
563 SSE_CopyPlane(dest->p[0].p_pixels, dest->p[0].i_pitch,
564 src[0], src_pitch[0], cache->buffer, cache->size,
565 height, cpu);
566 SSE_SplitPlanes(dest->p[1].p_pixels, dest->p[1].i_pitch,
567 dest->p[2].p_pixels, dest->p[2].i_pitch,
568 src[1], src_pitch[1], cache->buffer, cache->size,
569 height / 2, pixel_size, cpu);
570 asm volatile ("emms");
573 static void SSE_Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
574 const size_t src_pitch[static 3],
575 unsigned height, const copy_cache_t *cache,
576 uint8_t pixel_size, unsigned cpu)
578 SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
579 src[0], src_pitch[0],
580 cache->buffer, cache->size,
581 height, cpu);
582 SSE_InterleavePlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
583 src[U_PLANE], src_pitch[U_PLANE],
584 src[V_PLANE], src_pitch[V_PLANE],
585 cache->buffer, cache->size, height / 2, pixel_size, cpu);
586 asm volatile ("emms");
588 #undef COPY64
589 #endif /* CAN_COMPILE_SSE2 */
591 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
592 const uint8_t *src, size_t src_pitch,
593 unsigned height)
595 if (src_pitch == dst_pitch)
596 memcpy(dst, src, src_pitch * height);
597 else
598 for (unsigned y = 0; y < height; y++) {
599 memcpy(dst, src, src_pitch);
600 src += src_pitch;
601 dst += dst_pitch;
605 void Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
606 const size_t src_pitch[static 2], unsigned height,
607 const copy_cache_t *cache)
609 ASSERT_2PLANES;
610 #ifdef CAN_COMPILE_SSE2
611 unsigned cpu = vlc_CPU();
612 if (vlc_CPU_SSE2())
613 return SSE_Copy420_SP_to_SP(dst, src, src_pitch, height,
614 cache, cpu);
615 #else
616 (void) cache;
617 #endif
619 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
620 src[0], src_pitch[0], height);
621 CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
622 src[1], src_pitch[1], height/2);
625 #define SPLIT_PLANES(type, pitch_den) do { \
626 for (unsigned y = 0; y < height; y++) { \
627 for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
628 ((type *) dstu)[x] = ((const type *) src)[2*x+0]; \
629 ((type *) dstv)[x] = ((const type *) src)[2*x+1]; \
631 src += src_pitch; \
632 dstu += dstu_pitch; \
633 dstv += dstv_pitch; \
635 } while(0)
637 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
638 uint8_t *dstv, size_t dstv_pitch,
639 const uint8_t *src, size_t src_pitch, unsigned height)
641 SPLIT_PLANES(uint8_t, 2);
644 static void SplitPlanes16(uint8_t *dstu, size_t dstu_pitch,
645 uint8_t *dstv, size_t dstv_pitch,
646 const uint8_t *src, size_t src_pitch, unsigned height)
648 SPLIT_PLANES(uint16_t, 4);
651 void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
652 const size_t src_pitch[static 2], unsigned height,
653 const copy_cache_t *cache)
655 ASSERT_2PLANES;
656 #ifdef CAN_COMPILE_SSE2
657 unsigned cpu = vlc_CPU();
659 if (vlc_CPU_SSE2())
660 return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, cache, 1, cpu);
661 #else
662 VLC_UNUSED(cache);
663 #endif
665 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
666 src[0], src_pitch[0], height);
667 SplitPlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
668 dst->p[2].p_pixels, dst->p[2].i_pitch,
669 src[1], src_pitch[1], height/2);
672 void Copy420_16_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
673 const size_t src_pitch[static 2], unsigned height,
674 const copy_cache_t *cache)
676 ASSERT_2PLANES;
677 #ifdef CAN_COMPILE_SSE3
678 unsigned cpu = vlc_CPU();
680 if (vlc_CPU_SSE3())
681 return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, cache, 2, cpu);
682 #else
683 VLC_UNUSED(cache);
684 #endif
686 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
687 src[0], src_pitch[0], height);
688 SplitPlanes16(dst->p[1].p_pixels, dst->p[1].i_pitch,
689 dst->p[2].p_pixels, dst->p[2].i_pitch,
690 src[1], src_pitch[1], height/2);
693 #define INTERLEAVE_UV() do { \
694 for ( unsigned int line = 0; line < copy_lines; line++ ) { \
695 for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
696 *dstUV++ = *srcU++; \
697 *dstUV++ = *srcV++; \
699 dstUV += i_extra_pitch_uv; \
700 srcU += i_extra_pitch_u; \
701 srcV += i_extra_pitch_v; \
703 }while(0)
705 void Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
706 const size_t src_pitch[static 3], unsigned height,
707 const copy_cache_t *cache)
709 ASSERT_3PLANES;
710 #ifdef CAN_COMPILE_SSE2
711 unsigned cpu = vlc_CPU();
712 if (vlc_CPU_SSE2())
713 return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, cache, 1, cpu);
714 #else
715 (void) cache;
716 #endif
718 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
719 src[0], src_pitch[0], height);
721 const unsigned copy_lines = height / 2;
722 const unsigned copy_pitch = src_pitch[1];
724 const int i_extra_pitch_uv = dst->p[1].i_pitch - 2 * copy_pitch;
725 const int i_extra_pitch_u = src_pitch[U_PLANE] - copy_pitch;
726 const int i_extra_pitch_v = src_pitch[V_PLANE] - copy_pitch;
728 uint8_t *dstUV = dst->p[1].p_pixels;
729 const uint8_t *srcU = src[U_PLANE];
730 const uint8_t *srcV = src[V_PLANE];
731 INTERLEAVE_UV();
734 void Copy420_16_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
735 const size_t src_pitch[static 3], unsigned height,
736 const copy_cache_t *cache)
738 ASSERT_3PLANES;
739 #ifdef CAN_COMPILE_SSE2
740 unsigned cpu = vlc_CPU();
741 if (vlc_CPU_SSE3())
742 return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, cache, 2, cpu);
743 #else
744 (void) cache;
745 #endif
747 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
748 src[0], src_pitch[0], height);
750 const unsigned copy_lines = height / 2;
751 const unsigned copy_pitch = src_pitch[1] / 2;
753 const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
754 const int i_extra_pitch_u = src_pitch[U_PLANE] / 2 - copy_pitch;
755 const int i_extra_pitch_v = src_pitch[V_PLANE] / 2 - copy_pitch;
757 uint16_t *dstUV = (void*) dst->p[1].p_pixels;
758 const uint16_t *srcU = (const uint16_t *) src[U_PLANE];
759 const uint16_t *srcV = (const uint16_t *) src[V_PLANE];
760 INTERLEAVE_UV();
763 void CopyFromI420_10ToP010(picture_t *dst, const uint8_t *src[static 3],
764 const size_t src_pitch[static 3],
765 unsigned height, const copy_cache_t *cache)
767 (void) cache;
769 const int i_extra_pitch_dst_y = (dst->p[0].i_pitch - src_pitch[0]) / 2;
770 const int i_extra_pitch_src_y = (src_pitch[Y_PLANE] - src_pitch[0]) / 2;
771 uint16_t *dstY = (uint16_t *) dst->p[0].p_pixels;
772 const uint16_t *srcY = (const uint16_t *) src[Y_PLANE];
773 for (unsigned y = 0; y < height; y++) {
774 for (unsigned x = 0; x < (src_pitch[0] / 2); x++) {
775 *dstY++ = *srcY++ << 6;
777 dstY += i_extra_pitch_dst_y;
778 srcY += i_extra_pitch_src_y;
781 const unsigned copy_lines = height / 2;
782 const unsigned copy_pitch = src_pitch[1] / 2;
784 const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
785 const int i_extra_pitch_u = src_pitch[U_PLANE] / 2 - copy_pitch;
786 const int i_extra_pitch_v = src_pitch[V_PLANE] / 2 - copy_pitch;
788 uint16_t *dstUV = (uint16_t *) dst->p[1].p_pixels;
789 const uint16_t *srcU = (const uint16_t *) src[U_PLANE];
790 const uint16_t *srcV = (const uint16_t *) src[V_PLANE];
791 for ( unsigned int line = 0; line < copy_lines; line++ )
793 for ( unsigned int col = 0; col < copy_pitch; col++ )
795 *dstUV++ = *srcU++ << 6;
796 *dstUV++ = *srcV++ << 6;
798 dstUV += i_extra_pitch_uv;
799 srcU += i_extra_pitch_u;
800 srcV += i_extra_pitch_v;
804 void Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
805 const size_t src_pitch[static 3], unsigned height,
806 const copy_cache_t *cache)
808 ASSERT_3PLANES;
809 #ifdef CAN_COMPILE_SSE2
810 unsigned cpu = vlc_CPU();
811 if (vlc_CPU_SSE2())
812 return SSE_Copy420_P_to_P(dst, src, src_pitch, height, cache, cpu);
813 #else
814 (void) cache;
815 #endif
817 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
818 src[0], src_pitch[0], height);
819 CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
820 src[1], src_pitch[1], height / 2);
821 CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
822 src[2], src_pitch[2], height / 2);
825 void picture_SwapUV(picture_t *picture)
827 assert(picture->i_planes == 3);
829 plane_t tmp_plane = picture->p[1];
830 picture->p[1] = picture->p[2];
831 picture->p[2] = tmp_plane;
834 int picture_UpdatePlanes(picture_t *picture, uint8_t *data, unsigned pitch)
836 /* fill in buffer info in first plane */
837 picture->p->p_pixels = data;
838 picture->p->i_pitch = pitch;
839 picture->p->i_lines = picture->format.i_height;
840 assert(picture->p->i_visible_pitch <= picture->p->i_pitch);
841 assert(picture->p->i_visible_lines <= picture->p->i_lines);
843 /* Fill chroma planes for biplanar YUV */
844 if (picture->format.i_chroma == VLC_CODEC_NV12 ||
845 picture->format.i_chroma == VLC_CODEC_NV21 ||
846 picture->format.i_chroma == VLC_CODEC_P010) {
848 for (int n = 1; n < picture->i_planes; n++) {
849 const plane_t *o = &picture->p[n-1];
850 plane_t *p = &picture->p[n];
852 p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
853 p->i_pitch = pitch;
854 p->i_lines = picture->format.i_height;
855 assert(p->i_visible_pitch <= p->i_pitch);
856 assert(p->i_visible_lines <= p->i_lines);
858 /* The dx/d3d buffer is always allocated as NV12 */
859 if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_NV12)) {
860 /* TODO : Swap NV21 UV planes to match NV12 */
861 return VLC_EGENERIC;
865 /* Fill chroma planes for planar YUV */
866 else
867 if (picture->format.i_chroma == VLC_CODEC_I420 ||
868 picture->format.i_chroma == VLC_CODEC_J420 ||
869 picture->format.i_chroma == VLC_CODEC_YV12) {
871 for (int n = 1; n < picture->i_planes; n++) {
872 const plane_t *o = &picture->p[n-1];
873 plane_t *p = &picture->p[n];
875 p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
876 p->i_pitch = pitch / 2;
877 p->i_lines = picture->format.i_height / 2;
879 /* The dx/d3d buffer is always allocated as YV12 */
880 if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_YV12)) {
881 uint8_t *p_tmp = picture->p[1].p_pixels;
882 picture->p[1].p_pixels = picture->p[2].p_pixels;
883 picture->p[2].p_pixels = p_tmp;
886 return VLC_SUCCESS;
889 #ifdef COPY_TEST
890 # undef NDEBUG
892 #include <vlc_picture.h>
894 struct test_dst
896 vlc_fourcc_t chroma;
897 void (*conv)(picture_t *, const uint8_t *[], const size_t [], unsigned,
898 const copy_cache_t *);
901 struct test_conv
903 vlc_fourcc_t src_chroma;
904 struct test_dst dsts[3];
907 static const struct test_conv convs[] = {
908 { .src_chroma = VLC_CODEC_NV12,
909 .dsts = { { VLC_CODEC_I420, Copy420_SP_to_P },
910 { VLC_CODEC_NV12, Copy420_SP_to_SP } },
912 { .src_chroma = VLC_CODEC_I420,
913 .dsts = { { VLC_CODEC_I420, Copy420_P_to_P },
914 { VLC_CODEC_NV12, Copy420_P_to_SP } },
916 { .src_chroma = VLC_CODEC_P010,
917 .dsts = { { VLC_CODEC_I420_10B, Copy420_16_SP_to_P } },
919 { .src_chroma = VLC_CODEC_I420_10B,
920 .dsts = { { VLC_CODEC_P010, Copy420_16_P_to_SP } },
923 #define NB_CONVS ARRAY_SIZE(convs)
925 struct test_size
927 int i_width;
928 int i_height;
929 int i_visible_width;
930 int i_visible_height;
932 static const struct test_size sizes[] = {
933 { 1, 1, 1, 1 },
934 { 3, 3, 3, 3 },
935 { 65, 39, 65, 39 },
936 { 560, 369, 540, 350 },
937 { 1274, 721, 1200, 720 },
938 { 1920, 1088, 1920, 1080 },
939 { 3840, 2160, 3840, 2160 },
940 #if 0 /* too long */
941 { 8192, 8192, 8192, 8192 },
942 #endif
944 #define NB_SIZES ARRAY_SIZE(sizes)
946 static void piccheck(picture_t *pic, const vlc_chroma_description_t *dsc,
947 bool init)
949 #define ASSERT_COLOR() do { \
950 fprintf(stderr, "error: pixel doesn't match @ plane: %d: %d x %d: %X\n", i, x, y, *(--p)); \
951 assert(!"error: pixel doesn't match"); \
952 } while(0)
954 #define PICCHECK(type_u, type_uv, colors_P, color_UV, pitch_den) do { \
955 for (int i = 0; i < pic->i_planes; ++i) \
957 const struct plane_t *plane = &pic->p[i]; \
958 for (int y = 0; y < plane->i_visible_lines; ++y) \
960 if (pic->i_planes == 2 && i == 1) \
962 type_uv *p = (type_uv *)&plane->p_pixels[y * plane->i_pitch]; \
963 for (int x = 0; x < plane->i_visible_pitch / 2 / pitch_den; ++x) \
964 if (init) \
965 *(p++) = color_UV; \
966 else if (*(p++) != color_UV) \
967 ASSERT_COLOR(); \
969 else \
971 type_u *p = (type_u *) &plane->p_pixels[y * plane->i_pitch]; \
972 for (int x = 0; x < plane->i_visible_pitch / pitch_den; ++x) \
973 if (init) \
974 *(p++) = colors_P[i]; \
975 else if (*(p++) != colors_P[i]) \
976 ASSERT_COLOR(); \
980 } while (0)
982 assert(pic->i_planes == 2 || pic->i_planes == 3);
983 const uint8_t colors_8_P[3] = { 0x42, 0xF1, 0x36 };
984 const uint16_t color_8_UV = 0x36F1;
986 const uint16_t colors_16_P[3] = { 0x4210, 0x14F1, 0x4536 };
987 const uint32_t color_16_UV = 0x453614F1;
989 assert(dsc->pixel_size == 1 || dsc->pixel_size == 2);
990 if (dsc->pixel_size == 1)
991 PICCHECK(uint8_t, uint16_t, colors_8_P, color_8_UV, 1);
992 else
993 PICCHECK(uint16_t, uint32_t, colors_16_P, color_16_UV, 2);
996 static void pic_rsc_destroy(picture_t *pic)
998 for (unsigned i = 0; i < 3; i++)
999 free(pic->p[i].p_pixels);
1000 free(pic);
1003 static picture_t *pic_new_unaligned(const video_format_t *fmt)
1005 /* Allocate a no-aligned picture in order to ease buffer overflow detection
1006 * from the source picture */
1007 const vlc_chroma_description_t *dsc = vlc_fourcc_GetChromaDescription(fmt->i_chroma);
1008 assert(dsc);
1009 picture_resource_t rsc = { .pf_destroy = pic_rsc_destroy };
1010 for (unsigned i = 0; i < dsc->plane_count; i++)
1012 rsc.p[i].i_lines = ((fmt->i_visible_height + 1) & ~ 1) * dsc->p[i].h.num / dsc->p[i].h.den;
1013 rsc.p[i].i_pitch = ((fmt->i_visible_width + 1) & ~ 1) * dsc->pixel_size * dsc->p[i].w.num / dsc->p[i].w.den;
1014 rsc.p[i].p_pixels = malloc(rsc.p[i].i_lines * rsc.p[i].i_pitch);
1015 assert(rsc.p[i].p_pixels);
1017 return picture_NewFromResource(fmt, &rsc);
1020 int main(void)
1022 alarm(10);
1024 unsigned cpu = vlc_CPU();
1025 #ifndef COPY_TEST_NOOTPIM
1026 if (!vlc_CPU_SSE2())
1028 fprintf(stderr, "WARNING: could not test SSE\n");
1029 return 0;
1031 #endif
1033 for (size_t i = 0; i < NB_CONVS; ++i)
1035 const struct test_conv *conv = &convs[i];
1037 for (size_t j = 0; j < NB_SIZES; ++j)
1039 const struct test_size *size = &sizes[j];
1041 const vlc_chroma_description_t *src_dsc =
1042 vlc_fourcc_GetChromaDescription(conv->src_chroma);
1043 assert(src_dsc);
1045 video_format_t fmt;
1046 video_format_Init(&fmt, 0);
1047 video_format_Setup(&fmt, conv->src_chroma,
1048 size->i_width, size->i_height,
1049 size->i_visible_width, size->i_visible_height,
1050 1, 1);
1051 picture_t *src = pic_new_unaligned(&fmt);
1052 assert(src);
1053 piccheck(src, src_dsc, true);
1055 copy_cache_t cache;
1056 int ret = CopyInitCache(&cache, src->format.i_width
1057 * src_dsc->pixel_size);
1058 assert(ret == VLC_SUCCESS);
1060 for (size_t f = 0; conv->dsts[f].chroma != 0; ++f)
1062 const struct test_dst *test_dst= &conv->dsts[f];
1064 const vlc_chroma_description_t *dst_dsc =
1065 vlc_fourcc_GetChromaDescription(test_dst->chroma);
1066 assert(dst_dsc);
1067 fmt.i_chroma = test_dst->chroma;
1068 picture_t *dst = picture_NewFromFormat(&fmt);
1069 assert(dst);
1071 const uint8_t * src_planes[3] = { src->p[Y_PLANE].p_pixels,
1072 src->p[U_PLANE].p_pixels,
1073 src->p[V_PLANE].p_pixels };
1074 const size_t src_pitches[3] = { src->p[Y_PLANE].i_pitch,
1075 src->p[U_PLANE].i_pitch,
1076 src->p[V_PLANE].i_pitch };
1078 fprintf(stderr, "testing: %u x %u (vis: %u x %u) %4.4s -> %4.4s\n",
1079 size->i_width, size->i_height,
1080 size->i_visible_width, size->i_visible_height,
1081 (const char *) &src->format.i_chroma,
1082 (const char *) &dst->format.i_chroma);
1083 test_dst->conv(dst, src_planes, src_pitches,
1084 src->format.i_visible_height, &cache);
1085 piccheck(dst, dst_dsc, false);
1086 picture_Release(dst);
1088 picture_Release(src);
1089 CopyCleanCache(&cache);
1092 return 0;
1095 #endif