1 /*****************************************************************************
2 * copy.c: Fast YV12/NV12 copy
3 *****************************************************************************
4 * Copyright (C) 2010 Laurent Aimar
7 * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
8 * Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
29 #include <vlc_common.h>
30 #include <vlc_picture.h>
36 #define ASSERT_PLANE(i) assert(src[i]); \
39 #define ASSERT_2PLANES \
45 #define ASSERT_3PLANES ASSERT_2PLANES; \
48 int CopyInitCache(copy_cache_t
*cache
, unsigned width
)
50 #ifdef CAN_COMPILE_SSE2
51 cache
->size
= __MAX((width
+ 0x3f) & ~ 0x3f, 16384);
52 cache
->buffer
= aligned_alloc(64, cache
->size
);
56 (void) cache
; (void) width
;
61 void CopyCleanCache(copy_cache_t
*cache
)
63 #ifdef CAN_COMPILE_SSE2
64 aligned_free(cache
->buffer
);
72 #ifdef CAN_COMPILE_SSE2
73 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
74 * load and storing data with the SSE>=2 instruction store.
76 #define COPY16(dstp, srcp, load, store) \
78 load " 0(%[src]), %%xmm1\n" \
79 store " %%xmm1, 0(%[dst])\n" \
80 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
82 #define COPY64(dstp, srcp, load, store) \
84 load " 0(%[src]), %%xmm1\n" \
85 load " 16(%[src]), %%xmm2\n" \
86 load " 32(%[src]), %%xmm3\n" \
87 load " 48(%[src]), %%xmm4\n" \
88 store " %%xmm1, 0(%[dst])\n" \
89 store " %%xmm2, 16(%[dst])\n" \
90 store " %%xmm3, 32(%[dst])\n" \
91 store " %%xmm4, 48(%[dst])\n" \
92 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
95 # undef vlc_CPU_SSE4_1
96 # define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
100 # undef vlc_CPU_SSSE3
101 # define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
106 # define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
109 #ifdef COPY_TEST_NOOTPIM
110 # undef vlc_CPU_SSE4_1
111 # define vlc_CPU_SSE4_1() (0)
113 # define vlc_CPU_SSE3() (0)
115 # define vlc_CPU_SSE2() (0)
118 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
119 * as used by some video surface.
120 * XXX It is really efficient only when SSE4.1 is available.
123 static void CopyFromUswc(uint8_t *dst
, size_t dst_pitch
,
124 const uint8_t *src
, size_t src_pitch
,
125 unsigned width
, unsigned height
,
128 #if defined (__SSE4_1__) || !defined(CAN_COMPILE_SSSE3)
131 assert(((intptr_t)dst
& 0x0f) == 0 && (dst_pitch
& 0x0f) == 0);
133 asm volatile ("mfence");
135 for (unsigned y
= 0; y
< height
; y
++) {
136 const unsigned unaligned
= (-(uintptr_t)src
) & 0x0f;
137 unsigned x
= unaligned
;
139 #ifdef CAN_COMPILE_SSE4_1
140 if (vlc_CPU_SSE4_1()) {
142 for (; x
+63 < width
; x
+= 64)
143 COPY64(&dst
[x
], &src
[x
], "movntdqa", "movdqa");
145 COPY16(dst
, src
, "movdqu", "movdqa");
146 for (; x
+63 < width
; x
+= 64)
147 COPY64(&dst
[x
], &src
[x
], "movntdqa", "movdqu");
153 for (; x
+63 < width
; x
+= 64)
154 COPY64(&dst
[x
], &src
[x
], "movdqa", "movdqa");
156 COPY16(dst
, src
, "movdqu", "movdqa");
157 for (; x
+63 < width
; x
+= 64)
158 COPY64(&dst
[x
], &src
[x
], "movdqa", "movdqu");
162 for (; x
< width
; x
++)
168 asm volatile ("mfence");
172 static void Copy2d(uint8_t *dst
, size_t dst_pitch
,
173 const uint8_t *src
, size_t src_pitch
,
174 unsigned width
, unsigned height
)
176 assert(((intptr_t)src
& 0x0f) == 0 && (src_pitch
& 0x0f) == 0);
178 for (unsigned y
= 0; y
< height
; y
++) {
181 bool unaligned
= ((intptr_t)dst
& 0x0f) != 0;
183 for (; x
+63 < width
; x
+= 64)
184 COPY64(&dst
[x
], &src
[x
], "movdqa", "movntdq");
186 for (; x
+63 < width
; x
+= 64)
187 COPY64(&dst
[x
], &src
[x
], "movdqa", "movdqu");
190 for (; x
< width
; x
++)
200 SSE_InterleaveUV(uint8_t *dst
, size_t dst_pitch
,
201 uint8_t *srcu
, size_t srcu_pitch
,
202 uint8_t *srcv
, size_t srcv_pitch
,
203 unsigned int width
, unsigned int height
, uint8_t pixel_size
,
206 assert(!((intptr_t)srcu
& 0xf) && !(srcu_pitch
& 0x0f) &&
207 !((intptr_t)srcv
& 0xf) && !(srcv_pitch
& 0x0f));
209 #if defined(__SSSE3__) || !defined (CAN_COMPILE_SSSE3)
213 static const uint8_t shuffle_8
[] = { 0, 8,
221 static const uint8_t shuffle_16
[] = { 0, 1, 8, 9,
225 const uint8_t *shuffle
= pixel_size
== 1 ? shuffle_8
: shuffle_16
;
227 for (unsigned int y
= 0; y
< height
; ++y
)
232 "movhpd 0x00(%[src2]), %%xmm0\n" \
233 "movlpd 0x00(%[src1]), %%xmm0\n" \
235 "movhpd 0x08(%[src2]), %%xmm1\n" \
236 "movlpd 0x08(%[src1]), %%xmm1\n" \
238 "movhpd 0x10(%[src2]), %%xmm2\n" \
239 "movlpd 0x10(%[src1]), %%xmm2\n" \
241 "movhpd 0x18(%[src2]), %%xmm3\n" \
242 "movlpd 0x18(%[src1]), %%xmm3\n"
245 "movdqu %%xmm0, 0x00(%[dst])\n" \
246 "movdqu %%xmm1, 0x10(%[dst])\n" \
247 "movdqu %%xmm2, 0x20(%[dst])\n" \
248 "movdqu %%xmm3, 0x30(%[dst])\n"
250 #ifdef CAN_COMPILE_SSSE3
252 for (x
= 0; x
< (width
& ~31); x
+= 32)
255 "movdqu (%[shuffle]), %%xmm7\n"
257 "pshufb %%xmm7, %%xmm0\n"
258 "pshufb %%xmm7, %%xmm1\n"
259 "pshufb %%xmm7, %%xmm2\n"
260 "pshufb %%xmm7, %%xmm3\n"
262 : : [dst
]"r"(dst
+2*x
),
263 [src1
]"r"(srcu
+x
), [src2
]"r"(srcv
+x
),
264 [shuffle
]"r"(shuffle
)
265 : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
271 assert(pixel_size
== 1);
272 for (x
= 0; x
< (width
& ~31); x
+= 32)
276 "movhlps %%xmm0, %%xmm4\n"
277 "punpcklbw %%xmm4, %%xmm0\n"
279 "movhlps %%xmm1, %%xmm4\n"
280 "punpcklbw %%xmm4, %%xmm1\n"
282 "movhlps %%xmm2, %%xmm4\n"
283 "punpcklbw %%xmm4, %%xmm2\n"
285 "movhlps %%xmm3, %%xmm4\n"
286 "punpcklbw %%xmm4, %%xmm3\n"
288 : : [dst
]"r"(dst
+2*x
),
289 [src1
]"r"(srcu
+x
), [src2
]"r"(srcv
+x
)
291 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm7"
299 for (; x
< width
; x
++) {
300 dst
[2*x
+0] = srcu
[x
];
301 dst
[2*x
+1] = srcv
[x
];
306 for (; x
< width
; x
+= 2) {
307 dst
[2*x
+0] = srcu
[x
];
308 dst
[2*x
+1] = srcu
[x
+ 1];
309 dst
[2*x
+2] = srcv
[x
];
310 dst
[2*x
+3] = srcv
[x
+ 1];
320 static void SSE_SplitUV(uint8_t *dstu
, size_t dstu_pitch
,
321 uint8_t *dstv
, size_t dstv_pitch
,
322 const uint8_t *src
, size_t src_pitch
,
323 unsigned width
, unsigned height
, uint8_t pixel_size
,
326 #if defined(__SSSE3__) || !defined (CAN_COMPILE_SSSE3)
329 assert(pixel_size
== 1 || pixel_size
== 2);
330 assert(((intptr_t)src
& 0xf) == 0 && (src_pitch
& 0x0f) == 0);
333 "movdqa 0(%[src]), %%xmm0\n" \
334 "movdqa 16(%[src]), %%xmm1\n" \
335 "movdqa 32(%[src]), %%xmm2\n" \
336 "movdqa 48(%[src]), %%xmm3\n"
339 "movq %%xmm0, 0(%[dst1])\n" \
340 "movq %%xmm1, 8(%[dst1])\n" \
341 "movhpd %%xmm0, 0(%[dst2])\n" \
342 "movhpd %%xmm1, 8(%[dst2])\n" \
343 "movq %%xmm2, 16(%[dst1])\n" \
344 "movq %%xmm3, 24(%[dst1])\n" \
345 "movhpd %%xmm2, 16(%[dst2])\n" \
346 "movhpd %%xmm3, 24(%[dst2])\n"
348 #ifdef CAN_COMPILE_SSSE3
351 static const uint8_t shuffle_8
[] = { 0, 2, 4, 6, 8, 10, 12, 14,
352 1, 3, 5, 7, 9, 11, 13, 15 };
353 static const uint8_t shuffle_16
[] = { 0, 1, 4, 5, 8, 9, 12, 13,
354 2, 3, 6, 7, 10, 11, 14, 15 };
355 const uint8_t *shuffle
= pixel_size
== 1 ? shuffle_8
: shuffle_16
;
356 for (unsigned y
= 0; y
< height
; y
++) {
358 for (; x
< (width
& ~31); x
+= 32) {
360 "movdqu (%[shuffle]), %%xmm7\n"
362 "pshufb %%xmm7, %%xmm0\n"
363 "pshufb %%xmm7, %%xmm1\n"
364 "pshufb %%xmm7, %%xmm2\n"
365 "pshufb %%xmm7, %%xmm3\n"
367 : : [dst1
]"r"(&dstu
[x
]), [dst2
]"r"(&dstv
[x
]), [src
]"r"(&src
[2*x
]), [shuffle
]"r"(shuffle
) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
371 for (; x
< width
; x
++) {
372 dstu
[x
] = src
[2*x
+0];
373 dstv
[x
] = src
[2*x
+1];
378 for (; x
< width
; x
+= 2) {
379 dstu
[x
] = src
[2*x
+0];
380 dstu
[x
+1] = src
[2*x
+1];
381 dstv
[x
] = src
[2*x
+2];
382 dstv
[x
+1] = src
[2*x
+3];
392 assert(pixel_size
== 1);
393 static const uint8_t mask
[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
394 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
396 for (unsigned y
= 0; y
< height
; y
++)
399 for (; x
< (width
& ~31); x
+= 32) {
401 "movdqu (%[mask]), %%xmm7\n"
403 "movdqa %%xmm0, %%xmm4\n"
404 "movdqa %%xmm1, %%xmm5\n"
405 "movdqa %%xmm2, %%xmm6\n"
408 "pand %%xmm7, %%xmm4\n"
409 "pand %%xmm7, %%xmm5\n"
410 "pand %%xmm7, %%xmm6\n"
411 "packuswb %%xmm4, %%xmm0\n"
412 "packuswb %%xmm5, %%xmm1\n"
413 "pand %%xmm3, %%xmm7\n"
416 "packuswb %%xmm6, %%xmm2\n"
417 "packuswb %%xmm7, %%xmm3\n"
419 : : [dst2
]"r"(&dstu
[x
]), [dst1
]"r"(&dstv
[x
]), [src
]"r"(&src
[2*x
]), [mask
]"r"(mask
) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
421 for (; x
< width
; x
++) {
422 dstu
[x
] = src
[2*x
+0];
423 dstv
[x
] = src
[2*x
+1];
434 static void SSE_CopyPlane(uint8_t *dst
, size_t dst_pitch
,
435 const uint8_t *src
, size_t src_pitch
,
436 uint8_t *cache
, size_t cache_size
,
437 unsigned height
, unsigned cpu
)
439 const unsigned w16
= (src_pitch
+15) & ~15;
440 const unsigned hstep
= cache_size
/ w16
;
443 if (src_pitch
== dst_pitch
)
444 memcpy(dst
, src
, src_pitch
* height
);
446 for (unsigned y
= 0; y
< height
; y
+= hstep
) {
447 const unsigned hblock
= __MIN(hstep
, height
- y
);
449 /* Copy a bunch of line into our cache */
450 CopyFromUswc(cache
, w16
,
452 src_pitch
, hblock
, cpu
);
454 /* Copy from our cache to the destination */
455 Copy2d(dst
, dst_pitch
,
460 src
+= src_pitch
* hblock
;
461 dst
+= dst_pitch
* hblock
;
466 SSE_InterleavePlanes(uint8_t *dst
, size_t dst_pitch
,
467 const uint8_t *srcu
, size_t srcu_pitch
,
468 const uint8_t *srcv
, size_t srcv_pitch
,
469 uint8_t *cache
, size_t cache_size
,
470 unsigned int height
, uint8_t pixel_size
, unsigned int cpu
)
472 assert(srcu_pitch
== srcv_pitch
);
473 unsigned int const w16
= (srcu_pitch
+15) & ~15;
474 unsigned int const hstep
= (cache_size
) / (2*w16
);
477 for (unsigned int y
= 0; y
< height
; y
+= hstep
)
479 unsigned int const hblock
= __MIN(hstep
, height
- y
);
481 /* Copy a bunch of line into our cache */
482 CopyFromUswc(cache
, w16
, srcu
, srcu_pitch
,
483 srcu_pitch
, hblock
, cpu
);
484 CopyFromUswc(cache
+w16
*hblock
, w16
, srcv
, srcv_pitch
,
485 srcv_pitch
, hblock
, cpu
);
487 /* Copy from our cache to the destination */
488 SSE_InterleaveUV(dst
, dst_pitch
, cache
, w16
,
489 cache
+w16
*hblock
, w16
, srcu_pitch
, hblock
, pixel_size
,
493 srcu
+= hblock
* srcu_pitch
;
494 srcv
+= hblock
* srcv_pitch
;
495 dst
+= hblock
* dst_pitch
;
499 static void SSE_SplitPlanes(uint8_t *dstu
, size_t dstu_pitch
,
500 uint8_t *dstv
, size_t dstv_pitch
,
501 const uint8_t *src
, size_t src_pitch
,
502 uint8_t *cache
, size_t cache_size
,
503 unsigned height
, uint8_t pixel_size
, unsigned cpu
)
505 const unsigned w16
= (src_pitch
+15) & ~15;
506 const unsigned hstep
= cache_size
/ w16
;
509 for (unsigned y
= 0; y
< height
; y
+= hstep
) {
510 const unsigned hblock
= __MIN(hstep
, height
- y
);
512 /* Copy a bunch of line into our cache */
513 CopyFromUswc(cache
, w16
, src
, src_pitch
,
514 src_pitch
, hblock
, cpu
);
516 /* Copy from our cache to the destination */
517 SSE_SplitUV(dstu
, dstu_pitch
, dstv
, dstv_pitch
,
518 cache
, w16
, src_pitch
/ 2, hblock
, pixel_size
, cpu
);
521 src
+= src_pitch
* hblock
;
522 dstu
+= dstu_pitch
* hblock
;
523 dstv
+= dstv_pitch
* hblock
;
527 static void SSE_Copy420_P_to_P(picture_t
*dst
, const uint8_t *src
[static 3],
528 const size_t src_pitch
[static 3], unsigned height
,
529 const copy_cache_t
*cache
, unsigned cpu
)
531 for (unsigned n
= 0; n
< 3; n
++) {
532 const unsigned d
= n
> 0 ? 2 : 1;
533 SSE_CopyPlane(dst
->p
[n
].p_pixels
, dst
->p
[n
].i_pitch
,
534 src
[n
], src_pitch
[n
],
535 cache
->buffer
, cache
->size
,
536 (height
+d
-1)/d
, cpu
);
538 asm volatile ("emms");
542 static void SSE_Copy420_SP_to_SP(picture_t
*dst
, const uint8_t *src
[static 2],
543 const size_t src_pitch
[static 2], unsigned height
,
544 const copy_cache_t
*cache
, unsigned cpu
)
546 SSE_CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
547 src
[0], src_pitch
[0],
548 cache
->buffer
, cache
->size
,
550 SSE_CopyPlane(dst
->p
[1].p_pixels
, dst
->p
[1].i_pitch
,
551 src
[1], src_pitch
[1],
552 cache
->buffer
, cache
->size
,
554 asm volatile ("emms");
558 SSE_Copy420_SP_to_P(picture_t
*dest
, const uint8_t *src
[static 2],
559 const size_t src_pitch
[static 2], unsigned int height
,
560 const copy_cache_t
*cache
, uint8_t pixel_size
,
563 SSE_CopyPlane(dest
->p
[0].p_pixels
, dest
->p
[0].i_pitch
,
564 src
[0], src_pitch
[0], cache
->buffer
, cache
->size
,
566 SSE_SplitPlanes(dest
->p
[1].p_pixels
, dest
->p
[1].i_pitch
,
567 dest
->p
[2].p_pixels
, dest
->p
[2].i_pitch
,
568 src
[1], src_pitch
[1], cache
->buffer
, cache
->size
,
569 height
/ 2, pixel_size
, cpu
);
570 asm volatile ("emms");
573 static void SSE_Copy420_P_to_SP(picture_t
*dst
, const uint8_t *src
[static 3],
574 const size_t src_pitch
[static 3],
575 unsigned height
, const copy_cache_t
*cache
,
576 uint8_t pixel_size
, unsigned cpu
)
578 SSE_CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
579 src
[0], src_pitch
[0],
580 cache
->buffer
, cache
->size
,
582 SSE_InterleavePlanes(dst
->p
[1].p_pixels
, dst
->p
[1].i_pitch
,
583 src
[U_PLANE
], src_pitch
[U_PLANE
],
584 src
[V_PLANE
], src_pitch
[V_PLANE
],
585 cache
->buffer
, cache
->size
, height
/ 2, pixel_size
, cpu
);
586 asm volatile ("emms");
589 #endif /* CAN_COMPILE_SSE2 */
591 static void CopyPlane(uint8_t *dst
, size_t dst_pitch
,
592 const uint8_t *src
, size_t src_pitch
,
595 if (src_pitch
== dst_pitch
)
596 memcpy(dst
, src
, src_pitch
* height
);
598 for (unsigned y
= 0; y
< height
; y
++) {
599 memcpy(dst
, src
, src_pitch
);
605 void Copy420_SP_to_SP(picture_t
*dst
, const uint8_t *src
[static 2],
606 const size_t src_pitch
[static 2], unsigned height
,
607 const copy_cache_t
*cache
)
610 #ifdef CAN_COMPILE_SSE2
611 unsigned cpu
= vlc_CPU();
613 return SSE_Copy420_SP_to_SP(dst
, src
, src_pitch
, height
,
619 CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
620 src
[0], src_pitch
[0], height
);
621 CopyPlane(dst
->p
[1].p_pixels
, dst
->p
[1].i_pitch
,
622 src
[1], src_pitch
[1], height
/2);
625 #define SPLIT_PLANES(type, pitch_den) do { \
626 for (unsigned y = 0; y < height; y++) { \
627 for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
628 ((type *) dstu)[x] = ((const type *) src)[2*x+0]; \
629 ((type *) dstv)[x] = ((const type *) src)[2*x+1]; \
632 dstu += dstu_pitch; \
633 dstv += dstv_pitch; \
637 static void SplitPlanes(uint8_t *dstu
, size_t dstu_pitch
,
638 uint8_t *dstv
, size_t dstv_pitch
,
639 const uint8_t *src
, size_t src_pitch
, unsigned height
)
641 SPLIT_PLANES(uint8_t, 2);
644 static void SplitPlanes16(uint8_t *dstu
, size_t dstu_pitch
,
645 uint8_t *dstv
, size_t dstv_pitch
,
646 const uint8_t *src
, size_t src_pitch
, unsigned height
)
648 SPLIT_PLANES(uint16_t, 4);
651 void Copy420_SP_to_P(picture_t
*dst
, const uint8_t *src
[static 2],
652 const size_t src_pitch
[static 2], unsigned height
,
653 const copy_cache_t
*cache
)
656 #ifdef CAN_COMPILE_SSE2
657 unsigned cpu
= vlc_CPU();
660 return SSE_Copy420_SP_to_P(dst
, src
, src_pitch
, height
, cache
, 1, cpu
);
665 CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
666 src
[0], src_pitch
[0], height
);
667 SplitPlanes(dst
->p
[1].p_pixels
, dst
->p
[1].i_pitch
,
668 dst
->p
[2].p_pixels
, dst
->p
[2].i_pitch
,
669 src
[1], src_pitch
[1], height
/2);
672 void Copy420_16_SP_to_P(picture_t
*dst
, const uint8_t *src
[static 2],
673 const size_t src_pitch
[static 2], unsigned height
,
674 const copy_cache_t
*cache
)
677 #ifdef CAN_COMPILE_SSE3
678 unsigned cpu
= vlc_CPU();
681 return SSE_Copy420_SP_to_P(dst
, src
, src_pitch
, height
, cache
, 2, cpu
);
686 CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
687 src
[0], src_pitch
[0], height
);
688 SplitPlanes16(dst
->p
[1].p_pixels
, dst
->p
[1].i_pitch
,
689 dst
->p
[2].p_pixels
, dst
->p
[2].i_pitch
,
690 src
[1], src_pitch
[1], height
/2);
693 #define INTERLEAVE_UV() do { \
694 for ( unsigned int line = 0; line < copy_lines; line++ ) { \
695 for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
696 *dstUV++ = *srcU++; \
697 *dstUV++ = *srcV++; \
699 dstUV += i_extra_pitch_uv; \
700 srcU += i_extra_pitch_u; \
701 srcV += i_extra_pitch_v; \
705 void Copy420_P_to_SP(picture_t
*dst
, const uint8_t *src
[static 3],
706 const size_t src_pitch
[static 3], unsigned height
,
707 const copy_cache_t
*cache
)
710 #ifdef CAN_COMPILE_SSE2
711 unsigned cpu
= vlc_CPU();
713 return SSE_Copy420_P_to_SP(dst
, src
, src_pitch
, height
, cache
, 1, cpu
);
718 CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
719 src
[0], src_pitch
[0], height
);
721 const unsigned copy_lines
= height
/ 2;
722 const unsigned copy_pitch
= src_pitch
[1];
724 const int i_extra_pitch_uv
= dst
->p
[1].i_pitch
- 2 * copy_pitch
;
725 const int i_extra_pitch_u
= src_pitch
[U_PLANE
] - copy_pitch
;
726 const int i_extra_pitch_v
= src_pitch
[V_PLANE
] - copy_pitch
;
728 uint8_t *dstUV
= dst
->p
[1].p_pixels
;
729 const uint8_t *srcU
= src
[U_PLANE
];
730 const uint8_t *srcV
= src
[V_PLANE
];
734 void Copy420_16_P_to_SP(picture_t
*dst
, const uint8_t *src
[static 3],
735 const size_t src_pitch
[static 3], unsigned height
,
736 const copy_cache_t
*cache
)
739 #ifdef CAN_COMPILE_SSE2
740 unsigned cpu
= vlc_CPU();
742 return SSE_Copy420_P_to_SP(dst
, src
, src_pitch
, height
, cache
, 2, cpu
);
747 CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
748 src
[0], src_pitch
[0], height
);
750 const unsigned copy_lines
= height
/ 2;
751 const unsigned copy_pitch
= src_pitch
[1] / 2;
753 const int i_extra_pitch_uv
= dst
->p
[1].i_pitch
/ 2 - 2 * copy_pitch
;
754 const int i_extra_pitch_u
= src_pitch
[U_PLANE
] / 2 - copy_pitch
;
755 const int i_extra_pitch_v
= src_pitch
[V_PLANE
] / 2 - copy_pitch
;
757 uint16_t *dstUV
= (void*) dst
->p
[1].p_pixels
;
758 const uint16_t *srcU
= (const uint16_t *) src
[U_PLANE
];
759 const uint16_t *srcV
= (const uint16_t *) src
[V_PLANE
];
763 void CopyFromI420_10ToP010(picture_t
*dst
, const uint8_t *src
[static 3],
764 const size_t src_pitch
[static 3],
765 unsigned height
, const copy_cache_t
*cache
)
769 const int i_extra_pitch_dst_y
= (dst
->p
[0].i_pitch
- src_pitch
[0]) / 2;
770 const int i_extra_pitch_src_y
= (src_pitch
[Y_PLANE
] - src_pitch
[0]) / 2;
771 uint16_t *dstY
= (uint16_t *) dst
->p
[0].p_pixels
;
772 const uint16_t *srcY
= (const uint16_t *) src
[Y_PLANE
];
773 for (unsigned y
= 0; y
< height
; y
++) {
774 for (unsigned x
= 0; x
< (src_pitch
[0] / 2); x
++) {
775 *dstY
++ = *srcY
++ << 6;
777 dstY
+= i_extra_pitch_dst_y
;
778 srcY
+= i_extra_pitch_src_y
;
781 const unsigned copy_lines
= height
/ 2;
782 const unsigned copy_pitch
= src_pitch
[1] / 2;
784 const int i_extra_pitch_uv
= dst
->p
[1].i_pitch
/ 2 - 2 * copy_pitch
;
785 const int i_extra_pitch_u
= src_pitch
[U_PLANE
] / 2 - copy_pitch
;
786 const int i_extra_pitch_v
= src_pitch
[V_PLANE
] / 2 - copy_pitch
;
788 uint16_t *dstUV
= (uint16_t *) dst
->p
[1].p_pixels
;
789 const uint16_t *srcU
= (const uint16_t *) src
[U_PLANE
];
790 const uint16_t *srcV
= (const uint16_t *) src
[V_PLANE
];
791 for ( unsigned int line
= 0; line
< copy_lines
; line
++ )
793 for ( unsigned int col
= 0; col
< copy_pitch
; col
++ )
795 *dstUV
++ = *srcU
++ << 6;
796 *dstUV
++ = *srcV
++ << 6;
798 dstUV
+= i_extra_pitch_uv
;
799 srcU
+= i_extra_pitch_u
;
800 srcV
+= i_extra_pitch_v
;
804 void Copy420_P_to_P(picture_t
*dst
, const uint8_t *src
[static 3],
805 const size_t src_pitch
[static 3], unsigned height
,
806 const copy_cache_t
*cache
)
809 #ifdef CAN_COMPILE_SSE2
810 unsigned cpu
= vlc_CPU();
812 return SSE_Copy420_P_to_P(dst
, src
, src_pitch
, height
, cache
, cpu
);
817 CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
818 src
[0], src_pitch
[0], height
);
819 CopyPlane(dst
->p
[1].p_pixels
, dst
->p
[1].i_pitch
,
820 src
[1], src_pitch
[1], height
/ 2);
821 CopyPlane(dst
->p
[2].p_pixels
, dst
->p
[2].i_pitch
,
822 src
[2], src_pitch
[2], height
/ 2);
825 void picture_SwapUV(picture_t
*picture
)
827 assert(picture
->i_planes
== 3);
829 plane_t tmp_plane
= picture
->p
[1];
830 picture
->p
[1] = picture
->p
[2];
831 picture
->p
[2] = tmp_plane
;
834 int picture_UpdatePlanes(picture_t
*picture
, uint8_t *data
, unsigned pitch
)
836 /* fill in buffer info in first plane */
837 picture
->p
->p_pixels
= data
;
838 picture
->p
->i_pitch
= pitch
;
839 picture
->p
->i_lines
= picture
->format
.i_height
;
840 assert(picture
->p
->i_visible_pitch
<= picture
->p
->i_pitch
);
841 assert(picture
->p
->i_visible_lines
<= picture
->p
->i_lines
);
843 /* Fill chroma planes for biplanar YUV */
844 if (picture
->format
.i_chroma
== VLC_CODEC_NV12
||
845 picture
->format
.i_chroma
== VLC_CODEC_NV21
||
846 picture
->format
.i_chroma
== VLC_CODEC_P010
) {
848 for (int n
= 1; n
< picture
->i_planes
; n
++) {
849 const plane_t
*o
= &picture
->p
[n
-1];
850 plane_t
*p
= &picture
->p
[n
];
852 p
->p_pixels
= o
->p_pixels
+ o
->i_lines
* o
->i_pitch
;
854 p
->i_lines
= picture
->format
.i_height
;
855 assert(p
->i_visible_pitch
<= p
->i_pitch
);
856 assert(p
->i_visible_lines
<= p
->i_lines
);
858 /* The dx/d3d buffer is always allocated as NV12 */
859 if (vlc_fourcc_AreUVPlanesSwapped(picture
->format
.i_chroma
, VLC_CODEC_NV12
)) {
860 /* TODO : Swap NV21 UV planes to match NV12 */
865 /* Fill chroma planes for planar YUV */
867 if (picture
->format
.i_chroma
== VLC_CODEC_I420
||
868 picture
->format
.i_chroma
== VLC_CODEC_J420
||
869 picture
->format
.i_chroma
== VLC_CODEC_YV12
) {
871 for (int n
= 1; n
< picture
->i_planes
; n
++) {
872 const plane_t
*o
= &picture
->p
[n
-1];
873 plane_t
*p
= &picture
->p
[n
];
875 p
->p_pixels
= o
->p_pixels
+ o
->i_lines
* o
->i_pitch
;
876 p
->i_pitch
= pitch
/ 2;
877 p
->i_lines
= picture
->format
.i_height
/ 2;
879 /* The dx/d3d buffer is always allocated as YV12 */
880 if (vlc_fourcc_AreUVPlanesSwapped(picture
->format
.i_chroma
, VLC_CODEC_YV12
)) {
881 uint8_t *p_tmp
= picture
->p
[1].p_pixels
;
882 picture
->p
[1].p_pixels
= picture
->p
[2].p_pixels
;
883 picture
->p
[2].p_pixels
= p_tmp
;
892 #include <vlc_picture.h>
897 void (*conv
)(picture_t
*, const uint8_t *[], const size_t [], unsigned,
898 const copy_cache_t
*);
903 vlc_fourcc_t src_chroma
;
904 struct test_dst dsts
[3];
907 static const struct test_conv convs
[] = {
908 { .src_chroma
= VLC_CODEC_NV12
,
909 .dsts
= { { VLC_CODEC_I420
, Copy420_SP_to_P
},
910 { VLC_CODEC_NV12
, Copy420_SP_to_SP
} },
912 { .src_chroma
= VLC_CODEC_I420
,
913 .dsts
= { { VLC_CODEC_I420
, Copy420_P_to_P
},
914 { VLC_CODEC_NV12
, Copy420_P_to_SP
} },
916 { .src_chroma
= VLC_CODEC_P010
,
917 .dsts
= { { VLC_CODEC_I420_10B
, Copy420_16_SP_to_P
} },
919 { .src_chroma
= VLC_CODEC_I420_10B
,
920 .dsts
= { { VLC_CODEC_P010
, Copy420_16_P_to_SP
} },
923 #define NB_CONVS ARRAY_SIZE(convs)
930 int i_visible_height
;
932 static const struct test_size sizes
[] = {
936 { 560, 369, 540, 350 },
937 { 1274, 721, 1200, 720 },
938 { 1920, 1088, 1920, 1080 },
939 { 3840, 2160, 3840, 2160 },
941 { 8192, 8192, 8192, 8192 },
944 #define NB_SIZES ARRAY_SIZE(sizes)
946 static void piccheck(picture_t
*pic
, const vlc_chroma_description_t
*dsc
,
949 #define ASSERT_COLOR() do { \
950 fprintf(stderr, "error: pixel doesn't match @ plane: %d: %d x %d: %X\n", i, x, y, *(--p)); \
951 assert(!"error: pixel doesn't match"); \
954 #define PICCHECK(type_u, type_uv, colors_P, color_UV, pitch_den) do { \
955 for (int i = 0; i < pic->i_planes; ++i) \
957 const struct plane_t *plane = &pic->p[i]; \
958 for (int y = 0; y < plane->i_visible_lines; ++y) \
960 if (pic->i_planes == 2 && i == 1) \
962 type_uv *p = (type_uv *)&plane->p_pixels[y * plane->i_pitch]; \
963 for (int x = 0; x < plane->i_visible_pitch / 2 / pitch_den; ++x) \
966 else if (*(p++) != color_UV) \
971 type_u *p = (type_u *) &plane->p_pixels[y * plane->i_pitch]; \
972 for (int x = 0; x < plane->i_visible_pitch / pitch_den; ++x) \
974 *(p++) = colors_P[i]; \
975 else if (*(p++) != colors_P[i]) \
982 assert(pic
->i_planes
== 2 || pic
->i_planes
== 3);
983 const uint8_t colors_8_P
[3] = { 0x42, 0xF1, 0x36 };
984 const uint16_t color_8_UV
= 0x36F1;
986 const uint16_t colors_16_P
[3] = { 0x4210, 0x14F1, 0x4536 };
987 const uint32_t color_16_UV
= 0x453614F1;
989 assert(dsc
->pixel_size
== 1 || dsc
->pixel_size
== 2);
990 if (dsc
->pixel_size
== 1)
991 PICCHECK(uint8_t, uint16_t, colors_8_P
, color_8_UV
, 1);
993 PICCHECK(uint16_t, uint32_t, colors_16_P
, color_16_UV
, 2);
996 static void pic_rsc_destroy(picture_t
*pic
)
998 for (unsigned i
= 0; i
< 3; i
++)
999 free(pic
->p
[i
].p_pixels
);
1003 static picture_t
*pic_new_unaligned(const video_format_t
*fmt
)
1005 /* Allocate a no-aligned picture in order to ease buffer overflow detection
1006 * from the source picture */
1007 const vlc_chroma_description_t
*dsc
= vlc_fourcc_GetChromaDescription(fmt
->i_chroma
);
1009 picture_resource_t rsc
= { .pf_destroy
= pic_rsc_destroy
};
1010 for (unsigned i
= 0; i
< dsc
->plane_count
; i
++)
1012 rsc
.p
[i
].i_lines
= ((fmt
->i_visible_height
+ 1) & ~ 1) * dsc
->p
[i
].h
.num
/ dsc
->p
[i
].h
.den
;
1013 rsc
.p
[i
].i_pitch
= ((fmt
->i_visible_width
+ 1) & ~ 1) * dsc
->pixel_size
* dsc
->p
[i
].w
.num
/ dsc
->p
[i
].w
.den
;
1014 rsc
.p
[i
].p_pixels
= malloc(rsc
.p
[i
].i_lines
* rsc
.p
[i
].i_pitch
);
1015 assert(rsc
.p
[i
].p_pixels
);
1017 return picture_NewFromResource(fmt
, &rsc
);
1024 unsigned cpu
= vlc_CPU();
1025 #ifndef COPY_TEST_NOOTPIM
1026 if (!vlc_CPU_SSE2())
1028 fprintf(stderr
, "WARNING: could not test SSE\n");
1033 for (size_t i
= 0; i
< NB_CONVS
; ++i
)
1035 const struct test_conv
*conv
= &convs
[i
];
1037 for (size_t j
= 0; j
< NB_SIZES
; ++j
)
1039 const struct test_size
*size
= &sizes
[j
];
1041 const vlc_chroma_description_t
*src_dsc
=
1042 vlc_fourcc_GetChromaDescription(conv
->src_chroma
);
1046 video_format_Init(&fmt
, 0);
1047 video_format_Setup(&fmt
, conv
->src_chroma
,
1048 size
->i_width
, size
->i_height
,
1049 size
->i_visible_width
, size
->i_visible_height
,
1051 picture_t
*src
= pic_new_unaligned(&fmt
);
1053 piccheck(src
, src_dsc
, true);
1056 int ret
= CopyInitCache(&cache
, src
->format
.i_width
1057 * src_dsc
->pixel_size
);
1058 assert(ret
== VLC_SUCCESS
);
1060 for (size_t f
= 0; conv
->dsts
[f
].chroma
!= 0; ++f
)
1062 const struct test_dst
*test_dst
= &conv
->dsts
[f
];
1064 const vlc_chroma_description_t
*dst_dsc
=
1065 vlc_fourcc_GetChromaDescription(test_dst
->chroma
);
1067 fmt
.i_chroma
= test_dst
->chroma
;
1068 picture_t
*dst
= picture_NewFromFormat(&fmt
);
1071 const uint8_t * src_planes
[3] = { src
->p
[Y_PLANE
].p_pixels
,
1072 src
->p
[U_PLANE
].p_pixels
,
1073 src
->p
[V_PLANE
].p_pixels
};
1074 const size_t src_pitches
[3] = { src
->p
[Y_PLANE
].i_pitch
,
1075 src
->p
[U_PLANE
].i_pitch
,
1076 src
->p
[V_PLANE
].i_pitch
};
1078 fprintf(stderr
, "testing: %u x %u (vis: %u x %u) %4.4s -> %4.4s\n",
1079 size
->i_width
, size
->i_height
,
1080 size
->i_visible_width
, size
->i_visible_height
,
1081 (const char *) &src
->format
.i_chroma
,
1082 (const char *) &dst
->format
.i_chroma
);
1083 test_dst
->conv(dst
, src_planes
, src_pitches
,
1084 src
->format
.i_visible_height
, &cache
);
1085 piccheck(dst
, dst_dsc
, false);
1086 picture_Release(dst
);
1088 picture_Release(src
);
1089 CopyCleanCache(&cache
);