1 /*****************************************************************************
2 * copy.c: Fast YV12/NV12 copy
3 *****************************************************************************
4 * Copyright (C) 2010 Laurent Aimar
7 * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
8 * Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
33 #include <vlc_common.h>
34 #include <vlc_picture.h>
39 static void CopyPlane(uint8_t *dst
, size_t dst_pitch
,
40 const uint8_t *src
, size_t src_pitch
,
41 unsigned height
, int bitshift
);
43 #define ASSERT_PLANE(i) assert(src[i]); \
46 #define ASSERT_2PLANES \
52 #define ASSERT_3PLANES ASSERT_2PLANES; \
55 int CopyInitCache(copy_cache_t
*cache
, unsigned width
)
57 #ifdef CAN_COMPILE_SSE2
58 cache
->size
= __MAX((width
+ 0x3f) & ~ 0x3f, 16384);
59 cache
->buffer
= aligned_alloc(64, cache
->size
);
63 (void) cache
; (void) width
;
68 void CopyCleanCache(copy_cache_t
*cache
)
70 #ifdef CAN_COMPILE_SSE2
71 aligned_free(cache
->buffer
);
79 #ifdef CAN_COMPILE_SSE2
80 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
81 * load and storing data with the SSE>=2 instruction store.
84 #define COPY16_SHIFTR(x) \
86 #define COPY16_SHIFTL(x) \
89 #define COPY16_S(dstp, srcp, load, store, shiftstr) \
91 load " 0(%[src]), %%xmm1\n" \
93 store " %%xmm1, 0(%[dst])\n" \
94 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
96 #define COPY16(dstp, srcp, load, store) COPY16_S(dstp, srcp, load, store, "")
98 #define COPY64_SHIFTR(x) \
99 "psrlw "x", %%xmm1\n" \
100 "psrlw "x", %%xmm2\n" \
101 "psrlw "x", %%xmm3\n" \
102 "psrlw "x", %%xmm4\n"
103 #define COPY64_SHIFTL(x) \
104 "psllw "x", %%xmm1\n" \
105 "psllw "x", %%xmm2\n" \
106 "psllw "x", %%xmm3\n" \
107 "psllw "x", %%xmm4\n"
109 #define COPY64_S(dstp, srcp, load, store, shiftstr) \
111 load " 0(%[src]), %%xmm1\n" \
112 load " 16(%[src]), %%xmm2\n" \
113 load " 32(%[src]), %%xmm3\n" \
114 load " 48(%[src]), %%xmm4\n" \
116 store " %%xmm1, 0(%[dst])\n" \
117 store " %%xmm2, 16(%[dst])\n" \
118 store " %%xmm3, 32(%[dst])\n" \
119 store " %%xmm4, 48(%[dst])\n" \
120 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
122 #define COPY64(dstp, srcp, load, store) \
123 COPY64_S(dstp, srcp, load, store, "")
125 #ifdef COPY_TEST_NOOPTIM
126 # undef vlc_CPU_SSE4_1
127 # define vlc_CPU_SSE4_1() (0)
129 # define vlc_CPU_SSE3() (0)
130 # undef vlc_CPU_SSSE3
131 # define vlc_CPU_SSSE3() (0)
133 # define vlc_CPU_SSE2() (0)
136 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
137 * as used by some video surface.
138 * XXX It is really efficient only when SSE4.1 is available.
141 static void CopyFromUswc(uint8_t *dst
, size_t dst_pitch
,
142 const uint8_t *src
, size_t src_pitch
,
143 unsigned width
, unsigned height
, int bitshift
)
145 assert(((intptr_t)dst
& 0x0f) == 0 && (dst_pitch
& 0x0f) == 0);
147 asm volatile ("mfence");
149 #define SSE_USWC_COPY(shiftstr16, shiftstr64) \
150 for (unsigned y = 0; y < height; y++) { \
151 const unsigned unaligned = (-(uintptr_t)src) & 0x0f; \
152 unsigned x = unaligned; \
153 if (vlc_CPU_SSE4_1()) { \
155 for (; x+63 < width; x += 64) \
156 COPY64_S(&dst[x], &src[x], "movntdqa", "movdqa", shiftstr64); \
158 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
159 for (; x+63 < width; x += 64) \
160 COPY64_S(&dst[x], &src[x], "movntdqa", "movdqu", shiftstr64); \
164 for (; x+63 < width; x += 64) \
165 COPY64_S(&dst[x], &src[x], "movdqa", "movdqa", shiftstr64); \
167 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
168 for (; x+63 < width; x += 64) \
169 COPY64_S(&dst[x], &src[x], "movdqa", "movdqu", shiftstr64); \
172 /* The following should not happen since buffers are generally well aligned */ \
174 CopyPlane(&dst[x], dst_pitch - x, &src[x], src_pitch - x, 1, bitshift); \
182 SSE_USWC_COPY("", "")
185 SSE_USWC_COPY(COPY16_SHIFTL("$6"), COPY64_SHIFTL("$6"))
188 SSE_USWC_COPY(COPY16_SHIFTR("$6"), COPY64_SHIFTR("$6"))
191 SSE_USWC_COPY(COPY16_SHIFTR("$2"), COPY64_SHIFTR("$2"))
194 SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
197 SSE_USWC_COPY(COPY16_SHIFTR("$4"), COPY64_SHIFTR("$4"))
200 SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
203 vlc_assert_unreachable();
207 asm volatile ("mfence");
211 static void Copy2d(uint8_t *dst
, size_t dst_pitch
,
212 const uint8_t *src
, size_t src_pitch
,
213 unsigned width
, unsigned height
)
215 assert(((intptr_t)src
& 0x0f) == 0 && (src_pitch
& 0x0f) == 0);
217 for (unsigned y
= 0; y
< height
; y
++) {
220 bool unaligned
= ((intptr_t)dst
& 0x0f) != 0;
222 for (; x
+63 < width
; x
+= 64)
223 COPY64(&dst
[x
], &src
[x
], "movdqa", "movntdq");
225 for (; x
+63 < width
; x
+= 64)
226 COPY64(&dst
[x
], &src
[x
], "movdqa", "movdqu");
229 for (; x
< width
; x
++)
239 SSE_InterleaveUV(uint8_t *dst
, size_t dst_pitch
,
240 uint8_t *srcu
, size_t srcu_pitch
,
241 uint8_t *srcv
, size_t srcv_pitch
,
242 unsigned int width
, unsigned int height
, uint8_t pixel_size
)
244 assert(!((intptr_t)srcu
& 0xf) && !(srcu_pitch
& 0x0f) &&
245 !((intptr_t)srcv
& 0xf) && !(srcv_pitch
& 0x0f));
247 static const uint8_t shuffle_8
[] = { 0, 8,
255 static const uint8_t shuffle_16
[] = { 0, 1, 8, 9,
259 const uint8_t *shuffle
= pixel_size
== 1 ? shuffle_8
: shuffle_16
;
261 for (unsigned int y
= 0; y
< height
; ++y
)
266 "movhpd 0x00(%[src2]), %%xmm0\n" \
267 "movlpd 0x00(%[src1]), %%xmm0\n" \
269 "movhpd 0x08(%[src2]), %%xmm1\n" \
270 "movlpd 0x08(%[src1]), %%xmm1\n" \
272 "movhpd 0x10(%[src2]), %%xmm2\n" \
273 "movlpd 0x10(%[src1]), %%xmm2\n" \
275 "movhpd 0x18(%[src2]), %%xmm3\n" \
276 "movlpd 0x18(%[src1]), %%xmm3\n"
279 "movdqu %%xmm0, 0x00(%[dst])\n" \
280 "movdqu %%xmm1, 0x10(%[dst])\n" \
281 "movdqu %%xmm2, 0x20(%[dst])\n" \
282 "movdqu %%xmm3, 0x30(%[dst])\n"
284 #ifdef CAN_COMPILE_SSSE3
286 for (x
= 0; x
< (width
& ~31); x
+= 32)
289 "movdqu (%[shuffle]), %%xmm7\n"
291 "pshufb %%xmm7, %%xmm0\n"
292 "pshufb %%xmm7, %%xmm1\n"
293 "pshufb %%xmm7, %%xmm2\n"
294 "pshufb %%xmm7, %%xmm3\n"
296 : : [dst
]"r"(dst
+2*x
),
297 [src1
]"r"(srcu
+x
), [src2
]"r"(srcv
+x
),
298 [shuffle
]"r"(shuffle
)
299 : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
305 assert(pixel_size
== 1);
306 for (x
= 0; x
< (width
& ~31); x
+= 32)
310 "movhlps %%xmm0, %%xmm4\n"
311 "punpcklbw %%xmm4, %%xmm0\n"
313 "movhlps %%xmm1, %%xmm4\n"
314 "punpcklbw %%xmm4, %%xmm1\n"
316 "movhlps %%xmm2, %%xmm4\n"
317 "punpcklbw %%xmm4, %%xmm2\n"
319 "movhlps %%xmm3, %%xmm4\n"
320 "punpcklbw %%xmm4, %%xmm3\n"
322 : : [dst
]"r"(dst
+2*x
),
323 [src1
]"r"(srcu
+x
), [src2
]"r"(srcv
+x
)
325 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm7"
333 for (; x
< width
; x
++) {
334 dst
[2*x
+0] = srcu
[x
];
335 dst
[2*x
+1] = srcv
[x
];
340 for (; x
< width
; x
+= 2) {
341 dst
[2*x
+0] = srcu
[x
];
342 dst
[2*x
+1] = srcu
[x
+ 1];
343 dst
[2*x
+2] = srcv
[x
];
344 dst
[2*x
+3] = srcv
[x
+ 1];
354 static void SSE_SplitUV(uint8_t *dstu
, size_t dstu_pitch
,
355 uint8_t *dstv
, size_t dstv_pitch
,
356 const uint8_t *src
, size_t src_pitch
,
357 unsigned width
, unsigned height
, uint8_t pixel_size
)
359 assert(pixel_size
== 1 || pixel_size
== 2);
360 assert(((intptr_t)src
& 0xf) == 0 && (src_pitch
& 0x0f) == 0);
363 "movdqa 0(%[src]), %%xmm0\n" \
364 "movdqa 16(%[src]), %%xmm1\n" \
365 "movdqa 32(%[src]), %%xmm2\n" \
366 "movdqa 48(%[src]), %%xmm3\n"
369 "movq %%xmm0, 0(%[dst1])\n" \
370 "movq %%xmm1, 8(%[dst1])\n" \
371 "movhpd %%xmm0, 0(%[dst2])\n" \
372 "movhpd %%xmm1, 8(%[dst2])\n" \
373 "movq %%xmm2, 16(%[dst1])\n" \
374 "movq %%xmm3, 24(%[dst1])\n" \
375 "movhpd %%xmm2, 16(%[dst2])\n" \
376 "movhpd %%xmm3, 24(%[dst2])\n"
378 #ifdef CAN_COMPILE_SSSE3
381 static const uint8_t shuffle_8
[] = { 0, 2, 4, 6, 8, 10, 12, 14,
382 1, 3, 5, 7, 9, 11, 13, 15 };
383 static const uint8_t shuffle_16
[] = { 0, 1, 4, 5, 8, 9, 12, 13,
384 2, 3, 6, 7, 10, 11, 14, 15 };
385 const uint8_t *shuffle
= pixel_size
== 1 ? shuffle_8
: shuffle_16
;
386 for (unsigned y
= 0; y
< height
; y
++) {
388 for (; x
< (width
& ~31); x
+= 32) {
390 "movdqu (%[shuffle]), %%xmm7\n"
392 "pshufb %%xmm7, %%xmm0\n"
393 "pshufb %%xmm7, %%xmm1\n"
394 "pshufb %%xmm7, %%xmm2\n"
395 "pshufb %%xmm7, %%xmm3\n"
397 : : [dst1
]"r"(&dstu
[x
]), [dst2
]"r"(&dstv
[x
]), [src
]"r"(&src
[2*x
]), [shuffle
]"r"(shuffle
) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
401 for (; x
< width
; x
++) {
402 dstu
[x
] = src
[2*x
+0];
403 dstv
[x
] = src
[2*x
+1];
408 for (; x
< width
; x
+= 2) {
409 dstu
[x
] = src
[2*x
+0];
410 dstu
[x
+1] = src
[2*x
+1];
411 dstv
[x
] = src
[2*x
+2];
412 dstv
[x
+1] = src
[2*x
+3];
422 assert(pixel_size
== 1);
423 static const uint8_t mask
[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
424 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
426 for (unsigned y
= 0; y
< height
; y
++)
429 for (; x
< (width
& ~31); x
+= 32) {
431 "movdqu (%[mask]), %%xmm7\n"
433 "movdqa %%xmm0, %%xmm4\n"
434 "movdqa %%xmm1, %%xmm5\n"
435 "movdqa %%xmm2, %%xmm6\n"
438 "pand %%xmm7, %%xmm4\n"
439 "pand %%xmm7, %%xmm5\n"
440 "pand %%xmm7, %%xmm6\n"
441 "packuswb %%xmm4, %%xmm0\n"
442 "packuswb %%xmm5, %%xmm1\n"
443 "pand %%xmm3, %%xmm7\n"
446 "packuswb %%xmm6, %%xmm2\n"
447 "packuswb %%xmm7, %%xmm3\n"
449 : : [dst2
]"r"(&dstu
[x
]), [dst1
]"r"(&dstv
[x
]), [src
]"r"(&src
[2*x
]), [mask
]"r"(mask
) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
451 for (; x
< width
; x
++) {
452 dstu
[x
] = src
[2*x
+0];
453 dstv
[x
] = src
[2*x
+1];
464 static void SSE_CopyPlane(uint8_t *dst
, size_t dst_pitch
,
465 const uint8_t *src
, size_t src_pitch
,
466 uint8_t *cache
, size_t cache_size
,
467 unsigned height
, int bitshift
)
469 const unsigned w16
= (src_pitch
+15) & ~15;
470 const unsigned hstep
= cache_size
/ w16
;
473 /* If SSE4.1: CopyFromUswc is faster than memcpy */
474 if (!vlc_CPU_SSE4_1() && bitshift
== 0 && src_pitch
== dst_pitch
)
475 memcpy(dst
, src
, src_pitch
* height
);
477 for (unsigned y
= 0; y
< height
; y
+= hstep
) {
478 const unsigned hblock
= __MIN(hstep
, height
- y
);
480 /* Copy a bunch of line into our cache */
481 CopyFromUswc(cache
, w16
, src
, src_pitch
, src_pitch
, hblock
, bitshift
);
483 /* Copy from our cache to the destination */
484 Copy2d(dst
, dst_pitch
, cache
, w16
, src_pitch
, hblock
);
487 src
+= src_pitch
* hblock
;
488 dst
+= dst_pitch
* hblock
;
493 SSE_InterleavePlanes(uint8_t *dst
, size_t dst_pitch
,
494 const uint8_t *srcu
, size_t srcu_pitch
,
495 const uint8_t *srcv
, size_t srcv_pitch
,
496 uint8_t *cache
, size_t cache_size
,
497 unsigned int height
, uint8_t pixel_size
, int bitshift
)
499 assert(srcu_pitch
== srcv_pitch
);
500 unsigned int const w16
= (srcu_pitch
+15) & ~15;
501 unsigned int const hstep
= (cache_size
) / (2*w16
);
504 for (unsigned int y
= 0; y
< height
; y
+= hstep
)
506 unsigned int const hblock
= __MIN(hstep
, height
- y
);
508 /* Copy a bunch of line into our cache */
509 CopyFromUswc(cache
, w16
, srcu
, srcu_pitch
, srcu_pitch
, hblock
, bitshift
);
510 CopyFromUswc(cache
+w16
*hblock
, w16
, srcv
, srcv_pitch
,
511 srcv_pitch
, hblock
, bitshift
);
513 /* Copy from our cache to the destination */
514 SSE_InterleaveUV(dst
, dst_pitch
, cache
, w16
,
515 cache
+ w16
* hblock
, w16
,
516 srcu_pitch
, hblock
, pixel_size
);
519 srcu
+= hblock
* srcu_pitch
;
520 srcv
+= hblock
* srcv_pitch
;
521 dst
+= hblock
* dst_pitch
;
525 static void SSE_SplitPlanes(uint8_t *dstu
, size_t dstu_pitch
,
526 uint8_t *dstv
, size_t dstv_pitch
,
527 const uint8_t *src
, size_t src_pitch
,
528 uint8_t *cache
, size_t cache_size
,
529 unsigned height
, uint8_t pixel_size
, int bitshift
)
531 const unsigned w16
= (src_pitch
+15) & ~15;
532 const unsigned hstep
= cache_size
/ w16
;
535 for (unsigned y
= 0; y
< height
; y
+= hstep
) {
536 const unsigned hblock
= __MIN(hstep
, height
- y
);
538 /* Copy a bunch of line into our cache */
539 CopyFromUswc(cache
, w16
, src
, src_pitch
, src_pitch
, hblock
, bitshift
);
541 /* Copy from our cache to the destination */
542 SSE_SplitUV(dstu
, dstu_pitch
, dstv
, dstv_pitch
,
543 cache
, w16
, src_pitch
/ 2, hblock
, pixel_size
);
546 src
+= src_pitch
* hblock
;
547 dstu
+= dstu_pitch
* hblock
;
548 dstv
+= dstv_pitch
* hblock
;
552 static void SSE_Copy420_P_to_P(picture_t
*dst
, const uint8_t *src
[static 3],
553 const size_t src_pitch
[static 3], unsigned height
,
554 const copy_cache_t
*cache
)
556 for (unsigned n
= 0; n
< 3; n
++) {
557 const unsigned d
= n
> 0 ? 2 : 1;
558 SSE_CopyPlane(dst
->p
[n
].p_pixels
, dst
->p
[n
].i_pitch
,
559 src
[n
], src_pitch
[n
],
560 cache
->buffer
, cache
->size
,
563 asm volatile ("emms");
567 static void SSE_Copy420_SP_to_SP(picture_t
*dst
, const uint8_t *src
[static 2],
568 const size_t src_pitch
[static 2], unsigned height
,
569 const copy_cache_t
*cache
)
571 SSE_CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
, src
[0], src_pitch
[0],
572 cache
->buffer
, cache
->size
, height
, 0);
573 SSE_CopyPlane(dst
->p
[1].p_pixels
, dst
->p
[1].i_pitch
, src
[1], src_pitch
[1],
574 cache
->buffer
, cache
->size
, height
/ 2, 0);
575 asm volatile ("emms");
579 SSE_Copy420_SP_to_P(picture_t
*dest
, const uint8_t *src
[static 2],
580 const size_t src_pitch
[static 2], unsigned int height
,
581 uint8_t pixel_size
, int bitshift
, const copy_cache_t
*cache
)
583 SSE_CopyPlane(dest
->p
[0].p_pixels
, dest
->p
[0].i_pitch
,
584 src
[0], src_pitch
[0], cache
->buffer
, cache
->size
, height
, bitshift
);
586 SSE_SplitPlanes(dest
->p
[1].p_pixels
, dest
->p
[1].i_pitch
,
587 dest
->p
[2].p_pixels
, dest
->p
[2].i_pitch
,
588 src
[1], src_pitch
[1], cache
->buffer
, cache
->size
,
589 height
/ 2, pixel_size
, bitshift
);
590 asm volatile ("emms");
593 static void SSE_Copy420_P_to_SP(picture_t
*dst
, const uint8_t *src
[static 3],
594 const size_t src_pitch
[static 3],
595 unsigned height
, uint8_t pixel_size
,
596 int bitshift
, const copy_cache_t
*cache
)
598 SSE_CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
, src
[0], src_pitch
[0],
599 cache
->buffer
, cache
->size
, height
, bitshift
);
600 SSE_InterleavePlanes(dst
->p
[1].p_pixels
, dst
->p
[1].i_pitch
,
601 src
[U_PLANE
], src_pitch
[U_PLANE
],
602 src
[V_PLANE
], src_pitch
[V_PLANE
],
603 cache
->buffer
, cache
->size
, height
/ 2, pixel_size
, bitshift
);
604 asm volatile ("emms");
607 #endif /* CAN_COMPILE_SSE2 */
609 static void CopyPlane(uint8_t *dst
, size_t dst_pitch
,
610 const uint8_t *src
, size_t src_pitch
,
611 unsigned height
, int bitshift
)
615 for (unsigned y
= 0; y
< height
; y
++)
617 uint16_t *dst16
= (uint16_t *) dst
;
618 const uint16_t *src16
= (const uint16_t *) src
;
621 for (unsigned x
= 0; x
< (src_pitch
/ 2); x
++)
622 *dst16
++ = (*src16
++) >> (bitshift
& 0xf);
624 for (unsigned x
= 0; x
< (src_pitch
/ 2); x
++)
625 *dst16
++ = (*src16
++) << ((-bitshift
) & 0xf);
630 else if (src_pitch
== dst_pitch
)
631 memcpy(dst
, src
, src_pitch
* height
);
633 for (unsigned y
= 0; y
< height
; y
++) {
634 memcpy(dst
, src
, src_pitch
);
640 void CopyPacked(picture_t
*dst
, const uint8_t *src
, const size_t src_pitch
,
641 unsigned height
, const copy_cache_t
*cache
)
644 assert(src
); assert(src_pitch
);
647 if (vlc_CPU_SSE4_1())
648 SSE_CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
, src
, src_pitch
,
649 cache
->buffer
, cache
->size
, height
, 0);
651 CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
, src
, src_pitch
,
655 void Copy420_SP_to_SP(picture_t
*dst
, const uint8_t *src
[static 2],
656 const size_t src_pitch
[static 2], unsigned height
,
657 const copy_cache_t
*cache
)
660 #ifdef CAN_COMPILE_SSE2
662 return SSE_Copy420_SP_to_SP(dst
, src
, src_pitch
, height
, cache
);
667 CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
668 src
[0], src_pitch
[0], height
, 0);
669 CopyPlane(dst
->p
[1].p_pixels
, dst
->p
[1].i_pitch
,
670 src
[1], src_pitch
[1], height
/2, 0);
673 #define SPLIT_PLANES(type, pitch_den) do { \
674 for (unsigned y = 0; y < height; y++) { \
675 for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
676 ((type *) dstu)[x] = ((const type *) src)[2*x+0]; \
677 ((type *) dstv)[x] = ((const type *) src)[2*x+1]; \
680 dstu += dstu_pitch; \
681 dstv += dstv_pitch; \
685 #define SPLIT_PLANES_SHIFTR(type, pitch_den, bitshift) do { \
686 for (unsigned y = 0; y < height; y++) { \
687 for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
688 ((type *) dstu)[x] = (((const type *) src)[2*x+0]) >> (bitshift); \
689 ((type *) dstv)[x] = (((const type *) src)[2*x+1]) >> (bitshift); \
692 dstu += dstu_pitch; \
693 dstv += dstv_pitch; \
697 #define SPLIT_PLANES_SHIFTL(type, pitch_den, bitshift) do { \
698 for (unsigned y = 0; y < height; y++) { \
699 for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
700 ((type *) dstu)[x] = (((const type *) src)[2*x+0]) << (bitshift); \
701 ((type *) dstv)[x] = (((const type *) src)[2*x+1]) << (bitshift); \
704 dstu += dstu_pitch; \
705 dstv += dstv_pitch; \
709 static void SplitPlanes(uint8_t *dstu
, size_t dstu_pitch
,
710 uint8_t *dstv
, size_t dstv_pitch
,
711 const uint8_t *src
, size_t src_pitch
, unsigned height
)
713 SPLIT_PLANES(uint8_t, 2);
716 static void SplitPlanes16(uint8_t *dstu
, size_t dstu_pitch
,
717 uint8_t *dstv
, size_t dstv_pitch
,
718 const uint8_t *src
, size_t src_pitch
, unsigned height
,
722 SPLIT_PLANES(uint16_t, 4);
723 else if (bitshift
> 0)
724 SPLIT_PLANES_SHIFTR(uint16_t, 4, bitshift
& 0xf);
726 SPLIT_PLANES_SHIFTL(uint16_t, 4, (-bitshift
) & 0xf);
729 void Copy420_SP_to_P(picture_t
*dst
, const uint8_t *src
[static 2],
730 const size_t src_pitch
[static 2], unsigned height
,
731 const copy_cache_t
*cache
)
734 #ifdef CAN_COMPILE_SSE2
736 return SSE_Copy420_SP_to_P(dst
, src
, src_pitch
, height
, 1, 0, cache
);
741 CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
742 src
[0], src_pitch
[0], height
, 0);
743 SplitPlanes(dst
->p
[1].p_pixels
, dst
->p
[1].i_pitch
,
744 dst
->p
[2].p_pixels
, dst
->p
[2].i_pitch
,
745 src
[1], src_pitch
[1], height
/2);
748 void Copy420_16_SP_to_P(picture_t
*dst
, const uint8_t *src
[static 2],
749 const size_t src_pitch
[static 2], unsigned height
,
750 int bitshift
, const copy_cache_t
*cache
)
753 assert(bitshift
>= -6 && bitshift
<= 6 && (bitshift
% 2 == 0));
755 #ifdef CAN_COMPILE_SSE3
757 return SSE_Copy420_SP_to_P(dst
, src
, src_pitch
, height
, 2, bitshift
, cache
);
762 CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
763 src
[0], src_pitch
[0], height
, bitshift
);
764 SplitPlanes16(dst
->p
[1].p_pixels
, dst
->p
[1].i_pitch
,
765 dst
->p
[2].p_pixels
, dst
->p
[2].i_pitch
,
766 src
[1], src_pitch
[1], height
/2, bitshift
);
769 #define INTERLEAVE_UV() do { \
770 for ( unsigned int line = 0; line < copy_lines; line++ ) { \
771 for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
772 *dstUV++ = *srcU++; \
773 *dstUV++ = *srcV++; \
775 dstUV += i_extra_pitch_uv; \
776 srcU += i_extra_pitch_u; \
777 srcV += i_extra_pitch_v; \
781 #define INTERLEAVE_UV_SHIFTR(bitshitf) do { \
782 for ( unsigned int line = 0; line < copy_lines; line++ ) { \
783 for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
784 *dstUV++ = (*srcU++) >> (bitshitf); \
785 *dstUV++ = (*srcV++) >> (bitshitf); \
787 dstUV += i_extra_pitch_uv; \
788 srcU += i_extra_pitch_u; \
789 srcV += i_extra_pitch_v; \
793 #define INTERLEAVE_UV_SHIFTL(bitshitf) do { \
794 for ( unsigned int line = 0; line < copy_lines; line++ ) { \
795 for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
796 *dstUV++ = (*srcU++) << (bitshitf); \
797 *dstUV++ = (*srcV++) << (bitshitf); \
799 dstUV += i_extra_pitch_uv; \
800 srcU += i_extra_pitch_u; \
801 srcV += i_extra_pitch_v; \
805 void Copy420_P_to_SP(picture_t
*dst
, const uint8_t *src
[static 3],
806 const size_t src_pitch
[static 3], unsigned height
,
807 const copy_cache_t
*cache
)
810 #ifdef CAN_COMPILE_SSE2
812 return SSE_Copy420_P_to_SP(dst
, src
, src_pitch
, height
, 1, 0, cache
);
817 CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
818 src
[0], src_pitch
[0], height
, 0);
820 const unsigned copy_lines
= height
/ 2;
821 const unsigned copy_pitch
= src_pitch
[1];
823 const int i_extra_pitch_uv
= dst
->p
[1].i_pitch
- 2 * copy_pitch
;
824 const int i_extra_pitch_u
= src_pitch
[U_PLANE
] - copy_pitch
;
825 const int i_extra_pitch_v
= src_pitch
[V_PLANE
] - copy_pitch
;
827 uint8_t *dstUV
= dst
->p
[1].p_pixels
;
828 const uint8_t *srcU
= src
[U_PLANE
];
829 const uint8_t *srcV
= src
[V_PLANE
];
833 void Copy420_16_P_to_SP(picture_t
*dst
, const uint8_t *src
[static 3],
834 const size_t src_pitch
[static 3], unsigned height
,
835 int bitshift
, const copy_cache_t
*cache
)
838 assert(bitshift
>= -6 && bitshift
<= 6 && (bitshift
% 2 == 0));
839 #ifdef CAN_COMPILE_SSE2
841 return SSE_Copy420_P_to_SP(dst
, src
, src_pitch
, height
, 2, bitshift
, cache
);
846 CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
847 src
[0], src_pitch
[0], height
, bitshift
);
849 const unsigned copy_lines
= height
/ 2;
850 const unsigned copy_pitch
= src_pitch
[1] / 2;
852 const int i_extra_pitch_uv
= dst
->p
[1].i_pitch
/ 2 - 2 * copy_pitch
;
853 const int i_extra_pitch_u
= src_pitch
[U_PLANE
] / 2 - copy_pitch
;
854 const int i_extra_pitch_v
= src_pitch
[V_PLANE
] / 2 - copy_pitch
;
856 uint16_t *dstUV
= (void*) dst
->p
[1].p_pixels
;
857 const uint16_t *srcU
= (const uint16_t *) src
[U_PLANE
];
858 const uint16_t *srcV
= (const uint16_t *) src
[V_PLANE
];
862 else if (bitshift
> 0)
863 INTERLEAVE_UV_SHIFTR(bitshift
& 0xf);
865 INTERLEAVE_UV_SHIFTL((-bitshift
) & 0xf);
868 void CopyFromI420_10ToP010(picture_t
*dst
, const uint8_t *src
[static 3],
869 const size_t src_pitch
[static 3],
870 unsigned height
, const copy_cache_t
*cache
)
874 const int i_extra_pitch_dst_y
= (dst
->p
[0].i_pitch
- src_pitch
[0]) / 2;
875 const int i_extra_pitch_src_y
= (src_pitch
[Y_PLANE
] - src_pitch
[0]) / 2;
876 uint16_t *dstY
= (uint16_t *) dst
->p
[0].p_pixels
;
877 const uint16_t *srcY
= (const uint16_t *) src
[Y_PLANE
];
878 for (unsigned y
= 0; y
< height
; y
++) {
879 for (unsigned x
= 0; x
< (src_pitch
[0] / 2); x
++) {
880 *dstY
++ = *srcY
++ << 6;
882 dstY
+= i_extra_pitch_dst_y
;
883 srcY
+= i_extra_pitch_src_y
;
886 const unsigned copy_lines
= height
/ 2;
887 const unsigned copy_pitch
= src_pitch
[1] / 2;
889 const int i_extra_pitch_uv
= dst
->p
[1].i_pitch
/ 2 - 2 * copy_pitch
;
890 const int i_extra_pitch_u
= src_pitch
[U_PLANE
] / 2 - copy_pitch
;
891 const int i_extra_pitch_v
= src_pitch
[V_PLANE
] / 2 - copy_pitch
;
893 uint16_t *dstUV
= (uint16_t *) dst
->p
[1].p_pixels
;
894 const uint16_t *srcU
= (const uint16_t *) src
[U_PLANE
];
895 const uint16_t *srcV
= (const uint16_t *) src
[V_PLANE
];
896 for ( unsigned int line
= 0; line
< copy_lines
; line
++ )
898 for ( unsigned int col
= 0; col
< copy_pitch
; col
++ )
900 *dstUV
++ = *srcU
++ << 6;
901 *dstUV
++ = *srcV
++ << 6;
903 dstUV
+= i_extra_pitch_uv
;
904 srcU
+= i_extra_pitch_u
;
905 srcV
+= i_extra_pitch_v
;
909 void Copy420_P_to_P(picture_t
*dst
, const uint8_t *src
[static 3],
910 const size_t src_pitch
[static 3], unsigned height
,
911 const copy_cache_t
*cache
)
914 #ifdef CAN_COMPILE_SSE2
916 return SSE_Copy420_P_to_P(dst
, src
, src_pitch
, height
, cache
);
921 CopyPlane(dst
->p
[0].p_pixels
, dst
->p
[0].i_pitch
,
922 src
[0], src_pitch
[0], height
, 0);
923 CopyPlane(dst
->p
[1].p_pixels
, dst
->p
[1].i_pitch
,
924 src
[1], src_pitch
[1], height
/ 2, 0);
925 CopyPlane(dst
->p
[2].p_pixels
, dst
->p
[2].i_pitch
,
926 src
[2], src_pitch
[2], height
/ 2, 0);
929 int picture_UpdatePlanes(picture_t
*picture
, uint8_t *data
, unsigned pitch
)
931 /* fill in buffer info in first plane */
932 picture
->p
->p_pixels
= data
;
933 picture
->p
->i_pitch
= pitch
;
934 picture
->p
->i_lines
= picture
->format
.i_height
;
935 assert(picture
->p
->i_visible_pitch
<= picture
->p
->i_pitch
);
936 assert(picture
->p
->i_visible_lines
<= picture
->p
->i_lines
);
938 /* Fill chroma planes for biplanar YUV */
939 if (picture
->format
.i_chroma
== VLC_CODEC_NV12
||
940 picture
->format
.i_chroma
== VLC_CODEC_NV21
||
941 picture
->format
.i_chroma
== VLC_CODEC_P010
) {
943 for (int n
= 1; n
< picture
->i_planes
; n
++) {
944 const plane_t
*o
= &picture
->p
[n
-1];
945 plane_t
*p
= &picture
->p
[n
];
947 p
->p_pixels
= o
->p_pixels
+ o
->i_lines
* o
->i_pitch
;
949 p
->i_lines
= picture
->format
.i_height
;
950 assert(p
->i_visible_pitch
<= p
->i_pitch
);
951 assert(p
->i_visible_lines
<= p
->i_lines
);
953 /* The dx/d3d buffer is always allocated as NV12 */
954 if (vlc_fourcc_AreUVPlanesSwapped(picture
->format
.i_chroma
, VLC_CODEC_NV12
)) {
955 /* TODO : Swap NV21 UV planes to match NV12 */
960 /* Fill chroma planes for planar YUV */
962 if (picture
->format
.i_chroma
== VLC_CODEC_I420
||
963 picture
->format
.i_chroma
== VLC_CODEC_J420
||
964 picture
->format
.i_chroma
== VLC_CODEC_YV12
) {
966 for (int n
= 1; n
< picture
->i_planes
; n
++) {
967 const plane_t
*o
= &picture
->p
[n
-1];
968 plane_t
*p
= &picture
->p
[n
];
970 p
->p_pixels
= o
->p_pixels
+ o
->i_lines
* o
->i_pitch
;
971 p
->i_pitch
= pitch
/ 2;
972 p
->i_lines
= picture
->format
.i_height
/ 2;
974 /* The dx/d3d buffer is always allocated as YV12 */
975 if (vlc_fourcc_AreUVPlanesSwapped(picture
->format
.i_chroma
, VLC_CODEC_YV12
))
976 picture_SwapUV( picture
);
983 #include <vlc_picture.h>
991 void (*conv
)(picture_t
*, const uint8_t *[], const size_t [], unsigned,
992 const copy_cache_t
*);
993 void (*conv16
)(picture_t
*, const uint8_t *[], const size_t [], unsigned, int,
994 const copy_cache_t
*);
1000 vlc_fourcc_t src_chroma
;
1001 struct test_dst dsts
[3];
1004 static const struct test_conv convs
[] = {
1005 { .src_chroma
= VLC_CODEC_NV12
,
1006 .dsts
= { { VLC_CODEC_I420
, 0, .conv
= Copy420_SP_to_P
},
1007 { VLC_CODEC_NV12
, 0, .conv
= Copy420_SP_to_SP
} },
1009 { .src_chroma
= VLC_CODEC_I420
,
1010 .dsts
= { { VLC_CODEC_I420
, 0, .conv
= Copy420_P_to_P
},
1011 { VLC_CODEC_NV12
, 0, .conv
= Copy420_P_to_SP
} },
1013 { .src_chroma
= VLC_CODEC_P010
,
1014 .dsts
= { { VLC_CODEC_I420_10L
, 6, .conv16
= Copy420_16_SP_to_P
} },
1016 { .src_chroma
= VLC_CODEC_I420_10L
,
1017 .dsts
= { { VLC_CODEC_P010
, -6, .conv16
= Copy420_16_P_to_SP
} },
1020 #define NB_CONVS ARRAY_SIZE(convs)
1026 int i_visible_width
;
1027 int i_visible_height
;
1029 static const struct test_size sizes
[] = {
1033 { 560, 369, 540, 350 },
1034 { 1274, 721, 1200, 720 },
1035 { 1920, 1088, 1920, 1080 },
1036 { 3840, 2160, 3840, 2160 },
1037 #if 0 /* too long */
1038 { 8192, 8192, 8192, 8192 },
1041 #define NB_SIZES ARRAY_SIZE(sizes)
1043 static void piccheck(picture_t
*pic
, const vlc_chroma_description_t
*dsc
,
1046 #define ASSERT_COLOR(good) do { \
1047 fprintf(stderr, "error: pixel doesn't match @ plane: %d: %d x %d: 0x%X vs 0x%X\n", i, x, y, *(--p), good); \
1048 assert(!"error: pixel doesn't match"); \
1051 #define PICCHECK(type_u, type_uv, colors_P, color_UV, pitch_den) do { \
1052 for (int i = 0; i < pic->i_planes; ++i) \
1054 const struct plane_t *plane = &pic->p[i]; \
1055 for (int y = 0; y < plane->i_visible_lines; ++y) \
1057 if (pic->i_planes == 2 && i == 1) \
1059 type_uv *p = (type_uv *)&plane->p_pixels[y * plane->i_pitch]; \
1060 for (int x = 0; x < plane->i_visible_pitch / 2 / pitch_den; ++x) \
1062 *(p++) = color_UV; \
1063 else if (*(p++) != color_UV) \
1064 ASSERT_COLOR(color_UV); \
1068 type_u *p = (type_u *) &plane->p_pixels[y * plane->i_pitch]; \
1069 for (int x = 0; x < plane->i_visible_pitch / pitch_den; ++x) \
1071 *(p++) = colors_P[i]; \
1072 else if (*(p++) != colors_P[i]) \
1073 ASSERT_COLOR(colors_P[i]); \
1079 assert(pic
->i_planes
== 2 || pic
->i_planes
== 3);
1080 assert(dsc
->pixel_size
== 1 || dsc
->pixel_size
== 2);
1082 if (dsc
->pixel_size
== 1)
1084 const uint8_t colors_8_P
[3] = { 0x42, 0xF1, 0x36 };
1085 const uint16_t color_8_UV
= ntoh16(0xF136);
1086 PICCHECK(uint8_t, uint16_t, colors_8_P
, color_8_UV
, 1);
1090 const unsigned mask
= (1 << dsc
->pixel_bits
) - 1;
1091 uint16_t colors_16_P
[3] = { 0x1042 &mask
, 0xF114 &mask
, 0x3645 &mask
};
1093 switch (pic
->format
.i_chroma
)
1095 case VLC_CODEC_P010
:
1096 for (size_t i
= 0; i
< 3; ++i
)
1097 colors_16_P
[i
] <<= 6;
1099 case VLC_CODEC_I420_10L
:
1102 vlc_assert_unreachable();
1105 uint32_t color_16_UV
= (colors_16_P
[2] << 16) | colors_16_P
[1];
1107 PICCHECK(uint16_t, uint32_t, colors_16_P
, color_16_UV
, 2);
1111 static void pic_rsc_destroy(picture_t
*pic
)
1113 for (unsigned i
= 0; i
< 3; i
++)
1114 free(pic
->p
[i
].p_pixels
);
1118 static picture_t
*pic_new_unaligned(const video_format_t
*fmt
)
1120 /* Allocate a no-aligned picture in order to ease buffer overflow detection
1121 * from the source picture */
1122 const vlc_chroma_description_t
*dsc
= vlc_fourcc_GetChromaDescription(fmt
->i_chroma
);
1124 picture_resource_t rsc
= { .pf_destroy
= pic_rsc_destroy
};
1125 for (unsigned i
= 0; i
< dsc
->plane_count
; i
++)
1127 rsc
.p
[i
].i_lines
= ((fmt
->i_visible_height
+ 1) & ~ 1) * dsc
->p
[i
].h
.num
/ dsc
->p
[i
].h
.den
;
1128 rsc
.p
[i
].i_pitch
= ((fmt
->i_visible_width
+ 1) & ~ 1) * dsc
->pixel_size
* dsc
->p
[i
].w
.num
/ dsc
->p
[i
].w
.den
;
1129 rsc
.p
[i
].p_pixels
= malloc(rsc
.p
[i
].i_lines
* rsc
.p
[i
].i_pitch
);
1130 assert(rsc
.p
[i
].p_pixels
);
1132 return picture_NewFromResource(fmt
, &rsc
);
1139 #ifndef COPY_TEST_NOOPTIM
1140 if (!vlc_CPU_SSE2())
1142 fprintf(stderr
, "WARNING: could not test SSE\n");
1147 for (size_t i
= 0; i
< NB_CONVS
; ++i
)
1149 const struct test_conv
*conv
= &convs
[i
];
1151 for (size_t j
= 0; j
< NB_SIZES
; ++j
)
1153 const struct test_size
*size
= &sizes
[j
];
1155 const vlc_chroma_description_t
*src_dsc
=
1156 vlc_fourcc_GetChromaDescription(conv
->src_chroma
);
1160 video_format_Init(&fmt
, 0);
1161 video_format_Setup(&fmt
, conv
->src_chroma
,
1162 size
->i_width
, size
->i_height
,
1163 size
->i_visible_width
, size
->i_visible_height
,
1165 picture_t
*src
= pic_new_unaligned(&fmt
);
1167 piccheck(src
, src_dsc
, true);
1170 int ret
= CopyInitCache(&cache
, src
->format
.i_width
1171 * src_dsc
->pixel_size
);
1172 assert(ret
== VLC_SUCCESS
);
1174 for (size_t f
= 0; conv
->dsts
[f
].chroma
!= 0; ++f
)
1176 const struct test_dst
*test_dst
= &conv
->dsts
[f
];
1178 const vlc_chroma_description_t
*dst_dsc
=
1179 vlc_fourcc_GetChromaDescription(test_dst
->chroma
);
1181 fmt
.i_chroma
= test_dst
->chroma
;
1182 picture_t
*dst
= picture_NewFromFormat(&fmt
);
1185 const uint8_t * src_planes
[3] = { src
->p
[Y_PLANE
].p_pixels
,
1186 src
->p
[U_PLANE
].p_pixels
,
1187 src
->p
[V_PLANE
].p_pixels
};
1188 const size_t src_pitches
[3] = { src
->p
[Y_PLANE
].i_pitch
,
1189 src
->p
[U_PLANE
].i_pitch
,
1190 src
->p
[V_PLANE
].i_pitch
};
1192 fprintf(stderr
, "testing: %u x %u (vis: %u x %u) %4.4s -> %4.4s\n",
1193 size
->i_width
, size
->i_height
,
1194 size
->i_visible_width
, size
->i_visible_height
,
1195 (const char *) &src
->format
.i_chroma
,
1196 (const char *) &dst
->format
.i_chroma
);
1197 if (test_dst
->bitshift
== 0)
1198 test_dst
->conv(dst
, src_planes
, src_pitches
,
1199 src
->format
.i_visible_height
, &cache
);
1201 test_dst
->conv16(dst
, src_planes
, src_pitches
,
1202 src
->format
.i_visible_height
, test_dst
->bitshift
,
1204 piccheck(dst
, dst_dsc
, false);
1205 picture_Release(dst
);
1207 picture_Release(src
);
1208 CopyCleanCache(&cache
);