modules/video_chroma/copy.c

   1 /*****************************************************************************
   2  * copy.c: Fast YV12/NV12 copy
   3  *****************************************************************************
   4  * Copyright (C) 2010 Laurent Aimar
   5  * $Id$
   6  *
   7  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
   8  *          Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify it
  11  * under the terms of the GNU Lesser General Public License as published by
  12  * the Free Software Foundation; either version 2.1 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18  * GNU Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this program; if not, write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 #ifdef HAVE_CONFIG_H
  26 # include "config.h"
  27 #endif
  28
  29 #ifdef COPY_TEST
  30 # undef NDEBUG
  31 #endif
  32
  33 #include <vlc_common.h>
  34 #include <vlc_picture.h>
  35 #include <vlc_cpu.h>
  36 #include <assert.h>
  37
  38 #include "copy.h"
  39 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
  40                       const uint8_t *src, size_t src_pitch,
  41                       unsigned height, int bitshift);
  42
  43 #define ASSERT_PLANE(i) assert(src[i]); \
  44     assert(src_pitch[i])
  45
  46 #define ASSERT_2PLANES \
  47     assert(dst); \
  48     ASSERT_PLANE(0); \
  49     ASSERT_PLANE(1); \
  50     assert(height)
  51
  52 #define ASSERT_3PLANES ASSERT_2PLANES; \
  53     ASSERT_PLANE(2)
  54
  55 int CopyInitCache(copy_cache_t *cache, unsigned width)
  56 {
  57 #ifdef CAN_COMPILE_SSE2
  58     cache->size = __MAX((width + 0x3f) & ~ 0x3f, 16384);
  59     cache->buffer = aligned_alloc(64, cache->size);
  60     if (!cache->buffer)
  61         return VLC_EGENERIC;
  62 #else
  63     (void) cache; (void) width;
  64 #endif
  65     return VLC_SUCCESS;
  66 }
  67
  68 void CopyCleanCache(copy_cache_t *cache)
  69 {
  70 #ifdef CAN_COMPILE_SSE2
  71     aligned_free(cache->buffer);
  72     cache->buffer = NULL;
  73     cache->size   = 0;
  74 #else
  75     (void) cache;
  76 #endif
  77 }
  78
  79 #ifdef CAN_COMPILE_SSE2
  80 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
  81  * load and storing data with the SSE>=2 instruction store.
  82  */
  83
  84 #define COPY16_SHIFTR(x) \
  85     "psrlw "x", %%xmm1\n"
  86 #define COPY16_SHIFTL(x) \
  87     "psllw "x", %%xmm1\n"
  88
  89 #define COPY16_S(dstp, srcp, load, store, shiftstr) \
  90     asm volatile (                      \
  91         load "  0(%[src]), %%xmm1\n"    \
  92         shiftstr                        \
  93         store " %%xmm1,    0(%[dst])\n" \
  94         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
  95
  96 #define COPY16(dstp, srcp, load, store) COPY16_S(dstp, srcp, load, store, "")
  97
  98 #define COPY64_SHIFTR(x) \
  99     "psrlw "x", %%xmm1\n" \
 100     "psrlw "x", %%xmm2\n" \
 101     "psrlw "x", %%xmm3\n" \
 102     "psrlw "x", %%xmm4\n"
 103 #define COPY64_SHIFTL(x) \
 104     "psllw "x", %%xmm1\n" \
 105     "psllw "x", %%xmm2\n" \
 106     "psllw "x", %%xmm3\n" \
 107     "psllw "x", %%xmm4\n"
 108
 109 #define COPY64_S(dstp, srcp, load, store, shiftstr) \
 110     asm volatile (                      \
 111         load "  0(%[src]), %%xmm1\n"    \
 112         load " 16(%[src]), %%xmm2\n"    \
 113         load " 32(%[src]), %%xmm3\n"    \
 114         load " 48(%[src]), %%xmm4\n"    \
 115         shiftstr                        \
 116         store " %%xmm1,    0(%[dst])\n" \
 117         store " %%xmm2,   16(%[dst])\n" \
 118         store " %%xmm3,   32(%[dst])\n" \
 119         store " %%xmm4,   48(%[dst])\n" \
 120         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
 121
 122 #define COPY64(dstp, srcp, load, store) \
 123     COPY64_S(dstp, srcp, load, store, "")
 124
 125 #ifdef COPY_TEST_NOOPTIM
 126 # undef vlc_CPU_SSE4_1
 127 # define vlc_CPU_SSE4_1() (0)
 128 # undef vlc_CPU_SSE3
 129 # define vlc_CPU_SSE3() (0)
 130 # undef vlc_CPU_SSSE3
 131 # define vlc_CPU_SSSE3() (0)
 132 # undef vlc_CPU_SSE2
 133 # define vlc_CPU_SSE2() (0)
 134 #endif
 135
 136 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
 137  * as used by some video surface.
 138  * XXX It is really efficient only when SSE4.1 is available.
 139  */
 140 VLC_SSE
 141 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
 142                          const uint8_t *src, size_t src_pitch,
 143                          unsigned width, unsigned height, int bitshift)
 144 {
 145     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
 146
 147     asm volatile ("mfence");
 148
 149 #define SSE_USWC_COPY(shiftstr16, shiftstr64) \
 150     for (unsigned y = 0; y < height; y++) { \
 151         const unsigned unaligned = (-(uintptr_t)src) & 0x0f; \
 152         unsigned x = unaligned; \
 153         if (vlc_CPU_SSE4_1()) { \
 154             if (!unaligned) { \
 155                 for (; x+63 < width; x += 64) \
 156                     COPY64_S(&dst[x], &src[x], "movntdqa", "movdqa", shiftstr64); \
 157             } else { \
 158                 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
 159                 for (; x+63 < width; x += 64) \
 160                     COPY64_S(&dst[x], &src[x], "movntdqa", "movdqu", shiftstr64); \
 161             } \
 162         } else { \
 163             if (!unaligned) { \
 164                 for (; x+63 < width; x += 64) \
 165                     COPY64_S(&dst[x], &src[x], "movdqa", "movdqa", shiftstr64); \
 166             } else { \
 167                 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
 168                 for (; x+63 < width; x += 64) \
 169                     COPY64_S(&dst[x], &src[x], "movdqa", "movdqu", shiftstr64); \
 170             } \
 171         } \
 172         /* The following should not happen since buffers are generally well aligned */ \
 173         if (x < width) \
 174             CopyPlane(&dst[x], dst_pitch - x, &src[x], src_pitch - x, 1, bitshift); \
 175         src += src_pitch; \
 176         dst += dst_pitch; \
 177     }
 178
 179     switch (bitshift)
 180     {
 181         case 0:
 182             SSE_USWC_COPY("", "")
 183             break;
 184         case -6:
 185             SSE_USWC_COPY(COPY16_SHIFTL("$6"), COPY64_SHIFTL("$6"))
 186             break;
 187         case 6:
 188             SSE_USWC_COPY(COPY16_SHIFTR("$6"), COPY64_SHIFTR("$6"))
 189             break;
 190         case 2:
 191             SSE_USWC_COPY(COPY16_SHIFTR("$2"), COPY64_SHIFTR("$2"))
 192             break;
 193         case -2:
 194             SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
 195             break;
 196         case 4:
 197             SSE_USWC_COPY(COPY16_SHIFTR("$4"), COPY64_SHIFTR("$4"))
 198             break;
 199         case -4:
 200             SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
 201             break;
 202         default:
 203             vlc_assert_unreachable();
 204     }
 205 #undef SSE_USWC_COPY
 206
 207     asm volatile ("mfence");
 208 }
 209
 210 VLC_SSE
 211 static void Copy2d(uint8_t *dst, size_t dst_pitch,
 212                    const uint8_t *src, size_t src_pitch,
 213                    unsigned width, unsigned height)
 214 {
 215     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 216
 217     for (unsigned y = 0; y < height; y++) {
 218         unsigned x = 0;
 219
 220         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
 221         if (!unaligned) {
 222             for (; x+63 < width; x += 64)
 223                 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
 224         } else {
 225             for (; x+63 < width; x += 64)
 226                 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
 227         }
 228
 229         for (; x < width; x++)
 230             dst[x] = src[x];
 231
 232         src += src_pitch;
 233         dst += dst_pitch;
 234     }
 235 }
 236
 237 VLC_SSE
 238 static void
 239 SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch,
 240                  uint8_t *srcu, size_t srcu_pitch,
 241                  uint8_t *srcv, size_t srcv_pitch,
 242                  unsigned int width, unsigned int height, uint8_t pixel_size)
 243 {
 244     assert(!((intptr_t)srcu & 0xf) && !(srcu_pitch & 0x0f) &&
 245            !((intptr_t)srcv & 0xf) && !(srcv_pitch & 0x0f));
 246
 247     static const uint8_t shuffle_8[] = { 0, 8,
 248                                          1, 9,
 249                                          2, 10,
 250                                          3, 11,
 251                                          4, 12,
 252                                          5, 13,
 253                                          6, 14,
 254                                          7, 15 };
 255     static const uint8_t shuffle_16[] = { 0, 1, 8, 9,
 256                                           2, 3, 10, 11,
 257                                           4, 5, 12, 13,
 258                                           6, 7, 14, 15 };
 259     const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
 260
 261     for (unsigned int y = 0; y < height; ++y)
 262     {
 263         unsigned int    x;
 264
 265 #define LOAD2X32                        \
 266     "movhpd 0x00(%[src2]), %%xmm0\n"    \
 267     "movlpd 0x00(%[src1]), %%xmm0\n"    \
 268                                         \
 269     "movhpd 0x08(%[src2]), %%xmm1\n"    \
 270     "movlpd 0x08(%[src1]), %%xmm1\n"    \
 271                                         \
 272     "movhpd 0x10(%[src2]), %%xmm2\n"    \
 273     "movlpd 0x10(%[src1]), %%xmm2\n"    \
 274                                         \
 275     "movhpd 0x18(%[src2]), %%xmm3\n"    \
 276     "movlpd 0x18(%[src1]), %%xmm3\n"
 277
 278 #define STORE64                         \
 279     "movdqu %%xmm0, 0x00(%[dst])\n"     \
 280     "movdqu %%xmm1, 0x10(%[dst])\n"     \
 281     "movdqu %%xmm2, 0x20(%[dst])\n"     \
 282     "movdqu %%xmm3, 0x30(%[dst])\n"
 283
 284 #ifdef CAN_COMPILE_SSSE3
 285         if (vlc_CPU_SSSE3())
 286             for (x = 0; x < (width & ~31); x += 32)
 287                 asm volatile
 288                     (
 289                         "movdqu (%[shuffle]), %%xmm7\n"
 290                         LOAD2X32
 291                         "pshufb %%xmm7, %%xmm0\n"
 292                         "pshufb %%xmm7, %%xmm1\n"
 293                         "pshufb %%xmm7, %%xmm2\n"
 294                         "pshufb %%xmm7, %%xmm3\n"
 295                         STORE64
 296                         : : [dst]"r"(dst+2*x),
 297                             [src1]"r"(srcu+x), [src2]"r"(srcv+x),
 298                             [shuffle]"r"(shuffle)
 299                         : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
 300                     );
 301         else
 302 #endif
 303
 304         {
 305             assert(pixel_size == 1);
 306             for (x = 0; x < (width & ~31); x += 32)
 307                 asm volatile
 308                     (
 309                         LOAD2X32
 310                         "movhlps   %%xmm0, %%xmm4\n"
 311                         "punpcklbw %%xmm4, %%xmm0\n"
 312
 313                         "movhlps   %%xmm1, %%xmm4\n"
 314                         "punpcklbw %%xmm4, %%xmm1\n"
 315
 316                         "movhlps   %%xmm2, %%xmm4\n"
 317                         "punpcklbw %%xmm4, %%xmm2\n"
 318
 319                         "movhlps   %%xmm3, %%xmm4\n"
 320                         "punpcklbw %%xmm4, %%xmm3\n"
 321                         STORE64
 322                         : : [dst]"r"(dst+2*x),
 323                             [src1]"r"(srcu+x), [src2]"r"(srcv+x)
 324                         : "memory",
 325                           "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm7"
 326                     );
 327         }
 328 #undef LOAD2X32
 329 #undef STORE64
 330
 331         if (pixel_size == 1)
 332         {
 333             for (; x < width; x++) {
 334                 dst[2*x+0] = srcu[x];
 335                 dst[2*x+1] = srcv[x];
 336             }
 337         }
 338         else
 339         {
 340             for (; x < width; x+= 2) {
 341                 dst[2*x+0] = srcu[x];
 342                 dst[2*x+1] = srcu[x + 1];
 343                 dst[2*x+2] = srcv[x];
 344                 dst[2*x+3] = srcv[x + 1];
 345             }
 346         }
 347         srcu += srcu_pitch;
 348         srcv += srcv_pitch;
 349         dst += dst_pitch;
 350     }
 351 }
 352
 353 VLC_SSE
 354 static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
 355                         uint8_t *dstv, size_t dstv_pitch,
 356                         const uint8_t *src, size_t src_pitch,
 357                         unsigned width, unsigned height, uint8_t pixel_size)
 358 {
 359     assert(pixel_size == 1 || pixel_size == 2);
 360     assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
 361
 362 #define LOAD64 \
 363     "movdqa  0(%[src]), %%xmm0\n" \
 364     "movdqa 16(%[src]), %%xmm1\n" \
 365     "movdqa 32(%[src]), %%xmm2\n" \
 366     "movdqa 48(%[src]), %%xmm3\n"
 367
 368 #define STORE2X32 \
 369     "movq   %%xmm0,   0(%[dst1])\n" \
 370     "movq   %%xmm1,   8(%[dst1])\n" \
 371     "movhpd %%xmm0,   0(%[dst2])\n" \
 372     "movhpd %%xmm1,   8(%[dst2])\n" \
 373     "movq   %%xmm2,  16(%[dst1])\n" \
 374     "movq   %%xmm3,  24(%[dst1])\n" \
 375     "movhpd %%xmm2,  16(%[dst2])\n" \
 376     "movhpd %%xmm3,  24(%[dst2])\n"
 377
 378 #ifdef CAN_COMPILE_SSSE3
 379     if (vlc_CPU_SSSE3())
 380     {
 381         static const uint8_t shuffle_8[] = { 0, 2, 4, 6, 8, 10, 12, 14,
 382                                              1, 3, 5, 7, 9, 11, 13, 15 };
 383         static const uint8_t shuffle_16[] = {  0,  1,  4,  5,  8,  9, 12, 13,
 384                                                2,  3,  6,  7, 10, 11, 14, 15 };
 385         const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
 386         for (unsigned y = 0; y < height; y++) {
 387             unsigned x = 0;
 388             for (; x < (width & ~31); x += 32) {
 389                 asm volatile (
 390                     "movdqu (%[shuffle]), %%xmm7\n"
 391                     LOAD64
 392                     "pshufb  %%xmm7, %%xmm0\n"
 393                     "pshufb  %%xmm7, %%xmm1\n"
 394                     "pshufb  %%xmm7, %%xmm2\n"
 395                     "pshufb  %%xmm7, %%xmm3\n"
 396                     STORE2X32
 397                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
 398             }
 399             if (pixel_size == 1)
 400             {
 401                 for (; x < width; x++) {
 402                     dstu[x] = src[2*x+0];
 403                     dstv[x] = src[2*x+1];
 404                 }
 405             }
 406             else
 407             {
 408                 for (; x < width; x+= 2) {
 409                     dstu[x] = src[2*x+0];
 410                     dstu[x+1] = src[2*x+1];
 411                     dstv[x] = src[2*x+2];
 412                     dstv[x+1] = src[2*x+3];
 413                 }
 414             }
 415             src  += src_pitch;
 416             dstu += dstu_pitch;
 417             dstv += dstv_pitch;
 418         }
 419     } else
 420 #endif
 421     {
 422         assert(pixel_size == 1);
 423         static const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
 424                                         0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
 425
 426         for (unsigned y = 0; y < height; y++)
 427         {
 428             unsigned x = 0;
 429             for (; x < (width & ~31); x += 32) {
 430                 asm volatile (
 431                     "movdqu (%[mask]), %%xmm7\n"
 432                     LOAD64
 433                     "movdqa   %%xmm0, %%xmm4\n"
 434                     "movdqa   %%xmm1, %%xmm5\n"
 435                     "movdqa   %%xmm2, %%xmm6\n"
 436                     "psrlw    $8,     %%xmm0\n"
 437                     "psrlw    $8,     %%xmm1\n"
 438                     "pand     %%xmm7, %%xmm4\n"
 439                     "pand     %%xmm7, %%xmm5\n"
 440                     "pand     %%xmm7, %%xmm6\n"
 441                     "packuswb %%xmm4, %%xmm0\n"
 442                     "packuswb %%xmm5, %%xmm1\n"
 443                     "pand     %%xmm3, %%xmm7\n"
 444                     "psrlw    $8,     %%xmm2\n"
 445                     "psrlw    $8,     %%xmm3\n"
 446                     "packuswb %%xmm6, %%xmm2\n"
 447                     "packuswb %%xmm7, %%xmm3\n"
 448                     STORE2X32
 449                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 450             }
 451             for (; x < width; x++) {
 452                 dstu[x] = src[2*x+0];
 453                 dstv[x] = src[2*x+1];
 454             }
 455             src  += src_pitch;
 456             dstu += dstu_pitch;
 457             dstv += dstv_pitch;
 458         }
 459     }
 460 #undef STORE2X32
 461 #undef LOAD64
 462 }
 463
 464 static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
 465                           const uint8_t *src, size_t src_pitch,
 466                           uint8_t *cache, size_t cache_size,
 467                           unsigned height, int bitshift)
 468 {
 469     const unsigned w16 = (src_pitch+15) & ~15;
 470     const unsigned hstep = cache_size / w16;
 471     assert(hstep > 0);
 472
 473     /* If SSE4.1: CopyFromUswc is faster than memcpy */
 474     if (!vlc_CPU_SSE4_1() && bitshift == 0 && src_pitch == dst_pitch)
 475         memcpy(dst, src, src_pitch * height);
 476     else
 477     for (unsigned y = 0; y < height; y += hstep) {
 478         const unsigned hblock =  __MIN(hstep, height - y);
 479
 480         /* Copy a bunch of line into our cache */
 481         CopyFromUswc(cache, w16, src, src_pitch, src_pitch, hblock, bitshift);
 482
 483         /* Copy from our cache to the destination */
 484         Copy2d(dst, dst_pitch, cache, w16, src_pitch, hblock);
 485
 486         /* */
 487         src += src_pitch * hblock;
 488         dst += dst_pitch * hblock;
 489     }
 490 }
 491
 492 static void
 493 SSE_InterleavePlanes(uint8_t *dst, size_t dst_pitch,
 494                      const uint8_t *srcu, size_t srcu_pitch,
 495                      const uint8_t *srcv, size_t srcv_pitch,
 496                      uint8_t *cache, size_t cache_size,
 497                      unsigned int height, uint8_t pixel_size, int bitshift)
 498 {
 499     assert(srcu_pitch == srcv_pitch);
 500     unsigned int const  w16 = (srcu_pitch+15) & ~15;
 501     unsigned int const  hstep = (cache_size) / (2*w16);
 502     assert(hstep > 0);
 503
 504     for (unsigned int y = 0; y < height; y += hstep)
 505     {
 506         unsigned int const      hblock = __MIN(hstep, height - y);
 507
 508         /* Copy a bunch of line into our cache */
 509         CopyFromUswc(cache, w16, srcu, srcu_pitch, srcu_pitch, hblock, bitshift);
 510         CopyFromUswc(cache+w16*hblock, w16, srcv, srcv_pitch,
 511                      srcv_pitch, hblock, bitshift);
 512
 513         /* Copy from our cache to the destination */
 514         SSE_InterleaveUV(dst, dst_pitch, cache, w16,
 515                          cache + w16 * hblock, w16,
 516                          srcu_pitch, hblock, pixel_size);
 517
 518         /* */
 519         srcu += hblock * srcu_pitch;
 520         srcv += hblock * srcv_pitch;
 521         dst += hblock * dst_pitch;
 522     }
 523 }
 524
 525 static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 526                             uint8_t *dstv, size_t dstv_pitch,
 527                             const uint8_t *src, size_t src_pitch,
 528                             uint8_t *cache, size_t cache_size,
 529                             unsigned height, uint8_t pixel_size, int bitshift)
 530 {
 531     const unsigned w16 = (src_pitch+15) & ~15;
 532     const unsigned hstep = cache_size / w16;
 533     assert(hstep > 0);
 534
 535     for (unsigned y = 0; y < height; y += hstep) {
 536         const unsigned hblock =  __MIN(hstep, height - y);
 537
 538         /* Copy a bunch of line into our cache */
 539         CopyFromUswc(cache, w16, src, src_pitch, src_pitch, hblock, bitshift);
 540
 541         /* Copy from our cache to the destination */
 542         SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
 543                     cache, w16, src_pitch / 2, hblock, pixel_size);
 544
 545         /* */
 546         src  += src_pitch  * hblock;
 547         dstu += dstu_pitch * hblock;
 548         dstv += dstv_pitch * hblock;
 549     }
 550 }
 551
 552 static void SSE_Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
 553                                const size_t src_pitch[static 3], unsigned height,
 554                                const copy_cache_t *cache)
 555 {
 556     for (unsigned n = 0; n < 3; n++) {
 557         const unsigned d = n > 0 ? 2 : 1;
 558         SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
 559                       src[n], src_pitch[n],
 560                       cache->buffer, cache->size,
 561                       (height+d-1)/d, 0);
 562     }
 563     asm volatile ("emms");
 564 }
 565
 566
 567 static void SSE_Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
 568                                  const size_t src_pitch[static 2], unsigned height,
 569                                  const copy_cache_t *cache)
 570 {
 571     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
 572                   cache->buffer, cache->size, height, 0);
 573     SSE_CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch, src[1], src_pitch[1],
 574                   cache->buffer, cache->size, height / 2, 0);
 575     asm volatile ("emms");
 576 }
 577
 578 static void
 579 SSE_Copy420_SP_to_P(picture_t *dest, const uint8_t *src[static 2],
 580                     const size_t src_pitch[static 2], unsigned int height,
 581                     uint8_t pixel_size, int bitshift, const copy_cache_t *cache)
 582 {
 583     SSE_CopyPlane(dest->p[0].p_pixels, dest->p[0].i_pitch,
 584                   src[0], src_pitch[0], cache->buffer, cache->size, height, bitshift);
 585
 586     SSE_SplitPlanes(dest->p[1].p_pixels, dest->p[1].i_pitch,
 587                     dest->p[2].p_pixels, dest->p[2].i_pitch,
 588                     src[1], src_pitch[1], cache->buffer, cache->size,
 589                     height / 2, pixel_size, bitshift);
 590     asm volatile ("emms");
 591 }
 592
 593 static void SSE_Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 594                                 const size_t src_pitch[static 3],
 595                                 unsigned height, uint8_t pixel_size,
 596                                 int bitshift, const copy_cache_t *cache)
 597 {
 598     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
 599                   cache->buffer, cache->size, height, bitshift);
 600     SSE_InterleavePlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
 601                          src[U_PLANE], src_pitch[U_PLANE],
 602                          src[V_PLANE], src_pitch[V_PLANE],
 603                          cache->buffer, cache->size, height / 2, pixel_size, bitshift);
 604     asm volatile ("emms");
 605 }
 606 #undef COPY64
 607 #endif /* CAN_COMPILE_SSE2 */
 608
 609 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
 610                       const uint8_t *src, size_t src_pitch,
 611                       unsigned height, int bitshift)
 612 {
 613     if (bitshift != 0)
 614     {
 615         for (unsigned y = 0; y < height; y++)
 616         {
 617             uint16_t *dst16 = (uint16_t *) dst;
 618             const uint16_t *src16 = (const uint16_t *) src;
 619
 620             if (bitshift > 0)
 621                 for (unsigned x = 0; x < (src_pitch / 2); x++)
 622                     *dst16++ = (*src16++) >> (bitshift & 0xf);
 623             else
 624                 for (unsigned x = 0; x < (src_pitch / 2); x++)
 625                     *dst16++ = (*src16++) << ((-bitshift) & 0xf);
 626             src += src_pitch;
 627             dst += dst_pitch;
 628         }
 629     }
 630     else if (src_pitch == dst_pitch)
 631         memcpy(dst, src, src_pitch * height);
 632     else
 633     for (unsigned y = 0; y < height; y++) {
 634         memcpy(dst, src, src_pitch);
 635         src += src_pitch;
 636         dst += dst_pitch;
 637     }
 638 }
 639
 640 void Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
 641                       const size_t src_pitch[static 2], unsigned height,
 642                       const copy_cache_t *cache)
 643 {
 644     ASSERT_2PLANES;
 645 #ifdef CAN_COMPILE_SSE2
 646     if (vlc_CPU_SSE2())
 647         return SSE_Copy420_SP_to_SP(dst, src, src_pitch, height, cache);
 648 #else
 649     (void) cache;
 650 #endif
 651
 652     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 653               src[0], src_pitch[0], height, 0);
 654     CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
 655               src[1], src_pitch[1], height/2, 0);
 656 }
 657
 658 #define SPLIT_PLANES(type, pitch_den) do { \
 659     for (unsigned y = 0; y < height; y++) { \
 660         for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
 661             ((type *) dstu)[x] = ((const type *) src)[2*x+0]; \
 662             ((type *) dstv)[x] = ((const type *) src)[2*x+1]; \
 663         } \
 664         src  += src_pitch; \
 665         dstu += dstu_pitch; \
 666         dstv += dstv_pitch; \
 667     } \
 668 } while(0)
 669
 670 #define SPLIT_PLANES_SHIFTR(type, pitch_den, bitshift) do { \
 671     for (unsigned y = 0; y < height; y++) { \
 672         for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
 673             ((type *) dstu)[x] = (((const type *) src)[2*x+0]) >> (bitshift); \
 674             ((type *) dstv)[x] = (((const type *) src)[2*x+1]) >> (bitshift); \
 675         } \
 676         src  += src_pitch; \
 677         dstu += dstu_pitch; \
 678         dstv += dstv_pitch; \
 679     } \
 680 } while(0)
 681
 682 #define SPLIT_PLANES_SHIFTL(type, pitch_den, bitshift) do { \
 683     for (unsigned y = 0; y < height; y++) { \
 684         for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
 685             ((type *) dstu)[x] = (((const type *) src)[2*x+0]) << (bitshift); \
 686             ((type *) dstv)[x] = (((const type *) src)[2*x+1]) << (bitshift); \
 687         } \
 688         src  += src_pitch; \
 689         dstu += dstu_pitch; \
 690         dstv += dstv_pitch; \
 691     } \
 692 } while(0)
 693
 694 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 695                         uint8_t *dstv, size_t dstv_pitch,
 696                         const uint8_t *src, size_t src_pitch, unsigned height)
 697 {
 698     SPLIT_PLANES(uint8_t, 2);
 699 }
 700
 701 static void SplitPlanes16(uint8_t *dstu, size_t dstu_pitch,
 702                           uint8_t *dstv, size_t dstv_pitch,
 703                           const uint8_t *src, size_t src_pitch, unsigned height,
 704                           int bitshift)
 705 {
 706     if (bitshift == 0)
 707         SPLIT_PLANES(uint16_t, 4);
 708     else if (bitshift > 0)
 709         SPLIT_PLANES_SHIFTR(uint16_t, 4, bitshift & 0xf);
 710     else
 711         SPLIT_PLANES_SHIFTL(uint16_t, 4, (-bitshift) & 0xf);
 712 }
 713
 714 void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
 715                      const size_t src_pitch[static 2], unsigned height,
 716                      const copy_cache_t *cache)
 717 {
 718     ASSERT_2PLANES;
 719 #ifdef CAN_COMPILE_SSE2
 720     if (vlc_CPU_SSE2())
 721         return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 1, 0, cache);
 722 #else
 723     VLC_UNUSED(cache);
 724 #endif
 725
 726     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 727               src[0], src_pitch[0], height, 0);
 728     SplitPlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
 729                 dst->p[2].p_pixels, dst->p[2].i_pitch,
 730                 src[1], src_pitch[1], height/2);
 731 }
 732
 733 void Copy420_16_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
 734                         const size_t src_pitch[static 2], unsigned height,
 735                         int bitshift, const copy_cache_t *cache)
 736 {
 737     ASSERT_2PLANES;
 738     assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0));
 739
 740 #ifdef CAN_COMPILE_SSE3
 741     if (vlc_CPU_SSSE3())
 742         return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 2, bitshift, cache);
 743 #else
 744     VLC_UNUSED(cache);
 745 #endif
 746
 747     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 748               src[0], src_pitch[0], height, bitshift);
 749     SplitPlanes16(dst->p[1].p_pixels, dst->p[1].i_pitch,
 750                   dst->p[2].p_pixels, dst->p[2].i_pitch,
 751                   src[1], src_pitch[1], height/2, bitshift);
 752 }
 753
 754 #define INTERLEAVE_UV() do { \
 755     for ( unsigned int line = 0; line < copy_lines; line++ ) { \
 756         for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
 757             *dstUV++ = *srcU++; \
 758             *dstUV++ = *srcV++; \
 759         } \
 760         dstUV += i_extra_pitch_uv; \
 761         srcU  += i_extra_pitch_u; \
 762         srcV  += i_extra_pitch_v; \
 763     } \
 764 }while(0)
 765
 766 #define INTERLEAVE_UV_SHIFTR(bitshitf) do { \
 767     for ( unsigned int line = 0; line < copy_lines; line++ ) { \
 768         for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
 769             *dstUV++ = (*srcU++) >> (bitshitf); \
 770             *dstUV++ = (*srcV++) >> (bitshitf); \
 771         } \
 772         dstUV += i_extra_pitch_uv; \
 773         srcU  += i_extra_pitch_u; \
 774         srcV  += i_extra_pitch_v; \
 775     } \
 776 }while(0)
 777
 778 #define INTERLEAVE_UV_SHIFTL(bitshitf) do { \
 779     for ( unsigned int line = 0; line < copy_lines; line++ ) { \
 780         for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
 781             *dstUV++ = (*srcU++) << (bitshitf); \
 782             *dstUV++ = (*srcV++) << (bitshitf); \
 783         } \
 784         dstUV += i_extra_pitch_uv; \
 785         srcU  += i_extra_pitch_u; \
 786         srcV  += i_extra_pitch_v; \
 787     } \
 788 }while(0)
 789
 790 void Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 791                      const size_t src_pitch[static 3], unsigned height,
 792                      const copy_cache_t *cache)
 793 {
 794     ASSERT_3PLANES;
 795 #ifdef CAN_COMPILE_SSE2
 796     if (vlc_CPU_SSE2())
 797         return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 1, 0, cache);
 798 #else
 799     (void) cache;
 800 #endif
 801
 802     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 803               src[0], src_pitch[0], height, 0);
 804
 805     const unsigned copy_lines = height / 2;
 806     const unsigned copy_pitch = src_pitch[1];
 807
 808     const int i_extra_pitch_uv = dst->p[1].i_pitch - 2 * copy_pitch;
 809     const int i_extra_pitch_u  = src_pitch[U_PLANE] - copy_pitch;
 810     const int i_extra_pitch_v  = src_pitch[V_PLANE] - copy_pitch;
 811
 812     uint8_t *dstUV = dst->p[1].p_pixels;
 813     const uint8_t *srcU  = src[U_PLANE];
 814     const uint8_t *srcV  = src[V_PLANE];
 815     INTERLEAVE_UV();
 816 }
 817
 818 void Copy420_16_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 819                         const size_t src_pitch[static 3], unsigned height,
 820                         int bitshift, const copy_cache_t *cache)
 821 {
 822     ASSERT_3PLANES;
 823     assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0));
 824 #ifdef CAN_COMPILE_SSE2
 825     if (vlc_CPU_SSSE3())
 826         return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 2, bitshift, cache);
 827 #else
 828     (void) cache;
 829 #endif
 830
 831     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 832               src[0], src_pitch[0], height, bitshift);
 833
 834     const unsigned copy_lines = height / 2;
 835     const unsigned copy_pitch = src_pitch[1] / 2;
 836
 837     const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
 838     const int i_extra_pitch_u  = src_pitch[U_PLANE] / 2 - copy_pitch;
 839     const int i_extra_pitch_v  = src_pitch[V_PLANE] / 2 - copy_pitch;
 840
 841     uint16_t *dstUV = (void*) dst->p[1].p_pixels;
 842     const uint16_t *srcU  = (const uint16_t *) src[U_PLANE];
 843     const uint16_t *srcV  = (const uint16_t *) src[V_PLANE];
 844
 845     if (bitshift == 0)
 846         INTERLEAVE_UV();
 847     else if (bitshift > 0)
 848         INTERLEAVE_UV_SHIFTR(bitshift & 0xf);
 849     else
 850         INTERLEAVE_UV_SHIFTL((-bitshift) & 0xf);
 851 }
 852
 853 void CopyFromI420_10ToP010(picture_t *dst, const uint8_t *src[static 3],
 854                            const size_t src_pitch[static 3],
 855                            unsigned height, const copy_cache_t *cache)
 856 {
 857     (void) cache;
 858
 859     const int i_extra_pitch_dst_y = (dst->p[0].i_pitch  - src_pitch[0]) / 2;
 860     const int i_extra_pitch_src_y = (src_pitch[Y_PLANE] - src_pitch[0]) / 2;
 861     uint16_t *dstY = (uint16_t *) dst->p[0].p_pixels;
 862     const uint16_t *srcY = (const uint16_t *) src[Y_PLANE];
 863     for (unsigned y = 0; y < height; y++) {
 864         for (unsigned x = 0; x < (src_pitch[0] / 2); x++) {
 865             *dstY++ = *srcY++ << 6;
 866         }
 867         dstY += i_extra_pitch_dst_y;
 868         srcY += i_extra_pitch_src_y;
 869     }
 870
 871     const unsigned copy_lines = height / 2;
 872     const unsigned copy_pitch = src_pitch[1] / 2;
 873
 874     const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
 875     const int i_extra_pitch_u  = src_pitch[U_PLANE] / 2 - copy_pitch;
 876     const int i_extra_pitch_v  = src_pitch[V_PLANE] / 2 - copy_pitch;
 877
 878     uint16_t *dstUV = (uint16_t *) dst->p[1].p_pixels;
 879     const uint16_t *srcU  = (const uint16_t *) src[U_PLANE];
 880     const uint16_t *srcV  = (const uint16_t *) src[V_PLANE];
 881     for ( unsigned int line = 0; line < copy_lines; line++ )
 882     {
 883         for ( unsigned int col = 0; col < copy_pitch; col++ )
 884         {
 885             *dstUV++ = *srcU++ << 6;
 886             *dstUV++ = *srcV++ << 6;
 887         }
 888         dstUV += i_extra_pitch_uv;
 889         srcU  += i_extra_pitch_u;
 890         srcV  += i_extra_pitch_v;
 891     }
 892 }
 893
 894 void Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
 895                     const size_t src_pitch[static 3], unsigned height,
 896                     const copy_cache_t *cache)
 897 {
 898     ASSERT_3PLANES;
 899 #ifdef CAN_COMPILE_SSE2
 900     if (vlc_CPU_SSE2())
 901         return SSE_Copy420_P_to_P(dst, src, src_pitch, height, cache);
 902 #else
 903     (void) cache;
 904 #endif
 905
 906      CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 907                src[0], src_pitch[0], height, 0);
 908      CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
 909                src[1], src_pitch[1], height / 2, 0);
 910      CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
 911                src[2], src_pitch[2], height / 2, 0);
 912 }
 913
 914 int picture_UpdatePlanes(picture_t *picture, uint8_t *data, unsigned pitch)
 915 {
 916     /* fill in buffer info in first plane */
 917     picture->p->p_pixels = data;
 918     picture->p->i_pitch  = pitch;
 919     picture->p->i_lines  = picture->format.i_height;
 920     assert(picture->p->i_visible_pitch <= picture->p->i_pitch);
 921     assert(picture->p->i_visible_lines <= picture->p->i_lines);
 922
 923     /*  Fill chroma planes for biplanar YUV */
 924     if (picture->format.i_chroma == VLC_CODEC_NV12 ||
 925         picture->format.i_chroma == VLC_CODEC_NV21 ||
 926         picture->format.i_chroma == VLC_CODEC_P010) {
 927
 928         for (int n = 1; n < picture->i_planes; n++) {
 929             const plane_t *o = &picture->p[n-1];
 930             plane_t *p = &picture->p[n];
 931
 932             p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
 933             p->i_pitch  = pitch;
 934             p->i_lines  = picture->format.i_height;
 935             assert(p->i_visible_pitch <= p->i_pitch);
 936             assert(p->i_visible_lines <= p->i_lines);
 937         }
 938         /* The dx/d3d buffer is always allocated as NV12 */
 939         if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_NV12)) {
 940             /* TODO : Swap NV21 UV planes to match NV12 */
 941             return VLC_EGENERIC;
 942         }
 943     }
 944
 945     /*  Fill chroma planes for planar YUV */
 946     else
 947     if (picture->format.i_chroma == VLC_CODEC_I420 ||
 948         picture->format.i_chroma == VLC_CODEC_J420 ||
 949         picture->format.i_chroma == VLC_CODEC_YV12) {
 950
 951         for (int n = 1; n < picture->i_planes; n++) {
 952             const plane_t *o = &picture->p[n-1];
 953             plane_t *p = &picture->p[n];
 954
 955             p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
 956             p->i_pitch  = pitch / 2;
 957             p->i_lines  = picture->format.i_height / 2;
 958         }
 959         /* The dx/d3d buffer is always allocated as YV12 */
 960         if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_YV12))
 961             picture_SwapUV( picture );
 962     }
 963     return VLC_SUCCESS;
 964 }
 965
 966 #ifdef COPY_TEST
 967
 968 #include <vlc_picture.h>
 969
 970 struct test_dst
 971 {
 972     vlc_fourcc_t chroma;
 973     int bitshift;
 974     union
 975     {
 976         void (*conv)(picture_t *, const uint8_t *[], const size_t [], unsigned,
 977                      const copy_cache_t *);
 978         void (*conv16)(picture_t *, const uint8_t *[], const size_t [], unsigned, int,
 979                      const copy_cache_t *);
 980     };
 981 };
 982
 983 struct test_conv
 984 {
 985     vlc_fourcc_t src_chroma;
 986     struct test_dst dsts[3];
 987 };
 988
 989 static const struct test_conv convs[] = {
 990     { .src_chroma = VLC_CODEC_NV12,
 991       .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_SP_to_P },
 992                 { VLC_CODEC_NV12, 0, .conv = Copy420_SP_to_SP } },
 993     },
 994     { .src_chroma = VLC_CODEC_I420,
 995       .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_P_to_P },
 996                 { VLC_CODEC_NV12, 0, .conv = Copy420_P_to_SP } },
 997     },
 998     { .src_chroma = VLC_CODEC_P010,
 999       .dsts = { { VLC_CODEC_I420_10L, 6, .conv16 = Copy420_16_SP_to_P } },
1000     },
1001     { .src_chroma = VLC_CODEC_I420_10L,
1002       .dsts = { { VLC_CODEC_P010, -6, .conv16 = Copy420_16_P_to_SP } },
1003     },
1004 };
1005 #define NB_CONVS ARRAY_SIZE(convs)
1006
1007 struct test_size
1008 {
1009     int i_width;
1010     int i_height;
1011     int i_visible_width;
1012     int i_visible_height;
1013 };
1014 static const struct test_size sizes[] = {
1015     { 1, 1, 1, 1 },
1016     { 3, 3, 3, 3 },
1017     { 65, 39, 65, 39 },
1018     { 560, 369, 540, 350 },
1019     { 1274, 721, 1200, 720 },
1020     { 1920, 1088, 1920, 1080 },
1021     { 3840, 2160, 3840, 2160 },
1022 #if 0 /* too long */
1023     { 8192, 8192, 8192, 8192 },
1024 #endif
1025 };
1026 #define NB_SIZES ARRAY_SIZE(sizes)
1027
1028 static void piccheck(picture_t *pic, const vlc_chroma_description_t *dsc,
1029                      bool init)
1030 {
1031 #define ASSERT_COLOR(good) do { \
1032     fprintf(stderr, "error: pixel doesn't match @ plane: %d: %d x %d: 0x%X vs 0x%X\n", i, x, y, *(--p), good); \
1033     assert(!"error: pixel doesn't match"); \
1034 } while(0)
1035
1036 #define PICCHECK(type_u, type_uv, colors_P, color_UV, pitch_den) do { \
1037     for (int i = 0; i < pic->i_planes; ++i) \
1038     { \
1039         const struct plane_t *plane = &pic->p[i]; \
1040         for (int y = 0; y < plane->i_visible_lines; ++y) \
1041         { \
1042             if (pic->i_planes == 2 && i == 1) \
1043             { \
1044                 type_uv *p = (type_uv *)&plane->p_pixels[y * plane->i_pitch]; \
1045                 for (int x = 0; x < plane->i_visible_pitch / 2 / pitch_den; ++x) \
1046                     if (init) \
1047                         *(p++) = color_UV; \
1048                     else if (*(p++) != color_UV) \
1049                         ASSERT_COLOR(color_UV); \
1050             } \
1051             else \
1052             { \
1053                 type_u *p = (type_u *) &plane->p_pixels[y * plane->i_pitch]; \
1054                 for (int x = 0; x < plane->i_visible_pitch / pitch_den; ++x) \
1055                     if (init) \
1056                         *(p++) = colors_P[i]; \
1057                     else if (*(p++) != colors_P[i]) \
1058                         ASSERT_COLOR(colors_P[i]); \
1059             } \
1060         } \
1061     } \
1062 } while (0)
1063
1064     assert(pic->i_planes == 2 || pic->i_planes == 3);
1065     assert(dsc->pixel_size == 1 || dsc->pixel_size == 2);
1066
1067     if (dsc->pixel_size == 1)
1068     {
1069         const uint8_t colors_8_P[3] = { 0x42, 0xF1, 0x36 };
1070         const uint16_t color_8_UV = ntoh16(0xF136);
1071         PICCHECK(uint8_t, uint16_t, colors_8_P, color_8_UV, 1);
1072     }
1073     else
1074     {
1075         const unsigned mask = (1 << dsc->pixel_bits) - 1;
1076         uint16_t colors_16_P[3] = { 0x1042 &mask, 0xF114 &mask, 0x3645 &mask};
1077
1078         switch (pic->format.i_chroma)
1079         {
1080             case VLC_CODEC_P010:
1081                 for (size_t i = 0; i < 3; ++i)
1082                     colors_16_P[i] <<= 6;
1083                 break;
1084             case VLC_CODEC_I420_10L:
1085                 break;
1086             default:
1087                 vlc_assert_unreachable();
1088         }
1089
1090         uint32_t color_16_UV = (colors_16_P[2] << 16) | colors_16_P[1];
1091
1092         PICCHECK(uint16_t, uint32_t, colors_16_P, color_16_UV, 2);
1093     }
1094 }
1095
1096 static void pic_rsc_destroy(picture_t *pic)
1097 {
1098     for (unsigned i = 0; i < 3; i++)
1099         free(pic->p[i].p_pixels);
1100     free(pic);
1101 }
1102
1103 static picture_t *pic_new_unaligned(const video_format_t *fmt)
1104 {
1105     /* Allocate a no-aligned picture in order to ease buffer overflow detection
1106      * from the source picture */
1107     const vlc_chroma_description_t *dsc = vlc_fourcc_GetChromaDescription(fmt->i_chroma);
1108     assert(dsc);
1109     picture_resource_t rsc = { .pf_destroy = pic_rsc_destroy };
1110     for (unsigned i = 0; i < dsc->plane_count; i++)
1111     {
1112         rsc.p[i].i_lines = ((fmt->i_visible_height + 1) & ~ 1) * dsc->p[i].h.num / dsc->p[i].h.den;
1113         rsc.p[i].i_pitch = ((fmt->i_visible_width + 1) & ~ 1) * dsc->pixel_size * dsc->p[i].w.num / dsc->p[i].w.den;
1114         rsc.p[i].p_pixels = malloc(rsc.p[i].i_lines * rsc.p[i].i_pitch);
1115         assert(rsc.p[i].p_pixels);
1116     }
1117     return picture_NewFromResource(fmt, &rsc);
1118 }
1119
1120 int main(void)
1121 {
1122     alarm(10);
1123
1124 #ifndef COPY_TEST_NOOPTIM
1125     if (!vlc_CPU_SSE2())
1126     {
1127         fprintf(stderr, "WARNING: could not test SSE\n");
1128         return 77;
1129     }
1130 #endif
1131
1132     for (size_t i = 0; i < NB_CONVS; ++i)
1133     {
1134         const struct test_conv *conv = &convs[i];
1135
1136         for (size_t j = 0; j < NB_SIZES; ++j)
1137         {
1138             const struct test_size *size = &sizes[j];
1139
1140             const vlc_chroma_description_t *src_dsc =
1141                 vlc_fourcc_GetChromaDescription(conv->src_chroma);
1142             assert(src_dsc);
1143
1144             video_format_t fmt;
1145             video_format_Init(&fmt, 0);
1146             video_format_Setup(&fmt, conv->src_chroma,
1147                                size->i_width, size->i_height,
1148                                size->i_visible_width, size->i_visible_height,
1149                                1, 1);
1150             picture_t *src = pic_new_unaligned(&fmt);
1151             assert(src);
1152             piccheck(src, src_dsc, true);
1153
1154             copy_cache_t cache;
1155             int ret = CopyInitCache(&cache, src->format.i_width
1156                                     * src_dsc->pixel_size);
1157             assert(ret == VLC_SUCCESS);
1158
1159             for (size_t f = 0; conv->dsts[f].chroma != 0; ++f)
1160             {
1161                 const struct test_dst *test_dst= &conv->dsts[f];
1162
1163                 const vlc_chroma_description_t *dst_dsc =
1164                     vlc_fourcc_GetChromaDescription(test_dst->chroma);
1165                 assert(dst_dsc);
1166                 fmt.i_chroma = test_dst->chroma;
1167                 picture_t *dst = picture_NewFromFormat(&fmt);
1168                 assert(dst);
1169
1170                 const uint8_t * src_planes[3] = { src->p[Y_PLANE].p_pixels,
1171                                                   src->p[U_PLANE].p_pixels,
1172                                                   src->p[V_PLANE].p_pixels };
1173                 const size_t    src_pitches[3] = { src->p[Y_PLANE].i_pitch,
1174                                                    src->p[U_PLANE].i_pitch,
1175                                                    src->p[V_PLANE].i_pitch };
1176
1177                 fprintf(stderr, "testing: %u x %u (vis: %u x %u) %4.4s -> %4.4s\n",
1178                         size->i_width, size->i_height,
1179                         size->i_visible_width, size->i_visible_height,
1180                         (const char *) &src->format.i_chroma,
1181                         (const char *) &dst->format.i_chroma);
1182                 if (test_dst->bitshift == 0)
1183                     test_dst->conv(dst, src_planes, src_pitches,
1184                                    src->format.i_visible_height, &cache);
1185                 else
1186                     test_dst->conv16(dst, src_planes, src_pitches,
1187                                    src->format.i_visible_height, test_dst->bitshift,
1188                                    &cache);
1189                 piccheck(dst, dst_dsc, false);
1190                 picture_Release(dst);
1191             }
1192             picture_Release(src);
1193             CopyCleanCache(&cache);
1194         }
1195     }
1196     return 0;
1197 }
1198
1199 #endif