modules/video_chroma/copy.c

   1 /*****************************************************************************
   2  * copy.c: Fast YV12/NV12 copy
   3  *****************************************************************************
   4  * Copyright (C) 2010 Laurent Aimar
   5  *
   6  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
   7  *          Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
   8  *
   9  * This program is free software; you can redistribute it and/or modify it
  10  * under the terms of the GNU Lesser General Public License as published by
  11  * the Free Software Foundation; either version 2.1 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public License
  20  * along with this program; if not, write to the Free Software Foundation,
  21  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 #ifdef HAVE_CONFIG_H
  25 # include "config.h"
  26 #endif
  27
  28 #ifdef COPY_TEST
  29 # undef NDEBUG
  30 #endif
  31
  32 #include <vlc_common.h>
  33 #include <vlc_picture.h>
  34 #include <vlc_cpu.h>
  35 #include <assert.h>
  36
  37 #include "copy.h"
  38 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
  39                       const uint8_t *src, size_t src_pitch,
  40                       unsigned height, int bitshift);
  41
  42 #define ASSERT_PLANE(i) assert(src[i]); \
  43     assert(src_pitch[i])
  44
  45 #define ASSERT_2PLANES \
  46     assert(dst); \
  47     ASSERT_PLANE(0); \
  48     ASSERT_PLANE(1); \
  49     assert(height)
  50
  51 #define ASSERT_3PLANES ASSERT_2PLANES; \
  52     ASSERT_PLANE(2)
  53
  54 int CopyInitCache(copy_cache_t *cache, unsigned width)
  55 {
  56 #ifdef CAN_COMPILE_SSE2
  57     cache->size = __MAX((width + 0x3f) & ~ 0x3f, 16384);
  58     cache->buffer = aligned_alloc(64, cache->size);
  59     if (!cache->buffer)
  60         return VLC_EGENERIC;
  61 #else
  62     (void) cache; (void) width;
  63 #endif
  64     return VLC_SUCCESS;
  65 }
  66
  67 void CopyCleanCache(copy_cache_t *cache)
  68 {
  69 #ifdef CAN_COMPILE_SSE2
  70     aligned_free(cache->buffer);
  71     cache->buffer = NULL;
  72     cache->size   = 0;
  73 #else
  74     (void) cache;
  75 #endif
  76 }
  77
  78 #ifdef CAN_COMPILE_SSE2
  79 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
  80  * load and storing data with the SSE>=2 instruction store.
  81  */
  82
  83 #define COPY16_SHIFTR(x) \
  84     "psrlw "x", %%xmm1\n"
  85 #define COPY16_SHIFTL(x) \
  86     "psllw "x", %%xmm1\n"
  87
  88 #define COPY16_S(dstp, srcp, load, store, shiftstr) \
  89     asm volatile (                      \
  90         load "  0(%[src]), %%xmm1\n"    \
  91         shiftstr                        \
  92         store " %%xmm1,    0(%[dst])\n" \
  93         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
  94
  95 #define COPY16(dstp, srcp, load, store) COPY16_S(dstp, srcp, load, store, "")
  96
  97 #define COPY64_SHIFTR(x) \
  98     "psrlw "x", %%xmm1\n" \
  99     "psrlw "x", %%xmm2\n" \
 100     "psrlw "x", %%xmm3\n" \
 101     "psrlw "x", %%xmm4\n"
 102 #define COPY64_SHIFTL(x) \
 103     "psllw "x", %%xmm1\n" \
 104     "psllw "x", %%xmm2\n" \
 105     "psllw "x", %%xmm3\n" \
 106     "psllw "x", %%xmm4\n"
 107
 108 #define COPY64_S(dstp, srcp, load, store, shiftstr) \
 109     asm volatile (                      \
 110         load "  0(%[src]), %%xmm1\n"    \
 111         load " 16(%[src]), %%xmm2\n"    \
 112         load " 32(%[src]), %%xmm3\n"    \
 113         load " 48(%[src]), %%xmm4\n"    \
 114         shiftstr                        \
 115         store " %%xmm1,    0(%[dst])\n" \
 116         store " %%xmm2,   16(%[dst])\n" \
 117         store " %%xmm3,   32(%[dst])\n" \
 118         store " %%xmm4,   48(%[dst])\n" \
 119         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
 120
 121 #define COPY64(dstp, srcp, load, store) \
 122     COPY64_S(dstp, srcp, load, store, "")
 123
 124 #ifdef COPY_TEST_NOOPTIM
 125 # undef vlc_CPU_SSE4_1
 126 # define vlc_CPU_SSE4_1() (0)
 127 # undef vlc_CPU_SSE3
 128 # define vlc_CPU_SSE3() (0)
 129 # undef vlc_CPU_SSSE3
 130 # define vlc_CPU_SSSE3() (0)
 131 # undef vlc_CPU_SSE2
 132 # define vlc_CPU_SSE2() (0)
 133 #endif
 134
 135 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
 136  * as used by some video surface.
 137  * XXX It is really efficient only when SSE4.1 is available.
 138  */
 139 VLC_SSE
 140 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
 141                          const uint8_t *src, size_t src_pitch,
 142                          unsigned width, unsigned height, int bitshift)
 143 {
 144     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
 145
 146     asm volatile ("mfence");
 147
 148 #define SSE_USWC_COPY(shiftstr16, shiftstr64) \
 149     for (unsigned y = 0; y < height; y++) { \
 150         const unsigned unaligned = (-(uintptr_t)src) & 0x0f; \
 151         unsigned x = unaligned; \
 152         if (vlc_CPU_SSE4_1()) { \
 153             if (!unaligned) { \
 154                 for (; x+63 < width; x += 64) \
 155                     COPY64_S(&dst[x], &src[x], "movntdqa", "movdqa", shiftstr64); \
 156             } else { \
 157                 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
 158                 for (; x+63 < width; x += 64) \
 159                     COPY64_S(&dst[x], &src[x], "movntdqa", "movdqu", shiftstr64); \
 160             } \
 161         } else { \
 162             if (!unaligned) { \
 163                 for (; x+63 < width; x += 64) \
 164                     COPY64_S(&dst[x], &src[x], "movdqa", "movdqa", shiftstr64); \
 165             } else { \
 166                 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
 167                 for (; x+63 < width; x += 64) \
 168                     COPY64_S(&dst[x], &src[x], "movdqa", "movdqu", shiftstr64); \
 169             } \
 170         } \
 171         /* The following should not happen since buffers are generally well aligned */ \
 172         if (x < width) \
 173             CopyPlane(&dst[x], dst_pitch - x, &src[x], src_pitch - x, 1, bitshift); \
 174         src += src_pitch; \
 175         dst += dst_pitch; \
 176     }
 177
 178     switch (bitshift)
 179     {
 180         case 0:
 181             SSE_USWC_COPY("", "")
 182             break;
 183         case -6:
 184             SSE_USWC_COPY(COPY16_SHIFTL("$6"), COPY64_SHIFTL("$6"))
 185             break;
 186         case 6:
 187             SSE_USWC_COPY(COPY16_SHIFTR("$6"), COPY64_SHIFTR("$6"))
 188             break;
 189         case 2:
 190             SSE_USWC_COPY(COPY16_SHIFTR("$2"), COPY64_SHIFTR("$2"))
 191             break;
 192         case -2:
 193             SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
 194             break;
 195         case 4:
 196             SSE_USWC_COPY(COPY16_SHIFTR("$4"), COPY64_SHIFTR("$4"))
 197             break;
 198         case -4:
 199             SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
 200             break;
 201         default:
 202             vlc_assert_unreachable();
 203     }
 204 #undef SSE_USWC_COPY
 205
 206     asm volatile ("mfence");
 207 }
 208
 209 VLC_SSE
 210 static void Copy2d(uint8_t *dst, size_t dst_pitch,
 211                    const uint8_t *src, size_t src_pitch,
 212                    unsigned width, unsigned height)
 213 {
 214     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 215
 216     for (unsigned y = 0; y < height; y++) {
 217         unsigned x = 0;
 218
 219         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
 220         if (!unaligned) {
 221             for (; x+63 < width; x += 64)
 222                 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
 223         } else {
 224             for (; x+63 < width; x += 64)
 225                 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
 226         }
 227
 228         for (; x < width; x++)
 229             dst[x] = src[x];
 230
 231         src += src_pitch;
 232         dst += dst_pitch;
 233     }
 234 }
 235
 236 VLC_SSE
 237 static void
 238 SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch,
 239                  uint8_t *srcu, size_t srcu_pitch,
 240                  uint8_t *srcv, size_t srcv_pitch,
 241                  unsigned int width, unsigned int height, uint8_t pixel_size)
 242 {
 243     assert(!((intptr_t)srcu & 0xf) && !(srcu_pitch & 0x0f) &&
 244            !((intptr_t)srcv & 0xf) && !(srcv_pitch & 0x0f));
 245
 246     static const uint8_t shuffle_8[] = { 0, 8,
 247                                          1, 9,
 248                                          2, 10,
 249                                          3, 11,
 250                                          4, 12,
 251                                          5, 13,
 252                                          6, 14,
 253                                          7, 15 };
 254     static const uint8_t shuffle_16[] = { 0, 1, 8, 9,
 255                                           2, 3, 10, 11,
 256                                           4, 5, 12, 13,
 257                                           6, 7, 14, 15 };
 258     const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
 259
 260     for (unsigned int y = 0; y < height; ++y)
 261     {
 262         unsigned int    x;
 263
 264 #define LOAD2X32                        \
 265     "movhpd 0x00(%[src2]), %%xmm0\n"    \
 266     "movlpd 0x00(%[src1]), %%xmm0\n"    \
 267                                         \
 268     "movhpd 0x08(%[src2]), %%xmm1\n"    \
 269     "movlpd 0x08(%[src1]), %%xmm1\n"    \
 270                                         \
 271     "movhpd 0x10(%[src2]), %%xmm2\n"    \
 272     "movlpd 0x10(%[src1]), %%xmm2\n"    \
 273                                         \
 274     "movhpd 0x18(%[src2]), %%xmm3\n"    \
 275     "movlpd 0x18(%[src1]), %%xmm3\n"
 276
 277 #define STORE64                         \
 278     "movdqu %%xmm0, 0x00(%[dst])\n"     \
 279     "movdqu %%xmm1, 0x10(%[dst])\n"     \
 280     "movdqu %%xmm2, 0x20(%[dst])\n"     \
 281     "movdqu %%xmm3, 0x30(%[dst])\n"
 282
 283 #ifdef CAN_COMPILE_SSSE3
 284         if (vlc_CPU_SSSE3())
 285             for (x = 0; x < (width & ~31); x += 32)
 286                 asm volatile
 287                     (
 288                         "movdqu (%[shuffle]), %%xmm7\n"
 289                         LOAD2X32
 290                         "pshufb %%xmm7, %%xmm0\n"
 291                         "pshufb %%xmm7, %%xmm1\n"
 292                         "pshufb %%xmm7, %%xmm2\n"
 293                         "pshufb %%xmm7, %%xmm3\n"
 294                         STORE64
 295                         : : [dst]"r"(dst+2*x),
 296                             [src1]"r"(srcu+x), [src2]"r"(srcv+x),
 297                             [shuffle]"r"(shuffle)
 298                         : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
 299                     );
 300         else
 301 #endif
 302
 303         {
 304             assert(pixel_size == 1);
 305             for (x = 0; x < (width & ~31); x += 32)
 306                 asm volatile
 307                     (
 308                         LOAD2X32
 309                         "movhlps   %%xmm0, %%xmm4\n"
 310                         "punpcklbw %%xmm4, %%xmm0\n"
 311
 312                         "movhlps   %%xmm1, %%xmm4\n"
 313                         "punpcklbw %%xmm4, %%xmm1\n"
 314
 315                         "movhlps   %%xmm2, %%xmm4\n"
 316                         "punpcklbw %%xmm4, %%xmm2\n"
 317
 318                         "movhlps   %%xmm3, %%xmm4\n"
 319                         "punpcklbw %%xmm4, %%xmm3\n"
 320                         STORE64
 321                         : : [dst]"r"(dst+2*x),
 322                             [src1]"r"(srcu+x), [src2]"r"(srcv+x)
 323                         : "memory",
 324                           "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm7"
 325                     );
 326         }
 327 #undef LOAD2X32
 328 #undef STORE64
 329
 330         if (pixel_size == 1)
 331         {
 332             for (; x < width; x++) {
 333                 dst[2*x+0] = srcu[x];
 334                 dst[2*x+1] = srcv[x];
 335             }
 336         }
 337         else
 338         {
 339             for (; x < width; x+= 2) {
 340                 dst[2*x+0] = srcu[x];
 341                 dst[2*x+1] = srcu[x + 1];
 342                 dst[2*x+2] = srcv[x];
 343                 dst[2*x+3] = srcv[x + 1];
 344             }
 345         }
 346         srcu += srcu_pitch;
 347         srcv += srcv_pitch;
 348         dst += dst_pitch;
 349     }
 350 }
 351
 352 VLC_SSE
 353 static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
 354                         uint8_t *dstv, size_t dstv_pitch,
 355                         const uint8_t *src, size_t src_pitch,
 356                         unsigned width, unsigned height, uint8_t pixel_size)
 357 {
 358     assert(pixel_size == 1 || pixel_size == 2);
 359     assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
 360
 361 #define LOAD64 \
 362     "movdqa  0(%[src]), %%xmm0\n" \
 363     "movdqa 16(%[src]), %%xmm1\n" \
 364     "movdqa 32(%[src]), %%xmm2\n" \
 365     "movdqa 48(%[src]), %%xmm3\n"
 366
 367 #define STORE2X32 \
 368     "movq   %%xmm0,   0(%[dst1])\n" \
 369     "movq   %%xmm1,   8(%[dst1])\n" \
 370     "movhpd %%xmm0,   0(%[dst2])\n" \
 371     "movhpd %%xmm1,   8(%[dst2])\n" \
 372     "movq   %%xmm2,  16(%[dst1])\n" \
 373     "movq   %%xmm3,  24(%[dst1])\n" \
 374     "movhpd %%xmm2,  16(%[dst2])\n" \
 375     "movhpd %%xmm3,  24(%[dst2])\n"
 376
 377 #ifdef CAN_COMPILE_SSSE3
 378     if (vlc_CPU_SSSE3())
 379     {
 380         static const uint8_t shuffle_8[] = { 0, 2, 4, 6, 8, 10, 12, 14,
 381                                              1, 3, 5, 7, 9, 11, 13, 15 };
 382         static const uint8_t shuffle_16[] = {  0,  1,  4,  5,  8,  9, 12, 13,
 383                                                2,  3,  6,  7, 10, 11, 14, 15 };
 384         const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
 385         for (unsigned y = 0; y < height; y++) {
 386             unsigned x = 0;
 387             for (; x < (width & ~31); x += 32) {
 388                 asm volatile (
 389                     "movdqu (%[shuffle]), %%xmm7\n"
 390                     LOAD64
 391                     "pshufb  %%xmm7, %%xmm0\n"
 392                     "pshufb  %%xmm7, %%xmm1\n"
 393                     "pshufb  %%xmm7, %%xmm2\n"
 394                     "pshufb  %%xmm7, %%xmm3\n"
 395                     STORE2X32
 396                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
 397             }
 398             if (pixel_size == 1)
 399             {
 400                 for (; x < width; x++) {
 401                     dstu[x] = src[2*x+0];
 402                     dstv[x] = src[2*x+1];
 403                 }
 404             }
 405             else
 406             {
 407                 for (; x < width; x+= 2) {
 408                     dstu[x] = src[2*x+0];
 409                     dstu[x+1] = src[2*x+1];
 410                     dstv[x] = src[2*x+2];
 411                     dstv[x+1] = src[2*x+3];
 412                 }
 413             }
 414             src  += src_pitch;
 415             dstu += dstu_pitch;
 416             dstv += dstv_pitch;
 417         }
 418     } else
 419 #endif
 420     {
 421         assert(pixel_size == 1);
 422         static const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
 423                                         0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
 424
 425         for (unsigned y = 0; y < height; y++)
 426         {
 427             unsigned x = 0;
 428             for (; x < (width & ~31); x += 32) {
 429                 asm volatile (
 430                     "movdqu (%[mask]), %%xmm7\n"
 431                     LOAD64
 432                     "movdqa   %%xmm0, %%xmm4\n"
 433                     "movdqa   %%xmm1, %%xmm5\n"
 434                     "movdqa   %%xmm2, %%xmm6\n"
 435                     "psrlw    $8,     %%xmm0\n"
 436                     "psrlw    $8,     %%xmm1\n"
 437                     "pand     %%xmm7, %%xmm4\n"
 438                     "pand     %%xmm7, %%xmm5\n"
 439                     "pand     %%xmm7, %%xmm6\n"
 440                     "packuswb %%xmm4, %%xmm0\n"
 441                     "packuswb %%xmm5, %%xmm1\n"
 442                     "pand     %%xmm3, %%xmm7\n"
 443                     "psrlw    $8,     %%xmm2\n"
 444                     "psrlw    $8,     %%xmm3\n"
 445                     "packuswb %%xmm6, %%xmm2\n"
 446                     "packuswb %%xmm7, %%xmm3\n"
 447                     STORE2X32
 448                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 449             }
 450             for (; x < width; x++) {
 451                 dstu[x] = src[2*x+0];
 452                 dstv[x] = src[2*x+1];
 453             }
 454             src  += src_pitch;
 455             dstu += dstu_pitch;
 456             dstv += dstv_pitch;
 457         }
 458     }
 459 #undef STORE2X32
 460 #undef LOAD64
 461 }
 462
 463 static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
 464                           const uint8_t *src, size_t src_pitch,
 465                           uint8_t *cache, size_t cache_size,
 466                           unsigned height, int bitshift)
 467 {
 468     const size_t copy_pitch = __MIN(src_pitch, dst_pitch);
 469     assert(copy_pitch > 0);
 470     const unsigned w16 = (copy_pitch+15) & ~15;
 471     const unsigned hstep = cache_size / w16;
 472     const unsigned cache_width = __MIN(src_pitch, cache_size);
 473     assert(hstep > 0);
 474
 475     /* If SSE4.1: CopyFromUswc is faster than memcpy */
 476     if (!vlc_CPU_SSE4_1() && bitshift == 0 && src_pitch == dst_pitch)
 477         memcpy(dst, src, copy_pitch * height);
 478     else
 479     for (unsigned y = 0; y < height; y += hstep) {
 480         const unsigned hblock =  __MIN(hstep, height - y);
 481
 482         /* Copy a bunch of line into our cache */
 483         CopyFromUswc(cache, w16, src, src_pitch, cache_width, hblock, bitshift);
 484
 485         /* Copy from our cache to the destination */
 486         Copy2d(dst, dst_pitch, cache, w16, copy_pitch, hblock);
 487
 488         /* */
 489         src += src_pitch * hblock;
 490         dst += dst_pitch * hblock;
 491     }
 492 }
 493
 494 static void
 495 SSE_InterleavePlanes(uint8_t *dst, size_t dst_pitch,
 496                      const uint8_t *srcu, size_t srcu_pitch,
 497                      const uint8_t *srcv, size_t srcv_pitch,
 498                      uint8_t *cache, size_t cache_size,
 499                      unsigned int height, uint8_t pixel_size, int bitshift)
 500 {
 501     assert(srcu_pitch == srcv_pitch);
 502     size_t copy_pitch = __MIN(dst_pitch / 2, srcu_pitch);
 503     unsigned int const  w16 = (srcu_pitch+15) & ~15;
 504     unsigned int const  hstep = (cache_size) / (2*w16);
 505     const unsigned cacheu_width = __MIN(srcu_pitch, cache_size);
 506     const unsigned cachev_width = __MIN(srcv_pitch, cache_size);
 507     assert(hstep > 0);
 508
 509     for (unsigned int y = 0; y < height; y += hstep)
 510     {
 511         unsigned int const      hblock = __MIN(hstep, height - y);
 512
 513         /* Copy a bunch of line into our cache */
 514         CopyFromUswc(cache, w16, srcu, srcu_pitch, cacheu_width, hblock, bitshift);
 515         CopyFromUswc(cache+w16*hblock, w16, srcv, srcv_pitch,
 516                      cachev_width, hblock, bitshift);
 517
 518         /* Copy from our cache to the destination */
 519         SSE_InterleaveUV(dst, dst_pitch, cache, w16,
 520                          cache + w16 * hblock, w16,
 521                          copy_pitch, hblock, pixel_size);
 522
 523         /* */
 524         srcu += hblock * srcu_pitch;
 525         srcv += hblock * srcv_pitch;
 526         dst += hblock * dst_pitch;
 527     }
 528 }
 529
 530 static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 531                             uint8_t *dstv, size_t dstv_pitch,
 532                             const uint8_t *src, size_t src_pitch,
 533                             uint8_t *cache, size_t cache_size,
 534                             unsigned height, uint8_t pixel_size, int bitshift)
 535 {
 536     size_t copy_pitch = __MIN(__MIN(src_pitch / 2, dstu_pitch), dstv_pitch);
 537     const unsigned w16 = (src_pitch+15) & ~15;
 538     const unsigned hstep = cache_size / w16;
 539     const unsigned cache_width = __MIN(src_pitch, cache_size);
 540     assert(hstep > 0);
 541
 542     for (unsigned y = 0; y < height; y += hstep) {
 543         const unsigned hblock =  __MIN(hstep, height - y);
 544
 545         /* Copy a bunch of line into our cache */
 546         CopyFromUswc(cache, w16, src, src_pitch, cache_width, hblock, bitshift);
 547
 548         /* Copy from our cache to the destination */
 549         SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
 550                     cache, w16, copy_pitch, hblock, pixel_size);
 551
 552         /* */
 553         src  += src_pitch  * hblock;
 554         dstu += dstu_pitch * hblock;
 555         dstv += dstv_pitch * hblock;
 556     }
 557 }
 558
 559 static void SSE_Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
 560                                const size_t src_pitch[static 3], unsigned height,
 561                                const copy_cache_t *cache)
 562 {
 563     for (unsigned n = 0; n < 3; n++) {
 564         const unsigned d = n > 0 ? 2 : 1;
 565         SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
 566                       src[n], src_pitch[n],
 567                       cache->buffer, cache->size,
 568                       (height+d-1)/d, 0);
 569     }
 570     asm volatile ("emms");
 571 }
 572
 573
 574 static void SSE_Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
 575                                  const size_t src_pitch[static 2], unsigned height,
 576                                  const copy_cache_t *cache)
 577 {
 578     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
 579                   cache->buffer, cache->size, height, 0);
 580     SSE_CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch, src[1], src_pitch[1],
 581                   cache->buffer, cache->size, (height+1) / 2, 0);
 582     asm volatile ("emms");
 583 }
 584
 585 static void
 586 SSE_Copy420_SP_to_P(picture_t *dest, const uint8_t *src[static 2],
 587                     const size_t src_pitch[static 2], unsigned int height,
 588                     uint8_t pixel_size, int bitshift, const copy_cache_t *cache)
 589 {
 590     SSE_CopyPlane(dest->p[0].p_pixels, dest->p[0].i_pitch,
 591                   src[0], src_pitch[0], cache->buffer, cache->size, height, bitshift);
 592
 593     SSE_SplitPlanes(dest->p[1].p_pixels, dest->p[1].i_pitch,
 594                     dest->p[2].p_pixels, dest->p[2].i_pitch,
 595                     src[1], src_pitch[1], cache->buffer, cache->size,
 596                     (height+1) / 2, pixel_size, bitshift);
 597     asm volatile ("emms");
 598 }
 599
 600 static void SSE_Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 601                                 const size_t src_pitch[static 3],
 602                                 unsigned height, uint8_t pixel_size,
 603                                 int bitshift, const copy_cache_t *cache)
 604 {
 605     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
 606                   cache->buffer, cache->size, height, bitshift);
 607     SSE_InterleavePlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
 608                          src[U_PLANE], src_pitch[U_PLANE],
 609                          src[V_PLANE], src_pitch[V_PLANE],
 610                          cache->buffer, cache->size, (height+1) / 2, pixel_size, bitshift);
 611     asm volatile ("emms");
 612 }
 613 #undef COPY64
 614 #endif /* CAN_COMPILE_SSE2 */
 615
 616 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
 617                       const uint8_t *src, size_t src_pitch,
 618                       unsigned height, int bitshift)
 619 {
 620     const size_t copy_pitch = __MIN(src_pitch, dst_pitch);
 621     if (bitshift != 0)
 622     {
 623         for (unsigned y = 0; y < height; y++)
 624         {
 625             uint16_t *dst16 = (uint16_t *) dst;
 626             const uint16_t *src16 = (const uint16_t *) src;
 627
 628             if (bitshift > 0)
 629                 for (unsigned x = 0; x < (copy_pitch / 2); x++)
 630                     *dst16++ = (*src16++) >> (bitshift & 0xf);
 631             else
 632                 for (unsigned x = 0; x < (copy_pitch / 2); x++)
 633                     *dst16++ = (*src16++) << ((-bitshift) & 0xf);
 634             src += src_pitch;
 635             dst += dst_pitch;
 636         }
 637     }
 638     else if (src_pitch == dst_pitch)
 639         memcpy(dst, src, copy_pitch * height);
 640     else
 641     for (unsigned y = 0; y < height; y++) {
 642         memcpy(dst, src, copy_pitch);
 643         src += src_pitch;
 644         dst += dst_pitch;
 645     }
 646 }
 647
 648 void CopyPacked(picture_t *dst, const uint8_t *src, const size_t src_pitch,
 649                 unsigned height, const copy_cache_t *cache)
 650 {
 651     assert(dst);
 652     assert(src); assert(src_pitch);
 653     assert(height);
 654
 655 #ifdef CAN_COMPILE_SSE2
 656     if (vlc_CPU_SSE4_1())
 657         return SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src, src_pitch,
 658                              cache->buffer, cache->size, height, 0);
 659 #else
 660     (void) cache;
 661 #endif
 662         CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src, src_pitch,
 663                   height, 0);
 664 }
 665
 666 void Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
 667                       const size_t src_pitch[static 2], unsigned height,
 668                       const copy_cache_t *cache)
 669 {
 670     ASSERT_2PLANES;
 671 #ifdef CAN_COMPILE_SSE2
 672     if (vlc_CPU_SSE2())
 673         return SSE_Copy420_SP_to_SP(dst, src, src_pitch, height, cache);
 674 #else
 675     (void) cache;
 676 #endif
 677
 678     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 679               src[0], src_pitch[0], height, 0);
 680     CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
 681               src[1], src_pitch[1], (height+1)/2, 0);
 682 }
 683
 684 #define SPLIT_PLANES(type, pitch_den) do { \
 685     size_t copy_pitch = __MIN(__MIN(src_pitch / pitch_den, dstu_pitch), dstv_pitch); \
 686     for (unsigned y = 0; y < height; y++) { \
 687         for (unsigned x = 0; x < copy_pitch; x++) { \
 688             ((type *) dstu)[x] = ((const type *) src)[2*x+0]; \
 689             ((type *) dstv)[x] = ((const type *) src)[2*x+1]; \
 690         } \
 691         src  += src_pitch; \
 692         dstu += dstu_pitch; \
 693         dstv += dstv_pitch; \
 694     } \
 695 } while(0)
 696
 697 #define SPLIT_PLANES_SHIFTR(type, pitch_den, bitshift) do { \
 698     size_t copy_pitch = __MIN(__MIN(src_pitch / pitch_den, dstu_pitch), dstv_pitch); \
 699     for (unsigned y = 0; y < height; y++) { \
 700         for (unsigned x = 0; x < copy_pitch; x++) { \
 701             ((type *) dstu)[x] = (((const type *) src)[2*x+0]) >> (bitshift); \
 702             ((type *) dstv)[x] = (((const type *) src)[2*x+1]) >> (bitshift); \
 703         } \
 704         src  += src_pitch; \
 705         dstu += dstu_pitch; \
 706         dstv += dstv_pitch; \
 707     } \
 708 } while(0)
 709
 710 #define SPLIT_PLANES_SHIFTL(type, pitch_den, bitshift) do { \
 711     size_t copy_pitch = __MIN(__MIN(src_pitch / pitch_den, dstu_pitch), dstv_pitch); \
 712     for (unsigned y = 0; y < height; y++) { \
 713         for (unsigned x = 0; x < copy_pitch; x++) { \
 714             ((type *) dstu)[x] = (((const type *) src)[2*x+0]) << (bitshift); \
 715             ((type *) dstv)[x] = (((const type *) src)[2*x+1]) << (bitshift); \
 716         } \
 717         src  += src_pitch; \
 718         dstu += dstu_pitch; \
 719         dstv += dstv_pitch; \
 720     } \
 721 } while(0)
 722
 723 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 724                         uint8_t *dstv, size_t dstv_pitch,
 725                         const uint8_t *src, size_t src_pitch, unsigned height)
 726 {
 727     SPLIT_PLANES(uint8_t, 2);
 728 }
 729
 730 static void SplitPlanes16(uint8_t *dstu, size_t dstu_pitch,
 731                           uint8_t *dstv, size_t dstv_pitch,
 732                           const uint8_t *src, size_t src_pitch, unsigned height,
 733                           int bitshift)
 734 {
 735     if (bitshift == 0)
 736         SPLIT_PLANES(uint16_t, 4);
 737     else if (bitshift > 0)
 738         SPLIT_PLANES_SHIFTR(uint16_t, 4, bitshift & 0xf);
 739     else
 740         SPLIT_PLANES_SHIFTL(uint16_t, 4, (-bitshift) & 0xf);
 741 }
 742
 743 void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
 744                      const size_t src_pitch[static 2], unsigned height,
 745                      const copy_cache_t *cache)
 746 {
 747     ASSERT_2PLANES;
 748 #ifdef CAN_COMPILE_SSE2
 749     if (vlc_CPU_SSE2())
 750         return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 1, 0, cache);
 751 #else
 752     VLC_UNUSED(cache);
 753 #endif
 754
 755     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 756               src[0], src_pitch[0], height, 0);
 757     SplitPlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
 758                 dst->p[2].p_pixels, dst->p[2].i_pitch,
 759                 src[1], src_pitch[1], (height+1)/2);
 760 }
 761
 762 void Copy420_16_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
 763                         const size_t src_pitch[static 2], unsigned height,
 764                         int bitshift, const copy_cache_t *cache)
 765 {
 766     ASSERT_2PLANES;
 767     assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0));
 768
 769 #ifdef CAN_COMPILE_SSE3
 770     if (vlc_CPU_SSSE3())
 771         return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 2, bitshift, cache);
 772 #else
 773     VLC_UNUSED(cache);
 774 #endif
 775
 776     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 777               src[0], src_pitch[0], height, bitshift);
 778     SplitPlanes16(dst->p[1].p_pixels, dst->p[1].i_pitch,
 779                   dst->p[2].p_pixels, dst->p[2].i_pitch,
 780                   src[1], src_pitch[1], (height+1)/2, bitshift);
 781 }
 782
 783 #define INTERLEAVE_UV() do { \
 784     for ( unsigned int line = 0; line < copy_lines; line++ ) { \
 785         for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
 786             *dstUV++ = *srcU++; \
 787             *dstUV++ = *srcV++; \
 788         } \
 789         dstUV += i_extra_pitch_uv; \
 790         srcU  += i_extra_pitch_u; \
 791         srcV  += i_extra_pitch_v; \
 792     } \
 793 }while(0)
 794
 795 #define INTERLEAVE_UV_SHIFTR(bitshitf) do { \
 796     for ( unsigned int line = 0; line < copy_lines; line++ ) { \
 797         for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
 798             *dstUV++ = (*srcU++) >> (bitshitf); \
 799             *dstUV++ = (*srcV++) >> (bitshitf); \
 800         } \
 801         dstUV += i_extra_pitch_uv; \
 802         srcU  += i_extra_pitch_u; \
 803         srcV  += i_extra_pitch_v; \
 804     } \
 805 }while(0)
 806
 807 #define INTERLEAVE_UV_SHIFTL(bitshitf) do { \
 808     for ( unsigned int line = 0; line < copy_lines; line++ ) { \
 809         for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
 810             *dstUV++ = (*srcU++) << (bitshitf); \
 811             *dstUV++ = (*srcV++) << (bitshitf); \
 812         } \
 813         dstUV += i_extra_pitch_uv; \
 814         srcU  += i_extra_pitch_u; \
 815         srcV  += i_extra_pitch_v; \
 816     } \
 817 }while(0)
 818
 819 void Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 820                      const size_t src_pitch[static 3], unsigned height,
 821                      const copy_cache_t *cache)
 822 {
 823     ASSERT_3PLANES;
 824 #ifdef CAN_COMPILE_SSE2
 825     if (vlc_CPU_SSE2())
 826         return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 1, 0, cache);
 827 #else
 828     (void) cache;
 829 #endif
 830
 831     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 832               src[0], src_pitch[0], height, 0);
 833
 834     const unsigned copy_lines = (height+1) / 2;
 835     unsigned copy_pitch = src_pitch[1];
 836     if (copy_pitch > (size_t)dst->p[1].i_pitch / 2)
 837         copy_pitch = dst->p[1].i_pitch / 2;
 838
 839     const int i_extra_pitch_uv = dst->p[1].i_pitch - 2 * copy_pitch;
 840     const int i_extra_pitch_u  = src_pitch[U_PLANE] - copy_pitch;
 841     const int i_extra_pitch_v  = src_pitch[V_PLANE] - copy_pitch;
 842
 843     uint8_t *dstUV = dst->p[1].p_pixels;
 844     const uint8_t *srcU  = src[U_PLANE];
 845     const uint8_t *srcV  = src[V_PLANE];
 846     INTERLEAVE_UV();
 847 }
 848
 849 void Copy420_16_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 850                         const size_t src_pitch[static 3], unsigned height,
 851                         int bitshift, const copy_cache_t *cache)
 852 {
 853     ASSERT_3PLANES;
 854     assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0));
 855 #ifdef CAN_COMPILE_SSE2
 856     if (vlc_CPU_SSSE3())
 857         return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 2, bitshift, cache);
 858 #else
 859     (void) cache;
 860 #endif
 861
 862     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 863               src[0], src_pitch[0], height, bitshift);
 864
 865     const unsigned copy_lines = (height+1) / 2;
 866     const unsigned copy_pitch = src_pitch[1] / 2;
 867
 868     const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
 869     const int i_extra_pitch_u  = src_pitch[U_PLANE] / 2 - copy_pitch;
 870     const int i_extra_pitch_v  = src_pitch[V_PLANE] / 2 - copy_pitch;
 871
 872     uint16_t *dstUV = (void*) dst->p[1].p_pixels;
 873     const uint16_t *srcU  = (const uint16_t *) src[U_PLANE];
 874     const uint16_t *srcV  = (const uint16_t *) src[V_PLANE];
 875
 876     if (bitshift == 0)
 877         INTERLEAVE_UV();
 878     else if (bitshift > 0)
 879         INTERLEAVE_UV_SHIFTR(bitshift & 0xf);
 880     else
 881         INTERLEAVE_UV_SHIFTL((-bitshift) & 0xf);
 882 }
 883
 884 void Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
 885                     const size_t src_pitch[static 3], unsigned height,
 886                     const copy_cache_t *cache)
 887 {
 888     ASSERT_3PLANES;
 889 #ifdef CAN_COMPILE_SSE2
 890     if (vlc_CPU_SSE2())
 891         return SSE_Copy420_P_to_P(dst, src, src_pitch, height, cache);
 892 #else
 893     (void) cache;
 894 #endif
 895
 896      CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 897                src[0], src_pitch[0], height, 0);
 898      CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
 899                src[1], src_pitch[1], (height+1) / 2, 0);
 900      CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
 901                src[2], src_pitch[2], (height+1) / 2, 0);
 902 }
 903
 904 int picture_UpdatePlanes(picture_t *picture, uint8_t *data, unsigned pitch)
 905 {
 906     /* fill in buffer info in first plane */
 907     picture->p->p_pixels = data;
 908     picture->p->i_pitch  = pitch;
 909     picture->p->i_lines  = picture->format.i_height;
 910     assert(picture->p->i_visible_pitch <= picture->p->i_pitch);
 911     assert(picture->p->i_visible_lines <= picture->p->i_lines);
 912
 913     /*  Fill chroma planes for biplanar YUV */
 914     if (picture->format.i_chroma == VLC_CODEC_NV12 ||
 915         picture->format.i_chroma == VLC_CODEC_NV21 ||
 916         picture->format.i_chroma == VLC_CODEC_P010) {
 917
 918         for (int n = 1; n < picture->i_planes; n++) {
 919             const plane_t *o = &picture->p[n-1];
 920             plane_t *p = &picture->p[n];
 921
 922             p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
 923             p->i_pitch  = pitch;
 924             p->i_lines  = picture->format.i_height / 2;
 925             assert(p->i_visible_pitch <= p->i_pitch);
 926             assert(p->i_visible_lines <= p->i_lines);
 927         }
 928         /* The dx/d3d buffer is always allocated as NV12 */
 929         if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_NV12)) {
 930             /* TODO : Swap NV21 UV planes to match NV12 */
 931             return VLC_EGENERIC;
 932         }
 933     }
 934
 935     /*  Fill chroma planes for planar YUV */
 936     else
 937     if (picture->format.i_chroma == VLC_CODEC_I420 ||
 938         picture->format.i_chroma == VLC_CODEC_J420 ||
 939         picture->format.i_chroma == VLC_CODEC_YV12) {
 940
 941         for (int n = 1; n < picture->i_planes; n++) {
 942             const plane_t *o = &picture->p[n-1];
 943             plane_t *p = &picture->p[n];
 944
 945             p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
 946             p->i_pitch  = pitch / 2;
 947             p->i_lines  = picture->format.i_height / 2;
 948         }
 949         /* The dx/d3d buffer is always allocated as YV12 */
 950         if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_YV12))
 951             picture_SwapUV( picture );
 952     }
 953     return VLC_SUCCESS;
 954 }
 955
 956 #ifdef COPY_TEST
 957
 958 #include <vlc_picture.h>
 959
 960 struct test_dst
 961 {
 962     vlc_fourcc_t chroma;
 963     int bitshift;
 964     union
 965     {
 966         void (*conv)(picture_t *, const uint8_t *[], const size_t [], unsigned,
 967                      const copy_cache_t *);
 968         void (*conv16)(picture_t *, const uint8_t *[], const size_t [], unsigned, int,
 969                      const copy_cache_t *);
 970     };
 971 };
 972
 973 struct test_conv
 974 {
 975     vlc_fourcc_t src_chroma;
 976     struct test_dst dsts[3];
 977 };
 978
 979 static const struct test_conv convs[] = {
 980     { .src_chroma = VLC_CODEC_NV12,
 981       .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_SP_to_P },
 982                 { VLC_CODEC_NV12, 0, .conv = Copy420_SP_to_SP } },
 983     },
 984     { .src_chroma = VLC_CODEC_I420,
 985       .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_P_to_P },
 986                 { VLC_CODEC_NV12, 0, .conv = Copy420_P_to_SP } },
 987     },
 988     { .src_chroma = VLC_CODEC_P010,
 989       .dsts = { { VLC_CODEC_I420_10L, 6, .conv16 = Copy420_16_SP_to_P } },
 990     },
 991     { .src_chroma = VLC_CODEC_I420_10L,
 992       .dsts = { { VLC_CODEC_P010, -6, .conv16 = Copy420_16_P_to_SP } },
 993     },
 994 };
 995 #define NB_CONVS ARRAY_SIZE(convs)
 996
 997 struct test_size
 998 {
 999     int i_width;
1000     int i_height;
1001     int i_visible_width;
1002     int i_visible_height;
1003 };
1004 static const struct test_size sizes[] = {
1005     { 1, 1, 1, 1 },
1006     { 3, 3, 3, 3 },
1007     { 65, 39, 65, 39 },
1008     { 560, 369, 540, 350 },
1009     { 1274, 721, 1200, 720 },
1010     { 1920, 1088, 1920, 1080 },
1011     { 3840, 2160, 3840, 2160 },
1012 #if 0 /* too long */
1013     { 8192, 8192, 8192, 8192 },
1014 #endif
1015 };
1016 #define NB_SIZES ARRAY_SIZE(sizes)
1017
1018 static void piccheck(picture_t *pic, const vlc_chroma_description_t *dsc,
1019                      bool init)
1020 {
1021 #define ASSERT_COLOR(good) do { \
1022     fprintf(stderr, "error: pixel doesn't match @ plane: %d: %d x %d: 0x%X vs 0x%X\n", i, x, y, *(--p), good); \
1023     assert(!"error: pixel doesn't match"); \
1024 } while(0)
1025
1026 #define PICCHECK(type_u, type_uv, colors_P, color_UV, pitch_den) do { \
1027     for (int i = 0; i < pic->i_planes; ++i) \
1028     { \
1029         const struct plane_t *plane = &pic->p[i]; \
1030         for (int y = 0; y < plane->i_visible_lines; ++y) \
1031         { \
1032             if (pic->i_planes == 2 && i == 1) \
1033             { \
1034                 type_uv *p = (type_uv *)&plane->p_pixels[y * plane->i_pitch]; \
1035                 for (int x = 0; x < plane->i_visible_pitch / 2 / pitch_den; ++x) \
1036                     if (init) \
1037                         *(p++) = color_UV; \
1038                     else if (*(p++) != color_UV) \
1039                         ASSERT_COLOR(color_UV); \
1040             } \
1041             else \
1042             { \
1043                 type_u *p = (type_u *) &plane->p_pixels[y * plane->i_pitch]; \
1044                 for (int x = 0; x < plane->i_visible_pitch / pitch_den; ++x) \
1045                     if (init) \
1046                         *(p++) = colors_P[i]; \
1047                     else if (*(p++) != colors_P[i]) \
1048                         ASSERT_COLOR(colors_P[i]); \
1049             } \
1050         } \
1051     } \
1052 } while (0)
1053
1054     assert(pic->i_planes == 2 || pic->i_planes == 3);
1055     assert(dsc->pixel_size == 1 || dsc->pixel_size == 2);
1056
1057     if (dsc->pixel_size == 1)
1058     {
1059         const uint8_t colors_8_P[3] = { 0x42, 0xF1, 0x36 };
1060         const uint16_t color_8_UV = ntoh16(0xF136);
1061         PICCHECK(uint8_t, uint16_t, colors_8_P, color_8_UV, 1);
1062     }
1063     else
1064     {
1065         const unsigned mask = (1 << dsc->pixel_bits) - 1;
1066         uint16_t colors_16_P[3] = { 0x1042 &mask, 0xF114 &mask, 0x3645 &mask};
1067
1068         switch (pic->format.i_chroma)
1069         {
1070             case VLC_CODEC_P010:
1071                 for (size_t i = 0; i < 3; ++i)
1072                     colors_16_P[i] <<= 6;
1073                 break;
1074             case VLC_CODEC_I420_10L:
1075                 break;
1076             default:
1077                 vlc_assert_unreachable();
1078         }
1079
1080         uint32_t color_16_UV = GetDWLE( &colors_16_P[1] );
1081
1082         PICCHECK(uint16_t, uint32_t, colors_16_P, color_16_UV, 2);
1083     }
1084 }
1085
1086 static void pic_rsc_destroy(picture_t *pic)
1087 {
1088     for (unsigned i = 0; i < 3; i++)
1089         free(pic->p[i].p_pixels);
1090 }
1091
1092 static picture_t *pic_new_unaligned(const video_format_t *fmt)
1093 {
1094     /* Allocate a no-aligned picture in order to ease buffer overflow detection
1095      * from the source picture */
1096     const vlc_chroma_description_t *dsc = vlc_fourcc_GetChromaDescription(fmt->i_chroma);
1097     assert(dsc);
1098     picture_resource_t rsc = { .pf_destroy = pic_rsc_destroy };
1099     for (unsigned i = 0; i < dsc->plane_count; i++)
1100     {
1101         rsc.p[i].i_lines = ((fmt->i_visible_height + (dsc->p[i].h.den - 1)) / dsc->p[i].h.den) * dsc->p[i].h.num;
1102         rsc.p[i].i_pitch = ((fmt->i_visible_width + (dsc->p[i].w.den - 1)) / dsc->p[i].w.den) * dsc->p[i].w.num * dsc->pixel_size;
1103         rsc.p[i].p_pixels = malloc(rsc.p[i].i_lines * rsc.p[i].i_pitch);
1104         assert(rsc.p[i].p_pixels);
1105     }
1106     return picture_NewFromResource(fmt, &rsc);
1107 }
1108
1109 int main(void)
1110 {
1111     alarm(10);
1112
1113 #ifndef COPY_TEST_NOOPTIM
1114     if (!vlc_CPU_SSE2())
1115     {
1116         fprintf(stderr, "WARNING: could not test SSE\n");
1117         return 77;
1118     }
1119 #endif
1120
1121     for (size_t i = 0; i < NB_CONVS; ++i)
1122     {
1123         const struct test_conv *conv = &convs[i];
1124
1125         for (size_t j = 0; j < NB_SIZES; ++j)
1126         {
1127             const struct test_size *size = &sizes[j];
1128
1129             const vlc_chroma_description_t *src_dsc =
1130                 vlc_fourcc_GetChromaDescription(conv->src_chroma);
1131             assert(src_dsc);
1132
1133             video_format_t fmt;
1134             video_format_Init(&fmt, 0);
1135             video_format_Setup(&fmt, conv->src_chroma,
1136                                size->i_width, size->i_height,
1137                                size->i_visible_width, size->i_visible_height,
1138                                1, 1);
1139             picture_t *src = pic_new_unaligned(&fmt);
1140             assert(src);
1141             piccheck(src, src_dsc, true);
1142
1143             copy_cache_t cache;
1144             int ret = CopyInitCache(&cache, src->format.i_width
1145                                     * src_dsc->pixel_size);
1146             assert(ret == VLC_SUCCESS);
1147
1148             for (size_t f = 0; conv->dsts[f].chroma != 0; ++f)
1149             {
1150                 const struct test_dst *test_dst= &conv->dsts[f];
1151
1152                 const vlc_chroma_description_t *dst_dsc =
1153                     vlc_fourcc_GetChromaDescription(test_dst->chroma);
1154                 assert(dst_dsc);
1155                 fmt.i_chroma = test_dst->chroma;
1156                 picture_t *dst = picture_NewFromFormat(&fmt);
1157                 assert(dst);
1158
1159                 const uint8_t * src_planes[3] = { src->p[Y_PLANE].p_pixels,
1160                                                   src->p[U_PLANE].p_pixels,
1161                                                   src->p[V_PLANE].p_pixels };
1162                 const size_t    src_pitches[3] = { src->p[Y_PLANE].i_pitch,
1163                                                    src->p[U_PLANE].i_pitch,
1164                                                    src->p[V_PLANE].i_pitch };
1165
1166                 fprintf(stderr, "testing: %u x %u (vis: %u x %u) %4.4s -> %4.4s\n",
1167                         size->i_width, size->i_height,
1168                         size->i_visible_width, size->i_visible_height,
1169                         (const char *) &src->format.i_chroma,
1170                         (const char *) &dst->format.i_chroma);
1171                 if (test_dst->bitshift == 0)
1172                     test_dst->conv(dst, src_planes, src_pitches,
1173                                    src->format.i_visible_height, &cache);
1174                 else
1175                     test_dst->conv16(dst, src_planes, src_pitches,
1176                                    src->format.i_visible_height, test_dst->bitshift,
1177                                    &cache);
1178                 piccheck(dst, dst_dsc, false);
1179                 picture_Release(dst);
1180             }
1181             picture_Release(src);
1182             CopyCleanCache(&cache);
1183         }
1184     }
1185     return 0;
1186 }
1187
1188 #endif