modules/video_chroma/copy.c

   1 /*****************************************************************************
   2  * copy.c: Fast YV12/NV12 copy
   3  *****************************************************************************
   4  * Copyright (C) 2010 Laurent Aimar
   5  * $Id$
   6  *
   7  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
   8  *          Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify it
  11  * under the terms of the GNU Lesser General Public License as published by
  12  * the Free Software Foundation; either version 2.1 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18  * GNU Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this program; if not, write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 #ifdef HAVE_CONFIG_H
  26 # include "config.h"
  27 #endif
  28
  29 #ifdef COPY_TEST
  30 # undef NDEBUG
  31 #endif
  32
  33 #include <vlc_common.h>
  34 #include <vlc_picture.h>
  35 #include <vlc_cpu.h>
  36 #include <assert.h>
  37
  38 #include "copy.h"
  39 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
  40                       const uint8_t *src, size_t src_pitch,
  41                       unsigned height, int bitshift);
  42
  43 #define ASSERT_PLANE(i) assert(src[i]); \
  44     assert(src_pitch[i])
  45
  46 #define ASSERT_2PLANES \
  47     assert(dst); \
  48     ASSERT_PLANE(0); \
  49     ASSERT_PLANE(1); \
  50     assert(height)
  51
  52 #define ASSERT_3PLANES ASSERT_2PLANES; \
  53     ASSERT_PLANE(2)
  54
  55 int CopyInitCache(copy_cache_t *cache, unsigned width)
  56 {
  57 #ifdef CAN_COMPILE_SSE2
  58     cache->size = __MAX((width + 0x3f) & ~ 0x3f, 16384);
  59     cache->buffer = aligned_alloc(64, cache->size);
  60     if (!cache->buffer)
  61         return VLC_EGENERIC;
  62 #else
  63     (void) cache; (void) width;
  64 #endif
  65     return VLC_SUCCESS;
  66 }
  67
  68 void CopyCleanCache(copy_cache_t *cache)
  69 {
  70 #ifdef CAN_COMPILE_SSE2
  71     aligned_free(cache->buffer);
  72     cache->buffer = NULL;
  73     cache->size   = 0;
  74 #else
  75     (void) cache;
  76 #endif
  77 }
  78
  79 #ifdef CAN_COMPILE_SSE2
  80 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
  81  * load and storing data with the SSE>=2 instruction store.
  82  */
  83
  84 #define COPY16_SHIFTR(x) \
  85     "psrlw "x", %%xmm1\n"
  86 #define COPY16_SHIFTL(x) \
  87     "psllw "x", %%xmm1\n"
  88
  89 #define COPY16_S(dstp, srcp, load, store, shiftstr) \
  90     asm volatile (                      \
  91         load "  0(%[src]), %%xmm1\n"    \
  92         shiftstr                        \
  93         store " %%xmm1,    0(%[dst])\n" \
  94         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
  95
  96 #define COPY16(dstp, srcp, load, store) COPY16_S(dstp, srcp, load, store, "")
  97
  98 #define COPY64_SHIFTR(x) \
  99     "psrlw "x", %%xmm1\n" \
 100     "psrlw "x", %%xmm2\n" \
 101     "psrlw "x", %%xmm3\n" \
 102     "psrlw "x", %%xmm4\n"
 103 #define COPY64_SHIFTL(x) \
 104     "psllw "x", %%xmm1\n" \
 105     "psllw "x", %%xmm2\n" \
 106     "psllw "x", %%xmm3\n" \
 107     "psllw "x", %%xmm4\n"
 108
 109 #define COPY64_S(dstp, srcp, load, store, shiftstr) \
 110     asm volatile (                      \
 111         load "  0(%[src]), %%xmm1\n"    \
 112         load " 16(%[src]), %%xmm2\n"    \
 113         load " 32(%[src]), %%xmm3\n"    \
 114         load " 48(%[src]), %%xmm4\n"    \
 115         shiftstr                        \
 116         store " %%xmm1,    0(%[dst])\n" \
 117         store " %%xmm2,   16(%[dst])\n" \
 118         store " %%xmm3,   32(%[dst])\n" \
 119         store " %%xmm4,   48(%[dst])\n" \
 120         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
 121
 122 #define COPY64(dstp, srcp, load, store) \
 123     COPY64_S(dstp, srcp, load, store, "")
 124
 125 #ifdef COPY_TEST_NOOPTIM
 126 # undef vlc_CPU_SSE4_1
 127 # define vlc_CPU_SSE4_1() (0)
 128 # undef vlc_CPU_SSE3
 129 # define vlc_CPU_SSE3() (0)
 130 # undef vlc_CPU_SSSE3
 131 # define vlc_CPU_SSSE3() (0)
 132 # undef vlc_CPU_SSE2
 133 # define vlc_CPU_SSE2() (0)
 134 #endif
 135
 136 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
 137  * as used by some video surface.
 138  * XXX It is really efficient only when SSE4.1 is available.
 139  */
 140 VLC_SSE
 141 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
 142                          const uint8_t *src, size_t src_pitch,
 143                          unsigned width, unsigned height, int bitshift)
 144 {
 145     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
 146
 147     asm volatile ("mfence");
 148
 149 #define SSE_USWC_COPY(shiftstr16, shiftstr64) \
 150     for (unsigned y = 0; y < height; y++) { \
 151         const unsigned unaligned = (-(uintptr_t)src) & 0x0f; \
 152         unsigned x = unaligned; \
 153         if (vlc_CPU_SSE4_1()) { \
 154             if (!unaligned) { \
 155                 for (; x+63 < width; x += 64) \
 156                     COPY64_S(&dst[x], &src[x], "movntdqa", "movdqa", shiftstr64); \
 157             } else { \
 158                 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
 159                 for (; x+63 < width; x += 64) \
 160                     COPY64_S(&dst[x], &src[x], "movntdqa", "movdqu", shiftstr64); \
 161             } \
 162         } else { \
 163             if (!unaligned) { \
 164                 for (; x+63 < width; x += 64) \
 165                     COPY64_S(&dst[x], &src[x], "movdqa", "movdqa", shiftstr64); \
 166             } else { \
 167                 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
 168                 for (; x+63 < width; x += 64) \
 169                     COPY64_S(&dst[x], &src[x], "movdqa", "movdqu", shiftstr64); \
 170             } \
 171         } \
 172         /* The following should not happen since buffers are generally well aligned */ \
 173         if (x < width) \
 174             CopyPlane(&dst[x], dst_pitch - x, &src[x], src_pitch - x, 1, bitshift); \
 175         src += src_pitch; \
 176         dst += dst_pitch; \
 177     }
 178
 179     switch (bitshift)
 180     {
 181         case 0:
 182             SSE_USWC_COPY("", "")
 183             break;
 184         case -6:
 185             SSE_USWC_COPY(COPY16_SHIFTL("$6"), COPY64_SHIFTL("$6"))
 186             break;
 187         case 6:
 188             SSE_USWC_COPY(COPY16_SHIFTR("$6"), COPY64_SHIFTR("$6"))
 189             break;
 190         case 2:
 191             SSE_USWC_COPY(COPY16_SHIFTR("$2"), COPY64_SHIFTR("$2"))
 192             break;
 193         case -2:
 194             SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
 195             break;
 196         case 4:
 197             SSE_USWC_COPY(COPY16_SHIFTR("$4"), COPY64_SHIFTR("$4"))
 198             break;
 199         case -4:
 200             SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
 201             break;
 202         default:
 203             vlc_assert_unreachable();
 204     }
 205 #undef SSE_USWC_COPY
 206
 207     asm volatile ("mfence");
 208 }
 209
 210 VLC_SSE
 211 static void Copy2d(uint8_t *dst, size_t dst_pitch,
 212                    const uint8_t *src, size_t src_pitch,
 213                    unsigned width, unsigned height)
 214 {
 215     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 216
 217     for (unsigned y = 0; y < height; y++) {
 218         unsigned x = 0;
 219
 220         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
 221         if (!unaligned) {
 222             for (; x+63 < width; x += 64)
 223                 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
 224         } else {
 225             for (; x+63 < width; x += 64)
 226                 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
 227         }
 228
 229         for (; x < width; x++)
 230             dst[x] = src[x];
 231
 232         src += src_pitch;
 233         dst += dst_pitch;
 234     }
 235 }
 236
 237 VLC_SSE
 238 static void
 239 SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch,
 240                  uint8_t *srcu, size_t srcu_pitch,
 241                  uint8_t *srcv, size_t srcv_pitch,
 242                  unsigned int width, unsigned int height, uint8_t pixel_size)
 243 {
 244     assert(!((intptr_t)srcu & 0xf) && !(srcu_pitch & 0x0f) &&
 245            !((intptr_t)srcv & 0xf) && !(srcv_pitch & 0x0f));
 246
 247     static const uint8_t shuffle_8[] = { 0, 8,
 248                                          1, 9,
 249                                          2, 10,
 250                                          3, 11,
 251                                          4, 12,
 252                                          5, 13,
 253                                          6, 14,
 254                                          7, 15 };
 255     static const uint8_t shuffle_16[] = { 0, 1, 8, 9,
 256                                           2, 3, 10, 11,
 257                                           4, 5, 12, 13,
 258                                           6, 7, 14, 15 };
 259     const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
 260
 261     for (unsigned int y = 0; y < height; ++y)
 262     {
 263         unsigned int    x;
 264
 265 #define LOAD2X32                        \
 266     "movhpd 0x00(%[src2]), %%xmm0\n"    \
 267     "movlpd 0x00(%[src1]), %%xmm0\n"    \
 268                                         \
 269     "movhpd 0x08(%[src2]), %%xmm1\n"    \
 270     "movlpd 0x08(%[src1]), %%xmm1\n"    \
 271                                         \
 272     "movhpd 0x10(%[src2]), %%xmm2\n"    \
 273     "movlpd 0x10(%[src1]), %%xmm2\n"    \
 274                                         \
 275     "movhpd 0x18(%[src2]), %%xmm3\n"    \
 276     "movlpd 0x18(%[src1]), %%xmm3\n"
 277
 278 #define STORE64                         \
 279     "movdqu %%xmm0, 0x00(%[dst])\n"     \
 280     "movdqu %%xmm1, 0x10(%[dst])\n"     \
 281     "movdqu %%xmm2, 0x20(%[dst])\n"     \
 282     "movdqu %%xmm3, 0x30(%[dst])\n"
 283
 284 #ifdef CAN_COMPILE_SSSE3
 285         if (vlc_CPU_SSSE3())
 286             for (x = 0; x < (width & ~31); x += 32)
 287                 asm volatile
 288                     (
 289                         "movdqu (%[shuffle]), %%xmm7\n"
 290                         LOAD2X32
 291                         "pshufb %%xmm7, %%xmm0\n"
 292                         "pshufb %%xmm7, %%xmm1\n"
 293                         "pshufb %%xmm7, %%xmm2\n"
 294                         "pshufb %%xmm7, %%xmm3\n"
 295                         STORE64
 296                         : : [dst]"r"(dst+2*x),
 297                             [src1]"r"(srcu+x), [src2]"r"(srcv+x),
 298                             [shuffle]"r"(shuffle)
 299                         : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
 300                     );
 301         else
 302 #endif
 303
 304         {
 305             assert(pixel_size == 1);
 306             for (x = 0; x < (width & ~31); x += 32)
 307                 asm volatile
 308                     (
 309                         LOAD2X32
 310                         "movhlps   %%xmm0, %%xmm4\n"
 311                         "punpcklbw %%xmm4, %%xmm0\n"
 312
 313                         "movhlps   %%xmm1, %%xmm4\n"
 314                         "punpcklbw %%xmm4, %%xmm1\n"
 315
 316                         "movhlps   %%xmm2, %%xmm4\n"
 317                         "punpcklbw %%xmm4, %%xmm2\n"
 318
 319                         "movhlps   %%xmm3, %%xmm4\n"
 320                         "punpcklbw %%xmm4, %%xmm3\n"
 321                         STORE64
 322                         : : [dst]"r"(dst+2*x),
 323                             [src1]"r"(srcu+x), [src2]"r"(srcv+x)
 324                         : "memory",
 325                           "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm7"
 326                     );
 327         }
 328 #undef LOAD2X32
 329 #undef STORE64
 330
 331         if (pixel_size == 1)
 332         {
 333             for (; x < width; x++) {
 334                 dst[2*x+0] = srcu[x];
 335                 dst[2*x+1] = srcv[x];
 336             }
 337         }
 338         else
 339         {
 340             for (; x < width; x+= 2) {
 341                 dst[2*x+0] = srcu[x];
 342                 dst[2*x+1] = srcu[x + 1];
 343                 dst[2*x+2] = srcv[x];
 344                 dst[2*x+3] = srcv[x + 1];
 345             }
 346         }
 347         srcu += srcu_pitch;
 348         srcv += srcv_pitch;
 349         dst += dst_pitch;
 350     }
 351 }
 352
 353 VLC_SSE
 354 static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
 355                         uint8_t *dstv, size_t dstv_pitch,
 356                         const uint8_t *src, size_t src_pitch,
 357                         unsigned width, unsigned height, uint8_t pixel_size)
 358 {
 359     assert(pixel_size == 1 || pixel_size == 2);
 360     assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
 361
 362 #define LOAD64 \
 363     "movdqa  0(%[src]), %%xmm0\n" \
 364     "movdqa 16(%[src]), %%xmm1\n" \
 365     "movdqa 32(%[src]), %%xmm2\n" \
 366     "movdqa 48(%[src]), %%xmm3\n"
 367
 368 #define STORE2X32 \
 369     "movq   %%xmm0,   0(%[dst1])\n" \
 370     "movq   %%xmm1,   8(%[dst1])\n" \
 371     "movhpd %%xmm0,   0(%[dst2])\n" \
 372     "movhpd %%xmm1,   8(%[dst2])\n" \
 373     "movq   %%xmm2,  16(%[dst1])\n" \
 374     "movq   %%xmm3,  24(%[dst1])\n" \
 375     "movhpd %%xmm2,  16(%[dst2])\n" \
 376     "movhpd %%xmm3,  24(%[dst2])\n"
 377
 378 #ifdef CAN_COMPILE_SSSE3
 379     if (vlc_CPU_SSSE3())
 380     {
 381         static const uint8_t shuffle_8[] = { 0, 2, 4, 6, 8, 10, 12, 14,
 382                                              1, 3, 5, 7, 9, 11, 13, 15 };
 383         static const uint8_t shuffle_16[] = {  0,  1,  4,  5,  8,  9, 12, 13,
 384                                                2,  3,  6,  7, 10, 11, 14, 15 };
 385         const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
 386         for (unsigned y = 0; y < height; y++) {
 387             unsigned x = 0;
 388             for (; x < (width & ~31); x += 32) {
 389                 asm volatile (
 390                     "movdqu (%[shuffle]), %%xmm7\n"
 391                     LOAD64
 392                     "pshufb  %%xmm7, %%xmm0\n"
 393                     "pshufb  %%xmm7, %%xmm1\n"
 394                     "pshufb  %%xmm7, %%xmm2\n"
 395                     "pshufb  %%xmm7, %%xmm3\n"
 396                     STORE2X32
 397                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
 398             }
 399             if (pixel_size == 1)
 400             {
 401                 for (; x < width; x++) {
 402                     dstu[x] = src[2*x+0];
 403                     dstv[x] = src[2*x+1];
 404                 }
 405             }
 406             else
 407             {
 408                 for (; x < width; x+= 2) {
 409                     dstu[x] = src[2*x+0];
 410                     dstu[x+1] = src[2*x+1];
 411                     dstv[x] = src[2*x+2];
 412                     dstv[x+1] = src[2*x+3];
 413                 }
 414             }
 415             src  += src_pitch;
 416             dstu += dstu_pitch;
 417             dstv += dstv_pitch;
 418         }
 419     } else
 420 #endif
 421     {
 422         assert(pixel_size == 1);
 423         static const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
 424                                         0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
 425
 426         for (unsigned y = 0; y < height; y++)
 427         {
 428             unsigned x = 0;
 429             for (; x < (width & ~31); x += 32) {
 430                 asm volatile (
 431                     "movdqu (%[mask]), %%xmm7\n"
 432                     LOAD64
 433                     "movdqa   %%xmm0, %%xmm4\n"
 434                     "movdqa   %%xmm1, %%xmm5\n"
 435                     "movdqa   %%xmm2, %%xmm6\n"
 436                     "psrlw    $8,     %%xmm0\n"
 437                     "psrlw    $8,     %%xmm1\n"
 438                     "pand     %%xmm7, %%xmm4\n"
 439                     "pand     %%xmm7, %%xmm5\n"
 440                     "pand     %%xmm7, %%xmm6\n"
 441                     "packuswb %%xmm4, %%xmm0\n"
 442                     "packuswb %%xmm5, %%xmm1\n"
 443                     "pand     %%xmm3, %%xmm7\n"
 444                     "psrlw    $8,     %%xmm2\n"
 445                     "psrlw    $8,     %%xmm3\n"
 446                     "packuswb %%xmm6, %%xmm2\n"
 447                     "packuswb %%xmm7, %%xmm3\n"
 448                     STORE2X32
 449                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 450             }
 451             for (; x < width; x++) {
 452                 dstu[x] = src[2*x+0];
 453                 dstv[x] = src[2*x+1];
 454             }
 455             src  += src_pitch;
 456             dstu += dstu_pitch;
 457             dstv += dstv_pitch;
 458         }
 459     }
 460 #undef STORE2X32
 461 #undef LOAD64
 462 }
 463
 464 static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
 465                           const uint8_t *src, size_t src_pitch,
 466                           uint8_t *cache, size_t cache_size,
 467                           unsigned height, int bitshift)
 468 {
 469     const unsigned w16 = (src_pitch+15) & ~15;
 470     const unsigned hstep = cache_size / w16;
 471     assert(hstep > 0);
 472
 473     /* If SSE4.1: CopyFromUswc is faster than memcpy */
 474     if (!vlc_CPU_SSE4_1() && bitshift == 0 && src_pitch == dst_pitch)
 475         memcpy(dst, src, src_pitch * height);
 476     else
 477     for (unsigned y = 0; y < height; y += hstep) {
 478         const unsigned hblock =  __MIN(hstep, height - y);
 479
 480         /* Copy a bunch of line into our cache */
 481         CopyFromUswc(cache, w16, src, src_pitch, src_pitch, hblock, bitshift);
 482
 483         /* Copy from our cache to the destination */
 484         Copy2d(dst, dst_pitch, cache, w16, src_pitch, hblock);
 485
 486         /* */
 487         src += src_pitch * hblock;
 488         dst += dst_pitch * hblock;
 489     }
 490 }
 491
 492 static void
 493 SSE_InterleavePlanes(uint8_t *dst, size_t dst_pitch,
 494                      const uint8_t *srcu, size_t srcu_pitch,
 495                      const uint8_t *srcv, size_t srcv_pitch,
 496                      uint8_t *cache, size_t cache_size,
 497                      unsigned int height, uint8_t pixel_size, int bitshift)
 498 {
 499     assert(srcu_pitch == srcv_pitch);
 500     unsigned int const  w16 = (srcu_pitch+15) & ~15;
 501     unsigned int const  hstep = (cache_size) / (2*w16);
 502     assert(hstep > 0);
 503
 504     for (unsigned int y = 0; y < height; y += hstep)
 505     {
 506         unsigned int const      hblock = __MIN(hstep, height - y);
 507
 508         /* Copy a bunch of line into our cache */
 509         CopyFromUswc(cache, w16, srcu, srcu_pitch, srcu_pitch, hblock, bitshift);
 510         CopyFromUswc(cache+w16*hblock, w16, srcv, srcv_pitch,
 511                      srcv_pitch, hblock, bitshift);
 512
 513         /* Copy from our cache to the destination */
 514         SSE_InterleaveUV(dst, dst_pitch, cache, w16,
 515                          cache + w16 * hblock, w16,
 516                          srcu_pitch, hblock, pixel_size);
 517
 518         /* */
 519         srcu += hblock * srcu_pitch;
 520         srcv += hblock * srcv_pitch;
 521         dst += hblock * dst_pitch;
 522     }
 523 }
 524
 525 static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 526                             uint8_t *dstv, size_t dstv_pitch,
 527                             const uint8_t *src, size_t src_pitch,
 528                             uint8_t *cache, size_t cache_size,
 529                             unsigned height, uint8_t pixel_size, int bitshift)
 530 {
 531     const unsigned w16 = (src_pitch+15) & ~15;
 532     const unsigned hstep = cache_size / w16;
 533     assert(hstep > 0);
 534
 535     for (unsigned y = 0; y < height; y += hstep) {
 536         const unsigned hblock =  __MIN(hstep, height - y);
 537
 538         /* Copy a bunch of line into our cache */
 539         CopyFromUswc(cache, w16, src, src_pitch, src_pitch, hblock, bitshift);
 540
 541         /* Copy from our cache to the destination */
 542         SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
 543                     cache, w16, src_pitch / 2, hblock, pixel_size);
 544
 545         /* */
 546         src  += src_pitch  * hblock;
 547         dstu += dstu_pitch * hblock;
 548         dstv += dstv_pitch * hblock;
 549     }
 550 }
 551
 552 static void SSE_Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
 553                                const size_t src_pitch[static 3], unsigned height,
 554                                const copy_cache_t *cache)
 555 {
 556     for (unsigned n = 0; n < 3; n++) {
 557         const unsigned d = n > 0 ? 2 : 1;
 558         SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
 559                       src[n], src_pitch[n],
 560                       cache->buffer, cache->size,
 561                       (height+d-1)/d, 0);
 562     }
 563     asm volatile ("emms");
 564 }
 565
 566
 567 static void SSE_Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
 568                                  const size_t src_pitch[static 2], unsigned height,
 569                                  const copy_cache_t *cache)
 570 {
 571     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
 572                   cache->buffer, cache->size, height, 0);
 573     SSE_CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch, src[1], src_pitch[1],
 574                   cache->buffer, cache->size, height / 2, 0);
 575     asm volatile ("emms");
 576 }
 577
 578 static void
 579 SSE_Copy420_SP_to_P(picture_t *dest, const uint8_t *src[static 2],
 580                     const size_t src_pitch[static 2], unsigned int height,
 581                     uint8_t pixel_size, int bitshift, const copy_cache_t *cache)
 582 {
 583     SSE_CopyPlane(dest->p[0].p_pixels, dest->p[0].i_pitch,
 584                   src[0], src_pitch[0], cache->buffer, cache->size, height, bitshift);
 585
 586     SSE_SplitPlanes(dest->p[1].p_pixels, dest->p[1].i_pitch,
 587                     dest->p[2].p_pixels, dest->p[2].i_pitch,
 588                     src[1], src_pitch[1], cache->buffer, cache->size,
 589                     height / 2, pixel_size, bitshift);
 590     asm volatile ("emms");
 591 }
 592
 593 static void SSE_Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 594                                 const size_t src_pitch[static 3],
 595                                 unsigned height, uint8_t pixel_size,
 596                                 int bitshift, const copy_cache_t *cache)
 597 {
 598     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
 599                   cache->buffer, cache->size, height, bitshift);
 600     SSE_InterleavePlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
 601                          src[U_PLANE], src_pitch[U_PLANE],
 602                          src[V_PLANE], src_pitch[V_PLANE],
 603                          cache->buffer, cache->size, height / 2, pixel_size, bitshift);
 604     asm volatile ("emms");
 605 }
 606 #undef COPY64
 607 #endif /* CAN_COMPILE_SSE2 */
 608
 609 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
 610                       const uint8_t *src, size_t src_pitch,
 611                       unsigned height, int bitshift)
 612 {
 613     if (bitshift != 0)
 614     {
 615         for (unsigned y = 0; y < height; y++)
 616         {
 617             uint16_t *dst16 = (uint16_t *) dst;
 618             const uint16_t *src16 = (const uint16_t *) src;
 619
 620             if (bitshift > 0)
 621                 for (unsigned x = 0; x < (src_pitch / 2); x++)
 622                     *dst16++ = (*src16++) >> (bitshift & 0xf);
 623             else
 624                 for (unsigned x = 0; x < (src_pitch / 2); x++)
 625                     *dst16++ = (*src16++) << ((-bitshift) & 0xf);
 626             src += src_pitch;
 627             dst += dst_pitch;
 628         }
 629     }
 630     else if (src_pitch == dst_pitch)
 631         memcpy(dst, src, src_pitch * height);
 632     else
 633     for (unsigned y = 0; y < height; y++) {
 634         memcpy(dst, src, src_pitch);
 635         src += src_pitch;
 636         dst += dst_pitch;
 637     }
 638 }
 639
 640 void CopyPacked(picture_t *dst, const uint8_t *src, const size_t src_pitch,
 641                 unsigned height, const copy_cache_t *cache)
 642 {
 643     assert(dst);
 644     assert(src); assert(src_pitch);
 645     assert(height);
 646
 647     if (vlc_CPU_SSE4_1())
 648         SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src, src_pitch,
 649                       cache->buffer, cache->size, height, 0);
 650     else
 651         CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src, src_pitch,
 652                   height, 0);
 653 }
 654
 655 void Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
 656                       const size_t src_pitch[static 2], unsigned height,
 657                       const copy_cache_t *cache)
 658 {
 659     ASSERT_2PLANES;
 660 #ifdef CAN_COMPILE_SSE2
 661     if (vlc_CPU_SSE2())
 662         return SSE_Copy420_SP_to_SP(dst, src, src_pitch, height, cache);
 663 #else
 664     (void) cache;
 665 #endif
 666
 667     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 668               src[0], src_pitch[0], height, 0);
 669     CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
 670               src[1], src_pitch[1], height/2, 0);
 671 }
 672
 673 #define SPLIT_PLANES(type, pitch_den) do { \
 674     for (unsigned y = 0; y < height; y++) { \
 675         for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
 676             ((type *) dstu)[x] = ((const type *) src)[2*x+0]; \
 677             ((type *) dstv)[x] = ((const type *) src)[2*x+1]; \
 678         } \
 679         src  += src_pitch; \
 680         dstu += dstu_pitch; \
 681         dstv += dstv_pitch; \
 682     } \
 683 } while(0)
 684
 685 #define SPLIT_PLANES_SHIFTR(type, pitch_den, bitshift) do { \
 686     for (unsigned y = 0; y < height; y++) { \
 687         for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
 688             ((type *) dstu)[x] = (((const type *) src)[2*x+0]) >> (bitshift); \
 689             ((type *) dstv)[x] = (((const type *) src)[2*x+1]) >> (bitshift); \
 690         } \
 691         src  += src_pitch; \
 692         dstu += dstu_pitch; \
 693         dstv += dstv_pitch; \
 694     } \
 695 } while(0)
 696
 697 #define SPLIT_PLANES_SHIFTL(type, pitch_den, bitshift) do { \
 698     for (unsigned y = 0; y < height; y++) { \
 699         for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
 700             ((type *) dstu)[x] = (((const type *) src)[2*x+0]) << (bitshift); \
 701             ((type *) dstv)[x] = (((const type *) src)[2*x+1]) << (bitshift); \
 702         } \
 703         src  += src_pitch; \
 704         dstu += dstu_pitch; \
 705         dstv += dstv_pitch; \
 706     } \
 707 } while(0)
 708
 709 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 710                         uint8_t *dstv, size_t dstv_pitch,
 711                         const uint8_t *src, size_t src_pitch, unsigned height)
 712 {
 713     SPLIT_PLANES(uint8_t, 2);
 714 }
 715
 716 static void SplitPlanes16(uint8_t *dstu, size_t dstu_pitch,
 717                           uint8_t *dstv, size_t dstv_pitch,
 718                           const uint8_t *src, size_t src_pitch, unsigned height,
 719                           int bitshift)
 720 {
 721     if (bitshift == 0)
 722         SPLIT_PLANES(uint16_t, 4);
 723     else if (bitshift > 0)
 724         SPLIT_PLANES_SHIFTR(uint16_t, 4, bitshift & 0xf);
 725     else
 726         SPLIT_PLANES_SHIFTL(uint16_t, 4, (-bitshift) & 0xf);
 727 }
 728
 729 void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
 730                      const size_t src_pitch[static 2], unsigned height,
 731                      const copy_cache_t *cache)
 732 {
 733     ASSERT_2PLANES;
 734 #ifdef CAN_COMPILE_SSE2
 735     if (vlc_CPU_SSE2())
 736         return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 1, 0, cache);
 737 #else
 738     VLC_UNUSED(cache);
 739 #endif
 740
 741     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 742               src[0], src_pitch[0], height, 0);
 743     SplitPlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
 744                 dst->p[2].p_pixels, dst->p[2].i_pitch,
 745                 src[1], src_pitch[1], height/2);
 746 }
 747
 748 void Copy420_16_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
 749                         const size_t src_pitch[static 2], unsigned height,
 750                         int bitshift, const copy_cache_t *cache)
 751 {
 752     ASSERT_2PLANES;
 753     assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0));
 754
 755 #ifdef CAN_COMPILE_SSE3
 756     if (vlc_CPU_SSSE3())
 757         return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 2, bitshift, cache);
 758 #else
 759     VLC_UNUSED(cache);
 760 #endif
 761
 762     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 763               src[0], src_pitch[0], height, bitshift);
 764     SplitPlanes16(dst->p[1].p_pixels, dst->p[1].i_pitch,
 765                   dst->p[2].p_pixels, dst->p[2].i_pitch,
 766                   src[1], src_pitch[1], height/2, bitshift);
 767 }
 768
 769 #define INTERLEAVE_UV() do { \
 770     for ( unsigned int line = 0; line < copy_lines; line++ ) { \
 771         for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
 772             *dstUV++ = *srcU++; \
 773             *dstUV++ = *srcV++; \
 774         } \
 775         dstUV += i_extra_pitch_uv; \
 776         srcU  += i_extra_pitch_u; \
 777         srcV  += i_extra_pitch_v; \
 778     } \
 779 }while(0)
 780
 781 #define INTERLEAVE_UV_SHIFTR(bitshitf) do { \
 782     for ( unsigned int line = 0; line < copy_lines; line++ ) { \
 783         for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
 784             *dstUV++ = (*srcU++) >> (bitshitf); \
 785             *dstUV++ = (*srcV++) >> (bitshitf); \
 786         } \
 787         dstUV += i_extra_pitch_uv; \
 788         srcU  += i_extra_pitch_u; \
 789         srcV  += i_extra_pitch_v; \
 790     } \
 791 }while(0)
 792
 793 #define INTERLEAVE_UV_SHIFTL(bitshitf) do { \
 794     for ( unsigned int line = 0; line < copy_lines; line++ ) { \
 795         for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
 796             *dstUV++ = (*srcU++) << (bitshitf); \
 797             *dstUV++ = (*srcV++) << (bitshitf); \
 798         } \
 799         dstUV += i_extra_pitch_uv; \
 800         srcU  += i_extra_pitch_u; \
 801         srcV  += i_extra_pitch_v; \
 802     } \
 803 }while(0)
 804
 805 void Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 806                      const size_t src_pitch[static 3], unsigned height,
 807                      const copy_cache_t *cache)
 808 {
 809     ASSERT_3PLANES;
 810 #ifdef CAN_COMPILE_SSE2
 811     if (vlc_CPU_SSE2())
 812         return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 1, 0, cache);
 813 #else
 814     (void) cache;
 815 #endif
 816
 817     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 818               src[0], src_pitch[0], height, 0);
 819
 820     const unsigned copy_lines = height / 2;
 821     const unsigned copy_pitch = src_pitch[1];
 822
 823     const int i_extra_pitch_uv = dst->p[1].i_pitch - 2 * copy_pitch;
 824     const int i_extra_pitch_u  = src_pitch[U_PLANE] - copy_pitch;
 825     const int i_extra_pitch_v  = src_pitch[V_PLANE] - copy_pitch;
 826
 827     uint8_t *dstUV = dst->p[1].p_pixels;
 828     const uint8_t *srcU  = src[U_PLANE];
 829     const uint8_t *srcV  = src[V_PLANE];
 830     INTERLEAVE_UV();
 831 }
 832
 833 void Copy420_16_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 834                         const size_t src_pitch[static 3], unsigned height,
 835                         int bitshift, const copy_cache_t *cache)
 836 {
 837     ASSERT_3PLANES;
 838     assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0));
 839 #ifdef CAN_COMPILE_SSE2
 840     if (vlc_CPU_SSSE3())
 841         return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 2, bitshift, cache);
 842 #else
 843     (void) cache;
 844 #endif
 845
 846     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 847               src[0], src_pitch[0], height, bitshift);
 848
 849     const unsigned copy_lines = height / 2;
 850     const unsigned copy_pitch = src_pitch[1] / 2;
 851
 852     const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
 853     const int i_extra_pitch_u  = src_pitch[U_PLANE] / 2 - copy_pitch;
 854     const int i_extra_pitch_v  = src_pitch[V_PLANE] / 2 - copy_pitch;
 855
 856     uint16_t *dstUV = (void*) dst->p[1].p_pixels;
 857     const uint16_t *srcU  = (const uint16_t *) src[U_PLANE];
 858     const uint16_t *srcV  = (const uint16_t *) src[V_PLANE];
 859
 860     if (bitshift == 0)
 861         INTERLEAVE_UV();
 862     else if (bitshift > 0)
 863         INTERLEAVE_UV_SHIFTR(bitshift & 0xf);
 864     else
 865         INTERLEAVE_UV_SHIFTL((-bitshift) & 0xf);
 866 }
 867
 868 void CopyFromI420_10ToP010(picture_t *dst, const uint8_t *src[static 3],
 869                            const size_t src_pitch[static 3],
 870                            unsigned height, const copy_cache_t *cache)
 871 {
 872     (void) cache;
 873
 874     const int i_extra_pitch_dst_y = (dst->p[0].i_pitch  - src_pitch[0]) / 2;
 875     const int i_extra_pitch_src_y = (src_pitch[Y_PLANE] - src_pitch[0]) / 2;
 876     uint16_t *dstY = (uint16_t *) dst->p[0].p_pixels;
 877     const uint16_t *srcY = (const uint16_t *) src[Y_PLANE];
 878     for (unsigned y = 0; y < height; y++) {
 879         for (unsigned x = 0; x < (src_pitch[0] / 2); x++) {
 880             *dstY++ = *srcY++ << 6;
 881         }
 882         dstY += i_extra_pitch_dst_y;
 883         srcY += i_extra_pitch_src_y;
 884     }
 885
 886     const unsigned copy_lines = height / 2;
 887     const unsigned copy_pitch = src_pitch[1] / 2;
 888
 889     const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
 890     const int i_extra_pitch_u  = src_pitch[U_PLANE] / 2 - copy_pitch;
 891     const int i_extra_pitch_v  = src_pitch[V_PLANE] / 2 - copy_pitch;
 892
 893     uint16_t *dstUV = (uint16_t *) dst->p[1].p_pixels;
 894     const uint16_t *srcU  = (const uint16_t *) src[U_PLANE];
 895     const uint16_t *srcV  = (const uint16_t *) src[V_PLANE];
 896     for ( unsigned int line = 0; line < copy_lines; line++ )
 897     {
 898         for ( unsigned int col = 0; col < copy_pitch; col++ )
 899         {
 900             *dstUV++ = *srcU++ << 6;
 901             *dstUV++ = *srcV++ << 6;
 902         }
 903         dstUV += i_extra_pitch_uv;
 904         srcU  += i_extra_pitch_u;
 905         srcV  += i_extra_pitch_v;
 906     }
 907 }
 908
 909 void Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
 910                     const size_t src_pitch[static 3], unsigned height,
 911                     const copy_cache_t *cache)
 912 {
 913     ASSERT_3PLANES;
 914 #ifdef CAN_COMPILE_SSE2
 915     if (vlc_CPU_SSE2())
 916         return SSE_Copy420_P_to_P(dst, src, src_pitch, height, cache);
 917 #else
 918     (void) cache;
 919 #endif
 920
 921      CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 922                src[0], src_pitch[0], height, 0);
 923      CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
 924                src[1], src_pitch[1], height / 2, 0);
 925      CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
 926                src[2], src_pitch[2], height / 2, 0);
 927 }
 928
 929 int picture_UpdatePlanes(picture_t *picture, uint8_t *data, unsigned pitch)
 930 {
 931     /* fill in buffer info in first plane */
 932     picture->p->p_pixels = data;
 933     picture->p->i_pitch  = pitch;
 934     picture->p->i_lines  = picture->format.i_height;
 935     assert(picture->p->i_visible_pitch <= picture->p->i_pitch);
 936     assert(picture->p->i_visible_lines <= picture->p->i_lines);
 937
 938     /*  Fill chroma planes for biplanar YUV */
 939     if (picture->format.i_chroma == VLC_CODEC_NV12 ||
 940         picture->format.i_chroma == VLC_CODEC_NV21 ||
 941         picture->format.i_chroma == VLC_CODEC_P010) {
 942
 943         for (int n = 1; n < picture->i_planes; n++) {
 944             const plane_t *o = &picture->p[n-1];
 945             plane_t *p = &picture->p[n];
 946
 947             p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
 948             p->i_pitch  = pitch;
 949             p->i_lines  = picture->format.i_height;
 950             assert(p->i_visible_pitch <= p->i_pitch);
 951             assert(p->i_visible_lines <= p->i_lines);
 952         }
 953         /* The dx/d3d buffer is always allocated as NV12 */
 954         if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_NV12)) {
 955             /* TODO : Swap NV21 UV planes to match NV12 */
 956             return VLC_EGENERIC;
 957         }
 958     }
 959
 960     /*  Fill chroma planes for planar YUV */
 961     else
 962     if (picture->format.i_chroma == VLC_CODEC_I420 ||
 963         picture->format.i_chroma == VLC_CODEC_J420 ||
 964         picture->format.i_chroma == VLC_CODEC_YV12) {
 965
 966         for (int n = 1; n < picture->i_planes; n++) {
 967             const plane_t *o = &picture->p[n-1];
 968             plane_t *p = &picture->p[n];
 969
 970             p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
 971             p->i_pitch  = pitch / 2;
 972             p->i_lines  = picture->format.i_height / 2;
 973         }
 974         /* The dx/d3d buffer is always allocated as YV12 */
 975         if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_YV12))
 976             picture_SwapUV( picture );
 977     }
 978     return VLC_SUCCESS;
 979 }
 980
 981 #ifdef COPY_TEST
 982
 983 #include <vlc_picture.h>
 984
 985 struct test_dst
 986 {
 987     vlc_fourcc_t chroma;
 988     int bitshift;
 989     union
 990     {
 991         void (*conv)(picture_t *, const uint8_t *[], const size_t [], unsigned,
 992                      const copy_cache_t *);
 993         void (*conv16)(picture_t *, const uint8_t *[], const size_t [], unsigned, int,
 994                      const copy_cache_t *);
 995     };
 996 };
 997
 998 struct test_conv
 999 {
1000     vlc_fourcc_t src_chroma;
1001     struct test_dst dsts[3];
1002 };
1003
1004 static const struct test_conv convs[] = {
1005     { .src_chroma = VLC_CODEC_NV12,
1006       .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_SP_to_P },
1007                 { VLC_CODEC_NV12, 0, .conv = Copy420_SP_to_SP } },
1008     },
1009     { .src_chroma = VLC_CODEC_I420,
1010       .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_P_to_P },
1011                 { VLC_CODEC_NV12, 0, .conv = Copy420_P_to_SP } },
1012     },
1013     { .src_chroma = VLC_CODEC_P010,
1014       .dsts = { { VLC_CODEC_I420_10L, 6, .conv16 = Copy420_16_SP_to_P } },
1015     },
1016     { .src_chroma = VLC_CODEC_I420_10L,
1017       .dsts = { { VLC_CODEC_P010, -6, .conv16 = Copy420_16_P_to_SP } },
1018     },
1019 };
1020 #define NB_CONVS ARRAY_SIZE(convs)
1021
1022 struct test_size
1023 {
1024     int i_width;
1025     int i_height;
1026     int i_visible_width;
1027     int i_visible_height;
1028 };
1029 static const struct test_size sizes[] = {
1030     { 1, 1, 1, 1 },
1031     { 3, 3, 3, 3 },
1032     { 65, 39, 65, 39 },
1033     { 560, 369, 540, 350 },
1034     { 1274, 721, 1200, 720 },
1035     { 1920, 1088, 1920, 1080 },
1036     { 3840, 2160, 3840, 2160 },
1037 #if 0 /* too long */
1038     { 8192, 8192, 8192, 8192 },
1039 #endif
1040 };
1041 #define NB_SIZES ARRAY_SIZE(sizes)
1042
1043 static void piccheck(picture_t *pic, const vlc_chroma_description_t *dsc,
1044                      bool init)
1045 {
1046 #define ASSERT_COLOR(good) do { \
1047     fprintf(stderr, "error: pixel doesn't match @ plane: %d: %d x %d: 0x%X vs 0x%X\n", i, x, y, *(--p), good); \
1048     assert(!"error: pixel doesn't match"); \
1049 } while(0)
1050
1051 #define PICCHECK(type_u, type_uv, colors_P, color_UV, pitch_den) do { \
1052     for (int i = 0; i < pic->i_planes; ++i) \
1053     { \
1054         const struct plane_t *plane = &pic->p[i]; \
1055         for (int y = 0; y < plane->i_visible_lines; ++y) \
1056         { \
1057             if (pic->i_planes == 2 && i == 1) \
1058             { \
1059                 type_uv *p = (type_uv *)&plane->p_pixels[y * plane->i_pitch]; \
1060                 for (int x = 0; x < plane->i_visible_pitch / 2 / pitch_den; ++x) \
1061                     if (init) \
1062                         *(p++) = color_UV; \
1063                     else if (*(p++) != color_UV) \
1064                         ASSERT_COLOR(color_UV); \
1065             } \
1066             else \
1067             { \
1068                 type_u *p = (type_u *) &plane->p_pixels[y * plane->i_pitch]; \
1069                 for (int x = 0; x < plane->i_visible_pitch / pitch_den; ++x) \
1070                     if (init) \
1071                         *(p++) = colors_P[i]; \
1072                     else if (*(p++) != colors_P[i]) \
1073                         ASSERT_COLOR(colors_P[i]); \
1074             } \
1075         } \
1076     } \
1077 } while (0)
1078
1079     assert(pic->i_planes == 2 || pic->i_planes == 3);
1080     assert(dsc->pixel_size == 1 || dsc->pixel_size == 2);
1081
1082     if (dsc->pixel_size == 1)
1083     {
1084         const uint8_t colors_8_P[3] = { 0x42, 0xF1, 0x36 };
1085         const uint16_t color_8_UV = ntoh16(0xF136);
1086         PICCHECK(uint8_t, uint16_t, colors_8_P, color_8_UV, 1);
1087     }
1088     else
1089     {
1090         const unsigned mask = (1 << dsc->pixel_bits) - 1;
1091         uint16_t colors_16_P[3] = { 0x1042 &mask, 0xF114 &mask, 0x3645 &mask};
1092
1093         switch (pic->format.i_chroma)
1094         {
1095             case VLC_CODEC_P010:
1096                 for (size_t i = 0; i < 3; ++i)
1097                     colors_16_P[i] <<= 6;
1098                 break;
1099             case VLC_CODEC_I420_10L:
1100                 break;
1101             default:
1102                 vlc_assert_unreachable();
1103         }
1104
1105         uint32_t color_16_UV = (colors_16_P[2] << 16) | colors_16_P[1];
1106
1107         PICCHECK(uint16_t, uint32_t, colors_16_P, color_16_UV, 2);
1108     }
1109 }
1110
1111 static void pic_rsc_destroy(picture_t *pic)
1112 {
1113     for (unsigned i = 0; i < 3; i++)
1114         free(pic->p[i].p_pixels);
1115     free(pic);
1116 }
1117
1118 static picture_t *pic_new_unaligned(const video_format_t *fmt)
1119 {
1120     /* Allocate a no-aligned picture in order to ease buffer overflow detection
1121      * from the source picture */
1122     const vlc_chroma_description_t *dsc = vlc_fourcc_GetChromaDescription(fmt->i_chroma);
1123     assert(dsc);
1124     picture_resource_t rsc = { .pf_destroy = pic_rsc_destroy };
1125     for (unsigned i = 0; i < dsc->plane_count; i++)
1126     {
1127         rsc.p[i].i_lines = ((fmt->i_visible_height + 1) & ~ 1) * dsc->p[i].h.num / dsc->p[i].h.den;
1128         rsc.p[i].i_pitch = ((fmt->i_visible_width + 1) & ~ 1) * dsc->pixel_size * dsc->p[i].w.num / dsc->p[i].w.den;
1129         rsc.p[i].p_pixels = malloc(rsc.p[i].i_lines * rsc.p[i].i_pitch);
1130         assert(rsc.p[i].p_pixels);
1131     }
1132     return picture_NewFromResource(fmt, &rsc);
1133 }
1134
1135 int main(void)
1136 {
1137     alarm(10);
1138
1139 #ifndef COPY_TEST_NOOPTIM
1140     if (!vlc_CPU_SSE2())
1141     {
1142         fprintf(stderr, "WARNING: could not test SSE\n");
1143         return 77;
1144     }
1145 #endif
1146
1147     for (size_t i = 0; i < NB_CONVS; ++i)
1148     {
1149         const struct test_conv *conv = &convs[i];
1150
1151         for (size_t j = 0; j < NB_SIZES; ++j)
1152         {
1153             const struct test_size *size = &sizes[j];
1154
1155             const vlc_chroma_description_t *src_dsc =
1156                 vlc_fourcc_GetChromaDescription(conv->src_chroma);
1157             assert(src_dsc);
1158
1159             video_format_t fmt;
1160             video_format_Init(&fmt, 0);
1161             video_format_Setup(&fmt, conv->src_chroma,
1162                                size->i_width, size->i_height,
1163                                size->i_visible_width, size->i_visible_height,
1164                                1, 1);
1165             picture_t *src = pic_new_unaligned(&fmt);
1166             assert(src);
1167             piccheck(src, src_dsc, true);
1168
1169             copy_cache_t cache;
1170             int ret = CopyInitCache(&cache, src->format.i_width
1171                                     * src_dsc->pixel_size);
1172             assert(ret == VLC_SUCCESS);
1173
1174             for (size_t f = 0; conv->dsts[f].chroma != 0; ++f)
1175             {
1176                 const struct test_dst *test_dst= &conv->dsts[f];
1177
1178                 const vlc_chroma_description_t *dst_dsc =
1179                     vlc_fourcc_GetChromaDescription(test_dst->chroma);
1180                 assert(dst_dsc);
1181                 fmt.i_chroma = test_dst->chroma;
1182                 picture_t *dst = picture_NewFromFormat(&fmt);
1183                 assert(dst);
1184
1185                 const uint8_t * src_planes[3] = { src->p[Y_PLANE].p_pixels,
1186                                                   src->p[U_PLANE].p_pixels,
1187                                                   src->p[V_PLANE].p_pixels };
1188                 const size_t    src_pitches[3] = { src->p[Y_PLANE].i_pitch,
1189                                                    src->p[U_PLANE].i_pitch,
1190                                                    src->p[V_PLANE].i_pitch };
1191
1192                 fprintf(stderr, "testing: %u x %u (vis: %u x %u) %4.4s -> %4.4s\n",
1193                         size->i_width, size->i_height,
1194                         size->i_visible_width, size->i_visible_height,
1195                         (const char *) &src->format.i_chroma,
1196                         (const char *) &dst->format.i_chroma);
1197                 if (test_dst->bitshift == 0)
1198                     test_dst->conv(dst, src_planes, src_pitches,
1199                                    src->format.i_visible_height, &cache);
1200                 else
1201                     test_dst->conv16(dst, src_planes, src_pitches,
1202                                    src->format.i_visible_height, test_dst->bitshift,
1203                                    &cache);
1204                 piccheck(dst, dst_dsc, false);
1205                 picture_Release(dst);
1206             }
1207             picture_Release(src);
1208             CopyCleanCache(&cache);
1209         }
1210     }
1211     return 0;
1212 }
1213
1214 #endif