modules/video_chroma/copy.c

   1 /*****************************************************************************
   2  * copy.c: Fast YV12/NV12 copy
   3  *****************************************************************************
   4  * Copyright (C) 2010 Laurent Aimar
   5  * $Id$
   6  *
   7  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
   8  *          Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify it
  11  * under the terms of the GNU Lesser General Public License as published by
  12  * the Free Software Foundation; either version 2.1 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18  * GNU Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this program; if not, write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 #ifdef HAVE_CONFIG_H
  26 # include "config.h"
  27 #endif
  28
  29 #ifdef COPY_TEST
  30 # undef NDEBUG
  31 #endif
  32
  33 #include <vlc_common.h>
  34 #include <vlc_picture.h>
  35 #include <vlc_cpu.h>
  36 #include <assert.h>
  37
  38 #include "copy.h"
  39 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
  40                       const uint8_t *src, size_t src_pitch,
  41                       unsigned height, int bitshift);
  42
  43 #define ASSERT_PLANE(i) assert(src[i]); \
  44     assert(src_pitch[i])
  45
  46 #define ASSERT_2PLANES \
  47     assert(dst); \
  48     ASSERT_PLANE(0); \
  49     ASSERT_PLANE(1); \
  50     assert(height)
  51
  52 #define ASSERT_3PLANES ASSERT_2PLANES; \
  53     ASSERT_PLANE(2)
  54
  55 int CopyInitCache(copy_cache_t *cache, unsigned width)
  56 {
  57 #ifdef CAN_COMPILE_SSE2
  58     cache->size = __MAX((width + 0x3f) & ~ 0x3f, 16384);
  59     cache->buffer = aligned_alloc(64, cache->size);
  60     if (!cache->buffer)
  61         return VLC_EGENERIC;
  62 #else
  63     (void) cache; (void) width;
  64 #endif
  65     return VLC_SUCCESS;
  66 }
  67
  68 void CopyCleanCache(copy_cache_t *cache)
  69 {
  70 #ifdef CAN_COMPILE_SSE2
  71     aligned_free(cache->buffer);
  72     cache->buffer = NULL;
  73     cache->size   = 0;
  74 #else
  75     (void) cache;
  76 #endif
  77 }
  78
  79 #ifdef CAN_COMPILE_SSE2
  80 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
  81  * load and storing data with the SSE>=2 instruction store.
  82  */
  83
  84 #define COPY16_SHIFTR(x) \
  85     "psrlw "x", %%xmm1\n"
  86 #define COPY16_SHIFTL(x) \
  87     "psllw "x", %%xmm1\n"
  88
  89 #define COPY16_S(dstp, srcp, load, store, shiftstr) \
  90     asm volatile (                      \
  91         load "  0(%[src]), %%xmm1\n"    \
  92         shiftstr                        \
  93         store " %%xmm1,    0(%[dst])\n" \
  94         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
  95
  96 #define COPY16(dstp, srcp, load, store) COPY16_S(dstp, srcp, load, store, "")
  97
  98 #define COPY64_SHIFTR(x) \
  99     "psrlw "x", %%xmm1\n" \
 100     "psrlw "x", %%xmm2\n" \
 101     "psrlw "x", %%xmm3\n" \
 102     "psrlw "x", %%xmm4\n"
 103 #define COPY64_SHIFTL(x) \
 104     "psllw "x", %%xmm1\n" \
 105     "psllw "x", %%xmm2\n" \
 106     "psllw "x", %%xmm3\n" \
 107     "psllw "x", %%xmm4\n"
 108
 109 #define COPY64_S(dstp, srcp, load, store, shiftstr) \
 110     asm volatile (                      \
 111         load "  0(%[src]), %%xmm1\n"    \
 112         load " 16(%[src]), %%xmm2\n"    \
 113         load " 32(%[src]), %%xmm3\n"    \
 114         load " 48(%[src]), %%xmm4\n"    \
 115         shiftstr                        \
 116         store " %%xmm1,    0(%[dst])\n" \
 117         store " %%xmm2,   16(%[dst])\n" \
 118         store " %%xmm3,   32(%[dst])\n" \
 119         store " %%xmm4,   48(%[dst])\n" \
 120         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
 121
 122 #define COPY64(dstp, srcp, load, store) \
 123     COPY64_S(dstp, srcp, load, store, "")
 124
 125 #ifdef COPY_TEST_NOOPTIM
 126 # undef vlc_CPU_SSE4_1
 127 # define vlc_CPU_SSE4_1() (0)
 128 # undef vlc_CPU_SSE3
 129 # define vlc_CPU_SSE3() (0)
 130 # undef vlc_CPU_SSSE3
 131 # define vlc_CPU_SSSE3() (0)
 132 # undef vlc_CPU_SSE2
 133 # define vlc_CPU_SSE2() (0)
 134 #endif
 135
 136 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
 137  * as used by some video surface.
 138  * XXX It is really efficient only when SSE4.1 is available.
 139  */
 140 VLC_SSE
 141 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
 142                          const uint8_t *src, size_t src_pitch,
 143                          unsigned width, unsigned height, int bitshift)
 144 {
 145     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
 146
 147     asm volatile ("mfence");
 148
 149 #define SSE_USWC_COPY(shiftstr16, shiftstr64) \
 150     for (unsigned y = 0; y < height; y++) { \
 151         const unsigned unaligned = (-(uintptr_t)src) & 0x0f; \
 152         unsigned x = unaligned; \
 153         if (vlc_CPU_SSE4_1()) { \
 154             if (!unaligned) { \
 155                 for (; x+63 < width; x += 64) \
 156                     COPY64_S(&dst[x], &src[x], "movntdqa", "movdqa", shiftstr64); \
 157             } else { \
 158                 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
 159                 for (; x+63 < width; x += 64) \
 160                     COPY64_S(&dst[x], &src[x], "movntdqa", "movdqu", shiftstr64); \
 161             } \
 162         } else { \
 163             if (!unaligned) { \
 164                 for (; x+63 < width; x += 64) \
 165                     COPY64_S(&dst[x], &src[x], "movdqa", "movdqa", shiftstr64); \
 166             } else { \
 167                 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
 168                 for (; x+63 < width; x += 64) \
 169                     COPY64_S(&dst[x], &src[x], "movdqa", "movdqu", shiftstr64); \
 170             } \
 171         } \
 172         /* The following should not happen since buffers are generally well aligned */ \
 173         if (x < width) \
 174             CopyPlane(&dst[x], dst_pitch - x, &src[x], src_pitch - x, 1, bitshift); \
 175         src += src_pitch; \
 176         dst += dst_pitch; \
 177     }
 178
 179     switch (bitshift)
 180     {
 181         case 0:
 182             SSE_USWC_COPY("", "")
 183             break;
 184         case -6:
 185             SSE_USWC_COPY(COPY16_SHIFTL("$6"), COPY64_SHIFTL("$6"))
 186             break;
 187         case 6:
 188             SSE_USWC_COPY(COPY16_SHIFTR("$6"), COPY64_SHIFTR("$6"))
 189             break;
 190         case 2:
 191             SSE_USWC_COPY(COPY16_SHIFTR("$2"), COPY64_SHIFTR("$2"))
 192             break;
 193         case -2:
 194             SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
 195             break;
 196         case 4:
 197             SSE_USWC_COPY(COPY16_SHIFTR("$4"), COPY64_SHIFTR("$4"))
 198             break;
 199         case -4:
 200             SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
 201             break;
 202         default:
 203             vlc_assert_unreachable();
 204     }
 205 #undef SSE_USWC_COPY
 206
 207     asm volatile ("mfence");
 208 }
 209
 210 VLC_SSE
 211 static void Copy2d(uint8_t *dst, size_t dst_pitch,
 212                    const uint8_t *src, size_t src_pitch,
 213                    unsigned width, unsigned height)
 214 {
 215     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 216
 217     for (unsigned y = 0; y < height; y++) {
 218         unsigned x = 0;
 219
 220         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
 221         if (!unaligned) {
 222             for (; x+63 < width; x += 64)
 223                 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
 224         } else {
 225             for (; x+63 < width; x += 64)
 226                 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
 227         }
 228
 229         for (; x < width; x++)
 230             dst[x] = src[x];
 231
 232         src += src_pitch;
 233         dst += dst_pitch;
 234     }
 235 }
 236
 237 VLC_SSE
 238 static void
 239 SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch,
 240                  uint8_t *srcu, size_t srcu_pitch,
 241                  uint8_t *srcv, size_t srcv_pitch,
 242                  unsigned int width, unsigned int height, uint8_t pixel_size)
 243 {
 244     assert(!((intptr_t)srcu & 0xf) && !(srcu_pitch & 0x0f) &&
 245            !((intptr_t)srcv & 0xf) && !(srcv_pitch & 0x0f));
 246
 247     static const uint8_t shuffle_8[] = { 0, 8,
 248                                          1, 9,
 249                                          2, 10,
 250                                          3, 11,
 251                                          4, 12,
 252                                          5, 13,
 253                                          6, 14,
 254                                          7, 15 };
 255     static const uint8_t shuffle_16[] = { 0, 1, 8, 9,
 256                                           2, 3, 10, 11,
 257                                           4, 5, 12, 13,
 258                                           6, 7, 14, 15 };
 259     const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
 260
 261     for (unsigned int y = 0; y < height; ++y)
 262     {
 263         unsigned int    x;
 264
 265 #define LOAD2X32                        \
 266     "movhpd 0x00(%[src2]), %%xmm0\n"    \
 267     "movlpd 0x00(%[src1]), %%xmm0\n"    \
 268                                         \
 269     "movhpd 0x08(%[src2]), %%xmm1\n"    \
 270     "movlpd 0x08(%[src1]), %%xmm1\n"    \
 271                                         \
 272     "movhpd 0x10(%[src2]), %%xmm2\n"    \
 273     "movlpd 0x10(%[src1]), %%xmm2\n"    \
 274                                         \
 275     "movhpd 0x18(%[src2]), %%xmm3\n"    \
 276     "movlpd 0x18(%[src1]), %%xmm3\n"
 277
 278 #define STORE64                         \
 279     "movdqu %%xmm0, 0x00(%[dst])\n"     \
 280     "movdqu %%xmm1, 0x10(%[dst])\n"     \
 281     "movdqu %%xmm2, 0x20(%[dst])\n"     \
 282     "movdqu %%xmm3, 0x30(%[dst])\n"
 283
 284 #ifdef CAN_COMPILE_SSSE3
 285         if (vlc_CPU_SSSE3())
 286             for (x = 0; x < (width & ~31); x += 32)
 287                 asm volatile
 288                     (
 289                         "movdqu (%[shuffle]), %%xmm7\n"
 290                         LOAD2X32
 291                         "pshufb %%xmm7, %%xmm0\n"
 292                         "pshufb %%xmm7, %%xmm1\n"
 293                         "pshufb %%xmm7, %%xmm2\n"
 294                         "pshufb %%xmm7, %%xmm3\n"
 295                         STORE64
 296                         : : [dst]"r"(dst+2*x),
 297                             [src1]"r"(srcu+x), [src2]"r"(srcv+x),
 298                             [shuffle]"r"(shuffle)
 299                         : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
 300                     );
 301         else
 302 #endif
 303
 304         {
 305             assert(pixel_size == 1);
 306             for (x = 0; x < (width & ~31); x += 32)
 307                 asm volatile
 308                     (
 309                         LOAD2X32
 310                         "movhlps   %%xmm0, %%xmm4\n"
 311                         "punpcklbw %%xmm4, %%xmm0\n"
 312
 313                         "movhlps   %%xmm1, %%xmm4\n"
 314                         "punpcklbw %%xmm4, %%xmm1\n"
 315
 316                         "movhlps   %%xmm2, %%xmm4\n"
 317                         "punpcklbw %%xmm4, %%xmm2\n"
 318
 319                         "movhlps   %%xmm3, %%xmm4\n"
 320                         "punpcklbw %%xmm4, %%xmm3\n"
 321                         STORE64
 322                         : : [dst]"r"(dst+2*x),
 323                             [src1]"r"(srcu+x), [src2]"r"(srcv+x)
 324                         : "memory",
 325                           "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm7"
 326                     );
 327         }
 328 #undef LOAD2X32
 329 #undef STORE64
 330
 331         if (pixel_size == 1)
 332         {
 333             for (; x < width; x++) {
 334                 dst[2*x+0] = srcu[x];
 335                 dst[2*x+1] = srcv[x];
 336             }
 337         }
 338         else
 339         {
 340             for (; x < width; x+= 2) {
 341                 dst[2*x+0] = srcu[x];
 342                 dst[2*x+1] = srcu[x + 1];
 343                 dst[2*x+2] = srcv[x];
 344                 dst[2*x+3] = srcv[x + 1];
 345             }
 346         }
 347         srcu += srcu_pitch;
 348         srcv += srcv_pitch;
 349         dst += dst_pitch;
 350     }
 351 }
 352
 353 VLC_SSE
 354 static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
 355                         uint8_t *dstv, size_t dstv_pitch,
 356                         const uint8_t *src, size_t src_pitch,
 357                         unsigned width, unsigned height, uint8_t pixel_size)
 358 {
 359     assert(pixel_size == 1 || pixel_size == 2);
 360     assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
 361
 362 #define LOAD64 \
 363     "movdqa  0(%[src]), %%xmm0\n" \
 364     "movdqa 16(%[src]), %%xmm1\n" \
 365     "movdqa 32(%[src]), %%xmm2\n" \
 366     "movdqa 48(%[src]), %%xmm3\n"
 367
 368 #define STORE2X32 \
 369     "movq   %%xmm0,   0(%[dst1])\n" \
 370     "movq   %%xmm1,   8(%[dst1])\n" \
 371     "movhpd %%xmm0,   0(%[dst2])\n" \
 372     "movhpd %%xmm1,   8(%[dst2])\n" \
 373     "movq   %%xmm2,  16(%[dst1])\n" \
 374     "movq   %%xmm3,  24(%[dst1])\n" \
 375     "movhpd %%xmm2,  16(%[dst2])\n" \
 376     "movhpd %%xmm3,  24(%[dst2])\n"
 377
 378 #ifdef CAN_COMPILE_SSSE3
 379     if (vlc_CPU_SSSE3())
 380     {
 381         static const uint8_t shuffle_8[] = { 0, 2, 4, 6, 8, 10, 12, 14,
 382                                              1, 3, 5, 7, 9, 11, 13, 15 };
 383         static const uint8_t shuffle_16[] = {  0,  1,  4,  5,  8,  9, 12, 13,
 384                                                2,  3,  6,  7, 10, 11, 14, 15 };
 385         const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
 386         for (unsigned y = 0; y < height; y++) {
 387             unsigned x = 0;
 388             for (; x < (width & ~31); x += 32) {
 389                 asm volatile (
 390                     "movdqu (%[shuffle]), %%xmm7\n"
 391                     LOAD64
 392                     "pshufb  %%xmm7, %%xmm0\n"
 393                     "pshufb  %%xmm7, %%xmm1\n"
 394                     "pshufb  %%xmm7, %%xmm2\n"
 395                     "pshufb  %%xmm7, %%xmm3\n"
 396                     STORE2X32
 397                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
 398             }
 399             if (pixel_size == 1)
 400             {
 401                 for (; x < width; x++) {
 402                     dstu[x] = src[2*x+0];
 403                     dstv[x] = src[2*x+1];
 404                 }
 405             }
 406             else
 407             {
 408                 for (; x < width; x+= 2) {
 409                     dstu[x] = src[2*x+0];
 410                     dstu[x+1] = src[2*x+1];
 411                     dstv[x] = src[2*x+2];
 412                     dstv[x+1] = src[2*x+3];
 413                 }
 414             }
 415             src  += src_pitch;
 416             dstu += dstu_pitch;
 417             dstv += dstv_pitch;
 418         }
 419     } else
 420 #endif
 421     {
 422         assert(pixel_size == 1);
 423         static const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
 424                                         0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
 425
 426         for (unsigned y = 0; y < height; y++)
 427         {
 428             unsigned x = 0;
 429             for (; x < (width & ~31); x += 32) {
 430                 asm volatile (
 431                     "movdqu (%[mask]), %%xmm7\n"
 432                     LOAD64
 433                     "movdqa   %%xmm0, %%xmm4\n"
 434                     "movdqa   %%xmm1, %%xmm5\n"
 435                     "movdqa   %%xmm2, %%xmm6\n"
 436                     "psrlw    $8,     %%xmm0\n"
 437                     "psrlw    $8,     %%xmm1\n"
 438                     "pand     %%xmm7, %%xmm4\n"
 439                     "pand     %%xmm7, %%xmm5\n"
 440                     "pand     %%xmm7, %%xmm6\n"
 441                     "packuswb %%xmm4, %%xmm0\n"
 442                     "packuswb %%xmm5, %%xmm1\n"
 443                     "pand     %%xmm3, %%xmm7\n"
 444                     "psrlw    $8,     %%xmm2\n"
 445                     "psrlw    $8,     %%xmm3\n"
 446                     "packuswb %%xmm6, %%xmm2\n"
 447                     "packuswb %%xmm7, %%xmm3\n"
 448                     STORE2X32
 449                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 450             }
 451             for (; x < width; x++) {
 452                 dstu[x] = src[2*x+0];
 453                 dstv[x] = src[2*x+1];
 454             }
 455             src  += src_pitch;
 456             dstu += dstu_pitch;
 457             dstv += dstv_pitch;
 458         }
 459     }
 460 #undef STORE2X32
 461 #undef LOAD64
 462 }
 463
 464 static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
 465                           const uint8_t *src, size_t src_pitch,
 466                           uint8_t *cache, size_t cache_size,
 467                           unsigned height, int bitshift)
 468 {
 469     const size_t copy_pitch = __MIN(src_pitch, dst_pitch);
 470     const unsigned w16 = (copy_pitch+15) & ~15;
 471     const unsigned hstep = cache_size / w16;
 472     const unsigned cache_width = __MIN(src_pitch, hstep);
 473     assert(hstep > 0);
 474
 475     /* If SSE4.1: CopyFromUswc is faster than memcpy */
 476     if (!vlc_CPU_SSE4_1() && bitshift == 0 && src_pitch == dst_pitch)
 477         memcpy(dst, src, copy_pitch * height);
 478     else
 479     for (unsigned y = 0; y < height; y += hstep) {
 480         const unsigned hblock =  __MIN(hstep, height - y);
 481
 482         /* Copy a bunch of line into our cache */
 483         CopyFromUswc(cache, w16, src, src_pitch, cache_width, hblock, bitshift);
 484
 485         /* Copy from our cache to the destination */
 486         Copy2d(dst, dst_pitch, cache, w16, copy_pitch, hblock);
 487
 488         /* */
 489         src += src_pitch * hblock;
 490         dst += dst_pitch * hblock;
 491     }
 492 }
 493
 494 static void
 495 SSE_InterleavePlanes(uint8_t *dst, size_t dst_pitch,
 496                      const uint8_t *srcu, size_t srcu_pitch,
 497                      const uint8_t *srcv, size_t srcv_pitch,
 498                      uint8_t *cache, size_t cache_size,
 499                      unsigned int height, uint8_t pixel_size, int bitshift)
 500 {
 501     assert(srcu_pitch == srcv_pitch);
 502     size_t copy_pitch = __MIN(dst_pitch / 2, srcu_pitch);
 503     unsigned int const  w16 = (srcu_pitch+15) & ~15;
 504     unsigned int const  hstep = (cache_size) / (2*w16);
 505     const unsigned cacheu_width = __MIN(srcu_pitch, hstep);
 506     const unsigned cachev_width = __MIN(srcv_pitch, hstep);
 507     assert(hstep > 0);
 508
 509     for (unsigned int y = 0; y < height; y += hstep)
 510     {
 511         unsigned int const      hblock = __MIN(hstep, height - y);
 512
 513         /* Copy a bunch of line into our cache */
 514         CopyFromUswc(cache, w16, srcu, srcu_pitch, cacheu_width, hblock, bitshift);
 515         CopyFromUswc(cache+w16*hblock, w16, srcv, srcv_pitch,
 516                      cachev_width, hblock, bitshift);
 517
 518         /* Copy from our cache to the destination */
 519         SSE_InterleaveUV(dst, dst_pitch, cache, w16,
 520                          cache + w16 * hblock, w16,
 521                          copy_pitch, hblock, pixel_size);
 522
 523         /* */
 524         srcu += hblock * srcu_pitch;
 525         srcv += hblock * srcv_pitch;
 526         dst += hblock * dst_pitch;
 527     }
 528 }
 529
 530 static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 531                             uint8_t *dstv, size_t dstv_pitch,
 532                             const uint8_t *src, size_t src_pitch,
 533                             uint8_t *cache, size_t cache_size,
 534                             unsigned height, uint8_t pixel_size, int bitshift)
 535 {
 536     size_t copy_pitch = __MIN(__MIN(src_pitch / 2, dstu_pitch), dstv_pitch);
 537     const unsigned w16 = (src_pitch+15) & ~15;
 538     const unsigned hstep = cache_size / w16;
 539     const unsigned cache_width = __MIN(src_pitch, hstep);
 540     assert(hstep > 0);
 541
 542     for (unsigned y = 0; y < height; y += hstep) {
 543         const unsigned hblock =  __MIN(hstep, height - y);
 544
 545         /* Copy a bunch of line into our cache */
 546         CopyFromUswc(cache, w16, src, src_pitch, cache_width, hblock, bitshift);
 547
 548         /* Copy from our cache to the destination */
 549         SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
 550                     cache, w16, copy_pitch, hblock, pixel_size);
 551
 552         /* */
 553         src  += src_pitch  * hblock;
 554         dstu += dstu_pitch * hblock;
 555         dstv += dstv_pitch * hblock;
 556     }
 557 }
 558
 559 static void SSE_Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
 560                                const size_t src_pitch[static 3], unsigned height,
 561                                const copy_cache_t *cache)
 562 {
 563     for (unsigned n = 0; n < 3; n++) {
 564         const unsigned d = n > 0 ? 2 : 1;
 565         SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
 566                       src[n], src_pitch[n],
 567                       cache->buffer, cache->size,
 568                       (height+d-1)/d, 0);
 569     }
 570     asm volatile ("emms");
 571 }
 572
 573
 574 static void SSE_Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
 575                                  const size_t src_pitch[static 2], unsigned height,
 576                                  const copy_cache_t *cache)
 577 {
 578     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
 579                   cache->buffer, cache->size, height, 0);
 580     SSE_CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch, src[1], src_pitch[1],
 581                   cache->buffer, cache->size, (height+1) / 2, 0);
 582     asm volatile ("emms");
 583 }
 584
 585 static void
 586 SSE_Copy420_SP_to_P(picture_t *dest, const uint8_t *src[static 2],
 587                     const size_t src_pitch[static 2], unsigned int height,
 588                     uint8_t pixel_size, int bitshift, const copy_cache_t *cache)
 589 {
 590     SSE_CopyPlane(dest->p[0].p_pixels, dest->p[0].i_pitch,
 591                   src[0], src_pitch[0], cache->buffer, cache->size, height, bitshift);
 592
 593     SSE_SplitPlanes(dest->p[1].p_pixels, dest->p[1].i_pitch,
 594                     dest->p[2].p_pixels, dest->p[2].i_pitch,
 595                     src[1], src_pitch[1], cache->buffer, cache->size,
 596                     (height+1) / 2, pixel_size, bitshift);
 597     asm volatile ("emms");
 598 }
 599
 600 static void SSE_Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 601                                 const size_t src_pitch[static 3],
 602                                 unsigned height, uint8_t pixel_size,
 603                                 int bitshift, const copy_cache_t *cache)
 604 {
 605     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
 606                   cache->buffer, cache->size, height, bitshift);
 607     SSE_InterleavePlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
 608                          src[U_PLANE], src_pitch[U_PLANE],
 609                          src[V_PLANE], src_pitch[V_PLANE],
 610                          cache->buffer, cache->size, (height+1) / 2, pixel_size, bitshift);
 611     asm volatile ("emms");
 612 }
 613 #undef COPY64
 614 #endif /* CAN_COMPILE_SSE2 */
 615
 616 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
 617                       const uint8_t *src, size_t src_pitch,
 618                       unsigned height, int bitshift)
 619 {
 620     const size_t copy_pitch = __MIN(src_pitch, dst_pitch);
 621     if (bitshift != 0)
 622     {
 623         for (unsigned y = 0; y < height; y++)
 624         {
 625             uint16_t *dst16 = (uint16_t *) dst;
 626             const uint16_t *src16 = (const uint16_t *) src;
 627
 628             if (bitshift > 0)
 629                 for (unsigned x = 0; x < (copy_pitch / 2); x++)
 630                     *dst16++ = (*src16++) >> (bitshift & 0xf);
 631             else
 632                 for (unsigned x = 0; x < (copy_pitch / 2); x++)
 633                     *dst16++ = (*src16++) << ((-bitshift) & 0xf);
 634             src += src_pitch;
 635             dst += dst_pitch;
 636         }
 637     }
 638     else if (src_pitch == dst_pitch)
 639         memcpy(dst, src, copy_pitch * height);
 640     else
 641     for (unsigned y = 0; y < height; y++) {
 642         memcpy(dst, src, copy_pitch);
 643         src += src_pitch;
 644         dst += dst_pitch;
 645     }
 646 }
 647
 648 void CopyPacked(picture_t *dst, const uint8_t *src, const size_t src_pitch,
 649                 unsigned height, const copy_cache_t *cache)
 650 {
 651     assert(dst);
 652     assert(src); assert(src_pitch);
 653     assert(height);
 654
 655 #ifdef CAN_COMPILE_SSE2
 656     if (vlc_CPU_SSE4_1())
 657         return SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src, src_pitch,
 658                              cache->buffer, cache->size, height, 0);
 659 #else
 660     (void) cache;
 661 #endif
 662         CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src, src_pitch,
 663                   height, 0);
 664 }
 665
 666 void Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
 667                       const size_t src_pitch[static 2], unsigned height,
 668                       const copy_cache_t *cache)
 669 {
 670     ASSERT_2PLANES;
 671 #ifdef CAN_COMPILE_SSE2
 672     if (vlc_CPU_SSE2())
 673         return SSE_Copy420_SP_to_SP(dst, src, src_pitch, height, cache);
 674 #else
 675     (void) cache;
 676 #endif
 677
 678     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 679               src[0], src_pitch[0], height, 0);
 680     CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
 681               src[1], src_pitch[1], (height+1)/2, 0);
 682 }
 683
 684 #define SPLIT_PLANES(type, pitch_den) do { \
 685     size_t copy_pitch = __MIN(__MIN(src_pitch / pitch_den, dstu_pitch), dstv_pitch); \
 686     for (unsigned y = 0; y < height; y++) { \
 687         for (unsigned x = 0; x < copy_pitch; x++) { \
 688             ((type *) dstu)[x] = ((const type *) src)[2*x+0]; \
 689             ((type *) dstv)[x] = ((const type *) src)[2*x+1]; \
 690         } \
 691         src  += src_pitch; \
 692         dstu += dstu_pitch; \
 693         dstv += dstv_pitch; \
 694     } \
 695 } while(0)
 696
 697 #define SPLIT_PLANES_SHIFTR(type, pitch_den, bitshift) do { \
 698     size_t copy_pitch = __MIN(__MIN(src_pitch / pitch_den, dstu_pitch), dstv_pitch); \
 699     for (unsigned y = 0; y < height; y++) { \
 700         for (unsigned x = 0; x < copy_pitch; x++) { \
 701             ((type *) dstu)[x] = (((const type *) src)[2*x+0]) >> (bitshift); \
 702             ((type *) dstv)[x] = (((const type *) src)[2*x+1]) >> (bitshift); \
 703         } \
 704         src  += src_pitch; \
 705         dstu += dstu_pitch; \
 706         dstv += dstv_pitch; \
 707     } \
 708 } while(0)
 709
 710 #define SPLIT_PLANES_SHIFTL(type, pitch_den, bitshift) do { \
 711     size_t copy_pitch = __MIN(__MIN(src_pitch / pitch_den, dstu_pitch), dstv_pitch); \
 712     for (unsigned y = 0; y < height; y++) { \
 713         for (unsigned x = 0; x < copy_pitch; x++) { \
 714             ((type *) dstu)[x] = (((const type *) src)[2*x+0]) << (bitshift); \
 715             ((type *) dstv)[x] = (((const type *) src)[2*x+1]) << (bitshift); \
 716         } \
 717         src  += src_pitch; \
 718         dstu += dstu_pitch; \
 719         dstv += dstv_pitch; \
 720     } \
 721 } while(0)
 722
 723 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 724                         uint8_t *dstv, size_t dstv_pitch,
 725                         const uint8_t *src, size_t src_pitch, unsigned height)
 726 {
 727     SPLIT_PLANES(uint8_t, 2);
 728 }
 729
 730 static void SplitPlanes16(uint8_t *dstu, size_t dstu_pitch,
 731                           uint8_t *dstv, size_t dstv_pitch,
 732                           const uint8_t *src, size_t src_pitch, unsigned height,
 733                           int bitshift)
 734 {
 735     if (bitshift == 0)
 736         SPLIT_PLANES(uint16_t, 4);
 737     else if (bitshift > 0)
 738         SPLIT_PLANES_SHIFTR(uint16_t, 4, bitshift & 0xf);
 739     else
 740         SPLIT_PLANES_SHIFTL(uint16_t, 4, (-bitshift) & 0xf);
 741 }
 742
 743 void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
 744                      const size_t src_pitch[static 2], unsigned height,
 745                      const copy_cache_t *cache)
 746 {
 747     ASSERT_2PLANES;
 748 #ifdef CAN_COMPILE_SSE2
 749     if (vlc_CPU_SSE2())
 750         return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 1, 0, cache);
 751 #else
 752     VLC_UNUSED(cache);
 753 #endif
 754
 755     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 756               src[0], src_pitch[0], height, 0);
 757     SplitPlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
 758                 dst->p[2].p_pixels, dst->p[2].i_pitch,
 759                 src[1], src_pitch[1], (height+1)/2);
 760 }
 761
 762 void Copy420_16_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
 763                         const size_t src_pitch[static 2], unsigned height,
 764                         int bitshift, const copy_cache_t *cache)
 765 {
 766     ASSERT_2PLANES;
 767     assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0));
 768
 769 #ifdef CAN_COMPILE_SSE3
 770     if (vlc_CPU_SSSE3())
 771         return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 2, bitshift, cache);
 772 #else
 773     VLC_UNUSED(cache);
 774 #endif
 775
 776     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 777               src[0], src_pitch[0], height, bitshift);
 778     SplitPlanes16(dst->p[1].p_pixels, dst->p[1].i_pitch,
 779                   dst->p[2].p_pixels, dst->p[2].i_pitch,
 780                   src[1], src_pitch[1], (height+1)/2, bitshift);
 781 }
 782
 783 #define INTERLEAVE_UV() do { \
 784     for ( unsigned int line = 0; line < copy_lines; line++ ) { \
 785         for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
 786             *dstUV++ = *srcU++; \
 787             *dstUV++ = *srcV++; \
 788         } \
 789         dstUV += i_extra_pitch_uv; \
 790         srcU  += i_extra_pitch_u; \
 791         srcV  += i_extra_pitch_v; \
 792     } \
 793 }while(0)
 794
 795 #define INTERLEAVE_UV_SHIFTR(bitshitf) do { \
 796     for ( unsigned int line = 0; line < copy_lines; line++ ) { \
 797         for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
 798             *dstUV++ = (*srcU++) >> (bitshitf); \
 799             *dstUV++ = (*srcV++) >> (bitshitf); \
 800         } \
 801         dstUV += i_extra_pitch_uv; \
 802         srcU  += i_extra_pitch_u; \
 803         srcV  += i_extra_pitch_v; \
 804     } \
 805 }while(0)
 806
 807 #define INTERLEAVE_UV_SHIFTL(bitshitf) do { \
 808     for ( unsigned int line = 0; line < copy_lines; line++ ) { \
 809         for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
 810             *dstUV++ = (*srcU++) << (bitshitf); \
 811             *dstUV++ = (*srcV++) << (bitshitf); \
 812         } \
 813         dstUV += i_extra_pitch_uv; \
 814         srcU  += i_extra_pitch_u; \
 815         srcV  += i_extra_pitch_v; \
 816     } \
 817 }while(0)
 818
 819 void Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 820                      const size_t src_pitch[static 3], unsigned height,
 821                      const copy_cache_t *cache)
 822 {
 823     ASSERT_3PLANES;
 824 #ifdef CAN_COMPILE_SSE2
 825     if (vlc_CPU_SSE2())
 826         return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 1, 0, cache);
 827 #else
 828     (void) cache;
 829 #endif
 830
 831     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 832               src[0], src_pitch[0], height, 0);
 833
 834     const unsigned copy_lines = (height+1) / 2;
 835     const unsigned copy_pitch = __MIN(src_pitch[1], dst->p[1].i_pitch / 2);
 836
 837     const int i_extra_pitch_uv = dst->p[1].i_pitch - 2 * copy_pitch;
 838     const int i_extra_pitch_u  = src_pitch[U_PLANE] - copy_pitch;
 839     const int i_extra_pitch_v  = src_pitch[V_PLANE] - copy_pitch;
 840
 841     uint8_t *dstUV = dst->p[1].p_pixels;
 842     const uint8_t *srcU  = src[U_PLANE];
 843     const uint8_t *srcV  = src[V_PLANE];
 844     INTERLEAVE_UV();
 845 }
 846
 847 void Copy420_16_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 848                         const size_t src_pitch[static 3], unsigned height,
 849                         int bitshift, const copy_cache_t *cache)
 850 {
 851     ASSERT_3PLANES;
 852     assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0));
 853 #ifdef CAN_COMPILE_SSE2
 854     if (vlc_CPU_SSSE3())
 855         return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 2, bitshift, cache);
 856 #else
 857     (void) cache;
 858 #endif
 859
 860     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 861               src[0], src_pitch[0], height, bitshift);
 862
 863     const unsigned copy_lines = (height+1) / 2;
 864     const unsigned copy_pitch = src_pitch[1] / 2;
 865
 866     const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
 867     const int i_extra_pitch_u  = src_pitch[U_PLANE] / 2 - copy_pitch;
 868     const int i_extra_pitch_v  = src_pitch[V_PLANE] / 2 - copy_pitch;
 869
 870     uint16_t *dstUV = (void*) dst->p[1].p_pixels;
 871     const uint16_t *srcU  = (const uint16_t *) src[U_PLANE];
 872     const uint16_t *srcV  = (const uint16_t *) src[V_PLANE];
 873
 874     if (bitshift == 0)
 875         INTERLEAVE_UV();
 876     else if (bitshift > 0)
 877         INTERLEAVE_UV_SHIFTR(bitshift & 0xf);
 878     else
 879         INTERLEAVE_UV_SHIFTL((-bitshift) & 0xf);
 880 }
 881
 882 void Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
 883                     const size_t src_pitch[static 3], unsigned height,
 884                     const copy_cache_t *cache)
 885 {
 886     ASSERT_3PLANES;
 887 #ifdef CAN_COMPILE_SSE2
 888     if (vlc_CPU_SSE2())
 889         return SSE_Copy420_P_to_P(dst, src, src_pitch, height, cache);
 890 #else
 891     (void) cache;
 892 #endif
 893
 894      CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 895                src[0], src_pitch[0], height, 0);
 896      CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
 897                src[1], src_pitch[1], (height+1) / 2, 0);
 898      CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
 899                src[2], src_pitch[2], (height+1) / 2, 0);
 900 }
 901
 902 int picture_UpdatePlanes(picture_t *picture, uint8_t *data, unsigned pitch)
 903 {
 904     /* fill in buffer info in first plane */
 905     picture->p->p_pixels = data;
 906     picture->p->i_pitch  = pitch;
 907     picture->p->i_lines  = picture->format.i_height;
 908     assert(picture->p->i_visible_pitch <= picture->p->i_pitch);
 909     assert(picture->p->i_visible_lines <= picture->p->i_lines);
 910
 911     /*  Fill chroma planes for biplanar YUV */
 912     if (picture->format.i_chroma == VLC_CODEC_NV12 ||
 913         picture->format.i_chroma == VLC_CODEC_NV21 ||
 914         picture->format.i_chroma == VLC_CODEC_P010) {
 915
 916         for (int n = 1; n < picture->i_planes; n++) {
 917             const plane_t *o = &picture->p[n-1];
 918             plane_t *p = &picture->p[n];
 919
 920             p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
 921             p->i_pitch  = pitch;
 922             p->i_lines  = picture->format.i_height;
 923             assert(p->i_visible_pitch <= p->i_pitch);
 924             assert(p->i_visible_lines <= p->i_lines);
 925         }
 926         /* The dx/d3d buffer is always allocated as NV12 */
 927         if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_NV12)) {
 928             /* TODO : Swap NV21 UV planes to match NV12 */
 929             return VLC_EGENERIC;
 930         }
 931     }
 932
 933     /*  Fill chroma planes for planar YUV */
 934     else
 935     if (picture->format.i_chroma == VLC_CODEC_I420 ||
 936         picture->format.i_chroma == VLC_CODEC_J420 ||
 937         picture->format.i_chroma == VLC_CODEC_YV12) {
 938
 939         for (int n = 1; n < picture->i_planes; n++) {
 940             const plane_t *o = &picture->p[n-1];
 941             plane_t *p = &picture->p[n];
 942
 943             p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
 944             p->i_pitch  = pitch / 2;
 945             p->i_lines  = picture->format.i_height / 2;
 946         }
 947         /* The dx/d3d buffer is always allocated as YV12 */
 948         if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_YV12))
 949             picture_SwapUV( picture );
 950     }
 951     return VLC_SUCCESS;
 952 }
 953
 954 #ifdef COPY_TEST
 955
 956 #include <vlc_picture.h>
 957
 958 struct test_dst
 959 {
 960     vlc_fourcc_t chroma;
 961     int bitshift;
 962     union
 963     {
 964         void (*conv)(picture_t *, const uint8_t *[], const size_t [], unsigned,
 965                      const copy_cache_t *);
 966         void (*conv16)(picture_t *, const uint8_t *[], const size_t [], unsigned, int,
 967                      const copy_cache_t *);
 968     };
 969 };
 970
 971 struct test_conv
 972 {
 973     vlc_fourcc_t src_chroma;
 974     struct test_dst dsts[3];
 975 };
 976
 977 static const struct test_conv convs[] = {
 978     { .src_chroma = VLC_CODEC_NV12,
 979       .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_SP_to_P },
 980                 { VLC_CODEC_NV12, 0, .conv = Copy420_SP_to_SP } },
 981     },
 982     { .src_chroma = VLC_CODEC_I420,
 983       .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_P_to_P },
 984                 { VLC_CODEC_NV12, 0, .conv = Copy420_P_to_SP } },
 985     },
 986     { .src_chroma = VLC_CODEC_P010,
 987       .dsts = { { VLC_CODEC_I420_10L, 6, .conv16 = Copy420_16_SP_to_P } },
 988     },
 989     { .src_chroma = VLC_CODEC_I420_10L,
 990       .dsts = { { VLC_CODEC_P010, -6, .conv16 = Copy420_16_P_to_SP } },
 991     },
 992 };
 993 #define NB_CONVS ARRAY_SIZE(convs)
 994
 995 struct test_size
 996 {
 997     int i_width;
 998     int i_height;
 999     int i_visible_width;
1000     int i_visible_height;
1001 };
1002 static const struct test_size sizes[] = {
1003     { 1, 1, 1, 1 },
1004     { 3, 3, 3, 3 },
1005     { 65, 39, 65, 39 },
1006     { 560, 369, 540, 350 },
1007     { 1274, 721, 1200, 720 },
1008     { 1920, 1088, 1920, 1080 },
1009     { 3840, 2160, 3840, 2160 },
1010 #if 0 /* too long */
1011     { 8192, 8192, 8192, 8192 },
1012 #endif
1013 };
1014 #define NB_SIZES ARRAY_SIZE(sizes)
1015
1016 static void piccheck(picture_t *pic, const vlc_chroma_description_t *dsc,
1017                      bool init)
1018 {
1019 #define ASSERT_COLOR(good) do { \
1020     fprintf(stderr, "error: pixel doesn't match @ plane: %d: %d x %d: 0x%X vs 0x%X\n", i, x, y, *(--p), good); \
1021     assert(!"error: pixel doesn't match"); \
1022 } while(0)
1023
1024 #define PICCHECK(type_u, type_uv, colors_P, color_UV, pitch_den) do { \
1025     for (int i = 0; i < pic->i_planes; ++i) \
1026     { \
1027         const struct plane_t *plane = &pic->p[i]; \
1028         for (int y = 0; y < plane->i_visible_lines; ++y) \
1029         { \
1030             if (pic->i_planes == 2 && i == 1) \
1031             { \
1032                 type_uv *p = (type_uv *)&plane->p_pixels[y * plane->i_pitch]; \
1033                 for (int x = 0; x < plane->i_visible_pitch / 2 / pitch_den; ++x) \
1034                     if (init) \
1035                         *(p++) = color_UV; \
1036                     else if (*(p++) != color_UV) \
1037                         ASSERT_COLOR(color_UV); \
1038             } \
1039             else \
1040             { \
1041                 type_u *p = (type_u *) &plane->p_pixels[y * plane->i_pitch]; \
1042                 for (int x = 0; x < plane->i_visible_pitch / pitch_den; ++x) \
1043                     if (init) \
1044                         *(p++) = colors_P[i]; \
1045                     else if (*(p++) != colors_P[i]) \
1046                         ASSERT_COLOR(colors_P[i]); \
1047             } \
1048         } \
1049     } \
1050 } while (0)
1051
1052     assert(pic->i_planes == 2 || pic->i_planes == 3);
1053     assert(dsc->pixel_size == 1 || dsc->pixel_size == 2);
1054
1055     if (dsc->pixel_size == 1)
1056     {
1057         const uint8_t colors_8_P[3] = { 0x42, 0xF1, 0x36 };
1058         const uint16_t color_8_UV = ntoh16(0xF136);
1059         PICCHECK(uint8_t, uint16_t, colors_8_P, color_8_UV, 1);
1060     }
1061     else
1062     {
1063         const unsigned mask = (1 << dsc->pixel_bits) - 1;
1064         uint16_t colors_16_P[3] = { 0x1042 &mask, 0xF114 &mask, 0x3645 &mask};
1065
1066         switch (pic->format.i_chroma)
1067         {
1068             case VLC_CODEC_P010:
1069                 for (size_t i = 0; i < 3; ++i)
1070                     colors_16_P[i] <<= 6;
1071                 break;
1072             case VLC_CODEC_I420_10L:
1073                 break;
1074             default:
1075                 vlc_assert_unreachable();
1076         }
1077
1078         uint32_t color_16_UV = GetDWLE( &colors_16_P[1] );
1079
1080         PICCHECK(uint16_t, uint32_t, colors_16_P, color_16_UV, 2);
1081     }
1082 }
1083
1084 static void pic_rsc_destroy(picture_t *pic)
1085 {
1086     for (unsigned i = 0; i < 3; i++)
1087         free(pic->p[i].p_pixels);
1088     free(pic);
1089 }
1090
1091 static picture_t *pic_new_unaligned(const video_format_t *fmt)
1092 {
1093     /* Allocate a no-aligned picture in order to ease buffer overflow detection
1094      * from the source picture */
1095     const vlc_chroma_description_t *dsc = vlc_fourcc_GetChromaDescription(fmt->i_chroma);
1096     assert(dsc);
1097     picture_resource_t rsc = { .pf_destroy = pic_rsc_destroy };
1098     for (unsigned i = 0; i < dsc->plane_count; i++)
1099     {
1100         rsc.p[i].i_lines = ((fmt->i_visible_height + (dsc->p[i].h.den - 1)) / dsc->p[i].h.den) * dsc->p[i].h.num;
1101         rsc.p[i].i_pitch = ((fmt->i_visible_width + (dsc->p[i].w.den - 1)) / dsc->p[i].w.den) * dsc->p[i].w.num * dsc->pixel_size;
1102         rsc.p[i].p_pixels = malloc(rsc.p[i].i_lines * rsc.p[i].i_pitch);
1103         assert(rsc.p[i].p_pixels);
1104     }
1105     return picture_NewFromResource(fmt, &rsc);
1106 }
1107
1108 int main(void)
1109 {
1110     alarm(10);
1111
1112 #ifndef COPY_TEST_NOOPTIM
1113     if (!vlc_CPU_SSE2())
1114     {
1115         fprintf(stderr, "WARNING: could not test SSE\n");
1116         return 77;
1117     }
1118 #endif
1119
1120     for (size_t i = 0; i < NB_CONVS; ++i)
1121     {
1122         const struct test_conv *conv = &convs[i];
1123
1124         for (size_t j = 0; j < NB_SIZES; ++j)
1125         {
1126             const struct test_size *size = &sizes[j];
1127
1128             const vlc_chroma_description_t *src_dsc =
1129                 vlc_fourcc_GetChromaDescription(conv->src_chroma);
1130             assert(src_dsc);
1131
1132             video_format_t fmt;
1133             video_format_Init(&fmt, 0);
1134             video_format_Setup(&fmt, conv->src_chroma,
1135                                size->i_width, size->i_height,
1136                                size->i_visible_width, size->i_visible_height,
1137                                1, 1);
1138             picture_t *src = pic_new_unaligned(&fmt);
1139             assert(src);
1140             piccheck(src, src_dsc, true);
1141
1142             copy_cache_t cache;
1143             int ret = CopyInitCache(&cache, src->format.i_width
1144                                     * src_dsc->pixel_size);
1145             assert(ret == VLC_SUCCESS);
1146
1147             for (size_t f = 0; conv->dsts[f].chroma != 0; ++f)
1148             {
1149                 const struct test_dst *test_dst= &conv->dsts[f];
1150
1151                 const vlc_chroma_description_t *dst_dsc =
1152                     vlc_fourcc_GetChromaDescription(test_dst->chroma);
1153                 assert(dst_dsc);
1154                 fmt.i_chroma = test_dst->chroma;
1155                 picture_t *dst = picture_NewFromFormat(&fmt);
1156                 assert(dst);
1157
1158                 const uint8_t * src_planes[3] = { src->p[Y_PLANE].p_pixels,
1159                                                   src->p[U_PLANE].p_pixels,
1160                                                   src->p[V_PLANE].p_pixels };
1161                 const size_t    src_pitches[3] = { src->p[Y_PLANE].i_pitch,
1162                                                    src->p[U_PLANE].i_pitch,
1163                                                    src->p[V_PLANE].i_pitch };
1164
1165                 fprintf(stderr, "testing: %u x %u (vis: %u x %u) %4.4s -> %4.4s\n",
1166                         size->i_width, size->i_height,
1167                         size->i_visible_width, size->i_visible_height,
1168                         (const char *) &src->format.i_chroma,
1169                         (const char *) &dst->format.i_chroma);
1170                 if (test_dst->bitshift == 0)
1171                     test_dst->conv(dst, src_planes, src_pitches,
1172                                    src->format.i_visible_height, &cache);
1173                 else
1174                     test_dst->conv16(dst, src_planes, src_pitches,
1175                                    src->format.i_visible_height, test_dst->bitshift,
1176                                    &cache);
1177                 piccheck(dst, dst_dsc, false);
1178                 picture_Release(dst);
1179             }
1180             picture_Release(src);
1181             CopyCleanCache(&cache);
1182         }
1183     }
1184     return 0;
1185 }
1186
1187 #endif