modules/video_chroma/copy.c

   1 /*****************************************************************************
   2  * copy.c: Fast YV12/NV12 copy
   3  *****************************************************************************
   4  * Copyright (C) 2010 Laurent Aimar
   5  * $Id$
   6  *
   7  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
   8  *          Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
   9  *
  10  * This program is free software; you can redistribute it and/or modify it
  11  * under the terms of the GNU Lesser General Public License as published by
  12  * the Free Software Foundation; either version 2.1 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18  * GNU Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this program; if not, write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 #ifdef HAVE_CONFIG_H
  26 # include "config.h"
  27 #endif
  28
  29 #include <vlc_common.h>
  30 #include <vlc_picture.h>
  31 #include <vlc_cpu.h>
  32 #include <assert.h>
  33
  34 #include "copy.h"
  35
  36 #define ASSERT_PLANE(i) assert(src[i]); \
  37     assert(src_pitch[i])
  38
  39 #define ASSERT_2PLANES \
  40     assert(dst); \
  41     ASSERT_PLANE(0); \
  42     ASSERT_PLANE(1); \
  43     assert(height)
  44
  45 #define ASSERT_3PLANES ASSERT_2PLANES; \
  46     ASSERT_PLANE(2)
  47
  48 int CopyInitCache(copy_cache_t *cache, unsigned width)
  49 {
  50 #ifdef CAN_COMPILE_SSE2
  51     cache->size = __MAX((width + 0x3f) & ~ 0x3f, 16384);
  52     cache->buffer = aligned_alloc(64, cache->size);
  53     if (!cache->buffer)
  54         return VLC_EGENERIC;
  55 #else
  56     (void) cache; (void) width;
  57 #endif
  58     return VLC_SUCCESS;
  59 }
  60
  61 void CopyCleanCache(copy_cache_t *cache)
  62 {
  63 #ifdef CAN_COMPILE_SSE2
  64     aligned_free(cache->buffer);
  65     cache->buffer = NULL;
  66     cache->size   = 0;
  67 #else
  68     (void) cache;
  69 #endif
  70 }
  71
  72 #ifdef CAN_COMPILE_SSE2
  73 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
  74  * load and storing data with the SSE>=2 instruction store.
  75  */
  76 #define COPY16(dstp, srcp, load, store) \
  77     asm volatile (                      \
  78         load "  0(%[src]), %%xmm1\n"    \
  79         store " %%xmm1,    0(%[dst])\n" \
  80         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
  81
  82 #define COPY64(dstp, srcp, load, store) \
  83     asm volatile (                      \
  84         load "  0(%[src]), %%xmm1\n"    \
  85         load " 16(%[src]), %%xmm2\n"    \
  86         load " 32(%[src]), %%xmm3\n"    \
  87         load " 48(%[src]), %%xmm4\n"    \
  88         store " %%xmm1,    0(%[dst])\n" \
  89         store " %%xmm2,   16(%[dst])\n" \
  90         store " %%xmm3,   32(%[dst])\n" \
  91         store " %%xmm4,   48(%[dst])\n" \
  92         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
  93
  94 #ifndef __SSE4_1__
  95 # undef vlc_CPU_SSE4_1
  96 # define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
  97 #endif
  98
  99 #ifndef __SSSE3__
 100 # undef vlc_CPU_SSSE3
 101 # define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
 102 #endif
 103
 104 #ifndef __SSE2__
 105 # undef vlc_CPU_SSE2
 106 # define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
 107 #endif
 108
 109 #ifdef COPY_TEST_NOOTPIM
 110 # undef vlc_CPU_SSE4_1
 111 # define vlc_CPU_SSE4_1() (0)
 112 # undef vlc_CPU_SSE3
 113 # define vlc_CPU_SSE3() (0)
 114 # undef vlc_CPU_SSE2
 115 # define vlc_CPU_SSE2() (0)
 116 #endif
 117
 118 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
 119  * as used by some video surface.
 120  * XXX It is really efficient only when SSE4.1 is available.
 121  */
 122 VLC_SSE
 123 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
 124                          const uint8_t *src, size_t src_pitch,
 125                          unsigned width, unsigned height,
 126                          unsigned cpu)
 127 {
 128 #if defined (__SSE4_1__) || !defined(CAN_COMPILE_SSSE3)
 129     VLC_UNUSED(cpu);
 130 #endif
 131     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
 132
 133     asm volatile ("mfence");
 134
 135     for (unsigned y = 0; y < height; y++) {
 136         const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
 137         unsigned x = unaligned;
 138
 139 #ifdef CAN_COMPILE_SSE4_1
 140         if (vlc_CPU_SSE4_1()) {
 141             if (!unaligned) {
 142                 for (; x+63 < width; x += 64)
 143                     COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
 144             } else {
 145                 COPY16(dst, src, "movdqu", "movdqa");
 146                 for (; x+63 < width; x += 64)
 147                     COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
 148             }
 149         } else
 150 #endif
 151         {
 152             if (!unaligned) {
 153                 for (; x+63 < width; x += 64)
 154                     COPY64(&dst[x], &src[x], "movdqa", "movdqa");
 155             } else {
 156                 COPY16(dst, src, "movdqu", "movdqa");
 157                 for (; x+63 < width; x += 64)
 158                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
 159             }
 160         }
 161
 162         for (; x < width; x++)
 163             dst[x] = src[x];
 164
 165         src += src_pitch;
 166         dst += dst_pitch;
 167     }
 168     asm volatile ("mfence");
 169 }
 170
 171 VLC_SSE
 172 static void Copy2d(uint8_t *dst, size_t dst_pitch,
 173                    const uint8_t *src, size_t src_pitch,
 174                    unsigned width, unsigned height)
 175 {
 176     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 177
 178     for (unsigned y = 0; y < height; y++) {
 179         unsigned x = 0;
 180
 181         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
 182         if (!unaligned) {
 183             for (; x+63 < width; x += 64)
 184                 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
 185         } else {
 186             for (; x+63 < width; x += 64)
 187                 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
 188         }
 189
 190         for (; x < width; x++)
 191             dst[x] = src[x];
 192
 193         src += src_pitch;
 194         dst += dst_pitch;
 195     }
 196 }
 197
 198 VLC_SSE
 199 static void
 200 SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch,
 201                  uint8_t *srcu, size_t srcu_pitch,
 202                  uint8_t *srcv, size_t srcv_pitch,
 203                  unsigned int width, unsigned int height, uint8_t pixel_size,
 204                  unsigned int cpu)
 205 {
 206     assert(!((intptr_t)srcu & 0xf) && !(srcu_pitch & 0x0f) &&
 207            !((intptr_t)srcv & 0xf) && !(srcv_pitch & 0x0f));
 208
 209 #if defined(__SSSE3__) || !defined (CAN_COMPILE_SSSE3)
 210     VLC_UNUSED(cpu);
 211 #endif
 212
 213     static const uint8_t shuffle_8[] = { 0, 8,
 214                                          1, 9,
 215                                          2, 10,
 216                                          3, 11,
 217                                          4, 12,
 218                                          5, 13,
 219                                          6, 14,
 220                                          7, 15 };
 221     static const uint8_t shuffle_16[] = { 0, 1, 8, 9,
 222                                           2, 3, 10, 11,
 223                                           4, 5, 12, 13,
 224                                           6, 7, 14, 15 };
 225     const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
 226
 227     for (unsigned int y = 0; y < height; ++y)
 228     {
 229         unsigned int    x;
 230
 231 #define LOAD2X32                        \
 232     "movhpd 0x00(%[src2]), %%xmm0\n"    \
 233     "movlpd 0x00(%[src1]), %%xmm0\n"    \
 234                                         \
 235     "movhpd 0x08(%[src2]), %%xmm1\n"    \
 236     "movlpd 0x08(%[src1]), %%xmm1\n"    \
 237                                         \
 238     "movhpd 0x10(%[src2]), %%xmm2\n"    \
 239     "movlpd 0x10(%[src1]), %%xmm2\n"    \
 240                                         \
 241     "movhpd 0x18(%[src2]), %%xmm3\n"    \
 242     "movlpd 0x18(%[src1]), %%xmm3\n"
 243
 244 #define STORE64                         \
 245     "movdqu %%xmm0, 0x00(%[dst])\n"     \
 246     "movdqu %%xmm1, 0x10(%[dst])\n"     \
 247     "movdqu %%xmm2, 0x20(%[dst])\n"     \
 248     "movdqu %%xmm3, 0x30(%[dst])\n"
 249
 250 #ifdef CAN_COMPILE_SSSE3
 251         if (vlc_CPU_SSSE3())
 252             for (x = 0; x < (width & ~31); x += 32)
 253                 asm volatile
 254                     (
 255                         "movdqu (%[shuffle]), %%xmm7\n"
 256                         LOAD2X32
 257                         "pshufb %%xmm7, %%xmm0\n"
 258                         "pshufb %%xmm7, %%xmm1\n"
 259                         "pshufb %%xmm7, %%xmm2\n"
 260                         "pshufb %%xmm7, %%xmm3\n"
 261                         STORE64
 262                         : : [dst]"r"(dst+2*x),
 263                             [src1]"r"(srcu+x), [src2]"r"(srcv+x),
 264                             [shuffle]"r"(shuffle)
 265                         : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
 266                     );
 267         else
 268 #endif
 269
 270         {
 271             assert(pixel_size == 1);
 272             for (x = 0; x < (width & ~31); x += 32)
 273                 asm volatile
 274                     (
 275                         LOAD2X32
 276                         "movhlps   %%xmm0, %%xmm4\n"
 277                         "punpcklbw %%xmm4, %%xmm0\n"
 278
 279                         "movhlps   %%xmm1, %%xmm4\n"
 280                         "punpcklbw %%xmm4, %%xmm1\n"
 281
 282                         "movhlps   %%xmm2, %%xmm4\n"
 283                         "punpcklbw %%xmm4, %%xmm2\n"
 284
 285                         "movhlps   %%xmm3, %%xmm4\n"
 286                         "punpcklbw %%xmm4, %%xmm3\n"
 287                         STORE64
 288                         : : [dst]"r"(dst+2*x),
 289                             [src1]"r"(srcu+x), [src2]"r"(srcv+x)
 290                         : "memory",
 291                           "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm7"
 292                     );
 293         }
 294 #undef LOAD2X32
 295 #undef STORE64
 296
 297         if (pixel_size == 1)
 298         {
 299             for (; x < width; x++) {
 300                 dst[2*x+0] = srcu[x];
 301                 dst[2*x+1] = srcv[x];
 302             }
 303         }
 304         else
 305         {
 306             for (; x < width; x+= 2) {
 307                 dst[2*x+0] = srcu[x];
 308                 dst[2*x+1] = srcu[x + 1];
 309                 dst[2*x+2] = srcv[x];
 310                 dst[2*x+3] = srcv[x + 1];
 311             }
 312         }
 313         srcu += srcu_pitch;
 314         srcv += srcv_pitch;
 315         dst += dst_pitch;
 316     }
 317 }
 318
 319 VLC_SSE
 320 static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
 321                         uint8_t *dstv, size_t dstv_pitch,
 322                         const uint8_t *src, size_t src_pitch,
 323                         unsigned width, unsigned height, uint8_t pixel_size,
 324                         unsigned cpu)
 325 {
 326 #if defined(__SSSE3__) || !defined (CAN_COMPILE_SSSE3)
 327     VLC_UNUSED(cpu);
 328 #endif
 329     assert(pixel_size == 1 || pixel_size == 2);
 330     assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
 331
 332 #define LOAD64 \
 333     "movdqa  0(%[src]), %%xmm0\n" \
 334     "movdqa 16(%[src]), %%xmm1\n" \
 335     "movdqa 32(%[src]), %%xmm2\n" \
 336     "movdqa 48(%[src]), %%xmm3\n"
 337
 338 #define STORE2X32 \
 339     "movq   %%xmm0,   0(%[dst1])\n" \
 340     "movq   %%xmm1,   8(%[dst1])\n" \
 341     "movhpd %%xmm0,   0(%[dst2])\n" \
 342     "movhpd %%xmm1,   8(%[dst2])\n" \
 343     "movq   %%xmm2,  16(%[dst1])\n" \
 344     "movq   %%xmm3,  24(%[dst1])\n" \
 345     "movhpd %%xmm2,  16(%[dst2])\n" \
 346     "movhpd %%xmm3,  24(%[dst2])\n"
 347
 348 #ifdef CAN_COMPILE_SSSE3
 349     if (vlc_CPU_SSSE3())
 350     {
 351         static const uint8_t shuffle_8[] = { 0, 2, 4, 6, 8, 10, 12, 14,
 352                                              1, 3, 5, 7, 9, 11, 13, 15 };
 353         static const uint8_t shuffle_16[] = {  0,  1,  4,  5,  8,  9, 12, 13,
 354                                                2,  3,  6,  7, 10, 11, 14, 15 };
 355         const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
 356         for (unsigned y = 0; y < height; y++) {
 357             unsigned x = 0;
 358             for (; x < (width & ~31); x += 32) {
 359                 asm volatile (
 360                     "movdqu (%[shuffle]), %%xmm7\n"
 361                     LOAD64
 362                     "pshufb  %%xmm7, %%xmm0\n"
 363                     "pshufb  %%xmm7, %%xmm1\n"
 364                     "pshufb  %%xmm7, %%xmm2\n"
 365                     "pshufb  %%xmm7, %%xmm3\n"
 366                     STORE2X32
 367                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
 368             }
 369             if (pixel_size == 1)
 370             {
 371                 for (; x < width; x++) {
 372                     dstu[x] = src[2*x+0];
 373                     dstv[x] = src[2*x+1];
 374                 }
 375             }
 376             else
 377             {
 378                 for (; x < width; x+= 2) {
 379                     dstu[x] = src[2*x+0];
 380                     dstu[x+1] = src[2*x+1];
 381                     dstv[x] = src[2*x+2];
 382                     dstv[x+1] = src[2*x+3];
 383                 }
 384             }
 385             src  += src_pitch;
 386             dstu += dstu_pitch;
 387             dstv += dstv_pitch;
 388         }
 389     } else
 390 #endif
 391     {
 392         assert(pixel_size == 1);
 393         static const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
 394                                         0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
 395
 396         for (unsigned y = 0; y < height; y++)
 397         {
 398             unsigned x = 0;
 399             for (; x < (width & ~31); x += 32) {
 400                 asm volatile (
 401                     "movdqu (%[mask]), %%xmm7\n"
 402                     LOAD64
 403                     "movdqa   %%xmm0, %%xmm4\n"
 404                     "movdqa   %%xmm1, %%xmm5\n"
 405                     "movdqa   %%xmm2, %%xmm6\n"
 406                     "psrlw    $8,     %%xmm0\n"
 407                     "psrlw    $8,     %%xmm1\n"
 408                     "pand     %%xmm7, %%xmm4\n"
 409                     "pand     %%xmm7, %%xmm5\n"
 410                     "pand     %%xmm7, %%xmm6\n"
 411                     "packuswb %%xmm4, %%xmm0\n"
 412                     "packuswb %%xmm5, %%xmm1\n"
 413                     "pand     %%xmm3, %%xmm7\n"
 414                     "psrlw    $8,     %%xmm2\n"
 415                     "psrlw    $8,     %%xmm3\n"
 416                     "packuswb %%xmm6, %%xmm2\n"
 417                     "packuswb %%xmm7, %%xmm3\n"
 418                     STORE2X32
 419                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 420             }
 421             for (; x < width; x++) {
 422                 dstu[x] = src[2*x+0];
 423                 dstv[x] = src[2*x+1];
 424             }
 425             src  += src_pitch;
 426             dstu += dstu_pitch;
 427             dstv += dstv_pitch;
 428         }
 429     }
 430 #undef STORE2X32
 431 #undef LOAD64
 432 }
 433
 434 static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
 435                           const uint8_t *src, size_t src_pitch,
 436                           uint8_t *cache, size_t cache_size,
 437                           unsigned height, unsigned cpu)
 438 {
 439     const unsigned w16 = (src_pitch+15) & ~15;
 440     const unsigned hstep = cache_size / w16;
 441     assert(hstep > 0);
 442
 443     if (src_pitch == dst_pitch)
 444         memcpy(dst, src, src_pitch * height);
 445     else
 446     for (unsigned y = 0; y < height; y += hstep) {
 447         const unsigned hblock =  __MIN(hstep, height - y);
 448
 449         /* Copy a bunch of line into our cache */
 450         CopyFromUswc(cache, w16,
 451                      src, src_pitch,
 452                      src_pitch, hblock, cpu);
 453
 454         /* Copy from our cache to the destination */
 455         Copy2d(dst, dst_pitch,
 456                cache, w16,
 457                src_pitch, hblock);
 458
 459         /* */
 460         src += src_pitch * hblock;
 461         dst += dst_pitch * hblock;
 462     }
 463 }
 464
 465 static void
 466 SSE_InterleavePlanes(uint8_t *dst, size_t dst_pitch,
 467                      const uint8_t *srcu, size_t srcu_pitch,
 468                      const uint8_t *srcv, size_t srcv_pitch,
 469                      uint8_t *cache, size_t cache_size,
 470                      unsigned int height, uint8_t pixel_size, unsigned int cpu)
 471 {
 472     assert(srcu_pitch == srcv_pitch);
 473     unsigned int const  w16 = (srcu_pitch+15) & ~15;
 474     unsigned int const  hstep = (cache_size) / (2*w16);
 475     assert(hstep > 0);
 476
 477     for (unsigned int y = 0; y < height; y += hstep)
 478     {
 479         unsigned int const      hblock = __MIN(hstep, height - y);
 480
 481         /* Copy a bunch of line into our cache */
 482         CopyFromUswc(cache, w16, srcu, srcu_pitch,
 483                      srcu_pitch, hblock, cpu);
 484         CopyFromUswc(cache+w16*hblock, w16, srcv, srcv_pitch,
 485                      srcv_pitch, hblock, cpu);
 486
 487         /* Copy from our cache to the destination */
 488         SSE_InterleaveUV(dst, dst_pitch, cache, w16,
 489                          cache+w16*hblock, w16, srcu_pitch, hblock, pixel_size,
 490                          cpu);
 491
 492         /* */
 493         srcu += hblock * srcu_pitch;
 494         srcv += hblock * srcv_pitch;
 495         dst += hblock * dst_pitch;
 496     }
 497 }
 498
 499 static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 500                             uint8_t *dstv, size_t dstv_pitch,
 501                             const uint8_t *src, size_t src_pitch,
 502                             uint8_t *cache, size_t cache_size,
 503                             unsigned height, uint8_t pixel_size, unsigned cpu)
 504 {
 505     const unsigned w16 = (src_pitch+15) & ~15;
 506     const unsigned hstep = cache_size / w16;
 507     assert(hstep > 0);
 508
 509     for (unsigned y = 0; y < height; y += hstep) {
 510         const unsigned hblock =  __MIN(hstep, height - y);
 511
 512         /* Copy a bunch of line into our cache */
 513         CopyFromUswc(cache, w16, src, src_pitch,
 514                      src_pitch, hblock, cpu);
 515
 516         /* Copy from our cache to the destination */
 517         SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
 518                     cache, w16, src_pitch / 2, hblock, pixel_size, cpu);
 519
 520         /* */
 521         src  += src_pitch  * hblock;
 522         dstu += dstu_pitch * hblock;
 523         dstv += dstv_pitch * hblock;
 524     }
 525 }
 526
 527 static void SSE_Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
 528                                const size_t src_pitch[static 3], unsigned height,
 529                                const copy_cache_t *cache, unsigned cpu)
 530 {
 531     for (unsigned n = 0; n < 3; n++) {
 532         const unsigned d = n > 0 ? 2 : 1;
 533         SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
 534                       src[n], src_pitch[n],
 535                       cache->buffer, cache->size,
 536                       (height+d-1)/d, cpu);
 537     }
 538     asm volatile ("emms");
 539 }
 540
 541
 542 static void SSE_Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
 543                                  const size_t src_pitch[static 2], unsigned height,
 544                                  const copy_cache_t *cache, unsigned cpu)
 545 {
 546     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 547                   src[0], src_pitch[0],
 548                   cache->buffer, cache->size,
 549                   height, cpu);
 550     SSE_CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
 551                   src[1], src_pitch[1],
 552                   cache->buffer, cache->size,
 553                   height/2, cpu);
 554     asm volatile ("emms");
 555 }
 556
 557 static void
 558 SSE_Copy420_SP_to_P(picture_t *dest, const uint8_t *src[static 2],
 559                     const size_t src_pitch[static 2], unsigned int height,
 560                     const copy_cache_t *cache, uint8_t pixel_size,
 561                     unsigned int cpu)
 562 {
 563     SSE_CopyPlane(dest->p[0].p_pixels, dest->p[0].i_pitch,
 564                   src[0], src_pitch[0], cache->buffer, cache->size,
 565                   height, cpu);
 566     SSE_SplitPlanes(dest->p[1].p_pixels, dest->p[1].i_pitch,
 567                     dest->p[2].p_pixels, dest->p[2].i_pitch,
 568                     src[1], src_pitch[1], cache->buffer, cache->size,
 569                     height / 2, pixel_size, cpu);
 570     asm volatile ("emms");
 571 }
 572
 573 static void SSE_Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 574                                 const size_t src_pitch[static 3],
 575                                 unsigned height, const copy_cache_t *cache,
 576                                 uint8_t pixel_size, unsigned cpu)
 577 {
 578     SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 579                   src[0], src_pitch[0],
 580                   cache->buffer, cache->size,
 581                   height, cpu);
 582     SSE_InterleavePlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
 583                          src[U_PLANE], src_pitch[U_PLANE],
 584                          src[V_PLANE], src_pitch[V_PLANE],
 585                          cache->buffer, cache->size, height / 2, pixel_size, cpu);
 586     asm volatile ("emms");
 587 }
 588 #undef COPY64
 589 #endif /* CAN_COMPILE_SSE2 */
 590
 591 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
 592                       const uint8_t *src, size_t src_pitch,
 593                       unsigned height)
 594 {
 595     if (src_pitch == dst_pitch)
 596         memcpy(dst, src, src_pitch * height);
 597     else
 598     for (unsigned y = 0; y < height; y++) {
 599         memcpy(dst, src, src_pitch);
 600         src += src_pitch;
 601         dst += dst_pitch;
 602     }
 603 }
 604
 605 void Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
 606                       const size_t src_pitch[static 2], unsigned height,
 607                       const copy_cache_t *cache)
 608 {
 609     ASSERT_2PLANES;
 610 #ifdef CAN_COMPILE_SSE2
 611     unsigned cpu = vlc_CPU();
 612     if (vlc_CPU_SSE2())
 613         return SSE_Copy420_SP_to_SP(dst, src, src_pitch, height,
 614                                     cache, cpu);
 615 #else
 616     (void) cache;
 617 #endif
 618
 619     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 620               src[0], src_pitch[0], height);
 621     CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
 622               src[1], src_pitch[1], height/2);
 623 }
 624
 625 #define SPLIT_PLANES(type, pitch_den) do { \
 626     for (unsigned y = 0; y < height; y++) { \
 627         for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
 628             ((type *) dstu)[x] = ((const type *) src)[2*x+0]; \
 629             ((type *) dstv)[x] = ((const type *) src)[2*x+1]; \
 630         } \
 631         src  += src_pitch; \
 632         dstu += dstu_pitch; \
 633         dstv += dstv_pitch; \
 634     } \
 635 } while(0)
 636
 637 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 638                         uint8_t *dstv, size_t dstv_pitch,
 639                         const uint8_t *src, size_t src_pitch, unsigned height)
 640 {
 641     SPLIT_PLANES(uint8_t, 2);
 642 }
 643
 644 static void SplitPlanes16(uint8_t *dstu, size_t dstu_pitch,
 645                           uint8_t *dstv, size_t dstv_pitch,
 646                           const uint8_t *src, size_t src_pitch, unsigned height)
 647 {
 648     SPLIT_PLANES(uint16_t, 4);
 649 }
 650
 651 void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
 652                      const size_t src_pitch[static 2], unsigned height,
 653                      const copy_cache_t *cache)
 654 {
 655     ASSERT_2PLANES;
 656 #ifdef CAN_COMPILE_SSE2
 657     unsigned    cpu = vlc_CPU();
 658
 659     if (vlc_CPU_SSE2())
 660         return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, cache, 1, cpu);
 661 #else
 662     VLC_UNUSED(cache);
 663 #endif
 664
 665     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 666               src[0], src_pitch[0], height);
 667     SplitPlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
 668                 dst->p[2].p_pixels, dst->p[2].i_pitch,
 669                 src[1], src_pitch[1], height/2);
 670 }
 671
 672 void Copy420_16_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
 673                         const size_t src_pitch[static 2], unsigned height,
 674                         const copy_cache_t *cache)
 675 {
 676     ASSERT_2PLANES;
 677 #ifdef CAN_COMPILE_SSE3
 678     unsigned    cpu = vlc_CPU();
 679
 680     if (vlc_CPU_SSE3())
 681         return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, cache, 2, cpu);
 682 #else
 683     VLC_UNUSED(cache);
 684 #endif
 685
 686     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 687               src[0], src_pitch[0], height);
 688     SplitPlanes16(dst->p[1].p_pixels, dst->p[1].i_pitch,
 689                   dst->p[2].p_pixels, dst->p[2].i_pitch,
 690                   src[1], src_pitch[1], height/2);
 691 }
 692
 693 #define INTERLEAVE_UV() do { \
 694     for ( unsigned int line = 0; line < copy_lines; line++ ) { \
 695         for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
 696             *dstUV++ = *srcU++; \
 697             *dstUV++ = *srcV++; \
 698         } \
 699         dstUV += i_extra_pitch_uv; \
 700         srcU  += i_extra_pitch_u; \
 701         srcV  += i_extra_pitch_v; \
 702     } \
 703 }while(0)
 704
 705 void Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 706                      const size_t src_pitch[static 3], unsigned height,
 707                      const copy_cache_t *cache)
 708 {
 709     ASSERT_3PLANES;
 710 #ifdef CAN_COMPILE_SSE2
 711     unsigned cpu = vlc_CPU();
 712     if (vlc_CPU_SSE2())
 713         return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, cache, 1, cpu);
 714 #else
 715     (void) cache;
 716 #endif
 717
 718     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 719               src[0], src_pitch[0], height);
 720
 721     const unsigned copy_lines = height / 2;
 722     const unsigned copy_pitch = src_pitch[1];
 723
 724     const int i_extra_pitch_uv = dst->p[1].i_pitch - 2 * copy_pitch;
 725     const int i_extra_pitch_u  = src_pitch[U_PLANE] - copy_pitch;
 726     const int i_extra_pitch_v  = src_pitch[V_PLANE] - copy_pitch;
 727
 728     uint8_t *dstUV = dst->p[1].p_pixels;
 729     const uint8_t *srcU  = src[U_PLANE];
 730     const uint8_t *srcV  = src[V_PLANE];
 731     INTERLEAVE_UV();
 732 }
 733
 734 void Copy420_16_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
 735                         const size_t src_pitch[static 3], unsigned height,
 736                         const copy_cache_t *cache)
 737 {
 738     ASSERT_3PLANES;
 739 #ifdef CAN_COMPILE_SSE2
 740     unsigned cpu = vlc_CPU();
 741     if (vlc_CPU_SSE3())
 742         return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, cache, 2, cpu);
 743 #else
 744     (void) cache;
 745 #endif
 746
 747     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 748               src[0], src_pitch[0], height);
 749
 750     const unsigned copy_lines = height / 2;
 751     const unsigned copy_pitch = src_pitch[1] / 2;
 752
 753     const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
 754     const int i_extra_pitch_u  = src_pitch[U_PLANE] / 2 - copy_pitch;
 755     const int i_extra_pitch_v  = src_pitch[V_PLANE] / 2 - copy_pitch;
 756
 757     uint16_t *dstUV = (void*) dst->p[1].p_pixels;
 758     const uint16_t *srcU  = (const uint16_t *) src[U_PLANE];
 759     const uint16_t *srcV  = (const uint16_t *) src[V_PLANE];
 760     INTERLEAVE_UV();
 761 }
 762
 763 void CopyFromI420_10ToP010(picture_t *dst, const uint8_t *src[static 3],
 764                            const size_t src_pitch[static 3],
 765                            unsigned height, const copy_cache_t *cache)
 766 {
 767     (void) cache;
 768
 769     const int i_extra_pitch_dst_y = (dst->p[0].i_pitch  - src_pitch[0]) / 2;
 770     const int i_extra_pitch_src_y = (src_pitch[Y_PLANE] - src_pitch[0]) / 2;
 771     uint16_t *dstY = (uint16_t *) dst->p[0].p_pixels;
 772     const uint16_t *srcY = (const uint16_t *) src[Y_PLANE];
 773     for (unsigned y = 0; y < height; y++) {
 774         for (unsigned x = 0; x < (src_pitch[0] / 2); x++) {
 775             *dstY++ = *srcY++ << 6;
 776         }
 777         dstY += i_extra_pitch_dst_y;
 778         srcY += i_extra_pitch_src_y;
 779     }
 780
 781     const unsigned copy_lines = height / 2;
 782     const unsigned copy_pitch = src_pitch[1] / 2;
 783
 784     const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
 785     const int i_extra_pitch_u  = src_pitch[U_PLANE] / 2 - copy_pitch;
 786     const int i_extra_pitch_v  = src_pitch[V_PLANE] / 2 - copy_pitch;
 787
 788     uint16_t *dstUV = (uint16_t *) dst->p[1].p_pixels;
 789     const uint16_t *srcU  = (const uint16_t *) src[U_PLANE];
 790     const uint16_t *srcV  = (const uint16_t *) src[V_PLANE];
 791     for ( unsigned int line = 0; line < copy_lines; line++ )
 792     {
 793         for ( unsigned int col = 0; col < copy_pitch; col++ )
 794         {
 795             *dstUV++ = *srcU++ << 6;
 796             *dstUV++ = *srcV++ << 6;
 797         }
 798         dstUV += i_extra_pitch_uv;
 799         srcU  += i_extra_pitch_u;
 800         srcV  += i_extra_pitch_v;
 801     }
 802 }
 803
 804 void Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
 805                     const size_t src_pitch[static 3], unsigned height,
 806                     const copy_cache_t *cache)
 807 {
 808     ASSERT_3PLANES;
 809 #ifdef CAN_COMPILE_SSE2
 810     unsigned cpu = vlc_CPU();
 811     if (vlc_CPU_SSE2())
 812         return SSE_Copy420_P_to_P(dst, src, src_pitch, height, cache, cpu);
 813 #else
 814     (void) cache;
 815 #endif
 816
 817      CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 818                src[0], src_pitch[0], height);
 819      CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
 820                src[1], src_pitch[1], height / 2);
 821      CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
 822                src[2], src_pitch[2], height / 2);
 823 }
 824
 825 void picture_SwapUV(picture_t *picture)
 826 {
 827     assert(picture->i_planes == 3);
 828
 829     plane_t tmp_plane = picture->p[1];
 830     picture->p[1] = picture->p[2];
 831     picture->p[2] = tmp_plane;
 832 }
 833
 834 int picture_UpdatePlanes(picture_t *picture, uint8_t *data, unsigned pitch)
 835 {
 836     /* fill in buffer info in first plane */
 837     picture->p->p_pixels = data;
 838     picture->p->i_pitch  = pitch;
 839     picture->p->i_lines  = picture->format.i_height;
 840     assert(picture->p->i_visible_pitch <= picture->p->i_pitch);
 841     assert(picture->p->i_visible_lines <= picture->p->i_lines);
 842
 843     /*  Fill chroma planes for biplanar YUV */
 844     if (picture->format.i_chroma == VLC_CODEC_NV12 ||
 845         picture->format.i_chroma == VLC_CODEC_NV21 ||
 846         picture->format.i_chroma == VLC_CODEC_P010) {
 847
 848         for (int n = 1; n < picture->i_planes; n++) {
 849             const plane_t *o = &picture->p[n-1];
 850             plane_t *p = &picture->p[n];
 851
 852             p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
 853             p->i_pitch  = pitch;
 854             p->i_lines  = picture->format.i_height;
 855             assert(p->i_visible_pitch <= p->i_pitch);
 856             assert(p->i_visible_lines <= p->i_lines);
 857         }
 858         /* The dx/d3d buffer is always allocated as NV12 */
 859         if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_NV12)) {
 860             /* TODO : Swap NV21 UV planes to match NV12 */
 861             return VLC_EGENERIC;
 862         }
 863     }
 864
 865     /*  Fill chroma planes for planar YUV */
 866     else
 867     if (picture->format.i_chroma == VLC_CODEC_I420 ||
 868         picture->format.i_chroma == VLC_CODEC_J420 ||
 869         picture->format.i_chroma == VLC_CODEC_YV12) {
 870
 871         for (int n = 1; n < picture->i_planes; n++) {
 872             const plane_t *o = &picture->p[n-1];
 873             plane_t *p = &picture->p[n];
 874
 875             p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
 876             p->i_pitch  = pitch / 2;
 877             p->i_lines  = picture->format.i_height / 2;
 878         }
 879         /* The dx/d3d buffer is always allocated as YV12 */
 880         if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_YV12)) {
 881             uint8_t *p_tmp = picture->p[1].p_pixels;
 882             picture->p[1].p_pixels = picture->p[2].p_pixels;
 883             picture->p[2].p_pixels = p_tmp;
 884         }
 885     }
 886     return VLC_SUCCESS;
 887 }
 888
 889 #ifdef COPY_TEST
 890 # undef NDEBUG
 891
 892 #include <vlc_picture.h>
 893
 894 struct test_dst
 895 {
 896     vlc_fourcc_t chroma;
 897     void (*conv)(picture_t *, const uint8_t *[], const size_t [], unsigned,
 898                  const copy_cache_t *);
 899 };
 900
 901 struct test_conv
 902 {
 903     vlc_fourcc_t src_chroma;
 904     struct test_dst dsts[3];
 905 };
 906
 907 static const struct test_conv convs[] = {
 908     { .src_chroma = VLC_CODEC_NV12,
 909       .dsts = { { VLC_CODEC_I420, Copy420_SP_to_P },
 910                 { VLC_CODEC_NV12, Copy420_SP_to_SP } },
 911     },
 912     { .src_chroma = VLC_CODEC_I420,
 913       .dsts = { { VLC_CODEC_I420, Copy420_P_to_P },
 914                 { VLC_CODEC_NV12, Copy420_P_to_SP } },
 915     },
 916     { .src_chroma = VLC_CODEC_P010,
 917       .dsts = { { VLC_CODEC_I420_10B, Copy420_16_SP_to_P } },
 918     },
 919     { .src_chroma = VLC_CODEC_I420_10B,
 920       .dsts = { { VLC_CODEC_P010, Copy420_16_P_to_SP } },
 921     },
 922 };
 923 #define NB_CONVS ARRAY_SIZE(convs)
 924
 925 struct test_size
 926 {
 927     int i_width;
 928     int i_height;
 929     int i_visible_width;
 930     int i_visible_height;
 931 };
 932 static const struct test_size sizes[] = {
 933     { 1, 1, 1, 1 },
 934     { 3, 3, 3, 3 },
 935     { 65, 39, 65, 39 },
 936     { 560, 369, 540, 350 },
 937     { 1274, 721, 1200, 720 },
 938     { 1920, 1088, 1920, 1080 },
 939     { 3840, 2160, 3840, 2160 },
 940 #if 0 /* too long */
 941     { 8192, 8192, 8192, 8192 },
 942 #endif
 943 };
 944 #define NB_SIZES ARRAY_SIZE(sizes)
 945
 946 static void piccheck(picture_t *pic, const vlc_chroma_description_t *dsc,
 947                      bool init)
 948 {
 949 #define ASSERT_COLOR() do { \
 950     fprintf(stderr, "error: pixel doesn't match @ plane: %d: %d x %d: %X\n", i, x, y, *(--p)); \
 951     assert(!"error: pixel doesn't match"); \
 952 } while(0)
 953
 954 #define PICCHECK(type_u, type_uv, colors_P, color_UV, pitch_den) do { \
 955     for (int i = 0; i < pic->i_planes; ++i) \
 956     { \
 957         const struct plane_t *plane = &pic->p[i]; \
 958         for (int y = 0; y < plane->i_visible_lines; ++y) \
 959         { \
 960             if (pic->i_planes == 2 && i == 1) \
 961             { \
 962                 type_uv *p = (type_uv *)&plane->p_pixels[y * plane->i_pitch]; \
 963                 for (int x = 0; x < plane->i_visible_pitch / 2 / pitch_den; ++x) \
 964                     if (init) \
 965                         *(p++) = color_UV; \
 966                     else if (*(p++) != color_UV) \
 967                         ASSERT_COLOR(); \
 968             } \
 969             else \
 970             { \
 971                 type_u *p = (type_u *) &plane->p_pixels[y * plane->i_pitch]; \
 972                 for (int x = 0; x < plane->i_visible_pitch / pitch_den; ++x) \
 973                     if (init) \
 974                         *(p++) = colors_P[i]; \
 975                     else if (*(p++) != colors_P[i]) \
 976                         ASSERT_COLOR(); \
 977             } \
 978         } \
 979     } \
 980 } while (0)
 981
 982     assert(pic->i_planes == 2 || pic->i_planes == 3);
 983     const uint8_t colors_8_P[3] = { 0x42, 0xF1, 0x36 };
 984     const uint16_t color_8_UV = 0x36F1;
 985
 986     const uint16_t colors_16_P[3] = { 0x4210, 0x14F1, 0x4536 };
 987     const uint32_t color_16_UV = 0x453614F1;
 988
 989     assert(dsc->pixel_size == 1 || dsc->pixel_size == 2);
 990     if (dsc->pixel_size == 1)
 991         PICCHECK(uint8_t, uint16_t, colors_8_P, color_8_UV, 1);
 992     else
 993         PICCHECK(uint16_t, uint32_t, colors_16_P, color_16_UV, 2);
 994 }
 995
 996 static void pic_rsc_destroy(picture_t *pic)
 997 {
 998     for (unsigned i = 0; i < 3; i++)
 999         free(pic->p[i].p_pixels);
1000     free(pic);
1001 }
1002
1003 static picture_t *pic_new_unaligned(const video_format_t *fmt)
1004 {
1005     /* Allocate a no-aligned picture in order to ease buffer overflow detection
1006      * from the source picture */
1007     const vlc_chroma_description_t *dsc = vlc_fourcc_GetChromaDescription(fmt->i_chroma);
1008     assert(dsc);
1009     picture_resource_t rsc = { .pf_destroy = pic_rsc_destroy };
1010     for (unsigned i = 0; i < dsc->plane_count; i++)
1011     {
1012         rsc.p[i].i_lines = ((fmt->i_visible_height + 1) & ~ 1) * dsc->p[i].h.num / dsc->p[i].h.den;
1013         rsc.p[i].i_pitch = ((fmt->i_visible_width + 1) & ~ 1) * dsc->pixel_size * dsc->p[i].w.num / dsc->p[i].w.den;
1014         rsc.p[i].p_pixels = malloc(rsc.p[i].i_lines * rsc.p[i].i_pitch);
1015         assert(rsc.p[i].p_pixels);
1016     }
1017     return picture_NewFromResource(fmt, &rsc);
1018 }
1019
1020 int main(void)
1021 {
1022     alarm(10);
1023
1024     unsigned cpu = vlc_CPU();
1025 #ifndef COPY_TEST_NOOTPIM
1026     if (!vlc_CPU_SSE2())
1027     {
1028         fprintf(stderr, "WARNING: could not test SSE\n");
1029         return 0;
1030     }
1031 #endif
1032
1033     for (size_t i = 0; i < NB_CONVS; ++i)
1034     {
1035         const struct test_conv *conv = &convs[i];
1036
1037         for (size_t j = 0; j < NB_SIZES; ++j)
1038         {
1039             const struct test_size *size = &sizes[j];
1040
1041             const vlc_chroma_description_t *src_dsc =
1042                 vlc_fourcc_GetChromaDescription(conv->src_chroma);
1043             assert(src_dsc);
1044
1045             video_format_t fmt;
1046             video_format_Init(&fmt, 0);
1047             video_format_Setup(&fmt, conv->src_chroma,
1048                                size->i_width, size->i_height,
1049                                size->i_visible_width, size->i_visible_height,
1050                                1, 1);
1051             picture_t *src = pic_new_unaligned(&fmt);
1052             assert(src);
1053             piccheck(src, src_dsc, true);
1054
1055             copy_cache_t cache;
1056             int ret = CopyInitCache(&cache, src->format.i_width
1057                                     * src_dsc->pixel_size);
1058             assert(ret == VLC_SUCCESS);
1059
1060             for (size_t f = 0; conv->dsts[f].chroma != 0; ++f)
1061             {
1062                 const struct test_dst *test_dst= &conv->dsts[f];
1063
1064                 const vlc_chroma_description_t *dst_dsc =
1065                     vlc_fourcc_GetChromaDescription(test_dst->chroma);
1066                 assert(dst_dsc);
1067                 fmt.i_chroma = test_dst->chroma;
1068                 picture_t *dst = picture_NewFromFormat(&fmt);
1069                 assert(dst);
1070
1071                 const uint8_t * src_planes[3] = { src->p[Y_PLANE].p_pixels,
1072                                                   src->p[U_PLANE].p_pixels,
1073                                                   src->p[V_PLANE].p_pixels };
1074                 const size_t    src_pitches[3] = { src->p[Y_PLANE].i_pitch,
1075                                                    src->p[U_PLANE].i_pitch,
1076                                                    src->p[V_PLANE].i_pitch };
1077
1078                 fprintf(stderr, "testing: %u x %u (vis: %u x %u) %4.4s -> %4.4s\n",
1079                         size->i_width, size->i_height,
1080                         size->i_visible_width, size->i_visible_height,
1081                         (const char *) &src->format.i_chroma,
1082                         (const char *) &dst->format.i_chroma);
1083                 test_dst->conv(dst, src_planes, src_pitches,
1084                                 src->format.i_visible_height, &cache);
1085                 piccheck(dst, dst_dsc, false);
1086                 picture_Release(dst);
1087             }
1088             picture_Release(src);
1089             CopyCleanCache(&cache);
1090         }
1091     }
1092     return 0;
1093 }
1094
1095 #endif