modules/codec/avcodec/copy.c

   1 /*****************************************************************************
   2  * copy.c: Fast YV12/NV12 copy
   3  *****************************************************************************
   4  * Copyright (C) 2010 Laurent Aimar
   5  * $Id$
   6  *
   7  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 #ifdef HAVE_CONFIG_H
  25 # include "config.h"
  26 #endif
  27
  28 #include <vlc_common.h>
  29 #include <vlc_picture.h>
  30 #include <vlc_cpu.h>
  31 #include <assert.h>
  32
  33 #include "copy.h"
  34
  35 /* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
  36  * storing data with the SSE>=2 instruction store.
  37  */
  38 #define COPY64(dstp, srcp, load, store) \
  39     asm volatile (                      \
  40         load "  0(%[src]), %%xmm1\n"    \
  41         load " 16(%[src]), %%xmm2\n"    \
  42         load " 32(%[src]), %%xmm3\n"    \
  43         load " 48(%[src]), %%xmm4\n"    \
  44         store " %%xmm1,    0(%[dst])\n" \
  45         store " %%xmm2,   16(%[dst])\n" \
  46         store " %%xmm3,   32(%[dst])\n" \
  47         store " %%xmm4,   48(%[dst])\n" \
  48         : : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
  49
  50 /* Execute the instruction op only if SSE2 is supported. */
  51 #ifdef CAN_COMPILE_SSE2
  52 #   define ASM_SSE2(cpu, op) do {          \
  53         if (cpu & CPU_CAPABILITY_SSE2)  \
  54             asm volatile (op);    \
  55     } while (0)
  56 #else
  57 #   define ASM_SSE2(cpu, op)
  58 #endif
  59
  60 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
  61  * as used by some video surface.
  62  * XXX It is really efficient only when SSE4.1 is available.
  63  */
  64 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
  65                          const uint8_t *src, size_t src_pitch,
  66                          unsigned width, unsigned height,
  67                          unsigned cpu)
  68 {
  69     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
  70
  71     ASM_SSE2(cpu, "mfence");
  72     for (unsigned y = 0; y < height; y++) {
  73         const unsigned unaligned = (intptr_t)src & 0x0f;
  74         unsigned x;
  75
  76         for (x = 0; x < unaligned; x++)
  77             dst[x] = src[x];
  78
  79 #ifdef CAN_COMPILE_SSE4_1
  80         if (cpu & CPU_CAPABILITY_SSE4_1) {
  81             if (!unaligned) {
  82                 for (; x+63 < width; x += 64)
  83                     COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
  84             } else {
  85                 for (; x+63 < width; x += 64)
  86                     COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
  87             }
  88         } else
  89 #endif
  90 #ifdef CAN_COMPILE_SSE2
  91         if (cpu & CPU_CAPABILITY_SSE2) {
  92             if (!unaligned) {
  93                 for (; x+63 < width; x += 64)
  94                     COPY64(&dst[x], &src[x], "movdqa", "movdqa");
  95             } else {
  96                 for (; x+63 < width; x += 64)
  97                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
  98             }
  99         }
 100 #endif
 101
 102         for (; x < width; x++)
 103             dst[x] = src[x];
 104
 105         src += src_pitch;
 106         dst += dst_pitch;
 107     }
 108 }
 109
 110 static void Copy2d(uint8_t *dst, size_t dst_pitch,
 111                    const uint8_t *src, size_t src_pitch,
 112                    unsigned width, unsigned height,
 113                    unsigned cpu)
 114 {
 115     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 116
 117     ASM_SSE2(cpu, "mfence");
 118
 119     for (unsigned y = 0; y < height; y++) {
 120         unsigned x = 0;
 121         bool unaligned = ((intptr_t)dst & 0x0f) != 0;
 122
 123 #ifdef CAN_COMPILE_SSE2
 124         if (cpu & CPU_CAPABILITY_SSE2) {
 125             if (!unaligned) {
 126                 for (; x+63 < width; x += 64)
 127                     COPY64(&dst[x], &src[x], "movdqa", "movntdq");
 128             } else {
 129                 for (; x+63 < width; x += 64)
 130                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
 131             }
 132         }
 133 #endif
 134
 135         for (; x < width; x++)
 136             dst[x] = src[x];
 137
 138         src += src_pitch;
 139         dst += dst_pitch;
 140     }
 141 }
 142
 143 static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
 144                     uint8_t *dstv, size_t dstv_pitch,
 145                     const uint8_t *src, size_t src_pitch,
 146                     unsigned width, unsigned height, unsigned cpu)
 147 {
 148     const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
 149                                 1, 3, 5, 7, 9, 11, 13, 15 };
 150     const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
 151                              0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
 152
 153     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 154
 155     ASM_SSE2(cpu, "mfence");
 156
 157     for (unsigned y = 0; y < height; y++) {
 158         unsigned x = 0;
 159
 160 #define LOAD64 \
 161     "movdqa  0(%[src]), %%xmm0\n" \
 162     "movdqa 16(%[src]), %%xmm1\n" \
 163     "movdqa 32(%[src]), %%xmm2\n" \
 164     "movdqa 48(%[src]), %%xmm3\n"
 165
 166 #define STORE2X32 \
 167     "movq   %%xmm0,   0(%[dst1])\n" \
 168     "movq   %%xmm1,   8(%[dst1])\n" \
 169     "movhpd %%xmm0,   0(%[dst2])\n" \
 170     "movhpd %%xmm1,   8(%[dst2])\n" \
 171     "movq   %%xmm2,  16(%[dst1])\n" \
 172     "movq   %%xmm3,  24(%[dst1])\n" \
 173     "movhpd %%xmm2,  16(%[dst2])\n" \
 174     "movhpd %%xmm3,  24(%[dst2])\n"
 175
 176 #ifdef CAN_COMPILE_SSSE3
 177         if (cpu & CPU_CAPABILITY_SSSE3) {
 178             for (x = 0; x < (width & ~31); x += 32) {
 179                 asm volatile (
 180                     "movdqu (%[shuffle]), %%xmm7\n"
 181                     LOAD64
 182                     "pshufb  %%xmm7, %%xmm0\n"
 183                     "pshufb  %%xmm7, %%xmm1\n"
 184                     "pshufb  %%xmm7, %%xmm2\n"
 185                     "pshufb  %%xmm7, %%xmm3\n"
 186                     STORE2X32
 187                     : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory");
 188             }
 189         } else
 190 #endif
 191 #ifdef CAN_COMPILE_SSE2
 192         if (cpu & CPU_CAPABILITY_SSE2) {
 193             for (x = 0; x < (width & ~31); x += 32) {
 194                 asm volatile (
 195                     "movdqu (%[mask]), %%xmm7\n"
 196                     LOAD64
 197                     "movdqa   %%xmm0, %%xmm4\n"
 198                     "movdqa   %%xmm1, %%xmm5\n"
 199                     "movdqa   %%xmm2, %%xmm6\n"
 200                     "psrlw    $8,     %%xmm0\n"
 201                     "psrlw    $8,     %%xmm1\n"
 202                     "pand     %%xmm7, %%xmm4\n"
 203                     "pand     %%xmm7, %%xmm5\n"
 204                     "pand     %%xmm7, %%xmm6\n"
 205                     "packuswb %%xmm4, %%xmm0\n"
 206                     "packuswb %%xmm5, %%xmm1\n"
 207                     "pand     %%xmm3, %%xmm7\n"
 208                     "psrlw    $8,     %%xmm2\n"
 209                     "psrlw    $8,     %%xmm3\n"
 210                     "packuswb %%xmm6, %%xmm2\n"
 211                     "packuswb %%xmm7, %%xmm3\n"
 212                     STORE2X32
 213                     : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory");
 214             }
 215         }
 216 #endif
 217 #undef STORE2X32
 218 #undef LOAD64
 219
 220         for (; x < width; x++) {
 221             dstu[x] = src[2*x+0];
 222             dstv[x] = src[2*x+1];
 223         }
 224         src  += src_pitch;
 225         dstu += dstu_pitch;
 226         dstv += dstv_pitch;
 227     }
 228 }
 229
 230 static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch,
 231                       uint8_t *cache, size_t cache_size,
 232                       unsigned width, unsigned height,
 233                       unsigned cpu)
 234 {
 235     const unsigned w16 = (width+15) & ~15;
 236     const unsigned hstep = cache_size / w16;
 237     assert(hstep > 0);
 238
 239     for (unsigned y = 0; y < height; y += hstep) {
 240         const unsigned hblock =  __MIN(hstep, height - y);
 241
 242         /* Copy a bunch of line into our cache */
 243         CopyFromUswc(cache, w16,
 244                      src, src_pitch,
 245                      width, hblock, cpu);
 246
 247         /* Copy from our cache to the destination */
 248         Copy2d(dst, dst_pitch,
 249                cache, w16,
 250                width, hblock, cpu);
 251
 252         /* */
 253         src += src_pitch * hblock;
 254         dst += dst_pitch * hblock;
 255     }
 256
 257     ASM_SSE2(cpu, "mfence");
 258 }
 259 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
 260                         uint8_t *dstv, size_t dstv_pitch,
 261                         const uint8_t *src, size_t src_pitch,
 262                         uint8_t *cache, size_t cache_size,
 263                         unsigned width, unsigned height,
 264                         unsigned cpu)
 265 {
 266     const unsigned w2_16 = (2*width+15) & ~15;
 267     const unsigned hstep = cache_size / w2_16;
 268     assert(hstep > 0);
 269
 270     for (unsigned y = 0; y < height; y += hstep) {
 271         const unsigned hblock =  __MIN(hstep, height - y);
 272
 273         /* Copy a bunch of line into our cache */
 274         CopyFromUswc(cache, w2_16,
 275                      src, src_pitch,
 276                      2*width, hblock, cpu);
 277
 278         /* Copy from our cache to the destination */
 279         SplitUV(dstu, dstu_pitch,
 280                 dstv, dstv_pitch,
 281                 cache, w2_16,
 282                 width, hblock, cpu);
 283
 284         /* */
 285         src  += src_pitch  * hblock;
 286         dstu += dstu_pitch * hblock;
 287         dstv += dstv_pitch * hblock;
 288     }
 289
 290     ASM_SSE2(cpu, "mfence");
 291 }
 292
 293 int CopyInitCache(copy_cache_t *cache, unsigned width)
 294 {
 295     cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
 296     cache->buffer = vlc_memalign(&cache->base, 16, cache->size);
 297     if (!cache->base)
 298         return VLC_EGENERIC;
 299     return VLC_SUCCESS;
 300 }
 301 void CopyCleanCache(copy_cache_t *cache)
 302 {
 303     free(cache->base);
 304
 305     cache->base   = NULL;
 306     cache->buffer = NULL;
 307     cache->size   = 0;
 308 }
 309
 310 void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
 311                   unsigned width, unsigned height,
 312                   copy_cache_t *cache)
 313 {
 314     const unsigned cpu = vlc_CPU();
 315
 316     /* */
 317     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
 318               src[0], src_pitch[0],
 319               cache->buffer, cache->size,
 320               width, height, cpu);
 321     SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
 322                 dst->p[1].p_pixels, dst->p[1].i_pitch,
 323                 src[1], src_pitch[1],
 324                 cache->buffer, cache->size,
 325                 width/2, height/2, cpu);
 326
 327     ASM_SSE2(cpu, "emms");
 328 }
 329 void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
 330                   unsigned width, unsigned height,
 331                   copy_cache_t *cache)
 332 {
 333     const unsigned cpu = vlc_CPU();
 334
 335     /* */
 336     for (unsigned n = 0; n < 3; n++) {
 337         const unsigned d = n > 0 ? 2 : 1;
 338         CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
 339                   src[n], src_pitch[n],
 340                   cache->buffer, cache->size,
 341                   width/d, height/d, cpu);
 342     }
 343     ASM_SSE2(cpu, "emms");
 344 }
 345
 346 #undef ASM_SSE2
 347 #undef COPY64
 348