libvo/aclib_template.c

   1 /*
   2  * aclib - advanced C library ;)
   3  * functions which improve and expand the standard C library
   4  *
   5  * This file is part of MPlayer.
   6  *
   7  * MPlayer is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * MPlayer is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License along
  18  * with MPlayer; if not, write to the Free Software Foundation, Inc.,
  19  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20  */
  21
  22 #if !HAVE_SSE2
  23 /*
  24    P3 processor has only one SSE decoder so can execute only 1 sse insn per
  25    cpu clock, but it has 3 mmx decoders (include load/store unit)
  26    and executes 3 mmx insns per cpu clock.
  27    P4 processor has some chances, but after reading:
  28    http://www.emulators.com/pentium4.htm
  29    I have doubts. Anyway SSE2 version of this code can be written better.
  30 */
  31 #undef HAVE_SSE
  32 #define HAVE_SSE 0
  33 #endif
  34
  35
  36 /*
  37  This part of code was taken by me from Linux-2.4.3 and slightly modified
  38 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
  39 blocks but mplayer uses weakly ordered data and original sources can not
  40 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
  41
  42 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
  43
  44 Order Number 245470:
  45 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
  46
  47 Data referenced by a program can be temporal (data will be used again) or
  48 non-temporal (data will be referenced once and not reused in the immediate
  49 future). To make efficient use of the processor's caches, it is generally
  50 desirable to cache temporal data and not cache non-temporal data. Overloading
  51 the processor's caches with non-temporal data is sometimes referred to as
  52 "polluting the caches".
  53 The non-temporal data is written to memory with Write-Combining semantics.
  54
  55 The PREFETCHh instructions permits a program to load data into the processor
  56 at a suggested cache level, so that it is closer to the processors load and
  57 store unit when it is needed. If the data is already present in a level of
  58 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
  59 will not result in any data movement.
  60 But we should you PREFETCHNTA: Non-temporal data fetch data into location
  61 close to the processor, minimizing cache pollution.
  62
  63 The MOVNTQ (store quadword using non-temporal hint) instruction stores
  64 packed integer data from an MMX register to memory, using a non-temporal hint.
  65 The MOVNTPS (store packed single-precision floating-point values using
  66 non-temporal hint) instruction stores packed floating-point data from an
  67 XMM register to memory, using a non-temporal hint.
  68
  69 The SFENCE (Store Fence) instruction controls write ordering by creating a
  70 fence for memory store operations. This instruction guarantees that the results
  71 of every store instruction that precedes the store fence in program order is
  72 globally visible before any store instruction that follows the fence. The
  73 SFENCE instruction provides an efficient way of ensuring ordering between
  74 procedures that produce weakly-ordered data and procedures that consume that
  75 data.
  76
  77 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
  78 */
  79
  80 // 3dnow memcpy support from kernel 2.4.2
  81 //  by Pontscho/fresh!mindworkz
  82
  83
  84 #undef HAVE_ONLY_MMX1
  85 #if HAVE_MMX && !HAVE_MMX2 && !HAVE_AMD3DNOW && !HAVE_SSE
  86 /*  means: mmx v.1. Note: Since we added alignment of destinition it speedups
  87     of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
  88     standard (non MMX-optimized) version.
  89     Note: on K6-2+ it speedups memory copying upto 25% and
  90           on K7 and P3 about 500% (5 times). */
  91 #define HAVE_ONLY_MMX1
  92 #endif
  93
  94
  95 #undef HAVE_K6_2PLUS
  96 #if !HAVE_MMX2 && HAVE_AMD3DNOW
  97 #define HAVE_K6_2PLUS
  98 #endif
  99
 100 /* for small memory blocks (<256 bytes) this version is faster */
 101 #define small_memcpy(to,from,n)\
 102 {\
 103 register x86_reg dummy;\
 104 __asm__ volatile(\
 105         "rep; movsb"\
 106         :"=&D"(to), "=&S"(from), "=&c"(dummy)\
 107 /* It's most portable way to notify compiler */\
 108 /* that edi, esi and ecx are clobbered in asm block. */\
 109 /* Thanks to A'rpi for hint!!! */\
 110         :"0" (to), "1" (from),"2" (n)\
 111         : "memory");\
 112 }
 113
 114 #undef MMREG_SIZE
 115 #if HAVE_SSE
 116 #define MMREG_SIZE 16
 117 #else
 118 #define MMREG_SIZE 64 //8
 119 #endif
 120
 121 #undef PREFETCH
 122 #undef EMMS
 123
 124 #if HAVE_MMX2
 125 #define PREFETCH "prefetchnta"
 126 #elif HAVE_AMD3DNOW
 127 #define PREFETCH  "prefetch"
 128 #else
 129 #define PREFETCH " # nop"
 130 #endif
 131
 132 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
 133 #if HAVE_AMD3DNOW
 134 #define EMMS     "femms"
 135 #else
 136 #define EMMS     "emms"
 137 #endif
 138
 139 #undef MOVNTQ
 140 #if HAVE_MMX2
 141 #define MOVNTQ "movntq"
 142 #else
 143 #define MOVNTQ "movq"
 144 #endif
 145
 146 #undef MIN_LEN
 147 #ifdef HAVE_ONLY_MMX1
 148 #define MIN_LEN 0x800  /* 2K blocks */
 149 #else
 150 #define MIN_LEN 0x40  /* 64-byte blocks */
 151 #endif
 152
 153 static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
 154 {
 155         void *retval;
 156         size_t i;
 157         retval = to;
 158 #ifdef STATISTICS
 159         {
 160                 static int freq[33];
 161                 static int t=0;
 162                 int i;
 163                 for(i=0; len>(1<<i); i++);
 164                 freq[i]++;
 165                 t++;
 166                 if(1024*1024*1024 % t == 0)
 167                         for(i=0; i<32; i++)
 168                                 printf("freq < %8d %4d\n", 1<<i, freq[i]);
 169         }
 170 #endif
 171 #ifndef HAVE_ONLY_MMX1
 172         /* PREFETCH has effect even for MOVSB instruction ;) */
 173         __asm__ volatile (
 174                 PREFETCH" (%0)\n"
 175                 PREFETCH" 64(%0)\n"
 176                 PREFETCH" 128(%0)\n"
 177                 PREFETCH" 192(%0)\n"
 178                 PREFETCH" 256(%0)\n"
 179                 : : "r" (from) );
 180 #endif
 181         if(len >= MIN_LEN)
 182         {
 183           register x86_reg delta;
 184           /* Align destinition to MMREG_SIZE -boundary */
 185           delta = ((intptr_t)to)&(MMREG_SIZE-1);
 186           if(delta)
 187           {
 188             delta=MMREG_SIZE-delta;
 189             len -= delta;
 190             small_memcpy(to, from, delta);
 191           }
 192           i = len >> 6; /* len/64 */
 193           len&=63;
 194         /*
 195            This algorithm is top effective when the code consequently
 196            reads and writes blocks which have size of cache line.
 197            Size of cache line is processor-dependent.
 198            It will, however, be a minimum of 32 bytes on any processors.
 199            It would be better to have a number of instructions which
 200            perform reading and writing to be multiple to a number of
 201            processor's decoders, but it's not always possible.
 202         */
 203 #if HAVE_SSE /* Only P3 (may be Cyrix3) */
 204         if(((intptr_t)from) & 15)
 205         /* if SRC is misaligned */
 206         for(; i>0; i--)
 207         {
 208                 __asm__ volatile (
 209                 PREFETCH" 320(%0)\n"
 210                 "movups (%0), %%xmm0\n"
 211                 "movups 16(%0), %%xmm1\n"
 212                 "movups 32(%0), %%xmm2\n"
 213                 "movups 48(%0), %%xmm3\n"
 214                 "movntps %%xmm0, (%1)\n"
 215                 "movntps %%xmm1, 16(%1)\n"
 216                 "movntps %%xmm2, 32(%1)\n"
 217                 "movntps %%xmm3, 48(%1)\n"
 218                 :: "r" (from), "r" (to) : "memory");
 219                 from=((const unsigned char *) from)+64;
 220                 to=((unsigned char *)to)+64;
 221         }
 222         else
 223         /*
 224            Only if SRC is aligned on 16-byte boundary.
 225            It allows to use movaps instead of movups, which required data
 226            to be aligned or a general-protection exception (#GP) is generated.
 227         */
 228         for(; i>0; i--)
 229         {
 230                 __asm__ volatile (
 231                 PREFETCH" 320(%0)\n"
 232                 "movaps (%0), %%xmm0\n"
 233                 "movaps 16(%0), %%xmm1\n"
 234                 "movaps 32(%0), %%xmm2\n"
 235                 "movaps 48(%0), %%xmm3\n"
 236                 "movntps %%xmm0, (%1)\n"
 237                 "movntps %%xmm1, 16(%1)\n"
 238                 "movntps %%xmm2, 32(%1)\n"
 239                 "movntps %%xmm3, 48(%1)\n"
 240                 :: "r" (from), "r" (to) : "memory");
 241                 from=((const unsigned char *)from)+64;
 242                 to=((unsigned char *)to)+64;
 243         }
 244 #else
 245         // Align destination at BLOCK_SIZE boundary
 246         for(; ((intptr_t)to & (BLOCK_SIZE-1)) && i>0; i--)
 247         {
 248                 __asm__ volatile (
 249 #ifndef HAVE_ONLY_MMX1
 250                 PREFETCH" 320(%0)\n"
 251 #endif
 252                 "movq (%0), %%mm0\n"
 253                 "movq 8(%0), %%mm1\n"
 254                 "movq 16(%0), %%mm2\n"
 255                 "movq 24(%0), %%mm3\n"
 256                 "movq 32(%0), %%mm4\n"
 257                 "movq 40(%0), %%mm5\n"
 258                 "movq 48(%0), %%mm6\n"
 259                 "movq 56(%0), %%mm7\n"
 260                 MOVNTQ" %%mm0, (%1)\n"
 261                 MOVNTQ" %%mm1, 8(%1)\n"
 262                 MOVNTQ" %%mm2, 16(%1)\n"
 263                 MOVNTQ" %%mm3, 24(%1)\n"
 264                 MOVNTQ" %%mm4, 32(%1)\n"
 265                 MOVNTQ" %%mm5, 40(%1)\n"
 266                 MOVNTQ" %%mm6, 48(%1)\n"
 267                 MOVNTQ" %%mm7, 56(%1)\n"
 268                 :: "r" (from), "r" (to) : "memory");
 269                 from=((const unsigned char *)from)+64;
 270                 to=((unsigned char *)to)+64;
 271         }
 272
 273 //      printf(" %d %d\n", (int)from&1023, (int)to&1023);
 274         // Pure Assembly cuz gcc is a bit unpredictable ;)
 275         if(i>=BLOCK_SIZE/64)
 276                 __asm__ volatile(
 277                         "xor %%"REG_a", %%"REG_a"       \n\t"
 278                         ASMALIGN(4)
 279                         "1:                     \n\t"
 280                                 "movl (%0, %%"REG_a"), %%ecx    \n\t"
 281                                 "movl 32(%0, %%"REG_a"), %%ecx  \n\t"
 282                                 "movl 64(%0, %%"REG_a"), %%ecx  \n\t"
 283                                 "movl 96(%0, %%"REG_a"), %%ecx  \n\t"
 284                                 "add $128, %%"REG_a"            \n\t"
 285                                 "cmp %3, %%"REG_a"              \n\t"
 286                                 " jb 1b                         \n\t"
 287
 288                         "xor %%"REG_a", %%"REG_a"       \n\t"
 289
 290                                 ASMALIGN(4)
 291                                 "2:                     \n\t"
 292                                 "movq (%0, %%"REG_a"), %%mm0\n"
 293                                 "movq 8(%0, %%"REG_a"), %%mm1\n"
 294                                 "movq 16(%0, %%"REG_a"), %%mm2\n"
 295                                 "movq 24(%0, %%"REG_a"), %%mm3\n"
 296                                 "movq 32(%0, %%"REG_a"), %%mm4\n"
 297                                 "movq 40(%0, %%"REG_a"), %%mm5\n"
 298                                 "movq 48(%0, %%"REG_a"), %%mm6\n"
 299                                 "movq 56(%0, %%"REG_a"), %%mm7\n"
 300                                 MOVNTQ" %%mm0, (%1, %%"REG_a")\n"
 301                                 MOVNTQ" %%mm1, 8(%1, %%"REG_a")\n"
 302                                 MOVNTQ" %%mm2, 16(%1, %%"REG_a")\n"
 303                                 MOVNTQ" %%mm3, 24(%1, %%"REG_a")\n"
 304                                 MOVNTQ" %%mm4, 32(%1, %%"REG_a")\n"
 305                                 MOVNTQ" %%mm5, 40(%1, %%"REG_a")\n"
 306                                 MOVNTQ" %%mm6, 48(%1, %%"REG_a")\n"
 307                                 MOVNTQ" %%mm7, 56(%1, %%"REG_a")\n"
 308                                 "add $64, %%"REG_a"             \n\t"
 309                                 "cmp %3, %%"REG_a"              \n\t"
 310                                 "jb 2b                          \n\t"
 311
 312 #if CONFUSION_FACTOR > 0
 313         // a few percent speedup on out of order executing CPUs
 314                         "mov %5, %%"REG_a"              \n\t"
 315                                 "2:                     \n\t"
 316                                 "movl (%0), %%ecx       \n\t"
 317                                 "movl (%0), %%ecx       \n\t"
 318                                 "movl (%0), %%ecx       \n\t"
 319                                 "movl (%0), %%ecx       \n\t"
 320                                 "dec %%"REG_a"          \n\t"
 321                                 " jnz 2b                \n\t"
 322 #endif
 323
 324                         "xor %%"REG_a", %%"REG_a"       \n\t"
 325                         "add %3, %0             \n\t"
 326                         "add %3, %1             \n\t"
 327                         "sub %4, %2             \n\t"
 328                         "cmp %4, %2             \n\t"
 329                         " jae 1b                \n\t"
 330                                 : "+r" (from), "+r" (to), "+r" (i)
 331                                 : "r" ((x86_reg)BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" ((x86_reg)CONFUSION_FACTOR)
 332                                 : "%"REG_a, "%ecx"
 333                 );
 334
 335         for(; i>0; i--)
 336         {
 337                 __asm__ volatile (
 338 #ifndef HAVE_ONLY_MMX1
 339                 PREFETCH" 320(%0)\n"
 340 #endif
 341                 "movq (%0), %%mm0\n"
 342                 "movq 8(%0), %%mm1\n"
 343                 "movq 16(%0), %%mm2\n"
 344                 "movq 24(%0), %%mm3\n"
 345                 "movq 32(%0), %%mm4\n"
 346                 "movq 40(%0), %%mm5\n"
 347                 "movq 48(%0), %%mm6\n"
 348                 "movq 56(%0), %%mm7\n"
 349                 MOVNTQ" %%mm0, (%1)\n"
 350                 MOVNTQ" %%mm1, 8(%1)\n"
 351                 MOVNTQ" %%mm2, 16(%1)\n"
 352                 MOVNTQ" %%mm3, 24(%1)\n"
 353                 MOVNTQ" %%mm4, 32(%1)\n"
 354                 MOVNTQ" %%mm5, 40(%1)\n"
 355                 MOVNTQ" %%mm6, 48(%1)\n"
 356                 MOVNTQ" %%mm7, 56(%1)\n"
 357                 :: "r" (from), "r" (to) : "memory");
 358                 from=((const unsigned char *)from)+64;
 359                 to=((unsigned char *)to)+64;
 360         }
 361
 362 #endif /* Have SSE */
 363 #if HAVE_MMX2
 364                 /* since movntq is weakly-ordered, a "sfence"
 365                  * is needed to become ordered again. */
 366                 __asm__ volatile ("sfence":::"memory");
 367 #endif
 368 #if !HAVE_SSE
 369                 /* enables to use FPU */
 370                 __asm__ volatile (EMMS:::"memory");
 371 #endif
 372         }
 373         /*
 374          *      Now do the tail of the block
 375          */
 376         if(len) small_memcpy(to, from, len);
 377         return retval;
 378 }
 379
 380 /**
 381  * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
 382  */
 383 static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
 384 {
 385         void *retval;
 386         size_t i;
 387         retval = to;
 388 #ifdef STATISTICS
 389         {
 390                 static int freq[33];
 391                 static int t=0;
 392                 int i;
 393                 for(i=0; len>(1<<i); i++);
 394                 freq[i]++;
 395                 t++;
 396                 if(1024*1024*1024 % t == 0)
 397                         for(i=0; i<32; i++)
 398                                 printf("mem2agp freq < %8d %4d\n", 1<<i, freq[i]);
 399         }
 400 #endif
 401         if(len >= MIN_LEN)
 402         {
 403           register x86_reg delta;
 404           /* Align destinition to MMREG_SIZE -boundary */
 405           delta = ((intptr_t)to)&7;
 406           if(delta)
 407           {
 408             delta=8-delta;
 409             len -= delta;
 410             small_memcpy(to, from, delta);
 411           }
 412           i = len >> 6; /* len/64 */
 413           len &= 63;
 414         /*
 415            This algorithm is top effective when the code consequently
 416            reads and writes blocks which have size of cache line.
 417            Size of cache line is processor-dependent.
 418            It will, however, be a minimum of 32 bytes on any processors.
 419            It would be better to have a number of instructions which
 420            perform reading and writing to be multiple to a number of
 421            processor's decoders, but it's not always possible.
 422         */
 423         for(; i>0; i--)
 424         {
 425                 __asm__ volatile (
 426                 PREFETCH" 320(%0)\n"
 427                 "movq (%0), %%mm0\n"
 428                 "movq 8(%0), %%mm1\n"
 429                 "movq 16(%0), %%mm2\n"
 430                 "movq 24(%0), %%mm3\n"
 431                 "movq 32(%0), %%mm4\n"
 432                 "movq 40(%0), %%mm5\n"
 433                 "movq 48(%0), %%mm6\n"
 434                 "movq 56(%0), %%mm7\n"
 435                 MOVNTQ" %%mm0, (%1)\n"
 436                 MOVNTQ" %%mm1, 8(%1)\n"
 437                 MOVNTQ" %%mm2, 16(%1)\n"
 438                 MOVNTQ" %%mm3, 24(%1)\n"
 439                 MOVNTQ" %%mm4, 32(%1)\n"
 440                 MOVNTQ" %%mm5, 40(%1)\n"
 441                 MOVNTQ" %%mm6, 48(%1)\n"
 442                 MOVNTQ" %%mm7, 56(%1)\n"
 443                 :: "r" (from), "r" (to) : "memory");
 444                 from=((const unsigned char *)from)+64;
 445                 to=((unsigned char *)to)+64;
 446         }
 447 #if HAVE_MMX2
 448                 /* since movntq is weakly-ordered, a "sfence"
 449                  * is needed to become ordered again. */
 450                 __asm__ volatile ("sfence":::"memory");
 451 #endif
 452                 /* enables to use FPU */
 453                 __asm__ volatile (EMMS:::"memory");
 454         }
 455         /*
 456          *      Now do the tail of the block
 457          */
 458         if(len) small_memcpy(to, from, len);
 459         return retval;
 460 }