libvo/aclib_template.c

   1 /*
   2   aclib - advanced C library ;)
   3   This file contains functions which improve and expand standard C-library
   4 */
   5
   6 #if !HAVE_SSE2
   7 /*
   8    P3 processor has only one SSE decoder so can execute only 1 sse insn per
   9    cpu clock, but it has 3 mmx decoders (include load/store unit)
  10    and executes 3 mmx insns per cpu clock.
  11    P4 processor has some chances, but after reading:
  12    http://www.emulators.com/pentium4.htm
  13    I have doubts. Anyway SSE2 version of this code can be written better.
  14 */
  15 #undef HAVE_SSE
  16 #define HAVE_SSE 0
  17 #endif
  18
  19
  20 /*
  21  This part of code was taken by me from Linux-2.4.3 and slightly modified
  22 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
  23 blocks but mplayer uses weakly ordered data and original sources can not
  24 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
  25
  26 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
  27
  28 Order Number 245470:
  29 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
  30
  31 Data referenced by a program can be temporal (data will be used again) or
  32 non-temporal (data will be referenced once and not reused in the immediate
  33 future). To make efficient use of the processor's caches, it is generally
  34 desirable to cache temporal data and not cache non-temporal data. Overloading
  35 the processor's caches with non-temporal data is sometimes referred to as
  36 "polluting the caches".
  37 The non-temporal data is written to memory with Write-Combining semantics.
  38
  39 The PREFETCHh instructions permits a program to load data into the processor
  40 at a suggested cache level, so that it is closer to the processors load and
  41 store unit when it is needed. If the data is already present in a level of
  42 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
  43 will not result in any data movement.
  44 But we should you PREFETCHNTA: Non-temporal data fetch data into location
  45 close to the processor, minimizing cache pollution.
  46
  47 The MOVNTQ (store quadword using non-temporal hint) instruction stores
  48 packed integer data from an MMX register to memory, using a non-temporal hint.
  49 The MOVNTPS (store packed single-precision floating-point values using
  50 non-temporal hint) instruction stores packed floating-point data from an
  51 XMM register to memory, using a non-temporal hint.
  52
  53 The SFENCE (Store Fence) instruction controls write ordering by creating a
  54 fence for memory store operations. This instruction guarantees that the results
  55 of every store instruction that precedes the store fence in program order is
  56 globally visible before any store instruction that follows the fence. The
  57 SFENCE instruction provides an efficient way of ensuring ordering between
  58 procedures that produce weakly-ordered data and procedures that consume that
  59 data.
  60
  61 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
  62 */
  63
  64 // 3dnow memcpy support from kernel 2.4.2
  65 //  by Pontscho/fresh!mindworkz
  66
  67
  68 #undef HAVE_ONLY_MMX1
  69 #if HAVE_MMX && !HAVE_MMX2 && !HAVE_AMD3DNOW && !HAVE_SSE
  70 /*  means: mmx v.1. Note: Since we added alignment of destinition it speedups
  71     of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
  72     standard (non MMX-optimized) version.
  73     Note: on K6-2+ it speedups memory copying upto 25% and
  74           on K7 and P3 about 500% (5 times). */
  75 #define HAVE_ONLY_MMX1
  76 #endif
  77
  78
  79 #undef HAVE_K6_2PLUS
  80 #if !HAVE_MMX2 && HAVE_AMD3DNOW
  81 #define HAVE_K6_2PLUS
  82 #endif
  83
  84 /* for small memory blocks (<256 bytes) this version is faster */
  85 #define small_memcpy(to,from,n)\
  86 {\
  87 register unsigned long int dummy;\
  88 __asm__ volatile(\
  89         "rep; movsb"\
  90         :"=&D"(to), "=&S"(from), "=&c"(dummy)\
  91 /* It's most portable way to notify compiler */\
  92 /* that edi, esi and ecx are clobbered in asm block. */\
  93 /* Thanks to A'rpi for hint!!! */\
  94         :"0" (to), "1" (from),"2" (n)\
  95         : "memory");\
  96 }
  97
  98 #undef MMREG_SIZE
  99 #if HAVE_SSE
 100 #define MMREG_SIZE 16
 101 #else
 102 #define MMREG_SIZE 64 //8
 103 #endif
 104
 105 #undef PREFETCH
 106 #undef EMMS
 107
 108 #if HAVE_MMX2
 109 #define PREFETCH "prefetchnta"
 110 #elif HAVE_AMD3DNOW
 111 #define PREFETCH  "prefetch"
 112 #else
 113 #define PREFETCH " # nop"
 114 #endif
 115
 116 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
 117 #if HAVE_AMD3DNOW
 118 #define EMMS     "femms"
 119 #else
 120 #define EMMS     "emms"
 121 #endif
 122
 123 #undef MOVNTQ
 124 #if HAVE_MMX2
 125 #define MOVNTQ "movntq"
 126 #else
 127 #define MOVNTQ "movq"
 128 #endif
 129
 130 #undef MIN_LEN
 131 #ifdef HAVE_ONLY_MMX1
 132 #define MIN_LEN 0x800  /* 2K blocks */
 133 #else
 134 #define MIN_LEN 0x40  /* 64-byte blocks */
 135 #endif
 136
 137 static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
 138 {
 139         void *retval;
 140         size_t i;
 141         retval = to;
 142 #ifdef STATISTICS
 143         {
 144                 static int freq[33];
 145                 static int t=0;
 146                 int i;
 147                 for(i=0; len>(1<<i); i++);
 148                 freq[i]++;
 149                 t++;
 150                 if(1024*1024*1024 % t == 0)
 151                         for(i=0; i<32; i++)
 152                                 printf("freq < %8d %4d\n", 1<<i, freq[i]);
 153         }
 154 #endif
 155 #ifndef HAVE_ONLY_MMX1
 156         /* PREFETCH has effect even for MOVSB instruction ;) */
 157         __asm__ volatile (
 158                 PREFETCH" (%0)\n"
 159                 PREFETCH" 64(%0)\n"
 160                 PREFETCH" 128(%0)\n"
 161                 PREFETCH" 192(%0)\n"
 162                 PREFETCH" 256(%0)\n"
 163                 : : "r" (from) );
 164 #endif
 165         if(len >= MIN_LEN)
 166         {
 167           register unsigned long int delta;
 168           /* Align destinition to MMREG_SIZE -boundary */
 169           delta = ((unsigned long int)to)&(MMREG_SIZE-1);
 170           if(delta)
 171           {
 172             delta=MMREG_SIZE-delta;
 173             len -= delta;
 174             small_memcpy(to, from, delta);
 175           }
 176           i = len >> 6; /* len/64 */
 177           len&=63;
 178         /*
 179            This algorithm is top effective when the code consequently
 180            reads and writes blocks which have size of cache line.
 181            Size of cache line is processor-dependent.
 182            It will, however, be a minimum of 32 bytes on any processors.
 183            It would be better to have a number of instructions which
 184            perform reading and writing to be multiple to a number of
 185            processor's decoders, but it's not always possible.
 186         */
 187 #if HAVE_SSE /* Only P3 (may be Cyrix3) */
 188         if(((unsigned long)from) & 15)
 189         /* if SRC is misaligned */
 190         for(; i>0; i--)
 191         {
 192                 __asm__ volatile (
 193                 PREFETCH" 320(%0)\n"
 194                 "movups (%0), %%xmm0\n"
 195                 "movups 16(%0), %%xmm1\n"
 196                 "movups 32(%0), %%xmm2\n"
 197                 "movups 48(%0), %%xmm3\n"
 198                 "movntps %%xmm0, (%1)\n"
 199                 "movntps %%xmm1, 16(%1)\n"
 200                 "movntps %%xmm2, 32(%1)\n"
 201                 "movntps %%xmm3, 48(%1)\n"
 202                 :: "r" (from), "r" (to) : "memory");
 203                 from=((const unsigned char *) from)+64;
 204                 to=((unsigned char *)to)+64;
 205         }
 206         else
 207         /*
 208            Only if SRC is aligned on 16-byte boundary.
 209            It allows to use movaps instead of movups, which required data
 210            to be aligned or a general-protection exception (#GP) is generated.
 211         */
 212         for(; i>0; i--)
 213         {
 214                 __asm__ volatile (
 215                 PREFETCH" 320(%0)\n"
 216                 "movaps (%0), %%xmm0\n"
 217                 "movaps 16(%0), %%xmm1\n"
 218                 "movaps 32(%0), %%xmm2\n"
 219                 "movaps 48(%0), %%xmm3\n"
 220                 "movntps %%xmm0, (%1)\n"
 221                 "movntps %%xmm1, 16(%1)\n"
 222                 "movntps %%xmm2, 32(%1)\n"
 223                 "movntps %%xmm3, 48(%1)\n"
 224                 :: "r" (from), "r" (to) : "memory");
 225                 from=((const unsigned char *)from)+64;
 226                 to=((unsigned char *)to)+64;
 227         }
 228 #else
 229         // Align destination at BLOCK_SIZE boundary
 230         for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
 231         {
 232                 __asm__ volatile (
 233 #ifndef HAVE_ONLY_MMX1
 234                 PREFETCH" 320(%0)\n"
 235 #endif
 236                 "movq (%0), %%mm0\n"
 237                 "movq 8(%0), %%mm1\n"
 238                 "movq 16(%0), %%mm2\n"
 239                 "movq 24(%0), %%mm3\n"
 240                 "movq 32(%0), %%mm4\n"
 241                 "movq 40(%0), %%mm5\n"
 242                 "movq 48(%0), %%mm6\n"
 243                 "movq 56(%0), %%mm7\n"
 244                 MOVNTQ" %%mm0, (%1)\n"
 245                 MOVNTQ" %%mm1, 8(%1)\n"
 246                 MOVNTQ" %%mm2, 16(%1)\n"
 247                 MOVNTQ" %%mm3, 24(%1)\n"
 248                 MOVNTQ" %%mm4, 32(%1)\n"
 249                 MOVNTQ" %%mm5, 40(%1)\n"
 250                 MOVNTQ" %%mm6, 48(%1)\n"
 251                 MOVNTQ" %%mm7, 56(%1)\n"
 252                 :: "r" (from), "r" (to) : "memory");
 253                 from=((const unsigned char *)from)+64;
 254                 to=((unsigned char *)to)+64;
 255         }
 256
 257 //      printf(" %d %d\n", (int)from&1023, (int)to&1023);
 258         // Pure Assembly cuz gcc is a bit unpredictable ;)
 259         if(i>=BLOCK_SIZE/64)
 260                 __asm__ volatile(
 261                         "xor %%"REG_a", %%"REG_a"       \n\t"
 262                         ASMALIGN(4)
 263                         "1:                     \n\t"
 264                                 "movl (%0, %%"REG_a"), %%ebx    \n\t"
 265                                 "movl 32(%0, %%"REG_a"), %%ebx  \n\t"
 266                                 "movl 64(%0, %%"REG_a"), %%ebx  \n\t"
 267                                 "movl 96(%0, %%"REG_a"), %%ebx  \n\t"
 268                                 "add $128, %%"REG_a"            \n\t"
 269                                 "cmp %3, %%"REG_a"              \n\t"
 270                                 " jb 1b                         \n\t"
 271
 272                         "xor %%"REG_a", %%"REG_a"       \n\t"
 273
 274                                 ASMALIGN(4)
 275                                 "2:                     \n\t"
 276                                 "movq (%0, %%"REG_a"), %%mm0\n"
 277                                 "movq 8(%0, %%"REG_a"), %%mm1\n"
 278                                 "movq 16(%0, %%"REG_a"), %%mm2\n"
 279                                 "movq 24(%0, %%"REG_a"), %%mm3\n"
 280                                 "movq 32(%0, %%"REG_a"), %%mm4\n"
 281                                 "movq 40(%0, %%"REG_a"), %%mm5\n"
 282                                 "movq 48(%0, %%"REG_a"), %%mm6\n"
 283                                 "movq 56(%0, %%"REG_a"), %%mm7\n"
 284                                 MOVNTQ" %%mm0, (%1, %%"REG_a")\n"
 285                                 MOVNTQ" %%mm1, 8(%1, %%"REG_a")\n"
 286                                 MOVNTQ" %%mm2, 16(%1, %%"REG_a")\n"
 287                                 MOVNTQ" %%mm3, 24(%1, %%"REG_a")\n"
 288                                 MOVNTQ" %%mm4, 32(%1, %%"REG_a")\n"
 289                                 MOVNTQ" %%mm5, 40(%1, %%"REG_a")\n"
 290                                 MOVNTQ" %%mm6, 48(%1, %%"REG_a")\n"
 291                                 MOVNTQ" %%mm7, 56(%1, %%"REG_a")\n"
 292                                 "add $64, %%"REG_a"             \n\t"
 293                                 "cmp %3, %%"REG_a"              \n\t"
 294                                 "jb 2b                          \n\t"
 295
 296 #if CONFUSION_FACTOR > 0
 297         // a few percent speedup on out of order executing CPUs
 298                         "mov %5, %%"REG_a"              \n\t"
 299                                 "2:                     \n\t"
 300                                 "movl (%0), %%ebx       \n\t"
 301                                 "movl (%0), %%ebx       \n\t"
 302                                 "movl (%0), %%ebx       \n\t"
 303                                 "movl (%0), %%ebx       \n\t"
 304                                 "dec %%"REG_a"          \n\t"
 305                                 " jnz 2b                \n\t"
 306 #endif
 307
 308                         "xor %%"REG_a", %%"REG_a"       \n\t"
 309                         "add %3, %0             \n\t"
 310                         "add %3, %1             \n\t"
 311                         "sub %4, %2             \n\t"
 312                         "cmp %4, %2             \n\t"
 313                         " jae 1b                \n\t"
 314                                 : "+r" (from), "+r" (to), "+r" (i)
 315                                 : "r" ((long)BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" ((long)CONFUSION_FACTOR)
 316                                 : "%"REG_a, "%ebx"
 317                 );
 318
 319         for(; i>0; i--)
 320         {
 321                 __asm__ volatile (
 322 #ifndef HAVE_ONLY_MMX1
 323                 PREFETCH" 320(%0)\n"
 324 #endif
 325                 "movq (%0), %%mm0\n"
 326                 "movq 8(%0), %%mm1\n"
 327                 "movq 16(%0), %%mm2\n"
 328                 "movq 24(%0), %%mm3\n"
 329                 "movq 32(%0), %%mm4\n"
 330                 "movq 40(%0), %%mm5\n"
 331                 "movq 48(%0), %%mm6\n"
 332                 "movq 56(%0), %%mm7\n"
 333                 MOVNTQ" %%mm0, (%1)\n"
 334                 MOVNTQ" %%mm1, 8(%1)\n"
 335                 MOVNTQ" %%mm2, 16(%1)\n"
 336                 MOVNTQ" %%mm3, 24(%1)\n"
 337                 MOVNTQ" %%mm4, 32(%1)\n"
 338                 MOVNTQ" %%mm5, 40(%1)\n"
 339                 MOVNTQ" %%mm6, 48(%1)\n"
 340                 MOVNTQ" %%mm7, 56(%1)\n"
 341                 :: "r" (from), "r" (to) : "memory");
 342                 from=((const unsigned char *)from)+64;
 343                 to=((unsigned char *)to)+64;
 344         }
 345
 346 #endif /* Have SSE */
 347 #if HAVE_MMX2
 348                 /* since movntq is weakly-ordered, a "sfence"
 349                  * is needed to become ordered again. */
 350                 __asm__ volatile ("sfence":::"memory");
 351 #endif
 352 #if !HAVE_SSE
 353                 /* enables to use FPU */
 354                 __asm__ volatile (EMMS:::"memory");
 355 #endif
 356         }
 357         /*
 358          *      Now do the tail of the block
 359          */
 360         if(len) small_memcpy(to, from, len);
 361         return retval;
 362 }
 363
 364 /**
 365  * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
 366  */
 367 static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
 368 {
 369         void *retval;
 370         size_t i;
 371         retval = to;
 372 #ifdef STATISTICS
 373         {
 374                 static int freq[33];
 375                 static int t=0;
 376                 int i;
 377                 for(i=0; len>(1<<i); i++);
 378                 freq[i]++;
 379                 t++;
 380                 if(1024*1024*1024 % t == 0)
 381                         for(i=0; i<32; i++)
 382                                 printf("mem2agp freq < %8d %4d\n", 1<<i, freq[i]);
 383         }
 384 #endif
 385         if(len >= MIN_LEN)
 386         {
 387           register unsigned long int delta;
 388           /* Align destinition to MMREG_SIZE -boundary */
 389           delta = ((unsigned long int)to)&7;
 390           if(delta)
 391           {
 392             delta=8-delta;
 393             len -= delta;
 394             small_memcpy(to, from, delta);
 395           }
 396           i = len >> 6; /* len/64 */
 397           len &= 63;
 398         /*
 399            This algorithm is top effective when the code consequently
 400            reads and writes blocks which have size of cache line.
 401            Size of cache line is processor-dependent.
 402            It will, however, be a minimum of 32 bytes on any processors.
 403            It would be better to have a number of instructions which
 404            perform reading and writing to be multiple to a number of
 405            processor's decoders, but it's not always possible.
 406         */
 407         for(; i>0; i--)
 408         {
 409                 __asm__ volatile (
 410                 PREFETCH" 320(%0)\n"
 411                 "movq (%0), %%mm0\n"
 412                 "movq 8(%0), %%mm1\n"
 413                 "movq 16(%0), %%mm2\n"
 414                 "movq 24(%0), %%mm3\n"
 415                 "movq 32(%0), %%mm4\n"
 416                 "movq 40(%0), %%mm5\n"
 417                 "movq 48(%0), %%mm6\n"
 418                 "movq 56(%0), %%mm7\n"
 419                 MOVNTQ" %%mm0, (%1)\n"
 420                 MOVNTQ" %%mm1, 8(%1)\n"
 421                 MOVNTQ" %%mm2, 16(%1)\n"
 422                 MOVNTQ" %%mm3, 24(%1)\n"
 423                 MOVNTQ" %%mm4, 32(%1)\n"
 424                 MOVNTQ" %%mm5, 40(%1)\n"
 425                 MOVNTQ" %%mm6, 48(%1)\n"
 426                 MOVNTQ" %%mm7, 56(%1)\n"
 427                 :: "r" (from), "r" (to) : "memory");
 428                 from=((const unsigned char *)from)+64;
 429                 to=((unsigned char *)to)+64;
 430         }
 431 #if HAVE_MMX2
 432                 /* since movntq is weakly-ordered, a "sfence"
 433                  * is needed to become ordered again. */
 434                 __asm__ volatile ("sfence":::"memory");
 435 #endif
 436                 /* enables to use FPU */
 437                 __asm__ volatile (EMMS:::"memory");
 438         }
 439         /*
 440          *      Now do the tail of the block
 441          */
 442         if(len) small_memcpy(to, from, len);
 443         return retval;
 444 }
 445