libvo/aclib_template.c

   1 /*
   2   aclib - advanced C library ;)
   3   This file contains functions which improve and expand standard C-library
   4 */
   5
   6 #ifndef HAVE_SSE2
   7 /*
   8    P3 processor has only one SSE decoder so can execute only 1 sse insn per
   9    cpu clock, but it has 3 mmx decoders (include load/store unit)
  10    and executes 3 mmx insns per cpu clock.
  11    P4 processor has some chances, but after reading:
  12    http://www.emulators.com/pentium4.htm
  13    I have doubts. Anyway SSE2 version of this code can be written better.
  14 */
  15 #undef HAVE_SSE
  16 #endif
  17
  18
  19 /*
  20  This part of code was taken by me from Linux-2.4.3 and slightly modified
  21 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
  22 blocks but mplayer uses weakly ordered data and original sources can not
  23 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
  24
  25 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
  26
  27 Order Number 245470:
  28 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
  29
  30 Data referenced by a program can be temporal (data will be used again) or
  31 non-temporal (data will be referenced once and not reused in the immediate
  32 future). To make efficient use of the processor's caches, it is generally
  33 desirable to cache temporal data and not cache non-temporal data. Overloading
  34 the processor's caches with non-temporal data is sometimes referred to as
  35 "polluting the caches".
  36 The non-temporal data is written to memory with Write-Combining semantics.
  37
  38 The PREFETCHh instructions permits a program to load data into the processor
  39 at a suggested cache level, so that it is closer to the processors load and
  40 store unit when it is needed. If the data is already present in a level of
  41 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
  42 will not result in any data movement.
  43 But we should you PREFETCHNTA: Non-temporal data fetch data into location
  44 close to the processor, minimizing cache pollution.
  45
  46 The MOVNTQ (store quadword using non-temporal hint) instruction stores
  47 packed integer data from an MMX register to memory, using a non-temporal hint.
  48 The MOVNTPS (store packed single-precision floating-point values using
  49 non-temporal hint) instruction stores packed floating-point data from an
  50 XMM register to memory, using a non-temporal hint.
  51
  52 The SFENCE (Store Fence) instruction controls write ordering by creating a
  53 fence for memory store operations. This instruction guarantees that the results
  54 of every store instruction that precedes the store fence in program order is
  55 globally visible before any store instruction that follows the fence. The
  56 SFENCE instruction provides an efficient way of ensuring ordering between
  57 procedures that produce weakly-ordered data and procedures that consume that
  58 data.
  59
  60 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
  61 */
  62
  63 // 3dnow memcpy support from kernel 2.4.2
  64 //  by Pontscho/fresh!mindworkz
  65
  66
  67 #undef HAVE_ONLY_MMX1
  68 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
  69 /*  means: mmx v.1. Note: Since we added alignment of destinition it speedups
  70     of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
  71     standard (non MMX-optimized) version.
  72     Note: on K6-2+ it speedups memory copying upto 25% and
  73           on K7 and P3 about 500% (5 times). */
  74 #define HAVE_ONLY_MMX1
  75 #endif
  76
  77
  78 #undef HAVE_K6_2PLUS
  79 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
  80 #define HAVE_K6_2PLUS
  81 #endif
  82
  83 /* for small memory blocks (<256 bytes) this version is faster */
  84 #define small_memcpy(to,from,n)\
  85 {\
  86 register unsigned long int dummy;\
  87 __asm__ __volatile__(\
  88         "rep; movsb"\
  89         :"=&D"(to), "=&S"(from), "=&c"(dummy)\
  90 /* It's most portable way to notify compiler */\
  91 /* that edi, esi and ecx are clobbered in asm block. */\
  92 /* Thanks to A'rpi for hint!!! */\
  93         :"0" (to), "1" (from),"2" (n)\
  94         : "memory");\
  95 }
  96
  97 #undef MMREG_SIZE
  98 #ifdef HAVE_SSE
  99 #define MMREG_SIZE 16
 100 #else
 101 #define MMREG_SIZE 64 //8
 102 #endif
 103
 104 #undef PREFETCH
 105 #undef EMMS
 106
 107 #ifdef HAVE_MMX2
 108 #define PREFETCH "prefetchnta"
 109 #elif defined ( HAVE_3DNOW )
 110 #define PREFETCH  "prefetch"
 111 #else
 112 #define PREFETCH " # nop"
 113 #endif
 114
 115 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
 116 #ifdef HAVE_3DNOW
 117 #define EMMS     "femms"
 118 #else
 119 #define EMMS     "emms"
 120 #endif
 121
 122 #undef MOVNTQ
 123 #ifdef HAVE_MMX2
 124 #define MOVNTQ "movntq"
 125 #else
 126 #define MOVNTQ "movq"
 127 #endif
 128
 129 #undef MIN_LEN
 130 #ifdef HAVE_ONLY_MMX1
 131 #define MIN_LEN 0x800  /* 2K blocks */
 132 #else
 133 #define MIN_LEN 0x40  /* 64-byte blocks */
 134 #endif
 135
 136 static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
 137 {
 138         void *retval;
 139         size_t i;
 140         retval = to;
 141 #ifdef STATISTICS
 142         {
 143                 static int freq[33];
 144                 static int t=0;
 145                 int i;
 146                 for(i=0; len>(1<<i); i++);
 147                 freq[i]++;
 148                 t++;
 149                 if(1024*1024*1024 % t == 0)
 150                         for(i=0; i<32; i++)
 151                                 printf("freq < %8d %4d\n", 1<<i, freq[i]);
 152         }
 153 #endif
 154 #ifndef HAVE_ONLY_MMX1
 155         /* PREFETCH has effect even for MOVSB instruction ;) */
 156         __asm__ __volatile__ (
 157                 PREFETCH" (%0)\n"
 158                 PREFETCH" 64(%0)\n"
 159                 PREFETCH" 128(%0)\n"
 160                 PREFETCH" 192(%0)\n"
 161                 PREFETCH" 256(%0)\n"
 162                 : : "r" (from) );
 163 #endif
 164         if(len >= MIN_LEN)
 165         {
 166           register unsigned long int delta;
 167           /* Align destinition to MMREG_SIZE -boundary */
 168           delta = ((unsigned long int)to)&(MMREG_SIZE-1);
 169           if(delta)
 170           {
 171             delta=MMREG_SIZE-delta;
 172             len -= delta;
 173             small_memcpy(to, from, delta);
 174           }
 175           i = len >> 6; /* len/64 */
 176           len&=63;
 177         /*
 178            This algorithm is top effective when the code consequently
 179            reads and writes blocks which have size of cache line.
 180            Size of cache line is processor-dependent.
 181            It will, however, be a minimum of 32 bytes on any processors.
 182            It would be better to have a number of instructions which
 183            perform reading and writing to be multiple to a number of
 184            processor's decoders, but it's not always possible.
 185         */
 186 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
 187         if(((unsigned long)from) & 15)
 188         /* if SRC is misaligned */
 189         for(; i>0; i--)
 190         {
 191                 __asm__ __volatile__ (
 192                 PREFETCH" 320(%0)\n"
 193                 "movups (%0), %%xmm0\n"
 194                 "movups 16(%0), %%xmm1\n"
 195                 "movups 32(%0), %%xmm2\n"
 196                 "movups 48(%0), %%xmm3\n"
 197                 "movntps %%xmm0, (%1)\n"
 198                 "movntps %%xmm1, 16(%1)\n"
 199                 "movntps %%xmm2, 32(%1)\n"
 200                 "movntps %%xmm3, 48(%1)\n"
 201                 :: "r" (from), "r" (to) : "memory");
 202                 from=((const unsigned char *) from)+64;
 203                 to=((unsigned char *)to)+64;
 204         }
 205         else
 206         /*
 207            Only if SRC is aligned on 16-byte boundary.
 208            It allows to use movaps instead of movups, which required data
 209            to be aligned or a general-protection exception (#GP) is generated.
 210         */
 211         for(; i>0; i--)
 212         {
 213                 __asm__ __volatile__ (
 214                 PREFETCH" 320(%0)\n"
 215                 "movaps (%0), %%xmm0\n"
 216                 "movaps 16(%0), %%xmm1\n"
 217                 "movaps 32(%0), %%xmm2\n"
 218                 "movaps 48(%0), %%xmm3\n"
 219                 "movntps %%xmm0, (%1)\n"
 220                 "movntps %%xmm1, 16(%1)\n"
 221                 "movntps %%xmm2, 32(%1)\n"
 222                 "movntps %%xmm3, 48(%1)\n"
 223                 :: "r" (from), "r" (to) : "memory");
 224                 from=((const unsigned char *)from)+64;
 225                 to=((unsigned char *)to)+64;
 226         }
 227 #else
 228         // Align destination at BLOCK_SIZE boundary
 229         for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
 230         {
 231                 __asm__ __volatile__ (
 232 #ifndef HAVE_ONLY_MMX1
 233                 PREFETCH" 320(%0)\n"
 234 #endif
 235                 "movq (%0), %%mm0\n"
 236                 "movq 8(%0), %%mm1\n"
 237                 "movq 16(%0), %%mm2\n"
 238                 "movq 24(%0), %%mm3\n"
 239                 "movq 32(%0), %%mm4\n"
 240                 "movq 40(%0), %%mm5\n"
 241                 "movq 48(%0), %%mm6\n"
 242                 "movq 56(%0), %%mm7\n"
 243                 MOVNTQ" %%mm0, (%1)\n"
 244                 MOVNTQ" %%mm1, 8(%1)\n"
 245                 MOVNTQ" %%mm2, 16(%1)\n"
 246                 MOVNTQ" %%mm3, 24(%1)\n"
 247                 MOVNTQ" %%mm4, 32(%1)\n"
 248                 MOVNTQ" %%mm5, 40(%1)\n"
 249                 MOVNTQ" %%mm6, 48(%1)\n"
 250                 MOVNTQ" %%mm7, 56(%1)\n"
 251                 :: "r" (from), "r" (to) : "memory");
 252                 from=((const unsigned char *)from)+64;
 253                 to=((unsigned char *)to)+64;
 254         }
 255
 256 //      printf(" %d %d\n", (int)from&1023, (int)to&1023);
 257         // Pure Assembly cuz gcc is a bit unpredictable ;)
 258         if(i>=BLOCK_SIZE/64)
 259                 asm volatile(
 260                         "xor %%"REG_a", %%"REG_a"       \n\t"
 261                         ASMALIGN(4)
 262                         "1:                     \n\t"
 263                                 "movl (%0, %%"REG_a"), %%ebx    \n\t"
 264                                 "movl 32(%0, %%"REG_a"), %%ebx  \n\t"
 265                                 "movl 64(%0, %%"REG_a"), %%ebx  \n\t"
 266                                 "movl 96(%0, %%"REG_a"), %%ebx  \n\t"
 267                                 "add $128, %%"REG_a"            \n\t"
 268                                 "cmp %3, %%"REG_a"              \n\t"
 269                                 " jb 1b                         \n\t"
 270
 271                         "xor %%"REG_a", %%"REG_a"       \n\t"
 272
 273                                 ASMALIGN(4)
 274                                 "2:                     \n\t"
 275                                 "movq (%0, %%"REG_a"), %%mm0\n"
 276                                 "movq 8(%0, %%"REG_a"), %%mm1\n"
 277                                 "movq 16(%0, %%"REG_a"), %%mm2\n"
 278                                 "movq 24(%0, %%"REG_a"), %%mm3\n"
 279                                 "movq 32(%0, %%"REG_a"), %%mm4\n"
 280                                 "movq 40(%0, %%"REG_a"), %%mm5\n"
 281                                 "movq 48(%0, %%"REG_a"), %%mm6\n"
 282                                 "movq 56(%0, %%"REG_a"), %%mm7\n"
 283                                 MOVNTQ" %%mm0, (%1, %%"REG_a")\n"
 284                                 MOVNTQ" %%mm1, 8(%1, %%"REG_a")\n"
 285                                 MOVNTQ" %%mm2, 16(%1, %%"REG_a")\n"
 286                                 MOVNTQ" %%mm3, 24(%1, %%"REG_a")\n"
 287                                 MOVNTQ" %%mm4, 32(%1, %%"REG_a")\n"
 288                                 MOVNTQ" %%mm5, 40(%1, %%"REG_a")\n"
 289                                 MOVNTQ" %%mm6, 48(%1, %%"REG_a")\n"
 290                                 MOVNTQ" %%mm7, 56(%1, %%"REG_a")\n"
 291                                 "add $64, %%"REG_a"             \n\t"
 292                                 "cmp %3, %%"REG_a"              \n\t"
 293                                 "jb 2b                          \n\t"
 294
 295 #if CONFUSION_FACTOR > 0
 296         // a few percent speedup on out of order executing CPUs
 297                         "mov %5, %%"REG_a"              \n\t"
 298                                 "2:                     \n\t"
 299                                 "movl (%0), %%ebx       \n\t"
 300                                 "movl (%0), %%ebx       \n\t"
 301                                 "movl (%0), %%ebx       \n\t"
 302                                 "movl (%0), %%ebx       \n\t"
 303                                 "dec %%"REG_a"          \n\t"
 304                                 " jnz 2b                \n\t"
 305 #endif
 306
 307                         "xor %%"REG_a", %%"REG_a"       \n\t"
 308                         "add %3, %0             \n\t"
 309                         "add %3, %1             \n\t"
 310                         "sub %4, %2             \n\t"
 311                         "cmp %4, %2             \n\t"
 312                         " jae 1b                \n\t"
 313                                 : "+r" (from), "+r" (to), "+r" (i)
 314                                 : "r" ((long)BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" ((long)CONFUSION_FACTOR)
 315                                 : "%"REG_a, "%ebx"
 316                 );
 317
 318         for(; i>0; i--)
 319         {
 320                 __asm__ __volatile__ (
 321 #ifndef HAVE_ONLY_MMX1
 322                 PREFETCH" 320(%0)\n"
 323 #endif
 324                 "movq (%0), %%mm0\n"
 325                 "movq 8(%0), %%mm1\n"
 326                 "movq 16(%0), %%mm2\n"
 327                 "movq 24(%0), %%mm3\n"
 328                 "movq 32(%0), %%mm4\n"
 329                 "movq 40(%0), %%mm5\n"
 330                 "movq 48(%0), %%mm6\n"
 331                 "movq 56(%0), %%mm7\n"
 332                 MOVNTQ" %%mm0, (%1)\n"
 333                 MOVNTQ" %%mm1, 8(%1)\n"
 334                 MOVNTQ" %%mm2, 16(%1)\n"
 335                 MOVNTQ" %%mm3, 24(%1)\n"
 336                 MOVNTQ" %%mm4, 32(%1)\n"
 337                 MOVNTQ" %%mm5, 40(%1)\n"
 338                 MOVNTQ" %%mm6, 48(%1)\n"
 339                 MOVNTQ" %%mm7, 56(%1)\n"
 340                 :: "r" (from), "r" (to) : "memory");
 341                 from=((const unsigned char *)from)+64;
 342                 to=((unsigned char *)to)+64;
 343         }
 344
 345 #endif /* Have SSE */
 346 #ifdef HAVE_MMX2
 347                 /* since movntq is weakly-ordered, a "sfence"
 348                  * is needed to become ordered again. */
 349                 __asm__ __volatile__ ("sfence":::"memory");
 350 #endif
 351 #ifndef HAVE_SSE
 352                 /* enables to use FPU */
 353                 __asm__ __volatile__ (EMMS:::"memory");
 354 #endif
 355         }
 356         /*
 357          *      Now do the tail of the block
 358          */
 359         if(len) small_memcpy(to, from, len);
 360         return retval;
 361 }
 362
 363 /**
 364  * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
 365  */
 366 static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
 367 {
 368         void *retval;
 369         size_t i;
 370         retval = to;
 371 #ifdef STATISTICS
 372         {
 373                 static int freq[33];
 374                 static int t=0;
 375                 int i;
 376                 for(i=0; len>(1<<i); i++);
 377                 freq[i]++;
 378                 t++;
 379                 if(1024*1024*1024 % t == 0)
 380                         for(i=0; i<32; i++)
 381                                 printf("mem2agp freq < %8d %4d\n", 1<<i, freq[i]);
 382         }
 383 #endif
 384         if(len >= MIN_LEN)
 385         {
 386           register unsigned long int delta;
 387           /* Align destinition to MMREG_SIZE -boundary */
 388           delta = ((unsigned long int)to)&7;
 389           if(delta)
 390           {
 391             delta=8-delta;
 392             len -= delta;
 393             small_memcpy(to, from, delta);
 394           }
 395           i = len >> 6; /* len/64 */
 396           len &= 63;
 397         /*
 398            This algorithm is top effective when the code consequently
 399            reads and writes blocks which have size of cache line.
 400            Size of cache line is processor-dependent.
 401            It will, however, be a minimum of 32 bytes on any processors.
 402            It would be better to have a number of instructions which
 403            perform reading and writing to be multiple to a number of
 404            processor's decoders, but it's not always possible.
 405         */
 406         for(; i>0; i--)
 407         {
 408                 __asm__ __volatile__ (
 409                 PREFETCH" 320(%0)\n"
 410                 "movq (%0), %%mm0\n"
 411                 "movq 8(%0), %%mm1\n"
 412                 "movq 16(%0), %%mm2\n"
 413                 "movq 24(%0), %%mm3\n"
 414                 "movq 32(%0), %%mm4\n"
 415                 "movq 40(%0), %%mm5\n"
 416                 "movq 48(%0), %%mm6\n"
 417                 "movq 56(%0), %%mm7\n"
 418                 MOVNTQ" %%mm0, (%1)\n"
 419                 MOVNTQ" %%mm1, 8(%1)\n"
 420                 MOVNTQ" %%mm2, 16(%1)\n"
 421                 MOVNTQ" %%mm3, 24(%1)\n"
 422                 MOVNTQ" %%mm4, 32(%1)\n"
 423                 MOVNTQ" %%mm5, 40(%1)\n"
 424                 MOVNTQ" %%mm6, 48(%1)\n"
 425                 MOVNTQ" %%mm7, 56(%1)\n"
 426                 :: "r" (from), "r" (to) : "memory");
 427                 from=((const unsigned char *)from)+64;
 428                 to=((unsigned char *)to)+64;
 429         }
 430 #ifdef HAVE_MMX2
 431                 /* since movntq is weakly-ordered, a "sfence"
 432                  * is needed to become ordered again. */
 433                 __asm__ __volatile__ ("sfence":::"memory");
 434 #endif
 435                 /* enables to use FPU */
 436                 __asm__ __volatile__ (EMMS:::"memory");
 437         }
 438         /*
 439          *      Now do the tail of the block
 440          */
 441         if(len) small_memcpy(to, from, len);
 442         return retval;
 443 }
 444