nptl/allocatestack.c

   1 /* Copyright (C) 2002, 2003 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #include <assert.h>
  21 #include <errno.h>
  22 #include <stdint.h>
  23 #include <string.h>
  24 #include <unistd.h>
  25 #include <sys/mman.h>
  26 #include <sys/param.h>
  27 #include <dl-sysdep.h>
  28 #include <tls.h>
  29
  30
  31
  32 #ifndef NEED_SEPARATE_REGISTER_STACK
  33
  34 /* Most architectures have exactly one stack pointer.  Some have more.  */
  35 #define STACK_VARIABLES void *stackaddr
  36
  37 /* How to pass the values to the 'create_thread' function.  */
  38 #define STACK_VARIABLES_ARGS stackaddr
  39
  40 /* How to declare function which gets there parameters.  */
  41 #define STACK_VARIABLES_PARMS void *stackaddr
  42
  43 /* How to declare allocate_stack.  */
  44 #define ALLOCATE_STACK_PARMS void **stack
  45
  46 /* This is how the function is called.  We do it this way to allow
  47    other variants of the function to have more parameters.  */
  48 #define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  49
  50 #else
  51
  52 #define STACK_VARIABLES void *stackaddr; size_t stacksize
  53 #define STACK_VARIABLES_ARGS stackaddr, stacksize
  54 #define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  55 #define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  56 #define ALLOCATE_STACK(attr, pd) \
  57   allocate_stack (attr, pd, &stackaddr, &stacksize)
  58
  59 #endif
  60
  61
  62 /* Default alignment of stack.  */
  63 #ifndef STACK_ALIGN
  64 # define STACK_ALIGN __alignof__ (long double)
  65 #endif
  66
  67 /* Default value for minimal stack size after allocating thread
  68    descriptor and guard.  */
  69 #ifndef MINIMAL_REST_STACK
  70 # define MINIMAL_REST_STACK     4096
  71 #endif
  72
  73
  74 /* Let the architecture add some flags to the mmap() call used to
  75    allocate stacks.  */
  76 #ifndef ARCH_MAP_FLAGS
  77 # define ARCH_MAP_FLAGS 0
  78 #endif
  79
  80 /* This yields the pointer that TLS support code calls the thread pointer.  */
  81 #if TLS_TCB_AT_TP
  82 # define TLS_TPADJ(pd) (pd)
  83 #elif TLS_DTV_AT_TP
  84 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
  85 #endif
  86
  87 /* Cache handling for not-yet free stacks.  */
  88
  89 /* Maximum size in kB of cache.  */
  90 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
  91 static size_t stack_cache_actsize;
  92
  93 /* Mutex protecting this variable.  */
  94 static lll_lock_t stack_cache_lock = LLL_LOCK_INITIALIZER;
  95
  96 /* List of queued stack frames.  */
  97 static LIST_HEAD (stack_cache);
  98
  99 /* List of the stacks in use.  */
 100 static LIST_HEAD (stack_used);
 101
 102 /* List of the threads with user provided stacks in use.  No need to
 103    initialize this, since it's done in __pthread_initialize_minimal.  */
 104 list_t __stack_user __attribute__ ((nocommon));
 105 hidden_data_def (__stack_user)
 106
 107 #if COLORING_INCREMENT != 0
 108 /* Number of threads created.  */
 109 static unsigned int nptl_ncreated;
 110 #endif
 111
 112
 113 /* Check whether the stack is still used or not.  */
 114 #define FREE_P(descr) ((descr)->tid <= 0)
 115
 116
 117 /* We create a double linked list of all cache entries.  Double linked
 118    because this allows removing entries from the end.  */
 119
 120
 121 /* Get a stack frame from the cache.  We have to match by size since
 122    some blocks might be too small or far too large.  */
 123 static struct pthread *
 124 get_cached_stack (size_t *sizep, void **memp)
 125 {
 126   size_t size = *sizep;
 127   struct pthread *result = NULL;
 128   list_t *entry;
 129
 130   lll_lock (stack_cache_lock);
 131
 132   /* Search the cache for a matching entry.  We search for the
 133      smallest stack which has at least the required size.  Note that
 134      in normal situations the size of all allocated stacks is the
 135      same.  As the very least there are only a few different sizes.
 136      Therefore this loop will exit early most of the time with an
 137      exact match.  */
 138   list_for_each (entry, &stack_cache)
 139     {
 140       struct pthread *curr;
 141
 142       curr = list_entry (entry, struct pthread, list);
 143       if (FREE_P (curr) && curr->stackblock_size >= size)
 144         {
 145           if (curr->stackblock_size == size)
 146             {
 147               result = curr;
 148               break;
 149             }
 150
 151           if (result == NULL
 152               || result->stackblock_size > curr->stackblock_size)
 153             result = curr;
 154         }
 155     }
 156
 157   if (__builtin_expect (result == NULL, 0)
 158       /* Make sure the size difference is not too excessive.  In that
 159          case we do not use the block.  */
 160       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 161     {
 162       /* Release the lock.  */
 163       lll_unlock (stack_cache_lock);
 164
 165       return NULL;
 166     }
 167
 168   /* Dequeue the entry.  */
 169   list_del (&result->list);
 170
 171   /* And add to the list of stacks in use.  */
 172   list_add (&result->list, &stack_used);
 173
 174   /* And decrease the cache size.  */
 175   stack_cache_actsize -= result->stackblock_size;
 176
 177   /* Release the lock early.  */
 178   lll_unlock (stack_cache_lock);
 179
 180   /* Report size and location of the stack to the caller.  */
 181   *sizep = result->stackblock_size;
 182   *memp = result->stackblock;
 183
 184   /* Cancellation handling is back to the default.  */
 185   result->cancelhandling = 0;
 186   result->cleanup = NULL;
 187
 188   /* No pending event.  */
 189   result->nextevent = NULL;
 190
 191   /* Clear the DTV.  */
 192   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 193   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 194
 195   /* Re-initialize the TLS.  */
 196   _dl_allocate_tls_init (TLS_TPADJ (result));
 197
 198   return result;
 199 }
 200
 201
 202 /* Add a stack frame which is not used anymore to the stack.  Must be
 203    called with the cache lock held.  */
 204 static inline void
 205 queue_stack (struct pthread *stack)
 206 {
 207   /* We unconditionally add the stack to the list.  The memory may
 208      still be in use but it will not be reused until the kernel marks
 209      the stack as not used anymore.  */
 210   list_add (&stack->list, &stack_cache);
 211
 212   stack_cache_actsize += stack->stackblock_size;
 213   if (__builtin_expect (stack_cache_actsize > stack_cache_maxsize, 0))
 214     {
 215       /* We reduce the size of the cache.  Remove the last entries
 216          until the size is below the limit.  */
 217       list_t *entry;
 218       list_t *prev;
 219
 220       /* Search from the end of the list.  */
 221       list_for_each_prev_safe (entry, prev, &stack_cache)
 222         {
 223           struct pthread *curr;
 224
 225           curr = list_entry (entry, struct pthread, list);
 226           if (FREE_P (curr))
 227             {
 228               /* Unlink the block.  */
 229               list_del (entry);
 230
 231               /* Account for the freed memory.  */
 232               stack_cache_actsize -= curr->stackblock_size;
 233
 234               /* Free the memory associated with the ELF TLS.  */
 235               _dl_deallocate_tls (TLS_TPADJ (curr), false);
 236
 237               /* Remove this block.  This should never fail.  If it
 238                  does something is really wrong.  */
 239               if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 240                 abort ();
 241
 242               /* Maybe we have freed enough.  */
 243               if (stack_cache_actsize <= stack_cache_maxsize)
 244                 break;
 245             }
 246         }
 247     }
 248 }
 249
 250
 251
 252 static int
 253 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 254                 ALLOCATE_STACK_PARMS)
 255 {
 256   struct pthread *pd;
 257   size_t size;
 258   size_t pagesize_m1 = __getpagesize () - 1;
 259   void *stacktop;
 260
 261   assert (attr != NULL);
 262   assert (powerof2 (pagesize_m1 + 1));
 263   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 264
 265   /* Get the stack size from the attribute if it is set.  Otherwise we
 266      use the default we determined at start time.  */
 267   size = attr->stacksize ?: __default_stacksize;
 268
 269   /* Get memory for the stack.  */
 270   if (__builtin_expect (attr->flags & ATTR_FLAG_STACKADDR, 0))
 271     {
 272       uintptr_t adj;
 273
 274       /* If the user also specified the size of the stack make sure it
 275          is large enough.  */
 276       if (attr->stacksize != 0
 277           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 278         return EINVAL;
 279
 280       /* Adjust stack size for alignment of the TLS block.  */
 281 #if TLS_TCB_AT_TP
 282       adj = ((uintptr_t) attr->stackaddr - TLS_TCB_SIZE)
 283             & __static_tls_align_m1;
 284       assert (size > adj + TLS_TCB_SIZE);
 285 #elif TLS_DTV_AT_TP
 286       adj = ((uintptr_t) attr->stackaddr - __static_tls_size)
 287             & __static_tls_align_m1;
 288       assert (size > adj);
 289 #endif
 290
 291       /* The user provided some memory.  Let's hope it matches the
 292          size...  We do not allocate guard pages if the user provided
 293          the stack.  It is the user's responsibility to do this if it
 294          is wanted.  */
 295 #if TLS_TCB_AT_TP
 296       pd = (struct pthread *) ((uintptr_t) attr->stackaddr
 297                                - TLS_TCB_SIZE - adj);
 298 #elif TLS_DTV_AT_TP
 299       pd = (struct pthread *) (((uintptr_t) attr->stackaddr
 300                                 - __static_tls_size - adj)
 301                                - TLS_PRE_TCB_SIZE);
 302 #endif
 303
 304       /* The user provided stack memory needs to be cleared.  */
 305       memset (pd, '\0', sizeof (struct pthread));
 306
 307       /* The first TSD block is included in the TCB.  */
 308       pd->specific[0] = pd->specific_1stblock;
 309
 310 #if LLL_LOCK_INITIALIZER != 0
 311       /* Initialize the lock.  */
 312       pd->lock = LLL_LOCK_INITIALIZER;
 313 #endif
 314
 315       /* Remember the stack-related values.  */
 316       pd->stackblock = (char *) attr->stackaddr - size;
 317       pd->stackblock_size = size;
 318
 319       /* This is a user-provided stack.  It will not be queued in the
 320          stack cache nor will the memory (except the TLS memory) be freed.  */
 321       pd->user_stack = true;
 322
 323 #ifdef TLS_MULTIPLE_THREADS_IN_TCB
 324       /* This is at least the second thread.  */
 325       pd->header.multiple_threads = 1;
 326 #else
 327       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 328 #endif
 329
 330 #ifdef NEED_DL_SYSINFO
 331       /* Copy the sysinfo value from the parent.  */
 332       pd->header.sysinfo = THREAD_GETMEM (THREAD_SELF, header.sysinfo);
 333 #endif
 334
 335       /* Allocate the DTV for this thread.  */
 336       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 337         /* Something went wrong.  */
 338         return errno;
 339
 340
 341       /* Prepare to modify global data.  */
 342       lll_lock (stack_cache_lock);
 343
 344       /* And add to the list of stacks in use.  */
 345       list_add (&pd->list, &__stack_user);
 346
 347       lll_unlock (stack_cache_lock);
 348     }
 349   else
 350     {
 351       /* Allocate some anonymous memory.  If possible use the
 352          cache.  */
 353       size_t guardsize;
 354       size_t reqsize;
 355       void *mem;
 356
 357 #if COLORING_INCREMENT != 0
 358       /* Add one more page for stack coloring.  Don't do it for stacks
 359          with 16 times pagesize or larger.  This might just cause
 360          unnecessary misalignment.  */
 361       if (size <= 16 * pagesize_m1)
 362         size += pagesize_m1 + 1;
 363 #endif
 364
 365       /* Adjust the stack size for alignment.  */
 366       size &= ~__static_tls_align_m1;
 367       assert (size != 0);
 368
 369       /* Make sure the size of the stack is enough for the guard and
 370          eventually the thread descriptor.  */
 371       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 372       if (__builtin_expect (size < (guardsize + __static_tls_size
 373                                     + MINIMAL_REST_STACK + pagesize_m1 + 1),
 374                             0))
 375         /* The stack is too small (or the guard too large).  */
 376         return EINVAL;
 377
 378       /* Try to get a stack from the cache.  */
 379       reqsize = size;
 380       pd = get_cached_stack (&size, &mem);
 381       if (pd == NULL)
 382         {
 383           /* To avoid aliasing effects on a larger scale then pages we
 384              adjust the allocated stack size if necessary.  This way
 385              allocations directly following each other will not have
 386              aliasing problems.  */
 387 #if MULTI_PAGE_ALIASING != 0
 388           if ((size % MULTI_PAGE_ALIASING) == 0)
 389             size += pagesize_m1 + 1;
 390 #endif
 391
 392           mem = mmap (NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC,
 393                       MAP_PRIVATE | MAP_ANONYMOUS | ARCH_MAP_FLAGS, -1, 0);
 394
 395           if (__builtin_expect (mem == MAP_FAILED, 0))
 396             {
 397 #ifdef ARCH_RETRY_MMAP
 398               mem = ARCH_RETRY_MMAP (size);
 399               if (__builtin_expect (mem == MAP_FAILED, 0))
 400 #endif
 401                 return errno;
 402             }
 403
 404           /* SIZE is guaranteed to be greater than zero.
 405              So we can never get a null pointer back from mmap.  */
 406           assert (mem != NULL);
 407
 408 #if COLORING_INCREMENT != 0
 409           /* Atomically increment NCREATED.  */
 410           unsigned int ncreated = (atomic_exchange_and_add (&nptl_ncreated, 1)
 411                                    + 1);
 412
 413           /* We chose the offset for coloring by incrementing it for
 414              every new thread by a fixed amount.  The offset used
 415              module the page size.  Even if coloring would be better
 416              relative to higher alignment values it makes no sense to
 417              do it since the mmap() interface does not allow us to
 418              specify any alignment for the returned memory block.  */
 419           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 420
 421           /* Make sure the coloring offsets does not disturb the alignment
 422              of the TCB and static TLS block.  */
 423           if (__builtin_expect ((coloring & __static_tls_align_m1) != 0, 0))
 424             coloring = (((coloring + __static_tls_align_m1)
 425                          & ~(__static_tls_align_m1))
 426                         & ~pagesize_m1);
 427 #else
 428           /* Unless specified we do not make any adjustments.  */
 429 # define coloring 0
 430 #endif
 431
 432           /* Place the thread descriptor at the end of the stack.  */
 433 #if TLS_TCB_AT_TP
 434           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 435 #elif TLS_DTV_AT_TP
 436           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 437                                     - __static_tls_size)
 438                                     & ~__static_tls_align_m1)
 439                                    - TLS_PRE_TCB_SIZE);
 440 #endif
 441
 442           /* Remember the stack-related values.  */
 443           pd->stackblock = mem;
 444           pd->stackblock_size = size;
 445
 446           /* We allocated the first block thread-specific data array.
 447              This address will not change for the lifetime of this
 448              descriptor.  */
 449           pd->specific[0] = pd->specific_1stblock;
 450
 451 #if LLL_LOCK_INITIALIZER != 0
 452           /* Initialize the lock.  */
 453           pd->lock = LLL_LOCK_INITIALIZER;
 454 #endif
 455
 456 #ifdef TLS_MULTIPLE_THREADS_IN_TCB
 457           /* This is at least the second thread.  */
 458           pd->header.multiple_threads = 1;
 459 #else
 460           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 461 #endif
 462
 463 #ifdef NEED_DL_SYSINFO
 464           /* Copy the sysinfo value from the parent.  */
 465           pd->header.sysinfo = THREAD_GETMEM (THREAD_SELF, header.sysinfo);
 466 #endif
 467
 468           /* Allocate the DTV for this thread.  */
 469           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 470             {
 471               /* Something went wrong.  */
 472               int err = errno;
 473
 474               /* Free the stack memory we just allocated.  */
 475               (void) munmap (mem, size);
 476
 477               return err;
 478             }
 479
 480
 481           /* Prepare to modify global data.  */
 482           lll_lock (stack_cache_lock);
 483
 484           /* And add to the list of stacks in use.  */
 485           list_add (&pd->list, &stack_used);
 486
 487           lll_unlock (stack_cache_lock);
 488
 489
 490           /* Note that all of the stack and the thread descriptor is
 491              zeroed.  This means we do not have to initialize fields
 492              with initial value zero.  This is specifically true for
 493              the 'tid' field which is always set back to zero once the
 494              stack is not used anymore and for the 'guardsize' field
 495              which will be read next.  */
 496         }
 497
 498       /* Create or resize the guard area if necessary.  */
 499       if (__builtin_expect (guardsize > pd->guardsize, 0))
 500         {
 501 #ifdef NEED_SEPARATE_REGISTER_STACK
 502           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 503 #else
 504           char *guard = mem;
 505 #endif
 506           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 507             {
 508               int err;
 509             mprot_error:
 510               err = errno;
 511
 512               lll_lock (stack_cache_lock);
 513
 514               /* Remove the thread from the list.  */
 515               list_del (&pd->list);
 516
 517               lll_unlock (stack_cache_lock);
 518
 519               /* Get rid of the TLS block we allocated.  */
 520               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 521
 522               /* Free the stack memory regardless of whether the size
 523                  of the cache is over the limit or not.  If this piece
 524                  of memory caused problems we better do not use it
 525                  anymore.  Uh, and we ignore possible errors.  There
 526                  is nothing we could do.  */
 527               (void) munmap (mem, size);
 528
 529               return err;
 530             }
 531
 532           pd->guardsize = guardsize;
 533         }
 534       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 535                                  0))
 536         {
 537           /* The old guard area is too large.  */
 538
 539 #ifdef NEED_SEPARATE_REGISTER_STACK
 540           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 541           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 542
 543           if (oldguard < guard
 544               && mprotect (oldguard, guard - oldguard,
 545                            PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 546             goto mprot_error;
 547
 548           if (mprotect (guard + guardsize,
 549                         oldguard + pd->guardsize - guard - guardsize,
 550                         PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 551             goto mprot_error;
 552 #else
 553           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 554                         PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 555             goto mprot_error;
 556 #endif
 557
 558           pd->guardsize = guardsize;
 559         }
 560     }
 561
 562   /* We place the thread descriptor at the end of the stack.  */
 563   *pdp = pd;
 564
 565 #if TLS_TCB_AT_TP
 566   /* The stack begins before the TCB and the static TLS block.  */
 567   stacktop = ((char *) (pd + 1) - __static_tls_size);
 568 #elif TLS_DTV_AT_TP
 569   stacktop = (char *) (pd - 1);
 570 #endif
 571
 572 #ifdef NEED_SEPARATE_REGISTER_STACK
 573   *stack = pd->stackblock;
 574   *stacksize = stacktop - *stack;
 575 #else
 576   *stack = stacktop;
 577 #endif
 578
 579   return 0;
 580 }
 581
 582
 583 void
 584 internal_function
 585 __deallocate_stack (struct pthread *pd)
 586 {
 587   lll_lock (stack_cache_lock);
 588
 589   /* Remove the thread from the list of threads with user defined
 590      stacks.  */
 591   list_del (&pd->list);
 592
 593   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 594      not reset the 'used' flag in the 'tid' field.  This is done by
 595      the kernel.  If no thread has been created yet this field is
 596      still zero.  */
 597   if (__builtin_expect (! pd->user_stack, 1))
 598     (void) queue_stack (pd);
 599   else
 600     /* Free the memory associated with the ELF TLS.  */
 601     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 602
 603   lll_unlock (stack_cache_lock);
 604 }
 605
 606
 607 /* In case of a fork() call the memory allocation in the child will be
 608    the same but only one thread is running.  All stacks except that of
 609    the one running thread are not used anymore.  We have to recycle
 610    them.  */
 611 void
 612 __reclaim_stacks (void)
 613 {
 614   struct pthread *self = (struct pthread *) THREAD_SELF;
 615
 616   /* No locking necessary.  The caller is the only stack in use.  */
 617
 618   /* Mark all stacks except the still running one as free.  */
 619   list_t *runp;
 620   list_for_each (runp, &stack_used)
 621     {
 622       struct pthread *curp;
 623
 624       curp = list_entry (runp, struct pthread, list);
 625       if (curp != self)
 626         {
 627           /* This marks the stack as free.  */
 628           curp->tid = 0;
 629
 630           /* Account for the size of the stack.  */
 631           stack_cache_actsize += curp->stackblock_size;
 632         }
 633     }
 634
 635   /* Add the stack of all running threads to the cache.  */
 636   list_splice (&stack_used, &stack_cache);
 637
 638   /* Remove the entry for the current thread to from the cache list
 639      and add it to the list of running threads.  Which of the two
 640      lists is decided by the user_stack flag.  */
 641   list_del (&self->list);
 642
 643   /* Re-initialize the lists for all the threads.  */
 644   INIT_LIST_HEAD (&stack_used);
 645   INIT_LIST_HEAD (&__stack_user);
 646
 647   if (__builtin_expect (THREAD_GETMEM (self, user_stack), 0))
 648     list_add (&self->list, &__stack_user);
 649   else
 650     list_add (&self->list, &stack_used);
 651
 652   /* There is one thread running.  */
 653   __nptl_nthreads = 1;
 654
 655   /* Initialize the lock.  */
 656   stack_cache_lock = LLL_LOCK_INITIALIZER;
 657 }