nptl/allocatestack.c

   1 /* Copyright (C) 2002-2014 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <assert.h>
  20 #include <errno.h>
  21 #include <signal.h>
  22 #include <stdint.h>
  23 #include <string.h>
  24 #include <unistd.h>
  25 #include <sys/mman.h>
  26 #include <sys/param.h>
  27 #include <dl-sysdep.h>
  28 #include <dl-tls.h>
  29 #include <tls.h>
  30 #include <list.h>
  31 #include <lowlevellock.h>
  32 #include <kernel-features.h>
  33
  34
  35 #ifndef NEED_SEPARATE_REGISTER_STACK
  36
  37 /* Most architectures have exactly one stack pointer.  Some have more.  */
  38 # define STACK_VARIABLES void *stackaddr = NULL
  39
  40 /* How to pass the values to the 'create_thread' function.  */
  41 # define STACK_VARIABLES_ARGS stackaddr
  42
  43 /* How to declare function which gets there parameters.  */
  44 # define STACK_VARIABLES_PARMS void *stackaddr
  45
  46 /* How to declare allocate_stack.  */
  47 # define ALLOCATE_STACK_PARMS void **stack
  48
  49 /* This is how the function is called.  We do it this way to allow
  50    other variants of the function to have more parameters.  */
  51 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  52
  53 #else
  54
  55 /* We need two stacks.  The kernel will place them but we have to tell
  56    the kernel about the size of the reserved address space.  */
  57 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  58
  59 /* How to pass the values to the 'create_thread' function.  */
  60 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  61
  62 /* How to declare function which gets there parameters.  */
  63 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  64
  65 /* How to declare allocate_stack.  */
  66 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  67
  68 /* This is how the function is called.  We do it this way to allow
  69    other variants of the function to have more parameters.  */
  70 # define ALLOCATE_STACK(attr, pd) \
  71   allocate_stack (attr, pd, &stackaddr, &stacksize)
  72
  73 #endif
  74
  75
  76 /* Default alignment of stack.  */
  77 #ifndef STACK_ALIGN
  78 # define STACK_ALIGN __alignof__ (long double)
  79 #endif
  80
  81 /* Default value for minimal stack size after allocating thread
  82    descriptor and guard.  */
  83 #ifndef MINIMAL_REST_STACK
  84 # define MINIMAL_REST_STACK     4096
  85 #endif
  86
  87
  88 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  89    a stack.  Use it when possible.  */
  90 #ifndef MAP_STACK
  91 # define MAP_STACK 0
  92 #endif
  93
  94 /* This yields the pointer that TLS support code calls the thread pointer.  */
  95 #if TLS_TCB_AT_TP
  96 # define TLS_TPADJ(pd) (pd)
  97 #elif TLS_DTV_AT_TP
  98 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
  99 #endif
 100
 101 /* Cache handling for not-yet free stacks.  */
 102
 103 /* Maximum size in kB of cache.  */
 104 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 105 static size_t stack_cache_actsize;
 106
 107 /* Mutex protecting this variable.  */
 108 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 109
 110 /* List of queued stack frames.  */
 111 static LIST_HEAD (stack_cache);
 112
 113 /* List of the stacks in use.  */
 114 static LIST_HEAD (stack_used);
 115
 116 /* We need to record what list operations we are going to do so that,
 117    in case of an asynchronous interruption due to a fork() call, we
 118    can correct for the work.  */
 119 static uintptr_t in_flight_stack;
 120
 121 /* List of the threads with user provided stacks in use.  No need to
 122    initialize this, since it's done in __pthread_initialize_minimal.  */
 123 list_t __stack_user __attribute__ ((nocommon));
 124 hidden_data_def (__stack_user)
 125
 126 #if COLORING_INCREMENT != 0
 127 /* Number of threads created.  */
 128 static unsigned int nptl_ncreated;
 129 #endif
 130
 131
 132 /* Check whether the stack is still used or not.  */
 133 #define FREE_P(descr) ((descr)->tid <= 0)
 134
 135
 136 static void
 137 stack_list_del (list_t *elem)
 138 {
 139   in_flight_stack = (uintptr_t) elem;
 140
 141   atomic_write_barrier ();
 142
 143   list_del (elem);
 144
 145   atomic_write_barrier ();
 146
 147   in_flight_stack = 0;
 148 }
 149
 150
 151 static void
 152 stack_list_add (list_t *elem, list_t *list)
 153 {
 154   in_flight_stack = (uintptr_t) elem | 1;
 155
 156   atomic_write_barrier ();
 157
 158   list_add (elem, list);
 159
 160   atomic_write_barrier ();
 161
 162   in_flight_stack = 0;
 163 }
 164
 165
 166 /* We create a double linked list of all cache entries.  Double linked
 167    because this allows removing entries from the end.  */
 168
 169
 170 /* Get a stack frame from the cache.  We have to match by size since
 171    some blocks might be too small or far too large.  */
 172 static struct pthread *
 173 get_cached_stack (size_t *sizep, void **memp)
 174 {
 175   size_t size = *sizep;
 176   struct pthread *result = NULL;
 177   list_t *entry;
 178
 179   lll_lock (stack_cache_lock, LLL_PRIVATE);
 180
 181   /* Search the cache for a matching entry.  We search for the
 182      smallest stack which has at least the required size.  Note that
 183      in normal situations the size of all allocated stacks is the
 184      same.  As the very least there are only a few different sizes.
 185      Therefore this loop will exit early most of the time with an
 186      exact match.  */
 187   list_for_each (entry, &stack_cache)
 188     {
 189       struct pthread *curr;
 190
 191       curr = list_entry (entry, struct pthread, list);
 192       if (FREE_P (curr) && curr->stackblock_size >= size)
 193         {
 194           if (curr->stackblock_size == size)
 195             {
 196               result = curr;
 197               break;
 198             }
 199
 200           if (result == NULL
 201               || result->stackblock_size > curr->stackblock_size)
 202             result = curr;
 203         }
 204     }
 205
 206   if (__builtin_expect (result == NULL, 0)
 207       /* Make sure the size difference is not too excessive.  In that
 208          case we do not use the block.  */
 209       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 210     {
 211       /* Release the lock.  */
 212       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 213
 214       return NULL;
 215     }
 216
 217   /* Don't allow setxid until cloned.  */
 218   result->setxid_futex = -1;
 219
 220   /* Dequeue the entry.  */
 221   stack_list_del (&result->list);
 222
 223   /* And add to the list of stacks in use.  */
 224   stack_list_add (&result->list, &stack_used);
 225
 226   /* And decrease the cache size.  */
 227   stack_cache_actsize -= result->stackblock_size;
 228
 229   /* Release the lock early.  */
 230   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 231
 232   /* Report size and location of the stack to the caller.  */
 233   *sizep = result->stackblock_size;
 234   *memp = result->stackblock;
 235
 236   /* Cancellation handling is back to the default.  */
 237   result->cancelhandling = 0;
 238   result->cleanup = NULL;
 239
 240   /* No pending event.  */
 241   result->nextevent = NULL;
 242
 243   /* Clear the DTV.  */
 244   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 245   for (size_t cnt = 0; cnt < dtv[-1].counter; ++cnt)
 246     if (! dtv[1 + cnt].pointer.is_static
 247         && dtv[1 + cnt].pointer.val != TLS_DTV_UNALLOCATED)
 248       free (dtv[1 + cnt].pointer.val);
 249   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 250
 251   /* Re-initialize the TLS.  */
 252   _dl_allocate_tls_init (TLS_TPADJ (result));
 253
 254   return result;
 255 }
 256
 257
 258 /* Free stacks until cache size is lower than LIMIT.  */
 259 void
 260 __free_stacks (size_t limit)
 261 {
 262   /* We reduce the size of the cache.  Remove the last entries until
 263      the size is below the limit.  */
 264   list_t *entry;
 265   list_t *prev;
 266
 267   /* Search from the end of the list.  */
 268   list_for_each_prev_safe (entry, prev, &stack_cache)
 269     {
 270       struct pthread *curr;
 271
 272       curr = list_entry (entry, struct pthread, list);
 273       if (FREE_P (curr))
 274         {
 275           /* Unlink the block.  */
 276           stack_list_del (entry);
 277
 278           /* Account for the freed memory.  */
 279           stack_cache_actsize -= curr->stackblock_size;
 280
 281           /* Free the memory associated with the ELF TLS.  */
 282           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 283
 284           /* Remove this block.  This should never fail.  If it does
 285              something is really wrong.  */
 286           if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 287             abort ();
 288
 289           /* Maybe we have freed enough.  */
 290           if (stack_cache_actsize <= limit)
 291             break;
 292         }
 293     }
 294 }
 295
 296
 297 /* Add a stack frame which is not used anymore to the stack.  Must be
 298    called with the cache lock held.  */
 299 static inline void
 300 __attribute ((always_inline))
 301 queue_stack (struct pthread *stack)
 302 {
 303   /* We unconditionally add the stack to the list.  The memory may
 304      still be in use but it will not be reused until the kernel marks
 305      the stack as not used anymore.  */
 306   stack_list_add (&stack->list, &stack_cache);
 307
 308   stack_cache_actsize += stack->stackblock_size;
 309   if (__glibc_unlikely (stack_cache_actsize > stack_cache_maxsize))
 310     __free_stacks (stack_cache_maxsize);
 311 }
 312
 313
 314 static int
 315 internal_function
 316 change_stack_perm (struct pthread *pd
 317 #ifdef NEED_SEPARATE_REGISTER_STACK
 318                    , size_t pagemask
 319 #endif
 320                    )
 321 {
 322 #ifdef NEED_SEPARATE_REGISTER_STACK
 323   void *stack = (pd->stackblock
 324                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 325                       & pagemask) + pd->guardsize) & pagemask));
 326   size_t len = pd->stackblock + pd->stackblock_size - stack;
 327 #elif _STACK_GROWS_DOWN
 328   void *stack = pd->stackblock + pd->guardsize;
 329   size_t len = pd->stackblock_size - pd->guardsize;
 330 #elif _STACK_GROWS_UP
 331   void *stack = pd->stackblock;
 332   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 333 #else
 334 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 335 #endif
 336   if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 337     return errno;
 338
 339   return 0;
 340 }
 341
 342
 343 /* Returns a usable stack for a new thread either by allocating a
 344    new stack or reusing a cached stack of sufficient size.
 345    ATTR must be non-NULL and point to a valid pthread_attr.
 346    PDP must be non-NULL.  */
 347 static int
 348 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 349                 ALLOCATE_STACK_PARMS)
 350 {
 351   struct pthread *pd;
 352   size_t size;
 353   size_t pagesize_m1 = __getpagesize () - 1;
 354   void *stacktop;
 355
 356   assert (powerof2 (pagesize_m1 + 1));
 357   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 358
 359   /* Get the stack size from the attribute if it is set.  Otherwise we
 360      use the default we determined at start time.  */
 361   if (attr->stacksize != 0)
 362     size = attr->stacksize;
 363   else
 364     {
 365       lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
 366       size = __default_pthread_attr.stacksize;
 367       lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
 368     }
 369
 370   /* Get memory for the stack.  */
 371   if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))
 372     {
 373       uintptr_t adj;
 374
 375       /* If the user also specified the size of the stack make sure it
 376          is large enough.  */
 377       if (attr->stacksize != 0
 378           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 379         return EINVAL;
 380
 381       /* Adjust stack size for alignment of the TLS block.  */
 382 #if TLS_TCB_AT_TP
 383       adj = ((uintptr_t) attr->stackaddr - TLS_TCB_SIZE)
 384             & __static_tls_align_m1;
 385       assert (size > adj + TLS_TCB_SIZE);
 386 #elif TLS_DTV_AT_TP
 387       adj = ((uintptr_t) attr->stackaddr - __static_tls_size)
 388             & __static_tls_align_m1;
 389       assert (size > adj);
 390 #endif
 391
 392       /* The user provided some memory.  Let's hope it matches the
 393          size...  We do not allocate guard pages if the user provided
 394          the stack.  It is the user's responsibility to do this if it
 395          is wanted.  */
 396 #if TLS_TCB_AT_TP
 397       pd = (struct pthread *) ((uintptr_t) attr->stackaddr
 398                                - TLS_TCB_SIZE - adj);
 399 #elif TLS_DTV_AT_TP
 400       pd = (struct pthread *) (((uintptr_t) attr->stackaddr
 401                                 - __static_tls_size - adj)
 402                                - TLS_PRE_TCB_SIZE);
 403 #endif
 404
 405       /* The user provided stack memory needs to be cleared.  */
 406       memset (pd, '\0', sizeof (struct pthread));
 407
 408       /* The first TSD block is included in the TCB.  */
 409       pd->specific[0] = pd->specific_1stblock;
 410
 411       /* Remember the stack-related values.  */
 412       pd->stackblock = (char *) attr->stackaddr - size;
 413       pd->stackblock_size = size;
 414
 415       /* This is a user-provided stack.  It will not be queued in the
 416          stack cache nor will the memory (except the TLS memory) be freed.  */
 417       pd->user_stack = true;
 418
 419       /* This is at least the second thread.  */
 420       pd->header.multiple_threads = 1;
 421 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 422       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 423 #endif
 424
 425 #ifndef __ASSUME_PRIVATE_FUTEX
 426       /* The thread must know when private futexes are supported.  */
 427       pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 428                                                 header.private_futex);
 429 #endif
 430
 431 #ifdef NEED_DL_SYSINFO
 432       /* Copy the sysinfo value from the parent.  */
 433       THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 434 #endif
 435
 436       /* The process ID is also the same as that of the caller.  */
 437       pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 438
 439       /* Don't allow setxid until cloned.  */
 440       pd->setxid_futex = -1;
 441
 442       /* Allocate the DTV for this thread.  */
 443       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 444         {
 445           /* Something went wrong.  */
 446           assert (errno == ENOMEM);
 447           return errno;
 448         }
 449
 450
 451       /* Prepare to modify global data.  */
 452       lll_lock (stack_cache_lock, LLL_PRIVATE);
 453
 454       /* And add to the list of stacks in use.  */
 455       list_add (&pd->list, &__stack_user);
 456
 457       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 458     }
 459   else
 460     {
 461       /* Allocate some anonymous memory.  If possible use the cache.  */
 462       size_t guardsize;
 463       size_t reqsize;
 464       void *mem;
 465       const int prot = (PROT_READ | PROT_WRITE
 466                         | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
 467
 468 #if COLORING_INCREMENT != 0
 469       /* Add one more page for stack coloring.  Don't do it for stacks
 470          with 16 times pagesize or larger.  This might just cause
 471          unnecessary misalignment.  */
 472       if (size <= 16 * pagesize_m1)
 473         size += pagesize_m1 + 1;
 474 #endif
 475
 476       /* Adjust the stack size for alignment.  */
 477       size &= ~__static_tls_align_m1;
 478       assert (size != 0);
 479
 480       /* Make sure the size of the stack is enough for the guard and
 481          eventually the thread descriptor.  */
 482       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 483       if (__builtin_expect (size < ((guardsize + __static_tls_size
 484                                      + MINIMAL_REST_STACK + pagesize_m1)
 485                                     & ~pagesize_m1),
 486                             0))
 487         /* The stack is too small (or the guard too large).  */
 488         return EINVAL;
 489
 490       /* Try to get a stack from the cache.  */
 491       reqsize = size;
 492       pd = get_cached_stack (&size, &mem);
 493       if (pd == NULL)
 494         {
 495           /* To avoid aliasing effects on a larger scale than pages we
 496              adjust the allocated stack size if necessary.  This way
 497              allocations directly following each other will not have
 498              aliasing problems.  */
 499 #if MULTI_PAGE_ALIASING != 0
 500           if ((size % MULTI_PAGE_ALIASING) == 0)
 501             size += pagesize_m1 + 1;
 502 #endif
 503
 504           mem = mmap (NULL, size, prot,
 505                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 506
 507           if (__glibc_unlikely (mem == MAP_FAILED))
 508             return errno;
 509
 510           /* SIZE is guaranteed to be greater than zero.
 511              So we can never get a null pointer back from mmap.  */
 512           assert (mem != NULL);
 513
 514 #if COLORING_INCREMENT != 0
 515           /* Atomically increment NCREATED.  */
 516           unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
 517
 518           /* We chose the offset for coloring by incrementing it for
 519              every new thread by a fixed amount.  The offset used
 520              module the page size.  Even if coloring would be better
 521              relative to higher alignment values it makes no sense to
 522              do it since the mmap() interface does not allow us to
 523              specify any alignment for the returned memory block.  */
 524           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 525
 526           /* Make sure the coloring offsets does not disturb the alignment
 527              of the TCB and static TLS block.  */
 528           if (__glibc_unlikely ((coloring & __static_tls_align_m1) != 0))
 529             coloring = (((coloring + __static_tls_align_m1)
 530                          & ~(__static_tls_align_m1))
 531                         & ~pagesize_m1);
 532 #else
 533           /* Unless specified we do not make any adjustments.  */
 534 # define coloring 0
 535 #endif
 536
 537           /* Place the thread descriptor at the end of the stack.  */
 538 #if TLS_TCB_AT_TP
 539           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 540 #elif TLS_DTV_AT_TP
 541           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 542                                     - __static_tls_size)
 543                                     & ~__static_tls_align_m1)
 544                                    - TLS_PRE_TCB_SIZE);
 545 #endif
 546
 547           /* Remember the stack-related values.  */
 548           pd->stackblock = mem;
 549           pd->stackblock_size = size;
 550
 551           /* We allocated the first block thread-specific data array.
 552              This address will not change for the lifetime of this
 553              descriptor.  */
 554           pd->specific[0] = pd->specific_1stblock;
 555
 556           /* This is at least the second thread.  */
 557           pd->header.multiple_threads = 1;
 558 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 559           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 560 #endif
 561
 562 #ifndef __ASSUME_PRIVATE_FUTEX
 563           /* The thread must know when private futexes are supported.  */
 564           pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 565                                                     header.private_futex);
 566 #endif
 567
 568 #ifdef NEED_DL_SYSINFO
 569           /* Copy the sysinfo value from the parent.  */
 570           THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 571 #endif
 572
 573           /* Don't allow setxid until cloned.  */
 574           pd->setxid_futex = -1;
 575
 576           /* The process ID is also the same as that of the caller.  */
 577           pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 578
 579           /* Allocate the DTV for this thread.  */
 580           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 581             {
 582               /* Something went wrong.  */
 583               assert (errno == ENOMEM);
 584
 585               /* Free the stack memory we just allocated.  */
 586               (void) munmap (mem, size);
 587
 588               return errno;
 589             }
 590
 591
 592           /* Prepare to modify global data.  */
 593           lll_lock (stack_cache_lock, LLL_PRIVATE);
 594
 595           /* And add to the list of stacks in use.  */
 596           stack_list_add (&pd->list, &stack_used);
 597
 598           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 599
 600
 601           /* There might have been a race.  Another thread might have
 602              caused the stacks to get exec permission while this new
 603              stack was prepared.  Detect if this was possible and
 604              change the permission if necessary.  */
 605           if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
 606                                 && (prot & PROT_EXEC) == 0, 0))
 607             {
 608               int err = change_stack_perm (pd
 609 #ifdef NEED_SEPARATE_REGISTER_STACK
 610                                            , ~pagesize_m1
 611 #endif
 612                                            );
 613               if (err != 0)
 614                 {
 615                   /* Free the stack memory we just allocated.  */
 616                   (void) munmap (mem, size);
 617
 618                   return err;
 619                 }
 620             }
 621
 622
 623           /* Note that all of the stack and the thread descriptor is
 624              zeroed.  This means we do not have to initialize fields
 625              with initial value zero.  This is specifically true for
 626              the 'tid' field which is always set back to zero once the
 627              stack is not used anymore and for the 'guardsize' field
 628              which will be read next.  */
 629         }
 630
 631       /* Create or resize the guard area if necessary.  */
 632       if (__glibc_unlikely (guardsize > pd->guardsize))
 633         {
 634 #ifdef NEED_SEPARATE_REGISTER_STACK
 635           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 636 #elif _STACK_GROWS_DOWN
 637           char *guard = mem;
 638 # elif _STACK_GROWS_UP
 639           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 640 #endif
 641           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 642             {
 643             mprot_error:
 644               lll_lock (stack_cache_lock, LLL_PRIVATE);
 645
 646               /* Remove the thread from the list.  */
 647               stack_list_del (&pd->list);
 648
 649               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 650
 651               /* Get rid of the TLS block we allocated.  */
 652               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 653
 654               /* Free the stack memory regardless of whether the size
 655                  of the cache is over the limit or not.  If this piece
 656                  of memory caused problems we better do not use it
 657                  anymore.  Uh, and we ignore possible errors.  There
 658                  is nothing we could do.  */
 659               (void) munmap (mem, size);
 660
 661               return errno;
 662             }
 663
 664           pd->guardsize = guardsize;
 665         }
 666       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 667                                  0))
 668         {
 669           /* The old guard area is too large.  */
 670
 671 #ifdef NEED_SEPARATE_REGISTER_STACK
 672           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 673           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 674
 675           if (oldguard < guard
 676               && mprotect (oldguard, guard - oldguard, prot) != 0)
 677             goto mprot_error;
 678
 679           if (mprotect (guard + guardsize,
 680                         oldguard + pd->guardsize - guard - guardsize,
 681                         prot) != 0)
 682             goto mprot_error;
 683 #elif _STACK_GROWS_DOWN
 684           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 685                         prot) != 0)
 686             goto mprot_error;
 687 #elif _STACK_GROWS_UP
 688           if (mprotect ((char *) pd - pd->guardsize,
 689                         pd->guardsize - guardsize, prot) != 0)
 690             goto mprot_error;
 691 #endif
 692
 693           pd->guardsize = guardsize;
 694         }
 695       /* The pthread_getattr_np() calls need to get passed the size
 696          requested in the attribute, regardless of how large the
 697          actually used guardsize is.  */
 698       pd->reported_guardsize = guardsize;
 699     }
 700
 701   /* Initialize the lock.  We have to do this unconditionally since the
 702      stillborn thread could be canceled while the lock is taken.  */
 703   pd->lock = LLL_LOCK_INITIALIZER;
 704
 705   /* The robust mutex lists also need to be initialized
 706      unconditionally because the cleanup for the previous stack owner
 707      might have happened in the kernel.  */
 708   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 709                                   - offsetof (pthread_mutex_t,
 710                                               __data.__list.__next));
 711   pd->robust_head.list_op_pending = NULL;
 712 #ifdef __PTHREAD_MUTEX_HAVE_PREV
 713   pd->robust_prev = &pd->robust_head;
 714 #endif
 715   pd->robust_head.list = &pd->robust_head;
 716
 717   /* We place the thread descriptor at the end of the stack.  */
 718   *pdp = pd;
 719
 720 #if TLS_TCB_AT_TP
 721   /* The stack begins before the TCB and the static TLS block.  */
 722   stacktop = ((char *) (pd + 1) - __static_tls_size);
 723 #elif TLS_DTV_AT_TP
 724   stacktop = (char *) (pd - 1);
 725 #endif
 726
 727 #ifdef NEED_SEPARATE_REGISTER_STACK
 728   *stack = pd->stackblock;
 729   *stacksize = stacktop - *stack;
 730 #elif _STACK_GROWS_DOWN
 731   *stack = stacktop;
 732 #elif _STACK_GROWS_UP
 733   *stack = pd->stackblock;
 734   assert (*stack > 0);
 735 #endif
 736
 737   return 0;
 738 }
 739
 740
 741 void
 742 internal_function
 743 __deallocate_stack (struct pthread *pd)
 744 {
 745   lll_lock (stack_cache_lock, LLL_PRIVATE);
 746
 747   /* Remove the thread from the list of threads with user defined
 748      stacks.  */
 749   stack_list_del (&pd->list);
 750
 751   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 752      not reset the 'used' flag in the 'tid' field.  This is done by
 753      the kernel.  If no thread has been created yet this field is
 754      still zero.  */
 755   if (__glibc_likely (! pd->user_stack))
 756     (void) queue_stack (pd);
 757   else
 758     /* Free the memory associated with the ELF TLS.  */
 759     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 760
 761   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 762 }
 763
 764
 765 int
 766 internal_function
 767 __make_stacks_executable (void **stack_endp)
 768 {
 769   /* First the main thread's stack.  */
 770   int err = _dl_make_stack_executable (stack_endp);
 771   if (err != 0)
 772     return err;
 773
 774 #ifdef NEED_SEPARATE_REGISTER_STACK
 775   const size_t pagemask = ~(__getpagesize () - 1);
 776 #endif
 777
 778   lll_lock (stack_cache_lock, LLL_PRIVATE);
 779
 780   list_t *runp;
 781   list_for_each (runp, &stack_used)
 782     {
 783       err = change_stack_perm (list_entry (runp, struct pthread, list)
 784 #ifdef NEED_SEPARATE_REGISTER_STACK
 785                                , pagemask
 786 #endif
 787                                );
 788       if (err != 0)
 789         break;
 790     }
 791
 792   /* Also change the permission for the currently unused stacks.  This
 793      might be wasted time but better spend it here than adding a check
 794      in the fast path.  */
 795   if (err == 0)
 796     list_for_each (runp, &stack_cache)
 797       {
 798         err = change_stack_perm (list_entry (runp, struct pthread, list)
 799 #ifdef NEED_SEPARATE_REGISTER_STACK
 800                                  , pagemask
 801 #endif
 802                                  );
 803         if (err != 0)
 804           break;
 805       }
 806
 807   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 808
 809   return err;
 810 }
 811
 812
 813 /* In case of a fork() call the memory allocation in the child will be
 814    the same but only one thread is running.  All stacks except that of
 815    the one running thread are not used anymore.  We have to recycle
 816    them.  */
 817 void
 818 __reclaim_stacks (void)
 819 {
 820   struct pthread *self = (struct pthread *) THREAD_SELF;
 821
 822   /* No locking necessary.  The caller is the only stack in use.  But
 823      we have to be aware that we might have interrupted a list
 824      operation.  */
 825
 826   if (in_flight_stack != 0)
 827     {
 828       bool add_p = in_flight_stack & 1;
 829       list_t *elem = (list_t *) (in_flight_stack & ~(uintptr_t) 1);
 830
 831       if (add_p)
 832         {
 833           /* We always add at the beginning of the list.  So in this case we
 834              only need to check the beginning of these lists to see if the
 835              pointers at the head of the list are inconsistent.  */
 836           list_t *l = NULL;
 837
 838           if (stack_used.next->prev != &stack_used)
 839             l = &stack_used;
 840           else if (stack_cache.next->prev != &stack_cache)
 841             l = &stack_cache;
 842
 843           if (l != NULL)
 844             {
 845               assert (l->next->prev == elem);
 846               elem->next = l->next;
 847               elem->prev = l;
 848               l->next = elem;
 849             }
 850         }
 851       else
 852         {
 853           /* We can simply always replay the delete operation.  */
 854           elem->next->prev = elem->prev;
 855           elem->prev->next = elem->next;
 856         }
 857     }
 858
 859   /* Mark all stacks except the still running one as free.  */
 860   list_t *runp;
 861   list_for_each (runp, &stack_used)
 862     {
 863       struct pthread *curp = list_entry (runp, struct pthread, list);
 864       if (curp != self)
 865         {
 866           /* This marks the stack as free.  */
 867           curp->tid = 0;
 868
 869           /* The PID field must be initialized for the new process.  */
 870           curp->pid = self->pid;
 871
 872           /* Account for the size of the stack.  */
 873           stack_cache_actsize += curp->stackblock_size;
 874
 875           if (curp->specific_used)
 876             {
 877               /* Clear the thread-specific data.  */
 878               memset (curp->specific_1stblock, '\0',
 879                       sizeof (curp->specific_1stblock));
 880
 881               curp->specific_used = false;
 882
 883               for (size_t cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 884                 if (curp->specific[cnt] != NULL)
 885                   {
 886                     memset (curp->specific[cnt], '\0',
 887                             sizeof (curp->specific_1stblock));
 888
 889                     /* We have allocated the block which we do not
 890                        free here so re-set the bit.  */
 891                     curp->specific_used = true;
 892                   }
 893             }
 894         }
 895     }
 896
 897   /* Reset the PIDs in any cached stacks.  */
 898   list_for_each (runp, &stack_cache)
 899     {
 900       struct pthread *curp = list_entry (runp, struct pthread, list);
 901       curp->pid = self->pid;
 902     }
 903
 904   /* Add the stack of all running threads to the cache.  */
 905   list_splice (&stack_used, &stack_cache);
 906
 907   /* Remove the entry for the current thread to from the cache list
 908      and add it to the list of running threads.  Which of the two
 909      lists is decided by the user_stack flag.  */
 910   stack_list_del (&self->list);
 911
 912   /* Re-initialize the lists for all the threads.  */
 913   INIT_LIST_HEAD (&stack_used);
 914   INIT_LIST_HEAD (&__stack_user);
 915
 916   if (__glibc_unlikely (THREAD_GETMEM (self, user_stack)))
 917     list_add (&self->list, &__stack_user);
 918   else
 919     list_add (&self->list, &stack_used);
 920
 921   /* There is one thread running.  */
 922   __nptl_nthreads = 1;
 923
 924   in_flight_stack = 0;
 925
 926   /* Initialize locks.  */
 927   stack_cache_lock = LLL_LOCK_INITIALIZER;
 928   __default_pthread_attr_lock = LLL_LOCK_INITIALIZER;
 929 }
 930
 931
 932 #if HP_TIMING_AVAIL
 933 # undef __find_thread_by_id
 934 /* Find a thread given the thread ID.  */
 935 attribute_hidden
 936 struct pthread *
 937 __find_thread_by_id (pid_t tid)
 938 {
 939   struct pthread *result = NULL;
 940
 941   lll_lock (stack_cache_lock, LLL_PRIVATE);
 942
 943   /* Iterate over the list with system-allocated threads first.  */
 944   list_t *runp;
 945   list_for_each (runp, &stack_used)
 946     {
 947       struct pthread *curp;
 948
 949       curp = list_entry (runp, struct pthread, list);
 950
 951       if (curp->tid == tid)
 952         {
 953           result = curp;
 954           goto out;
 955         }
 956     }
 957
 958   /* Now the list with threads using user-allocated stacks.  */
 959   list_for_each (runp, &__stack_user)
 960     {
 961       struct pthread *curp;
 962
 963       curp = list_entry (runp, struct pthread, list);
 964
 965       if (curp->tid == tid)
 966         {
 967           result = curp;
 968           goto out;
 969         }
 970     }
 971
 972  out:
 973   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 974
 975   return result;
 976 }
 977 #endif
 978
 979
 980 static void
 981 internal_function
 982 setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
 983 {
 984   int ch;
 985
 986   /* Wait until this thread is cloned.  */
 987   if (t->setxid_futex == -1
 988       && ! atomic_compare_and_exchange_bool_acq (&t->setxid_futex, -2, -1))
 989     do
 990       lll_futex_wait (&t->setxid_futex, -2, LLL_PRIVATE);
 991     while (t->setxid_futex == -2);
 992
 993   /* Don't let the thread exit before the setxid handler runs.  */
 994   t->setxid_futex = 0;
 995
 996   do
 997     {
 998       ch = t->cancelhandling;
 999
1000       /* If the thread is exiting right now, ignore it.  */
1001       if ((ch & EXITING_BITMASK) != 0)
1002         {
1003           /* Release the futex if there is no other setxid in
1004              progress.  */
1005           if ((ch & SETXID_BITMASK) == 0)
1006             {
1007               t->setxid_futex = 1;
1008               lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
1009             }
1010           return;
1011         }
1012     }
1013   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1014                                                ch | SETXID_BITMASK, ch));
1015 }
1016
1017
1018 static void
1019 internal_function
1020 setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
1021 {
1022   int ch;
1023
1024   do
1025     {
1026       ch = t->cancelhandling;
1027       if ((ch & SETXID_BITMASK) == 0)
1028         return;
1029     }
1030   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1031                                                ch & ~SETXID_BITMASK, ch));
1032
1033   /* Release the futex just in case.  */
1034   t->setxid_futex = 1;
1035   lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
1036 }
1037
1038
1039 static int
1040 internal_function
1041 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
1042 {
1043   if ((t->cancelhandling & SETXID_BITMASK) == 0)
1044     return 0;
1045
1046   int val;
1047   INTERNAL_SYSCALL_DECL (err);
1048   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
1049                           t->tid, SIGSETXID);
1050
1051   /* If this failed, it must have had not started yet or else exited.  */
1052   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
1053     {
1054       atomic_increment (&cmdp->cntr);
1055       return 1;
1056     }
1057   else
1058     return 0;
1059 }
1060
1061
1062 int
1063 attribute_hidden
1064 __nptl_setxid (struct xid_command *cmdp)
1065 {
1066   int signalled;
1067   int result;
1068   lll_lock (stack_cache_lock, LLL_PRIVATE);
1069
1070   __xidcmd = cmdp;
1071   cmdp->cntr = 0;
1072
1073   struct pthread *self = THREAD_SELF;
1074
1075   /* Iterate over the list with system-allocated threads first.  */
1076   list_t *runp;
1077   list_for_each (runp, &stack_used)
1078     {
1079       struct pthread *t = list_entry (runp, struct pthread, list);
1080       if (t == self)
1081         continue;
1082
1083       setxid_mark_thread (cmdp, t);
1084     }
1085
1086   /* Now the list with threads using user-allocated stacks.  */
1087   list_for_each (runp, &__stack_user)
1088     {
1089       struct pthread *t = list_entry (runp, struct pthread, list);
1090       if (t == self)
1091         continue;
1092
1093       setxid_mark_thread (cmdp, t);
1094     }
1095
1096   /* Iterate until we don't succeed in signalling anyone.  That means
1097      we have gotten all running threads, and their children will be
1098      automatically correct once started.  */
1099   do
1100     {
1101       signalled = 0;
1102
1103       list_for_each (runp, &stack_used)
1104         {
1105           struct pthread *t = list_entry (runp, struct pthread, list);
1106           if (t == self)
1107             continue;
1108
1109           signalled += setxid_signal_thread (cmdp, t);
1110         }
1111
1112       list_for_each (runp, &__stack_user)
1113         {
1114           struct pthread *t = list_entry (runp, struct pthread, list);
1115           if (t == self)
1116             continue;
1117
1118           signalled += setxid_signal_thread (cmdp, t);
1119         }
1120
1121       int cur = cmdp->cntr;
1122       while (cur != 0)
1123         {
1124           lll_futex_wait (&cmdp->cntr, cur, LLL_PRIVATE);
1125           cur = cmdp->cntr;
1126         }
1127     }
1128   while (signalled != 0);
1129
1130   /* Clean up flags, so that no thread blocks during exit waiting
1131      for a signal which will never come.  */
1132   list_for_each (runp, &stack_used)
1133     {
1134       struct pthread *t = list_entry (runp, struct pthread, list);
1135       if (t == self)
1136         continue;
1137
1138       setxid_unmark_thread (cmdp, t);
1139     }
1140
1141   list_for_each (runp, &__stack_user)
1142     {
1143       struct pthread *t = list_entry (runp, struct pthread, list);
1144       if (t == self)
1145         continue;
1146
1147       setxid_unmark_thread (cmdp, t);
1148     }
1149
1150   /* This must be last, otherwise the current thread might not have
1151      permissions to send SIGSETXID syscall to the other threads.  */
1152   INTERNAL_SYSCALL_DECL (err);
1153   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1154                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1155   if (INTERNAL_SYSCALL_ERROR_P (result, err))
1156     {
1157       __set_errno (INTERNAL_SYSCALL_ERRNO (result, err));
1158       result = -1;
1159     }
1160
1161   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1162   return result;
1163 }
1164
1165 static inline void __attribute__((always_inline))
1166 init_one_static_tls (struct pthread *curp, struct link_map *map)
1167 {
1168   dtv_t *dtv = GET_DTV (TLS_TPADJ (curp));
1169 # if TLS_TCB_AT_TP
1170   void *dest = (char *) curp - map->l_tls_offset;
1171 # elif TLS_DTV_AT_TP
1172   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1173 # else
1174 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1175 # endif
1176
1177   /* Fill in the DTV slot so that a later LD/GD access will find it.  */
1178   dtv[map->l_tls_modid].pointer.val = dest;
1179   dtv[map->l_tls_modid].pointer.is_static = true;
1180
1181   /* Initialize the memory.  */
1182   memset (__mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1183           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1184 }
1185
1186 void
1187 attribute_hidden
1188 __pthread_init_static_tls (struct link_map *map)
1189 {
1190   lll_lock (stack_cache_lock, LLL_PRIVATE);
1191
1192   /* Iterate over the list with system-allocated threads first.  */
1193   list_t *runp;
1194   list_for_each (runp, &stack_used)
1195     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1196
1197   /* Now the list with threads using user-allocated stacks.  */
1198   list_for_each (runp, &__stack_user)
1199     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1200
1201   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1202 }
1203
1204
1205 void
1206 attribute_hidden
1207 __wait_lookup_done (void)
1208 {
1209   lll_lock (stack_cache_lock, LLL_PRIVATE);
1210
1211   struct pthread *self = THREAD_SELF;
1212
1213   /* Iterate over the list with system-allocated threads first.  */
1214   list_t *runp;
1215   list_for_each (runp, &stack_used)
1216     {
1217       struct pthread *t = list_entry (runp, struct pthread, list);
1218       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1219         continue;
1220
1221       int *const gscope_flagp = &t->header.gscope_flag;
1222
1223       /* We have to wait until this thread is done with the global
1224          scope.  First tell the thread that we are waiting and
1225          possibly have to be woken.  */
1226       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1227                                                 THREAD_GSCOPE_FLAG_WAIT,
1228                                                 THREAD_GSCOPE_FLAG_USED))
1229         continue;
1230
1231       do
1232         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1233       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1234     }
1235
1236   /* Now the list with threads using user-allocated stacks.  */
1237   list_for_each (runp, &__stack_user)
1238     {
1239       struct pthread *t = list_entry (runp, struct pthread, list);
1240       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1241         continue;
1242
1243       int *const gscope_flagp = &t->header.gscope_flag;
1244
1245       /* We have to wait until this thread is done with the global
1246          scope.  First tell the thread that we are waiting and
1247          possibly have to be woken.  */
1248       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1249                                                 THREAD_GSCOPE_FLAG_WAIT,
1250                                                 THREAD_GSCOPE_FLAG_USED))
1251         continue;
1252
1253       do
1254         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1255       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1256     }
1257
1258   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1259 }