nptl/allocatestack.c

   1 /* Copyright (C) 2002-2015 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <assert.h>
  20 #include <errno.h>
  21 #include <signal.h>
  22 #include <stdint.h>
  23 #include <string.h>
  24 #include <unistd.h>
  25 #include <sys/mman.h>
  26 #include <sys/param.h>
  27 #include <dl-sysdep.h>
  28 #include <dl-tls.h>
  29 #include <tls.h>
  30 #include <list.h>
  31 #include <lowlevellock.h>
  32 #include <futex-internal.h>
  33 #include <kernel-features.h>
  34 #include <stack-aliasing.h>
  35
  36
  37 #ifndef NEED_SEPARATE_REGISTER_STACK
  38
  39 /* Most architectures have exactly one stack pointer.  Some have more.  */
  40 # define STACK_VARIABLES void *stackaddr = NULL
  41
  42 /* How to pass the values to the 'create_thread' function.  */
  43 # define STACK_VARIABLES_ARGS stackaddr
  44
  45 /* How to declare function which gets there parameters.  */
  46 # define STACK_VARIABLES_PARMS void *stackaddr
  47
  48 /* How to declare allocate_stack.  */
  49 # define ALLOCATE_STACK_PARMS void **stack
  50
  51 /* This is how the function is called.  We do it this way to allow
  52    other variants of the function to have more parameters.  */
  53 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  54
  55 #else
  56
  57 /* We need two stacks.  The kernel will place them but we have to tell
  58    the kernel about the size of the reserved address space.  */
  59 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  60
  61 /* How to pass the values to the 'create_thread' function.  */
  62 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  63
  64 /* How to declare function which gets there parameters.  */
  65 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  66
  67 /* How to declare allocate_stack.  */
  68 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  69
  70 /* This is how the function is called.  We do it this way to allow
  71    other variants of the function to have more parameters.  */
  72 # define ALLOCATE_STACK(attr, pd) \
  73   allocate_stack (attr, pd, &stackaddr, &stacksize)
  74
  75 #endif
  76
  77
  78 /* Default alignment of stack.  */
  79 #ifndef STACK_ALIGN
  80 # define STACK_ALIGN __alignof__ (long double)
  81 #endif
  82
  83 /* Default value for minimal stack size after allocating thread
  84    descriptor and guard.  */
  85 #ifndef MINIMAL_REST_STACK
  86 # define MINIMAL_REST_STACK     4096
  87 #endif
  88
  89
  90 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  91    a stack.  Use it when possible.  */
  92 #ifndef MAP_STACK
  93 # define MAP_STACK 0
  94 #endif
  95
  96 /* This yields the pointer that TLS support code calls the thread pointer.  */
  97 #if TLS_TCB_AT_TP
  98 # define TLS_TPADJ(pd) (pd)
  99 #elif TLS_DTV_AT_TP
 100 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
 101 #endif
 102
 103 /* Cache handling for not-yet free stacks.  */
 104
 105 /* Maximum size in kB of cache.  */
 106 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 107 static size_t stack_cache_actsize;
 108
 109 /* Mutex protecting this variable.  */
 110 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 111
 112 /* List of queued stack frames.  */
 113 static LIST_HEAD (stack_cache);
 114
 115 /* List of the stacks in use.  */
 116 static LIST_HEAD (stack_used);
 117
 118 /* We need to record what list operations we are going to do so that,
 119    in case of an asynchronous interruption due to a fork() call, we
 120    can correct for the work.  */
 121 static uintptr_t in_flight_stack;
 122
 123 /* List of the threads with user provided stacks in use.  No need to
 124    initialize this, since it's done in __pthread_initialize_minimal.  */
 125 list_t __stack_user __attribute__ ((nocommon));
 126 hidden_data_def (__stack_user)
 127
 128 #if COLORING_INCREMENT != 0
 129 /* Number of threads created.  */
 130 static unsigned int nptl_ncreated;
 131 #endif
 132
 133
 134 /* Check whether the stack is still used or not.  */
 135 #define FREE_P(descr) ((descr)->tid <= 0)
 136
 137
 138 static void
 139 stack_list_del (list_t *elem)
 140 {
 141   in_flight_stack = (uintptr_t) elem;
 142
 143   atomic_write_barrier ();
 144
 145   list_del (elem);
 146
 147   atomic_write_barrier ();
 148
 149   in_flight_stack = 0;
 150 }
 151
 152
 153 static void
 154 stack_list_add (list_t *elem, list_t *list)
 155 {
 156   in_flight_stack = (uintptr_t) elem | 1;
 157
 158   atomic_write_barrier ();
 159
 160   list_add (elem, list);
 161
 162   atomic_write_barrier ();
 163
 164   in_flight_stack = 0;
 165 }
 166
 167
 168 /* We create a double linked list of all cache entries.  Double linked
 169    because this allows removing entries from the end.  */
 170
 171
 172 /* Get a stack frame from the cache.  We have to match by size since
 173    some blocks might be too small or far too large.  */
 174 static struct pthread *
 175 get_cached_stack (size_t *sizep, void **memp)
 176 {
 177   size_t size = *sizep;
 178   struct pthread *result = NULL;
 179   list_t *entry;
 180
 181   lll_lock (stack_cache_lock, LLL_PRIVATE);
 182
 183   /* Search the cache for a matching entry.  We search for the
 184      smallest stack which has at least the required size.  Note that
 185      in normal situations the size of all allocated stacks is the
 186      same.  As the very least there are only a few different sizes.
 187      Therefore this loop will exit early most of the time with an
 188      exact match.  */
 189   list_for_each (entry, &stack_cache)
 190     {
 191       struct pthread *curr;
 192
 193       curr = list_entry (entry, struct pthread, list);
 194       if (FREE_P (curr) && curr->stackblock_size >= size)
 195         {
 196           if (curr->stackblock_size == size)
 197             {
 198               result = curr;
 199               break;
 200             }
 201
 202           if (result == NULL
 203               || result->stackblock_size > curr->stackblock_size)
 204             result = curr;
 205         }
 206     }
 207
 208   if (__builtin_expect (result == NULL, 0)
 209       /* Make sure the size difference is not too excessive.  In that
 210          case we do not use the block.  */
 211       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 212     {
 213       /* Release the lock.  */
 214       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 215
 216       return NULL;
 217     }
 218
 219   /* Don't allow setxid until cloned.  */
 220   result->setxid_futex = -1;
 221
 222   /* Dequeue the entry.  */
 223   stack_list_del (&result->list);
 224
 225   /* And add to the list of stacks in use.  */
 226   stack_list_add (&result->list, &stack_used);
 227
 228   /* And decrease the cache size.  */
 229   stack_cache_actsize -= result->stackblock_size;
 230
 231   /* Release the lock early.  */
 232   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 233
 234   /* Report size and location of the stack to the caller.  */
 235   *sizep = result->stackblock_size;
 236   *memp = result->stackblock;
 237
 238   /* Cancellation handling is back to the default.  */
 239   result->cancelhandling = 0;
 240   result->cleanup = NULL;
 241
 242   /* No pending event.  */
 243   result->nextevent = NULL;
 244
 245   /* Clear the DTV.  */
 246   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 247   for (size_t cnt = 0; cnt < dtv[-1].counter; ++cnt)
 248     if (! dtv[1 + cnt].pointer.is_static
 249         && dtv[1 + cnt].pointer.val != TLS_DTV_UNALLOCATED)
 250       free (dtv[1 + cnt].pointer.val);
 251   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 252
 253   /* Re-initialize the TLS.  */
 254   _dl_allocate_tls_init (TLS_TPADJ (result));
 255
 256   return result;
 257 }
 258
 259
 260 /* Free stacks until cache size is lower than LIMIT.  */
 261 void
 262 __free_stacks (size_t limit)
 263 {
 264   /* We reduce the size of the cache.  Remove the last entries until
 265      the size is below the limit.  */
 266   list_t *entry;
 267   list_t *prev;
 268
 269   /* Search from the end of the list.  */
 270   list_for_each_prev_safe (entry, prev, &stack_cache)
 271     {
 272       struct pthread *curr;
 273
 274       curr = list_entry (entry, struct pthread, list);
 275       if (FREE_P (curr))
 276         {
 277           /* Unlink the block.  */
 278           stack_list_del (entry);
 279
 280           /* Account for the freed memory.  */
 281           stack_cache_actsize -= curr->stackblock_size;
 282
 283           /* Free the memory associated with the ELF TLS.  */
 284           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 285
 286           /* Remove this block.  This should never fail.  If it does
 287              something is really wrong.  */
 288           if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 289             abort ();
 290
 291           /* Maybe we have freed enough.  */
 292           if (stack_cache_actsize <= limit)
 293             break;
 294         }
 295     }
 296 }
 297
 298
 299 /* Add a stack frame which is not used anymore to the stack.  Must be
 300    called with the cache lock held.  */
 301 static inline void
 302 __attribute ((always_inline))
 303 queue_stack (struct pthread *stack)
 304 {
 305   /* We unconditionally add the stack to the list.  The memory may
 306      still be in use but it will not be reused until the kernel marks
 307      the stack as not used anymore.  */
 308   stack_list_add (&stack->list, &stack_cache);
 309
 310   stack_cache_actsize += stack->stackblock_size;
 311   if (__glibc_unlikely (stack_cache_actsize > stack_cache_maxsize))
 312     __free_stacks (stack_cache_maxsize);
 313 }
 314
 315
 316 static int
 317 internal_function
 318 change_stack_perm (struct pthread *pd
 319 #ifdef NEED_SEPARATE_REGISTER_STACK
 320                    , size_t pagemask
 321 #endif
 322                    )
 323 {
 324 #ifdef NEED_SEPARATE_REGISTER_STACK
 325   void *stack = (pd->stackblock
 326                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 327                       & pagemask) + pd->guardsize) & pagemask));
 328   size_t len = pd->stackblock + pd->stackblock_size - stack;
 329 #elif _STACK_GROWS_DOWN
 330   void *stack = pd->stackblock + pd->guardsize;
 331   size_t len = pd->stackblock_size - pd->guardsize;
 332 #elif _STACK_GROWS_UP
 333   void *stack = pd->stackblock;
 334   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 335 #else
 336 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 337 #endif
 338   if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 339     return errno;
 340
 341   return 0;
 342 }
 343
 344
 345 /* Returns a usable stack for a new thread either by allocating a
 346    new stack or reusing a cached stack of sufficient size.
 347    ATTR must be non-NULL and point to a valid pthread_attr.
 348    PDP must be non-NULL.  */
 349 static int
 350 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 351                 ALLOCATE_STACK_PARMS)
 352 {
 353   struct pthread *pd;
 354   size_t size;
 355   size_t pagesize_m1 = __getpagesize () - 1;
 356
 357   assert (powerof2 (pagesize_m1 + 1));
 358   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 359
 360   /* Get the stack size from the attribute if it is set.  Otherwise we
 361      use the default we determined at start time.  */
 362   if (attr->stacksize != 0)
 363     size = attr->stacksize;
 364   else
 365     {
 366       lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
 367       size = __default_pthread_attr.stacksize;
 368       lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
 369     }
 370
 371   /* Get memory for the stack.  */
 372   if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))
 373     {
 374       uintptr_t adj;
 375
 376       /* If the user also specified the size of the stack make sure it
 377          is large enough.  */
 378       if (attr->stacksize != 0
 379           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 380         return EINVAL;
 381
 382       /* Adjust stack size for alignment of the TLS block.  */
 383 #if TLS_TCB_AT_TP
 384       adj = ((uintptr_t) attr->stackaddr - TLS_TCB_SIZE)
 385             & __static_tls_align_m1;
 386       assert (size > adj + TLS_TCB_SIZE);
 387 #elif TLS_DTV_AT_TP
 388       adj = ((uintptr_t) attr->stackaddr - __static_tls_size)
 389             & __static_tls_align_m1;
 390       assert (size > adj);
 391 #endif
 392
 393       /* The user provided some memory.  Let's hope it matches the
 394          size...  We do not allocate guard pages if the user provided
 395          the stack.  It is the user's responsibility to do this if it
 396          is wanted.  */
 397 #if TLS_TCB_AT_TP
 398       pd = (struct pthread *) ((uintptr_t) attr->stackaddr
 399                                - TLS_TCB_SIZE - adj);
 400 #elif TLS_DTV_AT_TP
 401       pd = (struct pthread *) (((uintptr_t) attr->stackaddr
 402                                 - __static_tls_size - adj)
 403                                - TLS_PRE_TCB_SIZE);
 404 #endif
 405
 406       /* The user provided stack memory needs to be cleared.  */
 407       memset (pd, '\0', sizeof (struct pthread));
 408
 409       /* The first TSD block is included in the TCB.  */
 410       pd->specific[0] = pd->specific_1stblock;
 411
 412       /* Remember the stack-related values.  */
 413       pd->stackblock = (char *) attr->stackaddr - size;
 414       pd->stackblock_size = size;
 415
 416       /* This is a user-provided stack.  It will not be queued in the
 417          stack cache nor will the memory (except the TLS memory) be freed.  */
 418       pd->user_stack = true;
 419
 420       /* This is at least the second thread.  */
 421       pd->header.multiple_threads = 1;
 422 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 423       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 424 #endif
 425
 426 #ifndef __ASSUME_PRIVATE_FUTEX
 427       /* The thread must know when private futexes are supported.  */
 428       pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 429                                                 header.private_futex);
 430 #endif
 431
 432 #ifdef NEED_DL_SYSINFO
 433       SETUP_THREAD_SYSINFO (pd);
 434 #endif
 435
 436       /* The process ID is also the same as that of the caller.  */
 437       pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 438
 439       /* Don't allow setxid until cloned.  */
 440       pd->setxid_futex = -1;
 441
 442       /* Allocate the DTV for this thread.  */
 443       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 444         {
 445           /* Something went wrong.  */
 446           assert (errno == ENOMEM);
 447           return errno;
 448         }
 449
 450
 451       /* Prepare to modify global data.  */
 452       lll_lock (stack_cache_lock, LLL_PRIVATE);
 453
 454       /* And add to the list of stacks in use.  */
 455       list_add (&pd->list, &__stack_user);
 456
 457       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 458     }
 459   else
 460     {
 461       /* Allocate some anonymous memory.  If possible use the cache.  */
 462       size_t guardsize;
 463       size_t reqsize;
 464       void *mem;
 465       const int prot = (PROT_READ | PROT_WRITE
 466                         | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
 467
 468 #if COLORING_INCREMENT != 0
 469       /* Add one more page for stack coloring.  Don't do it for stacks
 470          with 16 times pagesize or larger.  This might just cause
 471          unnecessary misalignment.  */
 472       if (size <= 16 * pagesize_m1)
 473         size += pagesize_m1 + 1;
 474 #endif
 475
 476       /* Adjust the stack size for alignment.  */
 477       size &= ~__static_tls_align_m1;
 478       assert (size != 0);
 479
 480       /* Make sure the size of the stack is enough for the guard and
 481          eventually the thread descriptor.  */
 482       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 483       if (__builtin_expect (size < ((guardsize + __static_tls_size
 484                                      + MINIMAL_REST_STACK + pagesize_m1)
 485                                     & ~pagesize_m1),
 486                             0))
 487         /* The stack is too small (or the guard too large).  */
 488         return EINVAL;
 489
 490       /* Try to get a stack from the cache.  */
 491       reqsize = size;
 492       pd = get_cached_stack (&size, &mem);
 493       if (pd == NULL)
 494         {
 495           /* To avoid aliasing effects on a larger scale than pages we
 496              adjust the allocated stack size if necessary.  This way
 497              allocations directly following each other will not have
 498              aliasing problems.  */
 499 #if MULTI_PAGE_ALIASING != 0
 500           if ((size % MULTI_PAGE_ALIASING) == 0)
 501             size += pagesize_m1 + 1;
 502 #endif
 503
 504           mem = mmap (NULL, size, prot,
 505                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 506
 507           if (__glibc_unlikely (mem == MAP_FAILED))
 508             return errno;
 509
 510           /* SIZE is guaranteed to be greater than zero.
 511              So we can never get a null pointer back from mmap.  */
 512           assert (mem != NULL);
 513
 514 #if COLORING_INCREMENT != 0
 515           /* Atomically increment NCREATED.  */
 516           unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
 517
 518           /* We chose the offset for coloring by incrementing it for
 519              every new thread by a fixed amount.  The offset used
 520              module the page size.  Even if coloring would be better
 521              relative to higher alignment values it makes no sense to
 522              do it since the mmap() interface does not allow us to
 523              specify any alignment for the returned memory block.  */
 524           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 525
 526           /* Make sure the coloring offsets does not disturb the alignment
 527              of the TCB and static TLS block.  */
 528           if (__glibc_unlikely ((coloring & __static_tls_align_m1) != 0))
 529             coloring = (((coloring + __static_tls_align_m1)
 530                          & ~(__static_tls_align_m1))
 531                         & ~pagesize_m1);
 532 #else
 533           /* Unless specified we do not make any adjustments.  */
 534 # define coloring 0
 535 #endif
 536
 537           /* Place the thread descriptor at the end of the stack.  */
 538 #if TLS_TCB_AT_TP
 539           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 540 #elif TLS_DTV_AT_TP
 541           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 542                                     - __static_tls_size)
 543                                     & ~__static_tls_align_m1)
 544                                    - TLS_PRE_TCB_SIZE);
 545 #endif
 546
 547           /* Remember the stack-related values.  */
 548           pd->stackblock = mem;
 549           pd->stackblock_size = size;
 550
 551           /* We allocated the first block thread-specific data array.
 552              This address will not change for the lifetime of this
 553              descriptor.  */
 554           pd->specific[0] = pd->specific_1stblock;
 555
 556           /* This is at least the second thread.  */
 557           pd->header.multiple_threads = 1;
 558 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 559           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 560 #endif
 561
 562 #ifndef __ASSUME_PRIVATE_FUTEX
 563           /* The thread must know when private futexes are supported.  */
 564           pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 565                                                     header.private_futex);
 566 #endif
 567
 568 #ifdef NEED_DL_SYSINFO
 569           SETUP_THREAD_SYSINFO (pd);
 570 #endif
 571
 572           /* Don't allow setxid until cloned.  */
 573           pd->setxid_futex = -1;
 574
 575           /* The process ID is also the same as that of the caller.  */
 576           pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 577
 578           /* Allocate the DTV for this thread.  */
 579           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 580             {
 581               /* Something went wrong.  */
 582               assert (errno == ENOMEM);
 583
 584               /* Free the stack memory we just allocated.  */
 585               (void) munmap (mem, size);
 586
 587               return errno;
 588             }
 589
 590
 591           /* Prepare to modify global data.  */
 592           lll_lock (stack_cache_lock, LLL_PRIVATE);
 593
 594           /* And add to the list of stacks in use.  */
 595           stack_list_add (&pd->list, &stack_used);
 596
 597           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 598
 599
 600           /* There might have been a race.  Another thread might have
 601              caused the stacks to get exec permission while this new
 602              stack was prepared.  Detect if this was possible and
 603              change the permission if necessary.  */
 604           if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
 605                                 && (prot & PROT_EXEC) == 0, 0))
 606             {
 607               int err = change_stack_perm (pd
 608 #ifdef NEED_SEPARATE_REGISTER_STACK
 609                                            , ~pagesize_m1
 610 #endif
 611                                            );
 612               if (err != 0)
 613                 {
 614                   /* Free the stack memory we just allocated.  */
 615                   (void) munmap (mem, size);
 616
 617                   return err;
 618                 }
 619             }
 620
 621
 622           /* Note that all of the stack and the thread descriptor is
 623              zeroed.  This means we do not have to initialize fields
 624              with initial value zero.  This is specifically true for
 625              the 'tid' field which is always set back to zero once the
 626              stack is not used anymore and for the 'guardsize' field
 627              which will be read next.  */
 628         }
 629
 630       /* Create or resize the guard area if necessary.  */
 631       if (__glibc_unlikely (guardsize > pd->guardsize))
 632         {
 633 #ifdef NEED_SEPARATE_REGISTER_STACK
 634           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 635 #elif _STACK_GROWS_DOWN
 636           char *guard = mem;
 637 # elif _STACK_GROWS_UP
 638           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 639 #endif
 640           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 641             {
 642             mprot_error:
 643               lll_lock (stack_cache_lock, LLL_PRIVATE);
 644
 645               /* Remove the thread from the list.  */
 646               stack_list_del (&pd->list);
 647
 648               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 649
 650               /* Get rid of the TLS block we allocated.  */
 651               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 652
 653               /* Free the stack memory regardless of whether the size
 654                  of the cache is over the limit or not.  If this piece
 655                  of memory caused problems we better do not use it
 656                  anymore.  Uh, and we ignore possible errors.  There
 657                  is nothing we could do.  */
 658               (void) munmap (mem, size);
 659
 660               return errno;
 661             }
 662
 663           pd->guardsize = guardsize;
 664         }
 665       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 666                                  0))
 667         {
 668           /* The old guard area is too large.  */
 669
 670 #ifdef NEED_SEPARATE_REGISTER_STACK
 671           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 672           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 673
 674           if (oldguard < guard
 675               && mprotect (oldguard, guard - oldguard, prot) != 0)
 676             goto mprot_error;
 677
 678           if (mprotect (guard + guardsize,
 679                         oldguard + pd->guardsize - guard - guardsize,
 680                         prot) != 0)
 681             goto mprot_error;
 682 #elif _STACK_GROWS_DOWN
 683           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 684                         prot) != 0)
 685             goto mprot_error;
 686 #elif _STACK_GROWS_UP
 687           if (mprotect ((char *) pd - pd->guardsize,
 688                         pd->guardsize - guardsize, prot) != 0)
 689             goto mprot_error;
 690 #endif
 691
 692           pd->guardsize = guardsize;
 693         }
 694       /* The pthread_getattr_np() calls need to get passed the size
 695          requested in the attribute, regardless of how large the
 696          actually used guardsize is.  */
 697       pd->reported_guardsize = guardsize;
 698     }
 699
 700   /* Initialize the lock.  We have to do this unconditionally since the
 701      stillborn thread could be canceled while the lock is taken.  */
 702   pd->lock = LLL_LOCK_INITIALIZER;
 703
 704   /* The robust mutex lists also need to be initialized
 705      unconditionally because the cleanup for the previous stack owner
 706      might have happened in the kernel.  */
 707   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 708                                   - offsetof (pthread_mutex_t,
 709                                               __data.__list.__next));
 710   pd->robust_head.list_op_pending = NULL;
 711 #ifdef __PTHREAD_MUTEX_HAVE_PREV
 712   pd->robust_prev = &pd->robust_head;
 713 #endif
 714   pd->robust_head.list = &pd->robust_head;
 715
 716   /* We place the thread descriptor at the end of the stack.  */
 717   *pdp = pd;
 718
 719 #if _STACK_GROWS_DOWN
 720   void *stacktop;
 721
 722 # if TLS_TCB_AT_TP
 723   /* The stack begins before the TCB and the static TLS block.  */
 724   stacktop = ((char *) (pd + 1) - __static_tls_size);
 725 # elif TLS_DTV_AT_TP
 726   stacktop = (char *) (pd - 1);
 727 # endif
 728
 729 # ifdef NEED_SEPARATE_REGISTER_STACK
 730   *stack = pd->stackblock;
 731   *stacksize = stacktop - *stack;
 732 # else
 733   *stack = stacktop;
 734 # endif
 735 #else
 736   *stack = pd->stackblock;
 737   assert (*stack > 0);
 738 #endif
 739
 740   return 0;
 741 }
 742
 743
 744 void
 745 internal_function
 746 __deallocate_stack (struct pthread *pd)
 747 {
 748   lll_lock (stack_cache_lock, LLL_PRIVATE);
 749
 750   /* Remove the thread from the list of threads with user defined
 751      stacks.  */
 752   stack_list_del (&pd->list);
 753
 754   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 755      not reset the 'used' flag in the 'tid' field.  This is done by
 756      the kernel.  If no thread has been created yet this field is
 757      still zero.  */
 758   if (__glibc_likely (! pd->user_stack))
 759     (void) queue_stack (pd);
 760   else
 761     /* Free the memory associated with the ELF TLS.  */
 762     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 763
 764   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 765 }
 766
 767
 768 int
 769 internal_function
 770 __make_stacks_executable (void **stack_endp)
 771 {
 772   /* First the main thread's stack.  */
 773   int err = _dl_make_stack_executable (stack_endp);
 774   if (err != 0)
 775     return err;
 776
 777 #ifdef NEED_SEPARATE_REGISTER_STACK
 778   const size_t pagemask = ~(__getpagesize () - 1);
 779 #endif
 780
 781   lll_lock (stack_cache_lock, LLL_PRIVATE);
 782
 783   list_t *runp;
 784   list_for_each (runp, &stack_used)
 785     {
 786       err = change_stack_perm (list_entry (runp, struct pthread, list)
 787 #ifdef NEED_SEPARATE_REGISTER_STACK
 788                                , pagemask
 789 #endif
 790                                );
 791       if (err != 0)
 792         break;
 793     }
 794
 795   /* Also change the permission for the currently unused stacks.  This
 796      might be wasted time but better spend it here than adding a check
 797      in the fast path.  */
 798   if (err == 0)
 799     list_for_each (runp, &stack_cache)
 800       {
 801         err = change_stack_perm (list_entry (runp, struct pthread, list)
 802 #ifdef NEED_SEPARATE_REGISTER_STACK
 803                                  , pagemask
 804 #endif
 805                                  );
 806         if (err != 0)
 807           break;
 808       }
 809
 810   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 811
 812   return err;
 813 }
 814
 815
 816 /* In case of a fork() call the memory allocation in the child will be
 817    the same but only one thread is running.  All stacks except that of
 818    the one running thread are not used anymore.  We have to recycle
 819    them.  */
 820 void
 821 __reclaim_stacks (void)
 822 {
 823   struct pthread *self = (struct pthread *) THREAD_SELF;
 824
 825   /* No locking necessary.  The caller is the only stack in use.  But
 826      we have to be aware that we might have interrupted a list
 827      operation.  */
 828
 829   if (in_flight_stack != 0)
 830     {
 831       bool add_p = in_flight_stack & 1;
 832       list_t *elem = (list_t *) (in_flight_stack & ~(uintptr_t) 1);
 833
 834       if (add_p)
 835         {
 836           /* We always add at the beginning of the list.  So in this case we
 837              only need to check the beginning of these lists to see if the
 838              pointers at the head of the list are inconsistent.  */
 839           list_t *l = NULL;
 840
 841           if (stack_used.next->prev != &stack_used)
 842             l = &stack_used;
 843           else if (stack_cache.next->prev != &stack_cache)
 844             l = &stack_cache;
 845
 846           if (l != NULL)
 847             {
 848               assert (l->next->prev == elem);
 849               elem->next = l->next;
 850               elem->prev = l;
 851               l->next = elem;
 852             }
 853         }
 854       else
 855         {
 856           /* We can simply always replay the delete operation.  */
 857           elem->next->prev = elem->prev;
 858           elem->prev->next = elem->next;
 859         }
 860     }
 861
 862   /* Mark all stacks except the still running one as free.  */
 863   list_t *runp;
 864   list_for_each (runp, &stack_used)
 865     {
 866       struct pthread *curp = list_entry (runp, struct pthread, list);
 867       if (curp != self)
 868         {
 869           /* This marks the stack as free.  */
 870           curp->tid = 0;
 871
 872           /* The PID field must be initialized for the new process.  */
 873           curp->pid = self->pid;
 874
 875           /* Account for the size of the stack.  */
 876           stack_cache_actsize += curp->stackblock_size;
 877
 878           if (curp->specific_used)
 879             {
 880               /* Clear the thread-specific data.  */
 881               memset (curp->specific_1stblock, '\0',
 882                       sizeof (curp->specific_1stblock));
 883
 884               curp->specific_used = false;
 885
 886               for (size_t cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 887                 if (curp->specific[cnt] != NULL)
 888                   {
 889                     memset (curp->specific[cnt], '\0',
 890                             sizeof (curp->specific_1stblock));
 891
 892                     /* We have allocated the block which we do not
 893                        free here so re-set the bit.  */
 894                     curp->specific_used = true;
 895                   }
 896             }
 897         }
 898     }
 899
 900   /* Reset the PIDs in any cached stacks.  */
 901   list_for_each (runp, &stack_cache)
 902     {
 903       struct pthread *curp = list_entry (runp, struct pthread, list);
 904       curp->pid = self->pid;
 905     }
 906
 907   /* Add the stack of all running threads to the cache.  */
 908   list_splice (&stack_used, &stack_cache);
 909
 910   /* Remove the entry for the current thread to from the cache list
 911      and add it to the list of running threads.  Which of the two
 912      lists is decided by the user_stack flag.  */
 913   stack_list_del (&self->list);
 914
 915   /* Re-initialize the lists for all the threads.  */
 916   INIT_LIST_HEAD (&stack_used);
 917   INIT_LIST_HEAD (&__stack_user);
 918
 919   if (__glibc_unlikely (THREAD_GETMEM (self, user_stack)))
 920     list_add (&self->list, &__stack_user);
 921   else
 922     list_add (&self->list, &stack_used);
 923
 924   /* There is one thread running.  */
 925   __nptl_nthreads = 1;
 926
 927   in_flight_stack = 0;
 928
 929   /* Initialize locks.  */
 930   stack_cache_lock = LLL_LOCK_INITIALIZER;
 931   __default_pthread_attr_lock = LLL_LOCK_INITIALIZER;
 932 }
 933
 934
 935 #if HP_TIMING_AVAIL
 936 # undef __find_thread_by_id
 937 /* Find a thread given the thread ID.  */
 938 attribute_hidden
 939 struct pthread *
 940 __find_thread_by_id (pid_t tid)
 941 {
 942   struct pthread *result = NULL;
 943
 944   lll_lock (stack_cache_lock, LLL_PRIVATE);
 945
 946   /* Iterate over the list with system-allocated threads first.  */
 947   list_t *runp;
 948   list_for_each (runp, &stack_used)
 949     {
 950       struct pthread *curp;
 951
 952       curp = list_entry (runp, struct pthread, list);
 953
 954       if (curp->tid == tid)
 955         {
 956           result = curp;
 957           goto out;
 958         }
 959     }
 960
 961   /* Now the list with threads using user-allocated stacks.  */
 962   list_for_each (runp, &__stack_user)
 963     {
 964       struct pthread *curp;
 965
 966       curp = list_entry (runp, struct pthread, list);
 967
 968       if (curp->tid == tid)
 969         {
 970           result = curp;
 971           goto out;
 972         }
 973     }
 974
 975  out:
 976   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 977
 978   return result;
 979 }
 980 #endif
 981
 982
 983 #ifdef SIGSETXID
 984 static void
 985 internal_function
 986 setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
 987 {
 988   int ch;
 989
 990   /* Wait until this thread is cloned.  */
 991   if (t->setxid_futex == -1
 992       && ! atomic_compare_and_exchange_bool_acq (&t->setxid_futex, -2, -1))
 993     do
 994       futex_wait_simple (&t->setxid_futex, -2, FUTEX_PRIVATE);
 995     while (t->setxid_futex == -2);
 996
 997   /* Don't let the thread exit before the setxid handler runs.  */
 998   t->setxid_futex = 0;
 999
1000   do
1001     {
1002       ch = t->cancelhandling;
1003
1004       /* If the thread is exiting right now, ignore it.  */
1005       if ((ch & EXITING_BITMASK) != 0)
1006         {
1007           /* Release the futex if there is no other setxid in
1008              progress.  */
1009           if ((ch & SETXID_BITMASK) == 0)
1010             {
1011               t->setxid_futex = 1;
1012               futex_wake (&t->setxid_futex, 1, FUTEX_PRIVATE);
1013             }
1014           return;
1015         }
1016     }
1017   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1018                                                ch | SETXID_BITMASK, ch));
1019 }
1020
1021
1022 static void
1023 internal_function
1024 setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
1025 {
1026   int ch;
1027
1028   do
1029     {
1030       ch = t->cancelhandling;
1031       if ((ch & SETXID_BITMASK) == 0)
1032         return;
1033     }
1034   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1035                                                ch & ~SETXID_BITMASK, ch));
1036
1037   /* Release the futex just in case.  */
1038   t->setxid_futex = 1;
1039   futex_wake (&t->setxid_futex, 1, FUTEX_PRIVATE);
1040 }
1041
1042
1043 static int
1044 internal_function
1045 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
1046 {
1047   if ((t->cancelhandling & SETXID_BITMASK) == 0)
1048     return 0;
1049
1050   int val;
1051   INTERNAL_SYSCALL_DECL (err);
1052   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
1053                           t->tid, SIGSETXID);
1054
1055   /* If this failed, it must have had not started yet or else exited.  */
1056   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
1057     {
1058       atomic_increment (&cmdp->cntr);
1059       return 1;
1060     }
1061   else
1062     return 0;
1063 }
1064
1065 /* Check for consistency across set*id system call results.  The abort
1066    should not happen as long as all privileges changes happen through
1067    the glibc wrappers.  ERROR must be 0 (no error) or an errno
1068    code.  */
1069 void
1070 attribute_hidden
1071 __nptl_setxid_error (struct xid_command *cmdp, int error)
1072 {
1073   do
1074     {
1075       int olderror = cmdp->error;
1076       if (olderror == error)
1077         break;
1078       if (olderror != -1)
1079         /* Mismatch between current and previous results.  */
1080         abort ();
1081     }
1082   while (atomic_compare_and_exchange_bool_acq (&cmdp->error, error, -1));
1083 }
1084
1085 int
1086 attribute_hidden
1087 __nptl_setxid (struct xid_command *cmdp)
1088 {
1089   int signalled;
1090   int result;
1091   lll_lock (stack_cache_lock, LLL_PRIVATE);
1092
1093   __xidcmd = cmdp;
1094   cmdp->cntr = 0;
1095   cmdp->error = -1;
1096
1097   struct pthread *self = THREAD_SELF;
1098
1099   /* Iterate over the list with system-allocated threads first.  */
1100   list_t *runp;
1101   list_for_each (runp, &stack_used)
1102     {
1103       struct pthread *t = list_entry (runp, struct pthread, list);
1104       if (t == self)
1105         continue;
1106
1107       setxid_mark_thread (cmdp, t);
1108     }
1109
1110   /* Now the list with threads using user-allocated stacks.  */
1111   list_for_each (runp, &__stack_user)
1112     {
1113       struct pthread *t = list_entry (runp, struct pthread, list);
1114       if (t == self)
1115         continue;
1116
1117       setxid_mark_thread (cmdp, t);
1118     }
1119
1120   /* Iterate until we don't succeed in signalling anyone.  That means
1121      we have gotten all running threads, and their children will be
1122      automatically correct once started.  */
1123   do
1124     {
1125       signalled = 0;
1126
1127       list_for_each (runp, &stack_used)
1128         {
1129           struct pthread *t = list_entry (runp, struct pthread, list);
1130           if (t == self)
1131             continue;
1132
1133           signalled += setxid_signal_thread (cmdp, t);
1134         }
1135
1136       list_for_each (runp, &__stack_user)
1137         {
1138           struct pthread *t = list_entry (runp, struct pthread, list);
1139           if (t == self)
1140             continue;
1141
1142           signalled += setxid_signal_thread (cmdp, t);
1143         }
1144
1145       int cur = cmdp->cntr;
1146       while (cur != 0)
1147         {
1148           futex_wait_simple ((unsigned int *) &cmdp->cntr, cur,
1149                              FUTEX_PRIVATE);
1150           cur = cmdp->cntr;
1151         }
1152     }
1153   while (signalled != 0);
1154
1155   /* Clean up flags, so that no thread blocks during exit waiting
1156      for a signal which will never come.  */
1157   list_for_each (runp, &stack_used)
1158     {
1159       struct pthread *t = list_entry (runp, struct pthread, list);
1160       if (t == self)
1161         continue;
1162
1163       setxid_unmark_thread (cmdp, t);
1164     }
1165
1166   list_for_each (runp, &__stack_user)
1167     {
1168       struct pthread *t = list_entry (runp, struct pthread, list);
1169       if (t == self)
1170         continue;
1171
1172       setxid_unmark_thread (cmdp, t);
1173     }
1174
1175   /* This must be last, otherwise the current thread might not have
1176      permissions to send SIGSETXID syscall to the other threads.  */
1177   INTERNAL_SYSCALL_DECL (err);
1178   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1179                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1180   int error = 0;
1181   if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (result, err)))
1182     {
1183       error = INTERNAL_SYSCALL_ERRNO (result, err);
1184       __set_errno (error);
1185       result = -1;
1186     }
1187   __nptl_setxid_error (cmdp, error);
1188
1189   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1190   return result;
1191 }
1192 #endif  /* SIGSETXID.  */
1193
1194
1195 static inline void __attribute__((always_inline))
1196 init_one_static_tls (struct pthread *curp, struct link_map *map)
1197 {
1198 # if TLS_TCB_AT_TP
1199   void *dest = (char *) curp - map->l_tls_offset;
1200 # elif TLS_DTV_AT_TP
1201   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1202 # else
1203 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1204 # endif
1205
1206   /* We cannot delay the initialization of the Static TLS area, since
1207      it can be accessed with LE or IE, but since the DTV is only used
1208      by GD and LD, we can delay its update to avoid a race.  */
1209   memset (__mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1210           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1211 }
1212
1213 void
1214 attribute_hidden
1215 __pthread_init_static_tls (struct link_map *map)
1216 {
1217   lll_lock (stack_cache_lock, LLL_PRIVATE);
1218
1219   /* Iterate over the list with system-allocated threads first.  */
1220   list_t *runp;
1221   list_for_each (runp, &stack_used)
1222     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1223
1224   /* Now the list with threads using user-allocated stacks.  */
1225   list_for_each (runp, &__stack_user)
1226     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1227
1228   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1229 }
1230
1231
1232 void
1233 attribute_hidden
1234 __wait_lookup_done (void)
1235 {
1236   lll_lock (stack_cache_lock, LLL_PRIVATE);
1237
1238   struct pthread *self = THREAD_SELF;
1239
1240   /* Iterate over the list with system-allocated threads first.  */
1241   list_t *runp;
1242   list_for_each (runp, &stack_used)
1243     {
1244       struct pthread *t = list_entry (runp, struct pthread, list);
1245       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1246         continue;
1247
1248       int *const gscope_flagp = &t->header.gscope_flag;
1249
1250       /* We have to wait until this thread is done with the global
1251          scope.  First tell the thread that we are waiting and
1252          possibly have to be woken.  */
1253       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1254                                                 THREAD_GSCOPE_FLAG_WAIT,
1255                                                 THREAD_GSCOPE_FLAG_USED))
1256         continue;
1257
1258       do
1259         futex_wait_simple ((unsigned int *) gscope_flagp,
1260                            THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
1261       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1262     }
1263
1264   /* Now the list with threads using user-allocated stacks.  */
1265   list_for_each (runp, &__stack_user)
1266     {
1267       struct pthread *t = list_entry (runp, struct pthread, list);
1268       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1269         continue;
1270
1271       int *const gscope_flagp = &t->header.gscope_flag;
1272
1273       /* We have to wait until this thread is done with the global
1274          scope.  First tell the thread that we are waiting and
1275          possibly have to be woken.  */
1276       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1277                                                 THREAD_GSCOPE_FLAG_WAIT,
1278                                                 THREAD_GSCOPE_FLAG_USED))
1279         continue;
1280
1281       do
1282         futex_wait_simple ((unsigned int *) gscope_flagp,
1283                            THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
1284       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1285     }
1286
1287   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1288 }