nptl/allocatestack.c

   1 /* Copyright (C) 2002-2013 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <assert.h>
  20 #include <errno.h>
  21 #include <signal.h>
  22 #include <stdint.h>
  23 #include <string.h>
  24 #include <unistd.h>
  25 #include <sys/mman.h>
  26 #include <sys/param.h>
  27 #include <dl-sysdep.h>
  28 #include <dl-tls.h>
  29 #include <tls.h>
  30 #include <list.h>
  31 #include <lowlevellock.h>
  32 #include <kernel-features.h>
  33
  34
  35 #ifndef NEED_SEPARATE_REGISTER_STACK
  36
  37 /* Most architectures have exactly one stack pointer.  Some have more.  */
  38 # define STACK_VARIABLES void *stackaddr = NULL
  39
  40 /* How to pass the values to the 'create_thread' function.  */
  41 # define STACK_VARIABLES_ARGS stackaddr
  42
  43 /* How to declare function which gets there parameters.  */
  44 # define STACK_VARIABLES_PARMS void *stackaddr
  45
  46 /* How to declare allocate_stack.  */
  47 # define ALLOCATE_STACK_PARMS void **stack
  48
  49 /* This is how the function is called.  We do it this way to allow
  50    other variants of the function to have more parameters.  */
  51 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  52
  53 #else
  54
  55 /* We need two stacks.  The kernel will place them but we have to tell
  56    the kernel about the size of the reserved address space.  */
  57 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  58
  59 /* How to pass the values to the 'create_thread' function.  */
  60 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  61
  62 /* How to declare function which gets there parameters.  */
  63 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  64
  65 /* How to declare allocate_stack.  */
  66 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  67
  68 /* This is how the function is called.  We do it this way to allow
  69    other variants of the function to have more parameters.  */
  70 # define ALLOCATE_STACK(attr, pd) \
  71   allocate_stack (attr, pd, &stackaddr, &stacksize)
  72
  73 #endif
  74
  75
  76 /* Default alignment of stack.  */
  77 #ifndef STACK_ALIGN
  78 # define STACK_ALIGN __alignof__ (long double)
  79 #endif
  80
  81 /* Default value for minimal stack size after allocating thread
  82    descriptor and guard.  */
  83 #ifndef MINIMAL_REST_STACK
  84 # define MINIMAL_REST_STACK     4096
  85 #endif
  86
  87
  88 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  89    a stack.  Use it when possible.  */
  90 #ifndef MAP_STACK
  91 # define MAP_STACK 0
  92 #endif
  93
  94 /* This yields the pointer that TLS support code calls the thread pointer.  */
  95 #if TLS_TCB_AT_TP
  96 # define TLS_TPADJ(pd) (pd)
  97 #elif TLS_DTV_AT_TP
  98 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
  99 #endif
 100
 101 /* Cache handling for not-yet free stacks.  */
 102
 103 /* Maximum size in kB of cache.  */
 104 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 105 static size_t stack_cache_actsize;
 106
 107 /* Mutex protecting this variable.  */
 108 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 109
 110 /* List of queued stack frames.  */
 111 static LIST_HEAD (stack_cache);
 112
 113 /* List of the stacks in use.  */
 114 static LIST_HEAD (stack_used);
 115
 116 /* We need to record what list operations we are going to do so that,
 117    in case of an asynchronous interruption due to a fork() call, we
 118    can correct for the work.  */
 119 static uintptr_t in_flight_stack;
 120
 121 /* List of the threads with user provided stacks in use.  No need to
 122    initialize this, since it's done in __pthread_initialize_minimal.  */
 123 list_t __stack_user __attribute__ ((nocommon));
 124 hidden_data_def (__stack_user)
 125
 126 #if COLORING_INCREMENT != 0
 127 /* Number of threads created.  */
 128 static unsigned int nptl_ncreated;
 129 #endif
 130
 131
 132 /* Check whether the stack is still used or not.  */
 133 #define FREE_P(descr) ((descr)->tid <= 0)
 134
 135
 136 static void
 137 stack_list_del (list_t *elem)
 138 {
 139   in_flight_stack = (uintptr_t) elem;
 140
 141   atomic_write_barrier ();
 142
 143   list_del (elem);
 144
 145   atomic_write_barrier ();
 146
 147   in_flight_stack = 0;
 148 }
 149
 150
 151 static void
 152 stack_list_add (list_t *elem, list_t *list)
 153 {
 154   in_flight_stack = (uintptr_t) elem | 1;
 155
 156   atomic_write_barrier ();
 157
 158   list_add (elem, list);
 159
 160   atomic_write_barrier ();
 161
 162   in_flight_stack = 0;
 163 }
 164
 165
 166 /* We create a double linked list of all cache entries.  Double linked
 167    because this allows removing entries from the end.  */
 168
 169
 170 /* Get a stack frame from the cache.  We have to match by size since
 171    some blocks might be too small or far too large.  */
 172 static struct pthread *
 173 get_cached_stack (size_t *sizep, void **memp)
 174 {
 175   size_t size = *sizep;
 176   struct pthread *result = NULL;
 177   list_t *entry;
 178
 179   lll_lock (stack_cache_lock, LLL_PRIVATE);
 180
 181   /* Search the cache for a matching entry.  We search for the
 182      smallest stack which has at least the required size.  Note that
 183      in normal situations the size of all allocated stacks is the
 184      same.  As the very least there are only a few different sizes.
 185      Therefore this loop will exit early most of the time with an
 186      exact match.  */
 187   list_for_each (entry, &stack_cache)
 188     {
 189       struct pthread *curr;
 190
 191       curr = list_entry (entry, struct pthread, list);
 192       if (FREE_P (curr) && curr->stackblock_size >= size)
 193         {
 194           if (curr->stackblock_size == size)
 195             {
 196               result = curr;
 197               break;
 198             }
 199
 200           if (result == NULL
 201               || result->stackblock_size > curr->stackblock_size)
 202             result = curr;
 203         }
 204     }
 205
 206   if (__builtin_expect (result == NULL, 0)
 207       /* Make sure the size difference is not too excessive.  In that
 208          case we do not use the block.  */
 209       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 210     {
 211       /* Release the lock.  */
 212       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 213
 214       return NULL;
 215     }
 216
 217   /* Don't allow setxid until cloned.  */
 218   result->setxid_futex = -1;
 219
 220   /* Dequeue the entry.  */
 221   stack_list_del (&result->list);
 222
 223   /* And add to the list of stacks in use.  */
 224   stack_list_add (&result->list, &stack_used);
 225
 226   /* And decrease the cache size.  */
 227   stack_cache_actsize -= result->stackblock_size;
 228
 229   /* Release the lock early.  */
 230   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 231
 232   /* Report size and location of the stack to the caller.  */
 233   *sizep = result->stackblock_size;
 234   *memp = result->stackblock;
 235
 236   /* Cancellation handling is back to the default.  */
 237   result->cancelhandling = 0;
 238   result->cleanup = NULL;
 239
 240   /* No pending event.  */
 241   result->nextevent = NULL;
 242
 243   /* Clear the DTV.  */
 244   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 245   for (size_t cnt = 0; cnt < dtv[-1].counter; ++cnt)
 246     if (! dtv[1 + cnt].pointer.is_static
 247         && dtv[1 + cnt].pointer.val != TLS_DTV_UNALLOCATED)
 248       free (dtv[1 + cnt].pointer.val);
 249   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 250
 251   /* Re-initialize the TLS.  */
 252   _dl_allocate_tls_init (TLS_TPADJ (result));
 253
 254   return result;
 255 }
 256
 257
 258 /* Free stacks until cache size is lower than LIMIT.  */
 259 void
 260 __free_stacks (size_t limit)
 261 {
 262   /* We reduce the size of the cache.  Remove the last entries until
 263      the size is below the limit.  */
 264   list_t *entry;
 265   list_t *prev;
 266
 267   /* Search from the end of the list.  */
 268   list_for_each_prev_safe (entry, prev, &stack_cache)
 269     {
 270       struct pthread *curr;
 271
 272       curr = list_entry (entry, struct pthread, list);
 273       if (FREE_P (curr))
 274         {
 275           /* Unlink the block.  */
 276           stack_list_del (entry);
 277
 278           /* Account for the freed memory.  */
 279           stack_cache_actsize -= curr->stackblock_size;
 280
 281           /* Free the memory associated with the ELF TLS.  */
 282           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 283
 284           /* Remove this block.  This should never fail.  If it does
 285              something is really wrong.  */
 286           if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 287             abort ();
 288
 289           /* Maybe we have freed enough.  */
 290           if (stack_cache_actsize <= limit)
 291             break;
 292         }
 293     }
 294 }
 295
 296
 297 /* Add a stack frame which is not used anymore to the stack.  Must be
 298    called with the cache lock held.  */
 299 static inline void
 300 __attribute ((always_inline))
 301 queue_stack (struct pthread *stack)
 302 {
 303   /* We unconditionally add the stack to the list.  The memory may
 304      still be in use but it will not be reused until the kernel marks
 305      the stack as not used anymore.  */
 306   stack_list_add (&stack->list, &stack_cache);
 307
 308   stack_cache_actsize += stack->stackblock_size;
 309   if (__builtin_expect (stack_cache_actsize > stack_cache_maxsize, 0))
 310     __free_stacks (stack_cache_maxsize);
 311 }
 312
 313
 314 static int
 315 internal_function
 316 change_stack_perm (struct pthread *pd
 317 #ifdef NEED_SEPARATE_REGISTER_STACK
 318                    , size_t pagemask
 319 #endif
 320                    )
 321 {
 322 #ifdef NEED_SEPARATE_REGISTER_STACK
 323   void *stack = (pd->stackblock
 324                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 325                       & pagemask) + pd->guardsize) & pagemask));
 326   size_t len = pd->stackblock + pd->stackblock_size - stack;
 327 #elif _STACK_GROWS_DOWN
 328   void *stack = pd->stackblock + pd->guardsize;
 329   size_t len = pd->stackblock_size - pd->guardsize;
 330 #elif _STACK_GROWS_UP
 331   void *stack = pd->stackblock;
 332   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 333 #else
 334 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 335 #endif
 336   if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 337     return errno;
 338
 339   return 0;
 340 }
 341
 342
 343 /* Returns a usable stack for a new thread either by allocating a
 344    new stack or reusing a cached stack of sufficient size.
 345    ATTR must be non-NULL and point to a valid pthread_attr.
 346    PDP must be non-NULL.  */
 347 static int
 348 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 349                 ALLOCATE_STACK_PARMS)
 350 {
 351   struct pthread *pd;
 352   size_t size;
 353   size_t pagesize_m1 = __getpagesize () - 1;
 354   void *stacktop;
 355
 356   assert (powerof2 (pagesize_m1 + 1));
 357   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 358
 359   /* Get the stack size from the attribute if it is set.  Otherwise we
 360      use the default we determined at start time.  */
 361   if (attr->stacksize != 0)
 362     size = attr->stacksize;
 363   else
 364     {
 365       lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
 366       size = __default_pthread_attr.stacksize;
 367       lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
 368     }
 369
 370   /* Get memory for the stack.  */
 371   if (__builtin_expect (attr->flags & ATTR_FLAG_STACKADDR, 0))
 372     {
 373       uintptr_t adj;
 374
 375       /* If the user also specified the size of the stack make sure it
 376          is large enough.  */
 377       if (attr->stacksize != 0
 378           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 379         return EINVAL;
 380
 381       /* Adjust stack size for alignment of the TLS block.  */
 382 #if TLS_TCB_AT_TP
 383       adj = ((uintptr_t) attr->stackaddr - TLS_TCB_SIZE)
 384             & __static_tls_align_m1;
 385       assert (size > adj + TLS_TCB_SIZE);
 386 #elif TLS_DTV_AT_TP
 387       adj = ((uintptr_t) attr->stackaddr - __static_tls_size)
 388             & __static_tls_align_m1;
 389       assert (size > adj);
 390 #endif
 391
 392       /* The user provided some memory.  Let's hope it matches the
 393          size...  We do not allocate guard pages if the user provided
 394          the stack.  It is the user's responsibility to do this if it
 395          is wanted.  */
 396 #if TLS_TCB_AT_TP
 397       pd = (struct pthread *) ((uintptr_t) attr->stackaddr
 398                                - TLS_TCB_SIZE - adj);
 399 #elif TLS_DTV_AT_TP
 400       pd = (struct pthread *) (((uintptr_t) attr->stackaddr
 401                                 - __static_tls_size - adj)
 402                                - TLS_PRE_TCB_SIZE);
 403 #endif
 404
 405       /* The user provided stack memory needs to be cleared.  */
 406       memset (pd, '\0', sizeof (struct pthread));
 407
 408       /* The first TSD block is included in the TCB.  */
 409       pd->specific[0] = pd->specific_1stblock;
 410
 411       /* Remember the stack-related values.  */
 412       pd->stackblock = (char *) attr->stackaddr - size;
 413       pd->stackblock_size = size;
 414
 415       /* This is a user-provided stack.  It will not be queued in the
 416          stack cache nor will the memory (except the TLS memory) be freed.  */
 417       pd->user_stack = true;
 418
 419       /* This is at least the second thread.  */
 420       pd->header.multiple_threads = 1;
 421 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 422       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 423 #endif
 424
 425 #ifndef __ASSUME_PRIVATE_FUTEX
 426       /* The thread must know when private futexes are supported.  */
 427       pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 428                                                 header.private_futex);
 429 #endif
 430
 431 #ifdef NEED_DL_SYSINFO
 432       /* Copy the sysinfo value from the parent.  */
 433       THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 434 #endif
 435
 436       /* The process ID is also the same as that of the caller.  */
 437       pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 438
 439       /* Don't allow setxid until cloned.  */
 440       pd->setxid_futex = -1;
 441
 442       /* Allocate the DTV for this thread.  */
 443       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 444         {
 445           /* Something went wrong.  */
 446           assert (errno == ENOMEM);
 447           return errno;
 448         }
 449
 450
 451       /* Prepare to modify global data.  */
 452       lll_lock (stack_cache_lock, LLL_PRIVATE);
 453
 454       /* And add to the list of stacks in use.  */
 455       list_add (&pd->list, &__stack_user);
 456
 457       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 458     }
 459   else
 460     {
 461       /* Allocate some anonymous memory.  If possible use the cache.  */
 462       size_t guardsize;
 463       size_t reqsize;
 464       void *mem;
 465       const int prot = (PROT_READ | PROT_WRITE
 466                         | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
 467
 468 #if COLORING_INCREMENT != 0
 469       /* Add one more page for stack coloring.  Don't do it for stacks
 470          with 16 times pagesize or larger.  This might just cause
 471          unnecessary misalignment.  */
 472       if (size <= 16 * pagesize_m1)
 473         size += pagesize_m1 + 1;
 474 #endif
 475
 476       /* Adjust the stack size for alignment.  */
 477       size &= ~__static_tls_align_m1;
 478       assert (size != 0);
 479
 480       /* Make sure the size of the stack is enough for the guard and
 481          eventually the thread descriptor.  */
 482       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 483       if (__builtin_expect (size < ((guardsize + __static_tls_size
 484                                      + MINIMAL_REST_STACK + pagesize_m1)
 485                                     & ~pagesize_m1),
 486                             0))
 487         /* The stack is too small (or the guard too large).  */
 488         return EINVAL;
 489
 490       /* Try to get a stack from the cache.  */
 491       reqsize = size;
 492       pd = get_cached_stack (&size, &mem);
 493       if (pd == NULL)
 494         {
 495           /* To avoid aliasing effects on a larger scale than pages we
 496              adjust the allocated stack size if necessary.  This way
 497              allocations directly following each other will not have
 498              aliasing problems.  */
 499 #if MULTI_PAGE_ALIASING != 0
 500           if ((size % MULTI_PAGE_ALIASING) == 0)
 501             size += pagesize_m1 + 1;
 502 #endif
 503
 504           mem = mmap (NULL, size, prot,
 505                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 506
 507           if (__builtin_expect (mem == MAP_FAILED, 0))
 508             return errno;
 509
 510           /* SIZE is guaranteed to be greater than zero.
 511              So we can never get a null pointer back from mmap.  */
 512           assert (mem != NULL);
 513
 514 #if COLORING_INCREMENT != 0
 515           /* Atomically increment NCREATED.  */
 516           unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
 517
 518           /* We chose the offset for coloring by incrementing it for
 519              every new thread by a fixed amount.  The offset used
 520              module the page size.  Even if coloring would be better
 521              relative to higher alignment values it makes no sense to
 522              do it since the mmap() interface does not allow us to
 523              specify any alignment for the returned memory block.  */
 524           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 525
 526           /* Make sure the coloring offsets does not disturb the alignment
 527              of the TCB and static TLS block.  */
 528           if (__builtin_expect ((coloring & __static_tls_align_m1) != 0, 0))
 529             coloring = (((coloring + __static_tls_align_m1)
 530                          & ~(__static_tls_align_m1))
 531                         & ~pagesize_m1);
 532 #else
 533           /* Unless specified we do not make any adjustments.  */
 534 # define coloring 0
 535 #endif
 536
 537           /* Place the thread descriptor at the end of the stack.  */
 538 #if TLS_TCB_AT_TP
 539           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 540 #elif TLS_DTV_AT_TP
 541           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 542                                     - __static_tls_size)
 543                                     & ~__static_tls_align_m1)
 544                                    - TLS_PRE_TCB_SIZE);
 545 #endif
 546
 547           /* Remember the stack-related values.  */
 548           pd->stackblock = mem;
 549           pd->stackblock_size = size;
 550
 551           /* We allocated the first block thread-specific data array.
 552              This address will not change for the lifetime of this
 553              descriptor.  */
 554           pd->specific[0] = pd->specific_1stblock;
 555
 556           /* This is at least the second thread.  */
 557           pd->header.multiple_threads = 1;
 558 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 559           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 560 #endif
 561
 562 #ifndef __ASSUME_PRIVATE_FUTEX
 563           /* The thread must know when private futexes are supported.  */
 564           pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 565                                                     header.private_futex);
 566 #endif
 567
 568 #ifdef NEED_DL_SYSINFO
 569           /* Copy the sysinfo value from the parent.  */
 570           THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 571 #endif
 572
 573           /* Don't allow setxid until cloned.  */
 574           pd->setxid_futex = -1;
 575
 576           /* The process ID is also the same as that of the caller.  */
 577           pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 578
 579           /* Allocate the DTV for this thread.  */
 580           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 581             {
 582               /* Something went wrong.  */
 583               assert (errno == ENOMEM);
 584
 585               /* Free the stack memory we just allocated.  */
 586               (void) munmap (mem, size);
 587
 588               return errno;
 589             }
 590
 591
 592           /* Prepare to modify global data.  */
 593           lll_lock (stack_cache_lock, LLL_PRIVATE);
 594
 595           /* And add to the list of stacks in use.  */
 596           stack_list_add (&pd->list, &stack_used);
 597
 598           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 599
 600
 601           /* There might have been a race.  Another thread might have
 602              caused the stacks to get exec permission while this new
 603              stack was prepared.  Detect if this was possible and
 604              change the permission if necessary.  */
 605           if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
 606                                 && (prot & PROT_EXEC) == 0, 0))
 607             {
 608               int err = change_stack_perm (pd
 609 #ifdef NEED_SEPARATE_REGISTER_STACK
 610                                            , ~pagesize_m1
 611 #endif
 612                                            );
 613               if (err != 0)
 614                 {
 615                   /* Free the stack memory we just allocated.  */
 616                   (void) munmap (mem, size);
 617
 618                   return err;
 619                 }
 620             }
 621
 622
 623           /* Note that all of the stack and the thread descriptor is
 624              zeroed.  This means we do not have to initialize fields
 625              with initial value zero.  This is specifically true for
 626              the 'tid' field which is always set back to zero once the
 627              stack is not used anymore and for the 'guardsize' field
 628              which will be read next.  */
 629         }
 630
 631       /* Create or resize the guard area if necessary.  */
 632       if (__builtin_expect (guardsize > pd->guardsize, 0))
 633         {
 634 #ifdef NEED_SEPARATE_REGISTER_STACK
 635           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 636 #elif _STACK_GROWS_DOWN
 637           char *guard = mem;
 638 # elif _STACK_GROWS_UP
 639           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 640 #endif
 641           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 642             {
 643             mprot_error:
 644               lll_lock (stack_cache_lock, LLL_PRIVATE);
 645
 646               /* Remove the thread from the list.  */
 647               stack_list_del (&pd->list);
 648
 649               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 650
 651               /* Get rid of the TLS block we allocated.  */
 652               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 653
 654               /* Free the stack memory regardless of whether the size
 655                  of the cache is over the limit or not.  If this piece
 656                  of memory caused problems we better do not use it
 657                  anymore.  Uh, and we ignore possible errors.  There
 658                  is nothing we could do.  */
 659               (void) munmap (mem, size);
 660
 661               return errno;
 662             }
 663
 664           pd->guardsize = guardsize;
 665         }
 666       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 667                                  0))
 668         {
 669           /* The old guard area is too large.  */
 670
 671 #ifdef NEED_SEPARATE_REGISTER_STACK
 672           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 673           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 674
 675           if (oldguard < guard
 676               && mprotect (oldguard, guard - oldguard, prot) != 0)
 677             goto mprot_error;
 678
 679           if (mprotect (guard + guardsize,
 680                         oldguard + pd->guardsize - guard - guardsize,
 681                         prot) != 0)
 682             goto mprot_error;
 683 #elif _STACK_GROWS_DOWN
 684           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 685                         prot) != 0)
 686             goto mprot_error;
 687 #elif _STACK_GROWS_UP
 688           if (mprotect ((char *) pd - pd->guardsize,
 689                         pd->guardsize - guardsize, prot) != 0)
 690             goto mprot_error;
 691 #endif
 692
 693           pd->guardsize = guardsize;
 694         }
 695       /* The pthread_getattr_np() calls need to get passed the size
 696          requested in the attribute, regardless of how large the
 697          actually used guardsize is.  */
 698       pd->reported_guardsize = guardsize;
 699     }
 700
 701   /* Initialize the lock.  We have to do this unconditionally since the
 702      stillborn thread could be canceled while the lock is taken.  */
 703   pd->lock = LLL_LOCK_INITIALIZER;
 704
 705   /* The robust mutex lists also need to be initialized
 706      unconditionally because the cleanup for the previous stack owner
 707      might have happened in the kernel.  */
 708   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 709                                   - offsetof (pthread_mutex_t,
 710                                               __data.__list.__next));
 711   pd->robust_head.list_op_pending = NULL;
 712 #ifdef __PTHREAD_MUTEX_HAVE_PREV
 713   pd->robust_prev = &pd->robust_head;
 714 #endif
 715   pd->robust_head.list = &pd->robust_head;
 716
 717   /* We place the thread descriptor at the end of the stack.  */
 718   *pdp = pd;
 719
 720 #if TLS_TCB_AT_TP
 721   /* The stack begins before the TCB and the static TLS block.  */
 722   stacktop = ((char *) (pd + 1) - __static_tls_size);
 723 #elif TLS_DTV_AT_TP
 724   stacktop = (char *) (pd - 1);
 725 #endif
 726
 727 #ifdef NEED_SEPARATE_REGISTER_STACK
 728   *stack = pd->stackblock;
 729   *stacksize = stacktop - *stack;
 730 #elif _STACK_GROWS_DOWN
 731   *stack = stacktop;
 732 #elif _STACK_GROWS_UP
 733   *stack = pd->stackblock;
 734   assert (*stack > 0);
 735 #endif
 736
 737   return 0;
 738 }
 739
 740
 741 void
 742 internal_function
 743 __deallocate_stack (struct pthread *pd)
 744 {
 745   lll_lock (stack_cache_lock, LLL_PRIVATE);
 746
 747   /* Remove the thread from the list of threads with user defined
 748      stacks.  */
 749   stack_list_del (&pd->list);
 750
 751   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 752      not reset the 'used' flag in the 'tid' field.  This is done by
 753      the kernel.  If no thread has been created yet this field is
 754      still zero.  */
 755   if (__builtin_expect (! pd->user_stack, 1))
 756     (void) queue_stack (pd);
 757   else
 758     /* Free the memory associated with the ELF TLS.  */
 759     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 760
 761   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 762 }
 763
 764
 765 int
 766 internal_function
 767 __make_stacks_executable (void **stack_endp)
 768 {
 769   /* First the main thread's stack.  */
 770   int err = _dl_make_stack_executable (stack_endp);
 771   if (err != 0)
 772     return err;
 773
 774 #ifdef NEED_SEPARATE_REGISTER_STACK
 775   const size_t pagemask = ~(__getpagesize () - 1);
 776 #endif
 777
 778   lll_lock (stack_cache_lock, LLL_PRIVATE);
 779
 780   list_t *runp;
 781   list_for_each (runp, &stack_used)
 782     {
 783       err = change_stack_perm (list_entry (runp, struct pthread, list)
 784 #ifdef NEED_SEPARATE_REGISTER_STACK
 785                                , pagemask
 786 #endif
 787                                );
 788       if (err != 0)
 789         break;
 790     }
 791
 792   /* Also change the permission for the currently unused stacks.  This
 793      might be wasted time but better spend it here than adding a check
 794      in the fast path.  */
 795   if (err == 0)
 796     list_for_each (runp, &stack_cache)
 797       {
 798         err = change_stack_perm (list_entry (runp, struct pthread, list)
 799 #ifdef NEED_SEPARATE_REGISTER_STACK
 800                                  , pagemask
 801 #endif
 802                                  );
 803         if (err != 0)
 804           break;
 805       }
 806
 807   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 808
 809   return err;
 810 }
 811
 812
 813 /* In case of a fork() call the memory allocation in the child will be
 814    the same but only one thread is running.  All stacks except that of
 815    the one running thread are not used anymore.  We have to recycle
 816    them.  */
 817 void
 818 __reclaim_stacks (void)
 819 {
 820   struct pthread *self = (struct pthread *) THREAD_SELF;
 821
 822   /* No locking necessary.  The caller is the only stack in use.  But
 823      we have to be aware that we might have interrupted a list
 824      operation.  */
 825
 826   if (in_flight_stack != 0)
 827     {
 828       bool add_p = in_flight_stack & 1;
 829       list_t *elem = (list_t *) (in_flight_stack & ~(uintptr_t) 1);
 830
 831       if (add_p)
 832         {
 833           /* We always add at the beginning of the list.  So in this
 834              case we only need to check the beginning of these lists.  */
 835           int check_list (list_t *l)
 836           {
 837             if (l->next->prev != l)
 838               {
 839                 assert (l->next->prev == elem);
 840
 841                 elem->next = l->next;
 842                 elem->prev = l;
 843                 l->next = elem;
 844
 845                 return 1;
 846               }
 847
 848             return 0;
 849           }
 850
 851           if (check_list (&stack_used) == 0)
 852             (void) check_list (&stack_cache);
 853         }
 854       else
 855         {
 856           /* We can simply always replay the delete operation.  */
 857           elem->next->prev = elem->prev;
 858           elem->prev->next = elem->next;
 859         }
 860     }
 861
 862   /* Mark all stacks except the still running one as free.  */
 863   list_t *runp;
 864   list_for_each (runp, &stack_used)
 865     {
 866       struct pthread *curp = list_entry (runp, struct pthread, list);
 867       if (curp != self)
 868         {
 869           /* This marks the stack as free.  */
 870           curp->tid = 0;
 871
 872           /* The PID field must be initialized for the new process.  */
 873           curp->pid = self->pid;
 874
 875           /* Account for the size of the stack.  */
 876           stack_cache_actsize += curp->stackblock_size;
 877
 878           if (curp->specific_used)
 879             {
 880               /* Clear the thread-specific data.  */
 881               memset (curp->specific_1stblock, '\0',
 882                       sizeof (curp->specific_1stblock));
 883
 884               curp->specific_used = false;
 885
 886               for (size_t cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 887                 if (curp->specific[cnt] != NULL)
 888                   {
 889                     memset (curp->specific[cnt], '\0',
 890                             sizeof (curp->specific_1stblock));
 891
 892                     /* We have allocated the block which we do not
 893                        free here so re-set the bit.  */
 894                     curp->specific_used = true;
 895                   }
 896             }
 897         }
 898     }
 899
 900   /* Reset the PIDs in any cached stacks.  */
 901   list_for_each (runp, &stack_cache)
 902     {
 903       struct pthread *curp = list_entry (runp, struct pthread, list);
 904       curp->pid = self->pid;
 905     }
 906
 907   /* Add the stack of all running threads to the cache.  */
 908   list_splice (&stack_used, &stack_cache);
 909
 910   /* Remove the entry for the current thread to from the cache list
 911      and add it to the list of running threads.  Which of the two
 912      lists is decided by the user_stack flag.  */
 913   stack_list_del (&self->list);
 914
 915   /* Re-initialize the lists for all the threads.  */
 916   INIT_LIST_HEAD (&stack_used);
 917   INIT_LIST_HEAD (&__stack_user);
 918
 919   if (__builtin_expect (THREAD_GETMEM (self, user_stack), 0))
 920     list_add (&self->list, &__stack_user);
 921   else
 922     list_add (&self->list, &stack_used);
 923
 924   /* There is one thread running.  */
 925   __nptl_nthreads = 1;
 926
 927   in_flight_stack = 0;
 928
 929   /* Initialize locks.  */
 930   stack_cache_lock = LLL_LOCK_INITIALIZER;
 931   __default_pthread_attr_lock = LLL_LOCK_INITIALIZER;
 932 }
 933
 934
 935 #if HP_TIMING_AVAIL
 936 # undef __find_thread_by_id
 937 /* Find a thread given the thread ID.  */
 938 attribute_hidden
 939 struct pthread *
 940 __find_thread_by_id (pid_t tid)
 941 {
 942   struct pthread *result = NULL;
 943
 944   lll_lock (stack_cache_lock, LLL_PRIVATE);
 945
 946   /* Iterate over the list with system-allocated threads first.  */
 947   list_t *runp;
 948   list_for_each (runp, &stack_used)
 949     {
 950       struct pthread *curp;
 951
 952       curp = list_entry (runp, struct pthread, list);
 953
 954       if (curp->tid == tid)
 955         {
 956           result = curp;
 957           goto out;
 958         }
 959     }
 960
 961   /* Now the list with threads using user-allocated stacks.  */
 962   list_for_each (runp, &__stack_user)
 963     {
 964       struct pthread *curp;
 965
 966       curp = list_entry (runp, struct pthread, list);
 967
 968       if (curp->tid == tid)
 969         {
 970           result = curp;
 971           goto out;
 972         }
 973     }
 974
 975  out:
 976   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 977
 978   return result;
 979 }
 980 #endif
 981
 982
 983 static void
 984 internal_function
 985 setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
 986 {
 987   int ch;
 988
 989   /* Wait until this thread is cloned.  */
 990   if (t->setxid_futex == -1
 991       && ! atomic_compare_and_exchange_bool_acq (&t->setxid_futex, -2, -1))
 992     do
 993       lll_futex_wait (&t->setxid_futex, -2, LLL_PRIVATE);
 994     while (t->setxid_futex == -2);
 995
 996   /* Don't let the thread exit before the setxid handler runs.  */
 997   t->setxid_futex = 0;
 998
 999   do
1000     {
1001       ch = t->cancelhandling;
1002
1003       /* If the thread is exiting right now, ignore it.  */
1004       if ((ch & EXITING_BITMASK) != 0)
1005         {
1006           /* Release the futex if there is no other setxid in
1007              progress.  */
1008           if ((ch & SETXID_BITMASK) == 0)
1009             {
1010               t->setxid_futex = 1;
1011               lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
1012             }
1013           return;
1014         }
1015     }
1016   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1017                                                ch | SETXID_BITMASK, ch));
1018 }
1019
1020
1021 static void
1022 internal_function
1023 setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
1024 {
1025   int ch;
1026
1027   do
1028     {
1029       ch = t->cancelhandling;
1030       if ((ch & SETXID_BITMASK) == 0)
1031         return;
1032     }
1033   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1034                                                ch & ~SETXID_BITMASK, ch));
1035
1036   /* Release the futex just in case.  */
1037   t->setxid_futex = 1;
1038   lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
1039 }
1040
1041
1042 static int
1043 internal_function
1044 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
1045 {
1046   if ((t->cancelhandling & SETXID_BITMASK) == 0)
1047     return 0;
1048
1049   int val;
1050   INTERNAL_SYSCALL_DECL (err);
1051   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
1052                           t->tid, SIGSETXID);
1053
1054   /* If this failed, it must have had not started yet or else exited.  */
1055   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
1056     {
1057       atomic_increment (&cmdp->cntr);
1058       return 1;
1059     }
1060   else
1061     return 0;
1062 }
1063
1064
1065 int
1066 attribute_hidden
1067 __nptl_setxid (struct xid_command *cmdp)
1068 {
1069   int signalled;
1070   int result;
1071   lll_lock (stack_cache_lock, LLL_PRIVATE);
1072
1073   __xidcmd = cmdp;
1074   cmdp->cntr = 0;
1075
1076   struct pthread *self = THREAD_SELF;
1077
1078   /* Iterate over the list with system-allocated threads first.  */
1079   list_t *runp;
1080   list_for_each (runp, &stack_used)
1081     {
1082       struct pthread *t = list_entry (runp, struct pthread, list);
1083       if (t == self)
1084         continue;
1085
1086       setxid_mark_thread (cmdp, t);
1087     }
1088
1089   /* Now the list with threads using user-allocated stacks.  */
1090   list_for_each (runp, &__stack_user)
1091     {
1092       struct pthread *t = list_entry (runp, struct pthread, list);
1093       if (t == self)
1094         continue;
1095
1096       setxid_mark_thread (cmdp, t);
1097     }
1098
1099   /* Iterate until we don't succeed in signalling anyone.  That means
1100      we have gotten all running threads, and their children will be
1101      automatically correct once started.  */
1102   do
1103     {
1104       signalled = 0;
1105
1106       list_for_each (runp, &stack_used)
1107         {
1108           struct pthread *t = list_entry (runp, struct pthread, list);
1109           if (t == self)
1110             continue;
1111
1112           signalled += setxid_signal_thread (cmdp, t);
1113         }
1114
1115       list_for_each (runp, &__stack_user)
1116         {
1117           struct pthread *t = list_entry (runp, struct pthread, list);
1118           if (t == self)
1119             continue;
1120
1121           signalled += setxid_signal_thread (cmdp, t);
1122         }
1123
1124       int cur = cmdp->cntr;
1125       while (cur != 0)
1126         {
1127           lll_futex_wait (&cmdp->cntr, cur, LLL_PRIVATE);
1128           cur = cmdp->cntr;
1129         }
1130     }
1131   while (signalled != 0);
1132
1133   /* Clean up flags, so that no thread blocks during exit waiting
1134      for a signal which will never come.  */
1135   list_for_each (runp, &stack_used)
1136     {
1137       struct pthread *t = list_entry (runp, struct pthread, list);
1138       if (t == self)
1139         continue;
1140
1141       setxid_unmark_thread (cmdp, t);
1142     }
1143
1144   list_for_each (runp, &__stack_user)
1145     {
1146       struct pthread *t = list_entry (runp, struct pthread, list);
1147       if (t == self)
1148         continue;
1149
1150       setxid_unmark_thread (cmdp, t);
1151     }
1152
1153   /* This must be last, otherwise the current thread might not have
1154      permissions to send SIGSETXID syscall to the other threads.  */
1155   INTERNAL_SYSCALL_DECL (err);
1156   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1157                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1158   if (INTERNAL_SYSCALL_ERROR_P (result, err))
1159     {
1160       __set_errno (INTERNAL_SYSCALL_ERRNO (result, err));
1161       result = -1;
1162     }
1163
1164   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1165   return result;
1166 }
1167
1168 static inline void __attribute__((always_inline))
1169 init_one_static_tls (struct pthread *curp, struct link_map *map)
1170 {
1171   dtv_t *dtv = GET_DTV (TLS_TPADJ (curp));
1172 # if TLS_TCB_AT_TP
1173   void *dest = (char *) curp - map->l_tls_offset;
1174 # elif TLS_DTV_AT_TP
1175   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1176 # else
1177 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1178 # endif
1179
1180   /* Fill in the DTV slot so that a later LD/GD access will find it.  */
1181   dtv[map->l_tls_modid].pointer.val = dest;
1182   dtv[map->l_tls_modid].pointer.is_static = true;
1183
1184   /* Initialize the memory.  */
1185   memset (__mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1186           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1187 }
1188
1189 void
1190 attribute_hidden
1191 __pthread_init_static_tls (struct link_map *map)
1192 {
1193   lll_lock (stack_cache_lock, LLL_PRIVATE);
1194
1195   /* Iterate over the list with system-allocated threads first.  */
1196   list_t *runp;
1197   list_for_each (runp, &stack_used)
1198     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1199
1200   /* Now the list with threads using user-allocated stacks.  */
1201   list_for_each (runp, &__stack_user)
1202     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1203
1204   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1205 }
1206
1207
1208 void
1209 attribute_hidden
1210 __wait_lookup_done (void)
1211 {
1212   lll_lock (stack_cache_lock, LLL_PRIVATE);
1213
1214   struct pthread *self = THREAD_SELF;
1215
1216   /* Iterate over the list with system-allocated threads first.  */
1217   list_t *runp;
1218   list_for_each (runp, &stack_used)
1219     {
1220       struct pthread *t = list_entry (runp, struct pthread, list);
1221       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1222         continue;
1223
1224       int *const gscope_flagp = &t->header.gscope_flag;
1225
1226       /* We have to wait until this thread is done with the global
1227          scope.  First tell the thread that we are waiting and
1228          possibly have to be woken.  */
1229       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1230                                                 THREAD_GSCOPE_FLAG_WAIT,
1231                                                 THREAD_GSCOPE_FLAG_USED))
1232         continue;
1233
1234       do
1235         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1236       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1237     }
1238
1239   /* Now the list with threads using user-allocated stacks.  */
1240   list_for_each (runp, &__stack_user)
1241     {
1242       struct pthread *t = list_entry (runp, struct pthread, list);
1243       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1244         continue;
1245
1246       int *const gscope_flagp = &t->header.gscope_flag;
1247
1248       /* We have to wait until this thread is done with the global
1249          scope.  First tell the thread that we are waiting and
1250          possibly have to be woken.  */
1251       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1252                                                 THREAD_GSCOPE_FLAG_WAIT,
1253                                                 THREAD_GSCOPE_FLAG_USED))
1254         continue;
1255
1256       do
1257         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1258       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1259     }
1260
1261   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1262 }