nptl/allocatestack.c

   1 /* Copyright (C) 2002-2019 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <assert.h>
  20 #include <errno.h>
  21 #include <signal.h>
  22 #include <stdint.h>
  23 #include <string.h>
  24 #include <unistd.h>
  25 #include <sys/mman.h>
  26 #include <sys/param.h>
  27 #include <dl-sysdep.h>
  28 #include <dl-tls.h>
  29 #include <tls.h>
  30 #include <list.h>
  31 #include <lowlevellock.h>
  32 #include <futex-internal.h>
  33 #include <kernel-features.h>
  34 #include <stack-aliasing.h>
  35
  36
  37 #ifndef NEED_SEPARATE_REGISTER_STACK
  38
  39 /* Most architectures have exactly one stack pointer.  Some have more.  */
  40 # define STACK_VARIABLES void *stackaddr = NULL
  41
  42 /* How to pass the values to the 'create_thread' function.  */
  43 # define STACK_VARIABLES_ARGS stackaddr
  44
  45 /* How to declare function which gets there parameters.  */
  46 # define STACK_VARIABLES_PARMS void *stackaddr
  47
  48 /* How to declare allocate_stack.  */
  49 # define ALLOCATE_STACK_PARMS void **stack
  50
  51 /* This is how the function is called.  We do it this way to allow
  52    other variants of the function to have more parameters.  */
  53 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  54
  55 #else
  56
  57 /* We need two stacks.  The kernel will place them but we have to tell
  58    the kernel about the size of the reserved address space.  */
  59 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  60
  61 /* How to pass the values to the 'create_thread' function.  */
  62 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  63
  64 /* How to declare function which gets there parameters.  */
  65 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  66
  67 /* How to declare allocate_stack.  */
  68 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  69
  70 /* This is how the function is called.  We do it this way to allow
  71    other variants of the function to have more parameters.  */
  72 # define ALLOCATE_STACK(attr, pd) \
  73   allocate_stack (attr, pd, &stackaddr, &stacksize)
  74
  75 #endif
  76
  77
  78 /* Default alignment of stack.  */
  79 #ifndef STACK_ALIGN
  80 # define STACK_ALIGN __alignof__ (long double)
  81 #endif
  82
  83 /* Default value for minimal stack size after allocating thread
  84    descriptor and guard.  */
  85 #ifndef MINIMAL_REST_STACK
  86 # define MINIMAL_REST_STACK     4096
  87 #endif
  88
  89
  90 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  91    a stack.  Use it when possible.  */
  92 #ifndef MAP_STACK
  93 # define MAP_STACK 0
  94 #endif
  95
  96 /* This yields the pointer that TLS support code calls the thread pointer.  */
  97 #if TLS_TCB_AT_TP
  98 # define TLS_TPADJ(pd) (pd)
  99 #elif TLS_DTV_AT_TP
 100 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
 101 #endif
 102
 103 /* Cache handling for not-yet free stacks.  */
 104
 105 /* Maximum size in kB of cache.  */
 106 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 107 static size_t stack_cache_actsize;
 108
 109 /* Mutex protecting this variable.  */
 110 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 111
 112 /* List of queued stack frames.  */
 113 static LIST_HEAD (stack_cache);
 114
 115 /* List of the stacks in use.  */
 116 static LIST_HEAD (stack_used);
 117
 118 /* We need to record what list operations we are going to do so that,
 119    in case of an asynchronous interruption due to a fork() call, we
 120    can correct for the work.  */
 121 static uintptr_t in_flight_stack;
 122
 123 /* List of the threads with user provided stacks in use.  No need to
 124    initialize this, since it's done in __pthread_initialize_minimal.  */
 125 list_t __stack_user __attribute__ ((nocommon));
 126 hidden_data_def (__stack_user)
 127
 128
 129 /* Check whether the stack is still used or not.  */
 130 #define FREE_P(descr) ((descr)->tid <= 0)
 131
 132
 133 static void
 134 stack_list_del (list_t *elem)
 135 {
 136   in_flight_stack = (uintptr_t) elem;
 137
 138   atomic_write_barrier ();
 139
 140   list_del (elem);
 141
 142   atomic_write_barrier ();
 143
 144   in_flight_stack = 0;
 145 }
 146
 147
 148 static void
 149 stack_list_add (list_t *elem, list_t *list)
 150 {
 151   in_flight_stack = (uintptr_t) elem | 1;
 152
 153   atomic_write_barrier ();
 154
 155   list_add (elem, list);
 156
 157   atomic_write_barrier ();
 158
 159   in_flight_stack = 0;
 160 }
 161
 162
 163 /* We create a double linked list of all cache entries.  Double linked
 164    because this allows removing entries from the end.  */
 165
 166
 167 /* Get a stack frame from the cache.  We have to match by size since
 168    some blocks might be too small or far too large.  */
 169 static struct pthread *
 170 get_cached_stack (size_t *sizep, void **memp)
 171 {
 172   size_t size = *sizep;
 173   struct pthread *result = NULL;
 174   list_t *entry;
 175
 176   lll_lock (stack_cache_lock, LLL_PRIVATE);
 177
 178   /* Search the cache for a matching entry.  We search for the
 179      smallest stack which has at least the required size.  Note that
 180      in normal situations the size of all allocated stacks is the
 181      same.  As the very least there are only a few different sizes.
 182      Therefore this loop will exit early most of the time with an
 183      exact match.  */
 184   list_for_each (entry, &stack_cache)
 185     {
 186       struct pthread *curr;
 187
 188       curr = list_entry (entry, struct pthread, list);
 189       if (FREE_P (curr) && curr->stackblock_size >= size)
 190         {
 191           if (curr->stackblock_size == size)
 192             {
 193               result = curr;
 194               break;
 195             }
 196
 197           if (result == NULL
 198               || result->stackblock_size > curr->stackblock_size)
 199             result = curr;
 200         }
 201     }
 202
 203   if (__builtin_expect (result == NULL, 0)
 204       /* Make sure the size difference is not too excessive.  In that
 205          case we do not use the block.  */
 206       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 207     {
 208       /* Release the lock.  */
 209       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 210
 211       return NULL;
 212     }
 213
 214   /* Don't allow setxid until cloned.  */
 215   result->setxid_futex = -1;
 216
 217   /* Dequeue the entry.  */
 218   stack_list_del (&result->list);
 219
 220   /* And add to the list of stacks in use.  */
 221   stack_list_add (&result->list, &stack_used);
 222
 223   /* And decrease the cache size.  */
 224   stack_cache_actsize -= result->stackblock_size;
 225
 226   /* Release the lock early.  */
 227   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 228
 229   /* Report size and location of the stack to the caller.  */
 230   *sizep = result->stackblock_size;
 231   *memp = result->stackblock;
 232
 233   /* Cancellation handling is back to the default.  */
 234   result->cancelhandling = 0;
 235   result->cleanup = NULL;
 236
 237   /* No pending event.  */
 238   result->nextevent = NULL;
 239
 240   /* Clear the DTV.  */
 241   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 242   for (size_t cnt = 0; cnt < dtv[-1].counter; ++cnt)
 243     free (dtv[1 + cnt].pointer.to_free);
 244   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 245
 246   /* Re-initialize the TLS.  */
 247   _dl_allocate_tls_init (TLS_TPADJ (result));
 248
 249   return result;
 250 }
 251
 252
 253 /* Free stacks until cache size is lower than LIMIT.  */
 254 static void
 255 free_stacks (size_t limit)
 256 {
 257   /* We reduce the size of the cache.  Remove the last entries until
 258      the size is below the limit.  */
 259   list_t *entry;
 260   list_t *prev;
 261
 262   /* Search from the end of the list.  */
 263   list_for_each_prev_safe (entry, prev, &stack_cache)
 264     {
 265       struct pthread *curr;
 266
 267       curr = list_entry (entry, struct pthread, list);
 268       if (FREE_P (curr))
 269         {
 270           /* Unlink the block.  */
 271           stack_list_del (entry);
 272
 273           /* Account for the freed memory.  */
 274           stack_cache_actsize -= curr->stackblock_size;
 275
 276           /* Free the memory associated with the ELF TLS.  */
 277           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 278
 279           /* Remove this block.  This should never fail.  If it does
 280              something is really wrong.  */
 281           if (__munmap (curr->stackblock, curr->stackblock_size) != 0)
 282             abort ();
 283
 284           /* Maybe we have freed enough.  */
 285           if (stack_cache_actsize <= limit)
 286             break;
 287         }
 288     }
 289 }
 290
 291 /* Free all the stacks on cleanup.  */
 292 void
 293 __nptl_stacks_freeres (void)
 294 {
 295   free_stacks (0);
 296 }
 297
 298 /* Add a stack frame which is not used anymore to the stack.  Must be
 299    called with the cache lock held.  */
 300 static inline void
 301 __attribute ((always_inline))
 302 queue_stack (struct pthread *stack)
 303 {
 304   /* We unconditionally add the stack to the list.  The memory may
 305      still be in use but it will not be reused until the kernel marks
 306      the stack as not used anymore.  */
 307   stack_list_add (&stack->list, &stack_cache);
 308
 309   stack_cache_actsize += stack->stackblock_size;
 310   if (__glibc_unlikely (stack_cache_actsize > stack_cache_maxsize))
 311     free_stacks (stack_cache_maxsize);
 312 }
 313
 314
 315 static int
 316 change_stack_perm (struct pthread *pd
 317 #ifdef NEED_SEPARATE_REGISTER_STACK
 318                    , size_t pagemask
 319 #endif
 320                    )
 321 {
 322 #ifdef NEED_SEPARATE_REGISTER_STACK
 323   void *stack = (pd->stackblock
 324                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 325                       & pagemask) + pd->guardsize) & pagemask));
 326   size_t len = pd->stackblock + pd->stackblock_size - stack;
 327 #elif _STACK_GROWS_DOWN
 328   void *stack = pd->stackblock + pd->guardsize;
 329   size_t len = pd->stackblock_size - pd->guardsize;
 330 #elif _STACK_GROWS_UP
 331   void *stack = pd->stackblock;
 332   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 333 #else
 334 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 335 #endif
 336   if (__mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 337     return errno;
 338
 339   return 0;
 340 }
 341
 342 /* Return the guard page position on allocated stack.  */
 343 static inline char *
 344 __attribute ((always_inline))
 345 guard_position (void *mem, size_t size, size_t guardsize, struct pthread *pd,
 346                 size_t pagesize_m1)
 347 {
 348 #ifdef NEED_SEPARATE_REGISTER_STACK
 349   return mem + (((size - guardsize) / 2) & ~pagesize_m1);
 350 #elif _STACK_GROWS_DOWN
 351   return mem;
 352 #elif _STACK_GROWS_UP
 353   return (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 354 #endif
 355 }
 356
 357 /* Based on stack allocated with PROT_NONE, setup the required portions with
 358    'prot' flags based on the guard page position.  */
 359 static inline int
 360 setup_stack_prot (char *mem, size_t size, char *guard, size_t guardsize,
 361                   const int prot)
 362 {
 363   char *guardend = guard + guardsize;
 364 #if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
 365   /* As defined at guard_position, for architectures with downward stack
 366      the guard page is always at start of the allocated area.  */
 367   if (__mprotect (guardend, size - guardsize, prot) != 0)
 368     return errno;
 369 #else
 370   size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
 371   if (__mprotect (mem, mprots1, prot) != 0)
 372     return errno;
 373   size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
 374   if (__mprotect (guardend, mprots2, prot) != 0)
 375     return errno;
 376 #endif
 377   return 0;
 378 }
 379
 380 /* Mark the memory of the stack as usable to the kernel.  It frees everything
 381    except for the space used for the TCB itself.  */
 382 static __always_inline void
 383 advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
 384 {
 385   uintptr_t sp = (uintptr_t) CURRENT_STACK_FRAME;
 386   size_t pagesize_m1 = __getpagesize () - 1;
 387 #if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
 388   size_t freesize = (sp - (uintptr_t) mem) & ~pagesize_m1;
 389   assert (freesize < size);
 390   if (freesize > PTHREAD_STACK_MIN)
 391     __madvise (mem, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
 392 #else
 393   /* Page aligned start of memory to free (higher than or equal
 394      to current sp plus the minimum stack size).  */
 395   uintptr_t freeblock = (sp + PTHREAD_STACK_MIN + pagesize_m1) & ~pagesize_m1;
 396   uintptr_t free_end = (pd - guardsize) & ~pagesize_m1;
 397   if (free_end > freeblock)
 398     {
 399       size_t freesize = free_end - freeblock;
 400       assert (freesize < size);
 401       __madvise ((void*) freeblock, freesize, MADV_DONTNEED);
 402     }
 403 #endif
 404 }
 405
 406 /* Returns a usable stack for a new thread either by allocating a
 407    new stack or reusing a cached stack of sufficient size.
 408    ATTR must be non-NULL and point to a valid pthread_attr.
 409    PDP must be non-NULL.  */
 410 static int
 411 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 412                 ALLOCATE_STACK_PARMS)
 413 {
 414   struct pthread *pd;
 415   size_t size;
 416   size_t pagesize_m1 = __getpagesize () - 1;
 417
 418   assert (powerof2 (pagesize_m1 + 1));
 419   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 420
 421   /* Get the stack size from the attribute if it is set.  Otherwise we
 422      use the default we determined at start time.  */
 423   if (attr->stacksize != 0)
 424     size = attr->stacksize;
 425   else
 426     {
 427       lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
 428       size = __default_pthread_attr.stacksize;
 429       lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
 430     }
 431
 432   /* Get memory for the stack.  */
 433   if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))
 434     {
 435       uintptr_t adj;
 436       char *stackaddr = (char *) attr->stackaddr;
 437
 438       /* Assume the same layout as the _STACK_GROWS_DOWN case, with struct
 439          pthread at the top of the stack block.  Later we adjust the guard
 440          location and stack address to match the _STACK_GROWS_UP case.  */
 441       if (_STACK_GROWS_UP)
 442         stackaddr += attr->stacksize;
 443
 444       /* If the user also specified the size of the stack make sure it
 445          is large enough.  */
 446       if (attr->stacksize != 0
 447           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 448         return EINVAL;
 449
 450       /* Adjust stack size for alignment of the TLS block.  */
 451 #if TLS_TCB_AT_TP
 452       adj = ((uintptr_t) stackaddr - TLS_TCB_SIZE)
 453             & __static_tls_align_m1;
 454       assert (size > adj + TLS_TCB_SIZE);
 455 #elif TLS_DTV_AT_TP
 456       adj = ((uintptr_t) stackaddr - __static_tls_size)
 457             & __static_tls_align_m1;
 458       assert (size > adj);
 459 #endif
 460
 461       /* The user provided some memory.  Let's hope it matches the
 462          size...  We do not allocate guard pages if the user provided
 463          the stack.  It is the user's responsibility to do this if it
 464          is wanted.  */
 465 #if TLS_TCB_AT_TP
 466       pd = (struct pthread *) ((uintptr_t) stackaddr
 467                                - TLS_TCB_SIZE - adj);
 468 #elif TLS_DTV_AT_TP
 469       pd = (struct pthread *) (((uintptr_t) stackaddr
 470                                 - __static_tls_size - adj)
 471                                - TLS_PRE_TCB_SIZE);
 472 #endif
 473
 474       /* The user provided stack memory needs to be cleared.  */
 475       memset (pd, '\0', sizeof (struct pthread));
 476
 477       /* The first TSD block is included in the TCB.  */
 478       pd->specific[0] = pd->specific_1stblock;
 479
 480       /* Remember the stack-related values.  */
 481       pd->stackblock = (char *) stackaddr - size;
 482       pd->stackblock_size = size;
 483
 484       /* This is a user-provided stack.  It will not be queued in the
 485          stack cache nor will the memory (except the TLS memory) be freed.  */
 486       pd->user_stack = true;
 487
 488       /* This is at least the second thread.  */
 489       pd->header.multiple_threads = 1;
 490 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 491       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 492 #endif
 493
 494 #ifdef NEED_DL_SYSINFO
 495       SETUP_THREAD_SYSINFO (pd);
 496 #endif
 497
 498       /* Don't allow setxid until cloned.  */
 499       pd->setxid_futex = -1;
 500
 501       /* Allocate the DTV for this thread.  */
 502       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 503         {
 504           /* Something went wrong.  */
 505           assert (errno == ENOMEM);
 506           return errno;
 507         }
 508
 509
 510       /* Prepare to modify global data.  */
 511       lll_lock (stack_cache_lock, LLL_PRIVATE);
 512
 513       /* And add to the list of stacks in use.  */
 514       list_add (&pd->list, &__stack_user);
 515
 516       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 517     }
 518   else
 519     {
 520       /* Allocate some anonymous memory.  If possible use the cache.  */
 521       size_t guardsize;
 522       size_t reqsize;
 523       void *mem;
 524       const int prot = (PROT_READ | PROT_WRITE
 525                         | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
 526
 527       /* Adjust the stack size for alignment.  */
 528       size &= ~__static_tls_align_m1;
 529       assert (size != 0);
 530
 531       /* Make sure the size of the stack is enough for the guard and
 532          eventually the thread descriptor.  */
 533       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 534       if (guardsize < attr->guardsize || size + guardsize < guardsize)
 535         /* Arithmetic overflow.  */
 536         return EINVAL;
 537       size += guardsize;
 538       if (__builtin_expect (size < ((guardsize + __static_tls_size
 539                                      + MINIMAL_REST_STACK + pagesize_m1)
 540                                     & ~pagesize_m1),
 541                             0))
 542         /* The stack is too small (or the guard too large).  */
 543         return EINVAL;
 544
 545       /* Try to get a stack from the cache.  */
 546       reqsize = size;
 547       pd = get_cached_stack (&size, &mem);
 548       if (pd == NULL)
 549         {
 550           /* To avoid aliasing effects on a larger scale than pages we
 551              adjust the allocated stack size if necessary.  This way
 552              allocations directly following each other will not have
 553              aliasing problems.  */
 554 #if MULTI_PAGE_ALIASING != 0
 555           if ((size % MULTI_PAGE_ALIASING) == 0)
 556             size += pagesize_m1 + 1;
 557 #endif
 558
 559           /* If a guard page is required, avoid committing memory by first
 560              allocate with PROT_NONE and then reserve with required permission
 561              excluding the guard page.  */
 562           mem = __mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE,
 563                         MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 564
 565           if (__glibc_unlikely (mem == MAP_FAILED))
 566             return errno;
 567
 568           /* SIZE is guaranteed to be greater than zero.
 569              So we can never get a null pointer back from mmap.  */
 570           assert (mem != NULL);
 571
 572           /* Place the thread descriptor at the end of the stack.  */
 573 #if TLS_TCB_AT_TP
 574           pd = (struct pthread *) ((((uintptr_t) mem + size)
 575                                     - TLS_TCB_SIZE)
 576                                    & ~__static_tls_align_m1);
 577 #elif TLS_DTV_AT_TP
 578           pd = (struct pthread *) ((((uintptr_t) mem + size
 579                                     - __static_tls_size)
 580                                     & ~__static_tls_align_m1)
 581                                    - TLS_PRE_TCB_SIZE);
 582 #endif
 583
 584           /* Now mprotect the required region excluding the guard area.  */
 585           if (__glibc_likely (guardsize > 0))
 586             {
 587               char *guard = guard_position (mem, size, guardsize, pd,
 588                                             pagesize_m1);
 589               if (setup_stack_prot (mem, size, guard, guardsize, prot) != 0)
 590                 {
 591                   __munmap (mem, size);
 592                   return errno;
 593                 }
 594             }
 595
 596           /* Remember the stack-related values.  */
 597           pd->stackblock = mem;
 598           pd->stackblock_size = size;
 599           /* Update guardsize for newly allocated guardsize to avoid
 600              an mprotect in guard resize below.  */
 601           pd->guardsize = guardsize;
 602
 603           /* We allocated the first block thread-specific data array.
 604              This address will not change for the lifetime of this
 605              descriptor.  */
 606           pd->specific[0] = pd->specific_1stblock;
 607
 608           /* This is at least the second thread.  */
 609           pd->header.multiple_threads = 1;
 610 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 611           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 612 #endif
 613
 614 #ifdef NEED_DL_SYSINFO
 615           SETUP_THREAD_SYSINFO (pd);
 616 #endif
 617
 618           /* Don't allow setxid until cloned.  */
 619           pd->setxid_futex = -1;
 620
 621           /* Allocate the DTV for this thread.  */
 622           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 623             {
 624               /* Something went wrong.  */
 625               assert (errno == ENOMEM);
 626
 627               /* Free the stack memory we just allocated.  */
 628               (void) __munmap (mem, size);
 629
 630               return errno;
 631             }
 632
 633
 634           /* Prepare to modify global data.  */
 635           lll_lock (stack_cache_lock, LLL_PRIVATE);
 636
 637           /* And add to the list of stacks in use.  */
 638           stack_list_add (&pd->list, &stack_used);
 639
 640           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 641
 642
 643           /* There might have been a race.  Another thread might have
 644              caused the stacks to get exec permission while this new
 645              stack was prepared.  Detect if this was possible and
 646              change the permission if necessary.  */
 647           if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
 648                                 && (prot & PROT_EXEC) == 0, 0))
 649             {
 650               int err = change_stack_perm (pd
 651 #ifdef NEED_SEPARATE_REGISTER_STACK
 652                                            , ~pagesize_m1
 653 #endif
 654                                            );
 655               if (err != 0)
 656                 {
 657                   /* Free the stack memory we just allocated.  */
 658                   (void) __munmap (mem, size);
 659
 660                   return err;
 661                 }
 662             }
 663
 664
 665           /* Note that all of the stack and the thread descriptor is
 666              zeroed.  This means we do not have to initialize fields
 667              with initial value zero.  This is specifically true for
 668              the 'tid' field which is always set back to zero once the
 669              stack is not used anymore and for the 'guardsize' field
 670              which will be read next.  */
 671         }
 672
 673       /* Create or resize the guard area if necessary.  */
 674       if (__glibc_unlikely (guardsize > pd->guardsize))
 675         {
 676           char *guard = guard_position (mem, size, guardsize, pd,
 677                                         pagesize_m1);
 678           if (__mprotect (guard, guardsize, PROT_NONE) != 0)
 679             {
 680             mprot_error:
 681               lll_lock (stack_cache_lock, LLL_PRIVATE);
 682
 683               /* Remove the thread from the list.  */
 684               stack_list_del (&pd->list);
 685
 686               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 687
 688               /* Get rid of the TLS block we allocated.  */
 689               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 690
 691               /* Free the stack memory regardless of whether the size
 692                  of the cache is over the limit or not.  If this piece
 693                  of memory caused problems we better do not use it
 694                  anymore.  Uh, and we ignore possible errors.  There
 695                  is nothing we could do.  */
 696               (void) __munmap (mem, size);
 697
 698               return errno;
 699             }
 700
 701           pd->guardsize = guardsize;
 702         }
 703       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 704                                  0))
 705         {
 706           /* The old guard area is too large.  */
 707
 708 #ifdef NEED_SEPARATE_REGISTER_STACK
 709           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 710           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 711
 712           if (oldguard < guard
 713               && __mprotect (oldguard, guard - oldguard, prot) != 0)
 714             goto mprot_error;
 715
 716           if (__mprotect (guard + guardsize,
 717                         oldguard + pd->guardsize - guard - guardsize,
 718                         prot) != 0)
 719             goto mprot_error;
 720 #elif _STACK_GROWS_DOWN
 721           if (__mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 722                         prot) != 0)
 723             goto mprot_error;
 724 #elif _STACK_GROWS_UP
 725          char *new_guard = (char *)(((uintptr_t) pd - guardsize)
 726                                     & ~pagesize_m1);
 727          char *old_guard = (char *)(((uintptr_t) pd - pd->guardsize)
 728                                     & ~pagesize_m1);
 729          /* The guard size difference might be > 0, but once rounded
 730             to the nearest page the size difference might be zero.  */
 731          if (new_guard > old_guard
 732              && __mprotect (old_guard, new_guard - old_guard, prot) != 0)
 733             goto mprot_error;
 734 #endif
 735
 736           pd->guardsize = guardsize;
 737         }
 738       /* The pthread_getattr_np() calls need to get passed the size
 739          requested in the attribute, regardless of how large the
 740          actually used guardsize is.  */
 741       pd->reported_guardsize = guardsize;
 742     }
 743
 744   /* Initialize the lock.  We have to do this unconditionally since the
 745      stillborn thread could be canceled while the lock is taken.  */
 746   pd->lock = LLL_LOCK_INITIALIZER;
 747
 748   /* The robust mutex lists also need to be initialized
 749      unconditionally because the cleanup for the previous stack owner
 750      might have happened in the kernel.  */
 751   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 752                                   - offsetof (pthread_mutex_t,
 753                                               __data.__list.__next));
 754   pd->robust_head.list_op_pending = NULL;
 755 #if __PTHREAD_MUTEX_HAVE_PREV
 756   pd->robust_prev = &pd->robust_head;
 757 #endif
 758   pd->robust_head.list = &pd->robust_head;
 759
 760   /* We place the thread descriptor at the end of the stack.  */
 761   *pdp = pd;
 762
 763 #if _STACK_GROWS_DOWN
 764   void *stacktop;
 765
 766 # if TLS_TCB_AT_TP
 767   /* The stack begins before the TCB and the static TLS block.  */
 768   stacktop = ((char *) (pd + 1) - __static_tls_size);
 769 # elif TLS_DTV_AT_TP
 770   stacktop = (char *) (pd - 1);
 771 # endif
 772
 773 # ifdef NEED_SEPARATE_REGISTER_STACK
 774   *stack = pd->stackblock;
 775   *stacksize = stacktop - *stack;
 776 # else
 777   *stack = stacktop;
 778 # endif
 779 #else
 780   *stack = pd->stackblock;
 781 #endif
 782
 783   return 0;
 784 }
 785
 786
 787 void
 788 __deallocate_stack (struct pthread *pd)
 789 {
 790   lll_lock (stack_cache_lock, LLL_PRIVATE);
 791
 792   /* Remove the thread from the list of threads with user defined
 793      stacks.  */
 794   stack_list_del (&pd->list);
 795
 796   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 797      not reset the 'used' flag in the 'tid' field.  This is done by
 798      the kernel.  If no thread has been created yet this field is
 799      still zero.  */
 800   if (__glibc_likely (! pd->user_stack))
 801     (void) queue_stack (pd);
 802   else
 803     /* Free the memory associated with the ELF TLS.  */
 804     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 805
 806   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 807 }
 808
 809
 810 int
 811 __make_stacks_executable (void **stack_endp)
 812 {
 813   /* First the main thread's stack.  */
 814   int err = _dl_make_stack_executable (stack_endp);
 815   if (err != 0)
 816     return err;
 817
 818 #ifdef NEED_SEPARATE_REGISTER_STACK
 819   const size_t pagemask = ~(__getpagesize () - 1);
 820 #endif
 821
 822   lll_lock (stack_cache_lock, LLL_PRIVATE);
 823
 824   list_t *runp;
 825   list_for_each (runp, &stack_used)
 826     {
 827       err = change_stack_perm (list_entry (runp, struct pthread, list)
 828 #ifdef NEED_SEPARATE_REGISTER_STACK
 829                                , pagemask
 830 #endif
 831                                );
 832       if (err != 0)
 833         break;
 834     }
 835
 836   /* Also change the permission for the currently unused stacks.  This
 837      might be wasted time but better spend it here than adding a check
 838      in the fast path.  */
 839   if (err == 0)
 840     list_for_each (runp, &stack_cache)
 841       {
 842         err = change_stack_perm (list_entry (runp, struct pthread, list)
 843 #ifdef NEED_SEPARATE_REGISTER_STACK
 844                                  , pagemask
 845 #endif
 846                                  );
 847         if (err != 0)
 848           break;
 849       }
 850
 851   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 852
 853   return err;
 854 }
 855
 856
 857 /* In case of a fork() call the memory allocation in the child will be
 858    the same but only one thread is running.  All stacks except that of
 859    the one running thread are not used anymore.  We have to recycle
 860    them.  */
 861 void
 862 __reclaim_stacks (void)
 863 {
 864   struct pthread *self = (struct pthread *) THREAD_SELF;
 865
 866   /* No locking necessary.  The caller is the only stack in use.  But
 867      we have to be aware that we might have interrupted a list
 868      operation.  */
 869
 870   if (in_flight_stack != 0)
 871     {
 872       bool add_p = in_flight_stack & 1;
 873       list_t *elem = (list_t *) (in_flight_stack & ~(uintptr_t) 1);
 874
 875       if (add_p)
 876         {
 877           /* We always add at the beginning of the list.  So in this case we
 878              only need to check the beginning of these lists to see if the
 879              pointers at the head of the list are inconsistent.  */
 880           list_t *l = NULL;
 881
 882           if (stack_used.next->prev != &stack_used)
 883             l = &stack_used;
 884           else if (stack_cache.next->prev != &stack_cache)
 885             l = &stack_cache;
 886
 887           if (l != NULL)
 888             {
 889               assert (l->next->prev == elem);
 890               elem->next = l->next;
 891               elem->prev = l;
 892               l->next = elem;
 893             }
 894         }
 895       else
 896         {
 897           /* We can simply always replay the delete operation.  */
 898           elem->next->prev = elem->prev;
 899           elem->prev->next = elem->next;
 900         }
 901     }
 902
 903   /* Mark all stacks except the still running one as free.  */
 904   list_t *runp;
 905   list_for_each (runp, &stack_used)
 906     {
 907       struct pthread *curp = list_entry (runp, struct pthread, list);
 908       if (curp != self)
 909         {
 910           /* This marks the stack as free.  */
 911           curp->tid = 0;
 912
 913           /* Account for the size of the stack.  */
 914           stack_cache_actsize += curp->stackblock_size;
 915
 916           if (curp->specific_used)
 917             {
 918               /* Clear the thread-specific data.  */
 919               memset (curp->specific_1stblock, '\0',
 920                       sizeof (curp->specific_1stblock));
 921
 922               curp->specific_used = false;
 923
 924               for (size_t cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 925                 if (curp->specific[cnt] != NULL)
 926                   {
 927                     memset (curp->specific[cnt], '\0',
 928                             sizeof (curp->specific_1stblock));
 929
 930                     /* We have allocated the block which we do not
 931                        free here so re-set the bit.  */
 932                     curp->specific_used = true;
 933                   }
 934             }
 935         }
 936     }
 937
 938   /* Add the stack of all running threads to the cache.  */
 939   list_splice (&stack_used, &stack_cache);
 940
 941   /* Remove the entry for the current thread to from the cache list
 942      and add it to the list of running threads.  Which of the two
 943      lists is decided by the user_stack flag.  */
 944   stack_list_del (&self->list);
 945
 946   /* Re-initialize the lists for all the threads.  */
 947   INIT_LIST_HEAD (&stack_used);
 948   INIT_LIST_HEAD (&__stack_user);
 949
 950   if (__glibc_unlikely (THREAD_GETMEM (self, user_stack)))
 951     list_add (&self->list, &__stack_user);
 952   else
 953     list_add (&self->list, &stack_used);
 954
 955   /* There is one thread running.  */
 956   __nptl_nthreads = 1;
 957
 958   in_flight_stack = 0;
 959
 960   /* Initialize locks.  */
 961   stack_cache_lock = LLL_LOCK_INITIALIZER;
 962   __default_pthread_attr_lock = LLL_LOCK_INITIALIZER;
 963 }
 964
 965
 966 #ifdef SIGSETXID
 967 static void
 968 setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
 969 {
 970   int ch;
 971
 972   /* Wait until this thread is cloned.  */
 973   if (t->setxid_futex == -1
 974       && ! atomic_compare_and_exchange_bool_acq (&t->setxid_futex, -2, -1))
 975     do
 976       futex_wait_simple (&t->setxid_futex, -2, FUTEX_PRIVATE);
 977     while (t->setxid_futex == -2);
 978
 979   /* Don't let the thread exit before the setxid handler runs.  */
 980   t->setxid_futex = 0;
 981
 982   do
 983     {
 984       ch = t->cancelhandling;
 985
 986       /* If the thread is exiting right now, ignore it.  */
 987       if ((ch & EXITING_BITMASK) != 0)
 988         {
 989           /* Release the futex if there is no other setxid in
 990              progress.  */
 991           if ((ch & SETXID_BITMASK) == 0)
 992             {
 993               t->setxid_futex = 1;
 994               futex_wake (&t->setxid_futex, 1, FUTEX_PRIVATE);
 995             }
 996           return;
 997         }
 998     }
 999   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1000                                                ch | SETXID_BITMASK, ch));
1001 }
1002
1003
1004 static void
1005 setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
1006 {
1007   int ch;
1008
1009   do
1010     {
1011       ch = t->cancelhandling;
1012       if ((ch & SETXID_BITMASK) == 0)
1013         return;
1014     }
1015   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1016                                                ch & ~SETXID_BITMASK, ch));
1017
1018   /* Release the futex just in case.  */
1019   t->setxid_futex = 1;
1020   futex_wake (&t->setxid_futex, 1, FUTEX_PRIVATE);
1021 }
1022
1023
1024 static int
1025 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
1026 {
1027   if ((t->cancelhandling & SETXID_BITMASK) == 0)
1028     return 0;
1029
1030   int val;
1031   pid_t pid = __getpid ();
1032   INTERNAL_SYSCALL_DECL (err);
1033   val = INTERNAL_SYSCALL_CALL (tgkill, err, pid, t->tid, SIGSETXID);
1034
1035   /* If this failed, it must have had not started yet or else exited.  */
1036   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
1037     {
1038       atomic_increment (&cmdp->cntr);
1039       return 1;
1040     }
1041   else
1042     return 0;
1043 }
1044
1045 /* Check for consistency across set*id system call results.  The abort
1046    should not happen as long as all privileges changes happen through
1047    the glibc wrappers.  ERROR must be 0 (no error) or an errno
1048    code.  */
1049 void
1050 attribute_hidden
1051 __nptl_setxid_error (struct xid_command *cmdp, int error)
1052 {
1053   do
1054     {
1055       int olderror = cmdp->error;
1056       if (olderror == error)
1057         break;
1058       if (olderror != -1)
1059         {
1060           /* Mismatch between current and previous results.  Save the
1061              error value to memory so that is not clobbered by the
1062              abort function and preserved in coredumps.  */
1063           volatile int xid_err __attribute__((unused)) = error;
1064           abort ();
1065         }
1066     }
1067   while (atomic_compare_and_exchange_bool_acq (&cmdp->error, error, -1));
1068 }
1069
1070 int
1071 attribute_hidden
1072 __nptl_setxid (struct xid_command *cmdp)
1073 {
1074   int signalled;
1075   int result;
1076   lll_lock (stack_cache_lock, LLL_PRIVATE);
1077
1078   __xidcmd = cmdp;
1079   cmdp->cntr = 0;
1080   cmdp->error = -1;
1081
1082   struct pthread *self = THREAD_SELF;
1083
1084   /* Iterate over the list with system-allocated threads first.  */
1085   list_t *runp;
1086   list_for_each (runp, &stack_used)
1087     {
1088       struct pthread *t = list_entry (runp, struct pthread, list);
1089       if (t == self)
1090         continue;
1091
1092       setxid_mark_thread (cmdp, t);
1093     }
1094
1095   /* Now the list with threads using user-allocated stacks.  */
1096   list_for_each (runp, &__stack_user)
1097     {
1098       struct pthread *t = list_entry (runp, struct pthread, list);
1099       if (t == self)
1100         continue;
1101
1102       setxid_mark_thread (cmdp, t);
1103     }
1104
1105   /* Iterate until we don't succeed in signalling anyone.  That means
1106      we have gotten all running threads, and their children will be
1107      automatically correct once started.  */
1108   do
1109     {
1110       signalled = 0;
1111
1112       list_for_each (runp, &stack_used)
1113         {
1114           struct pthread *t = list_entry (runp, struct pthread, list);
1115           if (t == self)
1116             continue;
1117
1118           signalled += setxid_signal_thread (cmdp, t);
1119         }
1120
1121       list_for_each (runp, &__stack_user)
1122         {
1123           struct pthread *t = list_entry (runp, struct pthread, list);
1124           if (t == self)
1125             continue;
1126
1127           signalled += setxid_signal_thread (cmdp, t);
1128         }
1129
1130       int cur = cmdp->cntr;
1131       while (cur != 0)
1132         {
1133           futex_wait_simple ((unsigned int *) &cmdp->cntr, cur,
1134                              FUTEX_PRIVATE);
1135           cur = cmdp->cntr;
1136         }
1137     }
1138   while (signalled != 0);
1139
1140   /* Clean up flags, so that no thread blocks during exit waiting
1141      for a signal which will never come.  */
1142   list_for_each (runp, &stack_used)
1143     {
1144       struct pthread *t = list_entry (runp, struct pthread, list);
1145       if (t == self)
1146         continue;
1147
1148       setxid_unmark_thread (cmdp, t);
1149     }
1150
1151   list_for_each (runp, &__stack_user)
1152     {
1153       struct pthread *t = list_entry (runp, struct pthread, list);
1154       if (t == self)
1155         continue;
1156
1157       setxid_unmark_thread (cmdp, t);
1158     }
1159
1160   /* This must be last, otherwise the current thread might not have
1161      permissions to send SIGSETXID syscall to the other threads.  */
1162   INTERNAL_SYSCALL_DECL (err);
1163   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1164                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1165   int error = 0;
1166   if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (result, err)))
1167     {
1168       error = INTERNAL_SYSCALL_ERRNO (result, err);
1169       __set_errno (error);
1170       result = -1;
1171     }
1172   __nptl_setxid_error (cmdp, error);
1173
1174   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1175   return result;
1176 }
1177 #endif  /* SIGSETXID.  */
1178
1179
1180 static inline void __attribute__((always_inline))
1181 init_one_static_tls (struct pthread *curp, struct link_map *map)
1182 {
1183 # if TLS_TCB_AT_TP
1184   void *dest = (char *) curp - map->l_tls_offset;
1185 # elif TLS_DTV_AT_TP
1186   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1187 # else
1188 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1189 # endif
1190
1191   /* Initialize the memory.  */
1192   memset (__mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1193           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1194 }
1195
1196 void
1197 attribute_hidden
1198 __pthread_init_static_tls (struct link_map *map)
1199 {
1200   lll_lock (stack_cache_lock, LLL_PRIVATE);
1201
1202   /* Iterate over the list with system-allocated threads first.  */
1203   list_t *runp;
1204   list_for_each (runp, &stack_used)
1205     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1206
1207   /* Now the list with threads using user-allocated stacks.  */
1208   list_for_each (runp, &__stack_user)
1209     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1210
1211   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1212 }
1213
1214
1215 void
1216 attribute_hidden
1217 __wait_lookup_done (void)
1218 {
1219   lll_lock (stack_cache_lock, LLL_PRIVATE);
1220
1221   struct pthread *self = THREAD_SELF;
1222
1223   /* Iterate over the list with system-allocated threads first.  */
1224   list_t *runp;
1225   list_for_each (runp, &stack_used)
1226     {
1227       struct pthread *t = list_entry (runp, struct pthread, list);
1228       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1229         continue;
1230
1231       int *const gscope_flagp = &t->header.gscope_flag;
1232
1233       /* We have to wait until this thread is done with the global
1234          scope.  First tell the thread that we are waiting and
1235          possibly have to be woken.  */
1236       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1237                                                 THREAD_GSCOPE_FLAG_WAIT,
1238                                                 THREAD_GSCOPE_FLAG_USED))
1239         continue;
1240
1241       do
1242         futex_wait_simple ((unsigned int *) gscope_flagp,
1243                            THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
1244       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1245     }
1246
1247   /* Now the list with threads using user-allocated stacks.  */
1248   list_for_each (runp, &__stack_user)
1249     {
1250       struct pthread *t = list_entry (runp, struct pthread, list);
1251       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1252         continue;
1253
1254       int *const gscope_flagp = &t->header.gscope_flag;
1255
1256       /* We have to wait until this thread is done with the global
1257          scope.  First tell the thread that we are waiting and
1258          possibly have to be woken.  */
1259       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1260                                                 THREAD_GSCOPE_FLAG_WAIT,
1261                                                 THREAD_GSCOPE_FLAG_USED))
1262         continue;
1263
1264       do
1265         futex_wait_simple ((unsigned int *) gscope_flagp,
1266                            THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
1267       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1268     }
1269
1270   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1271 }