libpthread/nptl/allocatestack.c

   1 /* Copyright (C) 2002-2007, 2009 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <assert.h>
  20 #include <errno.h>
  21 #include <signal.h>
  22 #include <stdint.h>
  23 #include <string.h>
  24 #include <unistd.h>
  25 #include <sys/mman.h>
  26 #include <sys/param.h>
  27 #include <tls.h>
  28 #include <lowlevellock.h>
  29 #include <link.h>
  30 #include <bits/kernel-features.h>
  31
  32
  33 #ifndef NEED_SEPARATE_REGISTER_STACK
  34
  35 /* Most architectures have exactly one stack pointer.  Some have more.  */
  36 # define STACK_VARIABLES void *stackaddr = NULL
  37
  38 /* How to pass the values to the 'create_thread' function.  */
  39 # define STACK_VARIABLES_ARGS stackaddr
  40
  41 /* How to declare function which gets there parameters.  */
  42 # define STACK_VARIABLES_PARMS void *stackaddr
  43
  44 /* How to declare allocate_stack.  */
  45 # define ALLOCATE_STACK_PARMS void **stack
  46
  47 /* This is how the function is called.  We do it this way to allow
  48    other variants of the function to have more parameters.  */
  49 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  50
  51 #else
  52
  53 /* We need two stacks.  The kernel will place them but we have to tell
  54    the kernel about the size of the reserved address space.  */
  55 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  56
  57 /* How to pass the values to the 'create_thread' function.  */
  58 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  59
  60 /* How to declare function which gets there parameters.  */
  61 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  62
  63 /* How to declare allocate_stack.  */
  64 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  65
  66 /* This is how the function is called.  We do it this way to allow
  67    other variants of the function to have more parameters.  */
  68 # define ALLOCATE_STACK(attr, pd) \
  69   allocate_stack (attr, pd, &stackaddr, &stacksize)
  70
  71 #endif
  72
  73
  74 /* Default alignment of stack.  */
  75 #ifndef STACK_ALIGN
  76 # define STACK_ALIGN __alignof__ (long double)
  77 #endif
  78
  79 /* Default value for minimal stack size after allocating thread
  80    descriptor and guard.  */
  81 #ifndef MINIMAL_REST_STACK
  82 # define MINIMAL_REST_STACK     4096
  83 #endif
  84
  85
  86 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  87    a stack.  Use it when possible.  */
  88 #ifndef MAP_STACK
  89 # define MAP_STACK 0
  90 #endif
  91
  92 /* This yields the pointer that TLS support code calls the thread pointer.  */
  93 #if defined(TLS_TCB_AT_TP)
  94 # define TLS_TPADJ(pd) (pd)
  95 #elif defined(TLS_DTV_AT_TP)
  96 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
  97 #endif
  98
  99 /* Cache handling for not-yet free stacks.  */
 100
 101 /*
 102    Maximum size in kB of cache. GNU libc default is 40MiB
 103    embedded systems don't have enough ram for big dirty stack caches,
 104    reduce it to 16MiB. 4 does not work, f.e. tst-kill4 segfaults.
 105 */
 106 static size_t stack_cache_maxsize = 16 * 1024 * 1024;
 107 static size_t stack_cache_actsize;
 108
 109 /* Mutex protecting this variable.  */
 110 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 111
 112 /* List of queued stack frames.  */
 113 static LIST_HEAD (stack_cache);
 114
 115 /* List of the stacks in use.  */
 116 static LIST_HEAD (stack_used);
 117
 118 /* We need to record what list operations we are going to do so that,
 119    in case of an asynchronous interruption due to a fork() call, we
 120    can correct for the work.  */
 121 static uintptr_t in_flight_stack;
 122
 123 /* List of the threads with user provided stacks in use.  No need to
 124    initialize this, since it's done in __pthread_initialize_minimal.  */
 125 list_t __stack_user __attribute__ ((nocommon));
 126 hidden_data_def (__stack_user)
 127
 128 #if defined COLORING_INCREMENT && COLORING_INCREMENT != 0
 129 /* Number of threads created.  */
 130 static unsigned int nptl_ncreated;
 131 #endif
 132
 133
 134 /* Check whether the stack is still used or not.  */
 135 #define FREE_P(descr) ((descr)->tid <= 0)
 136
 137
 138 static void
 139 stack_list_del (list_t *elem)
 140 {
 141   in_flight_stack = (uintptr_t) elem;
 142
 143   atomic_write_barrier ();
 144
 145   list_del (elem);
 146
 147   atomic_write_barrier ();
 148
 149   in_flight_stack = 0;
 150 }
 151
 152
 153 static void
 154 stack_list_add (list_t *elem, list_t *list)
 155 {
 156   in_flight_stack = (uintptr_t) elem | 1;
 157
 158   atomic_write_barrier ();
 159
 160   list_add (elem, list);
 161
 162   atomic_write_barrier ();
 163
 164   in_flight_stack = 0;
 165 }
 166
 167
 168 /* We create a double linked list of all cache entries.  Double linked
 169    because this allows removing entries from the end.  */
 170
 171
 172 /* Get a stack frame from the cache.  We have to match by size since
 173    some blocks might be too small or far too large.  */
 174 static struct pthread *
 175 get_cached_stack (size_t *sizep, void **memp)
 176 {
 177   size_t size = *sizep;
 178   struct pthread *result = NULL;
 179   list_t *entry;
 180
 181   lll_lock (stack_cache_lock, LLL_PRIVATE);
 182
 183   /* Search the cache for a matching entry.  We search for the
 184      smallest stack which has at least the required size.  Note that
 185      in normal situations the size of all allocated stacks is the
 186      same.  As the very least there are only a few different sizes.
 187      Therefore this loop will exit early most of the time with an
 188      exact match.  */
 189   list_for_each (entry, &stack_cache)
 190     {
 191       struct pthread *curr;
 192
 193       curr = list_entry (entry, struct pthread, list);
 194       if (FREE_P (curr) && curr->stackblock_size >= size)
 195         {
 196           if (curr->stackblock_size == size)
 197             {
 198               result = curr;
 199               break;
 200             }
 201
 202           if (result == NULL
 203               || result->stackblock_size > curr->stackblock_size)
 204             result = curr;
 205         }
 206     }
 207
 208   if (__builtin_expect (result == NULL, 0)
 209       /* Make sure the size difference is not too excessive.  In that
 210          case we do not use the block.  */
 211       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 212     {
 213       /* Release the lock.  */
 214       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 215
 216       return NULL;
 217     }
 218
 219   /* Dequeue the entry.  */
 220   stack_list_del (&result->list);
 221
 222   /* And add to the list of stacks in use.  */
 223   stack_list_add (&result->list, &stack_used);
 224
 225   /* And decrease the cache size.  */
 226   stack_cache_actsize -= result->stackblock_size;
 227
 228   /* Release the lock early.  */
 229   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 230
 231   /* Report size and location of the stack to the caller.  */
 232   *sizep = result->stackblock_size;
 233   *memp = result->stackblock;
 234
 235   /* Cancellation handling is back to the default.  */
 236   result->cancelhandling = 0;
 237   result->cleanup = NULL;
 238
 239   /* No pending event.  */
 240   result->nextevent = NULL;
 241
 242   /* Clear the DTV.  */
 243   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 244   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 245
 246   /* Re-initialize the TLS.  */
 247   _dl_allocate_tls_init (TLS_TPADJ (result));
 248
 249   return result;
 250 }
 251
 252
 253 /* Free stacks until cache size is lower than LIMIT.  */
 254 void
 255 __free_stacks (size_t limit)
 256 {
 257   /* We reduce the size of the cache.  Remove the last entries until
 258      the size is below the limit.  */
 259   list_t *entry;
 260   list_t *prev;
 261
 262   /* Search from the end of the list.  */
 263   list_for_each_prev_safe (entry, prev, &stack_cache)
 264     {
 265       struct pthread *curr;
 266
 267       curr = list_entry (entry, struct pthread, list);
 268       if (FREE_P (curr))
 269         {
 270           /* Unlink the block.  */
 271           stack_list_del (entry);
 272
 273           /* Account for the freed memory.  */
 274           stack_cache_actsize -= curr->stackblock_size;
 275
 276           /* Free the memory associated with the ELF TLS.  */
 277           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 278
 279           /* Remove this block.  This should never fail.  If it does
 280              something is really wrong.  */
 281           if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 282             abort ();
 283
 284           /* Maybe we have freed enough.  */
 285           if (stack_cache_actsize <= limit)
 286             break;
 287         }
 288     }
 289 }
 290
 291
 292 /* Add a stack frame which is not used anymore to the stack.  Must be
 293    called with the cache lock held.  */
 294 static inline void
 295 __attribute ((always_inline))
 296 queue_stack (struct pthread *stack)
 297 {
 298   /* We unconditionally add the stack to the list.  The memory may
 299      still be in use but it will not be reused until the kernel marks
 300      the stack as not used anymore.  */
 301   stack_list_add (&stack->list, &stack_cache);
 302
 303   stack_cache_actsize += stack->stackblock_size;
 304   if (__builtin_expect (stack_cache_actsize > stack_cache_maxsize, 0))
 305     __free_stacks (stack_cache_maxsize);
 306 }
 307
 308
 309 static int
 310 internal_function
 311 change_stack_perm (struct pthread *pd
 312 #ifdef NEED_SEPARATE_REGISTER_STACK
 313                    , size_t pagemask
 314 #endif
 315                    )
 316 {
 317 #ifdef NEED_SEPARATE_REGISTER_STACK
 318   void *stack = (pd->stackblock
 319                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 320                       & pagemask) + pd->guardsize) & pagemask));
 321   size_t len = pd->stackblock + pd->stackblock_size - stack;
 322 #elif defined _STACK_GROWS_DOWN
 323   void *stack = pd->stackblock + pd->guardsize;
 324   size_t len = pd->stackblock_size - pd->guardsize;
 325 #elif defined _STACK_GROWS_UP
 326   void *stack = pd->stackblock;
 327   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 328 #else
 329 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 330 #endif
 331 #ifdef __ARCH_USE_MMU__
 332   if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 333     return errno;
 334 #endif
 335
 336   return 0;
 337 }
 338
 339
 340 static int
 341 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 342                 ALLOCATE_STACK_PARMS)
 343 {
 344   struct pthread *pd;
 345   size_t size;
 346   size_t pagesize_m1 = __getpagesize () - 1;
 347   void *stacktop;
 348
 349   assert (attr != NULL);
 350   assert (powerof2 (pagesize_m1 + 1));
 351   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 352
 353   /* Get the stack size from the attribute if it is set.  Otherwise we
 354      use the default we determined at start time.  */
 355   size = attr->stacksize ?: __default_stacksize;
 356
 357   /* Get memory for the stack.  */
 358   if (__builtin_expect (attr->flags & ATTR_FLAG_STACKADDR, 0))
 359     {
 360       uintptr_t adj;
 361
 362       /* If the user also specified the size of the stack make sure it
 363          is large enough.  */
 364       if (attr->stacksize != 0
 365           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 366         return EINVAL;
 367
 368       /* Adjust stack size for alignment of the TLS block.  */
 369 #if defined(TLS_TCB_AT_TP)
 370       adj = ((uintptr_t) attr->stackaddr - TLS_TCB_SIZE)
 371             & __static_tls_align_m1;
 372       assert (size > adj + TLS_TCB_SIZE);
 373 #elif defined(TLS_DTV_AT_TP)
 374       adj = ((uintptr_t) attr->stackaddr - __static_tls_size)
 375             & __static_tls_align_m1;
 376       assert (size > adj);
 377 #endif
 378
 379       /* The user provided some memory.  Let's hope it matches the
 380          size...  We do not allocate guard pages if the user provided
 381          the stack.  It is the user's responsibility to do this if it
 382          is wanted.  */
 383 #if defined(TLS_TCB_AT_TP)
 384       pd = (struct pthread *) ((uintptr_t) attr->stackaddr
 385                                - TLS_TCB_SIZE - adj);
 386 #elif defined(TLS_DTV_AT_TP)
 387       pd = (struct pthread *) (((uintptr_t) attr->stackaddr
 388                                 - __static_tls_size - adj)
 389                                - TLS_PRE_TCB_SIZE);
 390 #endif
 391
 392       /* The user provided stack memory needs to be cleared.  */
 393       memset (pd, '\0', sizeof (struct pthread));
 394
 395       /* The first TSD block is included in the TCB.  */
 396       pd->specific[0] = pd->specific_1stblock;
 397
 398       /* Remember the stack-related values.  */
 399       pd->stackblock = (char *) attr->stackaddr - size;
 400       pd->stackblock_size = size;
 401
 402       /* This is a user-provided stack.  It will not be queued in the
 403          stack cache nor will the memory (except the TLS memory) be freed.  */
 404       pd->user_stack = true;
 405
 406       /* This is at least the second thread.  */
 407       pd->header.multiple_threads = 1;
 408 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 409       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 410 #endif
 411
 412 #ifndef __ASSUME_PRIVATE_FUTEX
 413       /* The thread must know when private futexes are supported.  */
 414       pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 415                                                 header.private_futex);
 416 #endif
 417
 418 #ifdef NEED_DL_SYSINFO
 419       /* Copy the sysinfo value from the parent.  */
 420       THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 421 #endif
 422
 423       /* Allocate the DTV for this thread.  */
 424       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 425         {
 426           /* Something went wrong.  */
 427           assert (errno == ENOMEM);
 428           return EAGAIN;
 429         }
 430
 431
 432       /* Prepare to modify global data.  */
 433       lll_lock (stack_cache_lock, LLL_PRIVATE);
 434
 435       /* And add to the list of stacks in use.  */
 436       list_add (&pd->list, &__stack_user);
 437
 438       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 439     }
 440   else
 441     {
 442       /* Allocate some anonymous memory.  If possible use the cache.  */
 443       size_t guardsize;
 444       size_t reqsize;
 445       void *mem = 0;
 446       const int prot = (PROT_READ | PROT_WRITE);
 447
 448 #if defined COLORING_INCREMENT && COLORING_INCREMENT != 0
 449       /* Add one more page for stack coloring.  Don't do it for stacks
 450          with 16 times pagesize or larger.  This might just cause
 451          unnecessary misalignment.  */
 452       if (size <= 16 * pagesize_m1)
 453         size += pagesize_m1 + 1;
 454 #endif
 455
 456       /* Adjust the stack size for alignment.  */
 457       size &= ~__static_tls_align_m1;
 458       assert (size != 0);
 459
 460       /* Make sure the size of the stack is enough for the guard and
 461          eventually the thread descriptor.  */
 462       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 463       if (__builtin_expect (size < ((guardsize + __static_tls_size
 464                                      + MINIMAL_REST_STACK + pagesize_m1)
 465                                     & ~pagesize_m1),
 466                             0))
 467         /* The stack is too small (or the guard too large).  */
 468         return EINVAL;
 469
 470       /* Try to get a stack from the cache.  */
 471       reqsize = size;
 472       pd = get_cached_stack (&size, &mem);
 473       if (pd == NULL)
 474         {
 475           /* To avoid aliasing effects on a larger scale than pages we
 476              adjust the allocated stack size if necessary.  This way
 477              allocations directly following each other will not have
 478              aliasing problems.  */
 479 #if defined MULTI_PAGE_ALIASING && MULTI_PAGE_ALIASING != 0
 480           if ((size % MULTI_PAGE_ALIASING) == 0)
 481             size += pagesize_m1 + 1;
 482 #endif
 483
 484           mem = mmap (NULL, size, prot,
 485                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 486
 487           if (__builtin_expect (mem == MAP_FAILED, 0))
 488             {
 489               if (errno == ENOMEM)
 490                 __set_errno (EAGAIN);
 491
 492                return errno;
 493             }
 494
 495           /* SIZE is guaranteed to be greater than zero.
 496              So we can never get a null pointer back from mmap.  */
 497           assert (mem != NULL);
 498
 499 #if defined COLORING_INCREMENT && COLORING_INCREMENT != 0
 500           /* Atomically increment NCREATED.  */
 501           unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
 502
 503           /* We chose the offset for coloring by incrementing it for
 504              every new thread by a fixed amount.  The offset used
 505              module the page size.  Even if coloring would be better
 506              relative to higher alignment values it makes no sense to
 507              do it since the mmap() interface does not allow us to
 508              specify any alignment for the returned memory block.  */
 509           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 510
 511           /* Make sure the coloring offsets does not disturb the alignment
 512              of the TCB and static TLS block.  */
 513           if (__builtin_expect ((coloring & __static_tls_align_m1) != 0, 0))
 514             coloring = (((coloring + __static_tls_align_m1)
 515                          & ~(__static_tls_align_m1))
 516                         & ~pagesize_m1);
 517 #else
 518           /* Unless specified we do not make any adjustments.  */
 519 # define coloring 0
 520 #endif
 521
 522           /* Place the thread descriptor at the end of the stack.  */
 523 #if defined(TLS_TCB_AT_TP)
 524           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 525 #elif defined(TLS_DTV_AT_TP)
 526           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 527                                     - __static_tls_size)
 528                                     & ~__static_tls_align_m1)
 529                                    - TLS_PRE_TCB_SIZE);
 530 #endif
 531
 532           /* Remember the stack-related values.  */
 533           pd->stackblock = mem;
 534           pd->stackblock_size = size;
 535
 536           /* We allocated the first block thread-specific data array.
 537              This address will not change for the lifetime of this
 538              descriptor.  */
 539           pd->specific[0] = pd->specific_1stblock;
 540
 541           /* This is at least the second thread.  */
 542           pd->header.multiple_threads = 1;
 543 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 544           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 545 #endif
 546
 547 #ifndef __ASSUME_PRIVATE_FUTEX
 548           /* The thread must know when private futexes are supported.  */
 549           pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 550                                                     header.private_futex);
 551 #endif
 552
 553 #ifdef NEED_DL_SYSINFO
 554           /* Copy the sysinfo value from the parent.  */
 555           THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 556 #endif
 557
 558           /* Allocate the DTV for this thread.  */
 559           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 560             {
 561               /* Something went wrong.  */
 562               assert (errno == ENOMEM);
 563
 564               /* Free the stack memory we just allocated.  */
 565               (void) munmap (mem, size);
 566
 567               return EAGAIN;
 568             }
 569
 570
 571           /* Prepare to modify global data.  */
 572           lll_lock (stack_cache_lock, LLL_PRIVATE);
 573
 574           /* And add to the list of stacks in use.  */
 575           stack_list_add (&pd->list, &stack_used);
 576
 577           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 578
 579
 580           /* Note that all of the stack and the thread descriptor is
 581              zeroed.  This means we do not have to initialize fields
 582              with initial value zero.  This is specifically true for
 583              the 'tid' field which is always set back to zero once the
 584              stack is not used anymore and for the 'guardsize' field
 585              which will be read next.  */
 586         }
 587
 588       /* Create or resize the guard area if necessary.  */
 589       if (__builtin_expect (guardsize > pd->guardsize, 0))
 590         {
 591 #ifdef NEED_SEPARATE_REGISTER_STACK
 592           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 593 #elif defined _STACK_GROWS_DOWN
 594           char *guard = mem;
 595 #elif defined _STACK_GROWS_UP
 596           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 597 #endif
 598 #ifdef __ARCH_USE_MMU__
 599           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 600             {
 601               int err;
 602 #ifdef NEED_SEPARATE_REGISTER_STACK
 603             mprot_error:
 604 #endif
 605               err = errno;
 606
 607               lll_lock (stack_cache_lock, LLL_PRIVATE);
 608
 609               /* Remove the thread from the list.  */
 610               stack_list_del (&pd->list);
 611
 612               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 613
 614               /* Get rid of the TLS block we allocated.  */
 615               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 616
 617               /* Free the stack memory regardless of whether the size
 618                  of the cache is over the limit or not.  If this piece
 619                  of memory caused problems we better do not use it
 620                  anymore.  Uh, and we ignore possible errors.  There
 621                  is nothing we could do.  */
 622               (void) munmap (mem, size);
 623
 624               return err;
 625             }
 626 #endif
 627
 628           pd->guardsize = guardsize;
 629         }
 630       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 631                                  0))
 632         {
 633           /* The old guard area is too large.  */
 634
 635 #ifdef NEED_SEPARATE_REGISTER_STACK
 636           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 637           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 638
 639 #ifdef __ARCH_USE_MMU__
 640           if (oldguard < guard
 641               && mprotect (oldguard, guard - oldguard, prot) != 0)
 642             goto mprot_error;
 643
 644           if (mprotect (guard + guardsize,
 645                         oldguard + pd->guardsize - guard - guardsize,
 646                         prot) != 0)
 647             goto mprot_error;
 648 #elif defined _STACK_GROWS_DOWN
 649           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 650                         prot) != 0)
 651             goto mprot_error;
 652 #elif defined _STACK_GROWS_UP
 653           if (mprotect ((char *) (((uintptr_t) pd - pd->guardsize) & ~pagesize_m1),
 654                         pd->guardsize - guardsize, prot) != 0)
 655             goto mprot_error;
 656 #endif
 657 #endif
 658
 659           pd->guardsize = guardsize;
 660         }
 661       /* The pthread_getattr_np() calls need to get passed the size
 662          requested in the attribute, regardless of how large the
 663          actually used guardsize is.  */
 664       pd->reported_guardsize = guardsize;
 665     }
 666
 667   /* Initialize the lock.  We have to do this unconditionally since the
 668      stillborn thread could be canceled while the lock is taken.  */
 669   pd->lock = LLL_LOCK_INITIALIZER;
 670
 671   /* The robust mutex lists also need to be initialized
 672      unconditionally because the cleanup for the previous stack owner
 673      might have happened in the kernel.  */
 674   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 675                                   - offsetof (pthread_mutex_t,
 676                                               __data.__list.__next));
 677   pd->robust_head.list_op_pending = NULL;
 678 #ifdef __PTHREAD_MUTEX_HAVE_PREV
 679   pd->robust_prev = &pd->robust_head;
 680 #endif
 681   pd->robust_head.list = &pd->robust_head;
 682
 683   /* We place the thread descriptor at the end of the stack.  */
 684   *pdp = pd;
 685
 686 #if defined(TLS_TCB_AT_TP)
 687   /* The stack begins before the TCB and the static TLS block.  */
 688   stacktop = ((char *) (pd + 1) - __static_tls_size);
 689 #elif defined(TLS_DTV_AT_TP)
 690   stacktop = (char *) (pd - 1);
 691 #endif
 692
 693 #ifdef NEED_SEPARATE_REGISTER_STACK
 694   *stack = pd->stackblock;
 695   *stacksize = stacktop - *stack;
 696 #elif defined _STACK_GROWS_DOWN
 697   *stack = stacktop;
 698 #elif defined _STACK_GROWS_UP
 699   *stack = pd->stackblock;
 700   assert (*stack > 0);
 701 #endif
 702
 703   return 0;
 704 }
 705
 706
 707 void
 708 internal_function
 709 __deallocate_stack (struct pthread *pd)
 710 {
 711   lll_lock (stack_cache_lock, LLL_PRIVATE);
 712
 713   /* Remove the thread from the list of threads with user defined
 714      stacks.  */
 715   stack_list_del (&pd->list);
 716
 717   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 718      not reset the 'used' flag in the 'tid' field.  This is done by
 719      the kernel.  If no thread has been created yet this field is
 720      still zero.  */
 721   if (__builtin_expect (! pd->user_stack, 1))
 722     (void) queue_stack (pd);
 723   else
 724     /* Free the memory associated with the ELF TLS.  */
 725     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 726
 727   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 728 }
 729
 730
 731 int
 732 internal_function
 733 __make_stacks_executable (void **stack_endp)
 734 {
 735   /* First the main thread's stack.  */
 736   int err = EPERM;
 737   if (err != 0)
 738     return err;
 739
 740 #ifdef NEED_SEPARATE_REGISTER_STACK
 741   const size_t pagemask = ~(__getpagesize () - 1);
 742 #endif
 743
 744   lll_lock (stack_cache_lock, LLL_PRIVATE);
 745
 746   list_t *runp;
 747   list_for_each (runp, &stack_used)
 748     {
 749       err = change_stack_perm (list_entry (runp, struct pthread, list)
 750 #ifdef NEED_SEPARATE_REGISTER_STACK
 751                                , pagemask
 752 #endif
 753                                );
 754       if (err != 0)
 755         break;
 756     }
 757
 758   /* Also change the permission for the currently unused stacks.  This
 759      might be wasted time but better spend it here than adding a check
 760      in the fast path.  */
 761   if (err == 0)
 762     list_for_each (runp, &stack_cache)
 763       {
 764         err = change_stack_perm (list_entry (runp, struct pthread, list)
 765 #ifdef NEED_SEPARATE_REGISTER_STACK
 766                                  , pagemask
 767 #endif
 768                                  );
 769         if (err != 0)
 770           break;
 771       }
 772
 773   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 774
 775   return err;
 776 }
 777
 778
 779 /* In case of a fork() call the memory allocation in the child will be
 780    the same but only one thread is running.  All stacks except that of
 781    the one running thread are not used anymore.  We have to recycle
 782    them.  */
 783 void
 784 __reclaim_stacks (void)
 785 {
 786   struct pthread *self = (struct pthread *) THREAD_SELF;
 787
 788   /* No locking necessary.  The caller is the only stack in use.  But
 789      we have to be aware that we might have interrupted a list
 790      operation.  */
 791
 792   if (in_flight_stack != 0)
 793     {
 794       bool add_p = in_flight_stack & 1;
 795       list_t *elem = (list_t *)(uintptr_t)(in_flight_stack & ~UINTMAX_C (1));
 796
 797       if (add_p)
 798         {
 799           /* We always add at the beginning of the list.  So in this
 800              case we only need to check the beginning of these lists.  */
 801           int check_list (list_t *l)
 802           {
 803             if (l->next->prev != l)
 804               {
 805                 assert (l->next->prev == elem);
 806
 807                 elem->next = l->next;
 808                 elem->prev = l;
 809                 l->next = elem;
 810
 811                 return 1;
 812               }
 813
 814             return 0;
 815           }
 816
 817           if (check_list (&stack_used) == 0)
 818             (void) check_list (&stack_cache);
 819         }
 820       else
 821         {
 822           /* We can simply always replay the delete operation.  */
 823           elem->next->prev = elem->prev;
 824           elem->prev->next = elem->next;
 825         }
 826     }
 827
 828   /* Mark all stacks except the still running one as free.  */
 829   list_t *runp;
 830   list_for_each (runp, &stack_used)
 831     {
 832       struct pthread *curp = list_entry (runp, struct pthread, list);
 833       if (curp != self)
 834         {
 835           /* This marks the stack as free.  */
 836           curp->tid = 0;
 837
 838           /* Account for the size of the stack.  */
 839           stack_cache_actsize += curp->stackblock_size;
 840
 841           if (curp->specific_used)
 842             {
 843               /* Clear the thread-specific data.  */
 844               memset (curp->specific_1stblock, '\0',
 845                       sizeof (curp->specific_1stblock));
 846
 847               curp->specific_used = false;
 848
 849               size_t cnt;
 850               for (cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 851                 if (curp->specific[cnt] != NULL)
 852                   {
 853                     memset (curp->specific[cnt], '\0',
 854                             sizeof (curp->specific_1stblock));
 855
 856                     /* We have allocated the block which we do not
 857                        free here so re-set the bit.  */
 858                     curp->specific_used = true;
 859                   }
 860             }
 861         }
 862     }
 863
 864   /* Add the stack of all running threads to the cache.  */
 865   list_splice (&stack_used, &stack_cache);
 866
 867   /* Remove the entry for the current thread to from the cache list
 868      and add it to the list of running threads.  Which of the two
 869      lists is decided by the user_stack flag.  */
 870   stack_list_del (&self->list);
 871
 872   /* Re-initialize the lists for all the threads.  */
 873   INIT_LIST_HEAD (&stack_used);
 874   INIT_LIST_HEAD (&__stack_user);
 875
 876   if (__builtin_expect (THREAD_GETMEM (self, user_stack), 0))
 877     list_add (&self->list, &__stack_user);
 878   else
 879     list_add (&self->list, &stack_used);
 880
 881   /* There is one thread running.  */
 882   __nptl_nthreads = 1;
 883
 884   in_flight_stack = 0;
 885
 886   /* Initialize the lock.  */
 887   stack_cache_lock = LLL_LOCK_INITIALIZER;
 888 }
 889
 890
 891 static void
 892 internal_function
 893 setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
 894 {
 895   int ch;
 896
 897   /* Don't let the thread exit before the setxid handler runs.  */
 898   t->setxid_futex = 0;
 899
 900   do
 901     {
 902       ch = t->cancelhandling;
 903
 904       /* If the thread is exiting right now, ignore it.  */
 905       if ((ch & EXITING_BITMASK) != 0)
 906         return;
 907     }
 908   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
 909                                                ch | SETXID_BITMASK, ch));
 910 }
 911
 912
 913 static void
 914 internal_function
 915 setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
 916 {
 917   int ch;
 918
 919   do
 920     {
 921       ch = t->cancelhandling;
 922       if ((ch & SETXID_BITMASK) == 0)
 923         return;
 924     }
 925   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
 926                                                ch & ~SETXID_BITMASK, ch));
 927
 928   /* Release the futex just in case.  */
 929   t->setxid_futex = 1;
 930   lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
 931 }
 932
 933
 934 static int
 935 internal_function
 936 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
 937 {
 938   if ((t->cancelhandling & SETXID_BITMASK) == 0)
 939     return 0;
 940
 941   int val;
 942   pid_t pid = getpid ();
 943   INTERNAL_SYSCALL_DECL (err);
 944   val = INTERNAL_SYSCALL (tgkill, err, 3, pid, t->tid, SIGSETXID);
 945
 946   /* If this failed, it must have had not started yet or else exited.  */
 947   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
 948     {
 949       atomic_increment (&cmdp->cntr);
 950       return 1;
 951     }
 952   else
 953     return 0;
 954 }
 955
 956
 957 int
 958 attribute_hidden
 959 __nptl_setxid (struct xid_command *cmdp)
 960 {
 961   int signalled;
 962   int result;
 963   lll_lock (stack_cache_lock, LLL_PRIVATE);
 964
 965   __xidcmd = cmdp;
 966   cmdp->cntr = 0;
 967
 968   struct pthread *self = THREAD_SELF;
 969
 970   /* Iterate over the list with system-allocated threads first.  */
 971   list_t *runp;
 972   list_for_each (runp, &stack_used)
 973     {
 974       struct pthread *t = list_entry (runp, struct pthread, list);
 975       if (t == self)
 976         continue;
 977
 978       setxid_mark_thread (cmdp, t);
 979     }
 980
 981   /* Now the list with threads using user-allocated stacks.  */
 982   list_for_each (runp, &__stack_user)
 983     {
 984       struct pthread *t = list_entry (runp, struct pthread, list);
 985       if (t == self)
 986         continue;
 987
 988       setxid_mark_thread (cmdp, t);
 989     }
 990
 991   /* Iterate until we don't succeed in signalling anyone.  That means
 992      we have gotten all running threads, and their children will be
 993      automatically correct once started.  */
 994   do
 995     {
 996       signalled = 0;
 997
 998       list_for_each (runp, &stack_used)
 999         {
1000           struct pthread *t = list_entry (runp, struct pthread, list);
1001           if (t == self)
1002             continue;
1003
1004           signalled += setxid_signal_thread (cmdp, t);
1005         }
1006
1007       list_for_each (runp, &__stack_user)
1008         {
1009           struct pthread *t = list_entry (runp, struct pthread, list);
1010           if (t == self)
1011             continue;
1012
1013           signalled += setxid_signal_thread (cmdp, t);
1014         }
1015
1016       int cur = cmdp->cntr;
1017       while (cur != 0)
1018         {
1019           lll_futex_wait (&cmdp->cntr, cur, LLL_PRIVATE);
1020           cur = cmdp->cntr;
1021         }
1022     }
1023   while (signalled != 0);
1024
1025   /* Clean up flags, so that no thread blocks during exit waiting
1026      for a signal which will never come.  */
1027   list_for_each (runp, &stack_used)
1028     {
1029       struct pthread *t = list_entry (runp, struct pthread, list);
1030       if (t == self)
1031         continue;
1032
1033       setxid_unmark_thread (cmdp, t);
1034     }
1035
1036   list_for_each (runp, &__stack_user)
1037     {
1038       struct pthread *t = list_entry (runp, struct pthread, list);
1039       if (t == self)
1040         continue;
1041
1042       setxid_unmark_thread (cmdp, t);
1043     }
1044
1045   /* This must be last, otherwise the current thread might not have
1046      permissions to send SIGSETXID syscall to the other threads.  */
1047   INTERNAL_SYSCALL_DECL (err);
1048   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1049                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1050   if (INTERNAL_SYSCALL_ERROR_P (result, err))
1051     {
1052       __set_errno (INTERNAL_SYSCALL_ERRNO (result, err));
1053       result = -1;
1054     }
1055
1056   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1057   return result;
1058 }
1059
1060 static inline void __attribute__((always_inline))
1061 init_one_static_tls (struct pthread *curp, struct link_map *map)
1062 {
1063   dtv_t *dtv = GET_DTV (TLS_TPADJ (curp));
1064 # if defined(TLS_TCB_AT_TP)
1065   void *dest = (char *) curp - map->l_tls_offset;
1066 # elif defined(TLS_DTV_AT_TP)
1067   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1068 # else
1069 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1070 # endif
1071
1072   /* Fill in the DTV slot so that a later LD/GD access will find it.  */
1073   dtv[map->l_tls_modid].pointer.val = dest;
1074   dtv[map->l_tls_modid].pointer.is_static = true;
1075
1076   /* Initialize the memory.  */
1077   memset (mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1078           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1079 }
1080
1081 void
1082 attribute_hidden
1083 __pthread_init_static_tls (struct link_map *map)
1084 {
1085   lll_lock (stack_cache_lock, LLL_PRIVATE);
1086
1087   /* Iterate over the list with system-allocated threads first.  */
1088   list_t *runp;
1089   list_for_each (runp, &stack_used)
1090     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1091
1092   /* Now the list with threads using user-allocated stacks.  */
1093   list_for_each (runp, &__stack_user)
1094     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1095
1096   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1097 }
1098
1099
1100 void
1101 attribute_hidden
1102 __wait_lookup_done (void)
1103 {
1104   lll_lock (stack_cache_lock, LLL_PRIVATE);
1105
1106   struct pthread *self = THREAD_SELF;
1107
1108   /* Iterate over the list with system-allocated threads first.  */
1109   list_t *runp;
1110   list_for_each (runp, &stack_used)
1111     {
1112       struct pthread *t = list_entry (runp, struct pthread, list);
1113       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1114         continue;
1115
1116       int *const gscope_flagp = &t->header.gscope_flag;
1117
1118       /* We have to wait until this thread is done with the global
1119          scope.  First tell the thread that we are waiting and
1120          possibly have to be woken.  */
1121       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1122                                                 THREAD_GSCOPE_FLAG_WAIT,
1123                                                 THREAD_GSCOPE_FLAG_USED))
1124         continue;
1125
1126       do
1127         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1128       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1129     }
1130
1131   /* Now the list with threads using user-allocated stacks.  */
1132   list_for_each (runp, &__stack_user)
1133     {
1134       struct pthread *t = list_entry (runp, struct pthread, list);
1135       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1136         continue;
1137
1138       int *const gscope_flagp = &t->header.gscope_flag;
1139
1140       /* We have to wait until this thread is done with the global
1141          scope.  First tell the thread that we are waiting and
1142          possibly have to be woken.  */
1143       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1144                                                 THREAD_GSCOPE_FLAG_WAIT,
1145                                                 THREAD_GSCOPE_FLAG_USED))
1146         continue;
1147
1148       do
1149         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1150       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1151     }
1152
1153   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1154 }