libpthread/nptl/allocatestack.c

   1 /* Copyright (C) 2002-2007, 2009 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <assert.h>
  20 #include <errno.h>
  21 #include <signal.h>
  22 #include <stdint.h>
  23 #include <string.h>
  24 #include <unistd.h>
  25 #include <sys/mman.h>
  26 #include <sys/param.h>
  27 #include <tls.h>
  28 #include <lowlevellock.h>
  29 #include <link.h>
  30 #include <bits/kernel-features.h>
  31
  32
  33 #ifndef NEED_SEPARATE_REGISTER_STACK
  34
  35 /* Most architectures have exactly one stack pointer.  Some have more.  */
  36 # define STACK_VARIABLES void *stackaddr = NULL
  37
  38 /* How to pass the values to the 'create_thread' function.  */
  39 # define STACK_VARIABLES_ARGS stackaddr
  40
  41 /* How to declare function which gets there parameters.  */
  42 # define STACK_VARIABLES_PARMS void *stackaddr
  43
  44 /* How to declare allocate_stack.  */
  45 # define ALLOCATE_STACK_PARMS void **stack
  46
  47 /* This is how the function is called.  We do it this way to allow
  48    other variants of the function to have more parameters.  */
  49 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  50
  51 #else
  52
  53 /* We need two stacks.  The kernel will place them but we have to tell
  54    the kernel about the size of the reserved address space.  */
  55 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  56
  57 /* How to pass the values to the 'create_thread' function.  */
  58 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  59
  60 /* How to declare function which gets there parameters.  */
  61 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  62
  63 /* How to declare allocate_stack.  */
  64 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  65
  66 /* This is how the function is called.  We do it this way to allow
  67    other variants of the function to have more parameters.  */
  68 # define ALLOCATE_STACK(attr, pd) \
  69   allocate_stack (attr, pd, &stackaddr, &stacksize)
  70
  71 #endif
  72
  73
  74 /* Default alignment of stack.  */
  75 #ifndef STACK_ALIGN
  76 # define STACK_ALIGN __alignof__ (long double)
  77 #endif
  78
  79 /* Default value for minimal stack size after allocating thread
  80    descriptor and guard.  */
  81 #ifndef MINIMAL_REST_STACK
  82 # define MINIMAL_REST_STACK     4096
  83 #endif
  84
  85
  86 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  87    a stack.  Use it when possible.  */
  88 #ifndef MAP_STACK
  89 # define MAP_STACK 0
  90 #endif
  91
  92 /* This yields the pointer that TLS support code calls the thread pointer.  */
  93 #if defined(TLS_TCB_AT_TP)
  94 # define TLS_TPADJ(pd) (pd)
  95 #elif defined(TLS_DTV_AT_TP)
  96 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
  97 #endif
  98
  99 /* Cache handling for not-yet free stacks.  */
 100
 101 /*
 102    Maximum size in kB of cache. GNU libc default is 40MiB
 103    embedded systems don't have enough ram for big dirty stack caches,
 104    reduce it to 16MiB. 4 does not work, f.e. tst-kill4 segfaults.
 105 */
 106 static size_t stack_cache_maxsize = 16 * 1024 * 1024;
 107 static size_t stack_cache_actsize;
 108
 109 /* Mutex protecting this variable.  */
 110 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 111
 112 /* List of queued stack frames.  */
 113 static LIST_HEAD (stack_cache);
 114
 115 /* List of the stacks in use.  */
 116 static LIST_HEAD (stack_used);
 117
 118 /* We need to record what list operations we are going to do so that,
 119    in case of an asynchronous interruption due to a fork() call, we
 120    can correct for the work.  */
 121 static uintptr_t in_flight_stack;
 122
 123 /* List of the threads with user provided stacks in use.  No need to
 124    initialize this, since it's done in __pthread_initialize_minimal.  */
 125 list_t __stack_user __attribute__ ((nocommon));
 126 hidden_data_def (__stack_user)
 127
 128 #if defined COLORING_INCREMENT && COLORING_INCREMENT != 0
 129 /* Number of threads created.  */
 130 static unsigned int nptl_ncreated;
 131 #endif
 132
 133
 134 /* Check whether the stack is still used or not.  */
 135 #define FREE_P(descr) ((descr)->tid <= 0)
 136
 137
 138 static void
 139 stack_list_del (list_t *elem)
 140 {
 141   in_flight_stack = (uintptr_t) elem;
 142
 143   atomic_write_barrier ();
 144
 145   list_del (elem);
 146
 147   atomic_write_barrier ();
 148
 149   in_flight_stack = 0;
 150 }
 151
 152
 153 static void
 154 stack_list_add (list_t *elem, list_t *list)
 155 {
 156   in_flight_stack = (uintptr_t) elem | 1;
 157
 158   atomic_write_barrier ();
 159
 160   list_add (elem, list);
 161
 162   atomic_write_barrier ();
 163
 164   in_flight_stack = 0;
 165 }
 166
 167
 168 /* We create a double linked list of all cache entries.  Double linked
 169    because this allows removing entries from the end.  */
 170
 171
 172 /* Get a stack frame from the cache.  We have to match by size since
 173    some blocks might be too small or far too large.  */
 174 static struct pthread *
 175 get_cached_stack (size_t *sizep, void **memp)
 176 {
 177   size_t size = *sizep;
 178   struct pthread *result = NULL;
 179   list_t *entry;
 180
 181   lll_lock (stack_cache_lock, LLL_PRIVATE);
 182
 183   /* Search the cache for a matching entry.  We search for the
 184      smallest stack which has at least the required size.  Note that
 185      in normal situations the size of all allocated stacks is the
 186      same.  As the very least there are only a few different sizes.
 187      Therefore this loop will exit early most of the time with an
 188      exact match.  */
 189   list_for_each (entry, &stack_cache)
 190     {
 191       struct pthread *curr;
 192
 193       curr = list_entry (entry, struct pthread, list);
 194       if (FREE_P (curr) && curr->stackblock_size >= size)
 195         {
 196           if (curr->stackblock_size == size)
 197             {
 198               result = curr;
 199               break;
 200             }
 201
 202           if (result == NULL
 203               || result->stackblock_size > curr->stackblock_size)
 204             result = curr;
 205         }
 206     }
 207
 208   if (__builtin_expect (result == NULL, 0)
 209       /* Make sure the size difference is not too excessive.  In that
 210          case we do not use the block.  */
 211       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 212     {
 213       /* Release the lock.  */
 214       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 215
 216       return NULL;
 217     }
 218
 219   /* Dequeue the entry.  */
 220   stack_list_del (&result->list);
 221
 222   /* And add to the list of stacks in use.  */
 223   stack_list_add (&result->list, &stack_used);
 224
 225   /* And decrease the cache size.  */
 226   stack_cache_actsize -= result->stackblock_size;
 227
 228   /* Release the lock early.  */
 229   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 230
 231   /* Report size and location of the stack to the caller.  */
 232   *sizep = result->stackblock_size;
 233   *memp = result->stackblock;
 234
 235   /* Cancellation handling is back to the default.  */
 236   result->cancelhandling = 0;
 237   result->cleanup = NULL;
 238
 239   /* No pending event.  */
 240   result->nextevent = NULL;
 241
 242   /* Clear the DTV.  */
 243   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 244   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 245
 246   /* Re-initialize the TLS.  */
 247   _dl_allocate_tls_init (TLS_TPADJ (result));
 248
 249   return result;
 250 }
 251
 252
 253 /* Free stacks until cache size is lower than LIMIT.  */
 254 void
 255 __free_stacks (size_t limit)
 256 {
 257   /* We reduce the size of the cache.  Remove the last entries until
 258      the size is below the limit.  */
 259   list_t *entry;
 260   list_t *prev;
 261
 262   /* Search from the end of the list.  */
 263   list_for_each_prev_safe (entry, prev, &stack_cache)
 264     {
 265       struct pthread *curr;
 266
 267       curr = list_entry (entry, struct pthread, list);
 268       if (FREE_P (curr))
 269         {
 270           /* Unlink the block.  */
 271           stack_list_del (entry);
 272
 273           /* Account for the freed memory.  */
 274           stack_cache_actsize -= curr->stackblock_size;
 275
 276           /* Free the memory associated with the ELF TLS.  */
 277           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 278
 279           /* Remove this block.  This should never fail.  If it does
 280              something is really wrong.  */
 281           if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 282             abort ();
 283
 284           /* Maybe we have freed enough.  */
 285           if (stack_cache_actsize <= limit)
 286             break;
 287         }
 288     }
 289 }
 290
 291
 292 /* Add a stack frame which is not used anymore to the stack.  Must be
 293    called with the cache lock held.  */
 294 static inline void
 295 __attribute ((always_inline))
 296 queue_stack (struct pthread *stack)
 297 {
 298   /* We unconditionally add the stack to the list.  The memory may
 299      still be in use but it will not be reused until the kernel marks
 300      the stack as not used anymore.  */
 301   stack_list_add (&stack->list, &stack_cache);
 302
 303   stack_cache_actsize += stack->stackblock_size;
 304   if (__builtin_expect (stack_cache_actsize > stack_cache_maxsize, 0))
 305     __free_stacks (stack_cache_maxsize);
 306 }
 307
 308
 309 static int
 310 internal_function
 311 change_stack_perm (struct pthread *pd
 312 #ifdef NEED_SEPARATE_REGISTER_STACK
 313                    , size_t pagemask
 314 #endif
 315                    )
 316 {
 317 #ifdef NEED_SEPARATE_REGISTER_STACK
 318   void *stack = (pd->stackblock
 319                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 320                       & pagemask) + pd->guardsize) & pagemask));
 321   size_t len = pd->stackblock + pd->stackblock_size - stack;
 322 #elif defined _STACK_GROWS_DOWN
 323   void *stack = pd->stackblock + pd->guardsize;
 324   size_t len = pd->stackblock_size - pd->guardsize;
 325 #elif defined _STACK_GROWS_UP
 326   void *stack = pd->stackblock;
 327   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 328 #else
 329 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 330 #endif
 331   if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 332     return errno;
 333
 334   return 0;
 335 }
 336
 337
 338 static int
 339 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 340                 ALLOCATE_STACK_PARMS)
 341 {
 342   struct pthread *pd;
 343   size_t size;
 344   size_t pagesize_m1 = __getpagesize () - 1;
 345   void *stacktop;
 346
 347   assert (attr != NULL);
 348   assert (powerof2 (pagesize_m1 + 1));
 349   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 350
 351   /* Get the stack size from the attribute if it is set.  Otherwise we
 352      use the default we determined at start time.  */
 353   size = attr->stacksize ?: __default_stacksize;
 354
 355   /* Get memory for the stack.  */
 356   if (__builtin_expect (attr->flags & ATTR_FLAG_STACKADDR, 0))
 357     {
 358       uintptr_t adj;
 359
 360       /* If the user also specified the size of the stack make sure it
 361          is large enough.  */
 362       if (attr->stacksize != 0
 363           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 364         return EINVAL;
 365
 366       /* Adjust stack size for alignment of the TLS block.  */
 367 #if defined(TLS_TCB_AT_TP)
 368       adj = ((uintptr_t) attr->stackaddr - TLS_TCB_SIZE)
 369             & __static_tls_align_m1;
 370       assert (size > adj + TLS_TCB_SIZE);
 371 #elif defined(TLS_DTV_AT_TP)
 372       adj = ((uintptr_t) attr->stackaddr - __static_tls_size)
 373             & __static_tls_align_m1;
 374       assert (size > adj);
 375 #endif
 376
 377       /* The user provided some memory.  Let's hope it matches the
 378          size...  We do not allocate guard pages if the user provided
 379          the stack.  It is the user's responsibility to do this if it
 380          is wanted.  */
 381 #if defined(TLS_TCB_AT_TP)
 382       pd = (struct pthread *) ((uintptr_t) attr->stackaddr
 383                                - TLS_TCB_SIZE - adj);
 384 #elif defined(TLS_DTV_AT_TP)
 385       pd = (struct pthread *) (((uintptr_t) attr->stackaddr
 386                                 - __static_tls_size - adj)
 387                                - TLS_PRE_TCB_SIZE);
 388 #endif
 389
 390       /* The user provided stack memory needs to be cleared.  */
 391       memset (pd, '\0', sizeof (struct pthread));
 392
 393       /* The first TSD block is included in the TCB.  */
 394       pd->specific[0] = pd->specific_1stblock;
 395
 396       /* Remember the stack-related values.  */
 397       pd->stackblock = (char *) attr->stackaddr - size;
 398       pd->stackblock_size = size;
 399
 400       /* This is a user-provided stack.  It will not be queued in the
 401          stack cache nor will the memory (except the TLS memory) be freed.  */
 402       pd->user_stack = true;
 403
 404       /* This is at least the second thread.  */
 405       pd->header.multiple_threads = 1;
 406 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 407       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 408 #endif
 409
 410 #ifndef __ASSUME_PRIVATE_FUTEX
 411       /* The thread must know when private futexes are supported.  */
 412       pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 413                                                 header.private_futex);
 414 #endif
 415
 416 #ifdef NEED_DL_SYSINFO
 417       /* Copy the sysinfo value from the parent.  */
 418       THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 419 #endif
 420
 421       /* Allocate the DTV for this thread.  */
 422       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 423         {
 424           /* Something went wrong.  */
 425           assert (errno == ENOMEM);
 426           return EAGAIN;
 427         }
 428
 429
 430       /* Prepare to modify global data.  */
 431       lll_lock (stack_cache_lock, LLL_PRIVATE);
 432
 433       /* And add to the list of stacks in use.  */
 434       list_add (&pd->list, &__stack_user);
 435
 436       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 437     }
 438   else
 439     {
 440       /* Allocate some anonymous memory.  If possible use the cache.  */
 441       size_t guardsize;
 442       size_t reqsize;
 443       void *mem = 0;
 444       const int prot = (PROT_READ | PROT_WRITE);
 445
 446 #if defined COLORING_INCREMENT && COLORING_INCREMENT != 0
 447       /* Add one more page for stack coloring.  Don't do it for stacks
 448          with 16 times pagesize or larger.  This might just cause
 449          unnecessary misalignment.  */
 450       if (size <= 16 * pagesize_m1)
 451         size += pagesize_m1 + 1;
 452 #endif
 453
 454       /* Adjust the stack size for alignment.  */
 455       size &= ~__static_tls_align_m1;
 456       assert (size != 0);
 457
 458       /* Make sure the size of the stack is enough for the guard and
 459          eventually the thread descriptor.  */
 460       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 461       if (__builtin_expect (size < ((guardsize + __static_tls_size
 462                                      + MINIMAL_REST_STACK + pagesize_m1)
 463                                     & ~pagesize_m1),
 464                             0))
 465         /* The stack is too small (or the guard too large).  */
 466         return EINVAL;
 467
 468       /* Try to get a stack from the cache.  */
 469       reqsize = size;
 470       pd = get_cached_stack (&size, &mem);
 471       if (pd == NULL)
 472         {
 473           /* To avoid aliasing effects on a larger scale than pages we
 474              adjust the allocated stack size if necessary.  This way
 475              allocations directly following each other will not have
 476              aliasing problems.  */
 477 #if defined MULTI_PAGE_ALIASING && MULTI_PAGE_ALIASING != 0
 478           if ((size % MULTI_PAGE_ALIASING) == 0)
 479             size += pagesize_m1 + 1;
 480 #endif
 481
 482           mem = mmap (NULL, size, prot,
 483                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 484
 485           if (__builtin_expect (mem == MAP_FAILED, 0))
 486             {
 487               if (errno == ENOMEM)
 488                 __set_errno (EAGAIN);
 489
 490                return errno;
 491             }
 492
 493           /* SIZE is guaranteed to be greater than zero.
 494              So we can never get a null pointer back from mmap.  */
 495           assert (mem != NULL);
 496
 497 #if defined COLORING_INCREMENT && COLORING_INCREMENT != 0
 498           /* Atomically increment NCREATED.  */
 499           unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
 500
 501           /* We chose the offset for coloring by incrementing it for
 502              every new thread by a fixed amount.  The offset used
 503              module the page size.  Even if coloring would be better
 504              relative to higher alignment values it makes no sense to
 505              do it since the mmap() interface does not allow us to
 506              specify any alignment for the returned memory block.  */
 507           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 508
 509           /* Make sure the coloring offsets does not disturb the alignment
 510              of the TCB and static TLS block.  */
 511           if (__builtin_expect ((coloring & __static_tls_align_m1) != 0, 0))
 512             coloring = (((coloring + __static_tls_align_m1)
 513                          & ~(__static_tls_align_m1))
 514                         & ~pagesize_m1);
 515 #else
 516           /* Unless specified we do not make any adjustments.  */
 517 # define coloring 0
 518 #endif
 519
 520           /* Place the thread descriptor at the end of the stack.  */
 521 #if defined(TLS_TCB_AT_TP)
 522           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 523 #elif defined(TLS_DTV_AT_TP)
 524           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 525                                     - __static_tls_size)
 526                                     & ~__static_tls_align_m1)
 527                                    - TLS_PRE_TCB_SIZE);
 528 #endif
 529
 530           /* Remember the stack-related values.  */
 531           pd->stackblock = mem;
 532           pd->stackblock_size = size;
 533
 534           /* We allocated the first block thread-specific data array.
 535              This address will not change for the lifetime of this
 536              descriptor.  */
 537           pd->specific[0] = pd->specific_1stblock;
 538
 539           /* This is at least the second thread.  */
 540           pd->header.multiple_threads = 1;
 541 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 542           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 543 #endif
 544
 545 #ifndef __ASSUME_PRIVATE_FUTEX
 546           /* The thread must know when private futexes are supported.  */
 547           pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 548                                                     header.private_futex);
 549 #endif
 550
 551 #ifdef NEED_DL_SYSINFO
 552           /* Copy the sysinfo value from the parent.  */
 553           THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 554 #endif
 555
 556           /* Allocate the DTV for this thread.  */
 557           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 558             {
 559               /* Something went wrong.  */
 560               assert (errno == ENOMEM);
 561
 562               /* Free the stack memory we just allocated.  */
 563               (void) munmap (mem, size);
 564
 565               return EAGAIN;
 566             }
 567
 568
 569           /* Prepare to modify global data.  */
 570           lll_lock (stack_cache_lock, LLL_PRIVATE);
 571
 572           /* And add to the list of stacks in use.  */
 573           stack_list_add (&pd->list, &stack_used);
 574
 575           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 576
 577
 578           /* Note that all of the stack and the thread descriptor is
 579              zeroed.  This means we do not have to initialize fields
 580              with initial value zero.  This is specifically true for
 581              the 'tid' field which is always set back to zero once the
 582              stack is not used anymore and for the 'guardsize' field
 583              which will be read next.  */
 584         }
 585
 586       /* Create or resize the guard area if necessary.  */
 587       if (__builtin_expect (guardsize > pd->guardsize, 0))
 588         {
 589 #ifdef NEED_SEPARATE_REGISTER_STACK
 590           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 591 #elif defined _STACK_GROWS_DOWN
 592           char *guard = mem;
 593 #elif defined _STACK_GROWS_UP
 594           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 595 #endif
 596           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 597             {
 598               int err;
 599             mprot_error:
 600               err = errno;
 601
 602               lll_lock (stack_cache_lock, LLL_PRIVATE);
 603
 604               /* Remove the thread from the list.  */
 605               stack_list_del (&pd->list);
 606
 607               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 608
 609               /* Get rid of the TLS block we allocated.  */
 610               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 611
 612               /* Free the stack memory regardless of whether the size
 613                  of the cache is over the limit or not.  If this piece
 614                  of memory caused problems we better do not use it
 615                  anymore.  Uh, and we ignore possible errors.  There
 616                  is nothing we could do.  */
 617               (void) munmap (mem, size);
 618
 619               return err;
 620             }
 621
 622           pd->guardsize = guardsize;
 623         }
 624       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 625                                  0))
 626         {
 627           /* The old guard area is too large.  */
 628
 629 #ifdef NEED_SEPARATE_REGISTER_STACK
 630           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 631           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 632
 633           if (oldguard < guard
 634               && mprotect (oldguard, guard - oldguard, prot) != 0)
 635             goto mprot_error;
 636
 637           if (mprotect (guard + guardsize,
 638                         oldguard + pd->guardsize - guard - guardsize,
 639                         prot) != 0)
 640             goto mprot_error;
 641 #elif defined _STACK_GROWS_DOWN
 642           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 643                         prot) != 0)
 644             goto mprot_error;
 645 #elif defined _STACK_GROWS_UP
 646           if (mprotect ((char *) (((uintptr_t) pd - pd->guardsize) & ~pagesize_m1),
 647                         pd->guardsize - guardsize, prot) != 0)
 648             goto mprot_error;
 649 #endif
 650
 651           pd->guardsize = guardsize;
 652         }
 653       /* The pthread_getattr_np() calls need to get passed the size
 654          requested in the attribute, regardless of how large the
 655          actually used guardsize is.  */
 656       pd->reported_guardsize = guardsize;
 657     }
 658
 659   /* Initialize the lock.  We have to do this unconditionally since the
 660      stillborn thread could be canceled while the lock is taken.  */
 661   pd->lock = LLL_LOCK_INITIALIZER;
 662
 663   /* The robust mutex lists also need to be initialized
 664      unconditionally because the cleanup for the previous stack owner
 665      might have happened in the kernel.  */
 666   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 667                                   - offsetof (pthread_mutex_t,
 668                                               __data.__list.__next));
 669   pd->robust_head.list_op_pending = NULL;
 670 #ifdef __PTHREAD_MUTEX_HAVE_PREV
 671   pd->robust_prev = &pd->robust_head;
 672 #endif
 673   pd->robust_head.list = &pd->robust_head;
 674
 675   /* We place the thread descriptor at the end of the stack.  */
 676   *pdp = pd;
 677
 678 #if defined(TLS_TCB_AT_TP)
 679   /* The stack begins before the TCB and the static TLS block.  */
 680   stacktop = ((char *) (pd + 1) - __static_tls_size);
 681 #elif defined(TLS_DTV_AT_TP)
 682   stacktop = (char *) (pd - 1);
 683 #endif
 684
 685 #ifdef NEED_SEPARATE_REGISTER_STACK
 686   *stack = pd->stackblock;
 687   *stacksize = stacktop - *stack;
 688 #elif defined _STACK_GROWS_DOWN
 689   *stack = stacktop;
 690 #elif defined _STACK_GROWS_UP
 691   *stack = pd->stackblock;
 692   assert (*stack > 0);
 693 #endif
 694
 695   return 0;
 696 }
 697
 698
 699 void
 700 internal_function
 701 __deallocate_stack (struct pthread *pd)
 702 {
 703   lll_lock (stack_cache_lock, LLL_PRIVATE);
 704
 705   /* Remove the thread from the list of threads with user defined
 706      stacks.  */
 707   stack_list_del (&pd->list);
 708
 709   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 710      not reset the 'used' flag in the 'tid' field.  This is done by
 711      the kernel.  If no thread has been created yet this field is
 712      still zero.  */
 713   if (__builtin_expect (! pd->user_stack, 1))
 714     (void) queue_stack (pd);
 715   else
 716     /* Free the memory associated with the ELF TLS.  */
 717     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 718
 719   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 720 }
 721
 722
 723 int
 724 internal_function
 725 __make_stacks_executable (void **stack_endp)
 726 {
 727   /* First the main thread's stack.  */
 728   int err = EPERM;
 729   if (err != 0)
 730     return err;
 731
 732 #ifdef NEED_SEPARATE_REGISTER_STACK
 733   const size_t pagemask = ~(__getpagesize () - 1);
 734 #endif
 735
 736   lll_lock (stack_cache_lock, LLL_PRIVATE);
 737
 738   list_t *runp;
 739   list_for_each (runp, &stack_used)
 740     {
 741       err = change_stack_perm (list_entry (runp, struct pthread, list)
 742 #ifdef NEED_SEPARATE_REGISTER_STACK
 743                                , pagemask
 744 #endif
 745                                );
 746       if (err != 0)
 747         break;
 748     }
 749
 750   /* Also change the permission for the currently unused stacks.  This
 751      might be wasted time but better spend it here than adding a check
 752      in the fast path.  */
 753   if (err == 0)
 754     list_for_each (runp, &stack_cache)
 755       {
 756         err = change_stack_perm (list_entry (runp, struct pthread, list)
 757 #ifdef NEED_SEPARATE_REGISTER_STACK
 758                                  , pagemask
 759 #endif
 760                                  );
 761         if (err != 0)
 762           break;
 763       }
 764
 765   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 766
 767   return err;
 768 }
 769
 770
 771 /* In case of a fork() call the memory allocation in the child will be
 772    the same but only one thread is running.  All stacks except that of
 773    the one running thread are not used anymore.  We have to recycle
 774    them.  */
 775 void
 776 __reclaim_stacks (void)
 777 {
 778   struct pthread *self = (struct pthread *) THREAD_SELF;
 779
 780   /* No locking necessary.  The caller is the only stack in use.  But
 781      we have to be aware that we might have interrupted a list
 782      operation.  */
 783
 784   if (in_flight_stack != 0)
 785     {
 786       bool add_p = in_flight_stack & 1;
 787       list_t *elem = (list_t *)(uintptr_t)(in_flight_stack & ~UINTMAX_C (1));
 788
 789       if (add_p)
 790         {
 791           /* We always add at the beginning of the list.  So in this
 792              case we only need to check the beginning of these lists.  */
 793           int check_list (list_t *l)
 794           {
 795             if (l->next->prev != l)
 796               {
 797                 assert (l->next->prev == elem);
 798
 799                 elem->next = l->next;
 800                 elem->prev = l;
 801                 l->next = elem;
 802
 803                 return 1;
 804               }
 805
 806             return 0;
 807           }
 808
 809           if (check_list (&stack_used) == 0)
 810             (void) check_list (&stack_cache);
 811         }
 812       else
 813         {
 814           /* We can simply always replay the delete operation.  */
 815           elem->next->prev = elem->prev;
 816           elem->prev->next = elem->next;
 817         }
 818     }
 819
 820   /* Mark all stacks except the still running one as free.  */
 821   list_t *runp;
 822   list_for_each (runp, &stack_used)
 823     {
 824       struct pthread *curp = list_entry (runp, struct pthread, list);
 825       if (curp != self)
 826         {
 827           /* This marks the stack as free.  */
 828           curp->tid = 0;
 829
 830           /* Account for the size of the stack.  */
 831           stack_cache_actsize += curp->stackblock_size;
 832
 833           if (curp->specific_used)
 834             {
 835               /* Clear the thread-specific data.  */
 836               memset (curp->specific_1stblock, '\0',
 837                       sizeof (curp->specific_1stblock));
 838
 839               curp->specific_used = false;
 840
 841               size_t cnt;
 842               for (cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 843                 if (curp->specific[cnt] != NULL)
 844                   {
 845                     memset (curp->specific[cnt], '\0',
 846                             sizeof (curp->specific_1stblock));
 847
 848                     /* We have allocated the block which we do not
 849                        free here so re-set the bit.  */
 850                     curp->specific_used = true;
 851                   }
 852             }
 853         }
 854     }
 855
 856   /* Add the stack of all running threads to the cache.  */
 857   list_splice (&stack_used, &stack_cache);
 858
 859   /* Remove the entry for the current thread to from the cache list
 860      and add it to the list of running threads.  Which of the two
 861      lists is decided by the user_stack flag.  */
 862   stack_list_del (&self->list);
 863
 864   /* Re-initialize the lists for all the threads.  */
 865   INIT_LIST_HEAD (&stack_used);
 866   INIT_LIST_HEAD (&__stack_user);
 867
 868   if (__builtin_expect (THREAD_GETMEM (self, user_stack), 0))
 869     list_add (&self->list, &__stack_user);
 870   else
 871     list_add (&self->list, &stack_used);
 872
 873   /* There is one thread running.  */
 874   __nptl_nthreads = 1;
 875
 876   in_flight_stack = 0;
 877
 878   /* Initialize the lock.  */
 879   stack_cache_lock = LLL_LOCK_INITIALIZER;
 880 }
 881
 882
 883 static void
 884 internal_function
 885 setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
 886 {
 887   int ch;
 888
 889   /* Don't let the thread exit before the setxid handler runs.  */
 890   t->setxid_futex = 0;
 891
 892   do
 893     {
 894       ch = t->cancelhandling;
 895
 896       /* If the thread is exiting right now, ignore it.  */
 897       if ((ch & EXITING_BITMASK) != 0)
 898         return;
 899     }
 900   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
 901                                                ch | SETXID_BITMASK, ch));
 902 }
 903
 904
 905 static void
 906 internal_function
 907 setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
 908 {
 909   int ch;
 910
 911   do
 912     {
 913       ch = t->cancelhandling;
 914       if ((ch & SETXID_BITMASK) == 0)
 915         return;
 916     }
 917   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
 918                                                ch & ~SETXID_BITMASK, ch));
 919
 920   /* Release the futex just in case.  */
 921   t->setxid_futex = 1;
 922   lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
 923 }
 924
 925
 926 static int
 927 internal_function
 928 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
 929 {
 930   if ((t->cancelhandling & SETXID_BITMASK) == 0)
 931     return 0;
 932
 933   int val;
 934   pid_t pid = getpid ();
 935   INTERNAL_SYSCALL_DECL (err);
 936   val = INTERNAL_SYSCALL (tgkill, err, 3, pid, t->tid, SIGSETXID);
 937
 938   /* If this failed, it must have had not started yet or else exited.  */
 939   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
 940     {
 941       atomic_increment (&cmdp->cntr);
 942       return 1;
 943     }
 944   else
 945     return 0;
 946 }
 947
 948
 949 int
 950 attribute_hidden
 951 __nptl_setxid (struct xid_command *cmdp)
 952 {
 953   int signalled;
 954   int result;
 955   lll_lock (stack_cache_lock, LLL_PRIVATE);
 956
 957   __xidcmd = cmdp;
 958   cmdp->cntr = 0;
 959
 960   struct pthread *self = THREAD_SELF;
 961
 962   /* Iterate over the list with system-allocated threads first.  */
 963   list_t *runp;
 964   list_for_each (runp, &stack_used)
 965     {
 966       struct pthread *t = list_entry (runp, struct pthread, list);
 967       if (t == self)
 968         continue;
 969
 970       setxid_mark_thread (cmdp, t);
 971     }
 972
 973   /* Now the list with threads using user-allocated stacks.  */
 974   list_for_each (runp, &__stack_user)
 975     {
 976       struct pthread *t = list_entry (runp, struct pthread, list);
 977       if (t == self)
 978         continue;
 979
 980       setxid_mark_thread (cmdp, t);
 981     }
 982
 983   /* Iterate until we don't succeed in signalling anyone.  That means
 984      we have gotten all running threads, and their children will be
 985      automatically correct once started.  */
 986   do
 987     {
 988       signalled = 0;
 989
 990       list_for_each (runp, &stack_used)
 991         {
 992           struct pthread *t = list_entry (runp, struct pthread, list);
 993           if (t == self)
 994             continue;
 995
 996           signalled += setxid_signal_thread (cmdp, t);
 997         }
 998
 999       list_for_each (runp, &__stack_user)
1000         {
1001           struct pthread *t = list_entry (runp, struct pthread, list);
1002           if (t == self)
1003             continue;
1004
1005           signalled += setxid_signal_thread (cmdp, t);
1006         }
1007
1008       int cur = cmdp->cntr;
1009       while (cur != 0)
1010         {
1011           lll_futex_wait (&cmdp->cntr, cur, LLL_PRIVATE);
1012           cur = cmdp->cntr;
1013         }
1014     }
1015   while (signalled != 0);
1016
1017   /* Clean up flags, so that no thread blocks during exit waiting
1018      for a signal which will never come.  */
1019   list_for_each (runp, &stack_used)
1020     {
1021       struct pthread *t = list_entry (runp, struct pthread, list);
1022       if (t == self)
1023         continue;
1024
1025       setxid_unmark_thread (cmdp, t);
1026     }
1027
1028   list_for_each (runp, &__stack_user)
1029     {
1030       struct pthread *t = list_entry (runp, struct pthread, list);
1031       if (t == self)
1032         continue;
1033
1034       setxid_unmark_thread (cmdp, t);
1035     }
1036
1037   /* This must be last, otherwise the current thread might not have
1038      permissions to send SIGSETXID syscall to the other threads.  */
1039   INTERNAL_SYSCALL_DECL (err);
1040   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1041                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1042   if (INTERNAL_SYSCALL_ERROR_P (result, err))
1043     {
1044       __set_errno (INTERNAL_SYSCALL_ERRNO (result, err));
1045       result = -1;
1046     }
1047
1048   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1049   return result;
1050 }
1051
1052 static inline void __attribute__((always_inline))
1053 init_one_static_tls (struct pthread *curp, struct link_map *map)
1054 {
1055   dtv_t *dtv = GET_DTV (TLS_TPADJ (curp));
1056 # if defined(TLS_TCB_AT_TP)
1057   void *dest = (char *) curp - map->l_tls_offset;
1058 # elif defined(TLS_DTV_AT_TP)
1059   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1060 # else
1061 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1062 # endif
1063
1064   /* Fill in the DTV slot so that a later LD/GD access will find it.  */
1065   dtv[map->l_tls_modid].pointer.val = dest;
1066   dtv[map->l_tls_modid].pointer.is_static = true;
1067
1068   /* Initialize the memory.  */
1069   memset (mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1070           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1071 }
1072
1073 void
1074 attribute_hidden
1075 __pthread_init_static_tls (struct link_map *map)
1076 {
1077   lll_lock (stack_cache_lock, LLL_PRIVATE);
1078
1079   /* Iterate over the list with system-allocated threads first.  */
1080   list_t *runp;
1081   list_for_each (runp, &stack_used)
1082     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1083
1084   /* Now the list with threads using user-allocated stacks.  */
1085   list_for_each (runp, &__stack_user)
1086     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1087
1088   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1089 }
1090
1091
1092 void
1093 attribute_hidden
1094 __wait_lookup_done (void)
1095 {
1096   lll_lock (stack_cache_lock, LLL_PRIVATE);
1097
1098   struct pthread *self = THREAD_SELF;
1099
1100   /* Iterate over the list with system-allocated threads first.  */
1101   list_t *runp;
1102   list_for_each (runp, &stack_used)
1103     {
1104       struct pthread *t = list_entry (runp, struct pthread, list);
1105       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1106         continue;
1107
1108       int *const gscope_flagp = &t->header.gscope_flag;
1109
1110       /* We have to wait until this thread is done with the global
1111          scope.  First tell the thread that we are waiting and
1112          possibly have to be woken.  */
1113       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1114                                                 THREAD_GSCOPE_FLAG_WAIT,
1115                                                 THREAD_GSCOPE_FLAG_USED))
1116         continue;
1117
1118       do
1119         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1120       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1121     }
1122
1123   /* Now the list with threads using user-allocated stacks.  */
1124   list_for_each (runp, &__stack_user)
1125     {
1126       struct pthread *t = list_entry (runp, struct pthread, list);
1127       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1128         continue;
1129
1130       int *const gscope_flagp = &t->header.gscope_flag;
1131
1132       /* We have to wait until this thread is done with the global
1133          scope.  First tell the thread that we are waiting and
1134          possibly have to be woken.  */
1135       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1136                                                 THREAD_GSCOPE_FLAG_WAIT,
1137                                                 THREAD_GSCOPE_FLAG_USED))
1138         continue;
1139
1140       do
1141         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1142       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1143     }
1144
1145   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1146 }