nptl/allocatestack.c

   1 /* Copyright (C) 2002-2013 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <assert.h>
  20 #include <errno.h>
  21 #include <signal.h>
  22 #include <stdint.h>
  23 #include <string.h>
  24 #include <unistd.h>
  25 #include <sys/mman.h>
  26 #include <sys/param.h>
  27 #include <dl-sysdep.h>
  28 #include <dl-tls.h>
  29 #include <tls.h>
  30 #include <list.h>
  31 #include <lowlevellock.h>
  32 #include <kernel-features.h>
  33
  34
  35 #ifndef NEED_SEPARATE_REGISTER_STACK
  36
  37 /* Most architectures have exactly one stack pointer.  Some have more.  */
  38 # define STACK_VARIABLES void *stackaddr = NULL
  39
  40 /* How to pass the values to the 'create_thread' function.  */
  41 # define STACK_VARIABLES_ARGS stackaddr
  42
  43 /* How to declare function which gets there parameters.  */
  44 # define STACK_VARIABLES_PARMS void *stackaddr
  45
  46 /* How to declare allocate_stack.  */
  47 # define ALLOCATE_STACK_PARMS void **stack
  48
  49 /* This is how the function is called.  We do it this way to allow
  50    other variants of the function to have more parameters.  */
  51 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  52
  53 #else
  54
  55 /* We need two stacks.  The kernel will place them but we have to tell
  56    the kernel about the size of the reserved address space.  */
  57 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  58
  59 /* How to pass the values to the 'create_thread' function.  */
  60 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  61
  62 /* How to declare function which gets there parameters.  */
  63 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  64
  65 /* How to declare allocate_stack.  */
  66 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  67
  68 /* This is how the function is called.  We do it this way to allow
  69    other variants of the function to have more parameters.  */
  70 # define ALLOCATE_STACK(attr, pd) \
  71   allocate_stack (attr, pd, &stackaddr, &stacksize)
  72
  73 #endif
  74
  75
  76 /* Default alignment of stack.  */
  77 #ifndef STACK_ALIGN
  78 # define STACK_ALIGN __alignof__ (long double)
  79 #endif
  80
  81 /* Default value for minimal stack size after allocating thread
  82    descriptor and guard.  */
  83 #ifndef MINIMAL_REST_STACK
  84 # define MINIMAL_REST_STACK     4096
  85 #endif
  86
  87
  88 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  89    a stack.  Use it when possible.  */
  90 #ifndef MAP_STACK
  91 # define MAP_STACK 0
  92 #endif
  93
  94 /* This yields the pointer that TLS support code calls the thread pointer.  */
  95 #if TLS_TCB_AT_TP
  96 # define TLS_TPADJ(pd) (pd)
  97 #elif TLS_DTV_AT_TP
  98 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
  99 #endif
 100
 101 /* Cache handling for not-yet free stacks.  */
 102
 103 /* Maximum size in kB of cache.  */
 104 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 105 static size_t stack_cache_actsize;
 106
 107 /* Mutex protecting this variable.  */
 108 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 109
 110 /* List of queued stack frames.  */
 111 static LIST_HEAD (stack_cache);
 112
 113 /* List of the stacks in use.  */
 114 static LIST_HEAD (stack_used);
 115
 116 /* We need to record what list operations we are going to do so that,
 117    in case of an asynchronous interruption due to a fork() call, we
 118    can correct for the work.  */
 119 static uintptr_t in_flight_stack;
 120
 121 /* List of the threads with user provided stacks in use.  No need to
 122    initialize this, since it's done in __pthread_initialize_minimal.  */
 123 list_t __stack_user __attribute__ ((nocommon));
 124 hidden_data_def (__stack_user)
 125
 126 #if COLORING_INCREMENT != 0
 127 /* Number of threads created.  */
 128 static unsigned int nptl_ncreated;
 129 #endif
 130
 131
 132 /* Check whether the stack is still used or not.  */
 133 #define FREE_P(descr) ((descr)->tid <= 0)
 134
 135
 136 static void
 137 stack_list_del (list_t *elem)
 138 {
 139   in_flight_stack = (uintptr_t) elem;
 140
 141   atomic_write_barrier ();
 142
 143   list_del (elem);
 144
 145   atomic_write_barrier ();
 146
 147   in_flight_stack = 0;
 148 }
 149
 150
 151 static void
 152 stack_list_add (list_t *elem, list_t *list)
 153 {
 154   in_flight_stack = (uintptr_t) elem | 1;
 155
 156   atomic_write_barrier ();
 157
 158   list_add (elem, list);
 159
 160   atomic_write_barrier ();
 161
 162   in_flight_stack = 0;
 163 }
 164
 165
 166 /* We create a double linked list of all cache entries.  Double linked
 167    because this allows removing entries from the end.  */
 168
 169
 170 /* Get a stack frame from the cache.  We have to match by size since
 171    some blocks might be too small or far too large.  */
 172 static struct pthread *
 173 get_cached_stack (size_t *sizep, void **memp)
 174 {
 175   size_t size = *sizep;
 176   struct pthread *result = NULL;
 177   list_t *entry;
 178
 179   lll_lock (stack_cache_lock, LLL_PRIVATE);
 180
 181   /* Search the cache for a matching entry.  We search for the
 182      smallest stack which has at least the required size.  Note that
 183      in normal situations the size of all allocated stacks is the
 184      same.  As the very least there are only a few different sizes.
 185      Therefore this loop will exit early most of the time with an
 186      exact match.  */
 187   list_for_each (entry, &stack_cache)
 188     {
 189       struct pthread *curr;
 190
 191       curr = list_entry (entry, struct pthread, list);
 192       if (FREE_P (curr) && curr->stackblock_size >= size)
 193         {
 194           if (curr->stackblock_size == size)
 195             {
 196               result = curr;
 197               break;
 198             }
 199
 200           if (result == NULL
 201               || result->stackblock_size > curr->stackblock_size)
 202             result = curr;
 203         }
 204     }
 205
 206   if (__builtin_expect (result == NULL, 0)
 207       /* Make sure the size difference is not too excessive.  In that
 208          case we do not use the block.  */
 209       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 210     {
 211       /* Release the lock.  */
 212       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 213
 214       return NULL;
 215     }
 216
 217   /* Don't allow setxid until cloned.  */
 218   result->setxid_futex = -1;
 219
 220   /* Dequeue the entry.  */
 221   stack_list_del (&result->list);
 222
 223   /* And add to the list of stacks in use.  */
 224   stack_list_add (&result->list, &stack_used);
 225
 226   /* And decrease the cache size.  */
 227   stack_cache_actsize -= result->stackblock_size;
 228
 229   /* Release the lock early.  */
 230   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 231
 232   /* Report size and location of the stack to the caller.  */
 233   *sizep = result->stackblock_size;
 234   *memp = result->stackblock;
 235
 236   /* Cancellation handling is back to the default.  */
 237   result->cancelhandling = 0;
 238   result->cleanup = NULL;
 239
 240   /* No pending event.  */
 241   result->nextevent = NULL;
 242
 243   /* Clear the DTV.  */
 244   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 245   for (size_t cnt = 0; cnt < dtv[-1].counter; ++cnt)
 246     if (! dtv[1 + cnt].pointer.is_static
 247         && dtv[1 + cnt].pointer.val != TLS_DTV_UNALLOCATED)
 248       free (dtv[1 + cnt].pointer.val);
 249   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 250
 251   /* Re-initialize the TLS.  */
 252   _dl_allocate_tls_init (TLS_TPADJ (result));
 253
 254   return result;
 255 }
 256
 257
 258 /* Free stacks until cache size is lower than LIMIT.  */
 259 void
 260 __free_stacks (size_t limit)
 261 {
 262   /* We reduce the size of the cache.  Remove the last entries until
 263      the size is below the limit.  */
 264   list_t *entry;
 265   list_t *prev;
 266
 267   /* Search from the end of the list.  */
 268   list_for_each_prev_safe (entry, prev, &stack_cache)
 269     {
 270       struct pthread *curr;
 271
 272       curr = list_entry (entry, struct pthread, list);
 273       if (FREE_P (curr))
 274         {
 275           /* Unlink the block.  */
 276           stack_list_del (entry);
 277
 278           /* Account for the freed memory.  */
 279           stack_cache_actsize -= curr->stackblock_size;
 280
 281           /* Free the memory associated with the ELF TLS.  */
 282           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 283
 284           /* Remove this block.  This should never fail.  If it does
 285              something is really wrong.  */
 286           if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 287             abort ();
 288
 289           /* Maybe we have freed enough.  */
 290           if (stack_cache_actsize <= limit)
 291             break;
 292         }
 293     }
 294 }
 295
 296
 297 /* Add a stack frame which is not used anymore to the stack.  Must be
 298    called with the cache lock held.  */
 299 static inline void
 300 __attribute ((always_inline))
 301 queue_stack (struct pthread *stack)
 302 {
 303   /* We unconditionally add the stack to the list.  The memory may
 304      still be in use but it will not be reused until the kernel marks
 305      the stack as not used anymore.  */
 306   stack_list_add (&stack->list, &stack_cache);
 307
 308   stack_cache_actsize += stack->stackblock_size;
 309   if (__builtin_expect (stack_cache_actsize > stack_cache_maxsize, 0))
 310     __free_stacks (stack_cache_maxsize);
 311 }
 312
 313
 314 static int
 315 internal_function
 316 change_stack_perm (struct pthread *pd
 317 #ifdef NEED_SEPARATE_REGISTER_STACK
 318                    , size_t pagemask
 319 #endif
 320                    )
 321 {
 322 #ifdef NEED_SEPARATE_REGISTER_STACK
 323   void *stack = (pd->stackblock
 324                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 325                       & pagemask) + pd->guardsize) & pagemask));
 326   size_t len = pd->stackblock + pd->stackblock_size - stack;
 327 #elif _STACK_GROWS_DOWN
 328   void *stack = pd->stackblock + pd->guardsize;
 329   size_t len = pd->stackblock_size - pd->guardsize;
 330 #elif _STACK_GROWS_UP
 331   void *stack = pd->stackblock;
 332   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 333 #else
 334 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 335 #endif
 336   if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 337     return errno;
 338
 339   return 0;
 340 }
 341
 342
 343 /* Returns a usable stack for a new thread either by allocating a
 344    new stack or reusing a cached stack of sufficient size.
 345    ATTR must be non-NULL and point to a valid pthread_attr.
 346    PDP must be non-NULL.  */
 347 static int
 348 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 349                 ALLOCATE_STACK_PARMS)
 350 {
 351   struct pthread *pd;
 352   size_t size;
 353   size_t pagesize_m1 = __getpagesize () - 1;
 354   void *stacktop;
 355
 356   assert (powerof2 (pagesize_m1 + 1));
 357   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 358
 359   /* Get the stack size from the attribute if it is set.  Otherwise we
 360      use the default we determined at start time.  */
 361   size = attr->stacksize ?: __default_pthread_attr.stacksize;
 362
 363   /* Get memory for the stack.  */
 364   if (__builtin_expect (attr->flags & ATTR_FLAG_STACKADDR, 0))
 365     {
 366       uintptr_t adj;
 367
 368       /* If the user also specified the size of the stack make sure it
 369          is large enough.  */
 370       if (attr->stacksize != 0
 371           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 372         return EINVAL;
 373
 374       /* Adjust stack size for alignment of the TLS block.  */
 375 #if TLS_TCB_AT_TP
 376       adj = ((uintptr_t) attr->stackaddr - TLS_TCB_SIZE)
 377             & __static_tls_align_m1;
 378       assert (size > adj + TLS_TCB_SIZE);
 379 #elif TLS_DTV_AT_TP
 380       adj = ((uintptr_t) attr->stackaddr - __static_tls_size)
 381             & __static_tls_align_m1;
 382       assert (size > adj);
 383 #endif
 384
 385       /* The user provided some memory.  Let's hope it matches the
 386          size...  We do not allocate guard pages if the user provided
 387          the stack.  It is the user's responsibility to do this if it
 388          is wanted.  */
 389 #if TLS_TCB_AT_TP
 390       pd = (struct pthread *) ((uintptr_t) attr->stackaddr
 391                                - TLS_TCB_SIZE - adj);
 392 #elif TLS_DTV_AT_TP
 393       pd = (struct pthread *) (((uintptr_t) attr->stackaddr
 394                                 - __static_tls_size - adj)
 395                                - TLS_PRE_TCB_SIZE);
 396 #endif
 397
 398       /* The user provided stack memory needs to be cleared.  */
 399       memset (pd, '\0', sizeof (struct pthread));
 400
 401       /* The first TSD block is included in the TCB.  */
 402       pd->specific[0] = pd->specific_1stblock;
 403
 404       /* Remember the stack-related values.  */
 405       pd->stackblock = (char *) attr->stackaddr - size;
 406       pd->stackblock_size = size;
 407
 408       /* This is a user-provided stack.  It will not be queued in the
 409          stack cache nor will the memory (except the TLS memory) be freed.  */
 410       pd->user_stack = true;
 411
 412       /* This is at least the second thread.  */
 413       pd->header.multiple_threads = 1;
 414 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 415       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 416 #endif
 417
 418 #ifndef __ASSUME_PRIVATE_FUTEX
 419       /* The thread must know when private futexes are supported.  */
 420       pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 421                                                 header.private_futex);
 422 #endif
 423
 424 #ifdef NEED_DL_SYSINFO
 425       /* Copy the sysinfo value from the parent.  */
 426       THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 427 #endif
 428
 429       /* The process ID is also the same as that of the caller.  */
 430       pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 431
 432       /* Don't allow setxid until cloned.  */
 433       pd->setxid_futex = -1;
 434
 435       /* Allocate the DTV for this thread.  */
 436       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 437         {
 438           /* Something went wrong.  */
 439           assert (errno == ENOMEM);
 440           return errno;
 441         }
 442
 443
 444       /* Prepare to modify global data.  */
 445       lll_lock (stack_cache_lock, LLL_PRIVATE);
 446
 447       /* And add to the list of stacks in use.  */
 448       list_add (&pd->list, &__stack_user);
 449
 450       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 451     }
 452   else
 453     {
 454       /* Allocate some anonymous memory.  If possible use the cache.  */
 455       size_t guardsize;
 456       size_t reqsize;
 457       void *mem;
 458       const int prot = (PROT_READ | PROT_WRITE
 459                         | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
 460
 461 #if COLORING_INCREMENT != 0
 462       /* Add one more page for stack coloring.  Don't do it for stacks
 463          with 16 times pagesize or larger.  This might just cause
 464          unnecessary misalignment.  */
 465       if (size <= 16 * pagesize_m1)
 466         size += pagesize_m1 + 1;
 467 #endif
 468
 469       /* Adjust the stack size for alignment.  */
 470       size &= ~__static_tls_align_m1;
 471       assert (size != 0);
 472
 473       /* Make sure the size of the stack is enough for the guard and
 474          eventually the thread descriptor.  */
 475       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 476       if (__builtin_expect (size < ((guardsize + __static_tls_size
 477                                      + MINIMAL_REST_STACK + pagesize_m1)
 478                                     & ~pagesize_m1),
 479                             0))
 480         /* The stack is too small (or the guard too large).  */
 481         return EINVAL;
 482
 483       /* Try to get a stack from the cache.  */
 484       reqsize = size;
 485       pd = get_cached_stack (&size, &mem);
 486       if (pd == NULL)
 487         {
 488           /* To avoid aliasing effects on a larger scale than pages we
 489              adjust the allocated stack size if necessary.  This way
 490              allocations directly following each other will not have
 491              aliasing problems.  */
 492 #if MULTI_PAGE_ALIASING != 0
 493           if ((size % MULTI_PAGE_ALIASING) == 0)
 494             size += pagesize_m1 + 1;
 495 #endif
 496
 497           mem = mmap (NULL, size, prot,
 498                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 499
 500           if (__builtin_expect (mem == MAP_FAILED, 0))
 501             return errno;
 502
 503           /* SIZE is guaranteed to be greater than zero.
 504              So we can never get a null pointer back from mmap.  */
 505           assert (mem != NULL);
 506
 507 #if COLORING_INCREMENT != 0
 508           /* Atomically increment NCREATED.  */
 509           unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
 510
 511           /* We chose the offset for coloring by incrementing it for
 512              every new thread by a fixed amount.  The offset used
 513              module the page size.  Even if coloring would be better
 514              relative to higher alignment values it makes no sense to
 515              do it since the mmap() interface does not allow us to
 516              specify any alignment for the returned memory block.  */
 517           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 518
 519           /* Make sure the coloring offsets does not disturb the alignment
 520              of the TCB and static TLS block.  */
 521           if (__builtin_expect ((coloring & __static_tls_align_m1) != 0, 0))
 522             coloring = (((coloring + __static_tls_align_m1)
 523                          & ~(__static_tls_align_m1))
 524                         & ~pagesize_m1);
 525 #else
 526           /* Unless specified we do not make any adjustments.  */
 527 # define coloring 0
 528 #endif
 529
 530           /* Place the thread descriptor at the end of the stack.  */
 531 #if TLS_TCB_AT_TP
 532           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 533 #elif TLS_DTV_AT_TP
 534           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 535                                     - __static_tls_size)
 536                                     & ~__static_tls_align_m1)
 537                                    - TLS_PRE_TCB_SIZE);
 538 #endif
 539
 540           /* Remember the stack-related values.  */
 541           pd->stackblock = mem;
 542           pd->stackblock_size = size;
 543
 544           /* We allocated the first block thread-specific data array.
 545              This address will not change for the lifetime of this
 546              descriptor.  */
 547           pd->specific[0] = pd->specific_1stblock;
 548
 549           /* This is at least the second thread.  */
 550           pd->header.multiple_threads = 1;
 551 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 552           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 553 #endif
 554
 555 #ifndef __ASSUME_PRIVATE_FUTEX
 556           /* The thread must know when private futexes are supported.  */
 557           pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 558                                                     header.private_futex);
 559 #endif
 560
 561 #ifdef NEED_DL_SYSINFO
 562           /* Copy the sysinfo value from the parent.  */
 563           THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 564 #endif
 565
 566           /* Don't allow setxid until cloned.  */
 567           pd->setxid_futex = -1;
 568
 569           /* The process ID is also the same as that of the caller.  */
 570           pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 571
 572           /* Allocate the DTV for this thread.  */
 573           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 574             {
 575               /* Something went wrong.  */
 576               assert (errno == ENOMEM);
 577
 578               /* Free the stack memory we just allocated.  */
 579               (void) munmap (mem, size);
 580
 581               return errno;
 582             }
 583
 584
 585           /* Prepare to modify global data.  */
 586           lll_lock (stack_cache_lock, LLL_PRIVATE);
 587
 588           /* And add to the list of stacks in use.  */
 589           stack_list_add (&pd->list, &stack_used);
 590
 591           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 592
 593
 594           /* There might have been a race.  Another thread might have
 595              caused the stacks to get exec permission while this new
 596              stack was prepared.  Detect if this was possible and
 597              change the permission if necessary.  */
 598           if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
 599                                 && (prot & PROT_EXEC) == 0, 0))
 600             {
 601               int err = change_stack_perm (pd
 602 #ifdef NEED_SEPARATE_REGISTER_STACK
 603                                            , ~pagesize_m1
 604 #endif
 605                                            );
 606               if (err != 0)
 607                 {
 608                   /* Free the stack memory we just allocated.  */
 609                   (void) munmap (mem, size);
 610
 611                   return err;
 612                 }
 613             }
 614
 615
 616           /* Note that all of the stack and the thread descriptor is
 617              zeroed.  This means we do not have to initialize fields
 618              with initial value zero.  This is specifically true for
 619              the 'tid' field which is always set back to zero once the
 620              stack is not used anymore and for the 'guardsize' field
 621              which will be read next.  */
 622         }
 623
 624       /* Create or resize the guard area if necessary.  */
 625       if (__builtin_expect (guardsize > pd->guardsize, 0))
 626         {
 627 #ifdef NEED_SEPARATE_REGISTER_STACK
 628           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 629 #elif _STACK_GROWS_DOWN
 630           char *guard = mem;
 631 # elif _STACK_GROWS_UP
 632           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 633 #endif
 634           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 635             {
 636             mprot_error:
 637               lll_lock (stack_cache_lock, LLL_PRIVATE);
 638
 639               /* Remove the thread from the list.  */
 640               stack_list_del (&pd->list);
 641
 642               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 643
 644               /* Get rid of the TLS block we allocated.  */
 645               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 646
 647               /* Free the stack memory regardless of whether the size
 648                  of the cache is over the limit or not.  If this piece
 649                  of memory caused problems we better do not use it
 650                  anymore.  Uh, and we ignore possible errors.  There
 651                  is nothing we could do.  */
 652               (void) munmap (mem, size);
 653
 654               return errno;
 655             }
 656
 657           pd->guardsize = guardsize;
 658         }
 659       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 660                                  0))
 661         {
 662           /* The old guard area is too large.  */
 663
 664 #ifdef NEED_SEPARATE_REGISTER_STACK
 665           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 666           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 667
 668           if (oldguard < guard
 669               && mprotect (oldguard, guard - oldguard, prot) != 0)
 670             goto mprot_error;
 671
 672           if (mprotect (guard + guardsize,
 673                         oldguard + pd->guardsize - guard - guardsize,
 674                         prot) != 0)
 675             goto mprot_error;
 676 #elif _STACK_GROWS_DOWN
 677           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 678                         prot) != 0)
 679             goto mprot_error;
 680 #elif _STACK_GROWS_UP
 681           if (mprotect ((char *) pd - pd->guardsize,
 682                         pd->guardsize - guardsize, prot) != 0)
 683             goto mprot_error;
 684 #endif
 685
 686           pd->guardsize = guardsize;
 687         }
 688       /* The pthread_getattr_np() calls need to get passed the size
 689          requested in the attribute, regardless of how large the
 690          actually used guardsize is.  */
 691       pd->reported_guardsize = guardsize;
 692     }
 693
 694   /* Initialize the lock.  We have to do this unconditionally since the
 695      stillborn thread could be canceled while the lock is taken.  */
 696   pd->lock = LLL_LOCK_INITIALIZER;
 697
 698   /* The robust mutex lists also need to be initialized
 699      unconditionally because the cleanup for the previous stack owner
 700      might have happened in the kernel.  */
 701   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 702                                   - offsetof (pthread_mutex_t,
 703                                               __data.__list.__next));
 704   pd->robust_head.list_op_pending = NULL;
 705 #ifdef __PTHREAD_MUTEX_HAVE_PREV
 706   pd->robust_prev = &pd->robust_head;
 707 #endif
 708   pd->robust_head.list = &pd->robust_head;
 709
 710   /* We place the thread descriptor at the end of the stack.  */
 711   *pdp = pd;
 712
 713 #if TLS_TCB_AT_TP
 714   /* The stack begins before the TCB and the static TLS block.  */
 715   stacktop = ((char *) (pd + 1) - __static_tls_size);
 716 #elif TLS_DTV_AT_TP
 717   stacktop = (char *) (pd - 1);
 718 #endif
 719
 720 #ifdef NEED_SEPARATE_REGISTER_STACK
 721   *stack = pd->stackblock;
 722   *stacksize = stacktop - *stack;
 723 #elif _STACK_GROWS_DOWN
 724   *stack = stacktop;
 725 #elif _STACK_GROWS_UP
 726   *stack = pd->stackblock;
 727   assert (*stack > 0);
 728 #endif
 729
 730   return 0;
 731 }
 732
 733
 734 void
 735 internal_function
 736 __deallocate_stack (struct pthread *pd)
 737 {
 738   lll_lock (stack_cache_lock, LLL_PRIVATE);
 739
 740   /* Remove the thread from the list of threads with user defined
 741      stacks.  */
 742   stack_list_del (&pd->list);
 743
 744   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 745      not reset the 'used' flag in the 'tid' field.  This is done by
 746      the kernel.  If no thread has been created yet this field is
 747      still zero.  */
 748   if (__builtin_expect (! pd->user_stack, 1))
 749     (void) queue_stack (pd);
 750   else
 751     /* Free the memory associated with the ELF TLS.  */
 752     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 753
 754   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 755 }
 756
 757
 758 int
 759 internal_function
 760 __make_stacks_executable (void **stack_endp)
 761 {
 762   /* First the main thread's stack.  */
 763   int err = _dl_make_stack_executable (stack_endp);
 764   if (err != 0)
 765     return err;
 766
 767 #ifdef NEED_SEPARATE_REGISTER_STACK
 768   const size_t pagemask = ~(__getpagesize () - 1);
 769 #endif
 770
 771   lll_lock (stack_cache_lock, LLL_PRIVATE);
 772
 773   list_t *runp;
 774   list_for_each (runp, &stack_used)
 775     {
 776       err = change_stack_perm (list_entry (runp, struct pthread, list)
 777 #ifdef NEED_SEPARATE_REGISTER_STACK
 778                                , pagemask
 779 #endif
 780                                );
 781       if (err != 0)
 782         break;
 783     }
 784
 785   /* Also change the permission for the currently unused stacks.  This
 786      might be wasted time but better spend it here than adding a check
 787      in the fast path.  */
 788   if (err == 0)
 789     list_for_each (runp, &stack_cache)
 790       {
 791         err = change_stack_perm (list_entry (runp, struct pthread, list)
 792 #ifdef NEED_SEPARATE_REGISTER_STACK
 793                                  , pagemask
 794 #endif
 795                                  );
 796         if (err != 0)
 797           break;
 798       }
 799
 800   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 801
 802   return err;
 803 }
 804
 805
 806 /* In case of a fork() call the memory allocation in the child will be
 807    the same but only one thread is running.  All stacks except that of
 808    the one running thread are not used anymore.  We have to recycle
 809    them.  */
 810 void
 811 __reclaim_stacks (void)
 812 {
 813   struct pthread *self = (struct pthread *) THREAD_SELF;
 814
 815   /* No locking necessary.  The caller is the only stack in use.  But
 816      we have to be aware that we might have interrupted a list
 817      operation.  */
 818
 819   if (in_flight_stack != 0)
 820     {
 821       bool add_p = in_flight_stack & 1;
 822       list_t *elem = (list_t *) (in_flight_stack & ~(uintptr_t) 1);
 823
 824       if (add_p)
 825         {
 826           /* We always add at the beginning of the list.  So in this
 827              case we only need to check the beginning of these lists.  */
 828           int check_list (list_t *l)
 829           {
 830             if (l->next->prev != l)
 831               {
 832                 assert (l->next->prev == elem);
 833
 834                 elem->next = l->next;
 835                 elem->prev = l;
 836                 l->next = elem;
 837
 838                 return 1;
 839               }
 840
 841             return 0;
 842           }
 843
 844           if (check_list (&stack_used) == 0)
 845             (void) check_list (&stack_cache);
 846         }
 847       else
 848         {
 849           /* We can simply always replay the delete operation.  */
 850           elem->next->prev = elem->prev;
 851           elem->prev->next = elem->next;
 852         }
 853     }
 854
 855   /* Mark all stacks except the still running one as free.  */
 856   list_t *runp;
 857   list_for_each (runp, &stack_used)
 858     {
 859       struct pthread *curp = list_entry (runp, struct pthread, list);
 860       if (curp != self)
 861         {
 862           /* This marks the stack as free.  */
 863           curp->tid = 0;
 864
 865           /* The PID field must be initialized for the new process.  */
 866           curp->pid = self->pid;
 867
 868           /* Account for the size of the stack.  */
 869           stack_cache_actsize += curp->stackblock_size;
 870
 871           if (curp->specific_used)
 872             {
 873               /* Clear the thread-specific data.  */
 874               memset (curp->specific_1stblock, '\0',
 875                       sizeof (curp->specific_1stblock));
 876
 877               curp->specific_used = false;
 878
 879               for (size_t cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 880                 if (curp->specific[cnt] != NULL)
 881                   {
 882                     memset (curp->specific[cnt], '\0',
 883                             sizeof (curp->specific_1stblock));
 884
 885                     /* We have allocated the block which we do not
 886                        free here so re-set the bit.  */
 887                     curp->specific_used = true;
 888                   }
 889             }
 890         }
 891     }
 892
 893   /* Reset the PIDs in any cached stacks.  */
 894   list_for_each (runp, &stack_cache)
 895     {
 896       struct pthread *curp = list_entry (runp, struct pthread, list);
 897       curp->pid = self->pid;
 898     }
 899
 900   /* Add the stack of all running threads to the cache.  */
 901   list_splice (&stack_used, &stack_cache);
 902
 903   /* Remove the entry for the current thread to from the cache list
 904      and add it to the list of running threads.  Which of the two
 905      lists is decided by the user_stack flag.  */
 906   stack_list_del (&self->list);
 907
 908   /* Re-initialize the lists for all the threads.  */
 909   INIT_LIST_HEAD (&stack_used);
 910   INIT_LIST_HEAD (&__stack_user);
 911
 912   if (__builtin_expect (THREAD_GETMEM (self, user_stack), 0))
 913     list_add (&self->list, &__stack_user);
 914   else
 915     list_add (&self->list, &stack_used);
 916
 917   /* There is one thread running.  */
 918   __nptl_nthreads = 1;
 919
 920   in_flight_stack = 0;
 921
 922   /* Initialize the lock.  */
 923   stack_cache_lock = LLL_LOCK_INITIALIZER;
 924 }
 925
 926
 927 #if HP_TIMING_AVAIL
 928 # undef __find_thread_by_id
 929 /* Find a thread given the thread ID.  */
 930 attribute_hidden
 931 struct pthread *
 932 __find_thread_by_id (pid_t tid)
 933 {
 934   struct pthread *result = NULL;
 935
 936   lll_lock (stack_cache_lock, LLL_PRIVATE);
 937
 938   /* Iterate over the list with system-allocated threads first.  */
 939   list_t *runp;
 940   list_for_each (runp, &stack_used)
 941     {
 942       struct pthread *curp;
 943
 944       curp = list_entry (runp, struct pthread, list);
 945
 946       if (curp->tid == tid)
 947         {
 948           result = curp;
 949           goto out;
 950         }
 951     }
 952
 953   /* Now the list with threads using user-allocated stacks.  */
 954   list_for_each (runp, &__stack_user)
 955     {
 956       struct pthread *curp;
 957
 958       curp = list_entry (runp, struct pthread, list);
 959
 960       if (curp->tid == tid)
 961         {
 962           result = curp;
 963           goto out;
 964         }
 965     }
 966
 967  out:
 968   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 969
 970   return result;
 971 }
 972 #endif
 973
 974
 975 static void
 976 internal_function
 977 setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
 978 {
 979   int ch;
 980
 981   /* Wait until this thread is cloned.  */
 982   if (t->setxid_futex == -1
 983       && ! atomic_compare_and_exchange_bool_acq (&t->setxid_futex, -2, -1))
 984     do
 985       lll_futex_wait (&t->setxid_futex, -2, LLL_PRIVATE);
 986     while (t->setxid_futex == -2);
 987
 988   /* Don't let the thread exit before the setxid handler runs.  */
 989   t->setxid_futex = 0;
 990
 991   do
 992     {
 993       ch = t->cancelhandling;
 994
 995       /* If the thread is exiting right now, ignore it.  */
 996       if ((ch & EXITING_BITMASK) != 0)
 997         {
 998           /* Release the futex if there is no other setxid in
 999              progress.  */
1000           if ((ch & SETXID_BITMASK) == 0)
1001             {
1002               t->setxid_futex = 1;
1003               lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
1004             }
1005           return;
1006         }
1007     }
1008   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1009                                                ch | SETXID_BITMASK, ch));
1010 }
1011
1012
1013 static void
1014 internal_function
1015 setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
1016 {
1017   int ch;
1018
1019   do
1020     {
1021       ch = t->cancelhandling;
1022       if ((ch & SETXID_BITMASK) == 0)
1023         return;
1024     }
1025   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1026                                                ch & ~SETXID_BITMASK, ch));
1027
1028   /* Release the futex just in case.  */
1029   t->setxid_futex = 1;
1030   lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
1031 }
1032
1033
1034 static int
1035 internal_function
1036 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
1037 {
1038   if ((t->cancelhandling & SETXID_BITMASK) == 0)
1039     return 0;
1040
1041   int val;
1042   INTERNAL_SYSCALL_DECL (err);
1043   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
1044                           t->tid, SIGSETXID);
1045
1046   /* If this failed, it must have had not started yet or else exited.  */
1047   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
1048     {
1049       atomic_increment (&cmdp->cntr);
1050       return 1;
1051     }
1052   else
1053     return 0;
1054 }
1055
1056
1057 int
1058 attribute_hidden
1059 __nptl_setxid (struct xid_command *cmdp)
1060 {
1061   int signalled;
1062   int result;
1063   lll_lock (stack_cache_lock, LLL_PRIVATE);
1064
1065   __xidcmd = cmdp;
1066   cmdp->cntr = 0;
1067
1068   struct pthread *self = THREAD_SELF;
1069
1070   /* Iterate over the list with system-allocated threads first.  */
1071   list_t *runp;
1072   list_for_each (runp, &stack_used)
1073     {
1074       struct pthread *t = list_entry (runp, struct pthread, list);
1075       if (t == self)
1076         continue;
1077
1078       setxid_mark_thread (cmdp, t);
1079     }
1080
1081   /* Now the list with threads using user-allocated stacks.  */
1082   list_for_each (runp, &__stack_user)
1083     {
1084       struct pthread *t = list_entry (runp, struct pthread, list);
1085       if (t == self)
1086         continue;
1087
1088       setxid_mark_thread (cmdp, t);
1089     }
1090
1091   /* Iterate until we don't succeed in signalling anyone.  That means
1092      we have gotten all running threads, and their children will be
1093      automatically correct once started.  */
1094   do
1095     {
1096       signalled = 0;
1097
1098       list_for_each (runp, &stack_used)
1099         {
1100           struct pthread *t = list_entry (runp, struct pthread, list);
1101           if (t == self)
1102             continue;
1103
1104           signalled += setxid_signal_thread (cmdp, t);
1105         }
1106
1107       list_for_each (runp, &__stack_user)
1108         {
1109           struct pthread *t = list_entry (runp, struct pthread, list);
1110           if (t == self)
1111             continue;
1112
1113           signalled += setxid_signal_thread (cmdp, t);
1114         }
1115
1116       int cur = cmdp->cntr;
1117       while (cur != 0)
1118         {
1119           lll_futex_wait (&cmdp->cntr, cur, LLL_PRIVATE);
1120           cur = cmdp->cntr;
1121         }
1122     }
1123   while (signalled != 0);
1124
1125   /* Clean up flags, so that no thread blocks during exit waiting
1126      for a signal which will never come.  */
1127   list_for_each (runp, &stack_used)
1128     {
1129       struct pthread *t = list_entry (runp, struct pthread, list);
1130       if (t == self)
1131         continue;
1132
1133       setxid_unmark_thread (cmdp, t);
1134     }
1135
1136   list_for_each (runp, &__stack_user)
1137     {
1138       struct pthread *t = list_entry (runp, struct pthread, list);
1139       if (t == self)
1140         continue;
1141
1142       setxid_unmark_thread (cmdp, t);
1143     }
1144
1145   /* This must be last, otherwise the current thread might not have
1146      permissions to send SIGSETXID syscall to the other threads.  */
1147   INTERNAL_SYSCALL_DECL (err);
1148   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1149                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1150   if (INTERNAL_SYSCALL_ERROR_P (result, err))
1151     {
1152       __set_errno (INTERNAL_SYSCALL_ERRNO (result, err));
1153       result = -1;
1154     }
1155
1156   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1157   return result;
1158 }
1159
1160 static inline void __attribute__((always_inline))
1161 init_one_static_tls (struct pthread *curp, struct link_map *map)
1162 {
1163   dtv_t *dtv = GET_DTV (TLS_TPADJ (curp));
1164 # if TLS_TCB_AT_TP
1165   void *dest = (char *) curp - map->l_tls_offset;
1166 # elif TLS_DTV_AT_TP
1167   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1168 # else
1169 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1170 # endif
1171
1172   /* Fill in the DTV slot so that a later LD/GD access will find it.  */
1173   dtv[map->l_tls_modid].pointer.val = dest;
1174   dtv[map->l_tls_modid].pointer.is_static = true;
1175
1176   /* Initialize the memory.  */
1177   memset (__mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1178           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1179 }
1180
1181 void
1182 attribute_hidden
1183 __pthread_init_static_tls (struct link_map *map)
1184 {
1185   lll_lock (stack_cache_lock, LLL_PRIVATE);
1186
1187   /* Iterate over the list with system-allocated threads first.  */
1188   list_t *runp;
1189   list_for_each (runp, &stack_used)
1190     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1191
1192   /* Now the list with threads using user-allocated stacks.  */
1193   list_for_each (runp, &__stack_user)
1194     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1195
1196   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1197 }
1198
1199
1200 void
1201 attribute_hidden
1202 __wait_lookup_done (void)
1203 {
1204   lll_lock (stack_cache_lock, LLL_PRIVATE);
1205
1206   struct pthread *self = THREAD_SELF;
1207
1208   /* Iterate over the list with system-allocated threads first.  */
1209   list_t *runp;
1210   list_for_each (runp, &stack_used)
1211     {
1212       struct pthread *t = list_entry (runp, struct pthread, list);
1213       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1214         continue;
1215
1216       int *const gscope_flagp = &t->header.gscope_flag;
1217
1218       /* We have to wait until this thread is done with the global
1219          scope.  First tell the thread that we are waiting and
1220          possibly have to be woken.  */
1221       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1222                                                 THREAD_GSCOPE_FLAG_WAIT,
1223                                                 THREAD_GSCOPE_FLAG_USED))
1224         continue;
1225
1226       do
1227         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1228       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1229     }
1230
1231   /* Now the list with threads using user-allocated stacks.  */
1232   list_for_each (runp, &__stack_user)
1233     {
1234       struct pthread *t = list_entry (runp, struct pthread, list);
1235       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1236         continue;
1237
1238       int *const gscope_flagp = &t->header.gscope_flag;
1239
1240       /* We have to wait until this thread is done with the global
1241          scope.  First tell the thread that we are waiting and
1242          possibly have to be woken.  */
1243       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1244                                                 THREAD_GSCOPE_FLAG_WAIT,
1245                                                 THREAD_GSCOPE_FLAG_USED))
1246         continue;
1247
1248       do
1249         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1250       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1251     }
1252
1253   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1254 }