nptl/allocatestack.c

   1 /* Copyright (C) 2002-2007, 2009 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #include <assert.h>
  21 #include <errno.h>
  22 #include <signal.h>
  23 #include <stdint.h>
  24 #include <string.h>
  25 #include <unistd.h>
  26 #include <sys/mman.h>
  27 #include <sys/param.h>
  28 #include <dl-sysdep.h>
  29 #include <tls.h>
  30 #include <lowlevellock.h>
  31 #include <kernel-features.h>
  32
  33
  34 #ifndef NEED_SEPARATE_REGISTER_STACK
  35
  36 /* Most architectures have exactly one stack pointer.  Some have more.  */
  37 # define STACK_VARIABLES void *stackaddr = NULL
  38
  39 /* How to pass the values to the 'create_thread' function.  */
  40 # define STACK_VARIABLES_ARGS stackaddr
  41
  42 /* How to declare function which gets there parameters.  */
  43 # define STACK_VARIABLES_PARMS void *stackaddr
  44
  45 /* How to declare allocate_stack.  */
  46 # define ALLOCATE_STACK_PARMS void **stack
  47
  48 /* This is how the function is called.  We do it this way to allow
  49    other variants of the function to have more parameters.  */
  50 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  51
  52 #else
  53
  54 /* We need two stacks.  The kernel will place them but we have to tell
  55    the kernel about the size of the reserved address space.  */
  56 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  57
  58 /* How to pass the values to the 'create_thread' function.  */
  59 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  60
  61 /* How to declare function which gets there parameters.  */
  62 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  63
  64 /* How to declare allocate_stack.  */
  65 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  66
  67 /* This is how the function is called.  We do it this way to allow
  68    other variants of the function to have more parameters.  */
  69 # define ALLOCATE_STACK(attr, pd) \
  70   allocate_stack (attr, pd, &stackaddr, &stacksize)
  71
  72 #endif
  73
  74
  75 /* Default alignment of stack.  */
  76 #ifndef STACK_ALIGN
  77 # define STACK_ALIGN __alignof__ (long double)
  78 #endif
  79
  80 /* Default value for minimal stack size after allocating thread
  81    descriptor and guard.  */
  82 #ifndef MINIMAL_REST_STACK
  83 # define MINIMAL_REST_STACK     4096
  84 #endif
  85
  86
  87 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  88    a stack.  Use it when possible.  */
  89 #ifndef MAP_STACK
  90 # define MAP_STACK 0
  91 #endif
  92
  93 /* This yields the pointer that TLS support code calls the thread pointer.  */
  94 #if TLS_TCB_AT_TP
  95 # define TLS_TPADJ(pd) (pd)
  96 #elif TLS_DTV_AT_TP
  97 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
  98 #endif
  99
 100 /* Cache handling for not-yet free stacks.  */
 101
 102 /* Maximum size in kB of cache.  */
 103 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 104 static size_t stack_cache_actsize;
 105
 106 /* Mutex protecting this variable.  */
 107 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 108
 109 /* List of queued stack frames.  */
 110 static LIST_HEAD (stack_cache);
 111
 112 /* List of the stacks in use.  */
 113 static LIST_HEAD (stack_used);
 114
 115 /* We need to record what list operations we are going to do so that,
 116    in case of an asynchronous interruption due to a fork() call, we
 117    can correct for the work.  */
 118 static uintptr_t in_flight_stack;
 119
 120 /* List of the threads with user provided stacks in use.  No need to
 121    initialize this, since it's done in __pthread_initialize_minimal.  */
 122 list_t __stack_user __attribute__ ((nocommon));
 123 hidden_data_def (__stack_user)
 124
 125 #if COLORING_INCREMENT != 0
 126 /* Number of threads created.  */
 127 static unsigned int nptl_ncreated;
 128 #endif
 129
 130
 131 /* Check whether the stack is still used or not.  */
 132 #define FREE_P(descr) ((descr)->tid <= 0)
 133
 134
 135 static void
 136 stack_list_del (list_t *elem)
 137 {
 138   in_flight_stack = (uintptr_t) elem;
 139
 140   atomic_write_barrier ();
 141
 142   list_del (elem);
 143
 144   atomic_write_barrier ();
 145
 146   in_flight_stack = 0;
 147 }
 148
 149
 150 static void
 151 stack_list_add (list_t *elem, list_t *list)
 152 {
 153   in_flight_stack = (uintptr_t) elem | 1;
 154
 155   atomic_write_barrier ();
 156
 157   list_add (elem, list);
 158
 159   atomic_write_barrier ();
 160
 161   in_flight_stack = 0;
 162 }
 163
 164
 165 /* We create a double linked list of all cache entries.  Double linked
 166    because this allows removing entries from the end.  */
 167
 168
 169 /* Get a stack frame from the cache.  We have to match by size since
 170    some blocks might be too small or far too large.  */
 171 static struct pthread *
 172 get_cached_stack (size_t *sizep, void **memp)
 173 {
 174   size_t size = *sizep;
 175   struct pthread *result = NULL;
 176   list_t *entry;
 177
 178   lll_lock (stack_cache_lock, LLL_PRIVATE);
 179
 180   /* Search the cache for a matching entry.  We search for the
 181      smallest stack which has at least the required size.  Note that
 182      in normal situations the size of all allocated stacks is the
 183      same.  As the very least there are only a few different sizes.
 184      Therefore this loop will exit early most of the time with an
 185      exact match.  */
 186   list_for_each (entry, &stack_cache)
 187     {
 188       struct pthread *curr;
 189
 190       curr = list_entry (entry, struct pthread, list);
 191       if (FREE_P (curr) && curr->stackblock_size >= size)
 192         {
 193           if (curr->stackblock_size == size)
 194             {
 195               result = curr;
 196               break;
 197             }
 198
 199           if (result == NULL
 200               || result->stackblock_size > curr->stackblock_size)
 201             result = curr;
 202         }
 203     }
 204
 205   if (__builtin_expect (result == NULL, 0)
 206       /* Make sure the size difference is not too excessive.  In that
 207          case we do not use the block.  */
 208       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 209     {
 210       /* Release the lock.  */
 211       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 212
 213       return NULL;
 214     }
 215
 216   /* Dequeue the entry.  */
 217   stack_list_del (&result->list);
 218
 219   /* And add to the list of stacks in use.  */
 220   stack_list_add (&result->list, &stack_used);
 221
 222   /* And decrease the cache size.  */
 223   stack_cache_actsize -= result->stackblock_size;
 224
 225   /* Release the lock early.  */
 226   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 227
 228   /* Report size and location of the stack to the caller.  */
 229   *sizep = result->stackblock_size;
 230   *memp = result->stackblock;
 231
 232   /* Cancellation handling is back to the default.  */
 233   result->cancelhandling = 0;
 234   result->cleanup = NULL;
 235
 236   /* No pending event.  */
 237   result->nextevent = NULL;
 238
 239   /* Clear the DTV.  */
 240   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 241   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 242
 243   /* Re-initialize the TLS.  */
 244   _dl_allocate_tls_init (TLS_TPADJ (result));
 245
 246   return result;
 247 }
 248
 249
 250 /* Free stacks until cache size is lower than LIMIT.  */
 251 static void
 252 free_stacks (size_t limit)
 253 {
 254   /* We reduce the size of the cache.  Remove the last entries until
 255      the size is below the limit.  */
 256   list_t *entry;
 257   list_t *prev;
 258
 259   /* Search from the end of the list.  */
 260   list_for_each_prev_safe (entry, prev, &stack_cache)
 261     {
 262       struct pthread *curr;
 263
 264       curr = list_entry (entry, struct pthread, list);
 265       if (FREE_P (curr))
 266         {
 267           /* Unlink the block.  */
 268           stack_list_del (entry);
 269
 270           /* Account for the freed memory.  */
 271           stack_cache_actsize -= curr->stackblock_size;
 272
 273           /* Free the memory associated with the ELF TLS.  */
 274           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 275
 276           /* Remove this block.  This should never fail.  If it does
 277              something is really wrong.  */
 278           if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 279             abort ();
 280
 281           /* Maybe we have freed enough.  */
 282           if (stack_cache_actsize <= limit)
 283             break;
 284         }
 285     }
 286 }
 287
 288
 289 /* Add a stack frame which is not used anymore to the stack.  Must be
 290    called with the cache lock held.  */
 291 static inline void
 292 __attribute ((always_inline))
 293 queue_stack (struct pthread *stack)
 294 {
 295   /* We unconditionally add the stack to the list.  The memory may
 296      still be in use but it will not be reused until the kernel marks
 297      the stack as not used anymore.  */
 298   stack_list_add (&stack->list, &stack_cache);
 299
 300   stack_cache_actsize += stack->stackblock_size;
 301   if (__builtin_expect (stack_cache_actsize > stack_cache_maxsize, 0))
 302     free_stacks (stack_cache_maxsize);
 303 }
 304
 305
 306 /* This function is called indirectly from the freeres code in libc.  */
 307 void
 308 __free_stack_cache (void)
 309 {
 310   free_stacks (0);
 311 }
 312
 313
 314 static int
 315 internal_function
 316 change_stack_perm (struct pthread *pd
 317 #ifdef NEED_SEPARATE_REGISTER_STACK
 318                    , size_t pagemask
 319 #endif
 320                    )
 321 {
 322 #ifdef NEED_SEPARATE_REGISTER_STACK
 323   void *stack = (pd->stackblock
 324                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 325                       & pagemask) + pd->guardsize) & pagemask));
 326   size_t len = pd->stackblock + pd->stackblock_size - stack;
 327 #elif _STACK_GROWS_DOWN
 328   void *stack = pd->stackblock + pd->guardsize;
 329   size_t len = pd->stackblock_size - pd->guardsize;
 330 #elif _STACK_GROWS_UP
 331   void *stack = pd->stackblock;
 332   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 333 #else
 334 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 335 #endif
 336   if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 337     return errno;
 338
 339   return 0;
 340 }
 341
 342
 343 static int
 344 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 345                 ALLOCATE_STACK_PARMS)
 346 {
 347   struct pthread *pd;
 348   size_t size;
 349   size_t pagesize_m1 = __getpagesize () - 1;
 350   void *stacktop;
 351
 352   assert (attr != NULL);
 353   assert (powerof2 (pagesize_m1 + 1));
 354   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 355
 356   /* Get the stack size from the attribute if it is set.  Otherwise we
 357      use the default we determined at start time.  */
 358   size = attr->stacksize ?: __default_stacksize;
 359
 360   /* Get memory for the stack.  */
 361   if (__builtin_expect (attr->flags & ATTR_FLAG_STACKADDR, 0))
 362     {
 363       uintptr_t adj;
 364
 365       /* If the user also specified the size of the stack make sure it
 366          is large enough.  */
 367       if (attr->stacksize != 0
 368           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 369         return EINVAL;
 370
 371       /* Adjust stack size for alignment of the TLS block.  */
 372 #if TLS_TCB_AT_TP
 373       adj = ((uintptr_t) attr->stackaddr - TLS_TCB_SIZE)
 374             & __static_tls_align_m1;
 375       assert (size > adj + TLS_TCB_SIZE);
 376 #elif TLS_DTV_AT_TP
 377       adj = ((uintptr_t) attr->stackaddr - __static_tls_size)
 378             & __static_tls_align_m1;
 379       assert (size > adj);
 380 #endif
 381
 382       /* The user provided some memory.  Let's hope it matches the
 383          size...  We do not allocate guard pages if the user provided
 384          the stack.  It is the user's responsibility to do this if it
 385          is wanted.  */
 386 #if TLS_TCB_AT_TP
 387       pd = (struct pthread *) ((uintptr_t) attr->stackaddr
 388                                - TLS_TCB_SIZE - adj);
 389 #elif TLS_DTV_AT_TP
 390       pd = (struct pthread *) (((uintptr_t) attr->stackaddr
 391                                 - __static_tls_size - adj)
 392                                - TLS_PRE_TCB_SIZE);
 393 #endif
 394
 395       /* The user provided stack memory needs to be cleared.  */
 396       memset (pd, '\0', sizeof (struct pthread));
 397
 398       /* The first TSD block is included in the TCB.  */
 399       pd->specific[0] = pd->specific_1stblock;
 400
 401       /* Remember the stack-related values.  */
 402       pd->stackblock = (char *) attr->stackaddr - size;
 403       pd->stackblock_size = size;
 404
 405       /* This is a user-provided stack.  It will not be queued in the
 406          stack cache nor will the memory (except the TLS memory) be freed.  */
 407       pd->user_stack = true;
 408
 409       /* This is at least the second thread.  */
 410       pd->header.multiple_threads = 1;
 411 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 412       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 413 #endif
 414
 415 #ifndef __ASSUME_PRIVATE_FUTEX
 416       /* The thread must know when private futexes are supported.  */
 417       pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 418                                                 header.private_futex);
 419 #endif
 420
 421 #ifdef NEED_DL_SYSINFO
 422       /* Copy the sysinfo value from the parent.  */
 423       THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 424 #endif
 425
 426       /* The process ID is also the same as that of the caller.  */
 427       pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 428
 429       /* Allocate the DTV for this thread.  */
 430       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 431         {
 432           /* Something went wrong.  */
 433           assert (errno == ENOMEM);
 434           return EAGAIN;
 435         }
 436
 437
 438       /* Prepare to modify global data.  */
 439       lll_lock (stack_cache_lock, LLL_PRIVATE);
 440
 441       /* And add to the list of stacks in use.  */
 442       list_add (&pd->list, &__stack_user);
 443
 444       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 445     }
 446   else
 447     {
 448       /* Allocate some anonymous memory.  If possible use the cache.  */
 449       size_t guardsize;
 450       size_t reqsize;
 451       void *mem;
 452       const int prot = (PROT_READ | PROT_WRITE
 453                         | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
 454
 455 #if COLORING_INCREMENT != 0
 456       /* Add one more page for stack coloring.  Don't do it for stacks
 457          with 16 times pagesize or larger.  This might just cause
 458          unnecessary misalignment.  */
 459       if (size <= 16 * pagesize_m1)
 460         size += pagesize_m1 + 1;
 461 #endif
 462
 463       /* Adjust the stack size for alignment.  */
 464       size &= ~__static_tls_align_m1;
 465       assert (size != 0);
 466
 467       /* Make sure the size of the stack is enough for the guard and
 468          eventually the thread descriptor.  */
 469       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 470       if (__builtin_expect (size < ((guardsize + __static_tls_size
 471                                      + MINIMAL_REST_STACK + pagesize_m1)
 472                                     & ~pagesize_m1),
 473                             0))
 474         /* The stack is too small (or the guard too large).  */
 475         return EINVAL;
 476
 477       /* Try to get a stack from the cache.  */
 478       reqsize = size;
 479       pd = get_cached_stack (&size, &mem);
 480       if (pd == NULL)
 481         {
 482           /* To avoid aliasing effects on a larger scale than pages we
 483              adjust the allocated stack size if necessary.  This way
 484              allocations directly following each other will not have
 485              aliasing problems.  */
 486 #if MULTI_PAGE_ALIASING != 0
 487           if ((size % MULTI_PAGE_ALIASING) == 0)
 488             size += pagesize_m1 + 1;
 489 #endif
 490
 491           mem = mmap (NULL, size, prot,
 492                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 493
 494           if (__builtin_expect (mem == MAP_FAILED, 0))
 495             {
 496               if (errno == ENOMEM)
 497                 __set_errno (EAGAIN);
 498
 499                return errno;
 500             }
 501
 502           /* SIZE is guaranteed to be greater than zero.
 503              So we can never get a null pointer back from mmap.  */
 504           assert (mem != NULL);
 505
 506 #if COLORING_INCREMENT != 0
 507           /* Atomically increment NCREATED.  */
 508           unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
 509
 510           /* We chose the offset for coloring by incrementing it for
 511              every new thread by a fixed amount.  The offset used
 512              module the page size.  Even if coloring would be better
 513              relative to higher alignment values it makes no sense to
 514              do it since the mmap() interface does not allow us to
 515              specify any alignment for the returned memory block.  */
 516           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 517
 518           /* Make sure the coloring offsets does not disturb the alignment
 519              of the TCB and static TLS block.  */
 520           if (__builtin_expect ((coloring & __static_tls_align_m1) != 0, 0))
 521             coloring = (((coloring + __static_tls_align_m1)
 522                          & ~(__static_tls_align_m1))
 523                         & ~pagesize_m1);
 524 #else
 525           /* Unless specified we do not make any adjustments.  */
 526 # define coloring 0
 527 #endif
 528
 529           /* Place the thread descriptor at the end of the stack.  */
 530 #if TLS_TCB_AT_TP
 531           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 532 #elif TLS_DTV_AT_TP
 533           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 534                                     - __static_tls_size)
 535                                     & ~__static_tls_align_m1)
 536                                    - TLS_PRE_TCB_SIZE);
 537 #endif
 538
 539           /* Remember the stack-related values.  */
 540           pd->stackblock = mem;
 541           pd->stackblock_size = size;
 542
 543           /* We allocated the first block thread-specific data array.
 544              This address will not change for the lifetime of this
 545              descriptor.  */
 546           pd->specific[0] = pd->specific_1stblock;
 547
 548           /* This is at least the second thread.  */
 549           pd->header.multiple_threads = 1;
 550 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 551           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 552 #endif
 553
 554 #ifndef __ASSUME_PRIVATE_FUTEX
 555           /* The thread must know when private futexes are supported.  */
 556           pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 557                                                     header.private_futex);
 558 #endif
 559
 560 #ifdef NEED_DL_SYSINFO
 561           /* Copy the sysinfo value from the parent.  */
 562           THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 563 #endif
 564
 565           /* The process ID is also the same as that of the caller.  */
 566           pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 567
 568           /* Allocate the DTV for this thread.  */
 569           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 570             {
 571               /* Something went wrong.  */
 572               assert (errno == ENOMEM);
 573
 574               /* Free the stack memory we just allocated.  */
 575               (void) munmap (mem, size);
 576
 577               return EAGAIN;
 578             }
 579
 580
 581           /* Prepare to modify global data.  */
 582           lll_lock (stack_cache_lock, LLL_PRIVATE);
 583
 584           /* And add to the list of stacks in use.  */
 585           stack_list_add (&pd->list, &stack_used);
 586
 587           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 588
 589
 590           /* There might have been a race.  Another thread might have
 591              caused the stacks to get exec permission while this new
 592              stack was prepared.  Detect if this was possible and
 593              change the permission if necessary.  */
 594           if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
 595                                 && (prot & PROT_EXEC) == 0, 0))
 596             {
 597               int err = change_stack_perm (pd
 598 #ifdef NEED_SEPARATE_REGISTER_STACK
 599                                            , ~pagesize_m1
 600 #endif
 601                                            );
 602               if (err != 0)
 603                 {
 604                   /* Free the stack memory we just allocated.  */
 605                   (void) munmap (mem, size);
 606
 607                   return err;
 608                 }
 609             }
 610
 611
 612           /* Note that all of the stack and the thread descriptor is
 613              zeroed.  This means we do not have to initialize fields
 614              with initial value zero.  This is specifically true for
 615              the 'tid' field which is always set back to zero once the
 616              stack is not used anymore and for the 'guardsize' field
 617              which will be read next.  */
 618         }
 619
 620       /* Create or resize the guard area if necessary.  */
 621       if (__builtin_expect (guardsize > pd->guardsize, 0))
 622         {
 623 #ifdef NEED_SEPARATE_REGISTER_STACK
 624           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 625 #elif _STACK_GROWS_DOWN
 626           char *guard = mem;
 627 # elif _STACK_GROWS_UP
 628           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 629 #endif
 630           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 631             {
 632               int err;
 633             mprot_error:
 634               err = errno;
 635
 636               lll_lock (stack_cache_lock, LLL_PRIVATE);
 637
 638               /* Remove the thread from the list.  */
 639               stack_list_del (&pd->list);
 640
 641               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 642
 643               /* Get rid of the TLS block we allocated.  */
 644               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 645
 646               /* Free the stack memory regardless of whether the size
 647                  of the cache is over the limit or not.  If this piece
 648                  of memory caused problems we better do not use it
 649                  anymore.  Uh, and we ignore possible errors.  There
 650                  is nothing we could do.  */
 651               (void) munmap (mem, size);
 652
 653               return err;
 654             }
 655
 656           pd->guardsize = guardsize;
 657         }
 658       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 659                                  0))
 660         {
 661           /* The old guard area is too large.  */
 662
 663 #ifdef NEED_SEPARATE_REGISTER_STACK
 664           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 665           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 666
 667           if (oldguard < guard
 668               && mprotect (oldguard, guard - oldguard, prot) != 0)
 669             goto mprot_error;
 670
 671           if (mprotect (guard + guardsize,
 672                         oldguard + pd->guardsize - guard - guardsize,
 673                         prot) != 0)
 674             goto mprot_error;
 675 #elif _STACK_GROWS_DOWN
 676           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 677                         prot) != 0)
 678             goto mprot_error;
 679 #elif _STACK_GROWS_UP
 680           if (mprotect ((char *) pd - pd->guardsize,
 681                         pd->guardsize - guardsize, prot) != 0)
 682             goto mprot_error;
 683 #endif
 684
 685           pd->guardsize = guardsize;
 686         }
 687       /* The pthread_getattr_np() calls need to get passed the size
 688          requested in the attribute, regardless of how large the
 689          actually used guardsize is.  */
 690       pd->reported_guardsize = guardsize;
 691     }
 692
 693   /* Initialize the lock.  We have to do this unconditionally since the
 694      stillborn thread could be canceled while the lock is taken.  */
 695   pd->lock = LLL_LOCK_INITIALIZER;
 696
 697   /* The robust mutex lists also need to be initialized
 698      unconditionally because the cleanup for the previous stack owner
 699      might have happened in the kernel.  */
 700   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 701                                   - offsetof (pthread_mutex_t,
 702                                               __data.__list.__next));
 703   pd->robust_head.list_op_pending = NULL;
 704 #ifdef __PTHREAD_MUTEX_HAVE_PREV
 705   pd->robust_prev = &pd->robust_head;
 706 #endif
 707   pd->robust_head.list = &pd->robust_head;
 708
 709   /* We place the thread descriptor at the end of the stack.  */
 710   *pdp = pd;
 711
 712 #if TLS_TCB_AT_TP
 713   /* The stack begins before the TCB and the static TLS block.  */
 714   stacktop = ((char *) (pd + 1) - __static_tls_size);
 715 #elif TLS_DTV_AT_TP
 716   stacktop = (char *) (pd - 1);
 717 #endif
 718
 719 #ifdef NEED_SEPARATE_REGISTER_STACK
 720   *stack = pd->stackblock;
 721   *stacksize = stacktop - *stack;
 722 #elif _STACK_GROWS_DOWN
 723   *stack = stacktop;
 724 #elif _STACK_GROWS_UP
 725   *stack = pd->stackblock;
 726   assert (*stack > 0);
 727 #endif
 728
 729   return 0;
 730 }
 731
 732
 733 void
 734 internal_function
 735 __deallocate_stack (struct pthread *pd)
 736 {
 737   lll_lock (stack_cache_lock, LLL_PRIVATE);
 738
 739   /* Remove the thread from the list of threads with user defined
 740      stacks.  */
 741   stack_list_del (&pd->list);
 742
 743   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 744      not reset the 'used' flag in the 'tid' field.  This is done by
 745      the kernel.  If no thread has been created yet this field is
 746      still zero.  */
 747   if (__builtin_expect (! pd->user_stack, 1))
 748     (void) queue_stack (pd);
 749   else
 750     /* Free the memory associated with the ELF TLS.  */
 751     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 752
 753   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 754 }
 755
 756
 757 int
 758 internal_function
 759 __make_stacks_executable (void **stack_endp)
 760 {
 761   /* First the main thread's stack.  */
 762   int err = _dl_make_stack_executable (stack_endp);
 763   if (err != 0)
 764     return err;
 765
 766 #ifdef NEED_SEPARATE_REGISTER_STACK
 767   const size_t pagemask = ~(__getpagesize () - 1);
 768 #endif
 769
 770   lll_lock (stack_cache_lock, LLL_PRIVATE);
 771
 772   list_t *runp;
 773   list_for_each (runp, &stack_used)
 774     {
 775       err = change_stack_perm (list_entry (runp, struct pthread, list)
 776 #ifdef NEED_SEPARATE_REGISTER_STACK
 777                                , pagemask
 778 #endif
 779                                );
 780       if (err != 0)
 781         break;
 782     }
 783
 784   /* Also change the permission for the currently unused stacks.  This
 785      might be wasted time but better spend it here than adding a check
 786      in the fast path.  */
 787   if (err == 0)
 788     list_for_each (runp, &stack_cache)
 789       {
 790         err = change_stack_perm (list_entry (runp, struct pthread, list)
 791 #ifdef NEED_SEPARATE_REGISTER_STACK
 792                                  , pagemask
 793 #endif
 794                                  );
 795         if (err != 0)
 796           break;
 797       }
 798
 799   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 800
 801   return err;
 802 }
 803
 804
 805 /* In case of a fork() call the memory allocation in the child will be
 806    the same but only one thread is running.  All stacks except that of
 807    the one running thread are not used anymore.  We have to recycle
 808    them.  */
 809 void
 810 __reclaim_stacks (void)
 811 {
 812   struct pthread *self = (struct pthread *) THREAD_SELF;
 813
 814   /* No locking necessary.  The caller is the only stack in use.  But
 815      we have to be aware that we might have interrupted a list
 816      operation.  */
 817
 818   if (in_flight_stack != 0)
 819     {
 820       bool add_p = in_flight_stack & 1;
 821       list_t *elem = (list_t *) (in_flight_stack & ~UINTMAX_C (1));
 822
 823       if (add_p)
 824         {
 825           /* We always add at the beginning of the list.  So in this
 826              case we only need to check the beginning of these lists.  */
 827           int check_list (list_t *l)
 828           {
 829             if (l->next->prev != l)
 830               {
 831                 assert (l->next->prev == elem);
 832
 833                 elem->next = l->next;
 834                 elem->prev = l;
 835                 l->next = elem;
 836
 837                 return 1;
 838               }
 839
 840             return 0;
 841           }
 842
 843           if (check_list (&stack_used) == 0)
 844             (void) check_list (&stack_cache);
 845         }
 846       else
 847         {
 848           /* We can simply always replay the delete operation.  */
 849           elem->next->prev = elem->prev;
 850           elem->prev->next = elem->next;
 851         }
 852
 853       in_flight_stack = 0;
 854     }
 855
 856   /* Mark all stacks except the still running one as free.  */
 857   list_t *runp;
 858   list_for_each (runp, &stack_used)
 859     {
 860       struct pthread *curp = list_entry (runp, struct pthread, list);
 861       if (curp != self)
 862         {
 863           /* This marks the stack as free.  */
 864           curp->tid = 0;
 865
 866           /* The PID field must be initialized for the new process.  */
 867           curp->pid = self->pid;
 868
 869           /* Account for the size of the stack.  */
 870           stack_cache_actsize += curp->stackblock_size;
 871
 872           if (curp->specific_used)
 873             {
 874               /* Clear the thread-specific data.  */
 875               memset (curp->specific_1stblock, '\0',
 876                       sizeof (curp->specific_1stblock));
 877
 878               curp->specific_used = false;
 879
 880               for (size_t cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 881                 if (curp->specific[cnt] != NULL)
 882                   {
 883                     memset (curp->specific[cnt], '\0',
 884                             sizeof (curp->specific_1stblock));
 885
 886                     /* We have allocated the block which we do not
 887                        free here so re-set the bit.  */
 888                     curp->specific_used = true;
 889                   }
 890             }
 891         }
 892     }
 893
 894   /* Reset the PIDs in any cached stacks.  */
 895   list_for_each (runp, &stack_cache)
 896     {
 897       struct pthread *curp = list_entry (runp, struct pthread, list);
 898       curp->pid = self->pid;
 899     }
 900
 901   /* Add the stack of all running threads to the cache.  */
 902   list_splice (&stack_used, &stack_cache);
 903
 904   /* Remove the entry for the current thread to from the cache list
 905      and add it to the list of running threads.  Which of the two
 906      lists is decided by the user_stack flag.  */
 907   stack_list_del (&self->list);
 908
 909   /* Re-initialize the lists for all the threads.  */
 910   INIT_LIST_HEAD (&stack_used);
 911   INIT_LIST_HEAD (&__stack_user);
 912
 913   if (__builtin_expect (THREAD_GETMEM (self, user_stack), 0))
 914     list_add (&self->list, &__stack_user);
 915   else
 916     stack_list_add (&self->list, &stack_used);
 917
 918   /* There is one thread running.  */
 919   __nptl_nthreads = 1;
 920
 921   /* Initialize the lock.  */
 922   stack_cache_lock = LLL_LOCK_INITIALIZER;
 923 }
 924
 925
 926 #if HP_TIMING_AVAIL
 927 # undef __find_thread_by_id
 928 /* Find a thread given the thread ID.  */
 929 attribute_hidden
 930 struct pthread *
 931 __find_thread_by_id (pid_t tid)
 932 {
 933   struct pthread *result = NULL;
 934
 935   lll_lock (stack_cache_lock, LLL_PRIVATE);
 936
 937   /* Iterate over the list with system-allocated threads first.  */
 938   list_t *runp;
 939   list_for_each (runp, &stack_used)
 940     {
 941       struct pthread *curp;
 942
 943       curp = list_entry (runp, struct pthread, list);
 944
 945       if (curp->tid == tid)
 946         {
 947           result = curp;
 948           goto out;
 949         }
 950     }
 951
 952   /* Now the list with threads using user-allocated stacks.  */
 953   list_for_each (runp, &__stack_user)
 954     {
 955       struct pthread *curp;
 956
 957       curp = list_entry (runp, struct pthread, list);
 958
 959       if (curp->tid == tid)
 960         {
 961           result = curp;
 962           goto out;
 963         }
 964     }
 965
 966  out:
 967   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 968
 969   return result;
 970 }
 971 #endif
 972
 973
 974 static void
 975 internal_function
 976 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
 977 {
 978   if (! IS_DETACHED (t))
 979     {
 980       int ch;
 981       do
 982         {
 983           ch = t->cancelhandling;
 984
 985           /* If the thread is exiting right now, ignore it.  */
 986           if ((ch & EXITING_BITMASK) != 0)
 987             return;
 988         }
 989       while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
 990                                                    ch | SETXID_BITMASK, ch));
 991     }
 992
 993   int val;
 994   INTERNAL_SYSCALL_DECL (err);
 995 #if __ASSUME_TGKILL
 996   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
 997                           t->tid, SIGSETXID);
 998 #else
 999 # ifdef __NR_tgkill
1000   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
1001                           t->tid, SIGSETXID);
1002   if (INTERNAL_SYSCALL_ERROR_P (val, err)
1003       && INTERNAL_SYSCALL_ERRNO (val, err) == ENOSYS)
1004 # endif
1005     val = INTERNAL_SYSCALL (tkill, err, 2, t->tid, SIGSETXID);
1006 #endif
1007
1008   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
1009     atomic_increment (&cmdp->cntr);
1010 }
1011
1012
1013 int
1014 attribute_hidden
1015 __nptl_setxid (struct xid_command *cmdp)
1016 {
1017   int result;
1018   lll_lock (stack_cache_lock, LLL_PRIVATE);
1019
1020   __xidcmd = cmdp;
1021   cmdp->cntr = 0;
1022
1023   struct pthread *self = THREAD_SELF;
1024
1025   /* Iterate over the list with system-allocated threads first.  */
1026   list_t *runp;
1027   list_for_each (runp, &stack_used)
1028     {
1029       struct pthread *t = list_entry (runp, struct pthread, list);
1030       if (t == self)
1031         continue;
1032
1033       setxid_signal_thread (cmdp, t);
1034     }
1035
1036   /* Now the list with threads using user-allocated stacks.  */
1037   list_for_each (runp, &__stack_user)
1038     {
1039       struct pthread *t = list_entry (runp, struct pthread, list);
1040       if (t == self)
1041         continue;
1042
1043       setxid_signal_thread (cmdp, t);
1044     }
1045
1046   int cur = cmdp->cntr;
1047   while (cur != 0)
1048     {
1049       lll_futex_wait (&cmdp->cntr, cur, LLL_PRIVATE);
1050       cur = cmdp->cntr;
1051     }
1052
1053   /* This must be last, otherwise the current thread might not have
1054      permissions to send SIGSETXID syscall to the other threads.  */
1055   INTERNAL_SYSCALL_DECL (err);
1056   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1057                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1058   if (INTERNAL_SYSCALL_ERROR_P (result, err))
1059     {
1060       __set_errno (INTERNAL_SYSCALL_ERRNO (result, err));
1061       result = -1;
1062     }
1063
1064   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1065   return result;
1066 }
1067
1068 static inline void __attribute__((always_inline))
1069 init_one_static_tls (struct pthread *curp, struct link_map *map)
1070 {
1071   dtv_t *dtv = GET_DTV (TLS_TPADJ (curp));
1072 # if TLS_TCB_AT_TP
1073   void *dest = (char *) curp - map->l_tls_offset;
1074 # elif TLS_DTV_AT_TP
1075   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1076 # else
1077 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1078 # endif
1079
1080   /* Fill in the DTV slot so that a later LD/GD access will find it.  */
1081   dtv[map->l_tls_modid].pointer.val = dest;
1082   dtv[map->l_tls_modid].pointer.is_static = true;
1083
1084   /* Initialize the memory.  */
1085   memset (__mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1086           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1087 }
1088
1089 void
1090 attribute_hidden
1091 __pthread_init_static_tls (struct link_map *map)
1092 {
1093   lll_lock (stack_cache_lock, LLL_PRIVATE);
1094
1095   /* Iterate over the list with system-allocated threads first.  */
1096   list_t *runp;
1097   list_for_each (runp, &stack_used)
1098     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1099
1100   /* Now the list with threads using user-allocated stacks.  */
1101   list_for_each (runp, &__stack_user)
1102     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1103
1104   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1105 }
1106
1107
1108 void
1109 attribute_hidden
1110 __wait_lookup_done (void)
1111 {
1112   lll_lock (stack_cache_lock, LLL_PRIVATE);
1113
1114   struct pthread *self = THREAD_SELF;
1115
1116   /* Iterate over the list with system-allocated threads first.  */
1117   list_t *runp;
1118   list_for_each (runp, &stack_used)
1119     {
1120       struct pthread *t = list_entry (runp, struct pthread, list);
1121       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1122         continue;
1123
1124       int *const gscope_flagp = &t->header.gscope_flag;
1125
1126       /* We have to wait until this thread is done with the global
1127          scope.  First tell the thread that we are waiting and
1128          possibly have to be woken.  */
1129       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1130                                                 THREAD_GSCOPE_FLAG_WAIT,
1131                                                 THREAD_GSCOPE_FLAG_USED))
1132         continue;
1133
1134       do
1135         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1136       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1137     }
1138
1139   /* Now the list with threads using user-allocated stacks.  */
1140   list_for_each (runp, &__stack_user)
1141     {
1142       struct pthread *t = list_entry (runp, struct pthread, list);
1143       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1144         continue;
1145
1146       int *const gscope_flagp = &t->header.gscope_flag;
1147
1148       /* We have to wait until this thread is done with the global
1149          scope.  First tell the thread that we are waiting and
1150          possibly have to be woken.  */
1151       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1152                                                 THREAD_GSCOPE_FLAG_WAIT,
1153                                                 THREAD_GSCOPE_FLAG_USED))
1154         continue;
1155
1156       do
1157         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1158       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1159     }
1160
1161   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1162 }