nptl/allocatestack.c

   1 /* Copyright (C) 2002-2007, 2009 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #include <assert.h>
  21 #include <errno.h>
  22 #include <signal.h>
  23 #include <stdint.h>
  24 #include <string.h>
  25 #include <unistd.h>
  26 #include <sys/mman.h>
  27 #include <sys/param.h>
  28 #include <dl-sysdep.h>
  29 #include <tls.h>
  30 #include <lowlevellock.h>
  31 #include <kernel-features.h>
  32
  33
  34 #ifndef NEED_SEPARATE_REGISTER_STACK
  35
  36 /* Most architectures have exactly one stack pointer.  Some have more.  */
  37 # define STACK_VARIABLES void *stackaddr = NULL
  38
  39 /* How to pass the values to the 'create_thread' function.  */
  40 # define STACK_VARIABLES_ARGS stackaddr
  41
  42 /* How to declare function which gets there parameters.  */
  43 # define STACK_VARIABLES_PARMS void *stackaddr
  44
  45 /* How to declare allocate_stack.  */
  46 # define ALLOCATE_STACK_PARMS void **stack
  47
  48 /* This is how the function is called.  We do it this way to allow
  49    other variants of the function to have more parameters.  */
  50 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  51
  52 #else
  53
  54 /* We need two stacks.  The kernel will place them but we have to tell
  55    the kernel about the size of the reserved address space.  */
  56 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  57
  58 /* How to pass the values to the 'create_thread' function.  */
  59 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  60
  61 /* How to declare function which gets there parameters.  */
  62 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  63
  64 /* How to declare allocate_stack.  */
  65 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  66
  67 /* This is how the function is called.  We do it this way to allow
  68    other variants of the function to have more parameters.  */
  69 # define ALLOCATE_STACK(attr, pd) \
  70   allocate_stack (attr, pd, &stackaddr, &stacksize)
  71
  72 #endif
  73
  74
  75 /* Default alignment of stack.  */
  76 #ifndef STACK_ALIGN
  77 # define STACK_ALIGN __alignof__ (long double)
  78 #endif
  79
  80 /* Default value for minimal stack size after allocating thread
  81    descriptor and guard.  */
  82 #ifndef MINIMAL_REST_STACK
  83 # define MINIMAL_REST_STACK     4096
  84 #endif
  85
  86
  87 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  88    a stack.  Use it when possible.  */
  89 #ifndef MAP_STACK
  90 # define MAP_STACK 0
  91 #endif
  92
  93 /* This yields the pointer that TLS support code calls the thread pointer.  */
  94 #if TLS_TCB_AT_TP
  95 # define TLS_TPADJ(pd) (pd)
  96 #elif TLS_DTV_AT_TP
  97 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
  98 #endif
  99
 100 /* Cache handling for not-yet free stacks.  */
 101
 102 /* Maximum size in kB of cache.  */
 103 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 104 static size_t stack_cache_actsize;
 105
 106 /* Mutex protecting this variable.  */
 107 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 108
 109 /* List of queued stack frames.  */
 110 static LIST_HEAD (stack_cache);
 111
 112 /* List of the stacks in use.  */
 113 static LIST_HEAD (stack_used);
 114
 115 /* We need to record what list operations we are going to do so that,
 116    in case of an asynchronous interruption due to a fork() call, we
 117    can correct for the work.  */
 118 static uintptr_t in_flight_stack;
 119
 120 /* List of the threads with user provided stacks in use.  No need to
 121    initialize this, since it's done in __pthread_initialize_minimal.  */
 122 list_t __stack_user __attribute__ ((nocommon));
 123 hidden_data_def (__stack_user)
 124
 125 #if COLORING_INCREMENT != 0
 126 /* Number of threads created.  */
 127 static unsigned int nptl_ncreated;
 128 #endif
 129
 130
 131 /* Check whether the stack is still used or not.  */
 132 #define FREE_P(descr) ((descr)->tid <= 0)
 133
 134
 135 static void
 136 stack_list_del (list_t *elem)
 137 {
 138   in_flight_stack = (uintptr_t) elem;
 139
 140   atomic_write_barrier ();
 141
 142   list_del (elem);
 143
 144   atomic_write_barrier ();
 145
 146   in_flight_stack = 0;
 147 }
 148
 149
 150 static void
 151 stack_list_add (list_t *elem, list_t *list)
 152 {
 153   in_flight_stack = (uintptr_t) elem | 1;
 154
 155   atomic_write_barrier ();
 156
 157   list_add (elem, list);
 158
 159   atomic_write_barrier ();
 160
 161   in_flight_stack = 0;
 162 }
 163
 164
 165 /* We create a double linked list of all cache entries.  Double linked
 166    because this allows removing entries from the end.  */
 167
 168
 169 /* Get a stack frame from the cache.  We have to match by size since
 170    some blocks might be too small or far too large.  */
 171 static struct pthread *
 172 get_cached_stack (size_t *sizep, void **memp)
 173 {
 174   size_t size = *sizep;
 175   struct pthread *result = NULL;
 176   list_t *entry;
 177
 178   lll_lock (stack_cache_lock, LLL_PRIVATE);
 179
 180   /* Search the cache for a matching entry.  We search for the
 181      smallest stack which has at least the required size.  Note that
 182      in normal situations the size of all allocated stacks is the
 183      same.  As the very least there are only a few different sizes.
 184      Therefore this loop will exit early most of the time with an
 185      exact match.  */
 186   list_for_each (entry, &stack_cache)
 187     {
 188       struct pthread *curr;
 189
 190       curr = list_entry (entry, struct pthread, list);
 191       if (FREE_P (curr) && curr->stackblock_size >= size)
 192         {
 193           if (curr->stackblock_size == size)
 194             {
 195               result = curr;
 196               break;
 197             }
 198
 199           if (result == NULL
 200               || result->stackblock_size > curr->stackblock_size)
 201             result = curr;
 202         }
 203     }
 204
 205   if (__builtin_expect (result == NULL, 0)
 206       /* Make sure the size difference is not too excessive.  In that
 207          case we do not use the block.  */
 208       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 209     {
 210       /* Release the lock.  */
 211       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 212
 213       return NULL;
 214     }
 215
 216   /* Dequeue the entry.  */
 217   stack_list_del (&result->list);
 218
 219   /* And add to the list of stacks in use.  */
 220   stack_list_add (&result->list, &stack_used);
 221
 222   /* And decrease the cache size.  */
 223   stack_cache_actsize -= result->stackblock_size;
 224
 225   /* Release the lock early.  */
 226   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 227
 228   /* Report size and location of the stack to the caller.  */
 229   *sizep = result->stackblock_size;
 230   *memp = result->stackblock;
 231
 232   /* Cancellation handling is back to the default.  */
 233   result->cancelhandling = 0;
 234   result->cleanup = NULL;
 235
 236   /* No pending event.  */
 237   result->nextevent = NULL;
 238
 239   /* Clear the DTV.  */
 240   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 241   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 242
 243   /* Re-initialize the TLS.  */
 244   _dl_allocate_tls_init (TLS_TPADJ (result));
 245
 246   return result;
 247 }
 248
 249
 250 /* Free stacks until cache size is lower than LIMIT.  */
 251 void
 252 __free_stacks (size_t limit)
 253 {
 254   /* We reduce the size of the cache.  Remove the last entries until
 255      the size is below the limit.  */
 256   list_t *entry;
 257   list_t *prev;
 258
 259   /* Search from the end of the list.  */
 260   list_for_each_prev_safe (entry, prev, &stack_cache)
 261     {
 262       struct pthread *curr;
 263
 264       curr = list_entry (entry, struct pthread, list);
 265       if (FREE_P (curr))
 266         {
 267           /* Unlink the block.  */
 268           stack_list_del (entry);
 269
 270           /* Account for the freed memory.  */
 271           stack_cache_actsize -= curr->stackblock_size;
 272
 273           /* Free the memory associated with the ELF TLS.  */
 274           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 275
 276           /* Remove this block.  This should never fail.  If it does
 277              something is really wrong.  */
 278           if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 279             abort ();
 280
 281           /* Maybe we have freed enough.  */
 282           if (stack_cache_actsize <= limit)
 283             break;
 284         }
 285     }
 286 }
 287
 288
 289 /* Add a stack frame which is not used anymore to the stack.  Must be
 290    called with the cache lock held.  */
 291 static inline void
 292 __attribute ((always_inline))
 293 queue_stack (struct pthread *stack)
 294 {
 295   /* We unconditionally add the stack to the list.  The memory may
 296      still be in use but it will not be reused until the kernel marks
 297      the stack as not used anymore.  */
 298   stack_list_add (&stack->list, &stack_cache);
 299
 300   stack_cache_actsize += stack->stackblock_size;
 301   if (__builtin_expect (stack_cache_actsize > stack_cache_maxsize, 0))
 302     __free_stacks (stack_cache_maxsize);
 303 }
 304
 305
 306 static int
 307 internal_function
 308 change_stack_perm (struct pthread *pd
 309 #ifdef NEED_SEPARATE_REGISTER_STACK
 310                    , size_t pagemask
 311 #endif
 312                    )
 313 {
 314 #ifdef NEED_SEPARATE_REGISTER_STACK
 315   void *stack = (pd->stackblock
 316                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 317                       & pagemask) + pd->guardsize) & pagemask));
 318   size_t len = pd->stackblock + pd->stackblock_size - stack;
 319 #elif _STACK_GROWS_DOWN
 320   void *stack = pd->stackblock + pd->guardsize;
 321   size_t len = pd->stackblock_size - pd->guardsize;
 322 #elif _STACK_GROWS_UP
 323   void *stack = pd->stackblock;
 324   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 325 #else
 326 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 327 #endif
 328   if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 329     return errno;
 330
 331   return 0;
 332 }
 333
 334
 335 static int
 336 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 337                 ALLOCATE_STACK_PARMS)
 338 {
 339   struct pthread *pd;
 340   size_t size;
 341   size_t pagesize_m1 = __getpagesize () - 1;
 342   void *stacktop;
 343
 344   assert (attr != NULL);
 345   assert (powerof2 (pagesize_m1 + 1));
 346   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 347
 348   /* Get the stack size from the attribute if it is set.  Otherwise we
 349      use the default we determined at start time.  */
 350   size = attr->stacksize ?: __default_stacksize;
 351
 352   /* Get memory for the stack.  */
 353   if (__builtin_expect (attr->flags & ATTR_FLAG_STACKADDR, 0))
 354     {
 355       uintptr_t adj;
 356
 357       /* If the user also specified the size of the stack make sure it
 358          is large enough.  */
 359       if (attr->stacksize != 0
 360           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 361         return EINVAL;
 362
 363       /* Adjust stack size for alignment of the TLS block.  */
 364 #if TLS_TCB_AT_TP
 365       adj = ((uintptr_t) attr->stackaddr - TLS_TCB_SIZE)
 366             & __static_tls_align_m1;
 367       assert (size > adj + TLS_TCB_SIZE);
 368 #elif TLS_DTV_AT_TP
 369       adj = ((uintptr_t) attr->stackaddr - __static_tls_size)
 370             & __static_tls_align_m1;
 371       assert (size > adj);
 372 #endif
 373
 374       /* The user provided some memory.  Let's hope it matches the
 375          size...  We do not allocate guard pages if the user provided
 376          the stack.  It is the user's responsibility to do this if it
 377          is wanted.  */
 378 #if TLS_TCB_AT_TP
 379       pd = (struct pthread *) ((uintptr_t) attr->stackaddr
 380                                - TLS_TCB_SIZE - adj);
 381 #elif TLS_DTV_AT_TP
 382       pd = (struct pthread *) (((uintptr_t) attr->stackaddr
 383                                 - __static_tls_size - adj)
 384                                - TLS_PRE_TCB_SIZE);
 385 #endif
 386
 387       /* The user provided stack memory needs to be cleared.  */
 388       memset (pd, '\0', sizeof (struct pthread));
 389
 390       /* The first TSD block is included in the TCB.  */
 391       pd->specific[0] = pd->specific_1stblock;
 392
 393       /* Remember the stack-related values.  */
 394       pd->stackblock = (char *) attr->stackaddr - size;
 395       pd->stackblock_size = size;
 396
 397       /* This is a user-provided stack.  It will not be queued in the
 398          stack cache nor will the memory (except the TLS memory) be freed.  */
 399       pd->user_stack = true;
 400
 401       /* This is at least the second thread.  */
 402       pd->header.multiple_threads = 1;
 403 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 404       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 405 #endif
 406
 407 #ifndef __ASSUME_PRIVATE_FUTEX
 408       /* The thread must know when private futexes are supported.  */
 409       pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 410                                                 header.private_futex);
 411 #endif
 412
 413 #ifdef NEED_DL_SYSINFO
 414       /* Copy the sysinfo value from the parent.  */
 415       THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 416 #endif
 417
 418       /* The process ID is also the same as that of the caller.  */
 419       pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 420
 421       /* Allocate the DTV for this thread.  */
 422       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 423         {
 424           /* Something went wrong.  */
 425           assert (errno == ENOMEM);
 426           return EAGAIN;
 427         }
 428
 429
 430       /* Prepare to modify global data.  */
 431       lll_lock (stack_cache_lock, LLL_PRIVATE);
 432
 433       /* And add to the list of stacks in use.  */
 434       list_add (&pd->list, &__stack_user);
 435
 436       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 437     }
 438   else
 439     {
 440       /* Allocate some anonymous memory.  If possible use the cache.  */
 441       size_t guardsize;
 442       size_t reqsize;
 443       void *mem;
 444       const int prot = (PROT_READ | PROT_WRITE
 445                         | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
 446
 447 #if COLORING_INCREMENT != 0
 448       /* Add one more page for stack coloring.  Don't do it for stacks
 449          with 16 times pagesize or larger.  This might just cause
 450          unnecessary misalignment.  */
 451       if (size <= 16 * pagesize_m1)
 452         size += pagesize_m1 + 1;
 453 #endif
 454
 455       /* Adjust the stack size for alignment.  */
 456       size &= ~__static_tls_align_m1;
 457       assert (size != 0);
 458
 459       /* Make sure the size of the stack is enough for the guard and
 460          eventually the thread descriptor.  */
 461       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 462       if (__builtin_expect (size < ((guardsize + __static_tls_size
 463                                      + MINIMAL_REST_STACK + pagesize_m1)
 464                                     & ~pagesize_m1),
 465                             0))
 466         /* The stack is too small (or the guard too large).  */
 467         return EINVAL;
 468
 469       /* Try to get a stack from the cache.  */
 470       reqsize = size;
 471       pd = get_cached_stack (&size, &mem);
 472       if (pd == NULL)
 473         {
 474           /* To avoid aliasing effects on a larger scale than pages we
 475              adjust the allocated stack size if necessary.  This way
 476              allocations directly following each other will not have
 477              aliasing problems.  */
 478 #if MULTI_PAGE_ALIASING != 0
 479           if ((size % MULTI_PAGE_ALIASING) == 0)
 480             size += pagesize_m1 + 1;
 481 #endif
 482
 483           mem = mmap (NULL, size, prot,
 484                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 485
 486           if (__builtin_expect (mem == MAP_FAILED, 0))
 487             {
 488               if (errno == ENOMEM)
 489                 __set_errno (EAGAIN);
 490
 491                return errno;
 492             }
 493
 494           /* SIZE is guaranteed to be greater than zero.
 495              So we can never get a null pointer back from mmap.  */
 496           assert (mem != NULL);
 497
 498 #if COLORING_INCREMENT != 0
 499           /* Atomically increment NCREATED.  */
 500           unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
 501
 502           /* We chose the offset for coloring by incrementing it for
 503              every new thread by a fixed amount.  The offset used
 504              module the page size.  Even if coloring would be better
 505              relative to higher alignment values it makes no sense to
 506              do it since the mmap() interface does not allow us to
 507              specify any alignment for the returned memory block.  */
 508           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 509
 510           /* Make sure the coloring offsets does not disturb the alignment
 511              of the TCB and static TLS block.  */
 512           if (__builtin_expect ((coloring & __static_tls_align_m1) != 0, 0))
 513             coloring = (((coloring + __static_tls_align_m1)
 514                          & ~(__static_tls_align_m1))
 515                         & ~pagesize_m1);
 516 #else
 517           /* Unless specified we do not make any adjustments.  */
 518 # define coloring 0
 519 #endif
 520
 521           /* Place the thread descriptor at the end of the stack.  */
 522 #if TLS_TCB_AT_TP
 523           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 524 #elif TLS_DTV_AT_TP
 525           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 526                                     - __static_tls_size)
 527                                     & ~__static_tls_align_m1)
 528                                    - TLS_PRE_TCB_SIZE);
 529 #endif
 530
 531           /* Remember the stack-related values.  */
 532           pd->stackblock = mem;
 533           pd->stackblock_size = size;
 534
 535           /* We allocated the first block thread-specific data array.
 536              This address will not change for the lifetime of this
 537              descriptor.  */
 538           pd->specific[0] = pd->specific_1stblock;
 539
 540           /* This is at least the second thread.  */
 541           pd->header.multiple_threads = 1;
 542 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 543           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 544 #endif
 545
 546 #ifndef __ASSUME_PRIVATE_FUTEX
 547           /* The thread must know when private futexes are supported.  */
 548           pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 549                                                     header.private_futex);
 550 #endif
 551
 552 #ifdef NEED_DL_SYSINFO
 553           /* Copy the sysinfo value from the parent.  */
 554           THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 555 #endif
 556
 557           /* The process ID is also the same as that of the caller.  */
 558           pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 559
 560           /* Allocate the DTV for this thread.  */
 561           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 562             {
 563               /* Something went wrong.  */
 564               assert (errno == ENOMEM);
 565
 566               /* Free the stack memory we just allocated.  */
 567               (void) munmap (mem, size);
 568
 569               return EAGAIN;
 570             }
 571
 572
 573           /* Prepare to modify global data.  */
 574           lll_lock (stack_cache_lock, LLL_PRIVATE);
 575
 576           /* And add to the list of stacks in use.  */
 577           stack_list_add (&pd->list, &stack_used);
 578
 579           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 580
 581
 582           /* There might have been a race.  Another thread might have
 583              caused the stacks to get exec permission while this new
 584              stack was prepared.  Detect if this was possible and
 585              change the permission if necessary.  */
 586           if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
 587                                 && (prot & PROT_EXEC) == 0, 0))
 588             {
 589               int err = change_stack_perm (pd
 590 #ifdef NEED_SEPARATE_REGISTER_STACK
 591                                            , ~pagesize_m1
 592 #endif
 593                                            );
 594               if (err != 0)
 595                 {
 596                   /* Free the stack memory we just allocated.  */
 597                   (void) munmap (mem, size);
 598
 599                   return err;
 600                 }
 601             }
 602
 603
 604           /* Note that all of the stack and the thread descriptor is
 605              zeroed.  This means we do not have to initialize fields
 606              with initial value zero.  This is specifically true for
 607              the 'tid' field which is always set back to zero once the
 608              stack is not used anymore and for the 'guardsize' field
 609              which will be read next.  */
 610         }
 611
 612       /* Create or resize the guard area if necessary.  */
 613       if (__builtin_expect (guardsize > pd->guardsize, 0))
 614         {
 615 #ifdef NEED_SEPARATE_REGISTER_STACK
 616           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 617 #elif _STACK_GROWS_DOWN
 618           char *guard = mem;
 619 # elif _STACK_GROWS_UP
 620           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 621 #endif
 622           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 623             {
 624               int err;
 625             mprot_error:
 626               err = errno;
 627
 628               lll_lock (stack_cache_lock, LLL_PRIVATE);
 629
 630               /* Remove the thread from the list.  */
 631               stack_list_del (&pd->list);
 632
 633               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 634
 635               /* Get rid of the TLS block we allocated.  */
 636               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 637
 638               /* Free the stack memory regardless of whether the size
 639                  of the cache is over the limit or not.  If this piece
 640                  of memory caused problems we better do not use it
 641                  anymore.  Uh, and we ignore possible errors.  There
 642                  is nothing we could do.  */
 643               (void) munmap (mem, size);
 644
 645               return err;
 646             }
 647
 648           pd->guardsize = guardsize;
 649         }
 650       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 651                                  0))
 652         {
 653           /* The old guard area is too large.  */
 654
 655 #ifdef NEED_SEPARATE_REGISTER_STACK
 656           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 657           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 658
 659           if (oldguard < guard
 660               && mprotect (oldguard, guard - oldguard, prot) != 0)
 661             goto mprot_error;
 662
 663           if (mprotect (guard + guardsize,
 664                         oldguard + pd->guardsize - guard - guardsize,
 665                         prot) != 0)
 666             goto mprot_error;
 667 #elif _STACK_GROWS_DOWN
 668           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 669                         prot) != 0)
 670             goto mprot_error;
 671 #elif _STACK_GROWS_UP
 672           if (mprotect ((char *) pd - pd->guardsize,
 673                         pd->guardsize - guardsize, prot) != 0)
 674             goto mprot_error;
 675 #endif
 676
 677           pd->guardsize = guardsize;
 678         }
 679       /* The pthread_getattr_np() calls need to get passed the size
 680          requested in the attribute, regardless of how large the
 681          actually used guardsize is.  */
 682       pd->reported_guardsize = guardsize;
 683     }
 684
 685   /* Initialize the lock.  We have to do this unconditionally since the
 686      stillborn thread could be canceled while the lock is taken.  */
 687   pd->lock = LLL_LOCK_INITIALIZER;
 688
 689   /* The robust mutex lists also need to be initialized
 690      unconditionally because the cleanup for the previous stack owner
 691      might have happened in the kernel.  */
 692   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 693                                   - offsetof (pthread_mutex_t,
 694                                               __data.__list.__next));
 695   pd->robust_head.list_op_pending = NULL;
 696 #ifdef __PTHREAD_MUTEX_HAVE_PREV
 697   pd->robust_prev = &pd->robust_head;
 698 #endif
 699   pd->robust_head.list = &pd->robust_head;
 700
 701   /* We place the thread descriptor at the end of the stack.  */
 702   *pdp = pd;
 703
 704 #if TLS_TCB_AT_TP
 705   /* The stack begins before the TCB and the static TLS block.  */
 706   stacktop = ((char *) (pd + 1) - __static_tls_size);
 707 #elif TLS_DTV_AT_TP
 708   stacktop = (char *) (pd - 1);
 709 #endif
 710
 711 #ifdef NEED_SEPARATE_REGISTER_STACK
 712   *stack = pd->stackblock;
 713   *stacksize = stacktop - *stack;
 714 #elif _STACK_GROWS_DOWN
 715   *stack = stacktop;
 716 #elif _STACK_GROWS_UP
 717   *stack = pd->stackblock;
 718   assert (*stack > 0);
 719 #endif
 720
 721   return 0;
 722 }
 723
 724
 725 void
 726 internal_function
 727 __deallocate_stack (struct pthread *pd)
 728 {
 729   lll_lock (stack_cache_lock, LLL_PRIVATE);
 730
 731   /* Remove the thread from the list of threads with user defined
 732      stacks.  */
 733   stack_list_del (&pd->list);
 734
 735   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 736      not reset the 'used' flag in the 'tid' field.  This is done by
 737      the kernel.  If no thread has been created yet this field is
 738      still zero.  */
 739   if (__builtin_expect (! pd->user_stack, 1))
 740     (void) queue_stack (pd);
 741   else
 742     /* Free the memory associated with the ELF TLS.  */
 743     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 744
 745   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 746 }
 747
 748
 749 int
 750 internal_function
 751 __make_stacks_executable (void **stack_endp)
 752 {
 753   /* First the main thread's stack.  */
 754   int err = _dl_make_stack_executable (stack_endp);
 755   if (err != 0)
 756     return err;
 757
 758 #ifdef NEED_SEPARATE_REGISTER_STACK
 759   const size_t pagemask = ~(__getpagesize () - 1);
 760 #endif
 761
 762   lll_lock (stack_cache_lock, LLL_PRIVATE);
 763
 764   list_t *runp;
 765   list_for_each (runp, &stack_used)
 766     {
 767       err = change_stack_perm (list_entry (runp, struct pthread, list)
 768 #ifdef NEED_SEPARATE_REGISTER_STACK
 769                                , pagemask
 770 #endif
 771                                );
 772       if (err != 0)
 773         break;
 774     }
 775
 776   /* Also change the permission for the currently unused stacks.  This
 777      might be wasted time but better spend it here than adding a check
 778      in the fast path.  */
 779   if (err == 0)
 780     list_for_each (runp, &stack_cache)
 781       {
 782         err = change_stack_perm (list_entry (runp, struct pthread, list)
 783 #ifdef NEED_SEPARATE_REGISTER_STACK
 784                                  , pagemask
 785 #endif
 786                                  );
 787         if (err != 0)
 788           break;
 789       }
 790
 791   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 792
 793   return err;
 794 }
 795
 796
 797 /* In case of a fork() call the memory allocation in the child will be
 798    the same but only one thread is running.  All stacks except that of
 799    the one running thread are not used anymore.  We have to recycle
 800    them.  */
 801 void
 802 __reclaim_stacks (void)
 803 {
 804   struct pthread *self = (struct pthread *) THREAD_SELF;
 805
 806   /* No locking necessary.  The caller is the only stack in use.  But
 807      we have to be aware that we might have interrupted a list
 808      operation.  */
 809
 810   if (in_flight_stack != 0)
 811     {
 812       bool add_p = in_flight_stack & 1;
 813       list_t *elem = (list_t *) (in_flight_stack & ~UINTMAX_C (1));
 814
 815       if (add_p)
 816         {
 817           /* We always add at the beginning of the list.  So in this
 818              case we only need to check the beginning of these lists.  */
 819           int check_list (list_t *l)
 820           {
 821             if (l->next->prev != l)
 822               {
 823                 assert (l->next->prev == elem);
 824
 825                 elem->next = l->next;
 826                 elem->prev = l;
 827                 l->next = elem;
 828
 829                 return 1;
 830               }
 831
 832             return 0;
 833           }
 834
 835           if (check_list (&stack_used) == 0)
 836             (void) check_list (&stack_cache);
 837         }
 838       else
 839         {
 840           /* We can simply always replay the delete operation.  */
 841           elem->next->prev = elem->prev;
 842           elem->prev->next = elem->next;
 843         }
 844     }
 845
 846   /* Mark all stacks except the still running one as free.  */
 847   list_t *runp;
 848   list_for_each (runp, &stack_used)
 849     {
 850       struct pthread *curp = list_entry (runp, struct pthread, list);
 851       if (curp != self)
 852         {
 853           /* This marks the stack as free.  */
 854           curp->tid = 0;
 855
 856           /* The PID field must be initialized for the new process.  */
 857           curp->pid = self->pid;
 858
 859           /* Account for the size of the stack.  */
 860           stack_cache_actsize += curp->stackblock_size;
 861
 862           if (curp->specific_used)
 863             {
 864               /* Clear the thread-specific data.  */
 865               memset (curp->specific_1stblock, '\0',
 866                       sizeof (curp->specific_1stblock));
 867
 868               curp->specific_used = false;
 869
 870               for (size_t cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 871                 if (curp->specific[cnt] != NULL)
 872                   {
 873                     memset (curp->specific[cnt], '\0',
 874                             sizeof (curp->specific_1stblock));
 875
 876                     /* We have allocated the block which we do not
 877                        free here so re-set the bit.  */
 878                     curp->specific_used = true;
 879                   }
 880             }
 881         }
 882     }
 883
 884   /* Reset the PIDs in any cached stacks.  */
 885   list_for_each (runp, &stack_cache)
 886     {
 887       struct pthread *curp = list_entry (runp, struct pthread, list);
 888       curp->pid = self->pid;
 889     }
 890
 891   /* Add the stack of all running threads to the cache.  */
 892   list_splice (&stack_used, &stack_cache);
 893
 894   /* Remove the entry for the current thread to from the cache list
 895      and add it to the list of running threads.  Which of the two
 896      lists is decided by the user_stack flag.  */
 897   stack_list_del (&self->list);
 898
 899   /* Re-initialize the lists for all the threads.  */
 900   INIT_LIST_HEAD (&stack_used);
 901   INIT_LIST_HEAD (&__stack_user);
 902
 903   if (__builtin_expect (THREAD_GETMEM (self, user_stack), 0))
 904     list_add (&self->list, &__stack_user);
 905   else
 906     list_add (&self->list, &stack_used);
 907
 908   /* There is one thread running.  */
 909   __nptl_nthreads = 1;
 910
 911   in_flight_stack = 0;
 912
 913   /* Initialize the lock.  */
 914   stack_cache_lock = LLL_LOCK_INITIALIZER;
 915 }
 916
 917
 918 #if HP_TIMING_AVAIL
 919 # undef __find_thread_by_id
 920 /* Find a thread given the thread ID.  */
 921 attribute_hidden
 922 struct pthread *
 923 __find_thread_by_id (pid_t tid)
 924 {
 925   struct pthread *result = NULL;
 926
 927   lll_lock (stack_cache_lock, LLL_PRIVATE);
 928
 929   /* Iterate over the list with system-allocated threads first.  */
 930   list_t *runp;
 931   list_for_each (runp, &stack_used)
 932     {
 933       struct pthread *curp;
 934
 935       curp = list_entry (runp, struct pthread, list);
 936
 937       if (curp->tid == tid)
 938         {
 939           result = curp;
 940           goto out;
 941         }
 942     }
 943
 944   /* Now the list with threads using user-allocated stacks.  */
 945   list_for_each (runp, &__stack_user)
 946     {
 947       struct pthread *curp;
 948
 949       curp = list_entry (runp, struct pthread, list);
 950
 951       if (curp->tid == tid)
 952         {
 953           result = curp;
 954           goto out;
 955         }
 956     }
 957
 958  out:
 959   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 960
 961   return result;
 962 }
 963 #endif
 964
 965
 966 static void
 967 internal_function
 968 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
 969 {
 970   if (! IS_DETACHED (t))
 971     {
 972       int ch;
 973       do
 974         {
 975           ch = t->cancelhandling;
 976
 977           /* If the thread is exiting right now, ignore it.  */
 978           if ((ch & EXITING_BITMASK) != 0)
 979             return;
 980         }
 981       while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
 982                                                    ch | SETXID_BITMASK, ch));
 983     }
 984
 985   int val;
 986   INTERNAL_SYSCALL_DECL (err);
 987 #if __ASSUME_TGKILL
 988   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
 989                           t->tid, SIGSETXID);
 990 #else
 991 # ifdef __NR_tgkill
 992   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
 993                           t->tid, SIGSETXID);
 994   if (INTERNAL_SYSCALL_ERROR_P (val, err)
 995       && INTERNAL_SYSCALL_ERRNO (val, err) == ENOSYS)
 996 # endif
 997     val = INTERNAL_SYSCALL (tkill, err, 2, t->tid, SIGSETXID);
 998 #endif
 999
1000   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
1001     atomic_increment (&cmdp->cntr);
1002 }
1003
1004
1005 int
1006 attribute_hidden
1007 __nptl_setxid (struct xid_command *cmdp)
1008 {
1009   int result;
1010   lll_lock (stack_cache_lock, LLL_PRIVATE);
1011
1012   __xidcmd = cmdp;
1013   cmdp->cntr = 0;
1014
1015   struct pthread *self = THREAD_SELF;
1016
1017   /* Iterate over the list with system-allocated threads first.  */
1018   list_t *runp;
1019   list_for_each (runp, &stack_used)
1020     {
1021       struct pthread *t = list_entry (runp, struct pthread, list);
1022       if (t == self)
1023         continue;
1024
1025       setxid_signal_thread (cmdp, t);
1026     }
1027
1028   /* Now the list with threads using user-allocated stacks.  */
1029   list_for_each (runp, &__stack_user)
1030     {
1031       struct pthread *t = list_entry (runp, struct pthread, list);
1032       if (t == self)
1033         continue;
1034
1035       setxid_signal_thread (cmdp, t);
1036     }
1037
1038   int cur = cmdp->cntr;
1039   while (cur != 0)
1040     {
1041       lll_futex_wait (&cmdp->cntr, cur, LLL_PRIVATE);
1042       cur = cmdp->cntr;
1043     }
1044
1045   /* This must be last, otherwise the current thread might not have
1046      permissions to send SIGSETXID syscall to the other threads.  */
1047   INTERNAL_SYSCALL_DECL (err);
1048   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1049                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1050   if (INTERNAL_SYSCALL_ERROR_P (result, err))
1051     {
1052       __set_errno (INTERNAL_SYSCALL_ERRNO (result, err));
1053       result = -1;
1054     }
1055
1056   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1057   return result;
1058 }
1059
1060 static inline void __attribute__((always_inline))
1061 init_one_static_tls (struct pthread *curp, struct link_map *map)
1062 {
1063   dtv_t *dtv = GET_DTV (TLS_TPADJ (curp));
1064 # if TLS_TCB_AT_TP
1065   void *dest = (char *) curp - map->l_tls_offset;
1066 # elif TLS_DTV_AT_TP
1067   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1068 # else
1069 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1070 # endif
1071
1072   /* Fill in the DTV slot so that a later LD/GD access will find it.  */
1073   dtv[map->l_tls_modid].pointer.val = dest;
1074   dtv[map->l_tls_modid].pointer.is_static = true;
1075
1076   /* Initialize the memory.  */
1077   memset (__mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1078           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1079 }
1080
1081 void
1082 attribute_hidden
1083 __pthread_init_static_tls (struct link_map *map)
1084 {
1085   lll_lock (stack_cache_lock, LLL_PRIVATE);
1086
1087   /* Iterate over the list with system-allocated threads first.  */
1088   list_t *runp;
1089   list_for_each (runp, &stack_used)
1090     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1091
1092   /* Now the list with threads using user-allocated stacks.  */
1093   list_for_each (runp, &__stack_user)
1094     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1095
1096   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1097 }
1098
1099
1100 void
1101 attribute_hidden
1102 __wait_lookup_done (void)
1103 {
1104   lll_lock (stack_cache_lock, LLL_PRIVATE);
1105
1106   struct pthread *self = THREAD_SELF;
1107
1108   /* Iterate over the list with system-allocated threads first.  */
1109   list_t *runp;
1110   list_for_each (runp, &stack_used)
1111     {
1112       struct pthread *t = list_entry (runp, struct pthread, list);
1113       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1114         continue;
1115
1116       int *const gscope_flagp = &t->header.gscope_flag;
1117
1118       /* We have to wait until this thread is done with the global
1119          scope.  First tell the thread that we are waiting and
1120          possibly have to be woken.  */
1121       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1122                                                 THREAD_GSCOPE_FLAG_WAIT,
1123                                                 THREAD_GSCOPE_FLAG_USED))
1124         continue;
1125
1126       do
1127         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1128       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1129     }
1130
1131   /* Now the list with threads using user-allocated stacks.  */
1132   list_for_each (runp, &__stack_user)
1133     {
1134       struct pthread *t = list_entry (runp, struct pthread, list);
1135       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1136         continue;
1137
1138       int *const gscope_flagp = &t->header.gscope_flag;
1139
1140       /* We have to wait until this thread is done with the global
1141          scope.  First tell the thread that we are waiting and
1142          possibly have to be woken.  */
1143       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1144                                                 THREAD_GSCOPE_FLAG_WAIT,
1145                                                 THREAD_GSCOPE_FLAG_USED))
1146         continue;
1147
1148       do
1149         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1150       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1151     }
1152
1153   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1154 }