nptl/allocatestack.c

   1 /* Copyright (C) 2002-2012 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <assert.h>
  20 #include <errno.h>
  21 #include <signal.h>
  22 #include <stdint.h>
  23 #include <string.h>
  24 #include <unistd.h>
  25 #include <sys/mman.h>
  26 #include <sys/param.h>
  27 #include <dl-sysdep.h>
  28 #include <dl-tls.h>
  29 #include <tls.h>
  30 #include <list.h>
  31 #include <lowlevellock.h>
  32 #include <kernel-features.h>
  33
  34
  35 #ifndef NEED_SEPARATE_REGISTER_STACK
  36
  37 /* Most architectures have exactly one stack pointer.  Some have more.  */
  38 # define STACK_VARIABLES void *stackaddr = NULL
  39
  40 /* How to pass the values to the 'create_thread' function.  */
  41 # define STACK_VARIABLES_ARGS stackaddr
  42
  43 /* How to declare function which gets there parameters.  */
  44 # define STACK_VARIABLES_PARMS void *stackaddr
  45
  46 /* How to declare allocate_stack.  */
  47 # define ALLOCATE_STACK_PARMS void **stack
  48
  49 /* This is how the function is called.  We do it this way to allow
  50    other variants of the function to have more parameters.  */
  51 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  52
  53 #else
  54
  55 /* We need two stacks.  The kernel will place them but we have to tell
  56    the kernel about the size of the reserved address space.  */
  57 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  58
  59 /* How to pass the values to the 'create_thread' function.  */
  60 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  61
  62 /* How to declare function which gets there parameters.  */
  63 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  64
  65 /* How to declare allocate_stack.  */
  66 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  67
  68 /* This is how the function is called.  We do it this way to allow
  69    other variants of the function to have more parameters.  */
  70 # define ALLOCATE_STACK(attr, pd) \
  71   allocate_stack (attr, pd, &stackaddr, &stacksize)
  72
  73 #endif
  74
  75
  76 /* Default alignment of stack.  */
  77 #ifndef STACK_ALIGN
  78 # define STACK_ALIGN __alignof__ (long double)
  79 #endif
  80
  81 /* Default value for minimal stack size after allocating thread
  82    descriptor and guard.  */
  83 #ifndef MINIMAL_REST_STACK
  84 # define MINIMAL_REST_STACK     4096
  85 #endif
  86
  87
  88 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  89    a stack.  Use it when possible.  */
  90 #ifndef MAP_STACK
  91 # define MAP_STACK 0
  92 #endif
  93
  94 /* This yields the pointer that TLS support code calls the thread pointer.  */
  95 #if TLS_TCB_AT_TP
  96 # define TLS_TPADJ(pd) (pd)
  97 #elif TLS_DTV_AT_TP
  98 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
  99 #endif
 100
 101 /* Cache handling for not-yet free stacks.  */
 102
 103 /* Maximum size in kB of cache.  */
 104 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 105 static size_t stack_cache_actsize;
 106
 107 /* Mutex protecting this variable.  */
 108 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 109
 110 /* List of queued stack frames.  */
 111 static LIST_HEAD (stack_cache);
 112
 113 /* List of the stacks in use.  */
 114 static LIST_HEAD (stack_used);
 115
 116 /* We need to record what list operations we are going to do so that,
 117    in case of an asynchronous interruption due to a fork() call, we
 118    can correct for the work.  */
 119 static uintptr_t in_flight_stack;
 120
 121 /* List of the threads with user provided stacks in use.  No need to
 122    initialize this, since it's done in __pthread_initialize_minimal.  */
 123 list_t __stack_user __attribute__ ((nocommon));
 124 hidden_data_def (__stack_user)
 125
 126 #if COLORING_INCREMENT != 0
 127 /* Number of threads created.  */
 128 static unsigned int nptl_ncreated;
 129 #endif
 130
 131
 132 /* Check whether the stack is still used or not.  */
 133 #define FREE_P(descr) ((descr)->tid <= 0)
 134
 135
 136 static void
 137 stack_list_del (list_t *elem)
 138 {
 139   in_flight_stack = (uintptr_t) elem;
 140
 141   atomic_write_barrier ();
 142
 143   list_del (elem);
 144
 145   atomic_write_barrier ();
 146
 147   in_flight_stack = 0;
 148 }
 149
 150
 151 static void
 152 stack_list_add (list_t *elem, list_t *list)
 153 {
 154   in_flight_stack = (uintptr_t) elem | 1;
 155
 156   atomic_write_barrier ();
 157
 158   list_add (elem, list);
 159
 160   atomic_write_barrier ();
 161
 162   in_flight_stack = 0;
 163 }
 164
 165
 166 /* We create a double linked list of all cache entries.  Double linked
 167    because this allows removing entries from the end.  */
 168
 169
 170 /* Get a stack frame from the cache.  We have to match by size since
 171    some blocks might be too small or far too large.  */
 172 static struct pthread *
 173 get_cached_stack (size_t *sizep, void **memp)
 174 {
 175   size_t size = *sizep;
 176   struct pthread *result = NULL;
 177   list_t *entry;
 178
 179   lll_lock (stack_cache_lock, LLL_PRIVATE);
 180
 181   /* Search the cache for a matching entry.  We search for the
 182      smallest stack which has at least the required size.  Note that
 183      in normal situations the size of all allocated stacks is the
 184      same.  As the very least there are only a few different sizes.
 185      Therefore this loop will exit early most of the time with an
 186      exact match.  */
 187   list_for_each (entry, &stack_cache)
 188     {
 189       struct pthread *curr;
 190
 191       curr = list_entry (entry, struct pthread, list);
 192       if (FREE_P (curr) && curr->stackblock_size >= size)
 193         {
 194           if (curr->stackblock_size == size)
 195             {
 196               result = curr;
 197               break;
 198             }
 199
 200           if (result == NULL
 201               || result->stackblock_size > curr->stackblock_size)
 202             result = curr;
 203         }
 204     }
 205
 206   if (__builtin_expect (result == NULL, 0)
 207       /* Make sure the size difference is not too excessive.  In that
 208          case we do not use the block.  */
 209       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 210     {
 211       /* Release the lock.  */
 212       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 213
 214       return NULL;
 215     }
 216
 217   /* Don't allow setxid until cloned.  */
 218   result->setxid_futex = -1;
 219
 220   /* Dequeue the entry.  */
 221   stack_list_del (&result->list);
 222
 223   /* And add to the list of stacks in use.  */
 224   stack_list_add (&result->list, &stack_used);
 225
 226   /* And decrease the cache size.  */
 227   stack_cache_actsize -= result->stackblock_size;
 228
 229   /* Release the lock early.  */
 230   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 231
 232   /* Report size and location of the stack to the caller.  */
 233   *sizep = result->stackblock_size;
 234   *memp = result->stackblock;
 235
 236   /* Cancellation handling is back to the default.  */
 237   result->cancelhandling = 0;
 238   result->cleanup = NULL;
 239
 240   /* No pending event.  */
 241   result->nextevent = NULL;
 242
 243   /* Clear the DTV.  */
 244   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 245   for (size_t cnt = 0; cnt < dtv[-1].counter; ++cnt)
 246     if (! dtv[1 + cnt].pointer.is_static
 247         && dtv[1 + cnt].pointer.val != TLS_DTV_UNALLOCATED)
 248       free (dtv[1 + cnt].pointer.val);
 249   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 250
 251   /* Re-initialize the TLS.  */
 252   _dl_allocate_tls_init (TLS_TPADJ (result));
 253
 254   return result;
 255 }
 256
 257
 258 /* Free stacks until cache size is lower than LIMIT.  */
 259 void
 260 __free_stacks (size_t limit)
 261 {
 262   /* We reduce the size of the cache.  Remove the last entries until
 263      the size is below the limit.  */
 264   list_t *entry;
 265   list_t *prev;
 266
 267   /* Search from the end of the list.  */
 268   list_for_each_prev_safe (entry, prev, &stack_cache)
 269     {
 270       struct pthread *curr;
 271
 272       curr = list_entry (entry, struct pthread, list);
 273       if (FREE_P (curr))
 274         {
 275           /* Unlink the block.  */
 276           stack_list_del (entry);
 277
 278           /* Account for the freed memory.  */
 279           stack_cache_actsize -= curr->stackblock_size;
 280
 281           /* Free the memory associated with the ELF TLS.  */
 282           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 283
 284           /* Remove this block.  This should never fail.  If it does
 285              something is really wrong.  */
 286           if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 287             abort ();
 288
 289           /* Maybe we have freed enough.  */
 290           if (stack_cache_actsize <= limit)
 291             break;
 292         }
 293     }
 294 }
 295
 296
 297 /* Add a stack frame which is not used anymore to the stack.  Must be
 298    called with the cache lock held.  */
 299 static inline void
 300 __attribute ((always_inline))
 301 queue_stack (struct pthread *stack)
 302 {
 303   /* We unconditionally add the stack to the list.  The memory may
 304      still be in use but it will not be reused until the kernel marks
 305      the stack as not used anymore.  */
 306   stack_list_add (&stack->list, &stack_cache);
 307
 308   stack_cache_actsize += stack->stackblock_size;
 309   if (__builtin_expect (stack_cache_actsize > stack_cache_maxsize, 0))
 310     __free_stacks (stack_cache_maxsize);
 311 }
 312
 313
 314 static int
 315 internal_function
 316 change_stack_perm (struct pthread *pd
 317 #ifdef NEED_SEPARATE_REGISTER_STACK
 318                    , size_t pagemask
 319 #endif
 320                    )
 321 {
 322 #ifdef NEED_SEPARATE_REGISTER_STACK
 323   void *stack = (pd->stackblock
 324                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 325                       & pagemask) + pd->guardsize) & pagemask));
 326   size_t len = pd->stackblock + pd->stackblock_size - stack;
 327 #elif _STACK_GROWS_DOWN
 328   void *stack = pd->stackblock + pd->guardsize;
 329   size_t len = pd->stackblock_size - pd->guardsize;
 330 #elif _STACK_GROWS_UP
 331   void *stack = pd->stackblock;
 332   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 333 #else
 334 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 335 #endif
 336   if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 337     return errno;
 338
 339   return 0;
 340 }
 341
 342
 343 static int
 344 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 345                 ALLOCATE_STACK_PARMS)
 346 {
 347   struct pthread *pd;
 348   size_t size;
 349   size_t pagesize_m1 = __getpagesize () - 1;
 350   void *stacktop;
 351
 352   assert (attr != NULL);
 353   assert (powerof2 (pagesize_m1 + 1));
 354   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 355
 356   /* Get the stack size from the attribute if it is set.  Otherwise we
 357      use the default we determined at start time.  */
 358   size = attr->stacksize ?: __default_stacksize;
 359
 360   /* Get memory for the stack.  */
 361   if (__builtin_expect (attr->flags & ATTR_FLAG_STACKADDR, 0))
 362     {
 363       uintptr_t adj;
 364
 365       /* If the user also specified the size of the stack make sure it
 366          is large enough.  */
 367       if (attr->stacksize != 0
 368           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 369         return EINVAL;
 370
 371       /* Adjust stack size for alignment of the TLS block.  */
 372 #if TLS_TCB_AT_TP
 373       adj = ((uintptr_t) attr->stackaddr - TLS_TCB_SIZE)
 374             & __static_tls_align_m1;
 375       assert (size > adj + TLS_TCB_SIZE);
 376 #elif TLS_DTV_AT_TP
 377       adj = ((uintptr_t) attr->stackaddr - __static_tls_size)
 378             & __static_tls_align_m1;
 379       assert (size > adj);
 380 #endif
 381
 382       /* The user provided some memory.  Let's hope it matches the
 383          size...  We do not allocate guard pages if the user provided
 384          the stack.  It is the user's responsibility to do this if it
 385          is wanted.  */
 386 #if TLS_TCB_AT_TP
 387       pd = (struct pthread *) ((uintptr_t) attr->stackaddr
 388                                - TLS_TCB_SIZE - adj);
 389 #elif TLS_DTV_AT_TP
 390       pd = (struct pthread *) (((uintptr_t) attr->stackaddr
 391                                 - __static_tls_size - adj)
 392                                - TLS_PRE_TCB_SIZE);
 393 #endif
 394
 395       /* The user provided stack memory needs to be cleared.  */
 396       memset (pd, '\0', sizeof (struct pthread));
 397
 398       /* The first TSD block is included in the TCB.  */
 399       pd->specific[0] = pd->specific_1stblock;
 400
 401       /* Remember the stack-related values.  */
 402       pd->stackblock = (char *) attr->stackaddr - size;
 403       pd->stackblock_size = size;
 404
 405       /* This is a user-provided stack.  It will not be queued in the
 406          stack cache nor will the memory (except the TLS memory) be freed.  */
 407       pd->user_stack = true;
 408
 409       /* This is at least the second thread.  */
 410       pd->header.multiple_threads = 1;
 411 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 412       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 413 #endif
 414
 415 #ifndef __ASSUME_PRIVATE_FUTEX
 416       /* The thread must know when private futexes are supported.  */
 417       pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 418                                                 header.private_futex);
 419 #endif
 420
 421 #ifdef NEED_DL_SYSINFO
 422       /* Copy the sysinfo value from the parent.  */
 423       THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 424 #endif
 425
 426       /* The process ID is also the same as that of the caller.  */
 427       pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 428
 429       /* Don't allow setxid until cloned.  */
 430       pd->setxid_futex = -1;
 431
 432       /* Allocate the DTV for this thread.  */
 433       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 434         {
 435           /* Something went wrong.  */
 436           assert (errno == ENOMEM);
 437           return errno;
 438         }
 439
 440
 441       /* Prepare to modify global data.  */
 442       lll_lock (stack_cache_lock, LLL_PRIVATE);
 443
 444       /* And add to the list of stacks in use.  */
 445       list_add (&pd->list, &__stack_user);
 446
 447       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 448     }
 449   else
 450     {
 451       /* Allocate some anonymous memory.  If possible use the cache.  */
 452       size_t guardsize;
 453       size_t reqsize;
 454       void *mem;
 455       const int prot = (PROT_READ | PROT_WRITE
 456                         | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
 457
 458 #if COLORING_INCREMENT != 0
 459       /* Add one more page for stack coloring.  Don't do it for stacks
 460          with 16 times pagesize or larger.  This might just cause
 461          unnecessary misalignment.  */
 462       if (size <= 16 * pagesize_m1)
 463         size += pagesize_m1 + 1;
 464 #endif
 465
 466       /* Adjust the stack size for alignment.  */
 467       size &= ~__static_tls_align_m1;
 468       assert (size != 0);
 469
 470       /* Make sure the size of the stack is enough for the guard and
 471          eventually the thread descriptor.  */
 472       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 473       if (__builtin_expect (size < ((guardsize + __static_tls_size
 474                                      + MINIMAL_REST_STACK + pagesize_m1)
 475                                     & ~pagesize_m1),
 476                             0))
 477         /* The stack is too small (or the guard too large).  */
 478         return EINVAL;
 479
 480       /* Try to get a stack from the cache.  */
 481       reqsize = size;
 482       pd = get_cached_stack (&size, &mem);
 483       if (pd == NULL)
 484         {
 485           /* To avoid aliasing effects on a larger scale than pages we
 486              adjust the allocated stack size if necessary.  This way
 487              allocations directly following each other will not have
 488              aliasing problems.  */
 489 #if MULTI_PAGE_ALIASING != 0
 490           if ((size % MULTI_PAGE_ALIASING) == 0)
 491             size += pagesize_m1 + 1;
 492 #endif
 493
 494           mem = mmap (NULL, size, prot,
 495                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 496
 497           if (__builtin_expect (mem == MAP_FAILED, 0))
 498             return errno;
 499
 500           /* SIZE is guaranteed to be greater than zero.
 501              So we can never get a null pointer back from mmap.  */
 502           assert (mem != NULL);
 503
 504 #if COLORING_INCREMENT != 0
 505           /* Atomically increment NCREATED.  */
 506           unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
 507
 508           /* We chose the offset for coloring by incrementing it for
 509              every new thread by a fixed amount.  The offset used
 510              module the page size.  Even if coloring would be better
 511              relative to higher alignment values it makes no sense to
 512              do it since the mmap() interface does not allow us to
 513              specify any alignment for the returned memory block.  */
 514           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 515
 516           /* Make sure the coloring offsets does not disturb the alignment
 517              of the TCB and static TLS block.  */
 518           if (__builtin_expect ((coloring & __static_tls_align_m1) != 0, 0))
 519             coloring = (((coloring + __static_tls_align_m1)
 520                          & ~(__static_tls_align_m1))
 521                         & ~pagesize_m1);
 522 #else
 523           /* Unless specified we do not make any adjustments.  */
 524 # define coloring 0
 525 #endif
 526
 527           /* Place the thread descriptor at the end of the stack.  */
 528 #if TLS_TCB_AT_TP
 529           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 530 #elif TLS_DTV_AT_TP
 531           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 532                                     - __static_tls_size)
 533                                     & ~__static_tls_align_m1)
 534                                    - TLS_PRE_TCB_SIZE);
 535 #endif
 536
 537           /* Remember the stack-related values.  */
 538           pd->stackblock = mem;
 539           pd->stackblock_size = size;
 540
 541           /* We allocated the first block thread-specific data array.
 542              This address will not change for the lifetime of this
 543              descriptor.  */
 544           pd->specific[0] = pd->specific_1stblock;
 545
 546           /* This is at least the second thread.  */
 547           pd->header.multiple_threads = 1;
 548 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 549           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 550 #endif
 551
 552 #ifndef __ASSUME_PRIVATE_FUTEX
 553           /* The thread must know when private futexes are supported.  */
 554           pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 555                                                     header.private_futex);
 556 #endif
 557
 558 #ifdef NEED_DL_SYSINFO
 559           /* Copy the sysinfo value from the parent.  */
 560           THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 561 #endif
 562
 563           /* Don't allow setxid until cloned.  */
 564           pd->setxid_futex = -1;
 565
 566           /* The process ID is also the same as that of the caller.  */
 567           pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 568
 569           /* Allocate the DTV for this thread.  */
 570           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 571             {
 572               /* Something went wrong.  */
 573               assert (errno == ENOMEM);
 574
 575               /* Free the stack memory we just allocated.  */
 576               (void) munmap (mem, size);
 577
 578               return errno;
 579             }
 580
 581
 582           /* Prepare to modify global data.  */
 583           lll_lock (stack_cache_lock, LLL_PRIVATE);
 584
 585           /* And add to the list of stacks in use.  */
 586           stack_list_add (&pd->list, &stack_used);
 587
 588           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 589
 590
 591           /* There might have been a race.  Another thread might have
 592              caused the stacks to get exec permission while this new
 593              stack was prepared.  Detect if this was possible and
 594              change the permission if necessary.  */
 595           if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
 596                                 && (prot & PROT_EXEC) == 0, 0))
 597             {
 598               int err = change_stack_perm (pd
 599 #ifdef NEED_SEPARATE_REGISTER_STACK
 600                                            , ~pagesize_m1
 601 #endif
 602                                            );
 603               if (err != 0)
 604                 {
 605                   /* Free the stack memory we just allocated.  */
 606                   (void) munmap (mem, size);
 607
 608                   return err;
 609                 }
 610             }
 611
 612
 613           /* Note that all of the stack and the thread descriptor is
 614              zeroed.  This means we do not have to initialize fields
 615              with initial value zero.  This is specifically true for
 616              the 'tid' field which is always set back to zero once the
 617              stack is not used anymore and for the 'guardsize' field
 618              which will be read next.  */
 619         }
 620
 621       /* Create or resize the guard area if necessary.  */
 622       if (__builtin_expect (guardsize > pd->guardsize, 0))
 623         {
 624 #ifdef NEED_SEPARATE_REGISTER_STACK
 625           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 626 #elif _STACK_GROWS_DOWN
 627           char *guard = mem;
 628 # elif _STACK_GROWS_UP
 629           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 630 #endif
 631           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 632             {
 633             mprot_error:
 634               lll_lock (stack_cache_lock, LLL_PRIVATE);
 635
 636               /* Remove the thread from the list.  */
 637               stack_list_del (&pd->list);
 638
 639               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 640
 641               /* Get rid of the TLS block we allocated.  */
 642               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 643
 644               /* Free the stack memory regardless of whether the size
 645                  of the cache is over the limit or not.  If this piece
 646                  of memory caused problems we better do not use it
 647                  anymore.  Uh, and we ignore possible errors.  There
 648                  is nothing we could do.  */
 649               (void) munmap (mem, size);
 650
 651               return errno;
 652             }
 653
 654           pd->guardsize = guardsize;
 655         }
 656       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 657                                  0))
 658         {
 659           /* The old guard area is too large.  */
 660
 661 #ifdef NEED_SEPARATE_REGISTER_STACK
 662           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 663           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 664
 665           if (oldguard < guard
 666               && mprotect (oldguard, guard - oldguard, prot) != 0)
 667             goto mprot_error;
 668
 669           if (mprotect (guard + guardsize,
 670                         oldguard + pd->guardsize - guard - guardsize,
 671                         prot) != 0)
 672             goto mprot_error;
 673 #elif _STACK_GROWS_DOWN
 674           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 675                         prot) != 0)
 676             goto mprot_error;
 677 #elif _STACK_GROWS_UP
 678           if (mprotect ((char *) pd - pd->guardsize,
 679                         pd->guardsize - guardsize, prot) != 0)
 680             goto mprot_error;
 681 #endif
 682
 683           pd->guardsize = guardsize;
 684         }
 685       /* The pthread_getattr_np() calls need to get passed the size
 686          requested in the attribute, regardless of how large the
 687          actually used guardsize is.  */
 688       pd->reported_guardsize = guardsize;
 689     }
 690
 691   /* Initialize the lock.  We have to do this unconditionally since the
 692      stillborn thread could be canceled while the lock is taken.  */
 693   pd->lock = LLL_LOCK_INITIALIZER;
 694
 695   /* The robust mutex lists also need to be initialized
 696      unconditionally because the cleanup for the previous stack owner
 697      might have happened in the kernel.  */
 698   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 699                                   - offsetof (pthread_mutex_t,
 700                                               __data.__list.__next));
 701   pd->robust_head.list_op_pending = NULL;
 702 #ifdef __PTHREAD_MUTEX_HAVE_PREV
 703   pd->robust_prev = &pd->robust_head;
 704 #endif
 705   pd->robust_head.list = &pd->robust_head;
 706
 707   /* We place the thread descriptor at the end of the stack.  */
 708   *pdp = pd;
 709
 710 #if TLS_TCB_AT_TP
 711   /* The stack begins before the TCB and the static TLS block.  */
 712   stacktop = ((char *) (pd + 1) - __static_tls_size);
 713 #elif TLS_DTV_AT_TP
 714   stacktop = (char *) (pd - 1);
 715 #endif
 716
 717 #ifdef NEED_SEPARATE_REGISTER_STACK
 718   *stack = pd->stackblock;
 719   *stacksize = stacktop - *stack;
 720 #elif _STACK_GROWS_DOWN
 721   *stack = stacktop;
 722 #elif _STACK_GROWS_UP
 723   *stack = pd->stackblock;
 724   assert (*stack > 0);
 725 #endif
 726
 727   return 0;
 728 }
 729
 730
 731 void
 732 internal_function
 733 __deallocate_stack (struct pthread *pd)
 734 {
 735   lll_lock (stack_cache_lock, LLL_PRIVATE);
 736
 737   /* Remove the thread from the list of threads with user defined
 738      stacks.  */
 739   stack_list_del (&pd->list);
 740
 741   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 742      not reset the 'used' flag in the 'tid' field.  This is done by
 743      the kernel.  If no thread has been created yet this field is
 744      still zero.  */
 745   if (__builtin_expect (! pd->user_stack, 1))
 746     (void) queue_stack (pd);
 747   else
 748     /* Free the memory associated with the ELF TLS.  */
 749     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 750
 751   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 752 }
 753
 754
 755 int
 756 internal_function
 757 __make_stacks_executable (void **stack_endp)
 758 {
 759   /* First the main thread's stack.  */
 760   int err = _dl_make_stack_executable (stack_endp);
 761   if (err != 0)
 762     return err;
 763
 764 #ifdef NEED_SEPARATE_REGISTER_STACK
 765   const size_t pagemask = ~(__getpagesize () - 1);
 766 #endif
 767
 768   lll_lock (stack_cache_lock, LLL_PRIVATE);
 769
 770   list_t *runp;
 771   list_for_each (runp, &stack_used)
 772     {
 773       err = change_stack_perm (list_entry (runp, struct pthread, list)
 774 #ifdef NEED_SEPARATE_REGISTER_STACK
 775                                , pagemask
 776 #endif
 777                                );
 778       if (err != 0)
 779         break;
 780     }
 781
 782   /* Also change the permission for the currently unused stacks.  This
 783      might be wasted time but better spend it here than adding a check
 784      in the fast path.  */
 785   if (err == 0)
 786     list_for_each (runp, &stack_cache)
 787       {
 788         err = change_stack_perm (list_entry (runp, struct pthread, list)
 789 #ifdef NEED_SEPARATE_REGISTER_STACK
 790                                  , pagemask
 791 #endif
 792                                  );
 793         if (err != 0)
 794           break;
 795       }
 796
 797   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 798
 799   return err;
 800 }
 801
 802
 803 /* In case of a fork() call the memory allocation in the child will be
 804    the same but only one thread is running.  All stacks except that of
 805    the one running thread are not used anymore.  We have to recycle
 806    them.  */
 807 void
 808 __reclaim_stacks (void)
 809 {
 810   struct pthread *self = (struct pthread *) THREAD_SELF;
 811
 812   /* No locking necessary.  The caller is the only stack in use.  But
 813      we have to be aware that we might have interrupted a list
 814      operation.  */
 815
 816   if (in_flight_stack != 0)
 817     {
 818       bool add_p = in_flight_stack & 1;
 819       list_t *elem = (list_t *) (in_flight_stack & ~(uintptr_t) 1);
 820
 821       if (add_p)
 822         {
 823           /* We always add at the beginning of the list.  So in this
 824              case we only need to check the beginning of these lists.  */
 825           int check_list (list_t *l)
 826           {
 827             if (l->next->prev != l)
 828               {
 829                 assert (l->next->prev == elem);
 830
 831                 elem->next = l->next;
 832                 elem->prev = l;
 833                 l->next = elem;
 834
 835                 return 1;
 836               }
 837
 838             return 0;
 839           }
 840
 841           if (check_list (&stack_used) == 0)
 842             (void) check_list (&stack_cache);
 843         }
 844       else
 845         {
 846           /* We can simply always replay the delete operation.  */
 847           elem->next->prev = elem->prev;
 848           elem->prev->next = elem->next;
 849         }
 850     }
 851
 852   /* Mark all stacks except the still running one as free.  */
 853   list_t *runp;
 854   list_for_each (runp, &stack_used)
 855     {
 856       struct pthread *curp = list_entry (runp, struct pthread, list);
 857       if (curp != self)
 858         {
 859           /* This marks the stack as free.  */
 860           curp->tid = 0;
 861
 862           /* The PID field must be initialized for the new process.  */
 863           curp->pid = self->pid;
 864
 865           /* Account for the size of the stack.  */
 866           stack_cache_actsize += curp->stackblock_size;
 867
 868           if (curp->specific_used)
 869             {
 870               /* Clear the thread-specific data.  */
 871               memset (curp->specific_1stblock, '\0',
 872                       sizeof (curp->specific_1stblock));
 873
 874               curp->specific_used = false;
 875
 876               for (size_t cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 877                 if (curp->specific[cnt] != NULL)
 878                   {
 879                     memset (curp->specific[cnt], '\0',
 880                             sizeof (curp->specific_1stblock));
 881
 882                     /* We have allocated the block which we do not
 883                        free here so re-set the bit.  */
 884                     curp->specific_used = true;
 885                   }
 886             }
 887         }
 888     }
 889
 890   /* Reset the PIDs in any cached stacks.  */
 891   list_for_each (runp, &stack_cache)
 892     {
 893       struct pthread *curp = list_entry (runp, struct pthread, list);
 894       curp->pid = self->pid;
 895     }
 896
 897   /* Add the stack of all running threads to the cache.  */
 898   list_splice (&stack_used, &stack_cache);
 899
 900   /* Remove the entry for the current thread to from the cache list
 901      and add it to the list of running threads.  Which of the two
 902      lists is decided by the user_stack flag.  */
 903   stack_list_del (&self->list);
 904
 905   /* Re-initialize the lists for all the threads.  */
 906   INIT_LIST_HEAD (&stack_used);
 907   INIT_LIST_HEAD (&__stack_user);
 908
 909   if (__builtin_expect (THREAD_GETMEM (self, user_stack), 0))
 910     list_add (&self->list, &__stack_user);
 911   else
 912     list_add (&self->list, &stack_used);
 913
 914   /* There is one thread running.  */
 915   __nptl_nthreads = 1;
 916
 917   in_flight_stack = 0;
 918
 919   /* Initialize the lock.  */
 920   stack_cache_lock = LLL_LOCK_INITIALIZER;
 921 }
 922
 923
 924 #if HP_TIMING_AVAIL
 925 # undef __find_thread_by_id
 926 /* Find a thread given the thread ID.  */
 927 attribute_hidden
 928 struct pthread *
 929 __find_thread_by_id (pid_t tid)
 930 {
 931   struct pthread *result = NULL;
 932
 933   lll_lock (stack_cache_lock, LLL_PRIVATE);
 934
 935   /* Iterate over the list with system-allocated threads first.  */
 936   list_t *runp;
 937   list_for_each (runp, &stack_used)
 938     {
 939       struct pthread *curp;
 940
 941       curp = list_entry (runp, struct pthread, list);
 942
 943       if (curp->tid == tid)
 944         {
 945           result = curp;
 946           goto out;
 947         }
 948     }
 949
 950   /* Now the list with threads using user-allocated stacks.  */
 951   list_for_each (runp, &__stack_user)
 952     {
 953       struct pthread *curp;
 954
 955       curp = list_entry (runp, struct pthread, list);
 956
 957       if (curp->tid == tid)
 958         {
 959           result = curp;
 960           goto out;
 961         }
 962     }
 963
 964  out:
 965   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 966
 967   return result;
 968 }
 969 #endif
 970
 971
 972 static void
 973 internal_function
 974 setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
 975 {
 976   int ch;
 977
 978   /* Wait until this thread is cloned.  */
 979   if (t->setxid_futex == -1
 980       && ! atomic_compare_and_exchange_bool_acq (&t->setxid_futex, -2, -1))
 981     do
 982       lll_futex_wait (&t->setxid_futex, -2, LLL_PRIVATE);
 983     while (t->setxid_futex == -2);
 984
 985   /* Don't let the thread exit before the setxid handler runs.  */
 986   t->setxid_futex = 0;
 987
 988   do
 989     {
 990       ch = t->cancelhandling;
 991
 992       /* If the thread is exiting right now, ignore it.  */
 993       if ((ch & EXITING_BITMASK) != 0)
 994         {
 995           /* Release the futex if there is no other setxid in
 996              progress.  */
 997           if ((ch & SETXID_BITMASK) == 0)
 998             {
 999               t->setxid_futex = 1;
1000               lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
1001             }
1002           return;
1003         }
1004     }
1005   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1006                                                ch | SETXID_BITMASK, ch));
1007 }
1008
1009
1010 static void
1011 internal_function
1012 setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
1013 {
1014   int ch;
1015
1016   do
1017     {
1018       ch = t->cancelhandling;
1019       if ((ch & SETXID_BITMASK) == 0)
1020         return;
1021     }
1022   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1023                                                ch & ~SETXID_BITMASK, ch));
1024
1025   /* Release the futex just in case.  */
1026   t->setxid_futex = 1;
1027   lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
1028 }
1029
1030
1031 static int
1032 internal_function
1033 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
1034 {
1035   if ((t->cancelhandling & SETXID_BITMASK) == 0)
1036     return 0;
1037
1038   int val;
1039   INTERNAL_SYSCALL_DECL (err);
1040   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
1041                           t->tid, SIGSETXID);
1042
1043   /* If this failed, it must have had not started yet or else exited.  */
1044   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
1045     {
1046       atomic_increment (&cmdp->cntr);
1047       return 1;
1048     }
1049   else
1050     return 0;
1051 }
1052
1053
1054 int
1055 attribute_hidden
1056 __nptl_setxid (struct xid_command *cmdp)
1057 {
1058   int signalled;
1059   int result;
1060   lll_lock (stack_cache_lock, LLL_PRIVATE);
1061
1062   __xidcmd = cmdp;
1063   cmdp->cntr = 0;
1064
1065   struct pthread *self = THREAD_SELF;
1066
1067   /* Iterate over the list with system-allocated threads first.  */
1068   list_t *runp;
1069   list_for_each (runp, &stack_used)
1070     {
1071       struct pthread *t = list_entry (runp, struct pthread, list);
1072       if (t == self)
1073         continue;
1074
1075       setxid_mark_thread (cmdp, t);
1076     }
1077
1078   /* Now the list with threads using user-allocated stacks.  */
1079   list_for_each (runp, &__stack_user)
1080     {
1081       struct pthread *t = list_entry (runp, struct pthread, list);
1082       if (t == self)
1083         continue;
1084
1085       setxid_mark_thread (cmdp, t);
1086     }
1087
1088   /* Iterate until we don't succeed in signalling anyone.  That means
1089      we have gotten all running threads, and their children will be
1090      automatically correct once started.  */
1091   do
1092     {
1093       signalled = 0;
1094
1095       list_for_each (runp, &stack_used)
1096         {
1097           struct pthread *t = list_entry (runp, struct pthread, list);
1098           if (t == self)
1099             continue;
1100
1101           signalled += setxid_signal_thread (cmdp, t);
1102         }
1103
1104       list_for_each (runp, &__stack_user)
1105         {
1106           struct pthread *t = list_entry (runp, struct pthread, list);
1107           if (t == self)
1108             continue;
1109
1110           signalled += setxid_signal_thread (cmdp, t);
1111         }
1112
1113       int cur = cmdp->cntr;
1114       while (cur != 0)
1115         {
1116           lll_futex_wait (&cmdp->cntr, cur, LLL_PRIVATE);
1117           cur = cmdp->cntr;
1118         }
1119     }
1120   while (signalled != 0);
1121
1122   /* Clean up flags, so that no thread blocks during exit waiting
1123      for a signal which will never come.  */
1124   list_for_each (runp, &stack_used)
1125     {
1126       struct pthread *t = list_entry (runp, struct pthread, list);
1127       if (t == self)
1128         continue;
1129
1130       setxid_unmark_thread (cmdp, t);
1131     }
1132
1133   list_for_each (runp, &__stack_user)
1134     {
1135       struct pthread *t = list_entry (runp, struct pthread, list);
1136       if (t == self)
1137         continue;
1138
1139       setxid_unmark_thread (cmdp, t);
1140     }
1141
1142   /* This must be last, otherwise the current thread might not have
1143      permissions to send SIGSETXID syscall to the other threads.  */
1144   INTERNAL_SYSCALL_DECL (err);
1145   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1146                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1147   if (INTERNAL_SYSCALL_ERROR_P (result, err))
1148     {
1149       __set_errno (INTERNAL_SYSCALL_ERRNO (result, err));
1150       result = -1;
1151     }
1152
1153   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1154   return result;
1155 }
1156
1157 static inline void __attribute__((always_inline))
1158 init_one_static_tls (struct pthread *curp, struct link_map *map)
1159 {
1160   dtv_t *dtv = GET_DTV (TLS_TPADJ (curp));
1161 # if TLS_TCB_AT_TP
1162   void *dest = (char *) curp - map->l_tls_offset;
1163 # elif TLS_DTV_AT_TP
1164   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1165 # else
1166 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1167 # endif
1168
1169   /* Fill in the DTV slot so that a later LD/GD access will find it.  */
1170   dtv[map->l_tls_modid].pointer.val = dest;
1171   dtv[map->l_tls_modid].pointer.is_static = true;
1172
1173   /* Initialize the memory.  */
1174   memset (__mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1175           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1176 }
1177
1178 void
1179 attribute_hidden
1180 __pthread_init_static_tls (struct link_map *map)
1181 {
1182   lll_lock (stack_cache_lock, LLL_PRIVATE);
1183
1184   /* Iterate over the list with system-allocated threads first.  */
1185   list_t *runp;
1186   list_for_each (runp, &stack_used)
1187     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1188
1189   /* Now the list with threads using user-allocated stacks.  */
1190   list_for_each (runp, &__stack_user)
1191     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1192
1193   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1194 }
1195
1196
1197 void
1198 attribute_hidden
1199 __wait_lookup_done (void)
1200 {
1201   lll_lock (stack_cache_lock, LLL_PRIVATE);
1202
1203   struct pthread *self = THREAD_SELF;
1204
1205   /* Iterate over the list with system-allocated threads first.  */
1206   list_t *runp;
1207   list_for_each (runp, &stack_used)
1208     {
1209       struct pthread *t = list_entry (runp, struct pthread, list);
1210       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1211         continue;
1212
1213       int *const gscope_flagp = &t->header.gscope_flag;
1214
1215       /* We have to wait until this thread is done with the global
1216          scope.  First tell the thread that we are waiting and
1217          possibly have to be woken.  */
1218       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1219                                                 THREAD_GSCOPE_FLAG_WAIT,
1220                                                 THREAD_GSCOPE_FLAG_USED))
1221         continue;
1222
1223       do
1224         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1225       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1226     }
1227
1228   /* Now the list with threads using user-allocated stacks.  */
1229   list_for_each (runp, &__stack_user)
1230     {
1231       struct pthread *t = list_entry (runp, struct pthread, list);
1232       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1233         continue;
1234
1235       int *const gscope_flagp = &t->header.gscope_flag;
1236
1237       /* We have to wait until this thread is done with the global
1238          scope.  First tell the thread that we are waiting and
1239          possibly have to be woken.  */
1240       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1241                                                 THREAD_GSCOPE_FLAG_WAIT,
1242                                                 THREAD_GSCOPE_FLAG_USED))
1243         continue;
1244
1245       do
1246         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1247       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1248     }
1249
1250   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1251 }