nptl/allocatestack.c

   1 /* Copyright (C) 2002-2021 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3
   4    The GNU C Library is free software; you can redistribute it and/or
   5    modify it under the terms of the GNU Lesser General Public
   6    License as published by the Free Software Foundation; either
   7    version 2.1 of the License, or (at your option) any later version.
   8
   9    The GNU C Library is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    Lesser General Public License for more details.
  13
  14    You should have received a copy of the GNU Lesser General Public
  15    License along with the GNU C Library; if not, see
  16    <https://www.gnu.org/licenses/>.  */
  17
  18 #include <assert.h>
  19 #include <errno.h>
  20 #include <signal.h>
  21 #include <stdint.h>
  22 #include <string.h>
  23 #include <unistd.h>
  24 #include <sys/mman.h>
  25 #include <sys/param.h>
  26 #include <dl-sysdep.h>
  27 #include <dl-tls.h>
  28 #include <tls.h>
  29 #include <list.h>
  30 #include <lowlevellock.h>
  31 #include <futex-internal.h>
  32 #include <kernel-features.h>
  33 #include <nptl-stack.h>
  34
  35 /* Default alignment of stack.  */
  36 #ifndef STACK_ALIGN
  37 # define STACK_ALIGN __alignof__ (long double)
  38 #endif
  39
  40 /* Default value for minimal stack size after allocating thread
  41    descriptor and guard.  */
  42 #ifndef MINIMAL_REST_STACK
  43 # define MINIMAL_REST_STACK     4096
  44 #endif
  45
  46
  47 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  48    a stack.  Use it when possible.  */
  49 #ifndef MAP_STACK
  50 # define MAP_STACK 0
  51 #endif
  52
  53 /* Get a stack frame from the cache.  We have to match by size since
  54    some blocks might be too small or far too large.  */
  55 static struct pthread *
  56 get_cached_stack (size_t *sizep, void **memp)
  57 {
  58   size_t size = *sizep;
  59   struct pthread *result = NULL;
  60   list_t *entry;
  61
  62   lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
  63
  64   /* Search the cache for a matching entry.  We search for the
  65      smallest stack which has at least the required size.  Note that
  66      in normal situations the size of all allocated stacks is the
  67      same.  As the very least there are only a few different sizes.
  68      Therefore this loop will exit early most of the time with an
  69      exact match.  */
  70   list_for_each (entry, &GL (dl_stack_cache))
  71     {
  72       struct pthread *curr;
  73
  74       curr = list_entry (entry, struct pthread, list);
  75       if (__nptl_stack_in_use (curr) && curr->stackblock_size >= size)
  76         {
  77           if (curr->stackblock_size == size)
  78             {
  79               result = curr;
  80               break;
  81             }
  82
  83           if (result == NULL
  84               || result->stackblock_size > curr->stackblock_size)
  85             result = curr;
  86         }
  87     }
  88
  89   if (__builtin_expect (result == NULL, 0)
  90       /* Make sure the size difference is not too excessive.  In that
  91          case we do not use the block.  */
  92       || __builtin_expect (result->stackblock_size > 4 * size, 0))
  93     {
  94       /* Release the lock.  */
  95       lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
  96
  97       return NULL;
  98     }
  99
 100   /* Don't allow setxid until cloned.  */
 101   result->setxid_futex = -1;
 102
 103   /* Dequeue the entry.  */
 104   __nptl_stack_list_del (&result->list);
 105
 106   /* And add to the list of stacks in use.  */
 107   __nptl_stack_list_add (&result->list, &GL (dl_stack_used));
 108
 109   /* And decrease the cache size.  */
 110   GL (dl_stack_cache_actsize) -= result->stackblock_size;
 111
 112   /* Release the lock early.  */
 113   lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 114
 115   /* Report size and location of the stack to the caller.  */
 116   *sizep = result->stackblock_size;
 117   *memp = result->stackblock;
 118
 119   /* Cancellation handling is back to the default.  */
 120   result->cancelhandling = 0;
 121   result->cancelstate = PTHREAD_CANCEL_ENABLE;
 122   result->canceltype = PTHREAD_CANCEL_DEFERRED;
 123   result->cleanup = NULL;
 124   result->setup_failed = 0;
 125
 126   /* No pending event.  */
 127   result->nextevent = NULL;
 128
 129   result->tls_state = (struct tls_internal_t) { 0 };
 130
 131   /* Clear the DTV.  */
 132   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 133   for (size_t cnt = 0; cnt < dtv[-1].counter; ++cnt)
 134     free (dtv[1 + cnt].pointer.to_free);
 135   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 136
 137   /* Re-initialize the TLS.  */
 138   _dl_allocate_tls_init (TLS_TPADJ (result));
 139
 140   return result;
 141 }
 142
 143 /* Return the guard page position on allocated stack.  */
 144 static inline char *
 145 __attribute ((always_inline))
 146 guard_position (void *mem, size_t size, size_t guardsize, struct pthread *pd,
 147                 size_t pagesize_m1)
 148 {
 149 #ifdef NEED_SEPARATE_REGISTER_STACK
 150   return mem + (((size - guardsize) / 2) & ~pagesize_m1);
 151 #elif _STACK_GROWS_DOWN
 152   return mem;
 153 #elif _STACK_GROWS_UP
 154   return (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 155 #endif
 156 }
 157
 158 /* Based on stack allocated with PROT_NONE, setup the required portions with
 159    'prot' flags based on the guard page position.  */
 160 static inline int
 161 setup_stack_prot (char *mem, size_t size, char *guard, size_t guardsize,
 162                   const int prot)
 163 {
 164   char *guardend = guard + guardsize;
 165 #if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
 166   /* As defined at guard_position, for architectures with downward stack
 167      the guard page is always at start of the allocated area.  */
 168   if (__mprotect (guardend, size - guardsize, prot) != 0)
 169     return errno;
 170 #else
 171   size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
 172   if (__mprotect (mem, mprots1, prot) != 0)
 173     return errno;
 174   size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
 175   if (__mprotect (guardend, mprots2, prot) != 0)
 176     return errno;
 177 #endif
 178   return 0;
 179 }
 180
 181 /* Mark the memory of the stack as usable to the kernel.  It frees everything
 182    except for the space used for the TCB itself.  */
 183 static __always_inline void
 184 advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
 185 {
 186   uintptr_t sp = (uintptr_t) CURRENT_STACK_FRAME;
 187   size_t pagesize_m1 = __getpagesize () - 1;
 188 #if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
 189   size_t freesize = (sp - (uintptr_t) mem) & ~pagesize_m1;
 190   assert (freesize < size);
 191   if (freesize > PTHREAD_STACK_MIN)
 192     __madvise (mem, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
 193 #else
 194   /* Page aligned start of memory to free (higher than or equal
 195      to current sp plus the minimum stack size).  */
 196   uintptr_t freeblock = (sp + PTHREAD_STACK_MIN + pagesize_m1) & ~pagesize_m1;
 197   uintptr_t free_end = (pd - guardsize) & ~pagesize_m1;
 198   if (free_end > freeblock)
 199     {
 200       size_t freesize = free_end - freeblock;
 201       assert (freesize < size);
 202       __madvise ((void*) freeblock, freesize, MADV_DONTNEED);
 203     }
 204 #endif
 205 }
 206
 207 /* Returns a usable stack for a new thread either by allocating a
 208    new stack or reusing a cached stack of sufficient size.
 209    ATTR must be non-NULL and point to a valid pthread_attr.
 210    PDP must be non-NULL.  */
 211 static int
 212 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 213                 void **stack, size_t *stacksize)
 214 {
 215   struct pthread *pd;
 216   size_t size;
 217   size_t pagesize_m1 = __getpagesize () - 1;
 218   size_t tls_static_size_for_stack = __nptl_tls_static_size_for_stack ();
 219   size_t tls_static_align_m1 = GLRO (dl_tls_static_align) - 1;
 220
 221   assert (powerof2 (pagesize_m1 + 1));
 222   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 223
 224   /* Get the stack size from the attribute if it is set.  Otherwise we
 225      use the default we determined at start time.  */
 226   if (attr->stacksize != 0)
 227     size = attr->stacksize;
 228   else
 229     {
 230       lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
 231       size = __default_pthread_attr.internal.stacksize;
 232       lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
 233     }
 234
 235   /* Get memory for the stack.  */
 236   if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))
 237     {
 238       uintptr_t adj;
 239       char *stackaddr = (char *) attr->stackaddr;
 240
 241       /* Assume the same layout as the _STACK_GROWS_DOWN case, with struct
 242          pthread at the top of the stack block.  Later we adjust the guard
 243          location and stack address to match the _STACK_GROWS_UP case.  */
 244       if (_STACK_GROWS_UP)
 245         stackaddr += attr->stacksize;
 246
 247       /* If the user also specified the size of the stack make sure it
 248          is large enough.  */
 249       if (attr->stacksize != 0
 250           && attr->stacksize < (tls_static_size_for_stack
 251                                 + MINIMAL_REST_STACK))
 252         return EINVAL;
 253
 254       /* Adjust stack size for alignment of the TLS block.  */
 255 #if TLS_TCB_AT_TP
 256       adj = ((uintptr_t) stackaddr - TLS_TCB_SIZE)
 257             & tls_static_align_m1;
 258       assert (size > adj + TLS_TCB_SIZE);
 259 #elif TLS_DTV_AT_TP
 260       adj = ((uintptr_t) stackaddr - tls_static_size_for_stack)
 261             & tls_static_align_m1;
 262       assert (size > adj);
 263 #endif
 264
 265       /* The user provided some memory.  Let's hope it matches the
 266          size...  We do not allocate guard pages if the user provided
 267          the stack.  It is the user's responsibility to do this if it
 268          is wanted.  */
 269 #if TLS_TCB_AT_TP
 270       pd = (struct pthread *) ((uintptr_t) stackaddr
 271                                - TLS_TCB_SIZE - adj);
 272 #elif TLS_DTV_AT_TP
 273       pd = (struct pthread *) (((uintptr_t) stackaddr
 274                                 - tls_static_size_for_stack - adj)
 275                                - TLS_PRE_TCB_SIZE);
 276 #endif
 277
 278       /* The user provided stack memory needs to be cleared.  */
 279       memset (pd, '\0', sizeof (struct pthread));
 280
 281       /* The first TSD block is included in the TCB.  */
 282       pd->specific[0] = pd->specific_1stblock;
 283
 284       /* Remember the stack-related values.  */
 285       pd->stackblock = (char *) stackaddr - size;
 286       pd->stackblock_size = size;
 287
 288       /* This is a user-provided stack.  It will not be queued in the
 289          stack cache nor will the memory (except the TLS memory) be freed.  */
 290       pd->user_stack = true;
 291
 292       /* This is at least the second thread.  */
 293       pd->header.multiple_threads = 1;
 294 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 295       __libc_multiple_threads = 1;
 296 #endif
 297
 298 #ifdef NEED_DL_SYSINFO
 299       SETUP_THREAD_SYSINFO (pd);
 300 #endif
 301
 302       /* Don't allow setxid until cloned.  */
 303       pd->setxid_futex = -1;
 304
 305       /* Allocate the DTV for this thread.  */
 306       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 307         {
 308           /* Something went wrong.  */
 309           assert (errno == ENOMEM);
 310           return errno;
 311         }
 312
 313
 314       /* Prepare to modify global data.  */
 315       lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 316
 317       /* And add to the list of stacks in use.  */
 318       list_add (&pd->list, &GL (dl_stack_user));
 319
 320       lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 321     }
 322   else
 323     {
 324       /* Allocate some anonymous memory.  If possible use the cache.  */
 325       size_t guardsize;
 326       size_t reported_guardsize;
 327       size_t reqsize;
 328       void *mem;
 329       const int prot = (PROT_READ | PROT_WRITE
 330                         | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
 331
 332       /* Adjust the stack size for alignment.  */
 333       size &= ~tls_static_align_m1;
 334       assert (size != 0);
 335
 336       /* Make sure the size of the stack is enough for the guard and
 337          eventually the thread descriptor.  On some targets there is
 338          a minimum guard size requirement, ARCH_MIN_GUARD_SIZE, so
 339          internally enforce it (unless the guard was disabled), but
 340          report the original guard size for backward compatibility:
 341          before POSIX 2008 the guardsize was specified to be one page
 342          by default which is observable via pthread_attr_getguardsize
 343          and pthread_getattr_np.  */
 344       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 345       reported_guardsize = guardsize;
 346       if (guardsize > 0 && guardsize < ARCH_MIN_GUARD_SIZE)
 347         guardsize = ARCH_MIN_GUARD_SIZE;
 348       if (guardsize < attr->guardsize || size + guardsize < guardsize)
 349         /* Arithmetic overflow.  */
 350         return EINVAL;
 351       size += guardsize;
 352       if (__builtin_expect (size < ((guardsize + tls_static_size_for_stack
 353                                      + MINIMAL_REST_STACK + pagesize_m1)
 354                                     & ~pagesize_m1),
 355                             0))
 356         /* The stack is too small (or the guard too large).  */
 357         return EINVAL;
 358
 359       /* Try to get a stack from the cache.  */
 360       reqsize = size;
 361       pd = get_cached_stack (&size, &mem);
 362       if (pd == NULL)
 363         {
 364           /* If a guard page is required, avoid committing memory by first
 365              allocate with PROT_NONE and then reserve with required permission
 366              excluding the guard page.  */
 367           mem = __mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE,
 368                         MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 369
 370           if (__glibc_unlikely (mem == MAP_FAILED))
 371             return errno;
 372
 373           /* SIZE is guaranteed to be greater than zero.
 374              So we can never get a null pointer back from mmap.  */
 375           assert (mem != NULL);
 376
 377           /* Place the thread descriptor at the end of the stack.  */
 378 #if TLS_TCB_AT_TP
 379           pd = (struct pthread *) ((((uintptr_t) mem + size)
 380                                     - TLS_TCB_SIZE)
 381                                    & ~tls_static_align_m1);
 382 #elif TLS_DTV_AT_TP
 383           pd = (struct pthread *) ((((uintptr_t) mem + size
 384                                     - tls_static_size_for_stack)
 385                                     & ~tls_static_align_m1)
 386                                    - TLS_PRE_TCB_SIZE);
 387 #endif
 388
 389           /* Now mprotect the required region excluding the guard area.  */
 390           if (__glibc_likely (guardsize > 0))
 391             {
 392               char *guard = guard_position (mem, size, guardsize, pd,
 393                                             pagesize_m1);
 394               if (setup_stack_prot (mem, size, guard, guardsize, prot) != 0)
 395                 {
 396                   __munmap (mem, size);
 397                   return errno;
 398                 }
 399             }
 400
 401           /* Remember the stack-related values.  */
 402           pd->stackblock = mem;
 403           pd->stackblock_size = size;
 404           /* Update guardsize for newly allocated guardsize to avoid
 405              an mprotect in guard resize below.  */
 406           pd->guardsize = guardsize;
 407
 408           /* We allocated the first block thread-specific data array.
 409              This address will not change for the lifetime of this
 410              descriptor.  */
 411           pd->specific[0] = pd->specific_1stblock;
 412
 413           /* This is at least the second thread.  */
 414           pd->header.multiple_threads = 1;
 415 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 416           __libc_multiple_threads = 1;
 417 #endif
 418
 419 #ifdef NEED_DL_SYSINFO
 420           SETUP_THREAD_SYSINFO (pd);
 421 #endif
 422
 423           /* Don't allow setxid until cloned.  */
 424           pd->setxid_futex = -1;
 425
 426           /* Allocate the DTV for this thread.  */
 427           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 428             {
 429               /* Something went wrong.  */
 430               assert (errno == ENOMEM);
 431
 432               /* Free the stack memory we just allocated.  */
 433               (void) __munmap (mem, size);
 434
 435               return errno;
 436             }
 437
 438
 439           /* Prepare to modify global data.  */
 440           lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 441
 442           /* And add to the list of stacks in use.  */
 443           __nptl_stack_list_add (&pd->list, &GL (dl_stack_used));
 444
 445           lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 446
 447
 448           /* There might have been a race.  Another thread might have
 449              caused the stacks to get exec permission while this new
 450              stack was prepared.  Detect if this was possible and
 451              change the permission if necessary.  */
 452           if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
 453                                 && (prot & PROT_EXEC) == 0, 0))
 454             {
 455               int err = __nptl_change_stack_perm (pd);
 456               if (err != 0)
 457                 {
 458                   /* Free the stack memory we just allocated.  */
 459                   (void) __munmap (mem, size);
 460
 461                   return err;
 462                 }
 463             }
 464
 465
 466           /* Note that all of the stack and the thread descriptor is
 467              zeroed.  This means we do not have to initialize fields
 468              with initial value zero.  This is specifically true for
 469              the 'tid' field which is always set back to zero once the
 470              stack is not used anymore and for the 'guardsize' field
 471              which will be read next.  */
 472         }
 473
 474       /* Create or resize the guard area if necessary.  */
 475       if (__glibc_unlikely (guardsize > pd->guardsize))
 476         {
 477           char *guard = guard_position (mem, size, guardsize, pd,
 478                                         pagesize_m1);
 479           if (__mprotect (guard, guardsize, PROT_NONE) != 0)
 480             {
 481             mprot_error:
 482               lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 483
 484               /* Remove the thread from the list.  */
 485               __nptl_stack_list_del (&pd->list);
 486
 487               lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 488
 489               /* Get rid of the TLS block we allocated.  */
 490               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 491
 492               /* Free the stack memory regardless of whether the size
 493                  of the cache is over the limit or not.  If this piece
 494                  of memory caused problems we better do not use it
 495                  anymore.  Uh, and we ignore possible errors.  There
 496                  is nothing we could do.  */
 497               (void) __munmap (mem, size);
 498
 499               return errno;
 500             }
 501
 502           pd->guardsize = guardsize;
 503         }
 504       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 505                                  0))
 506         {
 507           /* The old guard area is too large.  */
 508
 509 #ifdef NEED_SEPARATE_REGISTER_STACK
 510           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 511           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 512
 513           if (oldguard < guard
 514               && __mprotect (oldguard, guard - oldguard, prot) != 0)
 515             goto mprot_error;
 516
 517           if (__mprotect (guard + guardsize,
 518                         oldguard + pd->guardsize - guard - guardsize,
 519                         prot) != 0)
 520             goto mprot_error;
 521 #elif _STACK_GROWS_DOWN
 522           if (__mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 523                         prot) != 0)
 524             goto mprot_error;
 525 #elif _STACK_GROWS_UP
 526          char *new_guard = (char *)(((uintptr_t) pd - guardsize)
 527                                     & ~pagesize_m1);
 528          char *old_guard = (char *)(((uintptr_t) pd - pd->guardsize)
 529                                     & ~pagesize_m1);
 530          /* The guard size difference might be > 0, but once rounded
 531             to the nearest page the size difference might be zero.  */
 532          if (new_guard > old_guard
 533              && __mprotect (old_guard, new_guard - old_guard, prot) != 0)
 534             goto mprot_error;
 535 #endif
 536
 537           pd->guardsize = guardsize;
 538         }
 539       /* The pthread_getattr_np() calls need to get passed the size
 540          requested in the attribute, regardless of how large the
 541          actually used guardsize is.  */
 542       pd->reported_guardsize = reported_guardsize;
 543     }
 544
 545   /* Initialize the lock.  We have to do this unconditionally since the
 546      stillborn thread could be canceled while the lock is taken.  */
 547   pd->lock = LLL_LOCK_INITIALIZER;
 548
 549   /* The robust mutex lists also need to be initialized
 550      unconditionally because the cleanup for the previous stack owner
 551      might have happened in the kernel.  */
 552   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 553                                   - offsetof (pthread_mutex_t,
 554                                               __data.__list.__next));
 555   pd->robust_head.list_op_pending = NULL;
 556 #if __PTHREAD_MUTEX_HAVE_PREV
 557   pd->robust_prev = &pd->robust_head;
 558 #endif
 559   pd->robust_head.list = &pd->robust_head;
 560
 561   /* We place the thread descriptor at the end of the stack.  */
 562   *pdp = pd;
 563
 564   void *stacktop;
 565
 566 #if TLS_TCB_AT_TP
 567   /* The stack begins before the TCB and the static TLS block.  */
 568   stacktop = ((char *) (pd + 1) - tls_static_size_for_stack);
 569 #elif TLS_DTV_AT_TP
 570   stacktop = (char *) (pd - 1);
 571 #endif
 572
 573   *stacksize = stacktop - pd->stackblock;
 574   *stack = pd->stackblock;
 575
 576   return 0;
 577 }