thread_pthread.c

   1 /* -*-c-*- */
   2 /**********************************************************************
   3
   4   thread_pthread.c -
   5
   6   $Author$
   7
   8   Copyright (C) 2004-2007 Koichi Sasada
   9
  10 **********************************************************************/
  11
  12 #ifdef THREAD_SYSTEM_DEPENDENT_IMPLEMENTATION
  13
  14 #include "gc.h"
  15 #include "mjit.h"
  16
  17 #ifdef HAVE_SYS_RESOURCE_H
  18 #include <sys/resource.h>
  19 #endif
  20 #ifdef HAVE_THR_STKSEGMENT
  21 #include <thread.h>
  22 #endif
  23 #if defined(HAVE_FCNTL_H)
  24 #include <fcntl.h>
  25 #elif defined(HAVE_SYS_FCNTL_H)
  26 #include <sys/fcntl.h>
  27 #endif
  28 #ifdef HAVE_SYS_PRCTL_H
  29 #include <sys/prctl.h>
  30 #endif
  31 #if defined(HAVE_SYS_TIME_H)
  32 #include <sys/time.h>
  33 #endif
  34 #if defined(__HAIKU__)
  35 #include <kernel/OS.h>
  36 #endif
  37 #ifdef __linux__
  38 #include <sys/syscall.h> /* for SYS_gettid */
  39 #endif
  40 #include <time.h>
  41 #include <signal.h>
  42
  43 #if defined(HAVE_SYS_EVENTFD_H) && defined(HAVE_EVENTFD)
  44 #  define USE_EVENTFD (1)
  45 #  include <sys/eventfd.h>
  46 #else
  47 #  define USE_EVENTFD (0)
  48 #endif
  49
  50 #if defined(SIGVTALRM) && !defined(__CYGWIN__) && !defined(__EMSCRIPTEN__)
  51 #  define USE_UBF_LIST 1
  52 #endif
  53
  54 /*
  55  * UBF_TIMER and ubf_list both use SIGVTALRM.
  56  *
  57  * UBF_TIMER has NOTHING to do with thread timeslices (TIMER_INTERRUPT_MASK)
  58  *
  59  * UBF_TIMER is to close TOCTTOU signal race on programs where we
  60  * cannot rely on GVL contention (vm->gvl.timer) to perform wakeups
  61  * while a thread is doing blocking I/O on sockets or pipes.  With
  62  * rb_thread_call_without_gvl and similar functions:
  63  *
  64  * (1) Check interrupts.
  65  * (2) release GVL.
  66  * (2a) signal received
  67  * (3) call func with data1 (blocks for a long time without ubf_timer)
  68  * (4) acquire GVL.
  69  *     Other Ruby threads can not run in parallel any more.
  70  * (5) Check interrupts.
  71  *
  72  * We need UBF_TIMER to break out of (3) if (2a) happens.
  73  *
  74  * ubf_list wakeups may be triggered on gvl_yield.
  75  *
  76  * If we have vm->gvl.timer (on GVL contention), we don't need UBF_TIMER
  77  * as it can perform the same tasks while doing timeslices.
  78  */
  79 #define UBF_TIMER_NONE 0
  80 #define UBF_TIMER_POSIX 1
  81 #define UBF_TIMER_PTHREAD 2
  82
  83 #ifndef UBF_TIMER
  84 #  if defined(HAVE_TIMER_SETTIME) && defined(HAVE_TIMER_CREATE) && \
  85       defined(CLOCK_MONOTONIC) && defined(USE_UBF_LIST)
  86      /* preferred */
  87 #    define UBF_TIMER UBF_TIMER_POSIX
  88 #  elif defined(USE_UBF_LIST)
  89      /* safe, but inefficient */
  90 #    define UBF_TIMER UBF_TIMER_PTHREAD
  91 #  else
  92      /* we'll be racy without SIGVTALRM for ubf_list */
  93 #    define UBF_TIMER UBF_TIMER_NONE
  94 #  endif
  95 #endif
  96
  97 enum rtimer_state {
  98     /* alive, after timer_create: */
  99     RTIMER_DISARM,
 100     RTIMER_ARMING,
 101     RTIMER_ARMED,
 102
 103     RTIMER_DEAD
 104 };
 105
 106 #if UBF_TIMER == UBF_TIMER_POSIX
 107 static const struct itimerspec zero;
 108 static struct {
 109     rb_atomic_t state_; /* rtimer_state */
 110     rb_pid_t owner;
 111     timer_t timerid;
 112 } timer_posix = {
 113     /* .state = */ RTIMER_DEAD,
 114 };
 115
 116 #define TIMER_STATE_DEBUG 0
 117
 118 static const char *
 119 rtimer_state_name(enum rtimer_state state)
 120 {
 121     switch (state) {
 122       case RTIMER_DISARM: return "disarm";
 123       case RTIMER_ARMING: return "arming";
 124       case RTIMER_ARMED:  return "armed";
 125       case RTIMER_DEAD:   return "dead";
 126       default: rb_bug("unreachable");
 127     }
 128 }
 129
 130 static enum rtimer_state
 131 timer_state_exchange(enum rtimer_state state)
 132 {
 133     enum rtimer_state prev = ATOMIC_EXCHANGE(timer_posix.state_, state);
 134     if (TIMER_STATE_DEBUG) fprintf(stderr, "state (exc): %s->%s\n", rtimer_state_name(prev), rtimer_state_name(state));
 135     return prev;
 136 }
 137
 138 static enum rtimer_state
 139 timer_state_cas(enum rtimer_state expected_prev, enum rtimer_state state)
 140 {
 141     enum rtimer_state prev = ATOMIC_CAS(timer_posix.state_, expected_prev, state);
 142
 143     if (TIMER_STATE_DEBUG) {
 144         if (prev == expected_prev) {
 145             fprintf(stderr, "state (cas): %s->%s\n", rtimer_state_name(prev), rtimer_state_name(state));
 146         }
 147         else {
 148             fprintf(stderr, "state (cas): %s (expected:%s)\n", rtimer_state_name(prev), rtimer_state_name(expected_prev));
 149         }
 150     }
 151
 152     return prev;
 153 }
 154
 155 #elif UBF_TIMER == UBF_TIMER_PTHREAD
 156 static void *timer_pthread_fn(void *);
 157 static struct {
 158     int low[2];
 159     rb_atomic_t armed; /* boolean */
 160     rb_pid_t owner;
 161     pthread_t thid;
 162 } timer_pthread = {
 163     { -1, -1 },
 164 };
 165 #endif
 166
 167 static const rb_hrtime_t *sigwait_timeout(rb_thread_t *, int sigwait_fd,
 168                                               const rb_hrtime_t *,
 169                                               int *drained_p);
 170 static void ubf_timer_disarm(void);
 171 static void threadptr_trap_interrupt(rb_thread_t *);
 172 static void clear_thread_cache_altstack(void);
 173 static void ubf_wakeup_all_threads(void);
 174 static int ubf_threads_empty(void);
 175
 176 #define TIMER_THREAD_CREATED_P() (signal_self_pipe.owner_process == getpid())
 177
 178 /* for testing, and in case we come across a platform w/o pipes: */
 179 #define BUSY_WAIT_SIGNALS (0)
 180
 181 /*
 182  * sigwait_th is the thread which owns sigwait_fd and sleeps on it
 183  * (using ppoll).  MJIT worker can be sigwait_th==0, so we initialize
 184  * it to THREAD_INVALID at startup and fork time.  It is the ONLY thread
 185  * allowed to read from sigwait_fd, otherwise starvation can occur.
 186  */
 187 #define THREAD_INVALID ((const rb_thread_t *)-1)
 188 static const rb_thread_t *sigwait_th;
 189
 190 #ifdef HAVE_SCHED_YIELD
 191 #define native_thread_yield() (void)sched_yield()
 192 #else
 193 #define native_thread_yield() ((void)0)
 194 #endif
 195
 196 #if defined(HAVE_PTHREAD_CONDATTR_SETCLOCK) && \
 197     defined(CLOCK_REALTIME) && defined(CLOCK_MONOTONIC) && \
 198     defined(HAVE_CLOCK_GETTIME)
 199 static pthread_condattr_t condattr_mono;
 200 static pthread_condattr_t *condattr_monotonic = &condattr_mono;
 201 #else
 202 static const void *const condattr_monotonic = NULL;
 203 #endif
 204
 205 /* 100ms.  10ms is too small for user level thread scheduling
 206  * on recent Linux (tested on 2.6.35)
 207  */
 208 #define TIME_QUANTUM_MSEC (100)
 209 #define TIME_QUANTUM_USEC (TIME_QUANTUM_MSEC * 1000)
 210 #define TIME_QUANTUM_NSEC (TIME_QUANTUM_USEC * 1000)
 211
 212 static rb_hrtime_t native_cond_timeout(rb_nativethread_cond_t *, rb_hrtime_t);
 213 static int native_cond_timedwait(rb_nativethread_cond_t *cond, pthread_mutex_t *mutex, const rb_hrtime_t *abs);
 214
 215 /*
 216  * Designate the next gvl.timer thread, favor the last thread in
 217  * the waitq since it will be in waitq longest
 218  */
 219 static int
 220 designate_timer_thread(rb_global_vm_lock_t *gvl)
 221 {
 222     native_thread_data_t *last;
 223
 224     last = list_tail(&gvl->waitq, native_thread_data_t, node.ubf);
 225     if (last) {
 226         rb_native_cond_signal(&last->cond.gvlq);
 227         return TRUE;
 228     }
 229     return FALSE;
 230 }
 231
 232 /*
 233  * We become designated timer thread to kick vm->gvl.owner
 234  * periodically.  Continue on old timeout if it expired.
 235  */
 236 static void
 237 do_gvl_timer(rb_global_vm_lock_t *gvl, rb_thread_t *th)
 238 {
 239     rb_vm_t *vm = GET_VM();
 240     static rb_hrtime_t abs;
 241     native_thread_data_t *nd = &th->native_thread_data;
 242
 243     gvl->timer = th;
 244
 245     /* take over wakeups from UBF_TIMER */
 246     ubf_timer_disarm();
 247
 248     if (gvl->timer_err == ETIMEDOUT) {
 249         abs = native_cond_timeout(&nd->cond.gvlq, TIME_QUANTUM_NSEC);
 250     }
 251     gvl->timer_err = native_cond_timedwait(&nd->cond.gvlq, &gvl->lock, &abs);
 252
 253     ubf_wakeup_all_threads();
 254     ruby_sigchld_handler(vm);
 255
 256     if (UNLIKELY(rb_signal_buff_size())) {
 257         if (th == vm->ractor.main_thread) {
 258             RUBY_VM_SET_TRAP_INTERRUPT(th->ec);
 259         }
 260         else {
 261             threadptr_trap_interrupt(vm->ractor.main_thread);
 262         }
 263     }
 264
 265     /*
 266      * Timeslice.  Warning: the process may fork while this
 267      * thread is contending for GVL:
 268      */
 269     if (gvl->owner) {
 270         // strictly speaking, accessing "gvl->owner" is not thread-safe
 271         RUBY_VM_SET_TIMER_INTERRUPT(gvl->owner->ec);
 272     }
 273     gvl->timer = 0;
 274 }
 275
 276 static void
 277 gvl_acquire_common(rb_global_vm_lock_t *gvl, rb_thread_t *th)
 278 {
 279     if (gvl->owner) {
 280         native_thread_data_t *nd = &th->native_thread_data;
 281
 282         VM_ASSERT(th->unblock.func == 0 &&
 283                   "we must not be in ubf_list and GVL waitq at the same time");
 284
 285         list_add_tail(&gvl->waitq, &nd->node.gvl);
 286
 287         do {
 288             if (!gvl->timer) {
 289                 do_gvl_timer(gvl, th);
 290             }
 291             else {
 292                 rb_native_cond_wait(&nd->cond.gvlq, &gvl->lock);
 293             }
 294         } while (gvl->owner);
 295
 296         list_del_init(&nd->node.gvl);
 297
 298         if (gvl->need_yield) {
 299             gvl->need_yield = 0;
 300             rb_native_cond_signal(&gvl->switch_cond);
 301         }
 302     }
 303     else { /* reset timer if uncontended */
 304         gvl->timer_err = ETIMEDOUT;
 305     }
 306     gvl->owner = th;
 307     if (!gvl->timer) {
 308         if (!designate_timer_thread(gvl) && !ubf_threads_empty()) {
 309             rb_thread_wakeup_timer_thread(-1);
 310         }
 311     }
 312 }
 313
 314 static void
 315 gvl_acquire(rb_global_vm_lock_t *gvl, rb_thread_t *th)
 316 {
 317     rb_native_mutex_lock(&gvl->lock);
 318     gvl_acquire_common(gvl, th);
 319     rb_native_mutex_unlock(&gvl->lock);
 320 }
 321
 322 static const native_thread_data_t *
 323 gvl_release_common(rb_global_vm_lock_t *gvl)
 324 {
 325     native_thread_data_t *next;
 326     gvl->owner = 0;
 327     next = list_top(&gvl->waitq, native_thread_data_t, node.ubf);
 328     if (next) rb_native_cond_signal(&next->cond.gvlq);
 329
 330     return next;
 331 }
 332
 333 static void
 334 gvl_release(rb_global_vm_lock_t *gvl)
 335 {
 336     rb_native_mutex_lock(&gvl->lock);
 337     gvl_release_common(gvl);
 338     rb_native_mutex_unlock(&gvl->lock);
 339 }
 340
 341 static void
 342 gvl_yield(rb_global_vm_lock_t *gvl, rb_thread_t *th)
 343 {
 344     const native_thread_data_t *next;
 345
 346     /*
 347      * Perhaps other threads are stuck in blocking region w/o GVL, too,
 348      * (perhaps looping in io_close_fptr) so we kick them:
 349      */
 350     ubf_wakeup_all_threads();
 351     rb_native_mutex_lock(&gvl->lock);
 352     next = gvl_release_common(gvl);
 353
 354     /* An another thread is processing GVL yield. */
 355     if (UNLIKELY(gvl->wait_yield)) {
 356         while (gvl->wait_yield)
 357             rb_native_cond_wait(&gvl->switch_wait_cond, &gvl->lock);
 358     }
 359     else if (next) {
 360         /* Wait until another thread task takes GVL. */
 361         gvl->need_yield = 1;
 362         gvl->wait_yield = 1;
 363         while (gvl->need_yield)
 364             rb_native_cond_wait(&gvl->switch_cond, &gvl->lock);
 365         gvl->wait_yield = 0;
 366         rb_native_cond_broadcast(&gvl->switch_wait_cond);
 367     }
 368     else {
 369         rb_native_mutex_unlock(&gvl->lock);
 370         native_thread_yield();
 371         rb_native_mutex_lock(&gvl->lock);
 372         rb_native_cond_broadcast(&gvl->switch_wait_cond);
 373     }
 374     gvl_acquire_common(gvl, th);
 375     rb_native_mutex_unlock(&gvl->lock);
 376 }
 377
 378 void
 379 rb_gvl_init(rb_global_vm_lock_t *gvl)
 380 {
 381     rb_native_mutex_initialize(&gvl->lock);
 382     rb_native_cond_initialize(&gvl->switch_cond);
 383     rb_native_cond_initialize(&gvl->switch_wait_cond);
 384     list_head_init(&gvl->waitq);
 385     gvl->owner = 0;
 386     gvl->timer = 0;
 387     gvl->timer_err = ETIMEDOUT;
 388     gvl->need_yield = 0;
 389     gvl->wait_yield = 0;
 390 }
 391
 392 static void
 393 gvl_destroy(rb_global_vm_lock_t *gvl)
 394 {
 395     /*
 396      * only called once at VM shutdown (not atfork), another thread
 397      * may still grab vm->gvl.lock when calling gvl_release at
 398      * the end of thread_start_func_2
 399      */
 400     if (0) {
 401         rb_native_cond_destroy(&gvl->switch_wait_cond);
 402         rb_native_cond_destroy(&gvl->switch_cond);
 403         rb_native_mutex_destroy(&gvl->lock);
 404     }
 405     clear_thread_cache_altstack();
 406 }
 407
 408 #if defined(HAVE_WORKING_FORK)
 409 static void thread_cache_reset(void);
 410 static void
 411 gvl_atfork(rb_global_vm_lock_t *gvl)
 412 {
 413     thread_cache_reset();
 414     rb_gvl_init(gvl);
 415     gvl_acquire(gvl, GET_THREAD());
 416 }
 417 #endif
 418
 419 #define NATIVE_MUTEX_LOCK_DEBUG 0
 420
 421 static void
 422 mutex_debug(const char *msg, void *lock)
 423 {
 424     if (NATIVE_MUTEX_LOCK_DEBUG) {
 425         int r;
 426         static pthread_mutex_t dbglock = PTHREAD_MUTEX_INITIALIZER;
 427
 428         if ((r = pthread_mutex_lock(&dbglock)) != 0) {exit(EXIT_FAILURE);}
 429         fprintf(stdout, "%s: %p\n", msg, lock);
 430         if ((r = pthread_mutex_unlock(&dbglock)) != 0) {exit(EXIT_FAILURE);}
 431     }
 432 }
 433
 434 void
 435 rb_native_mutex_lock(pthread_mutex_t *lock)
 436 {
 437     int r;
 438     mutex_debug("lock", lock);
 439     if ((r = pthread_mutex_lock(lock)) != 0) {
 440         rb_bug_errno("pthread_mutex_lock", r);
 441     }
 442 }
 443
 444 void
 445 rb_native_mutex_unlock(pthread_mutex_t *lock)
 446 {
 447     int r;
 448     mutex_debug("unlock", lock);
 449     if ((r = pthread_mutex_unlock(lock)) != 0) {
 450         rb_bug_errno("pthread_mutex_unlock", r);
 451     }
 452 }
 453
 454 int
 455 rb_native_mutex_trylock(pthread_mutex_t *lock)
 456 {
 457     int r;
 458     mutex_debug("trylock", lock);
 459     if ((r = pthread_mutex_trylock(lock)) != 0) {
 460         if (r == EBUSY) {
 461             return EBUSY;
 462         }
 463         else {
 464             rb_bug_errno("pthread_mutex_trylock", r);
 465         }
 466     }
 467     return 0;
 468 }
 469
 470 void
 471 rb_native_mutex_initialize(pthread_mutex_t *lock)
 472 {
 473     int r = pthread_mutex_init(lock, 0);
 474     mutex_debug("init", lock);
 475     if (r != 0) {
 476         rb_bug_errno("pthread_mutex_init", r);
 477     }
 478 }
 479
 480 void
 481 rb_native_mutex_destroy(pthread_mutex_t *lock)
 482 {
 483     int r = pthread_mutex_destroy(lock);
 484     mutex_debug("destroy", lock);
 485     if (r != 0) {
 486         rb_bug_errno("pthread_mutex_destroy", r);
 487     }
 488 }
 489
 490 void
 491 rb_native_cond_initialize(rb_nativethread_cond_t *cond)
 492 {
 493     int r = pthread_cond_init(cond, condattr_monotonic);
 494     if (r != 0) {
 495         rb_bug_errno("pthread_cond_init", r);
 496     }
 497 }
 498
 499 void
 500 rb_native_cond_destroy(rb_nativethread_cond_t *cond)
 501 {
 502     int r = pthread_cond_destroy(cond);
 503     if (r != 0) {
 504         rb_bug_errno("pthread_cond_destroy", r);
 505     }
 506 }
 507
 508 /*
 509  * In OS X 10.7 (Lion), pthread_cond_signal and pthread_cond_broadcast return
 510  * EAGAIN after retrying 8192 times.  You can see them in the following page:
 511  *
 512  * http://www.opensource.apple.com/source/Libc/Libc-763.11/pthreads/pthread_cond.c
 513  *
 514  * The following rb_native_cond_signal and rb_native_cond_broadcast functions
 515  * need to retrying until pthread functions don't return EAGAIN.
 516  */
 517
 518 void
 519 rb_native_cond_signal(rb_nativethread_cond_t *cond)
 520 {
 521     int r;
 522     do {
 523         r = pthread_cond_signal(cond);
 524     } while (r == EAGAIN);
 525     if (r != 0) {
 526         rb_bug_errno("pthread_cond_signal", r);
 527     }
 528 }
 529
 530 void
 531 rb_native_cond_broadcast(rb_nativethread_cond_t *cond)
 532 {
 533     int r;
 534     do {
 535         r = pthread_cond_broadcast(cond);
 536     } while (r == EAGAIN);
 537     if (r != 0) {
 538         rb_bug_errno("rb_native_cond_broadcast", r);
 539     }
 540 }
 541
 542 void
 543 rb_native_cond_wait(rb_nativethread_cond_t *cond, pthread_mutex_t *mutex)
 544 {
 545     int r = pthread_cond_wait(cond, mutex);
 546     if (r != 0) {
 547         rb_bug_errno("pthread_cond_wait", r);
 548     }
 549 }
 550
 551 static int
 552 native_cond_timedwait(rb_nativethread_cond_t *cond, pthread_mutex_t *mutex, const rb_hrtime_t *abs)
 553 {
 554     int r;
 555     struct timespec ts;
 556
 557     /*
 558      * An old Linux may return EINTR. Even though POSIX says
 559      *   "These functions shall not return an error code of [EINTR]".
 560      *   http://pubs.opengroup.org/onlinepubs/009695399/functions/pthread_cond_timedwait.html
 561      * Let's hide it from arch generic code.
 562      */
 563     do {
 564         rb_hrtime2timespec(&ts, abs);
 565         r = pthread_cond_timedwait(cond, mutex, &ts);
 566     } while (r == EINTR);
 567
 568     if (r != 0 && r != ETIMEDOUT) {
 569         rb_bug_errno("pthread_cond_timedwait", r);
 570     }
 571
 572     return r;
 573 }
 574
 575 void
 576 rb_native_cond_timedwait(rb_nativethread_cond_t *cond, pthread_mutex_t *mutex, unsigned long msec)
 577 {
 578     rb_hrtime_t hrmsec = native_cond_timeout(cond, RB_HRTIME_PER_MSEC * msec);
 579     native_cond_timedwait(cond, mutex, &hrmsec);
 580 }
 581
 582 static rb_hrtime_t
 583 native_cond_timeout(rb_nativethread_cond_t *cond, const rb_hrtime_t rel)
 584 {
 585     if (condattr_monotonic) {
 586         return rb_hrtime_add(rb_hrtime_now(), rel);
 587     }
 588     else {
 589         struct timespec ts;
 590
 591         rb_timespec_now(&ts);
 592         return rb_hrtime_add(rb_timespec2hrtime(&ts), rel);
 593     }
 594 }
 595
 596 #define native_cleanup_push pthread_cleanup_push
 597 #define native_cleanup_pop  pthread_cleanup_pop
 598
 599 #ifdef RB_THREAD_LOCAL_SPECIFIER
 600 static RB_THREAD_LOCAL_SPECIFIER rb_thread_t *ruby_native_thread;
 601 #else
 602 static pthread_key_t ruby_native_thread_key;
 603 #endif
 604
 605 static void
 606 null_func(int i)
 607 {
 608     /* null */
 609 }
 610
 611 rb_thread_t *
 612 ruby_thread_from_native(void)
 613 {
 614 #ifdef RB_THREAD_LOCAL_SPECIFIER
 615     return ruby_native_thread;
 616 #else
 617     return pthread_getspecific(ruby_native_thread_key);
 618 #endif
 619 }
 620
 621 int
 622 ruby_thread_set_native(rb_thread_t *th)
 623 {
 624     if (th && th->ec) {
 625         rb_ractor_set_current_ec(th->ractor, th->ec);
 626     }
 627 #ifdef RB_THREAD_LOCAL_SPECIFIER
 628     ruby_native_thread = th;
 629     return 1;
 630 #else
 631     return pthread_setspecific(ruby_native_thread_key, th) == 0;
 632 #endif
 633 }
 634
 635 static void native_thread_init(rb_thread_t *th);
 636
 637 void
 638 Init_native_thread(rb_thread_t *th)
 639 {
 640 #if defined(HAVE_PTHREAD_CONDATTR_SETCLOCK)
 641     if (condattr_monotonic) {
 642         int r = pthread_condattr_init(condattr_monotonic);
 643         if (r == 0) {
 644             r = pthread_condattr_setclock(condattr_monotonic, CLOCK_MONOTONIC);
 645         }
 646         if (r) condattr_monotonic = NULL;
 647     }
 648 #endif
 649
 650 #ifndef RB_THREAD_LOCAL_SPECIFIER
 651     if (pthread_key_create(&ruby_native_thread_key, 0) == EAGAIN) {
 652         rb_bug("pthread_key_create failed (ruby_native_thread_key)");
 653     }
 654     if (pthread_key_create(&ruby_current_ec_key, 0) == EAGAIN) {
 655         rb_bug("pthread_key_create failed (ruby_current_ec_key)");
 656     }
 657 #endif
 658     th->thread_id = pthread_self();
 659     ruby_thread_set_native(th);
 660     fill_thread_id_str(th);
 661     native_thread_init(th);
 662     posix_signal(SIGVTALRM, null_func);
 663 }
 664
 665 #ifdef RB_THREAD_T_HAS_NATIVE_ID
 666 static int
 667 get_native_thread_id(void)
 668 {
 669 #ifdef __linux__
 670     return (int)syscall(SYS_gettid);
 671 #elif defined(__FreeBSD__)
 672     return pthread_getthreadid_np();
 673 #endif
 674 }
 675 #endif
 676
 677 static void
 678 native_thread_init(rb_thread_t *th)
 679 {
 680     native_thread_data_t *nd = &th->native_thread_data;
 681
 682 #ifdef RB_THREAD_T_HAS_NATIVE_ID
 683     th->tid = get_native_thread_id();
 684 #endif
 685 #ifdef USE_UBF_LIST
 686     list_node_init(&nd->node.ubf);
 687 #endif
 688     rb_native_cond_initialize(&nd->cond.gvlq);
 689     if (&nd->cond.gvlq != &nd->cond.intr)
 690         rb_native_cond_initialize(&nd->cond.intr);
 691 }
 692
 693 #ifndef USE_THREAD_CACHE
 694 #define USE_THREAD_CACHE 1
 695 #endif
 696
 697 static void
 698 native_thread_destroy(rb_thread_t *th)
 699 {
 700     native_thread_data_t *nd = &th->native_thread_data;
 701
 702     rb_native_cond_destroy(&nd->cond.gvlq);
 703     if (&nd->cond.gvlq != &nd->cond.intr)
 704         rb_native_cond_destroy(&nd->cond.intr);
 705
 706     /*
 707      * prevent false positive from ruby_thread_has_gvl_p if that
 708      * gets called from an interposing function wrapper
 709      */
 710     if (USE_THREAD_CACHE)
 711         ruby_thread_set_native(0);
 712 }
 713
 714 #if USE_THREAD_CACHE
 715 static rb_thread_t *register_cached_thread_and_wait(void *);
 716 #endif
 717
 718 #if defined HAVE_PTHREAD_GETATTR_NP || defined HAVE_PTHREAD_ATTR_GET_NP
 719 #define STACKADDR_AVAILABLE 1
 720 #elif defined HAVE_PTHREAD_GET_STACKADDR_NP && defined HAVE_PTHREAD_GET_STACKSIZE_NP
 721 #define STACKADDR_AVAILABLE 1
 722 #undef MAINSTACKADDR_AVAILABLE
 723 #define MAINSTACKADDR_AVAILABLE 1
 724 void *pthread_get_stackaddr_np(pthread_t);
 725 size_t pthread_get_stacksize_np(pthread_t);
 726 #elif defined HAVE_THR_STKSEGMENT || defined HAVE_PTHREAD_STACKSEG_NP
 727 #define STACKADDR_AVAILABLE 1
 728 #elif defined HAVE_PTHREAD_GETTHRDS_NP
 729 #define STACKADDR_AVAILABLE 1
 730 #elif defined __HAIKU__
 731 #define STACKADDR_AVAILABLE 1
 732 #endif
 733
 734 #ifndef MAINSTACKADDR_AVAILABLE
 735 # ifdef STACKADDR_AVAILABLE
 736 #   define MAINSTACKADDR_AVAILABLE 1
 737 # else
 738 #   define MAINSTACKADDR_AVAILABLE 0
 739 # endif
 740 #endif
 741 #if MAINSTACKADDR_AVAILABLE && !defined(get_main_stack)
 742 # define get_main_stack(addr, size) get_stack(addr, size)
 743 #endif
 744
 745 #ifdef STACKADDR_AVAILABLE
 746 /*
 747  * Get the initial address and size of current thread's stack
 748  */
 749 static int
 750 get_stack(void **addr, size_t *size)
 751 {
 752 #define CHECK_ERR(expr)                         \
 753     {int err = (expr); if (err) return err;}
 754 #ifdef HAVE_PTHREAD_GETATTR_NP /* Linux */
 755     pthread_attr_t attr;
 756     size_t guard = 0;
 757     STACK_GROW_DIR_DETECTION;
 758     CHECK_ERR(pthread_getattr_np(pthread_self(), &attr));
 759 # ifdef HAVE_PTHREAD_ATTR_GETSTACK
 760     CHECK_ERR(pthread_attr_getstack(&attr, addr, size));
 761     STACK_DIR_UPPER((void)0, (void)(*addr = (char *)*addr + *size));
 762 # else
 763     CHECK_ERR(pthread_attr_getstackaddr(&attr, addr));
 764     CHECK_ERR(pthread_attr_getstacksize(&attr, size));
 765 # endif
 766 # ifdef HAVE_PTHREAD_ATTR_GETGUARDSIZE
 767     CHECK_ERR(pthread_attr_getguardsize(&attr, &guard));
 768 # else
 769     guard = getpagesize();
 770 # endif
 771     *size -= guard;
 772     pthread_attr_destroy(&attr);
 773 #elif defined HAVE_PTHREAD_ATTR_GET_NP /* FreeBSD, DragonFly BSD, NetBSD */
 774     pthread_attr_t attr;
 775     CHECK_ERR(pthread_attr_init(&attr));
 776     CHECK_ERR(pthread_attr_get_np(pthread_self(), &attr));
 777 # ifdef HAVE_PTHREAD_ATTR_GETSTACK
 778     CHECK_ERR(pthread_attr_getstack(&attr, addr, size));
 779 # else
 780     CHECK_ERR(pthread_attr_getstackaddr(&attr, addr));
 781     CHECK_ERR(pthread_attr_getstacksize(&attr, size));
 782 # endif
 783     STACK_DIR_UPPER((void)0, (void)(*addr = (char *)*addr + *size));
 784     pthread_attr_destroy(&attr);
 785 #elif (defined HAVE_PTHREAD_GET_STACKADDR_NP && defined HAVE_PTHREAD_GET_STACKSIZE_NP) /* MacOS X */
 786     pthread_t th = pthread_self();
 787     *addr = pthread_get_stackaddr_np(th);
 788     *size = pthread_get_stacksize_np(th);
 789 #elif defined HAVE_THR_STKSEGMENT || defined HAVE_PTHREAD_STACKSEG_NP
 790     stack_t stk;
 791 # if defined HAVE_THR_STKSEGMENT /* Solaris */
 792     CHECK_ERR(thr_stksegment(&stk));
 793 # else /* OpenBSD */
 794     CHECK_ERR(pthread_stackseg_np(pthread_self(), &stk));
 795 # endif
 796     *addr = stk.ss_sp;
 797     *size = stk.ss_size;
 798 #elif defined HAVE_PTHREAD_GETTHRDS_NP /* AIX */
 799     pthread_t th = pthread_self();
 800     struct __pthrdsinfo thinfo;
 801     char reg[256];
 802     int regsiz=sizeof(reg);
 803     CHECK_ERR(pthread_getthrds_np(&th, PTHRDSINFO_QUERY_ALL,
 804                                    &thinfo, sizeof(thinfo),
 805                                    &reg, &regsiz));
 806     *addr = thinfo.__pi_stackaddr;
 807     /* Must not use thinfo.__pi_stacksize for size.
 808        It is around 3KB smaller than the correct size
 809        calculated by thinfo.__pi_stackend - thinfo.__pi_stackaddr. */
 810     *size = thinfo.__pi_stackend - thinfo.__pi_stackaddr;
 811     STACK_DIR_UPPER((void)0, (void)(*addr = (char *)*addr + *size));
 812 #elif defined __HAIKU__
 813     thread_info info;
 814     STACK_GROW_DIR_DETECTION;
 815     CHECK_ERR(get_thread_info(find_thread(NULL), &info));
 816     *addr = info.stack_base;
 817     *size = (uintptr_t)info.stack_end - (uintptr_t)info.stack_base;
 818     STACK_DIR_UPPER((void)0, (void)(*addr = (char *)*addr + *size));
 819 #else
 820 #error STACKADDR_AVAILABLE is defined but not implemented.
 821 #endif
 822     return 0;
 823 #undef CHECK_ERR
 824 }
 825 #endif
 826
 827 static struct {
 828     rb_nativethread_id_t id;
 829     size_t stack_maxsize;
 830     VALUE *stack_start;
 831 } native_main_thread;
 832
 833 #ifdef STACK_END_ADDRESS
 834 extern void *STACK_END_ADDRESS;
 835 #endif
 836
 837 enum {
 838     RUBY_STACK_SPACE_LIMIT = 1024 * 1024, /* 1024KB */
 839     RUBY_STACK_SPACE_RATIO = 5
 840 };
 841
 842 static size_t
 843 space_size(size_t stack_size)
 844 {
 845     size_t space_size = stack_size / RUBY_STACK_SPACE_RATIO;
 846     if (space_size > RUBY_STACK_SPACE_LIMIT) {
 847         return RUBY_STACK_SPACE_LIMIT;
 848     }
 849     else {
 850         return space_size;
 851     }
 852 }
 853
 854 #ifdef __linux__
 855 static __attribute__((noinline)) void
 856 reserve_stack(volatile char *limit, size_t size)
 857 {
 858 # ifdef C_ALLOCA
 859 #   error needs alloca()
 860 # endif
 861     struct rlimit rl;
 862     volatile char buf[0x100];
 863     enum {stack_check_margin = 0x1000}; /* for -fstack-check */
 864
 865     STACK_GROW_DIR_DETECTION;
 866
 867     if (!getrlimit(RLIMIT_STACK, &rl) && rl.rlim_cur == RLIM_INFINITY)
 868         return;
 869
 870     if (size < stack_check_margin) return;
 871     size -= stack_check_margin;
 872
 873     size -= sizeof(buf); /* margin */
 874     if (IS_STACK_DIR_UPPER()) {
 875         const volatile char *end = buf + sizeof(buf);
 876         limit += size;
 877         if (limit > end) {
 878             /* |<-bottom (=limit(a))                                     top->|
 879              * | .. |<-buf 256B |<-end                          | stack check |
 880              * |  256B  |              =size=                   | margin (4KB)|
 881              * |              =size=         limit(b)->|  256B  |             |
 882              * |                |       alloca(sz)     |        |             |
 883              * | .. |<-buf      |<-limit(c)    [sz-1]->0>       |             |
 884              */
 885             size_t sz = limit - end;
 886             limit = alloca(sz);
 887             limit[sz-1] = 0;
 888         }
 889     }
 890     else {
 891         limit -= size;
 892         if (buf > limit) {
 893             /* |<-top (=limit(a))                                     bottom->|
 894              * | .. | 256B buf->|                               | stack check |
 895              * |  256B  |              =size=                   | margin (4KB)|
 896              * |              =size=         limit(b)->|  256B  |             |
 897              * |                |       alloca(sz)     |        |             |
 898              * | .. |      buf->|           limit(c)-><0>       |             |
 899              */
 900             size_t sz = buf - limit;
 901             limit = alloca(sz);
 902             limit[0] = 0;
 903         }
 904     }
 905 }
 906 #else
 907 # define reserve_stack(limit, size) ((void)(limit), (void)(size))
 908 #endif
 909
 910 #undef ruby_init_stack
 911 void
 912 ruby_init_stack(volatile VALUE *addr)
 913 {
 914     native_main_thread.id = pthread_self();
 915
 916 #if MAINSTACKADDR_AVAILABLE
 917     if (native_main_thread.stack_maxsize) return;
 918     {
 919         void* stackaddr;
 920         size_t size;
 921         if (get_main_stack(&stackaddr, &size) == 0) {
 922             native_main_thread.stack_maxsize = size;
 923             native_main_thread.stack_start = stackaddr;
 924             reserve_stack(stackaddr, size);
 925             goto bound_check;
 926         }
 927     }
 928 #endif
 929 #ifdef STACK_END_ADDRESS
 930     native_main_thread.stack_start = STACK_END_ADDRESS;
 931 #else
 932     if (!native_main_thread.stack_start ||
 933         STACK_UPPER((VALUE *)(void *)&addr,
 934                     native_main_thread.stack_start > addr,
 935                     native_main_thread.stack_start < addr)) {
 936         native_main_thread.stack_start = (VALUE *)addr;
 937     }
 938 #endif
 939     {
 940 #if defined(HAVE_GETRLIMIT)
 941 #if defined(PTHREAD_STACK_DEFAULT)
 942 # if PTHREAD_STACK_DEFAULT < RUBY_STACK_SPACE*5
 943 #  error "PTHREAD_STACK_DEFAULT is too small"
 944 # endif
 945         size_t size = PTHREAD_STACK_DEFAULT;
 946 #else
 947         size_t size = RUBY_VM_THREAD_VM_STACK_SIZE;
 948 #endif
 949         size_t space;
 950         int pagesize = getpagesize();
 951         struct rlimit rlim;
 952         STACK_GROW_DIR_DETECTION;
 953         if (getrlimit(RLIMIT_STACK, &rlim) == 0) {
 954             size = (size_t)rlim.rlim_cur;
 955         }
 956         addr = native_main_thread.stack_start;
 957         if (IS_STACK_DIR_UPPER()) {
 958             space = ((size_t)((char *)addr + size) / pagesize) * pagesize - (size_t)addr;
 959         }
 960         else {
 961             space = (size_t)addr - ((size_t)((char *)addr - size) / pagesize + 1) * pagesize;
 962         }
 963         native_main_thread.stack_maxsize = space;
 964 #endif
 965     }
 966
 967 #if MAINSTACKADDR_AVAILABLE
 968   bound_check:
 969 #endif
 970     /* If addr is out of range of main-thread stack range estimation,  */
 971     /* it should be on co-routine (alternative stack). [Feature #2294] */
 972     {
 973         void *start, *end;
 974         STACK_GROW_DIR_DETECTION;
 975
 976         if (IS_STACK_DIR_UPPER()) {
 977             start = native_main_thread.stack_start;
 978             end = (char *)native_main_thread.stack_start + native_main_thread.stack_maxsize;
 979         }
 980         else {
 981             start = (char *)native_main_thread.stack_start - native_main_thread.stack_maxsize;
 982             end = native_main_thread.stack_start;
 983         }
 984
 985         if ((void *)addr < start || (void *)addr > end) {
 986             /* out of range */
 987             native_main_thread.stack_start = (VALUE *)addr;
 988             native_main_thread.stack_maxsize = 0; /* unknown */
 989         }
 990     }
 991 }
 992
 993 #define CHECK_ERR(expr) \
 994     {int err = (expr); if (err) {rb_bug_errno(#expr, err);}}
 995
 996 static int
 997 native_thread_init_stack(rb_thread_t *th)
 998 {
 999     rb_nativethread_id_t curr = pthread_self();
1000
1001     if (pthread_equal(curr, native_main_thread.id)) {
1002         th->ec->machine.stack_start = native_main_thread.stack_start;
1003         th->ec->machine.stack_maxsize = native_main_thread.stack_maxsize;
1004     }
1005     else {
1006 #ifdef STACKADDR_AVAILABLE
1007         void *start;
1008         size_t size;
1009
1010         if (get_stack(&start, &size) == 0) {
1011             uintptr_t diff = (uintptr_t)start - (uintptr_t)&curr;
1012             th->ec->machine.stack_start = (VALUE *)&curr;
1013             th->ec->machine.stack_maxsize = size - diff;
1014         }
1015 #else
1016         rb_raise(rb_eNotImpError, "ruby engine can initialize only in the main thread");
1017 #endif
1018     }
1019
1020     return 0;
1021 }
1022
1023 #ifndef __CYGWIN__
1024 #define USE_NATIVE_THREAD_INIT 1
1025 #endif
1026
1027 static void *
1028 thread_start_func_1(void *th_ptr)
1029 {
1030     rb_thread_t *th = th_ptr;
1031     RB_ALTSTACK_INIT(void *altstack, th->altstack);
1032 #if USE_THREAD_CACHE
1033   thread_start:
1034 #endif
1035     {
1036 #if !defined USE_NATIVE_THREAD_INIT
1037         VALUE stack_start;
1038 #endif
1039
1040         fill_thread_id_str(th);
1041 #if defined USE_NATIVE_THREAD_INIT
1042         native_thread_init_stack(th);
1043 #endif
1044         native_thread_init(th);
1045         /* run */
1046 #if defined USE_NATIVE_THREAD_INIT
1047         thread_start_func_2(th, th->ec->machine.stack_start);
1048 #else
1049         thread_start_func_2(th, &stack_start);
1050 #endif
1051     }
1052 #if USE_THREAD_CACHE
1053     /* cache thread */
1054     if ((th = register_cached_thread_and_wait(RB_ALTSTACK(altstack))) != 0) {
1055         goto thread_start;
1056     }
1057 #else
1058     RB_ALTSTACK_FREE(altstack);
1059 #endif
1060     return 0;
1061 }
1062
1063 struct cached_thread_entry {
1064     rb_nativethread_cond_t cond;
1065     rb_nativethread_id_t thread_id;
1066     rb_thread_t *th;
1067     void *altstack;
1068     struct list_node node;
1069 };
1070
1071 #if USE_THREAD_CACHE
1072 static rb_nativethread_lock_t thread_cache_lock = RB_NATIVETHREAD_LOCK_INIT;
1073 static LIST_HEAD(cached_thread_head);
1074
1075 #  if defined(HAVE_WORKING_FORK)
1076 static void
1077 thread_cache_reset(void)
1078 {
1079     rb_native_mutex_initialize(&thread_cache_lock);
1080     list_head_init(&cached_thread_head);
1081 }
1082 #  endif
1083
1084 /*
1085  * number of seconds to cache for, I think 1-5s is sufficient to obviate
1086  * the need for thread pool in many network programs (taking into account
1087  * worst case network latency across the globe) without wasting memory
1088  */
1089 #ifndef THREAD_CACHE_TIME
1090 #  define THREAD_CACHE_TIME ((rb_hrtime_t)3 * RB_HRTIME_PER_SEC)
1091 #endif
1092
1093 static rb_thread_t *
1094 register_cached_thread_and_wait(void *altstack)
1095 {
1096     rb_hrtime_t end = THREAD_CACHE_TIME;
1097     struct cached_thread_entry entry;
1098
1099     rb_native_cond_initialize(&entry.cond);
1100     entry.altstack = altstack;
1101     entry.th = NULL;
1102     entry.thread_id = pthread_self();
1103     end = native_cond_timeout(&entry.cond, end);
1104
1105     rb_native_mutex_lock(&thread_cache_lock);
1106     {
1107         list_add(&cached_thread_head, &entry.node);
1108
1109         native_cond_timedwait(&entry.cond, &thread_cache_lock, &end);
1110
1111         if (entry.th == NULL) { /* unused */
1112             list_del(&entry.node);
1113         }
1114     }
1115     rb_native_mutex_unlock(&thread_cache_lock);
1116
1117     rb_native_cond_destroy(&entry.cond);
1118     if (!entry.th) {
1119         RB_ALTSTACK_FREE(entry.altstack);
1120     }
1121
1122     return entry.th;
1123 }
1124 #else
1125 #  if defined(HAVE_WORKING_FORK)
1126 static void thread_cache_reset(void) { }
1127 #  endif
1128 #endif
1129
1130 static int
1131 use_cached_thread(rb_thread_t *th)
1132 {
1133 #if USE_THREAD_CACHE
1134     struct cached_thread_entry *entry;
1135
1136     rb_native_mutex_lock(&thread_cache_lock);
1137     entry = list_pop(&cached_thread_head, struct cached_thread_entry, node);
1138     if (entry) {
1139         entry->th = th;
1140         /* th->thread_id must be set before signal for Thread#name= */
1141         th->thread_id = entry->thread_id;
1142         fill_thread_id_str(th);
1143         rb_native_cond_signal(&entry->cond);
1144     }
1145     rb_native_mutex_unlock(&thread_cache_lock);
1146     return !!entry;
1147 #endif
1148     return 0;
1149 }
1150
1151 static void
1152 clear_thread_cache_altstack(void)
1153 {
1154 #if USE_THREAD_CACHE
1155     struct cached_thread_entry *entry;
1156
1157     rb_native_mutex_lock(&thread_cache_lock);
1158     list_for_each(&cached_thread_head, entry, node) {
1159         void MAYBE_UNUSED(*altstack) = entry->altstack;
1160         entry->altstack = 0;
1161         RB_ALTSTACK_FREE(altstack);
1162     }
1163     rb_native_mutex_unlock(&thread_cache_lock);
1164 #endif
1165 }
1166
1167 static int
1168 native_thread_create(rb_thread_t *th)
1169 {
1170     int err = 0;
1171
1172     if (use_cached_thread(th)) {
1173         thread_debug("create (use cached thread): %p\n", (void *)th);
1174     }
1175     else {
1176         pthread_attr_t attr;
1177         const size_t stack_size = th->vm->default_params.thread_machine_stack_size + th->vm->default_params.thread_vm_stack_size;
1178         const size_t space = space_size(stack_size);
1179
1180 #ifdef USE_SIGALTSTACK
1181         th->altstack = rb_allocate_sigaltstack();
1182 #endif
1183         th->ec->machine.stack_maxsize = stack_size - space;
1184
1185         CHECK_ERR(pthread_attr_init(&attr));
1186
1187 # ifdef PTHREAD_STACK_MIN
1188         thread_debug("create - stack size: %lu\n", (unsigned long)stack_size);
1189         CHECK_ERR(pthread_attr_setstacksize(&attr, stack_size));
1190 # endif
1191
1192 # ifdef HAVE_PTHREAD_ATTR_SETINHERITSCHED
1193         CHECK_ERR(pthread_attr_setinheritsched(&attr, PTHREAD_INHERIT_SCHED));
1194 # endif
1195         CHECK_ERR(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED));
1196
1197         err = pthread_create(&th->thread_id, &attr, thread_start_func_1, th);
1198         thread_debug("create: %p (%d)\n", (void *)th, err);
1199         /* should be done in the created thread */
1200         fill_thread_id_str(th);
1201         CHECK_ERR(pthread_attr_destroy(&attr));
1202     }
1203     return err;
1204 }
1205
1206 #if USE_NATIVE_THREAD_PRIORITY
1207
1208 static void
1209 native_thread_apply_priority(rb_thread_t *th)
1210 {
1211 #if defined(_POSIX_PRIORITY_SCHEDULING) && (_POSIX_PRIORITY_SCHEDULING > 0)
1212     struct sched_param sp;
1213     int policy;
1214     int priority = 0 - th->priority;
1215     int max, min;
1216     pthread_getschedparam(th->thread_id, &policy, &sp);
1217     max = sched_get_priority_max(policy);
1218     min = sched_get_priority_min(policy);
1219
1220     if (min > priority) {
1221         priority = min;
1222     }
1223     else if (max < priority) {
1224         priority = max;
1225     }
1226
1227     sp.sched_priority = priority;
1228     pthread_setschedparam(th->thread_id, policy, &sp);
1229 #else
1230     /* not touched */
1231 #endif
1232 }
1233
1234 #endif /* USE_NATIVE_THREAD_PRIORITY */
1235
1236 static int
1237 native_fd_select(int n, rb_fdset_t *readfds, rb_fdset_t *writefds, rb_fdset_t *exceptfds, struct timeval *timeout, rb_thread_t *th)
1238 {
1239     return rb_fd_select(n, readfds, writefds, exceptfds, timeout);
1240 }
1241
1242 static void
1243 ubf_pthread_cond_signal(void *ptr)
1244 {
1245     rb_thread_t *th = (rb_thread_t *)ptr;
1246     thread_debug("ubf_pthread_cond_signal (%p)\n", (void *)th);
1247     rb_native_cond_signal(&th->native_thread_data.cond.intr);
1248 }
1249
1250 static void
1251 native_cond_sleep(rb_thread_t *th, rb_hrtime_t *rel)
1252 {
1253     rb_nativethread_lock_t *lock = &th->interrupt_lock;
1254     rb_nativethread_cond_t *cond = &th->native_thread_data.cond.intr;
1255
1256     /* Solaris cond_timedwait() return EINVAL if an argument is greater than
1257      * current_time + 100,000,000.  So cut up to 100,000,000.  This is
1258      * considered as a kind of spurious wakeup.  The caller to native_sleep
1259      * should care about spurious wakeup.
1260      *
1261      * See also [Bug #1341] [ruby-core:29702]
1262      * http://download.oracle.com/docs/cd/E19683-01/816-0216/6m6ngupgv/index.html
1263      */
1264     const rb_hrtime_t max = (rb_hrtime_t)100000000 * RB_HRTIME_PER_SEC;
1265
1266     GVL_UNLOCK_BEGIN(th);
1267     {
1268         rb_native_mutex_lock(lock);
1269         th->unblock.func = ubf_pthread_cond_signal;
1270         th->unblock.arg = th;
1271
1272         if (RUBY_VM_INTERRUPTED(th->ec)) {
1273             /* interrupted.  return immediate */
1274             thread_debug("native_sleep: interrupted before sleep\n");
1275         }
1276         else {
1277             if (!rel) {
1278                 rb_native_cond_wait(cond, lock);
1279             }
1280             else {
1281                 rb_hrtime_t end;
1282
1283                 if (*rel > max) {
1284                     *rel = max;
1285                 }
1286
1287                 end = native_cond_timeout(cond, *rel);
1288                 native_cond_timedwait(cond, lock, &end);
1289             }
1290         }
1291         th->unblock.func = 0;
1292
1293         rb_native_mutex_unlock(lock);
1294     }
1295     GVL_UNLOCK_END(th);
1296
1297     thread_debug("native_sleep done\n");
1298 }
1299
1300 #ifdef USE_UBF_LIST
1301 static LIST_HEAD(ubf_list_head);
1302 static rb_nativethread_lock_t ubf_list_lock = RB_NATIVETHREAD_LOCK_INIT;
1303
1304 static void
1305 ubf_list_atfork(void)
1306 {
1307     list_head_init(&ubf_list_head);
1308     rb_native_mutex_initialize(&ubf_list_lock);
1309 }
1310
1311 /* The thread 'th' is registered to be trying unblock. */
1312 static void
1313 register_ubf_list(rb_thread_t *th)
1314 {
1315     struct list_node *node = &th->native_thread_data.node.ubf;
1316
1317     if (list_empty((struct list_head*)node)) {
1318         rb_native_mutex_lock(&ubf_list_lock);
1319         list_add(&ubf_list_head, node);
1320         rb_native_mutex_unlock(&ubf_list_lock);
1321     }
1322 }
1323
1324 /* The thread 'th' is unblocked. It no longer need to be registered. */
1325 static void
1326 unregister_ubf_list(rb_thread_t *th)
1327 {
1328     struct list_node *node = &th->native_thread_data.node.ubf;
1329
1330     /* we can't allow re-entry into ubf_list_head */
1331     VM_ASSERT(th->unblock.func == 0);
1332
1333     if (!list_empty((struct list_head*)node)) {
1334         rb_native_mutex_lock(&ubf_list_lock);
1335         list_del_init(node);
1336         if (list_empty(&ubf_list_head) && !rb_signal_buff_size()) {
1337             ubf_timer_disarm();
1338         }
1339         rb_native_mutex_unlock(&ubf_list_lock);
1340     }
1341 }
1342
1343 /*
1344  * send a signal to intent that a target thread return from blocking syscall.
1345  * Maybe any signal is ok, but we chose SIGVTALRM.
1346  */
1347 static void
1348 ubf_wakeup_thread(rb_thread_t *th)
1349 {
1350     thread_debug("thread_wait_queue_wakeup (%"PRI_THREAD_ID")\n", thread_id_str(th));
1351     pthread_kill(th->thread_id, SIGVTALRM);
1352 }
1353
1354 static void
1355 ubf_select(void *ptr)
1356 {
1357     rb_thread_t *th = (rb_thread_t *)ptr;
1358     rb_global_vm_lock_t *gvl = rb_ractor_gvl(th->ractor);
1359     const rb_thread_t *cur = ruby_thread_from_native(); /* may be 0 */
1360
1361     register_ubf_list(th);
1362
1363     /*
1364      * ubf_wakeup_thread() doesn't guarantee to wake up a target thread.
1365      * Therefore, we repeatedly call ubf_wakeup_thread() until a target thread
1366      * exit from ubf function.  We must have a timer to perform this operation.
1367      * We use double-checked locking here because this function may be called
1368      * while vm->gvl.lock is held in do_gvl_timer.
1369      * There is also no need to start a timer if we're the designated
1370      * sigwait_th thread, otherwise we can deadlock with a thread
1371      * in unblock_function_clear.
1372      */
1373     if (cur != gvl->timer && cur != sigwait_th) {
1374         /*
1375          * Double-checked locking above was to prevent nested locking
1376          * by the SAME thread.  We use trylock here to prevent deadlocks
1377          * between DIFFERENT threads
1378          */
1379         if (rb_native_mutex_trylock(&gvl->lock) == 0) {
1380             if (!gvl->timer) {
1381                 rb_thread_wakeup_timer_thread(-1);
1382             }
1383             rb_native_mutex_unlock(&gvl->lock);
1384         }
1385     }
1386
1387     ubf_wakeup_thread(th);
1388 }
1389
1390 static int
1391 ubf_threads_empty(void)
1392 {
1393     return list_empty(&ubf_list_head);
1394 }
1395
1396 static void
1397 ubf_wakeup_all_threads(void)
1398 {
1399     rb_thread_t *th;
1400     native_thread_data_t *dat;
1401
1402     if (!ubf_threads_empty()) {
1403         rb_native_mutex_lock(&ubf_list_lock);
1404         list_for_each(&ubf_list_head, dat, node.ubf) {
1405             th = container_of(dat, rb_thread_t, native_thread_data);
1406             ubf_wakeup_thread(th);
1407         }
1408         rb_native_mutex_unlock(&ubf_list_lock);
1409     }
1410 }
1411
1412 #else /* USE_UBF_LIST */
1413 #define register_ubf_list(th) (void)(th)
1414 #define unregister_ubf_list(th) (void)(th)
1415 #define ubf_select 0
1416 static void ubf_wakeup_all_threads(void) { return; }
1417 static int ubf_threads_empty(void) { return 1; }
1418 #define ubf_list_atfork() do {} while (0)
1419 #endif /* USE_UBF_LIST */
1420
1421 #define TT_DEBUG 0
1422 #define WRITE_CONST(fd, str) (void)(write((fd),(str),sizeof(str)-1)<0)
1423
1424 static struct {
1425     /* pipes are closed in forked children when owner_process does not match */
1426     int normal[2]; /* [0] == sigwait_fd */
1427     int ub_main[2]; /* unblock main thread from native_ppoll_sleep */
1428
1429     /* volatile for signal handler use: */
1430     volatile rb_pid_t owner_process;
1431 } signal_self_pipe = {
1432     {-1, -1},
1433     {-1, -1},
1434 };
1435
1436 /* only use signal-safe system calls here */
1437 static void
1438 rb_thread_wakeup_timer_thread_fd(int fd)
1439 {
1440 #if USE_EVENTFD
1441     const uint64_t buff = 1;
1442 #else
1443     const char buff = '!';
1444 #endif
1445     ssize_t result;
1446
1447     /* already opened */
1448     if (fd >= 0) {
1449       retry:
1450         if ((result = write(fd, &buff, sizeof(buff))) <= 0) {
1451             int e = errno;
1452             switch (e) {
1453               case EINTR: goto retry;
1454               case EAGAIN:
1455 #if defined(EWOULDBLOCK) && EWOULDBLOCK != EAGAIN
1456               case EWOULDBLOCK:
1457 #endif
1458                 break;
1459               default:
1460                 async_bug_fd("rb_thread_wakeup_timer_thread: write", e, fd);
1461             }
1462         }
1463         if (TT_DEBUG) WRITE_CONST(2, "rb_thread_wakeup_timer_thread: write\n");
1464     }
1465     else {
1466         /* ignore wakeup */
1467     }
1468 }
1469
1470 /*
1471  * This ensures we get a SIGVTALRM in TIME_QUANTUM_MSEC if our
1472  * process could not react to the original signal in time.
1473  */
1474 static void
1475 ubf_timer_arm(rb_pid_t current) /* async signal safe */
1476 {
1477 #if UBF_TIMER == UBF_TIMER_POSIX
1478     if ((!current || timer_posix.owner == current) &&
1479         timer_state_cas(RTIMER_DISARM, RTIMER_ARMING) == RTIMER_DISARM) {
1480         struct itimerspec it;
1481
1482         it.it_interval.tv_sec = it.it_value.tv_sec = 0;
1483         it.it_interval.tv_nsec = it.it_value.tv_nsec = TIME_QUANTUM_NSEC;
1484
1485         if (timer_settime(timer_posix.timerid, 0, &it, 0))
1486             rb_async_bug_errno("timer_settime (arm)", errno);
1487
1488         switch (timer_state_cas(RTIMER_ARMING, RTIMER_ARMED)) {
1489           case RTIMER_DISARM:
1490             /* somebody requested a disarm while we were arming */
1491             /* may race harmlessly with ubf_timer_destroy */
1492             (void)timer_settime(timer_posix.timerid, 0, &zero, 0);
1493
1494           case RTIMER_ARMING: return; /* success */
1495           case RTIMER_ARMED:
1496             /*
1497              * it is possible to have another thread disarm, and
1498              * a third thread arm finish re-arming before we get
1499              * here, so we wasted a syscall with timer_settime but
1500              * probably unavoidable in a signal handler.
1501              */
1502             return;
1503           case RTIMER_DEAD:
1504             /* may race harmlessly with ubf_timer_destroy */
1505             (void)timer_settime(timer_posix.timerid, 0, &zero, 0);
1506             return;
1507           default:
1508             rb_async_bug_errno("UBF_TIMER_POSIX unknown state", ERANGE);
1509         }
1510     }
1511 #elif UBF_TIMER == UBF_TIMER_PTHREAD
1512     if (!current || current == timer_pthread.owner) {
1513         if (ATOMIC_EXCHANGE(timer_pthread.armed, 1) == 0)
1514             rb_thread_wakeup_timer_thread_fd(timer_pthread.low[1]);
1515     }
1516 #endif
1517 }
1518
1519 void
1520 rb_thread_wakeup_timer_thread(int sig)
1521 {
1522     rb_pid_t current;
1523
1524     /* non-sighandler path */
1525     if (sig <= 0) {
1526         rb_thread_wakeup_timer_thread_fd(signal_self_pipe.normal[1]);
1527         if (sig < 0) {
1528             ubf_timer_arm(0);
1529         }
1530         return;
1531     }
1532
1533     /* must be safe inside sighandler, so no mutex */
1534     current = getpid();
1535     if (signal_self_pipe.owner_process == current) {
1536         rb_thread_wakeup_timer_thread_fd(signal_self_pipe.normal[1]);
1537
1538         /*
1539          * system_working check is required because vm and main_thread are
1540          * freed during shutdown
1541          */
1542         if (system_working > 0) {
1543             volatile rb_execution_context_t *ec;
1544             rb_vm_t *vm = GET_VM();
1545             rb_thread_t *mth;
1546
1547             /*
1548              * FIXME: root VM and main_thread should be static and not
1549              * on heap for maximum safety (and startup/shutdown speed)
1550              */
1551             if (!vm) return;
1552             mth = vm->ractor.main_thread;
1553             if (!mth || system_working <= 0) return;
1554
1555             /* this relies on GC for grace period before cont_free */
1556             ec = ACCESS_ONCE(rb_execution_context_t *, mth->ec);
1557
1558             if (ec) {
1559                 RUBY_VM_SET_TRAP_INTERRUPT(ec);
1560                 ubf_timer_arm(current);
1561
1562                 /* some ubfs can interrupt single-threaded process directly */
1563                 if (vm->ubf_async_safe && mth->unblock.func) {
1564                     (mth->unblock.func)(mth->unblock.arg);
1565                 }
1566             }
1567         }
1568     }
1569 }
1570
1571 #define CLOSE_INVALIDATE_PAIR(expr) \
1572     close_invalidate_pair(expr,"close_invalidate: "#expr)
1573 static void
1574 close_invalidate(int *fdp, const char *msg)
1575 {
1576     int fd = *fdp;
1577
1578     *fdp = -1;
1579     if (close(fd) < 0) {
1580         async_bug_fd(msg, errno, fd);
1581     }
1582 }
1583
1584 static void
1585 close_invalidate_pair(int fds[2], const char *msg)
1586 {
1587     if (USE_EVENTFD && fds[0] == fds[1]) {
1588         close_invalidate(&fds[0], msg);
1589         fds[1] = -1;
1590     }
1591     else {
1592         close_invalidate(&fds[0], msg);
1593         close_invalidate(&fds[1], msg);
1594     }
1595 }
1596
1597 static void
1598 set_nonblock(int fd)
1599 {
1600     int oflags;
1601     int err;
1602
1603     oflags = fcntl(fd, F_GETFL);
1604     if (oflags == -1)
1605         rb_sys_fail(0);
1606     oflags |= O_NONBLOCK;
1607     err = fcntl(fd, F_SETFL, oflags);
1608     if (err == -1)
1609         rb_sys_fail(0);
1610 }
1611
1612 /* communication pipe with timer thread and signal handler */
1613 static int
1614 setup_communication_pipe_internal(int pipes[2])
1615 {
1616     int err;
1617
1618     if (pipes[0] >= 0 || pipes[1] >= 0) {
1619         VM_ASSERT(pipes[0] >= 0);
1620         VM_ASSERT(pipes[1] >= 0);
1621         return 0;
1622     }
1623
1624     /*
1625      * Don't bother with eventfd on ancient Linux 2.6.22..2.6.26 which were
1626      * missing EFD_* flags, they can fall back to pipe
1627      */
1628 #if USE_EVENTFD && defined(EFD_NONBLOCK) && defined(EFD_CLOEXEC)
1629     pipes[0] = pipes[1] = eventfd(0, EFD_NONBLOCK|EFD_CLOEXEC);
1630     if (pipes[0] >= 0) {
1631         rb_update_max_fd(pipes[0]);
1632         return 0;
1633     }
1634 #endif
1635
1636     err = rb_cloexec_pipe(pipes);
1637     if (err != 0) {
1638         rb_warn("pipe creation failed for timer: %s, scheduling broken",
1639                 strerror(errno));
1640         return -1;
1641     }
1642     rb_update_max_fd(pipes[0]);
1643     rb_update_max_fd(pipes[1]);
1644     set_nonblock(pipes[0]);
1645     set_nonblock(pipes[1]);
1646     return 0;
1647 }
1648
1649 #if !defined(SET_CURRENT_THREAD_NAME) && defined(__linux__) && defined(PR_SET_NAME)
1650 # define SET_CURRENT_THREAD_NAME(name) prctl(PR_SET_NAME, name)
1651 #endif
1652
1653 enum {
1654     THREAD_NAME_MAX =
1655 #if defined(__linux__)
1656     16
1657 #elif defined(__APPLE__)
1658 /* Undocumented, and main thread seems unlimited */
1659     64
1660 #else
1661     16
1662 #endif
1663 };
1664
1665 static VALUE threadptr_invoke_proc_location(rb_thread_t *th);
1666
1667 static void
1668 native_set_thread_name(rb_thread_t *th)
1669 {
1670 #ifdef SET_CURRENT_THREAD_NAME
1671     VALUE loc;
1672     if (!NIL_P(loc = th->name)) {
1673         SET_CURRENT_THREAD_NAME(RSTRING_PTR(loc));
1674     }
1675     else if ((loc = threadptr_invoke_proc_location(th)) != Qnil) {
1676         char *name, *p;
1677         char buf[THREAD_NAME_MAX];
1678         size_t len;
1679         int n;
1680
1681         name = RSTRING_PTR(RARRAY_AREF(loc, 0));
1682         p = strrchr(name, '/'); /* show only the basename of the path. */
1683         if (p && p[1])
1684             name = p + 1;
1685
1686         n = snprintf(buf, sizeof(buf), "%s:%d", name, NUM2INT(RARRAY_AREF(loc, 1)));
1687         RB_GC_GUARD(loc);
1688
1689         len = (size_t)n;
1690         if (len >= sizeof(buf)) {
1691             buf[sizeof(buf)-2] = '*';
1692             buf[sizeof(buf)-1] = '\0';
1693         }
1694         SET_CURRENT_THREAD_NAME(buf);
1695     }
1696 #endif
1697 }
1698
1699 static void
1700 native_set_another_thread_name(rb_nativethread_id_t thread_id, VALUE name)
1701 {
1702 #if defined SET_ANOTHER_THREAD_NAME || defined SET_CURRENT_THREAD_NAME
1703     char buf[THREAD_NAME_MAX];
1704     const char *s = "";
1705 # if !defined SET_ANOTHER_THREAD_NAME
1706     if (!pthread_equal(pthread_self(), thread_id)) return;
1707 # endif
1708     if (!NIL_P(name)) {
1709         long n;
1710         RSTRING_GETMEM(name, s, n);
1711         if (n >= (int)sizeof(buf)) {
1712             memcpy(buf, s, sizeof(buf)-1);
1713             buf[sizeof(buf)-1] = '\0';
1714             s = buf;
1715         }
1716     }
1717 # if defined SET_ANOTHER_THREAD_NAME
1718     SET_ANOTHER_THREAD_NAME(thread_id, s);
1719 # elif defined SET_CURRENT_THREAD_NAME
1720     SET_CURRENT_THREAD_NAME(s);
1721 # endif
1722 #endif
1723 }
1724
1725 #if defined(RB_THREAD_T_HAS_NATIVE_ID) || defined(__APPLE__)
1726 static VALUE
1727 native_thread_native_thread_id(rb_thread_t *target_th)
1728 {
1729 #ifdef RB_THREAD_T_HAS_NATIVE_ID
1730     int tid = target_th->tid;
1731     if (tid == 0) return Qnil;
1732     return INT2FIX(tid);
1733 #elif defined(__APPLE__)
1734     uint64_t tid;
1735     int e = pthread_threadid_np(target_th->thread_id, &tid);
1736     if (e != 0) rb_syserr_fail(e, "pthread_threadid_np");
1737     return ULL2NUM((unsigned long long)tid);
1738 #endif
1739 }
1740 # define USE_NATIVE_THREAD_NATIVE_THREAD_ID 1
1741 #else
1742 # define USE_NATIVE_THREAD_NATIVE_THREAD_ID 0
1743 #endif
1744
1745 static void
1746 ubf_timer_invalidate(void)
1747 {
1748 #if UBF_TIMER == UBF_TIMER_PTHREAD
1749     CLOSE_INVALIDATE_PAIR(timer_pthread.low);
1750 #endif
1751 }
1752
1753 static void
1754 ubf_timer_pthread_create(rb_pid_t current)
1755 {
1756 #if UBF_TIMER == UBF_TIMER_PTHREAD
1757     int err;
1758     if (timer_pthread.owner == current)
1759         return;
1760
1761     if (setup_communication_pipe_internal(timer_pthread.low) < 0)
1762         return;
1763
1764     err = pthread_create(&timer_pthread.thid, 0, timer_pthread_fn, GET_VM());
1765     if (!err)
1766         timer_pthread.owner = current;
1767     else
1768         rb_warn("pthread_create failed for timer: %s, signals racy",
1769                 strerror(err));
1770 #endif
1771 }
1772
1773 static void
1774 ubf_timer_create(rb_pid_t current)
1775 {
1776 #if UBF_TIMER == UBF_TIMER_POSIX
1777 #  if defined(__sun)
1778 #    define UBF_TIMER_CLOCK CLOCK_REALTIME
1779 #  else /* Tested Linux and FreeBSD: */
1780 #    define UBF_TIMER_CLOCK CLOCK_MONOTONIC
1781 #  endif
1782
1783     struct sigevent sev;
1784
1785     sev.sigev_notify = SIGEV_SIGNAL;
1786     sev.sigev_signo = SIGVTALRM;
1787     sev.sigev_value.sival_ptr = &timer_posix;
1788
1789     if (!timer_create(UBF_TIMER_CLOCK, &sev, &timer_posix.timerid)) {
1790         rb_atomic_t prev = timer_state_exchange(RTIMER_DISARM);
1791
1792         if (prev != RTIMER_DEAD) {
1793             rb_bug("timer_posix was not dead: %u\n", (unsigned)prev);
1794         }
1795         timer_posix.owner = current;
1796     }
1797     else {
1798         rb_warn("timer_create failed: %s, signals racy", strerror(errno));
1799     }
1800 #endif
1801     if (UBF_TIMER == UBF_TIMER_PTHREAD)
1802         ubf_timer_pthread_create(current);
1803 }
1804
1805 static void
1806 rb_thread_create_timer_thread(void)
1807 {
1808     /* we only create the pipe, and lazy-spawn */
1809     rb_pid_t current = getpid();
1810     rb_pid_t owner = signal_self_pipe.owner_process;
1811
1812     if (owner && owner != current) {
1813         CLOSE_INVALIDATE_PAIR(signal_self_pipe.normal);
1814         CLOSE_INVALIDATE_PAIR(signal_self_pipe.ub_main);
1815         ubf_timer_invalidate();
1816     }
1817
1818     if (setup_communication_pipe_internal(signal_self_pipe.normal) < 0) return;
1819     if (setup_communication_pipe_internal(signal_self_pipe.ub_main) < 0) return;
1820
1821     ubf_timer_create(current);
1822     if (owner != current) {
1823         /* validate pipe on this process */
1824         sigwait_th = THREAD_INVALID;
1825         signal_self_pipe.owner_process = current;
1826     }
1827 }
1828
1829 static void
1830 ubf_timer_disarm(void)
1831 {
1832 #if UBF_TIMER == UBF_TIMER_POSIX
1833     rb_atomic_t prev;
1834
1835     if (timer_posix.owner && timer_posix.owner != getpid()) return;
1836     prev = timer_state_cas(RTIMER_ARMED, RTIMER_DISARM);
1837     switch (prev) {
1838       case RTIMER_DISARM: return; /* likely */
1839       case RTIMER_ARMING: return; /* ubf_timer_arm will disarm itself */
1840       case RTIMER_ARMED:
1841         if (timer_settime(timer_posix.timerid, 0, &zero, 0)) {
1842             int err = errno;
1843
1844             if (err == EINVAL) {
1845                 prev = timer_state_cas(RTIMER_DISARM, RTIMER_DISARM);
1846
1847                 /* main thread may have killed the timer */
1848                 if (prev == RTIMER_DEAD) return;
1849
1850                 rb_bug_errno("timer_settime (disarm)", err);
1851             }
1852         }
1853         return;
1854       case RTIMER_DEAD: return; /* stay dead */
1855       default:
1856         rb_bug("UBF_TIMER_POSIX bad state: %u\n", (unsigned)prev);
1857     }
1858
1859 #elif UBF_TIMER == UBF_TIMER_PTHREAD
1860     ATOMIC_SET(timer_pthread.armed, 0);
1861 #endif
1862 }
1863
1864 static void
1865 ubf_timer_destroy(void)
1866 {
1867 #if UBF_TIMER == UBF_TIMER_POSIX
1868     if (timer_posix.owner == getpid()) {
1869         rb_atomic_t expect = RTIMER_DISARM;
1870         size_t i, max = 10000000;
1871
1872         /* prevent signal handler from arming: */
1873         for (i = 0; i < max; i++) {
1874             switch (timer_state_cas(expect, RTIMER_DEAD)) {
1875               case RTIMER_DISARM:
1876                 if (expect == RTIMER_DISARM) goto done;
1877                 expect = RTIMER_DISARM;
1878                 break;
1879               case RTIMER_ARMING:
1880                 native_thread_yield(); /* let another thread finish arming */
1881                 expect = RTIMER_ARMED;
1882                 break;
1883               case RTIMER_ARMED:
1884                 if (expect == RTIMER_ARMED) {
1885                     if (timer_settime(timer_posix.timerid, 0, &zero, 0))
1886                         rb_bug_errno("timer_settime (destroy)", errno);
1887                     goto done;
1888                 }
1889                 expect = RTIMER_ARMED;
1890                 break;
1891               case RTIMER_DEAD:
1892                 rb_bug("RTIMER_DEAD unexpected");
1893             }
1894         }
1895         rb_bug("timed out waiting for timer to arm");
1896 done:
1897         if (timer_delete(timer_posix.timerid) < 0)
1898             rb_sys_fail("timer_delete");
1899
1900         VM_ASSERT(timer_state_exchange(RTIMER_DEAD) == RTIMER_DEAD);
1901     }
1902 #elif UBF_TIMER == UBF_TIMER_PTHREAD
1903     int err;
1904
1905     timer_pthread.owner = 0;
1906     ubf_timer_disarm();
1907     rb_thread_wakeup_timer_thread_fd(timer_pthread.low[1]);
1908     err = pthread_join(timer_pthread.thid, 0);
1909     if (err) {
1910         rb_raise(rb_eThreadError, "native_thread_join() failed (%d)", err);
1911     }
1912 #endif
1913 }
1914
1915 static int
1916 native_stop_timer_thread(void)
1917 {
1918     int stopped;
1919     stopped = --system_working <= 0;
1920     if (stopped)
1921         ubf_timer_destroy();
1922
1923     if (TT_DEBUG) fprintf(stderr, "stop timer thread\n");
1924     return stopped;
1925 }
1926
1927 static void
1928 native_reset_timer_thread(void)
1929 {
1930     if (TT_DEBUG)  fprintf(stderr, "reset timer thread\n");
1931 }
1932
1933 #ifdef HAVE_SIGALTSTACK
1934 int
1935 ruby_stack_overflowed_p(const rb_thread_t *th, const void *addr)
1936 {
1937     void *base;
1938     size_t size;
1939     const size_t water_mark = 1024 * 1024;
1940     STACK_GROW_DIR_DETECTION;
1941
1942 #ifdef STACKADDR_AVAILABLE
1943     if (get_stack(&base, &size) == 0) {
1944 # ifdef __APPLE__
1945         if (pthread_equal(th->thread_id, native_main_thread.id)) {
1946             struct rlimit rlim;
1947             if (getrlimit(RLIMIT_STACK, &rlim) == 0 && rlim.rlim_cur > size) {
1948                 size = (size_t)rlim.rlim_cur;
1949             }
1950         }
1951 # endif
1952         base = (char *)base + STACK_DIR_UPPER(+size, -size);
1953     }
1954     else
1955 #endif
1956     if (th) {
1957         size = th->ec->machine.stack_maxsize;
1958         base = (char *)th->ec->machine.stack_start - STACK_DIR_UPPER(0, size);
1959     }
1960     else {
1961         return 0;
1962     }
1963     size /= RUBY_STACK_SPACE_RATIO;
1964     if (size > water_mark) size = water_mark;
1965     if (IS_STACK_DIR_UPPER()) {
1966         if (size > ~(size_t)base+1) size = ~(size_t)base+1;
1967         if (addr > base && addr <= (void *)((char *)base + size)) return 1;
1968     }
1969     else {
1970         if (size > (size_t)base) size = (size_t)base;
1971         if (addr > (void *)((char *)base - size) && addr <= base) return 1;
1972     }
1973     return 0;
1974 }
1975 #endif
1976
1977 int
1978 rb_reserved_fd_p(int fd)
1979 {
1980     /* no false-positive if out-of-FD at startup */
1981     if (fd < 0)
1982         return 0;
1983
1984 #if UBF_TIMER == UBF_TIMER_PTHREAD
1985     if (fd == timer_pthread.low[0] || fd == timer_pthread.low[1])
1986         goto check_pid;
1987 #endif
1988     if (fd == signal_self_pipe.normal[0] || fd == signal_self_pipe.normal[1])
1989         goto check_pid;
1990     if (fd == signal_self_pipe.ub_main[0] || fd == signal_self_pipe.ub_main[1])
1991         goto check_pid;
1992     return 0;
1993 check_pid:
1994     if (signal_self_pipe.owner_process == getpid()) /* async-signal-safe */
1995         return 1;
1996     return 0;
1997 }
1998
1999 rb_nativethread_id_t
2000 rb_nativethread_self(void)
2001 {
2002     return pthread_self();
2003 }
2004
2005 #if USE_MJIT
2006 /* A function that wraps actual worker function, for pthread abstraction. */
2007 static void *
2008 mjit_worker(void *arg)
2009 {
2010     void (*worker_func)(void) = (void(*)(void))arg;
2011
2012 #ifdef SET_CURRENT_THREAD_NAME
2013     SET_CURRENT_THREAD_NAME("ruby-mjitworker"); /* 16 byte including NUL */
2014 #endif
2015     worker_func();
2016     return NULL;
2017 }
2018
2019 /* Launch MJIT thread. Returns FALSE if it fails to create thread. */
2020 int
2021 rb_thread_create_mjit_thread(void (*worker_func)(void))
2022 {
2023     pthread_attr_t attr;
2024     pthread_t worker_pid;
2025     int ret = FALSE;
2026
2027     if (pthread_attr_init(&attr) != 0) return ret;
2028
2029     /* jit_worker thread is not to be joined */
2030     if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) == 0
2031         && pthread_create(&worker_pid, &attr, mjit_worker, (void *)worker_func) == 0) {
2032         ret = TRUE;
2033     }
2034     pthread_attr_destroy(&attr);
2035     return ret;
2036 }
2037 #endif
2038
2039 int
2040 rb_sigwait_fd_get(const rb_thread_t *th)
2041 {
2042     if (signal_self_pipe.normal[0] >= 0) {
2043         VM_ASSERT(signal_self_pipe.owner_process == getpid());
2044         /*
2045          * no need to keep firing the timer if any thread is sleeping
2046          * on the signal self-pipe
2047          */
2048         ubf_timer_disarm();
2049
2050         if (ATOMIC_PTR_CAS(sigwait_th, THREAD_INVALID, th) == THREAD_INVALID) {
2051             return signal_self_pipe.normal[0];
2052         }
2053     }
2054     return -1; /* avoid thundering herd and work stealing/starvation */
2055 }
2056
2057 void
2058 rb_sigwait_fd_put(const rb_thread_t *th, int fd)
2059 {
2060     const rb_thread_t *old;
2061
2062     VM_ASSERT(signal_self_pipe.normal[0] == fd);
2063     old = ATOMIC_PTR_EXCHANGE(sigwait_th, THREAD_INVALID);
2064     if (old != th) assert(old == th);
2065 }
2066
2067 #ifndef HAVE_PPOLL
2068 /* TODO: don't ignore sigmask */
2069 static int
2070 ruby_ppoll(struct pollfd *fds, nfds_t nfds,
2071       const struct timespec *ts, const sigset_t *sigmask)
2072 {
2073     int timeout_ms;
2074
2075     if (ts) {
2076         int tmp, tmp2;
2077
2078         if (ts->tv_sec > INT_MAX/1000)
2079             timeout_ms = INT_MAX;
2080         else {
2081             tmp = (int)(ts->tv_sec * 1000);
2082             /* round up 1ns to 1ms to avoid excessive wakeups for <1ms sleep */
2083             tmp2 = (int)((ts->tv_nsec + 999999L) / (1000L * 1000L));
2084             if (INT_MAX - tmp < tmp2)
2085                 timeout_ms = INT_MAX;
2086             else
2087                 timeout_ms = (int)(tmp + tmp2);
2088         }
2089     }
2090     else
2091         timeout_ms = -1;
2092
2093     return poll(fds, nfds, timeout_ms);
2094 }
2095 #  define ppoll(fds,nfds,ts,sigmask) ruby_ppoll((fds),(nfds),(ts),(sigmask))
2096 #endif
2097
2098 void
2099 rb_sigwait_sleep(rb_thread_t *th, int sigwait_fd, const rb_hrtime_t *rel)
2100 {
2101     struct pollfd pfd;
2102     struct timespec ts;
2103
2104     pfd.fd = sigwait_fd;
2105     pfd.events = POLLIN;
2106
2107     if (!BUSY_WAIT_SIGNALS && ubf_threads_empty()) {
2108         (void)ppoll(&pfd, 1, rb_hrtime2timespec(&ts, rel), 0);
2109         check_signals_nogvl(th, sigwait_fd);
2110     }
2111     else {
2112         rb_hrtime_t to = RB_HRTIME_MAX, end;
2113         int n = 0;
2114
2115         if (rel) {
2116             to = *rel;
2117             end = rb_hrtime_add(rb_hrtime_now(), to);
2118         }
2119         /*
2120          * tricky: this needs to return on spurious wakeup (no auto-retry).
2121          * But we also need to distinguish between periodic quantum
2122          * wakeups, so we care about the result of consume_communication_pipe
2123          *
2124          * We want to avoid spurious wakeup for Mutex#sleep compatibility
2125          * [ruby-core:88102]
2126          */
2127         for (;;) {
2128             const rb_hrtime_t *sto = sigwait_timeout(th, sigwait_fd, &to, &n);
2129
2130             if (n) return;
2131             n = ppoll(&pfd, 1, rb_hrtime2timespec(&ts, sto), 0);
2132             if (check_signals_nogvl(th, sigwait_fd))
2133                 return;
2134             if (n || (th && RUBY_VM_INTERRUPTED(th->ec)))
2135                 return;
2136             if (rel && hrtime_update_expire(&to, end))
2137                 return;
2138         }
2139     }
2140 }
2141
2142 /*
2143  * we need to guarantee wakeups from native_ppoll_sleep because
2144  * ubf_select may not be going through ubf_list if other threads
2145  * are all sleeping.
2146  */
2147 static void
2148 ubf_ppoll_sleep(void *ignore)
2149 {
2150     rb_thread_wakeup_timer_thread_fd(signal_self_pipe.ub_main[1]);
2151 }
2152
2153 /*
2154  * Single CPU setups benefit from explicit sched_yield() before ppoll(),
2155  * since threads may be too starved to enter the GVL waitqueue for
2156  * us to detect contention.  Instead, we want to kick other threads
2157  * so they can run and possibly prevent us from entering slow paths
2158  * in ppoll() or similar syscalls.
2159  *
2160  * Confirmed on FreeBSD 11.2 and Linux 4.19.
2161  * [ruby-core:90417] [Bug #15398]
2162  */
2163 #define GVL_UNLOCK_BEGIN_YIELD(th) do { \
2164     const native_thread_data_t *next; \
2165     rb_global_vm_lock_t *gvl = rb_ractor_gvl(th->ractor); \
2166     RB_GC_SAVE_MACHINE_CONTEXT(th); \
2167     rb_native_mutex_lock(&gvl->lock); \
2168     next = gvl_release_common(gvl); \
2169     rb_native_mutex_unlock(&gvl->lock); \
2170     if (!next && rb_ractor_living_thread_num(th->ractor) > 1) { \
2171         native_thread_yield(); \
2172     }
2173
2174 /*
2175  * This function does not exclusively acquire sigwait_fd, so it
2176  * cannot safely read from it.  However, it can be woken up in
2177  * 4 ways:
2178  *
2179  * 1) ubf_ppoll_sleep (from another thread)
2180  * 2) rb_thread_wakeup_timer_thread (from signal handler)
2181  * 3) any unmasked signal hitting the process
2182  * 4) periodic ubf timer wakeups (after 3)
2183  */
2184 static void
2185 native_ppoll_sleep(rb_thread_t *th, rb_hrtime_t *rel)
2186 {
2187     rb_native_mutex_lock(&th->interrupt_lock);
2188     th->unblock.func = ubf_ppoll_sleep;
2189     rb_native_mutex_unlock(&th->interrupt_lock);
2190
2191     GVL_UNLOCK_BEGIN_YIELD(th);
2192
2193     if (!RUBY_VM_INTERRUPTED(th->ec)) {
2194         struct pollfd pfd[2];
2195         struct timespec ts;
2196
2197         pfd[0].fd = signal_self_pipe.normal[0]; /* sigwait_fd */
2198         pfd[1].fd = signal_self_pipe.ub_main[0];
2199         pfd[0].events = pfd[1].events = POLLIN;
2200         if (ppoll(pfd, 2, rb_hrtime2timespec(&ts, rel), 0) > 0) {
2201             if (pfd[1].revents & POLLIN) {
2202                 (void)consume_communication_pipe(pfd[1].fd);
2203             }
2204         }
2205         /*
2206          * do not read the sigwait_fd, here, let uplevel callers
2207          * or other threads that, otherwise we may steal and starve
2208          * other threads
2209          */
2210     }
2211     unblock_function_clear(th);
2212     GVL_UNLOCK_END(th);
2213 }
2214
2215 static void
2216 native_sleep(rb_thread_t *th, rb_hrtime_t *rel)
2217 {
2218     int sigwait_fd = rb_sigwait_fd_get(th);
2219     rb_ractor_blocking_threads_inc(th->ractor, __FILE__, __LINE__);
2220
2221     if (sigwait_fd >= 0) {
2222         rb_native_mutex_lock(&th->interrupt_lock);
2223         th->unblock.func = ubf_sigwait;
2224         rb_native_mutex_unlock(&th->interrupt_lock);
2225
2226         GVL_UNLOCK_BEGIN_YIELD(th);
2227
2228         if (!RUBY_VM_INTERRUPTED(th->ec)) {
2229             rb_sigwait_sleep(th, sigwait_fd, rel);
2230         }
2231         else {
2232             check_signals_nogvl(th, sigwait_fd);
2233         }
2234         unblock_function_clear(th);
2235         GVL_UNLOCK_END(th);
2236         rb_sigwait_fd_put(th, sigwait_fd);
2237         rb_sigwait_fd_migrate(th->vm);
2238     }
2239     else if (th == th->vm->ractor.main_thread) { /* always able to handle signals */
2240         native_ppoll_sleep(th, rel);
2241     }
2242     else {
2243         native_cond_sleep(th, rel);
2244     }
2245
2246     rb_ractor_blocking_threads_dec(th->ractor, __FILE__, __LINE__);
2247 }
2248
2249 #if UBF_TIMER == UBF_TIMER_PTHREAD
2250 static void *
2251 timer_pthread_fn(void *p)
2252 {
2253     rb_vm_t *vm = p;
2254     pthread_t main_thread_id = vm->ractor.main_thread->thread_id;
2255     struct pollfd pfd;
2256     int timeout = -1;
2257     int ccp;
2258
2259     pfd.fd = timer_pthread.low[0];
2260     pfd.events = POLLIN;
2261
2262     while (system_working > 0) {
2263         (void)poll(&pfd, 1, timeout);
2264         ccp = consume_communication_pipe(pfd.fd);
2265
2266         if (system_working > 0) {
2267             if (ATOMIC_CAS(timer_pthread.armed, 1, 1)) {
2268                 pthread_kill(main_thread_id, SIGVTALRM);
2269
2270                 if (rb_signal_buff_size() || !ubf_threads_empty()) {
2271                     timeout = TIME_QUANTUM_MSEC;
2272                 }
2273                 else {
2274                     ATOMIC_SET(timer_pthread.armed, 0);
2275                     timeout = -1;
2276                 }
2277             }
2278             else if (ccp) {
2279                 pthread_kill(main_thread_id, SIGVTALRM);
2280                 ATOMIC_SET(timer_pthread.armed, 0);
2281                 timeout = -1;
2282             }
2283         }
2284     }
2285
2286     return 0;
2287 }
2288 #endif /* UBF_TIMER_PTHREAD */
2289
2290 static VALUE
2291 ubf_caller(void *ignore)
2292 {
2293     rb_thread_sleep_forever();
2294
2295     return Qfalse;
2296 }
2297
2298 /*
2299  * Called if and only if one thread is running, and
2300  * the unblock function is NOT async-signal-safe
2301  * This assumes USE_THREAD_CACHE is true for performance reasons
2302  */
2303 static VALUE
2304 rb_thread_start_unblock_thread(void)
2305 {
2306     return rb_thread_create(ubf_caller, 0);
2307 }
2308 #endif /* THREAD_SYSTEM_DEPENDENT_IMPLEMENTATION */