sys/kern/kern_time.c

   1 /*
   2  * Copyright (c) 1982, 1986, 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. Neither the name of the University nor the names of its contributors
  14  *    may be used to endorse or promote products derived from this software
  15  *    without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  *
  29  *      @(#)kern_time.c 8.1 (Berkeley) 6/10/93
  30  * $FreeBSD: src/sys/kern/kern_time.c,v 1.68.2.1 2002/10/01 08:00:41 bde Exp $
  31  */
  32
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/buf.h>
  36 #include <sys/sysmsg.h>
  37 #include <sys/resourcevar.h>
  38 #include <sys/signalvar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/sysent.h>
  41 #include <sys/proc.h>
  42 #include <sys/priv.h>
  43 #include <sys/time.h>
  44 #include <sys/vnode.h>
  45 #include <sys/sysctl.h>
  46 #include <sys/kern_syscall.h>
  47 #include <sys/upmap.h>
  48 #include <vm/vm.h>
  49 #include <vm/vm_extern.h>
  50
  51 #include <sys/msgport2.h>
  52 #include <sys/spinlock2.h>
  53 #include <sys/thread2.h>
  54
  55 extern struct spinlock ntp_spin;
  56
  57 #define CPUCLOCK_BIT                    0x80000000
  58 #define CPUCLOCK_ID_MASK                ~CPUCLOCK_BIT
  59 #define CPUCLOCK2LWPID(clock_id)        (((clockid_t)(clock_id) >> 32) & CPUCLOCK_ID_MASK)
  60 #define CPUCLOCK2PID(clock_id)          ((clock_id) & CPUCLOCK_ID_MASK)
  61 #define MAKE_CPUCLOCK(pid, lwp_id)      ((clockid_t)(lwp_id) << 32 | (pid) | CPUCLOCK_BIT)
  62
  63 struct timezone tz;
  64
  65 /*
  66  * Time of day and interval timer support.
  67  *
  68  * These routines provide the kernel entry points to get and set
  69  * the time-of-day and per-process interval timers.  Subroutines
  70  * here provide support for adding and subtracting timeval structures
  71  * and decrementing interval timers, optionally reloading the interval
  72  * timers when they expire.
  73  */
  74
  75 static int      settime(struct timeval *);
  76 static void     timevalfix(struct timeval *);
  77 static void     realitexpire(void *arg);
  78
  79 static int sysctl_gettimeofday_quick(SYSCTL_HANDLER_ARGS);
  80
  81
  82 /*
  83  * Nanosleep tries very hard to sleep for a precisely requested time
  84  * interval, down to 1uS.  The administrator can impose a minimum delay
  85  * and a delay below which we hard-loop instead of initiate a timer
  86  * interrupt and sleep.
  87  *
  88  * For machines under high loads it might be beneficial to increase min_us
  89  * to e.g. 1000uS (1ms) so spining processes sleep meaningfully.
  90  */
  91 static int     nanosleep_min_us = 10;
  92 static int     nanosleep_hard_us = 100;
  93 static int     gettimeofday_quick = 0;
  94 SYSCTL_INT(_kern, OID_AUTO, nanosleep_min_us, CTLFLAG_RW,
  95            &nanosleep_min_us, 0, "");
  96 SYSCTL_INT(_kern, OID_AUTO, nanosleep_hard_us, CTLFLAG_RW,
  97            &nanosleep_hard_us, 0, "");
  98 SYSCTL_PROC(_kern, OID_AUTO, gettimeofday_quick, CTLTYPE_INT | CTLFLAG_RW,
  99            0, 0, sysctl_gettimeofday_quick, "I", "Quick mode gettimeofday");
 100
 101 static struct lock masterclock_lock = LOCK_INITIALIZER("mstrclk", 0, 0);
 102
 103 static int
 104 settime(struct timeval *tv)
 105 {
 106         struct timeval delta, tv1, tv2;
 107         static struct timeval maxtime, laststep;
 108         struct timespec ts;
 109         int origcpu;
 110
 111         if ((origcpu = mycpu->gd_cpuid) != 0)
 112                 lwkt_setcpu_self(globaldata_find(0));
 113
 114         crit_enter();
 115         microtime(&tv1);
 116         delta = *tv;
 117         timevalsub(&delta, &tv1);
 118
 119         /*
 120          * If the system is secure, we do not allow the time to be
 121          * set to a value earlier than 1 second less than the highest
 122          * time we have yet seen. The worst a miscreant can do in
 123          * this circumstance is "freeze" time. He couldn't go
 124          * back to the past.
 125          *
 126          * We similarly do not allow the clock to be stepped more
 127          * than one second, nor more than once per second. This allows
 128          * a miscreant to make the clock march double-time, but no worse.
 129          */
 130         if (securelevel > 1) {
 131                 if (delta.tv_sec < 0 || delta.tv_usec < 0) {
 132                         /*
 133                          * Update maxtime to latest time we've seen.
 134                          */
 135                         if (tv1.tv_sec > maxtime.tv_sec)
 136                                 maxtime = tv1;
 137                         tv2 = *tv;
 138                         timevalsub(&tv2, &maxtime);
 139                         if (tv2.tv_sec < -1) {
 140                                 tv->tv_sec = maxtime.tv_sec - 1;
 141                                 kprintf("Time adjustment clamped to -1 second\n");
 142                         }
 143                 } else {
 144                         if (tv1.tv_sec == laststep.tv_sec) {
 145                                 crit_exit();
 146                                 return (EPERM);
 147                         }
 148                         if (delta.tv_sec > 1) {
 149                                 tv->tv_sec = tv1.tv_sec + 1;
 150                                 kprintf("Time adjustment clamped to +1 second\n");
 151                         }
 152                         laststep = *tv;
 153                 }
 154         }
 155
 156         ts.tv_sec = tv->tv_sec;
 157         ts.tv_nsec = tv->tv_usec * 1000;
 158         set_timeofday(&ts);
 159         crit_exit();
 160
 161         if (origcpu != 0)
 162                 lwkt_setcpu_self(globaldata_find(origcpu));
 163
 164         resettodr();
 165         return (0);
 166 }
 167
 168 static void
 169 get_process_cputime(struct proc *p, struct timespec *ats)
 170 {
 171         struct rusage ru;
 172
 173         lwkt_gettoken(&p->p_token);
 174         calcru_proc(p, &ru);
 175         lwkt_reltoken(&p->p_token);
 176         timevaladd(&ru.ru_utime, &ru.ru_stime);
 177         TIMEVAL_TO_TIMESPEC(&ru.ru_utime, ats);
 178 }
 179
 180 static void
 181 get_process_usertime(struct proc *p, struct timespec *ats)
 182 {
 183         struct rusage ru;
 184
 185         lwkt_gettoken(&p->p_token);
 186         calcru_proc(p, &ru);
 187         lwkt_reltoken(&p->p_token);
 188         TIMEVAL_TO_TIMESPEC(&ru.ru_utime, ats);
 189 }
 190
 191 static void
 192 get_thread_cputime(struct thread *td, struct timespec *ats)
 193 {
 194         struct timeval sys, user;
 195
 196         calcru(td->td_lwp, &user, &sys);
 197         timevaladd(&user, &sys);
 198         TIMEVAL_TO_TIMESPEC(&user, ats);
 199 }
 200
 201 /*
 202  * MPSAFE
 203  */
 204 int
 205 kern_clock_gettime(clockid_t clock_id, struct timespec *ats)
 206 {
 207         struct proc *p;
 208         struct lwp *lp;
 209         lwpid_t lwp_id;
 210
 211         p = curproc;
 212         switch(clock_id) {
 213         case CLOCK_REALTIME:
 214         case CLOCK_REALTIME_PRECISE:
 215                 nanotime(ats);
 216                 break;
 217         case CLOCK_REALTIME_FAST:
 218                 getnanotime(ats);
 219                 break;
 220         case CLOCK_MONOTONIC:
 221         case CLOCK_MONOTONIC_PRECISE:
 222         case CLOCK_UPTIME:
 223         case CLOCK_UPTIME_PRECISE:
 224                 nanouptime(ats);
 225                 break;
 226         case CLOCK_MONOTONIC_FAST:
 227         case CLOCK_UPTIME_FAST:
 228                 getnanouptime(ats);
 229                 break;
 230         case CLOCK_VIRTUAL:
 231                 get_process_usertime(p, ats);
 232                 break;
 233         case CLOCK_PROF:
 234         case CLOCK_PROCESS_CPUTIME_ID:
 235                 get_process_cputime(p, ats);
 236                 break;
 237         case CLOCK_SECOND:
 238                 ats->tv_sec = time_second;
 239                 ats->tv_nsec = 0;
 240                 break;
 241         case CLOCK_THREAD_CPUTIME_ID:
 242                 get_thread_cputime(curthread, ats);
 243                 break;
 244         default:
 245                 if ((clock_id & CPUCLOCK_BIT) == 0)
 246                         return (EINVAL);
 247                 if ((p = pfind(CPUCLOCK2PID(clock_id))) == NULL)
 248                         return (EINVAL);
 249                 lwp_id = CPUCLOCK2LWPID(clock_id);
 250                 if (lwp_id == 0) {
 251                         get_process_cputime(p, ats);
 252                 } else {
 253                         lwkt_gettoken(&p->p_token);
 254                         lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, lwp_id);
 255                         if (lp == NULL) {
 256                                 lwkt_reltoken(&p->p_token);
 257                                 PRELE(p);
 258                                 return (EINVAL);
 259                         }
 260                         get_thread_cputime(lp->lwp_thread, ats);
 261                         lwkt_reltoken(&p->p_token);
 262                 }
 263                 PRELE(p);
 264         }
 265         return (0);
 266 }
 267
 268 /*
 269  * MPSAFE
 270  */
 271 int
 272 sys_clock_gettime(struct sysmsg *sysmsg, const struct clock_gettime_args *uap)
 273 {
 274         struct timespec ats;
 275         int error;
 276
 277         error = kern_clock_gettime(uap->clock_id, &ats);
 278         if (error == 0)
 279                 error = copyout(&ats, uap->tp, sizeof(ats));
 280
 281         return (error);
 282 }
 283
 284 int
 285 kern_clock_settime(clockid_t clock_id, struct timespec *ats)
 286 {
 287         struct thread *td = curthread;
 288         struct timeval atv;
 289         int error;
 290
 291         if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
 292                 return (error);
 293         if (clock_id != CLOCK_REALTIME)
 294                 return (EINVAL);
 295         if (ats->tv_sec < 0 || ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000)
 296                 return (EINVAL);
 297
 298         lockmgr(&masterclock_lock, LK_EXCLUSIVE);
 299         TIMESPEC_TO_TIMEVAL(&atv, ats);
 300         error = settime(&atv);
 301         lockmgr(&masterclock_lock, LK_RELEASE);
 302
 303         return (error);
 304 }
 305
 306 /*
 307  * MPALMOSTSAFE
 308  */
 309 int
 310 sys_clock_settime(struct sysmsg *sysmsg, const struct clock_settime_args *uap)
 311 {
 312         struct timespec ats;
 313         int error;
 314
 315         if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0)
 316                 return (error);
 317
 318         error = kern_clock_settime(uap->clock_id, &ats);
 319
 320         return (error);
 321 }
 322
 323 /*
 324  * MPSAFE
 325  */
 326 int
 327 kern_clock_getres(clockid_t clock_id, struct timespec *ts)
 328 {
 329         ts->tv_sec = 0;
 330
 331         switch (clock_id) {
 332         case CLOCK_REALTIME:
 333         case CLOCK_REALTIME_FAST:
 334         case CLOCK_REALTIME_PRECISE:
 335         case CLOCK_MONOTONIC:
 336         case CLOCK_MONOTONIC_FAST:
 337         case CLOCK_MONOTONIC_PRECISE:
 338         case CLOCK_UPTIME:
 339         case CLOCK_UPTIME_FAST:
 340         case CLOCK_UPTIME_PRECISE:
 341                 /*
 342                  * Minimum reportable resolution is 1ns.  Rounding is
 343                  * otherwise unimportant.
 344                  */
 345                 ts->tv_nsec = 999999999 / sys_cputimer->freq + 1;
 346                 break;
 347         case CLOCK_VIRTUAL:
 348         case CLOCK_PROF:
 349                 /* Accurately round up here because we can do so cheaply. */
 350                 ts->tv_nsec = howmany(1000000000, hz);
 351                 break;
 352         case CLOCK_SECOND:
 353                 ts->tv_sec = 1;
 354                 ts->tv_nsec = 0;
 355                 break;
 356         case CLOCK_THREAD_CPUTIME_ID:
 357         case CLOCK_PROCESS_CPUTIME_ID:
 358                 ts->tv_nsec = 1000;
 359                 break;
 360         default:
 361                 if ((clock_id & CPUCLOCK_BIT) == CPUCLOCK_BIT) {
 362                         pid_t pid = CPUCLOCK2PID(clock_id);
 363                         if (pid < 2 || pid > PID_MAX)
 364                                 return (EINVAL);
 365                         ts->tv_nsec = 1000;
 366                 } else {
 367                         return (EINVAL);
 368                 }
 369         }
 370
 371         return (0);
 372 }
 373
 374 /*
 375  * MPSAFE
 376  */
 377 int
 378 sys_clock_getres(struct sysmsg *sysmsg, const struct clock_getres_args *uap)
 379 {
 380         int error;
 381         struct timespec ts;
 382
 383         error = kern_clock_getres(uap->clock_id, &ts);
 384         if (error == 0)
 385                 error = copyout(&ts, uap->tp, sizeof(ts));
 386
 387         return (error);
 388 }
 389
 390 static int
 391 kern_getcpuclockid(pid_t pid, lwpid_t lwp_id, clockid_t *clock_id)
 392 {
 393         struct proc *p;
 394         int error = 0;
 395
 396         if (pid == 0) {
 397                 p = curproc;
 398                 pid = p->p_pid;
 399                 PHOLD(p);
 400         } else {
 401                 p = pfind(pid);
 402                 if (p == NULL)
 403                         return (ESRCH);
 404         }
 405         /* lwp_id can be 0 when called by clock_getcpuclockid() */
 406         if (lwp_id < 0) {
 407                 error = EINVAL;
 408                 goto out;
 409         }
 410         lwkt_gettoken(&p->p_token);
 411         if (lwp_id > 0 &&
 412             lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, lwp_id) == NULL) {
 413                 lwkt_reltoken(&p->p_token);
 414                 error = ESRCH;
 415                 goto out;
 416         }
 417         *clock_id = MAKE_CPUCLOCK(pid, lwp_id);
 418         lwkt_reltoken(&p->p_token);
 419 out:
 420         PRELE(p);
 421         return (error);
 422 }
 423
 424 int
 425 sys_getcpuclockid(struct sysmsg *sysmsg, const struct getcpuclockid_args *uap)
 426 {
 427         clockid_t clk_id;
 428         int error;
 429
 430         error = kern_getcpuclockid(uap->pid, uap->lwp_id, &clk_id);
 431         if (error == 0)
 432                 error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t));
 433
 434         return (error);
 435 }
 436
 437 /*
 438  * clock_nanosleep1()
 439  *
 440  *      This is a general helper function for clock_nanosleep() and
 441  *      nanosleep() (aka sleep(), aka usleep()).
 442  *
 443  *      If there is less than one tick's worth of time left and
 444  *      we haven't done a yield, or the remaining microseconds is
 445  *      ridiculously low, do a yield.  This avoids having
 446  *      to deal with systimer overheads when the system is under
 447  *      heavy loads.  If we have done a yield already then use
 448  *      a systimer and an uninterruptable thread wait.
 449  *
 450  *      If there is more than a tick's worth of time left,
 451  *      calculate the baseline ticks and use an interruptable
 452  *      tsleep, then handle the fine-grained delay on the next
 453  *      loop.  This usually results in two sleeps occuring, a long one
 454  *      and a short one.
 455  *
 456  * MPSAFE
 457  */
 458 static void
 459 ns1_systimer(systimer_t info, int in_ipi __unused,
 460     struct intrframe *frame __unused)
 461 {
 462         lwkt_schedule(info->data);
 463 }
 464
 465 int
 466 clock_nanosleep1(clockid_t clock_id, int flags,
 467     struct timespec *rqt, struct timespec *rmt)
 468 {
 469         static int nanowait;
 470         struct timespec ts_cur, ts_tgt, ts_int;
 471         struct timeval tv;
 472         bool is_abs;
 473         int error, error2;
 474
 475         if ((flags & ~(TIMER_RELTIME | TIMER_ABSTIME)) != 0)
 476                 return (EINVAL);
 477         if (rqt->tv_sec < 0 || rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
 478                 return (EINVAL);
 479         if (rqt->tv_sec == 0 && rqt->tv_nsec == 0)
 480                 return (0);
 481
 482         switch (clock_id) {
 483         case CLOCK_REALTIME:
 484         case CLOCK_REALTIME_FAST:
 485         case CLOCK_REALTIME_PRECISE:
 486         case CLOCK_SECOND:
 487         case CLOCK_MONOTONIC:
 488         case CLOCK_MONOTONIC_FAST:
 489         case CLOCK_MONOTONIC_PRECISE:
 490         case CLOCK_UPTIME:
 491         case CLOCK_UPTIME_FAST:
 492         case CLOCK_UPTIME_PRECISE:
 493                 is_abs = (flags & TIMER_ABSTIME) != 0;
 494                 break;
 495         case CLOCK_VIRTUAL:
 496         case CLOCK_PROF:
 497         case CLOCK_PROCESS_CPUTIME_ID:
 498                 return (ENOTSUP);
 499         case CLOCK_THREAD_CPUTIME_ID:
 500         default:
 501                 return (EINVAL);
 502         }
 503
 504         error = kern_clock_gettime(clock_id, &ts_cur);
 505         if (error)
 506                 return (error);
 507
 508         if (is_abs) {
 509                 if (timespeccmp(&ts_cur, rqt, >=))
 510                         return (0);
 511
 512                 ts_tgt = *rqt; /* target timestamp */
 513                 timespecsub(&ts_tgt, &ts_cur, &ts_int); /* sleep interval */
 514         } else {
 515                 ts_int = *rqt; /* sleep interval */
 516                 timespecadd(&ts_cur, &ts_int, &ts_tgt); /* target timestamp */
 517         }
 518
 519         for (;;) {
 520                 int ticks;
 521                 struct systimer info;
 522                 thread_t td;
 523
 524                 timespecsub(&ts_tgt, &ts_cur, &ts_int);
 525                 TIMESPEC_TO_TIMEVAL(&tv, &ts_int);
 526                 ticks = tv.tv_usec / ustick; /* approximate */
 527
 528                 if (tv.tv_sec == 0 && ticks == 0) {
 529                         td = curthread;
 530                         if (tv.tv_usec > 0 && tv.tv_usec < nanosleep_min_us)
 531                                 tv.tv_usec = nanosleep_min_us;
 532                         if (tv.tv_usec < nanosleep_hard_us) {
 533                                 lwkt_user_yield();
 534                                 cpu_pause();
 535                         } else {
 536                                 crit_enter_quick(td);
 537                                 systimer_init_oneshot(&info, ns1_systimer,
 538                                                 td, tv.tv_usec);
 539                                 lwkt_deschedule_self(td);
 540                                 crit_exit_quick(td);
 541                                 lwkt_switch();
 542                                 systimer_del(&info); /* make sure it's gone */
 543                         }
 544                         error = iscaught(td->td_lwp);
 545                 } else if (tv.tv_sec == 0) {
 546                         error = tsleep(&nanowait, PCATCH, "nanslp", ticks);
 547                 } else {
 548                         ticks = tvtohz_low(&tv); /* also handles overflow */
 549                         error = tsleep(&nanowait, PCATCH, "nanslp", ticks);
 550                 }
 551
 552                 error2 = kern_clock_gettime(clock_id, &ts_cur);
 553                 if (error2)
 554                         return (error2);
 555
 556                 if (error && error != EWOULDBLOCK) {
 557                         if (error == ERESTART)
 558                                 error = EINTR;
 559                         if (rmt != NULL && !is_abs) {
 560                                 timespecsub(&ts_tgt, &ts_cur, &ts_int);
 561                                 if (ts_int.tv_sec < 0)
 562                                         timespecclear(&ts_int);
 563                                 *rmt = ts_int;
 564                         }
 565                         return (error);
 566                 }
 567                 if (timespeccmp(&ts_cur, &ts_tgt, >=))
 568                         return (0);
 569         }
 570 }
 571
 572 int
 573 nanosleep1(struct timespec *rqt, struct timespec *rmt)
 574 {
 575         return clock_nanosleep1(CLOCK_REALTIME, TIMER_RELTIME, rqt, rmt);
 576 }
 577
 578 /*
 579  * MPSAFE
 580  */
 581 int
 582 sys_clock_nanosleep(struct sysmsg *sysmsg,
 583     const struct clock_nanosleep_args *uap)
 584 {
 585         int error;
 586         bool is_abs;
 587         struct timespec rqt;
 588         struct timespec rmt;
 589
 590         is_abs = (uap->flags & TIMER_ABSTIME) != 0;
 591
 592         error = copyin(uap->rqtp, &rqt, sizeof(rqt));
 593         if (error) {
 594                 sysmsg->sysmsg_result = error;
 595                 return (0);
 596         }
 597
 598         bzero(&rmt, sizeof(rmt));
 599         error = clock_nanosleep1(uap->clock_id, uap->flags, &rqt, &rmt);
 600
 601         /*
 602          * copyout the residual if nanosleep was interrupted.
 603          */
 604         if (error == EINTR && uap->rmtp != NULL && !is_abs) {
 605                 int error2;
 606
 607                 error2 = copyout(&rmt, uap->rmtp, sizeof(rmt));
 608                 if (error2)
 609                         error = error2;
 610         }
 611
 612         sysmsg->sysmsg_result = error;
 613         return (0);
 614 }
 615
 616 /*
 617  * MPSAFE
 618  */
 619 int
 620 sys_nanosleep(struct sysmsg *sysmsg, const struct nanosleep_args *uap)
 621 {
 622         int error;
 623         struct timespec rqt;
 624         struct timespec rmt;
 625
 626         error = copyin(uap->rqtp, &rqt, sizeof(rqt));
 627         if (error)
 628                 return (error);
 629
 630         bzero(&rmt, sizeof(rmt));
 631         error = nanosleep1(&rqt, &rmt);
 632
 633         /*
 634          * copyout the residual if nanosleep was interrupted.
 635          */
 636         if (error == EINTR && uap->rmtp != NULL) {
 637                 int error2;
 638
 639                 error2 = copyout(&rmt, uap->rmtp, sizeof(rmt));
 640                 if (error2)
 641                         error = error2;
 642         }
 643         return (error);
 644 }
 645
 646 /*
 647  * The gettimeofday() system call is supposed to return a fine-grained
 648  * realtime stamp.  However, acquiring a fine-grained stamp can create a
 649  * bottleneck when multiple cpu cores are trying to accessing e.g. the
 650  * HPET hardware timer all at the same time, so we have a sysctl that
 651  * allows its behavior to be changed to a more coarse-grained timestamp
 652  * which does not have to access a hardware timer.
 653  */
 654 int
 655 sys_gettimeofday(struct sysmsg *sysmsg, const struct gettimeofday_args *uap)
 656 {
 657         struct timeval atv;
 658         int error = 0;
 659
 660         if (uap->tp) {
 661                 if (gettimeofday_quick)
 662                         getmicrotime(&atv);
 663                 else
 664                         microtime(&atv);
 665                 if ((error = copyout((caddr_t)&atv, (caddr_t)uap->tp,
 666                     sizeof (atv))))
 667                         return (error);
 668         }
 669         if (uap->tzp)
 670                 error = copyout((caddr_t)&tz, (caddr_t)uap->tzp,
 671                     sizeof (tz));
 672         return (error);
 673 }
 674
 675 /*
 676  * MPALMOSTSAFE
 677  */
 678 int
 679 sys_settimeofday(struct sysmsg *sysmsg, const struct settimeofday_args *uap)
 680 {
 681         struct thread *td = curthread;
 682         struct timeval atv;
 683         struct timezone atz;
 684         int error;
 685
 686         if ((error = priv_check(td, PRIV_SETTIMEOFDAY)))
 687                 return (error);
 688         /*
 689          * Verify all parameters before changing time.
 690          *
 691          * XXX: We do not allow the time to be set to 0.0, which also by
 692          *      happy coincidence works around a pkgsrc bulk build bug.
 693          */
 694         if (uap->tv) {
 695                 if ((error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
 696                     sizeof(atv))))
 697                         return (error);
 698                 if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
 699                         return (EINVAL);
 700                 if (atv.tv_sec == 0 && atv.tv_usec == 0)
 701                         return (EINVAL);
 702         }
 703         if (uap->tzp &&
 704             (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz))))
 705                 return (error);
 706
 707         lockmgr(&masterclock_lock, LK_EXCLUSIVE);
 708         if (uap->tv && (error = settime(&atv))) {
 709                 lockmgr(&masterclock_lock, LK_RELEASE);
 710                 return (error);
 711         }
 712         lockmgr(&masterclock_lock, LK_RELEASE);
 713
 714         if (uap->tzp)
 715                 tz = atz;
 716         return (0);
 717 }
 718
 719 /*
 720  * WARNING! Run with ntp_spin held
 721  */
 722 static void
 723 kern_adjtime_common(void)
 724 {
 725         if ((ntp_delta >= 0 && ntp_delta < ntp_default_tick_delta) ||
 726             (ntp_delta < 0 && ntp_delta > -ntp_default_tick_delta))
 727                 ntp_tick_delta = ntp_delta;
 728         else if (ntp_delta > ntp_big_delta)
 729                 ntp_tick_delta = 10 * ntp_default_tick_delta;
 730         else if (ntp_delta < -ntp_big_delta)
 731                 ntp_tick_delta = -10 * ntp_default_tick_delta;
 732         else if (ntp_delta > 0)
 733                 ntp_tick_delta = ntp_default_tick_delta;
 734         else
 735                 ntp_tick_delta = -ntp_default_tick_delta;
 736 }
 737
 738 void
 739 kern_adjtime(int64_t delta, int64_t *odelta)
 740 {
 741         spin_lock(&ntp_spin);
 742         *odelta = ntp_delta;
 743         ntp_delta = delta;
 744         kern_adjtime_common();
 745         spin_unlock(&ntp_spin);
 746 }
 747
 748 static void
 749 kern_get_ntp_delta(int64_t *delta)
 750 {
 751         *delta = ntp_delta;
 752 }
 753
 754 void
 755 kern_reladjtime(int64_t delta)
 756 {
 757         spin_lock(&ntp_spin);
 758         ntp_delta += delta;
 759         kern_adjtime_common();
 760         spin_unlock(&ntp_spin);
 761 }
 762
 763 static void
 764 kern_adjfreq(int64_t rate)
 765 {
 766         spin_lock(&ntp_spin);
 767         ntp_tick_permanent = rate;
 768         spin_unlock(&ntp_spin);
 769 }
 770
 771 /*
 772  * MPALMOSTSAFE
 773  */
 774 int
 775 sys_adjtime(struct sysmsg *sysmsg, const struct adjtime_args *uap)
 776 {
 777         struct thread *td = curthread;
 778         struct timeval atv;
 779         int64_t ndelta, odelta;
 780         int error;
 781
 782         if ((error = priv_check(td, PRIV_ADJTIME)))
 783                 return (error);
 784         error = copyin(uap->delta, &atv, sizeof(struct timeval));
 785         if (error)
 786                 return (error);
 787
 788         /*
 789          * Compute the total correction and the rate at which to apply it.
 790          * Round the adjustment down to a whole multiple of the per-tick
 791          * delta, so that after some number of incremental changes in
 792          * hardclock(), tickdelta will become zero, lest the correction
 793          * overshoot and start taking us away from the desired final time.
 794          */
 795         ndelta = (int64_t)atv.tv_sec * 1000000000 + atv.tv_usec * 1000;
 796         kern_adjtime(ndelta, &odelta);
 797
 798         if (uap->olddelta) {
 799                 atv.tv_sec = odelta / 1000000000;
 800                 atv.tv_usec = odelta % 1000000000 / 1000;
 801                 copyout(&atv, uap->olddelta, sizeof(struct timeval));
 802         }
 803         return (0);
 804 }
 805
 806 static int
 807 sysctl_adjtime(SYSCTL_HANDLER_ARGS)
 808 {
 809         int64_t delta;
 810         int error;
 811
 812         if (req->newptr != NULL) {
 813                 if (priv_check(curthread, PRIV_ROOT))
 814                         return (EPERM);
 815                 error = SYSCTL_IN(req, &delta, sizeof(delta));
 816                 if (error)
 817                         return (error);
 818                 kern_reladjtime(delta);
 819         }
 820
 821         if (req->oldptr)
 822                 kern_get_ntp_delta(&delta);
 823         error = SYSCTL_OUT(req, &delta, sizeof(delta));
 824         return (error);
 825 }
 826
 827 /*
 828  * delta is in nanoseconds.
 829  */
 830 static int
 831 sysctl_delta(SYSCTL_HANDLER_ARGS)
 832 {
 833         int64_t delta, old_delta;
 834         int error;
 835
 836         if (req->newptr != NULL) {
 837                 if (priv_check(curthread, PRIV_ROOT))
 838                         return (EPERM);
 839                 error = SYSCTL_IN(req, &delta, sizeof(delta));
 840                 if (error)
 841                         return (error);
 842                 kern_adjtime(delta, &old_delta);
 843         }
 844
 845         if (req->oldptr != NULL)
 846                 kern_get_ntp_delta(&old_delta);
 847         error = SYSCTL_OUT(req, &old_delta, sizeof(old_delta));
 848         return (error);
 849 }
 850
 851 /*
 852  * frequency is in nanoseconds per second shifted left 32.
 853  * kern_adjfreq() needs it in nanoseconds per tick shifted left 32.
 854  */
 855 static int
 856 sysctl_adjfreq(SYSCTL_HANDLER_ARGS)
 857 {
 858         int64_t freqdelta;
 859         int error;
 860
 861         if (req->newptr != NULL) {
 862                 if (priv_check(curthread, PRIV_ROOT))
 863                         return (EPERM);
 864                 error = SYSCTL_IN(req, &freqdelta, sizeof(freqdelta));
 865                 if (error)
 866                         return (error);
 867
 868                 freqdelta /= hz;
 869                 kern_adjfreq(freqdelta);
 870         }
 871
 872         if (req->oldptr != NULL)
 873                 freqdelta = ntp_tick_permanent * hz;
 874         error = SYSCTL_OUT(req, &freqdelta, sizeof(freqdelta));
 875         if (error)
 876                 return (error);
 877
 878         return (0);
 879 }
 880
 881 SYSCTL_NODE(_kern, OID_AUTO, ntp, CTLFLAG_RW, 0, "NTP related controls");
 882 SYSCTL_PROC(_kern_ntp, OID_AUTO, permanent,
 883     CTLTYPE_QUAD|CTLFLAG_RW, 0, 0,
 884     sysctl_adjfreq, "Q", "permanent correction per second");
 885 SYSCTL_PROC(_kern_ntp, OID_AUTO, delta,
 886     CTLTYPE_QUAD|CTLFLAG_RW, 0, 0,
 887     sysctl_delta, "Q", "one-time delta");
 888 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, big_delta, CTLFLAG_RD,
 889     &ntp_big_delta, sizeof(ntp_big_delta), "Q",
 890     "threshold for fast adjustment");
 891 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, tick_delta, CTLFLAG_RD,
 892     &ntp_tick_delta, sizeof(ntp_tick_delta), "LU",
 893     "per-tick adjustment");
 894 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, default_tick_delta, CTLFLAG_RD,
 895     &ntp_default_tick_delta, sizeof(ntp_default_tick_delta), "LU",
 896     "default per-tick adjustment");
 897 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, next_leap_second, CTLFLAG_RW,
 898     &ntp_leap_second, sizeof(ntp_leap_second), "LU",
 899     "next leap second");
 900 SYSCTL_INT(_kern_ntp, OID_AUTO, insert_leap_second, CTLFLAG_RW,
 901     &ntp_leap_insert, 0, "insert or remove leap second");
 902 SYSCTL_PROC(_kern_ntp, OID_AUTO, adjust,
 903     CTLTYPE_QUAD|CTLFLAG_RW, 0, 0,
 904     sysctl_adjtime, "Q", "relative adjust for delta");
 905
 906 /*
 907  * Get value of an interval timer.  The process virtual and
 908  * profiling virtual time timers are kept in the p_stats area, since
 909  * they can be swapped out.  These are kept internally in the
 910  * way they are specified externally: in time until they expire.
 911  *
 912  * The real time interval timer is kept in the process table slot
 913  * for the process, and its value (it_value) is kept as an
 914  * absolute time rather than as a delta, so that it is easy to keep
 915  * periodic real-time signals from drifting.
 916  *
 917  * Virtual time timers are processed in the hardclock() routine of
 918  * kern_clock.c.  The real time timer is processed by a timeout
 919  * routine, called from the softclock() routine.  Since a callout
 920  * may be delayed in real time due to interrupt processing in the system,
 921  * it is possible for the real time timeout routine (realitexpire, given below),
 922  * to be delayed in real time past when it is supposed to occur.  It
 923  * does not suffice, therefore, to reload the real timer .it_value from the
 924  * real time timers .it_interval.  Rather, we compute the next time in
 925  * absolute time the timer should go off.
 926  *
 927  * MPALMOSTSAFE
 928  */
 929 int
 930 sys_getitimer(struct sysmsg *sysmsg, const struct getitimer_args *uap)
 931 {
 932         struct proc *p = curproc;
 933         struct timeval ctv;
 934         struct itimerval aitv;
 935
 936         if (uap->which > ITIMER_PROF)
 937                 return (EINVAL);
 938         lwkt_gettoken(&p->p_token);
 939         if (uap->which == ITIMER_REAL) {
 940                 /*
 941                  * Convert from absolute to relative time in .it_value
 942                  * part of real time timer.  If time for real time timer
 943                  * has passed return 0, else return difference between
 944                  * current time and time for the timer to go off.
 945                  */
 946                 aitv = p->p_realtimer;
 947                 if (timevalisset(&aitv.it_value)) {
 948                         getmicrouptime(&ctv);
 949                         if (timevalcmp(&aitv.it_value, &ctv, <))
 950                                 timevalclear(&aitv.it_value);
 951                         else
 952                                 timevalsub(&aitv.it_value, &ctv);
 953                 }
 954         } else {
 955                 aitv = p->p_timer[uap->which];
 956         }
 957         lwkt_reltoken(&p->p_token);
 958         return (copyout(&aitv, uap->itv, sizeof (struct itimerval)));
 959 }
 960
 961 /*
 962  * MPALMOSTSAFE
 963  */
 964 int
 965 sys_setitimer(struct sysmsg *sysmsg, const struct setitimer_args *uap)
 966 {
 967         struct itimerval aitv;
 968         struct timeval ctv;
 969         struct itimerval *itvp;
 970         struct proc *p = curproc;
 971         struct getitimer_args gitargs;
 972         int error;
 973
 974         if (uap->which > ITIMER_PROF)
 975                 return (EINVAL);
 976         itvp = uap->itv;
 977         if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv,
 978             sizeof(struct itimerval))))
 979                 return (error);
 980
 981         if (uap->oitv) {
 982                 gitargs.which = uap->which;
 983                 gitargs.itv = uap->oitv;
 984                 error = sys_getitimer(sysmsg, &gitargs);
 985                 if (error)
 986                         return error;
 987         }
 988         if (itvp == NULL)
 989                 return (0);
 990         if (itimerfix(&aitv.it_value))
 991                 return (EINVAL);
 992         if (!timevalisset(&aitv.it_value))
 993                 timevalclear(&aitv.it_interval);
 994         else if (itimerfix(&aitv.it_interval))
 995                 return (EINVAL);
 996         lwkt_gettoken(&p->p_token);
 997         if (uap->which == ITIMER_REAL) {
 998                 if (timevalisset(&p->p_realtimer.it_value))
 999                         callout_cancel(&p->p_ithandle);
1000                 if (timevalisset(&aitv.it_value))
1001                         callout_reset(&p->p_ithandle,
1002                             tvtohz_high(&aitv.it_value), realitexpire, p);
1003                 getmicrouptime(&ctv);
1004                 timevaladd(&aitv.it_value, &ctv);
1005                 p->p_realtimer = aitv;
1006         } else {
1007                 p->p_timer[uap->which] = aitv;
1008                 switch(uap->which) {
1009                 case ITIMER_VIRTUAL:
1010                         p->p_flags &= ~P_SIGVTALRM;
1011                         break;
1012                 case ITIMER_PROF:
1013                         p->p_flags &= ~P_SIGPROF;
1014                         break;
1015                 }
1016         }
1017         lwkt_reltoken(&p->p_token);
1018         return (0);
1019 }
1020
1021 /*
1022  * Real interval timer expired:
1023  * send process whose timer expired an alarm signal.
1024  * If time is not set up to reload, then just return.
1025  * Else compute next time timer should go off which is > current time.
1026  * This is where delay in processing this timeout causes multiple
1027  * SIGALRM calls to be compressed into one.
1028  * tvtohz_high() always adds 1 to allow for the time until the next clock
1029  * interrupt being strictly less than 1 clock tick, but we don't want
1030  * that here since we want to appear to be in sync with the clock
1031  * interrupt even when we're delayed.
1032  */
1033 static
1034 void
1035 realitexpire(void *arg)
1036 {
1037         struct proc *p;
1038         struct timeval ctv, ntv;
1039
1040         p = (struct proc *)arg;
1041         PHOLD(p);
1042         lwkt_gettoken(&p->p_token);
1043         ksignal(p, SIGALRM);
1044         if (!timevalisset(&p->p_realtimer.it_interval)) {
1045                 timevalclear(&p->p_realtimer.it_value);
1046                 goto done;
1047         }
1048         for (;;) {
1049                 timevaladd(&p->p_realtimer.it_value,
1050                            &p->p_realtimer.it_interval);
1051                 getmicrouptime(&ctv);
1052                 if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) {
1053                         ntv = p->p_realtimer.it_value;
1054                         timevalsub(&ntv, &ctv);
1055                         callout_reset(&p->p_ithandle, tvtohz_low(&ntv),
1056                                       realitexpire, p);
1057                         goto done;
1058                 }
1059         }
1060 done:
1061         lwkt_reltoken(&p->p_token);
1062         PRELE(p);
1063 }
1064
1065 /*
1066  * Used to validate itimer timeouts and utimes*() timespecs.
1067  */
1068 int
1069 itimerfix(struct timeval *tv)
1070 {
1071         if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000)
1072                 return (EINVAL);
1073         if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < ustick)
1074                 tv->tv_usec = ustick;
1075         return (0);
1076 }
1077
1078 /*
1079  * Used to validate timeouts and utimes*() timespecs.
1080  */
1081 int
1082 itimespecfix(struct timespec *ts)
1083 {
1084         if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000ULL)
1085                 return (EINVAL);
1086         if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < nstick)
1087                 ts->tv_nsec = nstick;
1088         return (0);
1089 }
1090
1091 /*
1092  * Decrement an interval timer by a specified number
1093  * of microseconds, which must be less than a second,
1094  * i.e. < 1000000.  If the timer expires, then reload
1095  * it.  In this case, carry over (usec - old value) to
1096  * reduce the value reloaded into the timer so that
1097  * the timer does not drift.  This routine assumes
1098  * that it is called in a context where the timers
1099  * on which it is operating cannot change in value.
1100  */
1101 int
1102 itimerdecr(struct itimerval *itp, int usec)
1103 {
1104
1105         if (itp->it_value.tv_usec < usec) {
1106                 if (itp->it_value.tv_sec == 0) {
1107                         /* expired, and already in next interval */
1108                         usec -= itp->it_value.tv_usec;
1109                         goto expire;
1110                 }
1111                 itp->it_value.tv_usec += 1000000;
1112                 itp->it_value.tv_sec--;
1113         }
1114         itp->it_value.tv_usec -= usec;
1115         usec = 0;
1116         if (timevalisset(&itp->it_value))
1117                 return (1);
1118         /* expired, exactly at end of interval */
1119 expire:
1120         if (timevalisset(&itp->it_interval)) {
1121                 itp->it_value = itp->it_interval;
1122                 itp->it_value.tv_usec -= usec;
1123                 if (itp->it_value.tv_usec < 0) {
1124                         itp->it_value.tv_usec += 1000000;
1125                         itp->it_value.tv_sec--;
1126                 }
1127         } else
1128                 itp->it_value.tv_usec = 0;              /* sec is already 0 */
1129         return (0);
1130 }
1131
1132 /*
1133  * Add and subtract routines for timevals.
1134  * N.B.: subtract routine doesn't deal with
1135  * results which are before the beginning,
1136  * it just gets very confused in this case.
1137  * Caveat emptor.
1138  */
1139 void
1140 timevaladd(struct timeval *t1, const struct timeval *t2)
1141 {
1142
1143         t1->tv_sec += t2->tv_sec;
1144         t1->tv_usec += t2->tv_usec;
1145         timevalfix(t1);
1146 }
1147
1148 void
1149 timevalsub(struct timeval *t1, const struct timeval *t2)
1150 {
1151
1152         t1->tv_sec -= t2->tv_sec;
1153         t1->tv_usec -= t2->tv_usec;
1154         timevalfix(t1);
1155 }
1156
1157 static void
1158 timevalfix(struct timeval *t1)
1159 {
1160
1161         if (t1->tv_usec < 0) {
1162                 t1->tv_sec--;
1163                 t1->tv_usec += 1000000;
1164         }
1165         if (t1->tv_usec >= 1000000) {
1166                 t1->tv_sec++;
1167                 t1->tv_usec -= 1000000;
1168         }
1169 }
1170
1171 /*
1172  * ratecheck(): simple time-based rate-limit checking.
1173  */
1174 int
1175 ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
1176 {
1177         struct timeval tv, delta;
1178         int rv = 0;
1179
1180         getmicrouptime(&tv);            /* NB: 10ms precision */
1181         delta = tv;
1182         timevalsub(&delta, lasttime);
1183
1184         /*
1185          * check for 0,0 is so that the message will be seen at least once,
1186          * even if interval is huge.
1187          */
1188         if (timevalcmp(&delta, mininterval, >=) ||
1189             (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
1190                 *lasttime = tv;
1191                 rv = 1;
1192         }
1193
1194         return (rv);
1195 }
1196
1197 /*
1198  * ppsratecheck(): packets (or events) per second limitation.
1199  *
1200  * Return 0 if the limit is to be enforced (e.g. the caller
1201  * should drop a packet because of the rate limitation).
1202  *
1203  * maxpps of 0 always causes zero to be returned.  maxpps of -1
1204  * always causes 1 to be returned; this effectively defeats rate
1205  * limiting.
1206  *
1207  * Note that we maintain the struct timeval for compatibility
1208  * with other bsd systems.  We reuse the storage and just monitor
1209  * clock ticks for minimal overhead.
1210  */
1211 int
1212 ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
1213 {
1214         int now;
1215
1216         /*
1217          * Reset the last time and counter if this is the first call
1218          * or more than a second has passed since the last update of
1219          * lasttime.
1220          */
1221         now = ticks;
1222         if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) {
1223                 lasttime->tv_sec = now;
1224                 *curpps = 1;
1225                 return (maxpps != 0);
1226         } else {
1227                 (*curpps)++;            /* NB: ignore potential overflow */
1228                 return (maxpps < 0 || *curpps < maxpps);
1229         }
1230 }
1231
1232 static int
1233 sysctl_gettimeofday_quick(SYSCTL_HANDLER_ARGS)
1234 {
1235         int error;
1236         int gtod;
1237
1238         gtod = gettimeofday_quick;
1239         error = sysctl_handle_int(oidp, &gtod, 0, req);
1240         if (error || req->newptr == NULL)
1241                 return error;
1242         gettimeofday_quick = gtod;
1243         if (kpmap)
1244                 kpmap->fast_gtod = gtod;
1245         return 0;
1246 }