sys/kern/kern_time.c

   1 /*
   2  * Copyright (c) 1982, 1986, 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. Neither the name of the University nor the names of its contributors
  14  *    may be used to endorse or promote products derived from this software
  15  *    without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  *
  29  *      @(#)kern_time.c 8.1 (Berkeley) 6/10/93
  30  * $FreeBSD: src/sys/kern/kern_time.c,v 1.68.2.1 2002/10/01 08:00:41 bde Exp $
  31  */
  32
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/buf.h>
  36 #include <sys/sysproto.h>
  37 #include <sys/resourcevar.h>
  38 #include <sys/signalvar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/sysent.h>
  41 #include <sys/sysunion.h>
  42 #include <sys/proc.h>
  43 #include <sys/priv.h>
  44 #include <sys/time.h>
  45 #include <sys/vnode.h>
  46 #include <sys/sysctl.h>
  47 #include <sys/kern_syscall.h>
  48 #include <vm/vm.h>
  49 #include <vm/vm_extern.h>
  50
  51 #include <sys/msgport2.h>
  52 #include <sys/spinlock2.h>
  53 #include <sys/thread2.h>
  54
  55 extern struct spinlock ntp_spin;
  56
  57 #define CPUCLOCK_BIT                    0x80000000
  58 #define CPUCLOCK_ID_MASK                ~CPUCLOCK_BIT
  59 #define CPUCLOCK2LWPID(clock_id)        (((clockid_t)(clock_id) >> 32) & CPUCLOCK_ID_MASK)
  60 #define CPUCLOCK2PID(clock_id)          ((clock_id) & CPUCLOCK_ID_MASK)
  61 #define MAKE_CPUCLOCK(pid, lwp_id)      ((clockid_t)(lwp_id) << 32 | (pid) | CPUCLOCK_BIT)
  62
  63 struct timezone tz;
  64
  65 /*
  66  * Time of day and interval timer support.
  67  *
  68  * These routines provide the kernel entry points to get and set
  69  * the time-of-day and per-process interval timers.  Subroutines
  70  * here provide support for adding and subtracting timeval structures
  71  * and decrementing interval timers, optionally reloading the interval
  72  * timers when they expire.
  73  */
  74
  75 static int      settime(struct timeval *);
  76 static void     timevalfix(struct timeval *);
  77 static void     realitexpire(void *arg);
  78
  79 /*
  80  * Nanosleep tries very hard to sleep for a precisely requested time
  81  * interval, down to 1uS.  The administrator can impose a minimum delay
  82  * and a delay below which we hard-loop instead of initiate a timer
  83  * interrupt and sleep.
  84  *
  85  * For machines under high loads it might be beneficial to increase min_us
  86  * to e.g. 1000uS (1ms) so spining processes sleep meaningfully.
  87  */
  88 static int     nanosleep_min_us = 10;
  89 static int     nanosleep_hard_us = 100;
  90 static int     gettimeofday_quick = 0;
  91 SYSCTL_INT(_kern, OID_AUTO, nanosleep_min_us, CTLFLAG_RW,
  92            &nanosleep_min_us, 0, "");
  93 SYSCTL_INT(_kern, OID_AUTO, nanosleep_hard_us, CTLFLAG_RW,
  94            &nanosleep_hard_us, 0, "");
  95 SYSCTL_INT(_kern, OID_AUTO, gettimeofday_quick, CTLFLAG_RW,
  96            &gettimeofday_quick, 0, "");
  97
  98 static struct lock masterclock_lock = LOCK_INITIALIZER("mstrclk", 0, 0);
  99
 100 static int
 101 settime(struct timeval *tv)
 102 {
 103         struct timeval delta, tv1, tv2;
 104         static struct timeval maxtime, laststep;
 105         struct timespec ts;
 106         int origcpu;
 107
 108         if ((origcpu = mycpu->gd_cpuid) != 0)
 109                 lwkt_setcpu_self(globaldata_find(0));
 110
 111         crit_enter();
 112         microtime(&tv1);
 113         delta = *tv;
 114         timevalsub(&delta, &tv1);
 115
 116         /*
 117          * If the system is secure, we do not allow the time to be
 118          * set to a value earlier than 1 second less than the highest
 119          * time we have yet seen. The worst a miscreant can do in
 120          * this circumstance is "freeze" time. He couldn't go
 121          * back to the past.
 122          *
 123          * We similarly do not allow the clock to be stepped more
 124          * than one second, nor more than once per second. This allows
 125          * a miscreant to make the clock march double-time, but no worse.
 126          */
 127         if (securelevel > 1) {
 128                 if (delta.tv_sec < 0 || delta.tv_usec < 0) {
 129                         /*
 130                          * Update maxtime to latest time we've seen.
 131                          */
 132                         if (tv1.tv_sec > maxtime.tv_sec)
 133                                 maxtime = tv1;
 134                         tv2 = *tv;
 135                         timevalsub(&tv2, &maxtime);
 136                         if (tv2.tv_sec < -1) {
 137                                 tv->tv_sec = maxtime.tv_sec - 1;
 138                                 kprintf("Time adjustment clamped to -1 second\n");
 139                         }
 140                 } else {
 141                         if (tv1.tv_sec == laststep.tv_sec) {
 142                                 crit_exit();
 143                                 return (EPERM);
 144                         }
 145                         if (delta.tv_sec > 1) {
 146                                 tv->tv_sec = tv1.tv_sec + 1;
 147                                 kprintf("Time adjustment clamped to +1 second\n");
 148                         }
 149                         laststep = *tv;
 150                 }
 151         }
 152
 153         ts.tv_sec = tv->tv_sec;
 154         ts.tv_nsec = tv->tv_usec * 1000;
 155         set_timeofday(&ts);
 156         crit_exit();
 157
 158         if (origcpu != 0)
 159                 lwkt_setcpu_self(globaldata_find(origcpu));
 160
 161         resettodr();
 162         return (0);
 163 }
 164
 165 static void
 166 get_process_cputime(struct proc *p, struct timespec *ats)
 167 {
 168         struct rusage ru;
 169
 170         lwkt_gettoken(&p->p_token);
 171         calcru_proc(p, &ru);
 172         lwkt_reltoken(&p->p_token);
 173         timevaladd(&ru.ru_utime, &ru.ru_stime);
 174         TIMEVAL_TO_TIMESPEC(&ru.ru_utime, ats);
 175 }
 176
 177 static void
 178 get_process_usertime(struct proc *p, struct timespec *ats)
 179 {
 180         struct rusage ru;
 181
 182         lwkt_gettoken(&p->p_token);
 183         calcru_proc(p, &ru);
 184         lwkt_reltoken(&p->p_token);
 185         TIMEVAL_TO_TIMESPEC(&ru.ru_utime, ats);
 186 }
 187
 188 static void
 189 get_thread_cputime(struct thread *td, struct timespec *ats)
 190 {
 191         struct timeval sys, user;
 192
 193         calcru(td->td_lwp, &user, &sys);
 194         timevaladd(&user, &sys);
 195         TIMEVAL_TO_TIMESPEC(&user, ats);
 196 }
 197
 198 /*
 199  * MPSAFE
 200  */
 201 int
 202 kern_clock_gettime(clockid_t clock_id, struct timespec *ats)
 203 {
 204         struct proc *p;
 205         struct lwp *lp;
 206         lwpid_t lwp_id;
 207
 208         p = curproc;
 209         switch(clock_id) {
 210         case CLOCK_REALTIME:
 211         case CLOCK_REALTIME_PRECISE:
 212                 nanotime(ats);
 213                 break;
 214         case CLOCK_REALTIME_FAST:
 215                 getnanotime(ats);
 216                 break;
 217         case CLOCK_MONOTONIC:
 218         case CLOCK_MONOTONIC_PRECISE:
 219         case CLOCK_UPTIME:
 220         case CLOCK_UPTIME_PRECISE:
 221                 nanouptime(ats);
 222                 break;
 223         case CLOCK_MONOTONIC_FAST:
 224         case CLOCK_UPTIME_FAST:
 225                 getnanouptime(ats);
 226                 break;
 227         case CLOCK_VIRTUAL:
 228                 get_process_usertime(p, ats);
 229                 break;
 230         case CLOCK_PROF:
 231         case CLOCK_PROCESS_CPUTIME_ID:
 232                 get_process_cputime(p, ats);
 233                 break;
 234         case CLOCK_SECOND:
 235                 ats->tv_sec = time_second;
 236                 ats->tv_nsec = 0;
 237                 break;
 238         case CLOCK_THREAD_CPUTIME_ID:
 239                 get_thread_cputime(curthread, ats);
 240                 break;
 241         default:
 242                 if ((clock_id & CPUCLOCK_BIT) == 0)
 243                         return (EINVAL);
 244                 if ((p = pfind(CPUCLOCK2PID(clock_id))) == NULL)
 245                         return (EINVAL);
 246                 lwp_id = CPUCLOCK2LWPID(clock_id);
 247                 if (lwp_id == 0) {
 248                         get_process_cputime(p, ats);
 249                 } else {
 250                         lwkt_gettoken(&p->p_token);
 251                         lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, lwp_id);
 252                         if (lp == NULL) {
 253                                 lwkt_reltoken(&p->p_token);
 254                                 PRELE(p);
 255                                 return (EINVAL);
 256                         }
 257                         get_thread_cputime(lp->lwp_thread, ats);
 258                         lwkt_reltoken(&p->p_token);
 259                 }
 260                 PRELE(p);
 261         }
 262         return (0);
 263 }
 264
 265 /*
 266  * MPSAFE
 267  */
 268 int
 269 sys_clock_gettime(struct clock_gettime_args *uap)
 270 {
 271         struct timespec ats;
 272         int error;
 273
 274         error = kern_clock_gettime(uap->clock_id, &ats);
 275         if (error == 0)
 276                 error = copyout(&ats, uap->tp, sizeof(ats));
 277
 278         return (error);
 279 }
 280
 281 int
 282 kern_clock_settime(clockid_t clock_id, struct timespec *ats)
 283 {
 284         struct thread *td = curthread;
 285         struct timeval atv;
 286         int error;
 287
 288         if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
 289                 return (error);
 290         if (clock_id != CLOCK_REALTIME)
 291                 return (EINVAL);
 292         if (ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000)
 293                 return (EINVAL);
 294
 295         lockmgr(&masterclock_lock, LK_EXCLUSIVE);
 296         TIMESPEC_TO_TIMEVAL(&atv, ats);
 297         error = settime(&atv);
 298         lockmgr(&masterclock_lock, LK_RELEASE);
 299
 300         return (error);
 301 }
 302
 303 /*
 304  * MPALMOSTSAFE
 305  */
 306 int
 307 sys_clock_settime(struct clock_settime_args *uap)
 308 {
 309         struct timespec ats;
 310         int error;
 311
 312         if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0)
 313                 return (error);
 314
 315         error = kern_clock_settime(uap->clock_id, &ats);
 316
 317         return (error);
 318 }
 319
 320 /*
 321  * MPSAFE
 322  */
 323 int
 324 kern_clock_getres(clockid_t clock_id, struct timespec *ts)
 325 {
 326         ts->tv_sec = 0;
 327         switch(clock_id) {
 328         case CLOCK_REALTIME:
 329         case CLOCK_REALTIME_FAST:
 330         case CLOCK_REALTIME_PRECISE:
 331         case CLOCK_MONOTONIC:
 332         case CLOCK_MONOTONIC_FAST:
 333         case CLOCK_MONOTONIC_PRECISE:
 334         case CLOCK_UPTIME:
 335         case CLOCK_UPTIME_FAST:
 336         case CLOCK_UPTIME_PRECISE:
 337                 /*
 338                  * Round up the result of the division cheaply
 339                  * by adding 1.  Rounding up is especially important
 340                  * if rounding down would give 0.  Perfect rounding
 341                  * is unimportant.
 342                  */
 343                 ts->tv_nsec = 1000000000 / sys_cputimer->freq + 1;
 344                 break;
 345         case CLOCK_VIRTUAL:
 346         case CLOCK_PROF:
 347                 /* Accurately round up here because we can do so cheaply. */
 348                 ts->tv_nsec = (1000000000 + hz - 1) / hz;
 349                 break;
 350         case CLOCK_SECOND:
 351                 ts->tv_sec = 1;
 352                 ts->tv_nsec = 0;
 353                 break;
 354         case CLOCK_THREAD_CPUTIME_ID:
 355         case CLOCK_PROCESS_CPUTIME_ID:
 356                 ts->tv_nsec = 1000;
 357                 break;
 358         default:
 359                 if ((clock_id & CPUCLOCK_BIT) != 0)
 360                         ts->tv_nsec = 1000;
 361                 else
 362                         return (EINVAL);
 363         }
 364
 365         return (0);
 366 }
 367
 368 /*
 369  * MPSAFE
 370  */
 371 int
 372 sys_clock_getres(struct clock_getres_args *uap)
 373 {
 374         int error;
 375         struct timespec ts;
 376
 377         error = kern_clock_getres(uap->clock_id, &ts);
 378         if (error == 0)
 379                 error = copyout(&ts, uap->tp, sizeof(ts));
 380
 381         return (error);
 382 }
 383
 384 static int
 385 kern_getcpuclockid(pid_t pid, lwpid_t lwp_id, clockid_t *clock_id)
 386 {
 387         struct proc *p;
 388         int error = 0;
 389
 390         if (pid == 0) {
 391                 p = curproc;
 392                 pid = p->p_pid;
 393                 PHOLD(p);
 394         } else {
 395                 p = pfind(pid);
 396                 if (p == NULL)
 397                         return (ESRCH);
 398         }
 399         /* lwp_id can be 0 when called by clock_getcpuclockid() */
 400         if (lwp_id < 0) {
 401                 error = EINVAL;
 402                 goto out;
 403         }
 404         lwkt_gettoken(&p->p_token);
 405         if (lwp_id > 0 &&
 406             lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, lwp_id) == NULL) {
 407                 lwkt_reltoken(&p->p_token);
 408                 error = ESRCH;
 409                 goto out;
 410         }
 411         *clock_id = MAKE_CPUCLOCK(pid, lwp_id);
 412         lwkt_reltoken(&p->p_token);
 413 out:
 414         PRELE(p);
 415         return (error);
 416 }
 417
 418 int
 419 sys_getcpuclockid(struct getcpuclockid_args *uap)
 420 {
 421         clockid_t clk_id;
 422         int error;
 423
 424         error = kern_getcpuclockid(uap->pid, uap->lwp_id, &clk_id);
 425         if (error == 0)
 426                 error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t));
 427
 428         return (error);
 429 }
 430
 431 /*
 432  * nanosleep1()
 433  *
 434  *      This is a general helper function for nanosleep() (aka sleep() aka
 435  *      usleep()).
 436  *
 437  *      If there is less then one tick's worth of time left and
 438  *      we haven't done a yield, or the remaining microseconds is
 439  *      ridiculously low, do a yield.  This avoids having
 440  *      to deal with systimer overheads when the system is under
 441  *      heavy loads.  If we have done a yield already then use
 442  *      a systimer and an uninterruptable thread wait.
 443  *
 444  *      If there is more then a tick's worth of time left,
 445  *      calculate the baseline ticks and use an interruptable
 446  *      tsleep, then handle the fine-grained delay on the next
 447  *      loop.  This usually results in two sleeps occuring, a long one
 448  *      and a short one.
 449  *
 450  * MPSAFE
 451  */
 452 static void
 453 ns1_systimer(systimer_t info, int in_ipi __unused,
 454     struct intrframe *frame __unused)
 455 {
 456         lwkt_schedule(info->data);
 457 }
 458
 459 int
 460 nanosleep1(struct timespec *rqt, struct timespec *rmt)
 461 {
 462         static int nanowait;
 463         struct timespec ts, ts2, ts3;
 464         struct timeval tv;
 465         int error;
 466
 467         if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
 468                 return (EINVAL);
 469         /* XXX: imho this should return EINVAL at least for tv_sec < 0 */
 470         if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0))
 471                 return (0);
 472         nanouptime(&ts);
 473         timespecadd(&ts, rqt);          /* ts = target timestamp compare */
 474         TIMESPEC_TO_TIMEVAL(&tv, rqt);  /* tv = sleep interval */
 475
 476         for (;;) {
 477                 int ticks;
 478                 struct systimer info;
 479
 480                 ticks = tv.tv_usec / ustick;    /* approximate */
 481
 482                 if (tv.tv_sec == 0 && ticks == 0) {
 483                         thread_t td = curthread;
 484                         if (tv.tv_usec > 0 && tv.tv_usec < nanosleep_min_us)
 485                                 tv.tv_usec = nanosleep_min_us;
 486                         if (tv.tv_usec < nanosleep_hard_us) {
 487                                 lwkt_user_yield();
 488                                 cpu_pause();
 489                         } else {
 490                                 crit_enter_quick(td);
 491                                 systimer_init_oneshot(&info, ns1_systimer,
 492                                                 td, tv.tv_usec);
 493                                 lwkt_deschedule_self(td);
 494                                 crit_exit_quick(td);
 495                                 lwkt_switch();
 496                                 systimer_del(&info); /* make sure it's gone */
 497                         }
 498                         error = iscaught(td->td_lwp);
 499                 } else if (tv.tv_sec == 0) {
 500                         error = tsleep(&nanowait, PCATCH, "nanslp", ticks);
 501                 } else {
 502                         ticks = tvtohz_low(&tv); /* also handles overflow */
 503                         error = tsleep(&nanowait, PCATCH, "nanslp", ticks);
 504                 }
 505                 nanouptime(&ts2);
 506                 if (error && error != EWOULDBLOCK) {
 507                         if (error == ERESTART)
 508                                 error = EINTR;
 509                         if (rmt != NULL) {
 510                                 timespecsub(&ts, &ts2);
 511                                 if (ts.tv_sec < 0)
 512                                         timespecclear(&ts);
 513                                 *rmt = ts;
 514                         }
 515                         return (error);
 516                 }
 517                 if (timespeccmp(&ts2, &ts, >=))
 518                         return (0);
 519                 ts3 = ts;
 520                 timespecsub(&ts3, &ts2);
 521                 TIMESPEC_TO_TIMEVAL(&tv, &ts3);
 522         }
 523 }
 524
 525 /*
 526  * MPSAFE
 527  */
 528 int
 529 sys_nanosleep(struct nanosleep_args *uap)
 530 {
 531         int error;
 532         struct timespec rqt;
 533         struct timespec rmt;
 534
 535         error = copyin(uap->rqtp, &rqt, sizeof(rqt));
 536         if (error)
 537                 return (error);
 538
 539         error = nanosleep1(&rqt, &rmt);
 540
 541         /*
 542          * copyout the residual if nanosleep was interrupted.
 543          */
 544         if (error && uap->rmtp) {
 545                 int error2;
 546
 547                 error2 = copyout(&rmt, uap->rmtp, sizeof(rmt));
 548                 if (error2)
 549                         error = error2;
 550         }
 551         return (error);
 552 }
 553
 554 /*
 555  * The gettimeofday() system call is supposed to return a fine-grained
 556  * realtime stamp.  However, acquiring a fine-grained stamp can create a
 557  * bottleneck when multiple cpu cores are trying to accessing e.g. the
 558  * HPET hardware timer all at the same time, so we have a sysctl that
 559  * allows its behavior to be changed to a more coarse-grained timestamp
 560  * which does not have to access a hardware timer.
 561  */
 562 int
 563 sys_gettimeofday(struct gettimeofday_args *uap)
 564 {
 565         struct timeval atv;
 566         int error = 0;
 567
 568         if (uap->tp) {
 569                 if (gettimeofday_quick)
 570                         getmicrotime(&atv);
 571                 else
 572                         microtime(&atv);
 573                 if ((error = copyout((caddr_t)&atv, (caddr_t)uap->tp,
 574                     sizeof (atv))))
 575                         return (error);
 576         }
 577         if (uap->tzp)
 578                 error = copyout((caddr_t)&tz, (caddr_t)uap->tzp,
 579                     sizeof (tz));
 580         return (error);
 581 }
 582
 583 /*
 584  * MPALMOSTSAFE
 585  */
 586 int
 587 sys_settimeofday(struct settimeofday_args *uap)
 588 {
 589         struct thread *td = curthread;
 590         struct timeval atv;
 591         struct timezone atz;
 592         int error;
 593
 594         if ((error = priv_check(td, PRIV_SETTIMEOFDAY)))
 595                 return (error);
 596         /*
 597          * Verify all parameters before changing time.
 598          *
 599          * XXX: We do not allow the time to be set to 0.0, which also by
 600          *      happy coincidence works around a pkgsrc bulk build bug.
 601          */
 602         if (uap->tv) {
 603                 if ((error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
 604                     sizeof(atv))))
 605                         return (error);
 606                 if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
 607                         return (EINVAL);
 608                 if (atv.tv_sec == 0 && atv.tv_usec == 0)
 609                         return (EINVAL);
 610         }
 611         if (uap->tzp &&
 612             (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz))))
 613                 return (error);
 614
 615         lockmgr(&masterclock_lock, LK_EXCLUSIVE);
 616         if (uap->tv && (error = settime(&atv))) {
 617                 lockmgr(&masterclock_lock, LK_RELEASE);
 618                 return (error);
 619         }
 620         lockmgr(&masterclock_lock, LK_RELEASE);
 621
 622         if (uap->tzp)
 623                 tz = atz;
 624         return (0);
 625 }
 626
 627 /*
 628  * WARNING! Run with ntp_spin held
 629  */
 630 static void
 631 kern_adjtime_common(void)
 632 {
 633         if ((ntp_delta >= 0 && ntp_delta < ntp_default_tick_delta) ||
 634             (ntp_delta < 0 && ntp_delta > -ntp_default_tick_delta))
 635                 ntp_tick_delta = ntp_delta;
 636         else if (ntp_delta > ntp_big_delta)
 637                 ntp_tick_delta = 10 * ntp_default_tick_delta;
 638         else if (ntp_delta < -ntp_big_delta)
 639                 ntp_tick_delta = -10 * ntp_default_tick_delta;
 640         else if (ntp_delta > 0)
 641                 ntp_tick_delta = ntp_default_tick_delta;
 642         else
 643                 ntp_tick_delta = -ntp_default_tick_delta;
 644 }
 645
 646 void
 647 kern_adjtime(int64_t delta, int64_t *odelta)
 648 {
 649         spin_lock(&ntp_spin);
 650         *odelta = ntp_delta;
 651         ntp_delta = delta;
 652         kern_adjtime_common();
 653         spin_unlock(&ntp_spin);
 654 }
 655
 656 static void
 657 kern_get_ntp_delta(int64_t *delta)
 658 {
 659         *delta = ntp_delta;
 660 }
 661
 662 void
 663 kern_reladjtime(int64_t delta)
 664 {
 665         spin_lock(&ntp_spin);
 666         ntp_delta += delta;
 667         kern_adjtime_common();
 668         spin_unlock(&ntp_spin);
 669 }
 670
 671 static void
 672 kern_adjfreq(int64_t rate)
 673 {
 674         spin_lock(&ntp_spin);
 675         ntp_tick_permanent = rate;
 676         spin_unlock(&ntp_spin);
 677 }
 678
 679 /*
 680  * MPALMOSTSAFE
 681  */
 682 int
 683 sys_adjtime(struct adjtime_args *uap)
 684 {
 685         struct thread *td = curthread;
 686         struct timeval atv;
 687         int64_t ndelta, odelta;
 688         int error;
 689
 690         if ((error = priv_check(td, PRIV_ADJTIME)))
 691                 return (error);
 692         error = copyin(uap->delta, &atv, sizeof(struct timeval));
 693         if (error)
 694                 return (error);
 695
 696         /*
 697          * Compute the total correction and the rate at which to apply it.
 698          * Round the adjustment down to a whole multiple of the per-tick
 699          * delta, so that after some number of incremental changes in
 700          * hardclock(), tickdelta will become zero, lest the correction
 701          * overshoot and start taking us away from the desired final time.
 702          */
 703         ndelta = (int64_t)atv.tv_sec * 1000000000 + atv.tv_usec * 1000;
 704         kern_adjtime(ndelta, &odelta);
 705
 706         if (uap->olddelta) {
 707                 atv.tv_sec = odelta / 1000000000;
 708                 atv.tv_usec = odelta % 1000000000 / 1000;
 709                 copyout(&atv, uap->olddelta, sizeof(struct timeval));
 710         }
 711         return (0);
 712 }
 713
 714 static int
 715 sysctl_adjtime(SYSCTL_HANDLER_ARGS)
 716 {
 717         int64_t delta;
 718         int error;
 719
 720         if (req->newptr != NULL) {
 721                 if (priv_check(curthread, PRIV_ROOT))
 722                         return (EPERM);
 723                 error = SYSCTL_IN(req, &delta, sizeof(delta));
 724                 if (error)
 725                         return (error);
 726                 kern_reladjtime(delta);
 727         }
 728
 729         if (req->oldptr)
 730                 kern_get_ntp_delta(&delta);
 731         error = SYSCTL_OUT(req, &delta, sizeof(delta));
 732         return (error);
 733 }
 734
 735 /*
 736  * delta is in nanoseconds.
 737  */
 738 static int
 739 sysctl_delta(SYSCTL_HANDLER_ARGS)
 740 {
 741         int64_t delta, old_delta;
 742         int error;
 743
 744         if (req->newptr != NULL) {
 745                 if (priv_check(curthread, PRIV_ROOT))
 746                         return (EPERM);
 747                 error = SYSCTL_IN(req, &delta, sizeof(delta));
 748                 if (error)
 749                         return (error);
 750                 kern_adjtime(delta, &old_delta);
 751         }
 752
 753         if (req->oldptr != NULL)
 754                 kern_get_ntp_delta(&old_delta);
 755         error = SYSCTL_OUT(req, &old_delta, sizeof(old_delta));
 756         return (error);
 757 }
 758
 759 /*
 760  * frequency is in nanoseconds per second shifted left 32.
 761  * kern_adjfreq() needs it in nanoseconds per tick shifted left 32.
 762  */
 763 static int
 764 sysctl_adjfreq(SYSCTL_HANDLER_ARGS)
 765 {
 766         int64_t freqdelta;
 767         int error;
 768
 769         if (req->newptr != NULL) {
 770                 if (priv_check(curthread, PRIV_ROOT))
 771                         return (EPERM);
 772                 error = SYSCTL_IN(req, &freqdelta, sizeof(freqdelta));
 773                 if (error)
 774                         return (error);
 775
 776                 freqdelta /= hz;
 777                 kern_adjfreq(freqdelta);
 778         }
 779
 780         if (req->oldptr != NULL)
 781                 freqdelta = ntp_tick_permanent * hz;
 782         error = SYSCTL_OUT(req, &freqdelta, sizeof(freqdelta));
 783         if (error)
 784                 return (error);
 785
 786         return (0);
 787 }
 788
 789 SYSCTL_NODE(_kern, OID_AUTO, ntp, CTLFLAG_RW, 0, "NTP related controls");
 790 SYSCTL_PROC(_kern_ntp, OID_AUTO, permanent,
 791     CTLTYPE_QUAD|CTLFLAG_RW, 0, 0,
 792     sysctl_adjfreq, "Q", "permanent correction per second");
 793 SYSCTL_PROC(_kern_ntp, OID_AUTO, delta,
 794     CTLTYPE_QUAD|CTLFLAG_RW, 0, 0,
 795     sysctl_delta, "Q", "one-time delta");
 796 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, big_delta, CTLFLAG_RD,
 797     &ntp_big_delta, sizeof(ntp_big_delta), "Q",
 798     "threshold for fast adjustment");
 799 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, tick_delta, CTLFLAG_RD,
 800     &ntp_tick_delta, sizeof(ntp_tick_delta), "LU",
 801     "per-tick adjustment");
 802 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, default_tick_delta, CTLFLAG_RD,
 803     &ntp_default_tick_delta, sizeof(ntp_default_tick_delta), "LU",
 804     "default per-tick adjustment");
 805 SYSCTL_OPAQUE(_kern_ntp, OID_AUTO, next_leap_second, CTLFLAG_RW,
 806     &ntp_leap_second, sizeof(ntp_leap_second), "LU",
 807     "next leap second");
 808 SYSCTL_INT(_kern_ntp, OID_AUTO, insert_leap_second, CTLFLAG_RW,
 809     &ntp_leap_insert, 0, "insert or remove leap second");
 810 SYSCTL_PROC(_kern_ntp, OID_AUTO, adjust,
 811     CTLTYPE_QUAD|CTLFLAG_RW, 0, 0,
 812     sysctl_adjtime, "Q", "relative adjust for delta");
 813
 814 /*
 815  * Get value of an interval timer.  The process virtual and
 816  * profiling virtual time timers are kept in the p_stats area, since
 817  * they can be swapped out.  These are kept internally in the
 818  * way they are specified externally: in time until they expire.
 819  *
 820  * The real time interval timer is kept in the process table slot
 821  * for the process, and its value (it_value) is kept as an
 822  * absolute time rather than as a delta, so that it is easy to keep
 823  * periodic real-time signals from drifting.
 824  *
 825  * Virtual time timers are processed in the hardclock() routine of
 826  * kern_clock.c.  The real time timer is processed by a timeout
 827  * routine, called from the softclock() routine.  Since a callout
 828  * may be delayed in real time due to interrupt processing in the system,
 829  * it is possible for the real time timeout routine (realitexpire, given below),
 830  * to be delayed in real time past when it is supposed to occur.  It
 831  * does not suffice, therefore, to reload the real timer .it_value from the
 832  * real time timers .it_interval.  Rather, we compute the next time in
 833  * absolute time the timer should go off.
 834  *
 835  * MPALMOSTSAFE
 836  */
 837 int
 838 sys_getitimer(struct getitimer_args *uap)
 839 {
 840         struct proc *p = curproc;
 841         struct timeval ctv;
 842         struct itimerval aitv;
 843
 844         if (uap->which > ITIMER_PROF)
 845                 return (EINVAL);
 846         lwkt_gettoken(&p->p_token);
 847         if (uap->which == ITIMER_REAL) {
 848                 /*
 849                  * Convert from absolute to relative time in .it_value
 850                  * part of real time timer.  If time for real time timer
 851                  * has passed return 0, else return difference between
 852                  * current time and time for the timer to go off.
 853                  */
 854                 aitv = p->p_realtimer;
 855                 if (timevalisset(&aitv.it_value)) {
 856                         getmicrouptime(&ctv);
 857                         if (timevalcmp(&aitv.it_value, &ctv, <))
 858                                 timevalclear(&aitv.it_value);
 859                         else
 860                                 timevalsub(&aitv.it_value, &ctv);
 861                 }
 862         } else {
 863                 aitv = p->p_timer[uap->which];
 864         }
 865         lwkt_reltoken(&p->p_token);
 866         return (copyout(&aitv, uap->itv, sizeof (struct itimerval)));
 867 }
 868
 869 /*
 870  * MPALMOSTSAFE
 871  */
 872 int
 873 sys_setitimer(struct setitimer_args *uap)
 874 {
 875         struct itimerval aitv;
 876         struct timeval ctv;
 877         struct itimerval *itvp;
 878         struct proc *p = curproc;
 879         int error;
 880
 881         if (uap->which > ITIMER_PROF)
 882                 return (EINVAL);
 883         itvp = uap->itv;
 884         if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv,
 885             sizeof(struct itimerval))))
 886                 return (error);
 887         if ((uap->itv = uap->oitv) &&
 888             (error = sys_getitimer((struct getitimer_args *)uap)))
 889                 return (error);
 890         if (itvp == NULL)
 891                 return (0);
 892         if (itimerfix(&aitv.it_value))
 893                 return (EINVAL);
 894         if (!timevalisset(&aitv.it_value))
 895                 timevalclear(&aitv.it_interval);
 896         else if (itimerfix(&aitv.it_interval))
 897                 return (EINVAL);
 898         lwkt_gettoken(&p->p_token);
 899         if (uap->which == ITIMER_REAL) {
 900                 if (timevalisset(&p->p_realtimer.it_value))
 901                         callout_stop_sync(&p->p_ithandle);
 902                 if (timevalisset(&aitv.it_value))
 903                         callout_reset(&p->p_ithandle,
 904                             tvtohz_high(&aitv.it_value), realitexpire, p);
 905                 getmicrouptime(&ctv);
 906                 timevaladd(&aitv.it_value, &ctv);
 907                 p->p_realtimer = aitv;
 908         } else {
 909                 p->p_timer[uap->which] = aitv;
 910                 switch(uap->which) {
 911                 case ITIMER_VIRTUAL:
 912                         p->p_flags &= ~P_SIGVTALRM;
 913                         break;
 914                 case ITIMER_PROF:
 915                         p->p_flags &= ~P_SIGPROF;
 916                         break;
 917                 }
 918         }
 919         lwkt_reltoken(&p->p_token);
 920         return (0);
 921 }
 922
 923 /*
 924  * Real interval timer expired:
 925  * send process whose timer expired an alarm signal.
 926  * If time is not set up to reload, then just return.
 927  * Else compute next time timer should go off which is > current time.
 928  * This is where delay in processing this timeout causes multiple
 929  * SIGALRM calls to be compressed into one.
 930  * tvtohz_high() always adds 1 to allow for the time until the next clock
 931  * interrupt being strictly less than 1 clock tick, but we don't want
 932  * that here since we want to appear to be in sync with the clock
 933  * interrupt even when we're delayed.
 934  */
 935 static
 936 void
 937 realitexpire(void *arg)
 938 {
 939         struct proc *p;
 940         struct timeval ctv, ntv;
 941
 942         p = (struct proc *)arg;
 943         PHOLD(p);
 944         lwkt_gettoken(&p->p_token);
 945         ksignal(p, SIGALRM);
 946         if (!timevalisset(&p->p_realtimer.it_interval)) {
 947                 timevalclear(&p->p_realtimer.it_value);
 948                 goto done;
 949         }
 950         for (;;) {
 951                 timevaladd(&p->p_realtimer.it_value,
 952                            &p->p_realtimer.it_interval);
 953                 getmicrouptime(&ctv);
 954                 if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) {
 955                         ntv = p->p_realtimer.it_value;
 956                         timevalsub(&ntv, &ctv);
 957                         callout_reset(&p->p_ithandle, tvtohz_low(&ntv),
 958                                       realitexpire, p);
 959                         goto done;
 960                 }
 961         }
 962 done:
 963         lwkt_reltoken(&p->p_token);
 964         PRELE(p);
 965 }
 966
 967 /*
 968  * Used to validate itimer timeouts and utimes*() timespecs.
 969  */
 970 int
 971 itimerfix(struct timeval *tv)
 972 {
 973         if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000)
 974                 return (EINVAL);
 975         if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < ustick)
 976                 tv->tv_usec = ustick;
 977         return (0);
 978 }
 979
 980 /*
 981  * Used to validate timeouts and utimes*() timespecs.
 982  */
 983 int
 984 itimespecfix(struct timespec *ts)
 985 {
 986         if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000ULL)
 987                 return (EINVAL);
 988         if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < nstick)
 989                 ts->tv_nsec = nstick;
 990         return (0);
 991 }
 992
 993 /*
 994  * Decrement an interval timer by a specified number
 995  * of microseconds, which must be less than a second,
 996  * i.e. < 1000000.  If the timer expires, then reload
 997  * it.  In this case, carry over (usec - old value) to
 998  * reduce the value reloaded into the timer so that
 999  * the timer does not drift.  This routine assumes
1000  * that it is called in a context where the timers
1001  * on which it is operating cannot change in value.
1002  */
1003 int
1004 itimerdecr(struct itimerval *itp, int usec)
1005 {
1006
1007         if (itp->it_value.tv_usec < usec) {
1008                 if (itp->it_value.tv_sec == 0) {
1009                         /* expired, and already in next interval */
1010                         usec -= itp->it_value.tv_usec;
1011                         goto expire;
1012                 }
1013                 itp->it_value.tv_usec += 1000000;
1014                 itp->it_value.tv_sec--;
1015         }
1016         itp->it_value.tv_usec -= usec;
1017         usec = 0;
1018         if (timevalisset(&itp->it_value))
1019                 return (1);
1020         /* expired, exactly at end of interval */
1021 expire:
1022         if (timevalisset(&itp->it_interval)) {
1023                 itp->it_value = itp->it_interval;
1024                 itp->it_value.tv_usec -= usec;
1025                 if (itp->it_value.tv_usec < 0) {
1026                         itp->it_value.tv_usec += 1000000;
1027                         itp->it_value.tv_sec--;
1028                 }
1029         } else
1030                 itp->it_value.tv_usec = 0;              /* sec is already 0 */
1031         return (0);
1032 }
1033
1034 /*
1035  * Add and subtract routines for timevals.
1036  * N.B.: subtract routine doesn't deal with
1037  * results which are before the beginning,
1038  * it just gets very confused in this case.
1039  * Caveat emptor.
1040  */
1041 void
1042 timevaladd(struct timeval *t1, const struct timeval *t2)
1043 {
1044
1045         t1->tv_sec += t2->tv_sec;
1046         t1->tv_usec += t2->tv_usec;
1047         timevalfix(t1);
1048 }
1049
1050 void
1051 timevalsub(struct timeval *t1, const struct timeval *t2)
1052 {
1053
1054         t1->tv_sec -= t2->tv_sec;
1055         t1->tv_usec -= t2->tv_usec;
1056         timevalfix(t1);
1057 }
1058
1059 static void
1060 timevalfix(struct timeval *t1)
1061 {
1062
1063         if (t1->tv_usec < 0) {
1064                 t1->tv_sec--;
1065                 t1->tv_usec += 1000000;
1066         }
1067         if (t1->tv_usec >= 1000000) {
1068                 t1->tv_sec++;
1069                 t1->tv_usec -= 1000000;
1070         }
1071 }
1072
1073 /*
1074  * ratecheck(): simple time-based rate-limit checking.
1075  */
1076 int
1077 ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
1078 {
1079         struct timeval tv, delta;
1080         int rv = 0;
1081
1082         getmicrouptime(&tv);            /* NB: 10ms precision */
1083         delta = tv;
1084         timevalsub(&delta, lasttime);
1085
1086         /*
1087          * check for 0,0 is so that the message will be seen at least once,
1088          * even if interval is huge.
1089          */
1090         if (timevalcmp(&delta, mininterval, >=) ||
1091             (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
1092                 *lasttime = tv;
1093                 rv = 1;
1094         }
1095
1096         return (rv);
1097 }
1098
1099 /*
1100  * ppsratecheck(): packets (or events) per second limitation.
1101  *
1102  * Return 0 if the limit is to be enforced (e.g. the caller
1103  * should drop a packet because of the rate limitation).
1104  *
1105  * maxpps of 0 always causes zero to be returned.  maxpps of -1
1106  * always causes 1 to be returned; this effectively defeats rate
1107  * limiting.
1108  *
1109  * Note that we maintain the struct timeval for compatibility
1110  * with other bsd systems.  We reuse the storage and just monitor
1111  * clock ticks for minimal overhead.
1112  */
1113 int
1114 ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
1115 {
1116         int now;
1117
1118         /*
1119          * Reset the last time and counter if this is the first call
1120          * or more than a second has passed since the last update of
1121          * lasttime.
1122          */
1123         now = ticks;
1124         if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) {
1125                 lasttime->tv_sec = now;
1126                 *curpps = 1;
1127                 return (maxpps != 0);
1128         } else {
1129                 (*curpps)++;            /* NB: ignore potential overflow */
1130                 return (maxpps < 0 || *curpps < maxpps);
1131         }
1132 }