sys/kern/kern_subr.c

   1 /*      $NetBSD: kern_subr.c,v 1.198 2009/01/11 02:45:52 christos Exp $ */
   2
   3 /*-
   4  * Copyright (c) 1997, 1998, 1999, 2002, 2007, 2008 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
   9  * NASA Ames Research Center, and by Luke Mewburn.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30  * POSSIBILITY OF SUCH DAMAGE.
  31  */
  32
  33 /*
  34  * Copyright (c) 1982, 1986, 1991, 1993
  35  *      The Regents of the University of California.  All rights reserved.
  36  * (c) UNIX System Laboratories, Inc.
  37  * All or some portions of this file are derived from material licensed
  38  * to the University of California by American Telephone and Telegraph
  39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  40  * the permission of UNIX System Laboratories, Inc.
  41  *
  42  * Copyright (c) 1992, 1993
  43  *      The Regents of the University of California.  All rights reserved.
  44  *
  45  * This software was developed by the Computer Systems Engineering group
  46  * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
  47  * contributed to Berkeley.
  48  *
  49  * All advertising materials mentioning features or use of this software
  50  * must display the following acknowledgement:
  51  *      This product includes software developed by the University of
  52  *      California, Lawrence Berkeley Laboratory.
  53  *
  54  * Redistribution and use in source and binary forms, with or without
  55  * modification, are permitted provided that the following conditions
  56  * are met:
  57  * 1. Redistributions of source code must retain the above copyright
  58  *    notice, this list of conditions and the following disclaimer.
  59  * 2. Redistributions in binary form must reproduce the above copyright
  60  *    notice, this list of conditions and the following disclaimer in the
  61  *    documentation and/or other materials provided with the distribution.
  62  * 3. Neither the name of the University nor the names of its contributors
  63  *    may be used to endorse or promote products derived from this software
  64  *    without specific prior written permission.
  65  *
  66  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  67  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  68  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  69  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  70  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  71  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  72  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  73  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  74  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  75  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  76  * SUCH DAMAGE.
  77  *
  78  *      @(#)kern_subr.c 8.4 (Berkeley) 2/14/95
  79  */
  80
  81 #include <sys/cdefs.h>
  82 __KERNEL_RCSID(0, "$NetBSD: kern_subr.c,v 1.198 2009/01/11 02:45:52 christos Exp $");
  83
  84 #include "opt_ddb.h"
  85 #include "opt_md.h"
  86 #include "opt_syscall_debug.h"
  87 #include "opt_ktrace.h"
  88 #include "opt_ptrace.h"
  89 #include "opt_powerhook.h"
  90 #include "opt_tftproot.h"
  91
  92 #include <sys/param.h>
  93 #include <sys/systm.h>
  94 #include <sys/proc.h>
  95 #include <sys/malloc.h>
  96 #include <sys/mount.h>
  97 #include <sys/device.h>
  98 #include <sys/reboot.h>
  99 #include <sys/conf.h>
 100 #include <sys/disk.h>
 101 #include <sys/disklabel.h>
 102 #include <sys/queue.h>
 103 #include <sys/ktrace.h>
 104 #include <sys/ptrace.h>
 105 #include <sys/fcntl.h>
 106 #include <sys/kauth.h>
 107 #include <sys/vnode.h>
 108 #include <sys/syscallvar.h>
 109 #include <sys/xcall.h>
 110 #include <sys/module.h>
 111
 112 #include <uvm/uvm_extern.h>
 113
 114 #include <dev/cons.h>
 115
 116 #include <net/if.h>
 117
 118 /* XXX these should eventually move to subr_autoconf.c */
 119 static struct device *finddevice(const char *);
 120 static struct device *getdisk(char *, int, int, dev_t *, int);
 121 static struct device *parsedisk(char *, int, int, dev_t *);
 122 static const char *getwedgename(const char *, int);
 123
 124 /*
 125  * A generic linear hook.
 126  */
 127 struct hook_desc {
 128         LIST_ENTRY(hook_desc) hk_list;
 129         void    (*hk_fn)(void *);
 130         void    *hk_arg;
 131 };
 132 typedef LIST_HEAD(, hook_desc) hook_list_t;
 133
 134 #ifdef TFTPROOT
 135 int tftproot_dhcpboot(struct device *);
 136 #endif
 137
 138 dev_t   dumpcdev;       /* for savecore */
 139
 140 void
 141 uio_setup_sysspace(struct uio *uio)
 142 {
 143
 144         uio->uio_vmspace = vmspace_kernel();
 145 }
 146
 147 int
 148 uiomove(void *buf, size_t n, struct uio *uio)
 149 {
 150         struct vmspace *vm = uio->uio_vmspace;
 151         struct iovec *iov;
 152         size_t cnt;
 153         int error = 0;
 154         char *cp = buf;
 155
 156         ASSERT_SLEEPABLE();
 157
 158 #ifdef DIAGNOSTIC
 159         if (uio->uio_rw != UIO_READ && uio->uio_rw != UIO_WRITE)
 160                 panic("uiomove: mode");
 161 #endif
 162         while (n > 0 && uio->uio_resid) {
 163                 iov = uio->uio_iov;
 164                 cnt = iov->iov_len;
 165                 if (cnt == 0) {
 166                         KASSERT(uio->uio_iovcnt > 0);
 167                         uio->uio_iov++;
 168                         uio->uio_iovcnt--;
 169                         continue;
 170                 }
 171                 if (cnt > n)
 172                         cnt = n;
 173                 if (!VMSPACE_IS_KERNEL_P(vm)) {
 174                         if (curcpu()->ci_schedstate.spc_flags &
 175                             SPCF_SHOULDYIELD)
 176                                 preempt();
 177                 }
 178
 179                 if (uio->uio_rw == UIO_READ) {
 180                         error = copyout_vmspace(vm, cp, iov->iov_base,
 181                             cnt);
 182                 } else {
 183                         error = copyin_vmspace(vm, iov->iov_base, cp,
 184                             cnt);
 185                 }
 186                 if (error) {
 187                         break;
 188                 }
 189                 iov->iov_base = (char *)iov->iov_base + cnt;
 190                 iov->iov_len -= cnt;
 191                 uio->uio_resid -= cnt;
 192                 uio->uio_offset += cnt;
 193                 cp += cnt;
 194                 KDASSERT(cnt <= n);
 195                 n -= cnt;
 196         }
 197
 198         return (error);
 199 }
 200
 201 /*
 202  * Wrapper for uiomove() that validates the arguments against a known-good
 203  * kernel buffer.
 204  */
 205 int
 206 uiomove_frombuf(void *buf, size_t buflen, struct uio *uio)
 207 {
 208         size_t offset;
 209
 210         if (uio->uio_offset < 0 || /* uio->uio_resid < 0 || */
 211             (offset = uio->uio_offset) != uio->uio_offset)
 212                 return (EINVAL);
 213         if (offset >= buflen)
 214                 return (0);
 215         return (uiomove((char *)buf + offset, buflen - offset, uio));
 216 }
 217
 218 /*
 219  * Give next character to user as result of read.
 220  */
 221 int
 222 ureadc(int c, struct uio *uio)
 223 {
 224         struct iovec *iov;
 225
 226         if (uio->uio_resid <= 0)
 227                 panic("ureadc: non-positive resid");
 228 again:
 229         if (uio->uio_iovcnt <= 0)
 230                 panic("ureadc: non-positive iovcnt");
 231         iov = uio->uio_iov;
 232         if (iov->iov_len <= 0) {
 233                 uio->uio_iovcnt--;
 234                 uio->uio_iov++;
 235                 goto again;
 236         }
 237         if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) {
 238                 if (subyte(iov->iov_base, c) < 0)
 239                         return (EFAULT);
 240         } else {
 241                 *(char *)iov->iov_base = c;
 242         }
 243         iov->iov_base = (char *)iov->iov_base + 1;
 244         iov->iov_len--;
 245         uio->uio_resid--;
 246         uio->uio_offset++;
 247         return (0);
 248 }
 249
 250 /*
 251  * Like copyin(), but operates on an arbitrary vmspace.
 252  */
 253 int
 254 copyin_vmspace(struct vmspace *vm, const void *uaddr, void *kaddr, size_t len)
 255 {
 256         struct iovec iov;
 257         struct uio uio;
 258         int error;
 259
 260         if (len == 0)
 261                 return (0);
 262
 263         if (VMSPACE_IS_KERNEL_P(vm)) {
 264                 return kcopy(uaddr, kaddr, len);
 265         }
 266         if (__predict_true(vm == curproc->p_vmspace)) {
 267                 return copyin(uaddr, kaddr, len);
 268         }
 269
 270         iov.iov_base = kaddr;
 271         iov.iov_len = len;
 272         uio.uio_iov = &iov;
 273         uio.uio_iovcnt = 1;
 274         uio.uio_offset = (off_t)(uintptr_t)uaddr;
 275         uio.uio_resid = len;
 276         uio.uio_rw = UIO_READ;
 277         UIO_SETUP_SYSSPACE(&uio);
 278         error = uvm_io(&vm->vm_map, &uio);
 279
 280         return (error);
 281 }
 282
 283 /*
 284  * Like copyout(), but operates on an arbitrary vmspace.
 285  */
 286 int
 287 copyout_vmspace(struct vmspace *vm, const void *kaddr, void *uaddr, size_t len)
 288 {
 289         struct iovec iov;
 290         struct uio uio;
 291         int error;
 292
 293         if (len == 0)
 294                 return (0);
 295
 296         if (VMSPACE_IS_KERNEL_P(vm)) {
 297                 return kcopy(kaddr, uaddr, len);
 298         }
 299         if (__predict_true(vm == curproc->p_vmspace)) {
 300                 return copyout(kaddr, uaddr, len);
 301         }
 302
 303         iov.iov_base = __UNCONST(kaddr); /* XXXUNCONST cast away const */
 304         iov.iov_len = len;
 305         uio.uio_iov = &iov;
 306         uio.uio_iovcnt = 1;
 307         uio.uio_offset = (off_t)(uintptr_t)uaddr;
 308         uio.uio_resid = len;
 309         uio.uio_rw = UIO_WRITE;
 310         UIO_SETUP_SYSSPACE(&uio);
 311         error = uvm_io(&vm->vm_map, &uio);
 312
 313         return (error);
 314 }
 315
 316 /*
 317  * Like copyin(), but operates on an arbitrary process.
 318  */
 319 int
 320 copyin_proc(struct proc *p, const void *uaddr, void *kaddr, size_t len)
 321 {
 322         struct vmspace *vm;
 323         int error;
 324
 325         error = proc_vmspace_getref(p, &vm);
 326         if (error) {
 327                 return error;
 328         }
 329         error = copyin_vmspace(vm, uaddr, kaddr, len);
 330         uvmspace_free(vm);
 331
 332         return error;
 333 }
 334
 335 /*
 336  * Like copyout(), but operates on an arbitrary process.
 337  */
 338 int
 339 copyout_proc(struct proc *p, const void *kaddr, void *uaddr, size_t len)
 340 {
 341         struct vmspace *vm;
 342         int error;
 343
 344         error = proc_vmspace_getref(p, &vm);
 345         if (error) {
 346                 return error;
 347         }
 348         error = copyout_vmspace(vm, kaddr, uaddr, len);
 349         uvmspace_free(vm);
 350
 351         return error;
 352 }
 353
 354 /*
 355  * Like copyin(), except it operates on kernel addresses when the FKIOCTL
 356  * flag is passed in `ioctlflags' from the ioctl call.
 357  */
 358 int
 359 ioctl_copyin(int ioctlflags, const void *src, void *dst, size_t len)
 360 {
 361         if (ioctlflags & FKIOCTL)
 362                 return kcopy(src, dst, len);
 363         return copyin(src, dst, len);
 364 }
 365
 366 /*
 367  * Like copyout(), except it operates on kernel addresses when the FKIOCTL
 368  * flag is passed in `ioctlflags' from the ioctl call.
 369  */
 370 int
 371 ioctl_copyout(int ioctlflags, const void *src, void *dst, size_t len)
 372 {
 373         if (ioctlflags & FKIOCTL)
 374                 return kcopy(src, dst, len);
 375         return copyout(src, dst, len);
 376 }
 377
 378 static void *
 379 hook_establish(hook_list_t *list, void (*fn)(void *), void *arg)
 380 {
 381         struct hook_desc *hd;
 382
 383         hd = malloc(sizeof(*hd), M_DEVBUF, M_NOWAIT);
 384         if (hd == NULL)
 385                 return (NULL);
 386
 387         hd->hk_fn = fn;
 388         hd->hk_arg = arg;
 389         LIST_INSERT_HEAD(list, hd, hk_list);
 390
 391         return (hd);
 392 }
 393
 394 static void
 395 hook_disestablish(hook_list_t *list, void *vhook)
 396 {
 397 #ifdef DIAGNOSTIC
 398         struct hook_desc *hd;
 399
 400         LIST_FOREACH(hd, list, hk_list) {
 401                 if (hd == vhook)
 402                         break;
 403         }
 404
 405         if (hd == NULL)
 406                 panic("hook_disestablish: hook %p not established", vhook);
 407 #endif
 408         LIST_REMOVE((struct hook_desc *)vhook, hk_list);
 409         free(vhook, M_DEVBUF);
 410 }
 411
 412 static void
 413 hook_destroy(hook_list_t *list)
 414 {
 415         struct hook_desc *hd;
 416
 417         while ((hd = LIST_FIRST(list)) != NULL) {
 418                 LIST_REMOVE(hd, hk_list);
 419                 free(hd, M_DEVBUF);
 420         }
 421 }
 422
 423 static void
 424 hook_proc_run(hook_list_t *list, struct proc *p)
 425 {
 426         struct hook_desc *hd;
 427
 428         LIST_FOREACH(hd, list, hk_list)
 429                 ((void (*)(struct proc *, void *))*hd->hk_fn)(p, hd->hk_arg);
 430 }
 431
 432 /*
 433  * "Shutdown hook" types, functions, and variables.
 434  *
 435  * Should be invoked immediately before the
 436  * system is halted or rebooted, i.e. after file systems unmounted,
 437  * after crash dump done, etc.
 438  *
 439  * Each shutdown hook is removed from the list before it's run, so that
 440  * it won't be run again.
 441  */
 442
 443 static hook_list_t shutdownhook_list;
 444
 445 void *
 446 shutdownhook_establish(void (*fn)(void *), void *arg)
 447 {
 448         return hook_establish(&shutdownhook_list, fn, arg);
 449 }
 450
 451 void
 452 shutdownhook_disestablish(void *vhook)
 453 {
 454         hook_disestablish(&shutdownhook_list, vhook);
 455 }
 456
 457 /*
 458  * Run shutdown hooks.  Should be invoked immediately before the
 459  * system is halted or rebooted, i.e. after file systems unmounted,
 460  * after crash dump done, etc.
 461  *
 462  * Each shutdown hook is removed from the list before it's run, so that
 463  * it won't be run again.
 464  */
 465 void
 466 doshutdownhooks(void)
 467 {
 468         struct hook_desc *dp;
 469
 470         while ((dp = LIST_FIRST(&shutdownhook_list)) != NULL) {
 471                 LIST_REMOVE(dp, hk_list);
 472                 (*dp->hk_fn)(dp->hk_arg);
 473 #if 0
 474                 /*
 475                  * Don't bother freeing the hook structure,, since we may
 476                  * be rebooting because of a memory corruption problem,
 477                  * and this might only make things worse.  It doesn't
 478                  * matter, anyway, since the system is just about to
 479                  * reboot.
 480                  */
 481                 free(dp, M_DEVBUF);
 482 #endif
 483         }
 484 }
 485
 486 /*
 487  * "Mountroot hook" types, functions, and variables.
 488  */
 489
 490 static hook_list_t mountroothook_list;
 491
 492 void *
 493 mountroothook_establish(void (*fn)(struct device *), struct device *dev)
 494 {
 495         return hook_establish(&mountroothook_list, (void (*)(void *))fn, dev);
 496 }
 497
 498 void
 499 mountroothook_disestablish(void *vhook)
 500 {
 501         hook_disestablish(&mountroothook_list, vhook);
 502 }
 503
 504 void
 505 mountroothook_destroy(void)
 506 {
 507         hook_destroy(&mountroothook_list);
 508 }
 509
 510 void
 511 domountroothook(void)
 512 {
 513         struct hook_desc *hd;
 514
 515         LIST_FOREACH(hd, &mountroothook_list, hk_list) {
 516                 if (hd->hk_arg == (void *)root_device) {
 517                         (*hd->hk_fn)(hd->hk_arg);
 518                         return;
 519                 }
 520         }
 521 }
 522
 523 static hook_list_t exechook_list;
 524
 525 void *
 526 exechook_establish(void (*fn)(struct proc *, void *), void *arg)
 527 {
 528         return hook_establish(&exechook_list, (void (*)(void *))fn, arg);
 529 }
 530
 531 void
 532 exechook_disestablish(void *vhook)
 533 {
 534         hook_disestablish(&exechook_list, vhook);
 535 }
 536
 537 /*
 538  * Run exec hooks.
 539  */
 540 void
 541 doexechooks(struct proc *p)
 542 {
 543         hook_proc_run(&exechook_list, p);
 544 }
 545
 546 static hook_list_t exithook_list;
 547 extern krwlock_t exec_lock;
 548
 549 void *
 550 exithook_establish(void (*fn)(struct proc *, void *), void *arg)
 551 {
 552         void *rv;
 553
 554         rw_enter(&exec_lock, RW_WRITER);
 555         rv = hook_establish(&exithook_list, (void (*)(void *))fn, arg);
 556         rw_exit(&exec_lock);
 557         return rv;
 558 }
 559
 560 void
 561 exithook_disestablish(void *vhook)
 562 {
 563
 564         rw_enter(&exec_lock, RW_WRITER);
 565         hook_disestablish(&exithook_list, vhook);
 566         rw_exit(&exec_lock);
 567 }
 568
 569 /*
 570  * Run exit hooks.
 571  */
 572 void
 573 doexithooks(struct proc *p)
 574 {
 575         hook_proc_run(&exithook_list, p);
 576 }
 577
 578 static hook_list_t forkhook_list;
 579
 580 void *
 581 forkhook_establish(void (*fn)(struct proc *, struct proc *))
 582 {
 583         return hook_establish(&forkhook_list, (void (*)(void *))fn, NULL);
 584 }
 585
 586 void
 587 forkhook_disestablish(void *vhook)
 588 {
 589         hook_disestablish(&forkhook_list, vhook);
 590 }
 591
 592 /*
 593  * Run fork hooks.
 594  */
 595 void
 596 doforkhooks(struct proc *p2, struct proc *p1)
 597 {
 598         struct hook_desc *hd;
 599
 600         LIST_FOREACH(hd, &forkhook_list, hk_list) {
 601                 ((void (*)(struct proc *, struct proc *))*hd->hk_fn)
 602                     (p2, p1);
 603         }
 604 }
 605
 606 /*
 607  * "Power hook" types, functions, and variables.
 608  * The list of power hooks is kept ordered with the last registered hook
 609  * first.
 610  * When running the hooks on power down the hooks are called in reverse
 611  * registration order, when powering up in registration order.
 612  */
 613 struct powerhook_desc {
 614         CIRCLEQ_ENTRY(powerhook_desc) sfd_list;
 615         void    (*sfd_fn)(int, void *);
 616         void    *sfd_arg;
 617         char    sfd_name[16];
 618 };
 619
 620 static CIRCLEQ_HEAD(, powerhook_desc) powerhook_list =
 621     CIRCLEQ_HEAD_INITIALIZER(powerhook_list);
 622
 623 void *
 624 powerhook_establish(const char *name, void (*fn)(int, void *), void *arg)
 625 {
 626         struct powerhook_desc *ndp;
 627
 628         ndp = (struct powerhook_desc *)
 629             malloc(sizeof(*ndp), M_DEVBUF, M_NOWAIT);
 630         if (ndp == NULL)
 631                 return (NULL);
 632
 633         ndp->sfd_fn = fn;
 634         ndp->sfd_arg = arg;
 635         strlcpy(ndp->sfd_name, name, sizeof(ndp->sfd_name));
 636         CIRCLEQ_INSERT_HEAD(&powerhook_list, ndp, sfd_list);
 637
 638         aprint_error("%s: WARNING: powerhook_establish is deprecated\n", name);
 639         return (ndp);
 640 }
 641
 642 void
 643 powerhook_disestablish(void *vhook)
 644 {
 645 #ifdef DIAGNOSTIC
 646         struct powerhook_desc *dp;
 647
 648         CIRCLEQ_FOREACH(dp, &powerhook_list, sfd_list)
 649                 if (dp == vhook)
 650                         goto found;
 651         panic("powerhook_disestablish: hook %p not established", vhook);
 652  found:
 653 #endif
 654
 655         CIRCLEQ_REMOVE(&powerhook_list, (struct powerhook_desc *)vhook,
 656             sfd_list);
 657         free(vhook, M_DEVBUF);
 658 }
 659
 660 /*
 661  * Run power hooks.
 662  */
 663 void
 664 dopowerhooks(int why)
 665 {
 666         struct powerhook_desc *dp;
 667
 668 #ifdef POWERHOOK_DEBUG
 669         const char *why_name;
 670         static const char * pwr_names[] = {PWR_NAMES};
 671         why_name = why < __arraycount(pwr_names) ? pwr_names[why] : "???";
 672 #endif
 673
 674         if (why == PWR_RESUME || why == PWR_SOFTRESUME) {
 675                 CIRCLEQ_FOREACH_REVERSE(dp, &powerhook_list, sfd_list) {
 676 #ifdef POWERHOOK_DEBUG
 677                         printf("dopowerhooks %s: %s (%p)\n", why_name, dp->sfd_name, dp);
 678 #endif
 679                         (*dp->sfd_fn)(why, dp->sfd_arg);
 680                 }
 681         } else {
 682                 CIRCLEQ_FOREACH(dp, &powerhook_list, sfd_list) {
 683 #ifdef POWERHOOK_DEBUG
 684                         printf("dopowerhooks %s: %s (%p)\n", why_name, dp->sfd_name, dp);
 685 #endif
 686                         (*dp->sfd_fn)(why, dp->sfd_arg);
 687                 }
 688         }
 689
 690 #ifdef POWERHOOK_DEBUG
 691         printf("dopowerhooks: %s done\n", why_name);
 692 #endif
 693 }
 694
 695 static int
 696 isswap(struct device *dv)
 697 {
 698         struct dkwedge_info wi;
 699         struct vnode *vn;
 700         int error;
 701
 702         if (device_class(dv) != DV_DISK || !device_is_a(dv, "dk"))
 703                 return 0;
 704
 705         if ((vn = opendisk(dv)) == NULL)
 706                 return 0;
 707
 708         error = VOP_IOCTL(vn, DIOCGWEDGEINFO, &wi, FREAD, NOCRED);
 709         VOP_CLOSE(vn, FREAD, NOCRED);
 710         vput(vn);
 711         if (error) {
 712 #ifdef DEBUG_WEDGE
 713                 printf("%s: Get wedge info returned %d\n", device_xname(dv), error);
 714 #endif
 715                 return 0;
 716         }
 717         return strcmp(wi.dkw_ptype, DKW_PTYPE_SWAP) == 0;
 718 }
 719
 720 /*
 721  * Determine the root device and, if instructed to, the root file system.
 722  */
 723
 724 #include "md.h"
 725
 726 #if NMD > 0
 727 extern struct cfdriver md_cd;
 728 #ifdef MEMORY_DISK_IS_ROOT
 729 int md_is_root = 1;
 730 #else
 731 int md_is_root = 0;
 732 #endif
 733 #endif
 734
 735 /*
 736  * The device and wedge that we booted from.  If booted_wedge is NULL,
 737  * the we might consult booted_partition.
 738  */
 739 struct device *booted_device;
 740 struct device *booted_wedge;
 741 int booted_partition;
 742
 743 /*
 744  * Use partition letters if it's a disk class but not a wedge.
 745  * XXX Check for wedge is kinda gross.
 746  */
 747 #define DEV_USES_PARTITIONS(dv)                                         \
 748         (device_class((dv)) == DV_DISK &&                               \
 749          !device_is_a((dv), "dk"))
 750
 751 void
 752 setroot(struct device *bootdv, int bootpartition)
 753 {
 754         struct device *dv;
 755         int len, majdev;
 756         dev_t nrootdev;
 757         dev_t ndumpdev = NODEV;
 758         char buf[128];
 759         const char *rootdevname;
 760         const char *dumpdevname;
 761         struct device *rootdv = NULL;           /* XXX gcc -Wuninitialized */
 762         struct device *dumpdv = NULL;
 763         struct ifnet *ifp;
 764         const char *deffsname;
 765         struct vfsops *vops;
 766
 767 #ifdef TFTPROOT
 768         if (tftproot_dhcpboot(bootdv) != 0)
 769                 boothowto |= RB_ASKNAME;
 770 #endif
 771
 772 #if NMD > 0
 773         if (md_is_root) {
 774                 /*
 775                  * XXX there should be "root on md0" in the config file,
 776                  * but it isn't always
 777                  */
 778                 bootdv = md_cd.cd_devs[0];
 779                 bootpartition = 0;
 780         }
 781 #endif
 782
 783         /*
 784          * If NFS is specified as the file system, and we found
 785          * a DV_DISK boot device (or no boot device at all), then
 786          * find a reasonable network interface for "rootspec".
 787          */
 788         vops = vfs_getopsbyname(MOUNT_NFS);
 789         if (vops != NULL && strcmp(rootfstype, MOUNT_NFS) == 0 &&
 790             rootspec == NULL &&
 791             (bootdv == NULL || device_class(bootdv) != DV_IFNET)) {
 792                 IFNET_FOREACH(ifp) {
 793                         if ((ifp->if_flags &
 794                              (IFF_LOOPBACK|IFF_POINTOPOINT)) == 0)
 795                                 break;
 796                 }
 797                 if (ifp == NULL) {
 798                         /*
 799                          * Can't find a suitable interface; ask the
 800                          * user.
 801                          */
 802                         boothowto |= RB_ASKNAME;
 803                 } else {
 804                         /*
 805                          * Have a suitable interface; behave as if
 806                          * the user specified this interface.
 807                          */
 808                         rootspec = (const char *)ifp->if_xname;
 809                 }
 810         }
 811         if (vops != NULL)
 812                 vfs_delref(vops);
 813
 814         /*
 815          * If wildcarded root and we the boot device wasn't determined,
 816          * ask the user.
 817          */
 818         if (rootspec == NULL && bootdv == NULL)
 819                 boothowto |= RB_ASKNAME;
 820
 821  top:
 822         if (boothowto & RB_ASKNAME) {
 823                 struct device *defdumpdv;
 824
 825                 for (;;) {
 826                         printf("root device");
 827                         if (bootdv != NULL) {
 828                                 printf(" (default %s", device_xname(bootdv));
 829                                 if (DEV_USES_PARTITIONS(bootdv))
 830                                         printf("%c", bootpartition + 'a');
 831                                 printf(")");
 832                         }
 833                         printf(": ");
 834                         len = cngetsn(buf, sizeof(buf));
 835                         if (len == 0 && bootdv != NULL) {
 836                                 strlcpy(buf, device_xname(bootdv), sizeof(buf));
 837                                 len = strlen(buf);
 838                         }
 839                         if (len > 0 && buf[len - 1] == '*') {
 840                                 buf[--len] = '\0';
 841                                 dv = getdisk(buf, len, 1, &nrootdev, 0);
 842                                 if (dv != NULL) {
 843                                         rootdv = dv;
 844                                         break;
 845                                 }
 846                         }
 847                         dv = getdisk(buf, len, bootpartition, &nrootdev, 0);
 848                         if (dv != NULL) {
 849                                 rootdv = dv;
 850                                 break;
 851                         }
 852                 }
 853
 854                 /*
 855                  * Set up the default dump device.  If root is on
 856                  * a network device, there is no default dump
 857                  * device, since we don't support dumps to the
 858                  * network.
 859                  */
 860                 if (DEV_USES_PARTITIONS(rootdv) == 0)
 861                         defdumpdv = NULL;
 862                 else
 863                         defdumpdv = rootdv;
 864
 865                 for (;;) {
 866                         printf("dump device");
 867                         if (defdumpdv != NULL) {
 868                                 /*
 869                                  * Note, we know it's a disk if we get here.
 870                                  */
 871                                 printf(" (default %sb)", device_xname(defdumpdv));
 872                         }
 873                         printf(": ");
 874                         len = cngetsn(buf, sizeof(buf));
 875                         if (len == 0) {
 876                                 if (defdumpdv != NULL) {
 877                                         ndumpdev = MAKEDISKDEV(major(nrootdev),
 878                                             DISKUNIT(nrootdev), 1);
 879                                 }
 880                                 dumpdv = defdumpdv;
 881                                 break;
 882                         }
 883                         if (len == 4 && strcmp(buf, "none") == 0) {
 884                                 dumpdv = NULL;
 885                                 break;
 886                         }
 887                         dv = getdisk(buf, len, 1, &ndumpdev, 1);
 888                         if (dv != NULL) {
 889                                 dumpdv = dv;
 890                                 break;
 891                         }
 892                 }
 893
 894                 rootdev = nrootdev;
 895                 dumpdev = ndumpdev;
 896
 897                 for (vops = LIST_FIRST(&vfs_list); vops != NULL;
 898                      vops = LIST_NEXT(vops, vfs_list)) {
 899                         if (vops->vfs_mountroot != NULL &&
 900                             strcmp(rootfstype, vops->vfs_name) == 0)
 901                         break;
 902                 }
 903
 904                 if (vops == NULL) {
 905                         deffsname = "generic";
 906                 } else
 907                         deffsname = vops->vfs_name;
 908
 909                 for (;;) {
 910                         printf("file system (default %s): ", deffsname);
 911                         len = cngetsn(buf, sizeof(buf));
 912                         if (len == 0) {
 913                                 if (strcmp(deffsname, "generic") == 0)
 914                                         rootfstype = ROOT_FSTYPE_ANY;
 915                                 break;
 916                         }
 917                         if (len == 4 && strcmp(buf, "halt") == 0)
 918                                 cpu_reboot(RB_HALT, NULL);
 919                         else if (len == 6 && strcmp(buf, "reboot") == 0)
 920                                 cpu_reboot(0, NULL);
 921 #if defined(DDB)
 922                         else if (len == 3 && strcmp(buf, "ddb") == 0) {
 923                                 console_debugger();
 924                         }
 925 #endif
 926                         else if (len == 7 && strcmp(buf, "generic") == 0) {
 927                                 rootfstype = ROOT_FSTYPE_ANY;
 928                                 break;
 929                         }
 930                         vops = vfs_getopsbyname(buf);
 931                         if (vops == NULL || vops->vfs_mountroot == NULL) {
 932                                 printf("use one of: generic");
 933                                 for (vops = LIST_FIRST(&vfs_list);
 934                                      vops != NULL;
 935                                      vops = LIST_NEXT(vops, vfs_list)) {
 936                                         if (vops->vfs_mountroot != NULL)
 937                                                 printf(" %s", vops->vfs_name);
 938                                 }
 939                                 if (vops != NULL)
 940                                         vfs_delref(vops);
 941 #if defined(DDB)
 942                                 printf(" ddb");
 943 #endif
 944                                 printf(" halt reboot\n");
 945                         } else {
 946                                 /*
 947                                  * XXX If *vops gets freed between here and
 948                                  * the call to mountroot(), rootfstype will
 949                                  * point to something unexpected.  But in
 950                                  * this case the system will fail anyway.
 951                                  */
 952                                 rootfstype = vops->vfs_name;
 953                                 vfs_delref(vops);
 954                                 break;
 955                         }
 956                 }
 957
 958         } else if (rootspec == NULL) {
 959                 /*
 960                  * Wildcarded root; use the boot device.
 961                  */
 962                 rootdv = bootdv;
 963
 964                 if (bootdv)
 965                         majdev = devsw_name2blk(device_xname(bootdv), NULL, 0);
 966                 else
 967                         majdev = -1;
 968                 if (majdev >= 0) {
 969                         /*
 970                          * Root is on a disk.  `bootpartition' is root,
 971                          * unless the device does not use partitions.
 972                          */
 973                         if (DEV_USES_PARTITIONS(bootdv))
 974                                 rootdev = MAKEDISKDEV(majdev,
 975                                                       device_unit(bootdv),
 976                                                       bootpartition);
 977                         else
 978                                 rootdev = makedev(majdev, device_unit(bootdv));
 979                 }
 980         } else {
 981
 982                 /*
 983                  * `root on <dev> ...'
 984                  */
 985
 986                 /*
 987                  * If it's a network interface, we can bail out
 988                  * early.
 989                  */
 990                 dv = finddevice(rootspec);
 991                 if (dv != NULL && device_class(dv) == DV_IFNET) {
 992                         rootdv = dv;
 993                         goto haveroot;
 994                 }
 995
 996                 if (rootdev == NODEV &&
 997                     device_class(dv) == DV_DISK && device_is_a(dv, "dk") &&
 998                     (majdev = devsw_name2blk(device_xname(dv), NULL, 0)) >= 0)
 999                         rootdev = makedev(majdev, device_unit(dv));
1000
1001                 rootdevname = devsw_blk2name(major(rootdev));
1002                 if (rootdevname == NULL) {
1003                         printf("unknown device major 0x%llx\n",
1004                             (unsigned long long)rootdev);
1005                         boothowto |= RB_ASKNAME;
1006                         goto top;
1007                 }
1008                 memset(buf, 0, sizeof(buf));
1009                 snprintf(buf, sizeof(buf), "%s%llu", rootdevname,
1010                     (unsigned long long)DISKUNIT(rootdev));
1011
1012                 rootdv = finddevice(buf);
1013                 if (rootdv == NULL) {
1014                         printf("device %s (0x%llx) not configured\n",
1015                             buf, (unsigned long long)rootdev);
1016                         boothowto |= RB_ASKNAME;
1017                         goto top;
1018                 }
1019         }
1020
1021  haveroot:
1022
1023         root_device = rootdv;
1024
1025         switch (device_class(rootdv)) {
1026         case DV_IFNET:
1027         case DV_DISK:
1028                 aprint_normal("root on %s", device_xname(rootdv));
1029                 if (DEV_USES_PARTITIONS(rootdv))
1030                         aprint_normal("%c", (int)DISKPART(rootdev) + 'a');
1031                 break;
1032
1033         default:
1034                 printf("can't determine root device\n");
1035                 boothowto |= RB_ASKNAME;
1036                 goto top;
1037         }
1038
1039         /*
1040          * Now configure the dump device.
1041          *
1042          * If we haven't figured out the dump device, do so, with
1043          * the following rules:
1044          *
1045          *      (a) We already know dumpdv in the RB_ASKNAME case.
1046          *
1047          *      (b) If dumpspec is set, try to use it.  If the device
1048          *          is not available, punt.
1049          *
1050          *      (c) If dumpspec is not set, the dump device is
1051          *          wildcarded or unspecified.  If the root device
1052          *          is DV_IFNET, punt.  Otherwise, use partition b
1053          *          of the root device.
1054          */
1055
1056         if (boothowto & RB_ASKNAME) {           /* (a) */
1057                 if (dumpdv == NULL)
1058                         goto nodumpdev;
1059         } else if (dumpspec != NULL) {          /* (b) */
1060                 if (strcmp(dumpspec, "none") == 0 || dumpdev == NODEV) {
1061                         /*
1062                          * Operator doesn't want a dump device.
1063                          * Or looks like they tried to pick a network
1064                          * device.  Oops.
1065                          */
1066                         goto nodumpdev;
1067                 }
1068
1069                 dumpdevname = devsw_blk2name(major(dumpdev));
1070                 if (dumpdevname == NULL)
1071                         goto nodumpdev;
1072                 memset(buf, 0, sizeof(buf));
1073                 snprintf(buf, sizeof(buf), "%s%llu", dumpdevname,
1074                     (unsigned long long)DISKUNIT(dumpdev));
1075
1076                 dumpdv = finddevice(buf);
1077                 if (dumpdv == NULL) {
1078                         /*
1079                          * Device not configured.
1080                          */
1081                         goto nodumpdev;
1082                 }
1083         } else {                                /* (c) */
1084                 if (DEV_USES_PARTITIONS(rootdv) == 0) {
1085                         for (dv = TAILQ_FIRST(&alldevs); dv != NULL;
1086                             dv = TAILQ_NEXT(dv, dv_list))
1087                                 if (isswap(dv))
1088                                         break;
1089                         if (dv == NULL)
1090                                 goto nodumpdev;
1091
1092                         majdev = devsw_name2blk(device_xname(dv), NULL, 0);
1093                         if (majdev < 0)
1094                                 goto nodumpdev;
1095                         dumpdv = dv;
1096                         dumpdev = makedev(majdev, device_unit(dumpdv));
1097                 } else {
1098                         dumpdv = rootdv;
1099                         dumpdev = MAKEDISKDEV(major(rootdev),
1100                             device_unit(dumpdv), 1);
1101                 }
1102         }
1103
1104         dumpcdev = devsw_blk2chr(dumpdev);
1105         aprint_normal(" dumps on %s", device_xname(dumpdv));
1106         if (DEV_USES_PARTITIONS(dumpdv))
1107                 aprint_normal("%c", (int)DISKPART(dumpdev) + 'a');
1108         aprint_normal("\n");
1109         return;
1110
1111  nodumpdev:
1112         dumpdev = NODEV;
1113         dumpcdev = NODEV;
1114         aprint_normal("\n");
1115 }
1116
1117 static struct device *
1118 finddevice(const char *name)
1119 {
1120         const char *wname;
1121
1122         if ((wname = getwedgename(name, strlen(name))) != NULL)
1123                 return dkwedge_find_by_wname(wname);
1124
1125         return device_find_by_xname(name);
1126 }
1127
1128 static struct device *
1129 getdisk(char *str, int len, int defpart, dev_t *devp, int isdump)
1130 {
1131         struct device   *dv;
1132
1133         if ((dv = parsedisk(str, len, defpart, devp)) == NULL) {
1134                 printf("use one of:");
1135                 TAILQ_FOREACH(dv, &alldevs, dv_list) {
1136                         if (DEV_USES_PARTITIONS(dv))
1137                                 printf(" %s[a-%c]", device_xname(dv),
1138                                     'a' + MAXPARTITIONS - 1);
1139                         else if (device_class(dv) == DV_DISK)
1140                                 printf(" %s", device_xname(dv));
1141                         if (isdump == 0 && device_class(dv) == DV_IFNET)
1142                                 printf(" %s", device_xname(dv));
1143                 }
1144                 dkwedge_print_wnames();
1145                 if (isdump)
1146                         printf(" none");
1147 #if defined(DDB)
1148                 printf(" ddb");
1149 #endif
1150                 printf(" halt reboot\n");
1151         }
1152         return dv;
1153 }
1154
1155 static const char *
1156 getwedgename(const char *name, int namelen)
1157 {
1158         const char *wpfx = "wedge:";
1159         const int wpfxlen = strlen(wpfx);
1160
1161         if (namelen < wpfxlen || strncmp(name, wpfx, wpfxlen) != 0)
1162                 return NULL;
1163
1164         return name + wpfxlen;
1165 }
1166
1167 static struct device *
1168 parsedisk(char *str, int len, int defpart, dev_t *devp)
1169 {
1170         struct device *dv;
1171         const char *wname;
1172         char *cp, c;
1173         int majdev, part;
1174         if (len == 0)
1175                 return (NULL);
1176
1177         if (len == 4 && strcmp(str, "halt") == 0)
1178                 cpu_reboot(RB_HALT, NULL);
1179         else if (len == 6 && strcmp(str, "reboot") == 0)
1180                 cpu_reboot(0, NULL);
1181 #if defined(DDB)
1182         else if (len == 3 && strcmp(str, "ddb") == 0)
1183                 console_debugger();
1184 #endif
1185
1186         cp = str + len - 1;
1187         c = *cp;
1188
1189         if ((wname = getwedgename(str, len)) != NULL) {
1190                 if ((dv = dkwedge_find_by_wname(wname)) == NULL)
1191                         return NULL;
1192                 part = defpart;
1193                 goto gotdisk;
1194         } else if (c >= 'a' && c <= ('a' + MAXPARTITIONS - 1)) {
1195                 part = c - 'a';
1196                 *cp = '\0';
1197         } else
1198                 part = defpart;
1199
1200         dv = finddevice(str);
1201         if (dv != NULL) {
1202                 if (device_class(dv) == DV_DISK) {
1203  gotdisk:
1204                         majdev = devsw_name2blk(device_xname(dv), NULL, 0);
1205                         if (majdev < 0)
1206                                 panic("parsedisk");
1207                         if (DEV_USES_PARTITIONS(dv))
1208                                 *devp = MAKEDISKDEV(majdev, device_unit(dv),
1209                                                     part);
1210                         else
1211                                 *devp = makedev(majdev, device_unit(dv));
1212                 }
1213
1214                 if (device_class(dv) == DV_IFNET)
1215                         *devp = NODEV;
1216         }
1217
1218         *cp = c;
1219         return (dv);
1220 }
1221
1222 /*
1223  * snprintf() `bytes' into `buf', reformatting it so that the number,
1224  * plus a possible `x' + suffix extension) fits into len bytes (including
1225  * the terminating NUL).
1226  * Returns the number of bytes stored in buf, or -1 if there was a problem.
1227  * E.g, given a len of 9 and a suffix of `B':
1228  *      bytes           result
1229  *      -----           ------
1230  *      99999           `99999 B'
1231  *      100000          `97 kB'
1232  *      66715648        `65152 kB'
1233  *      252215296       `240 MB'
1234  */
1235 int
1236 humanize_number(char *buf, size_t len, uint64_t bytes, const char *suffix,
1237     int divisor)
1238 {
1239         /* prefixes are: (none), kilo, Mega, Giga, Tera, Peta, Exa */
1240         const char *prefixes;
1241         int             r;
1242         uint64_t        umax;
1243         size_t          i, suffixlen;
1244
1245         if (buf == NULL || suffix == NULL)
1246                 return (-1);
1247         if (len > 0)
1248                 buf[0] = '\0';
1249         suffixlen = strlen(suffix);
1250         /* check if enough room for `x y' + suffix + `\0' */
1251         if (len < 4 + suffixlen)
1252                 return (-1);
1253
1254         if (divisor == 1024) {
1255                 /*
1256                  * binary multiplies
1257                  * XXX IEC 60027-2 recommends Ki, Mi, Gi...
1258                  */
1259                 prefixes = " KMGTPE";
1260         } else
1261                 prefixes = " kMGTPE"; /* SI for decimal multiplies */
1262
1263         umax = 1;
1264         for (i = 0; i < len - suffixlen - 3; i++) {
1265                 umax *= 10;
1266                 if (umax > bytes)
1267                         break;
1268         }
1269         for (i = 0; bytes >= umax && prefixes[i + 1]; i++)
1270                 bytes /= divisor;
1271
1272         r = snprintf(buf, len, "%qu%s%c%s", (unsigned long long)bytes,
1273             i == 0 ? "" : " ", prefixes[i], suffix);
1274
1275         return (r);
1276 }
1277
1278 int
1279 format_bytes(char *buf, size_t len, uint64_t bytes)
1280 {
1281         int     rv;
1282         size_t  nlen;
1283
1284         rv = humanize_number(buf, len, bytes, "B", 1024);
1285         if (rv != -1) {
1286                         /* nuke the trailing ` B' if it exists */
1287                 nlen = strlen(buf) - 2;
1288                 if (strcmp(&buf[nlen], " B") == 0)
1289                         buf[nlen] = '\0';
1290         }
1291         return (rv);
1292 }
1293
1294 /*
1295  * Return true if system call tracing is enabled for the specified process.
1296  */
1297 bool
1298 trace_is_enabled(struct proc *p)
1299 {
1300 #ifdef SYSCALL_DEBUG
1301         return (true);
1302 #endif
1303 #ifdef KTRACE
1304         if (ISSET(p->p_traceflag, (KTRFAC_SYSCALL | KTRFAC_SYSRET)))
1305                 return (true);
1306 #endif
1307 #ifdef PTRACE
1308         if (ISSET(p->p_slflag, PSL_SYSCALL))
1309                 return (true);
1310 #endif
1311
1312         return (false);
1313 }
1314
1315 /*
1316  * Start trace of particular system call. If process is being traced,
1317  * this routine is called by MD syscall dispatch code just before
1318  * a system call is actually executed.
1319  */
1320 int
1321 trace_enter(register_t code, const register_t *args, int narg)
1322 {
1323 #ifdef SYSCALL_DEBUG
1324         scdebug_call(code, args);
1325 #endif /* SYSCALL_DEBUG */
1326
1327         ktrsyscall(code, args, narg);
1328
1329 #ifdef PTRACE
1330         if ((curlwp->l_proc->p_slflag & (PSL_SYSCALL|PSL_TRACED)) ==
1331             (PSL_SYSCALL|PSL_TRACED))
1332                 process_stoptrace();
1333 #endif
1334         return 0;
1335 }
1336
1337 /*
1338  * End trace of particular system call. If process is being traced,
1339  * this routine is called by MD syscall dispatch code just after
1340  * a system call finishes.
1341  * MD caller guarantees the passed 'code' is within the supported
1342  * system call number range for emulation the process runs under.
1343  */
1344 void
1345 trace_exit(register_t code, register_t rval[], int error)
1346 {
1347 #ifdef SYSCALL_DEBUG
1348         scdebug_ret(code, error, rval);
1349 #endif /* SYSCALL_DEBUG */
1350
1351         ktrsysret(code, error, rval);
1352
1353 #ifdef PTRACE
1354         if ((curlwp->l_proc->p_slflag & (PSL_SYSCALL|PSL_TRACED)) ==
1355             (PSL_SYSCALL|PSL_TRACED))
1356                 process_stoptrace();
1357 #endif
1358 }
1359
1360 int
1361 syscall_establish(const struct emul *em, const struct syscall_package *sp)
1362 {
1363         struct sysent *sy;
1364         int i;
1365
1366         KASSERT(mutex_owned(&module_lock));
1367
1368         if (em == NULL) {
1369                 em = &emul_netbsd;
1370         }
1371         sy = em->e_sysent;
1372
1373         /*
1374          * Ensure that all preconditions are valid, since this is
1375          * an all or nothing deal.  Once a system call is entered,
1376          * it can become busy and we could be unable to remove it
1377          * on error.
1378          */
1379         for (i = 0; sp[i].sp_call != NULL; i++) {
1380                 if (sy[sp[i].sp_code].sy_call != sys_nomodule) {
1381 #ifdef DIAGNOSTIC
1382                         printf("syscall %d is busy\n", sp[i].sp_code);
1383 #endif
1384                         return EBUSY;
1385                 }
1386         }
1387         /* Everything looks good, patch them in. */
1388         for (i = 0; sp[i].sp_call != NULL; i++) {
1389                 sy[sp[i].sp_code].sy_call = sp[i].sp_call;
1390         }
1391
1392         return 0;
1393 }
1394
1395 int
1396 syscall_disestablish(const struct emul *em, const struct syscall_package *sp)
1397 {
1398         struct sysent *sy;
1399         uint64_t where;
1400         lwp_t *l;
1401         int i;
1402
1403         KASSERT(mutex_owned(&module_lock));
1404
1405         if (em == NULL) {
1406                 em = &emul_netbsd;
1407         }
1408         sy = em->e_sysent;
1409
1410         /*
1411          * First, patch the system calls to sys_nomodule to gate further
1412          * activity.
1413          */
1414         for (i = 0; sp[i].sp_call != NULL; i++) {
1415                 KASSERT(sy[sp[i].sp_code].sy_call == sp[i].sp_call);
1416                 sy[sp[i].sp_code].sy_call = sys_nomodule;
1417         }
1418
1419         /*
1420          * Run a cross call to cycle through all CPUs.  This does two
1421          * things: lock activity provides a barrier and makes our update
1422          * of sy_call visible to all CPUs, and upon return we can be sure
1423          * that we see pertinent values of l_sysent posted by remote CPUs.
1424          */
1425         where = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL);
1426         xc_wait(where);
1427
1428         /*
1429          * Now it's safe to check l_sysent.  Run through all LWPs and see
1430          * if anyone is still using the system call.
1431          */
1432         for (i = 0; sp[i].sp_call != NULL; i++) {
1433                 mutex_enter(proc_lock);
1434                 LIST_FOREACH(l, &alllwp, l_list) {
1435                         if (l->l_sysent == &sy[sp[i].sp_code]) {
1436                                 break;
1437                         }
1438                 }
1439                 mutex_exit(proc_lock);
1440                 if (l == NULL) {
1441                         continue;
1442                 }
1443                 /*
1444                  * We lose: one or more calls are still in use.  Put back
1445                  * the old entrypoints and act like nothing happened.
1446                  * When we drop module_lock, any system calls held in
1447                  * sys_nomodule() will be restarted.
1448                  */
1449                 for (i = 0; sp[i].sp_call != NULL; i++) {
1450                         sy[sp[i].sp_code].sy_call = sp[i].sp_call;
1451                 }
1452                 return EBUSY;
1453         }
1454
1455         return 0;
1456 }