usr/src/uts/common/inet/ip/ip_mroute.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /* Copyright (c) 1990 Mentat Inc. */
  25
  26 /*
  27  * Copyright (c) 2018, Joyent, Inc.
  28  * Copyright 2024 Oxide Computer Company
  29  */
  30
  31 /*
  32  * Procedures for the kernel part of DVMRP,
  33  * a Distance-Vector Multicast Routing Protocol.
  34  * (See RFC-1075)
  35  * Written by David Waitzman, BBN Labs, August 1988.
  36  * Modified by Steve Deering, Stanford, February 1989.
  37  * Modified by Mark J. Steiglitz, Stanford, May, 1991
  38  * Modified by Van Jacobson, LBL, January 1993
  39  * Modified by Ajit Thyagarajan, PARC, August 1993
  40  * Modified by Bill Fenner, PARC, April 1995
  41  *
  42  * MROUTING 3.5
  43  */
  44
  45 /*
  46  * TODO
  47  * - function pointer field in vif, void *vif_sendit()
  48  */
  49
  50 #include <sys/types.h>
  51 #include <sys/stream.h>
  52 #include <sys/stropts.h>
  53 #include <sys/strlog.h>
  54 #include <sys/systm.h>
  55 #include <sys/ddi.h>
  56 #include <sys/cmn_err.h>
  57 #include <sys/zone.h>
  58
  59 #include <sys/param.h>
  60 #include <sys/socket.h>
  61 #include <sys/vtrace.h>
  62 #include <sys/debug.h>
  63 #include <net/if.h>
  64 #include <sys/sockio.h>
  65 #include <netinet/in.h>
  66 #include <net/if_dl.h>
  67
  68 #include <inet/ipsec_impl.h>
  69 #include <inet/common.h>
  70 #include <inet/mi.h>
  71 #include <inet/nd.h>
  72 #include <inet/tunables.h>
  73 #include <inet/mib2.h>
  74 #include <netinet/ip6.h>
  75 #include <inet/ip.h>
  76 #include <inet/snmpcom.h>
  77
  78 #include <netinet/igmp.h>
  79 #include <netinet/igmp_var.h>
  80 #include <netinet/udp.h>
  81 #include <netinet/ip_mroute.h>
  82 #include <inet/ip_multi.h>
  83 #include <inet/ip_ire.h>
  84 #include <inet/ip_ndp.h>
  85 #include <inet/ip_if.h>
  86 #include <inet/ipclassifier.h>
  87
  88 #include <netinet/pim.h>
  89
  90
  91 /*
  92  * MT Design:
  93  *
  94  * There are three main data structures viftable, mfctable and tbftable that
  95  * need to be protected against MT races.
  96  *
  97  * vitable is a fixed length array of vif structs. There is no lock to protect
  98  * the whole array, instead each struct is protected by its own indiviual lock.
  99  * The value of v_marks in conjuction with the value of v_refcnt determines the
 100  * current state of a vif structure. One special state that needs mention
 101  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
 102  * that vif is being initalized.
 103  * Each structure is freed when the refcnt goes down to zero. If a delete comes
 104  * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
 105  * which prevents the struct from further use.  When the refcnt goes to zero
 106  * the struct is freed and is marked VIF_MARK_NOTINUSE.
 107  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
 108  * from  going away a refhold is put on the ipif before using it. see
 109  * lock_good_vif() and unlock_good_vif().
 110  *
 111  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
 112  * of the vif struct.
 113  *
 114  * tbftable is also a fixed length array of tbf structs and is only accessed
 115  * via v_tbf.  It is protected by its own lock tbf_lock.
 116  *
 117  * Lock Ordering is
 118  * v_lock --> tbf_lock
 119  * v_lock --> ill_locK
 120  *
 121  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
 122  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
 123  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
 124  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
 125  * protect the struct elements.
 126  *
 127  * mfc structs are dynamically allocated and are singly linked
 128  * at the head of the chain. When an mfc structure is to be deleted
 129  * it is marked condemned and so is the state in the bucket struct.
 130  * When the last walker of the hash bucket exits all the mfc structs
 131  * marked condemed are freed.
 132  *
 133  * Locking Hierarchy:
 134  * The bucket lock should be acquired before the mfc struct lock.
 135  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
 136  * operations on the bucket struct.
 137  *
 138  * last_encap_lock and numvifs_mutex should be acquired after
 139  * acquring vif or mfc locks. These locks protect some global variables.
 140  *
 141  * The statistics are not currently protected by a lock
 142  * causing the stats be be approximate, not exact.
 143  */
 144
 145 #define NO_VIF  MAXVIFS         /* from mrouted, no route for src */
 146
 147 /*
 148  * Timeouts:
 149  *      Upcall timeouts - BSD uses boolean_t mfc->expire and
 150  *      nexpire[MFCTBLSIZE], the number of times expire has been called.
 151  *      SunOS 5.x uses mfc->timeout for each mfc.
 152  *      Some Unixes are limited in the number of simultaneous timeouts
 153  *      that can be run, SunOS 5.x does not have this restriction.
 154  */
 155
 156 /*
 157  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
 158  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
 159  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
 160  */
 161 #define         EXPIRE_TIMEOUT  (hz/4)  /* 4x / second  */
 162 #define         UPCALL_EXPIRE   6       /* number of timeouts   */
 163
 164 /*
 165  * Hash function for a source, group entry
 166  */
 167 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
 168         ((g) >> 20) ^ ((g) >> 10) ^ (g))
 169
 170 #define                 TBF_REPROCESS   (hz / 100)      /* 100x /second */
 171
 172 /* Identify PIM packet that came on a Register interface */
 173 #define PIM_REGISTER_MARKER     0xffffffff
 174
 175 /* Function declarations */
 176 static int      add_mfc(struct mfcctl *, ip_stack_t *);
 177 static int      add_vif(struct vifctl *, conn_t *, ip_stack_t *);
 178 static int      del_mfc(struct mfcctl *, ip_stack_t *);
 179 static int      del_vif(vifi_t *, ip_stack_t *);
 180 static void     del_vifp(struct vif *);
 181 static void     encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
 182 static void     expire_upcalls(void *);
 183 static void     fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
 184 static void     free_queue(struct mfc *);
 185 static int      get_assert(uchar_t *, ip_stack_t *);
 186 static int      get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
 187 static int      get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
 188 static int      get_version(uchar_t *);
 189 static int      get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
 190 static int      ip_mdq(mblk_t *, ipha_t *, ill_t *,
 191                     ipaddr_t, struct mfc *);
 192 static int      ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
 193 static void     phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
 194 static int      register_mforward(mblk_t *, ip_recv_attr_t *);
 195 static void     register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
 196 static int      set_assert(int *, ip_stack_t *);
 197
 198 /*
 199  * Token Bucket Filter functions
 200  */
 201 static int  priority(struct vif *, ipha_t *);
 202 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
 203 static int  tbf_dq_sel(struct vif *, ipha_t *);
 204 static void tbf_process_q(struct vif *);
 205 static void tbf_queue(struct vif *, mblk_t *);
 206 static void tbf_reprocess_q(void *);
 207 static void tbf_send_packet(struct vif *, mblk_t *);
 208 static void tbf_update_tokens(struct vif *);
 209 static void release_mfc(struct mfcb *);
 210
 211 static boolean_t is_mrouter_off(ip_stack_t *);
 212 /*
 213  * Encapsulation packets
 214  */
 215
 216 #define ENCAP_TTL       64
 217
 218 /* prototype IP hdr for encapsulated packets */
 219 static ipha_t multicast_encap_iphdr = {
 220         IP_SIMPLE_HDR_VERSION,
 221         0,                              /* tos */
 222         sizeof (ipha_t),                /* total length */
 223         0,                              /* id */
 224         0,                              /* frag offset */
 225         ENCAP_TTL, IPPROTO_ENCAP,
 226         0,                              /* checksum */
 227 };
 228
 229 /*
 230  * Rate limit for assert notification messages, in nsec.
 231  */
 232 #define ASSERT_MSG_TIME         3000000000
 233
 234
 235 #define VIF_REFHOLD(vifp) {                     \
 236         mutex_enter(&(vifp)->v_lock);           \
 237         (vifp)->v_refcnt++;                     \
 238         mutex_exit(&(vifp)->v_lock);            \
 239 }
 240
 241 #define VIF_REFRELE_LOCKED(vifp) {                              \
 242         (vifp)->v_refcnt--;                                     \
 243         if ((vifp)->v_refcnt == 0 &&                            \
 244                 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) {       \
 245                         del_vifp(vifp);                         \
 246         } else {                                                \
 247                 mutex_exit(&(vifp)->v_lock);                    \
 248         }                                                       \
 249 }
 250
 251 #define VIF_REFRELE(vifp) {                                     \
 252         mutex_enter(&(vifp)->v_lock);                           \
 253         (vifp)->v_refcnt--;                                     \
 254         if ((vifp)->v_refcnt == 0 &&                            \
 255                 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) {       \
 256                         del_vifp(vifp);                         \
 257         } else {                                                \
 258                 mutex_exit(&(vifp)->v_lock);                    \
 259         }                                                       \
 260 }
 261
 262 #define MFCB_REFHOLD(mfcb) {                            \
 263         mutex_enter(&(mfcb)->mfcb_lock);                \
 264         (mfcb)->mfcb_refcnt++;                          \
 265         ASSERT((mfcb)->mfcb_refcnt != 0);               \
 266         mutex_exit(&(mfcb)->mfcb_lock);                 \
 267 }
 268
 269 #define MFCB_REFRELE(mfcb) {                                    \
 270         mutex_enter(&(mfcb)->mfcb_lock);                        \
 271         ASSERT((mfcb)->mfcb_refcnt != 0);                       \
 272         if (--(mfcb)->mfcb_refcnt == 0 &&                       \
 273                 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {   \
 274                         release_mfc(mfcb);                      \
 275         }                                                       \
 276         mutex_exit(&(mfcb)->mfcb_lock);                         \
 277 }
 278
 279 /*
 280  * MFCFIND:
 281  * Find a route for a given origin IP address and multicast group address.
 282  * Skip entries with pending upcalls.
 283  * Type of service parameter to be added in the future!
 284  */
 285 #define MFCFIND(mfcbp, o, g, rt) { \
 286         struct mfc *_mb_rt = NULL; \
 287         rt = NULL; \
 288         _mb_rt = mfcbp->mfcb_mfc; \
 289         while (_mb_rt) { \
 290                 if ((_mb_rt->mfc_origin.s_addr == o) && \
 291                     (_mb_rt->mfc_mcastgrp.s_addr == g) && \
 292                     (_mb_rt->mfc_rte == NULL) && \
 293                     (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
 294                     rt = _mb_rt; \
 295                     break; \
 296                 } \
 297         _mb_rt = _mb_rt->mfc_next; \
 298         } \
 299 }
 300
 301 /*
 302  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
 303  * are inefficient. We use gethrestime() which returns a timespec_t with
 304  * sec and nsec, the resolution is machine dependent.
 305  * The following 2 macros have been changed to use nsec instead of usec.
 306  */
 307 /*
 308  * Macros to compute elapsed time efficiently.
 309  * Borrowed from Van Jacobson's scheduling code.
 310  * Delta should be a hrtime_t.
 311  */
 312 #define TV_DELTA(a, b, delta) { \
 313         int xxs; \
 314  \
 315         delta = (a).tv_nsec - (b).tv_nsec; \
 316         if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
 317                 switch (xxs) { \
 318                 case 2: \
 319                     delta += 1000000000; \
 320                     /*FALLTHROUGH*/ \
 321                 case 1: \
 322                     delta += 1000000000; \
 323                     break; \
 324                 default: \
 325                     delta += (1000000000 * xxs); \
 326                 } \
 327         } \
 328 }
 329
 330 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
 331         (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
 332
 333 /*
 334  * Handle MRT setsockopt commands to modify the multicast routing tables.
 335  */
 336 int
 337 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data,
 338     int datalen)
 339 {
 340         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 341
 342         mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
 343         if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
 344                 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 345                 return (EACCES);
 346         }
 347         mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 348
 349         if (checkonly) {
 350                 /*
 351                  * do not do operation, just pretend to - new T_CHECK
 352                  * Note: Even routines further on can probably fail but
 353                  * this T_CHECK stuff is only to please XTI so it not
 354                  * necessary to be perfect.
 355                  */
 356                 switch (cmd) {
 357                 case MRT_INIT:
 358                 case MRT_DONE:
 359                 case MRT_ADD_VIF:
 360                 case MRT_DEL_VIF:
 361                 case MRT_ADD_MFC:
 362                 case MRT_DEL_MFC:
 363                 case MRT_ASSERT:
 364                         return (0);
 365                 default:
 366                         return (EOPNOTSUPP);
 367                 }
 368         }
 369
 370         /*
 371          * make sure no command is issued after multicast routing has been
 372          * turned off.
 373          */
 374         if (cmd != MRT_INIT && cmd != MRT_DONE) {
 375                 if (is_mrouter_off(ipst))
 376                         return (EINVAL);
 377         }
 378
 379         switch (cmd) {
 380         case MRT_INIT:  return (ip_mrouter_init(connp, data, datalen, ipst));
 381         case MRT_DONE:  return (ip_mrouter_done(ipst));
 382         case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp, ipst));
 383         case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, ipst));
 384         case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
 385         case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
 386         case MRT_ASSERT:   return (set_assert((int *)data, ipst));
 387         default:           return (EOPNOTSUPP);
 388         }
 389 }
 390
 391 /*
 392  * Handle MRT getsockopt commands
 393  */
 394 int
 395 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data)
 396 {
 397         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 398
 399         if (connp != ipst->ips_ip_g_mrouter)
 400                 return (EACCES);
 401
 402         switch (cmd) {
 403         case MRT_VERSION:       return (get_version((uchar_t *)data));
 404         case MRT_ASSERT:        return (get_assert((uchar_t *)data, ipst));
 405         default:                return (EOPNOTSUPP);
 406         }
 407 }
 408
 409 /*
 410  * Handle ioctl commands to obtain information from the cache.
 411  * Called with shared access to IP. These are read_only ioctls.
 412  */
 413 /* ARGSUSED */
 414 int
 415 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 416     ip_ioctl_cmd_t *ipip, void *if_req)
 417 {
 418         mblk_t  *mp1;
 419         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
 420         conn_t          *connp = Q_TO_CONN(q);
 421         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 422
 423         /* Existence verified in ip_wput_nondata */
 424         mp1 = mp->b_cont->b_cont;
 425
 426         switch (iocp->ioc_cmd) {
 427         case (SIOCGETVIFCNT):
 428                 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
 429         case (SIOCGETSGCNT):
 430                 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
 431         case (SIOCGETLSGCNT):
 432                 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
 433         default:
 434                 return (EINVAL);
 435         }
 436 }
 437
 438 /*
 439  * Returns the packet, byte, rpf-failure count for the source, group provided.
 440  */
 441 static int
 442 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
 443 {
 444         struct mfc *rt;
 445         struct mfcb *mfcbp;
 446
 447         mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
 448         MFCB_REFHOLD(mfcbp);
 449         MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
 450
 451         if (rt != NULL) {
 452                 mutex_enter(&rt->mfc_mutex);
 453                 req->pktcnt   = rt->mfc_pkt_cnt;
 454                 req->bytecnt  = rt->mfc_byte_cnt;
 455                 req->wrong_if = rt->mfc_wrong_if;
 456                 mutex_exit(&rt->mfc_mutex);
 457         } else
 458                 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
 459
 460         MFCB_REFRELE(mfcbp);
 461         return (0);
 462 }
 463
 464 /*
 465  * Returns the packet, byte, rpf-failure count for the source, group provided.
 466  * Uses larger counters and IPv6 addresses.
 467  */
 468 /* ARGSUSED XXX until implemented */
 469 static int
 470 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
 471 {
 472         /* XXX TODO SIOCGETLSGCNT */
 473         return (ENXIO);
 474 }
 475
 476 /*
 477  * Returns the input and output packet and byte counts on the vif provided.
 478  */
 479 static int
 480 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
 481 {
 482         vifi_t vifi = req->vifi;
 483
 484         if (vifi >= ipst->ips_numvifs)
 485                 return (EINVAL);
 486
 487         /*
 488          * No locks here, an approximation is fine.
 489          */
 490         req->icount = ipst->ips_vifs[vifi].v_pkt_in;
 491         req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
 492         req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
 493         req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
 494
 495         return (0);
 496 }
 497
 498 static int
 499 get_version(uchar_t *data)
 500 {
 501         int *v = (int *)data;
 502
 503         *v = 0x0305;    /* XXX !!!! */
 504
 505         return (0);
 506 }
 507
 508 /*
 509  * Set PIM assert processing global.
 510  */
 511 static int
 512 set_assert(int *i, ip_stack_t *ipst)
 513 {
 514         if ((*i != 1) && (*i != 0))
 515                 return (EINVAL);
 516
 517         ipst->ips_pim_assert = *i;
 518
 519         return (0);
 520 }
 521
 522 /*
 523  * Get PIM assert processing global.
 524  */
 525 static int
 526 get_assert(uchar_t *data, ip_stack_t *ipst)
 527 {
 528         int *i = (int *)data;
 529
 530         *i = ipst->ips_pim_assert;
 531
 532         return (0);
 533 }
 534
 535 /*
 536  * Enable multicast routing.
 537  */
 538 static int
 539 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
 540 {
 541         int     *v;
 542
 543         if (data == NULL || (datalen != sizeof (int)))
 544                 return (ENOPROTOOPT);
 545
 546         v = (int *)data;
 547         if (*v != 1)
 548                 return (ENOPROTOOPT);
 549
 550         mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
 551         if (ipst->ips_ip_g_mrouter != NULL) {
 552                 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 553                 return (EADDRINUSE);
 554         }
 555
 556         /*
 557          * MRT_INIT should only be allowed for RAW sockets, but we double
 558          * check.
 559          */
 560         if (!IPCL_IS_RAWIP(connp)) {
 561                 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 562                 return (EINVAL);
 563         }
 564
 565         ipst->ips_ip_g_mrouter = connp;
 566         connp->conn_multi_router = 1;
 567         /* In order for tunnels to work we have to turn ip_g_forward on */
 568         if (!WE_ARE_FORWARDING(ipst)) {
 569                 if (ipst->ips_ip_mrtdebug > 1) {
 570                         (void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
 571                             "ip_mrouter_init: turning on forwarding");
 572                 }
 573                 ipst->ips_saved_ip_forwarding = ipst->ips_ip_forwarding;
 574                 ipst->ips_ip_forwarding = IP_FORWARD_ALWAYS;
 575         }
 576
 577         mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 578         return (0);
 579 }
 580
 581 void
 582 ip_mrouter_stack_init(ip_stack_t *ipst)
 583 {
 584         mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
 585
 586         ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
 587             KM_SLEEP);
 588         ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
 589         /*
 590          * mfctable:
 591          * Includes all mfcs, including waiting upcalls.
 592          * Multiple mfcs per bucket.
 593          */
 594         ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
 595             KM_SLEEP);
 596         /*
 597          * Define the token bucket filter structures.
 598          * tbftable -> each vif has one of these for storing info.
 599          */
 600         ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
 601
 602         mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
 603
 604         ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
 605         ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
 606 }
 607
 608 /*
 609  * Disable multicast routing.
 610  * Didn't use global timeout_val (BSD version), instead check the mfctable.
 611  */
 612 int
 613 ip_mrouter_done(ip_stack_t *ipst)
 614 {
 615         conn_t          *mrouter;
 616         vifi_t          vifi;
 617         struct mfc      *mfc_rt;
 618         int             i;
 619
 620         mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
 621         if (ipst->ips_ip_g_mrouter == NULL) {
 622                 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 623                 return (EINVAL);
 624         }
 625
 626         mrouter = ipst->ips_ip_g_mrouter;
 627
 628         if (ipst->ips_saved_ip_forwarding != -1) {
 629                 if (ipst->ips_ip_mrtdebug > 1) {
 630                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 631                             "ip_mrouter_done: turning off forwarding");
 632                 }
 633                 ipst->ips_ip_forwarding = ipst->ips_saved_ip_forwarding;
 634                 ipst->ips_saved_ip_forwarding = -1;
 635         }
 636
 637         /*
 638          * Always clear cache when vifs change.
 639          * No need to get ipst->ips_last_encap_lock since we are running as
 640          * a writer.
 641          */
 642         mutex_enter(&ipst->ips_last_encap_lock);
 643         ipst->ips_last_encap_src = 0;
 644         ipst->ips_last_encap_vif = NULL;
 645         mutex_exit(&ipst->ips_last_encap_lock);
 646         mrouter->conn_multi_router = 0;
 647
 648         mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 649
 650         /*
 651          * For each phyint in use,
 652          * disable promiscuous reception of all IP multicasts.
 653          */
 654         for (vifi = 0; vifi < MAXVIFS; vifi++) {
 655                 struct vif *vifp = ipst->ips_vifs + vifi;
 656
 657                 mutex_enter(&vifp->v_lock);
 658                 /*
 659                  * if the vif is active mark it condemned.
 660                  */
 661                 if (vifp->v_marks & VIF_MARK_GOOD) {
 662                         ASSERT(vifp->v_ipif != NULL);
 663                         ipif_refhold(vifp->v_ipif);
 664                         /* Phyint only */
 665                         if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
 666                                 ipif_t *ipif = vifp->v_ipif;
 667                                 ilm_t *ilm = vifp->v_ilm;
 668
 669                                 vifp->v_ilm = NULL;
 670                                 vifp->v_marks &= ~VIF_MARK_GOOD;
 671                                 vifp->v_marks |= VIF_MARK_CONDEMNED;
 672
 673                                 mutex_exit(&(vifp)->v_lock);
 674                                 if (ilm != NULL) {
 675                                         ill_t *ill = ipif->ipif_ill;
 676
 677                                         (void) ip_delmulti(ilm);
 678                                         ASSERT(ill->ill_mrouter_cnt > 0);
 679                                         atomic_dec_32(&ill->ill_mrouter_cnt);
 680                                 }
 681                                 mutex_enter(&vifp->v_lock);
 682                         }
 683                         ipif_refrele(vifp->v_ipif);
 684                         /*
 685                          * decreases the refcnt added in add_vif.
 686                          * and release v_lock.
 687                          */
 688                         VIF_REFRELE_LOCKED(vifp);
 689                 } else {
 690                         mutex_exit(&vifp->v_lock);
 691                         continue;
 692                 }
 693         }
 694
 695         mutex_enter(&ipst->ips_numvifs_mutex);
 696         ipst->ips_numvifs = 0;
 697         ipst->ips_pim_assert = 0;
 698         ipst->ips_reg_vif_num = ALL_VIFS;
 699         mutex_exit(&ipst->ips_numvifs_mutex);
 700
 701         /*
 702          * Free upcall msgs.
 703          * Go through mfctable and stop any outstanding upcall
 704          * timeouts remaining on mfcs.
 705          */
 706         for (i = 0; i < MFCTBLSIZ; i++) {
 707                 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
 708                 ipst->ips_mfcs[i].mfcb_refcnt++;
 709                 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
 710                 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
 711                 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
 712                 while (mfc_rt) {
 713                         /* Free upcalls */
 714                         mutex_enter(&mfc_rt->mfc_mutex);
 715                         if (mfc_rt->mfc_rte != NULL) {
 716                                 if (mfc_rt->mfc_timeout_id != 0) {
 717                                         /*
 718                                          * OK to drop the lock as we have
 719                                          * a refcnt on the bucket. timeout
 720                                          * can fire but it will see that
 721                                          * mfc_timeout_id == 0 and not do
 722                                          * anything. see expire_upcalls().
 723                                          */
 724                                         mfc_rt->mfc_timeout_id = 0;
 725                                         mutex_exit(&mfc_rt->mfc_mutex);
 726                                         (void) untimeout(
 727                                             mfc_rt->mfc_timeout_id);
 728                                         mfc_rt->mfc_timeout_id = 0;
 729                                         mutex_enter(&mfc_rt->mfc_mutex);
 730
 731                                         /*
 732                                          * all queued upcall packets
 733                                          * and mblk will be freed in
 734                                          * release_mfc().
 735                                          */
 736                                 }
 737                         }
 738
 739                         mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
 740
 741                         mutex_exit(&mfc_rt->mfc_mutex);
 742                         mfc_rt = mfc_rt->mfc_next;
 743                 }
 744                 MFCB_REFRELE(&ipst->ips_mfcs[i]);
 745         }
 746
 747         mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
 748         ipst->ips_ip_g_mrouter = NULL;
 749         mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 750         return (0);
 751 }
 752
 753 void
 754 ip_mrouter_stack_destroy(ip_stack_t *ipst)
 755 {
 756         struct mfcb *mfcbp;
 757         struct mfc  *rt;
 758         int i;
 759
 760         for (i = 0; i < MFCTBLSIZ; i++) {
 761                 mfcbp = &ipst->ips_mfcs[i];
 762
 763                 while ((rt = mfcbp->mfcb_mfc) != NULL) {
 764                         (void) printf("ip_mrouter_stack_destroy: free for %d\n",
 765                             i);
 766
 767                         mfcbp->mfcb_mfc = rt->mfc_next;
 768                         free_queue(rt);
 769                         mi_free(rt);
 770                 }
 771         }
 772         kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
 773         ipst->ips_vifs = NULL;
 774         kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
 775         ipst->ips_mrtstat = NULL;
 776         kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
 777         ipst->ips_mfcs = NULL;
 778         kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
 779         ipst->ips_tbfs = NULL;
 780
 781         mutex_destroy(&ipst->ips_last_encap_lock);
 782         mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
 783 }
 784
 785 static boolean_t
 786 is_mrouter_off(ip_stack_t *ipst)
 787 {
 788         conn_t  *mrouter;
 789
 790         mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
 791         if (ipst->ips_ip_g_mrouter == NULL) {
 792                 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 793                 return (B_TRUE);
 794         }
 795
 796         mrouter = ipst->ips_ip_g_mrouter;
 797         if (mrouter->conn_multi_router == 0) {
 798                 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 799                 return (B_TRUE);
 800         }
 801         mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 802         return (B_FALSE);
 803 }
 804
 805 static void
 806 unlock_good_vif(struct vif *vifp)
 807 {
 808         ASSERT(vifp->v_ipif != NULL);
 809         ipif_refrele(vifp->v_ipif);
 810         VIF_REFRELE(vifp);
 811 }
 812
 813 static boolean_t
 814 lock_good_vif(struct vif *vifp)
 815 {
 816         mutex_enter(&vifp->v_lock);
 817         if (!(vifp->v_marks & VIF_MARK_GOOD)) {
 818                 mutex_exit(&vifp->v_lock);
 819                 return (B_FALSE);
 820         }
 821
 822         ASSERT(vifp->v_ipif != NULL);
 823         mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
 824         if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
 825                 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
 826                 mutex_exit(&vifp->v_lock);
 827                 return (B_FALSE);
 828         }
 829         ipif_refhold_locked(vifp->v_ipif);
 830         mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
 831         vifp->v_refcnt++;
 832         mutex_exit(&vifp->v_lock);
 833         return (B_TRUE);
 834 }
 835
 836 /*
 837  * Add a vif to the vif table.
 838  */
 839 static int
 840 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst)
 841 {
 842         struct vif      *vifp = ipst->ips_vifs + vifcp->vifc_vifi;
 843         ipif_t          *ipif;
 844         int             error = 0;
 845         struct tbf      *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
 846         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
 847         ilm_t           *ilm;
 848         ill_t           *ill;
 849
 850         ASSERT(connp != NULL);
 851
 852         if (vifcp->vifc_vifi >= MAXVIFS)
 853                 return (EINVAL);
 854
 855         if (is_mrouter_off(ipst))
 856                 return (EINVAL);
 857
 858         mutex_enter(&vifp->v_lock);
 859         /*
 860          * Viftable entry should be 0.
 861          * if v_marks == 0 but v_refcnt != 0 means struct is being
 862          * initialized.
 863          *
 864          * Also note that it is very unlikely that we will get a MRT_ADD_VIF
 865          * request while the delete is in progress, mrouted only sends add
 866          * requests when a new interface is added and the new interface cannot
 867          * have the same vifi as an existing interface. We make sure that
 868          * ill_delete will block till the vif is deleted by adding a refcnt
 869          * to ipif in del_vif().
 870          */
 871         if (vifp->v_lcl_addr.s_addr != 0 ||
 872             vifp->v_marks != 0 ||
 873             vifp->v_refcnt != 0) {
 874                 mutex_exit(&vifp->v_lock);
 875                 return (EADDRINUSE);
 876         }
 877
 878         /* Incoming vif should not be 0 */
 879         if (vifcp->vifc_lcl_addr.s_addr == 0) {
 880                 mutex_exit(&vifp->v_lock);
 881                 return (EINVAL);
 882         }
 883
 884         vifp->v_refcnt++;
 885         mutex_exit(&vifp->v_lock);
 886         /* Find the interface with the local address */
 887         ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
 888             IPCL_ZONEID(connp), ipst);
 889         if (ipif == NULL) {
 890                 VIF_REFRELE(vifp);
 891                 return (EADDRNOTAVAIL);
 892         }
 893
 894         if (ipst->ips_ip_mrtdebug > 1) {
 895                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 896                     "add_vif: src 0x%x enter",
 897                     vifcp->vifc_lcl_addr.s_addr);
 898         }
 899
 900         mutex_enter(&vifp->v_lock);
 901         /*
 902          * Always clear cache when vifs change.
 903          * Needed to ensure that src isn't left over from before vif was added.
 904          * No need to get last_encap_lock, since we are running as a writer.
 905          */
 906
 907         mutex_enter(&ipst->ips_last_encap_lock);
 908         ipst->ips_last_encap_src = 0;
 909         ipst->ips_last_encap_vif = NULL;
 910         mutex_exit(&ipst->ips_last_encap_lock);
 911
 912         if (vifcp->vifc_flags & VIFF_TUNNEL) {
 913                 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
 914                         cmn_err(CE_WARN,
 915                             "add_vif: source route tunnels not supported\n");
 916                         VIF_REFRELE_LOCKED(vifp);
 917                         ipif_refrele(ipif);
 918                         return (EOPNOTSUPP);
 919                 }
 920                 vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
 921
 922         } else {
 923                 /* Phyint or Register vif */
 924                 if (vifcp->vifc_flags & VIFF_REGISTER) {
 925                         /*
 926                          * Note: Since all IPPROTO_IP level options (including
 927                          * MRT_ADD_VIF) are done exclusively via
 928                          * ip_optmgmt_writer(), a lock is not necessary to
 929                          * protect reg_vif_num.
 930                          */
 931                         mutex_enter(&ipst->ips_numvifs_mutex);
 932                         if (ipst->ips_reg_vif_num == ALL_VIFS) {
 933                                 ipst->ips_reg_vif_num = vifcp->vifc_vifi;
 934                                 mutex_exit(&ipst->ips_numvifs_mutex);
 935                         } else {
 936                                 mutex_exit(&ipst->ips_numvifs_mutex);
 937                                 VIF_REFRELE_LOCKED(vifp);
 938                                 ipif_refrele(ipif);
 939                                 return (EADDRINUSE);
 940                         }
 941                 }
 942
 943                 /* Make sure the interface supports multicast */
 944                 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
 945                         VIF_REFRELE_LOCKED(vifp);
 946                         ipif_refrele(ipif);
 947                         if (vifcp->vifc_flags & VIFF_REGISTER) {
 948                                 mutex_enter(&ipst->ips_numvifs_mutex);
 949                                 ipst->ips_reg_vif_num = ALL_VIFS;
 950                                 mutex_exit(&ipst->ips_numvifs_mutex);
 951                         }
 952                         return (EOPNOTSUPP);
 953                 }
 954                 /* Enable promiscuous reception of all IP mcasts from the if */
 955                 mutex_exit(&vifp->v_lock);
 956
 957                 ill = ipif->ipif_ill;
 958                 if (IS_UNDER_IPMP(ill))
 959                         ill = ipmp_ill_hold_ipmp_ill(ill);
 960
 961                 if (ill == NULL) {
 962                         ilm = NULL;
 963                 } else {
 964                         ilm = ip_addmulti(&ipv6_all_zeros, ill,
 965                             ipif->ipif_zoneid, &error);
 966                         if (ilm != NULL)
 967                                 atomic_inc_32(&ill->ill_mrouter_cnt);
 968                         if (IS_UNDER_IPMP(ipif->ipif_ill)) {
 969                                 ill_refrele(ill);
 970                                 ill = ipif->ipif_ill;
 971                         }
 972                 }
 973
 974                 mutex_enter(&vifp->v_lock);
 975                 /*
 976                  * since we released the lock lets make sure that
 977                  * ip_mrouter_done() has not been called.
 978                  */
 979                 if (ilm == NULL || is_mrouter_off(ipst)) {
 980                         if (ilm != NULL) {
 981                                 (void) ip_delmulti(ilm);
 982                                 ASSERT(ill->ill_mrouter_cnt > 0);
 983                                 atomic_dec_32(&ill->ill_mrouter_cnt);
 984                         }
 985                         if (vifcp->vifc_flags & VIFF_REGISTER) {
 986                                 mutex_enter(&ipst->ips_numvifs_mutex);
 987                                 ipst->ips_reg_vif_num = ALL_VIFS;
 988                                 mutex_exit(&ipst->ips_numvifs_mutex);
 989                         }
 990                         VIF_REFRELE_LOCKED(vifp);
 991                         ipif_refrele(ipif);
 992                         return (error?error:EINVAL);
 993                 }
 994                 vifp->v_ilm = ilm;
 995         }
 996         /* Define parameters for the tbf structure */
 997         vifp->v_tbf = v_tbf;
 998         gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
 999         vifp->v_tbf->tbf_n_tok = 0;
1000         vifp->v_tbf->tbf_q_len = 0;
1001         vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
1002         vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
1003
1004         vifp->v_flags = vifcp->vifc_flags;
1005         vifp->v_threshold = vifcp->vifc_threshold;
1006         vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1007         vifp->v_ipif = ipif;
1008         ipif_refrele(ipif);
1009         /* Scaling up here, allows division by 1024 in critical code.   */
1010         vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1011         vifp->v_timeout_id = 0;
1012         /* initialize per vif pkt counters */
1013         vifp->v_pkt_in = 0;
1014         vifp->v_pkt_out = 0;
1015         vifp->v_bytes_in = 0;
1016         vifp->v_bytes_out = 0;
1017         mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1018
1019         /* Adjust numvifs up, if the vifi is higher than numvifs */
1020         mutex_enter(&ipst->ips_numvifs_mutex);
1021         if (ipst->ips_numvifs <= vifcp->vifc_vifi)
1022                 ipst->ips_numvifs = vifcp->vifc_vifi + 1;
1023         mutex_exit(&ipst->ips_numvifs_mutex);
1024
1025         if (ipst->ips_ip_mrtdebug > 1) {
1026                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1027                     "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1028                     vifcp->vifc_vifi,
1029                     ntohl(vifcp->vifc_lcl_addr.s_addr),
1030                     (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1031                     ntohl(vifcp->vifc_rmt_addr.s_addr),
1032                     vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1033         }
1034
1035         vifp->v_marks = VIF_MARK_GOOD;
1036         mutex_exit(&vifp->v_lock);
1037         return (0);
1038 }
1039
1040
1041 /* Delete a vif from the vif table. */
1042 static void
1043 del_vifp(struct vif *vifp)
1044 {
1045         struct tbf      *t = vifp->v_tbf;
1046         mblk_t  *mp0;
1047         vifi_t  vifi;
1048         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
1049         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
1050
1051         ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1052         ASSERT(t != NULL);
1053
1054         if (ipst->ips_ip_mrtdebug > 1) {
1055                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1056                     "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1057         }
1058
1059         if (vifp->v_timeout_id != 0) {
1060                 (void) untimeout(vifp->v_timeout_id);
1061                 vifp->v_timeout_id = 0;
1062         }
1063
1064         /*
1065          * Free packets queued at the interface.
1066          * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1067          */
1068         mutex_enter(&t->tbf_lock);
1069         while (t->tbf_q != NULL) {
1070                 mp0 = t->tbf_q;
1071                 t->tbf_q = t->tbf_q->b_next;
1072                 mp0->b_prev = mp0->b_next = NULL;
1073                 freemsg(mp0);
1074         }
1075         mutex_exit(&t->tbf_lock);
1076
1077         /*
1078          * Always clear cache when vifs change.
1079          * No need to get last_encap_lock since we are running as a writer.
1080          */
1081         mutex_enter(&ipst->ips_last_encap_lock);
1082         if (vifp == ipst->ips_last_encap_vif) {
1083                 ipst->ips_last_encap_vif = NULL;
1084                 ipst->ips_last_encap_src = 0;
1085         }
1086         mutex_exit(&ipst->ips_last_encap_lock);
1087
1088         mutex_destroy(&t->tbf_lock);
1089
1090         bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1091
1092         /* Adjust numvifs down */
1093         mutex_enter(&ipst->ips_numvifs_mutex);
1094         for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1095                 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
1096                         break;
1097         ipst->ips_numvifs = vifi;
1098         mutex_exit(&ipst->ips_numvifs_mutex);
1099
1100         bzero(vifp, sizeof (*vifp));
1101 }
1102
1103 static int
1104 del_vif(vifi_t *vifip, ip_stack_t *ipst)
1105 {
1106         struct vif      *vifp = ipst->ips_vifs + *vifip;
1107
1108         if (*vifip >= ipst->ips_numvifs)
1109                 return (EINVAL);
1110
1111         mutex_enter(&vifp->v_lock);
1112         /*
1113          * Not initialized
1114          * Here we are not looking at the vif that is being initialized
1115          * i.e vifp->v_marks == 0 and refcnt > 0.
1116          */
1117         if (vifp->v_lcl_addr.s_addr == 0 ||
1118             !(vifp->v_marks & VIF_MARK_GOOD)) {
1119                 mutex_exit(&vifp->v_lock);
1120                 return (EADDRNOTAVAIL);
1121         }
1122
1123         /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1124         vifp->v_marks &= ~VIF_MARK_GOOD;
1125         vifp->v_marks |= VIF_MARK_CONDEMNED;
1126
1127         /* Phyint only */
1128         if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1129                 ipif_t *ipif = vifp->v_ipif;
1130                 ilm_t *ilm = vifp->v_ilm;
1131
1132                 vifp->v_ilm = NULL;
1133
1134                 ASSERT(ipif != NULL);
1135                 /*
1136                  * should be OK to drop the lock as we
1137                  * have marked this as CONDEMNED.
1138                  */
1139                 mutex_exit(&(vifp)->v_lock);
1140                 if (ilm != NULL) {
1141                         (void) ip_delmulti(ilm);
1142                         ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0);
1143                         atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt);
1144                 }
1145                 mutex_enter(&(vifp)->v_lock);
1146         }
1147
1148         if (vifp->v_flags & VIFF_REGISTER) {
1149                 mutex_enter(&ipst->ips_numvifs_mutex);
1150                 ipst->ips_reg_vif_num = ALL_VIFS;
1151                 mutex_exit(&ipst->ips_numvifs_mutex);
1152         }
1153
1154         /*
1155          * decreases the refcnt added in add_vif.
1156          */
1157         VIF_REFRELE_LOCKED(vifp);
1158         return (0);
1159 }
1160
1161 /*
1162  * Add an mfc entry.
1163  */
1164 static int
1165 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1166 {
1167         struct mfc *rt;
1168         struct rtdetq *rte;
1169         ushort_t nstl;
1170         int i;
1171         struct mfcb *mfcbp;
1172         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
1173
1174         /*
1175          * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1176          * did not have a real route for pkt.
1177          * We want this pkt without rt installed in the mfctable to prevent
1178          * multiiple tries, so go ahead and put it in mfctable, it will
1179          * be discarded later in ip_mdq() because the child is NULL.
1180          */
1181
1182         /* Error checking, out of bounds? */
1183         if (mfccp->mfcc_parent > MAXVIFS) {
1184                 ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1185                     (int)mfccp->mfcc_parent));
1186                 return (EINVAL);
1187         }
1188
1189         if ((mfccp->mfcc_parent != NO_VIF) &&
1190             (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
1191                 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1192                     (int)mfccp->mfcc_parent));
1193                 return (EINVAL);
1194         }
1195
1196         if (is_mrouter_off(ipst)) {
1197                 return (EINVAL);
1198         }
1199
1200         mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
1201             mfccp->mfcc_mcastgrp.s_addr)];
1202         MFCB_REFHOLD(mfcbp);
1203         MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1204             mfccp->mfcc_mcastgrp.s_addr, rt);
1205
1206         /* If an entry already exists, just update the fields */
1207         if (rt) {
1208                 if (ipst->ips_ip_mrtdebug > 1) {
1209                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1210                             "add_mfc: update o %x grp %x parent %x",
1211                             ntohl(mfccp->mfcc_origin.s_addr),
1212                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
1213                             mfccp->mfcc_parent);
1214                 }
1215                 mutex_enter(&rt->mfc_mutex);
1216                 rt->mfc_parent = mfccp->mfcc_parent;
1217
1218                 mutex_enter(&ipst->ips_numvifs_mutex);
1219                 for (i = 0; i < (int)ipst->ips_numvifs; i++)
1220                         rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1221                 mutex_exit(&ipst->ips_numvifs_mutex);
1222                 mutex_exit(&rt->mfc_mutex);
1223
1224                 MFCB_REFRELE(mfcbp);
1225                 return (0);
1226         }
1227
1228         /*
1229          * Find the entry for which the upcall was made and update.
1230          */
1231         for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1232                 mutex_enter(&rt->mfc_mutex);
1233                 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1234                     (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1235                     (rt->mfc_rte != NULL) &&
1236                     !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1237                         if (nstl++ != 0)
1238                                 cmn_err(CE_WARN,
1239                                     "add_mfc: %s o %x g %x p %x",
1240                                     "multiple kernel entries",
1241                                     ntohl(mfccp->mfcc_origin.s_addr),
1242                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
1243                                     mfccp->mfcc_parent);
1244
1245                         if (ipst->ips_ip_mrtdebug > 1) {
1246                                 (void) mi_strlog(mrouter->conn_rq, 1,
1247                                     SL_TRACE,
1248                                     "add_mfc: o %x g %x p %x",
1249                                     ntohl(mfccp->mfcc_origin.s_addr),
1250                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
1251                                     mfccp->mfcc_parent);
1252                         }
1253                         fill_route(rt, mfccp, ipst);
1254
1255                         /*
1256                          * Prevent cleanup of cache entry.
1257                          * Timer starts in ip_mforward.
1258                          */
1259                         if (rt->mfc_timeout_id != 0) {
1260                                 timeout_id_t id;
1261                                 id = rt->mfc_timeout_id;
1262                                 /*
1263                                  * setting id to zero will avoid this
1264                                  * entry from being cleaned up in
1265                                  * expire_up_calls().
1266                                  */
1267                                 rt->mfc_timeout_id = 0;
1268                                 /*
1269                                  * dropping the lock is fine as we
1270                                  * have a refhold on the bucket.
1271                                  * so mfc cannot be freed.
1272                                  * The timeout can fire but it will see
1273                                  * that mfc_timeout_id == 0 and not cleanup.
1274                                  */
1275                                 mutex_exit(&rt->mfc_mutex);
1276                                 (void) untimeout(id);
1277                                 mutex_enter(&rt->mfc_mutex);
1278                         }
1279
1280                         /*
1281                          * Send all pkts that are queued waiting for the upcall.
1282                          * ip_mdq param tun set to 0 -
1283                          * the return value of ip_mdq() isn't used here,
1284                          * so value we send doesn't matter.
1285                          */
1286                         while (rt->mfc_rte != NULL) {
1287                                 rte = rt->mfc_rte;
1288                                 rt->mfc_rte = rte->rte_next;
1289                                 mutex_exit(&rt->mfc_mutex);
1290                                 (void) ip_mdq(rte->mp, (ipha_t *)
1291                                     rte->mp->b_rptr, rte->ill, 0, rt);
1292                                 freemsg(rte->mp);
1293                                 mi_free((char *)rte);
1294                                 mutex_enter(&rt->mfc_mutex);
1295                         }
1296                 }
1297                 mutex_exit(&rt->mfc_mutex);
1298         }
1299
1300
1301         /*
1302          * It is possible that an entry is being inserted without an upcall
1303          */
1304         if (nstl == 0) {
1305                 mutex_enter(&(mfcbp->mfcb_lock));
1306                 if (ipst->ips_ip_mrtdebug > 1) {
1307                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1308                             "add_mfc: no upcall o %x g %x p %x",
1309                             ntohl(mfccp->mfcc_origin.s_addr),
1310                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
1311                             mfccp->mfcc_parent);
1312                 }
1313                 if (is_mrouter_off(ipst)) {
1314                         mutex_exit(&mfcbp->mfcb_lock);
1315                         MFCB_REFRELE(mfcbp);
1316                         return (EINVAL);
1317                 }
1318
1319                 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1320
1321                         mutex_enter(&rt->mfc_mutex);
1322                         if ((rt->mfc_origin.s_addr ==
1323                             mfccp->mfcc_origin.s_addr) &&
1324                             (rt->mfc_mcastgrp.s_addr ==
1325                             mfccp->mfcc_mcastgrp.s_addr) &&
1326                             (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1327                                 fill_route(rt, mfccp, ipst);
1328                                 mutex_exit(&rt->mfc_mutex);
1329                                 break;
1330                         }
1331                         mutex_exit(&rt->mfc_mutex);
1332                 }
1333
1334                 /* No upcall, so make a new entry into mfctable */
1335                 if (rt == NULL) {
1336                         rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1337                         if (rt == NULL) {
1338                                 ip1dbg(("add_mfc: out of memory\n"));
1339                                 mutex_exit(&mfcbp->mfcb_lock);
1340                                 MFCB_REFRELE(mfcbp);
1341                                 return (ENOBUFS);
1342                         }
1343
1344                         /* Insert new entry at head of hash chain */
1345                         mutex_enter(&rt->mfc_mutex);
1346                         fill_route(rt, mfccp, ipst);
1347
1348                         /* Link into table */
1349                         rt->mfc_next   = mfcbp->mfcb_mfc;
1350                         mfcbp->mfcb_mfc = rt;
1351                         mutex_exit(&rt->mfc_mutex);
1352                 }
1353                 mutex_exit(&mfcbp->mfcb_lock);
1354         }
1355
1356         MFCB_REFRELE(mfcbp);
1357         return (0);
1358 }
1359
1360 /*
1361  * Fills in mfc structure from mrouted mfcctl.
1362  */
1363 static void
1364 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
1365 {
1366         int i;
1367
1368         rt->mfc_origin          = mfccp->mfcc_origin;
1369         rt->mfc_mcastgrp        = mfccp->mfcc_mcastgrp;
1370         rt->mfc_parent          = mfccp->mfcc_parent;
1371         mutex_enter(&ipst->ips_numvifs_mutex);
1372         for (i = 0; i < (int)ipst->ips_numvifs; i++) {
1373                 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1374         }
1375         mutex_exit(&ipst->ips_numvifs_mutex);
1376         /* Initialize pkt counters per src-grp */
1377         rt->mfc_pkt_cnt = 0;
1378         rt->mfc_byte_cnt        = 0;
1379         rt->mfc_wrong_if        = 0;
1380         rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1381
1382 }
1383
1384 static void
1385 free_queue(struct mfc *mfcp)
1386 {
1387         struct rtdetq *rte0;
1388
1389         /*
1390          * Drop all queued upcall packets.
1391          * Free the mbuf with the pkt.
1392          */
1393         while ((rte0 = mfcp->mfc_rte) != NULL) {
1394                 mfcp->mfc_rte = rte0->rte_next;
1395                 freemsg(rte0->mp);
1396                 mi_free((char *)rte0);
1397         }
1398 }
1399 /*
1400  * go thorugh the hash bucket and free all the entries marked condemned.
1401  */
1402 void
1403 release_mfc(struct mfcb *mfcbp)
1404 {
1405         struct mfc *current_mfcp;
1406         struct mfc *prev_mfcp;
1407
1408         prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1409
1410         while (current_mfcp != NULL) {
1411                 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1412                         if (current_mfcp == mfcbp->mfcb_mfc) {
1413                                 mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1414                                 free_queue(current_mfcp);
1415                                 mi_free(current_mfcp);
1416                                 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1417                                 continue;
1418                         }
1419                         ASSERT(prev_mfcp != NULL);
1420                         prev_mfcp->mfc_next = current_mfcp->mfc_next;
1421                         free_queue(current_mfcp);
1422                         mi_free(current_mfcp);
1423                         current_mfcp = NULL;
1424                 } else {
1425                         prev_mfcp = current_mfcp;
1426                 }
1427
1428                 current_mfcp = prev_mfcp->mfc_next;
1429
1430         }
1431         mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1432         ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1433 }
1434
1435 /*
1436  * Delete an mfc entry.
1437  */
1438 static int
1439 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1440 {
1441         struct in_addr  origin;
1442         struct in_addr  mcastgrp;
1443         struct mfc      *rt;
1444         uint_t          hash;
1445         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
1446
1447         origin = mfccp->mfcc_origin;
1448         mcastgrp = mfccp->mfcc_mcastgrp;
1449         hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1450
1451         if (ipst->ips_ip_mrtdebug > 1) {
1452                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1453                     "del_mfc: o %x g %x",
1454                     ntohl(origin.s_addr),
1455                     ntohl(mcastgrp.s_addr));
1456         }
1457
1458         MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1459
1460         /* Find mfc in mfctable, finds only entries without upcalls */
1461         for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1462                 mutex_enter(&rt->mfc_mutex);
1463                 if (origin.s_addr == rt->mfc_origin.s_addr &&
1464                     mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1465                     rt->mfc_rte == NULL &&
1466                     !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1467                         break;
1468                 mutex_exit(&rt->mfc_mutex);
1469         }
1470
1471         /*
1472          * Return if there was an upcall (mfc_rte != NULL,
1473          * or rt not in mfctable.
1474          */
1475         if (rt == NULL) {
1476                 MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1477                 return (EADDRNOTAVAIL);
1478         }
1479
1480
1481         /*
1482          * no need to hold lock as we have a reference.
1483          */
1484         ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1485         /* error checking */
1486         if (rt->mfc_timeout_id != 0) {
1487                 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1488                 /*
1489                  * Its ok to drop the lock,  the struct cannot be freed
1490                  * since we have a ref on the hash bucket.
1491                  */
1492                 rt->mfc_timeout_id = 0;
1493                 mutex_exit(&rt->mfc_mutex);
1494                 (void) untimeout(rt->mfc_timeout_id);
1495                 mutex_enter(&rt->mfc_mutex);
1496         }
1497
1498         ASSERT(rt->mfc_rte == NULL);
1499
1500
1501         /*
1502          * Delete the entry from the cache
1503          */
1504         rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1505         mutex_exit(&rt->mfc_mutex);
1506
1507         MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1508
1509         return (0);
1510 }
1511
1512 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1513
1514 /*
1515  * IP multicast forwarding function. This function assumes that the packet
1516  * pointed to by ipha has arrived on (or is about to be sent to) the interface
1517  * pointed to by "ill", and the packet is to be relayed to other networks
1518  * that have members of the packet's destination IP multicast group.
1519  *
1520  * The packet is returned unscathed to the caller, unless it is
1521  * erroneous, in which case a -1 value tells the caller (IP)
1522  * to discard it.
1523  *
1524  * Unlike BSD, SunOS 5.x needs to return to IP info about
1525  * whether pkt came in thru a tunnel, so it can be discarded, unless
1526  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1527  * to be delivered.
1528  * Return values are 0 - pkt is okay and phyint
1529  *                  -1 - pkt is malformed and to be tossed
1530  *                   1 - pkt came in on tunnel
1531  */
1532 int
1533 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira)
1534 {
1535         ipha_t          *ipha = (ipha_t *)mp->b_rptr;
1536         ill_t           *ill = ira->ira_ill;
1537         struct mfc      *rt;
1538         ipaddr_t        src, dst, tunnel_src = 0;
1539         static int      srctun = 0;
1540         vifi_t          vifi;
1541         boolean_t       pim_reg_packet = B_FALSE;
1542         struct mfcb     *mfcbp;
1543         ip_stack_t      *ipst = ill->ill_ipst;
1544         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
1545         ill_t           *rill = ira->ira_rill;
1546
1547         ASSERT(ira->ira_pktlen == msgdsize(mp));
1548
1549         if (ipst->ips_ip_mrtdebug > 1) {
1550                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1551                     "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1552                     ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1553                     ill->ill_name);
1554         }
1555
1556         dst = ipha->ipha_dst;
1557         if (ira->ira_flags & IRAF_PIM_REGISTER)
1558                 pim_reg_packet = B_TRUE;
1559         else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET)
1560                 tunnel_src = ira->ira_mroute_tunnel;
1561
1562         /*
1563          * Don't forward a packet with time-to-live of zero or one,
1564          * or a packet destined to a local-only group.
1565          */
1566         if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1567             (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1568                 if (ipst->ips_ip_mrtdebug > 1) {
1569                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1570                             "ip_mforward: not forwarded ttl %d,"
1571                             " dst 0x%x ill %s",
1572                             ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1573                 }
1574                 if (tunnel_src != 0)
1575                         return (1);
1576                 else
1577                         return (0);
1578         }
1579
1580         if ((tunnel_src != 0) || pim_reg_packet) {
1581                 /*
1582                  * Packet arrived over an encapsulated tunnel or via a PIM
1583                  * register message.
1584                  */
1585                 if (ipst->ips_ip_mrtdebug > 1) {
1586                         if (tunnel_src != 0) {
1587                                 (void) mi_strlog(mrouter->conn_rq, 1,
1588                                     SL_TRACE,
1589                                     "ip_mforward: ill %s arrived via ENCAP TUN",
1590                                     ill->ill_name);
1591                         } else if (pim_reg_packet) {
1592                                 (void) mi_strlog(mrouter->conn_rq, 1,
1593                                     SL_TRACE,
1594                                     "ip_mforward: ill %s arrived via"
1595                                     "  REGISTER VIF",
1596                                     ill->ill_name);
1597                         }
1598                 }
1599         } else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1600             (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1601             ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1602                 /* Packet arrived via a physical interface. */
1603                 if (ipst->ips_ip_mrtdebug > 1) {
1604                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1605                             "ip_mforward: ill %s arrived via PHYINT",
1606                             ill->ill_name);
1607                 }
1608
1609         } else {
1610                 /*
1611                  * Packet arrived through a SRCRT tunnel.
1612                  * Source-route tunnels are no longer supported.
1613                  * Error message printed every 1000 times.
1614                  */
1615                 if ((srctun++ % 1000) == 0) {
1616                         cmn_err(CE_WARN,
1617                             "ip_mforward: received source-routed pkt from %x",
1618                             ntohl(ipha->ipha_src));
1619                 }
1620                 return (-1);
1621         }
1622
1623         ipst->ips_mrtstat->mrts_fwd_in++;
1624         src = ipha->ipha_src;
1625
1626         /* Find route in cache, return NULL if not there or upcalls q'ed. */
1627
1628         /*
1629          * Lock the mfctable against changes made by ip_mforward.
1630          * Note that only add_mfc and del_mfc can remove entries and
1631          * they run with exclusive access to IP. So we do not need to
1632          * guard against the rt being deleted, so release lock after reading.
1633          */
1634
1635         if (is_mrouter_off(ipst))
1636                 return (-1);
1637
1638         mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
1639         MFCB_REFHOLD(mfcbp);
1640         MFCFIND(mfcbp, src, dst, rt);
1641
1642         /* Entry exists, so forward if necessary */
1643         if (rt != NULL) {
1644                 int ret = 0;
1645                 ipst->ips_mrtstat->mrts_mfc_hits++;
1646                 if (pim_reg_packet) {
1647                         ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1648                         ret = ip_mdq(mp, ipha,
1649                             ipst->ips_vifs[ipst->ips_reg_vif_num].
1650                             v_ipif->ipif_ill,
1651                             0, rt);
1652                 } else {
1653                         ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1654                 }
1655
1656                 MFCB_REFRELE(mfcbp);
1657                 return (ret);
1658
1659                 /*
1660                  * Don't forward if we don't have a cache entry.  Mrouted will
1661                  * always provide a cache entry in response to an upcall.
1662                  */
1663         } else {
1664                 /*
1665                  * If we don't have a route for packet's origin, make a copy
1666                  * of the packet and send message to routing daemon.
1667                  */
1668                 struct mfc      *mfc_rt  = NULL;
1669                 mblk_t          *mp0     = NULL;
1670                 mblk_t          *mp_copy = NULL;
1671                 struct rtdetq   *rte     = NULL;
1672                 struct rtdetq   *rte_m, *rte1, *prev_rte;
1673                 uint_t          hash;
1674                 int             npkts;
1675                 boolean_t       new_mfc = B_FALSE;
1676                 ipst->ips_mrtstat->mrts_mfc_misses++;
1677                 /* BSD uses mrts_no_route++ */
1678                 if (ipst->ips_ip_mrtdebug > 1) {
1679                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1680                             "ip_mforward: no rte ill %s src %x g %x misses %d",
1681                             ill->ill_name, ntohl(src), ntohl(dst),
1682                             (int)ipst->ips_mrtstat->mrts_mfc_misses);
1683                 }
1684                 /*
1685                  * The order of the following code differs from the BSD code.
1686                  * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1687                  * code works, so SunOS 5.x wasn't changed to conform to the
1688                  * BSD version.
1689                  */
1690
1691                 /* Lock mfctable. */
1692                 hash = MFCHASH(src, dst);
1693                 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
1694
1695                 /*
1696                  * If we are turning off mrouted return an error
1697                  */
1698                 if (is_mrouter_off(ipst)) {
1699                         mutex_exit(&mfcbp->mfcb_lock);
1700                         MFCB_REFRELE(mfcbp);
1701                         return (-1);
1702                 }
1703
1704                 /* Is there an upcall waiting for this packet? */
1705                 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
1706                     mfc_rt = mfc_rt->mfc_next) {
1707                         mutex_enter(&mfc_rt->mfc_mutex);
1708                         if (ipst->ips_ip_mrtdebug > 1) {
1709                                 (void) mi_strlog(mrouter->conn_rq, 1,
1710                                     SL_TRACE,
1711                                     "ip_mforward: MFCTAB hash %d o 0x%x"
1712                                     " g 0x%x\n",
1713                                     hash, ntohl(mfc_rt->mfc_origin.s_addr),
1714                                     ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1715                         }
1716                         /* There is an upcall */
1717                         if ((src == mfc_rt->mfc_origin.s_addr) &&
1718                             (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1719                             (mfc_rt->mfc_rte != NULL) &&
1720                             !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1721                                 break;
1722                         }
1723                         mutex_exit(&mfc_rt->mfc_mutex);
1724                 }
1725                 /* No upcall, so make a new entry into mfctable */
1726                 if (mfc_rt == NULL) {
1727                         mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1728                         if (mfc_rt == NULL) {
1729                                 ipst->ips_mrtstat->mrts_fwd_drop++;
1730                                 ip1dbg(("ip_mforward: out of memory "
1731                                     "for mfc, mfc_rt\n"));
1732                                 goto error_return;
1733                         } else
1734                                 new_mfc = B_TRUE;
1735                         /* Get resources */
1736                         /* TODO could copy header and dup rest */
1737                         mp_copy = copymsg(mp);
1738                         if (mp_copy == NULL) {
1739                                 ipst->ips_mrtstat->mrts_fwd_drop++;
1740                                 ip1dbg(("ip_mforward: out of memory for "
1741                                     "mblk, mp_copy\n"));
1742                                 goto error_return;
1743                         }
1744                         mutex_enter(&mfc_rt->mfc_mutex);
1745                 }
1746                 /* Get resources for rte, whether first rte or not first. */
1747                 /* Add this packet into rtdetq */
1748                 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1749                 if (rte == NULL) {
1750                         ipst->ips_mrtstat->mrts_fwd_drop++;
1751                         mutex_exit(&mfc_rt->mfc_mutex);
1752                         ip1dbg(("ip_mforward: out of memory for"
1753                             " rtdetq, rte\n"));
1754                         goto error_return;
1755                 }
1756
1757                 mp0 = copymsg(mp);
1758                 if (mp0 == NULL) {
1759                         ipst->ips_mrtstat->mrts_fwd_drop++;
1760                         ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1761                         mutex_exit(&mfc_rt->mfc_mutex);
1762                         goto error_return;
1763                 }
1764                 rte->mp         = mp0;
1765                 if (pim_reg_packet) {
1766                         ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1767                         rte->ill =
1768                             ipst->ips_vifs[ipst->ips_reg_vif_num].
1769                             v_ipif->ipif_ill;
1770                 } else {
1771                         rte->ill = ill;
1772                 }
1773                 rte->rte_next   = NULL;
1774
1775                 /*
1776                  * Determine if upcall q (rtdetq) has overflowed.
1777                  * mfc_rt->mfc_rte is null by mi_zalloc
1778                  * if it is the first message.
1779                  */
1780                 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1781                     rte_m = rte_m->rte_next)
1782                         npkts++;
1783                 if (ipst->ips_ip_mrtdebug > 1) {
1784                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1785                             "ip_mforward: upcalls %d\n", npkts);
1786                 }
1787                 if (npkts > MAX_UPQ) {
1788                         ipst->ips_mrtstat->mrts_upq_ovflw++;
1789                         mutex_exit(&mfc_rt->mfc_mutex);
1790                         goto error_return;
1791                 }
1792
1793                 if (npkts == 0) {       /* first upcall */
1794                         int i = 0;
1795                         /*
1796                          * Now finish installing the new mfc! Now that we have
1797                          * resources!  Insert new entry at head of hash chain.
1798                          * Use src and dst which are ipaddr_t's.
1799                          */
1800                         mfc_rt->mfc_origin.s_addr = src;
1801                         mfc_rt->mfc_mcastgrp.s_addr = dst;
1802
1803                         mutex_enter(&ipst->ips_numvifs_mutex);
1804                         for (i = 0; i < (int)ipst->ips_numvifs; i++)
1805                                 mfc_rt->mfc_ttls[i] = 0;
1806                         mutex_exit(&ipst->ips_numvifs_mutex);
1807                         mfc_rt->mfc_parent = ALL_VIFS;
1808
1809                         /* Link into table */
1810                         if (ipst->ips_ip_mrtdebug > 1) {
1811                                 (void) mi_strlog(mrouter->conn_rq, 1,
1812                                     SL_TRACE,
1813                                     "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1814                                     "g 0x%x\n", hash,
1815                                     ntohl(mfc_rt->mfc_origin.s_addr),
1816                                     ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1817                         }
1818                         mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
1819                         ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
1820                         mfc_rt->mfc_rte = NULL;
1821                 }
1822
1823                 /* Link in the upcall */
1824                 /* First upcall */
1825                 if (mfc_rt->mfc_rte == NULL)
1826                         mfc_rt->mfc_rte = rte;
1827                 else {
1828                         /* not the first upcall */
1829                         prev_rte = mfc_rt->mfc_rte;
1830                         for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1831                             prev_rte = rte1, rte1 = rte1->rte_next)
1832                                 ;
1833                         prev_rte->rte_next = rte;
1834                 }
1835
1836                 /*
1837                  * No upcalls waiting, this is first one, so send a message to
1838                  * routing daemon to install a route into kernel table.
1839                  */
1840                 if (npkts == 0) {
1841                         struct igmpmsg  *im;
1842                         /* ipha_protocol is 0, for upcall */
1843                         ASSERT(mp_copy != NULL);
1844                         im = (struct igmpmsg *)mp_copy->b_rptr;
1845                         im->im_msgtype  = IGMPMSG_NOCACHE;
1846                         im->im_mbz = 0;
1847                         mutex_enter(&ipst->ips_numvifs_mutex);
1848                         if (pim_reg_packet) {
1849                                 im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
1850                                 mutex_exit(&ipst->ips_numvifs_mutex);
1851                         } else {
1852                                 /*
1853                                  * XXX do we need to hold locks here ?
1854                                  */
1855                                 for (vifi = 0;
1856                                     vifi < ipst->ips_numvifs;
1857                                     vifi++) {
1858                                         if (ipst->ips_vifs[vifi].v_ipif == NULL)
1859                                                 continue;
1860                                         if (ipst->ips_vifs[vifi].
1861                                             v_ipif->ipif_ill == ill) {
1862                                                 im->im_vif = (uchar_t)vifi;
1863                                                 break;
1864                                         }
1865                                 }
1866                                 mutex_exit(&ipst->ips_numvifs_mutex);
1867                                 ASSERT(vifi < ipst->ips_numvifs);
1868                         }
1869
1870                         ipst->ips_mrtstat->mrts_upcalls++;
1871                         /* Timer to discard upcalls if mrouted is too slow */
1872                         mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1873                             mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1874                         mutex_exit(&mfc_rt->mfc_mutex);
1875                         mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1876                         /* Pass to RAWIP */
1877                         ira->ira_ill = ira->ira_rill = NULL;
1878                         (mrouter->conn_recv)(mrouter, mp_copy, NULL, ira);
1879                         ira->ira_ill = ill;
1880                         ira->ira_rill = rill;
1881                 } else {
1882                         mutex_exit(&mfc_rt->mfc_mutex);
1883                         mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1884                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1885                         ip_drop_input("ip_mforward - upcall already waiting",
1886                             mp_copy, ill);
1887                         freemsg(mp_copy);
1888                 }
1889
1890                 MFCB_REFRELE(mfcbp);
1891                 if (tunnel_src != 0)
1892                         return (1);
1893                 else
1894                         return (0);
1895         error_return:
1896                 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1897                 MFCB_REFRELE(mfcbp);
1898                 if (mfc_rt != NULL && (new_mfc == B_TRUE))
1899                         mi_free((char *)mfc_rt);
1900                 if (rte != NULL)
1901                         mi_free((char *)rte);
1902                 if (mp_copy != NULL) {
1903                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1904                         ip_drop_input("ip_mforward error", mp_copy, ill);
1905                         freemsg(mp_copy);
1906                 }
1907                 if (mp0 != NULL)
1908                         freemsg(mp0);
1909                 return (-1);
1910         }
1911 }
1912
1913 /*
1914  * Clean up the mfctable cache entry if upcall is not serviced.
1915  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1916  */
1917 static void
1918 expire_upcalls(void *arg)
1919 {
1920         struct mfc *mfc_rt = arg;
1921         uint_t hash;
1922         struct mfc *prev_mfc, *mfc0;
1923         ip_stack_t      *ipst;
1924         conn_t          *mrouter;
1925
1926         if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
1927                 cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
1928                 return;
1929         }
1930         ipst = mfc_rt->mfc_rte->ill->ill_ipst;
1931         mrouter = ipst->ips_ip_g_mrouter;
1932
1933         hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1934         if (ipst->ips_ip_mrtdebug > 1) {
1935                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1936                     "expire_upcalls: hash %d s %x g %x",
1937                     hash, ntohl(mfc_rt->mfc_origin.s_addr),
1938                     ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1939         }
1940         MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1941         mutex_enter(&mfc_rt->mfc_mutex);
1942         /*
1943          * if timeout has been set to zero, than the
1944          * entry has been filled, no need to delete it.
1945          */
1946         if (mfc_rt->mfc_timeout_id == 0)
1947                 goto done;
1948         ipst->ips_mrtstat->mrts_cache_cleanups++;
1949         mfc_rt->mfc_timeout_id = 0;
1950
1951         /* Determine entry to be cleaned up in cache table. */
1952         for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
1953             prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
1954                 if (mfc0 == mfc_rt)
1955                         break;
1956
1957         /* del_mfc takes care of gone mfcs */
1958         ASSERT(prev_mfc != NULL);
1959         ASSERT(mfc0 != NULL);
1960
1961         /*
1962          * Delete the entry from the cache
1963          */
1964         ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1965         mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1966
1967         /*
1968          * release_mfc will drop all queued upcall packets.
1969          * and will free the mbuf with the pkt, if, timing info.
1970          */
1971 done:
1972         mutex_exit(&mfc_rt->mfc_mutex);
1973         MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1974 }
1975
1976 /*
1977  * Packet forwarding routine once entry in the cache is made.
1978  */
1979 static int
1980 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
1981     struct mfc *rt)
1982 {
1983         vifi_t vifi;
1984         struct vif *vifp;
1985         ipaddr_t dst = ipha->ipha_dst;
1986         size_t  plen = msgdsize(mp);
1987         vifi_t num_of_vifs;
1988         ip_stack_t      *ipst = ill->ill_ipst;
1989         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
1990         ip_recv_attr_t  iras;
1991
1992         if (ipst->ips_ip_mrtdebug > 1) {
1993                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1994                     "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
1995                     ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1996                     ill->ill_name);
1997         }
1998
1999         /* Macro to send packet on vif */
2000 #define MC_SEND(ipha, mp, vifp, dst) { \
2001         if ((vifp)->v_flags & VIFF_TUNNEL) \
2002                 encap_send((ipha), (mp), (vifp), (dst)); \
2003         else if ((vifp)->v_flags & VIFF_REGISTER) \
2004                 register_send((ipha), (mp), (vifp), (dst)); \
2005         else \
2006                 phyint_send((ipha), (mp), (vifp), (dst)); \
2007 }
2008
2009         vifi = rt->mfc_parent;
2010
2011         /*
2012          * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2013          * Mrouted had no route.
2014          * We wanted the route installed in the mfctable to prevent multiple
2015          * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2016          * NULL so we don't want to check the ill. Still needed as of Mrouted
2017          * 3.6.
2018          */
2019         if (vifi == NO_VIF) {
2020                 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2021                     ill->ill_name));
2022                 if (ipst->ips_ip_mrtdebug > 1) {
2023                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2024                             "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2025                 }
2026                 return (-1);    /* drop pkt */
2027         }
2028
2029         if (!lock_good_vif(&ipst->ips_vifs[vifi]))
2030                 return (-1);
2031         /*
2032          * The MFC entries are not cleaned up when an ipif goes
2033          * away thus this code has to guard against an MFC referencing
2034          * an ipif that has been closed. Note: reset_mrt_vif_ipif
2035          * sets the v_ipif to NULL when the ipif disappears.
2036          */
2037         ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
2038
2039         if (vifi >= ipst->ips_numvifs) {
2040                 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2041                     "%d ill %s viftable ill %s\n",
2042                     (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2043                     ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2044                 unlock_good_vif(&ipst->ips_vifs[vifi]);
2045                 return (-1);
2046         }
2047         /*
2048          * Don't forward if it didn't arrive from the parent vif for its
2049          * origin.
2050          */
2051         if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) ||
2052             (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2053                 /* Came in the wrong interface */
2054                 ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2055                         "numvifs %d ill %s viftable ill %s\n",
2056                         (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2057                         ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
2058                 if (ipst->ips_ip_mrtdebug > 1) {
2059                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2060                             "ip_mdq: arrived wrong if, vifi %d ill "
2061                             "%s viftable ill %s\n",
2062                             (int)vifi, ill->ill_name,
2063                             ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2064                 }
2065                 ipst->ips_mrtstat->mrts_wrong_if++;
2066                 rt->mfc_wrong_if++;
2067
2068                 /*
2069                  * If we are doing PIM assert processing and we are forwarding
2070                  * packets on this interface, and it is a broadcast medium
2071                  * interface (and not a tunnel), send a message to the routing.
2072                  *
2073                  * We use the first ipif on the list, since it's all we have.
2074                  * Chances are the ipif_flags are the same for ipifs on the ill.
2075                  */
2076                 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
2077                     (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2078                     !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
2079                         mblk_t          *mp_copy;
2080                         struct igmpmsg  *im;
2081
2082                         /* TODO could copy header and dup rest */
2083                         mp_copy = copymsg(mp);
2084                         if (mp_copy == NULL) {
2085                                 ipst->ips_mrtstat->mrts_fwd_drop++;
2086                                 ip1dbg(("ip_mdq: out of memory "
2087                                     "for mblk, mp_copy\n"));
2088                                 unlock_good_vif(&ipst->ips_vifs[vifi]);
2089                                 return (-1);
2090                         }
2091
2092                         im = (struct igmpmsg *)mp_copy->b_rptr;
2093                         im->im_msgtype = IGMPMSG_WRONGVIF;
2094                         im->im_mbz = 0;
2095                         im->im_vif = (ushort_t)vifi;
2096                         /* Pass to RAWIP */
2097
2098                         bzero(&iras, sizeof (iras));
2099                         iras.ira_flags = IRAF_IS_IPV4;
2100                         iras.ira_ip_hdr_length =
2101                             IPH_HDR_LENGTH(mp_copy->b_rptr);
2102                         iras.ira_pktlen = msgdsize(mp_copy);
2103                         iras.ira_ttl = ipha->ipha_ttl;
2104                         (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2105                         ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2106                 }
2107                 unlock_good_vif(&ipst->ips_vifs[vifi]);
2108                 if (tunnel_src != 0)
2109                         return (1);
2110                 else
2111                         return (0);
2112         }
2113         /*
2114          * If I sourced this packet, it counts as output, else it was input.
2115          */
2116         if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
2117                 ipst->ips_vifs[vifi].v_pkt_out++;
2118                 ipst->ips_vifs[vifi].v_bytes_out += plen;
2119         } else {
2120                 ipst->ips_vifs[vifi].v_pkt_in++;
2121                 ipst->ips_vifs[vifi].v_bytes_in += plen;
2122         }
2123         mutex_enter(&rt->mfc_mutex);
2124         rt->mfc_pkt_cnt++;
2125         rt->mfc_byte_cnt += plen;
2126         mutex_exit(&rt->mfc_mutex);
2127         unlock_good_vif(&ipst->ips_vifs[vifi]);
2128         /*
2129          * For each vif, decide if a copy of the packet should be forwarded.
2130          * Forward if:
2131          *              - the vif threshold ttl is non-zero AND
2132          *              - the pkt ttl exceeds the vif's threshold
2133          * A non-zero mfc_ttl indicates that the vif is part of
2134          * the output set for the mfc entry.
2135          */
2136         mutex_enter(&ipst->ips_numvifs_mutex);
2137         num_of_vifs = ipst->ips_numvifs;
2138         mutex_exit(&ipst->ips_numvifs_mutex);
2139         for (vifp = ipst->ips_vifs, vifi = 0;
2140             vifi < num_of_vifs;
2141             vifp++, vifi++) {
2142                 if (!lock_good_vif(vifp))
2143                         continue;
2144                 if ((rt->mfc_ttls[vifi] > 0) &&
2145                     (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2146                         /*
2147                          * lock_good_vif should not have succedded if
2148                          * v_ipif is null.
2149                          */
2150                         ASSERT(vifp->v_ipif != NULL);
2151                         vifp->v_pkt_out++;
2152                         vifp->v_bytes_out += plen;
2153                         MC_SEND(ipha, mp, vifp, dst);
2154                         ipst->ips_mrtstat->mrts_fwd_out++;
2155                 }
2156                 unlock_good_vif(vifp);
2157         }
2158         if (tunnel_src != 0)
2159                 return (1);
2160         else
2161                 return (0);
2162 }
2163
2164 /*
2165  * Send the packet on physical interface.
2166  * Caller assumes can continue to use mp on return.
2167  */
2168 /* ARGSUSED */
2169 static void
2170 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2171 {
2172         mblk_t  *mp_copy;
2173         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2174         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2175
2176         /* Make a new reference to the packet */
2177         mp_copy = copymsg(mp);  /* TODO could copy header and dup rest */
2178         if (mp_copy == NULL) {
2179                 ipst->ips_mrtstat->mrts_fwd_drop++;
2180                 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2181                 return;
2182         }
2183         if (vifp->v_rate_limit <= 0)
2184                 tbf_send_packet(vifp, mp_copy);
2185         else  {
2186                 if (ipst->ips_ip_mrtdebug > 1) {
2187                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2188                             "phyint_send: tbf_contr rate %d "
2189                             "vifp 0x%p mp 0x%p dst 0x%x",
2190                             vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2191                 }
2192                 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2193         }
2194 }
2195
2196 /*
2197  * Send the whole packet for REGISTER encapsulation to PIM daemon
2198  * Caller assumes it can continue to use mp on return.
2199  */
2200 /* ARGSUSED */
2201 static void
2202 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2203 {
2204         struct igmpmsg  *im;
2205         mblk_t          *mp_copy;
2206         ipha_t          *ipha_copy;
2207         ill_t           *ill = vifp->v_ipif->ipif_ill;
2208         ip_stack_t      *ipst = ill->ill_ipst;
2209         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2210         ip_recv_attr_t  iras;
2211
2212         if (ipst->ips_ip_mrtdebug > 1) {
2213                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2214                     "register_send: src %x, dst %x\n",
2215                     ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2216         }
2217
2218         /*
2219          * Copy the old packet & pullup its IP header into the new mblk_t so we
2220          * can modify it.  Try to fill the new mblk_t since if we don't the
2221          * ethernet driver will.
2222          */
2223         mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2224         if (mp_copy == NULL) {
2225                 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2226                 if (ipst->ips_ip_mrtdebug > 3) {
2227                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2228                             "register_send: allocb failure.");
2229                 }
2230                 return;
2231         }
2232
2233         /*
2234          * Bump write pointer to account for igmpmsg being added.
2235          */
2236         mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2237
2238         /*
2239          * Chain packet to new mblk_t.
2240          */
2241         if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2242                 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2243                 if (ipst->ips_ip_mrtdebug > 3) {
2244                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2245                             "register_send: copymsg failure.");
2246                 }
2247                 freeb(mp_copy);
2248                 return;
2249         }
2250
2251         /*
2252          * icmp_input() asserts that IP version field is set to an
2253          * appropriate version. Hence, the struct igmpmsg that this really
2254          * becomes, needs to have the correct IP version field.
2255          */
2256         ipha_copy = (ipha_t *)mp_copy->b_rptr;
2257         *ipha_copy = multicast_encap_iphdr;
2258
2259         /*
2260          * The kernel uses the struct igmpmsg header to encode the messages to
2261          * the multicast routing daemon. Fill in the fields in the header
2262          * starting with the message type which is IGMPMSG_WHOLEPKT
2263          */
2264         im = (struct igmpmsg *)mp_copy->b_rptr;
2265         im->im_msgtype = IGMPMSG_WHOLEPKT;
2266         im->im_src.s_addr = ipha->ipha_src;
2267         im->im_dst.s_addr = ipha->ipha_dst;
2268
2269         /*
2270          * Must Be Zero. This is because the struct igmpmsg is really an IP
2271          * header with renamed fields and the multicast routing daemon uses
2272          * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2273          */
2274         im->im_mbz = 0;
2275
2276         ++ipst->ips_mrtstat->mrts_upcalls;
2277         if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld :
2278             !canputnext(mrouter->conn_rq)) {
2279                 ++ipst->ips_mrtstat->mrts_pim_regsend_drops;
2280                 if (ipst->ips_ip_mrtdebug > 3) {
2281                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2282                             "register_send: register upcall failure.");
2283                 }
2284                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2285                 ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill);
2286                 freemsg(mp_copy);
2287         } else {
2288                 /* Pass to RAWIP */
2289                 bzero(&iras, sizeof (iras));
2290                 iras.ira_flags = IRAF_IS_IPV4;
2291                 iras.ira_ip_hdr_length = sizeof (ipha_t);
2292                 iras.ira_pktlen = msgdsize(mp_copy);
2293                 iras.ira_ttl = ipha->ipha_ttl;
2294                 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2295                 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2296         }
2297 }
2298
2299 /*
2300  * pim_validate_cksum handles verification of the checksum in the
2301  * pim header.  For PIM Register packets, the checksum is calculated
2302  * across the PIM header only.  For all other packets, the checksum
2303  * is for the PIM header and remainder of the packet.
2304  *
2305  * returns: B_TRUE, if checksum is okay.
2306  *          B_FALSE, if checksum is not valid.
2307  */
2308 static boolean_t
2309 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2310 {
2311         mblk_t *mp_dup;
2312
2313         if ((mp_dup = dupmsg(mp)) == NULL)
2314                 return (B_FALSE);
2315
2316         mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2317         if (pimp->pim_type == PIM_REGISTER)
2318                 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2319         if (IP_CSUM(mp_dup, 0, 0)) {
2320                 freemsg(mp_dup);
2321                 return (B_FALSE);
2322         }
2323         freemsg(mp_dup);
2324         return (B_TRUE);
2325 }
2326
2327 /*
2328  * Process PIM protocol packets i.e. IP Protocol 103.
2329  * Register messages are decapsulated and sent onto multicast forwarding.
2330  *
2331  * Return NULL for a bad packet that is discarded here.
2332  * Return mp if the message is OK and should be handed to "raw" receivers.
2333  * Callers of pim_input() may need to reinitialize variables that were copied
2334  * from the mblk as this calls pullupmsg().
2335  */
2336 mblk_t *
2337 pim_input(mblk_t *mp, ip_recv_attr_t *ira)
2338 {
2339         ipha_t          *eip, *ip;
2340         int             iplen, pimlen, iphlen;
2341         struct pim      *pimp;  /* pointer to a pim struct */
2342         uint32_t        *reghdr;
2343         ill_t           *ill = ira->ira_ill;
2344         ip_stack_t      *ipst = ill->ill_ipst;
2345         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2346
2347         /*
2348          * Pullup the msg for PIM protocol processing.
2349          */
2350         if (pullupmsg(mp, -1) == 0) {
2351                 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2352                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2353                 ip_drop_input("mrts_pim_nomemory", mp, ill);
2354                 freemsg(mp);
2355                 return (NULL);
2356         }
2357
2358         ip = (ipha_t *)mp->b_rptr;
2359         iplen = ip->ipha_length;
2360         iphlen = IPH_HDR_LENGTH(ip);
2361         pimlen = ntohs(iplen) - iphlen;
2362
2363         /*
2364          * Validate lengths
2365          */
2366         if (pimlen < PIM_MINLEN) {
2367                 ++ipst->ips_mrtstat->mrts_pim_malformed;
2368                 if (ipst->ips_ip_mrtdebug > 1) {
2369                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2370                             "pim_input: length not at least minlen");
2371                 }
2372                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2373                 ip_drop_input("mrts_pim_malformed", mp, ill);
2374                 freemsg(mp);
2375                 return (NULL);
2376         }
2377
2378         /*
2379          * Point to the PIM header.
2380          */
2381         pimp = (struct pim *)((caddr_t)ip + iphlen);
2382
2383         /*
2384          * Check the version number.
2385          */
2386         if (pimp->pim_vers != PIM_VERSION) {
2387                 ++ipst->ips_mrtstat->mrts_pim_badversion;
2388                 if (ipst->ips_ip_mrtdebug > 1) {
2389                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2390                             "pim_input: unknown version of PIM");
2391                 }
2392                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2393                 ip_drop_input("mrts_pim_badversion", mp, ill);
2394                 freemsg(mp);
2395                 return (NULL);
2396         }
2397
2398         /*
2399          * Validate the checksum
2400          */
2401         if (!pim_validate_cksum(mp, ip, pimp)) {
2402                 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
2403                 if (ipst->ips_ip_mrtdebug > 1) {
2404                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2405                             "pim_input: invalid checksum");
2406                 }
2407                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2408                 ip_drop_input("pim_rcv_badcsum", mp, ill);
2409                 freemsg(mp);
2410                 return (NULL);
2411         }
2412
2413         if (pimp->pim_type != PIM_REGISTER)
2414                 return (mp);
2415
2416         reghdr = (uint32_t *)(pimp + 1);
2417         eip = (ipha_t *)(reghdr + 1);
2418
2419         /*
2420          * check if the inner packet is destined to mcast group
2421          */
2422         if (!CLASSD(eip->ipha_dst)) {
2423                 ++ipst->ips_mrtstat->mrts_pim_badregisters;
2424                 if (ipst->ips_ip_mrtdebug > 1) {
2425                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2426                             "pim_input: Inner pkt not mcast .. !");
2427                 }
2428                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2429                 ip_drop_input("mrts_pim_badregisters", mp, ill);
2430                 freemsg(mp);
2431                 return (NULL);
2432         }
2433         if (ipst->ips_ip_mrtdebug > 1) {
2434                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2435                     "register from %x, to %x, len %d",
2436                     ntohl(eip->ipha_src),
2437                     ntohl(eip->ipha_dst),
2438                     ntohs(eip->ipha_length));
2439         }
2440         /*
2441          * If the null register bit is not set, decapsulate
2442          * the packet before forwarding it.
2443          * Avoid this in no register vif
2444          */
2445         if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) &&
2446             ipst->ips_reg_vif_num != ALL_VIFS) {
2447                 mblk_t *mp_copy;
2448                 uint_t saved_pktlen;
2449
2450                 /* Copy the message */
2451                 if ((mp_copy = copymsg(mp)) == NULL) {
2452                         ++ipst->ips_mrtstat->mrts_pim_nomemory;
2453                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2454                         ip_drop_input("mrts_pim_nomemory", mp, ill);
2455                         freemsg(mp);
2456                         return (NULL);
2457                 }
2458
2459                 /*
2460                  * Decapsulate the packet and give it to
2461                  * register_mforward.
2462                  */
2463                 mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr);
2464                 saved_pktlen = ira->ira_pktlen;
2465                 ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr);
2466                 if (register_mforward(mp_copy, ira) != 0) {
2467                         /* register_mforward already called ip_drop_input */
2468                         freemsg(mp);
2469                         ira->ira_pktlen = saved_pktlen;
2470                         return (NULL);
2471                 }
2472                 ira->ira_pktlen = saved_pktlen;
2473         }
2474
2475         /*
2476          * Pass all valid PIM packets up to any process(es) listening on a raw
2477          * PIM socket. For Solaris it is done right after pim_input() is
2478          * called.
2479          */
2480         return (mp);
2481 }
2482
2483 /*
2484  * PIM sparse mode hook.  Called by pim_input after decapsulating
2485  * the packet. Loop back the packet, as if we have received it.
2486  * In pim_input() we have to check if the destination is a multicast address.
2487  */
2488 static int
2489 register_mforward(mblk_t *mp, ip_recv_attr_t *ira)
2490 {
2491         ire_t           *ire;
2492         ipha_t          *ipha = (ipha_t *)mp->b_rptr;
2493         ill_t           *ill = ira->ira_ill;
2494         ip_stack_t      *ipst = ill->ill_ipst;
2495         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2496
2497         ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
2498
2499         if (ipst->ips_ip_mrtdebug > 3) {
2500                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2501                     "register_mforward: src %x, dst %x\n",
2502                     ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2503         }
2504         /*
2505          * Need to pass in to ip_mforward() the information that the
2506          * packet has arrived on the register_vif. We mark it with
2507          * the IRAF_PIM_REGISTER attribute.
2508          * pim_input verified that the (inner) destination is multicast,
2509          * hence we skip the generic code in ip_input.
2510          */
2511         ira->ira_flags |= IRAF_PIM_REGISTER;
2512         ++ipst->ips_mrtstat->mrts_pim_regforwards;
2513
2514         if (!CLASSD(ipha->ipha_dst)) {
2515                 ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES,
2516                     ira->ira_tsl, MATCH_IRE_SECATTR, IRR_ALLOCATE, 0, ipst,
2517                     NULL, NULL, NULL);
2518         } else {
2519                 ire = ire_multicast(ill);
2520         }
2521         ASSERT(ire != NULL);
2522         /* Normally this will return the IRE_MULTICAST */
2523         if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2524                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2525                 ip_drop_input("mrts_pim RTF_REJECT", mp, ill);
2526                 freemsg(mp);
2527                 ire_refrele(ire);
2528                 return (-1);
2529         }
2530         ASSERT(ire->ire_type & IRE_MULTICAST);
2531         (*ire->ire_recvfn)(ire, mp, ipha, ira);
2532         ire_refrele(ire);
2533
2534         return (0);
2535 }
2536
2537 /*
2538  * Send an encapsulated packet.
2539  * Caller assumes can continue to use mp when routine returns.
2540  */
2541 /* ARGSUSED */
2542 static void
2543 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2544 {
2545         mblk_t  *mp_copy;
2546         ipha_t  *ipha_copy;
2547         size_t  len;
2548         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2549         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2550
2551         if (ipst->ips_ip_mrtdebug > 1) {
2552                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2553                     "encap_send: vif %ld enter",
2554                     (ptrdiff_t)(vifp - ipst->ips_vifs));
2555         }
2556         len = ntohs(ipha->ipha_length);
2557
2558         /*
2559          * Copy the old packet & pullup it's IP header into the
2560          * new mbuf so we can modify it.  Try to fill the new
2561          * mbuf since if we don't the ethernet driver will.
2562          */
2563         mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2564         if (mp_copy == NULL)
2565                 return;
2566         mp_copy->b_rptr += 32;
2567         mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2568         if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2569                 freeb(mp_copy);
2570                 return;
2571         }
2572
2573         /*
2574          * Fill in the encapsulating IP header.
2575          * Remote tunnel dst in rmt_addr, from add_vif().
2576          */
2577         ipha_copy = (ipha_t *)mp_copy->b_rptr;
2578         *ipha_copy = multicast_encap_iphdr;
2579         ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2580         ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2581         ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2582         ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2583         ASSERT(ipha_copy->ipha_ident == 0);
2584
2585         /* Turn the encapsulated IP header back into a valid one. */
2586         ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2587         ipha->ipha_ttl--;
2588         ipha->ipha_hdr_checksum = 0;
2589         ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2590
2591         ipha_copy->ipha_ttl = ipha->ipha_ttl;
2592
2593         if (ipst->ips_ip_mrtdebug > 1) {
2594                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2595                     "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2596         }
2597         if (vifp->v_rate_limit <= 0)
2598                 tbf_send_packet(vifp, mp_copy);
2599         else
2600                 /* ipha is from the original header */
2601                 tbf_control(vifp, mp_copy, ipha);
2602 }
2603
2604 /*
2605  * De-encapsulate a packet and feed it back through IP input if it
2606  * matches one of our multicast tunnels.
2607  *
2608  * This routine is called whenever IP gets a packet with prototype
2609  * IPPROTO_ENCAP and a local destination address and the packet didn't
2610  * match one of our configured IP-in-IP tunnels.
2611  */
2612 void
2613 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira)
2614 {
2615         ipha_t          *ipha = (ipha_t *)mp->b_rptr;
2616         ipha_t          *ipha_encap;
2617         int             hlen = IPH_HDR_LENGTH(ipha);
2618         int             hlen_encap;
2619         ipaddr_t        src;
2620         struct vif      *vifp;
2621         ire_t           *ire;
2622         ill_t           *ill = ira->ira_ill;
2623         ip_stack_t      *ipst = ill->ill_ipst;
2624         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2625
2626         /* Make sure we have all of the inner header */
2627         ipha_encap = (ipha_t *)((char *)ipha + hlen);
2628         if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) {
2629                 ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira);
2630                 if (ipha == NULL) {
2631                         ipst->ips_mrtstat->mrts_bad_tunnel++;
2632                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2633                         ip_drop_input("ip_mroute_decap: too short", mp, ill);
2634                         freemsg(mp);
2635                         return;
2636                 }
2637                 ipha_encap = (ipha_t *)((char *)ipha + hlen);
2638         }
2639         hlen_encap = IPH_HDR_LENGTH(ipha_encap);
2640         if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) {
2641                 ipha = ip_pullup(mp, hlen + hlen_encap, ira);
2642                 if (ipha == NULL) {
2643                         ipst->ips_mrtstat->mrts_bad_tunnel++;
2644                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2645                         ip_drop_input("ip_mroute_decap: too short", mp, ill);
2646                         freemsg(mp);
2647                         return;
2648                 }
2649                 ipha_encap = (ipha_t *)((char *)ipha + hlen);
2650         }
2651
2652         /*
2653          * Dump the packet if it's not to a multicast destination or if
2654          * we don't have an encapsulating tunnel with the source.
2655          * Note:  This code assumes that the remote site IP address
2656          * uniquely identifies the tunnel (i.e., that this site has
2657          * at most one tunnel with the remote site).
2658          */
2659         if (!CLASSD(ipha_encap->ipha_dst)) {
2660                 ipst->ips_mrtstat->mrts_bad_tunnel++;
2661                 ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2662                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2663                 ip_drop_input("mrts_bad_tunnel", mp, ill);
2664                 freemsg(mp);
2665                 return;
2666         }
2667         src = (ipaddr_t)ipha->ipha_src;
2668         mutex_enter(&ipst->ips_last_encap_lock);
2669         if (src != ipst->ips_last_encap_src) {
2670                 struct vif *vife;
2671
2672                 vifp = ipst->ips_vifs;
2673                 vife = vifp + ipst->ips_numvifs;
2674                 ipst->ips_last_encap_src = src;
2675                 ipst->ips_last_encap_vif = 0;
2676                 for (; vifp < vife; ++vifp) {
2677                         if (!lock_good_vif(vifp))
2678                                 continue;
2679                         if (vifp->v_rmt_addr.s_addr == src) {
2680                                 if (vifp->v_flags & VIFF_TUNNEL)
2681                                         ipst->ips_last_encap_vif = vifp;
2682                                 if (ipst->ips_ip_mrtdebug > 1) {
2683                                         (void) mi_strlog(mrouter->conn_rq,
2684                                             1, SL_TRACE,
2685                                             "ip_mroute_decap: good tun "
2686                                             "vif %ld with %x",
2687                                             (ptrdiff_t)(vifp - ipst->ips_vifs),
2688                                             ntohl(src));
2689                                 }
2690                                 unlock_good_vif(vifp);
2691                                 break;
2692                         }
2693                         unlock_good_vif(vifp);
2694                 }
2695         }
2696         if ((vifp = ipst->ips_last_encap_vif) == 0) {
2697                 mutex_exit(&ipst->ips_last_encap_lock);
2698                 ipst->ips_mrtstat->mrts_bad_tunnel++;
2699                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2700                 ip_drop_input("mrts_bad_tunnel", mp, ill);
2701                 freemsg(mp);
2702                 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2703                     (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
2704                 return;
2705         }
2706         mutex_exit(&ipst->ips_last_encap_lock);
2707
2708         /*
2709          * Need to pass in the tunnel source to ip_mforward (so that it can
2710          * verify that the packet arrived over the correct vif.)
2711          */
2712         ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET;
2713         ira->ira_mroute_tunnel = src;
2714         mp->b_rptr += hlen;
2715         ira->ira_pktlen -= hlen;
2716         ira->ira_ip_hdr_length = hlen_encap;
2717
2718         /*
2719          * We don't redo any of the filtering in ill_input_full_v4 and we
2720          * have checked that all of ipha_encap and any IP options are
2721          * pulled up. Hence we call ire_recv_multicast_v4 directly.
2722          * However, we have to check for RSVP as in ip_input_full_v4
2723          * and if so we pass it to ire_recv_broadcast_v4 for local delivery
2724          * to the rsvpd.
2725          */
2726         if (ipha_encap->ipha_protocol == IPPROTO_RSVP &&
2727             ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
2728                 ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill,
2729                     ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR,
2730                     IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2731         } else {
2732                 ire = ire_multicast(ill);
2733         }
2734         ASSERT(ire != NULL);
2735         /* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */
2736         if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2737                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2738                 ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill);
2739                 freemsg(mp);
2740                 ire_refrele(ire);
2741                 return;
2742         }
2743         ire->ire_ib_pkt_count++;
2744         ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST));
2745         (*ire->ire_recvfn)(ire, mp, ipha_encap, ira);
2746         ire_refrele(ire);
2747 }
2748
2749 /*
2750  * Remove all records with v_ipif == ipif.  Called when an interface goes away
2751  * (stream closed).  Called as writer.
2752  */
2753 void
2754 reset_mrt_vif_ipif(ipif_t *ipif)
2755 {
2756         vifi_t vifi, tmp_vifi;
2757         vifi_t num_of_vifs;
2758         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
2759
2760         /* Can't check vifi >= 0 since vifi_t is unsigned! */
2761
2762         mutex_enter(&ipst->ips_numvifs_mutex);
2763         num_of_vifs = ipst->ips_numvifs;
2764         mutex_exit(&ipst->ips_numvifs_mutex);
2765
2766         for (vifi = num_of_vifs; vifi != 0; vifi--) {
2767                 tmp_vifi = vifi - 1;
2768                 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
2769                         (void) del_vif(&tmp_vifi, ipst);
2770                 }
2771         }
2772 }
2773
2774 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
2775 void
2776 reset_mrt_ill(ill_t *ill)
2777 {
2778         struct mfc      *rt;
2779         struct rtdetq   *rte;
2780         int             i;
2781         ip_stack_t      *ipst = ill->ill_ipst;
2782         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2783         timeout_id_t    id;
2784
2785         for (i = 0; i < MFCTBLSIZ; i++) {
2786                 MFCB_REFHOLD(&ipst->ips_mfcs[i]);
2787                 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
2788                         if (ipst->ips_ip_mrtdebug > 1) {
2789                                 (void) mi_strlog(mrouter->conn_rq, 1,
2790                                     SL_TRACE,
2791                                     "reset_mrt_ill: mfctable [%d]", i);
2792                         }
2793                         while (rt != NULL) {
2794                                 mutex_enter(&rt->mfc_mutex);
2795                                 while ((rte = rt->mfc_rte) != NULL) {
2796                                         if (rte->ill == ill &&
2797                                             (id = rt->mfc_timeout_id) != 0) {
2798                                                 /*
2799                                                  * Its ok to drop the lock,  the
2800                                                  * struct cannot be freed since
2801                                                  * we have a ref on the hash
2802                                                  * bucket.
2803                                                  */
2804                                                 mutex_exit(&rt->mfc_mutex);
2805                                                 (void) untimeout(id);
2806                                                 mutex_enter(&rt->mfc_mutex);
2807                                         }
2808                                         if (rte->ill == ill) {
2809                                                 if (ipst->ips_ip_mrtdebug > 1) {
2810                                                 (void) mi_strlog(
2811                                                     mrouter->conn_rq,
2812                                                     1, SL_TRACE,
2813                                                     "reset_mrt_ill: "
2814                                                     "ill 0x%p", (void *)ill);
2815                                                 }
2816                                                 rt->mfc_rte = rte->rte_next;
2817                                                 freemsg(rte->mp);
2818                                                 mi_free((char *)rte);
2819                                         }
2820                                 }
2821                                 mutex_exit(&rt->mfc_mutex);
2822                                 rt = rt->mfc_next;
2823                         }
2824                 }
2825                 MFCB_REFRELE(&ipst->ips_mfcs[i]);
2826         }
2827 }
2828
2829 /*
2830  * Token bucket filter module.
2831  * The ipha is for mcastgrp destination for phyint and encap.
2832  */
2833 static void
2834 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2835 {
2836         size_t  p_len =  msgdsize(mp);
2837         struct tbf      *t    = vifp->v_tbf;
2838         timeout_id_t id = 0;
2839         ill_t           *ill = vifp->v_ipif->ipif_ill;
2840         ip_stack_t      *ipst = ill->ill_ipst;
2841         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2842
2843         /* Drop if packet is too large */
2844         if (p_len > MAX_BKT_SIZE) {
2845                 ipst->ips_mrtstat->mrts_pkt2large++;
2846                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2847                 ip_drop_output("tbf_control - too large", mp, ill);
2848                 freemsg(mp);
2849                 return;
2850         }
2851         if (ipst->ips_ip_mrtdebug > 1) {
2852                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2853                     "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2854                     (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
2855                     ntohl(ipha->ipha_dst));
2856         }
2857
2858         mutex_enter(&t->tbf_lock);
2859
2860         tbf_update_tokens(vifp);
2861
2862         /*
2863          * If there are enough tokens,
2864          * and the queue is empty, send this packet out.
2865          */
2866         if (ipst->ips_ip_mrtdebug > 1) {
2867                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2868                     "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
2869                     (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
2870                     t->tbf_q_len);
2871         }
2872         /* No packets are queued */
2873         if (t->tbf_q_len == 0) {
2874                 /* queue empty, send packet if enough tokens */
2875                 if (p_len <= t->tbf_n_tok) {
2876                         t->tbf_n_tok -= p_len;
2877                         mutex_exit(&t->tbf_lock);
2878                         tbf_send_packet(vifp, mp);
2879                         return;
2880                 } else {
2881                         /* Queue packet and timeout till later */
2882                         tbf_queue(vifp, mp);
2883                         ASSERT(vifp->v_timeout_id == 0);
2884                         vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2885                             TBF_REPROCESS);
2886                 }
2887         } else if (t->tbf_q_len < t->tbf_max_q_len) {
2888                 /* Finite queue length, so queue pkts and process queue */
2889                 tbf_queue(vifp, mp);
2890                 tbf_process_q(vifp);
2891         } else {
2892                 /* Check that we have UDP header with IP header */
2893                 size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2894                     sizeof (struct udphdr);
2895
2896                 if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2897                         if (!pullupmsg(mp, hdr_length)) {
2898                                 BUMP_MIB(ill->ill_ip_mib,
2899                                     ipIfStatsOutDiscards);
2900                                 ip_drop_output("tbf_control - pullup", mp, ill);
2901                                 freemsg(mp);
2902                                 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2903                                     "vif %ld src 0x%x dst 0x%x\n",
2904                                     (ptrdiff_t)(vifp - ipst->ips_vifs),
2905                                     ntohl(ipha->ipha_src),
2906                                     ntohl(ipha->ipha_dst)));
2907                                 mutex_exit(&vifp->v_tbf->tbf_lock);
2908                                 return;
2909                         } else
2910                                 /* Have to reassign ipha after pullupmsg */
2911                                 ipha = (ipha_t *)mp->b_rptr;
2912                 }
2913                 /*
2914                  * Queue length too much,
2915                  * try to selectively dq, or queue and process
2916                  */
2917                 if (!tbf_dq_sel(vifp, ipha)) {
2918                         ipst->ips_mrtstat->mrts_q_overflow++;
2919                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2920                         ip_drop_output("mrts_q_overflow", mp, ill);
2921                         freemsg(mp);
2922                 } else {
2923                         tbf_queue(vifp, mp);
2924                         tbf_process_q(vifp);
2925                 }
2926         }
2927         if (t->tbf_q_len == 0) {
2928                 id = vifp->v_timeout_id;
2929                 vifp->v_timeout_id = 0;
2930         }
2931         mutex_exit(&vifp->v_tbf->tbf_lock);
2932         if (id != 0)
2933                 (void) untimeout(id);
2934 }
2935
2936 /*
2937  * Adds a packet to the tbf queue at the interface.
2938  * The ipha is for mcastgrp destination for phyint and encap.
2939  */
2940 static void
2941 tbf_queue(struct vif *vifp, mblk_t *mp)
2942 {
2943         struct tbf      *t = vifp->v_tbf;
2944         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2945         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2946
2947         if (ipst->ips_ip_mrtdebug > 1) {
2948                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2949                     "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
2950         }
2951         ASSERT(MUTEX_HELD(&t->tbf_lock));
2952
2953         if (t->tbf_t == NULL) {
2954                 /* Queue was empty */
2955                 t->tbf_q = mp;
2956         } else {
2957                 /* Insert at tail */
2958                 t->tbf_t->b_next = mp;
2959         }
2960         /* set new tail pointer */
2961         t->tbf_t = mp;
2962
2963         mp->b_next = mp->b_prev = NULL;
2964
2965         t->tbf_q_len++;
2966 }
2967
2968 /*
2969  * Process the queue at the vif interface.
2970  * Drops the tbf_lock when sending packets.
2971  *
2972  * NOTE : The caller should quntimeout if the queue length is 0.
2973  */
2974 static void
2975 tbf_process_q(struct vif *vifp)
2976 {
2977         mblk_t  *mp;
2978         struct tbf      *t = vifp->v_tbf;
2979         size_t  len;
2980         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2981         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2982
2983         if (ipst->ips_ip_mrtdebug > 1) {
2984                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2985                     "tbf_process_q 1: vif %ld qlen = %d",
2986                     (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
2987         }
2988
2989         /*
2990          * Loop through the queue at the interface and send
2991          * as many packets as possible.
2992          */
2993         ASSERT(MUTEX_HELD(&t->tbf_lock));
2994
2995         while (t->tbf_q_len > 0) {
2996                 mp = t->tbf_q;
2997                 len = (size_t)msgdsize(mp); /* length of ip pkt */
2998
2999                 /* Determine if the packet can be sent */
3000                 if (len <= t->tbf_n_tok) {
3001                         /*
3002                          * If so, reduce no. of tokens, dequeue the packet,
3003                          * send the packet.
3004                          */
3005                         t->tbf_n_tok -= len;
3006
3007                         t->tbf_q = mp->b_next;
3008                         if (--t->tbf_q_len == 0) {
3009                                 t->tbf_t = NULL;
3010                         }
3011                         mp->b_next = NULL;
3012                         /* Exit mutex before sending packet, then re-enter */
3013                         mutex_exit(&t->tbf_lock);
3014                         tbf_send_packet(vifp, mp);
3015                         mutex_enter(&t->tbf_lock);
3016                 } else
3017                         break;
3018         }
3019 }
3020
3021 /* Called at tbf timeout to update tokens, process q and reset timer.  */
3022 static void
3023 tbf_reprocess_q(void *arg)
3024 {
3025         struct vif *vifp = arg;
3026         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3027         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
3028
3029         mutex_enter(&vifp->v_tbf->tbf_lock);
3030         vifp->v_timeout_id = 0;
3031         tbf_update_tokens(vifp);
3032
3033         tbf_process_q(vifp);
3034
3035         if (vifp->v_tbf->tbf_q_len > 0) {
3036                 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
3037                     TBF_REPROCESS);
3038         }
3039         mutex_exit(&vifp->v_tbf->tbf_lock);
3040
3041         if (ipst->ips_ip_mrtdebug > 1) {
3042                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3043                     "tbf_reprcess_q: vif %ld timeout id = %p",
3044                     (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
3045         }
3046 }
3047
3048 /*
3049  * Function that will selectively discard a member of the tbf queue,
3050  * based on the precedence value and the priority.
3051  *
3052  * NOTE : The caller should quntimeout if the queue length is 0.
3053  */
3054 static int
3055 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
3056 {
3057         uint_t          p;
3058         struct tbf              *t = vifp->v_tbf;
3059         mblk_t          **np;
3060         mblk_t          *last, *mp;
3061         ill_t           *ill = vifp->v_ipif->ipif_ill;
3062         ip_stack_t      *ipst = ill->ill_ipst;
3063         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
3064
3065         if (ipst->ips_ip_mrtdebug > 1) {
3066                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3067                     "dq_sel: vif %ld dst 0x%x",
3068                     (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
3069         }
3070
3071         ASSERT(MUTEX_HELD(&t->tbf_lock));
3072         p = priority(vifp, ipha);
3073
3074         np = &t->tbf_q;
3075         last = NULL;
3076         while ((mp = *np) != NULL) {
3077                 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
3078                         *np = mp->b_next;
3079                         /* If removing the last packet, fix the tail pointer */
3080                         if (mp == t->tbf_t)
3081                                 t->tbf_t = last;
3082                         mp->b_prev = mp->b_next = NULL;
3083                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3084                         ip_drop_output("tbf_dq_send", mp, ill);
3085                         freemsg(mp);
3086                         /*
3087                          * It's impossible for the queue to be empty, but
3088                          * we check anyway.
3089                          */
3090                         if (--t->tbf_q_len == 0) {
3091                                 t->tbf_t = NULL;
3092                         }
3093                         ipst->ips_mrtstat->mrts_drop_sel++;
3094                         return (1);
3095                 }
3096                 np = &mp->b_next;
3097                 last = mp;
3098         }
3099         return (0);
3100 }
3101
3102 /* Sends packet, 2 cases - encap tunnel, phyint.  */
3103 static void
3104 tbf_send_packet(struct vif *vifp, mblk_t *mp)
3105 {
3106         ipif_t          *ipif = vifp->v_ipif;
3107         ill_t           *ill = ipif->ipif_ill;
3108         ip_stack_t      *ipst = ill->ill_ipst;
3109         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
3110         ipha_t          *ipha;
3111
3112         ipha = (ipha_t *)mp->b_rptr;
3113         /* If encap tunnel options */
3114         if (vifp->v_flags & VIFF_TUNNEL)  {
3115                 ip_xmit_attr_t  ixas;
3116
3117                 if (ipst->ips_ip_mrtdebug > 1) {
3118                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3119                             "tbf_send_packet: ENCAP tunnel vif %ld",
3120                             (ptrdiff_t)(vifp - ipst->ips_vifs));
3121                 }
3122                 bzero(&ixas, sizeof (ixas));
3123                 ixas.ixa_flags =
3124                     IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE;
3125                 ixas.ixa_ipst = ipst;
3126                 ixas.ixa_ifindex = 0;
3127                 ixas.ixa_cred = kcred;
3128                 ixas.ixa_cpid = NOPID;
3129                 ixas.ixa_tsl = NULL;
3130                 ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3131                 ixas.ixa_pktlen = ntohs(ipha->ipha_length);
3132                 ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3133
3134                 /*
3135                  * Feed into ip_output_simple which will set the ident field
3136                  * and checksum the encapsulating header.
3137                  * BSD gets the cached route vifp->v_route from ip_output()
3138                  * to speed up route table lookups. Not necessary in SunOS 5.x.
3139                  * One could make multicast forwarding faster by putting an
3140                  * ip_xmit_attr_t in each vif thereby caching the ire/nce.
3141                  */
3142                 (void) ip_output_simple(mp, &ixas);
3143                 ixa_cleanup(&ixas);
3144                 return;
3145
3146                 /* phyint */
3147         } else {
3148                 /* Need to loop back to members on the outgoing interface. */
3149                 ipaddr_t        dst;
3150                 ip_recv_attr_t  iras;
3151                 nce_t           *nce;
3152
3153                 bzero(&iras, sizeof (iras));
3154                 iras.ira_flags = IRAF_IS_IPV4;
3155                 iras.ira_ill = iras.ira_rill = ill;
3156                 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3157                 iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3158                 iras.ira_pktlen = ntohs(ipha->ipha_length);
3159                 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3160
3161                 dst = ipha->ipha_dst;
3162                 if (ill_hasmembers_v4(ill, dst)) {
3163                         iras.ira_flags |= IRAF_LOOPBACK_COPY;
3164                 }
3165                 if (ipst->ips_ip_mrtdebug > 1) {
3166                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3167                             "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
3168                             (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
3169                 }
3170                 /*
3171                  * Find an NCE which matches the nexthop.
3172                  * For a pt-pt interface we use the other end of the pt-pt
3173                  * link.
3174                  */
3175                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
3176                         dst = ipif->ipif_pp_dst_addr;
3177                         nce = arp_nce_init(ill, dst, ill->ill_net_type);
3178                 } else {
3179                         nce = arp_nce_init(ill, dst, IRE_MULTICAST);
3180                 }
3181                 if (nce == NULL) {
3182                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3183                         ip_drop_output("tbf_send_packet - no nce", mp, ill);
3184                         freemsg(mp);
3185                         return;
3186                 }
3187
3188                 /*
3189                  * We don't remeber the incoming ill. Thus we
3190                  * pretend the  packet arrived on the outbound ill. This means
3191                  * statistics for input errors will be increased on the wrong
3192                  * ill but that isn't a big deal.
3193                  */
3194                 ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mc_mtu,
3195                     0);
3196                 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3197
3198                 nce_refrele(nce);
3199         }
3200 }
3201
3202 /*
3203  * Determine the current time and then the elapsed time (between the last time
3204  * and time now).  Update the no. of tokens in the bucket.
3205  */
3206 static void
3207 tbf_update_tokens(struct vif *vifp)
3208 {
3209         timespec_t      tp;
3210         hrtime_t        tm;
3211         struct tbf      *t = vifp->v_tbf;
3212         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3213         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
3214
3215         ASSERT(MUTEX_HELD(&t->tbf_lock));
3216
3217         /* Time in secs and nsecs, rate limit in kbits/sec */
3218         gethrestime(&tp);
3219
3220         /*LINTED*/
3221         TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3222
3223         /*
3224          * This formula is actually
3225          * "time in seconds" * "bytes/second".  Scaled for nsec.
3226          * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3227          *
3228          * The (1000/1024) was introduced in add_vif to optimize
3229          * this divide into a shift.
3230          */
3231         t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3232         t->tbf_last_pkt_t = tp;
3233
3234         if (t->tbf_n_tok > MAX_BKT_SIZE)
3235                 t->tbf_n_tok = MAX_BKT_SIZE;
3236         if (ipst->ips_ip_mrtdebug > 1) {
3237                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3238                     "tbf_update_tok: tm %lld tok %d vif %ld",
3239                     tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
3240         }
3241 }
3242
3243 /*
3244  * Priority currently is based on port nos.
3245  * Different forwarding mechanisms have different ways
3246  * of obtaining the port no. Hence, the vif must be
3247  * given along with the packet itself.
3248  *
3249  */
3250 static int
3251 priority(struct vif *vifp, ipha_t *ipha)
3252 {
3253         int prio;
3254         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3255         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
3256
3257         /* Temporary hack; may add general packet classifier some day */
3258
3259         ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3260
3261         /*
3262          * The UDP port space is divided up into four priority ranges:
3263          * [0, 16384)   : unclassified - lowest priority
3264          * [16384, 32768)       : audio - highest priority
3265          * [32768, 49152)       : whiteboard - medium priority
3266          * [49152, 65536)       : video - low priority
3267          */
3268
3269         if (ipha->ipha_protocol == IPPROTO_UDP) {
3270                 struct udphdr *udp =
3271                     (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3272                 switch (ntohs(udp->uh_dport) & 0xc000) {
3273                 case 0x4000:
3274                         prio = 70;
3275                         break;
3276                 case 0x8000:
3277                         prio = 60;
3278                         break;
3279                 case 0xc000:
3280                         prio = 55;
3281                         break;
3282                 default:
3283                         prio = 50;
3284                         break;
3285                 }
3286                 if (ipst->ips_ip_mrtdebug > 1) {
3287                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3288                             "priority: port %x prio %d\n",
3289                             ntohs(udp->uh_dport), prio);
3290                 }
3291         } else
3292                 prio = 50;  /* default priority */
3293         return (prio);
3294 }
3295
3296 /*
3297  * End of token bucket filter modifications
3298  */
3299
3300
3301
3302 /*
3303  * Produces data for netstat -M.
3304  */
3305 int
3306 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
3307 {
3308         ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
3309         ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
3310         if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
3311                 sizeof (struct mrtstat))) {
3312                 ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3313                     (size_t)sizeof (struct mrtstat)));
3314                 return (0);
3315         }
3316         return (1);
3317 }
3318
3319 /*
3320  * Sends info for SNMP's MIB.
3321  */
3322 int
3323 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
3324 {
3325         struct vifctl   vi;
3326         vifi_t          vifi;
3327
3328         mutex_enter(&ipst->ips_numvifs_mutex);
3329         for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
3330                 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
3331                         continue;
3332                 /*
3333                  * No locks here, an approximation is fine.
3334                  */
3335                 vi.vifc_vifi = vifi;
3336                 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
3337                 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
3338                 vi.vifc_rate_limit      = ipst->ips_vifs[vifi].v_rate_limit;
3339                 vi.vifc_lcl_addr        = ipst->ips_vifs[vifi].v_lcl_addr;
3340                 vi.vifc_rmt_addr        = ipst->ips_vifs[vifi].v_rmt_addr;
3341                 vi.vifc_pkt_in          = ipst->ips_vifs[vifi].v_pkt_in;
3342                 vi.vifc_pkt_out         = ipst->ips_vifs[vifi].v_pkt_out;
3343
3344                 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3345                         ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3346                             (size_t)sizeof (vi)));
3347                         mutex_exit(&ipst->ips_numvifs_mutex);
3348                         return (0);
3349                 }
3350         }
3351         mutex_exit(&ipst->ips_numvifs_mutex);
3352         return (1);
3353 }
3354
3355 /*
3356  * Called by ip_snmp_get to send up multicast routing table.
3357  */
3358 int
3359 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
3360 {
3361         int                     i, j;
3362         struct mfc              *rt;
3363         struct mfcctl   mfcc;
3364
3365         /*
3366          * Make sure multicast has not been turned off.
3367          */
3368         if (is_mrouter_off(ipst))
3369                 return (1);
3370
3371         /* Loop over all hash buckets and their chains */
3372         for (i = 0; i < MFCTBLSIZ; i++) {
3373                 MFCB_REFHOLD(&ipst->ips_mfcs[i]);
3374                 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3375                         mutex_enter(&rt->mfc_mutex);
3376                         if (rt->mfc_rte != NULL ||
3377                             (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3378                                 mutex_exit(&rt->mfc_mutex);
3379                                 continue;
3380                         }
3381                         mfcc.mfcc_origin = rt->mfc_origin;
3382                         mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3383                         mfcc.mfcc_parent = rt->mfc_parent;
3384                         mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3385                         mutex_enter(&ipst->ips_numvifs_mutex);
3386                         for (j = 0; j < (int)ipst->ips_numvifs; j++)
3387                                 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3388                         for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
3389                                 mfcc.mfcc_ttls[j] = 0;
3390                         mutex_exit(&ipst->ips_numvifs_mutex);
3391
3392                         mutex_exit(&rt->mfc_mutex);
3393                         if (!snmp_append_data(mp, (char *)&mfcc,
3394                             sizeof (mfcc))) {
3395                                 MFCB_REFRELE(&ipst->ips_mfcs[i]);
3396                                 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3397                                     (size_t)sizeof (mfcc)));
3398                                 return (0);
3399                         }
3400                 }
3401                 MFCB_REFRELE(&ipst->ips_mfcs[i]);
3402         }
3403         return (1);
3404 }