usr/src/uts/common/inet/ip/ip_mroute.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /* Copyright (c) 1990 Mentat Inc. */
  25
  26 /*
  27  * Procedures for the kernel part of DVMRP,
  28  * a Distance-Vector Multicast Routing Protocol.
  29  * (See RFC-1075)
  30  * Written by David Waitzman, BBN Labs, August 1988.
  31  * Modified by Steve Deering, Stanford, February 1989.
  32  * Modified by Mark J. Steiglitz, Stanford, May, 1991
  33  * Modified by Van Jacobson, LBL, January 1993
  34  * Modified by Ajit Thyagarajan, PARC, August 1993
  35  * Modified by Bill Fenner, PARC, April 1995
  36  *
  37  * MROUTING 3.5
  38  */
  39
  40 /*
  41  * TODO
  42  * - function pointer field in vif, void *vif_sendit()
  43  */
  44
  45 #include <sys/types.h>
  46 #include <sys/stream.h>
  47 #include <sys/stropts.h>
  48 #include <sys/strlog.h>
  49 #include <sys/systm.h>
  50 #include <sys/ddi.h>
  51 #include <sys/cmn_err.h>
  52 #include <sys/zone.h>
  53
  54 #include <sys/param.h>
  55 #include <sys/socket.h>
  56 #include <sys/vtrace.h>
  57 #include <sys/debug.h>
  58 #include <net/if.h>
  59 #include <sys/sockio.h>
  60 #include <netinet/in.h>
  61 #include <net/if_dl.h>
  62
  63 #include <inet/ipsec_impl.h>
  64 #include <inet/common.h>
  65 #include <inet/mi.h>
  66 #include <inet/nd.h>
  67 #include <inet/tunables.h>
  68 #include <inet/mib2.h>
  69 #include <netinet/ip6.h>
  70 #include <inet/ip.h>
  71 #include <inet/snmpcom.h>
  72
  73 #include <netinet/igmp.h>
  74 #include <netinet/igmp_var.h>
  75 #include <netinet/udp.h>
  76 #include <netinet/ip_mroute.h>
  77 #include <inet/ip_multi.h>
  78 #include <inet/ip_ire.h>
  79 #include <inet/ip_ndp.h>
  80 #include <inet/ip_if.h>
  81 #include <inet/ipclassifier.h>
  82
  83 #include <netinet/pim.h>
  84
  85
  86 /*
  87  * MT Design:
  88  *
  89  * There are three main data structures viftable, mfctable and tbftable that
  90  * need to be protected against MT races.
  91  *
  92  * vitable is a fixed length array of vif structs. There is no lock to protect
  93  * the whole array, instead each struct is protected by its own indiviual lock.
  94  * The value of v_marks in conjuction with the value of v_refcnt determines the
  95  * current state of a vif structure. One special state that needs mention
  96  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
  97  * that vif is being initalized.
  98  * Each structure is freed when the refcnt goes down to zero. If a delete comes
  99  * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
 100  * which prevents the struct from further use.  When the refcnt goes to zero
 101  * the struct is freed and is marked VIF_MARK_NOTINUSE.
 102  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
 103  * from  going away a refhold is put on the ipif before using it. see
 104  * lock_good_vif() and unlock_good_vif().
 105  *
 106  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
 107  * of the vif struct.
 108  *
 109  * tbftable is also a fixed length array of tbf structs and is only accessed
 110  * via v_tbf.  It is protected by its own lock tbf_lock.
 111  *
 112  * Lock Ordering is
 113  * v_lock --> tbf_lock
 114  * v_lock --> ill_locK
 115  *
 116  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
 117  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
 118  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
 119  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
 120  * protect the struct elements.
 121  *
 122  * mfc structs are dynamically allocated and are singly linked
 123  * at the head of the chain. When an mfc structure is to be deleted
 124  * it is marked condemned and so is the state in the bucket struct.
 125  * When the last walker of the hash bucket exits all the mfc structs
 126  * marked condemed are freed.
 127  *
 128  * Locking Hierarchy:
 129  * The bucket lock should be acquired before the mfc struct lock.
 130  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
 131  * operations on the bucket struct.
 132  *
 133  * last_encap_lock and numvifs_mutex should be acquired after
 134  * acquring vif or mfc locks. These locks protect some global variables.
 135  *
 136  * The statistics are not currently protected by a lock
 137  * causing the stats be be approximate, not exact.
 138  */
 139
 140 #define NO_VIF  MAXVIFS         /* from mrouted, no route for src */
 141
 142 /*
 143  * Timeouts:
 144  *      Upcall timeouts - BSD uses boolean_t mfc->expire and
 145  *      nexpire[MFCTBLSIZE], the number of times expire has been called.
 146  *      SunOS 5.x uses mfc->timeout for each mfc.
 147  *      Some Unixes are limited in the number of simultaneous timeouts
 148  *      that can be run, SunOS 5.x does not have this restriction.
 149  */
 150
 151 /*
 152  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
 153  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
 154  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
 155  */
 156 #define         EXPIRE_TIMEOUT  (hz/4)  /* 4x / second  */
 157 #define         UPCALL_EXPIRE   6       /* number of timeouts   */
 158
 159 /*
 160  * Hash function for a source, group entry
 161  */
 162 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
 163         ((g) >> 20) ^ ((g) >> 10) ^ (g))
 164
 165 #define                 TBF_REPROCESS   (hz / 100)      /* 100x /second */
 166
 167 /* Identify PIM packet that came on a Register interface */
 168 #define PIM_REGISTER_MARKER     0xffffffff
 169
 170 /* Function declarations */
 171 static int      add_mfc(struct mfcctl *, ip_stack_t *);
 172 static int      add_vif(struct vifctl *, conn_t *, ip_stack_t *);
 173 static int      del_mfc(struct mfcctl *, ip_stack_t *);
 174 static int      del_vif(vifi_t *, ip_stack_t *);
 175 static void     del_vifp(struct vif *);
 176 static void     encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
 177 static void     expire_upcalls(void *);
 178 static void     fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
 179 static void     free_queue(struct mfc *);
 180 static int      get_assert(uchar_t *, ip_stack_t *);
 181 static int      get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
 182 static int      get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
 183 static int      get_version(uchar_t *);
 184 static int      get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
 185 static int      ip_mdq(mblk_t *, ipha_t *, ill_t *,
 186                     ipaddr_t, struct mfc *);
 187 static int      ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
 188 static void     phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
 189 static int      register_mforward(mblk_t *, ip_recv_attr_t *);
 190 static void     register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
 191 static int      set_assert(int *, ip_stack_t *);
 192
 193 /*
 194  * Token Bucket Filter functions
 195  */
 196 static int  priority(struct vif *, ipha_t *);
 197 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
 198 static int  tbf_dq_sel(struct vif *, ipha_t *);
 199 static void tbf_process_q(struct vif *);
 200 static void tbf_queue(struct vif *, mblk_t *);
 201 static void tbf_reprocess_q(void *);
 202 static void tbf_send_packet(struct vif *, mblk_t *);
 203 static void tbf_update_tokens(struct vif *);
 204 static void release_mfc(struct mfcb *);
 205
 206 static boolean_t is_mrouter_off(ip_stack_t *);
 207 /*
 208  * Encapsulation packets
 209  */
 210
 211 #define ENCAP_TTL       64
 212
 213 /* prototype IP hdr for encapsulated packets */
 214 static ipha_t multicast_encap_iphdr = {
 215         IP_SIMPLE_HDR_VERSION,
 216         0,                              /* tos */
 217         sizeof (ipha_t),                /* total length */
 218         0,                              /* id */
 219         0,                              /* frag offset */
 220         ENCAP_TTL, IPPROTO_ENCAP,
 221         0,                              /* checksum */
 222 };
 223
 224 /*
 225  * Rate limit for assert notification messages, in nsec.
 226  */
 227 #define ASSERT_MSG_TIME         3000000000
 228
 229
 230 #define VIF_REFHOLD(vifp) {                     \
 231         mutex_enter(&(vifp)->v_lock);           \
 232         (vifp)->v_refcnt++;                     \
 233         mutex_exit(&(vifp)->v_lock);            \
 234 }
 235
 236 #define VIF_REFRELE_LOCKED(vifp) {                              \
 237         (vifp)->v_refcnt--;                                     \
 238         if ((vifp)->v_refcnt == 0 &&                            \
 239                 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) {       \
 240                         del_vifp(vifp);                         \
 241         } else {                                                \
 242                 mutex_exit(&(vifp)->v_lock);                    \
 243         }                                                       \
 244 }
 245
 246 #define VIF_REFRELE(vifp) {                                     \
 247         mutex_enter(&(vifp)->v_lock);                           \
 248         (vifp)->v_refcnt--;                                     \
 249         if ((vifp)->v_refcnt == 0 &&                            \
 250                 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) {       \
 251                         del_vifp(vifp);                         \
 252         } else {                                                \
 253                 mutex_exit(&(vifp)->v_lock);                    \
 254         }                                                       \
 255 }
 256
 257 #define MFCB_REFHOLD(mfcb) {                            \
 258         mutex_enter(&(mfcb)->mfcb_lock);                \
 259         (mfcb)->mfcb_refcnt++;                          \
 260         ASSERT((mfcb)->mfcb_refcnt != 0);               \
 261         mutex_exit(&(mfcb)->mfcb_lock);                 \
 262 }
 263
 264 #define MFCB_REFRELE(mfcb) {                                    \
 265         mutex_enter(&(mfcb)->mfcb_lock);                        \
 266         ASSERT((mfcb)->mfcb_refcnt != 0);                       \
 267         if (--(mfcb)->mfcb_refcnt == 0 &&                       \
 268                 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {   \
 269                         release_mfc(mfcb);                      \
 270         }                                                       \
 271         mutex_exit(&(mfcb)->mfcb_lock);                         \
 272 }
 273
 274 /*
 275  * MFCFIND:
 276  * Find a route for a given origin IP address and multicast group address.
 277  * Skip entries with pending upcalls.
 278  * Type of service parameter to be added in the future!
 279  */
 280 #define MFCFIND(mfcbp, o, g, rt) { \
 281         struct mfc *_mb_rt = NULL; \
 282         rt = NULL; \
 283         _mb_rt = mfcbp->mfcb_mfc; \
 284         while (_mb_rt) { \
 285                 if ((_mb_rt->mfc_origin.s_addr == o) && \
 286                     (_mb_rt->mfc_mcastgrp.s_addr == g) && \
 287                     (_mb_rt->mfc_rte == NULL) && \
 288                     (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
 289                     rt = _mb_rt; \
 290                     break; \
 291                 } \
 292         _mb_rt = _mb_rt->mfc_next; \
 293         } \
 294 }
 295
 296 /*
 297  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
 298  * are inefficient. We use gethrestime() which returns a timespec_t with
 299  * sec and nsec, the resolution is machine dependent.
 300  * The following 2 macros have been changed to use nsec instead of usec.
 301  */
 302 /*
 303  * Macros to compute elapsed time efficiently.
 304  * Borrowed from Van Jacobson's scheduling code.
 305  * Delta should be a hrtime_t.
 306  */
 307 #define TV_DELTA(a, b, delta) { \
 308         int xxs; \
 309  \
 310         delta = (a).tv_nsec - (b).tv_nsec; \
 311         if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
 312                 switch (xxs) { \
 313                 case 2: \
 314                     delta += 1000000000; \
 315                     /*FALLTHROUGH*/ \
 316                 case 1: \
 317                     delta += 1000000000; \
 318                     break; \
 319                 default: \
 320                     delta += (1000000000 * xxs); \
 321                 } \
 322         } \
 323 }
 324
 325 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
 326         (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
 327
 328 /*
 329  * Handle MRT setsockopt commands to modify the multicast routing tables.
 330  */
 331 int
 332 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data,
 333     int datalen)
 334 {
 335         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 336
 337         mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
 338         if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
 339                 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 340                 return (EACCES);
 341         }
 342         mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 343
 344         if (checkonly) {
 345                 /*
 346                  * do not do operation, just pretend to - new T_CHECK
 347                  * Note: Even routines further on can probably fail but
 348                  * this T_CHECK stuff is only to please XTI so it not
 349                  * necessary to be perfect.
 350                  */
 351                 switch (cmd) {
 352                 case MRT_INIT:
 353                 case MRT_DONE:
 354                 case MRT_ADD_VIF:
 355                 case MRT_DEL_VIF:
 356                 case MRT_ADD_MFC:
 357                 case MRT_DEL_MFC:
 358                 case MRT_ASSERT:
 359                         return (0);
 360                 default:
 361                         return (EOPNOTSUPP);
 362                 }
 363         }
 364
 365         /*
 366          * make sure no command is issued after multicast routing has been
 367          * turned off.
 368          */
 369         if (cmd != MRT_INIT && cmd != MRT_DONE) {
 370                 if (is_mrouter_off(ipst))
 371                         return (EINVAL);
 372         }
 373
 374         switch (cmd) {
 375         case MRT_INIT:  return (ip_mrouter_init(connp, data, datalen, ipst));
 376         case MRT_DONE:  return (ip_mrouter_done(ipst));
 377         case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp, ipst));
 378         case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, ipst));
 379         case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
 380         case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
 381         case MRT_ASSERT:   return (set_assert((int *)data, ipst));
 382         default:           return (EOPNOTSUPP);
 383         }
 384 }
 385
 386 /*
 387  * Handle MRT getsockopt commands
 388  */
 389 int
 390 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data)
 391 {
 392         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 393
 394         if (connp != ipst->ips_ip_g_mrouter)
 395                 return (EACCES);
 396
 397         switch (cmd) {
 398         case MRT_VERSION:       return (get_version((uchar_t *)data));
 399         case MRT_ASSERT:        return (get_assert((uchar_t *)data, ipst));
 400         default:                return (EOPNOTSUPP);
 401         }
 402 }
 403
 404 /*
 405  * Handle ioctl commands to obtain information from the cache.
 406  * Called with shared access to IP. These are read_only ioctls.
 407  */
 408 /* ARGSUSED */
 409 int
 410 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 411     ip_ioctl_cmd_t *ipip, void *if_req)
 412 {
 413         mblk_t  *mp1;
 414         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
 415         conn_t          *connp = Q_TO_CONN(q);
 416         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 417
 418         /* Existence verified in ip_wput_nondata */
 419         mp1 = mp->b_cont->b_cont;
 420
 421         switch (iocp->ioc_cmd) {
 422         case (SIOCGETVIFCNT):
 423                 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
 424         case (SIOCGETSGCNT):
 425                 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
 426         case (SIOCGETLSGCNT):
 427                 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
 428         default:
 429                 return (EINVAL);
 430         }
 431 }
 432
 433 /*
 434  * Returns the packet, byte, rpf-failure count for the source, group provided.
 435  */
 436 static int
 437 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
 438 {
 439         struct mfc *rt;
 440         struct mfcb *mfcbp;
 441
 442         mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
 443         MFCB_REFHOLD(mfcbp);
 444         MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
 445
 446         if (rt != NULL) {
 447                 mutex_enter(&rt->mfc_mutex);
 448                 req->pktcnt   = rt->mfc_pkt_cnt;
 449                 req->bytecnt  = rt->mfc_byte_cnt;
 450                 req->wrong_if = rt->mfc_wrong_if;
 451                 mutex_exit(&rt->mfc_mutex);
 452         } else
 453                 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
 454
 455         MFCB_REFRELE(mfcbp);
 456         return (0);
 457 }
 458
 459 /*
 460  * Returns the packet, byte, rpf-failure count for the source, group provided.
 461  * Uses larger counters and IPv6 addresses.
 462  */
 463 /* ARGSUSED XXX until implemented */
 464 static int
 465 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
 466 {
 467         /* XXX TODO SIOCGETLSGCNT */
 468         return (ENXIO);
 469 }
 470
 471 /*
 472  * Returns the input and output packet and byte counts on the vif provided.
 473  */
 474 static int
 475 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
 476 {
 477         vifi_t vifi = req->vifi;
 478
 479         if (vifi >= ipst->ips_numvifs)
 480                 return (EINVAL);
 481
 482         /*
 483          * No locks here, an approximation is fine.
 484          */
 485         req->icount = ipst->ips_vifs[vifi].v_pkt_in;
 486         req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
 487         req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
 488         req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
 489
 490         return (0);
 491 }
 492
 493 static int
 494 get_version(uchar_t *data)
 495 {
 496         int *v = (int *)data;
 497
 498         *v = 0x0305;    /* XXX !!!! */
 499
 500         return (0);
 501 }
 502
 503 /*
 504  * Set PIM assert processing global.
 505  */
 506 static int
 507 set_assert(int *i, ip_stack_t *ipst)
 508 {
 509         if ((*i != 1) && (*i != 0))
 510                 return (EINVAL);
 511
 512         ipst->ips_pim_assert = *i;
 513
 514         return (0);
 515 }
 516
 517 /*
 518  * Get PIM assert processing global.
 519  */
 520 static int
 521 get_assert(uchar_t *data, ip_stack_t *ipst)
 522 {
 523         int *i = (int *)data;
 524
 525         *i = ipst->ips_pim_assert;
 526
 527         return (0);
 528 }
 529
 530 /*
 531  * Enable multicast routing.
 532  */
 533 static int
 534 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
 535 {
 536         int     *v;
 537
 538         if (data == NULL || (datalen != sizeof (int)))
 539                 return (ENOPROTOOPT);
 540
 541         v = (int *)data;
 542         if (*v != 1)
 543                 return (ENOPROTOOPT);
 544
 545         mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
 546         if (ipst->ips_ip_g_mrouter != NULL) {
 547                 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 548                 return (EADDRINUSE);
 549         }
 550
 551         /*
 552          * MRT_INIT should only be allowed for RAW sockets, but we double
 553          * check.
 554          */
 555         if (!IPCL_IS_RAWIP(connp)) {
 556                 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 557                 return (EINVAL);
 558         }
 559
 560         ipst->ips_ip_g_mrouter = connp;
 561         connp->conn_multi_router = 1;
 562         /* In order for tunnels to work we have to turn ip_g_forward on */
 563         if (!WE_ARE_FORWARDING(ipst)) {
 564                 if (ipst->ips_ip_mrtdebug > 1) {
 565                         (void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
 566                             "ip_mrouter_init: turning on forwarding");
 567                 }
 568                 ipst->ips_saved_ip_forwarding = ipst->ips_ip_forwarding;
 569                 ipst->ips_ip_forwarding = IP_FORWARD_ALWAYS;
 570         }
 571
 572         mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 573         return (0);
 574 }
 575
 576 void
 577 ip_mrouter_stack_init(ip_stack_t *ipst)
 578 {
 579         mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
 580
 581         ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
 582             KM_SLEEP);
 583         ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
 584         /*
 585          * mfctable:
 586          * Includes all mfcs, including waiting upcalls.
 587          * Multiple mfcs per bucket.
 588          */
 589         ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
 590             KM_SLEEP);
 591         /*
 592          * Define the token bucket filter structures.
 593          * tbftable -> each vif has one of these for storing info.
 594          */
 595         ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
 596
 597         mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
 598
 599         ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
 600         ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
 601 }
 602
 603 /*
 604  * Disable multicast routing.
 605  * Didn't use global timeout_val (BSD version), instead check the mfctable.
 606  */
 607 int
 608 ip_mrouter_done(ip_stack_t *ipst)
 609 {
 610         conn_t          *mrouter;
 611         vifi_t          vifi;
 612         struct mfc      *mfc_rt;
 613         int             i;
 614
 615         mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
 616         if (ipst->ips_ip_g_mrouter == NULL) {
 617                 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 618                 return (EINVAL);
 619         }
 620
 621         mrouter = ipst->ips_ip_g_mrouter;
 622
 623         if (ipst->ips_saved_ip_forwarding != -1) {
 624                 if (ipst->ips_ip_mrtdebug > 1) {
 625                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 626                             "ip_mrouter_done: turning off forwarding");
 627                 }
 628                 ipst->ips_ip_forwarding = ipst->ips_saved_ip_forwarding;
 629                 ipst->ips_saved_ip_forwarding = -1;
 630         }
 631
 632         /*
 633          * Always clear cache when vifs change.
 634          * No need to get ipst->ips_last_encap_lock since we are running as
 635          * a writer.
 636          */
 637         mutex_enter(&ipst->ips_last_encap_lock);
 638         ipst->ips_last_encap_src = 0;
 639         ipst->ips_last_encap_vif = NULL;
 640         mutex_exit(&ipst->ips_last_encap_lock);
 641         mrouter->conn_multi_router = 0;
 642
 643         mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 644
 645         /*
 646          * For each phyint in use,
 647          * disable promiscuous reception of all IP multicasts.
 648          */
 649         for (vifi = 0; vifi < MAXVIFS; vifi++) {
 650                 struct vif *vifp = ipst->ips_vifs + vifi;
 651
 652                 mutex_enter(&vifp->v_lock);
 653                 /*
 654                  * if the vif is active mark it condemned.
 655                  */
 656                 if (vifp->v_marks & VIF_MARK_GOOD) {
 657                         ASSERT(vifp->v_ipif != NULL);
 658                         ipif_refhold(vifp->v_ipif);
 659                         /* Phyint only */
 660                         if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
 661                                 ipif_t *ipif = vifp->v_ipif;
 662                                 ilm_t *ilm = vifp->v_ilm;
 663
 664                                 vifp->v_ilm = NULL;
 665                                 vifp->v_marks &= ~VIF_MARK_GOOD;
 666                                 vifp->v_marks |= VIF_MARK_CONDEMNED;
 667
 668                                 mutex_exit(&(vifp)->v_lock);
 669                                 if (ilm != NULL) {
 670                                         ill_t *ill = ipif->ipif_ill;
 671
 672                                         (void) ip_delmulti(ilm);
 673                                         ASSERT(ill->ill_mrouter_cnt > 0);
 674                                         atomic_dec_32(&ill->ill_mrouter_cnt);
 675                                 }
 676                                 mutex_enter(&vifp->v_lock);
 677                         }
 678                         ipif_refrele(vifp->v_ipif);
 679                         /*
 680                          * decreases the refcnt added in add_vif.
 681                          * and release v_lock.
 682                          */
 683                         VIF_REFRELE_LOCKED(vifp);
 684                 } else {
 685                         mutex_exit(&vifp->v_lock);
 686                         continue;
 687                 }
 688         }
 689
 690         mutex_enter(&ipst->ips_numvifs_mutex);
 691         ipst->ips_numvifs = 0;
 692         ipst->ips_pim_assert = 0;
 693         ipst->ips_reg_vif_num = ALL_VIFS;
 694         mutex_exit(&ipst->ips_numvifs_mutex);
 695
 696         /*
 697          * Free upcall msgs.
 698          * Go through mfctable and stop any outstanding upcall
 699          * timeouts remaining on mfcs.
 700          */
 701         for (i = 0; i < MFCTBLSIZ; i++) {
 702                 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
 703                 ipst->ips_mfcs[i].mfcb_refcnt++;
 704                 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
 705                 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
 706                 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
 707                 while (mfc_rt) {
 708                         /* Free upcalls */
 709                         mutex_enter(&mfc_rt->mfc_mutex);
 710                         if (mfc_rt->mfc_rte != NULL) {
 711                                 if (mfc_rt->mfc_timeout_id != 0) {
 712                                         /*
 713                                          * OK to drop the lock as we have
 714                                          * a refcnt on the bucket. timeout
 715                                          * can fire but it will see that
 716                                          * mfc_timeout_id == 0 and not do
 717                                          * anything. see expire_upcalls().
 718                                          */
 719                                         mfc_rt->mfc_timeout_id = 0;
 720                                         mutex_exit(&mfc_rt->mfc_mutex);
 721                                         (void) untimeout(
 722                                             mfc_rt->mfc_timeout_id);
 723                                                 mfc_rt->mfc_timeout_id = 0;
 724                                         mutex_enter(&mfc_rt->mfc_mutex);
 725
 726                                         /*
 727                                          * all queued upcall packets
 728                                          * and mblk will be freed in
 729                                          * release_mfc().
 730                                          */
 731                                 }
 732                         }
 733
 734                         mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
 735
 736                         mutex_exit(&mfc_rt->mfc_mutex);
 737                         mfc_rt = mfc_rt->mfc_next;
 738                 }
 739                 MFCB_REFRELE(&ipst->ips_mfcs[i]);
 740         }
 741
 742         mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
 743         ipst->ips_ip_g_mrouter = NULL;
 744         mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 745         return (0);
 746 }
 747
 748 void
 749 ip_mrouter_stack_destroy(ip_stack_t *ipst)
 750 {
 751         struct mfcb *mfcbp;
 752         struct mfc  *rt;
 753         int i;
 754
 755         for (i = 0; i < MFCTBLSIZ; i++) {
 756                 mfcbp = &ipst->ips_mfcs[i];
 757
 758                 while ((rt = mfcbp->mfcb_mfc) != NULL) {
 759                         (void) printf("ip_mrouter_stack_destroy: free for %d\n",
 760                             i);
 761
 762                         mfcbp->mfcb_mfc = rt->mfc_next;
 763                         free_queue(rt);
 764                         mi_free(rt);
 765                 }
 766         }
 767         kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
 768         ipst->ips_vifs = NULL;
 769         kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
 770         ipst->ips_mrtstat = NULL;
 771         kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
 772         ipst->ips_mfcs = NULL;
 773         kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
 774         ipst->ips_tbfs = NULL;
 775
 776         mutex_destroy(&ipst->ips_last_encap_lock);
 777         mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
 778 }
 779
 780 static boolean_t
 781 is_mrouter_off(ip_stack_t *ipst)
 782 {
 783         conn_t  *mrouter;
 784
 785         mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
 786         if (ipst->ips_ip_g_mrouter == NULL) {
 787                 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 788                 return (B_TRUE);
 789         }
 790
 791         mrouter = ipst->ips_ip_g_mrouter;
 792         if (mrouter->conn_multi_router == 0) {
 793                 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 794                 return (B_TRUE);
 795         }
 796         mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
 797         return (B_FALSE);
 798 }
 799
 800 static void
 801 unlock_good_vif(struct vif *vifp)
 802 {
 803         ASSERT(vifp->v_ipif != NULL);
 804         ipif_refrele(vifp->v_ipif);
 805         VIF_REFRELE(vifp);
 806 }
 807
 808 static boolean_t
 809 lock_good_vif(struct vif *vifp)
 810 {
 811         mutex_enter(&vifp->v_lock);
 812         if (!(vifp->v_marks & VIF_MARK_GOOD)) {
 813                 mutex_exit(&vifp->v_lock);
 814                 return (B_FALSE);
 815         }
 816
 817         ASSERT(vifp->v_ipif != NULL);
 818         mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
 819         if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
 820                 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
 821                 mutex_exit(&vifp->v_lock);
 822                 return (B_FALSE);
 823         }
 824         ipif_refhold_locked(vifp->v_ipif);
 825         mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
 826         vifp->v_refcnt++;
 827         mutex_exit(&vifp->v_lock);
 828         return (B_TRUE);
 829 }
 830
 831 /*
 832  * Add a vif to the vif table.
 833  */
 834 static int
 835 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst)
 836 {
 837         struct vif      *vifp = ipst->ips_vifs + vifcp->vifc_vifi;
 838         ipif_t          *ipif;
 839         int             error = 0;
 840         struct tbf      *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
 841         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
 842         ilm_t           *ilm;
 843         ill_t           *ill;
 844
 845         ASSERT(connp != NULL);
 846
 847         if (vifcp->vifc_vifi >= MAXVIFS)
 848                 return (EINVAL);
 849
 850         if (is_mrouter_off(ipst))
 851                 return (EINVAL);
 852
 853         mutex_enter(&vifp->v_lock);
 854         /*
 855          * Viftable entry should be 0.
 856          * if v_marks == 0 but v_refcnt != 0 means struct is being
 857          * initialized.
 858          *
 859          * Also note that it is very unlikely that we will get a MRT_ADD_VIF
 860          * request while the delete is in progress, mrouted only sends add
 861          * requests when a new interface is added and the new interface cannot
 862          * have the same vifi as an existing interface. We make sure that
 863          * ill_delete will block till the vif is deleted by adding a refcnt
 864          * to ipif in del_vif().
 865          */
 866         if (vifp->v_lcl_addr.s_addr != 0 ||
 867             vifp->v_marks != 0 ||
 868             vifp->v_refcnt != 0) {
 869                 mutex_exit(&vifp->v_lock);
 870                 return (EADDRINUSE);
 871         }
 872
 873         /* Incoming vif should not be 0 */
 874         if (vifcp->vifc_lcl_addr.s_addr == 0) {
 875                 mutex_exit(&vifp->v_lock);
 876                 return (EINVAL);
 877         }
 878
 879         vifp->v_refcnt++;
 880         mutex_exit(&vifp->v_lock);
 881         /* Find the interface with the local address */
 882         ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
 883             IPCL_ZONEID(connp), ipst);
 884         if (ipif == NULL) {
 885                 VIF_REFRELE(vifp);
 886                 return (EADDRNOTAVAIL);
 887         }
 888
 889         if (ipst->ips_ip_mrtdebug > 1) {
 890                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
 891                     "add_vif: src 0x%x enter",
 892                     vifcp->vifc_lcl_addr.s_addr);
 893         }
 894
 895         mutex_enter(&vifp->v_lock);
 896         /*
 897          * Always clear cache when vifs change.
 898          * Needed to ensure that src isn't left over from before vif was added.
 899          * No need to get last_encap_lock, since we are running as a writer.
 900          */
 901
 902         mutex_enter(&ipst->ips_last_encap_lock);
 903         ipst->ips_last_encap_src = 0;
 904         ipst->ips_last_encap_vif = NULL;
 905         mutex_exit(&ipst->ips_last_encap_lock);
 906
 907         if (vifcp->vifc_flags & VIFF_TUNNEL) {
 908                 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
 909                         cmn_err(CE_WARN,
 910                             "add_vif: source route tunnels not supported\n");
 911                         VIF_REFRELE_LOCKED(vifp);
 912                         ipif_refrele(ipif);
 913                         return (EOPNOTSUPP);
 914                 }
 915                 vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
 916
 917         } else {
 918                 /* Phyint or Register vif */
 919                 if (vifcp->vifc_flags & VIFF_REGISTER) {
 920                         /*
 921                          * Note: Since all IPPROTO_IP level options (including
 922                          * MRT_ADD_VIF) are done exclusively via
 923                          * ip_optmgmt_writer(), a lock is not necessary to
 924                          * protect reg_vif_num.
 925                          */
 926                         mutex_enter(&ipst->ips_numvifs_mutex);
 927                         if (ipst->ips_reg_vif_num == ALL_VIFS) {
 928                                 ipst->ips_reg_vif_num = vifcp->vifc_vifi;
 929                                 mutex_exit(&ipst->ips_numvifs_mutex);
 930                         } else {
 931                                 mutex_exit(&ipst->ips_numvifs_mutex);
 932                                 VIF_REFRELE_LOCKED(vifp);
 933                                 ipif_refrele(ipif);
 934                                 return (EADDRINUSE);
 935                         }
 936                 }
 937
 938                 /* Make sure the interface supports multicast */
 939                 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
 940                         VIF_REFRELE_LOCKED(vifp);
 941                         ipif_refrele(ipif);
 942                         if (vifcp->vifc_flags & VIFF_REGISTER) {
 943                                 mutex_enter(&ipst->ips_numvifs_mutex);
 944                                 ipst->ips_reg_vif_num = ALL_VIFS;
 945                                 mutex_exit(&ipst->ips_numvifs_mutex);
 946                         }
 947                         return (EOPNOTSUPP);
 948                 }
 949                 /* Enable promiscuous reception of all IP mcasts from the if */
 950                 mutex_exit(&vifp->v_lock);
 951
 952                 ill = ipif->ipif_ill;
 953                 if (IS_UNDER_IPMP(ill))
 954                         ill = ipmp_ill_hold_ipmp_ill(ill);
 955
 956                 if (ill == NULL) {
 957                         ilm = NULL;
 958                 } else {
 959                         ilm = ip_addmulti(&ipv6_all_zeros, ill,
 960                             ipif->ipif_zoneid, &error);
 961                         if (ilm != NULL)
 962                                 atomic_inc_32(&ill->ill_mrouter_cnt);
 963                         if (IS_UNDER_IPMP(ipif->ipif_ill)) {
 964                                 ill_refrele(ill);
 965                                 ill = ipif->ipif_ill;
 966                         }
 967                 }
 968
 969                 mutex_enter(&vifp->v_lock);
 970                 /*
 971                  * since we released the lock lets make sure that
 972                  * ip_mrouter_done() has not been called.
 973                  */
 974                 if (ilm == NULL || is_mrouter_off(ipst)) {
 975                         if (ilm != NULL) {
 976                                 (void) ip_delmulti(ilm);
 977                                 ASSERT(ill->ill_mrouter_cnt > 0);
 978                                 atomic_dec_32(&ill->ill_mrouter_cnt);
 979                         }
 980                         if (vifcp->vifc_flags & VIFF_REGISTER) {
 981                                 mutex_enter(&ipst->ips_numvifs_mutex);
 982                                 ipst->ips_reg_vif_num = ALL_VIFS;
 983                                 mutex_exit(&ipst->ips_numvifs_mutex);
 984                         }
 985                         VIF_REFRELE_LOCKED(vifp);
 986                         ipif_refrele(ipif);
 987                         return (error?error:EINVAL);
 988                 }
 989                 vifp->v_ilm = ilm;
 990         }
 991         /* Define parameters for the tbf structure */
 992         vifp->v_tbf = v_tbf;
 993         gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
 994         vifp->v_tbf->tbf_n_tok = 0;
 995         vifp->v_tbf->tbf_q_len = 0;
 996         vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
 997         vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
 998
 999         vifp->v_flags = vifcp->vifc_flags;
1000         vifp->v_threshold = vifcp->vifc_threshold;
1001         vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1002         vifp->v_ipif = ipif;
1003         ipif_refrele(ipif);
1004         /* Scaling up here, allows division by 1024 in critical code.   */
1005         vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1006         vifp->v_timeout_id = 0;
1007         /* initialize per vif pkt counters */
1008         vifp->v_pkt_in = 0;
1009         vifp->v_pkt_out = 0;
1010         vifp->v_bytes_in = 0;
1011         vifp->v_bytes_out = 0;
1012         mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1013
1014         /* Adjust numvifs up, if the vifi is higher than numvifs */
1015         mutex_enter(&ipst->ips_numvifs_mutex);
1016         if (ipst->ips_numvifs <= vifcp->vifc_vifi)
1017                 ipst->ips_numvifs = vifcp->vifc_vifi + 1;
1018         mutex_exit(&ipst->ips_numvifs_mutex);
1019
1020         if (ipst->ips_ip_mrtdebug > 1) {
1021                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1022                     "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1023                     vifcp->vifc_vifi,
1024                     ntohl(vifcp->vifc_lcl_addr.s_addr),
1025                     (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1026                     ntohl(vifcp->vifc_rmt_addr.s_addr),
1027                     vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1028         }
1029
1030         vifp->v_marks = VIF_MARK_GOOD;
1031         mutex_exit(&vifp->v_lock);
1032         return (0);
1033 }
1034
1035
1036 /* Delete a vif from the vif table. */
1037 static void
1038 del_vifp(struct vif *vifp)
1039 {
1040         struct tbf      *t = vifp->v_tbf;
1041         mblk_t  *mp0;
1042         vifi_t  vifi;
1043         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
1044         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
1045
1046         ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1047         ASSERT(t != NULL);
1048
1049         if (ipst->ips_ip_mrtdebug > 1) {
1050                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1051                     "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1052         }
1053
1054         if (vifp->v_timeout_id != 0) {
1055                 (void) untimeout(vifp->v_timeout_id);
1056                 vifp->v_timeout_id = 0;
1057         }
1058
1059         /*
1060          * Free packets queued at the interface.
1061          * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1062          */
1063         mutex_enter(&t->tbf_lock);
1064         while (t->tbf_q != NULL) {
1065                 mp0 = t->tbf_q;
1066                 t->tbf_q = t->tbf_q->b_next;
1067                 mp0->b_prev = mp0->b_next = NULL;
1068                 freemsg(mp0);
1069         }
1070         mutex_exit(&t->tbf_lock);
1071
1072         /*
1073          * Always clear cache when vifs change.
1074          * No need to get last_encap_lock since we are running as a writer.
1075          */
1076         mutex_enter(&ipst->ips_last_encap_lock);
1077         if (vifp == ipst->ips_last_encap_vif) {
1078                 ipst->ips_last_encap_vif = NULL;
1079                 ipst->ips_last_encap_src = 0;
1080         }
1081         mutex_exit(&ipst->ips_last_encap_lock);
1082
1083         mutex_destroy(&t->tbf_lock);
1084
1085         bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1086
1087         /* Adjust numvifs down */
1088         mutex_enter(&ipst->ips_numvifs_mutex);
1089         for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1090                 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
1091                         break;
1092         ipst->ips_numvifs = vifi;
1093         mutex_exit(&ipst->ips_numvifs_mutex);
1094
1095         bzero(vifp, sizeof (*vifp));
1096 }
1097
1098 static int
1099 del_vif(vifi_t *vifip, ip_stack_t *ipst)
1100 {
1101         struct vif      *vifp = ipst->ips_vifs + *vifip;
1102
1103         if (*vifip >= ipst->ips_numvifs)
1104                 return (EINVAL);
1105
1106         mutex_enter(&vifp->v_lock);
1107         /*
1108          * Not initialized
1109          * Here we are not looking at the vif that is being initialized
1110          * i.e vifp->v_marks == 0 and refcnt > 0.
1111          */
1112         if (vifp->v_lcl_addr.s_addr == 0 ||
1113             !(vifp->v_marks & VIF_MARK_GOOD)) {
1114                 mutex_exit(&vifp->v_lock);
1115                 return (EADDRNOTAVAIL);
1116         }
1117
1118         /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1119         vifp->v_marks &= ~VIF_MARK_GOOD;
1120         vifp->v_marks |= VIF_MARK_CONDEMNED;
1121
1122         /* Phyint only */
1123         if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1124                 ipif_t *ipif = vifp->v_ipif;
1125                 ilm_t *ilm = vifp->v_ilm;
1126
1127                 vifp->v_ilm = NULL;
1128
1129                 ASSERT(ipif != NULL);
1130                 /*
1131                  * should be OK to drop the lock as we
1132                  * have marked this as CONDEMNED.
1133                  */
1134                 mutex_exit(&(vifp)->v_lock);
1135                 if (ilm != NULL) {
1136                         (void) ip_delmulti(ilm);
1137                         ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0);
1138                         atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt);
1139                 }
1140                 mutex_enter(&(vifp)->v_lock);
1141         }
1142
1143         if (vifp->v_flags & VIFF_REGISTER) {
1144                 mutex_enter(&ipst->ips_numvifs_mutex);
1145                 ipst->ips_reg_vif_num = ALL_VIFS;
1146                 mutex_exit(&ipst->ips_numvifs_mutex);
1147         }
1148
1149         /*
1150          * decreases the refcnt added in add_vif.
1151          */
1152         VIF_REFRELE_LOCKED(vifp);
1153         return (0);
1154 }
1155
1156 /*
1157  * Add an mfc entry.
1158  */
1159 static int
1160 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1161 {
1162         struct mfc *rt;
1163         struct rtdetq *rte;
1164         ushort_t nstl;
1165         int i;
1166         struct mfcb *mfcbp;
1167         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
1168
1169         /*
1170          * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1171          * did not have a real route for pkt.
1172          * We want this pkt without rt installed in the mfctable to prevent
1173          * multiiple tries, so go ahead and put it in mfctable, it will
1174          * be discarded later in ip_mdq() because the child is NULL.
1175          */
1176
1177         /* Error checking, out of bounds? */
1178         if (mfccp->mfcc_parent > MAXVIFS) {
1179                 ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1180                     (int)mfccp->mfcc_parent));
1181                 return (EINVAL);
1182         }
1183
1184         if ((mfccp->mfcc_parent != NO_VIF) &&
1185             (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
1186                 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1187                     (int)mfccp->mfcc_parent));
1188                 return (EINVAL);
1189         }
1190
1191         if (is_mrouter_off(ipst)) {
1192                 return (EINVAL);
1193         }
1194
1195         mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
1196             mfccp->mfcc_mcastgrp.s_addr)];
1197         MFCB_REFHOLD(mfcbp);
1198         MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1199             mfccp->mfcc_mcastgrp.s_addr, rt);
1200
1201         /* If an entry already exists, just update the fields */
1202         if (rt) {
1203                 if (ipst->ips_ip_mrtdebug > 1) {
1204                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1205                             "add_mfc: update o %x grp %x parent %x",
1206                             ntohl(mfccp->mfcc_origin.s_addr),
1207                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
1208                             mfccp->mfcc_parent);
1209                 }
1210                 mutex_enter(&rt->mfc_mutex);
1211                 rt->mfc_parent = mfccp->mfcc_parent;
1212
1213                 mutex_enter(&ipst->ips_numvifs_mutex);
1214                 for (i = 0; i < (int)ipst->ips_numvifs; i++)
1215                         rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1216                 mutex_exit(&ipst->ips_numvifs_mutex);
1217                 mutex_exit(&rt->mfc_mutex);
1218
1219                 MFCB_REFRELE(mfcbp);
1220                 return (0);
1221         }
1222
1223         /*
1224          * Find the entry for which the upcall was made and update.
1225          */
1226         for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1227                 mutex_enter(&rt->mfc_mutex);
1228                 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1229                     (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1230                     (rt->mfc_rte != NULL) &&
1231                     !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1232                         if (nstl++ != 0)
1233                                 cmn_err(CE_WARN,
1234                                     "add_mfc: %s o %x g %x p %x",
1235                                     "multiple kernel entries",
1236                                     ntohl(mfccp->mfcc_origin.s_addr),
1237                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
1238                                     mfccp->mfcc_parent);
1239
1240                         if (ipst->ips_ip_mrtdebug > 1) {
1241                                 (void) mi_strlog(mrouter->conn_rq, 1,
1242                                     SL_TRACE,
1243                                     "add_mfc: o %x g %x p %x",
1244                                     ntohl(mfccp->mfcc_origin.s_addr),
1245                                     ntohl(mfccp->mfcc_mcastgrp.s_addr),
1246                                     mfccp->mfcc_parent);
1247                         }
1248                         fill_route(rt, mfccp, ipst);
1249
1250                         /*
1251                          * Prevent cleanup of cache entry.
1252                          * Timer starts in ip_mforward.
1253                          */
1254                         if (rt->mfc_timeout_id != 0) {
1255                                 timeout_id_t id;
1256                                 id = rt->mfc_timeout_id;
1257                                 /*
1258                                  * setting id to zero will avoid this
1259                                  * entry from being cleaned up in
1260                                  * expire_up_calls().
1261                                  */
1262                                 rt->mfc_timeout_id = 0;
1263                                 /*
1264                                  * dropping the lock is fine as we
1265                                  * have a refhold on the bucket.
1266                                  * so mfc cannot be freed.
1267                                  * The timeout can fire but it will see
1268                                  * that mfc_timeout_id == 0 and not cleanup.
1269                                  */
1270                                 mutex_exit(&rt->mfc_mutex);
1271                                 (void) untimeout(id);
1272                                 mutex_enter(&rt->mfc_mutex);
1273                         }
1274
1275                         /*
1276                          * Send all pkts that are queued waiting for the upcall.
1277                          * ip_mdq param tun set to 0 -
1278                          * the return value of ip_mdq() isn't used here,
1279                          * so value we send doesn't matter.
1280                          */
1281                         while (rt->mfc_rte != NULL) {
1282                                 rte = rt->mfc_rte;
1283                                 rt->mfc_rte = rte->rte_next;
1284                                 mutex_exit(&rt->mfc_mutex);
1285                                 (void) ip_mdq(rte->mp, (ipha_t *)
1286                                     rte->mp->b_rptr, rte->ill, 0, rt);
1287                                 freemsg(rte->mp);
1288                                 mi_free((char *)rte);
1289                                 mutex_enter(&rt->mfc_mutex);
1290                         }
1291                 }
1292                 mutex_exit(&rt->mfc_mutex);
1293         }
1294
1295
1296         /*
1297          * It is possible that an entry is being inserted without an upcall
1298          */
1299         if (nstl == 0) {
1300                 mutex_enter(&(mfcbp->mfcb_lock));
1301                 if (ipst->ips_ip_mrtdebug > 1) {
1302                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1303                             "add_mfc: no upcall o %x g %x p %x",
1304                             ntohl(mfccp->mfcc_origin.s_addr),
1305                             ntohl(mfccp->mfcc_mcastgrp.s_addr),
1306                             mfccp->mfcc_parent);
1307                 }
1308                 if (is_mrouter_off(ipst)) {
1309                         mutex_exit(&mfcbp->mfcb_lock);
1310                         MFCB_REFRELE(mfcbp);
1311                         return (EINVAL);
1312                 }
1313
1314                 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1315
1316                         mutex_enter(&rt->mfc_mutex);
1317                         if ((rt->mfc_origin.s_addr ==
1318                             mfccp->mfcc_origin.s_addr) &&
1319                             (rt->mfc_mcastgrp.s_addr ==
1320                             mfccp->mfcc_mcastgrp.s_addr) &&
1321                             (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1322                                 fill_route(rt, mfccp, ipst);
1323                                 mutex_exit(&rt->mfc_mutex);
1324                                 break;
1325                         }
1326                         mutex_exit(&rt->mfc_mutex);
1327                 }
1328
1329                 /* No upcall, so make a new entry into mfctable */
1330                 if (rt == NULL) {
1331                         rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1332                         if (rt == NULL) {
1333                                 ip1dbg(("add_mfc: out of memory\n"));
1334                                 mutex_exit(&mfcbp->mfcb_lock);
1335                                 MFCB_REFRELE(mfcbp);
1336                                 return (ENOBUFS);
1337                         }
1338
1339                         /* Insert new entry at head of hash chain */
1340                         mutex_enter(&rt->mfc_mutex);
1341                         fill_route(rt, mfccp, ipst);
1342
1343                         /* Link into table */
1344                         rt->mfc_next   = mfcbp->mfcb_mfc;
1345                         mfcbp->mfcb_mfc = rt;
1346                         mutex_exit(&rt->mfc_mutex);
1347                 }
1348                 mutex_exit(&mfcbp->mfcb_lock);
1349         }
1350
1351         MFCB_REFRELE(mfcbp);
1352         return (0);
1353 }
1354
1355 /*
1356  * Fills in mfc structure from mrouted mfcctl.
1357  */
1358 static void
1359 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
1360 {
1361         int i;
1362
1363         rt->mfc_origin          = mfccp->mfcc_origin;
1364         rt->mfc_mcastgrp        = mfccp->mfcc_mcastgrp;
1365         rt->mfc_parent          = mfccp->mfcc_parent;
1366         mutex_enter(&ipst->ips_numvifs_mutex);
1367         for (i = 0; i < (int)ipst->ips_numvifs; i++) {
1368                 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1369         }
1370         mutex_exit(&ipst->ips_numvifs_mutex);
1371         /* Initialize pkt counters per src-grp */
1372         rt->mfc_pkt_cnt = 0;
1373         rt->mfc_byte_cnt        = 0;
1374         rt->mfc_wrong_if        = 0;
1375         rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1376
1377 }
1378
1379 static void
1380 free_queue(struct mfc *mfcp)
1381 {
1382         struct rtdetq *rte0;
1383
1384         /*
1385          * Drop all queued upcall packets.
1386          * Free the mbuf with the pkt.
1387          */
1388         while ((rte0 = mfcp->mfc_rte) != NULL) {
1389                 mfcp->mfc_rte = rte0->rte_next;
1390                 freemsg(rte0->mp);
1391                 mi_free((char *)rte0);
1392         }
1393 }
1394 /*
1395  * go thorugh the hash bucket and free all the entries marked condemned.
1396  */
1397 void
1398 release_mfc(struct mfcb *mfcbp)
1399 {
1400         struct mfc *current_mfcp;
1401         struct mfc *prev_mfcp;
1402
1403         prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1404
1405         while (current_mfcp != NULL) {
1406                 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1407                         if (current_mfcp == mfcbp->mfcb_mfc) {
1408                                 mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1409                                 free_queue(current_mfcp);
1410                                 mi_free(current_mfcp);
1411                                 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1412                                 continue;
1413                         }
1414                         ASSERT(prev_mfcp != NULL);
1415                         prev_mfcp->mfc_next = current_mfcp->mfc_next;
1416                         free_queue(current_mfcp);
1417                         mi_free(current_mfcp);
1418                         current_mfcp = NULL;
1419                 } else {
1420                         prev_mfcp = current_mfcp;
1421                 }
1422
1423                 current_mfcp = prev_mfcp->mfc_next;
1424
1425         }
1426         mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1427         ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1428 }
1429
1430 /*
1431  * Delete an mfc entry.
1432  */
1433 static int
1434 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1435 {
1436         struct in_addr  origin;
1437         struct in_addr  mcastgrp;
1438         struct mfc      *rt;
1439         uint_t          hash;
1440         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
1441
1442         origin = mfccp->mfcc_origin;
1443         mcastgrp = mfccp->mfcc_mcastgrp;
1444         hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1445
1446         if (ipst->ips_ip_mrtdebug > 1) {
1447                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1448                     "del_mfc: o %x g %x",
1449                     ntohl(origin.s_addr),
1450                     ntohl(mcastgrp.s_addr));
1451         }
1452
1453         MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1454
1455         /* Find mfc in mfctable, finds only entries without upcalls */
1456         for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1457                 mutex_enter(&rt->mfc_mutex);
1458                 if (origin.s_addr == rt->mfc_origin.s_addr &&
1459                     mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1460                     rt->mfc_rte == NULL &&
1461                     !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1462                         break;
1463                 mutex_exit(&rt->mfc_mutex);
1464         }
1465
1466         /*
1467          * Return if there was an upcall (mfc_rte != NULL,
1468          * or rt not in mfctable.
1469          */
1470         if (rt == NULL) {
1471                 MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1472                 return (EADDRNOTAVAIL);
1473         }
1474
1475
1476         /*
1477          * no need to hold lock as we have a reference.
1478          */
1479         ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1480         /* error checking */
1481         if (rt->mfc_timeout_id != 0) {
1482                 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1483                 /*
1484                  * Its ok to drop the lock,  the struct cannot be freed
1485                  * since we have a ref on the hash bucket.
1486                  */
1487                 rt->mfc_timeout_id = 0;
1488                 mutex_exit(&rt->mfc_mutex);
1489                 (void) untimeout(rt->mfc_timeout_id);
1490                 mutex_enter(&rt->mfc_mutex);
1491         }
1492
1493         ASSERT(rt->mfc_rte == NULL);
1494
1495
1496         /*
1497          * Delete the entry from the cache
1498          */
1499         rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1500         mutex_exit(&rt->mfc_mutex);
1501
1502         MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1503
1504         return (0);
1505 }
1506
1507 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1508
1509 /*
1510  * IP multicast forwarding function. This function assumes that the packet
1511  * pointed to by ipha has arrived on (or is about to be sent to) the interface
1512  * pointed to by "ill", and the packet is to be relayed to other networks
1513  * that have members of the packet's destination IP multicast group.
1514  *
1515  * The packet is returned unscathed to the caller, unless it is
1516  * erroneous, in which case a -1 value tells the caller (IP)
1517  * to discard it.
1518  *
1519  * Unlike BSD, SunOS 5.x needs to return to IP info about
1520  * whether pkt came in thru a tunnel, so it can be discarded, unless
1521  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1522  * to be delivered.
1523  * Return values are 0 - pkt is okay and phyint
1524  *                  -1 - pkt is malformed and to be tossed
1525  *                   1 - pkt came in on tunnel
1526  */
1527 int
1528 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira)
1529 {
1530         ipha_t          *ipha = (ipha_t *)mp->b_rptr;
1531         ill_t           *ill = ira->ira_ill;
1532         struct mfc      *rt;
1533         ipaddr_t        src, dst, tunnel_src = 0;
1534         static int      srctun = 0;
1535         vifi_t          vifi;
1536         boolean_t       pim_reg_packet = B_FALSE;
1537         struct mfcb     *mfcbp;
1538         ip_stack_t      *ipst = ill->ill_ipst;
1539         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
1540         ill_t           *rill = ira->ira_rill;
1541
1542         ASSERT(ira->ira_pktlen == msgdsize(mp));
1543
1544         if (ipst->ips_ip_mrtdebug > 1) {
1545                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1546                     "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1547                     ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1548                     ill->ill_name);
1549         }
1550
1551         dst = ipha->ipha_dst;
1552         if (ira->ira_flags & IRAF_PIM_REGISTER)
1553                 pim_reg_packet = B_TRUE;
1554         else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET)
1555                 tunnel_src = ira->ira_mroute_tunnel;
1556
1557         /*
1558          * Don't forward a packet with time-to-live of zero or one,
1559          * or a packet destined to a local-only group.
1560          */
1561         if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1562             (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1563                 if (ipst->ips_ip_mrtdebug > 1) {
1564                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1565                             "ip_mforward: not forwarded ttl %d,"
1566                             " dst 0x%x ill %s",
1567                             ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1568                 }
1569                 if (tunnel_src != 0)
1570                         return (1);
1571                 else
1572                         return (0);
1573         }
1574
1575         if ((tunnel_src != 0) || pim_reg_packet) {
1576                 /*
1577                  * Packet arrived over an encapsulated tunnel or via a PIM
1578                  * register message.
1579                  */
1580                 if (ipst->ips_ip_mrtdebug > 1) {
1581                         if (tunnel_src != 0) {
1582                                 (void) mi_strlog(mrouter->conn_rq, 1,
1583                                     SL_TRACE,
1584                                     "ip_mforward: ill %s arrived via ENCAP TUN",
1585                                     ill->ill_name);
1586                         } else if (pim_reg_packet) {
1587                                 (void) mi_strlog(mrouter->conn_rq, 1,
1588                                     SL_TRACE,
1589                                     "ip_mforward: ill %s arrived via"
1590                                     "  REGISTER VIF",
1591                                     ill->ill_name);
1592                         }
1593                 }
1594         } else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1595             (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1596             ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1597                 /* Packet arrived via a physical interface. */
1598                 if (ipst->ips_ip_mrtdebug > 1) {
1599                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1600                             "ip_mforward: ill %s arrived via PHYINT",
1601                             ill->ill_name);
1602                 }
1603
1604         } else {
1605                 /*
1606                  * Packet arrived through a SRCRT tunnel.
1607                  * Source-route tunnels are no longer supported.
1608                  * Error message printed every 1000 times.
1609                  */
1610                 if ((srctun++ % 1000) == 0) {
1611                         cmn_err(CE_WARN,
1612                             "ip_mforward: received source-routed pkt from %x",
1613                             ntohl(ipha->ipha_src));
1614                 }
1615                 return (-1);
1616         }
1617
1618         ipst->ips_mrtstat->mrts_fwd_in++;
1619         src = ipha->ipha_src;
1620
1621         /* Find route in cache, return NULL if not there or upcalls q'ed. */
1622
1623         /*
1624          * Lock the mfctable against changes made by ip_mforward.
1625          * Note that only add_mfc and del_mfc can remove entries and
1626          * they run with exclusive access to IP. So we do not need to
1627          * guard against the rt being deleted, so release lock after reading.
1628          */
1629
1630         if (is_mrouter_off(ipst))
1631                 return (-1);
1632
1633         mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
1634         MFCB_REFHOLD(mfcbp);
1635         MFCFIND(mfcbp, src, dst, rt);
1636
1637         /* Entry exists, so forward if necessary */
1638         if (rt != NULL) {
1639                 int ret = 0;
1640                 ipst->ips_mrtstat->mrts_mfc_hits++;
1641                 if (pim_reg_packet) {
1642                         ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1643                         ret = ip_mdq(mp, ipha,
1644                             ipst->ips_vifs[ipst->ips_reg_vif_num].
1645                             v_ipif->ipif_ill,
1646                             0, rt);
1647                 } else {
1648                         ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1649                 }
1650
1651                 MFCB_REFRELE(mfcbp);
1652                 return (ret);
1653
1654                 /*
1655                  * Don't forward if we don't have a cache entry.  Mrouted will
1656                  * always provide a cache entry in response to an upcall.
1657                  */
1658         } else {
1659                 /*
1660                  * If we don't have a route for packet's origin, make a copy
1661                  * of the packet and send message to routing daemon.
1662                  */
1663                 struct mfc      *mfc_rt  = NULL;
1664                 mblk_t          *mp0     = NULL;
1665                 mblk_t          *mp_copy = NULL;
1666                 struct rtdetq   *rte     = NULL;
1667                 struct rtdetq   *rte_m, *rte1, *prev_rte;
1668                 uint_t          hash;
1669                 int             npkts;
1670                 boolean_t       new_mfc = B_FALSE;
1671                 ipst->ips_mrtstat->mrts_mfc_misses++;
1672                 /* BSD uses mrts_no_route++ */
1673                 if (ipst->ips_ip_mrtdebug > 1) {
1674                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1675                             "ip_mforward: no rte ill %s src %x g %x misses %d",
1676                             ill->ill_name, ntohl(src), ntohl(dst),
1677                             (int)ipst->ips_mrtstat->mrts_mfc_misses);
1678                 }
1679                 /*
1680                  * The order of the following code differs from the BSD code.
1681                  * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1682                  * code works, so SunOS 5.x wasn't changed to conform to the
1683                  * BSD version.
1684                  */
1685
1686                 /* Lock mfctable. */
1687                 hash = MFCHASH(src, dst);
1688                 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
1689
1690                 /*
1691                  * If we are turning off mrouted return an error
1692                  */
1693                 if (is_mrouter_off(ipst)) {
1694                         mutex_exit(&mfcbp->mfcb_lock);
1695                         MFCB_REFRELE(mfcbp);
1696                         return (-1);
1697                 }
1698
1699                 /* Is there an upcall waiting for this packet? */
1700                 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
1701                     mfc_rt = mfc_rt->mfc_next) {
1702                         mutex_enter(&mfc_rt->mfc_mutex);
1703                         if (ipst->ips_ip_mrtdebug > 1) {
1704                                 (void) mi_strlog(mrouter->conn_rq, 1,
1705                                     SL_TRACE,
1706                                     "ip_mforward: MFCTAB hash %d o 0x%x"
1707                                     " g 0x%x\n",
1708                                     hash, ntohl(mfc_rt->mfc_origin.s_addr),
1709                                     ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1710                         }
1711                         /* There is an upcall */
1712                         if ((src == mfc_rt->mfc_origin.s_addr) &&
1713                             (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1714                             (mfc_rt->mfc_rte != NULL) &&
1715                             !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1716                                 break;
1717                         }
1718                         mutex_exit(&mfc_rt->mfc_mutex);
1719                 }
1720                 /* No upcall, so make a new entry into mfctable */
1721                 if (mfc_rt == NULL) {
1722                         mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1723                         if (mfc_rt == NULL) {
1724                                 ipst->ips_mrtstat->mrts_fwd_drop++;
1725                                 ip1dbg(("ip_mforward: out of memory "
1726                                     "for mfc, mfc_rt\n"));
1727                                 goto error_return;
1728                         } else
1729                                 new_mfc = B_TRUE;
1730                         /* Get resources */
1731                         /* TODO could copy header and dup rest */
1732                         mp_copy = copymsg(mp);
1733                         if (mp_copy == NULL) {
1734                                 ipst->ips_mrtstat->mrts_fwd_drop++;
1735                                 ip1dbg(("ip_mforward: out of memory for "
1736                                     "mblk, mp_copy\n"));
1737                                 goto error_return;
1738                         }
1739                         mutex_enter(&mfc_rt->mfc_mutex);
1740                 }
1741                 /* Get resources for rte, whether first rte or not first. */
1742                 /* Add this packet into rtdetq */
1743                 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1744                 if (rte == NULL) {
1745                         ipst->ips_mrtstat->mrts_fwd_drop++;
1746                         mutex_exit(&mfc_rt->mfc_mutex);
1747                         ip1dbg(("ip_mforward: out of memory for"
1748                             " rtdetq, rte\n"));
1749                         goto error_return;
1750                 }
1751
1752                 mp0 = copymsg(mp);
1753                 if (mp0 == NULL) {
1754                         ipst->ips_mrtstat->mrts_fwd_drop++;
1755                         ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1756                         mutex_exit(&mfc_rt->mfc_mutex);
1757                         goto error_return;
1758                 }
1759                 rte->mp         = mp0;
1760                 if (pim_reg_packet) {
1761                         ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1762                         rte->ill =
1763                             ipst->ips_vifs[ipst->ips_reg_vif_num].
1764                             v_ipif->ipif_ill;
1765                 } else {
1766                         rte->ill = ill;
1767                 }
1768                 rte->rte_next   = NULL;
1769
1770                 /*
1771                  * Determine if upcall q (rtdetq) has overflowed.
1772                  * mfc_rt->mfc_rte is null by mi_zalloc
1773                  * if it is the first message.
1774                  */
1775                 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1776                     rte_m = rte_m->rte_next)
1777                         npkts++;
1778                 if (ipst->ips_ip_mrtdebug > 1) {
1779                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1780                             "ip_mforward: upcalls %d\n", npkts);
1781                 }
1782                 if (npkts > MAX_UPQ) {
1783                         ipst->ips_mrtstat->mrts_upq_ovflw++;
1784                         mutex_exit(&mfc_rt->mfc_mutex);
1785                         goto error_return;
1786                 }
1787
1788                 if (npkts == 0) {       /* first upcall */
1789                         int i = 0;
1790                         /*
1791                          * Now finish installing the new mfc! Now that we have
1792                          * resources!  Insert new entry at head of hash chain.
1793                          * Use src and dst which are ipaddr_t's.
1794                          */
1795                         mfc_rt->mfc_origin.s_addr = src;
1796                         mfc_rt->mfc_mcastgrp.s_addr = dst;
1797
1798                         mutex_enter(&ipst->ips_numvifs_mutex);
1799                         for (i = 0; i < (int)ipst->ips_numvifs; i++)
1800                                 mfc_rt->mfc_ttls[i] = 0;
1801                         mutex_exit(&ipst->ips_numvifs_mutex);
1802                         mfc_rt->mfc_parent = ALL_VIFS;
1803
1804                         /* Link into table */
1805                         if (ipst->ips_ip_mrtdebug > 1) {
1806                                 (void) mi_strlog(mrouter->conn_rq, 1,
1807                                     SL_TRACE,
1808                                     "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1809                                     "g 0x%x\n", hash,
1810                                     ntohl(mfc_rt->mfc_origin.s_addr),
1811                                     ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1812                         }
1813                         mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
1814                         ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
1815                         mfc_rt->mfc_rte = NULL;
1816                 }
1817
1818                 /* Link in the upcall */
1819                 /* First upcall */
1820                 if (mfc_rt->mfc_rte == NULL)
1821                         mfc_rt->mfc_rte = rte;
1822                 else {
1823                         /* not the first upcall */
1824                         prev_rte = mfc_rt->mfc_rte;
1825                         for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1826                             prev_rte = rte1, rte1 = rte1->rte_next)
1827                                 ;
1828                         prev_rte->rte_next = rte;
1829                 }
1830
1831                 /*
1832                  * No upcalls waiting, this is first one, so send a message to
1833                  * routing daemon to install a route into kernel table.
1834                  */
1835                 if (npkts == 0) {
1836                         struct igmpmsg  *im;
1837                         /* ipha_protocol is 0, for upcall */
1838                         ASSERT(mp_copy != NULL);
1839                         im = (struct igmpmsg *)mp_copy->b_rptr;
1840                         im->im_msgtype  = IGMPMSG_NOCACHE;
1841                         im->im_mbz = 0;
1842                         mutex_enter(&ipst->ips_numvifs_mutex);
1843                         if (pim_reg_packet) {
1844                                 im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
1845                                 mutex_exit(&ipst->ips_numvifs_mutex);
1846                         } else {
1847                                 /*
1848                                  * XXX do we need to hold locks here ?
1849                                  */
1850                                 for (vifi = 0;
1851                                     vifi < ipst->ips_numvifs;
1852                                     vifi++) {
1853                                         if (ipst->ips_vifs[vifi].v_ipif == NULL)
1854                                                 continue;
1855                                         if (ipst->ips_vifs[vifi].
1856                                             v_ipif->ipif_ill == ill) {
1857                                                 im->im_vif = (uchar_t)vifi;
1858                                                 break;
1859                                         }
1860                                 }
1861                                 mutex_exit(&ipst->ips_numvifs_mutex);
1862                                 ASSERT(vifi < ipst->ips_numvifs);
1863                         }
1864
1865                         ipst->ips_mrtstat->mrts_upcalls++;
1866                         /* Timer to discard upcalls if mrouted is too slow */
1867                         mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1868                             mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1869                         mutex_exit(&mfc_rt->mfc_mutex);
1870                         mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1871                         /* Pass to RAWIP */
1872                         ira->ira_ill = ira->ira_rill = NULL;
1873                         (mrouter->conn_recv)(mrouter, mp_copy, NULL, ira);
1874                         ira->ira_ill = ill;
1875                         ira->ira_rill = rill;
1876                 } else {
1877                         mutex_exit(&mfc_rt->mfc_mutex);
1878                         mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1879                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1880                         ip_drop_input("ip_mforward - upcall already waiting",
1881                             mp_copy, ill);
1882                         freemsg(mp_copy);
1883                 }
1884
1885                 MFCB_REFRELE(mfcbp);
1886                 if (tunnel_src != 0)
1887                         return (1);
1888                 else
1889                         return (0);
1890         error_return:
1891                 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1892                 MFCB_REFRELE(mfcbp);
1893                 if (mfc_rt != NULL && (new_mfc == B_TRUE))
1894                         mi_free((char *)mfc_rt);
1895                 if (rte != NULL)
1896                         mi_free((char *)rte);
1897                 if (mp_copy != NULL) {
1898                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1899                         ip_drop_input("ip_mforward error", mp_copy, ill);
1900                         freemsg(mp_copy);
1901                 }
1902                 if (mp0 != NULL)
1903                         freemsg(mp0);
1904                 return (-1);
1905         }
1906 }
1907
1908 /*
1909  * Clean up the mfctable cache entry if upcall is not serviced.
1910  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1911  */
1912 static void
1913 expire_upcalls(void *arg)
1914 {
1915         struct mfc *mfc_rt = arg;
1916         uint_t hash;
1917         struct mfc *prev_mfc, *mfc0;
1918         ip_stack_t      *ipst;
1919         conn_t          *mrouter;
1920
1921         if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
1922                 cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
1923                 return;
1924         }
1925         ipst = mfc_rt->mfc_rte->ill->ill_ipst;
1926         mrouter = ipst->ips_ip_g_mrouter;
1927
1928         hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1929         if (ipst->ips_ip_mrtdebug > 1) {
1930                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1931                     "expire_upcalls: hash %d s %x g %x",
1932                     hash, ntohl(mfc_rt->mfc_origin.s_addr),
1933                     ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1934         }
1935         MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1936         mutex_enter(&mfc_rt->mfc_mutex);
1937         /*
1938          * if timeout has been set to zero, than the
1939          * entry has been filled, no need to delete it.
1940          */
1941         if (mfc_rt->mfc_timeout_id == 0)
1942                 goto done;
1943         ipst->ips_mrtstat->mrts_cache_cleanups++;
1944         mfc_rt->mfc_timeout_id = 0;
1945
1946         /* Determine entry to be cleaned up in cache table. */
1947         for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
1948             prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
1949                 if (mfc0 == mfc_rt)
1950                         break;
1951
1952         /* del_mfc takes care of gone mfcs */
1953         ASSERT(prev_mfc != NULL);
1954         ASSERT(mfc0 != NULL);
1955
1956         /*
1957          * Delete the entry from the cache
1958          */
1959         ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1960         mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1961
1962         /*
1963          * release_mfc will drop all queued upcall packets.
1964          * and will free the mbuf with the pkt, if, timing info.
1965          */
1966 done:
1967         mutex_exit(&mfc_rt->mfc_mutex);
1968         MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1969 }
1970
1971 /*
1972  * Packet forwarding routine once entry in the cache is made.
1973  */
1974 static int
1975 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
1976     struct mfc *rt)
1977 {
1978         vifi_t vifi;
1979         struct vif *vifp;
1980         ipaddr_t dst = ipha->ipha_dst;
1981         size_t  plen = msgdsize(mp);
1982         vifi_t num_of_vifs;
1983         ip_stack_t      *ipst = ill->ill_ipst;
1984         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
1985         ip_recv_attr_t  iras;
1986
1987         if (ipst->ips_ip_mrtdebug > 1) {
1988                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1989                     "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
1990                     ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1991                     ill->ill_name);
1992         }
1993
1994         /* Macro to send packet on vif */
1995 #define MC_SEND(ipha, mp, vifp, dst) { \
1996         if ((vifp)->v_flags & VIFF_TUNNEL) \
1997                 encap_send((ipha), (mp), (vifp), (dst)); \
1998         else if ((vifp)->v_flags & VIFF_REGISTER) \
1999                 register_send((ipha), (mp), (vifp), (dst)); \
2000         else \
2001                 phyint_send((ipha), (mp), (vifp), (dst)); \
2002 }
2003
2004         vifi = rt->mfc_parent;
2005
2006         /*
2007          * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2008          * Mrouted had no route.
2009          * We wanted the route installed in the mfctable to prevent multiple
2010          * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2011          * NULL so we don't want to check the ill. Still needed as of Mrouted
2012          * 3.6.
2013          */
2014         if (vifi == NO_VIF) {
2015                 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2016                     ill->ill_name));
2017                 if (ipst->ips_ip_mrtdebug > 1) {
2018                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2019                             "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2020                 }
2021                 return (-1);    /* drop pkt */
2022         }
2023
2024         if (!lock_good_vif(&ipst->ips_vifs[vifi]))
2025                 return (-1);
2026         /*
2027          * The MFC entries are not cleaned up when an ipif goes
2028          * away thus this code has to guard against an MFC referencing
2029          * an ipif that has been closed. Note: reset_mrt_vif_ipif
2030          * sets the v_ipif to NULL when the ipif disappears.
2031          */
2032         ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
2033
2034         if (vifi >= ipst->ips_numvifs) {
2035                 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2036                     "%d ill %s viftable ill %s\n",
2037                     (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2038                     ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2039                 unlock_good_vif(&ipst->ips_vifs[vifi]);
2040                 return (-1);
2041         }
2042         /*
2043          * Don't forward if it didn't arrive from the parent vif for its
2044          * origin.
2045          */
2046         if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) ||
2047             (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2048                 /* Came in the wrong interface */
2049                 ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2050                         "numvifs %d ill %s viftable ill %s\n",
2051                         (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2052                         ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
2053                 if (ipst->ips_ip_mrtdebug > 1) {
2054                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2055                             "ip_mdq: arrived wrong if, vifi %d ill "
2056                             "%s viftable ill %s\n",
2057                             (int)vifi, ill->ill_name,
2058                             ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2059                 }
2060                 ipst->ips_mrtstat->mrts_wrong_if++;
2061                 rt->mfc_wrong_if++;
2062
2063                 /*
2064                  * If we are doing PIM assert processing and we are forwarding
2065                  * packets on this interface, and it is a broadcast medium
2066                  * interface (and not a tunnel), send a message to the routing.
2067                  *
2068                  * We use the first ipif on the list, since it's all we have.
2069                  * Chances are the ipif_flags are the same for ipifs on the ill.
2070                  */
2071                 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
2072                     (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2073                     !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
2074                         mblk_t          *mp_copy;
2075                         struct igmpmsg  *im;
2076
2077                         /* TODO could copy header and dup rest */
2078                         mp_copy = copymsg(mp);
2079                         if (mp_copy == NULL) {
2080                                 ipst->ips_mrtstat->mrts_fwd_drop++;
2081                                 ip1dbg(("ip_mdq: out of memory "
2082                                     "for mblk, mp_copy\n"));
2083                                 unlock_good_vif(&ipst->ips_vifs[vifi]);
2084                                 return (-1);
2085                         }
2086
2087                         im = (struct igmpmsg *)mp_copy->b_rptr;
2088                         im->im_msgtype = IGMPMSG_WRONGVIF;
2089                         im->im_mbz = 0;
2090                         im->im_vif = (ushort_t)vifi;
2091                         /* Pass to RAWIP */
2092
2093                         bzero(&iras, sizeof (iras));
2094                         iras.ira_flags = IRAF_IS_IPV4;
2095                         iras.ira_ip_hdr_length =
2096                             IPH_HDR_LENGTH(mp_copy->b_rptr);
2097                         iras.ira_pktlen = msgdsize(mp_copy);
2098                         (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2099                         ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2100                 }
2101                 unlock_good_vif(&ipst->ips_vifs[vifi]);
2102                 if (tunnel_src != 0)
2103                         return (1);
2104                 else
2105                         return (0);
2106         }
2107         /*
2108          * If I sourced this packet, it counts as output, else it was input.
2109          */
2110         if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
2111                 ipst->ips_vifs[vifi].v_pkt_out++;
2112                 ipst->ips_vifs[vifi].v_bytes_out += plen;
2113         } else {
2114                 ipst->ips_vifs[vifi].v_pkt_in++;
2115                 ipst->ips_vifs[vifi].v_bytes_in += plen;
2116         }
2117         mutex_enter(&rt->mfc_mutex);
2118         rt->mfc_pkt_cnt++;
2119         rt->mfc_byte_cnt += plen;
2120         mutex_exit(&rt->mfc_mutex);
2121         unlock_good_vif(&ipst->ips_vifs[vifi]);
2122         /*
2123          * For each vif, decide if a copy of the packet should be forwarded.
2124          * Forward if:
2125          *              - the vif threshold ttl is non-zero AND
2126          *              - the pkt ttl exceeds the vif's threshold
2127          * A non-zero mfc_ttl indicates that the vif is part of
2128          * the output set for the mfc entry.
2129          */
2130         mutex_enter(&ipst->ips_numvifs_mutex);
2131         num_of_vifs = ipst->ips_numvifs;
2132         mutex_exit(&ipst->ips_numvifs_mutex);
2133         for (vifp = ipst->ips_vifs, vifi = 0;
2134             vifi < num_of_vifs;
2135             vifp++, vifi++) {
2136                 if (!lock_good_vif(vifp))
2137                         continue;
2138                 if ((rt->mfc_ttls[vifi] > 0) &&
2139                     (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2140                         /*
2141                          * lock_good_vif should not have succedded if
2142                          * v_ipif is null.
2143                          */
2144                         ASSERT(vifp->v_ipif != NULL);
2145                         vifp->v_pkt_out++;
2146                         vifp->v_bytes_out += plen;
2147                         MC_SEND(ipha, mp, vifp, dst);
2148                         ipst->ips_mrtstat->mrts_fwd_out++;
2149                 }
2150                 unlock_good_vif(vifp);
2151         }
2152         if (tunnel_src != 0)
2153                 return (1);
2154         else
2155                 return (0);
2156 }
2157
2158 /*
2159  * Send the packet on physical interface.
2160  * Caller assumes can continue to use mp on return.
2161  */
2162 /* ARGSUSED */
2163 static void
2164 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2165 {
2166         mblk_t  *mp_copy;
2167         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2168         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2169
2170         /* Make a new reference to the packet */
2171         mp_copy = copymsg(mp);  /* TODO could copy header and dup rest */
2172         if (mp_copy == NULL) {
2173                 ipst->ips_mrtstat->mrts_fwd_drop++;
2174                 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2175                 return;
2176         }
2177         if (vifp->v_rate_limit <= 0)
2178                 tbf_send_packet(vifp, mp_copy);
2179         else  {
2180                 if (ipst->ips_ip_mrtdebug > 1) {
2181                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2182                             "phyint_send: tbf_contr rate %d "
2183                             "vifp 0x%p mp 0x%p dst 0x%x",
2184                             vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2185                 }
2186                 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2187         }
2188 }
2189
2190 /*
2191  * Send the whole packet for REGISTER encapsulation to PIM daemon
2192  * Caller assumes it can continue to use mp on return.
2193  */
2194 /* ARGSUSED */
2195 static void
2196 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2197 {
2198         struct igmpmsg  *im;
2199         mblk_t          *mp_copy;
2200         ipha_t          *ipha_copy;
2201         ill_t           *ill = vifp->v_ipif->ipif_ill;
2202         ip_stack_t      *ipst = ill->ill_ipst;
2203         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2204         ip_recv_attr_t  iras;
2205
2206         if (ipst->ips_ip_mrtdebug > 1) {
2207                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2208                     "register_send: src %x, dst %x\n",
2209                     ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2210         }
2211
2212         /*
2213          * Copy the old packet & pullup its IP header into the new mblk_t so we
2214          * can modify it.  Try to fill the new mblk_t since if we don't the
2215          * ethernet driver will.
2216          */
2217         mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2218         if (mp_copy == NULL) {
2219                 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2220                 if (ipst->ips_ip_mrtdebug > 3) {
2221                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2222                             "register_send: allocb failure.");
2223                 }
2224                 return;
2225         }
2226
2227         /*
2228          * Bump write pointer to account for igmpmsg being added.
2229          */
2230         mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2231
2232         /*
2233          * Chain packet to new mblk_t.
2234          */
2235         if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2236                 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2237                 if (ipst->ips_ip_mrtdebug > 3) {
2238                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2239                             "register_send: copymsg failure.");
2240                 }
2241                 freeb(mp_copy);
2242                 return;
2243         }
2244
2245         /*
2246          * icmp_input() asserts that IP version field is set to an
2247          * appropriate version. Hence, the struct igmpmsg that this really
2248          * becomes, needs to have the correct IP version field.
2249          */
2250         ipha_copy = (ipha_t *)mp_copy->b_rptr;
2251         *ipha_copy = multicast_encap_iphdr;
2252
2253         /*
2254          * The kernel uses the struct igmpmsg header to encode the messages to
2255          * the multicast routing daemon. Fill in the fields in the header
2256          * starting with the message type which is IGMPMSG_WHOLEPKT
2257          */
2258         im = (struct igmpmsg *)mp_copy->b_rptr;
2259         im->im_msgtype = IGMPMSG_WHOLEPKT;
2260         im->im_src.s_addr = ipha->ipha_src;
2261         im->im_dst.s_addr = ipha->ipha_dst;
2262
2263         /*
2264          * Must Be Zero. This is because the struct igmpmsg is really an IP
2265          * header with renamed fields and the multicast routing daemon uses
2266          * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2267          */
2268         im->im_mbz = 0;
2269
2270         ++ipst->ips_mrtstat->mrts_upcalls;
2271         if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld :
2272             !canputnext(mrouter->conn_rq)) {
2273                 ++ipst->ips_mrtstat->mrts_pim_regsend_drops;
2274                 if (ipst->ips_ip_mrtdebug > 3) {
2275                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2276                             "register_send: register upcall failure.");
2277                 }
2278                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2279                 ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill);
2280                 freemsg(mp_copy);
2281         } else {
2282                 /* Pass to RAWIP */
2283                 bzero(&iras, sizeof (iras));
2284                 iras.ira_flags = IRAF_IS_IPV4;
2285                 iras.ira_ip_hdr_length = sizeof (ipha_t);
2286                 iras.ira_pktlen = msgdsize(mp_copy);
2287                 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2288                 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2289         }
2290 }
2291
2292 /*
2293  * pim_validate_cksum handles verification of the checksum in the
2294  * pim header.  For PIM Register packets, the checksum is calculated
2295  * across the PIM header only.  For all other packets, the checksum
2296  * is for the PIM header and remainder of the packet.
2297  *
2298  * returns: B_TRUE, if checksum is okay.
2299  *          B_FALSE, if checksum is not valid.
2300  */
2301 static boolean_t
2302 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2303 {
2304         mblk_t *mp_dup;
2305
2306         if ((mp_dup = dupmsg(mp)) == NULL)
2307                 return (B_FALSE);
2308
2309         mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2310         if (pimp->pim_type == PIM_REGISTER)
2311                 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2312         if (IP_CSUM(mp_dup, 0, 0)) {
2313                 freemsg(mp_dup);
2314                 return (B_FALSE);
2315         }
2316         freemsg(mp_dup);
2317         return (B_TRUE);
2318 }
2319
2320 /*
2321  * Process PIM protocol packets i.e. IP Protocol 103.
2322  * Register messages are decapsulated and sent onto multicast forwarding.
2323  *
2324  * Return NULL for a bad packet that is discarded here.
2325  * Return mp if the message is OK and should be handed to "raw" receivers.
2326  * Callers of pim_input() may need to reinitialize variables that were copied
2327  * from the mblk as this calls pullupmsg().
2328  */
2329 mblk_t *
2330 pim_input(mblk_t *mp, ip_recv_attr_t *ira)
2331 {
2332         ipha_t          *eip, *ip;
2333         int             iplen, pimlen, iphlen;
2334         struct pim      *pimp;  /* pointer to a pim struct */
2335         uint32_t        *reghdr;
2336         ill_t           *ill = ira->ira_ill;
2337         ip_stack_t      *ipst = ill->ill_ipst;
2338         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2339
2340         /*
2341          * Pullup the msg for PIM protocol processing.
2342          */
2343         if (pullupmsg(mp, -1) == 0) {
2344                 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2345                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2346                 ip_drop_input("mrts_pim_nomemory", mp, ill);
2347                 freemsg(mp);
2348                 return (NULL);
2349         }
2350
2351         ip = (ipha_t *)mp->b_rptr;
2352         iplen = ip->ipha_length;
2353         iphlen = IPH_HDR_LENGTH(ip);
2354         pimlen = ntohs(iplen) - iphlen;
2355
2356         /*
2357          * Validate lengths
2358          */
2359         if (pimlen < PIM_MINLEN) {
2360                 ++ipst->ips_mrtstat->mrts_pim_malformed;
2361                 if (ipst->ips_ip_mrtdebug > 1) {
2362                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2363                             "pim_input: length not at least minlen");
2364                 }
2365                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2366                 ip_drop_input("mrts_pim_malformed", mp, ill);
2367                 freemsg(mp);
2368                 return (NULL);
2369         }
2370
2371         /*
2372          * Point to the PIM header.
2373          */
2374         pimp = (struct pim *)((caddr_t)ip + iphlen);
2375
2376         /*
2377          * Check the version number.
2378          */
2379         if (pimp->pim_vers != PIM_VERSION) {
2380                 ++ipst->ips_mrtstat->mrts_pim_badversion;
2381                 if (ipst->ips_ip_mrtdebug > 1) {
2382                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2383                             "pim_input: unknown version of PIM");
2384                 }
2385                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2386                 ip_drop_input("mrts_pim_badversion", mp, ill);
2387                 freemsg(mp);
2388                 return (NULL);
2389         }
2390
2391         /*
2392          * Validate the checksum
2393          */
2394         if (!pim_validate_cksum(mp, ip, pimp)) {
2395                 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
2396                 if (ipst->ips_ip_mrtdebug > 1) {
2397                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2398                             "pim_input: invalid checksum");
2399                 }
2400                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2401                 ip_drop_input("pim_rcv_badcsum", mp, ill);
2402                 freemsg(mp);
2403                 return (NULL);
2404         }
2405
2406         if (pimp->pim_type != PIM_REGISTER)
2407                 return (mp);
2408
2409         reghdr = (uint32_t *)(pimp + 1);
2410         eip = (ipha_t *)(reghdr + 1);
2411
2412         /*
2413          * check if the inner packet is destined to mcast group
2414          */
2415         if (!CLASSD(eip->ipha_dst)) {
2416                 ++ipst->ips_mrtstat->mrts_pim_badregisters;
2417                 if (ipst->ips_ip_mrtdebug > 1) {
2418                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2419                             "pim_input: Inner pkt not mcast .. !");
2420                 }
2421                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2422                 ip_drop_input("mrts_pim_badregisters", mp, ill);
2423                 freemsg(mp);
2424                 return (NULL);
2425         }
2426         if (ipst->ips_ip_mrtdebug > 1) {
2427                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2428                     "register from %x, to %x, len %d",
2429                     ntohl(eip->ipha_src),
2430                     ntohl(eip->ipha_dst),
2431                     ntohs(eip->ipha_length));
2432         }
2433         /*
2434          * If the null register bit is not set, decapsulate
2435          * the packet before forwarding it.
2436          * Avoid this in no register vif
2437          */
2438         if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) &&
2439             ipst->ips_reg_vif_num != ALL_VIFS) {
2440                 mblk_t *mp_copy;
2441                 uint_t saved_pktlen;
2442
2443                 /* Copy the message */
2444                 if ((mp_copy = copymsg(mp)) == NULL) {
2445                         ++ipst->ips_mrtstat->mrts_pim_nomemory;
2446                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2447                         ip_drop_input("mrts_pim_nomemory", mp, ill);
2448                         freemsg(mp);
2449                         return (NULL);
2450                 }
2451
2452                 /*
2453                  * Decapsulate the packet and give it to
2454                  * register_mforward.
2455                  */
2456                 mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr);
2457                 saved_pktlen = ira->ira_pktlen;
2458                 ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr);
2459                 if (register_mforward(mp_copy, ira) != 0) {
2460                         /* register_mforward already called ip_drop_input */
2461                         freemsg(mp);
2462                         ira->ira_pktlen = saved_pktlen;
2463                         return (NULL);
2464                 }
2465                 ira->ira_pktlen = saved_pktlen;
2466         }
2467
2468         /*
2469          * Pass all valid PIM packets up to any process(es) listening on a raw
2470          * PIM socket. For Solaris it is done right after pim_input() is
2471          * called.
2472          */
2473         return (mp);
2474 }
2475
2476 /*
2477  * PIM sparse mode hook.  Called by pim_input after decapsulating
2478  * the packet. Loop back the packet, as if we have received it.
2479  * In pim_input() we have to check if the destination is a multicast address.
2480  */
2481 static int
2482 register_mforward(mblk_t *mp, ip_recv_attr_t *ira)
2483 {
2484         ire_t           *ire;
2485         ipha_t          *ipha = (ipha_t *)mp->b_rptr;
2486         ill_t           *ill = ira->ira_ill;
2487         ip_stack_t      *ipst = ill->ill_ipst;
2488         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2489
2490         ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
2491
2492         if (ipst->ips_ip_mrtdebug > 3) {
2493                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2494                     "register_mforward: src %x, dst %x\n",
2495                     ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2496         }
2497         /*
2498          * Need to pass in to ip_mforward() the information that the
2499          * packet has arrived on the register_vif. We mark it with
2500          * the IRAF_PIM_REGISTER attribute.
2501          * pim_input verified that the (inner) destination is multicast,
2502          * hence we skip the generic code in ip_input.
2503          */
2504         ira->ira_flags |= IRAF_PIM_REGISTER;
2505         ++ipst->ips_mrtstat->mrts_pim_regforwards;
2506
2507         if (!CLASSD(ipha->ipha_dst)) {
2508                 ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES,
2509                     ira->ira_tsl, MATCH_IRE_SECATTR, IRR_ALLOCATE, 0, ipst,
2510                     NULL, NULL, NULL);
2511         } else {
2512                 ire = ire_multicast(ill);
2513         }
2514         ASSERT(ire != NULL);
2515         /* Normally this will return the IRE_MULTICAST */
2516         if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2517                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2518                 ip_drop_input("mrts_pim RTF_REJECT", mp, ill);
2519                 freemsg(mp);
2520                 ire_refrele(ire);
2521                 return (-1);
2522         }
2523         ASSERT(ire->ire_type & IRE_MULTICAST);
2524         (*ire->ire_recvfn)(ire, mp, ipha, ira);
2525         ire_refrele(ire);
2526
2527         return (0);
2528 }
2529
2530 /*
2531  * Send an encapsulated packet.
2532  * Caller assumes can continue to use mp when routine returns.
2533  */
2534 /* ARGSUSED */
2535 static void
2536 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2537 {
2538         mblk_t  *mp_copy;
2539         ipha_t  *ipha_copy;
2540         size_t  len;
2541         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2542         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2543
2544         if (ipst->ips_ip_mrtdebug > 1) {
2545                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2546                     "encap_send: vif %ld enter",
2547                     (ptrdiff_t)(vifp - ipst->ips_vifs));
2548         }
2549         len = ntohs(ipha->ipha_length);
2550
2551         /*
2552          * Copy the old packet & pullup it's IP header into the
2553          * new mbuf so we can modify it.  Try to fill the new
2554          * mbuf since if we don't the ethernet driver will.
2555          */
2556         mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2557         if (mp_copy == NULL)
2558                 return;
2559         mp_copy->b_rptr += 32;
2560         mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2561         if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2562                 freeb(mp_copy);
2563                 return;
2564         }
2565
2566         /*
2567          * Fill in the encapsulating IP header.
2568          * Remote tunnel dst in rmt_addr, from add_vif().
2569          */
2570         ipha_copy = (ipha_t *)mp_copy->b_rptr;
2571         *ipha_copy = multicast_encap_iphdr;
2572         ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2573         ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2574         ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2575         ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2576         ASSERT(ipha_copy->ipha_ident == 0);
2577
2578         /* Turn the encapsulated IP header back into a valid one. */
2579         ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2580         ipha->ipha_ttl--;
2581         ipha->ipha_hdr_checksum = 0;
2582         ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2583
2584         ipha_copy->ipha_ttl = ipha->ipha_ttl;
2585
2586         if (ipst->ips_ip_mrtdebug > 1) {
2587                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2588                     "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2589         }
2590         if (vifp->v_rate_limit <= 0)
2591                 tbf_send_packet(vifp, mp_copy);
2592         else
2593                 /* ipha is from the original header */
2594                 tbf_control(vifp, mp_copy, ipha);
2595 }
2596
2597 /*
2598  * De-encapsulate a packet and feed it back through IP input if it
2599  * matches one of our multicast tunnels.
2600  *
2601  * This routine is called whenever IP gets a packet with prototype
2602  * IPPROTO_ENCAP and a local destination address and the packet didn't
2603  * match one of our configured IP-in-IP tunnels.
2604  */
2605 void
2606 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira)
2607 {
2608         ipha_t          *ipha = (ipha_t *)mp->b_rptr;
2609         ipha_t          *ipha_encap;
2610         int             hlen = IPH_HDR_LENGTH(ipha);
2611         int             hlen_encap;
2612         ipaddr_t        src;
2613         struct vif      *vifp;
2614         ire_t           *ire;
2615         ill_t           *ill = ira->ira_ill;
2616         ip_stack_t      *ipst = ill->ill_ipst;
2617         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2618
2619         /* Make sure we have all of the inner header */
2620         ipha_encap = (ipha_t *)((char *)ipha + hlen);
2621         if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) {
2622                 ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira);
2623                 if (ipha == NULL) {
2624                         ipst->ips_mrtstat->mrts_bad_tunnel++;
2625                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2626                         ip_drop_input("ip_mroute_decap: too short", mp, ill);
2627                         freemsg(mp);
2628                         return;
2629                 }
2630                 ipha_encap = (ipha_t *)((char *)ipha + hlen);
2631         }
2632         hlen_encap = IPH_HDR_LENGTH(ipha_encap);
2633         if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) {
2634                 ipha = ip_pullup(mp, hlen + hlen_encap, ira);
2635                 if (ipha == NULL) {
2636                         ipst->ips_mrtstat->mrts_bad_tunnel++;
2637                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2638                         ip_drop_input("ip_mroute_decap: too short", mp, ill);
2639                         freemsg(mp);
2640                         return;
2641                 }
2642                 ipha_encap = (ipha_t *)((char *)ipha + hlen);
2643         }
2644
2645         /*
2646          * Dump the packet if it's not to a multicast destination or if
2647          * we don't have an encapsulating tunnel with the source.
2648          * Note:  This code assumes that the remote site IP address
2649          * uniquely identifies the tunnel (i.e., that this site has
2650          * at most one tunnel with the remote site).
2651          */
2652         if (!CLASSD(ipha_encap->ipha_dst)) {
2653                 ipst->ips_mrtstat->mrts_bad_tunnel++;
2654                 ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2655                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2656                 ip_drop_input("mrts_bad_tunnel", mp, ill);
2657                 freemsg(mp);
2658                 return;
2659         }
2660         src = (ipaddr_t)ipha->ipha_src;
2661         mutex_enter(&ipst->ips_last_encap_lock);
2662         if (src != ipst->ips_last_encap_src) {
2663                 struct vif *vife;
2664
2665                 vifp = ipst->ips_vifs;
2666                 vife = vifp + ipst->ips_numvifs;
2667                 ipst->ips_last_encap_src = src;
2668                 ipst->ips_last_encap_vif = 0;
2669                 for (; vifp < vife; ++vifp) {
2670                         if (!lock_good_vif(vifp))
2671                                 continue;
2672                         if (vifp->v_rmt_addr.s_addr == src) {
2673                                 if (vifp->v_flags & VIFF_TUNNEL)
2674                                         ipst->ips_last_encap_vif = vifp;
2675                                 if (ipst->ips_ip_mrtdebug > 1) {
2676                                         (void) mi_strlog(mrouter->conn_rq,
2677                                             1, SL_TRACE,
2678                                             "ip_mroute_decap: good tun "
2679                                             "vif %ld with %x",
2680                                             (ptrdiff_t)(vifp - ipst->ips_vifs),
2681                                             ntohl(src));
2682                                 }
2683                                 unlock_good_vif(vifp);
2684                                 break;
2685                         }
2686                         unlock_good_vif(vifp);
2687                 }
2688         }
2689         if ((vifp = ipst->ips_last_encap_vif) == 0) {
2690                 mutex_exit(&ipst->ips_last_encap_lock);
2691                 ipst->ips_mrtstat->mrts_bad_tunnel++;
2692                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2693                 ip_drop_input("mrts_bad_tunnel", mp, ill);
2694                 freemsg(mp);
2695                 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2696                     (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
2697                 return;
2698         }
2699         mutex_exit(&ipst->ips_last_encap_lock);
2700
2701         /*
2702          * Need to pass in the tunnel source to ip_mforward (so that it can
2703          * verify that the packet arrived over the correct vif.)
2704          */
2705         ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET;
2706         ira->ira_mroute_tunnel = src;
2707         mp->b_rptr += hlen;
2708         ira->ira_pktlen -= hlen;
2709         ira->ira_ip_hdr_length = hlen_encap;
2710
2711         /*
2712          * We don't redo any of the filtering in ill_input_full_v4 and we
2713          * have checked that all of ipha_encap and any IP options are
2714          * pulled up. Hence we call ire_recv_multicast_v4 directly.
2715          * However, we have to check for RSVP as in ip_input_full_v4
2716          * and if so we pass it to ire_recv_broadcast_v4 for local delivery
2717          * to the rsvpd.
2718          */
2719         if (ipha_encap->ipha_protocol == IPPROTO_RSVP &&
2720             ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
2721                 ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill,
2722                     ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR,
2723                     IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2724         } else {
2725                 ire = ire_multicast(ill);
2726         }
2727         ASSERT(ire != NULL);
2728         /* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */
2729         if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2730                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2731                 ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill);
2732                 freemsg(mp);
2733                 ire_refrele(ire);
2734                 return;
2735         }
2736         ire->ire_ib_pkt_count++;
2737         ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST));
2738         (*ire->ire_recvfn)(ire, mp, ipha_encap, ira);
2739         ire_refrele(ire);
2740 }
2741
2742 /*
2743  * Remove all records with v_ipif == ipif.  Called when an interface goes away
2744  * (stream closed).  Called as writer.
2745  */
2746 void
2747 reset_mrt_vif_ipif(ipif_t *ipif)
2748 {
2749         vifi_t vifi, tmp_vifi;
2750         vifi_t num_of_vifs;
2751         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
2752
2753         /* Can't check vifi >= 0 since vifi_t is unsigned! */
2754
2755         mutex_enter(&ipst->ips_numvifs_mutex);
2756         num_of_vifs = ipst->ips_numvifs;
2757         mutex_exit(&ipst->ips_numvifs_mutex);
2758
2759         for (vifi = num_of_vifs; vifi != 0; vifi--) {
2760                 tmp_vifi = vifi - 1;
2761                 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
2762                         (void) del_vif(&tmp_vifi, ipst);
2763                 }
2764         }
2765 }
2766
2767 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
2768 void
2769 reset_mrt_ill(ill_t *ill)
2770 {
2771         struct mfc      *rt;
2772         struct rtdetq   *rte;
2773         int             i;
2774         ip_stack_t      *ipst = ill->ill_ipst;
2775         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2776         timeout_id_t    id;
2777
2778         for (i = 0; i < MFCTBLSIZ; i++) {
2779                 MFCB_REFHOLD(&ipst->ips_mfcs[i]);
2780                 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
2781                         if (ipst->ips_ip_mrtdebug > 1) {
2782                                 (void) mi_strlog(mrouter->conn_rq, 1,
2783                                     SL_TRACE,
2784                                     "reset_mrt_ill: mfctable [%d]", i);
2785                         }
2786                         while (rt != NULL) {
2787                                 mutex_enter(&rt->mfc_mutex);
2788                                 while ((rte = rt->mfc_rte) != NULL) {
2789                                         if (rte->ill == ill &&
2790                                             (id = rt->mfc_timeout_id) != 0) {
2791                                                 /*
2792                                                  * Its ok to drop the lock,  the
2793                                                  * struct cannot be freed since
2794                                                  * we have a ref on the hash
2795                                                  * bucket.
2796                                                  */
2797                                                 mutex_exit(&rt->mfc_mutex);
2798                                                 (void) untimeout(id);
2799                                                 mutex_enter(&rt->mfc_mutex);
2800                                         }
2801                                         if (rte->ill == ill) {
2802                                                 if (ipst->ips_ip_mrtdebug > 1) {
2803                                                 (void) mi_strlog(
2804                                                     mrouter->conn_rq,
2805                                                     1, SL_TRACE,
2806                                                     "reset_mrt_ill: "
2807                                                     "ill 0x%p", (void *)ill);
2808                                                 }
2809                                                 rt->mfc_rte = rte->rte_next;
2810                                                 freemsg(rte->mp);
2811                                                 mi_free((char *)rte);
2812                                         }
2813                                 }
2814                                 mutex_exit(&rt->mfc_mutex);
2815                                 rt = rt->mfc_next;
2816                         }
2817                 }
2818                 MFCB_REFRELE(&ipst->ips_mfcs[i]);
2819         }
2820 }
2821
2822 /*
2823  * Token bucket filter module.
2824  * The ipha is for mcastgrp destination for phyint and encap.
2825  */
2826 static void
2827 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2828 {
2829         size_t  p_len =  msgdsize(mp);
2830         struct tbf      *t    = vifp->v_tbf;
2831         timeout_id_t id = 0;
2832         ill_t           *ill = vifp->v_ipif->ipif_ill;
2833         ip_stack_t      *ipst = ill->ill_ipst;
2834         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2835
2836         /* Drop if packet is too large */
2837         if (p_len > MAX_BKT_SIZE) {
2838                 ipst->ips_mrtstat->mrts_pkt2large++;
2839                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2840                 ip_drop_output("tbf_control - too large", mp, ill);
2841                 freemsg(mp);
2842                 return;
2843         }
2844         if (ipst->ips_ip_mrtdebug > 1) {
2845                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2846                     "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2847                     (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
2848                     ntohl(ipha->ipha_dst));
2849         }
2850
2851         mutex_enter(&t->tbf_lock);
2852
2853         tbf_update_tokens(vifp);
2854
2855         /*
2856          * If there are enough tokens,
2857          * and the queue is empty, send this packet out.
2858          */
2859         if (ipst->ips_ip_mrtdebug > 1) {
2860                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2861                     "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
2862                     (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
2863                     t->tbf_q_len);
2864         }
2865         /* No packets are queued */
2866         if (t->tbf_q_len == 0) {
2867                 /* queue empty, send packet if enough tokens */
2868                 if (p_len <= t->tbf_n_tok) {
2869                         t->tbf_n_tok -= p_len;
2870                         mutex_exit(&t->tbf_lock);
2871                         tbf_send_packet(vifp, mp);
2872                         return;
2873                 } else {
2874                         /* Queue packet and timeout till later */
2875                         tbf_queue(vifp, mp);
2876                         ASSERT(vifp->v_timeout_id == 0);
2877                         vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2878                             TBF_REPROCESS);
2879                 }
2880         } else if (t->tbf_q_len < t->tbf_max_q_len) {
2881                 /* Finite queue length, so queue pkts and process queue */
2882                 tbf_queue(vifp, mp);
2883                 tbf_process_q(vifp);
2884         } else {
2885                 /* Check that we have UDP header with IP header */
2886                 size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2887                     sizeof (struct udphdr);
2888
2889                 if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2890                         if (!pullupmsg(mp, hdr_length)) {
2891                                 BUMP_MIB(ill->ill_ip_mib,
2892                                     ipIfStatsOutDiscards);
2893                                 ip_drop_output("tbf_control - pullup", mp, ill);
2894                                 freemsg(mp);
2895                                 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2896                                     "vif %ld src 0x%x dst 0x%x\n",
2897                                     (ptrdiff_t)(vifp - ipst->ips_vifs),
2898                                     ntohl(ipha->ipha_src),
2899                                     ntohl(ipha->ipha_dst)));
2900                                 mutex_exit(&vifp->v_tbf->tbf_lock);
2901                                 return;
2902                         } else
2903                                 /* Have to reassign ipha after pullupmsg */
2904                                 ipha = (ipha_t *)mp->b_rptr;
2905                 }
2906                 /*
2907                  * Queue length too much,
2908                  * try to selectively dq, or queue and process
2909                  */
2910                 if (!tbf_dq_sel(vifp, ipha)) {
2911                         ipst->ips_mrtstat->mrts_q_overflow++;
2912                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2913                         ip_drop_output("mrts_q_overflow", mp, ill);
2914                         freemsg(mp);
2915                 } else {
2916                         tbf_queue(vifp, mp);
2917                         tbf_process_q(vifp);
2918                 }
2919         }
2920         if (t->tbf_q_len == 0) {
2921                 id = vifp->v_timeout_id;
2922                 vifp->v_timeout_id = 0;
2923         }
2924         mutex_exit(&vifp->v_tbf->tbf_lock);
2925         if (id != 0)
2926                 (void) untimeout(id);
2927 }
2928
2929 /*
2930  * Adds a packet to the tbf queue at the interface.
2931  * The ipha is for mcastgrp destination for phyint and encap.
2932  */
2933 static void
2934 tbf_queue(struct vif *vifp, mblk_t *mp)
2935 {
2936         struct tbf      *t = vifp->v_tbf;
2937         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2938         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2939
2940         if (ipst->ips_ip_mrtdebug > 1) {
2941                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2942                     "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
2943         }
2944         ASSERT(MUTEX_HELD(&t->tbf_lock));
2945
2946         if (t->tbf_t == NULL) {
2947                 /* Queue was empty */
2948                 t->tbf_q = mp;
2949         } else {
2950                 /* Insert at tail */
2951                 t->tbf_t->b_next = mp;
2952         }
2953         /* set new tail pointer */
2954         t->tbf_t = mp;
2955
2956         mp->b_next = mp->b_prev = NULL;
2957
2958         t->tbf_q_len++;
2959 }
2960
2961 /*
2962  * Process the queue at the vif interface.
2963  * Drops the tbf_lock when sending packets.
2964  *
2965  * NOTE : The caller should quntimeout if the queue length is 0.
2966  */
2967 static void
2968 tbf_process_q(struct vif *vifp)
2969 {
2970         mblk_t  *mp;
2971         struct tbf      *t = vifp->v_tbf;
2972         size_t  len;
2973         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2974         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
2975
2976         if (ipst->ips_ip_mrtdebug > 1) {
2977                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2978                     "tbf_process_q 1: vif %ld qlen = %d",
2979                     (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
2980         }
2981
2982         /*
2983          * Loop through the queue at the interface and send
2984          * as many packets as possible.
2985          */
2986         ASSERT(MUTEX_HELD(&t->tbf_lock));
2987
2988         while (t->tbf_q_len > 0) {
2989                 mp = t->tbf_q;
2990                 len = (size_t)msgdsize(mp); /* length of ip pkt */
2991
2992                 /* Determine if the packet can be sent */
2993                 if (len <= t->tbf_n_tok) {
2994                         /*
2995                          * If so, reduce no. of tokens, dequeue the packet,
2996                          * send the packet.
2997                          */
2998                         t->tbf_n_tok -= len;
2999
3000                         t->tbf_q = mp->b_next;
3001                         if (--t->tbf_q_len == 0) {
3002                                 t->tbf_t = NULL;
3003                         }
3004                         mp->b_next = NULL;
3005                         /* Exit mutex before sending packet, then re-enter */
3006                         mutex_exit(&t->tbf_lock);
3007                         tbf_send_packet(vifp, mp);
3008                         mutex_enter(&t->tbf_lock);
3009                 } else
3010                         break;
3011         }
3012 }
3013
3014 /* Called at tbf timeout to update tokens, process q and reset timer.  */
3015 static void
3016 tbf_reprocess_q(void *arg)
3017 {
3018         struct vif *vifp = arg;
3019         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3020         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
3021
3022         mutex_enter(&vifp->v_tbf->tbf_lock);
3023         vifp->v_timeout_id = 0;
3024         tbf_update_tokens(vifp);
3025
3026         tbf_process_q(vifp);
3027
3028         if (vifp->v_tbf->tbf_q_len > 0) {
3029                 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
3030                     TBF_REPROCESS);
3031         }
3032         mutex_exit(&vifp->v_tbf->tbf_lock);
3033
3034         if (ipst->ips_ip_mrtdebug > 1) {
3035                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3036                     "tbf_reprcess_q: vif %ld timeout id = %p",
3037                     (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
3038         }
3039 }
3040
3041 /*
3042  * Function that will selectively discard a member of the tbf queue,
3043  * based on the precedence value and the priority.
3044  *
3045  * NOTE : The caller should quntimeout if the queue length is 0.
3046  */
3047 static int
3048 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
3049 {
3050         uint_t          p;
3051         struct tbf              *t = vifp->v_tbf;
3052         mblk_t          **np;
3053         mblk_t          *last, *mp;
3054         ill_t           *ill = vifp->v_ipif->ipif_ill;
3055         ip_stack_t      *ipst = ill->ill_ipst;
3056         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
3057
3058         if (ipst->ips_ip_mrtdebug > 1) {
3059                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3060                     "dq_sel: vif %ld dst 0x%x",
3061                     (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
3062         }
3063
3064         ASSERT(MUTEX_HELD(&t->tbf_lock));
3065         p = priority(vifp, ipha);
3066
3067         np = &t->tbf_q;
3068         last = NULL;
3069         while ((mp = *np) != NULL) {
3070                 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
3071                         *np = mp->b_next;
3072                         /* If removing the last packet, fix the tail pointer */
3073                         if (mp == t->tbf_t)
3074                                 t->tbf_t = last;
3075                         mp->b_prev = mp->b_next = NULL;
3076                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3077                         ip_drop_output("tbf_dq_send", mp, ill);
3078                         freemsg(mp);
3079                         /*
3080                          * It's impossible for the queue to be empty, but
3081                          * we check anyway.
3082                          */
3083                         if (--t->tbf_q_len == 0) {
3084                                 t->tbf_t = NULL;
3085                         }
3086                         ipst->ips_mrtstat->mrts_drop_sel++;
3087                         return (1);
3088                 }
3089                 np = &mp->b_next;
3090                 last = mp;
3091         }
3092         return (0);
3093 }
3094
3095 /* Sends packet, 2 cases - encap tunnel, phyint.  */
3096 static void
3097 tbf_send_packet(struct vif *vifp, mblk_t *mp)
3098 {
3099         ipif_t          *ipif = vifp->v_ipif;
3100         ill_t           *ill = ipif->ipif_ill;
3101         ip_stack_t      *ipst = ill->ill_ipst;
3102         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
3103         ipha_t          *ipha;
3104
3105         ipha = (ipha_t *)mp->b_rptr;
3106         /* If encap tunnel options */
3107         if (vifp->v_flags & VIFF_TUNNEL)  {
3108                 ip_xmit_attr_t  ixas;
3109
3110                 if (ipst->ips_ip_mrtdebug > 1) {
3111                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3112                             "tbf_send_packet: ENCAP tunnel vif %ld",
3113                             (ptrdiff_t)(vifp - ipst->ips_vifs));
3114                 }
3115                 bzero(&ixas, sizeof (ixas));
3116                 ixas.ixa_flags =
3117                     IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE;
3118                 ixas.ixa_ipst = ipst;
3119                 ixas.ixa_ifindex = 0;
3120                 ixas.ixa_cred = kcred;
3121                 ixas.ixa_cpid = NOPID;
3122                 ixas.ixa_tsl = NULL;
3123                 ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3124                 ixas.ixa_pktlen = ntohs(ipha->ipha_length);
3125                 ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3126
3127                 /*
3128                  * Feed into ip_output_simple which will set the ident field
3129                  * and checksum the encapsulating header.
3130                  * BSD gets the cached route vifp->v_route from ip_output()
3131                  * to speed up route table lookups. Not necessary in SunOS 5.x.
3132                  * One could make multicast forwarding faster by putting an
3133                  * ip_xmit_attr_t in each vif thereby caching the ire/nce.
3134                  */
3135                 (void) ip_output_simple(mp, &ixas);
3136                 ixa_cleanup(&ixas);
3137                 return;
3138
3139                 /* phyint */
3140         } else {
3141                 /* Need to loop back to members on the outgoing interface. */
3142                 ipaddr_t        dst;
3143                 ip_recv_attr_t  iras;
3144                 nce_t           *nce;
3145
3146                 bzero(&iras, sizeof (iras));
3147                 iras.ira_flags = IRAF_IS_IPV4;
3148                 iras.ira_ill = iras.ira_rill = ill;
3149                 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3150                 iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3151                 iras.ira_pktlen = ntohs(ipha->ipha_length);
3152                 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3153
3154                 dst = ipha->ipha_dst;
3155                 if (ill_hasmembers_v4(ill, dst)) {
3156                         iras.ira_flags |= IRAF_LOOPBACK_COPY;
3157                 }
3158                 if (ipst->ips_ip_mrtdebug > 1) {
3159                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3160                             "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
3161                             (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
3162                 }
3163                 /*
3164                  * Find an NCE which matches the nexthop.
3165                  * For a pt-pt interface we use the other end of the pt-pt
3166                  * link.
3167                  */
3168                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
3169                         dst = ipif->ipif_pp_dst_addr;
3170                         nce = arp_nce_init(ill, dst, ill->ill_net_type);
3171                 } else {
3172                         nce = arp_nce_init(ill, dst, IRE_MULTICAST);
3173                 }
3174                 if (nce == NULL) {
3175                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3176                         ip_drop_output("tbf_send_packet - no nce", mp, ill);
3177                         freemsg(mp);
3178                         return;
3179                 }
3180
3181                 /*
3182                  * We don't remeber the incoming ill. Thus we
3183                  * pretend the  packet arrived on the outbound ill. This means
3184                  * statistics for input errors will be increased on the wrong
3185                  * ill but that isn't a big deal.
3186                  */
3187                 ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mc_mtu,
3188                     0);
3189                 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3190
3191                 nce_refrele(nce);
3192         }
3193 }
3194
3195 /*
3196  * Determine the current time and then the elapsed time (between the last time
3197  * and time now).  Update the no. of tokens in the bucket.
3198  */
3199 static void
3200 tbf_update_tokens(struct vif *vifp)
3201 {
3202         timespec_t      tp;
3203         hrtime_t        tm;
3204         struct tbf      *t = vifp->v_tbf;
3205         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3206         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
3207
3208         ASSERT(MUTEX_HELD(&t->tbf_lock));
3209
3210         /* Time in secs and nsecs, rate limit in kbits/sec */
3211         gethrestime(&tp);
3212
3213         /*LINTED*/
3214         TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3215
3216         /*
3217          * This formula is actually
3218          * "time in seconds" * "bytes/second".  Scaled for nsec.
3219          * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3220          *
3221          * The (1000/1024) was introduced in add_vif to optimize
3222          * this divide into a shift.
3223          */
3224         t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3225         t->tbf_last_pkt_t = tp;
3226
3227         if (t->tbf_n_tok > MAX_BKT_SIZE)
3228                 t->tbf_n_tok = MAX_BKT_SIZE;
3229         if (ipst->ips_ip_mrtdebug > 1) {
3230                 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3231                     "tbf_update_tok: tm %lld tok %d vif %ld",
3232                     tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
3233         }
3234 }
3235
3236 /*
3237  * Priority currently is based on port nos.
3238  * Different forwarding mechanisms have different ways
3239  * of obtaining the port no. Hence, the vif must be
3240  * given along with the packet itself.
3241  *
3242  */
3243 static int
3244 priority(struct vif *vifp, ipha_t *ipha)
3245 {
3246         int prio;
3247         ip_stack_t      *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3248         conn_t          *mrouter = ipst->ips_ip_g_mrouter;
3249
3250         /* Temporary hack; may add general packet classifier some day */
3251
3252         ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3253
3254         /*
3255          * The UDP port space is divided up into four priority ranges:
3256          * [0, 16384)   : unclassified - lowest priority
3257          * [16384, 32768)       : audio - highest priority
3258          * [32768, 49152)       : whiteboard - medium priority
3259          * [49152, 65536)       : video - low priority
3260          */
3261
3262         if (ipha->ipha_protocol == IPPROTO_UDP) {
3263                 struct udphdr *udp =
3264                     (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3265                 switch (ntohs(udp->uh_dport) & 0xc000) {
3266                 case 0x4000:
3267                         prio = 70;
3268                         break;
3269                 case 0x8000:
3270                         prio = 60;
3271                         break;
3272                 case 0xc000:
3273                         prio = 55;
3274                         break;
3275                 default:
3276                         prio = 50;
3277                         break;
3278                 }
3279                 if (ipst->ips_ip_mrtdebug > 1) {
3280                         (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3281                             "priority: port %x prio %d\n",
3282                             ntohs(udp->uh_dport), prio);
3283                 }
3284         } else
3285                 prio = 50;  /* default priority */
3286         return (prio);
3287 }
3288
3289 /*
3290  * End of token bucket filter modifications
3291  */
3292
3293
3294
3295 /*
3296  * Produces data for netstat -M.
3297  */
3298 int
3299 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
3300 {
3301         ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
3302         ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
3303         if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
3304                 sizeof (struct mrtstat))) {
3305                 ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3306                     (size_t)sizeof (struct mrtstat)));
3307                 return (0);
3308         }
3309         return (1);
3310 }
3311
3312 /*
3313  * Sends info for SNMP's MIB.
3314  */
3315 int
3316 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
3317 {
3318         struct vifctl   vi;
3319         vifi_t          vifi;
3320
3321         mutex_enter(&ipst->ips_numvifs_mutex);
3322         for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
3323                 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
3324                         continue;
3325                 /*
3326                  * No locks here, an approximation is fine.
3327                  */
3328                 vi.vifc_vifi = vifi;
3329                 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
3330                 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
3331                 vi.vifc_rate_limit      = ipst->ips_vifs[vifi].v_rate_limit;
3332                 vi.vifc_lcl_addr        = ipst->ips_vifs[vifi].v_lcl_addr;
3333                 vi.vifc_rmt_addr        = ipst->ips_vifs[vifi].v_rmt_addr;
3334                 vi.vifc_pkt_in          = ipst->ips_vifs[vifi].v_pkt_in;
3335                 vi.vifc_pkt_out         = ipst->ips_vifs[vifi].v_pkt_out;
3336
3337                 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3338                         ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3339                             (size_t)sizeof (vi)));
3340                         mutex_exit(&ipst->ips_numvifs_mutex);
3341                         return (0);
3342                 }
3343         }
3344         mutex_exit(&ipst->ips_numvifs_mutex);
3345         return (1);
3346 }
3347
3348 /*
3349  * Called by ip_snmp_get to send up multicast routing table.
3350  */
3351 int
3352 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
3353 {
3354         int                     i, j;
3355         struct mfc              *rt;
3356         struct mfcctl   mfcc;
3357
3358         /*
3359          * Make sure multicast has not been turned off.
3360          */
3361         if (is_mrouter_off(ipst))
3362                 return (1);
3363
3364         /* Loop over all hash buckets and their chains */
3365         for (i = 0; i < MFCTBLSIZ; i++) {
3366                 MFCB_REFHOLD(&ipst->ips_mfcs[i]);
3367                 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3368                         mutex_enter(&rt->mfc_mutex);
3369                         if (rt->mfc_rte != NULL ||
3370                             (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3371                                 mutex_exit(&rt->mfc_mutex);
3372                                 continue;
3373                         }
3374                         mfcc.mfcc_origin = rt->mfc_origin;
3375                         mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3376                         mfcc.mfcc_parent = rt->mfc_parent;
3377                         mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3378                         mutex_enter(&ipst->ips_numvifs_mutex);
3379                         for (j = 0; j < (int)ipst->ips_numvifs; j++)
3380                                 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3381                         for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
3382                                 mfcc.mfcc_ttls[j] = 0;
3383                         mutex_exit(&ipst->ips_numvifs_mutex);
3384
3385                         mutex_exit(&rt->mfc_mutex);
3386                         if (!snmp_append_data(mp, (char *)&mfcc,
3387                             sizeof (mfcc))) {
3388                                 MFCB_REFRELE(&ipst->ips_mfcs[i]);
3389                                 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3390                                     (size_t)sizeof (mfcc)));
3391                                 return (0);
3392                         }
3393                 }
3394                 MFCB_REFRELE(&ipst->ips_mfcs[i]);
3395         }
3396         return (1);
3397 }