kernel/net/ip/ip_if.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 1990 Mentat Inc.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  * Copyright (c) 2016, Joyent, Inc. All rights reserved.
  26  * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  27  */
  28
  29 /*
  30  * This file contains the interface control functions for IP.
  31  */
  32
  33 #include <sys/types.h>
  34 #include <sys/stream.h>
  35 #include <sys/dlpi.h>
  36 #include <sys/stropts.h>
  37 #include <sys/strsun.h>
  38 #include <sys/sysmacros.h>
  39 #include <sys/strsubr.h>
  40 #include <sys/strlog.h>
  41 #include <sys/ddi.h>
  42 #include <sys/sunddi.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/kstat.h>
  45 #include <sys/debug.h>
  46 #include <sys/zone.h>
  47 #include <sys/sunldi.h>
  48 #include <sys/file.h>
  49 #include <sys/bitmap.h>
  50 #include <sys/cpuvar.h>
  51 #include <sys/time.h>
  52 #include <sys/ctype.h>
  53 #include <sys/kmem.h>
  54 #include <sys/systm.h>
  55 #include <sys/param.h>
  56 #include <sys/socket.h>
  57 #include <sys/isa_defs.h>
  58 #include <net/if.h>
  59 #include <net/if_arp.h>
  60 #include <net/if_types.h>
  61 #include <net/if_dl.h>
  62 #include <net/route.h>
  63 #include <sys/sockio.h>
  64 #include <netinet/in.h>
  65 #include <netinet/ip6.h>
  66 #include <netinet/icmp6.h>
  67 #include <netinet/igmp_var.h>
  68 #include <sys/policy.h>
  69 #include <sys/ethernet.h>
  70 #include <sys/callb.h>
  71 #include <sys/md5.h>
  72
  73 #include <inet/common.h>   /* for various inet/mi.h and inet/nd.h needs */
  74 #include <inet/mi.h>
  75 #include <inet/nd.h>
  76 #include <inet/tunables.h>
  77 #include <inet/arp.h>
  78 #include <inet/ip_arp.h>
  79 #include <inet/mib2.h>
  80 #include <inet/ip.h>
  81 #include <inet/ip6.h>
  82 #include <inet/ip6_asp.h>
  83 #include <inet/tcp.h>
  84 #include <inet/ip_multi.h>
  85 #include <inet/ip_ire.h>
  86 #include <inet/ip_ftable.h>
  87 #include <inet/ip_rts.h>
  88 #include <inet/ip_ndp.h>
  89 #include <inet/ip_if.h>
  90 #include <inet/ip_impl.h>
  91 #include <inet/sctp_ip.h>
  92 #include <inet/ip_netinfo.h>
  93 #include <inet/ilb_ip.h>
  94
  95 #include <netinet/igmp.h>
  96 #include <inet/ip_listutils.h>
  97 #include <inet/ipclassifier.h>
  98 #include <sys/mac_client.h>
  99 #include <sys/dld.h>
 100 #include <sys/mac_flow.h>
 101
 102 #include <sys/systeminfo.h>
 103 #include <sys/bootconf.h>
 104
 105 #include <inet/rawip_impl.h> /* needed for icmp_stack_t */
 106 #include <inet/udp_impl.h> /* needed for udp_stack_t */
 107
 108 /* The character which tells where the ill_name ends */
 109 #define IPIF_SEPARATOR_CHAR     ':'
 110
 111 /* IP ioctl function table entry */
 112 typedef struct ipft_s {
 113         int     ipft_cmd;
 114         pfi_t   ipft_pfi;
 115         int     ipft_min_size;
 116         int     ipft_flags;
 117 } ipft_t;
 118 #define IPFT_F_NO_REPLY         0x1     /* IP ioctl does not expect any reply */
 119 #define IPFT_F_SELF_REPLY       0x2     /* ioctl callee does the ioctl reply */
 120
 121 static int      nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
 122 static int      nd_ill_forward_set(queue_t *q, mblk_t *mp,
 123                     char *value, caddr_t cp, cred_t *ioc_cr);
 124
 125 static boolean_t ill_is_quiescent(ill_t *);
 126 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
 127 static ip_m_t   *ip_m_lookup(t_uscalar_t mac_type);
 128 static int      ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
 129     mblk_t *mp, boolean_t need_up);
 130 static int      ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
 131     mblk_t *mp, boolean_t need_up);
 132 static int      ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
 133     queue_t *q, mblk_t *mp, boolean_t need_up);
 134 static int      ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
 135     mblk_t *mp);
 136 static int      ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
 137     mblk_t *mp);
 138 static int      ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
 139     queue_t *q, mblk_t *mp, boolean_t need_up);
 140 static int      ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
 141     int ioccmd, struct linkblk *li);
 142 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
 143 static void     ip_wput_ioctl(queue_t *q, mblk_t *mp);
 144 static void     ipsq_flush(ill_t *ill);
 145
 146 static  int     ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
 147     queue_t *q, mblk_t *mp, boolean_t need_up);
 148 static void     ipsq_delete(ipsq_t *);
 149
 150 static ipif_t   *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
 151     boolean_t initialize, boolean_t insert, int *errorp);
 152 static ire_t    **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
 153 static void     ipif_delete_bcast_ires(ipif_t *ipif);
 154 static int      ipif_add_ires_v4(ipif_t *, boolean_t);
 155 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
 156                     boolean_t isv6);
 157 static int      ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
 158 static void     ipif_free(ipif_t *ipif);
 159 static void     ipif_free_tail(ipif_t *ipif);
 160 static void     ipif_set_default(ipif_t *ipif);
 161 static int      ipif_set_values(queue_t *q, mblk_t *mp,
 162     char *interf_name, uint_t *ppa);
 163 static int      ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
 164     queue_t *q);
 165 static ipif_t   *ipif_lookup_on_name(char *name, size_t namelen,
 166     boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
 167     ip_stack_t *);
 168 static ipif_t   *ipif_lookup_on_name_async(char *name, size_t namelen,
 169     boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func,
 170     int *error, ip_stack_t *);
 171
 172 static int      ill_alloc_ppa(ill_if_t *, ill_t *);
 173 static void     ill_delete_interface_type(ill_if_t *);
 174 static int      ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
 175 static void     ill_dl_down(ill_t *ill);
 176 static void     ill_down(ill_t *ill);
 177 static void     ill_down_ipifs(ill_t *, boolean_t);
 178 static void     ill_free_mib(ill_t *ill);
 179 static void     ill_glist_delete(ill_t *);
 180 static void     ill_phyint_reinit(ill_t *ill);
 181 static void     ill_set_nce_router_flags(ill_t *, boolean_t);
 182 static void     ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
 183 static void     ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *);
 184
 185 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
 186 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
 187 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
 188 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
 189 static ip_v4mapinfo_func_t ip_ether_v4_mapping;
 190 static ip_v6mapinfo_func_t ip_ether_v6_mapping;
 191 static ip_v4mapinfo_func_t ip_ib_v4_mapping;
 192 static ip_v6mapinfo_func_t ip_ib_v6_mapping;
 193 static ip_v4mapinfo_func_t ip_mbcast_mapping;
 194 static void     phyint_free(phyint_t *);
 195
 196 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
 197 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 198 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 199 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 200 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
 201 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
 202     dl_capability_sub_t *);
 203 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
 204 static void     ill_capability_dld_reset_fill(ill_t *, mblk_t *);
 205 static void     ill_capability_dld_ack(ill_t *, mblk_t *,
 206                     dl_capability_sub_t *);
 207 static void     ill_capability_dld_enable(ill_t *);
 208 static void     ill_capability_ack_thr(void *);
 209 static void     ill_capability_lso_enable(ill_t *);
 210
 211 static ill_t    *ill_prev_usesrc(ill_t *);
 212 static int      ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
 213 static void     ill_disband_usesrc_group(ill_t *);
 214 static void     ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
 215
 216 #ifdef DEBUG
 217 static  void    ill_trace_cleanup(const ill_t *);
 218 static  void    ipif_trace_cleanup(const ipif_t *);
 219 #endif
 220
 221 static  void    ill_dlpi_clear_deferred(ill_t *ill);
 222
 223 static  void    phyint_flags_init(phyint_t *, t_uscalar_t);
 224
 225 /*
 226  * if we go over the memory footprint limit more than once in this msec
 227  * interval, we'll start pruning aggressively.
 228  */
 229 int ip_min_frag_prune_time = 0;
 230
 231 static ipft_t   ip_ioctl_ftbl[] = {
 232         { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
 233         { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
 234                 IPFT_F_NO_REPLY },
 235         { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
 236         { 0 }
 237 };
 238
 239 /* Simple ICMP IP Header Template */
 240 static ipha_t icmp_ipha = {
 241         IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
 242 };
 243
 244 static uchar_t  ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
 245
 246 static ip_m_t   ip_m_tbl[] = {
 247         { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 248             ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
 249             ip_nodef_v6intfid },
 250         { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
 251             ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 252             ip_nodef_v6intfid },
 253         { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
 254             ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 255             ip_nodef_v6intfid },
 256         { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
 257             ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 258             ip_nodef_v6intfid },
 259         { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
 260             ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
 261             ip_nodef_v6intfid },
 262         { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
 263             ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
 264             ip_nodef_v6intfid },
 265         { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
 266             ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
 267             ip_ipv4_v6destintfid },
 268         { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
 269             ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
 270             ip_ipv6_v6destintfid },
 271         { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
 272             ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
 273             ip_nodef_v6intfid },
 274         { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 275             NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
 276         { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 277             NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
 278         { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 279             ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 280             ip_nodef_v6intfid }
 281 };
 282
 283 char    ipif_loopback_name[] = "lo0";
 284
 285 /* These are used by all IP network modules. */
 286 sin6_t  sin6_null;      /* Zero address for quick clears */
 287 sin_t   sin_null;       /* Zero address for quick clears */
 288
 289 /* When set search for unused ipif_seqid */
 290 static ipif_t   ipif_zero;
 291
 292 /*
 293  * ppa arena is created after these many
 294  * interfaces have been plumbed.
 295  */
 296 uint_t  ill_no_arena = 12;      /* Setable in /etc/system */
 297
 298 /*
 299  * Allocate per-interface mibs.
 300  * Returns true if ok. False otherwise.
 301  *  ipsq  may not yet be allocated (loopback case ).
 302  */
 303 static boolean_t
 304 ill_allocate_mibs(ill_t *ill)
 305 {
 306         /* Already allocated? */
 307         if (ill->ill_ip_mib != NULL) {
 308                 if (ill->ill_isv6)
 309                         ASSERT(ill->ill_icmp6_mib != NULL);
 310                 return (B_TRUE);
 311         }
 312
 313         ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
 314             KM_NOSLEEP);
 315         if (ill->ill_ip_mib == NULL) {
 316                 return (B_FALSE);
 317         }
 318
 319         /* Setup static information */
 320         SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
 321             sizeof (mib2_ipIfStatsEntry_t));
 322         if (ill->ill_isv6) {
 323                 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
 324                 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
 325                     sizeof (mib2_ipv6AddrEntry_t));
 326                 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
 327                     sizeof (mib2_ipv6RouteEntry_t));
 328                 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
 329                     sizeof (mib2_ipv6NetToMediaEntry_t));
 330                 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
 331                     sizeof (ipv6_member_t));
 332                 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
 333                     sizeof (ipv6_grpsrc_t));
 334         } else {
 335                 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
 336                 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
 337                     sizeof (mib2_ipAddrEntry_t));
 338                 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
 339                     sizeof (mib2_ipRouteEntry_t));
 340                 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
 341                     sizeof (mib2_ipNetToMediaEntry_t));
 342                 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
 343                     sizeof (ip_member_t));
 344                 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
 345                     sizeof (ip_grpsrc_t));
 346
 347                 /*
 348                  * For a v4 ill, we are done at this point, because per ill
 349                  * icmp mibs are only used for v6.
 350                  */
 351                 return (B_TRUE);
 352         }
 353
 354         ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
 355             KM_NOSLEEP);
 356         if (ill->ill_icmp6_mib == NULL) {
 357                 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
 358                 ill->ill_ip_mib = NULL;
 359                 return (B_FALSE);
 360         }
 361         /* static icmp info */
 362         ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
 363             sizeof (mib2_ipv6IfIcmpEntry_t);
 364         /*
 365          * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
 366          * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
 367          * -> ill_phyint_reinit
 368          */
 369         return (B_TRUE);
 370 }
 371
 372 /*
 373  * Completely vaporize a lower level tap and all associated interfaces.
 374  * ill_delete is called only out of ip_close when the device control
 375  * stream is being closed.
 376  */
 377 void
 378 ill_delete(ill_t *ill)
 379 {
 380         ipif_t  *ipif;
 381         ill_t   *prev_ill;
 382         ip_stack_t      *ipst = ill->ill_ipst;
 383
 384         /*
 385          * ill_delete may be forcibly entering the ipsq. The previous
 386          * ioctl may not have completed and may need to be aborted.
 387          * ipsq_flush takes care of it. If we don't need to enter the
 388          * the ipsq forcibly, the 2nd invocation of ipsq_flush in
 389          * ill_delete_tail is sufficient.
 390          */
 391         ipsq_flush(ill);
 392
 393         /*
 394          * Nuke all interfaces.  ipif_free will take down the interface,
 395          * remove it from the list, and free the data structure.
 396          * Walk down the ipif list and remove the logical interfaces
 397          * first before removing the main ipif. We can't unplumb
 398          * zeroth interface first in the case of IPv6 as update_conn_ill
 399          * -> ip_ll_multireq de-references ill_ipif for checking
 400          * POINTOPOINT.
 401          *
 402          * If ill_ipif was not properly initialized (i.e low on memory),
 403          * then no interfaces to clean up. In this case just clean up the
 404          * ill.
 405          */
 406         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
 407                 ipif_free(ipif);
 408
 409         /*
 410          * clean out all the nce_t entries that depend on this
 411          * ill for the ill_phys_addr.
 412          */
 413         nce_flush(ill, B_TRUE);
 414
 415         /* Clean up msgs on pending upcalls for mrouted */
 416         reset_mrt_ill(ill);
 417
 418         update_conn_ill(ill, ipst);
 419
 420         /*
 421          * Remove multicast references added as a result of calls to
 422          * ip_join_allmulti().
 423          */
 424         ip_purge_allmulti(ill);
 425
 426         /*
 427          * If the ill being deleted is under IPMP, boot it out of the illgrp.
 428          */
 429         if (IS_UNDER_IPMP(ill))
 430                 ipmp_ill_leave_illgrp(ill);
 431
 432         /*
 433          * ill_down will arrange to blow off any IRE's dependent on this
 434          * ILL, and shut down fragmentation reassembly.
 435          */
 436         ill_down(ill);
 437
 438         /* Let SCTP know, so that it can remove this from its list. */
 439         sctp_update_ill(ill, SCTP_ILL_REMOVE);
 440
 441         /*
 442          * Walk all CONNs that can have a reference on an ire or nce for this
 443          * ill (we actually walk all that now have stale references).
 444          */
 445         ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
 446
 447         /* With IPv6 we have dce_ifindex. Cleanup for neatness */
 448         if (ill->ill_isv6)
 449                 dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
 450
 451         /*
 452          * If an address on this ILL is being used as a source address then
 453          * clear out the pointers in other ILLs that point to this ILL.
 454          */
 455         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
 456         if (ill->ill_usesrc_grp_next != NULL) {
 457                 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
 458                         ill_disband_usesrc_group(ill);
 459                 } else {        /* consumer of the usesrc ILL */
 460                         prev_ill = ill_prev_usesrc(ill);
 461                         prev_ill->ill_usesrc_grp_next =
 462                             ill->ill_usesrc_grp_next;
 463                 }
 464         }
 465         rw_exit(&ipst->ips_ill_g_usesrc_lock);
 466 }
 467
 468 static void
 469 ipif_non_duplicate(ipif_t *ipif)
 470 {
 471         ill_t *ill = ipif->ipif_ill;
 472         mutex_enter(&ill->ill_lock);
 473         if (ipif->ipif_flags & IPIF_DUPLICATE) {
 474                 ipif->ipif_flags &= ~IPIF_DUPLICATE;
 475                 ASSERT(ill->ill_ipif_dup_count > 0);
 476                 ill->ill_ipif_dup_count--;
 477         }
 478         mutex_exit(&ill->ill_lock);
 479 }
 480
 481 /*
 482  * ill_delete_tail is called from ip_modclose after all references
 483  * to the closing ill are gone. The wait is done in ip_modclose
 484  */
 485 void
 486 ill_delete_tail(ill_t *ill)
 487 {
 488         mblk_t  **mpp;
 489         ipif_t  *ipif;
 490         ip_stack_t *ipst = ill->ill_ipst;
 491
 492         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
 493                 ipif_non_duplicate(ipif);
 494                 (void) ipif_down_tail(ipif);
 495         }
 496
 497         ASSERT(ill->ill_ipif_dup_count == 0);
 498
 499         /*
 500          * If polling capability is enabled (which signifies direct
 501          * upcall into IP and driver has ill saved as a handle),
 502          * we need to make sure that unbind has completed before we
 503          * let the ill disappear and driver no longer has any reference
 504          * to this ill.
 505          */
 506         mutex_enter(&ill->ill_lock);
 507         while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
 508                 cv_wait(&ill->ill_cv, &ill->ill_lock);
 509         mutex_exit(&ill->ill_lock);
 510         ASSERT(!(ill->ill_capabilities &
 511             (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
 512
 513         if (ill->ill_net_type != IRE_LOOPBACK)
 514                 qprocsoff(ill->ill_rq);
 515
 516         /*
 517          * We do an ipsq_flush once again now. New messages could have
 518          * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
 519          * could also have landed up if an ioctl thread had looked up
 520          * the ill before we set the ILL_CONDEMNED flag, but not yet
 521          * enqueued the ioctl when we did the ipsq_flush last time.
 522          */
 523         ipsq_flush(ill);
 524
 525         /*
 526          * Free capabilities.
 527          */
 528         if (ill->ill_hcksum_capab != NULL) {
 529                 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
 530                 ill->ill_hcksum_capab = NULL;
 531         }
 532
 533         if (ill->ill_zerocopy_capab != NULL) {
 534                 kmem_free(ill->ill_zerocopy_capab,
 535                     sizeof (ill_zerocopy_capab_t));
 536                 ill->ill_zerocopy_capab = NULL;
 537         }
 538
 539         if (ill->ill_lso_capab != NULL) {
 540                 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
 541                 ill->ill_lso_capab = NULL;
 542         }
 543
 544         if (ill->ill_dld_capab != NULL) {
 545                 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
 546                 ill->ill_dld_capab = NULL;
 547         }
 548
 549         /* Clean up ill_allowed_ips* related state */
 550         if (ill->ill_allowed_ips != NULL) {
 551                 ASSERT(ill->ill_allowed_ips_cnt > 0);
 552                 kmem_free(ill->ill_allowed_ips,
 553                     ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
 554                 ill->ill_allowed_ips = NULL;
 555                 ill->ill_allowed_ips_cnt = 0;
 556         }
 557
 558         while (ill->ill_ipif != NULL)
 559                 ipif_free_tail(ill->ill_ipif);
 560
 561         /*
 562          * We have removed all references to ilm from conn and the ones joined
 563          * within the kernel.
 564          *
 565          * We don't walk conns, mrts and ires because
 566          *
 567          * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
 568          * 2) ill_down ->ill_downi walks all the ires and cleans up
 569          *    ill references.
 570          */
 571
 572         /*
 573          * If this ill is an IPMP meta-interface, blow away the illgrp.  This
 574          * is safe to do because the illgrp has already been unlinked from the
 575          * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
 576          */
 577         if (IS_IPMP(ill)) {
 578                 ipmp_illgrp_destroy(ill->ill_grp);
 579                 ill->ill_grp = NULL;
 580         }
 581
 582         if (ill->ill_mphysaddr_list != NULL) {
 583                 multiphysaddr_t *mpa, *tmpa;
 584
 585                 mpa = ill->ill_mphysaddr_list;
 586                 ill->ill_mphysaddr_list = NULL;
 587                 while (mpa) {
 588                         tmpa = mpa->mpa_next;
 589                         kmem_free(mpa, sizeof (*mpa));
 590                         mpa = tmpa;
 591                 }
 592         }
 593         /*
 594          * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
 595          * could free the phyint. No more reference to the phyint after this
 596          * point.
 597          */
 598         (void) ill_glist_delete(ill);
 599
 600         if (ill->ill_frag_ptr != NULL) {
 601                 uint_t count;
 602
 603                 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
 604                         mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
 605                 }
 606                 mi_free(ill->ill_frag_ptr);
 607                 ill->ill_frag_ptr = NULL;
 608                 ill->ill_frag_hash_tbl = NULL;
 609         }
 610
 611         freemsg(ill->ill_nd_lla_mp);
 612         /* Free all retained control messages. */
 613         mpp = &ill->ill_first_mp_to_free;
 614         do {
 615                 while (mpp[0]) {
 616                         mblk_t  *mp;
 617                         mblk_t  *mp1;
 618
 619                         mp = mpp[0];
 620                         mpp[0] = mp->b_next;
 621                         for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
 622                                 mp1->b_next = NULL;
 623                                 mp1->b_prev = NULL;
 624                         }
 625                         freemsg(mp);
 626                 }
 627         } while (mpp++ != &ill->ill_last_mp_to_free);
 628
 629         ill_free_mib(ill);
 630
 631 #ifdef DEBUG
 632         ill_trace_cleanup(ill);
 633 #endif
 634
 635         /* The default multicast interface might have changed */
 636         ire_increment_multicast_generation(ipst, ill->ill_isv6);
 637
 638         /* Drop refcnt here */
 639         netstack_rele(ill->ill_ipst->ips_netstack);
 640         ill->ill_ipst = NULL;
 641 }
 642
 643 static void
 644 ill_free_mib(ill_t *ill)
 645 {
 646         ip_stack_t *ipst = ill->ill_ipst;
 647
 648         /*
 649          * MIB statistics must not be lost, so when an interface
 650          * goes away the counter values will be added to the global
 651          * MIBs.
 652          */
 653         if (ill->ill_ip_mib != NULL) {
 654                 if (ill->ill_isv6) {
 655                         ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
 656                             ill->ill_ip_mib);
 657                 } else {
 658                         ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
 659                             ill->ill_ip_mib);
 660                 }
 661
 662                 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
 663                 ill->ill_ip_mib = NULL;
 664         }
 665         if (ill->ill_icmp6_mib != NULL) {
 666                 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
 667                     ill->ill_icmp6_mib);
 668                 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
 669                 ill->ill_icmp6_mib = NULL;
 670         }
 671 }
 672
 673 /*
 674  * Concatenate together a physical address and a sap.
 675  *
 676  * Sap_lengths are interpreted as follows:
 677  *   sap_length == 0    ==>     no sap
 678  *   sap_length > 0     ==>     sap is at the head of the dlpi address
 679  *   sap_length < 0     ==>     sap is at the tail of the dlpi address
 680  */
 681 static void
 682 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
 683     t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
 684 {
 685         uint16_t sap_addr = (uint16_t)sap_src;
 686
 687         if (sap_length == 0) {
 688                 if (phys_src == NULL)
 689                         bzero(dst, phys_length);
 690                 else
 691                         bcopy(phys_src, dst, phys_length);
 692         } else if (sap_length < 0) {
 693                 if (phys_src == NULL)
 694                         bzero(dst, phys_length);
 695                 else
 696                         bcopy(phys_src, dst, phys_length);
 697                 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
 698         } else {
 699                 bcopy(&sap_addr, dst, sizeof (sap_addr));
 700                 if (phys_src == NULL)
 701                         bzero((char *)dst + sap_length, phys_length);
 702                 else
 703                         bcopy(phys_src, (char *)dst + sap_length, phys_length);
 704         }
 705 }
 706
 707 /*
 708  * Generate a dl_unitdata_req mblk for the device and address given.
 709  * addr_length is the length of the physical portion of the address.
 710  * If addr is NULL include an all zero address of the specified length.
 711  * TRUE? In any case, addr_length is taken to be the entire length of the
 712  * dlpi address, including the absolute value of sap_length.
 713  */
 714 mblk_t *
 715 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
 716     t_scalar_t sap_length)
 717 {
 718         dl_unitdata_req_t *dlur;
 719         mblk_t  *mp;
 720         t_scalar_t      abs_sap_length;         /* absolute value */
 721
 722         abs_sap_length = ABS(sap_length);
 723         mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
 724             DL_UNITDATA_REQ);
 725         if (mp == NULL)
 726                 return (NULL);
 727         dlur = (dl_unitdata_req_t *)mp->b_rptr;
 728         /* HACK: accomodate incompatible DLPI drivers */
 729         if (addr_length == 8)
 730                 addr_length = 6;
 731         dlur->dl_dest_addr_length = addr_length + abs_sap_length;
 732         dlur->dl_dest_addr_offset = sizeof (*dlur);
 733         dlur->dl_priority.dl_min = 0;
 734         dlur->dl_priority.dl_max = 0;
 735         ill_dlur_copy_address(addr, addr_length, sap, sap_length,
 736             (uchar_t *)&dlur[1]);
 737         return (mp);
 738 }
 739
 740 /*
 741  * Add the pending mp to the list. There can be only 1 pending mp
 742  * in the list. Any exclusive ioctl that needs to wait for a response
 743  * from another module or driver needs to use this function to set
 744  * the ipx_pending_mp to the ioctl mblk and wait for the response from
 745  * the other module/driver. This is also used while waiting for the
 746  * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
 747  */
 748 boolean_t
 749 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
 750     int waitfor)
 751 {
 752         ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
 753
 754         ASSERT(IAM_WRITER_IPIF(ipif));
 755         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
 756         ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
 757         ASSERT(ipx->ipx_pending_mp == NULL);
 758         /*
 759          * The caller may be using a different ipif than the one passed into
 760          * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
 761          * ill needs to wait for the V6 ill to quiesce).  So we can't ASSERT
 762          * that `ipx_current_ipif == ipif'.
 763          */
 764         ASSERT(ipx->ipx_current_ipif != NULL);
 765
 766         /*
 767          * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the
 768          * driver.
 769          */
 770         ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) ||
 771             (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) ||
 772             (DB_TYPE(add_mp) == M_PCPROTO));
 773
 774         if (connp != NULL) {
 775                 ASSERT(MUTEX_HELD(&connp->conn_lock));
 776                 /*
 777                  * Return error if the conn has started closing. The conn
 778                  * could have finished cleaning up the pending mp list,
 779                  * If so we should not add another mp to the list negating
 780                  * the cleanup.
 781                  */
 782                 if (connp->conn_state_flags & CONN_CLOSING)
 783                         return (B_FALSE);
 784         }
 785         mutex_enter(&ipx->ipx_lock);
 786         ipx->ipx_pending_ipif = ipif;
 787         /*
 788          * Note down the queue in b_queue. This will be returned by
 789          * ipsq_pending_mp_get. Caller will then use these values to restart
 790          * the processing
 791          */
 792         add_mp->b_next = NULL;
 793         add_mp->b_queue = q;
 794         ipx->ipx_pending_mp = add_mp;
 795         ipx->ipx_waitfor = waitfor;
 796         mutex_exit(&ipx->ipx_lock);
 797
 798         if (connp != NULL)
 799                 connp->conn_oper_pending_ill = ipif->ipif_ill;
 800
 801         return (B_TRUE);
 802 }
 803
 804 /*
 805  * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
 806  * queued in the list.
 807  */
 808 mblk_t *
 809 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
 810 {
 811         mblk_t  *curr = NULL;
 812         ipxop_t *ipx = ipsq->ipsq_xop;
 813
 814         *connpp = NULL;
 815         mutex_enter(&ipx->ipx_lock);
 816         if (ipx->ipx_pending_mp == NULL) {
 817                 mutex_exit(&ipx->ipx_lock);
 818                 return (NULL);
 819         }
 820
 821         /* There can be only 1 such excl message */
 822         curr = ipx->ipx_pending_mp;
 823         ASSERT(curr->b_next == NULL);
 824         ipx->ipx_pending_ipif = NULL;
 825         ipx->ipx_pending_mp = NULL;
 826         ipx->ipx_waitfor = 0;
 827         mutex_exit(&ipx->ipx_lock);
 828
 829         if (CONN_Q(curr->b_queue)) {
 830                 /*
 831                  * This mp did a refhold on the conn, at the start of the ioctl.
 832                  * So we can safely return a pointer to the conn to the caller.
 833                  */
 834                 *connpp = Q_TO_CONN(curr->b_queue);
 835         } else {
 836                 *connpp = NULL;
 837         }
 838         curr->b_next = NULL;
 839         curr->b_prev = NULL;
 840         return (curr);
 841 }
 842
 843 /*
 844  * Cleanup the ioctl mp queued in ipx_pending_mp
 845  * - Called in the ill_delete path
 846  * - Called in the M_ERROR or M_HANGUP path on the ill.
 847  * - Called in the conn close path.
 848  *
 849  * Returns success on finding the pending mblk associated with the ioctl or
 850  * exclusive operation in progress, failure otherwise.
 851  */
 852 boolean_t
 853 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
 854 {
 855         mblk_t  *mp;
 856         ipxop_t *ipx;
 857         queue_t *q;
 858         ipif_t  *ipif;
 859         int     cmd;
 860
 861         ASSERT(IAM_WRITER_ILL(ill));
 862         ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
 863
 864         mutex_enter(&ipx->ipx_lock);
 865         mp = ipx->ipx_pending_mp;
 866         if (connp != NULL) {
 867                 if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) {
 868                         /*
 869                          * Nothing to clean since the conn that is closing
 870                          * does not have a matching pending mblk in
 871                          * ipx_pending_mp.
 872                          */
 873                         mutex_exit(&ipx->ipx_lock);
 874                         return (B_FALSE);
 875                 }
 876         } else {
 877                 /*
 878                  * A non-zero ill_error signifies we are called in the
 879                  * M_ERROR or M_HANGUP path and we need to unconditionally
 880                  * abort any current ioctl and do the corresponding cleanup.
 881                  * A zero ill_error means we are in the ill_delete path and
 882                  * we do the cleanup only if there is a pending mp.
 883                  */
 884                 if (mp == NULL && ill->ill_error == 0) {
 885                         mutex_exit(&ipx->ipx_lock);
 886                         return (B_FALSE);
 887                 }
 888         }
 889
 890         /* Now remove from the ipx_pending_mp */
 891         ipx->ipx_pending_mp = NULL;
 892         ipif = ipx->ipx_pending_ipif;
 893         ipx->ipx_pending_ipif = NULL;
 894         ipx->ipx_waitfor = 0;
 895         ipx->ipx_current_ipif = NULL;
 896         cmd = ipx->ipx_current_ioctl;
 897         ipx->ipx_current_ioctl = 0;
 898         ipx->ipx_current_done = B_TRUE;
 899         mutex_exit(&ipx->ipx_lock);
 900
 901         if (mp == NULL)
 902                 return (B_FALSE);
 903
 904         q = mp->b_queue;
 905         mp->b_next = NULL;
 906         mp->b_prev = NULL;
 907         mp->b_queue = NULL;
 908
 909         if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
 910                 DTRACE_PROBE4(ipif__ioctl,
 911                     char *, "ipsq_pending_mp_cleanup",
 912                     int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
 913                     ipif_t *, ipif);
 914                 if (connp == NULL) {
 915                         ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
 916                 } else {
 917                         ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
 918                         mutex_enter(&ipif->ipif_ill->ill_lock);
 919                         ipif->ipif_state_flags &= ~IPIF_CHANGING;
 920                         mutex_exit(&ipif->ipif_ill->ill_lock);
 921                 }
 922         } else {
 923                 inet_freemsg(mp);
 924         }
 925         return (B_TRUE);
 926 }
 927
 928 /*
 929  * Called in the conn close path and ill delete path
 930  */
 931 static void
 932 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
 933 {
 934         ipsq_t  *ipsq;
 935         mblk_t  *prev;
 936         mblk_t  *curr;
 937         mblk_t  *next;
 938         queue_t *wq, *rq = NULL;
 939         mblk_t  *tmp_list = NULL;
 940
 941         ASSERT(IAM_WRITER_ILL(ill));
 942         if (connp != NULL)
 943                 wq = CONNP_TO_WQ(connp);
 944         else
 945                 wq = ill->ill_wq;
 946
 947         /*
 948          * In the case of lo0 being unplumbed, ill_wq will be NULL. Guard
 949          * against this here.
 950          */
 951         if (wq != NULL)
 952                 rq = RD(wq);
 953
 954         ipsq = ill->ill_phyint->phyint_ipsq;
 955         /*
 956          * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
 957          * In the case of ioctl from a conn, there can be only 1 mp
 958          * queued on the ipsq. If an ill is being unplumbed flush all
 959          * the messages.
 960          */
 961         mutex_enter(&ipsq->ipsq_lock);
 962         for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
 963             curr = next) {
 964                 next = curr->b_next;
 965                 if (connp == NULL ||
 966                     (curr->b_queue == wq || curr->b_queue == rq)) {
 967                         /* Unlink the mblk from the pending mp list */
 968                         if (prev != NULL) {
 969                                 prev->b_next = curr->b_next;
 970                         } else {
 971                                 ASSERT(ipsq->ipsq_xopq_mphead == curr);
 972                                 ipsq->ipsq_xopq_mphead = curr->b_next;
 973                         }
 974                         if (ipsq->ipsq_xopq_mptail == curr)
 975                                 ipsq->ipsq_xopq_mptail = prev;
 976                         /*
 977                          * Create a temporary list and release the ipsq lock
 978                          * New elements are added to the head of the tmp_list
 979                          */
 980                         curr->b_next = tmp_list;
 981                         tmp_list = curr;
 982                 } else {
 983                         prev = curr;
 984                 }
 985         }
 986         mutex_exit(&ipsq->ipsq_lock);
 987
 988         while (tmp_list != NULL) {
 989                 curr = tmp_list;
 990                 tmp_list = curr->b_next;
 991                 curr->b_next = NULL;
 992                 curr->b_prev = NULL;
 993                 wq = curr->b_queue;
 994                 curr->b_queue = NULL;
 995                 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
 996                         DTRACE_PROBE4(ipif__ioctl,
 997                             char *, "ipsq_xopq_mp_cleanup",
 998                             int, 0, ill_t *, NULL, ipif_t *, NULL);
 999                         ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ?
1000                             CONN_CLOSE : NO_COPYOUT, NULL);
1001                 } else {
1002                         /*
1003                          * IP-MT XXX In the case of TLI/XTI bind / optmgmt
1004                          * this can't be just inet_freemsg. we have to
1005                          * restart it otherwise the thread will be stuck.
1006                          */
1007                         inet_freemsg(curr);
1008                 }
1009         }
1010 }
1011
1012 /*
1013  * This conn has started closing. Cleanup any pending ioctl from this conn.
1014  * STREAMS ensures that there can be at most 1 active ioctl on a stream.
1015  */
1016 void
1017 conn_ioctl_cleanup(conn_t *connp)
1018 {
1019         ipsq_t  *ipsq;
1020         ill_t   *ill;
1021         boolean_t refheld;
1022
1023         /*
1024          * Check for a queued ioctl. If the ioctl has not yet started, the mp
1025          * is pending in the list headed by ipsq_xopq_head. If the ioctl has
1026          * started the mp could be present in ipx_pending_mp. Note that if
1027          * conn_oper_pending_ill is NULL, the ioctl may still be in flight and
1028          * not yet queued anywhere. In this case, the conn close code will wait
1029          * until the conn_ref is dropped. If the stream was a tcp stream, then
1030          * tcp_close will wait first until all ioctls have completed for this
1031          * conn.
1032          */
1033         mutex_enter(&connp->conn_lock);
1034         ill = connp->conn_oper_pending_ill;
1035         if (ill == NULL) {
1036                 mutex_exit(&connp->conn_lock);
1037                 return;
1038         }
1039
1040         /*
1041          * We may not be able to refhold the ill if the ill/ipif
1042          * is changing. But we need to make sure that the ill will
1043          * not vanish. So we just bump up the ill_waiter count.
1044          */
1045         refheld = ill_waiter_inc(ill);
1046         mutex_exit(&connp->conn_lock);
1047         if (refheld) {
1048                 if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
1049                         ill_waiter_dcr(ill);
1050                         /*
1051                          * Check whether this ioctl has started and is
1052                          * pending. If it is not found there then check
1053                          * whether this ioctl has not even started and is in
1054                          * the ipsq_xopq list.
1055                          */
1056                         if (!ipsq_pending_mp_cleanup(ill, connp))
1057                                 ipsq_xopq_mp_cleanup(ill, connp);
1058                         ipsq = ill->ill_phyint->phyint_ipsq;
1059                         ipsq_exit(ipsq);
1060                         return;
1061                 }
1062         }
1063
1064         /*
1065          * The ill is also closing and we could not bump up the
1066          * ill_waiter_count or we could not enter the ipsq. Leave
1067          * the cleanup to ill_delete
1068          */
1069         mutex_enter(&connp->conn_lock);
1070         while (connp->conn_oper_pending_ill != NULL)
1071                 cv_wait(&connp->conn_refcv, &connp->conn_lock);
1072         mutex_exit(&connp->conn_lock);
1073         if (refheld)
1074                 ill_waiter_dcr(ill);
1075 }
1076
1077 /*
1078  * ipcl_walk function for cleaning up conn_*_ill fields.
1079  * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
1080  * conn_bound_if in place. We prefer dropping
1081  * packets instead of sending them out the wrong interface, or accepting
1082  * packets from the wrong ifindex.
1083  */
1084 static void
1085 conn_cleanup_ill(conn_t *connp, caddr_t arg)
1086 {
1087         ill_t   *ill = (ill_t *)arg;
1088
1089         mutex_enter(&connp->conn_lock);
1090         if (connp->conn_dhcpinit_ill == ill) {
1091                 connp->conn_dhcpinit_ill = NULL;
1092                 ASSERT(ill->ill_dhcpinit != 0);
1093                 atomic_dec_32(&ill->ill_dhcpinit);
1094                 ill_set_inputfn(ill);
1095         }
1096         mutex_exit(&connp->conn_lock);
1097 }
1098
1099 static int
1100 ill_down_ipifs_tail(ill_t *ill)
1101 {
1102         ipif_t  *ipif;
1103         int err;
1104
1105         ASSERT(IAM_WRITER_ILL(ill));
1106         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1107                 ipif_non_duplicate(ipif);
1108                 /*
1109                  * ipif_down_tail will call arp_ll_down on the last ipif
1110                  * and typically return EINPROGRESS when the DL_UNBIND is sent.
1111                  */
1112                 if ((err = ipif_down_tail(ipif)) != 0)
1113                         return (err);
1114         }
1115         return (0);
1116 }
1117
1118 /* ARGSUSED */
1119 void
1120 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
1121 {
1122         ASSERT(IAM_WRITER_IPSQ(ipsq));
1123         (void) ill_down_ipifs_tail(q->q_ptr);
1124         freemsg(mp);
1125         ipsq_current_finish(ipsq);
1126 }
1127
1128 /*
1129  * ill_down_start is called when we want to down this ill and bring it up again
1130  * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
1131  * all interfaces, but don't tear down any plumbing.
1132  */
1133 boolean_t
1134 ill_down_start(queue_t *q, mblk_t *mp)
1135 {
1136         ill_t   *ill = q->q_ptr;
1137         ipif_t  *ipif;
1138
1139         ASSERT(IAM_WRITER_ILL(ill));
1140         /*
1141          * It is possible that some ioctl is already in progress while we
1142          * received the M_ERROR / M_HANGUP in which case, we need to abort
1143          * the ioctl. ill_down_start() is being processed as CUR_OP rather
1144          * than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent
1145          * the in progress ioctl from ever completing.
1146          *
1147          * The thread that started the ioctl (if any) must have returned,
1148          * since we are now executing as writer. After the 2 calls below,
1149          * the state of the ipsq and the ill would reflect no trace of any
1150          * pending operation. Subsequently if there is any response to the
1151          * original ioctl from the driver, it would be discarded as an
1152          * unsolicited message from the driver.
1153          */
1154         (void) ipsq_pending_mp_cleanup(ill, NULL);
1155         ill_dlpi_clear_deferred(ill);
1156
1157         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1158                 (void) ipif_down(ipif, NULL, NULL);
1159
1160         ill_down(ill);
1161
1162         /*
1163          * Walk all CONNs that can have a reference on an ire or nce for this
1164          * ill (we actually walk all that now have stale references).
1165          */
1166         ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
1167
1168         /* With IPv6 we have dce_ifindex. Cleanup for neatness */
1169         if (ill->ill_isv6)
1170                 dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
1171
1172         ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
1173
1174         /*
1175          * Atomically test and add the pending mp if references are active.
1176          */
1177         mutex_enter(&ill->ill_lock);
1178         if (!ill_is_quiescent(ill)) {
1179                 /* call cannot fail since `conn_t *' argument is NULL */
1180                 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
1181                     mp, ILL_DOWN);
1182                 mutex_exit(&ill->ill_lock);
1183                 return (B_FALSE);
1184         }
1185         mutex_exit(&ill->ill_lock);
1186         return (B_TRUE);
1187 }
1188
1189 static void
1190 ill_down(ill_t *ill)
1191 {
1192         mblk_t  *mp;
1193         ip_stack_t      *ipst = ill->ill_ipst;
1194
1195         /*
1196          * Blow off any IREs dependent on this ILL.
1197          * The caller needs to handle conn_ixa_cleanup
1198          */
1199         ill_delete_ires(ill);
1200
1201         ire_walk_ill(0, 0, ill_downi, ill, ill);
1202
1203         /* Remove any conn_*_ill depending on this ill */
1204         ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
1205
1206         /*
1207          * Free state for additional IREs.
1208          */
1209         mutex_enter(&ill->ill_saved_ire_lock);
1210         mp = ill->ill_saved_ire_mp;
1211         ill->ill_saved_ire_mp = NULL;
1212         ill->ill_saved_ire_cnt = 0;
1213         mutex_exit(&ill->ill_saved_ire_lock);
1214         freemsg(mp);
1215 }
1216
1217 /*
1218  * ire_walk routine used to delete every IRE that depends on
1219  * 'ill'.  (Always called as writer, and may only be called from ire_walk.)
1220  *
1221  * Note: since the routes added by the kernel are deleted separately,
1222  * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
1223  *
1224  * We also remove references on ire_nce_cache entries that refer to the ill.
1225  */
1226 void
1227 ill_downi(ire_t *ire, char *ill_arg)
1228 {
1229         ill_t   *ill = (ill_t *)ill_arg;
1230         nce_t   *nce;
1231
1232         mutex_enter(&ire->ire_lock);
1233         nce = ire->ire_nce_cache;
1234         if (nce != NULL && nce->nce_ill == ill)
1235                 ire->ire_nce_cache = NULL;
1236         else
1237                 nce = NULL;
1238         mutex_exit(&ire->ire_lock);
1239         if (nce != NULL)
1240                 nce_refrele(nce);
1241         if (ire->ire_ill == ill) {
1242                 /*
1243                  * The existing interface binding for ire must be
1244                  * deleted before trying to bind the route to another
1245                  * interface. However, since we are using the contents of the
1246                  * ire after ire_delete, the caller has to ensure that
1247                  * CONDEMNED (deleted) ire's are not removed from the list
1248                  * when ire_delete() returns. Currently ill_downi() is
1249                  * only called as part of ire_walk*() routines, so that
1250                  * the irb_refhold() done by ire_walk*() will ensure that
1251                  * ire_delete() does not lead to ire_inactive().
1252                  */
1253                 ASSERT(ire->ire_bucket->irb_refcnt > 0);
1254                 ire_delete(ire);
1255                 if (ire->ire_unbound)
1256                         ire_rebind(ire);
1257         }
1258 }
1259
1260 /* Remove IRE_IF_CLONE on this ill */
1261 void
1262 ill_downi_if_clone(ire_t *ire, char *ill_arg)
1263 {
1264         ill_t   *ill = (ill_t *)ill_arg;
1265
1266         ASSERT(ire->ire_type & IRE_IF_CLONE);
1267         if (ire->ire_ill == ill)
1268                 ire_delete(ire);
1269 }
1270
1271 /* Consume an M_IOCACK of the fastpath probe. */
1272 void
1273 ill_fastpath_ack(ill_t *ill, mblk_t *mp)
1274 {
1275         mblk_t  *mp1 = mp;
1276
1277         /*
1278          * If this was the first attempt turn on the fastpath probing.
1279          */
1280         mutex_enter(&ill->ill_lock);
1281         if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
1282                 ill->ill_dlpi_fastpath_state = IDS_OK;
1283         mutex_exit(&ill->ill_lock);
1284
1285         /* Free the M_IOCACK mblk, hold on to the data */
1286         mp = mp->b_cont;
1287         freeb(mp1);
1288         if (mp == NULL)
1289                 return;
1290         if (mp->b_cont != NULL)
1291                 nce_fastpath_update(ill, mp);
1292         else
1293                 ip0dbg(("ill_fastpath_ack:  no b_cont\n"));
1294         freemsg(mp);
1295 }
1296
1297 /*
1298  * Throw an M_IOCTL message downstream asking "do you know fastpath?"
1299  * The data portion of the request is a dl_unitdata_req_t template for
1300  * what we would send downstream in the absence of a fastpath confirmation.
1301  */
1302 int
1303 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
1304 {
1305         struct iocblk   *ioc;
1306         mblk_t  *mp;
1307
1308         if (dlur_mp == NULL)
1309                 return (EINVAL);
1310
1311         mutex_enter(&ill->ill_lock);
1312         switch (ill->ill_dlpi_fastpath_state) {
1313         case IDS_FAILED:
1314                 /*
1315                  * Driver NAKed the first fastpath ioctl - assume it doesn't
1316                  * support it.
1317                  */
1318                 mutex_exit(&ill->ill_lock);
1319                 return (ENOTSUP);
1320         case IDS_UNKNOWN:
1321                 /* This is the first probe */
1322                 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
1323                 break;
1324         default:
1325                 break;
1326         }
1327         mutex_exit(&ill->ill_lock);
1328
1329         if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
1330                 return (EAGAIN);
1331
1332         mp->b_cont = copyb(dlur_mp);
1333         if (mp->b_cont == NULL) {
1334                 freeb(mp);
1335                 return (EAGAIN);
1336         }
1337
1338         ioc = (struct iocblk *)mp->b_rptr;
1339         ioc->ioc_count = msgdsize(mp->b_cont);
1340
1341         DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
1342             char *, "DL_IOC_HDR_INFO", ill_t *, ill);
1343         putnext(ill->ill_wq, mp);
1344         return (0);
1345 }
1346
1347 void
1348 ill_capability_probe(ill_t *ill)
1349 {
1350         mblk_t  *mp;
1351
1352         ASSERT(IAM_WRITER_ILL(ill));
1353
1354         if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
1355             ill->ill_dlpi_capab_state != IDCS_FAILED)
1356                 return;
1357
1358         /*
1359          * We are starting a new cycle of capability negotiation.
1360          * Free up the capab reset messages of any previous incarnation.
1361          * We will do a fresh allocation when we get the response to our probe
1362          */
1363         if (ill->ill_capab_reset_mp != NULL) {
1364                 freemsg(ill->ill_capab_reset_mp);
1365                 ill->ill_capab_reset_mp = NULL;
1366         }
1367
1368         ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
1369
1370         mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
1371         if (mp == NULL)
1372                 return;
1373
1374         ill_capability_send(ill, mp);
1375         ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
1376 }
1377
1378 void
1379 ill_capability_reset(ill_t *ill, boolean_t reneg)
1380 {
1381         ASSERT(IAM_WRITER_ILL(ill));
1382
1383         if (ill->ill_dlpi_capab_state != IDCS_OK)
1384                 return;
1385
1386         ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
1387
1388         ill_capability_send(ill, ill->ill_capab_reset_mp);
1389         ill->ill_capab_reset_mp = NULL;
1390         /*
1391          * We turn off all capabilities except those pertaining to
1392          * direct function call capabilities viz. ILL_CAPAB_DLD*
1393          * which will be turned off by the corresponding reset functions.
1394          */
1395         ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM  | ILL_CAPAB_ZEROCOPY);
1396 }
1397
1398 static void
1399 ill_capability_reset_alloc(ill_t *ill)
1400 {
1401         mblk_t *mp;
1402         size_t  size = 0;
1403         int     err;
1404         dl_capability_req_t     *capb;
1405
1406         ASSERT(IAM_WRITER_ILL(ill));
1407         ASSERT(ill->ill_capab_reset_mp == NULL);
1408
1409         if (ILL_HCKSUM_CAPABLE(ill)) {
1410                 size += sizeof (dl_capability_sub_t) +
1411                     sizeof (dl_capab_hcksum_t);
1412         }
1413
1414         if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
1415                 size += sizeof (dl_capability_sub_t) +
1416                     sizeof (dl_capab_zerocopy_t);
1417         }
1418
1419         if (ill->ill_capabilities & ILL_CAPAB_DLD) {
1420                 size += sizeof (dl_capability_sub_t) +
1421                     sizeof (dl_capab_dld_t);
1422         }
1423
1424         mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
1425             STR_NOSIG, &err);
1426
1427         mp->b_datap->db_type = M_PROTO;
1428         bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
1429
1430         capb = (dl_capability_req_t *)mp->b_rptr;
1431         capb->dl_primitive = DL_CAPABILITY_REQ;
1432         capb->dl_sub_offset = sizeof (dl_capability_req_t);
1433         capb->dl_sub_length = size;
1434
1435         mp->b_wptr += sizeof (dl_capability_req_t);
1436
1437         /*
1438          * Each handler fills in the corresponding dl_capability_sub_t
1439          * inside the mblk,
1440          */
1441         ill_capability_hcksum_reset_fill(ill, mp);
1442         ill_capability_zerocopy_reset_fill(ill, mp);
1443         ill_capability_dld_reset_fill(ill, mp);
1444
1445         ill->ill_capab_reset_mp = mp;
1446 }
1447
1448 static void
1449 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
1450 {
1451         dl_capab_id_t *id_ic;
1452         uint_t sub_dl_cap = outers->dl_cap;
1453         dl_capability_sub_t *inners;
1454         uint8_t *capend;
1455
1456         ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
1457
1458         /*
1459          * Note: range checks here are not absolutely sufficient to
1460          * make us robust against malformed messages sent by drivers;
1461          * this is in keeping with the rest of IP's dlpi handling.
1462          * (Remember, it's coming from something else in the kernel
1463          * address space)
1464          */
1465
1466         capend = (uint8_t *)(outers + 1) + outers->dl_length;
1467         if (capend > mp->b_wptr) {
1468                 cmn_err(CE_WARN, "ill_capability_id_ack: "
1469                     "malformed sub-capability too long for mblk");
1470                 return;
1471         }
1472
1473         id_ic = (dl_capab_id_t *)(outers + 1);
1474
1475         if (outers->dl_length < sizeof (*id_ic) ||
1476             (inners = &id_ic->id_subcap,
1477             inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
1478                 cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
1479                     "encapsulated capab type %d too long for mblk",
1480                     inners->dl_cap);
1481                 return;
1482         }
1483
1484         if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
1485                 ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
1486                     "isn't as expected; pass-thru module(s) detected, "
1487                     "discarding capability\n", inners->dl_cap));
1488                 return;
1489         }
1490
1491         /* Process the encapsulated sub-capability */
1492         ill_capability_dispatch(ill, mp, inners);
1493 }
1494
1495 static void
1496 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
1497 {
1498         dl_capability_sub_t *dl_subcap;
1499
1500         if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
1501                 return;
1502
1503         /*
1504          * The dl_capab_dld_t that follows the dl_capability_sub_t is not
1505          * initialized below since it is not used by DLD.
1506          */
1507         dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1508         dl_subcap->dl_cap = DL_CAPAB_DLD;
1509         dl_subcap->dl_length = sizeof (dl_capab_dld_t);
1510
1511         mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
1512 }
1513
1514 static void
1515 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
1516 {
1517         /*
1518          * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK
1519          * is only to get the VRRP capability.
1520          *
1521          * Note that we cannot check ill_ipif_up_count here since
1522          * ill_ipif_up_count is only incremented when the resolver is setup.
1523          * That is done asynchronously, and can race with this function.
1524          */
1525         if (!ill->ill_dl_up) {
1526                 if (subp->dl_cap == DL_CAPAB_VRRP)
1527                         ill_capability_vrrp_ack(ill, mp, subp);
1528                 return;
1529         }
1530
1531         switch (subp->dl_cap) {
1532         case DL_CAPAB_HCKSUM:
1533                 ill_capability_hcksum_ack(ill, mp, subp);
1534                 break;
1535         case DL_CAPAB_ZEROCOPY:
1536                 ill_capability_zerocopy_ack(ill, mp, subp);
1537                 break;
1538         case DL_CAPAB_DLD:
1539                 ill_capability_dld_ack(ill, mp, subp);
1540                 break;
1541         case DL_CAPAB_VRRP:
1542                 break;
1543         default:
1544                 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
1545                     subp->dl_cap));
1546         }
1547 }
1548
1549 /*
1550  * Process the vrrp capability received from a DLS Provider. isub must point
1551  * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message.
1552  */
1553 static void
1554 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1555 {
1556         dl_capab_vrrp_t *vrrp;
1557         uint_t          sub_dl_cap = isub->dl_cap;
1558         uint8_t         *capend;
1559
1560         ASSERT(IAM_WRITER_ILL(ill));
1561         ASSERT(sub_dl_cap == DL_CAPAB_VRRP);
1562
1563         /*
1564          * Note: range checks here are not absolutely sufficient to
1565          * make us robust against malformed messages sent by drivers;
1566          * this is in keeping with the rest of IP's dlpi handling.
1567          * (Remember, it's coming from something else in the kernel
1568          * address space)
1569          */
1570         capend = (uint8_t *)(isub + 1) + isub->dl_length;
1571         if (capend > mp->b_wptr) {
1572                 cmn_err(CE_WARN, "ill_capability_vrrp_ack: "
1573                     "malformed sub-capability too long for mblk");
1574                 return;
1575         }
1576         vrrp = (dl_capab_vrrp_t *)(isub + 1);
1577
1578         /*
1579          * Compare the IP address family and set ILLF_VRRP for the right ill.
1580          */
1581         if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) ||
1582             (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) {
1583                 ill->ill_flags |= ILLF_VRRP;
1584         }
1585 }
1586
1587 /*
1588  * Process a hardware checksum offload capability negotiation ack received
1589  * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
1590  * of a DL_CAPABILITY_ACK message.
1591  */
1592 static void
1593 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1594 {
1595         dl_capability_req_t     *ocap;
1596         dl_capab_hcksum_t       *ihck, *ohck;
1597         ill_hcksum_capab_t      **ill_hcksum;
1598         mblk_t                  *nmp = NULL;
1599         uint_t                  sub_dl_cap = isub->dl_cap;
1600         uint8_t                 *capend;
1601
1602         ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
1603
1604         ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
1605
1606         /*
1607          * Note: range checks here are not absolutely sufficient to
1608          * make us robust against malformed messages sent by drivers;
1609          * this is in keeping with the rest of IP's dlpi handling.
1610          * (Remember, it's coming from something else in the kernel
1611          * address space)
1612          */
1613         capend = (uint8_t *)(isub + 1) + isub->dl_length;
1614         if (capend > mp->b_wptr) {
1615                 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1616                     "malformed sub-capability too long for mblk");
1617                 return;
1618         }
1619
1620         /*
1621          * There are two types of acks we process here:
1622          * 1. acks in reply to a (first form) generic capability req
1623          *    (no ENABLE flag set)
1624          * 2. acks in reply to a ENABLE capability req.
1625          *    (ENABLE flag set)
1626          */
1627         ihck = (dl_capab_hcksum_t *)(isub + 1);
1628
1629         if (ihck->hcksum_version != HCKSUM_VERSION_1) {
1630                 cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
1631                     "unsupported hardware checksum "
1632                     "sub-capability (version %d, expected %d)",
1633                     ihck->hcksum_version, HCKSUM_VERSION_1);
1634                 return;
1635         }
1636
1637         if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
1638                 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
1639                     "checksum capability isn't as expected; pass-thru "
1640                     "module(s) detected, discarding capability\n"));
1641                 return;
1642         }
1643
1644 #define CURR_HCKSUM_CAPAB                               \
1645         (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 |    \
1646         HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
1647
1648         if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
1649             (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
1650                 /* do ENABLE processing */
1651                 if (*ill_hcksum == NULL) {
1652                         *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
1653                             KM_NOSLEEP);
1654
1655                         if (*ill_hcksum == NULL) {
1656                                 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1657                                     "could not enable hcksum version %d "
1658                                     "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
1659                                     ill->ill_name);
1660                                 return;
1661                         }
1662                 }
1663
1664                 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
1665                 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
1666                 ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
1667                 ip1dbg(("ill_capability_hcksum_ack: interface %s "
1668                     "has enabled hardware checksumming\n ",
1669                     ill->ill_name));
1670         } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
1671                 /*
1672                  * Enabling hardware checksum offload
1673                  * Currently IP supports {TCP,UDP}/IPv4
1674                  * partial and full cksum offload and
1675                  * IPv4 header checksum offload.
1676                  * Allocate new mblk which will
1677                  * contain a new capability request
1678                  * to enable hardware checksum offload.
1679                  */
1680                 uint_t  size;
1681                 uchar_t *rptr;
1682
1683                 size = sizeof (dl_capability_req_t) +
1684                     sizeof (dl_capability_sub_t) + isub->dl_length;
1685
1686                 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1687                         cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1688                             "could not enable hardware cksum for %s (ENOMEM)\n",
1689                             ill->ill_name);
1690                         return;
1691                 }
1692
1693                 rptr = nmp->b_rptr;
1694                 /* initialize dl_capability_req_t */
1695                 ocap = (dl_capability_req_t *)nmp->b_rptr;
1696                 ocap->dl_sub_offset =
1697                     sizeof (dl_capability_req_t);
1698                 ocap->dl_sub_length =
1699                     sizeof (dl_capability_sub_t) +
1700                     isub->dl_length;
1701                 nmp->b_rptr += sizeof (dl_capability_req_t);
1702
1703                 /* initialize dl_capability_sub_t */
1704                 bcopy(isub, nmp->b_rptr, sizeof (*isub));
1705                 nmp->b_rptr += sizeof (*isub);
1706
1707                 /* initialize dl_capab_hcksum_t */
1708                 ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
1709                 bcopy(ihck, ohck, sizeof (*ihck));
1710
1711                 nmp->b_rptr = rptr;
1712                 ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
1713
1714                 /* Set ENABLE flag */
1715                 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
1716                 ohck->hcksum_txflags |= HCKSUM_ENABLE;
1717
1718                 /*
1719                  * nmp points to a DL_CAPABILITY_REQ message to enable
1720                  * hardware checksum acceleration.
1721                  */
1722                 ill_capability_send(ill, nmp);
1723         } else {
1724                 ip1dbg(("ill_capability_hcksum_ack: interface %s has "
1725                     "advertised %x hardware checksum capability flags\n",
1726                     ill->ill_name, ihck->hcksum_txflags));
1727         }
1728 }
1729
1730 static void
1731 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
1732 {
1733         dl_capab_hcksum_t *hck_subcap;
1734         dl_capability_sub_t *dl_subcap;
1735
1736         if (!ILL_HCKSUM_CAPABLE(ill))
1737                 return;
1738
1739         ASSERT(ill->ill_hcksum_capab != NULL);
1740
1741         dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1742         dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
1743         dl_subcap->dl_length = sizeof (*hck_subcap);
1744
1745         hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
1746         hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
1747         hck_subcap->hcksum_txflags = 0;
1748
1749         mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
1750 }
1751
1752 static void
1753 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1754 {
1755         mblk_t *nmp = NULL;
1756         dl_capability_req_t *oc;
1757         dl_capab_zerocopy_t *zc_ic, *zc_oc;
1758         ill_zerocopy_capab_t **ill_zerocopy_capab;
1759         uint_t sub_dl_cap = isub->dl_cap;
1760         uint8_t *capend;
1761
1762         ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
1763
1764         ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
1765
1766         /*
1767          * Note: range checks here are not absolutely sufficient to
1768          * make us robust against malformed messages sent by drivers;
1769          * this is in keeping with the rest of IP's dlpi handling.
1770          * (Remember, it's coming from something else in the kernel
1771          * address space)
1772          */
1773         capend = (uint8_t *)(isub + 1) + isub->dl_length;
1774         if (capend > mp->b_wptr) {
1775                 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1776                     "malformed sub-capability too long for mblk");
1777                 return;
1778         }
1779
1780         zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
1781         if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
1782                 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
1783                     "unsupported ZEROCOPY sub-capability (version %d, "
1784                     "expected %d)", zc_ic->zerocopy_version,
1785                     ZEROCOPY_VERSION_1);
1786                 return;
1787         }
1788
1789         if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
1790                 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
1791                     "capability isn't as expected; pass-thru module(s) "
1792                     "detected, discarding capability\n"));
1793                 return;
1794         }
1795
1796         if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
1797                 if (*ill_zerocopy_capab == NULL) {
1798                         *ill_zerocopy_capab =
1799                             kmem_zalloc(sizeof (ill_zerocopy_capab_t),
1800                             KM_NOSLEEP);
1801
1802                         if (*ill_zerocopy_capab == NULL) {
1803                                 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1804                                     "could not enable Zero-copy version %d "
1805                                     "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
1806                                     ill->ill_name);
1807                                 return;
1808                         }
1809                 }
1810
1811                 ip1dbg(("ill_capability_zerocopy_ack: interface %s "
1812                     "supports Zero-copy version %d\n", ill->ill_name,
1813                     ZEROCOPY_VERSION_1));
1814
1815                 (*ill_zerocopy_capab)->ill_zerocopy_version =
1816                     zc_ic->zerocopy_version;
1817                 (*ill_zerocopy_capab)->ill_zerocopy_flags =
1818                     zc_ic->zerocopy_flags;
1819
1820                 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
1821         } else {
1822                 uint_t size;
1823                 uchar_t *rptr;
1824
1825                 size = sizeof (dl_capability_req_t) +
1826                     sizeof (dl_capability_sub_t) +
1827                     sizeof (dl_capab_zerocopy_t);
1828
1829                 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1830                         cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1831                             "could not enable zerocopy for %s (ENOMEM)\n",
1832                             ill->ill_name);
1833                         return;
1834                 }
1835
1836                 rptr = nmp->b_rptr;
1837                 /* initialize dl_capability_req_t */
1838                 oc = (dl_capability_req_t *)rptr;
1839                 oc->dl_sub_offset = sizeof (dl_capability_req_t);
1840                 oc->dl_sub_length = sizeof (dl_capability_sub_t) +
1841                     sizeof (dl_capab_zerocopy_t);
1842                 rptr += sizeof (dl_capability_req_t);
1843
1844                 /* initialize dl_capability_sub_t */
1845                 bcopy(isub, rptr, sizeof (*isub));
1846                 rptr += sizeof (*isub);
1847
1848                 /* initialize dl_capab_zerocopy_t */
1849                 zc_oc = (dl_capab_zerocopy_t *)rptr;
1850                 *zc_oc = *zc_ic;
1851
1852                 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
1853                     "to enable zero-copy version %d\n", ill->ill_name,
1854                     ZEROCOPY_VERSION_1));
1855
1856                 /* set VMSAFE_MEM flag */
1857                 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
1858
1859                 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
1860                 ill_capability_send(ill, nmp);
1861         }
1862 }
1863
1864 static void
1865 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
1866 {
1867         dl_capab_zerocopy_t *zerocopy_subcap;
1868         dl_capability_sub_t *dl_subcap;
1869
1870         if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
1871                 return;
1872
1873         ASSERT(ill->ill_zerocopy_capab != NULL);
1874
1875         dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1876         dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
1877         dl_subcap->dl_length = sizeof (*zerocopy_subcap);
1878
1879         zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
1880         zerocopy_subcap->zerocopy_version =
1881             ill->ill_zerocopy_capab->ill_zerocopy_version;
1882         zerocopy_subcap->zerocopy_flags = 0;
1883
1884         mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
1885 }
1886
1887 /*
1888  * DLD capability
1889  * Refer to dld.h for more information regarding the purpose and usage
1890  * of this capability.
1891  */
1892 static void
1893 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1894 {
1895         dl_capab_dld_t          *dld_ic, dld;
1896         uint_t                  sub_dl_cap = isub->dl_cap;
1897         uint8_t                 *capend;
1898         ill_dld_capab_t         *idc;
1899
1900         ASSERT(IAM_WRITER_ILL(ill));
1901         ASSERT(sub_dl_cap == DL_CAPAB_DLD);
1902
1903         /*
1904          * Note: range checks here are not absolutely sufficient to
1905          * make us robust against malformed messages sent by drivers;
1906          * this is in keeping with the rest of IP's dlpi handling.
1907          * (Remember, it's coming from something else in the kernel
1908          * address space)
1909          */
1910         capend = (uint8_t *)(isub + 1) + isub->dl_length;
1911         if (capend > mp->b_wptr) {
1912                 cmn_err(CE_WARN, "ill_capability_dld_ack: "
1913                     "malformed sub-capability too long for mblk");
1914                 return;
1915         }
1916         dld_ic = (dl_capab_dld_t *)(isub + 1);
1917         if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
1918                 cmn_err(CE_CONT, "ill_capability_dld_ack: "
1919                     "unsupported DLD sub-capability (version %d, "
1920                     "expected %d)", dld_ic->dld_version,
1921                     DLD_CURRENT_VERSION);
1922                 return;
1923         }
1924         if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
1925                 ip1dbg(("ill_capability_dld_ack: mid token for dld "
1926                     "capability isn't as expected; pass-thru module(s) "
1927                     "detected, discarding capability\n"));
1928                 return;
1929         }
1930
1931         /*
1932          * Copy locally to ensure alignment.
1933          */
1934         bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
1935
1936         if ((idc = ill->ill_dld_capab) == NULL) {
1937                 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
1938                 if (idc == NULL) {
1939                         cmn_err(CE_WARN, "ill_capability_dld_ack: "
1940                             "could not enable DLD version %d "
1941                             "for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
1942                             ill->ill_name);
1943                         return;
1944                 }
1945                 ill->ill_dld_capab = idc;
1946         }
1947         idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
1948         idc->idc_capab_dh = (void *)dld.dld_capab_handle;
1949         ip1dbg(("ill_capability_dld_ack: interface %s "
1950             "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
1951
1952         ill_capability_dld_enable(ill);
1953 }
1954
1955 /*
1956  * Typically capability negotiation between IP and the driver happens via
1957  * DLPI message exchange. However GLD also offers a direct function call
1958  * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
1959  * But arbitrary function calls into IP or GLD are not permitted, since both
1960  * of them are protected by their own perimeter mechanism. The perimeter can
1961  * be viewed as a coarse lock or serialization mechanism. The hierarchy of
1962  * these perimeters is IP -> MAC. Thus for example to enable the squeue
1963  * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
1964  * to enter the mac perimeter and then do the direct function calls into
1965  * GLD to enable squeue polling. The ring related callbacks from the mac into
1966  * the stack to add, bind, quiesce, restart or cleanup a ring are all
1967  * protected by the mac perimeter.
1968  */
1969 static void
1970 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
1971 {
1972         ill_dld_capab_t         *idc = ill->ill_dld_capab;
1973         int                     err;
1974
1975         err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
1976             DLD_ENABLE);
1977         ASSERT(err == 0);
1978 }
1979
1980 static void
1981 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
1982 {
1983         ill_dld_capab_t         *idc = ill->ill_dld_capab;
1984         int                     err;
1985
1986         err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
1987             DLD_DISABLE);
1988         ASSERT(err == 0);
1989 }
1990
1991 boolean_t
1992 ill_mac_perim_held(ill_t *ill)
1993 {
1994         ill_dld_capab_t         *idc = ill->ill_dld_capab;
1995
1996         return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
1997             DLD_QUERY));
1998 }
1999
2000 static void
2001 ill_capability_direct_enable(ill_t *ill)
2002 {
2003         ill_dld_capab_t         *idc = ill->ill_dld_capab;
2004         ill_dld_direct_t        *idd = &idc->idc_direct;
2005         dld_capab_direct_t      direct;
2006         int                     rc;
2007
2008         ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2009
2010         bzero(&direct, sizeof (direct));
2011         direct.di_rx_cf = (uintptr_t)ip_input;
2012         direct.di_rx_ch = ill;
2013
2014         rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
2015             DLD_ENABLE);
2016         if (rc == 0) {
2017                 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
2018                 idd->idd_tx_dh = direct.di_tx_dh;
2019                 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
2020                 idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
2021                 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
2022                 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
2023                 ASSERT(idd->idd_tx_cb_df != NULL);
2024                 ASSERT(idd->idd_tx_fctl_df != NULL);
2025                 ASSERT(idd->idd_tx_df != NULL);
2026                 /*
2027                  * One time registration of flow enable callback function
2028                  */
2029                 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
2030                     ill_flow_enable, ill);
2031                 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
2032                 DTRACE_PROBE1(direct_on, (ill_t *), ill);
2033         } else {
2034                 cmn_err(CE_WARN, "warning: could not enable DIRECT "
2035                     "capability, rc = %d\n", rc);
2036                 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
2037         }
2038 }
2039
2040 static void
2041 ill_capability_poll_enable(ill_t *ill)
2042 {
2043         ill_dld_capab_t         *idc = ill->ill_dld_capab;
2044         dld_capab_poll_t        poll;
2045         int                     rc;
2046
2047         ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2048
2049         bzero(&poll, sizeof (poll));
2050         poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
2051         poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
2052         poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
2053         poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
2054         poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
2055         poll.poll_ring_ch = ill;
2056         rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
2057             DLD_ENABLE);
2058         if (rc == 0) {
2059                 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
2060                 DTRACE_PROBE1(poll_on, (ill_t *), ill);
2061         } else {
2062                 ip1dbg(("warning: could not enable POLL "
2063                     "capability, rc = %d\n", rc));
2064                 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
2065         }
2066 }
2067
2068 /*
2069  * Enable the LSO capability.
2070  */
2071 static void
2072 ill_capability_lso_enable(ill_t *ill)
2073 {
2074         ill_dld_capab_t *idc = ill->ill_dld_capab;
2075         dld_capab_lso_t lso;
2076         int rc;
2077
2078         ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2079
2080         if (ill->ill_lso_capab == NULL) {
2081                 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
2082                     KM_NOSLEEP);
2083                 if (ill->ill_lso_capab == NULL) {
2084                         cmn_err(CE_WARN, "ill_capability_lso_enable: "
2085                             "could not enable LSO for %s (ENOMEM)\n",
2086                             ill->ill_name);
2087                         return;
2088                 }
2089         }
2090
2091         bzero(&lso, sizeof (lso));
2092         if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
2093             DLD_ENABLE)) == 0) {
2094                 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
2095                 ill->ill_lso_capab->ill_lso_max = lso.lso_max;
2096                 ill->ill_capabilities |= ILL_CAPAB_LSO;
2097                 ip1dbg(("ill_capability_lso_enable: interface %s "
2098                     "has enabled LSO\n ", ill->ill_name));
2099         } else {
2100                 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
2101                 ill->ill_lso_capab = NULL;
2102                 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
2103         }
2104 }
2105
2106 static void
2107 ill_capability_dld_enable(ill_t *ill)
2108 {
2109         mac_perim_handle_t mph;
2110
2111         ASSERT(IAM_WRITER_ILL(ill));
2112
2113         if (ill->ill_isv6)
2114                 return;
2115
2116         ill_mac_perim_enter(ill, &mph);
2117         if (!ill->ill_isv6) {
2118                 ill_capability_direct_enable(ill);
2119                 ill_capability_poll_enable(ill);
2120                 ill_capability_lso_enable(ill);
2121         }
2122         ill->ill_capabilities |= ILL_CAPAB_DLD;
2123         ill_mac_perim_exit(ill, mph);
2124 }
2125
2126 static void
2127 ill_capability_dld_disable(ill_t *ill)
2128 {
2129         ill_dld_capab_t *idc;
2130         ill_dld_direct_t *idd;
2131         mac_perim_handle_t      mph;
2132
2133         ASSERT(IAM_WRITER_ILL(ill));
2134
2135         if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
2136                 return;
2137
2138         ill_mac_perim_enter(ill, &mph);
2139
2140         idc = ill->ill_dld_capab;
2141         if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
2142                 /*
2143                  * For performance we avoid locks in the transmit data path
2144                  * and don't maintain a count of the number of threads using
2145                  * direct calls. Thus some threads could be using direct
2146                  * transmit calls to GLD, even after the capability mechanism
2147                  * turns it off. This is still safe since the handles used in
2148                  * the direct calls continue to be valid until the unplumb is
2149                  * completed. Remove the callback that was added (1-time) at
2150                  * capab enable time.
2151                  */
2152                 mutex_enter(&ill->ill_lock);
2153                 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
2154                 mutex_exit(&ill->ill_lock);
2155                 if (ill->ill_flownotify_mh != NULL) {
2156                         idd = &idc->idc_direct;
2157                         idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
2158                             ill->ill_flownotify_mh);
2159                         ill->ill_flownotify_mh = NULL;
2160                 }
2161                 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
2162                     NULL, DLD_DISABLE);
2163         }
2164
2165         if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
2166                 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
2167                 ip_squeue_clean_all(ill);
2168                 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
2169                     NULL, DLD_DISABLE);
2170         }
2171
2172         if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
2173                 ASSERT(ill->ill_lso_capab != NULL);
2174                 /*
2175                  * Clear the capability flag for LSO but retain the
2176                  * ill_lso_capab structure since it's possible that another
2177                  * thread is still referring to it.  The structure only gets
2178                  * deallocated when we destroy the ill.
2179                  */
2180
2181                 ill->ill_capabilities &= ~ILL_CAPAB_LSO;
2182                 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
2183                     NULL, DLD_DISABLE);
2184         }
2185
2186         ill->ill_capabilities &= ~ILL_CAPAB_DLD;
2187         ill_mac_perim_exit(ill, mph);
2188 }
2189
2190 /*
2191  * Capability Negotiation protocol
2192  *
2193  * We don't wait for DLPI capability operations to finish during interface
2194  * bringup or teardown. Doing so would introduce more asynchrony and the
2195  * interface up/down operations will need multiple return and restarts.
2196  * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
2197  * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
2198  * exclusive operation won't start until the DLPI operations of the previous
2199  * exclusive operation complete.
2200  *
2201  * The capability state machine is shown below.
2202  *
2203  * state                next state              event, action
2204  *
2205  * IDCS_UNKNOWN         IDCS_PROBE_SENT         ill_capability_probe
2206  * IDCS_PROBE_SENT      IDCS_OK                 ill_capability_ack
2207  * IDCS_PROBE_SENT      IDCS_FAILED             ip_rput_dlpi_writer (nack)
2208  * IDCS_OK              IDCS_RENEG              Receipt of DL_NOTE_CAPAB_RENEG
2209  * IDCS_OK              IDCS_RESET_SENT         ill_capability_reset
2210  * IDCS_RESET_SENT      IDCS_UNKNOWN            ill_capability_ack_thr
2211  * IDCS_RENEG           IDCS_PROBE_SENT         ill_capability_ack_thr ->
2212  *                                                  ill_capability_probe.
2213  */
2214
2215 /*
2216  * Dedicated thread started from ip_stack_init that handles capability
2217  * disable. This thread ensures the taskq dispatch does not fail by waiting
2218  * for resources using TQ_SLEEP. The taskq mechanism is used to ensure
2219  * that direct calls to DLD are done in a cv_waitable context.
2220  */
2221 void
2222 ill_taskq_dispatch(ip_stack_t *ipst)
2223 {
2224         callb_cpr_t cprinfo;
2225         char    name[64];
2226         mblk_t  *mp;
2227
2228         (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
2229             ipst->ips_netstack->netstack_stackid);
2230         CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
2231             name);
2232         mutex_enter(&ipst->ips_capab_taskq_lock);
2233
2234         for (;;) {
2235                 mp = ipst->ips_capab_taskq_head;
2236                 while (mp != NULL) {
2237                         ipst->ips_capab_taskq_head = mp->b_next;
2238                         if (ipst->ips_capab_taskq_head == NULL)
2239                                 ipst->ips_capab_taskq_tail = NULL;
2240                         mutex_exit(&ipst->ips_capab_taskq_lock);
2241                         mp->b_next = NULL;
2242
2243                         VERIFY(taskq_dispatch(system_taskq,
2244                             ill_capability_ack_thr, mp, TQ_SLEEP) != 0);
2245                         mutex_enter(&ipst->ips_capab_taskq_lock);
2246                         mp = ipst->ips_capab_taskq_head;
2247                 }
2248
2249                 if (ipst->ips_capab_taskq_quit)
2250                         break;
2251                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2252                 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
2253                 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
2254         }
2255         VERIFY(ipst->ips_capab_taskq_head == NULL);
2256         VERIFY(ipst->ips_capab_taskq_tail == NULL);
2257         CALLB_CPR_EXIT(&cprinfo);
2258         thread_exit();
2259 }
2260
2261 /*
2262  * Consume a new-style hardware capabilities negotiation ack.
2263  * Called via taskq on receipt of DL_CAPABILITY_ACK.
2264  */
2265 static void
2266 ill_capability_ack_thr(void *arg)
2267 {
2268         mblk_t  *mp = arg;
2269         dl_capability_ack_t *capp;
2270         dl_capability_sub_t *subp, *endp;
2271         ill_t   *ill;
2272         boolean_t reneg;
2273
2274         ill = (ill_t *)mp->b_prev;
2275         mp->b_prev = NULL;
2276
2277         VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
2278
2279         if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
2280             ill->ill_dlpi_capab_state == IDCS_RENEG) {
2281                 /*
2282                  * We have received the ack for our DL_CAPAB reset request.
2283                  * There isnt' anything in the message that needs processing.
2284                  * All message based capabilities have been disabled, now
2285                  * do the function call based capability disable.
2286                  */
2287                 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
2288                 ill_capability_dld_disable(ill);
2289                 ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
2290                 if (reneg)
2291                         ill_capability_probe(ill);
2292                 goto done;
2293         }
2294
2295         if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
2296                 ill->ill_dlpi_capab_state = IDCS_OK;
2297
2298         capp = (dl_capability_ack_t *)mp->b_rptr;
2299
2300         if (capp->dl_sub_length == 0) {
2301                 /* no new-style capabilities */
2302                 goto done;
2303         }
2304
2305         /* make sure the driver supplied correct dl_sub_length */
2306         if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
2307                 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
2308                     "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
2309                 goto done;
2310         }
2311
2312 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
2313         /*
2314          * There are sub-capabilities. Process the ones we know about.
2315          * Loop until we don't have room for another sub-cap header..
2316          */
2317         for (subp = SC(capp, capp->dl_sub_offset),
2318             endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
2319             subp <= endp;
2320             subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
2321
2322                 switch (subp->dl_cap) {
2323                 case DL_CAPAB_ID_WRAPPER:
2324                         ill_capability_id_ack(ill, mp, subp);
2325                         break;
2326                 default:
2327                         ill_capability_dispatch(ill, mp, subp);
2328                         break;
2329                 }
2330         }
2331 #undef SC
2332 done:
2333         inet_freemsg(mp);
2334         ill_capability_done(ill);
2335         ipsq_exit(ill->ill_phyint->phyint_ipsq);
2336 }
2337
2338 /*
2339  * This needs to be started in a taskq thread to provide a cv_waitable
2340  * context.
2341  */
2342 void
2343 ill_capability_ack(ill_t *ill, mblk_t *mp)
2344 {
2345         ip_stack_t      *ipst = ill->ill_ipst;
2346
2347         mp->b_prev = (mblk_t *)ill;
2348         ASSERT(mp->b_next == NULL);
2349
2350         if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
2351             TQ_NOSLEEP) != 0)
2352                 return;
2353
2354         /*
2355          * The taskq dispatch failed. Signal the ill_taskq_dispatch thread
2356          * which will do the dispatch using TQ_SLEEP to guarantee success.
2357          */
2358         mutex_enter(&ipst->ips_capab_taskq_lock);
2359         if (ipst->ips_capab_taskq_head == NULL) {
2360                 ASSERT(ipst->ips_capab_taskq_tail == NULL);
2361                 ipst->ips_capab_taskq_head = mp;
2362         } else {
2363                 ipst->ips_capab_taskq_tail->b_next = mp;
2364         }
2365         ipst->ips_capab_taskq_tail = mp;
2366
2367         cv_signal(&ipst->ips_capab_taskq_cv);
2368         mutex_exit(&ipst->ips_capab_taskq_lock);
2369 }
2370
2371 /*
2372  * This routine is called to scan the fragmentation reassembly table for
2373  * the specified ILL for any packets that are starting to smell.
2374  * dead_interval is the maximum time in seconds that will be tolerated.  It
2375  * will either be the value specified in ip_g_frag_timeout, or zero if the
2376  * ILL is shutting down and it is time to blow everything off.
2377  *
2378  * It returns the number of seconds (as a time_t) that the next frag timer
2379  * should be scheduled for, 0 meaning that the timer doesn't need to be
2380  * re-started.  Note that the method of calculating next_timeout isn't
2381  * entirely accurate since time will flow between the time we grab
2382  * current_time and the time we schedule the next timeout.  This isn't a
2383  * big problem since this is the timer for sending an ICMP reassembly time
2384  * exceeded messages, and it doesn't have to be exactly accurate.
2385  *
2386  * This function is
2387  * sometimes called as writer, although this is not required.
2388  */
2389 time_t
2390 ill_frag_timeout(ill_t *ill, time_t dead_interval)
2391 {
2392         ipfb_t  *ipfb;
2393         ipfb_t  *endp;
2394         ipf_t   *ipf;
2395         ipf_t   *ipfnext;
2396         mblk_t  *mp;
2397         time_t  current_time = gethrestime_sec();
2398         time_t  next_timeout = 0;
2399         uint32_t        hdr_length;
2400         mblk_t  *send_icmp_head;
2401         mblk_t  *send_icmp_head_v6;
2402         ip_stack_t *ipst = ill->ill_ipst;
2403         ip_recv_attr_t iras;
2404
2405         bzero(&iras, sizeof (iras));
2406         iras.ira_flags = 0;
2407         iras.ira_ill = iras.ira_rill = ill;
2408         iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2409         iras.ira_rifindex = iras.ira_ruifindex;
2410
2411         ipfb = ill->ill_frag_hash_tbl;
2412         if (ipfb == NULL)
2413                 return (B_FALSE);
2414         endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
2415         /* Walk the frag hash table. */
2416         for (; ipfb < endp; ipfb++) {
2417                 send_icmp_head = NULL;
2418                 send_icmp_head_v6 = NULL;
2419                 mutex_enter(&ipfb->ipfb_lock);
2420                 while ((ipf = ipfb->ipfb_ipf) != 0) {
2421                         time_t frag_time = current_time - ipf->ipf_timestamp;
2422                         time_t frag_timeout;
2423
2424                         if (frag_time < dead_interval) {
2425                                 /*
2426                                  * There are some outstanding fragments
2427                                  * that will timeout later.  Make note of
2428                                  * the time so that we can reschedule the
2429                                  * next timeout appropriately.
2430                                  */
2431                                 frag_timeout = dead_interval - frag_time;
2432                                 if (next_timeout == 0 ||
2433                                     frag_timeout < next_timeout) {
2434                                         next_timeout = frag_timeout;
2435                                 }
2436                                 break;
2437                         }
2438                         /* Time's up.  Get it out of here. */
2439                         hdr_length = ipf->ipf_nf_hdr_len;
2440                         ipfnext = ipf->ipf_hash_next;
2441                         if (ipfnext)
2442                                 ipfnext->ipf_ptphn = ipf->ipf_ptphn;
2443                         *ipf->ipf_ptphn = ipfnext;
2444                         mp = ipf->ipf_mp->b_cont;
2445                         for (; mp; mp = mp->b_cont) {
2446                                 /* Extra points for neatness. */
2447                                 IP_REASS_SET_START(mp, 0);
2448                                 IP_REASS_SET_END(mp, 0);
2449                         }
2450                         mp = ipf->ipf_mp->b_cont;
2451                         atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count);
2452                         ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
2453                         ipfb->ipfb_count -= ipf->ipf_count;
2454                         ASSERT(ipfb->ipfb_frag_pkts > 0);
2455                         ipfb->ipfb_frag_pkts--;
2456                         /*
2457                          * We do not send any icmp message from here because
2458                          * we currently are holding the ipfb_lock for this
2459                          * hash chain. If we try and send any icmp messages
2460                          * from here we may end up via a put back into ip
2461                          * trying to get the same lock, causing a recursive
2462                          * mutex panic. Instead we build a list and send all
2463                          * the icmp messages after we have dropped the lock.
2464                          */
2465                         if (ill->ill_isv6) {
2466                                 if (hdr_length != 0) {
2467                                         mp->b_next = send_icmp_head_v6;
2468                                         send_icmp_head_v6 = mp;
2469                                 } else {
2470                                         freemsg(mp);
2471                                 }
2472                         } else {
2473                                 if (hdr_length != 0) {
2474                                         mp->b_next = send_icmp_head;
2475                                         send_icmp_head = mp;
2476                                 } else {
2477                                         freemsg(mp);
2478                                 }
2479                         }
2480                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2481                         ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
2482                         freeb(ipf->ipf_mp);
2483                 }
2484                 mutex_exit(&ipfb->ipfb_lock);
2485                 /*
2486                  * Now need to send any icmp messages that we delayed from
2487                  * above.
2488                  */
2489                 while (send_icmp_head_v6 != NULL) {
2490                         ip6_t *ip6h;
2491
2492                         mp = send_icmp_head_v6;
2493                         send_icmp_head_v6 = send_icmp_head_v6->b_next;
2494                         mp->b_next = NULL;
2495                         ip6h = (ip6_t *)mp->b_rptr;
2496                         iras.ira_flags = 0;
2497                         /*
2498                          * This will result in an incorrect ALL_ZONES zoneid
2499                          * for multicast packets, but we
2500                          * don't send ICMP errors for those in any case.
2501                          */
2502                         iras.ira_zoneid =
2503                             ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
2504                             ill, ipst);
2505                         ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2506                         icmp_time_exceeded_v6(mp,
2507                             ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
2508                             &iras);
2509                         ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2510                 }
2511                 while (send_icmp_head != NULL) {
2512                         ipaddr_t dst;
2513
2514                         mp = send_icmp_head;
2515                         send_icmp_head = send_icmp_head->b_next;
2516                         mp->b_next = NULL;
2517
2518                         dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
2519
2520                         iras.ira_flags = IRAF_IS_IPV4;
2521                         /*
2522                          * This will result in an incorrect ALL_ZONES zoneid
2523                          * for broadcast and multicast packets, but we
2524                          * don't send ICMP errors for those in any case.
2525                          */
2526                         iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
2527                             ill, ipst);
2528                         ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2529                         icmp_time_exceeded(mp,
2530                             ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
2531                         ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2532                 }
2533         }
2534         /*
2535          * A non-dying ILL will use the return value to decide whether to
2536          * restart the frag timer, and for how long.
2537          */
2538         return (next_timeout);
2539 }
2540
2541 /*
2542  * This routine is called when the approximate count of mblk memory used
2543  * for the specified ILL has exceeded max_count.
2544  */
2545 void
2546 ill_frag_prune(ill_t *ill, uint_t max_count)
2547 {
2548         ipfb_t  *ipfb;
2549         ipf_t   *ipf;
2550         size_t  count;
2551         clock_t now;
2552
2553         /*
2554          * If we are here within ip_min_frag_prune_time msecs remove
2555          * ill_frag_free_num_pkts oldest packets from each bucket and increment
2556          * ill_frag_free_num_pkts.
2557          */
2558         mutex_enter(&ill->ill_lock);
2559         now = ddi_get_lbolt();
2560         if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <=
2561             (ip_min_frag_prune_time != 0 ?
2562             ip_min_frag_prune_time : msec_per_tick)) {
2563
2564                 ill->ill_frag_free_num_pkts++;
2565
2566         } else {
2567                 ill->ill_frag_free_num_pkts = 0;
2568         }
2569         ill->ill_last_frag_clean_time = now;
2570         mutex_exit(&ill->ill_lock);
2571
2572         /*
2573          * free ill_frag_free_num_pkts oldest packets from each bucket.
2574          */
2575         if (ill->ill_frag_free_num_pkts != 0) {
2576                 int ix;
2577
2578                 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2579                         ipfb = &ill->ill_frag_hash_tbl[ix];
2580                         mutex_enter(&ipfb->ipfb_lock);
2581                         if (ipfb->ipfb_ipf != NULL) {
2582                                 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
2583                                     ill->ill_frag_free_num_pkts);
2584                         }
2585                         mutex_exit(&ipfb->ipfb_lock);
2586                 }
2587         }
2588         /*
2589          * While the reassembly list for this ILL is too big, prune a fragment
2590          * queue by age, oldest first.
2591          */
2592         while (ill->ill_frag_count > max_count) {
2593                 int     ix;
2594                 ipfb_t  *oipfb = NULL;
2595                 uint_t  oldest = UINT_MAX;
2596
2597                 count = 0;
2598                 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2599                         ipfb = &ill->ill_frag_hash_tbl[ix];
2600                         mutex_enter(&ipfb->ipfb_lock);
2601                         ipf = ipfb->ipfb_ipf;
2602                         if (ipf != NULL && ipf->ipf_gen < oldest) {
2603                                 oldest = ipf->ipf_gen;
2604                                 oipfb = ipfb;
2605                         }
2606                         count += ipfb->ipfb_count;
2607                         mutex_exit(&ipfb->ipfb_lock);
2608                 }
2609                 if (oipfb == NULL)
2610                         break;
2611
2612                 if (count <= max_count)
2613                         return; /* Somebody beat us to it, nothing to do */
2614                 mutex_enter(&oipfb->ipfb_lock);
2615                 ipf = oipfb->ipfb_ipf;
2616                 if (ipf != NULL) {
2617                         ill_frag_free_pkts(ill, oipfb, ipf, 1);
2618                 }
2619                 mutex_exit(&oipfb->ipfb_lock);
2620         }
2621 }
2622
2623 /*
2624  * free 'free_cnt' fragmented packets starting at ipf.
2625  */
2626 void
2627 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
2628 {
2629         size_t  count;
2630         mblk_t  *mp;
2631         mblk_t  *tmp;
2632         ipf_t **ipfp = ipf->ipf_ptphn;
2633
2634         ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
2635         ASSERT(ipfp != NULL);
2636         ASSERT(ipf != NULL);
2637
2638         while (ipf != NULL && free_cnt-- > 0) {
2639                 count = ipf->ipf_count;
2640                 mp = ipf->ipf_mp;
2641                 ipf = ipf->ipf_hash_next;
2642                 for (tmp = mp; tmp; tmp = tmp->b_cont) {
2643                         IP_REASS_SET_START(tmp, 0);
2644                         IP_REASS_SET_END(tmp, 0);
2645                 }
2646                 atomic_add_32(&ill->ill_frag_count, -count);
2647                 ASSERT(ipfb->ipfb_count >= count);
2648                 ipfb->ipfb_count -= count;
2649                 ASSERT(ipfb->ipfb_frag_pkts > 0);
2650                 ipfb->ipfb_frag_pkts--;
2651                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2652                 ip_drop_input("ipIfStatsReasmFails", mp, ill);
2653                 freemsg(mp);
2654         }
2655
2656         if (ipf)
2657                 ipf->ipf_ptphn = ipfp;
2658         ipfp[0] = ipf;
2659 }
2660
2661 /*
2662  * Helper function for ill_forward_set().
2663  */
2664 static void
2665 ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
2666 {
2667         ip_stack_t      *ipst = ill->ill_ipst;
2668
2669         ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2670
2671         ip1dbg(("ill_forward_set: %s %s forwarding on %s",
2672             (enable ? "Enabling" : "Disabling"),
2673             (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
2674         mutex_enter(&ill->ill_lock);
2675         if (enable)
2676                 ill->ill_flags |= ILLF_ROUTER;
2677         else
2678                 ill->ill_flags &= ~ILLF_ROUTER;
2679         mutex_exit(&ill->ill_lock);
2680         if (ill->ill_isv6)
2681                 ill_set_nce_router_flags(ill, enable);
2682         /* Notify routing socket listeners of this change. */
2683         if (ill->ill_ipif != NULL)
2684                 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
2685 }
2686
2687 /*
2688  * Set an ill's ILLF_ROUTER flag appropriately.  Send up RTS_IFINFO routing
2689  * socket messages for each interface whose flags we change.
2690  */
2691 int
2692 ill_forward_set(ill_t *ill, boolean_t enable)
2693 {
2694         ipmp_illgrp_t *illg;
2695         ip_stack_t *ipst = ill->ill_ipst;
2696
2697         ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2698
2699         if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
2700             (!enable && !(ill->ill_flags & ILLF_ROUTER)))
2701                 return (0);
2702
2703         if (IS_LOOPBACK(ill))
2704                 return (EINVAL);
2705
2706         if (enable && ill->ill_allowed_ips_cnt > 0)
2707                 return (EPERM);
2708
2709         if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
2710                 /*
2711                  * Update all of the interfaces in the group.
2712                  */
2713                 illg = ill->ill_grp;
2714                 ill = list_head(&illg->ig_if);
2715                 for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
2716                         ill_forward_set_on_ill(ill, enable);
2717
2718                 /*
2719                  * Update the IPMP meta-interface.
2720                  */
2721                 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
2722                 return (0);
2723         }
2724
2725         ill_forward_set_on_ill(ill, enable);
2726         return (0);
2727 }
2728
2729 /*
2730  * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
2731  * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
2732  * set or clear.
2733  */
2734 static void
2735 ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
2736 {
2737         ipif_t *ipif;
2738         ncec_t *ncec;
2739         nce_t *nce;
2740
2741         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
2742                 /*
2743                  * NOTE: we match across the illgrp because nce's for
2744                  * addresses on IPMP interfaces have an nce_ill that points to
2745                  * the bound underlying ill.
2746                  */
2747                 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
2748                 if (nce != NULL) {
2749                         ncec = nce->nce_common;
2750                         mutex_enter(&ncec->ncec_lock);
2751                         if (enable)
2752                                 ncec->ncec_flags |= NCE_F_ISROUTER;
2753                         else
2754                                 ncec->ncec_flags &= ~NCE_F_ISROUTER;
2755                         mutex_exit(&ncec->ncec_lock);
2756                         nce_refrele(nce);
2757                 }
2758         }
2759 }
2760
2761 /*
2762  * Intializes the context structure and returns the first ill in the list
2763  * cuurently start_list and end_list can have values:
2764  * MAX_G_HEADS          Traverse both IPV4 and IPV6 lists.
2765  * IP_V4_G_HEAD         Traverse IPV4 list only.
2766  * IP_V6_G_HEAD         Traverse IPV6 list only.
2767  */
2768
2769 /*
2770  * We don't check for CONDEMNED ills here. Caller must do that if
2771  * necessary under the ill lock.
2772  */
2773 ill_t *
2774 ill_first(int start_list, int end_list, ill_walk_context_t *ctx,
2775     ip_stack_t *ipst)
2776 {
2777         ill_if_t *ifp;
2778         ill_t *ill;
2779         avl_tree_t *avl_tree;
2780
2781         ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
2782         ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
2783
2784         /*
2785          * setup the lists to search
2786          */
2787         if (end_list != MAX_G_HEADS) {
2788                 ctx->ctx_current_list = start_list;
2789                 ctx->ctx_last_list = end_list;
2790         } else {
2791                 ctx->ctx_last_list = MAX_G_HEADS - 1;
2792                 ctx->ctx_current_list = 0;
2793         }
2794
2795         while (ctx->ctx_current_list <= ctx->ctx_last_list) {
2796                 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2797                 if (ifp != (ill_if_t *)
2798                     &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2799                         avl_tree = &ifp->illif_avl_by_ppa;
2800                         ill = avl_first(avl_tree);
2801                         /*
2802                          * ill is guaranteed to be non NULL or ifp should have
2803                          * not existed.
2804                          */
2805                         ASSERT(ill != NULL);
2806                         return (ill);
2807                 }
2808                 ctx->ctx_current_list++;
2809         }
2810
2811         return (NULL);
2812 }
2813
2814 /*
2815  * returns the next ill in the list. ill_first() must have been called
2816  * before calling ill_next() or bad things will happen.
2817  */
2818
2819 /*
2820  * We don't check for CONDEMNED ills here. Caller must do that if
2821  * necessary under the ill lock.
2822  */
2823 ill_t *
2824 ill_next(ill_walk_context_t *ctx, ill_t *lastill)
2825 {
2826         ill_if_t *ifp;
2827         ill_t *ill;
2828         ip_stack_t      *ipst = lastill->ill_ipst;
2829
2830         ASSERT(lastill->ill_ifptr != (ill_if_t *)
2831             &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst));
2832         if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill,
2833             AVL_AFTER)) != NULL) {
2834                 return (ill);
2835         }
2836
2837         /* goto next ill_ifp in the list. */
2838         ifp = lastill->ill_ifptr->illif_next;
2839
2840         /* make sure not at end of circular list */
2841         while (ifp ==
2842             (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2843                 if (++ctx->ctx_current_list > ctx->ctx_last_list)
2844                         return (NULL);
2845                 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2846         }
2847
2848         return (avl_first(&ifp->illif_avl_by_ppa));
2849 }
2850
2851 /*
2852  * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
2853  * The final number (PPA) must not have any leading zeros.  Upon success, a
2854  * pointer to the start of the PPA is returned; otherwise NULL is returned.
2855  */
2856 static char *
2857 ill_get_ppa_ptr(char *name)
2858 {
2859         int namelen = strlen(name);
2860         int end_ndx = namelen - 1;
2861         int ppa_ndx, i;
2862
2863         /*
2864          * Check that the first character is [a-zA-Z], and that the last
2865          * character is [0-9].
2866          */
2867         if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
2868                 return (NULL);
2869
2870         /*
2871          * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
2872          */
2873         for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
2874                 if (!isdigit(name[ppa_ndx - 1]))
2875                         break;
2876
2877         if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
2878                 return (NULL);
2879
2880         /*
2881          * Check that the intermediate characters are [a-z0-9.]
2882          */
2883         for (i = 1; i < ppa_ndx; i++) {
2884                 if (!isalpha(name[i]) && !isdigit(name[i]) &&
2885                     name[i] != '.' && name[i] != '_') {
2886                         return (NULL);
2887                 }
2888         }
2889
2890         return (name + ppa_ndx);
2891 }
2892
2893 /*
2894  * use avl tree to locate the ill.
2895  */
2896 static ill_t *
2897 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst)
2898 {
2899         char *ppa_ptr = NULL;
2900         int len;
2901         uint_t ppa;
2902         ill_t *ill = NULL;
2903         ill_if_t *ifp;
2904         int list;
2905
2906         /*
2907          * get ppa ptr
2908          */
2909         if (isv6)
2910                 list = IP_V6_G_HEAD;
2911         else
2912                 list = IP_V4_G_HEAD;
2913
2914         if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
2915                 return (NULL);
2916         }
2917
2918         len = ppa_ptr - name + 1;
2919
2920         ppa = stoi(&ppa_ptr);
2921
2922         ifp = IP_VX_ILL_G_LIST(list, ipst);
2923
2924         while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
2925                 /*
2926                  * match is done on len - 1 as the name is not null
2927                  * terminated it contains ppa in addition to the interface
2928                  * name.
2929                  */
2930                 if ((ifp->illif_name_len == len) &&
2931                     bcmp(ifp->illif_name, name, len - 1) == 0) {
2932                         break;
2933                 } else {
2934                         ifp = ifp->illif_next;
2935                 }
2936         }
2937
2938         if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
2939                 /*
2940                  * Even the interface type does not exist.
2941                  */
2942                 return (NULL);
2943         }
2944
2945         ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
2946         if (ill != NULL) {
2947                 mutex_enter(&ill->ill_lock);
2948                 if (ILL_CAN_LOOKUP(ill)) {
2949                         ill_refhold_locked(ill);
2950                         mutex_exit(&ill->ill_lock);
2951                         return (ill);
2952                 }
2953                 mutex_exit(&ill->ill_lock);
2954         }
2955         return (NULL);
2956 }
2957
2958 /*
2959  * comparison function for use with avl.
2960  */
2961 static int
2962 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr)
2963 {
2964         uint_t ppa;
2965         uint_t ill_ppa;
2966
2967         ASSERT(ppa_ptr != NULL && ill_ptr != NULL);
2968
2969         ppa = *((uint_t *)ppa_ptr);
2970         ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa;
2971         /*
2972          * We want the ill with the lowest ppa to be on the
2973          * top.
2974          */
2975         if (ill_ppa < ppa)
2976                 return (1);
2977         if (ill_ppa > ppa)
2978                 return (-1);
2979         return (0);
2980 }
2981
2982 /*
2983  * remove an interface type from the global list.
2984  */
2985 static void
2986 ill_delete_interface_type(ill_if_t *interface)
2987 {
2988         ASSERT(interface != NULL);
2989         ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0);
2990
2991         avl_destroy(&interface->illif_avl_by_ppa);
2992         if (interface->illif_ppa_arena != NULL)
2993                 vmem_destroy(interface->illif_ppa_arena);
2994
2995         remque(interface);
2996
2997         mi_free(interface);
2998 }
2999
3000 /*
3001  * remove ill from the global list.
3002  */
3003 static void
3004 ill_glist_delete(ill_t *ill)
3005 {
3006         ip_stack_t      *ipst;
3007         phyint_t        *phyi;
3008
3009         if (ill == NULL)
3010                 return;
3011         ipst = ill->ill_ipst;
3012         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3013
3014         /*
3015          * If the ill was never inserted into the AVL tree
3016          * we skip the if branch.
3017          */
3018         if (ill->ill_ifptr != NULL) {
3019                 /*
3020                  * remove from AVL tree and free ppa number
3021                  */
3022                 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill);
3023
3024                 if (ill->ill_ifptr->illif_ppa_arena != NULL) {
3025                         vmem_free(ill->ill_ifptr->illif_ppa_arena,
3026                             (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3027                 }
3028                 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) {
3029                         ill_delete_interface_type(ill->ill_ifptr);
3030                 }
3031
3032                 /*
3033                  * Indicate ill is no longer in the list.
3034                  */
3035                 ill->ill_ifptr = NULL;
3036                 ill->ill_name_length = 0;
3037                 ill->ill_name[0] = '\0';
3038                 ill->ill_ppa = UINT_MAX;
3039         }
3040
3041         /* Generate one last event for this ill. */
3042         ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
3043             ill->ill_name_length);
3044
3045         ASSERT(ill->ill_phyint != NULL);
3046         phyi = ill->ill_phyint;
3047         ill->ill_phyint = NULL;
3048
3049         /*
3050          * ill_init allocates a phyint always to store the copy
3051          * of flags relevant to phyint. At that point in time, we could
3052          * not assign the name and hence phyint_illv4/v6 could not be
3053          * initialized. Later in ipif_set_values, we assign the name to
3054          * the ill, at which point in time we assign phyint_illv4/v6.
3055          * Thus we don't rely on phyint_illv6 to be initialized always.
3056          */
3057         if (ill->ill_flags & ILLF_IPV6)
3058                 phyi->phyint_illv6 = NULL;
3059         else
3060                 phyi->phyint_illv4 = NULL;
3061
3062         if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
3063                 rw_exit(&ipst->ips_ill_g_lock);
3064                 return;
3065         }
3066
3067         /*
3068          * There are no ills left on this phyint; pull it out of the phyint
3069          * avl trees, and free it.
3070          */
3071         if (phyi->phyint_ifindex > 0) {
3072                 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3073                     phyi);
3074                 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
3075                     phyi);
3076         }
3077         rw_exit(&ipst->ips_ill_g_lock);
3078
3079         phyint_free(phyi);
3080 }
3081
3082 /*
3083  * allocate a ppa, if the number of plumbed interfaces of this type are
3084  * less than ill_no_arena do a linear search to find a unused ppa.
3085  * When the number goes beyond ill_no_arena switch to using an arena.
3086  * Note: ppa value of zero cannot be allocated from vmem_arena as it
3087  * is the return value for an error condition, so allocation starts at one
3088  * and is decremented by one.
3089  */
3090 static int
3091 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill)
3092 {
3093         ill_t *tmp_ill;
3094         uint_t start, end;
3095         int ppa;
3096
3097         if (ifp->illif_ppa_arena == NULL &&
3098             (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) {
3099                 /*
3100                  * Create an arena.
3101                  */
3102                 ifp->illif_ppa_arena = vmem_create(ifp->illif_name,
3103                     (void *)1, UINT_MAX - 1, 1, NULL, NULL,
3104                     NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
3105                         /* allocate what has already been assigned */
3106                 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa);
3107                     tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa,
3108                     tmp_ill, AVL_AFTER)) {
3109                         ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3110                             1,          /* size */
3111                             1,          /* align/quantum */
3112                             0,          /* phase */
3113                             0,          /* nocross */
3114                             /* minaddr */
3115                             (void *)((uintptr_t)tmp_ill->ill_ppa + 1),
3116                             /* maxaddr */
3117                             (void *)((uintptr_t)tmp_ill->ill_ppa + 2),
3118                             VM_NOSLEEP|VM_FIRSTFIT);
3119                         if (ppa == 0) {
3120                                 ip1dbg(("ill_alloc_ppa: ppa allocation"
3121                                     " failed while switching"));
3122                                 vmem_destroy(ifp->illif_ppa_arena);
3123                                 ifp->illif_ppa_arena = NULL;
3124                                 break;
3125                         }
3126                 }
3127         }
3128
3129         if (ifp->illif_ppa_arena != NULL) {
3130                 if (ill->ill_ppa == UINT_MAX) {
3131                         ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena,
3132                             1, VM_NOSLEEP|VM_FIRSTFIT);
3133                         if (ppa == 0)
3134                                 return (EAGAIN);
3135                         ill->ill_ppa = --ppa;
3136                 } else {
3137                         ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3138                             1,          /* size */
3139                             1,          /* align/quantum */
3140                             0,          /* phase */
3141                             0,          /* nocross */
3142                             (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */
3143                             (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */
3144                             VM_NOSLEEP|VM_FIRSTFIT);
3145                         /*
3146                          * Most likely the allocation failed because
3147                          * the requested ppa was in use.
3148                          */
3149                         if (ppa == 0)
3150                                 return (EEXIST);
3151                 }
3152                 return (0);
3153         }
3154
3155         /*
3156          * No arena is in use and not enough (>ill_no_arena) interfaces have
3157          * been plumbed to create one. Do a linear search to get a unused ppa.
3158          */
3159         if (ill->ill_ppa == UINT_MAX) {
3160                 end = UINT_MAX - 1;
3161                 start = 0;
3162         } else {
3163                 end = start = ill->ill_ppa;
3164         }
3165
3166         tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL);
3167         while (tmp_ill != NULL && tmp_ill->ill_ppa == start) {
3168                 if (start++ >= end) {
3169                         if (ill->ill_ppa == UINT_MAX)
3170                                 return (EAGAIN);
3171                         else
3172                                 return (EEXIST);
3173                 }
3174                 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER);
3175         }
3176         ill->ill_ppa = start;
3177         return (0);
3178 }
3179
3180 /*
3181  * Insert ill into the list of configured ill's. Once this function completes,
3182  * the ill is globally visible and is available through lookups. More precisely
3183  * this happens after the caller drops the ill_g_lock.
3184  */
3185 static int
3186 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
3187 {
3188         ill_if_t *ill_interface;
3189         avl_index_t where = 0;
3190         int error;
3191         int name_length;
3192         int index;
3193         boolean_t check_length = B_FALSE;
3194         ip_stack_t      *ipst = ill->ill_ipst;
3195
3196         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
3197
3198         name_length = mi_strlen(name) + 1;
3199
3200         if (isv6)
3201                 index = IP_V6_G_HEAD;
3202         else
3203                 index = IP_V4_G_HEAD;
3204
3205         ill_interface = IP_VX_ILL_G_LIST(index, ipst);
3206         /*
3207          * Search for interface type based on name
3208          */
3209         while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3210                 if ((ill_interface->illif_name_len == name_length) &&
3211                     (strcmp(ill_interface->illif_name, name) == 0)) {
3212                         break;
3213                 }
3214                 ill_interface = ill_interface->illif_next;
3215         }
3216
3217         /*
3218          * Interface type not found, create one.
3219          */
3220         if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3221                 ill_g_head_t ghead;
3222
3223                 /*
3224                  * allocate ill_if_t structure
3225                  */
3226                 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t));
3227                 if (ill_interface == NULL) {
3228                         return (ENOMEM);
3229                 }
3230
3231                 (void) strcpy(ill_interface->illif_name, name);
3232                 ill_interface->illif_name_len = name_length;
3233
3234                 avl_create(&ill_interface->illif_avl_by_ppa,
3235                     ill_compare_ppa, sizeof (ill_t),
3236                     offsetof(struct ill_s, ill_avl_byppa));
3237
3238                 /*
3239                  * link the structure in the back to maintain order
3240                  * of configuration for ifconfig output.
3241                  */
3242                 ghead = ipst->ips_ill_g_heads[index];
3243                 insque(ill_interface, ghead.ill_g_list_tail);
3244         }
3245
3246         if (ill->ill_ppa == UINT_MAX)
3247                 check_length = B_TRUE;
3248
3249         error = ill_alloc_ppa(ill_interface, ill);
3250         if (error != 0) {
3251                 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3252                         ill_delete_interface_type(ill->ill_ifptr);
3253                 return (error);
3254         }
3255
3256         /*
3257          * When the ppa is choosen by the system, check that there is
3258          * enough space to insert ppa. if a specific ppa was passed in this
3259          * check is not required as the interface name passed in will have
3260          * the right ppa in it.
3261          */
3262         if (check_length) {
3263                 /*
3264                  * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars.
3265                  */
3266                 char buf[sizeof (uint_t) * 3];
3267
3268                 /*
3269                  * convert ppa to string to calculate the amount of space
3270                  * required for it in the name.
3271                  */
3272                 numtos(ill->ill_ppa, buf);
3273
3274                 /* Do we have enough space to insert ppa ? */
3275
3276                 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) {
3277                         /* Free ppa and interface type struct */
3278                         if (ill_interface->illif_ppa_arena != NULL) {
3279                                 vmem_free(ill_interface->illif_ppa_arena,
3280                                     (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3281                         }
3282                         if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3283                                 ill_delete_interface_type(ill->ill_ifptr);
3284
3285                         return (EINVAL);
3286                 }
3287         }
3288
3289         (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa);
3290         ill->ill_name_length = mi_strlen(ill->ill_name) + 1;
3291
3292         (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa,
3293             &where);
3294         ill->ill_ifptr = ill_interface;
3295         avl_insert(&ill_interface->illif_avl_by_ppa, ill, where);
3296
3297         ill_phyint_reinit(ill);
3298         return (0);
3299 }
3300
3301 /* Initialize the per phyint ipsq used for serialization */
3302 static boolean_t
3303 ipsq_init(ill_t *ill, boolean_t enter)
3304 {
3305         ipsq_t  *ipsq;
3306         ipxop_t *ipx;
3307
3308         if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL)
3309                 return (B_FALSE);
3310
3311         ill->ill_phyint->phyint_ipsq = ipsq;
3312         ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop;
3313         ipx->ipx_ipsq = ipsq;
3314         ipsq->ipsq_next = ipsq;
3315         ipsq->ipsq_phyint = ill->ill_phyint;
3316         mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
3317         mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0);
3318         ipsq->ipsq_ipst = ill->ill_ipst;        /* No netstack_hold */
3319         if (enter) {
3320                 ipx->ipx_writer = curthread;
3321                 ipx->ipx_forced = B_FALSE;
3322                 ipx->ipx_reentry_cnt = 1;
3323 #ifdef DEBUG
3324                 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
3325 #endif
3326         }
3327         return (B_TRUE);
3328 }
3329
3330 /*
3331  * Here we perform initialisation of the ill_t common to both regular
3332  * interface ILLs and the special loopback ILL created by ill_lookup_on_name.
3333  */
3334 static int
3335 ill_init_common(ill_t *ill, queue_t *q, boolean_t isv6, boolean_t is_loopback,
3336     boolean_t ipsq_enter)
3337 {
3338         int count;
3339         uchar_t *frag_ptr;
3340
3341         mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
3342         mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
3343         ill->ill_saved_ire_cnt = 0;
3344
3345         if (is_loopback) {
3346                 ill->ill_max_frag = isv6 ? ip_loopback_mtu_v6plus :
3347                     ip_loopback_mtuplus;
3348                 /*
3349                  * No resolver here.
3350                  */
3351                 ill->ill_net_type = IRE_LOOPBACK;
3352         } else {
3353                 ill->ill_rq = q;
3354                 ill->ill_wq = WR(q);
3355                 ill->ill_ppa = UINT_MAX;
3356         }
3357
3358         ill->ill_isv6 = isv6;
3359
3360         /*
3361          * Allocate sufficient space to contain our fragment hash table and
3362          * the device name.
3363          */
3364         frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 2 * LIFNAMSIZ);
3365         if (frag_ptr == NULL)
3366                 return (ENOMEM);
3367         ill->ill_frag_ptr = frag_ptr;
3368         ill->ill_frag_free_num_pkts = 0;
3369         ill->ill_last_frag_clean_time = 0;
3370         ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr;
3371         ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE);
3372         for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
3373                 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock,
3374                     NULL, MUTEX_DEFAULT, NULL);
3375         }
3376
3377         ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
3378         if (ill->ill_phyint == NULL) {
3379                 mi_free(frag_ptr);
3380                 return (ENOMEM);
3381         }
3382
3383         mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
3384         if (isv6) {
3385                 ill->ill_phyint->phyint_illv6 = ill;
3386         } else {
3387                 ill->ill_phyint->phyint_illv4 = ill;
3388         }
3389         if (is_loopback) {
3390                 phyint_flags_init(ill->ill_phyint, DL_LOOP);
3391         }
3392
3393         list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
3394
3395         ill_set_inputfn(ill);
3396
3397         if (!ipsq_init(ill, ipsq_enter)) {
3398                 mi_free(frag_ptr);
3399                 mi_free(ill->ill_phyint);
3400                 return (ENOMEM);
3401         }
3402
3403         /* Frag queue limit stuff */
3404         ill->ill_frag_count = 0;
3405         ill->ill_ipf_gen = 0;
3406
3407         rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
3408         mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
3409         ill->ill_global_timer = INFINITY;
3410         ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
3411         ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
3412         ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
3413         ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
3414
3415         /*
3416          * Initialize IPv6 configuration variables.  The IP module is always
3417          * opened as an IPv4 module.  Instead tracking down the cases where
3418          * it switches to do ipv6, we'll just initialize the IPv6 configuration
3419          * here for convenience, this has no effect until the ill is set to do
3420          * IPv6.
3421          */
3422         ill->ill_reachable_time = ND_REACHABLE_TIME;
3423         ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT;
3424         ill->ill_max_buf = ND_MAX_Q;
3425         ill->ill_refcnt = 0;
3426
3427         return (0);
3428 }
3429
3430 /*
3431  * ill_init is called by ip_open when a device control stream is opened.
3432  * It does a few initializations, and shoots a DL_INFO_REQ message down
3433  * to the driver.  The response is later picked up in ip_rput_dlpi and
3434  * used to set up default mechanisms for talking to the driver.  (Always
3435  * called as writer.)
3436  *
3437  * If this function returns error, ip_open will call ip_close which in
3438  * turn will call ill_delete to clean up any memory allocated here that
3439  * is not yet freed.
3440  *
3441  * Note: ill_ipst and ill_zoneid must be set before calling ill_init.
3442  */
3443 int
3444 ill_init(queue_t *q, ill_t *ill)
3445 {
3446         int ret;
3447         dl_info_req_t   *dlir;
3448         mblk_t  *info_mp;
3449
3450         info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
3451             BPRI_HI);
3452         if (info_mp == NULL)
3453                 return (ENOMEM);
3454
3455         /*
3456          * For now pretend this is a v4 ill. We need to set phyint_ill*
3457          * at this point because of the following reason. If we can't
3458          * enter the ipsq at some point and cv_wait, the writer that
3459          * wakes us up tries to locate us using the list of all phyints
3460          * in an ipsq and the ills from the phyint thru the phyint_ill*.
3461          * If we don't set it now, we risk a missed wakeup.
3462          */
3463         if ((ret = ill_init_common(ill, q, B_FALSE, B_FALSE, B_TRUE)) != 0) {
3464                 freemsg(info_mp);
3465                 return (ret);
3466         }
3467
3468         ill->ill_state_flags |= ILL_LL_SUBNET_PENDING;
3469
3470         /* Send down the Info Request to the driver. */
3471         info_mp->b_datap->db_type = M_PCPROTO;
3472         dlir = (dl_info_req_t *)info_mp->b_rptr;
3473         info_mp->b_wptr = (uchar_t *)&dlir[1];
3474         dlir->dl_primitive = DL_INFO_REQ;
3475
3476         ill->ill_dlpi_pending = DL_PRIM_INVAL;
3477
3478         qprocson(q);
3479         ill_dlpi_send(ill, info_mp);
3480
3481         return (0);
3482 }
3483
3484 /*
3485  * ill_dls_info
3486  * creates datalink socket info from the device.
3487  */
3488 int
3489 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill)
3490 {
3491         size_t  len;
3492
3493         sdl->sdl_family = AF_LINK;
3494         sdl->sdl_index = ill_get_upper_ifindex(ill);
3495         sdl->sdl_type = ill->ill_type;
3496         ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3497         len = strlen(sdl->sdl_data);
3498         ASSERT(len < 256);
3499         sdl->sdl_nlen = (uchar_t)len;
3500         sdl->sdl_alen = ill->ill_phys_addr_length;
3501         sdl->sdl_slen = 0;
3502         if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL)
3503                 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen);
3504
3505         return (sizeof (struct sockaddr_dl));
3506 }
3507
3508 /*
3509  * ill_xarp_info
3510  * creates xarp info from the device.
3511  */
3512 static int
3513 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill)
3514 {
3515         sdl->sdl_family = AF_LINK;
3516         sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
3517         sdl->sdl_type = ill->ill_type;
3518         ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3519         sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data);
3520         sdl->sdl_alen = ill->ill_phys_addr_length;
3521         sdl->sdl_slen = 0;
3522         return (sdl->sdl_nlen);
3523 }
3524
3525 static int
3526 loopback_kstat_update(kstat_t *ksp, int rw)
3527 {
3528         kstat_named_t *kn;
3529         netstackid_t    stackid;
3530         netstack_t      *ns;
3531         ip_stack_t      *ipst;
3532
3533         if (ksp == NULL || ksp->ks_data == NULL)
3534                 return (EIO);
3535
3536         if (rw == KSTAT_WRITE)
3537                 return (EACCES);
3538
3539         kn = KSTAT_NAMED_PTR(ksp);
3540         stackid = (zoneid_t)(uintptr_t)ksp->ks_private;
3541
3542         ns = netstack_find_by_stackid(stackid);
3543         if (ns == NULL)
3544                 return (-1);
3545
3546         ipst = ns->netstack_ip;
3547         if (ipst == NULL) {
3548                 netstack_rele(ns);
3549                 return (-1);
3550         }
3551         kn[0].value.ui32 = ipst->ips_loopback_packets;
3552         kn[1].value.ui32 = ipst->ips_loopback_packets;
3553         netstack_rele(ns);
3554         return (0);
3555 }
3556
3557 /*
3558  * Has ifindex been plumbed already?
3559  */
3560 static boolean_t
3561 phyint_exists(uint_t index, ip_stack_t *ipst)
3562 {
3563         ASSERT(index != 0);
3564         ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
3565
3566         return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3567             &index, NULL) != NULL);
3568 }
3569
3570 /*
3571  * Pick a unique ifindex.
3572  * When the index counter passes IF_INDEX_MAX for the first time, the wrap
3573  * flag is set so that next time time ip_assign_ifindex() is called, it
3574  * falls through and resets the index counter back to 1, the minimum value
3575  * for the interface index. The logic below assumes that ips_ill_index
3576  * can hold a value of IF_INDEX_MAX+1 without there being any loss
3577  * (i.e. reset back to 0.)
3578  */
3579 boolean_t
3580 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst)
3581 {
3582         uint_t loops;
3583
3584         if (!ipst->ips_ill_index_wrap) {
3585                 *indexp = ipst->ips_ill_index++;
3586                 if (ipst->ips_ill_index > IF_INDEX_MAX) {
3587                         /*
3588                          * Reached the maximum ifindex value, set the wrap
3589                          * flag to indicate that it is no longer possible
3590                          * to assume that a given index is unallocated.
3591                          */
3592                         ipst->ips_ill_index_wrap = B_TRUE;
3593                 }
3594                 return (B_TRUE);
3595         }
3596
3597         if (ipst->ips_ill_index > IF_INDEX_MAX)
3598                 ipst->ips_ill_index = 1;
3599
3600         /*
3601          * Start reusing unused indexes. Note that we hold the ill_g_lock
3602          * at this point and don't want to call any function that attempts
3603          * to get the lock again.
3604          */
3605         for (loops = IF_INDEX_MAX; loops > 0; loops--) {
3606                 if (!phyint_exists(ipst->ips_ill_index, ipst)) {
3607                         /* found unused index - use it */
3608                         *indexp = ipst->ips_ill_index;
3609                         return (B_TRUE);
3610                 }
3611
3612                 ipst->ips_ill_index++;
3613                 if (ipst->ips_ill_index > IF_INDEX_MAX)
3614                         ipst->ips_ill_index = 1;
3615         }
3616
3617         /*
3618          * all interface indicies are inuse.
3619          */
3620         return (B_FALSE);
3621 }
3622
3623 /*
3624  * Assign a unique interface index for the phyint.
3625  */
3626 static boolean_t
3627 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst)
3628 {
3629         ASSERT(phyi->phyint_ifindex == 0);
3630         return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst));
3631 }
3632
3633 /*
3634  * Initialize the flags on `phyi' as per the provided mactype.
3635  */
3636 static void
3637 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype)
3638 {
3639         uint64_t flags = 0;
3640
3641         /*
3642          * Initialize PHYI_RUNNING and PHYI_FAILED.  For non-IPMP interfaces,
3643          * we always presume the underlying hardware is working and set
3644          * PHYI_RUNNING (if it's not, the driver will subsequently send a
3645          * DL_NOTE_LINK_DOWN message).  For IPMP interfaces, at initialization
3646          * there are no active interfaces in the group so we set PHYI_FAILED.
3647          */
3648         if (mactype == SUNW_DL_IPMP)
3649                 flags |= PHYI_FAILED;
3650         else
3651                 flags |= PHYI_RUNNING;
3652
3653         switch (mactype) {
3654         case SUNW_DL_VNI:
3655                 flags |= PHYI_VIRTUAL;
3656                 break;
3657         case SUNW_DL_IPMP:
3658                 flags |= PHYI_IPMP;
3659                 break;
3660         case DL_LOOP:
3661                 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL);
3662                 break;
3663         }
3664
3665         mutex_enter(&phyi->phyint_lock);
3666         phyi->phyint_flags |= flags;
3667         mutex_exit(&phyi->phyint_lock);
3668 }
3669
3670 /*
3671  * Return a pointer to the ill which matches the supplied name.  Note that
3672  * the ill name length includes the null termination character.  (May be
3673  * called as writer.)
3674  * If do_alloc and the interface is "lo0" it will be automatically created.
3675  * Cannot bump up reference on condemned ills. So dup detect can't be done
3676  * using this func.
3677  */
3678 ill_t *
3679 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
3680     boolean_t *did_alloc, ip_stack_t *ipst)
3681 {
3682         ill_t   *ill;
3683         ipif_t  *ipif;
3684         ipsq_t  *ipsq;
3685         kstat_named_t   *kn;
3686         boolean_t isloopback;
3687         in6_addr_t ov6addr;
3688
3689         isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
3690
3691         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3692         ill = ill_find_by_name(name, isv6, ipst);
3693         rw_exit(&ipst->ips_ill_g_lock);
3694         if (ill != NULL)
3695                 return (ill);
3696
3697         /*
3698          * Couldn't find it.  Does this happen to be a lookup for the
3699          * loopback device and are we allowed to allocate it?
3700          */
3701         if (!isloopback || !do_alloc)
3702                 return (NULL);
3703
3704         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3705         ill = ill_find_by_name(name, isv6, ipst);
3706         if (ill != NULL) {
3707                 rw_exit(&ipst->ips_ill_g_lock);
3708                 return (ill);
3709         }
3710
3711         /* Create the loopback device on demand */
3712         ill = (ill_t *)(mi_alloc(sizeof (ill_t) +
3713             sizeof (ipif_loopback_name), BPRI_MED));
3714         if (ill == NULL)
3715                 goto done;
3716
3717         bzero(ill, sizeof (*ill));
3718         ill->ill_ipst = ipst;
3719         netstack_hold(ipst->ips_netstack);
3720         /*
3721          * For exclusive stacks we set the zoneid to zero
3722          * to make IP operate as if in the global zone.
3723          */
3724         ill->ill_zoneid = GLOBAL_ZONEID;
3725
3726         if (ill_init_common(ill, NULL, isv6, B_TRUE, B_FALSE) != 0)
3727                 goto done;
3728
3729         if (!ill_allocate_mibs(ill))
3730                 goto done;
3731
3732         ill->ill_current_frag = ill->ill_max_frag;
3733         ill->ill_mtu = ill->ill_max_frag;       /* Initial value */
3734         ill->ill_mc_mtu = ill->ill_mtu;
3735         /*
3736          * ipif_loopback_name can't be pointed at directly because its used
3737          * by both the ipv4 and ipv6 interfaces.  When the ill is removed
3738          * from the glist, ill_glist_delete() sets the first character of
3739          * ill_name to '\0'.
3740          */
3741         ill->ill_name = (char *)ill + sizeof (*ill);
3742         (void) strcpy(ill->ill_name, ipif_loopback_name);
3743         ill->ill_name_length = sizeof (ipif_loopback_name);
3744         /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */
3745         ill->ill_dlpi_pending = DL_PRIM_INVAL;
3746
3747         ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL);
3748         if (ipif == NULL)
3749                 goto done;
3750
3751         ill->ill_flags = ILLF_MULTICAST;
3752
3753         ov6addr = ipif->ipif_v6lcl_addr;
3754         /* Set up default loopback address and mask. */
3755         if (!isv6) {
3756                 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK);
3757
3758                 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr);
3759                 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask);
3760                 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3761                     ipif->ipif_v6subnet);
3762                 ill->ill_flags |= ILLF_IPV4;
3763         } else {
3764                 ipif->ipif_v6lcl_addr = ipv6_loopback;
3765                 ipif->ipif_v6net_mask = ipv6_all_ones;
3766                 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3767                     ipif->ipif_v6subnet);
3768                 ill->ill_flags |= ILLF_IPV6;
3769         }
3770
3771         /*
3772          * Chain us in at the end of the ill list. hold the ill
3773          * before we make it globally visible. 1 for the lookup.
3774          */
3775         ill_refhold(ill);
3776
3777         ipsq = ill->ill_phyint->phyint_ipsq;
3778
3779         if (ill_glist_insert(ill, "lo", isv6) != 0)
3780                 cmn_err(CE_PANIC, "cannot insert loopback interface");
3781
3782         /* Let SCTP know so that it can add this to its list */
3783         sctp_update_ill(ill, SCTP_ILL_INSERT);
3784
3785         /*
3786          * We have already assigned ipif_v6lcl_addr above, but we need to
3787          * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which
3788          * requires to be after ill_glist_insert() since we need the
3789          * ill_index set. Pass on ipv6_loopback as the old address.
3790          */
3791         sctp_update_ipif_addr(ipif, ov6addr);
3792
3793         ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
3794
3795         /*
3796          * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs.
3797          * If so, free our original one.
3798          */
3799         if (ipsq != ill->ill_phyint->phyint_ipsq)
3800                 ipsq_delete(ipsq);
3801
3802         if (ipst->ips_loopback_ksp == NULL) {
3803                 /* Export loopback interface statistics */
3804                 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0,
3805                     ipif_loopback_name, "net",
3806                     KSTAT_TYPE_NAMED, 2, 0,
3807                     ipst->ips_netstack->netstack_stackid);
3808                 if (ipst->ips_loopback_ksp != NULL) {
3809                         ipst->ips_loopback_ksp->ks_update =
3810                             loopback_kstat_update;
3811                         kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp);
3812                         kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32);
3813                         kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32);
3814                         ipst->ips_loopback_ksp->ks_private =
3815                             (void *)(uintptr_t)ipst->ips_netstack->
3816                             netstack_stackid;
3817                         kstat_install(ipst->ips_loopback_ksp);
3818                 }
3819         }
3820
3821         *did_alloc = B_TRUE;
3822         rw_exit(&ipst->ips_ill_g_lock);
3823         ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id),
3824             NE_PLUMB, ill->ill_name, ill->ill_name_length);
3825         return (ill);
3826 done:
3827         if (ill != NULL) {
3828                 if (ill->ill_phyint != NULL) {
3829                         ipsq = ill->ill_phyint->phyint_ipsq;
3830                         if (ipsq != NULL) {
3831                                 ipsq->ipsq_phyint = NULL;
3832                                 ipsq_delete(ipsq);
3833                         }
3834                         mi_free(ill->ill_phyint);
3835                 }
3836                 ill_free_mib(ill);
3837                 if (ill->ill_ipst != NULL)
3838                         netstack_rele(ill->ill_ipst->ips_netstack);
3839                 mi_free(ill);
3840         }
3841         rw_exit(&ipst->ips_ill_g_lock);
3842         return (NULL);
3843 }
3844
3845 /*
3846  * For IPP calls - use the ip_stack_t for global stack.
3847  */
3848 ill_t *
3849 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6)
3850 {
3851         ip_stack_t      *ipst;
3852         ill_t           *ill;
3853         netstack_t      *ns;
3854
3855         ns = netstack_find_by_stackid(GLOBAL_NETSTACKID);
3856
3857         if ((ipst = ns->netstack_ip) == NULL) {
3858                 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n");
3859                 netstack_rele(ns);
3860                 return (NULL);
3861         }
3862
3863         ill = ill_lookup_on_ifindex(index, isv6, ipst);
3864         netstack_rele(ns);
3865         return (ill);
3866 }
3867
3868 /*
3869  * Return a pointer to the ill which matches the index and IP version type.
3870  */
3871 ill_t *
3872 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
3873 {
3874         ill_t   *ill;
3875         phyint_t *phyi;
3876
3877         /*
3878          * Indexes are stored in the phyint - a common structure
3879          * to both IPv4 and IPv6.
3880          */
3881         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3882         phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3883             (void *) &index, NULL);
3884         if (phyi != NULL) {
3885                 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
3886                 if (ill != NULL) {
3887                         mutex_enter(&ill->ill_lock);
3888                         if (!ILL_IS_CONDEMNED(ill)) {
3889                                 ill_refhold_locked(ill);
3890                                 mutex_exit(&ill->ill_lock);
3891                                 rw_exit(&ipst->ips_ill_g_lock);
3892                                 return (ill);
3893                         }
3894                         mutex_exit(&ill->ill_lock);
3895                 }
3896         }
3897         rw_exit(&ipst->ips_ill_g_lock);
3898         return (NULL);
3899 }
3900
3901 /*
3902  * Verify whether or not an interface index is valid for the specified zoneid
3903  * to transmit packets.
3904  * It can be zero (meaning "reset") or an interface index assigned
3905  * to a non-VNI interface. (We don't use VNI interface to send packets.)
3906  */
3907 boolean_t
3908 ip_xmit_ifindex_valid(uint_t ifindex, zoneid_t zoneid, boolean_t isv6,
3909     ip_stack_t *ipst)
3910 {
3911         ill_t           *ill;
3912
3913         if (ifindex == 0)
3914                 return (B_TRUE);
3915
3916         ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, isv6, ipst);
3917         if (ill == NULL)
3918                 return (B_FALSE);
3919         if (IS_VNI(ill)) {
3920                 ill_refrele(ill);
3921                 return (B_FALSE);
3922         }
3923         ill_refrele(ill);
3924         return (B_TRUE);
3925 }
3926
3927 /*
3928  * Return the ifindex next in sequence after the passed in ifindex.
3929  * If there is no next ifindex for the given protocol, return 0.
3930  */
3931 uint_t
3932 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
3933 {
3934         phyint_t *phyi;
3935         phyint_t *phyi_initial;
3936         uint_t   ifindex;
3937
3938         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3939
3940         if (index == 0) {
3941                 phyi = avl_first(
3942                     &ipst->ips_phyint_g_list->phyint_list_avl_by_index);
3943         } else {
3944                 phyi = phyi_initial = avl_find(
3945                     &ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3946                     (void *) &index, NULL);
3947         }
3948
3949         for (; phyi != NULL;
3950             phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3951             phyi, AVL_AFTER)) {
3952                 /*
3953                  * If we're not returning the first interface in the tree
3954                  * and we still haven't moved past the phyint_t that
3955                  * corresponds to index, avl_walk needs to be called again
3956                  */
3957                 if (!((index != 0) && (phyi == phyi_initial))) {
3958                         if (isv6) {
3959                                 if ((phyi->phyint_illv6) &&
3960                                     ILL_CAN_LOOKUP(phyi->phyint_illv6) &&
3961                                     (phyi->phyint_illv6->ill_isv6 == 1))
3962                                         break;
3963                         } else {
3964                                 if ((phyi->phyint_illv4) &&
3965                                     ILL_CAN_LOOKUP(phyi->phyint_illv4) &&
3966                                     (phyi->phyint_illv4->ill_isv6 == 0))
3967                                         break;
3968                         }
3969                 }
3970         }
3971
3972         rw_exit(&ipst->ips_ill_g_lock);
3973
3974         if (phyi != NULL)
3975                 ifindex = phyi->phyint_ifindex;
3976         else
3977                 ifindex = 0;
3978
3979         return (ifindex);
3980 }
3981
3982 /*
3983  * Return the ifindex for the named interface.
3984  * If there is no next ifindex for the interface, return 0.
3985  */
3986 uint_t
3987 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst)
3988 {
3989         phyint_t        *phyi;
3990         avl_index_t     where = 0;
3991         uint_t          ifindex;
3992
3993         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3994
3995         if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
3996             name, &where)) == NULL) {
3997                 rw_exit(&ipst->ips_ill_g_lock);
3998                 return (0);
3999         }
4000
4001         ifindex = phyi->phyint_ifindex;
4002
4003         rw_exit(&ipst->ips_ill_g_lock);
4004
4005         return (ifindex);
4006 }
4007
4008 /*
4009  * Return the ifindex to be used by upper layer protocols for instance
4010  * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill.
4011  */
4012 uint_t
4013 ill_get_upper_ifindex(const ill_t *ill)
4014 {
4015         if (IS_UNDER_IPMP(ill))
4016                 return (ipmp_ill_get_ipmp_ifindex(ill));
4017         else
4018                 return (ill->ill_phyint->phyint_ifindex);
4019 }
4020
4021
4022 /*
4023  * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt
4024  * that gives a running thread a reference to the ill. This reference must be
4025  * released by the thread when it is done accessing the ill and related
4026  * objects. ill_refcnt can not be used to account for static references
4027  * such as other structures pointing to an ill. Callers must generally
4028  * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros
4029  * or be sure that the ill is not being deleted or changing state before
4030  * calling the refhold functions. A non-zero ill_refcnt ensures that the
4031  * ill won't change any of its critical state such as address, netmask etc.
4032  */
4033 void
4034 ill_refhold(ill_t *ill)
4035 {
4036         mutex_enter(&ill->ill_lock);
4037         ill->ill_refcnt++;
4038         ILL_TRACE_REF(ill);
4039         mutex_exit(&ill->ill_lock);
4040 }
4041
4042 void
4043 ill_refhold_locked(ill_t *ill)
4044 {
4045         ASSERT(MUTEX_HELD(&ill->ill_lock));
4046         ill->ill_refcnt++;
4047         ILL_TRACE_REF(ill);
4048 }
4049
4050 /* Returns true if we managed to get a refhold */
4051 boolean_t
4052 ill_check_and_refhold(ill_t *ill)
4053 {
4054         mutex_enter(&ill->ill_lock);
4055         if (!ILL_IS_CONDEMNED(ill)) {
4056                 ill_refhold_locked(ill);
4057                 mutex_exit(&ill->ill_lock);
4058                 return (B_TRUE);
4059         }
4060         mutex_exit(&ill->ill_lock);
4061         return (B_FALSE);
4062 }
4063
4064 /*
4065  * Must not be called while holding any locks. Otherwise if this is
4066  * the last reference to be released, there is a chance of recursive mutex
4067  * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
4068  * to restart an ioctl.
4069  */
4070 void
4071 ill_refrele(ill_t *ill)
4072 {
4073         mutex_enter(&ill->ill_lock);
4074         ASSERT(ill->ill_refcnt != 0);
4075         ill->ill_refcnt--;
4076         ILL_UNTRACE_REF(ill);
4077         if (ill->ill_refcnt != 0) {
4078                 /* Every ire pointing to the ill adds 1 to ill_refcnt */
4079                 mutex_exit(&ill->ill_lock);
4080                 return;
4081         }
4082
4083         /* Drops the ill_lock */
4084         ipif_ill_refrele_tail(ill);
4085 }
4086
4087 /*
4088  * Obtain a weak reference count on the ill. This reference ensures the
4089  * ill won't be freed, but the ill may change any of its critical state
4090  * such as netmask, address etc. Returns an error if the ill has started
4091  * closing.
4092  */
4093 boolean_t
4094 ill_waiter_inc(ill_t *ill)
4095 {
4096         mutex_enter(&ill->ill_lock);
4097         if (ill->ill_state_flags & ILL_CONDEMNED) {
4098                 mutex_exit(&ill->ill_lock);
4099                 return (B_FALSE);
4100         }
4101         ill->ill_waiters++;
4102         mutex_exit(&ill->ill_lock);
4103         return (B_TRUE);
4104 }
4105
4106 void
4107 ill_waiter_dcr(ill_t *ill)
4108 {
4109         mutex_enter(&ill->ill_lock);
4110         ill->ill_waiters--;
4111         if (ill->ill_waiters == 0)
4112                 cv_broadcast(&ill->ill_cv);
4113         mutex_exit(&ill->ill_lock);
4114 }
4115
4116 /*
4117  * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the
4118  * driver.  We construct best guess defaults for lower level information that
4119  * we need.  If an interface is brought up without injection of any overriding
4120  * information from outside, we have to be ready to go with these defaults.
4121  * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ)
4122  * we primarely want the dl_provider_style.
4123  * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND
4124  * at which point we assume the other part of the information is valid.
4125  */
4126 void
4127 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
4128 {
4129         uchar_t         *brdcst_addr;
4130         uint_t          brdcst_addr_length, phys_addr_length;
4131         t_scalar_t      sap_length;
4132         dl_info_ack_t   *dlia;
4133         ip_m_t          *ipm;
4134         dl_qos_cl_sel1_t *sel1;
4135         int             min_mtu;
4136
4137         ASSERT(IAM_WRITER_ILL(ill));
4138
4139         /*
4140          * Till the ill is fully up  the ill is not globally visible.
4141          * So no need for a lock.
4142          */
4143         dlia = (dl_info_ack_t *)mp->b_rptr;
4144         ill->ill_mactype = dlia->dl_mac_type;
4145
4146         ipm = ip_m_lookup(dlia->dl_mac_type);
4147         if (ipm == NULL) {
4148                 ipm = ip_m_lookup(DL_OTHER);
4149                 ASSERT(ipm != NULL);
4150         }
4151         ill->ill_media = ipm;
4152
4153         /*
4154          * When the new DLPI stuff is ready we'll pull lengths
4155          * from dlia.
4156          */
4157         if (dlia->dl_version == DL_VERSION_2) {
4158                 brdcst_addr_length = dlia->dl_brdcst_addr_length;
4159                 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
4160                     brdcst_addr_length);
4161                 if (brdcst_addr == NULL) {
4162                         brdcst_addr_length = 0;
4163                 }
4164                 sap_length = dlia->dl_sap_length;
4165                 phys_addr_length = dlia->dl_addr_length - ABS(sap_length);
4166                 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n",
4167                     brdcst_addr_length, sap_length, phys_addr_length));
4168         } else {
4169                 brdcst_addr_length = 6;
4170                 brdcst_addr = ip_six_byte_all_ones;
4171                 sap_length = -2;
4172                 phys_addr_length = brdcst_addr_length;
4173         }
4174
4175         ill->ill_bcast_addr_length = brdcst_addr_length;
4176         ill->ill_phys_addr_length = phys_addr_length;
4177         ill->ill_sap_length = sap_length;
4178
4179         /*
4180          * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU,
4181          * but we must ensure a minimum IP MTU is used since other bits of
4182          * IP will fly apart otherwise.
4183          */
4184         min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
4185         ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
4186         ill->ill_current_frag = ill->ill_max_frag;
4187         ill->ill_mtu = ill->ill_max_frag;
4188         ill->ill_mc_mtu = ill->ill_mtu; /* Overridden by DL_NOTE_SDU_SIZE2 */
4189
4190         ill->ill_type = ipm->ip_m_type;
4191
4192         if (!ill->ill_dlpi_style_set) {
4193                 if (dlia->dl_provider_style == DL_STYLE2)
4194                         ill->ill_needs_attach = 1;
4195
4196                 phyint_flags_init(ill->ill_phyint, ill->ill_mactype);
4197
4198                 /*
4199                  * Allocate the first ipif on this ill.  We don't delay it
4200                  * further as ioctl handling assumes at least one ipif exists.
4201                  *
4202                  * At this point we don't know whether the ill is v4 or v6.
4203                  * We will know this whan the SIOCSLIFNAME happens and
4204                  * the correct value for ill_isv6 will be assigned in
4205                  * ipif_set_values(). We need to hold the ill lock and
4206                  * clear the ILL_LL_SUBNET_PENDING flag and atomically do
4207                  * the wakeup.
4208                  */
4209                 (void) ipif_allocate(ill, 0, IRE_LOCAL,
4210                     dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL);
4211                 mutex_enter(&ill->ill_lock);
4212                 ASSERT(ill->ill_dlpi_style_set == 0);
4213                 ill->ill_dlpi_style_set = 1;
4214                 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING;
4215                 cv_broadcast(&ill->ill_cv);
4216                 mutex_exit(&ill->ill_lock);
4217                 freemsg(mp);
4218                 return;
4219         }
4220         ASSERT(ill->ill_ipif != NULL);
4221         /*
4222          * We know whether it is IPv4 or IPv6 now, as this is the
4223          * second DL_INFO_ACK we are recieving in response to the
4224          * DL_INFO_REQ sent in ipif_set_values.
4225          */
4226         ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap;
4227         /*
4228          * Clear all the flags that were set based on ill_bcast_addr_length
4229          * and ill_phys_addr_length (in ipif_set_values) as these could have
4230          * changed now and we need to re-evaluate.
4231          */
4232         ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP);
4233         ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT);
4234
4235         /*
4236          * Free ill_bcast_mp as things could have changed now.
4237          *
4238          * NOTE: The IPMP meta-interface is special-cased because it starts
4239          * with no underlying interfaces (and thus an unknown broadcast
4240          * address length), but we enforce that an interface is broadcast-
4241          * capable as part of allowing it to join a group.
4242          */
4243         if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
4244                 if (ill->ill_bcast_mp != NULL)
4245                         freemsg(ill->ill_bcast_mp);
4246                 ill->ill_net_type = IRE_IF_NORESOLVER;
4247
4248                 ill->ill_bcast_mp = ill_dlur_gen(NULL,
4249                     ill->ill_phys_addr_length,
4250                     ill->ill_sap,
4251                     ill->ill_sap_length);
4252
4253                 if (ill->ill_isv6)
4254                         /*
4255                          * Note: xresolv interfaces will eventually need NOARP
4256                          * set here as well, but that will require those
4257                          * external resolvers to have some knowledge of
4258                          * that flag and act appropriately. Not to be changed
4259                          * at present.
4260                          */
4261                         ill->ill_flags |= ILLF_NONUD;
4262                 else
4263                         ill->ill_flags |= ILLF_NOARP;
4264
4265                 if (ill->ill_mactype == SUNW_DL_VNI) {
4266                         ill->ill_ipif->ipif_flags |= IPIF_NOXMIT;
4267                 } else if (ill->ill_phys_addr_length == 0 ||
4268                     ill->ill_mactype == DL_IPV4 ||
4269                     ill->ill_mactype == DL_IPV6) {
4270                         /*
4271                          * The underying link is point-to-point, so mark the
4272                          * interface as such.  We can do IP multicast over
4273                          * such a link since it transmits all network-layer
4274                          * packets to the remote side the same way.
4275                          */
4276                         ill->ill_flags |= ILLF_MULTICAST;
4277                         ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT;
4278                 }
4279         } else {
4280                 ill->ill_net_type = IRE_IF_RESOLVER;
4281                 if (ill->ill_bcast_mp != NULL)
4282                         freemsg(ill->ill_bcast_mp);
4283                 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr,
4284                     ill->ill_bcast_addr_length, ill->ill_sap,
4285                     ill->ill_sap_length);
4286                 /*
4287                  * Later detect lack of DLPI driver multicast
4288                  * capability by catching DL_ENABMULTI errors in
4289                  * ip_rput_dlpi.
4290                  */
4291                 ill->ill_flags |= ILLF_MULTICAST;
4292                 if (!ill->ill_isv6)
4293                         ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
4294         }
4295
4296         /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */
4297         if (ill->ill_mactype == SUNW_DL_IPMP)
4298                 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP);
4299
4300         /* By default an interface does not support any CoS marking */
4301         ill->ill_flags &= ~ILLF_COS_ENABLED;
4302
4303         /*
4304          * If we get QoS information in DL_INFO_ACK, the device supports
4305          * some form of CoS marking, set ILLF_COS_ENABLED.
4306          */
4307         sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset,
4308             dlia->dl_qos_length);
4309         if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) {
4310                 ill->ill_flags |= ILLF_COS_ENABLED;
4311         }
4312
4313         /* Clear any previous error indication. */
4314         ill->ill_error = 0;
4315         freemsg(mp);
4316 }
4317
4318 /*
4319  * Perform various checks to verify that an address would make sense as a
4320  * local, remote, or subnet interface address.
4321  */
4322 static boolean_t
4323 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask)
4324 {
4325         ipaddr_t        net_mask;
4326
4327         /*
4328          * Don't allow all zeroes, or all ones, but allow
4329          * all ones netmask.
4330          */
4331         if ((net_mask = ip_net_mask(addr)) == 0)
4332                 return (B_FALSE);
4333         /* A given netmask overrides the "guess" netmask */
4334         if (subnet_mask != 0)
4335                 net_mask = subnet_mask;
4336         if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) ||
4337             (addr == (addr | ~net_mask)))) {
4338                 return (B_FALSE);
4339         }
4340
4341         /*
4342          * Even if the netmask is all ones, we do not allow address to be
4343          * 255.255.255.255
4344          */
4345         if (addr == INADDR_BROADCAST)
4346                 return (B_FALSE);
4347
4348         if (CLASSD(addr))
4349                 return (B_FALSE);
4350
4351         return (B_TRUE);
4352 }
4353
4354 #define V6_IPIF_LINKLOCAL(p)    \
4355         IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr)
4356
4357 /*
4358  * Compare two given ipifs and check if the second one is better than
4359  * the first one using the order of preference (not taking deprecated
4360  * into acount) specified in ipif_lookup_multicast().
4361  */
4362 static boolean_t
4363 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
4364 {
4365         /* Check the least preferred first. */
4366         if (IS_LOOPBACK(old_ipif->ipif_ill)) {
4367                 /* If both ipifs are the same, use the first one. */
4368                 if (IS_LOOPBACK(new_ipif->ipif_ill))
4369                         return (B_FALSE);
4370                 else
4371                         return (B_TRUE);
4372         }
4373
4374         /* For IPv6, check for link local address. */
4375         if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) {
4376                 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4377                     V6_IPIF_LINKLOCAL(new_ipif)) {
4378                         /* The second one is equal or less preferred. */
4379                         return (B_FALSE);
4380                 } else {
4381                         return (B_TRUE);
4382                 }
4383         }
4384
4385         /* Then check for point to point interface. */
4386         if (old_ipif->ipif_flags & IPIF_POINTOPOINT) {
4387                 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4388                     (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) ||
4389                     (new_ipif->ipif_flags & IPIF_POINTOPOINT)) {
4390                         return (B_FALSE);
4391                 } else {
4392                         return (B_TRUE);
4393                 }
4394         }
4395
4396         /* old_ipif is a normal interface, so no need to use the new one. */
4397         return (B_FALSE);
4398 }
4399
4400 /*
4401  * Find a mulitcast-capable ipif given an IP instance and zoneid.
4402  * The ipif must be up, and its ill must multicast-capable, not
4403  * condemned, not an underlying interface in an IPMP group, and
4404  * not a VNI interface.  Order of preference:
4405  *
4406  *      1a. normal
4407  *      1b. normal, but deprecated
4408  *      2a. point to point
4409  *      2b. point to point, but deprecated
4410  *      3a. link local
4411  *      3b. link local, but deprecated
4412  *      4. loopback.
4413  */
4414 static ipif_t *
4415 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4416 {
4417         ill_t                   *ill;
4418         ill_walk_context_t      ctx;
4419         ipif_t                  *ipif;
4420         ipif_t                  *saved_ipif = NULL;
4421         ipif_t                  *dep_ipif = NULL;
4422
4423         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4424         if (isv6)
4425                 ill = ILL_START_WALK_V6(&ctx, ipst);
4426         else
4427                 ill = ILL_START_WALK_V4(&ctx, ipst);
4428
4429         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4430                 mutex_enter(&ill->ill_lock);
4431                 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) ||
4432                     ILL_IS_CONDEMNED(ill) ||
4433                     !(ill->ill_flags & ILLF_MULTICAST)) {
4434                         mutex_exit(&ill->ill_lock);
4435                         continue;
4436                 }
4437                 for (ipif = ill->ill_ipif; ipif != NULL;
4438                     ipif = ipif->ipif_next) {
4439                         if (zoneid != ipif->ipif_zoneid &&
4440                             zoneid != ALL_ZONES &&
4441                             ipif->ipif_zoneid != ALL_ZONES) {
4442                                 continue;
4443                         }
4444                         if (!(ipif->ipif_flags & IPIF_UP) ||
4445                             IPIF_IS_CONDEMNED(ipif)) {
4446                                 continue;
4447                         }
4448
4449                         /*
4450                          * Found one candidate.  If it is deprecated,
4451                          * remember it in dep_ipif.  If it is not deprecated,
4452                          * remember it in saved_ipif.
4453                          */
4454                         if (ipif->ipif_flags & IPIF_DEPRECATED) {
4455                                 if (dep_ipif == NULL) {
4456                                         dep_ipif = ipif;
4457                                 } else if (ipif_comp_multi(dep_ipif, ipif,
4458                                     isv6)) {
4459                                         /*
4460                                          * If the previous dep_ipif does not
4461                                          * belong to the same ill, we've done
4462                                          * a ipif_refhold() on it.  So we need
4463                                          * to release it.
4464                                          */
4465                                         if (dep_ipif->ipif_ill != ill)
4466                                                 ipif_refrele(dep_ipif);
4467                                         dep_ipif = ipif;
4468                                 }
4469                                 continue;
4470                         }
4471                         if (saved_ipif == NULL) {
4472                                 saved_ipif = ipif;
4473                         } else {
4474                                 if (ipif_comp_multi(saved_ipif, ipif, isv6)) {
4475                                         if (saved_ipif->ipif_ill != ill)
4476                                                 ipif_refrele(saved_ipif);
4477                                         saved_ipif = ipif;
4478                                 }
4479                         }
4480                 }
4481                 /*
4482                  * Before going to the next ill, do a ipif_refhold() on the
4483                  * saved ones.
4484                  */
4485                 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill)
4486                         ipif_refhold_locked(saved_ipif);
4487                 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill)
4488                         ipif_refhold_locked(dep_ipif);
4489                 mutex_exit(&ill->ill_lock);
4490         }
4491         rw_exit(&ipst->ips_ill_g_lock);
4492
4493         /*
4494          * If we have only the saved_ipif, return it.  But if we have both
4495          * saved_ipif and dep_ipif, check to see which one is better.
4496          */
4497         if (saved_ipif != NULL) {
4498                 if (dep_ipif != NULL) {
4499                         if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) {
4500                                 ipif_refrele(saved_ipif);
4501                                 return (dep_ipif);
4502                         } else {
4503                                 ipif_refrele(dep_ipif);
4504                                 return (saved_ipif);
4505                         }
4506                 }
4507                 return (saved_ipif);
4508         } else {
4509                 return (dep_ipif);
4510         }
4511 }
4512
4513 ill_t *
4514 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4515 {
4516         ipif_t *ipif;
4517         ill_t *ill;
4518
4519         ipif = ipif_lookup_multicast(ipst, zoneid, isv6);
4520         if (ipif == NULL)
4521                 return (NULL);
4522
4523         ill = ipif->ipif_ill;
4524         ill_refhold(ill);
4525         ipif_refrele(ipif);
4526         return (ill);
4527 }
4528
4529 /*
4530  * This function is called when an application does not specify an interface
4531  * to be used for multicast traffic (joining a group/sending data).  It
4532  * calls ire_lookup_multi() to look for an interface route for the
4533  * specified multicast group.  Doing this allows the administrator to add
4534  * prefix routes for multicast to indicate which interface to be used for
4535  * multicast traffic in the above scenario.  The route could be for all
4536  * multicast (224.0/4), for a single multicast group (a /32 route) or
4537  * anything in between.  If there is no such multicast route, we just find
4538  * any multicast capable interface and return it.  The returned ipif
4539  * is refhold'ed.
4540  */
4541 ill_t *
4542 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
4543     ipaddr_t *setsrcp)
4544 {
4545         ill_t                   *ill;
4546
4547         ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, setsrcp);
4548         if (ill != NULL)
4549                 return (ill);
4550
4551         return (ill_lookup_multicast(ipst, zoneid, B_FALSE));
4552 }
4553
4554 /*
4555  * Look for an ipif with the specified interface address and destination.
4556  * The destination address is used only for matching point-to-point interfaces.
4557  */
4558 ipif_t *
4559 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst)
4560 {
4561         ipif_t  *ipif;
4562         ill_t   *ill;
4563         ill_walk_context_t ctx;
4564
4565         /*
4566          * First match all the point-to-point interfaces
4567          * before looking at non-point-to-point interfaces.
4568          * This is done to avoid returning non-point-to-point
4569          * ipif instead of unnumbered point-to-point ipif.
4570          */
4571         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4572         ill = ILL_START_WALK_V4(&ctx, ipst);
4573         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4574                 mutex_enter(&ill->ill_lock);
4575                 for (ipif = ill->ill_ipif; ipif != NULL;
4576                     ipif = ipif->ipif_next) {
4577                         /* Allow the ipif to be down */
4578                         if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
4579                             (ipif->ipif_lcl_addr == if_addr) &&
4580                             (ipif->ipif_pp_dst_addr == dst)) {
4581                                 if (!IPIF_IS_CONDEMNED(ipif)) {
4582                                         ipif_refhold_locked(ipif);
4583                                         mutex_exit(&ill->ill_lock);
4584                                         rw_exit(&ipst->ips_ill_g_lock);
4585                                         return (ipif);
4586                                 }
4587                         }
4588                 }
4589                 mutex_exit(&ill->ill_lock);
4590         }
4591         rw_exit(&ipst->ips_ill_g_lock);
4592
4593         /* lookup the ipif based on interface address */
4594         ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst);
4595         ASSERT(ipif == NULL || !ipif->ipif_isv6);
4596         return (ipif);
4597 }
4598
4599 /*
4600  * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
4601  */
4602 static ipif_t *
4603 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags,
4604     zoneid_t zoneid, ip_stack_t *ipst)
4605 {
4606         ipif_t  *ipif;
4607         ill_t   *ill;
4608         boolean_t ptp = B_FALSE;
4609         ill_walk_context_t      ctx;
4610         boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP);
4611         boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP);
4612
4613         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4614         /*
4615          * Repeat twice, first based on local addresses and
4616          * next time for pointopoint.
4617          */
4618 repeat:
4619         ill = ILL_START_WALK_V4(&ctx, ipst);
4620         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4621                 if (match_ill != NULL && ill != match_ill &&
4622                     (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
4623                         continue;
4624                 }
4625                 mutex_enter(&ill->ill_lock);
4626                 for (ipif = ill->ill_ipif; ipif != NULL;
4627                     ipif = ipif->ipif_next) {
4628                         if (zoneid != ALL_ZONES &&
4629                             zoneid != ipif->ipif_zoneid &&
4630                             ipif->ipif_zoneid != ALL_ZONES)
4631                                 continue;
4632
4633                         if (no_duplicate && !(ipif->ipif_flags & IPIF_UP))
4634                                 continue;
4635
4636                         /* Allow the ipif to be down */
4637                         if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4638                             ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4639                             (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4640                             (ipif->ipif_pp_dst_addr == addr))) {
4641                                 if (!IPIF_IS_CONDEMNED(ipif)) {
4642                                         ipif_refhold_locked(ipif);
4643                                         mutex_exit(&ill->ill_lock);
4644                                         rw_exit(&ipst->ips_ill_g_lock);
4645                                         return (ipif);
4646                                 }
4647                         }
4648                 }
4649                 mutex_exit(&ill->ill_lock);
4650         }
4651
4652         /* If we already did the ptp case, then we are done */
4653         if (ptp) {
4654                 rw_exit(&ipst->ips_ill_g_lock);
4655                 return (NULL);
4656         }
4657         ptp = B_TRUE;
4658         goto repeat;
4659 }
4660
4661 /*
4662  * Lookup an ipif with the specified address.  For point-to-point links we
4663  * look for matches on either the destination address or the local address,
4664  * but we skip the local address check if IPIF_UNNUMBERED is set.  If the
4665  * `match_ill' argument is non-NULL, the lookup is restricted to that ill
4666  * (or illgrp if `match_ill' is in an IPMP group).
4667  */
4668 ipif_t *
4669 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4670     ip_stack_t *ipst)
4671 {
4672         return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP,
4673             zoneid, ipst));
4674 }
4675
4676 /*
4677  * Lookup an ipif with the specified address. Similar to ipif_lookup_addr,
4678  * except that we will only return an address if it is not marked as
4679  * IPIF_DUPLICATE
4680  */
4681 ipif_t *
4682 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4683     ip_stack_t *ipst)
4684 {
4685         return (ipif_lookup_addr_common(addr, match_ill,
4686             (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP),
4687             zoneid, ipst));
4688 }
4689
4690 /*
4691  * Special abbreviated version of ipif_lookup_addr() that doesn't match
4692  * `match_ill' across the IPMP group.  This function is only needed in some
4693  * corner-cases; almost everything should use ipif_lookup_addr().
4694  */
4695 ipif_t *
4696 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4697 {
4698         ASSERT(match_ill != NULL);
4699         return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES,
4700             ipst));
4701 }
4702
4703 /*
4704  * Look for an ipif with the specified address. For point-point links
4705  * we look for matches on either the destination address and the local
4706  * address, but we ignore the check on the local address if IPIF_UNNUMBERED
4707  * is set.
4708  * If the `match_ill' argument is non-NULL, the lookup is restricted to that
4709  * ill (or illgrp if `match_ill' is in an IPMP group).
4710  * Return the zoneid for the ipif which matches. ALL_ZONES if no match.
4711  */
4712 zoneid_t
4713 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4714 {
4715         zoneid_t zoneid;
4716         ipif_t  *ipif;
4717         ill_t   *ill;
4718         boolean_t ptp = B_FALSE;
4719         ill_walk_context_t      ctx;
4720
4721         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4722         /*
4723          * Repeat twice, first based on local addresses and
4724          * next time for pointopoint.
4725          */
4726 repeat:
4727         ill = ILL_START_WALK_V4(&ctx, ipst);
4728         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4729                 if (match_ill != NULL && ill != match_ill &&
4730                     !IS_IN_SAME_ILLGRP(ill, match_ill)) {
4731                         continue;
4732                 }
4733                 mutex_enter(&ill->ill_lock);
4734                 for (ipif = ill->ill_ipif; ipif != NULL;
4735                     ipif = ipif->ipif_next) {
4736                         /* Allow the ipif to be down */
4737                         if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4738                             ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4739                             (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4740                             (ipif->ipif_pp_dst_addr == addr)) &&
4741                             !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
4742                                 zoneid = ipif->ipif_zoneid;
4743                                 mutex_exit(&ill->ill_lock);
4744                                 rw_exit(&ipst->ips_ill_g_lock);
4745                                 return (zoneid);
4746                         }
4747                 }
4748                 mutex_exit(&ill->ill_lock);
4749         }
4750
4751         /* If we already did the ptp case, then we are done */
4752         if (ptp) {
4753                 rw_exit(&ipst->ips_ill_g_lock);
4754                 return (ALL_ZONES);
4755         }
4756         ptp = B_TRUE;
4757         goto repeat;
4758 }
4759
4760 /*
4761  * Look for an ipif that matches the specified remote address i.e. the
4762  * ipif that would receive the specified packet.
4763  * First look for directly connected interfaces and then do a recursive
4764  * IRE lookup and pick the first ipif corresponding to the source address in the
4765  * ire.
4766  * Returns: held ipif
4767  *
4768  * This is only used for ICMP_ADDRESS_MASK_REQUESTs
4769  */
4770 ipif_t *
4771 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
4772 {
4773         ipif_t  *ipif;
4774
4775         ASSERT(!ill->ill_isv6);
4776
4777         /*
4778          * Someone could be changing this ipif currently or change it
4779          * after we return this. Thus  a few packets could use the old
4780          * old values. However structure updates/creates (ire, ilg, ilm etc)
4781          * will atomically be updated or cleaned up with the new value
4782          * Thus we don't need a lock to check the flags or other attrs below.
4783          */
4784         mutex_enter(&ill->ill_lock);
4785         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4786                 if (IPIF_IS_CONDEMNED(ipif))
4787                         continue;
4788                 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid &&
4789                     ipif->ipif_zoneid != ALL_ZONES)
4790                         continue;
4791                 /* Allow the ipif to be down */
4792                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
4793                         if ((ipif->ipif_pp_dst_addr == addr) ||
4794                             (!(ipif->ipif_flags & IPIF_UNNUMBERED) &&
4795                             ipif->ipif_lcl_addr == addr)) {
4796                                 ipif_refhold_locked(ipif);
4797                                 mutex_exit(&ill->ill_lock);
4798                                 return (ipif);
4799                         }
4800                 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) {
4801                         ipif_refhold_locked(ipif);
4802                         mutex_exit(&ill->ill_lock);
4803                         return (ipif);
4804                 }
4805         }
4806         mutex_exit(&ill->ill_lock);
4807         /*
4808          * For a remote destination it isn't possible to nail down a particular
4809          * ipif.
4810          */
4811
4812         /* Pick the first interface */
4813         ipif = ipif_get_next_ipif(NULL, ill);
4814         return (ipif);
4815 }
4816
4817 /*
4818  * This func does not prevent refcnt from increasing. But if
4819  * the caller has taken steps to that effect, then this func
4820  * can be used to determine whether the ill has become quiescent
4821  */
4822 static boolean_t
4823 ill_is_quiescent(ill_t *ill)
4824 {
4825         ipif_t  *ipif;
4826
4827         ASSERT(MUTEX_HELD(&ill->ill_lock));
4828
4829         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4830                 if (ipif->ipif_refcnt != 0)
4831                         return (B_FALSE);
4832         }
4833         if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) {
4834                 return (B_FALSE);
4835         }
4836         return (B_TRUE);
4837 }
4838
4839 boolean_t
4840 ill_is_freeable(ill_t *ill)
4841 {
4842         ipif_t  *ipif;
4843
4844         ASSERT(MUTEX_HELD(&ill->ill_lock));
4845
4846         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4847                 if (ipif->ipif_refcnt != 0) {
4848                         return (B_FALSE);
4849                 }
4850         }
4851         if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) {
4852                 return (B_FALSE);
4853         }
4854         return (B_TRUE);
4855 }
4856
4857 /*
4858  * This func does not prevent refcnt from increasing. But if
4859  * the caller has taken steps to that effect, then this func
4860  * can be used to determine whether the ipif has become quiescent
4861  */
4862 static boolean_t
4863 ipif_is_quiescent(ipif_t *ipif)
4864 {
4865         ill_t *ill;
4866
4867         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4868
4869         if (ipif->ipif_refcnt != 0)
4870                 return (B_FALSE);
4871
4872         ill = ipif->ipif_ill;
4873         if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
4874             ill->ill_logical_down) {
4875                 return (B_TRUE);
4876         }
4877
4878         /* This is the last ipif going down or being deleted on this ill */
4879         if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) {
4880                 return (B_FALSE);
4881         }
4882
4883         return (B_TRUE);
4884 }
4885
4886 /*
4887  * return true if the ipif can be destroyed: the ipif has to be quiescent
4888  * with zero references from ire/ilm to it.
4889  */
4890 static boolean_t
4891 ipif_is_freeable(ipif_t *ipif)
4892 {
4893         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4894         ASSERT(ipif->ipif_id != 0);
4895         return (ipif->ipif_refcnt == 0);
4896 }
4897
4898 /*
4899  * The ipif/ill/ire has been refreled. Do the tail processing.
4900  * Determine if the ipif or ill in question has become quiescent and if so
4901  * wakeup close and/or restart any queued pending ioctl that is waiting
4902  * for the ipif_down (or ill_down)
4903  */
4904 void
4905 ipif_ill_refrele_tail(ill_t *ill)
4906 {
4907         mblk_t  *mp;
4908         conn_t  *connp;
4909         ipsq_t  *ipsq;
4910         ipxop_t *ipx;
4911         ipif_t  *ipif;
4912         dl_notify_ind_t *dlindp;
4913
4914         ASSERT(MUTEX_HELD(&ill->ill_lock));
4915
4916         if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) {
4917                 /* ip_modclose() may be waiting */
4918                 cv_broadcast(&ill->ill_cv);
4919         }
4920
4921         ipsq = ill->ill_phyint->phyint_ipsq;
4922         mutex_enter(&ipsq->ipsq_lock);
4923         ipx = ipsq->ipsq_xop;
4924         mutex_enter(&ipx->ipx_lock);
4925         if (ipx->ipx_waitfor == 0)      /* no one's waiting; bail */
4926                 goto unlock;
4927
4928         ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL);
4929
4930         ipif = ipx->ipx_pending_ipif;
4931         if (ipif->ipif_ill != ill)      /* wait is for another ill; bail */
4932                 goto unlock;
4933
4934         switch (ipx->ipx_waitfor) {
4935         case IPIF_DOWN:
4936                 if (!ipif_is_quiescent(ipif))
4937                         goto unlock;
4938                 break;
4939         case IPIF_FREE:
4940                 if (!ipif_is_freeable(ipif))
4941                         goto unlock;
4942                 break;
4943         case ILL_DOWN:
4944                 if (!ill_is_quiescent(ill))
4945                         goto unlock;
4946                 break;
4947         case ILL_FREE:
4948                 /*
4949                  * ILL_FREE is only for loopback; normal ill teardown waits
4950                  * synchronously in ip_modclose() without using ipx_waitfor,
4951                  * handled by the cv_broadcast() at the top of this function.
4952                  */
4953                 if (!ill_is_freeable(ill))
4954                         goto unlock;
4955                 break;
4956         default:
4957                 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n",
4958                     (void *)ipsq, ipx->ipx_waitfor);
4959         }
4960
4961         ill_refhold_locked(ill);        /* for qwriter_ip() call below */
4962         mutex_exit(&ipx->ipx_lock);
4963         mp = ipsq_pending_mp_get(ipsq, &connp);
4964         mutex_exit(&ipsq->ipsq_lock);
4965         mutex_exit(&ill->ill_lock);
4966
4967         ASSERT(mp != NULL);
4968         /*
4969          * NOTE: all of the qwriter_ip() calls below use CUR_OP since
4970          * we can only get here when the current operation decides it
4971          * it needs to quiesce via ipsq_pending_mp_add().
4972          */
4973         switch (mp->b_datap->db_type) {
4974         case M_PCPROTO:
4975         case M_PROTO:
4976                 /*
4977                  * For now, only DL_NOTIFY_IND messages can use this facility.
4978                  */
4979                 dlindp = (dl_notify_ind_t *)mp->b_rptr;
4980                 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND);
4981
4982                 switch (dlindp->dl_notification) {
4983                 case DL_NOTE_PHYS_ADDR:
4984                         qwriter_ip(ill, ill->ill_rq, mp,
4985                             ill_set_phys_addr_tail, CUR_OP, B_TRUE);
4986                         return;
4987                 case DL_NOTE_REPLUMB:
4988                         qwriter_ip(ill, ill->ill_rq, mp,
4989                             ill_replumb_tail, CUR_OP, B_TRUE);
4990                         return;
4991                 default:
4992                         ASSERT(0);
4993                         ill_refrele(ill);
4994                 }
4995                 break;
4996
4997         case M_ERROR:
4998         case M_HANGUP:
4999                 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP,
5000                     B_TRUE);
5001                 return;
5002
5003         case M_IOCTL:
5004         case M_IOCDATA:
5005                 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) :
5006                     ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE);
5007                 return;
5008
5009         default:
5010                 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
5011                     "db_type %d\n", (void *)mp, mp->b_datap->db_type);
5012         }
5013         return;
5014 unlock:
5015         mutex_exit(&ipsq->ipsq_lock);
5016         mutex_exit(&ipx->ipx_lock);
5017         mutex_exit(&ill->ill_lock);
5018 }
5019
5020 #ifdef DEBUG
5021 /* Reuse trace buffer from beginning (if reached the end) and record trace */
5022 static void
5023 th_trace_rrecord(th_trace_t *th_trace)
5024 {
5025         tr_buf_t *tr_buf;
5026         uint_t lastref;
5027
5028         lastref = th_trace->th_trace_lastref;
5029         lastref++;
5030         if (lastref == TR_BUF_MAX)
5031                 lastref = 0;
5032         th_trace->th_trace_lastref = lastref;
5033         tr_buf = &th_trace->th_trbuf[lastref];
5034         tr_buf->tr_time = ddi_get_lbolt();
5035         tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH);
5036 }
5037
5038 static void
5039 th_trace_free(void *value)
5040 {
5041         th_trace_t *th_trace = value;
5042
5043         ASSERT(th_trace->th_refcnt == 0);
5044         kmem_free(th_trace, sizeof (*th_trace));
5045 }
5046
5047 /*
5048  * Find or create the per-thread hash table used to track object references.
5049  * The ipst argument is NULL if we shouldn't allocate.
5050  *
5051  * Accesses per-thread data, so there's no need to lock here.
5052  */
5053 static mod_hash_t *
5054 th_trace_gethash(ip_stack_t *ipst)
5055 {
5056         th_hash_t *thh;
5057
5058         if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) {
5059                 mod_hash_t *mh;
5060                 char name[256];
5061                 size_t objsize, rshift;
5062                 int retv;
5063
5064                 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL)
5065                         return (NULL);
5066                 (void) snprintf(name, sizeof (name), "th_trace_%p",
5067                     (void *)curthread);
5068
5069                 /*
5070                  * We use mod_hash_create_extended here rather than the more
5071                  * obvious mod_hash_create_ptrhash because the latter has a
5072                  * hard-coded KM_SLEEP, and we'd prefer to fail rather than
5073                  * block.
5074                  */
5075                 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)),
5076                     MAX(sizeof (ire_t), sizeof (ncec_t)));
5077                 rshift = highbit(objsize);
5078                 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor,
5079                     th_trace_free, mod_hash_byptr, (void *)rshift,
5080                     mod_hash_ptrkey_cmp, KM_NOSLEEP);
5081                 if (mh == NULL) {
5082                         kmem_free(thh, sizeof (*thh));
5083                         return (NULL);
5084                 }
5085                 thh->thh_hash = mh;
5086                 thh->thh_ipst = ipst;
5087                 /*
5088                  * We trace ills, ipifs, ires, and nces.  All of these are
5089                  * per-IP-stack, so the lock on the thread list is as well.
5090                  */
5091                 rw_enter(&ip_thread_rwlock, RW_WRITER);
5092                 list_insert_tail(&ip_thread_list, thh);
5093                 rw_exit(&ip_thread_rwlock);
5094                 retv = tsd_set(ip_thread_data, thh);
5095                 ASSERT(retv == 0);
5096         }
5097         return (thh != NULL ? thh->thh_hash : NULL);
5098 }
5099
5100 boolean_t
5101 th_trace_ref(const void *obj, ip_stack_t *ipst)
5102 {
5103         th_trace_t *th_trace;
5104         mod_hash_t *mh;
5105         mod_hash_val_t val;
5106
5107         if ((mh = th_trace_gethash(ipst)) == NULL)
5108                 return (B_FALSE);
5109
5110         /*
5111          * Attempt to locate the trace buffer for this obj and thread.
5112          * If it does not exist, then allocate a new trace buffer and
5113          * insert into the hash.
5114          */
5115         if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) {
5116                 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP);
5117                 if (th_trace == NULL)
5118                         return (B_FALSE);
5119
5120                 th_trace->th_id = curthread;
5121                 if (mod_hash_insert(mh, (mod_hash_key_t)obj,
5122                     (mod_hash_val_t)th_trace) != 0) {
5123                         kmem_free(th_trace, sizeof (th_trace_t));
5124                         return (B_FALSE);
5125                 }
5126         } else {
5127                 th_trace = (th_trace_t *)val;
5128         }
5129
5130         ASSERT(th_trace->th_refcnt >= 0 &&
5131             th_trace->th_refcnt < TR_BUF_MAX - 1);
5132
5133         th_trace->th_refcnt++;
5134         th_trace_rrecord(th_trace);
5135         return (B_TRUE);
5136 }
5137
5138 /*
5139  * For the purpose of tracing a reference release, we assume that global
5140  * tracing is always on and that the same thread initiated the reference hold
5141  * is releasing.
5142  */
5143 void
5144 th_trace_unref(const void *obj)
5145 {
5146         int retv;
5147         mod_hash_t *mh;
5148         th_trace_t *th_trace;
5149         mod_hash_val_t val;
5150
5151         mh = th_trace_gethash(NULL);
5152         retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val);
5153         ASSERT(retv == 0);
5154         th_trace = (th_trace_t *)val;
5155
5156         ASSERT(th_trace->th_refcnt > 0);
5157         th_trace->th_refcnt--;
5158         th_trace_rrecord(th_trace);
5159 }
5160
5161 /*
5162  * If tracing has been disabled, then we assume that the reference counts are
5163  * now useless, and we clear them out before destroying the entries.
5164  */
5165 void
5166 th_trace_cleanup(const void *obj, boolean_t trace_disable)
5167 {
5168         th_hash_t       *thh;
5169         mod_hash_t      *mh;
5170         mod_hash_val_t  val;
5171         th_trace_t      *th_trace;
5172         int             retv;
5173
5174         rw_enter(&ip_thread_rwlock, RW_READER);
5175         for (thh = list_head(&ip_thread_list); thh != NULL;
5176             thh = list_next(&ip_thread_list, thh)) {
5177                 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj,
5178                     &val) == 0) {
5179                         th_trace = (th_trace_t *)val;
5180                         if (trace_disable)
5181                                 th_trace->th_refcnt = 0;
5182                         retv = mod_hash_destroy(mh, (mod_hash_key_t)obj);
5183                         ASSERT(retv == 0);
5184                 }
5185         }
5186         rw_exit(&ip_thread_rwlock);
5187 }
5188
5189 void
5190 ipif_trace_ref(ipif_t *ipif)
5191 {
5192         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5193
5194         if (ipif->ipif_trace_disable)
5195                 return;
5196
5197         if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) {
5198                 ipif->ipif_trace_disable = B_TRUE;
5199                 ipif_trace_cleanup(ipif);
5200         }
5201 }
5202
5203 void
5204 ipif_untrace_ref(ipif_t *ipif)
5205 {
5206         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5207
5208         if (!ipif->ipif_trace_disable)
5209                 th_trace_unref(ipif);
5210 }
5211
5212 void
5213 ill_trace_ref(ill_t *ill)
5214 {
5215         ASSERT(MUTEX_HELD(&ill->ill_lock));
5216
5217         if (ill->ill_trace_disable)
5218                 return;
5219
5220         if (!th_trace_ref(ill, ill->ill_ipst)) {
5221                 ill->ill_trace_disable = B_TRUE;
5222                 ill_trace_cleanup(ill);
5223         }
5224 }
5225
5226 void
5227 ill_untrace_ref(ill_t *ill)
5228 {
5229         ASSERT(MUTEX_HELD(&ill->ill_lock));
5230
5231         if (!ill->ill_trace_disable)
5232                 th_trace_unref(ill);
5233 }
5234
5235 /*
5236  * Called when ipif is unplumbed or when memory alloc fails.  Note that on
5237  * failure, ipif_trace_disable is set.
5238  */
5239 static void
5240 ipif_trace_cleanup(const ipif_t *ipif)
5241 {
5242         th_trace_cleanup(ipif, ipif->ipif_trace_disable);
5243 }
5244
5245 /*
5246  * Called when ill is unplumbed or when memory alloc fails.  Note that on
5247  * failure, ill_trace_disable is set.
5248  */
5249 static void
5250 ill_trace_cleanup(const ill_t *ill)
5251 {
5252         th_trace_cleanup(ill, ill->ill_trace_disable);
5253 }
5254 #endif /* DEBUG */
5255
5256 void
5257 ipif_refhold_locked(ipif_t *ipif)
5258 {
5259         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5260         ipif->ipif_refcnt++;
5261         IPIF_TRACE_REF(ipif);
5262 }
5263
5264 void
5265 ipif_refhold(ipif_t *ipif)
5266 {
5267         ill_t   *ill;
5268
5269         ill = ipif->ipif_ill;
5270         mutex_enter(&ill->ill_lock);
5271         ipif->ipif_refcnt++;
5272         IPIF_TRACE_REF(ipif);
5273         mutex_exit(&ill->ill_lock);
5274 }
5275
5276 /*
5277  * Must not be called while holding any locks. Otherwise if this is
5278  * the last reference to be released there is a chance of recursive mutex
5279  * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
5280  * to restart an ioctl.
5281  */
5282 void
5283 ipif_refrele(ipif_t *ipif)
5284 {
5285         ill_t   *ill;
5286
5287         ill = ipif->ipif_ill;
5288
5289         mutex_enter(&ill->ill_lock);
5290         ASSERT(ipif->ipif_refcnt != 0);
5291         ipif->ipif_refcnt--;
5292         IPIF_UNTRACE_REF(ipif);
5293         if (ipif->ipif_refcnt != 0) {
5294                 mutex_exit(&ill->ill_lock);
5295                 return;
5296         }
5297
5298         /* Drops the ill_lock */
5299         ipif_ill_refrele_tail(ill);
5300 }
5301
5302 ipif_t *
5303 ipif_get_next_ipif(ipif_t *curr, ill_t *ill)
5304 {
5305         ipif_t  *ipif;
5306
5307         mutex_enter(&ill->ill_lock);
5308         for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next);
5309             ipif != NULL; ipif = ipif->ipif_next) {
5310                 if (IPIF_IS_CONDEMNED(ipif))
5311                         continue;
5312                 ipif_refhold_locked(ipif);
5313                 mutex_exit(&ill->ill_lock);
5314                 return (ipif);
5315         }
5316         mutex_exit(&ill->ill_lock);
5317         return (NULL);
5318 }
5319
5320 /*
5321  * TODO: make this table extendible at run time
5322  * Return a pointer to the mac type info for 'mac_type'
5323  */
5324 static ip_m_t *
5325 ip_m_lookup(t_uscalar_t mac_type)
5326 {
5327         ip_m_t  *ipm;
5328
5329         for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++)
5330                 if (ipm->ip_m_mac_type == mac_type)
5331                         return (ipm);
5332         return (NULL);
5333 }
5334
5335 /*
5336  * Make a link layer address from the multicast IP address *addr.
5337  * To form the link layer address, invoke the ip_m_v*mapping function
5338  * associated with the link-layer type.
5339  */
5340 void
5341 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr)
5342 {
5343         ip_m_t *ipm;
5344
5345         if (ill->ill_net_type == IRE_IF_NORESOLVER)
5346                 return;
5347
5348         ASSERT(addr != NULL);
5349
5350         ipm = ip_m_lookup(ill->ill_mactype);
5351         if (ipm == NULL ||
5352             (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) ||
5353             (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) {
5354                 ip0dbg(("no mapping for ill %s mactype 0x%x\n",
5355                     ill->ill_name, ill->ill_mactype));
5356                 return;
5357         }
5358         if (ill->ill_isv6)
5359                 (*ipm->ip_m_v6mapping)(ill, addr, hwaddr);
5360         else
5361                 (*ipm->ip_m_v4mapping)(ill, addr, hwaddr);
5362 }
5363
5364 /*
5365  * Returns B_FALSE if the IPv4 netmask pointed by `mask' is non-contiguous.
5366  * Otherwise returns B_TRUE.
5367  *
5368  * The netmask can be verified to be contiguous with 32 shifts and or
5369  * operations. Take the contiguous mask (in host byte order) and compute
5370  *      mask | mask << 1 | mask << 2 | ... | mask << 31
5371  * the result will be the same as the 'mask' for contiguous mask.
5372  */
5373 static boolean_t
5374 ip_contiguous_mask(uint32_t mask)
5375 {
5376         uint32_t        m = mask;
5377         int             i;
5378
5379         for (i = 1; i < 32; i++)
5380                 m |= (mask << i);
5381
5382         return (m == mask);
5383 }
5384
5385 /*
5386  * ip_rt_add is called to add an IPv4 route to the forwarding table.
5387  * ill is passed in to associate it with the correct interface.
5388  * If ire_arg is set, then we return the held IRE in that location.
5389  */
5390 int
5391 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
5392     ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg,
5393     boolean_t ioctl_msg, ip_stack_t *ipst, zoneid_t zoneid)
5394 {
5395         ire_t   *ire, *nire;
5396         ire_t   *gw_ire = NULL;
5397         ipif_t  *ipif = NULL;
5398         uint_t  type;
5399         int     match_flags = MATCH_IRE_TYPE;
5400         boolean_t unbound = B_FALSE;
5401
5402         ip1dbg(("ip_rt_add:"));
5403
5404         if (ire_arg != NULL)
5405                 *ire_arg = NULL;
5406
5407         /* disallow non-contiguous netmasks */
5408         if (!ip_contiguous_mask(ntohl(mask)))
5409                 return (ENOTSUP);
5410
5411         /*
5412          * If this is the case of RTF_HOST being set, then we set the netmask
5413          * to all ones (regardless if one was supplied).
5414          */
5415         if (flags & RTF_HOST)
5416                 mask = IP_HOST_MASK;
5417
5418         /*
5419          * Prevent routes with a zero gateway from being created (since
5420          * interfaces can currently be plumbed and brought up no assigned
5421          * address).
5422          */
5423         if (gw_addr == 0)
5424                 return (ENETUNREACH);
5425         /*
5426          * Get the ipif, if any, corresponding to the gw_addr
5427          * If -ifp was specified we restrict ourselves to the ill, otherwise
5428          * we match on the gatway and destination to handle unnumbered pt-pt
5429          * interfaces.
5430          */
5431         if (ill != NULL)
5432                 ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst);
5433         else
5434                 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
5435         if (ipif != NULL) {
5436                 if (IS_VNI(ipif->ipif_ill)) {
5437                         ipif_refrele(ipif);
5438                         return (EINVAL);
5439                 }
5440         }
5441
5442         /*
5443          * GateD will attempt to create routes with a loopback interface
5444          * address as the gateway and with RTF_GATEWAY set.  We allow
5445          * these routes to be added, but create them as interface routes
5446          * since the gateway is an interface address.
5447          */
5448         if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) {
5449                 flags &= ~RTF_GATEWAY;
5450                 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK &&
5451                     mask == IP_HOST_MASK) {
5452                         ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
5453                             NULL, ALL_ZONES, MATCH_IRE_TYPE, 0, ipst, NULL);
5454                         if (ire != NULL) {
5455                                 ire_refrele(ire);
5456                                 ipif_refrele(ipif);
5457                                 return (EEXIST);
5458                         }
5459                         ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x"
5460                             "for 0x%x\n", (void *)ipif,
5461                             ipif->ipif_ire_type,
5462                             ntohl(ipif->ipif_lcl_addr)));
5463                         ire = ire_create(
5464                             (uchar_t *)&dst_addr,       /* dest address */
5465                             (uchar_t *)&mask,           /* mask */
5466                             NULL,                       /* no gateway */
5467                             ipif->ipif_ire_type,        /* LOOPBACK */
5468                             ipif->ipif_ill,
5469                             zoneid,
5470                             (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
5471                             ipst);
5472
5473                         if (ire == NULL) {
5474                                 ipif_refrele(ipif);
5475                                 return (ENOMEM);
5476                         }
5477                         /* src address assigned by the caller? */
5478                         if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5479                                 ire->ire_setsrc_addr = src_addr;
5480
5481                         nire = ire_add(ire);
5482                         if (nire == NULL) {
5483                                 /*
5484                                  * In the result of failure, ire_add() will have
5485                                  * already deleted the ire in question, so there
5486                                  * is no need to do that here.
5487                                  */
5488                                 ipif_refrele(ipif);
5489                                 return (ENOMEM);
5490                         }
5491                         /*
5492                          * Check if it was a duplicate entry. This handles
5493                          * the case of two racing route adds for the same route
5494                          */
5495                         if (nire != ire) {
5496                                 ASSERT(nire->ire_identical_ref > 1);
5497                                 ire_delete(nire);
5498                                 ire_refrele(nire);
5499                                 ipif_refrele(ipif);
5500                                 return (EEXIST);
5501                         }
5502                         ire = nire;
5503                         goto save_ire;
5504                 }
5505         }
5506
5507         /*
5508          * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
5509          * and the gateway address provided is one of the system's interface
5510          * addresses.  By using the routing socket interface and supplying an
5511          * RTA_IFP sockaddr with an interface index, an alternate method of
5512          * specifying an interface route to be created is available which uses
5513          * the interface index that specifies the outgoing interface rather than
5514          * the address of an outgoing interface (which may not be able to
5515          * uniquely identify an interface).  When coupled with the RTF_GATEWAY
5516          * flag, routes can be specified which not only specify the next-hop to
5517          * be used when routing to a certain prefix, but also which outgoing
5518          * interface should be used.
5519          *
5520          * Previously, interfaces would have unique addresses assigned to them
5521          * and so the address assigned to a particular interface could be used
5522          * to identify a particular interface.  One exception to this was the
5523          * case of an unnumbered interface (where IPIF_UNNUMBERED was set).
5524          *
5525          * With the advent of IPv6 and its link-local addresses, this
5526          * restriction was relaxed and interfaces could share addresses between
5527          * themselves.  In fact, typically all of the link-local interfaces on
5528          * an IPv6 node or router will have the same link-local address.  In
5529          * order to differentiate between these interfaces, the use of an
5530          * interface index is necessary and this index can be carried inside a
5531          * RTA_IFP sockaddr (which is actually a sockaddr_dl).  One restriction
5532          * of using the interface index, however, is that all of the ipif's that
5533          * are part of an ill have the same index and so the RTA_IFP sockaddr
5534          * cannot be used to differentiate between ipif's (or logical
5535          * interfaces) that belong to the same ill (physical interface).
5536          *
5537          * For example, in the following case involving IPv4 interfaces and
5538          * logical interfaces
5539          *
5540          *      192.0.2.32      255.255.255.224 192.0.2.33      U       if0
5541          *      192.0.2.32      255.255.255.224 192.0.2.34      U       if0
5542          *      192.0.2.32      255.255.255.224 192.0.2.35      U       if0
5543          *
5544          * the ipif's corresponding to each of these interface routes can be
5545          * uniquely identified by the "gateway" (actually interface address).
5546          *
5547          * In this case involving multiple IPv6 default routes to a particular
5548          * link-local gateway, the use of RTA_IFP is necessary to specify which
5549          * default route is of interest:
5550          *
5551          *      default         fe80::123:4567:89ab:cdef        U       if0
5552          *      default         fe80::123:4567:89ab:cdef        U       if1
5553          */
5554
5555         /* RTF_GATEWAY not set */
5556         if (!(flags & RTF_GATEWAY)) {
5557                 /*
5558                  * Whether or not ill (RTA_IFP) is set, we require that
5559                  * the gateway is one of our local addresses.
5560                  */
5561                 if (ipif == NULL)
5562                         return (ENETUNREACH);
5563
5564                 /*
5565                  * We use MATCH_IRE_ILL here. If the caller specified an
5566                  * interface (from the RTA_IFP sockaddr) we use it, otherwise
5567                  * we use the ill derived from the gateway address.
5568                  * We can always match the gateway address since we record it
5569                  * in ire_gateway_addr.
5570                  * We don't allow RTA_IFP to specify a different ill than the
5571                  * one matching the ipif to make sure we can delete the route.
5572                  */
5573                 match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL;
5574                 if (ill == NULL) {
5575                         ill = ipif->ipif_ill;
5576                 } else if (ill != ipif->ipif_ill) {
5577                         ipif_refrele(ipif);
5578                         return (EINVAL);
5579                 }
5580
5581                 /*
5582                  * We check for an existing entry at this point.
5583                  *
5584                  * Since a netmask isn't passed in via the ioctl interface
5585                  * (SIOCADDRT), we don't check for a matching netmask in that
5586                  * case.
5587                  */
5588                 if (!ioctl_msg)
5589                         match_flags |= MATCH_IRE_MASK;
5590                 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
5591                     IRE_INTERFACE, ill, ALL_ZONES, match_flags, 0, ipst, NULL);
5592                 if (ire != NULL) {
5593                         ire_refrele(ire);
5594                         ipif_refrele(ipif);
5595                         return (EEXIST);
5596                 }
5597
5598                 /*
5599                  * Some software (for example, GateD and Sun Cluster) attempts
5600                  * to create (what amount to) IRE_PREFIX routes with the
5601                  * loopback address as the gateway.  This is primarily done to
5602                  * set up prefixes with the RTF_REJECT flag set (for example,
5603                  * when generating aggregate routes.)
5604                  *
5605                  * If the IRE type (as defined by ill->ill_net_type) would be
5606                  * IRE_LOOPBACK, then we map the request into a
5607                  * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as
5608                  * these interface routes, by definition, can only be that.
5609                  *
5610                  * Needless to say, the real IRE_LOOPBACK is NOT created by this
5611                  * routine, but rather using ire_create() directly.
5612                  *
5613                  */
5614                 type = ill->ill_net_type;
5615                 if (type == IRE_LOOPBACK) {
5616                         type = IRE_IF_NORESOLVER;
5617                         flags |= RTF_BLACKHOLE;
5618                 }
5619
5620                 /*
5621                  * Create a copy of the IRE_IF_NORESOLVER or
5622                  * IRE_IF_RESOLVER with the modified address, netmask, and
5623                  * gateway.
5624                  */
5625                 ire = ire_create(
5626                     (uchar_t *)&dst_addr,
5627                     (uint8_t *)&mask,
5628                     (uint8_t *)&gw_addr,
5629                     type,
5630                     ill,
5631                     zoneid,
5632                     flags,
5633                     ipst);
5634                 if (ire == NULL) {
5635                         ipif_refrele(ipif);
5636                         return (ENOMEM);
5637                 }
5638
5639                 /* src address assigned by the caller? */
5640                 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5641                         ire->ire_setsrc_addr = src_addr;
5642
5643                 nire = ire_add(ire);
5644                 if (nire == NULL) {
5645                         /*
5646                          * In the result of failure, ire_add() will have
5647                          * already deleted the ire in question, so there
5648                          * is no need to do that here.
5649                          */
5650                         ipif_refrele(ipif);
5651                         return (ENOMEM);
5652                 }
5653                 /*
5654                  * Check if it was a duplicate entry. This handles
5655                  * the case of two racing route adds for the same route
5656                  */
5657                 if (nire != ire) {
5658                         ire_delete(nire);
5659                         ire_refrele(nire);
5660                         ipif_refrele(ipif);
5661                         return (EEXIST);
5662                 }
5663                 ire = nire;
5664                 goto save_ire;
5665         }
5666
5667         /*
5668          * Get an interface IRE for the specified gateway.
5669          * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
5670          * gateway, it is currently unreachable and we fail the request
5671          * accordingly. We reject any RTF_GATEWAY routes where the gateway
5672          * is an IRE_LOCAL or IRE_LOOPBACK.
5673          * If RTA_IFP was specified we look on that particular ill.
5674          */
5675         if (ill != NULL)
5676                 match_flags |= MATCH_IRE_ILL;
5677
5678         /* Check whether the gateway is reachable. */
5679 again:
5680         type = IRE_INTERFACE | IRE_LOCAL | IRE_LOOPBACK;
5681         if (flags & RTF_INDIRECT)
5682                 type |= IRE_OFFLINK;
5683
5684         gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill,
5685             ALL_ZONES, match_flags, 0, ipst, NULL);
5686         if (gw_ire == NULL) {
5687                 /*
5688                  * With IPMP, we allow host routes to influence in.mpathd's
5689                  * target selection.  However, if the test addresses are on
5690                  * their own network, the above lookup will fail since the
5691                  * underlying IRE_INTERFACEs are marked hidden.  So allow
5692                  * hidden test IREs to be found and try again.
5693                  */
5694                 if (!(match_flags & MATCH_IRE_TESTHIDDEN))  {
5695                         match_flags |= MATCH_IRE_TESTHIDDEN;
5696                         goto again;
5697                 }
5698                 if (ipif != NULL)
5699                         ipif_refrele(ipif);
5700                 return (ENETUNREACH);
5701         }
5702         if (gw_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
5703                 ire_refrele(gw_ire);
5704                 if (ipif != NULL)
5705                         ipif_refrele(ipif);
5706                 return (ENETUNREACH);
5707         }
5708
5709         if (ill == NULL && !(flags & RTF_INDIRECT)) {
5710                 unbound = B_TRUE;
5711                 if (ipst->ips_ip_strict_src_multihoming > 0)
5712                         ill = gw_ire->ire_ill;
5713         }
5714
5715         /*
5716          * We create one of three types of IREs as a result of this request
5717          * based on the netmask.  A netmask of all ones (which is automatically
5718          * assumed when RTF_HOST is set) results in an IRE_HOST being created.
5719          * An all zeroes netmask implies a default route so an IRE_DEFAULT is
5720          * created.  Otherwise, an IRE_PREFIX route is created for the
5721          * destination prefix.
5722          */
5723         if (mask == IP_HOST_MASK)
5724                 type = IRE_HOST;
5725         else if (mask == 0)
5726                 type = IRE_DEFAULT;
5727         else
5728                 type = IRE_PREFIX;
5729
5730         /* check for a duplicate entry */
5731         ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
5732             ALL_ZONES, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, 0, ipst,
5733             NULL);
5734         if (ire != NULL) {
5735                 if (ipif != NULL)
5736                         ipif_refrele(ipif);
5737                 ire_refrele(gw_ire);
5738                 ire_refrele(ire);
5739                 return (EEXIST);
5740         }
5741
5742         /* Create the IRE. */
5743         ire = ire_create(
5744             (uchar_t *)&dst_addr,               /* dest address */
5745             (uchar_t *)&mask,                   /* mask */
5746             (uchar_t *)&gw_addr,                /* gateway address */
5747             (ushort_t)type,                     /* IRE type */
5748             ill,
5749             zoneid,
5750             flags,
5751             ipst);
5752
5753         if (ire == NULL) {
5754                 if (ipif != NULL)
5755                         ipif_refrele(ipif);
5756                 ire_refrele(gw_ire);
5757                 return (ENOMEM);
5758         }
5759
5760         /* src address assigned by the caller? */
5761         if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5762                 ire->ire_setsrc_addr = src_addr;
5763
5764         ire->ire_unbound = unbound;
5765
5766         /*
5767          * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
5768          * SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
5769          */
5770
5771         /* Add the new IRE. */
5772         nire = ire_add(ire);
5773         if (nire == NULL) {
5774                 /*
5775                  * In the result of failure, ire_add() will have
5776                  * already deleted the ire in question, so there
5777                  * is no need to do that here.
5778                  */
5779                 if (ipif != NULL)
5780                         ipif_refrele(ipif);
5781                 ire_refrele(gw_ire);
5782                 return (ENOMEM);
5783         }
5784         /*
5785          * Check if it was a duplicate entry. This handles
5786          * the case of two racing route adds for the same route
5787          */
5788         if (nire != ire) {
5789                 ire_delete(nire);
5790                 ire_refrele(nire);
5791                 if (ipif != NULL)
5792                         ipif_refrele(ipif);
5793                 ire_refrele(gw_ire);
5794                 return (EEXIST);
5795         }
5796         ire = nire;
5797
5798 save_ire:
5799         if (gw_ire != NULL) {
5800                 ire_refrele(gw_ire);
5801                 gw_ire = NULL;
5802         }
5803         if (ill != NULL) {
5804                 /*
5805                  * Save enough information so that we can recreate the IRE if
5806                  * the interface goes down and then up.  The metrics associated
5807                  * with the route will be saved as well when rts_setmetrics() is
5808                  * called after the IRE has been created.  In the case where
5809                  * memory cannot be allocated, none of this information will be
5810                  * saved.
5811                  */
5812                 ill_save_ire(ill, ire);
5813         }
5814         if (ioctl_msg)
5815                 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst);
5816         if (ire_arg != NULL) {
5817                 /*
5818                  * Store the ire that was successfully added into where ire_arg
5819                  * points to so that callers don't have to look it up
5820                  * themselves (but they are responsible for ire_refrele()ing
5821                  * the ire when they are finished with it).
5822                  */
5823                 *ire_arg = ire;
5824         } else {
5825                 ire_refrele(ire);               /* Held in ire_add */
5826         }
5827         if (ipif != NULL)
5828                 ipif_refrele(ipif);
5829         return (0);
5830 }
5831
5832 /*
5833  * ip_rt_delete is called to delete an IPv4 route.
5834  * ill is passed in to associate it with the correct interface.
5835  */
5836 /* ARGSUSED4 */
5837 int
5838 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
5839     uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg,
5840     ip_stack_t *ipst, zoneid_t zoneid)
5841 {
5842         ire_t   *ire = NULL;
5843         ipif_t  *ipif;
5844         uint_t  type;
5845         uint_t  match_flags = MATCH_IRE_TYPE;
5846         int     err = 0;
5847
5848         ip1dbg(("ip_rt_delete:"));
5849         /*
5850          * If this is the case of RTF_HOST being set, then we set the netmask
5851          * to all ones.  Otherwise, we use the netmask if one was supplied.
5852          */
5853         if (flags & RTF_HOST) {
5854                 mask = IP_HOST_MASK;
5855                 match_flags |= MATCH_IRE_MASK;
5856         } else if (rtm_addrs & RTA_NETMASK) {
5857                 match_flags |= MATCH_IRE_MASK;
5858         }
5859
5860         /*
5861          * Note that RTF_GATEWAY is never set on a delete, therefore
5862          * we check if the gateway address is one of our interfaces first,
5863          * and fall back on RTF_GATEWAY routes.
5864          *
5865          * This makes it possible to delete an original
5866          * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
5867          * However, we have RTF_KERNEL set on the ones created by ipif_up
5868          * and those can not be deleted here.
5869          *
5870          * We use MATCH_IRE_ILL if we know the interface. If the caller
5871          * specified an interface (from the RTA_IFP sockaddr) we use it,
5872          * otherwise we use the ill derived from the gateway address.
5873          * We can always match the gateway address since we record it
5874          * in ire_gateway_addr.
5875          *
5876          * For more detail on specifying routes by gateway address and by
5877          * interface index, see the comments in ip_rt_add().
5878          */
5879         ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
5880         if (ipif != NULL) {
5881                 ill_t   *ill_match;
5882
5883                 if (ill != NULL)
5884                         ill_match = ill;
5885                 else
5886                         ill_match = ipif->ipif_ill;
5887
5888                 match_flags |= MATCH_IRE_ILL;
5889                 if (ipif->ipif_ire_type == IRE_LOOPBACK) {
5890                         ire = ire_ftable_lookup_v4(dst_addr, mask, 0,
5891                             IRE_LOOPBACK, ill_match, ALL_ZONES, match_flags, 0,
5892                             ipst, NULL);
5893                 }
5894                 if (ire == NULL) {
5895                         match_flags |= MATCH_IRE_GW;
5896                         ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
5897                             IRE_INTERFACE, ill_match, ALL_ZONES, match_flags,
5898                             0, ipst, NULL);
5899                 }
5900                 /* Avoid deleting routes created by kernel from an ipif */
5901                 if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) {
5902                         ire_refrele(ire);
5903                         ire = NULL;
5904                 }
5905
5906                 /* Restore in case we didn't find a match */
5907                 match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL);
5908         }
5909
5910         if (ire == NULL) {
5911                 /*
5912                  * At this point, the gateway address is not one of our own
5913                  * addresses or a matching interface route was not found.  We
5914                  * set the IRE type to lookup based on whether
5915                  * this is a host route, a default route or just a prefix.
5916                  *
5917                  * If an ill was passed in, then the lookup is based on an
5918                  * interface index so MATCH_IRE_ILL is added to match_flags.
5919                  */
5920                 match_flags |= MATCH_IRE_GW;
5921                 if (ill != NULL)
5922                         match_flags |= MATCH_IRE_ILL;
5923                 if (mask == IP_HOST_MASK)
5924                         type = IRE_HOST;
5925                 else if (mask == 0)
5926                         type = IRE_DEFAULT;
5927                 else
5928                         type = IRE_PREFIX;
5929                 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
5930                     ALL_ZONES, match_flags, 0, ipst, NULL);
5931         }
5932
5933         if (ipif != NULL) {
5934                 ipif_refrele(ipif);
5935                 ipif = NULL;
5936         }
5937
5938         if (ire == NULL)
5939                 return (ESRCH);
5940
5941         ill = ire->ire_ill;
5942         if (ill != NULL)
5943                 ill_remove_saved_ire(ill, ire);
5944         if (ioctl_msg)
5945                 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst);
5946         ire_delete(ire);
5947         ire_refrele(ire);
5948         return (err);
5949 }
5950
5951 /*
5952  * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL.
5953  */
5954 /* ARGSUSED */
5955 int
5956 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
5957     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
5958 {
5959         ipaddr_t dst_addr;
5960         ipaddr_t gw_addr;
5961         ipaddr_t mask;
5962         int error = 0;
5963         mblk_t *mp1;
5964         struct rtentry *rt;
5965         ipif_t *ipif = NULL;
5966         ip_stack_t      *ipst;
5967
5968         ASSERT(q->q_next == NULL);
5969         ipst = CONNQ_TO_IPST(q);
5970
5971         ip1dbg(("ip_siocaddrt:"));
5972         /* Existence of mp1 verified in ip_wput_nondata */
5973         mp1 = mp->b_cont->b_cont;
5974         rt = (struct rtentry *)mp1->b_rptr;
5975
5976         dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
5977         gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
5978
5979         /*
5980          * If the RTF_HOST flag is on, this is a request to assign a gateway
5981          * to a particular host address.  In this case, we set the netmask to
5982          * all ones for the particular destination address.  Otherwise,
5983          * determine the netmask to be used based on dst_addr and the interfaces
5984          * in use.
5985          */
5986         if (rt->rt_flags & RTF_HOST) {
5987                 mask = IP_HOST_MASK;
5988         } else {
5989                 /*
5990                  * Note that ip_subnet_mask returns a zero mask in the case of
5991                  * default (an all-zeroes address).
5992                  */
5993                 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
5994         }
5995
5996         error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL,
5997             B_TRUE, ipst, ALL_ZONES);
5998         if (ipif != NULL)
5999                 ipif_refrele(ipif);
6000         return (error);
6001 }
6002
6003 /*
6004  * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL.
6005  */
6006 /* ARGSUSED */
6007 int
6008 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6009     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6010 {
6011         ipaddr_t dst_addr;
6012         ipaddr_t gw_addr;
6013         ipaddr_t mask;
6014         int error;
6015         mblk_t *mp1;
6016         struct rtentry *rt;
6017         ipif_t *ipif = NULL;
6018         ip_stack_t      *ipst;
6019
6020         ASSERT(q->q_next == NULL);
6021         ipst = CONNQ_TO_IPST(q);
6022
6023         ip1dbg(("ip_siocdelrt:"));
6024         /* Existence of mp1 verified in ip_wput_nondata */
6025         mp1 = mp->b_cont->b_cont;
6026         rt = (struct rtentry *)mp1->b_rptr;
6027
6028         dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6029         gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6030
6031         /*
6032          * If the RTF_HOST flag is on, this is a request to delete a gateway
6033          * to a particular host address.  In this case, we set the netmask to
6034          * all ones for the particular destination address.  Otherwise,
6035          * determine the netmask to be used based on dst_addr and the interfaces
6036          * in use.
6037          */
6038         if (rt->rt_flags & RTF_HOST) {
6039                 mask = IP_HOST_MASK;
6040         } else {
6041                 /*
6042                  * Note that ip_subnet_mask returns a zero mask in the case of
6043                  * default (an all-zeroes address).
6044                  */
6045                 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
6046         }
6047
6048         error = ip_rt_delete(dst_addr, mask, gw_addr,
6049             RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE,
6050             ipst, ALL_ZONES);
6051         if (ipif != NULL)
6052                 ipif_refrele(ipif);
6053         return (error);
6054 }
6055
6056 /*
6057  * Enqueue the mp onto the ipsq, chained by b_next.
6058  * b_prev stores the function to be executed later, and b_queue the queue
6059  * where this mp originated.
6060  */
6061 void
6062 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6063     ill_t *pending_ill)
6064 {
6065         conn_t  *connp;
6066         ipxop_t *ipx = ipsq->ipsq_xop;
6067
6068         ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
6069         ASSERT(MUTEX_HELD(&ipx->ipx_lock));
6070         ASSERT(func != NULL);
6071
6072         mp->b_queue = q;
6073         mp->b_prev = (void *)func;
6074         mp->b_next = NULL;
6075
6076         switch (type) {
6077         case CUR_OP:
6078                 if (ipx->ipx_mptail != NULL) {
6079                         ASSERT(ipx->ipx_mphead != NULL);
6080                         ipx->ipx_mptail->b_next = mp;
6081                 } else {
6082                         ASSERT(ipx->ipx_mphead == NULL);
6083                         ipx->ipx_mphead = mp;
6084                 }
6085                 ipx->ipx_mptail = mp;
6086                 break;
6087
6088         case NEW_OP:
6089                 if (ipsq->ipsq_xopq_mptail != NULL) {
6090                         ASSERT(ipsq->ipsq_xopq_mphead != NULL);
6091                         ipsq->ipsq_xopq_mptail->b_next = mp;
6092                 } else {
6093                         ASSERT(ipsq->ipsq_xopq_mphead == NULL);
6094                         ipsq->ipsq_xopq_mphead = mp;
6095                 }
6096                 ipsq->ipsq_xopq_mptail = mp;
6097                 ipx->ipx_ipsq_queued = B_TRUE;
6098                 break;
6099
6100         case SWITCH_OP:
6101                 ASSERT(ipsq->ipsq_swxop != NULL);
6102                 /* only one switch operation is currently allowed */
6103                 ASSERT(ipsq->ipsq_switch_mp == NULL);
6104                 ipsq->ipsq_switch_mp = mp;
6105                 ipx->ipx_ipsq_queued = B_TRUE;
6106                 break;
6107         default:
6108                 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
6109         }
6110
6111         if (CONN_Q(q) && pending_ill != NULL) {
6112                 connp = Q_TO_CONN(q);
6113                 ASSERT(MUTEX_HELD(&connp->conn_lock));
6114                 connp->conn_oper_pending_ill = pending_ill;
6115         }
6116 }
6117
6118 /*
6119  * Dequeue the next message that requested exclusive access to this IPSQ's
6120  * xop.  Specifically:
6121  *
6122  *  1. If we're still processing the current operation on `ipsq', then
6123  *     dequeue the next message for the operation (from ipx_mphead), or
6124  *     return NULL if there are no queued messages for the operation.
6125  *     These messages are queued via CUR_OP to qwriter_ip() and friends.
6126  *
6127  *  2. If the current operation on `ipsq' has completed (ipx_current_ipif is
6128  *     not set) see if the ipsq has requested an xop switch.  If so, switch
6129  *     `ipsq' to a different xop.  Xop switches only happen when joining or
6130  *     leaving IPMP groups and require a careful dance -- see the comments
6131  *     in-line below for details.  If we're leaving a group xop or if we're
6132  *     joining a group xop and become writer on it, then we proceed to (3).
6133  *     Otherwise, we return NULL and exit the xop.
6134  *
6135  *  3. For each IPSQ in the xop, return any switch operation stored on
6136  *     ipsq_switch_mp (set via SWITCH_OP); these must be processed before
6137  *     any other messages queued on the IPSQ.  Otherwise, dequeue the next
6138  *     exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead.
6139  *     Note that if the phyint tied to `ipsq' is not using IPMP there will
6140  *     only be one IPSQ in the xop.  Otherwise, there will be one IPSQ for
6141  *     each phyint in the group, including the IPMP meta-interface phyint.
6142  */
6143 static mblk_t *
6144 ipsq_dq(ipsq_t *ipsq)
6145 {
6146         ill_t   *illv4, *illv6;
6147         mblk_t  *mp;
6148         ipsq_t  *xopipsq;
6149         ipsq_t  *leftipsq = NULL;
6150         ipxop_t *ipx;
6151         phyint_t *phyi = ipsq->ipsq_phyint;
6152         ip_stack_t *ipst = ipsq->ipsq_ipst;
6153         boolean_t emptied = B_FALSE;
6154
6155         /*
6156          * Grab all the locks we need in the defined order (ill_g_lock ->
6157          * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next.
6158          */
6159         rw_enter(&ipst->ips_ill_g_lock,
6160             ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER);
6161         mutex_enter(&ipsq->ipsq_lock);
6162         ipx = ipsq->ipsq_xop;
6163         mutex_enter(&ipx->ipx_lock);
6164
6165         /*
6166          * Dequeue the next message associated with the current exclusive
6167          * operation, if any.
6168          */
6169         if ((mp = ipx->ipx_mphead) != NULL) {
6170                 ipx->ipx_mphead = mp->b_next;
6171                 if (ipx->ipx_mphead == NULL)
6172                         ipx->ipx_mptail = NULL;
6173                 mp->b_next = (void *)ipsq;
6174                 goto out;
6175         }
6176
6177         if (ipx->ipx_current_ipif != NULL)
6178                 goto empty;
6179
6180         if (ipsq->ipsq_swxop != NULL) {
6181                 /*
6182                  * The exclusive operation that is now being completed has
6183                  * requested a switch to a different xop.  This happens
6184                  * when an interface joins or leaves an IPMP group.  Joins
6185                  * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()).
6186                  * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb
6187                  * (phyint_free()), or interface plumb for an ill type
6188                  * not in the IPMP group (ip_rput_dlpi_writer()).
6189                  *
6190                  * Xop switches are not allowed on the IPMP meta-interface.
6191                  */
6192                 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP));
6193                 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
6194                 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq);
6195
6196                 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) {
6197                         /*
6198                          * We're switching back to our own xop, so we have two
6199                          * xop's to drain/exit: our own, and the group xop
6200                          * that we are leaving.
6201                          *
6202                          * First, pull ourselves out of the group ipsq list.
6203                          * This is safe since we're writer on ill_g_lock.
6204                          */
6205                         ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop);
6206
6207                         xopipsq = ipx->ipx_ipsq;
6208                         while (xopipsq->ipsq_next != ipsq)
6209                                 xopipsq = xopipsq->ipsq_next;
6210
6211                         xopipsq->ipsq_next = ipsq->ipsq_next;
6212                         ipsq->ipsq_next = ipsq;
6213                         ipsq->ipsq_xop = ipsq->ipsq_swxop;
6214                         ipsq->ipsq_swxop = NULL;
6215
6216                         /*
6217                          * Second, prepare to exit the group xop.  The actual
6218                          * ipsq_exit() is done at the end of this function
6219                          * since we cannot hold any locks across ipsq_exit().
6220                          * Note that although we drop the group's ipx_lock, no
6221                          * threads can proceed since we're still ipx_writer.
6222                          */
6223                         leftipsq = xopipsq;
6224                         mutex_exit(&ipx->ipx_lock);
6225
6226                         /*
6227                          * Third, set ipx to point to our own xop (which was
6228                          * inactive and therefore can be entered).
6229                          */
6230                         ipx = ipsq->ipsq_xop;
6231                         mutex_enter(&ipx->ipx_lock);
6232                         ASSERT(ipx->ipx_writer == NULL);
6233                         ASSERT(ipx->ipx_current_ipif == NULL);
6234                 } else {
6235                         /*
6236                          * We're switching from our own xop to a group xop.
6237                          * The requestor of the switch must ensure that the
6238                          * group xop cannot go away (e.g. by ensuring the
6239                          * phyint associated with the xop cannot go away).
6240                          *
6241                          * If we can become writer on our new xop, then we'll
6242                          * do the drain.  Otherwise, the current writer of our
6243                          * new xop will do the drain when it exits.
6244                          *
6245                          * First, splice ourselves into the group IPSQ list.
6246                          * This is safe since we're writer on ill_g_lock.
6247                          */
6248                         ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6249
6250                         xopipsq = ipsq->ipsq_swxop->ipx_ipsq;
6251                         while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq)
6252                                 xopipsq = xopipsq->ipsq_next;
6253
6254                         xopipsq->ipsq_next = ipsq;
6255                         ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq;
6256                         ipsq->ipsq_xop = ipsq->ipsq_swxop;
6257                         ipsq->ipsq_swxop = NULL;
6258
6259                         /*
6260                          * Second, exit our own xop, since it's now unused.
6261                          * This is safe since we've got the only reference.
6262                          */
6263                         ASSERT(ipx->ipx_writer == curthread);
6264                         ipx->ipx_writer = NULL;
6265                         VERIFY(--ipx->ipx_reentry_cnt == 0);
6266                         ipx->ipx_ipsq_queued = B_FALSE;
6267                         mutex_exit(&ipx->ipx_lock);
6268
6269                         /*
6270                          * Third, set ipx to point to our new xop, and check
6271                          * if we can become writer on it.  If we cannot, then
6272                          * the current writer will drain the IPSQ group when
6273                          * it exits.  Our ipsq_xop is guaranteed to be stable
6274                          * because we're still holding ipsq_lock.
6275                          */
6276                         ipx = ipsq->ipsq_xop;
6277                         mutex_enter(&ipx->ipx_lock);
6278                         if (ipx->ipx_writer != NULL ||
6279                             ipx->ipx_current_ipif != NULL) {
6280                                 goto out;
6281                         }
6282                 }
6283
6284                 /*
6285                  * Fourth, become writer on our new ipx before we continue
6286                  * with the drain.  Note that we never dropped ipsq_lock
6287                  * above, so no other thread could've raced with us to
6288                  * become writer first.  Also, we're holding ipx_lock, so
6289                  * no other thread can examine the ipx right now.
6290                  */
6291                 ASSERT(ipx->ipx_current_ipif == NULL);
6292                 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6293                 VERIFY(ipx->ipx_reentry_cnt++ == 0);
6294                 ipx->ipx_writer = curthread;
6295                 ipx->ipx_forced = B_FALSE;
6296 #ifdef DEBUG
6297                 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6298 #endif
6299         }
6300
6301         xopipsq = ipsq;
6302         do {
6303                 /*
6304                  * So that other operations operate on a consistent and
6305                  * complete phyint, a switch message on an IPSQ must be
6306                  * handled prior to any other operations on that IPSQ.
6307                  */
6308                 if ((mp = xopipsq->ipsq_switch_mp) != NULL) {
6309                         xopipsq->ipsq_switch_mp = NULL;
6310                         ASSERT(mp->b_next == NULL);
6311                         mp->b_next = (void *)xopipsq;
6312                         goto out;
6313                 }
6314
6315                 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) {
6316                         xopipsq->ipsq_xopq_mphead = mp->b_next;
6317                         if (xopipsq->ipsq_xopq_mphead == NULL)
6318                                 xopipsq->ipsq_xopq_mptail = NULL;
6319                         mp->b_next = (void *)xopipsq;
6320                         goto out;
6321                 }
6322         } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6323 empty:
6324         /*
6325          * There are no messages.  Further, we are holding ipx_lock, hence no
6326          * new messages can end up on any IPSQ in the xop.
6327          */
6328         ipx->ipx_writer = NULL;
6329         ipx->ipx_forced = B_FALSE;
6330         VERIFY(--ipx->ipx_reentry_cnt == 0);
6331         ipx->ipx_ipsq_queued = B_FALSE;
6332         emptied = B_TRUE;
6333 #ifdef  DEBUG
6334         ipx->ipx_depth = 0;
6335 #endif
6336 out:
6337         mutex_exit(&ipx->ipx_lock);
6338         mutex_exit(&ipsq->ipsq_lock);
6339
6340         /*
6341          * If we completely emptied the xop, then wake up any threads waiting
6342          * to enter any of the IPSQ's associated with it.
6343          */
6344         if (emptied) {
6345                 xopipsq = ipsq;
6346                 do {
6347                         if ((phyi = xopipsq->ipsq_phyint) == NULL)
6348                                 continue;
6349
6350                         illv4 = phyi->phyint_illv4;
6351                         illv6 = phyi->phyint_illv6;
6352
6353                         GRAB_ILL_LOCKS(illv4, illv6);
6354                         if (illv4 != NULL)
6355                                 cv_broadcast(&illv4->ill_cv);
6356                         if (illv6 != NULL)
6357                                 cv_broadcast(&illv6->ill_cv);
6358                         RELEASE_ILL_LOCKS(illv4, illv6);
6359                 } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6360         }
6361         rw_exit(&ipst->ips_ill_g_lock);
6362
6363         /*
6364          * Now that all locks are dropped, exit the IPSQ we left.
6365          */
6366         if (leftipsq != NULL)
6367                 ipsq_exit(leftipsq);
6368
6369         return (mp);
6370 }
6371
6372 /*
6373  * Return completion status of previously initiated DLPI operations on
6374  * ills in the purview of an ipsq.
6375  */
6376 static boolean_t
6377 ipsq_dlpi_done(ipsq_t *ipsq)
6378 {
6379         ipsq_t          *ipsq_start;
6380         phyint_t        *phyi;
6381         ill_t           *ill;
6382
6383         ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock));
6384         ipsq_start = ipsq;
6385
6386         do {
6387                 /*
6388                  * The only current users of this function are ipsq_try_enter
6389                  * and ipsq_enter which have made sure that ipsq_writer is
6390                  * NULL before we reach here. ill_dlpi_pending is modified
6391                  * only by an ipsq writer
6392                  */
6393                 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL);
6394                 phyi = ipsq->ipsq_phyint;
6395                 /*
6396                  * phyi could be NULL if a phyint that is part of an
6397                  * IPMP group is being unplumbed. A more detailed
6398                  * comment is in ipmp_grp_update_kstats()
6399                  */
6400                 if (phyi != NULL) {
6401                         ill = phyi->phyint_illv4;
6402                         if (ill != NULL &&
6403                             (ill->ill_dlpi_pending != DL_PRIM_INVAL ||
6404                             ill->ill_arl_dlpi_pending))
6405                                 return (B_FALSE);
6406
6407                         ill = phyi->phyint_illv6;
6408                         if (ill != NULL &&
6409                             ill->ill_dlpi_pending != DL_PRIM_INVAL)
6410                                 return (B_FALSE);
6411                 }
6412
6413         } while ((ipsq = ipsq->ipsq_next) != ipsq_start);
6414
6415         return (B_TRUE);
6416 }
6417
6418 /*
6419  * Enter the ipsq corresponding to ill, by waiting synchronously till
6420  * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
6421  * will have to drain completely before ipsq_enter returns success.
6422  * ipx_current_ipif will be set if some exclusive op is in progress,
6423  * and the ipsq_exit logic will start the next enqueued op after
6424  * completion of the current op. If 'force' is used, we don't wait
6425  * for the enqueued ops. This is needed when a conn_close wants to
6426  * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
6427  * of an ill can also use this option. But we dont' use it currently.
6428  */
6429 #define ENTER_SQ_WAIT_TICKS 100
6430 boolean_t
6431 ipsq_enter(ill_t *ill, boolean_t force, int type)
6432 {
6433         ipsq_t  *ipsq;
6434         ipxop_t *ipx;
6435         boolean_t waited_enough = B_FALSE;
6436         ip_stack_t *ipst = ill->ill_ipst;
6437
6438         /*
6439          * Note that the relationship between ill and ipsq is fixed as long as
6440          * the ill is not ILL_CONDEMNED.  Holding ipsq_lock ensures the
6441          * relationship between the IPSQ and xop cannot change.  However,
6442          * since we cannot hold ipsq_lock across the cv_wait(), it may change
6443          * while we're waiting.  We wait on ill_cv and rely on ipsq_exit()
6444          * waking up all ills in the xop when it becomes available.
6445          */
6446         for (;;) {
6447                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6448                 mutex_enter(&ill->ill_lock);
6449                 if (ill->ill_state_flags & ILL_CONDEMNED) {
6450                         mutex_exit(&ill->ill_lock);
6451                         rw_exit(&ipst->ips_ill_g_lock);
6452                         return (B_FALSE);
6453                 }
6454
6455                 ipsq = ill->ill_phyint->phyint_ipsq;
6456                 mutex_enter(&ipsq->ipsq_lock);
6457                 ipx = ipsq->ipsq_xop;
6458                 mutex_enter(&ipx->ipx_lock);
6459
6460                 if (ipx->ipx_writer == NULL && (type == CUR_OP ||
6461                     (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) ||
6462                     waited_enough))
6463                         break;
6464
6465                 rw_exit(&ipst->ips_ill_g_lock);
6466
6467                 if (!force || ipx->ipx_writer != NULL) {
6468                         mutex_exit(&ipx->ipx_lock);
6469                         mutex_exit(&ipsq->ipsq_lock);
6470                         cv_wait(&ill->ill_cv, &ill->ill_lock);
6471                 } else {
6472                         mutex_exit(&ipx->ipx_lock);
6473                         mutex_exit(&ipsq->ipsq_lock);
6474                         (void) cv_reltimedwait(&ill->ill_cv,
6475                             &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK);
6476                         waited_enough = B_TRUE;
6477                 }
6478                 mutex_exit(&ill->ill_lock);
6479         }
6480
6481         ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6482         ASSERT(ipx->ipx_reentry_cnt == 0);
6483         ipx->ipx_writer = curthread;
6484         ipx->ipx_forced = (ipx->ipx_current_ipif != NULL);
6485         ipx->ipx_reentry_cnt++;
6486 #ifdef DEBUG
6487         ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6488 #endif
6489         mutex_exit(&ipx->ipx_lock);
6490         mutex_exit(&ipsq->ipsq_lock);
6491         mutex_exit(&ill->ill_lock);
6492         rw_exit(&ipst->ips_ill_g_lock);
6493
6494         return (B_TRUE);
6495 }
6496
6497 /*
6498  * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock
6499  * across the call to the core interface ipsq_try_enter() and hence calls this
6500  * function directly. This is explained more fully in ipif_set_values().
6501  * In order to support the above constraint, ipsq_try_enter is implemented as
6502  * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently
6503  */
6504 static ipsq_t *
6505 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
6506     int type, boolean_t reentry_ok)
6507 {
6508         ipsq_t  *ipsq;
6509         ipxop_t *ipx;
6510         ip_stack_t *ipst = ill->ill_ipst;
6511
6512         /*
6513          * lock ordering:
6514          * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
6515          *
6516          * ipx of an ipsq can't change when ipsq_lock is held.
6517          */
6518         ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
6519         GRAB_CONN_LOCK(q);
6520         mutex_enter(&ill->ill_lock);
6521         ipsq = ill->ill_phyint->phyint_ipsq;
6522         mutex_enter(&ipsq->ipsq_lock);
6523         ipx = ipsq->ipsq_xop;
6524         mutex_enter(&ipx->ipx_lock);
6525
6526         /*
6527          * 1. Enter the ipsq if we are already writer and reentry is ok.
6528          *    (Note: If the caller does not specify reentry_ok then neither
6529          *    'func' nor any of its callees must ever attempt to enter the ipsq
6530          *    again. Otherwise it can lead to an infinite loop
6531          * 2. Enter the ipsq if there is no current writer and this attempted
6532          *    entry is part of the current operation
6533          * 3. Enter the ipsq if there is no current writer and this is a new
6534          *    operation and the operation queue is empty and there is no
6535          *    operation currently in progress and if all previously initiated
6536          *    DLPI operations have completed.
6537          */
6538         if ((ipx->ipx_writer == curthread && reentry_ok) ||
6539             (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
6540             !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL &&
6541             ipsq_dlpi_done(ipsq))))) {
6542                 /* Success. */
6543                 ipx->ipx_reentry_cnt++;
6544                 ipx->ipx_writer = curthread;
6545                 ipx->ipx_forced = B_FALSE;
6546                 mutex_exit(&ipx->ipx_lock);
6547                 mutex_exit(&ipsq->ipsq_lock);
6548                 mutex_exit(&ill->ill_lock);
6549                 RELEASE_CONN_LOCK(q);
6550 #ifdef DEBUG
6551                 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6552 #endif
6553                 return (ipsq);
6554         }
6555
6556         if (func != NULL)
6557                 ipsq_enq(ipsq, q, mp, func, type, ill);
6558
6559         mutex_exit(&ipx->ipx_lock);
6560         mutex_exit(&ipsq->ipsq_lock);
6561         mutex_exit(&ill->ill_lock);
6562         RELEASE_CONN_LOCK(q);
6563         return (NULL);
6564 }
6565
6566 /*
6567  * The ipsq_t (ipsq) is the synchronization data structure used to serialize
6568  * certain critical operations like plumbing (i.e. most set ioctls), etc.
6569  * There is one ipsq per phyint. The ipsq
6570  * serializes exclusive ioctls issued by applications on a per ipsq basis in
6571  * ipsq_xopq_mphead. It also protects against multiple threads executing in
6572  * the ipsq. Responses from the driver pertain to the current ioctl (say a
6573  * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing
6574  * up the interface) and are enqueued in ipx_mphead.
6575  *
6576  * If a thread does not want to reenter the ipsq when it is already writer,
6577  * it must make sure that the specified reentry point to be called later
6578  * when the ipsq is empty, nor any code path starting from the specified reentry
6579  * point must never ever try to enter the ipsq again. Otherwise it can lead
6580  * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
6581  * When the thread that is currently exclusive finishes, it (ipsq_exit)
6582  * dequeues the requests waiting to become exclusive in ipx_mphead and calls
6583  * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit
6584  * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
6585  * ioctl if the current ioctl has completed. If the current ioctl is still
6586  * in progress it simply returns. The current ioctl could be waiting for
6587  * a response from another module (the driver or could be waiting for
6588  * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
6589  * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
6590  * execution of the ioctl and ipsq_exit does not start the next ioctl unless
6591  * ipx_current_ipif is NULL which happens only once the ioctl is complete and
6592  * all associated DLPI operations have completed.
6593  */
6594
6595 /*
6596  * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif'
6597  * and `ill' cannot both be specified).  Returns a pointer to the entered IPSQ
6598  * on success, or NULL on failure.  The caller ensures ipif/ill is valid by
6599  * refholding it as necessary.  If the IPSQ cannot be entered and `func' is
6600  * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ
6601  * can be entered.  If `func' is NULL, then `q' and `mp' are ignored.
6602  */
6603 ipsq_t *
6604 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
6605     ipsq_func_t func, int type, boolean_t reentry_ok)
6606 {
6607         ip_stack_t      *ipst;
6608         ipsq_t          *ipsq;
6609
6610         /* Only 1 of ipif or ill can be specified */
6611         ASSERT((ipif != NULL) ^ (ill != NULL));
6612
6613         if (ipif != NULL)
6614                 ill = ipif->ipif_ill;
6615         ipst = ill->ill_ipst;
6616
6617         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6618         ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok);
6619         rw_exit(&ipst->ips_ill_g_lock);
6620
6621         return (ipsq);
6622 }
6623
6624 /*
6625  * Try to enter the IPSQ corresponding to `ill' as writer.  The caller ensures
6626  * ill is valid by refholding it if necessary; we will refrele.  If the IPSQ
6627  * cannot be entered, the mp is queued for completion.
6628  */
6629 void
6630 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6631     boolean_t reentry_ok)
6632 {
6633         ipsq_t  *ipsq;
6634
6635         ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok);
6636
6637         /*
6638          * Drop the caller's refhold on the ill.  This is safe since we either
6639          * entered the IPSQ (and thus are exclusive), or failed to enter the
6640          * IPSQ, in which case we return without accessing ill anymore.  This
6641          * is needed because func needs to see the correct refcount.
6642          * e.g. removeif can work only then.
6643          */
6644         ill_refrele(ill);
6645         if (ipsq != NULL) {
6646                 (*func)(ipsq, q, mp, NULL);
6647                 ipsq_exit(ipsq);
6648         }
6649 }
6650
6651 /*
6652  * Exit the specified IPSQ.  If this is the final exit on it then drain it
6653  * prior to exiting.  Caller must be writer on the specified IPSQ.
6654  */
6655 void
6656 ipsq_exit(ipsq_t *ipsq)
6657 {
6658         mblk_t *mp;
6659         ipsq_t *mp_ipsq;
6660         queue_t *q;
6661         phyint_t *phyi;
6662         ipsq_func_t func;
6663
6664         ASSERT(IAM_WRITER_IPSQ(ipsq));
6665
6666         ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1);
6667         if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) {
6668                 ipsq->ipsq_xop->ipx_reentry_cnt--;
6669                 return;
6670         }
6671
6672         for (;;) {
6673                 phyi = ipsq->ipsq_phyint;
6674                 mp = ipsq_dq(ipsq);
6675                 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next;
6676
6677                 /*
6678                  * If we've changed to a new IPSQ, and the phyint associated
6679                  * with the old one has gone away, free the old IPSQ.  Note
6680                  * that this cannot happen while the IPSQ is in a group.
6681                  */
6682                 if (mp_ipsq != ipsq && phyi == NULL) {
6683                         ASSERT(ipsq->ipsq_next == ipsq);
6684                         ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6685                         ipsq_delete(ipsq);
6686                 }
6687
6688                 if (mp == NULL)
6689                         break;
6690
6691                 q = mp->b_queue;
6692                 func = (ipsq_func_t)mp->b_prev;
6693                 ipsq = mp_ipsq;
6694                 mp->b_next = mp->b_prev = NULL;
6695                 mp->b_queue = NULL;
6696
6697                 /*
6698                  * If 'q' is an conn queue, it is valid, since we did a
6699                  * a refhold on the conn at the start of the ioctl.
6700                  * If 'q' is an ill queue, it is valid, since close of an
6701                  * ill will clean up its IPSQ.
6702                  */
6703                 (*func)(ipsq, q, mp, NULL);
6704         }
6705 }
6706
6707 /*
6708  * Used to start any igmp or mld timers that could not be started
6709  * while holding ill_mcast_lock. The timers can't be started while holding
6710  * the lock, since mld/igmp_start_timers may need to call untimeout()
6711  * which can't be done while holding the lock which the timeout handler
6712  * acquires. Otherwise
6713  * there could be a deadlock since the timeout handlers
6714  * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire
6715  * ill_mcast_lock.
6716  */
6717 void
6718 ill_mcast_timer_start(ip_stack_t *ipst)
6719 {
6720         int             next;
6721
6722         mutex_enter(&ipst->ips_igmp_timer_lock);
6723         next = ipst->ips_igmp_deferred_next;
6724         ipst->ips_igmp_deferred_next = INFINITY;
6725         mutex_exit(&ipst->ips_igmp_timer_lock);
6726
6727         if (next != INFINITY)
6728                 igmp_start_timers(next, ipst);
6729
6730         mutex_enter(&ipst->ips_mld_timer_lock);
6731         next = ipst->ips_mld_deferred_next;
6732         ipst->ips_mld_deferred_next = INFINITY;
6733         mutex_exit(&ipst->ips_mld_timer_lock);
6734
6735         if (next != INFINITY)
6736                 mld_start_timers(next, ipst);
6737 }
6738
6739 /*
6740  * Start the current exclusive operation on `ipsq'; associate it with `ipif'
6741  * and `ioccmd'.
6742  */
6743 void
6744 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
6745 {
6746         ill_t *ill = ipif->ipif_ill;
6747         ipxop_t *ipx = ipsq->ipsq_xop;
6748
6749         ASSERT(IAM_WRITER_IPSQ(ipsq));
6750         ASSERT(ipx->ipx_current_ipif == NULL);
6751         ASSERT(ipx->ipx_current_ioctl == 0);
6752
6753         ipx->ipx_current_done = B_FALSE;
6754         ipx->ipx_current_ioctl = ioccmd;
6755         mutex_enter(&ipx->ipx_lock);
6756         ipx->ipx_current_ipif = ipif;
6757         mutex_exit(&ipx->ipx_lock);
6758
6759         /*
6760          * Set IPIF_CHANGING on one or more ipifs associated with the
6761          * current exclusive operation.  IPIF_CHANGING prevents any new
6762          * references to the ipif (so that the references will eventually
6763          * drop to zero) and also prevents any "get" operations (e.g.,
6764          * SIOCGLIFFLAGS) from being able to access the ipif until the
6765          * operation has completed and the ipif is again in a stable state.
6766          *
6767          * For ioctls, IPIF_CHANGING is set on the ipif associated with the
6768          * ioctl.  For internal operations (where ioccmd is zero), all ipifs
6769          * on the ill are marked with IPIF_CHANGING since it's unclear which
6770          * ipifs will be affected.
6771          *
6772          * Note that SIOCLIFREMOVEIF is a special case as it sets
6773          * IPIF_CONDEMNED internally after identifying the right ipif to
6774          * operate on.
6775          */
6776         switch (ioccmd) {
6777         case SIOCLIFREMOVEIF:
6778                 break;
6779         case 0:
6780                 mutex_enter(&ill->ill_lock);
6781                 ipif = ipif->ipif_ill->ill_ipif;
6782                 for (; ipif != NULL; ipif = ipif->ipif_next)
6783                         ipif->ipif_state_flags |= IPIF_CHANGING;
6784                 mutex_exit(&ill->ill_lock);
6785                 break;
6786         default:
6787                 mutex_enter(&ill->ill_lock);
6788                 ipif->ipif_state_flags |= IPIF_CHANGING;
6789                 mutex_exit(&ill->ill_lock);
6790         }
6791 }
6792
6793 /*
6794  * Finish the current exclusive operation on `ipsq'.  Usually, this will allow
6795  * the next exclusive operation to begin once we ipsq_exit().  However, if
6796  * pending DLPI operations remain, then we will wait for the queue to drain
6797  * before allowing the next exclusive operation to begin.  This ensures that
6798  * DLPI operations from one exclusive operation are never improperly processed
6799  * as part of a subsequent exclusive operation.
6800  */
6801 void
6802 ipsq_current_finish(ipsq_t *ipsq)
6803 {
6804         ipxop_t *ipx = ipsq->ipsq_xop;
6805         t_uscalar_t dlpi_pending = DL_PRIM_INVAL;
6806         ipif_t  *ipif = ipx->ipx_current_ipif;
6807
6808         ASSERT(IAM_WRITER_IPSQ(ipsq));
6809
6810         /*
6811          * For SIOCLIFREMOVEIF, the ipif has been already been blown away
6812          * (but in that case, IPIF_CHANGING will already be clear and no
6813          * pending DLPI messages can remain).
6814          */
6815         if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) {
6816                 ill_t *ill = ipif->ipif_ill;
6817
6818                 mutex_enter(&ill->ill_lock);
6819                 dlpi_pending = ill->ill_dlpi_pending;
6820                 if (ipx->ipx_current_ioctl == 0) {
6821                         ipif = ill->ill_ipif;
6822                         for (; ipif != NULL; ipif = ipif->ipif_next)
6823                                 ipif->ipif_state_flags &= ~IPIF_CHANGING;
6824                 } else {
6825                         ipif->ipif_state_flags &= ~IPIF_CHANGING;
6826                 }
6827                 mutex_exit(&ill->ill_lock);
6828         }
6829
6830         ASSERT(!ipx->ipx_current_done);
6831         ipx->ipx_current_done = B_TRUE;
6832         ipx->ipx_current_ioctl = 0;
6833         if (dlpi_pending == DL_PRIM_INVAL) {
6834                 mutex_enter(&ipx->ipx_lock);
6835                 ipx->ipx_current_ipif = NULL;
6836                 mutex_exit(&ipx->ipx_lock);
6837         }
6838 }
6839
6840 /*
6841  * The ill is closing. Flush all messages on the ipsq that originated
6842  * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead
6843  * for this ill since ipsq_enter could not have entered until then.
6844  * New messages can't be queued since the CONDEMNED flag is set.
6845  */
6846 static void
6847 ipsq_flush(ill_t *ill)
6848 {
6849         queue_t *q;
6850         mblk_t  *prev;
6851         mblk_t  *mp;
6852         mblk_t  *mp_next;
6853         ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
6854
6855         ASSERT(IAM_WRITER_ILL(ill));
6856
6857         /*
6858          * Flush any messages sent up by the driver.
6859          */
6860         mutex_enter(&ipx->ipx_lock);
6861         for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) {
6862                 mp_next = mp->b_next;
6863                 q = mp->b_queue;
6864                 if (q == ill->ill_rq || q == ill->ill_wq) {
6865                         /* dequeue mp */
6866                         if (prev == NULL)
6867                                 ipx->ipx_mphead = mp->b_next;
6868                         else
6869                                 prev->b_next = mp->b_next;
6870                         if (ipx->ipx_mptail == mp) {
6871                                 ASSERT(mp_next == NULL);
6872                                 ipx->ipx_mptail = prev;
6873                         }
6874                         inet_freemsg(mp);
6875                 } else {
6876                         prev = mp;
6877                 }
6878         }
6879         mutex_exit(&ipx->ipx_lock);
6880         (void) ipsq_pending_mp_cleanup(ill, NULL);
6881         ipsq_xopq_mp_cleanup(ill, NULL);
6882 }
6883
6884 /*
6885  * Parse an ifreq or lifreq struct coming down ioctls and refhold
6886  * and return the associated ipif.
6887  * Return value:
6888  *      Non zero: An error has occurred. ci may not be filled out.
6889  *      zero : ci is filled out with the ioctl cmd in ci.ci_name, and
6890  *      a held ipif in ci.ci_ipif.
6891  */
6892 int
6893 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
6894     cmd_info_t *ci)
6895 {
6896         char            *name;
6897         struct ifreq    *ifr;
6898         struct lifreq    *lifr;
6899         ipif_t          *ipif = NULL;
6900         ill_t           *ill;
6901         conn_t          *connp;
6902         boolean_t       isv6;
6903         int             err;
6904         mblk_t          *mp1;
6905         zoneid_t        zoneid;
6906         ip_stack_t      *ipst;
6907
6908         if (q->q_next != NULL) {
6909                 ill = (ill_t *)q->q_ptr;
6910                 isv6 = ill->ill_isv6;
6911                 connp = NULL;
6912                 zoneid = ALL_ZONES;
6913                 ipst = ill->ill_ipst;
6914         } else {
6915                 ill = NULL;
6916                 connp = Q_TO_CONN(q);
6917                 isv6 = (connp->conn_family == AF_INET6);
6918                 zoneid = connp->conn_zoneid;
6919                 if (zoneid == GLOBAL_ZONEID) {
6920                         /* global zone can access ipifs in all zones */
6921                         zoneid = ALL_ZONES;
6922                 }
6923                 ipst = connp->conn_netstack->netstack_ip;
6924         }
6925
6926         /* Has been checked in ip_wput_nondata */
6927         mp1 = mp->b_cont->b_cont;
6928
6929         if (ipip->ipi_cmd_type == IF_CMD) {
6930                 /* This a old style SIOC[GS]IF* command */
6931                 ifr = (struct ifreq *)mp1->b_rptr;
6932                 /*
6933                  * Null terminate the string to protect against buffer
6934                  * overrun. String was generated by user code and may not
6935                  * be trusted.
6936                  */
6937                 ifr->ifr_name[IFNAMSIZ - 1] = '\0';
6938                 name = ifr->ifr_name;
6939                 ci->ci_sin = (sin_t *)&ifr->ifr_addr;
6940                 ci->ci_sin6 = NULL;
6941                 ci->ci_lifr = (struct lifreq *)ifr;
6942         } else {
6943                 /* This a new style SIOC[GS]LIF* command */
6944                 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
6945                 lifr = (struct lifreq *)mp1->b_rptr;
6946                 /*
6947                  * Null terminate the string to protect against buffer
6948                  * overrun. String was generated by user code and may not
6949                  * be trusted.
6950                  */
6951                 lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
6952                 name = lifr->lifr_name;
6953                 ci->ci_sin = (sin_t *)&lifr->lifr_addr;
6954                 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr;
6955                 ci->ci_lifr = lifr;
6956         }
6957
6958         if (ipip->ipi_cmd == SIOCSLIFNAME) {
6959                 /*
6960                  * The ioctl will be failed if the ioctl comes down
6961                  * an conn stream
6962                  */
6963                 if (ill == NULL) {
6964                         /*
6965                          * Not an ill queue, return EINVAL same as the
6966                          * old error code.
6967                          */
6968                         return (ENXIO);
6969                 }
6970                 ipif = ill->ill_ipif;
6971                 ipif_refhold(ipif);
6972         } else {
6973                 /*
6974                  * Ensure that ioctls don't see any internal state changes
6975                  * caused by set ioctls by deferring them if IPIF_CHANGING is
6976                  * set.
6977                  */
6978                 ipif = ipif_lookup_on_name_async(name, mi_strlen(name),
6979                     isv6, zoneid, q, mp, ip_process_ioctl, &err, ipst);
6980                 if (ipif == NULL) {
6981                         if (err == EINPROGRESS)
6982                                 return (err);
6983                         err = 0;        /* Ensure we don't use it below */
6984                 }
6985         }
6986
6987         /*
6988          * Old style [GS]IFCMD does not admit IPv6 ipif
6989          */
6990         if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) {
6991                 ipif_refrele(ipif);
6992                 return (ENXIO);
6993         }
6994
6995         if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL &&
6996             name[0] == '\0') {
6997                 /*
6998                  * Handle a or a SIOC?IF* with a null name
6999                  * during plumb (on the ill queue before the I_PLINK).
7000                  */
7001                 ipif = ill->ill_ipif;
7002                 ipif_refhold(ipif);
7003         }
7004
7005         if (ipif == NULL)
7006                 return (ENXIO);
7007
7008         DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq",
7009             int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif);
7010
7011         ci->ci_ipif = ipif;
7012         return (0);
7013 }
7014
7015 /*
7016  * Return the total number of ipifs.
7017  */
7018 static uint_t
7019 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
7020 {
7021         uint_t numifs = 0;
7022         ill_t   *ill;
7023         ill_walk_context_t      ctx;
7024         ipif_t  *ipif;
7025
7026         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7027         ill = ILL_START_WALK_V4(&ctx, ipst);
7028         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7029                 if (IS_UNDER_IPMP(ill))
7030                         continue;
7031                 for (ipif = ill->ill_ipif; ipif != NULL;
7032                     ipif = ipif->ipif_next) {
7033                         if (ipif->ipif_zoneid == zoneid ||
7034                             ipif->ipif_zoneid == ALL_ZONES)
7035                                 numifs++;
7036                 }
7037         }
7038         rw_exit(&ipst->ips_ill_g_lock);
7039         return (numifs);
7040 }
7041
7042 /*
7043  * Return the total number of ipifs.
7044  */
7045 static uint_t
7046 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
7047 {
7048         uint_t numifs = 0;
7049         ill_t   *ill;
7050         ipif_t  *ipif;
7051         ill_walk_context_t      ctx;
7052
7053         ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid));
7054
7055         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7056         if (family == AF_INET)
7057                 ill = ILL_START_WALK_V4(&ctx, ipst);
7058         else if (family == AF_INET6)
7059                 ill = ILL_START_WALK_V6(&ctx, ipst);
7060         else
7061                 ill = ILL_START_WALK_ALL(&ctx, ipst);
7062
7063         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7064                 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP))
7065                         continue;
7066
7067                 for (ipif = ill->ill_ipif; ipif != NULL;
7068                     ipif = ipif->ipif_next) {
7069                         if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7070                             !(lifn_flags & LIFC_NOXMIT))
7071                                 continue;
7072                         if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7073                             !(lifn_flags & LIFC_TEMPORARY))
7074                                 continue;
7075                         if (((ipif->ipif_flags &
7076                             (IPIF_NOXMIT|IPIF_NOLOCAL|
7077                             IPIF_DEPRECATED)) ||
7078                             IS_LOOPBACK(ill) ||
7079                             !(ipif->ipif_flags & IPIF_UP)) &&
7080                             (lifn_flags & LIFC_EXTERNAL_SOURCE))
7081                                 continue;
7082
7083                         if (zoneid != ipif->ipif_zoneid &&
7084                             ipif->ipif_zoneid != ALL_ZONES &&
7085                             (zoneid != GLOBAL_ZONEID ||
7086                             !(lifn_flags & LIFC_ALLZONES)))
7087                                 continue;
7088
7089                         numifs++;
7090                 }
7091         }
7092         rw_exit(&ipst->ips_ill_g_lock);
7093         return (numifs);
7094 }
7095
7096 uint_t
7097 ip_get_lifsrcofnum(ill_t *ill)
7098 {
7099         uint_t numifs = 0;
7100         ill_t   *ill_head = ill;
7101         ip_stack_t      *ipst = ill->ill_ipst;
7102
7103         /*
7104          * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some
7105          * other thread may be trying to relink the ILLs in this usesrc group
7106          * and adjusting the ill_usesrc_grp_next pointers
7107          */
7108         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7109         if ((ill->ill_usesrc_ifindex == 0) &&
7110             (ill->ill_usesrc_grp_next != NULL)) {
7111                 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head);
7112                     ill = ill->ill_usesrc_grp_next)
7113                         numifs++;
7114         }
7115         rw_exit(&ipst->ips_ill_g_usesrc_lock);
7116
7117         return (numifs);
7118 }
7119
7120 /* Null values are passed in for ipif, sin, and ifreq */
7121 /* ARGSUSED */
7122 int
7123 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7124     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7125 {
7126         int *nump;
7127         conn_t *connp = Q_TO_CONN(q);
7128
7129         ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7130
7131         /* Existence of b_cont->b_cont checked in ip_wput_nondata */
7132         nump = (int *)mp->b_cont->b_cont->b_rptr;
7133
7134         *nump = ip_get_numifs(connp->conn_zoneid,
7135             connp->conn_netstack->netstack_ip);
7136         ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump));
7137         return (0);
7138 }
7139
7140 /* Null values are passed in for ipif, sin, and ifreq */
7141 /* ARGSUSED */
7142 int
7143 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin,
7144     queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7145 {
7146         struct lifnum *lifn;
7147         mblk_t  *mp1;
7148         conn_t *connp = Q_TO_CONN(q);
7149
7150         ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7151
7152         /* Existence checked in ip_wput_nondata */
7153         mp1 = mp->b_cont->b_cont;
7154
7155         lifn = (struct lifnum *)mp1->b_rptr;
7156         switch (lifn->lifn_family) {
7157         case AF_UNSPEC:
7158         case AF_INET:
7159         case AF_INET6:
7160                 break;
7161         default:
7162                 return (EAFNOSUPPORT);
7163         }
7164
7165         lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags,
7166             connp->conn_zoneid, connp->conn_netstack->netstack_ip);
7167         ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count));
7168         return (0);
7169 }
7170
7171 /* ARGSUSED */
7172 int
7173 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7174     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7175 {
7176         STRUCT_HANDLE(ifconf, ifc);
7177         mblk_t *mp1;
7178         struct iocblk *iocp;
7179         struct ifreq *ifr;
7180         ill_walk_context_t      ctx;
7181         ill_t   *ill;
7182         ipif_t  *ipif;
7183         struct sockaddr_in *sin;
7184         int32_t ifclen;
7185         zoneid_t zoneid;
7186         ip_stack_t *ipst = CONNQ_TO_IPST(q);
7187
7188         ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */
7189
7190         ip1dbg(("ip_sioctl_get_ifconf"));
7191         /* Existence verified in ip_wput_nondata */
7192         mp1 = mp->b_cont->b_cont;
7193         iocp = (struct iocblk *)mp->b_rptr;
7194         zoneid = Q_TO_CONN(q)->conn_zoneid;
7195
7196         /*
7197          * The original SIOCGIFCONF passed in a struct ifconf which specified
7198          * the user buffer address and length into which the list of struct
7199          * ifreqs was to be copied.  Since AT&T Streams does not seem to
7200          * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS,
7201          * the SIOCGIFCONF operation was redefined to simply provide
7202          * a large output buffer into which we are supposed to jam the ifreq
7203          * array.  The same ioctl command code was used, despite the fact that
7204          * both the applications and the kernel code had to change, thus making
7205          * it impossible to support both interfaces.
7206          *
7207          * For reasons not good enough to try to explain, the following
7208          * algorithm is used for deciding what to do with one of these:
7209          * If the IOCTL comes in as an I_STR, it is assumed to be of the new
7210          * form with the output buffer coming down as the continuation message.
7211          * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style,
7212          * and we have to copy in the ifconf structure to find out how big the
7213          * output buffer is and where to copy out to.  Sure no problem...
7214          *
7215          */
7216         STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL);
7217         if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) {
7218                 int numifs = 0;
7219                 size_t ifc_bufsize;
7220
7221                 /*
7222                  * Must be (better be!) continuation of a TRANSPARENT
7223                  * IOCTL.  We just copied in the ifconf structure.
7224                  */
7225                 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag,
7226                     (struct ifconf *)mp1->b_rptr);
7227
7228                 /*
7229                  * Allocate a buffer to hold requested information.
7230                  *
7231                  * If ifc_len is larger than what is needed, we only
7232                  * allocate what we will use.
7233                  *
7234                  * If ifc_len is smaller than what is needed, return
7235                  * EINVAL.
7236                  *
7237                  * XXX: the ill_t structure can hava 2 counters, for
7238                  * v4 and v6 (not just ill_ipif_up_count) to store the
7239                  * number of interfaces for a device, so we don't need
7240                  * to count them here...
7241                  */
7242                 numifs = ip_get_numifs(zoneid, ipst);
7243
7244                 ifclen = STRUCT_FGET(ifc, ifc_len);
7245                 ifc_bufsize = numifs * sizeof (struct ifreq);
7246                 if (ifc_bufsize > ifclen) {
7247                         if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7248                                 /* old behaviour */
7249                                 return (EINVAL);
7250                         } else {
7251                                 ifc_bufsize = ifclen;
7252                         }
7253                 }
7254
7255                 mp1 = mi_copyout_alloc(q, mp,
7256                     STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE);
7257                 if (mp1 == NULL)
7258                         return (ENOMEM);
7259
7260                 mp1->b_wptr = mp1->b_rptr + ifc_bufsize;
7261         }
7262         bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7263         /*
7264          * the SIOCGIFCONF ioctl only knows about
7265          * IPv4 addresses, so don't try to tell
7266          * it about interfaces with IPv6-only
7267          * addresses. (Last parm 'isv6' is B_FALSE)
7268          */
7269
7270         ifr = (struct ifreq *)mp1->b_rptr;
7271
7272         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7273         ill = ILL_START_WALK_V4(&ctx, ipst);
7274         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7275                 if (IS_UNDER_IPMP(ill))
7276                         continue;
7277                 for (ipif = ill->ill_ipif; ipif != NULL;
7278                     ipif = ipif->ipif_next) {
7279                         if (zoneid != ipif->ipif_zoneid &&
7280                             ipif->ipif_zoneid != ALL_ZONES)
7281                                 continue;
7282                         if ((uchar_t *)&ifr[1] > mp1->b_wptr) {
7283                                 if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7284                                         /* old behaviour */
7285                                         rw_exit(&ipst->ips_ill_g_lock);
7286                                         return (EINVAL);
7287                                 } else {
7288                                         goto if_copydone;
7289                                 }
7290                         }
7291                         ipif_get_name(ipif, ifr->ifr_name,
7292                             sizeof (ifr->ifr_name));
7293                         sin = (sin_t *)&ifr->ifr_addr;
7294                         *sin = sin_null;
7295                         sin->sin_family = AF_INET;
7296                         sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7297                         ifr++;
7298                 }
7299         }
7300 if_copydone:
7301         rw_exit(&ipst->ips_ill_g_lock);
7302         mp1->b_wptr = (uchar_t *)ifr;
7303
7304         if (STRUCT_BUF(ifc) != NULL) {
7305                 STRUCT_FSET(ifc, ifc_len,
7306                     (int)((uchar_t *)ifr - mp1->b_rptr));
7307         }
7308         return (0);
7309 }
7310
7311 /*
7312  * Get the interfaces using the address hosted on the interface passed in,
7313  * as a source adddress
7314  */
7315 /* ARGSUSED */
7316 int
7317 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7318     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7319 {
7320         mblk_t *mp1;
7321         ill_t   *ill, *ill_head;
7322         ipif_t  *ipif, *orig_ipif;
7323         int     numlifs = 0;
7324         size_t  lifs_bufsize, lifsmaxlen;
7325         struct  lifreq *lifr;
7326         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7327         uint_t  ifindex;
7328         zoneid_t zoneid;
7329         boolean_t isv6 = B_FALSE;
7330         struct  sockaddr_in     *sin;
7331         struct  sockaddr_in6    *sin6;
7332         STRUCT_HANDLE(lifsrcof, lifs);
7333         ip_stack_t              *ipst;
7334
7335         ipst = CONNQ_TO_IPST(q);
7336
7337         ASSERT(q->q_next == NULL);
7338
7339         zoneid = Q_TO_CONN(q)->conn_zoneid;
7340
7341         /* Existence verified in ip_wput_nondata */
7342         mp1 = mp->b_cont->b_cont;
7343
7344         /*
7345          * Must be (better be!) continuation of a TRANSPARENT
7346          * IOCTL.  We just copied in the lifsrcof structure.
7347          */
7348         STRUCT_SET_HANDLE(lifs, iocp->ioc_flag,
7349             (struct lifsrcof *)mp1->b_rptr);
7350
7351         if (MBLKL(mp1) != STRUCT_SIZE(lifs))
7352                 return (EINVAL);
7353
7354         ifindex = STRUCT_FGET(lifs, lifs_ifindex);
7355         isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
7356         ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst);
7357         if (ipif == NULL) {
7358                 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n",
7359                     ifindex));
7360                 return (ENXIO);
7361         }
7362
7363         /* Allocate a buffer to hold requested information */
7364         numlifs = ip_get_lifsrcofnum(ipif->ipif_ill);
7365         lifs_bufsize = numlifs * sizeof (struct lifreq);
7366         lifsmaxlen =  STRUCT_FGET(lifs, lifs_maxlen);
7367         /* The actual size needed is always returned in lifs_len */
7368         STRUCT_FSET(lifs, lifs_len, lifs_bufsize);
7369
7370         /* If the amount we need is more than what is passed in, abort */
7371         if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) {
7372                 ipif_refrele(ipif);
7373                 return (0);
7374         }
7375
7376         mp1 = mi_copyout_alloc(q, mp,
7377             STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE);
7378         if (mp1 == NULL) {
7379                 ipif_refrele(ipif);
7380                 return (ENOMEM);
7381         }
7382
7383         mp1->b_wptr = mp1->b_rptr + lifs_bufsize;
7384         bzero(mp1->b_rptr, lifs_bufsize);
7385
7386         lifr = (struct lifreq *)mp1->b_rptr;
7387
7388         ill = ill_head = ipif->ipif_ill;
7389         orig_ipif = ipif;
7390
7391         /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
7392         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7393         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7394
7395         ill = ill->ill_usesrc_grp_next; /* start from next ill */
7396         for (; (ill != NULL) && (ill != ill_head);
7397             ill = ill->ill_usesrc_grp_next) {
7398
7399                 if ((uchar_t *)&lifr[1] > mp1->b_wptr)
7400                         break;
7401
7402                 ipif = ill->ill_ipif;
7403                 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name));
7404                 if (ipif->ipif_isv6) {
7405                         sin6 = (sin6_t *)&lifr->lifr_addr;
7406                         *sin6 = sin6_null;
7407                         sin6->sin6_family = AF_INET6;
7408                         sin6->sin6_addr = ipif->ipif_v6lcl_addr;
7409                         lifr->lifr_addrlen = ip_mask_to_plen_v6(
7410                             &ipif->ipif_v6net_mask);
7411                 } else {
7412                         sin = (sin_t *)&lifr->lifr_addr;
7413                         *sin = sin_null;
7414                         sin->sin_family = AF_INET;
7415                         sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7416                         lifr->lifr_addrlen = ip_mask_to_plen(
7417                             ipif->ipif_net_mask);
7418                 }
7419                 lifr++;
7420         }
7421         rw_exit(&ipst->ips_ill_g_lock);
7422         rw_exit(&ipst->ips_ill_g_usesrc_lock);
7423         ipif_refrele(orig_ipif);
7424         mp1->b_wptr = (uchar_t *)lifr;
7425         STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr));
7426
7427         return (0);
7428 }
7429
7430 /* ARGSUSED */
7431 int
7432 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7433     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7434 {
7435         mblk_t *mp1;
7436         int     list;
7437         ill_t   *ill;
7438         ipif_t  *ipif;
7439         int     flags;
7440         int     numlifs = 0;
7441         size_t  lifc_bufsize;
7442         struct  lifreq *lifr;
7443         sa_family_t     family;
7444         struct  sockaddr_in     *sin;
7445         struct  sockaddr_in6    *sin6;
7446         ill_walk_context_t      ctx;
7447         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7448         int32_t lifclen;
7449         zoneid_t zoneid;
7450         STRUCT_HANDLE(lifconf, lifc);
7451         ip_stack_t *ipst = CONNQ_TO_IPST(q);
7452
7453         ip1dbg(("ip_sioctl_get_lifconf"));
7454
7455         ASSERT(q->q_next == NULL);
7456
7457         zoneid = Q_TO_CONN(q)->conn_zoneid;
7458
7459         /* Existence verified in ip_wput_nondata */
7460         mp1 = mp->b_cont->b_cont;
7461
7462         /*
7463          * An extended version of SIOCGIFCONF that takes an
7464          * additional address family and flags field.
7465          * AF_UNSPEC retrieve both IPv4 and IPv6.
7466          * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT
7467          * interfaces are omitted.
7468          * Similarly, IPIF_TEMPORARY interfaces are omitted
7469          * unless LIFC_TEMPORARY is specified.
7470          * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT,
7471          * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and
7472          * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE
7473          * has priority over LIFC_NOXMIT.
7474          */
7475         STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL);
7476
7477         if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc))
7478                 return (EINVAL);
7479
7480         /*
7481          * Must be (better be!) continuation of a TRANSPARENT
7482          * IOCTL.  We just copied in the lifconf structure.
7483          */
7484         STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr);
7485
7486         family = STRUCT_FGET(lifc, lifc_family);
7487         flags = STRUCT_FGET(lifc, lifc_flags);
7488
7489         switch (family) {
7490         case AF_UNSPEC:
7491                 /*
7492                  * walk all ILL's.
7493                  */
7494                 list = MAX_G_HEADS;
7495                 break;
7496         case AF_INET:
7497                 /*
7498                  * walk only IPV4 ILL's.
7499                  */
7500                 list = IP_V4_G_HEAD;
7501                 break;
7502         case AF_INET6:
7503                 /*
7504                  * walk only IPV6 ILL's.
7505                  */
7506                 list = IP_V6_G_HEAD;
7507                 break;
7508         default:
7509                 return (EAFNOSUPPORT);
7510         }
7511
7512         /*
7513          * Allocate a buffer to hold requested information.
7514          *
7515          * If lifc_len is larger than what is needed, we only
7516          * allocate what we will use.
7517          *
7518          * If lifc_len is smaller than what is needed, return
7519          * EINVAL.
7520          */
7521         numlifs = ip_get_numlifs(family, flags, zoneid, ipst);
7522         lifc_bufsize = numlifs * sizeof (struct lifreq);
7523         lifclen = STRUCT_FGET(lifc, lifc_len);
7524         if (lifc_bufsize > lifclen) {
7525                 if (iocp->ioc_cmd == O_SIOCGLIFCONF)
7526                         return (EINVAL);
7527                 else
7528                         lifc_bufsize = lifclen;
7529         }
7530
7531         mp1 = mi_copyout_alloc(q, mp,
7532             STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE);
7533         if (mp1 == NULL)
7534                 return (ENOMEM);
7535
7536         mp1->b_wptr = mp1->b_rptr + lifc_bufsize;
7537         bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7538
7539         lifr = (struct lifreq *)mp1->b_rptr;
7540
7541         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7542         ill = ill_first(list, list, &ctx, ipst);
7543         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7544                 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP))
7545                         continue;
7546
7547                 for (ipif = ill->ill_ipif; ipif != NULL;
7548                     ipif = ipif->ipif_next) {
7549                         if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7550                             !(flags & LIFC_NOXMIT))
7551                                 continue;
7552
7553                         if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7554                             !(flags & LIFC_TEMPORARY))
7555                                 continue;
7556
7557                         if (((ipif->ipif_flags &
7558                             (IPIF_NOXMIT|IPIF_NOLOCAL|
7559                             IPIF_DEPRECATED)) ||
7560                             IS_LOOPBACK(ill) ||
7561                             !(ipif->ipif_flags & IPIF_UP)) &&
7562                             (flags & LIFC_EXTERNAL_SOURCE))
7563                                 continue;
7564
7565                         if (zoneid != ipif->ipif_zoneid &&
7566                             ipif->ipif_zoneid != ALL_ZONES &&
7567                             (zoneid != GLOBAL_ZONEID ||
7568                             !(flags & LIFC_ALLZONES)))
7569                                 continue;
7570
7571                         if ((uchar_t *)&lifr[1] > mp1->b_wptr) {
7572                                 if (iocp->ioc_cmd == O_SIOCGLIFCONF) {
7573                                         rw_exit(&ipst->ips_ill_g_lock);
7574                                         return (EINVAL);
7575                                 } else {
7576                                         goto lif_copydone;
7577                                 }
7578                         }
7579
7580                         ipif_get_name(ipif, lifr->lifr_name,
7581                             sizeof (lifr->lifr_name));
7582                         lifr->lifr_type = ill->ill_type;
7583                         if (ipif->ipif_isv6) {
7584                                 sin6 = (sin6_t *)&lifr->lifr_addr;
7585                                 *sin6 = sin6_null;
7586                                 sin6->sin6_family = AF_INET6;
7587                                 sin6->sin6_addr =
7588                                     ipif->ipif_v6lcl_addr;
7589                                 lifr->lifr_addrlen =
7590                                     ip_mask_to_plen_v6(
7591                                     &ipif->ipif_v6net_mask);
7592                         } else {
7593                                 sin = (sin_t *)&lifr->lifr_addr;
7594                                 *sin = sin_null;
7595                                 sin->sin_family = AF_INET;
7596                                 sin->sin_addr.s_addr =
7597                                     ipif->ipif_lcl_addr;
7598                                 lifr->lifr_addrlen =
7599                                     ip_mask_to_plen(
7600                                     ipif->ipif_net_mask);
7601                         }
7602                         lifr++;
7603                 }
7604         }
7605 lif_copydone:
7606         rw_exit(&ipst->ips_ill_g_lock);
7607
7608         mp1->b_wptr = (uchar_t *)lifr;
7609         if (STRUCT_BUF(lifc) != NULL) {
7610                 STRUCT_FSET(lifc, lifc_len,
7611                     (int)((uchar_t *)lifr - mp1->b_rptr));
7612         }
7613         return (0);
7614 }
7615
7616 static void
7617 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
7618 {
7619         ip6_asp_t *table;
7620         size_t table_size;
7621         mblk_t *data_mp;
7622         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7623         ip_stack_t      *ipst;
7624
7625         if (q->q_next == NULL)
7626                 ipst = CONNQ_TO_IPST(q);
7627         else
7628                 ipst = ILLQ_TO_IPST(q);
7629
7630         /* These two ioctls are I_STR only */
7631         if (iocp->ioc_count == TRANSPARENT) {
7632                 miocnak(q, mp, 0, EINVAL);
7633                 return;
7634         }
7635
7636         data_mp = mp->b_cont;
7637         if (data_mp == NULL) {
7638                 /* The user passed us a NULL argument */
7639                 table = NULL;
7640                 table_size = iocp->ioc_count;
7641         } else {
7642                 /*
7643                  * The user provided a table.  The stream head
7644                  * may have copied in the user data in chunks,
7645                  * so make sure everything is pulled up
7646                  * properly.
7647                  */
7648                 if (MBLKL(data_mp) < iocp->ioc_count) {
7649                         mblk_t *new_data_mp;
7650                         if ((new_data_mp = msgpullup(data_mp, -1)) ==
7651                             NULL) {
7652                                 miocnak(q, mp, 0, ENOMEM);
7653                                 return;
7654                         }
7655                         freemsg(data_mp);
7656                         data_mp = new_data_mp;
7657                         mp->b_cont = data_mp;
7658                 }
7659                 table = (ip6_asp_t *)data_mp->b_rptr;
7660                 table_size = iocp->ioc_count;
7661         }
7662
7663         switch (iocp->ioc_cmd) {
7664         case SIOCGIP6ADDRPOLICY:
7665                 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst);
7666                 if (iocp->ioc_rval == -1)
7667                         iocp->ioc_error = EINVAL;
7668 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7669                 else if (table != NULL &&
7670                     (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) {
7671                         ip6_asp_t *src = table;
7672                         ip6_asp32_t *dst = (void *)table;
7673                         int count = table_size / sizeof (ip6_asp_t);
7674                         int i;
7675
7676                         /*
7677                          * We need to do an in-place shrink of the array
7678                          * to match the alignment attributes of the
7679                          * 32-bit ABI looking at it.
7680                          */
7681                         /* LINTED: logical expression always true: op "||" */
7682                         ASSERT(sizeof (*src) > sizeof (*dst));
7683                         for (i = 1; i < count; i++)
7684                                 bcopy(src + i, dst + i, sizeof (*dst));
7685                 }
7686 #endif
7687                 break;
7688
7689         case SIOCSIP6ADDRPOLICY:
7690                 ASSERT(mp->b_prev == NULL);
7691                 mp->b_prev = (void *)q;
7692 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7693                 /*
7694                  * We pass in the datamodel here so that the ip6_asp_replace()
7695                  * routine can handle converting from 32-bit to native formats
7696                  * where necessary.
7697                  *
7698                  * A better way to handle this might be to convert the inbound
7699                  * data structure here, and hang it off a new 'mp'; thus the
7700                  * ip6_asp_replace() logic would always be dealing with native
7701                  * format data structures..
7702                  *
7703                  * (An even simpler way to handle these ioctls is to just
7704                  * add a 32-bit trailing 'pad' field to the ip6_asp_t structure
7705                  * and just recompile everything that depends on it.)
7706                  */
7707 #endif
7708                 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst,
7709                     iocp->ioc_flag & IOC_MODELS);
7710                 return;
7711         }
7712
7713         DB_TYPE(mp) =  (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK;
7714         qreply(q, mp);
7715 }
7716
7717 static void
7718 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
7719 {
7720         mblk_t          *data_mp;
7721         struct dstinforeq       *dir;
7722         uint8_t         *end, *cur;
7723         in6_addr_t      *daddr, *saddr;
7724         ipaddr_t        v4daddr;
7725         ire_t           *ire;
7726         ipaddr_t        v4setsrc;
7727         in6_addr_t      v6setsrc;
7728         char            *slabel, *dlabel;
7729         boolean_t       isipv4;
7730         int             match_ire;
7731         ill_t           *dst_ill;
7732         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7733         conn_t          *connp = Q_TO_CONN(q);
7734         zoneid_t        zoneid = IPCL_ZONEID(connp);
7735         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
7736         uint64_t        ipif_flags;
7737
7738         ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
7739
7740         /*
7741          * This ioctl is I_STR only, and must have a
7742          * data mblk following the M_IOCTL mblk.
7743          */
7744         data_mp = mp->b_cont;
7745         if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) {
7746                 miocnak(q, mp, 0, EINVAL);
7747                 return;
7748         }
7749
7750         if (MBLKL(data_mp) < iocp->ioc_count) {
7751                 mblk_t *new_data_mp;
7752
7753                 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) {
7754                         miocnak(q, mp, 0, ENOMEM);
7755                         return;
7756                 }
7757                 freemsg(data_mp);
7758                 data_mp = new_data_mp;
7759                 mp->b_cont = data_mp;
7760         }
7761         match_ire = MATCH_IRE_DSTONLY;
7762
7763         for (cur = data_mp->b_rptr, end = data_mp->b_wptr;
7764             end - cur >= sizeof (struct dstinforeq);
7765             cur += sizeof (struct dstinforeq)) {
7766                 dir = (struct dstinforeq *)cur;
7767                 daddr = &dir->dir_daddr;
7768                 saddr = &dir->dir_saddr;
7769
7770                 /*
7771                  * ip_addr_scope_v6() and ip6_asp_lookup() handle
7772                  * v4 mapped addresses; ire_ftable_lookup_v6()
7773                  * and ip_select_source_v6() do not.
7774                  */
7775                 dir->dir_dscope = ip_addr_scope_v6(daddr);
7776                 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst);
7777
7778                 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr);
7779                 if (isipv4) {
7780                         IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr);
7781                         v4setsrc = INADDR_ANY;
7782                         ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid,
7783                             match_ire, IRR_ALLOCATE, 0, ipst, &v4setsrc, NULL);
7784                 } else {
7785                         v6setsrc = ipv6_all_zeros;
7786                         ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid,
7787                             match_ire, IRR_ALLOCATE, 0, ipst, &v6setsrc, NULL);
7788                 }
7789                 ASSERT(ire != NULL);
7790                 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
7791                         ire_refrele(ire);
7792                         dir->dir_dreachable = 0;
7793
7794                         /* move on to next dst addr */
7795                         continue;
7796                 }
7797                 dir->dir_dreachable = 1;
7798
7799                 dst_ill = ire_nexthop_ill(ire);
7800                 if (dst_ill == NULL) {
7801                         ire_refrele(ire);
7802                         continue;
7803                 }
7804
7805                 /* With ipmp we most likely look at the ipmp ill here */
7806                 dir->dir_dmactype = dst_ill->ill_mactype;
7807
7808                 if (isipv4) {
7809                         ipaddr_t v4saddr;
7810
7811                         if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr,
7812                             connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst,
7813                             &v4saddr, NULL, &ipif_flags) != 0) {
7814                                 v4saddr = INADDR_ANY;
7815                                 ipif_flags = 0;
7816                         }
7817                         IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr);
7818                 } else {
7819                         if (ip_select_source_v6(dst_ill, &v6setsrc, daddr,
7820                             zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT,
7821                             saddr, NULL, &ipif_flags) != 0) {
7822                                 *saddr = ipv6_all_zeros;
7823                                 ipif_flags = 0;
7824                         }
7825                 }
7826
7827                 dir->dir_sscope = ip_addr_scope_v6(saddr);
7828                 slabel = ip6_asp_lookup(saddr, NULL, ipst);
7829                 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel);
7830                 dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0;
7831                 ire_refrele(ire);
7832                 ill_refrele(dst_ill);
7833         }
7834         miocack(q, mp, iocp->ioc_count, 0);
7835 }
7836
7837 /*
7838  * Check if this is an address assigned to this machine.
7839  * Skips interfaces that are down by using ire checks.
7840  * Translates mapped addresses to v4 addresses and then
7841  * treats them as such, returning true if the v4 address
7842  * associated with this mapped address is configured.
7843  * Note: Applications will have to be careful what they do
7844  * with the response; use of mapped addresses limits
7845  * what can be done with the socket, especially with
7846  * respect to socket options and ioctls - neither IPv4
7847  * options nor IPv6 sticky options/ancillary data options
7848  * may be used.
7849  */
7850 /* ARGSUSED */
7851 int
7852 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
7853     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
7854 {
7855         struct sioc_addrreq *sia;
7856         sin_t *sin;
7857         ire_t *ire;
7858         mblk_t *mp1;
7859         zoneid_t zoneid;
7860         ip_stack_t      *ipst;
7861
7862         ip1dbg(("ip_sioctl_tmyaddr"));
7863
7864         ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
7865         zoneid = Q_TO_CONN(q)->conn_zoneid;
7866         ipst = CONNQ_TO_IPST(q);
7867
7868         /* Existence verified in ip_wput_nondata */
7869         mp1 = mp->b_cont->b_cont;
7870         sia = (struct sioc_addrreq *)mp1->b_rptr;
7871         sin = (sin_t *)&sia->sa_addr;
7872         switch (sin->sin_family) {
7873         case AF_INET6: {
7874                 sin6_t *sin6 = (sin6_t *)sin;
7875
7876                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
7877                         ipaddr_t v4_addr;
7878
7879                         IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
7880                             v4_addr);
7881                         ire = ire_ftable_lookup_v4(v4_addr, 0, 0,
7882                             IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
7883                             MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
7884                 } else {
7885                         in6_addr_t v6addr;
7886
7887                         v6addr = sin6->sin6_addr;
7888                         ire = ire_ftable_lookup_v6(&v6addr, 0, 0,
7889                             IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
7890                             MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
7891                 }
7892                 break;
7893         }
7894         case AF_INET: {
7895                 ipaddr_t v4addr;
7896
7897                 v4addr = sin->sin_addr.s_addr;
7898                 ire = ire_ftable_lookup_v4(v4addr, 0, 0,
7899                     IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
7900                     MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
7901                 break;
7902         }
7903         default:
7904                 return (EAFNOSUPPORT);
7905         }
7906         if (ire != NULL) {
7907                 sia->sa_res = 1;
7908                 ire_refrele(ire);
7909         } else {
7910                 sia->sa_res = 0;
7911         }
7912         return (0);
7913 }
7914
7915 /*
7916  * Check if this is an address assigned on-link i.e. neighbor,
7917  * and makes sure it's reachable from the current zone.
7918  * Returns true for my addresses as well.
7919  * Translates mapped addresses to v4 addresses and then
7920  * treats them as such, returning true if the v4 address
7921  * associated with this mapped address is configured.
7922  * Note: Applications will have to be careful what they do
7923  * with the response; use of mapped addresses limits
7924  * what can be done with the socket, especially with
7925  * respect to socket options and ioctls - neither IPv4
7926  * options nor IPv6 sticky options/ancillary data options
7927  * may be used.
7928  */
7929 /* ARGSUSED */
7930 int
7931 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
7932     ip_ioctl_cmd_t *ipip, void *duymmy_ifreq)
7933 {
7934         struct sioc_addrreq *sia;
7935         sin_t *sin;
7936         mblk_t  *mp1;
7937         ire_t *ire = NULL;
7938         zoneid_t zoneid;
7939         ip_stack_t      *ipst;
7940
7941         ip1dbg(("ip_sioctl_tonlink"));
7942
7943         ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
7944         zoneid = Q_TO_CONN(q)->conn_zoneid;
7945         ipst = CONNQ_TO_IPST(q);
7946
7947         /* Existence verified in ip_wput_nondata */
7948         mp1 = mp->b_cont->b_cont;
7949         sia = (struct sioc_addrreq *)mp1->b_rptr;
7950         sin = (sin_t *)&sia->sa_addr;
7951
7952         /*
7953          * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST
7954          * to make sure we only look at on-link unicast address.
7955          */
7956         switch (sin->sin_family) {
7957         case AF_INET6: {
7958                 sin6_t *sin6 = (sin6_t *)sin;
7959
7960                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
7961                         ipaddr_t v4_addr;
7962
7963                         IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
7964                             v4_addr);
7965                         if (!CLASSD(v4_addr)) {
7966                                 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0,
7967                                     NULL, zoneid, MATCH_IRE_DSTONLY, 0, ipst,
7968                                     NULL);
7969                         }
7970                 } else {
7971                         in6_addr_t v6addr;
7972
7973                         v6addr = sin6->sin6_addr;
7974                         if (!IN6_IS_ADDR_MULTICAST(&v6addr)) {
7975                                 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0,
7976                                     NULL, zoneid, MATCH_IRE_DSTONLY, 0, ipst,
7977                                     NULL);
7978                         }
7979                 }
7980                 break;
7981         }
7982         case AF_INET: {
7983                 ipaddr_t v4addr;
7984
7985                 v4addr = sin->sin_addr.s_addr;
7986                 if (!CLASSD(v4addr)) {
7987                         ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL,
7988                             zoneid, MATCH_IRE_DSTONLY, 0, ipst, NULL);
7989                 }
7990                 break;
7991         }
7992         default:
7993                 return (EAFNOSUPPORT);
7994         }
7995         sia->sa_res = 0;
7996         if (ire != NULL) {
7997                 ASSERT(!(ire->ire_type & IRE_MULTICAST));
7998
7999                 if ((ire->ire_type & IRE_ONLINK) &&
8000                     !(ire->ire_type & IRE_BROADCAST))
8001                         sia->sa_res = 1;
8002                 ire_refrele(ire);
8003         }
8004         return (0);
8005 }
8006
8007 /*
8008  * TBD: implement when kernel maintaines a list of site prefixes.
8009  */
8010 /* ARGSUSED */
8011 int
8012 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8013     ip_ioctl_cmd_t *ipip, void *ifreq)
8014 {
8015         return (ENXIO);
8016 }
8017
8018 /* ARP IOCTLs. */
8019 /* ARGSUSED */
8020 int
8021 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8022     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8023 {
8024         int             err;
8025         ipaddr_t        ipaddr;
8026         struct iocblk   *iocp;
8027         conn_t          *connp;
8028         struct arpreq   *ar;
8029         struct xarpreq  *xar;
8030         int             arp_flags, flags, alength;
8031         uchar_t         *lladdr;
8032         ip_stack_t      *ipst;
8033         ill_t           *ill = ipif->ipif_ill;
8034         ill_t           *proxy_ill = NULL;
8035         ipmp_arpent_t   *entp = NULL;
8036         boolean_t       proxyarp = B_FALSE;
8037         boolean_t       if_arp_ioctl = B_FALSE;
8038         ncec_t          *ncec = NULL;
8039         nce_t           *nce;
8040
8041         ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8042         connp = Q_TO_CONN(q);
8043         ipst = connp->conn_netstack->netstack_ip;
8044         iocp = (struct iocblk *)mp->b_rptr;
8045
8046         if (ipip->ipi_cmd_type == XARP_CMD) {
8047                 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */
8048                 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr;
8049                 ar = NULL;
8050
8051                 arp_flags = xar->xarp_flags;
8052                 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha);
8053                 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
8054                 /*
8055                  * Validate against user's link layer address length
8056                  * input and name and addr length limits.
8057                  */
8058                 alength = ill->ill_phys_addr_length;
8059                 if (ipip->ipi_cmd == SIOCSXARP) {
8060                         if (alength != xar->xarp_ha.sdl_alen ||
8061                             (alength + xar->xarp_ha.sdl_nlen >
8062                             sizeof (xar->xarp_ha.sdl_data)))
8063                                 return (EINVAL);
8064                 }
8065         } else {
8066                 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */
8067                 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr;
8068                 xar = NULL;
8069
8070                 arp_flags = ar->arp_flags;
8071                 lladdr = (uchar_t *)ar->arp_ha.sa_data;
8072                 /*
8073                  * Theoretically, the sa_family could tell us what link
8074                  * layer type this operation is trying to deal with. By
8075                  * common usage AF_UNSPEC means ethernet. We'll assume
8076                  * any attempt to use the SIOC?ARP ioctls is for ethernet,
8077                  * for now. Our new SIOC*XARP ioctls can be used more
8078                  * generally.
8079                  *
8080                  * If the underlying media happens to have a non 6 byte
8081                  * address, arp module will fail set/get, but the del
8082                  * operation will succeed.
8083                  */
8084                 alength = 6;
8085                 if ((ipip->ipi_cmd != SIOCDARP) &&
8086                     (alength != ill->ill_phys_addr_length)) {
8087                         return (EINVAL);
8088                 }
8089         }
8090
8091         /* Translate ATF* flags to NCE* flags */
8092         flags = 0;
8093         if (arp_flags & ATF_AUTHORITY)
8094                 flags |= NCE_F_AUTHORITY;
8095         if (arp_flags & ATF_PERM)
8096                 flags |= NCE_F_NONUD; /* not subject to aging */
8097         if (arp_flags & ATF_PUBL)
8098                 flags |= NCE_F_PUBLISH;
8099
8100         /*
8101          * IPMP ARP special handling:
8102          *
8103          * 1. Since ARP mappings must appear consistent across the group,
8104          *    prohibit changing ARP mappings on the underlying interfaces.
8105          *
8106          * 2. Since ARP mappings for IPMP data addresses are maintained by
8107          *    IP itself, prohibit changing them.
8108          *
8109          * 3. For proxy ARP, use a functioning hardware address in the group,
8110          *    provided one exists.  If one doesn't, just add the entry as-is;
8111          *    ipmp_illgrp_refresh_arpent() will refresh it if things change.
8112          */
8113         if (IS_UNDER_IPMP(ill)) {
8114                 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP)
8115                         return (EPERM);
8116         }
8117         if (IS_IPMP(ill)) {
8118                 ipmp_illgrp_t *illg = ill->ill_grp;
8119
8120                 switch (ipip->ipi_cmd) {
8121                 case SIOCSARP:
8122                 case SIOCSXARP:
8123                         proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength);
8124                         if (proxy_ill != NULL) {
8125                                 proxyarp = B_TRUE;
8126                                 if (!ipmp_ill_is_active(proxy_ill))
8127                                         proxy_ill = ipmp_illgrp_next_ill(illg);
8128                                 if (proxy_ill != NULL)
8129                                         lladdr = proxy_ill->ill_phys_addr;
8130                         }
8131                         /* FALLTHRU */
8132                 }
8133         }
8134
8135         ipaddr = sin->sin_addr.s_addr;
8136         /*
8137          * don't match across illgrp per case (1) and (2).
8138          * XXX use IS_IPMP(ill) like ndp_sioc_update?
8139          */
8140         nce = nce_lookup_v4(ill, &ipaddr);
8141         if (nce != NULL)
8142                 ncec = nce->nce_common;
8143
8144         switch (iocp->ioc_cmd) {
8145         case SIOCDARP:
8146         case SIOCDXARP: {
8147                 /*
8148                  * Delete the NCE if any.
8149                  */
8150                 if (ncec == NULL) {
8151                         iocp->ioc_error = ENXIO;
8152                         break;
8153                 }
8154                 /* Don't allow changes to arp mappings of local addresses. */
8155                 if (NCE_MYADDR(ncec)) {
8156                         nce_refrele(nce);
8157                         return (ENOTSUP);
8158                 }
8159                 iocp->ioc_error = 0;
8160
8161                 /*
8162                  * Delete the nce_common which has ncec_ill set to ipmp_ill.
8163                  * This will delete all the nce entries on the under_ills.
8164                  */
8165                 ncec_delete(ncec);
8166                 /*
8167                  * Once the NCE has been deleted, then the ire_dep* consistency
8168                  * mechanism will find any IRE which depended on the now
8169                  * condemned NCE (as part of sending packets).
8170                  * That mechanism handles redirects by deleting redirects
8171                  * that refer to UNREACHABLE nces.
8172                  */
8173                 break;
8174         }
8175         case SIOCGARP:
8176         case SIOCGXARP:
8177                 if (ncec != NULL) {
8178                         lladdr = ncec->ncec_lladdr;
8179                         flags = ncec->ncec_flags;
8180                         iocp->ioc_error = 0;
8181                         ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags);
8182                 } else {
8183                         iocp->ioc_error = ENXIO;
8184                 }
8185                 break;
8186         case SIOCSARP:
8187         case SIOCSXARP:
8188                 /* Don't allow changes to arp mappings of local addresses. */
8189                 if (ncec != NULL && NCE_MYADDR(ncec)) {
8190                         nce_refrele(nce);
8191                         return (ENOTSUP);
8192                 }
8193
8194                 /* static arp entries will undergo NUD if ATF_PERM is not set */
8195                 flags |= NCE_F_STATIC;
8196                 if (!if_arp_ioctl) {
8197                         ip_nce_lookup_and_update(&ipaddr, NULL, ipst,
8198                             lladdr, alength, flags);
8199                 } else {
8200                         ipif_t *ipif = ipif_get_next_ipif(NULL, ill);
8201                         if (ipif != NULL) {
8202                                 ip_nce_lookup_and_update(&ipaddr, ipif, ipst,
8203                                     lladdr, alength, flags);
8204                                 ipif_refrele(ipif);
8205                         }
8206                 }
8207                 if (nce != NULL) {
8208                         nce_refrele(nce);
8209                         nce = NULL;
8210                 }
8211                 /*
8212                  * NCE_F_STATIC entries will be added in state ND_REACHABLE
8213                  * by nce_add_common()
8214                  */
8215                 err = nce_lookup_then_add_v4(ill, lladdr,
8216                     ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED,
8217                     &nce);
8218                 if (err == EEXIST) {
8219                         ncec = nce->nce_common;
8220                         mutex_enter(&ncec->ncec_lock);
8221                         ncec->ncec_state = ND_REACHABLE;
8222                         ncec->ncec_flags = flags;
8223                         nce_update(ncec, ND_UNCHANGED, lladdr);
8224                         mutex_exit(&ncec->ncec_lock);
8225                         err = 0;
8226                 }
8227                 if (nce != NULL) {
8228                         nce_refrele(nce);
8229                         nce = NULL;
8230                 }
8231                 if (IS_IPMP(ill) && err == 0) {
8232                         entp = ipmp_illgrp_create_arpent(ill->ill_grp,
8233                             proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length,
8234                             flags);
8235                         if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
8236                                 iocp->ioc_error = (entp == NULL ? ENOMEM : 0);
8237                                 break;
8238                         }
8239                 }
8240                 iocp->ioc_error = err;
8241         }
8242
8243         if (nce != NULL) {
8244                 nce_refrele(nce);
8245         }
8246
8247         /*
8248          * If we created an IPMP ARP entry, mark that we've notified ARP.
8249          */
8250         if (entp != NULL)
8251                 ipmp_illgrp_mark_arpent(ill->ill_grp, entp);
8252
8253         return (iocp->ioc_error);
8254 }
8255
8256 /*
8257  * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify
8258  * the associated sin and refhold and return the associated ipif via `ci'.
8259  */
8260 int
8261 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
8262     cmd_info_t *ci)
8263 {
8264         mblk_t  *mp1;
8265         sin_t   *sin;
8266         conn_t  *connp;
8267         ipif_t  *ipif;
8268         ire_t   *ire = NULL;
8269         ill_t   *ill = NULL;
8270         boolean_t exists;
8271         ip_stack_t *ipst;
8272         struct arpreq *ar;
8273         struct xarpreq *xar;
8274         struct sockaddr_dl *sdl;
8275
8276         /* ioctl comes down on a conn */
8277         ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8278         connp = Q_TO_CONN(q);
8279         if (connp->conn_family == AF_INET6)
8280                 return (ENXIO);
8281
8282         ipst = connp->conn_netstack->netstack_ip;
8283
8284         /* Verified in ip_wput_nondata */
8285         mp1 = mp->b_cont->b_cont;
8286
8287         if (ipip->ipi_cmd_type == XARP_CMD) {
8288                 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq));
8289                 xar = (struct xarpreq *)mp1->b_rptr;
8290                 sin = (sin_t *)&xar->xarp_pa;
8291                 sdl = &xar->xarp_ha;
8292
8293                 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET)
8294                         return (ENXIO);
8295                 if (sdl->sdl_nlen >= LIFNAMSIZ)
8296                         return (EINVAL);
8297         } else {
8298                 ASSERT(ipip->ipi_cmd_type == ARP_CMD);
8299                 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq));
8300                 ar = (struct arpreq *)mp1->b_rptr;
8301                 sin = (sin_t *)&ar->arp_pa;
8302         }
8303
8304         if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) {
8305                 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen,
8306                     B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst);
8307                 if (ipif == NULL)
8308                         return (ENXIO);
8309                 if (ipif->ipif_id != 0) {
8310                         ipif_refrele(ipif);
8311                         return (ENXIO);
8312                 }
8313         } else {
8314                 /*
8315                  * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen
8316                  * of 0: use the IP address to find the ipif.  If the IP
8317                  * address is an IPMP test address, ire_ftable_lookup() will
8318                  * find the wrong ill, so we first do an ipif_lookup_addr().
8319                  */
8320                 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES,
8321                     ipst);
8322                 if (ipif == NULL) {
8323                         ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr,
8324                             0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES,
8325                             MATCH_IRE_TYPE, 0, ipst, NULL);
8326                         if (ire == NULL || ((ill = ire->ire_ill) == NULL)) {
8327                                 if (ire != NULL)
8328                                         ire_refrele(ire);
8329                                 return (ENXIO);
8330                         }
8331                         ASSERT(ire != NULL && ill != NULL);
8332                         ipif = ill->ill_ipif;
8333                         ipif_refhold(ipif);
8334                         ire_refrele(ire);
8335                 }
8336         }
8337
8338         if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) {
8339                 ipif_refrele(ipif);
8340                 return (ENXIO);
8341         }
8342
8343         ci->ci_sin = sin;
8344         ci->ci_ipif = ipif;
8345         return (0);
8346 }
8347
8348 /*
8349  * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the
8350  * value of `ioccmd'.  While an illgrp is linked to an ipmp_grp_t, it is
8351  * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it
8352  * up and thus an ill can join that illgrp.
8353  *
8354  * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than
8355  * open()/close() primarily because close() is not allowed to fail or block
8356  * forever.  On the other hand, I_PUNLINK *can* fail, and there's no reason
8357  * why anyone should ever need to I_PUNLINK an in-use IPMP stream.  To ensure
8358  * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the
8359  * I_PUNLINK) we defer linking to I_PLINK.  Separately, we also fail attempts
8360  * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent
8361  * state if I_UNLINK didn't occur.
8362  *
8363  * Note that for each plumb/unplumb operation, we may end up here more than
8364  * once because of the way ifconfig works.  However, it's OK to link the same
8365  * illgrp more than once, or unlink an illgrp that's already unlinked.
8366  */
8367 static int
8368 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd)
8369 {
8370         int err;
8371         ip_stack_t *ipst = ill->ill_ipst;
8372
8373         ASSERT(IS_IPMP(ill));
8374         ASSERT(IAM_WRITER_ILL(ill));
8375
8376         switch (ioccmd) {
8377         case I_LINK:
8378                 return (ENOTSUP);
8379
8380         case I_PLINK:
8381                 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8382                 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp);
8383                 rw_exit(&ipst->ips_ipmp_lock);
8384                 break;
8385
8386         case I_PUNLINK:
8387                 /*
8388                  * Require all UP ipifs be brought down prior to unlinking the
8389                  * illgrp so any associated IREs (and other state) is torched.
8390                  */
8391                 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
8392                         return (EBUSY);
8393
8394                 /*
8395                  * NOTE: We hold ipmp_lock across the unlink to prevent a race
8396                  * with an SIOCSLIFGROUPNAME request from an ill trying to
8397                  * join this group.  Specifically: ills trying to join grab
8398                  * ipmp_lock and bump a "pending join" counter checked by
8399                  * ipmp_illgrp_unlink_grp().  During the unlink no new pending
8400                  * joins can occur (since we have ipmp_lock).  Once we drop
8401                  * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not
8402                  * find the illgrp (since we unlinked it) and will return
8403                  * EAFNOSUPPORT.  This will then take them back through the
8404                  * IPMP meta-interface plumbing logic in ifconfig, and thus
8405                  * back through I_PLINK above.
8406                  */
8407                 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8408                 err = ipmp_illgrp_unlink_grp(ill->ill_grp);
8409                 rw_exit(&ipst->ips_ipmp_lock);
8410                 return (err);
8411         default:
8412                 break;
8413         }
8414         return (0);
8415 }
8416
8417 /*
8418  * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also
8419  * atomically set/clear the muxids. Also complete the ioctl by acking or
8420  * naking it.  Note that the code is structured such that the link type,
8421  * whether it's persistent or not, is treated equally.  ifconfig(8) and
8422  * its clones use the persistent link, while pppd(8) and perhaps many
8423  * other daemons may use non-persistent link.  When combined with some
8424  * ill_t states, linking and unlinking lower streams may be used as
8425  * indicators of dynamic re-plumbing events [see PSARC/1999/348].
8426  */
8427 /* ARGSUSED */
8428 void
8429 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8430 {
8431         mblk_t          *mp1;
8432         struct linkblk  *li;
8433         int             ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
8434         int             err = 0;
8435
8436         ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK ||
8437             ioccmd == I_LINK || ioccmd == I_UNLINK);
8438
8439         mp1 = mp->b_cont;       /* This is the linkblk info */
8440         li = (struct linkblk *)mp1->b_rptr;
8441
8442         err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li);
8443         if (err == EINPROGRESS)
8444                 return;
8445         if (err == 0)
8446                 miocack(q, mp, 0, 0);
8447         else
8448                 miocnak(q, mp, 0, err);
8449
8450         /* Conn was refheld in ip_sioctl_copyin_setup */
8451         if (CONN_Q(q)) {
8452                 CONN_DEC_IOCTLREF(Q_TO_CONN(q));
8453                 CONN_OPER_PENDING_DONE(Q_TO_CONN(q));
8454         }
8455 }
8456
8457 /*
8458  * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to
8459  * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP
8460  * module stream).
8461  * Returns zero on success, EINPROGRESS if the operation is still pending, or
8462  * an error code on failure.
8463  */
8464 static int
8465 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
8466     struct linkblk *li)
8467 {
8468         int             err = 0;
8469         ill_t           *ill;
8470         queue_t         *ipwq, *dwq;
8471         const char      *name;
8472         struct qinit    *qinfo;
8473         boolean_t       islink = (ioccmd == I_PLINK || ioccmd == I_LINK);
8474         boolean_t       entered_ipsq = B_FALSE;
8475         boolean_t       is_ip = B_FALSE;
8476         arl_t           *arl;
8477
8478         /*
8479          * Walk the lower stream to verify it's the IP module stream.
8480          * The IP module is identified by its name, wput function,
8481          * and non-NULL q_next.  STREAMS ensures that the lower stream
8482          * (li->l_qbot) will not vanish until this ioctl completes.
8483          */
8484         for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) {
8485                 qinfo = ipwq->q_qinfo;
8486                 name = qinfo->qi_minfo->mi_idname;
8487                 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 &&
8488                     qinfo->qi_putp != ip_lwput && ipwq->q_next != NULL) {
8489                         is_ip = B_TRUE;
8490                         break;
8491                 }
8492                 if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 &&
8493                     qinfo->qi_putp != ip_lwput && ipwq->q_next != NULL) {
8494                         break;
8495                 }
8496         }
8497
8498         /*
8499          * If this isn't an IP module stream, bail.
8500          */
8501         if (ipwq == NULL)
8502                 return (0);
8503
8504         if (!is_ip) {
8505                 arl = (arl_t *)ipwq->q_ptr;
8506                 ill = arl_to_ill(arl);
8507                 if (ill == NULL)
8508                         return (0);
8509         } else {
8510                 ill = ipwq->q_ptr;
8511         }
8512         ASSERT(ill != NULL);
8513
8514         if (ipsq == NULL) {
8515                 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
8516                     NEW_OP, B_FALSE);
8517                 if (ipsq == NULL) {
8518                         if (!is_ip)
8519                                 ill_refrele(ill);
8520                         return (EINPROGRESS);
8521                 }
8522                 entered_ipsq = B_TRUE;
8523         }
8524         ASSERT(IAM_WRITER_ILL(ill));
8525         mutex_enter(&ill->ill_lock);
8526         if (!is_ip) {
8527                 if (islink && ill->ill_muxid == 0) {
8528                         /*
8529                          * Plumbing has to be done with IP plumbed first, arp
8530                          * second, but here we have arp being plumbed first.
8531                          */
8532                         mutex_exit(&ill->ill_lock);
8533                         if (entered_ipsq)
8534                                 ipsq_exit(ipsq);
8535                         ill_refrele(ill);
8536                         return (EINVAL);
8537                 }
8538         }
8539         mutex_exit(&ill->ill_lock);
8540         if (!is_ip) {
8541                 arl->arl_muxid = islink ? li->l_index : 0;
8542                 ill_refrele(ill);
8543                 goto done;
8544         }
8545
8546         if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
8547                 goto done;
8548
8549         /*
8550          * As part of I_{P}LINKing, stash the number of downstream modules and
8551          * the read queue of the module immediately below IP in the ill.
8552          * These are used during the capability negotiation below.
8553          */
8554         ill->ill_lmod_rq = NULL;
8555         ill->ill_lmod_cnt = 0;
8556         if (islink && ((dwq = ipwq->q_next) != NULL)) {
8557                 ill->ill_lmod_rq = RD(dwq);
8558                 for (; dwq != NULL; dwq = dwq->q_next)
8559                         ill->ill_lmod_cnt++;
8560         }
8561
8562         ill->ill_muxid = islink ? li->l_index : 0;
8563
8564         /*
8565          * Mark the ipsq busy until the capability operations initiated below
8566          * complete. The PLINK/UNLINK ioctl itself completes when our caller
8567          * returns, but the capability operation may complete asynchronously
8568          * much later.
8569          */
8570         ipsq_current_start(ipsq, ill->ill_ipif, ioccmd);
8571         /*
8572          * If there's at least one up ipif on this ill, then we're bound to
8573          * the underlying driver via DLPI.  In that case, renegotiate
8574          * capabilities to account for any possible change in modules
8575          * interposed between IP and the driver.
8576          */
8577         if (ill->ill_ipif_up_count > 0) {
8578                 if (islink)
8579                         ill_capability_probe(ill);
8580                 else
8581                         ill_capability_reset(ill, B_FALSE);
8582         }
8583         ipsq_current_finish(ipsq);
8584 done:
8585         if (entered_ipsq)
8586                 ipsq_exit(ipsq);
8587
8588         return (err);
8589 }
8590
8591 /*
8592  * Search the ioctl command in the ioctl tables and return a pointer
8593  * to the ioctl command information. The ioctl command tables are
8594  * static and fully populated at compile time.
8595  */
8596 ip_ioctl_cmd_t *
8597 ip_sioctl_lookup(int ioc_cmd)
8598 {
8599         int index;
8600         ip_ioctl_cmd_t *ipip;
8601         ip_ioctl_cmd_t *ipip_end;
8602
8603         if (ioc_cmd == IPI_DONTCARE)
8604                 return (NULL);
8605
8606         /*
8607          * Do a 2 step search. First search the indexed table
8608          * based on the least significant byte of the ioctl cmd.
8609          * If we don't find a match, then search the misc table
8610          * serially.
8611          */
8612         index = ioc_cmd & 0xFF;
8613         if (index < ip_ndx_ioctl_count) {
8614                 ipip = &ip_ndx_ioctl_table[index];
8615                 if (ipip->ipi_cmd == ioc_cmd) {
8616                         /* Found a match in the ndx table */
8617                         return (ipip);
8618                 }
8619         }
8620
8621         /* Search the misc table */
8622         ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count];
8623         for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) {
8624                 if (ipip->ipi_cmd == ioc_cmd)
8625                         /* Found a match in the misc table */
8626                         return (ipip);
8627         }
8628
8629         return (NULL);
8630 }
8631
8632 /*
8633  * helper function for ip_sioctl_getsetprop(), which does some sanity checks
8634  */
8635 static boolean_t
8636 getset_ioctl_checks(mod_ioc_prop_t *pioc, int ioc_cmd)
8637 {
8638         uint_t          flags = pioc->mpr_flags;
8639         if (ioc_cmd == SIOCSETPROP) {
8640                 /*
8641                  * One can either reset the value to it's default value or
8642                  * change the current value or append/remove the value from
8643                  * a multi-valued properties.
8644                  */
8645                 if ((flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8646                     flags != MOD_PROP_ACTIVE &&
8647                     flags != (MOD_PROP_ACTIVE|MOD_PROP_APPEND) &&
8648                     flags != (MOD_PROP_ACTIVE|MOD_PROP_REMOVE))
8649                         return (B_FALSE);
8650         } else {
8651                 ASSERT(ioc_cmd == SIOCGETPROP);
8652
8653                 /*
8654                  * One can retrieve only one kind of property information
8655                  * at a time.
8656                  */
8657                 if ((flags & MOD_PROP_ACTIVE) != MOD_PROP_ACTIVE &&
8658                     (flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8659                     (flags & MOD_PROP_POSSIBLE) != MOD_PROP_POSSIBLE &&
8660                     (flags & MOD_PROP_PERM) != MOD_PROP_PERM)
8661                         return (B_FALSE);
8662         }
8663
8664         return (B_TRUE);
8665 }
8666
8667 /*
8668  * process the SIOC{SET|GET}PROP ioctl's
8669  */
8670 /* ARGSUSED */
8671 int
8672 ip_sioctl_getsetprop(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
8673     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req)
8674 {
8675         int             ioc_cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
8676         mblk_t          *mp1;
8677         mod_ioc_prop_t  *pioc;
8678         mod_prop_info_t *ptbl = NULL, *pinfo = NULL;
8679         ip_stack_t      *ipst;
8680         netstack_t      *stack;
8681         cred_t          *cr;
8682         boolean_t       set;
8683         int             err;
8684
8685         ASSERT(q->q_next == NULL);
8686         ASSERT(CONN_Q(q));
8687
8688         mp1 = mp->b_cont->b_cont;
8689         ipst = CONNQ_TO_IPST(q);
8690         stack = ipst->ips_netstack;
8691         pioc = (mod_ioc_prop_t *)mp1->b_rptr;
8692         if (!getset_ioctl_checks(pioc, ioc_cmd))
8693                 return (EINVAL);
8694
8695         switch (pioc->mpr_proto) {
8696         case MOD_PROTO_IP:
8697         case MOD_PROTO_IPV4:
8698         case MOD_PROTO_IPV6:
8699                 ptbl = ipst->ips_propinfo_tbl;
8700                 break;
8701         case MOD_PROTO_RAWIP:
8702                 ptbl = stack->netstack_icmp->is_propinfo_tbl;
8703                 break;
8704         case MOD_PROTO_TCP:
8705                 ptbl = stack->netstack_tcp->tcps_propinfo_tbl;
8706                 break;
8707         case MOD_PROTO_UDP:
8708                 ptbl = stack->netstack_udp->us_propinfo_tbl;
8709                 break;
8710         case MOD_PROTO_SCTP:
8711                 ptbl = stack->netstack_sctp->sctps_propinfo_tbl;
8712                 break;
8713         default:
8714                 return (EINVAL);
8715         }
8716
8717         pioc->mpr_ifname[sizeof(pioc->mpr_ifname)-1] = '\0';
8718         pioc->mpr_name[sizeof(pioc->mpr_name)-1] = '\0';
8719         pioc->mpr_val[sizeof(pioc->mpr_val)-1] = '\0';
8720
8721         pinfo = mod_prop_lookup(ptbl, pioc->mpr_name, pioc->mpr_proto);
8722         if (pinfo == NULL)
8723                 return (ENOENT);
8724
8725         set = (ioc_cmd == SIOCSETPROP) ? B_TRUE : B_FALSE;
8726         if (set && pinfo->mpi_setf != NULL) {
8727                 cr = msg_getcred(mp, NULL);
8728                 err = pinfo->mpi_setf(stack, cr, pinfo, pioc->mpr_ifname,
8729                     pioc->mpr_val, pioc->mpr_flags);
8730         } else if (!set && pinfo->mpi_getf != NULL) {
8731                 err = pinfo->mpi_getf(stack, pinfo, pioc->mpr_ifname,
8732                     pioc->mpr_val, sizeof(pioc->mpr_val), pioc->mpr_flags);
8733         } else {
8734                 err = EPERM;
8735         }
8736
8737         return (err);
8738 }
8739
8740 /*
8741  * process the legacy ND_GET, ND_SET ioctl just for {ip|ip6}_forwarding
8742  * as several routing daemons have unfortunately used this 'unpublished'
8743  * but well-known ioctls.
8744  */
8745 /* ARGSUSED */
8746 static void
8747 ip_process_legacy_nddprop(queue_t *q, mblk_t *mp)
8748 {
8749         struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
8750         mblk_t          *mp1 = mp->b_cont;
8751         char            *pname, *pval, *buf;
8752         uint_t          bufsize, proto;
8753         mod_prop_info_t *pinfo = NULL;
8754         ip_stack_t      *ipst;
8755         int             err = 0;
8756
8757         ASSERT(CONN_Q(q));
8758         ipst = CONNQ_TO_IPST(q);
8759
8760         if (iocp->ioc_count == 0 || mp1 == NULL) {
8761                 miocnak(q, mp, 0, EINVAL);
8762                 return;
8763         }
8764
8765         mp1->b_datap->db_lim[-1] = '\0';        /* Force null termination */
8766         pval = buf = pname = (char *)mp1->b_rptr;
8767         bufsize = MBLKL(mp1);
8768
8769         if (strcmp(pname, "ip_forwarding") == 0) {
8770                 pname = "forwarding";
8771                 proto = MOD_PROTO_IPV4;
8772         } else if (strcmp(pname, "ip6_forwarding") == 0) {
8773                 pname = "forwarding";
8774                 proto = MOD_PROTO_IPV6;
8775         } else {
8776                 miocnak(q, mp, 0, EINVAL);
8777                 return;
8778         }
8779
8780         pinfo = mod_prop_lookup(ipst->ips_propinfo_tbl, pname, proto);
8781
8782         switch (iocp->ioc_cmd) {
8783         case ND_GET:
8784                 if ((err = pinfo->mpi_getf(ipst->ips_netstack, pinfo, NULL, buf,
8785                     bufsize, 0)) == 0) {
8786                         miocack(q, mp, iocp->ioc_count, 0);
8787                         return;
8788                 }
8789                 break;
8790         case ND_SET:
8791                 /*
8792                  * buffer will have property name and value in the following
8793                  * format,
8794                  * <property name>'\0'<property value>'\0', extract them;
8795                  */
8796                 while (*pval++)
8797                         noop;
8798
8799                 if (!*pval || pval >= (char *)mp1->b_wptr) {
8800                         err = EINVAL;
8801                 } else if ((err = pinfo->mpi_setf(ipst->ips_netstack, NULL,
8802                     pinfo, NULL, pval, 0)) == 0) {
8803                         miocack(q, mp, 0, 0);
8804                         return;
8805                 }
8806                 break;
8807         default:
8808                 err = EINVAL;
8809                 break;
8810         }
8811         miocnak(q, mp, 0, err);
8812 }
8813
8814 /*
8815  * Wrapper function for resuming deferred ioctl processing
8816  * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER,
8817  * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently.
8818  */
8819 /* ARGSUSED */
8820 void
8821 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp,
8822     void *dummy_arg)
8823 {
8824         ip_sioctl_copyin_setup(q, mp);
8825 }
8826
8827 /*
8828  * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message
8829  * that arrives.  Most of the IOCTLs are "socket" IOCTLs which we handle
8830  * in either I_STR or TRANSPARENT form, using the mi_copy facility.
8831  * We establish here the size of the block to be copied in.  mi_copyin
8832  * arranges for this to happen, an processing continues in ip_wput_nondata with
8833  * an M_IOCDATA message.
8834  */
8835 void
8836 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp)
8837 {
8838         int     copyin_size;
8839         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8840         ip_ioctl_cmd_t *ipip;
8841         cred_t *cr;
8842         ip_stack_t      *ipst;
8843
8844         if (CONN_Q(q))
8845                 ipst = CONNQ_TO_IPST(q);
8846         else
8847                 ipst = ILLQ_TO_IPST(q);
8848
8849         ipip = ip_sioctl_lookup(iocp->ioc_cmd);
8850         if (ipip == NULL) {
8851                 /*
8852                  * The ioctl is not one we understand or own.
8853                  * Pass it along to be processed down stream,
8854                  * if this is a module instance of IP, else nak
8855                  * the ioctl.
8856                  */
8857                 if (q->q_next == NULL) {
8858                         goto nak;
8859                 } else {
8860                         putnext(q, mp);
8861                         return;
8862                 }
8863         }
8864
8865         /*
8866          * If this is deferred, then we will do all the checks when we
8867          * come back.
8868          */
8869         if ((iocp->ioc_cmd == SIOCGDSTINFO ||
8870             iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) {
8871                 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume);
8872                 return;
8873         }
8874
8875         /*
8876          * Only allow a very small subset of IP ioctls on this stream if
8877          * IP is a module and not a driver. Allowing ioctls to be processed
8878          * in this case may cause assert failures or data corruption.
8879          * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few
8880          * ioctls allowed on an IP module stream, after which this stream
8881          * normally becomes a multiplexor (at which time the stream head
8882          * will fail all ioctls).
8883          */
8884         if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
8885                 goto nak;
8886         }
8887
8888         /* Make sure we have ioctl data to process. */
8889         if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT))
8890                 goto nak;
8891
8892         /*
8893          * Prefer dblk credential over ioctl credential; some synthesized
8894          * ioctls have kcred set because there's no way to crhold()
8895          * a credential in some contexts.  (ioc_cr is not crfree() by
8896          * the framework; the caller of ioctl needs to hold the reference
8897          * for the duration of the call).
8898          */
8899         cr = msg_getcred(mp, NULL);
8900         if (cr == NULL)
8901                 cr = iocp->ioc_cr;
8902
8903         /* Make sure normal users don't send down privileged ioctls */
8904         if ((ipip->ipi_flags & IPI_PRIV) &&
8905             (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) {
8906                 /* We checked the privilege earlier but log it here */
8907                 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE));
8908                 return;
8909         }
8910
8911         /*
8912          * The ioctl command tables can only encode fixed length
8913          * ioctl data. If the length is variable, the table will
8914          * encode the length as zero. Such special cases are handled
8915          * below in the switch.
8916          */
8917         if (ipip->ipi_copyin_size != 0) {
8918                 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size);
8919                 return;
8920         }
8921
8922         switch (iocp->ioc_cmd) {
8923         case O_SIOCGIFCONF:
8924         case SIOCGIFCONF:
8925                 /*
8926                  * This IOCTL is hilarious.  See comments in
8927                  * ip_sioctl_get_ifconf for the story.
8928                  */
8929                 if (iocp->ioc_count == TRANSPARENT)
8930                         copyin_size = SIZEOF_STRUCT(ifconf,
8931                             iocp->ioc_flag);
8932                 else
8933                         copyin_size = iocp->ioc_count;
8934                 mi_copyin(q, mp, NULL, copyin_size);
8935                 return;
8936
8937         case O_SIOCGLIFCONF:
8938         case SIOCGLIFCONF:
8939                 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag);
8940                 mi_copyin(q, mp, NULL, copyin_size);
8941                 return;
8942
8943         case SIOCGLIFSRCOF:
8944                 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag);
8945                 mi_copyin(q, mp, NULL, copyin_size);
8946                 return;
8947
8948         case SIOCGIP6ADDRPOLICY:
8949                 ip_sioctl_ip6addrpolicy(q, mp);
8950                 ip6_asp_table_refrele(ipst);
8951                 return;
8952
8953         case SIOCSIP6ADDRPOLICY:
8954                 ip_sioctl_ip6addrpolicy(q, mp);
8955                 return;
8956
8957         case SIOCGDSTINFO:
8958                 ip_sioctl_dstinfo(q, mp);
8959                 ip6_asp_table_refrele(ipst);
8960                 return;
8961
8962         case ND_SET:
8963         case ND_GET:
8964                 ip_process_legacy_nddprop(q, mp);
8965                 return;
8966
8967         case I_PLINK:
8968         case I_PUNLINK:
8969         case I_LINK:
8970         case I_UNLINK:
8971                 /*
8972                  * We treat non-persistent link similarly as the persistent
8973                  * link case, in terms of plumbing/unplumbing, as well as
8974                  * dynamic re-plumbing events indicator.  See comments
8975                  * in ip_sioctl_plink() for more.
8976                  *
8977                  * Request can be enqueued in the 'ipsq' while waiting
8978                  * to become exclusive. So bump up the conn ref.
8979                  */
8980                 if (CONN_Q(q)) {
8981                         CONN_INC_REF(Q_TO_CONN(q));
8982                         CONN_INC_IOCTLREF(Q_TO_CONN(q))
8983                 }
8984                 ip_sioctl_plink(NULL, q, mp, NULL);
8985                 return;
8986
8987         case IP_IOCTL:
8988                 ip_wput_ioctl(q, mp);
8989                 return;
8990
8991         case SIOCILB:
8992                 /* The ioctl length varies depending on the ILB command. */
8993                 copyin_size = iocp->ioc_count;
8994                 if (copyin_size < sizeof (ilb_cmd_t))
8995                         goto nak;
8996                 mi_copyin(q, mp, NULL, copyin_size);
8997                 return;
8998
8999         default:
9000                 cmn_err(CE_WARN, "Unknown ioctl %d/0x%x slipped through.",
9001                     iocp->ioc_cmd, iocp->ioc_cmd);
9002                 /* FALLTHRU */
9003         }
9004 nak:
9005         if (mp->b_cont != NULL) {
9006                 freemsg(mp->b_cont);
9007                 mp->b_cont = NULL;
9008         }
9009         iocp->ioc_error = EINVAL;
9010         mp->b_datap->db_type = M_IOCNAK;
9011         iocp->ioc_count = 0;
9012         qreply(q, mp);
9013 }
9014
9015 static void
9016 ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags)
9017 {
9018         struct arpreq *ar;
9019         struct xarpreq *xar;
9020         mblk_t  *tmp;
9021         struct iocblk *iocp;
9022         int x_arp_ioctl = B_FALSE;
9023         int *flagsp;
9024         char *storage = NULL;
9025
9026         ASSERT(ill != NULL);
9027
9028         iocp = (struct iocblk *)mp->b_rptr;
9029         ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP);
9030
9031         tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */
9032         if ((iocp->ioc_cmd == SIOCGXARP) ||
9033             (iocp->ioc_cmd == SIOCSXARP)) {
9034                 x_arp_ioctl = B_TRUE;
9035                 xar = (struct xarpreq *)tmp->b_rptr;
9036                 flagsp = &xar->xarp_flags;
9037                 storage = xar->xarp_ha.sdl_data;
9038         } else {
9039                 ar = (struct arpreq *)tmp->b_rptr;
9040                 flagsp = &ar->arp_flags;
9041                 storage = ar->arp_ha.sa_data;
9042         }
9043
9044         /*
9045          * We're done if this is not an SIOCG{X}ARP
9046          */
9047         if (x_arp_ioctl) {
9048                 storage += ill_xarp_info(&xar->xarp_ha, ill);
9049                 if ((ill->ill_phys_addr_length + ill->ill_name_length) >
9050                     sizeof (xar->xarp_ha.sdl_data)) {
9051                         iocp->ioc_error = EINVAL;
9052                         return;
9053                 }
9054         }
9055         *flagsp = ATF_INUSE;
9056         /*
9057          * If /sbin/arp told us we are the authority using the "permanent"
9058          * flag, or if this is one of my addresses print "permanent"
9059          * in the /sbin/arp output.
9060          */
9061         if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY))
9062                 *flagsp |= ATF_AUTHORITY;
9063         if (flags & NCE_F_NONUD)
9064                 *flagsp |= ATF_PERM; /* not subject to aging */
9065         if (flags & NCE_F_PUBLISH)
9066                 *flagsp |= ATF_PUBL;
9067         if (hwaddr != NULL) {
9068                 *flagsp |= ATF_COM;
9069                 bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length);
9070         }
9071 }
9072
9073 /*
9074  * Create a new logical interface. If ipif_id is zero (i.e. not a logical
9075  * interface) create the next available logical interface for this
9076  * physical interface.
9077  * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an
9078  * ipif with the specified name.
9079  *
9080  * If the address family is not AF_UNSPEC then set the address as well.
9081  *
9082  * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout)
9083  * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer.
9084  *
9085  * Executed as a writer on the ill.
9086  * So no lock is needed to traverse the ipif chain, or examine the
9087  * phyint flags.
9088  */
9089 /* ARGSUSED */
9090 int
9091 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
9092     ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9093 {
9094         mblk_t  *mp1;
9095         struct lifreq *lifr;
9096         boolean_t       isv6;
9097         boolean_t       exists;
9098         char    *name;
9099         char    *endp;
9100         char    *cp;
9101         int     namelen;
9102         ipif_t  *ipif;
9103         long    id;
9104         ipsq_t  *ipsq;
9105         ill_t   *ill;
9106         sin_t   *sin;
9107         int     err = 0;
9108         boolean_t found_sep = B_FALSE;
9109         conn_t  *connp;
9110         zoneid_t zoneid;
9111         ip_stack_t *ipst = CONNQ_TO_IPST(q);
9112
9113         ASSERT(q->q_next == NULL);
9114         ip1dbg(("ip_sioctl_addif\n"));
9115         /* Existence of mp1 has been checked in ip_wput_nondata */
9116         mp1 = mp->b_cont->b_cont;
9117         /*
9118          * Null terminate the string to protect against buffer
9119          * overrun. String was generated by user code and may not
9120          * be trusted.
9121          */
9122         lifr = (struct lifreq *)mp1->b_rptr;
9123         lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
9124         name = lifr->lifr_name;
9125         ASSERT(CONN_Q(q));
9126         connp = Q_TO_CONN(q);
9127         isv6 = (connp->conn_family == AF_INET6);
9128         zoneid = connp->conn_zoneid;
9129         namelen = mi_strlen(name);
9130         if (namelen == 0)
9131                 return (EINVAL);
9132
9133         exists = B_FALSE;
9134         if ((namelen + 1 == sizeof (ipif_loopback_name)) &&
9135             (mi_strcmp(name, ipif_loopback_name) == 0)) {
9136                 /*
9137                  * Allow creating lo0 using SIOCLIFADDIF.
9138                  * can't be any other writer thread. So can pass null below
9139                  * for the last 4 args to ipif_lookup_name.
9140                  */
9141                 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE,
9142                     &exists, isv6, zoneid, ipst);
9143                 /* Prevent any further action */
9144                 if (ipif == NULL) {
9145                         return (ENOBUFS);
9146                 } else if (!exists) {
9147                         /* We created the ipif now and as writer */
9148                         ipif_refrele(ipif);
9149                         return (0);
9150                 } else {
9151                         ill = ipif->ipif_ill;
9152                         ill_refhold(ill);
9153                         ipif_refrele(ipif);
9154                 }
9155         } else {
9156                 /* Look for a colon in the name. */
9157                 endp = &name[namelen];
9158                 for (cp = endp; --cp > name; ) {
9159                         if (*cp == IPIF_SEPARATOR_CHAR) {
9160                                 found_sep = B_TRUE;
9161                                 /*
9162                                  * Reject any non-decimal aliases for plumbing
9163                                  * of logical interfaces. Aliases with leading
9164                                  * zeroes are also rejected as they introduce
9165                                  * ambiguity in the naming of the interfaces.
9166                                  * Comparing with "0" takes care of all such
9167                                  * cases.
9168                                  */
9169                                 if ((strncmp("0", cp+1, 1)) == 0)
9170                                         return (EINVAL);
9171
9172                                 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 ||
9173                                     id <= 0 || *endp != '\0') {
9174                                         return (EINVAL);
9175                                 }
9176                                 *cp = '\0';
9177                                 break;
9178                         }
9179                 }
9180                 ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst);
9181                 if (found_sep)
9182                         *cp = IPIF_SEPARATOR_CHAR;
9183                 if (ill == NULL)
9184                         return (ENXIO);
9185         }
9186
9187         ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP,
9188             B_TRUE);
9189
9190         /*
9191          * Release the refhold due to the lookup, now that we are excl
9192          * or we are just returning
9193          */
9194         ill_refrele(ill);
9195
9196         if (ipsq == NULL)
9197                 return (EINPROGRESS);
9198
9199         /* We are now exclusive on the IPSQ */
9200         ASSERT(IAM_WRITER_ILL(ill));
9201
9202         if (found_sep) {
9203                 /* Now see if there is an IPIF with this unit number. */
9204                 for (ipif = ill->ill_ipif; ipif != NULL;
9205                     ipif = ipif->ipif_next) {
9206                         if (ipif->ipif_id == id) {
9207                                 err = EEXIST;
9208                                 goto done;
9209                         }
9210                 }
9211         }
9212
9213         /*
9214          * We use IRE_LOCAL for lo0:1 etc. for "receive only" use
9215          * of lo0.  Plumbing for lo0:0 happens in ipif_lookup_on_name()
9216          * instead.
9217          */
9218         if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL,
9219             B_TRUE, B_TRUE, &err)) == NULL) {
9220                 goto done;
9221         }
9222
9223         /* Return created name with ioctl */
9224         (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name,
9225             IPIF_SEPARATOR_CHAR, ipif->ipif_id);
9226         ip1dbg(("created %s\n", lifr->lifr_name));
9227
9228         /* Set address */
9229         sin = (sin_t *)&lifr->lifr_addr;
9230         if (sin->sin_family != AF_UNSPEC) {
9231                 err = ip_sioctl_addr(ipif, sin, q, mp,
9232                     &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr);
9233         }
9234
9235 done:
9236         ipsq_exit(ipsq);
9237         return (err);
9238 }
9239
9240 /*
9241  * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical
9242  * interface) delete it based on the IP address (on this physical interface).
9243  * Otherwise delete it based on the ipif_id.
9244  * Also, special handling to allow a removeif of lo0.
9245  */
9246 /* ARGSUSED */
9247 int
9248 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9249     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9250 {
9251         conn_t          *connp;
9252         ill_t           *ill = ipif->ipif_ill;
9253         boolean_t        success;
9254         ip_stack_t      *ipst;
9255
9256         ipst = CONNQ_TO_IPST(q);
9257
9258         ASSERT(q->q_next == NULL);
9259         ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n",
9260             ill->ill_name, ipif->ipif_id, (void *)ipif));
9261         ASSERT(IAM_WRITER_IPIF(ipif));
9262
9263         connp = Q_TO_CONN(q);
9264         /*
9265          * Special case for unplumbing lo0 (the loopback physical interface).
9266          * If unplumbing lo0, the incoming address structure has been
9267          * initialized to all zeros. When unplumbing lo0, all its logical
9268          * interfaces must be removed too.
9269          *
9270          * Note that this interface may be called to remove a specific
9271          * loopback logical interface (eg, lo0:1). But in that case
9272          * ipif->ipif_id != 0 so that the code path for that case is the
9273          * same as any other interface (meaning it skips the code directly
9274          * below).
9275          */
9276         if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9277                 if (sin->sin_family == AF_UNSPEC &&
9278                     (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) {
9279                         /*
9280                          * Mark it condemned. No new ref. will be made to ill.
9281                          */
9282                         mutex_enter(&ill->ill_lock);
9283                         ill->ill_state_flags |= ILL_CONDEMNED;
9284                         for (ipif = ill->ill_ipif; ipif != NULL;
9285                             ipif = ipif->ipif_next) {
9286                                 ipif->ipif_state_flags |= IPIF_CONDEMNED;
9287                         }
9288                         mutex_exit(&ill->ill_lock);
9289
9290                         ipif = ill->ill_ipif;
9291                         /* unplumb the loopback interface */
9292                         ill_delete(ill);
9293                         mutex_enter(&connp->conn_lock);
9294                         mutex_enter(&ill->ill_lock);
9295
9296                         /* Are any references to this ill active */
9297                         if (ill_is_freeable(ill)) {
9298                                 mutex_exit(&ill->ill_lock);
9299                                 mutex_exit(&connp->conn_lock);
9300                                 ill_delete_tail(ill);
9301                                 mi_free(ill);
9302                                 return (0);
9303                         }
9304                         success = ipsq_pending_mp_add(connp, ipif,
9305                             CONNP_TO_WQ(connp), mp, ILL_FREE);
9306                         mutex_exit(&connp->conn_lock);
9307                         mutex_exit(&ill->ill_lock);
9308                         if (success)
9309                                 return (EINPROGRESS);
9310                         else
9311                                 return (EINTR);
9312                 }
9313         }
9314
9315         if (ipif->ipif_id == 0) {
9316                 ipsq_t *ipsq;
9317
9318                 /* Find based on address */
9319                 if (ipif->ipif_isv6) {
9320                         sin6_t *sin6;
9321
9322                         if (sin->sin_family != AF_INET6)
9323                                 return (EAFNOSUPPORT);
9324
9325                         sin6 = (sin6_t *)sin;
9326                         /* We are a writer, so we should be able to lookup */
9327                         ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill,
9328                             ipst);
9329                 } else {
9330                         if (sin->sin_family != AF_INET)
9331                                 return (EAFNOSUPPORT);
9332
9333                         /* We are a writer, so we should be able to lookup */
9334                         ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill,
9335                             ipst);
9336                 }
9337                 if (ipif == NULL) {
9338                         return (EADDRNOTAVAIL);
9339                 }
9340
9341                 /*
9342                  * It is possible for a user to send an SIOCLIFREMOVEIF with
9343                  * lifr_name of the physical interface but with an ip address
9344                  * lifr_addr of a logical interface plumbed over it.
9345                  * So update ipx_current_ipif now that ipif points to the
9346                  * correct one.
9347                  */
9348                 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
9349                 ipsq->ipsq_xop->ipx_current_ipif = ipif;
9350
9351                 /* This is a writer */
9352                 ipif_refrele(ipif);
9353         }
9354
9355         /*
9356          * Can not delete instance zero since it is tied to the ill.
9357          */
9358         if (ipif->ipif_id == 0)
9359                 return (EBUSY);
9360
9361         mutex_enter(&ill->ill_lock);
9362         ipif->ipif_state_flags |= IPIF_CONDEMNED;
9363         mutex_exit(&ill->ill_lock);
9364
9365         ipif_free(ipif);
9366
9367         mutex_enter(&connp->conn_lock);
9368         mutex_enter(&ill->ill_lock);
9369
9370         /* Are any references to this ipif active */
9371         if (ipif_is_freeable(ipif)) {
9372                 mutex_exit(&ill->ill_lock);
9373                 mutex_exit(&connp->conn_lock);
9374                 ipif_non_duplicate(ipif);
9375                 (void) ipif_down_tail(ipif);
9376                 ipif_free_tail(ipif); /* frees ipif */
9377                 return (0);
9378         }
9379         success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp,
9380             IPIF_FREE);
9381         mutex_exit(&ill->ill_lock);
9382         mutex_exit(&connp->conn_lock);
9383         if (success)
9384                 return (EINPROGRESS);
9385         else
9386                 return (EINTR);
9387 }
9388
9389 /*
9390  * Restart the removeif ioctl. The refcnt has gone down to 0.
9391  * The ipif is already condemned. So can't find it thru lookups.
9392  */
9393 /* ARGSUSED */
9394 int
9395 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
9396     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9397 {
9398         ill_t *ill = ipif->ipif_ill;
9399
9400         ASSERT(IAM_WRITER_IPIF(ipif));
9401         ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED);
9402
9403         ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n",
9404             ill->ill_name, ipif->ipif_id, (void *)ipif));
9405
9406         if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9407                 ASSERT(ill->ill_state_flags & ILL_CONDEMNED);
9408                 ill_delete_tail(ill);
9409                 mi_free(ill);
9410                 return (0);
9411         }
9412
9413         ipif_non_duplicate(ipif);
9414         (void) ipif_down_tail(ipif);
9415         ipif_free_tail(ipif);
9416
9417         return (0);
9418 }
9419
9420 /*
9421  * Set the local interface address using the given prefix and ill_token.
9422  */
9423 /* ARGSUSED */
9424 int
9425 ip_sioctl_prefix(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9426     ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9427 {
9428         int err;
9429         in6_addr_t v6addr;
9430         sin6_t *sin6;
9431         ill_t *ill;
9432         int i;
9433
9434         ip1dbg(("ip_sioctl_prefix(%s:%u %p)\n",
9435             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9436
9437         ASSERT(IAM_WRITER_IPIF(ipif));
9438
9439         if (!ipif->ipif_isv6)
9440                 return (EINVAL);
9441
9442         if (sin->sin_family != AF_INET6)
9443                 return (EAFNOSUPPORT);
9444
9445         sin6 = (sin6_t *)sin;
9446         v6addr = sin6->sin6_addr;
9447         ill = ipif->ipif_ill;
9448
9449         if (IN6_IS_ADDR_UNSPECIFIED(&v6addr) ||
9450             IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token))
9451                 return (EADDRNOTAVAIL);
9452
9453         for (i = 0; i < 4; i++)
9454                 sin6->sin6_addr.s6_addr32[i] |= ill->ill_token.s6_addr32[i];
9455
9456         err = ip_sioctl_addr(ipif, sin, q, mp,
9457             &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], dummy_ifreq);
9458         return (err);
9459 }
9460
9461 /*
9462  * Restart entry point to restart the address set operation after the
9463  * refcounts have dropped to zero.
9464  */
9465 /* ARGSUSED */
9466 int
9467 ip_sioctl_prefix_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9468     ip_ioctl_cmd_t *ipip, void *ifreq)
9469 {
9470         ip1dbg(("ip_sioctl_prefix_restart(%s:%u %p)\n",
9471             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9472         return (ip_sioctl_addr_restart(ipif, sin, q, mp, ipip, ifreq));
9473 }
9474
9475 /*
9476  * Set the local interface address.
9477  * Allow an address of all zero when the interface is down.
9478  */
9479 /* ARGSUSED */
9480 int
9481 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9482     ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9483 {
9484         int err = 0;
9485         in6_addr_t v6addr;
9486         boolean_t need_up = B_FALSE;
9487         ill_t *ill;
9488         int i;
9489
9490         ip1dbg(("ip_sioctl_addr(%s:%u %p)\n",
9491             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9492
9493         ASSERT(IAM_WRITER_IPIF(ipif));
9494
9495         ill = ipif->ipif_ill;
9496         if (ipif->ipif_isv6) {
9497                 sin6_t *sin6;
9498                 phyint_t *phyi;
9499
9500                 if (sin->sin_family != AF_INET6)
9501                         return (EAFNOSUPPORT);
9502
9503                 sin6 = (sin6_t *)sin;
9504                 v6addr = sin6->sin6_addr;
9505                 phyi = ill->ill_phyint;
9506
9507                 /*
9508                  * Enforce that true multicast interfaces have a link-local
9509                  * address for logical unit 0.
9510                  *
9511                  * However for those ipif's for which link-local address was
9512                  * not created by default, also allow setting :: as the address.
9513                  * This scenario would arise, when we delete an address on ipif
9514                  * with logical unit 0, we would want to set :: as the address.
9515                  */
9516                 if (ipif->ipif_id == 0 &&
9517                     (ill->ill_flags & ILLF_MULTICAST) &&
9518                     !(ipif->ipif_flags & (IPIF_POINTOPOINT)) &&
9519                     !(phyi->phyint_flags & (PHYI_LOOPBACK)) &&
9520                     !IN6_IS_ADDR_LINKLOCAL(&v6addr)) {
9521
9522                         /*
9523                          * if default link-local was not created by kernel for
9524                          * this ill, allow setting :: as the address on ipif:0.
9525                          */
9526                         if (ill->ill_flags & ILLF_NOLINKLOCAL) {
9527                                 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr))
9528                                         return (EADDRNOTAVAIL);
9529                         } else {
9530                                 return (EADDRNOTAVAIL);
9531                         }
9532                 }
9533
9534                 /*
9535                  * up interfaces shouldn't have the unspecified address
9536                  * unless they also have the IPIF_NOLOCAL flags set and
9537                  * have a subnet assigned.
9538                  */
9539                 if ((ipif->ipif_flags & IPIF_UP) &&
9540                     IN6_IS_ADDR_UNSPECIFIED(&v6addr) &&
9541                     (!(ipif->ipif_flags & IPIF_NOLOCAL) ||
9542                     IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) {
9543                         return (EADDRNOTAVAIL);
9544                 }
9545
9546                 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
9547                         return (EADDRNOTAVAIL);
9548         } else {
9549                 ipaddr_t addr;
9550
9551                 if (sin->sin_family != AF_INET)
9552                         return (EAFNOSUPPORT);
9553
9554                 addr = sin->sin_addr.s_addr;
9555
9556                 /* Allow INADDR_ANY as the local address. */
9557                 if (addr != INADDR_ANY &&
9558                     !ip_addr_ok_v4(addr, ipif->ipif_net_mask))
9559                         return (EADDRNOTAVAIL);
9560
9561                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9562         }
9563         /*
9564          * verify that the address being configured is permitted by the
9565          * ill_allowed_ips[] for the interface.
9566          */
9567         if (ill->ill_allowed_ips_cnt > 0) {
9568                 for (i = 0; i < ill->ill_allowed_ips_cnt; i++) {
9569                         if (IN6_ARE_ADDR_EQUAL(&ill->ill_allowed_ips[i],
9570                             &v6addr))
9571                                 break;
9572                 }
9573                 if (i == ill->ill_allowed_ips_cnt) {
9574                         pr_addr_dbg("!allowed addr %s\n", AF_INET6, &v6addr);
9575                         return (EPERM);
9576                 }
9577         }
9578         /*
9579          * Even if there is no change we redo things just to rerun
9580          * ipif_set_default.
9581          */
9582         if (ipif->ipif_flags & IPIF_UP) {
9583                 /*
9584                  * Setting a new local address, make sure
9585                  * we have net and subnet bcast ire's for
9586                  * the old address if we need them.
9587                  */
9588                 /*
9589                  * If the interface is already marked up,
9590                  * we call ipif_down which will take care
9591                  * of ditching any IREs that have been set
9592                  * up based on the old interface address.
9593                  */
9594                 err = ipif_logical_down(ipif, q, mp);
9595                 if (err == EINPROGRESS)
9596                         return (err);
9597                 (void) ipif_down_tail(ipif);
9598                 need_up = 1;
9599         }
9600
9601         err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up);
9602         return (err);
9603 }
9604
9605 int
9606 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9607     boolean_t need_up)
9608 {
9609         in6_addr_t v6addr;
9610         in6_addr_t ov6addr;
9611         ipaddr_t addr;
9612         sin6_t  *sin6;
9613         int     sinlen;
9614         int     err = 0;
9615         ill_t   *ill = ipif->ipif_ill;
9616         boolean_t need_dl_down;
9617         boolean_t need_arp_down;
9618         struct iocblk *iocp;
9619
9620         iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL;
9621
9622         ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n",
9623             ill->ill_name, ipif->ipif_id, (void *)ipif));
9624         ASSERT(IAM_WRITER_IPIF(ipif));
9625
9626         /* Must cancel any pending timer before taking the ill_lock */
9627         if (ipif->ipif_recovery_id != 0)
9628                 (void) untimeout(ipif->ipif_recovery_id);
9629         ipif->ipif_recovery_id = 0;
9630
9631         if (ipif->ipif_isv6) {
9632                 sin6 = (sin6_t *)sin;
9633                 v6addr = sin6->sin6_addr;
9634                 sinlen = sizeof (struct sockaddr_in6);
9635         } else {
9636                 addr = sin->sin_addr.s_addr;
9637                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9638                 sinlen = sizeof (struct sockaddr_in);
9639         }
9640         mutex_enter(&ill->ill_lock);
9641         ov6addr = ipif->ipif_v6lcl_addr;
9642         ipif->ipif_v6lcl_addr = v6addr;
9643         sctp_update_ipif_addr(ipif, ov6addr);
9644         ipif->ipif_addr_ready = 0;
9645
9646         ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
9647
9648         /*
9649          * If the interface was previously marked as a duplicate, then since
9650          * we've now got a "new" address, it should no longer be considered a
9651          * duplicate -- even if the "new" address is the same as the old one.
9652          * Note that if all ipifs are down, we may have a pending ARP down
9653          * event to handle.  This is because we want to recover from duplicates
9654          * and thus delay tearing down ARP until the duplicates have been
9655          * removed or disabled.
9656          */
9657         need_dl_down = need_arp_down = B_FALSE;
9658         if (ipif->ipif_flags & IPIF_DUPLICATE) {
9659                 need_arp_down = !need_up;
9660                 ipif->ipif_flags &= ~IPIF_DUPLICATE;
9661                 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
9662                     ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
9663                         need_dl_down = B_TRUE;
9664                 }
9665         }
9666
9667         ipif_set_default(ipif);
9668
9669         /*
9670          * If we've just manually set the IPv6 link-local address (0th ipif),
9671          * tag the ill so that future updates to the interface ID don't result
9672          * in this address getting automatically reconfigured from under the
9673          * administrator.
9674          */
9675         if (ipif->ipif_isv6 && ipif->ipif_id == 0) {
9676                 if (iocp == NULL || (iocp->ioc_cmd == SIOCSLIFADDR &&
9677                     !IN6_IS_ADDR_UNSPECIFIED(&v6addr)))
9678                         ill->ill_manual_linklocal = 1;
9679         }
9680
9681         /*
9682          * When publishing an interface address change event, we only notify
9683          * the event listeners of the new address.  It is assumed that if they
9684          * actively care about the addresses assigned that they will have
9685          * already discovered the previous address assigned (if there was one.)
9686          *
9687          * Don't attach nic event message for SIOCLIFADDIF ioctl.
9688          */
9689         if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) {
9690                 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id),
9691                     NE_ADDRESS_CHANGE, sin, sinlen);
9692         }
9693
9694         mutex_exit(&ill->ill_lock);
9695
9696         if (need_up) {
9697                 /*
9698                  * Now bring the interface back up.  If this
9699                  * is the only IPIF for the ILL, ipif_up
9700                  * will have to re-bind to the device, so
9701                  * we may get back EINPROGRESS, in which
9702                  * case, this IOCTL will get completed in
9703                  * ip_rput_dlpi when we see the DL_BIND_ACK.
9704                  */
9705                 err = ipif_up(ipif, q, mp);
9706         } else {
9707                 /* Perhaps ilgs should use this ill */
9708                 update_conn_ill(NULL, ill->ill_ipst);
9709         }
9710
9711         if (need_dl_down)
9712                 ill_dl_down(ill);
9713
9714         if (need_arp_down && !ill->ill_isv6)
9715                 (void) ipif_arp_down(ipif);
9716
9717         /*
9718          * The default multicast interface might have changed (for
9719          * instance if the IPv6 scope of the address changed)
9720          */
9721         ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
9722
9723         return (err);
9724 }
9725
9726 /*
9727  * Restart entry point to restart the address set operation after the
9728  * refcounts have dropped to zero.
9729  */
9730 /* ARGSUSED */
9731 int
9732 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9733     ip_ioctl_cmd_t *ipip, void *ifreq)
9734 {
9735         ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n",
9736             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9737         ASSERT(IAM_WRITER_IPIF(ipif));
9738         (void) ipif_down_tail(ipif);
9739         return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE));
9740 }
9741
9742 /* ARGSUSED */
9743 int
9744 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9745     ip_ioctl_cmd_t *ipip, void *if_req)
9746 {
9747         sin6_t *sin6 = (struct sockaddr_in6 *)sin;
9748         struct lifreq *lifr = (struct lifreq *)if_req;
9749
9750         ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n",
9751             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9752         /*
9753          * The net mask and address can't change since we have a
9754          * reference to the ipif. So no lock is necessary.
9755          */
9756         if (ipif->ipif_isv6) {
9757                 *sin6 = sin6_null;
9758                 sin6->sin6_family = AF_INET6;
9759                 sin6->sin6_addr = ipif->ipif_v6lcl_addr;
9760                 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
9761                         sin6->sin6_scope_id =
9762                             ipif->ipif_ill->ill_phyint->phyint_ifindex;
9763                 }
9764                 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
9765                 lifr->lifr_addrlen =
9766                     ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
9767         } else {
9768                 *sin = sin_null;
9769                 sin->sin_family = AF_INET;
9770                 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
9771                 if (ipip->ipi_cmd_type == LIF_CMD) {
9772                         lifr->lifr_addrlen =
9773                             ip_mask_to_plen(ipif->ipif_net_mask);
9774                 }
9775         }
9776         return (0);
9777 }
9778
9779 /*
9780  * Set the destination address for a pt-pt interface.
9781  */
9782 /* ARGSUSED */
9783 int
9784 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9785     ip_ioctl_cmd_t *ipip, void *if_req)
9786 {
9787         int err = 0;
9788         in6_addr_t v6addr;
9789         boolean_t need_up = B_FALSE;
9790
9791         ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n",
9792             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9793         ASSERT(IAM_WRITER_IPIF(ipif));
9794
9795         if (ipif->ipif_isv6) {
9796                 sin6_t *sin6;
9797
9798                 if (sin->sin_family != AF_INET6)
9799                         return (EAFNOSUPPORT);
9800
9801                 sin6 = (sin6_t *)sin;
9802                 v6addr = sin6->sin6_addr;
9803
9804                 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
9805                         return (EADDRNOTAVAIL);
9806         } else {
9807                 ipaddr_t addr;
9808
9809                 if (sin->sin_family != AF_INET)
9810                         return (EAFNOSUPPORT);
9811
9812                 addr = sin->sin_addr.s_addr;
9813                 if (addr != INADDR_ANY &&
9814                     !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) {
9815                         return (EADDRNOTAVAIL);
9816                 }
9817
9818                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9819         }
9820
9821         if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr))
9822                 return (0);     /* No change */
9823
9824         if (ipif->ipif_flags & IPIF_UP) {
9825                 /*
9826                  * If the interface is already marked up,
9827                  * we call ipif_down which will take care
9828                  * of ditching any IREs that have been set
9829                  * up based on the old pp dst address.
9830                  */
9831                 err = ipif_logical_down(ipif, q, mp);
9832                 if (err == EINPROGRESS)
9833                         return (err);
9834                 (void) ipif_down_tail(ipif);
9835                 need_up = B_TRUE;
9836         }
9837         /*
9838          * could return EINPROGRESS. If so ioctl will complete in
9839          * ip_rput_dlpi_writer
9840          */
9841         err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up);
9842         return (err);
9843 }
9844
9845 static int
9846 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9847     boolean_t need_up)
9848 {
9849         in6_addr_t v6addr;
9850         ill_t   *ill = ipif->ipif_ill;
9851         int     err = 0;
9852         boolean_t need_dl_down;
9853         boolean_t need_arp_down;
9854
9855         ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name,
9856             ipif->ipif_id, (void *)ipif));
9857
9858         /* Must cancel any pending timer before taking the ill_lock */
9859         if (ipif->ipif_recovery_id != 0)
9860                 (void) untimeout(ipif->ipif_recovery_id);
9861         ipif->ipif_recovery_id = 0;
9862
9863         if (ipif->ipif_isv6) {
9864                 sin6_t *sin6;
9865
9866                 sin6 = (sin6_t *)sin;
9867                 v6addr = sin6->sin6_addr;
9868         } else {
9869                 ipaddr_t addr;
9870
9871                 addr = sin->sin_addr.s_addr;
9872                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9873         }
9874         mutex_enter(&ill->ill_lock);
9875         /* Set point to point destination address. */
9876         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
9877                 /*
9878                  * Allow this as a means of creating logical
9879                  * pt-pt interfaces on top of e.g. an Ethernet.
9880                  * XXX Undocumented HACK for testing.
9881                  * pt-pt interfaces are created with NUD disabled.
9882                  */
9883                 ipif->ipif_flags |= IPIF_POINTOPOINT;
9884                 ipif->ipif_flags &= ~IPIF_BROADCAST;
9885                 if (ipif->ipif_isv6)
9886                         ill->ill_flags |= ILLF_NONUD;
9887         }
9888
9889         /*
9890          * If the interface was previously marked as a duplicate, then since
9891          * we've now got a "new" address, it should no longer be considered a
9892          * duplicate -- even if the "new" address is the same as the old one.
9893          * Note that if all ipifs are down, we may have a pending ARP down
9894          * event to handle.
9895          */
9896         need_dl_down = need_arp_down = B_FALSE;
9897         if (ipif->ipif_flags & IPIF_DUPLICATE) {
9898                 need_arp_down = !need_up;
9899                 ipif->ipif_flags &= ~IPIF_DUPLICATE;
9900                 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
9901                     ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
9902                         need_dl_down = B_TRUE;
9903                 }
9904         }
9905
9906         /*
9907          * If we've just manually set the IPv6 destination link-local address
9908          * (0th ipif), tag the ill so that future updates to the destination
9909          * interface ID (as can happen with interfaces over IP tunnels) don't
9910          * result in this address getting automatically reconfigured from
9911          * under the administrator.
9912          */
9913         if (ipif->ipif_isv6 && ipif->ipif_id == 0)
9914                 ill->ill_manual_dst_linklocal = 1;
9915
9916         /* Set the new address. */
9917         ipif->ipif_v6pp_dst_addr = v6addr;
9918         /* Make sure subnet tracks pp_dst */
9919         ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
9920         mutex_exit(&ill->ill_lock);
9921
9922         if (need_up) {
9923                 /*
9924                  * Now bring the interface back up.  If this
9925                  * is the only IPIF for the ILL, ipif_up
9926                  * will have to re-bind to the device, so
9927                  * we may get back EINPROGRESS, in which
9928                  * case, this IOCTL will get completed in
9929                  * ip_rput_dlpi when we see the DL_BIND_ACK.
9930                  */
9931                 err = ipif_up(ipif, q, mp);
9932         }
9933
9934         if (need_dl_down)
9935                 ill_dl_down(ill);
9936         if (need_arp_down && !ipif->ipif_isv6)
9937                 (void) ipif_arp_down(ipif);
9938
9939         return (err);
9940 }
9941
9942 /*
9943  * Restart entry point to restart the dstaddress set operation after the
9944  * refcounts have dropped to zero.
9945  */
9946 /* ARGSUSED */
9947 int
9948 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9949     ip_ioctl_cmd_t *ipip, void *ifreq)
9950 {
9951         ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n",
9952             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9953         (void) ipif_down_tail(ipif);
9954         return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE));
9955 }
9956
9957 /* ARGSUSED */
9958 int
9959 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9960     ip_ioctl_cmd_t *ipip, void *if_req)
9961 {
9962         sin6_t  *sin6 = (struct sockaddr_in6 *)sin;
9963
9964         ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n",
9965             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9966         /*
9967          * Get point to point destination address. The addresses can't
9968          * change since we hold a reference to the ipif.
9969          */
9970         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0)
9971                 return (EADDRNOTAVAIL);
9972
9973         if (ipif->ipif_isv6) {
9974                 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
9975                 *sin6 = sin6_null;
9976                 sin6->sin6_family = AF_INET6;
9977                 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr;
9978         } else {
9979                 *sin = sin_null;
9980                 sin->sin_family = AF_INET;
9981                 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr;
9982         }
9983         return (0);
9984 }
9985
9986 /*
9987  * Check which flags will change by the given flags being set
9988  * silently ignore flags which userland is not allowed to control.
9989  * (Because these flags may change between SIOCGLIFFLAGS and
9990  * SIOCSLIFFLAGS, and that's outside of userland's control,
9991  * we need to silently ignore them rather than fail.)
9992  */
9993 static void
9994 ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp,
9995     uint64_t *offp)
9996 {
9997         ill_t           *ill = ipif->ipif_ill;
9998         phyint_t        *phyi = ill->ill_phyint;
9999         uint64_t        cantchange_flags, intf_flags;
10000         uint64_t        turn_on, turn_off;
10001
10002         intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10003         cantchange_flags = IFF_CANTCHANGE;
10004         if (IS_IPMP(ill))
10005                 cantchange_flags |= IFF_IPMP_CANTCHANGE;
10006         turn_on = (flags ^ intf_flags) & ~cantchange_flags;
10007         turn_off = intf_flags & turn_on;
10008         turn_on ^= turn_off;
10009         *onp = turn_on;
10010         *offp = turn_off;
10011 }
10012
10013 /*
10014  * Set interface flags.  Many flags require special handling (e.g.,
10015  * bringing the interface down); see below for details.
10016  *
10017  * NOTE : We really don't enforce that ipif_id zero should be used
10018  *        for setting any flags other than IFF_LOGINT_FLAGS. This
10019  *        is because applications generally does SICGLIFFLAGS and
10020  *        ORs in the new flags (that affects the logical) and does a
10021  *        SIOCSLIFFLAGS. Thus, "flags" below could contain bits other
10022  *        than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the
10023  *        flags that will be turned on is correct with respect to
10024  *        ipif_id 0. For backward compatibility reasons, it is not done.
10025  */
10026 /* ARGSUSED */
10027 int
10028 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10029     ip_ioctl_cmd_t *ipip, void *if_req)
10030 {
10031         uint64_t turn_on;
10032         uint64_t turn_off;
10033         int     err = 0;
10034         phyint_t *phyi;
10035         ill_t *ill;
10036         conn_t *connp;
10037         uint64_t intf_flags;
10038         boolean_t phyint_flags_modified = B_FALSE;
10039         uint64_t flags;
10040         struct ifreq *ifr;
10041         struct lifreq *lifr;
10042         boolean_t set_linklocal = B_FALSE;
10043
10044         ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
10045             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10046
10047         ASSERT(IAM_WRITER_IPIF(ipif));
10048
10049         ill = ipif->ipif_ill;
10050         phyi = ill->ill_phyint;
10051
10052         if (ipip->ipi_cmd_type == IF_CMD) {
10053                 ifr = (struct ifreq *)if_req;
10054                 flags =  (uint64_t)(ifr->ifr_flags & 0x0000ffff);
10055         } else {
10056                 lifr = (struct lifreq *)if_req;
10057                 flags = lifr->lifr_flags;
10058         }
10059
10060         intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10061
10062         /*
10063          * Have the flags been set correctly until now?
10064          */
10065         ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10066         ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10067         ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10068         /*
10069          * Compare the new flags to the old, and partition
10070          * into those coming on and those going off.
10071          * For the 16 bit command keep the bits above bit 16 unchanged.
10072          */
10073         if (ipip->ipi_cmd == SIOCSIFFLAGS)
10074                 flags |= intf_flags & ~0xFFFF;
10075
10076         /*
10077          * Explicitly fail attempts to change flags that are always invalid on
10078          * an IPMP meta-interface.
10079          */
10080         if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID))
10081                 return (EINVAL);
10082
10083         ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10084         if ((turn_on|turn_off) == 0)
10085                 return (0);     /* No change */
10086
10087         /*
10088          * All test addresses must be IFF_DEPRECATED (to ensure source address
10089          * selection avoids them) -- so force IFF_DEPRECATED on, and do not
10090          * allow it to be turned off.
10091          */
10092         if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED &&
10093             (turn_on|intf_flags) & IFF_NOFAILOVER)
10094                 return (EINVAL);
10095
10096         if ((connp = Q_TO_CONN(q)) == NULL)
10097                 return (EINVAL);
10098
10099         /*
10100          * Only vrrp control socket is allowed to change IFF_UP and
10101          * IFF_NOACCEPT flags when IFF_VRRP is set.
10102          */
10103         if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) {
10104                 if (!connp->conn_isvrrp)
10105                         return (EINVAL);
10106         }
10107
10108         /*
10109          * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by
10110          * VRRP control socket.
10111          */
10112         if ((turn_off | turn_on) & IFF_NOACCEPT) {
10113                 if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP))
10114                         return (EINVAL);
10115         }
10116
10117         if (turn_on & IFF_NOFAILOVER) {
10118                 turn_on |= IFF_DEPRECATED;
10119                 flags |= IFF_DEPRECATED;
10120         }
10121
10122         /*
10123          * On underlying interfaces, only allow applications to manage test
10124          * addresses -- otherwise, they may get confused when the address
10125          * moves as part of being brought up.  Likewise, prevent an
10126          * application-managed test address from being converted to a data
10127          * address.  To prevent migration of administratively up addresses in
10128          * the kernel, we don't allow them to be converted either.
10129          */
10130         if (IS_UNDER_IPMP(ill)) {
10131                 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF;
10132
10133                 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER))
10134                         return (EINVAL);
10135
10136                 if ((turn_off & IFF_NOFAILOVER) &&
10137                     (flags & (appflags | IFF_UP | IFF_DUPLICATE)))
10138                         return (EINVAL);
10139         }
10140
10141         /*
10142          * Only allow IFF_TEMPORARY flag to be set on
10143          * IPv6 interfaces.
10144          */
10145         if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6))
10146                 return (EINVAL);
10147
10148         /*
10149          * cannot turn off IFF_NOXMIT on  VNI interfaces.
10150          */
10151         if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill))
10152                 return (EINVAL);
10153
10154         /*
10155          * Don't allow the IFF_ROUTER flag to be turned on on loopback
10156          * interfaces.  It makes no sense in that context.
10157          */
10158         if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK))
10159                 return (EINVAL);
10160
10161         /*
10162          * For IPv6 ipif_id 0, don't allow the interface to be up without
10163          * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set.
10164          * If the link local address isn't set, and can be set, it will get
10165          * set later on in this function.
10166          */
10167         if (ipif->ipif_id == 0 && ipif->ipif_isv6 &&
10168             (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) &&
10169             IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
10170                 if (ipif_cant_setlinklocal(ipif))
10171                         return (EINVAL);
10172                 set_linklocal = B_TRUE;
10173         }
10174
10175         /*
10176          * If we modify physical interface flags, we'll potentially need to
10177          * send up two routing socket messages for the changes (one for the
10178          * IPv4 ill, and another for the IPv6 ill).  Note that here.
10179          */
10180         if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10181                 phyint_flags_modified = B_TRUE;
10182
10183         /*
10184          * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE
10185          * (otherwise, we'd immediately use them, defeating standby).  Also,
10186          * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not
10187          * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already
10188          * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared.  We
10189          * also don't allow PHYI_STANDBY if VNI is enabled since its semantics
10190          * will not be honored.
10191          */
10192         if (turn_on & PHYI_STANDBY) {
10193                 /*
10194                  * No need to grab ill_g_usesrc_lock here; see the
10195                  * synchronization notes in ip.c.
10196                  */
10197                 if (ill->ill_usesrc_grp_next != NULL ||
10198                     intf_flags & PHYI_INACTIVE)
10199                         return (EINVAL);
10200                 if (!(flags & PHYI_FAILED)) {
10201                         flags |= PHYI_INACTIVE;
10202                         turn_on |= PHYI_INACTIVE;
10203                 }
10204         }
10205
10206         if (turn_off & PHYI_STANDBY) {
10207                 flags &= ~PHYI_INACTIVE;
10208                 turn_off |= PHYI_INACTIVE;
10209         }
10210
10211         /*
10212          * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both
10213          * would end up on.
10214          */
10215         if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
10216             (PHYI_FAILED | PHYI_INACTIVE))
10217                 return (EINVAL);
10218
10219         /*
10220          * If ILLF_ROUTER changes, we need to change the ip forwarding
10221          * status of the interface.
10222          */
10223         if ((turn_on | turn_off) & ILLF_ROUTER) {
10224                 err = ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0));
10225                 if (err != 0)
10226                         return (err);
10227         }
10228
10229         /*
10230          * If the interface is not UP and we are not going to
10231          * bring it UP, record the flags and return. When the
10232          * interface comes UP later, the right actions will be
10233          * taken.
10234          */
10235         if (!(ipif->ipif_flags & IPIF_UP) &&
10236             !(turn_on & IPIF_UP)) {
10237                 /* Record new flags in their respective places. */
10238                 mutex_enter(&ill->ill_lock);
10239                 mutex_enter(&ill->ill_phyint->phyint_lock);
10240                 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10241                 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10242                 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10243                 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10244                 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10245                 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10246                 mutex_exit(&ill->ill_lock);
10247                 mutex_exit(&ill->ill_phyint->phyint_lock);
10248
10249                 /*
10250                  * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the
10251                  * same to the kernel: if any of them has been set by
10252                  * userland, the interface cannot be used for data traffic.
10253                  */
10254                 if ((turn_on|turn_off) &
10255                     (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10256                         ASSERT(!IS_IPMP(ill));
10257                         /*
10258                          * It's possible the ill is part of an "anonymous"
10259                          * IPMP group rather than a real group.  In that case,
10260                          * there are no other interfaces in the group and thus
10261                          * no need to call ipmp_phyint_refresh_active().
10262                          */
10263                         if (IS_UNDER_IPMP(ill))
10264                                 ipmp_phyint_refresh_active(phyi);
10265                 }
10266
10267                 if (phyint_flags_modified) {
10268                         if (phyi->phyint_illv4 != NULL) {
10269                                 ip_rts_ifmsg(phyi->phyint_illv4->
10270                                     ill_ipif, RTSQ_DEFAULT);
10271                         }
10272                         if (phyi->phyint_illv6 != NULL) {
10273                                 ip_rts_ifmsg(phyi->phyint_illv6->
10274                                     ill_ipif, RTSQ_DEFAULT);
10275                         }
10276                 }
10277                 /* The default multicast interface might have changed */
10278                 ire_increment_multicast_generation(ill->ill_ipst,
10279                     ill->ill_isv6);
10280
10281                 return (0);
10282         } else if (set_linklocal) {
10283                 mutex_enter(&ill->ill_lock);
10284                 if (set_linklocal)
10285                         ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL;
10286                 mutex_exit(&ill->ill_lock);
10287         }
10288
10289         /*
10290          * Disallow IPv6 interfaces coming up that have the unspecified address,
10291          * or point-to-point interfaces with an unspecified destination. We do
10292          * allow the address to be unspecified for IPIF_NOLOCAL interfaces that
10293          * have a subnet assigned, which is how in.ndpd currently manages its
10294          * onlink prefix list when no addresses are configured with those
10295          * prefixes.
10296          */
10297         if (ipif->ipif_isv6 &&
10298             ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
10299             (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) ||
10300             IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) ||
10301             ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10302             IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) {
10303                 return (EINVAL);
10304         }
10305
10306         /*
10307          * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination
10308          * from being brought up.
10309          */
10310         if (!ipif->ipif_isv6 &&
10311             ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10312             ipif->ipif_pp_dst_addr == INADDR_ANY)) {
10313                 return (EINVAL);
10314         }
10315
10316         /*
10317          * If we are going to change one or more of the flags that are
10318          * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP,
10319          * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and
10320          * IPIF_NOFAILOVER, we will take special action.  This is
10321          * done by bring the ipif down, changing the flags and bringing
10322          * it back up again.  For IPIF_NOFAILOVER, the act of bringing it
10323          * back up will trigger the address to be moved.
10324          *
10325          * If we are going to change IFF_NOACCEPT, we need to bring
10326          * all the ipifs down then bring them up again.  The act of
10327          * bringing all the ipifs back up will trigger the local
10328          * ires being recreated with "no_accept" set/cleared.
10329          *
10330          * Note that ILLF_NOACCEPT is always set separately from the
10331          * other flags.
10332          */
10333         if ((turn_on|turn_off) &
10334             (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP|
10335             ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED|
10336             IPIF_NOFAILOVER)) {
10337                 /*
10338                  * ipif_down() will ire_delete bcast ire's for the subnet,
10339                  * while the ire_identical_ref tracks the case of IRE_BROADCAST
10340                  * entries shared between multiple ipifs on the same subnet.
10341                  */
10342                 if (((ipif->ipif_flags | turn_on) & IPIF_UP) &&
10343                     !(turn_off & IPIF_UP)) {
10344                         if (ipif->ipif_flags & IPIF_UP)
10345                                 ill->ill_logical_down = 1;
10346                         turn_on &= ~IPIF_UP;
10347                 }
10348                 err = ipif_down(ipif, q, mp);
10349                 ip1dbg(("ipif_down returns %d err ", err));
10350                 if (err == EINPROGRESS)
10351                         return (err);
10352                 (void) ipif_down_tail(ipif);
10353         } else if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10354                 /*
10355                  * If we can quiesce the ill, then continue.  If not, then
10356                  * ip_sioctl_flags_tail() will be called from
10357                  * ipif_ill_refrele_tail().
10358                  */
10359                 ill_down_ipifs(ill, B_TRUE);
10360
10361                 mutex_enter(&connp->conn_lock);
10362                 mutex_enter(&ill->ill_lock);
10363                 if (!ill_is_quiescent(ill)) {
10364                         boolean_t success;
10365
10366                         success = ipsq_pending_mp_add(connp, ill->ill_ipif,
10367                             q, mp, ILL_DOWN);
10368                         mutex_exit(&ill->ill_lock);
10369                         mutex_exit(&connp->conn_lock);
10370                         return (success ? EINPROGRESS : EINTR);
10371                 }
10372                 mutex_exit(&ill->ill_lock);
10373                 mutex_exit(&connp->conn_lock);
10374         }
10375         return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10376 }
10377
10378 static int
10379 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
10380 {
10381         ill_t   *ill;
10382         phyint_t *phyi;
10383         uint64_t turn_on, turn_off;
10384         boolean_t phyint_flags_modified = B_FALSE;
10385         int     err = 0;
10386         boolean_t set_linklocal = B_FALSE;
10387
10388         ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n",
10389             ipif->ipif_ill->ill_name, ipif->ipif_id));
10390
10391         ASSERT(IAM_WRITER_IPIF(ipif));
10392
10393         ill = ipif->ipif_ill;
10394         phyi = ill->ill_phyint;
10395
10396         ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10397
10398         /*
10399          * IFF_UP is handled separately.
10400          */
10401         turn_on &= ~IFF_UP;
10402         turn_off &= ~IFF_UP;
10403
10404         if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10405                 phyint_flags_modified = B_TRUE;
10406
10407         /*
10408          * Now we change the flags. Track current value of
10409          * other flags in their respective places.
10410          */
10411         mutex_enter(&ill->ill_lock);
10412         mutex_enter(&phyi->phyint_lock);
10413         ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10414         ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10415         ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10416         ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10417         phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10418         phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10419         if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) {
10420                 set_linklocal = B_TRUE;
10421                 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL;
10422         }
10423
10424         mutex_exit(&ill->ill_lock);
10425         mutex_exit(&phyi->phyint_lock);
10426
10427         if (set_linklocal)
10428                 (void) ipif_setlinklocal(ipif);
10429
10430         /*
10431          * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to
10432          * the kernel: if any of them has been set by userland, the interface
10433          * cannot be used for data traffic.
10434          */
10435         if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10436                 ASSERT(!IS_IPMP(ill));
10437                 /*
10438                  * It's possible the ill is part of an "anonymous" IPMP group
10439                  * rather than a real group.  In that case, there are no other
10440                  * interfaces in the group and thus no need for us to call
10441                  * ipmp_phyint_refresh_active().
10442                  */
10443                 if (IS_UNDER_IPMP(ill))
10444                         ipmp_phyint_refresh_active(phyi);
10445         }
10446
10447         if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10448                 /*
10449                  * If the ILLF_NOACCEPT flag is changed, bring up all the
10450                  * ipifs that were brought down.
10451                  *
10452                  * The routing sockets messages are sent as the result
10453                  * of ill_up_ipifs(), further, SCTP's IPIF list was updated
10454                  * as well.
10455                  */
10456                 err = ill_up_ipifs(ill, q, mp);
10457         } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) {
10458                 /*
10459                  * XXX ipif_up really does not know whether a phyint flags
10460                  * was modified or not. So, it sends up information on
10461                  * only one routing sockets message. As we don't bring up
10462                  * the interface and also set PHYI_ flags simultaneously
10463                  * it should be okay.
10464                  */
10465                 err = ipif_up(ipif, q, mp);
10466         } else {
10467                 /*
10468                  * Make sure routing socket sees all changes to the flags.
10469                  * ipif_up_done* handles this when we use ipif_up.
10470                  */
10471                 if (phyint_flags_modified) {
10472                         if (phyi->phyint_illv4 != NULL) {
10473                                 ip_rts_ifmsg(phyi->phyint_illv4->
10474                                     ill_ipif, RTSQ_DEFAULT);
10475                         }
10476                         if (phyi->phyint_illv6 != NULL) {
10477                                 ip_rts_ifmsg(phyi->phyint_illv6->
10478                                     ill_ipif, RTSQ_DEFAULT);
10479                         }
10480                 } else {
10481                         ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
10482                 }
10483                 /*
10484                  * Update the flags in SCTP's IPIF list, ipif_up() will do
10485                  * this in need_up case.
10486                  */
10487                 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10488         }
10489
10490         /* The default multicast interface might have changed */
10491         ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
10492         return (err);
10493 }
10494
10495 /*
10496  * Restart the flags operation now that the refcounts have dropped to zero.
10497  */
10498 /* ARGSUSED */
10499 int
10500 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10501     ip_ioctl_cmd_t *ipip, void *if_req)
10502 {
10503         uint64_t flags;
10504         struct ifreq *ifr = if_req;
10505         struct lifreq *lifr = if_req;
10506         uint64_t turn_on, turn_off;
10507
10508         ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n",
10509             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10510
10511         if (ipip->ipi_cmd_type == IF_CMD) {
10512                 /* cast to uint16_t prevents unwanted sign extension */
10513                 flags = (uint16_t)ifr->ifr_flags;
10514         } else {
10515                 flags = lifr->lifr_flags;
10516         }
10517
10518         /*
10519          * If this function call is a result of the ILLF_NOACCEPT flag
10520          * change, do not call ipif_down_tail(). See ip_sioctl_flags().
10521          */
10522         ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10523         if (!((turn_on|turn_off) & ILLF_NOACCEPT))
10524                 (void) ipif_down_tail(ipif);
10525
10526         return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10527 }
10528
10529 /*
10530  * Can operate on either a module or a driver queue.
10531  */
10532 /* ARGSUSED */
10533 int
10534 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10535     ip_ioctl_cmd_t *ipip, void *if_req)
10536 {
10537         /*
10538          * Has the flags been set correctly till now ?
10539          */
10540         ill_t *ill = ipif->ipif_ill;
10541         phyint_t *phyi = ill->ill_phyint;
10542
10543         ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n",
10544             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10545         ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10546         ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10547         ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10548
10549         /*
10550          * Need a lock since some flags can be set even when there are
10551          * references to the ipif.
10552          */
10553         mutex_enter(&ill->ill_lock);
10554         if (ipip->ipi_cmd_type == IF_CMD) {
10555                 struct ifreq *ifr = (struct ifreq *)if_req;
10556
10557                 /* Get interface flags (low 16 only). */
10558                 ifr->ifr_flags = ((ipif->ipif_flags |
10559                     ill->ill_flags | phyi->phyint_flags) & 0xffff);
10560         } else {
10561                 struct lifreq *lifr = (struct lifreq *)if_req;
10562
10563                 /* Get interface flags. */
10564                 lifr->lifr_flags = ipif->ipif_flags |
10565                     ill->ill_flags | phyi->phyint_flags;
10566         }
10567         mutex_exit(&ill->ill_lock);
10568         return (0);
10569 }
10570
10571 /*
10572  * We allow the MTU to be set on an ILL, but not have it be different
10573  * for different IPIFs since we don't actually send packets on IPIFs.
10574  */
10575 /* ARGSUSED */
10576 int
10577 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10578     ip_ioctl_cmd_t *ipip, void *if_req)
10579 {
10580         int mtu;
10581         int ip_min_mtu;
10582         struct ifreq    *ifr;
10583         struct lifreq *lifr;
10584         ill_t   *ill;
10585
10586         ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name,
10587             ipif->ipif_id, (void *)ipif));
10588         if (ipip->ipi_cmd_type == IF_CMD) {
10589                 ifr = (struct ifreq *)if_req;
10590                 mtu = ifr->ifr_metric;
10591         } else {
10592                 lifr = (struct lifreq *)if_req;
10593                 mtu = lifr->lifr_mtu;
10594         }
10595         /* Only allow for logical unit zero i.e. not on "bge0:17" */
10596         if (ipif->ipif_id != 0)
10597                 return (EINVAL);
10598
10599         ill = ipif->ipif_ill;
10600         if (ipif->ipif_isv6)
10601                 ip_min_mtu = IPV6_MIN_MTU;
10602         else
10603                 ip_min_mtu = IP_MIN_MTU;
10604
10605         mutex_enter(&ill->ill_lock);
10606         if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) {
10607                 mutex_exit(&ill->ill_lock);
10608                 return (EINVAL);
10609         }
10610         /* Avoid increasing ill_mc_mtu */
10611         if (ill->ill_mc_mtu > mtu)
10612                 ill->ill_mc_mtu = mtu;
10613
10614         /*
10615          * The dce and fragmentation code can handle changes to ill_mtu
10616          * concurrent with sending/fragmenting packets.
10617          */
10618         ill->ill_mtu = mtu;
10619         ill->ill_flags |= ILLF_FIXEDMTU;
10620         mutex_exit(&ill->ill_lock);
10621
10622         /*
10623          * Make sure all dce_generation checks find out
10624          * that ill_mtu/ill_mc_mtu has changed.
10625          */
10626         dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
10627
10628         /*
10629          * Refresh IPMP meta-interface MTU if necessary.
10630          */
10631         if (IS_UNDER_IPMP(ill))
10632                 ipmp_illgrp_refresh_mtu(ill->ill_grp);
10633
10634         /* Update the MTU in SCTP's list */
10635         sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10636         return (0);
10637 }
10638
10639 /* Get interface MTU. */
10640 /* ARGSUSED */
10641 int
10642 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10643     ip_ioctl_cmd_t *ipip, void *if_req)
10644 {
10645         struct ifreq    *ifr;
10646         struct lifreq   *lifr;
10647
10648         ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n",
10649             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10650
10651         /*
10652          * We allow a get on any logical interface even though the set
10653          * can only be done on logical unit 0.
10654          */
10655         if (ipip->ipi_cmd_type == IF_CMD) {
10656                 ifr = (struct ifreq *)if_req;
10657                 ifr->ifr_metric = ipif->ipif_ill->ill_mtu;
10658         } else {
10659                 lifr = (struct lifreq *)if_req;
10660                 lifr->lifr_mtu = ipif->ipif_ill->ill_mtu;
10661         }
10662         return (0);
10663 }
10664
10665 /* Set interface broadcast address. */
10666 /* ARGSUSED2 */
10667 int
10668 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10669     ip_ioctl_cmd_t *ipip, void *if_req)
10670 {
10671         ipaddr_t addr;
10672         ire_t   *ire;
10673         ill_t           *ill = ipif->ipif_ill;
10674         ip_stack_t      *ipst = ill->ill_ipst;
10675
10676         ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name,
10677             ipif->ipif_id));
10678
10679         ASSERT(IAM_WRITER_IPIF(ipif));
10680         if (!(ipif->ipif_flags & IPIF_BROADCAST))
10681                 return (EADDRNOTAVAIL);
10682
10683         ASSERT(!(ipif->ipif_isv6));     /* No IPv6 broadcast */
10684
10685         if (sin->sin_family != AF_INET)
10686                 return (EAFNOSUPPORT);
10687
10688         addr = sin->sin_addr.s_addr;
10689
10690         if (ipif->ipif_flags & IPIF_UP) {
10691                 /*
10692                  * If we are already up, make sure the new
10693                  * broadcast address makes sense.  If it does,
10694                  * there should be an IRE for it already.
10695                  */
10696                 ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST,
10697                     ill, ipif->ipif_zoneid,
10698                     (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL);
10699                 if (ire == NULL) {
10700                         return (EINVAL);
10701                 } else {
10702                         ire_refrele(ire);
10703                 }
10704         }
10705         /*
10706          * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST
10707          * needs to already exist we never need to change the set of
10708          * IRE_BROADCASTs when we are UP.
10709          */
10710         if (addr != ipif->ipif_brd_addr)
10711                 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr);
10712
10713         return (0);
10714 }
10715
10716 /* Get interface broadcast address. */
10717 /* ARGSUSED */
10718 int
10719 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10720     ip_ioctl_cmd_t *ipip, void *if_req)
10721 {
10722         ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n",
10723             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10724         if (!(ipif->ipif_flags & IPIF_BROADCAST))
10725                 return (EADDRNOTAVAIL);
10726
10727         /* IPIF_BROADCAST not possible with IPv6 */
10728         ASSERT(!ipif->ipif_isv6);
10729         *sin = sin_null;
10730         sin->sin_family = AF_INET;
10731         sin->sin_addr.s_addr = ipif->ipif_brd_addr;
10732         return (0);
10733 }
10734
10735 /*
10736  * This routine is called to handle the SIOCS*IFNETMASK IOCTL.
10737  */
10738 /* ARGSUSED */
10739 int
10740 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10741     ip_ioctl_cmd_t *ipip, void *if_req)
10742 {
10743         int err = 0;
10744         in6_addr_t v6mask;
10745
10746         ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n",
10747             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10748
10749         ASSERT(IAM_WRITER_IPIF(ipif));
10750
10751         if (ipif->ipif_isv6) {
10752                 sin6_t *sin6;
10753
10754                 if (sin->sin_family != AF_INET6)
10755                         return (EAFNOSUPPORT);
10756
10757                 sin6 = (sin6_t *)sin;
10758                 v6mask = sin6->sin6_addr;
10759         } else {
10760                 ipaddr_t mask;
10761
10762                 if (sin->sin_family != AF_INET)
10763                         return (EAFNOSUPPORT);
10764
10765                 mask = sin->sin_addr.s_addr;
10766                 if (!ip_contiguous_mask(ntohl(mask)))
10767                         return (ENOTSUP);
10768                 V4MASK_TO_V6(mask, v6mask);
10769         }
10770
10771         /*
10772          * No big deal if the interface isn't already up, or the mask
10773          * isn't really changing, or this is pt-pt.
10774          */
10775         if (!(ipif->ipif_flags & IPIF_UP) ||
10776             IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) ||
10777             (ipif->ipif_flags & IPIF_POINTOPOINT)) {
10778                 ipif->ipif_v6net_mask = v6mask;
10779                 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
10780                         V6_MASK_COPY(ipif->ipif_v6lcl_addr,
10781                             ipif->ipif_v6net_mask,
10782                             ipif->ipif_v6subnet);
10783                 }
10784                 return (0);
10785         }
10786         /*
10787          * Make sure we have valid net and subnet broadcast ire's
10788          * for the old netmask, if needed by other logical interfaces.
10789          */
10790         err = ipif_logical_down(ipif, q, mp);
10791         if (err == EINPROGRESS)
10792                 return (err);
10793         (void) ipif_down_tail(ipif);
10794         err = ip_sioctl_netmask_tail(ipif, sin, q, mp);
10795         return (err);
10796 }
10797
10798 static int
10799 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp)
10800 {
10801         in6_addr_t v6mask;
10802         int err = 0;
10803
10804         ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n",
10805             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10806
10807         if (ipif->ipif_isv6) {
10808                 sin6_t *sin6;
10809
10810                 sin6 = (sin6_t *)sin;
10811                 v6mask = sin6->sin6_addr;
10812         } else {
10813                 ipaddr_t mask;
10814
10815                 mask = sin->sin_addr.s_addr;
10816                 V4MASK_TO_V6(mask, v6mask);
10817         }
10818
10819         ipif->ipif_v6net_mask = v6mask;
10820         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
10821                 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
10822                     ipif->ipif_v6subnet);
10823         }
10824         err = ipif_up(ipif, q, mp);
10825
10826         if (err == 0 || err == EINPROGRESS) {
10827                 /*
10828                  * The interface must be DL_BOUND if this packet has to
10829                  * go out on the wire. Since we only go through a logical
10830                  * down and are bound with the driver during an internal
10831                  * down/up that is satisfied.
10832                  */
10833                 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) {
10834                         /* Potentially broadcast an address mask reply. */
10835                         ipif_mask_reply(ipif);
10836                 }
10837         }
10838         return (err);
10839 }
10840
10841 /* ARGSUSED */
10842 int
10843 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10844     ip_ioctl_cmd_t *ipip, void *if_req)
10845 {
10846         ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n",
10847             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10848         (void) ipif_down_tail(ipif);
10849         return (ip_sioctl_netmask_tail(ipif, sin, q, mp));
10850 }
10851
10852 /* Get interface net mask. */
10853 /* ARGSUSED */
10854 int
10855 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10856     ip_ioctl_cmd_t *ipip, void *if_req)
10857 {
10858         struct lifreq *lifr = (struct lifreq *)if_req;
10859         struct sockaddr_in6 *sin6 = (sin6_t *)sin;
10860
10861         ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n",
10862             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10863
10864         /*
10865          * net mask can't change since we have a reference to the ipif.
10866          */
10867         if (ipif->ipif_isv6) {
10868                 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
10869                 *sin6 = sin6_null;
10870                 sin6->sin6_family = AF_INET6;
10871                 sin6->sin6_addr = ipif->ipif_v6net_mask;
10872                 lifr->lifr_addrlen =
10873                     ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
10874         } else {
10875                 *sin = sin_null;
10876                 sin->sin_family = AF_INET;
10877                 sin->sin_addr.s_addr = ipif->ipif_net_mask;
10878                 if (ipip->ipi_cmd_type == LIF_CMD) {
10879                         lifr->lifr_addrlen =
10880                             ip_mask_to_plen(ipif->ipif_net_mask);
10881                 }
10882         }
10883         return (0);
10884 }
10885
10886 /* ARGSUSED */
10887 int
10888 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10889     ip_ioctl_cmd_t *ipip, void *if_req)
10890 {
10891         ip1dbg(("ip_sioctl_metric(%s:%u %p)\n",
10892             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10893
10894         /*
10895          * Since no applications should ever be setting metrics on underlying
10896          * interfaces, we explicitly fail to smoke 'em out.
10897          */
10898         if (IS_UNDER_IPMP(ipif->ipif_ill))
10899                 return (EINVAL);
10900
10901         /*
10902          * Set interface metric.  We don't use this for
10903          * anything but we keep track of it in case it is
10904          * important to routing applications or such.
10905          */
10906         if (ipip->ipi_cmd_type == IF_CMD) {
10907                 struct ifreq    *ifr;
10908
10909                 ifr = (struct ifreq *)if_req;
10910                 ipif->ipif_ill->ill_metric = ifr->ifr_metric;
10911         } else {
10912                 struct lifreq   *lifr;
10913
10914                 lifr = (struct lifreq *)if_req;
10915                 ipif->ipif_ill->ill_metric = lifr->lifr_metric;
10916         }
10917         return (0);
10918 }
10919
10920 /* ARGSUSED */
10921 int
10922 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10923     ip_ioctl_cmd_t *ipip, void *if_req)
10924 {
10925         /* Get interface metric. */
10926         ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n",
10927             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10928
10929         if (ipip->ipi_cmd_type == IF_CMD) {
10930                 struct ifreq    *ifr;
10931
10932                 ifr = (struct ifreq *)if_req;
10933                 ifr->ifr_metric = ipif->ipif_ill->ill_metric;
10934         } else {
10935                 struct lifreq   *lifr;
10936
10937                 lifr = (struct lifreq *)if_req;
10938                 lifr->lifr_metric = ipif->ipif_ill->ill_metric;
10939         }
10940
10941         return (0);
10942 }
10943
10944 /* ARGSUSED */
10945 int
10946 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10947     ip_ioctl_cmd_t *ipip, void *if_req)
10948 {
10949         int     arp_muxid;
10950
10951         ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n",
10952             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10953         /*
10954          * Set the muxid returned from I_PLINK.
10955          */
10956         if (ipip->ipi_cmd_type == IF_CMD) {
10957                 struct ifreq *ifr = (struct ifreq *)if_req;
10958
10959                 ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid;
10960                 arp_muxid = ifr->ifr_arp_muxid;
10961         } else {
10962                 struct lifreq *lifr = (struct lifreq *)if_req;
10963
10964                 ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid;
10965                 arp_muxid = lifr->lifr_arp_muxid;
10966         }
10967         arl_set_muxid(ipif->ipif_ill, arp_muxid);
10968         return (0);
10969 }
10970
10971 /* ARGSUSED */
10972 int
10973 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10974     ip_ioctl_cmd_t *ipip, void *if_req)
10975 {
10976         int     arp_muxid = 0;
10977
10978         ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n",
10979             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10980         /*
10981          * Get the muxid saved in ill for I_PUNLINK.
10982          */
10983         arp_muxid = arl_get_muxid(ipif->ipif_ill);
10984         if (ipip->ipi_cmd_type == IF_CMD) {
10985                 struct ifreq *ifr = (struct ifreq *)if_req;
10986
10987                 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid;
10988                 ifr->ifr_arp_muxid = arp_muxid;
10989         } else {
10990                 struct lifreq *lifr = (struct lifreq *)if_req;
10991
10992                 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid;
10993                 lifr->lifr_arp_muxid = arp_muxid;
10994         }
10995         return (0);
10996 }
10997
10998 /*
10999  * Set the subnet prefix. Does not modify the broadcast address.
11000  */
11001 /* ARGSUSED */
11002 int
11003 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11004     ip_ioctl_cmd_t *ipip, void *if_req)
11005 {
11006         int err = 0;
11007         in6_addr_t v6addr;
11008         in6_addr_t v6mask;
11009         boolean_t need_up = B_FALSE;
11010         int addrlen;
11011
11012         ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n",
11013             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11014
11015         ASSERT(IAM_WRITER_IPIF(ipif));
11016         addrlen = ((struct lifreq *)if_req)->lifr_addrlen;
11017
11018         if (ipif->ipif_isv6) {
11019                 sin6_t *sin6;
11020
11021                 if (sin->sin_family != AF_INET6)
11022                         return (EAFNOSUPPORT);
11023
11024                 sin6 = (sin6_t *)sin;
11025                 v6addr = sin6->sin6_addr;
11026                 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones))
11027                         return (EADDRNOTAVAIL);
11028         } else {
11029                 ipaddr_t addr;
11030
11031                 if (sin->sin_family != AF_INET)
11032                         return (EAFNOSUPPORT);
11033
11034                 addr = sin->sin_addr.s_addr;
11035                 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF))
11036                         return (EADDRNOTAVAIL);
11037                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11038                 /* Add 96 bits */
11039                 addrlen += IPV6_ABITS - IP_ABITS;
11040         }
11041
11042         if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL)
11043                 return (EINVAL);
11044
11045         /* Check if bits in the address is set past the mask */
11046         if (!V6_MASK_EQ(v6addr, v6mask, v6addr))
11047                 return (EINVAL);
11048
11049         if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) &&
11050             IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask))
11051                 return (0);     /* No change */
11052
11053         if (ipif->ipif_flags & IPIF_UP) {
11054                 /*
11055                  * If the interface is already marked up,
11056                  * we call ipif_down which will take care
11057                  * of ditching any IREs that have been set
11058                  * up based on the old interface address.
11059                  */
11060                 err = ipif_logical_down(ipif, q, mp);
11061                 if (err == EINPROGRESS)
11062                         return (err);
11063                 (void) ipif_down_tail(ipif);
11064                 need_up = B_TRUE;
11065         }
11066
11067         err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up);
11068         return (err);
11069 }
11070
11071 static int
11072 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask,
11073     queue_t *q, mblk_t *mp, boolean_t need_up)
11074 {
11075         ill_t   *ill = ipif->ipif_ill;
11076         int     err = 0;
11077
11078         ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n",
11079             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11080
11081         /* Set the new address. */
11082         mutex_enter(&ill->ill_lock);
11083         ipif->ipif_v6net_mask = v6mask;
11084         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11085                 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask,
11086                     ipif->ipif_v6subnet);
11087         }
11088         mutex_exit(&ill->ill_lock);
11089
11090         if (need_up) {
11091                 /*
11092                  * Now bring the interface back up.  If this
11093                  * is the only IPIF for the ILL, ipif_up
11094                  * will have to re-bind to the device, so
11095                  * we may get back EINPROGRESS, in which
11096                  * case, this IOCTL will get completed in
11097                  * ip_rput_dlpi when we see the DL_BIND_ACK.
11098                  */
11099                 err = ipif_up(ipif, q, mp);
11100                 if (err == EINPROGRESS)
11101                         return (err);
11102         }
11103         return (err);
11104 }
11105
11106 /* ARGSUSED */
11107 int
11108 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11109     ip_ioctl_cmd_t *ipip, void *if_req)
11110 {
11111         int     addrlen;
11112         in6_addr_t v6addr;
11113         in6_addr_t v6mask;
11114         struct lifreq *lifr = (struct lifreq *)if_req;
11115
11116         ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n",
11117             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11118         (void) ipif_down_tail(ipif);
11119
11120         addrlen = lifr->lifr_addrlen;
11121         if (ipif->ipif_isv6) {
11122                 sin6_t *sin6;
11123
11124                 sin6 = (sin6_t *)sin;
11125                 v6addr = sin6->sin6_addr;
11126         } else {
11127                 ipaddr_t addr;
11128
11129                 addr = sin->sin_addr.s_addr;
11130                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11131                 addrlen += IPV6_ABITS - IP_ABITS;
11132         }
11133         (void) ip_plen_to_mask_v6(addrlen, &v6mask);
11134
11135         return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE));
11136 }
11137
11138 /* ARGSUSED */
11139 int
11140 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11141     ip_ioctl_cmd_t *ipip, void *if_req)
11142 {
11143         struct lifreq *lifr = (struct lifreq *)if_req;
11144         struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin;
11145
11146         ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n",
11147             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11148         ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11149
11150         if (ipif->ipif_isv6) {
11151                 *sin6 = sin6_null;
11152                 sin6->sin6_family = AF_INET6;
11153                 sin6->sin6_addr = ipif->ipif_v6subnet;
11154                 lifr->lifr_addrlen =
11155                     ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11156         } else {
11157                 *sin = sin_null;
11158                 sin->sin_family = AF_INET;
11159                 sin->sin_addr.s_addr = ipif->ipif_subnet;
11160                 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask);
11161         }
11162         return (0);
11163 }
11164
11165 /*
11166  * Set the IPv6 address token.
11167  */
11168 /* ARGSUSED */
11169 int
11170 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11171     ip_ioctl_cmd_t *ipi, void *if_req)
11172 {
11173         ill_t *ill = ipif->ipif_ill;
11174         int err;
11175         in6_addr_t v6addr;
11176         in6_addr_t v6mask;
11177         boolean_t need_up = B_FALSE;
11178         int i;
11179         sin6_t *sin6 = (sin6_t *)sin;
11180         struct lifreq *lifr = (struct lifreq *)if_req;
11181         int addrlen;
11182
11183         ip1dbg(("ip_sioctl_token(%s:%u %p)\n",
11184             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11185         ASSERT(IAM_WRITER_IPIF(ipif));
11186
11187         addrlen = lifr->lifr_addrlen;
11188         /* Only allow for logical unit zero i.e. not on "le0:17" */
11189         if (ipif->ipif_id != 0)
11190                 return (EINVAL);
11191
11192         if (!ipif->ipif_isv6)
11193                 return (EINVAL);
11194
11195         if (addrlen > IPV6_ABITS)
11196                 return (EINVAL);
11197
11198         v6addr = sin6->sin6_addr;
11199
11200         /*
11201          * The length of the token is the length from the end.  To get
11202          * the proper mask for this, compute the mask of the bits not
11203          * in the token; ie. the prefix, and then xor to get the mask.
11204          */
11205         if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL)
11206                 return (EINVAL);
11207         for (i = 0; i < 4; i++) {
11208                 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11209         }
11210
11211         if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) &&
11212             ill->ill_token_length == addrlen)
11213                 return (0);     /* No change */
11214
11215         if (ipif->ipif_flags & IPIF_UP) {
11216                 err = ipif_logical_down(ipif, q, mp);
11217                 if (err == EINPROGRESS)
11218                         return (err);
11219                 (void) ipif_down_tail(ipif);
11220                 need_up = B_TRUE;
11221         }
11222         err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up);
11223         return (err);
11224 }
11225
11226 static int
11227 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q,
11228     mblk_t *mp, boolean_t need_up)
11229 {
11230         in6_addr_t v6addr;
11231         in6_addr_t v6mask;
11232         ill_t   *ill = ipif->ipif_ill;
11233         int     i;
11234         int     err = 0;
11235
11236         ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n",
11237             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11238         v6addr = sin6->sin6_addr;
11239         /*
11240          * The length of the token is the length from the end.  To get
11241          * the proper mask for this, compute the mask of the bits not
11242          * in the token; ie. the prefix, and then xor to get the mask.
11243          */
11244         (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask);
11245         for (i = 0; i < 4; i++)
11246                 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11247
11248         mutex_enter(&ill->ill_lock);
11249         V6_MASK_COPY(v6addr, v6mask, ill->ill_token);
11250         ill->ill_token_length = addrlen;
11251         ill->ill_manual_token = 1;
11252
11253         /* Reconfigure the link-local address based on this new token */
11254         ipif_setlinklocal(ill->ill_ipif);
11255
11256         mutex_exit(&ill->ill_lock);
11257
11258         if (need_up) {
11259                 /*
11260                  * Now bring the interface back up.  If this
11261                  * is the only IPIF for the ILL, ipif_up
11262                  * will have to re-bind to the device, so
11263                  * we may get back EINPROGRESS, in which
11264                  * case, this IOCTL will get completed in
11265                  * ip_rput_dlpi when we see the DL_BIND_ACK.
11266                  */
11267                 err = ipif_up(ipif, q, mp);
11268                 if (err == EINPROGRESS)
11269                         return (err);
11270         }
11271         return (err);
11272 }
11273
11274 /* ARGSUSED */
11275 int
11276 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11277     ip_ioctl_cmd_t *ipi, void *if_req)
11278 {
11279         ill_t *ill;
11280         sin6_t *sin6 = (sin6_t *)sin;
11281         struct lifreq *lifr = (struct lifreq *)if_req;
11282
11283         ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n",
11284             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11285         if (ipif->ipif_id != 0)
11286                 return (EINVAL);
11287
11288         ill = ipif->ipif_ill;
11289         if (!ill->ill_isv6)
11290                 return (ENXIO);
11291
11292         *sin6 = sin6_null;
11293         sin6->sin6_family = AF_INET6;
11294         ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token));
11295         sin6->sin6_addr = ill->ill_token;
11296         lifr->lifr_addrlen = ill->ill_token_length;
11297         return (0);
11298 }
11299
11300 /*
11301  * Set (hardware) link specific information that might override
11302  * what was acquired through the DL_INFO_ACK.
11303  */
11304 /* ARGSUSED */
11305 int
11306 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11307     ip_ioctl_cmd_t *ipi, void *if_req)
11308 {
11309         ill_t           *ill = ipif->ipif_ill;
11310         int             ip_min_mtu;
11311         struct lifreq   *lifr = (struct lifreq *)if_req;
11312         lif_ifinfo_req_t *lir;
11313
11314         ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n",
11315             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11316         lir = &lifr->lifr_ifinfo;
11317         ASSERT(IAM_WRITER_IPIF(ipif));
11318
11319         /* Only allow for logical unit zero i.e. not on "bge0:17" */
11320         if (ipif->ipif_id != 0)
11321                 return (EINVAL);
11322
11323         /* Set interface MTU. */
11324         if (ipif->ipif_isv6)
11325                 ip_min_mtu = IPV6_MIN_MTU;
11326         else
11327                 ip_min_mtu = IP_MIN_MTU;
11328
11329         /*
11330          * Verify values before we set anything. Allow zero to
11331          * mean unspecified.
11332          *
11333          * XXX We should be able to set the user-defined lir_mtu to some value
11334          * that is greater than ill_current_frag but less than ill_max_frag- the
11335          * ill_max_frag value tells us the max MTU that can be handled by the
11336          * datalink, whereas the ill_current_frag is dynamically computed for
11337          * some link-types like tunnels, based on the tunnel PMTU. However,
11338          * since there is currently no way of distinguishing between
11339          * administratively fixed link mtu values (e.g., those set via
11340          * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered
11341          * for tunnels) we conservatively choose the  ill_current_frag as the
11342          * upper-bound.
11343          */
11344         if (lir->lir_maxmtu != 0 &&
11345             (lir->lir_maxmtu > ill->ill_current_frag ||
11346             lir->lir_maxmtu < ip_min_mtu))
11347                 return (EINVAL);
11348         if (lir->lir_reachtime != 0 &&
11349             lir->lir_reachtime > ND_MAX_REACHTIME)
11350                 return (EINVAL);
11351         if (lir->lir_reachretrans != 0 &&
11352             lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME)
11353                 return (EINVAL);
11354
11355         mutex_enter(&ill->ill_lock);
11356         /*
11357          * The dce and fragmentation code can handle changes to ill_mtu
11358          * concurrent with sending/fragmenting packets.
11359          */
11360         if (lir->lir_maxmtu != 0)
11361                 ill->ill_user_mtu = lir->lir_maxmtu;
11362
11363         if (lir->lir_reachtime != 0)
11364                 ill->ill_reachable_time = lir->lir_reachtime;
11365
11366         if (lir->lir_reachretrans != 0)
11367                 ill->ill_reachable_retrans_time = lir->lir_reachretrans;
11368
11369         ill->ill_max_hops = lir->lir_maxhops;
11370         ill->ill_max_buf = ND_MAX_Q;
11371         if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) {
11372                 /*
11373                  * ill_mtu is the actual interface MTU, obtained as the min
11374                  * of user-configured mtu and the value announced by the
11375                  * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since
11376                  * we have already made the choice of requiring
11377                  * ill_user_mtu < ill_current_frag by the time we get here,
11378                  * the ill_mtu effectively gets assigned to the ill_user_mtu
11379                  * here.
11380                  */
11381                 ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu);
11382                 ill->ill_mc_mtu = MIN(ill->ill_mc_mtu, ill->ill_user_mtu);
11383         }
11384         mutex_exit(&ill->ill_lock);
11385
11386         /*
11387          * Make sure all dce_generation checks find out
11388          * that ill_mtu/ill_mc_mtu has changed.
11389          */
11390         if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0))
11391                 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
11392
11393         /*
11394          * Refresh IPMP meta-interface MTU if necessary.
11395          */
11396         if (IS_UNDER_IPMP(ill))
11397                 ipmp_illgrp_refresh_mtu(ill->ill_grp);
11398
11399         return (0);
11400 }
11401
11402 /* ARGSUSED */
11403 int
11404 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11405     ip_ioctl_cmd_t *ipi, void *if_req)
11406 {
11407         struct lif_ifinfo_req *lir;
11408         ill_t *ill = ipif->ipif_ill;
11409
11410         ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n",
11411             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11412         if (ipif->ipif_id != 0)
11413                 return (EINVAL);
11414
11415         lir = &((struct lifreq *)if_req)->lifr_ifinfo;
11416         lir->lir_maxhops = ill->ill_max_hops;
11417         lir->lir_reachtime = ill->ill_reachable_time;
11418         lir->lir_reachretrans = ill->ill_reachable_retrans_time;
11419         lir->lir_maxmtu = ill->ill_mtu;
11420
11421         return (0);
11422 }
11423
11424 /*
11425  * Return best guess as to the subnet mask for the specified address.
11426  * Based on the subnet masks for all the configured interfaces.
11427  *
11428  * We end up returning a zero mask in the case of default, multicast or
11429  * experimental.
11430  */
11431 static ipaddr_t
11432 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst)
11433 {
11434         ipaddr_t net_mask;
11435         ill_t   *ill;
11436         ipif_t  *ipif;
11437         ill_walk_context_t ctx;
11438         ipif_t  *fallback_ipif = NULL;
11439
11440         net_mask = ip_net_mask(addr);
11441         if (net_mask == 0) {
11442                 *ipifp = NULL;
11443                 return (0);
11444         }
11445
11446         /* Let's check to see if this is maybe a local subnet route. */
11447         /* this function only applies to IPv4 interfaces */
11448         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
11449         ill = ILL_START_WALK_V4(&ctx, ipst);
11450         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
11451                 mutex_enter(&ill->ill_lock);
11452                 for (ipif = ill->ill_ipif; ipif != NULL;
11453                     ipif = ipif->ipif_next) {
11454                         if (IPIF_IS_CONDEMNED(ipif))
11455                                 continue;
11456                         if (!(ipif->ipif_flags & IPIF_UP))
11457                                 continue;
11458                         if ((ipif->ipif_subnet & net_mask) ==
11459                             (addr & net_mask)) {
11460                                 /*
11461                                  * Don't trust pt-pt interfaces if there are
11462                                  * other interfaces.
11463                                  */
11464                                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
11465                                         if (fallback_ipif == NULL) {
11466                                                 ipif_refhold_locked(ipif);
11467                                                 fallback_ipif = ipif;
11468                                         }
11469                                         continue;
11470                                 }
11471
11472                                 /*
11473                                  * Fine. Just assume the same net mask as the
11474                                  * directly attached subnet interface is using.
11475                                  */
11476                                 ipif_refhold_locked(ipif);
11477                                 mutex_exit(&ill->ill_lock);
11478                                 rw_exit(&ipst->ips_ill_g_lock);
11479                                 if (fallback_ipif != NULL)
11480                                         ipif_refrele(fallback_ipif);
11481                                 *ipifp = ipif;
11482                                 return (ipif->ipif_net_mask);
11483                         }
11484                 }
11485                 mutex_exit(&ill->ill_lock);
11486         }
11487         rw_exit(&ipst->ips_ill_g_lock);
11488
11489         *ipifp = fallback_ipif;
11490         return ((fallback_ipif != NULL) ?
11491             fallback_ipif->ipif_net_mask : net_mask);
11492 }
11493
11494 /*
11495  * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl.
11496  */
11497 static void
11498 ip_wput_ioctl(queue_t *q, mblk_t *mp)
11499 {
11500         IOCP    iocp;
11501         ipft_t  *ipft;
11502         ipllc_t *ipllc;
11503         mblk_t  *mp1;
11504         cred_t  *cr;
11505         int     error = 0;
11506         conn_t  *connp;
11507
11508         ip1dbg(("ip_wput_ioctl"));
11509         iocp = (IOCP)mp->b_rptr;
11510         mp1 = mp->b_cont;
11511         if (mp1 == NULL) {
11512                 iocp->ioc_error = EINVAL;
11513                 mp->b_datap->db_type = M_IOCNAK;
11514                 iocp->ioc_count = 0;
11515                 qreply(q, mp);
11516                 return;
11517         }
11518
11519         /*
11520          * These IOCTLs provide various control capabilities to
11521          * upstream agents such as ULPs and processes.  There
11522          * are currently two such IOCTLs implemented.  They
11523          * are used by TCP to provide update information for
11524          * existing IREs and to forcibly delete an IRE for a
11525          * host that is not responding, thereby forcing an
11526          * attempt at a new route.
11527          */
11528         iocp->ioc_error = EINVAL;
11529         if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd)))
11530                 goto done;
11531
11532         ipllc = (ipllc_t *)mp1->b_rptr;
11533         for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) {
11534                 if (ipllc->ipllc_cmd == ipft->ipft_cmd)
11535                         break;
11536         }
11537         /*
11538          * prefer credential from mblk over ioctl;
11539          * see ip_sioctl_copyin_setup
11540          */
11541         cr = msg_getcred(mp, NULL);
11542         if (cr == NULL)
11543                 cr = iocp->ioc_cr;
11544
11545         /*
11546          * Refhold the conn in case the request gets queued up in some lookup
11547          */
11548         ASSERT(CONN_Q(q));
11549         connp = Q_TO_CONN(q);
11550         CONN_INC_REF(connp);
11551         CONN_INC_IOCTLREF(connp);
11552         if (ipft->ipft_pfi &&
11553             ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size ||
11554             pullupmsg(mp1, ipft->ipft_min_size))) {
11555                 error = (*ipft->ipft_pfi)(q,
11556                     (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr);
11557         }
11558         if (ipft->ipft_flags & IPFT_F_SELF_REPLY) {
11559                 /*
11560                  * CONN_OPER_PENDING_DONE happens in the function called
11561                  * through ipft_pfi above.
11562                  */
11563                 return;
11564         }
11565
11566         CONN_DEC_IOCTLREF(connp);
11567         CONN_OPER_PENDING_DONE(connp);
11568         if (ipft->ipft_flags & IPFT_F_NO_REPLY) {
11569                 freemsg(mp);
11570                 return;
11571         }
11572         iocp->ioc_error = error;
11573
11574 done:
11575         mp->b_datap->db_type = M_IOCACK;
11576         if (iocp->ioc_error)
11577                 iocp->ioc_count = 0;
11578         qreply(q, mp);
11579 }
11580
11581 /*
11582  * Assign a unique id for the ipif. This is used by sctp_addr.c
11583  * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures.
11584  */
11585 static void
11586 ipif_assign_seqid(ipif_t *ipif)
11587 {
11588         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
11589
11590         ipif->ipif_seqid = atomic_inc_64_nv(&ipst->ips_ipif_g_seqid);
11591 }
11592
11593 /*
11594  * Clone the contents of `sipif' to `dipif'.  Requires that both ipifs are
11595  * administratively down (i.e., no DAD), of the same type, and locked.  Note
11596  * that the clone is complete -- including the seqid -- and the expectation is
11597  * that the caller will either free or overwrite `sipif' before it's unlocked.
11598  */
11599 static void
11600 ipif_clone(const ipif_t *sipif, ipif_t *dipif)
11601 {
11602         ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock));
11603         ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock));
11604         ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11605         ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11606         ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type);
11607
11608         dipif->ipif_flags = sipif->ipif_flags;
11609         dipif->ipif_zoneid = sipif->ipif_zoneid;
11610         dipif->ipif_v6subnet = sipif->ipif_v6subnet;
11611         dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr;
11612         dipif->ipif_v6net_mask = sipif->ipif_v6net_mask;
11613         dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr;
11614         dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr;
11615
11616         /*
11617          * As per the comment atop the function, we assume that these sipif
11618          * fields will be changed before sipif is unlocked.
11619          */
11620         dipif->ipif_seqid = sipif->ipif_seqid;
11621         dipif->ipif_state_flags = sipif->ipif_state_flags;
11622 }
11623
11624 /*
11625  * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif'
11626  * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin
11627  * (unreferenced) ipif.  Also, if `sipif' is used by the current xop, then
11628  * transfer the xop to `dipif'.  Requires that all ipifs are administratively
11629  * down (i.e., no DAD), of the same type, and unlocked.
11630  */
11631 static void
11632 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
11633 {
11634         ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq;
11635         ipxop_t *ipx = ipsq->ipsq_xop;
11636
11637         ASSERT(sipif != dipif);
11638         ASSERT(sipif != virgipif);
11639
11640         /*
11641          * Grab all of the locks that protect the ipif in a defined order.
11642          */
11643         GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11644
11645         ipif_clone(sipif, dipif);
11646         if (virgipif != NULL) {
11647                 ipif_clone(virgipif, sipif);
11648                 mi_free(virgipif);
11649         }
11650
11651         RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11652
11653         /*
11654          * Transfer ownership of the current xop, if necessary.
11655          */
11656         if (ipx->ipx_current_ipif == sipif) {
11657                 ASSERT(ipx->ipx_pending_ipif == NULL);
11658                 mutex_enter(&ipx->ipx_lock);
11659                 ipx->ipx_current_ipif = dipif;
11660                 mutex_exit(&ipx->ipx_lock);
11661         }
11662
11663         if (virgipif == NULL)
11664                 mi_free(sipif);
11665 }
11666
11667 /*
11668  * checks if:
11669  *      - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and
11670  *      - logical interface is within the allowed range
11671  */
11672 static int
11673 is_lifname_valid(ill_t *ill, unsigned int ipif_id)
11674 {
11675         if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ)
11676                 return (ENAMETOOLONG);
11677
11678         if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if)
11679                 return (ERANGE);
11680         return (0);
11681 }
11682
11683 /*
11684  * Insert the ipif, so that the list of ipifs on the ill will be sorted
11685  * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will
11686  * be inserted into the first space available in the list. The value of
11687  * ipif_id will then be set to the appropriate value for its position.
11688  */
11689 static int
11690 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock)
11691 {
11692         ill_t *ill;
11693         ipif_t *tipif;
11694         ipif_t **tipifp;
11695         int id, err;
11696         ip_stack_t      *ipst;
11697
11698         ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK ||
11699             IAM_WRITER_IPIF(ipif));
11700
11701         ill = ipif->ipif_ill;
11702         ASSERT(ill != NULL);
11703         ipst = ill->ill_ipst;
11704
11705         /*
11706          * In the case of lo0:0 we already hold the ill_g_lock.
11707          * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate ->
11708          * ipif_insert.
11709          */
11710         if (acquire_g_lock)
11711                 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
11712         mutex_enter(&ill->ill_lock);
11713         id = ipif->ipif_id;
11714         tipifp = &(ill->ill_ipif);
11715         if (id == -1) { /* need to find a real id */
11716                 id = 0;
11717                 while ((tipif = *tipifp) != NULL) {
11718                         ASSERT(tipif->ipif_id >= id);
11719                         if (tipif->ipif_id != id)
11720                                 break; /* non-consecutive id */
11721                         id++;
11722                         tipifp = &(tipif->ipif_next);
11723                 }
11724                 if ((err = is_lifname_valid(ill, id)) != 0) {
11725                         mutex_exit(&ill->ill_lock);
11726                         if (acquire_g_lock)
11727                                 rw_exit(&ipst->ips_ill_g_lock);
11728                         return (err);
11729                 }
11730                 ipif->ipif_id = id; /* assign new id */
11731         } else if ((err = is_lifname_valid(ill, id)) == 0) {
11732                 /* we have a real id; insert ipif in the right place */
11733                 while ((tipif = *tipifp) != NULL) {
11734                         ASSERT(tipif->ipif_id != id);
11735                         if (tipif->ipif_id > id)
11736                                 break; /* found correct location */
11737                         tipifp = &(tipif->ipif_next);
11738                 }
11739         } else {
11740                 mutex_exit(&ill->ill_lock);
11741                 if (acquire_g_lock)
11742                         rw_exit(&ipst->ips_ill_g_lock);
11743                 return (err);
11744         }
11745
11746         ASSERT(tipifp != &(ill->ill_ipif) || id == 0);
11747
11748         ipif->ipif_next = tipif;
11749         *tipifp = ipif;
11750         mutex_exit(&ill->ill_lock);
11751         if (acquire_g_lock)
11752                 rw_exit(&ipst->ips_ill_g_lock);
11753
11754         return (0);
11755 }
11756
11757 static void
11758 ipif_remove(ipif_t *ipif)
11759 {
11760         ipif_t  **ipifp;
11761         ill_t   *ill = ipif->ipif_ill;
11762
11763         ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock));
11764
11765         mutex_enter(&ill->ill_lock);
11766         ipifp = &ill->ill_ipif;
11767         for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
11768                 if (*ipifp == ipif) {
11769                         *ipifp = ipif->ipif_next;
11770                         break;
11771                 }
11772         }
11773         mutex_exit(&ill->ill_lock);
11774 }
11775
11776 /*
11777  * Allocate and initialize a new interface control structure.  (Always
11778  * called as writer.)
11779  * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill
11780  * is not part of the global linked list of ills. ipif_seqid is unique
11781  * in the system and to preserve the uniqueness, it is assigned only
11782  * when ill becomes part of the global list. At that point ill will
11783  * have a name. If it doesn't get assigned here, it will get assigned
11784  * in ipif_set_values() as part of SIOCSLIFNAME processing.
11785  * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set
11786  * the interface flags or any other information from the DL_INFO_ACK for
11787  * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at
11788  * this point. The flags etc. will be set in ip_ll_subnet_defaults when the
11789  * second DL_INFO_ACK comes in from the driver.
11790  */
11791 static ipif_t *
11792 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
11793     boolean_t insert, int *errorp)
11794 {
11795         int err;
11796         ipif_t  *ipif;
11797         ip_stack_t *ipst = ill->ill_ipst;
11798
11799         ip1dbg(("ipif_allocate(%s:%d ill %p)\n",
11800             ill->ill_name, id, (void *)ill));
11801         ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill));
11802
11803         if (errorp != NULL)
11804                 *errorp = 0;
11805
11806         if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) {
11807                 if (errorp != NULL)
11808                         *errorp = ENOMEM;
11809                 return (NULL);
11810         }
11811         *ipif = ipif_zero;      /* start clean */
11812
11813         ipif->ipif_ill = ill;
11814         ipif->ipif_id = id;     /* could be -1 */
11815         /*
11816          * Inherit the zoneid from the ill; for the shared stack instance
11817          * this is always the global zone
11818          */
11819         ipif->ipif_zoneid = ill->ill_zoneid;
11820
11821         ipif->ipif_refcnt = 0;
11822
11823         if (insert) {
11824                 if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) {
11825                         mi_free(ipif);
11826                         if (errorp != NULL)
11827                                 *errorp = err;
11828                         return (NULL);
11829                 }
11830                 /* -1 id should have been replaced by real id */
11831                 id = ipif->ipif_id;
11832                 ASSERT(id >= 0);
11833         }
11834
11835         if (ill->ill_name[0] != '\0')
11836                 ipif_assign_seqid(ipif);
11837
11838         /*
11839          * If this is the zeroth ipif on the IPMP ill, create the illgrp
11840          * (which must not exist yet because the zeroth ipif is created once
11841          * per ill).  However, do not not link it to the ipmp_grp_t until
11842          * I_PLINK is called; see ip_sioctl_plink_ipmp() for details.
11843          */
11844         if (id == 0 && IS_IPMP(ill)) {
11845                 if (ipmp_illgrp_create(ill) == NULL) {
11846                         if (insert) {
11847                                 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
11848                                 ipif_remove(ipif);
11849                                 rw_exit(&ipst->ips_ill_g_lock);
11850                         }
11851                         mi_free(ipif);
11852                         if (errorp != NULL)
11853                                 *errorp = ENOMEM;
11854                         return (NULL);
11855                 }
11856         }
11857
11858         /*
11859          * We grab ill_lock to protect the flag changes.  The ipif is still
11860          * not up and can't be looked up until the ioctl completes and the
11861          * IPIF_CHANGING flag is cleared.
11862          */
11863         mutex_enter(&ill->ill_lock);
11864
11865         ipif->ipif_ire_type = ire_type;
11866
11867         if (ipif->ipif_isv6) {
11868                 ill->ill_flags |= ILLF_IPV6;
11869         } else {
11870                 ipaddr_t inaddr_any = INADDR_ANY;
11871
11872                 ill->ill_flags |= ILLF_IPV4;
11873
11874                 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */
11875                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
11876                     &ipif->ipif_v6lcl_addr);
11877                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
11878                     &ipif->ipif_v6subnet);
11879                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
11880                     &ipif->ipif_v6net_mask);
11881                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
11882                     &ipif->ipif_v6brd_addr);
11883                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
11884                     &ipif->ipif_v6pp_dst_addr);
11885         }
11886
11887         /*
11888          * Don't set the interface flags etc. now, will do it in
11889          * ip_ll_subnet_defaults.
11890          */
11891         if (!initialize)
11892                 goto out;
11893
11894         /*
11895          * NOTE: The IPMP meta-interface is special-cased because it starts
11896          * with no underlying interfaces (and thus an unknown broadcast
11897          * address length), but all interfaces that can be placed into an IPMP
11898          * group are required to be broadcast-capable.
11899          */
11900         if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) {
11901                 /*
11902                  * Later detect lack of DLPI driver multicast capability by
11903                  * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi().
11904                  */
11905                 ill->ill_flags |= ILLF_MULTICAST;
11906                 if (!ipif->ipif_isv6)
11907                         ipif->ipif_flags |= IPIF_BROADCAST;
11908         } else {
11909                 if (ill->ill_net_type != IRE_LOOPBACK) {
11910                         if (ipif->ipif_isv6)
11911                                 /*
11912                                  * Note: xresolv interfaces will eventually need
11913                                  * NOARP set here as well, but that will require
11914                                  * those external resolvers to have some
11915                                  * knowledge of that flag and act appropriately.
11916                                  * Not to be changed at present.
11917                                  */
11918                                 ill->ill_flags |= ILLF_NONUD;
11919                         else
11920                                 ill->ill_flags |= ILLF_NOARP;
11921                 }
11922                 if (ill->ill_phys_addr_length == 0) {
11923                         if (IS_VNI(ill)) {
11924                                 ipif->ipif_flags |= IPIF_NOXMIT;
11925                         } else {
11926                                 /* pt-pt supports multicast. */
11927                                 ill->ill_flags |= ILLF_MULTICAST;
11928                                 if (ill->ill_net_type != IRE_LOOPBACK)
11929                                         ipif->ipif_flags |= IPIF_POINTOPOINT;
11930                         }
11931                 }
11932         }
11933 out:
11934         mutex_exit(&ill->ill_lock);
11935         return (ipif);
11936 }
11937
11938 /*
11939  * Remove the neighbor cache entries associated with this logical
11940  * interface.
11941  */
11942 int
11943 ipif_arp_down(ipif_t *ipif)
11944 {
11945         ill_t   *ill = ipif->ipif_ill;
11946         int     err = 0;
11947
11948         ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
11949         ASSERT(IAM_WRITER_IPIF(ipif));
11950
11951         DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down",
11952             ill_t *, ill, ipif_t *, ipif);
11953         ipif_nce_down(ipif);
11954
11955         /*
11956          * If this is the last ipif that is going down and there are no
11957          * duplicate addresses we may yet attempt to re-probe, then we need to
11958          * clean up ARP completely.
11959          */
11960         if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
11961             !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) {
11962                 /*
11963                  * If this was the last ipif on an IPMP interface, purge any
11964                  * static ARP entries associated with it.
11965                  */
11966                 if (IS_IPMP(ill))
11967                         ipmp_illgrp_refresh_arpent(ill->ill_grp);
11968
11969                 /* UNBIND, DETACH */
11970                 err = arp_ll_down(ill);
11971         }
11972
11973         return (err);
11974 }
11975
11976 /*
11977  * Get the resolver set up for a new IP address.  (Always called as writer.)
11978  * Called both for IPv4 and IPv6 interfaces, though it only does some
11979  * basic DAD related initialization for IPv6. Honors ILLF_NOARP.
11980  *
11981  * The enumerated value res_act tunes the behavior:
11982  *      * Res_act_initial: set up all the resolver structures for a new
11983  *        IP address.
11984  *      * Res_act_defend: tell ARP that it needs to send a single gratuitous
11985  *        ARP message in defense of the address.
11986  *      * Res_act_rebind: tell ARP to change the hardware address for an IP
11987  *        address (and issue gratuitous ARPs).  Used by ipmp_ill_bind_ipif().
11988  *
11989  * Returns zero on success, or an errno upon failure.
11990  */
11991 int
11992 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
11993 {
11994         ill_t           *ill = ipif->ipif_ill;
11995         int             err;
11996         boolean_t       was_dup;
11997
11998         ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
11999             ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags));
12000         ASSERT(IAM_WRITER_IPIF(ipif));
12001
12002         was_dup = B_FALSE;
12003         if (res_act == Res_act_initial) {
12004                 ipif->ipif_addr_ready = 0;
12005                 /*
12006                  * We're bringing an interface up here.  There's no way that we
12007                  * should need to shut down ARP now.
12008                  */
12009                 mutex_enter(&ill->ill_lock);
12010                 if (ipif->ipif_flags & IPIF_DUPLICATE) {
12011                         ipif->ipif_flags &= ~IPIF_DUPLICATE;
12012                         ill->ill_ipif_dup_count--;
12013                         was_dup = B_TRUE;
12014                 }
12015                 mutex_exit(&ill->ill_lock);
12016         }
12017         if (ipif->ipif_recovery_id != 0)
12018                 (void) untimeout(ipif->ipif_recovery_id);
12019         ipif->ipif_recovery_id = 0;
12020         if (ill->ill_net_type != IRE_IF_RESOLVER) {
12021                 ipif->ipif_addr_ready = 1;
12022                 return (0);
12023         }
12024         /* NDP will set the ipif_addr_ready flag when it's ready */
12025         if (ill->ill_isv6)
12026                 return (0);
12027
12028         err = ipif_arp_up(ipif, res_act, was_dup);
12029         return (err);
12030 }
12031
12032 /*
12033  * This routine restarts IPv4/IPv6 duplicate address detection (DAD)
12034  * when a link has just gone back up.
12035  */
12036 static void
12037 ipif_nce_start_dad(ipif_t *ipif)
12038 {
12039         ncec_t *ncec;
12040         ill_t *ill = ipif->ipif_ill;
12041         boolean_t isv6 = ill->ill_isv6;
12042
12043         if (isv6) {
12044                 ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill,
12045                     &ipif->ipif_v6lcl_addr);
12046         } else {
12047                 ipaddr_t v4addr;
12048
12049                 if (ill->ill_net_type != IRE_IF_RESOLVER ||
12050                     (ipif->ipif_flags & IPIF_UNNUMBERED) ||
12051                     ipif->ipif_lcl_addr == INADDR_ANY) {
12052                         /*
12053                          * If we can't contact ARP for some reason,
12054                          * that's not really a problem.  Just send
12055                          * out the routing socket notification that
12056                          * DAD completion would have done, and continue.
12057                          */
12058                         ipif_mask_reply(ipif);
12059                         ipif_up_notify(ipif);
12060                         ipif->ipif_addr_ready = 1;
12061                         return;
12062                 }
12063
12064                 IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr);
12065                 ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr);
12066         }
12067
12068         if (ncec == NULL) {
12069                 ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n",
12070                     (void *)ipif));
12071                 return;
12072         }
12073         if (!nce_restart_dad(ncec)) {
12074                 /*
12075                  * If we can't restart DAD for some reason, that's not really a
12076                  * problem.  Just send out the routing socket notification that
12077                  * DAD completion would have done, and continue.
12078                  */
12079                 ipif_up_notify(ipif);
12080                 ipif->ipif_addr_ready = 1;
12081         }
12082         ncec_refrele(ncec);
12083 }
12084
12085 /*
12086  * Restart duplicate address detection on all interfaces on the given ill.
12087  *
12088  * This is called when an interface transitions from down to up
12089  * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN).
12090  *
12091  * Note that since the underlying physical link has transitioned, we must cause
12092  * at least one routing socket message to be sent here, either via DAD
12093  * completion or just by default on the first ipif.  (If we don't do this, then
12094  * in.mpathd will see long delays when doing link-based failure recovery.)
12095  */
12096 void
12097 ill_restart_dad(ill_t *ill, boolean_t went_up)
12098 {
12099         ipif_t *ipif;
12100
12101         if (ill == NULL)
12102                 return;
12103
12104         /*
12105          * If layer two doesn't support duplicate address detection, then just
12106          * send the routing socket message now and be done with it.
12107          */
12108         if (!ill->ill_isv6 && arp_no_defense) {
12109                 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12110                 return;
12111         }
12112
12113         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12114                 if (went_up) {
12115
12116                         if (ipif->ipif_flags & IPIF_UP) {
12117                                 ipif_nce_start_dad(ipif);
12118                         } else if (ipif->ipif_flags & IPIF_DUPLICATE) {
12119                                 /*
12120                                  * kick off the bring-up process now.
12121                                  */
12122                                 ipif_do_recovery(ipif);
12123                         } else {
12124                                 /*
12125                                  * Unfortunately, the first ipif is "special"
12126                                  * and represents the underlying ill in the
12127                                  * routing socket messages.  Thus, when this
12128                                  * one ipif is down, we must still notify so
12129                                  * that the user knows the IFF_RUNNING status
12130                                  * change.  (If the first ipif is up, then
12131                                  * we'll handle eventual routing socket
12132                                  * notification via DAD completion.)
12133                                  */
12134                                 if (ipif == ill->ill_ipif) {
12135                                         ip_rts_ifmsg(ill->ill_ipif,
12136                                             RTSQ_DEFAULT);
12137                                 }
12138                         }
12139                 } else {
12140                         /*
12141                          * After link down, we'll need to send a new routing
12142                          * message when the link comes back, so clear
12143                          * ipif_addr_ready.
12144                          */
12145                         ipif->ipif_addr_ready = 0;
12146                 }
12147         }
12148
12149         /*
12150          * If we've torn down links, then notify the user right away.
12151          */
12152         if (!went_up)
12153                 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12154 }
12155
12156 static void
12157 ipsq_delete(ipsq_t *ipsq)
12158 {
12159         ipxop_t *ipx = ipsq->ipsq_xop;
12160
12161         ipsq->ipsq_ipst = NULL;
12162         ASSERT(ipsq->ipsq_phyint == NULL);
12163         ASSERT(ipsq->ipsq_xop != NULL);
12164         ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL);
12165         ASSERT(ipx->ipx_pending_mp == NULL);
12166         kmem_free(ipsq, sizeof (ipsq_t));
12167 }
12168
12169 static int
12170 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
12171 {
12172         int err = 0;
12173         ipif_t *ipif;
12174
12175         if (ill == NULL)
12176                 return (0);
12177
12178         ASSERT(IAM_WRITER_ILL(ill));
12179         ill->ill_up_ipifs = B_TRUE;
12180         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12181                 if (ipif->ipif_was_up) {
12182                         if (!(ipif->ipif_flags & IPIF_UP))
12183                                 err = ipif_up(ipif, q, mp);
12184                         ipif->ipif_was_up = B_FALSE;
12185                         if (err != 0) {
12186                                 ASSERT(err == EINPROGRESS);
12187                                 return (err);
12188                         }
12189                 }
12190         }
12191         ill->ill_up_ipifs = B_FALSE;
12192         return (0);
12193 }
12194
12195 /*
12196  * This function is called to bring up all the ipifs that were up before
12197  * bringing the ill down via ill_down_ipifs().
12198  */
12199 int
12200 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
12201 {
12202         int err;
12203
12204         ASSERT(IAM_WRITER_ILL(ill));
12205
12206         if (ill->ill_replumbing) {
12207                 ill->ill_replumbing = 0;
12208                 /*
12209                  * Send down REPLUMB_DONE notification followed by the
12210                  * BIND_REQ on the arp stream.
12211                  */
12212                 if (!ill->ill_isv6)
12213                         arp_send_replumb_conf(ill);
12214         }
12215         err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp);
12216         if (err != 0)
12217                 return (err);
12218
12219         return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp));
12220 }
12221
12222 /*
12223  * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring
12224  * down the ipifs without sending DL_UNBIND_REQ to the driver.
12225  */
12226 static void
12227 ill_down_ipifs(ill_t *ill, boolean_t logical)
12228 {
12229         ipif_t *ipif;
12230
12231         ASSERT(IAM_WRITER_ILL(ill));
12232
12233         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12234                 /*
12235                  * We go through the ipif_down logic even if the ipif
12236                  * is already down, since routes can be added based
12237                  * on down ipifs. Going through ipif_down once again
12238                  * will delete any IREs created based on these routes.
12239                  */
12240                 if (ipif->ipif_flags & IPIF_UP)
12241                         ipif->ipif_was_up = B_TRUE;
12242
12243                 if (logical) {
12244                         (void) ipif_logical_down(ipif, NULL, NULL);
12245                         ipif_non_duplicate(ipif);
12246                         (void) ipif_down_tail(ipif);
12247                 } else {
12248                         (void) ipif_down(ipif, NULL, NULL);
12249                 }
12250         }
12251 }
12252
12253 /*
12254  * Redo source address selection.  This makes IXAF_VERIFY_SOURCE take
12255  * a look again at valid source addresses.
12256  * This should be called each time after the set of source addresses has been
12257  * changed.
12258  */
12259 void
12260 ip_update_source_selection(ip_stack_t *ipst)
12261 {
12262         /* We skip past SRC_GENERATION_VERIFY */
12263         if (atomic_inc_32_nv(&ipst->ips_src_generation) ==
12264             SRC_GENERATION_VERIFY)
12265                 atomic_inc_32(&ipst->ips_src_generation);
12266 }
12267
12268 /*
12269  * Finish the group join started in ip_sioctl_groupname().
12270  */
12271 /* ARGSUSED */
12272 static void
12273 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
12274 {
12275         ill_t           *ill = q->q_ptr;
12276         phyint_t        *phyi = ill->ill_phyint;
12277         ipmp_grp_t      *grp = phyi->phyint_grp;
12278         ip_stack_t      *ipst = ill->ill_ipst;
12279
12280         /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */
12281         ASSERT(!IS_IPMP(ill) && grp != NULL);
12282         ASSERT(IAM_WRITER_IPSQ(ipsq));
12283
12284         if (phyi->phyint_illv4 != NULL) {
12285                 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12286                 VERIFY(grp->gr_pendv4-- > 0);
12287                 rw_exit(&ipst->ips_ipmp_lock);
12288                 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4);
12289         }
12290         if (phyi->phyint_illv6 != NULL) {
12291                 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12292                 VERIFY(grp->gr_pendv6-- > 0);
12293                 rw_exit(&ipst->ips_ipmp_lock);
12294                 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6);
12295         }
12296         freemsg(mp);
12297 }
12298
12299 /*
12300  * Process an SIOCSLIFGROUPNAME request.
12301  */
12302 /* ARGSUSED */
12303 int
12304 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12305     ip_ioctl_cmd_t *ipip, void *ifreq)
12306 {
12307         struct lifreq   *lifr = ifreq;
12308         ill_t           *ill = ipif->ipif_ill;
12309         ip_stack_t      *ipst = ill->ill_ipst;
12310         phyint_t        *phyi = ill->ill_phyint;
12311         ipmp_grp_t      *grp = phyi->phyint_grp;
12312         mblk_t          *ipsq_mp;
12313         int             err = 0;
12314
12315         /*
12316          * Note that phyint_grp can only change here, where we're exclusive.
12317          */
12318         ASSERT(IAM_WRITER_ILL(ill));
12319
12320         if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL ||
12321             (phyi->phyint_flags & PHYI_VIRTUAL))
12322                 return (EINVAL);
12323
12324         lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0';
12325
12326         rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12327
12328         /*
12329          * If the name hasn't changed, there's nothing to do.
12330          */
12331         if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0)
12332                 goto unlock;
12333
12334         /*
12335          * Handle requests to rename an IPMP meta-interface.
12336          *
12337          * Note that creation of the IPMP meta-interface is handled in
12338          * userland through the standard plumbing sequence.  As part of the
12339          * plumbing the IPMP meta-interface, its initial groupname is set to
12340          * the name of the interface (see ipif_set_values_tail()).
12341          */
12342         if (IS_IPMP(ill)) {
12343                 err = ipmp_grp_rename(grp, lifr->lifr_groupname);
12344                 goto unlock;
12345         }
12346
12347         /*
12348          * Handle requests to add or remove an IP interface from a group.
12349          */
12350         if (lifr->lifr_groupname[0] != '\0') {                  /* add */
12351                 /*
12352                  * Moves are handled by first removing the interface from
12353                  * its existing group, and then adding it to another group.
12354                  * So, fail if it's already in a group.
12355                  */
12356                 if (IS_UNDER_IPMP(ill)) {
12357                         err = EALREADY;
12358                         goto unlock;
12359                 }
12360
12361                 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst);
12362                 if (grp == NULL) {
12363                         err = ENOENT;
12364                         goto unlock;
12365                 }
12366
12367                 /*
12368                  * Check if the phyint and its ills are suitable for
12369                  * inclusion into the group.
12370                  */
12371                 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0)
12372                         goto unlock;
12373
12374                 /*
12375                  * Checks pass; join the group, and enqueue the remaining
12376                  * illgrp joins for when we've become part of the group xop
12377                  * and are exclusive across its IPSQs.  Since qwriter_ip()
12378                  * requires an mblk_t to scribble on, and since `mp' will be
12379                  * freed as part of completing the ioctl, allocate another.
12380                  */
12381                 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) {
12382                         err = ENOMEM;
12383                         goto unlock;
12384                 }
12385
12386                 /*
12387                  * Before we drop ipmp_lock, bump gr_pend* to ensure that the
12388                  * IPMP meta-interface ills needed by `phyi' cannot go away
12389                  * before ip_join_illgrps() is called back.  See the comments
12390                  * in ip_sioctl_plink_ipmp() for more.
12391                  */
12392                 if (phyi->phyint_illv4 != NULL)
12393                         grp->gr_pendv4++;
12394                 if (phyi->phyint_illv6 != NULL)
12395                         grp->gr_pendv6++;
12396
12397                 rw_exit(&ipst->ips_ipmp_lock);
12398
12399                 ipmp_phyint_join_grp(phyi, grp);
12400                 ill_refhold(ill);
12401                 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps,
12402                     SWITCH_OP, B_FALSE);
12403                 return (0);
12404         } else {
12405                 /*
12406                  * Request to remove the interface from a group.  If the
12407                  * interface is not in a group, this trivially succeeds.
12408                  */
12409                 rw_exit(&ipst->ips_ipmp_lock);
12410                 if (IS_UNDER_IPMP(ill))
12411                         ipmp_phyint_leave_grp(phyi);
12412                 return (0);
12413         }
12414 unlock:
12415         rw_exit(&ipst->ips_ipmp_lock);
12416         return (err);
12417 }
12418
12419 /*
12420  * Process an SIOCGLIFBINDING request.
12421  */
12422 /* ARGSUSED */
12423 int
12424 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12425     ip_ioctl_cmd_t *ipip, void *ifreq)
12426 {
12427         ill_t           *ill;
12428         struct lifreq   *lifr = ifreq;
12429         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
12430
12431         if (!IS_IPMP(ipif->ipif_ill))
12432                 return (EINVAL);
12433
12434         rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12435         if ((ill = ipif->ipif_bound_ill) == NULL)
12436                 lifr->lifr_binding[0] = '\0';
12437         else
12438                 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ);
12439         rw_exit(&ipst->ips_ipmp_lock);
12440         return (0);
12441 }
12442
12443 /*
12444  * Process an SIOCGLIFGROUPNAME request.
12445  */
12446 /* ARGSUSED */
12447 int
12448 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12449     ip_ioctl_cmd_t *ipip, void *ifreq)
12450 {
12451         ipmp_grp_t      *grp;
12452         struct lifreq   *lifr = ifreq;
12453         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
12454
12455         rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12456         if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL)
12457                 lifr->lifr_groupname[0] = '\0';
12458         else
12459                 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ);
12460         rw_exit(&ipst->ips_ipmp_lock);
12461         return (0);
12462 }
12463
12464 /*
12465  * Process an SIOCGLIFGROUPINFO request.
12466  */
12467 /* ARGSUSED */
12468 int
12469 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12470     ip_ioctl_cmd_t *ipip, void *dummy)
12471 {
12472         ipmp_grp_t      *grp;
12473         lifgroupinfo_t  *lifgr;
12474         ip_stack_t      *ipst = CONNQ_TO_IPST(q);
12475
12476         /* ip_wput_nondata() verified mp->b_cont->b_cont */
12477         lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr;
12478         lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0';
12479
12480         rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12481         if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) {
12482                 rw_exit(&ipst->ips_ipmp_lock);
12483                 return (ENOENT);
12484         }
12485         ipmp_grp_info(grp, lifgr);
12486         rw_exit(&ipst->ips_ipmp_lock);
12487         return (0);
12488 }
12489
12490 static void
12491 ill_dl_down(ill_t *ill)
12492 {
12493         DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill);
12494
12495         /*
12496          * The ill is down; unbind but stay attached since we're still
12497          * associated with a PPA. If we have negotiated DLPI capabilites
12498          * with the data link service provider (IDS_OK) then reset them.
12499          * The interval between unbinding and rebinding is potentially
12500          * unbounded hence we cannot assume things will be the same.
12501          * The DLPI capabilities will be probed again when the data link
12502          * is brought up.
12503          */
12504         mblk_t  *mp = ill->ill_unbind_mp;
12505
12506         ip1dbg(("ill_dl_down(%s)\n", ill->ill_name));
12507
12508         if (!ill->ill_replumbing) {
12509                 /* Free all ilms for this ill */
12510                 update_conn_ill(ill, ill->ill_ipst);
12511         } else {
12512                 ill_leave_multicast(ill);
12513         }
12514
12515         ill->ill_unbind_mp = NULL;
12516         if (mp != NULL) {
12517                 ip1dbg(("ill_dl_down: %s (%u) for %s\n",
12518                     dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
12519                     ill->ill_name));
12520                 mutex_enter(&ill->ill_lock);
12521                 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS;
12522                 mutex_exit(&ill->ill_lock);
12523                 /*
12524                  * ip_rput does not pass up normal (M_PROTO) DLPI messages
12525                  * after ILL_CONDEMNED is set. So in the unplumb case, we call
12526                  * ill_capability_dld_disable disable rightaway. If this is not
12527                  * an unplumb operation then the disable happens on receipt of
12528                  * the capab ack via ip_rput_dlpi_writer ->
12529                  * ill_capability_ack_thr. In both cases the order of
12530                  * the operations seen by DLD is capability disable followed
12531                  * by DL_UNBIND. Also the DLD capability disable needs a
12532                  * cv_wait'able context.
12533                  */
12534                 if (ill->ill_state_flags & ILL_CONDEMNED)
12535                         ill_capability_dld_disable(ill);
12536                 ill_capability_reset(ill, B_FALSE);
12537                 ill_dlpi_send(ill, mp);
12538         }
12539         mutex_enter(&ill->ill_lock);
12540         ill->ill_dl_up = 0;
12541         ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
12542         mutex_exit(&ill->ill_lock);
12543 }
12544
12545 void
12546 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
12547 {
12548         union DL_primitives *dlp;
12549         t_uscalar_t prim;
12550         boolean_t waitack = B_FALSE;
12551
12552         ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12553
12554         dlp = (union DL_primitives *)mp->b_rptr;
12555         prim = dlp->dl_primitive;
12556
12557         ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n",
12558             dl_primstr(prim), prim, ill->ill_name));
12559
12560         switch (prim) {
12561         case DL_PHYS_ADDR_REQ:
12562         {
12563                 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr;
12564                 ill->ill_phys_addr_pend = dlpap->dl_addr_type;
12565                 break;
12566         }
12567         case DL_BIND_REQ:
12568                 mutex_enter(&ill->ill_lock);
12569                 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS;
12570                 mutex_exit(&ill->ill_lock);
12571                 break;
12572         }
12573
12574         /*
12575          * Except for the ACKs for the M_PCPROTO messages, all other ACKs
12576          * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore
12577          * we only wait for the ACK of the DL_UNBIND_REQ.
12578          */
12579         mutex_enter(&ill->ill_lock);
12580         if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
12581             (prim == DL_UNBIND_REQ)) {
12582                 ill->ill_dlpi_pending = prim;
12583                 waitack = B_TRUE;
12584         }
12585
12586         mutex_exit(&ill->ill_lock);
12587         DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch",
12588             char *, dl_primstr(prim), ill_t *, ill);
12589         putnext(ill->ill_wq, mp);
12590
12591         /*
12592          * There is no ack for DL_NOTIFY_CONF messages
12593          */
12594         if (waitack && prim == DL_NOTIFY_CONF)
12595                 ill_dlpi_done(ill, prim);
12596 }
12597
12598 /*
12599  * Helper function for ill_dlpi_send().
12600  */
12601 /* ARGSUSED */
12602 static void
12603 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
12604 {
12605         ill_dlpi_send(q->q_ptr, mp);
12606 }
12607
12608 /*
12609  * Send a DLPI control message to the driver but make sure there
12610  * is only one outstanding message. Uses ill_dlpi_pending to tell
12611  * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done()
12612  * when an ACK or a NAK is received to process the next queued message.
12613  */
12614 void
12615 ill_dlpi_send(ill_t *ill, mblk_t *mp)
12616 {
12617         mblk_t **mpp;
12618
12619         ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12620
12621         /*
12622          * To ensure that any DLPI requests for current exclusive operation
12623          * are always completely sent before any DLPI messages for other
12624          * operations, require writer access before enqueuing.
12625          */
12626         if (!IAM_WRITER_ILL(ill)) {
12627                 ill_refhold(ill);
12628                 /* qwriter_ip() does the ill_refrele() */
12629                 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer,
12630                     NEW_OP, B_TRUE);
12631                 return;
12632         }
12633
12634         mutex_enter(&ill->ill_lock);
12635         if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
12636                 /* Must queue message. Tail insertion */
12637                 mpp = &ill->ill_dlpi_deferred;
12638                 while (*mpp != NULL)
12639                         mpp = &((*mpp)->b_next);
12640
12641                 ip1dbg(("ill_dlpi_send: deferring request for %s "
12642                     "while %s pending\n", ill->ill_name,
12643                     dl_primstr(ill->ill_dlpi_pending)));
12644
12645                 *mpp = mp;
12646                 mutex_exit(&ill->ill_lock);
12647                 return;
12648         }
12649         mutex_exit(&ill->ill_lock);
12650         ill_dlpi_dispatch(ill, mp);
12651 }
12652
12653 void
12654 ill_capability_send(ill_t *ill, mblk_t *mp)
12655 {
12656         ill->ill_capab_pending_cnt++;
12657         ill_dlpi_send(ill, mp);
12658 }
12659
12660 void
12661 ill_capability_done(ill_t *ill)
12662 {
12663         ASSERT(ill->ill_capab_pending_cnt != 0);
12664
12665         ill_dlpi_done(ill, DL_CAPABILITY_REQ);
12666
12667         ill->ill_capab_pending_cnt--;
12668         if (ill->ill_capab_pending_cnt == 0 &&
12669             ill->ill_dlpi_capab_state == IDCS_OK)
12670                 ill_capability_reset_alloc(ill);
12671 }
12672
12673 /*
12674  * Send all deferred DLPI messages without waiting for their ACKs.
12675  */
12676 void
12677 ill_dlpi_send_deferred(ill_t *ill)
12678 {
12679         mblk_t *mp, *nextmp;
12680
12681         /*
12682          * Clear ill_dlpi_pending so that the message is not queued in
12683          * ill_dlpi_send().
12684          */
12685         mutex_enter(&ill->ill_lock);
12686         ill->ill_dlpi_pending = DL_PRIM_INVAL;
12687         mp = ill->ill_dlpi_deferred;
12688         ill->ill_dlpi_deferred = NULL;
12689         mutex_exit(&ill->ill_lock);
12690
12691         for (; mp != NULL; mp = nextmp) {
12692                 nextmp = mp->b_next;
12693                 mp->b_next = NULL;
12694                 ill_dlpi_send(ill, mp);
12695         }
12696 }
12697
12698 /*
12699  * Clear all the deferred DLPI messages. Called on receiving an M_ERROR
12700  * or M_HANGUP
12701  */
12702 static void
12703 ill_dlpi_clear_deferred(ill_t *ill)
12704 {
12705         mblk_t  *mp, *nextmp;
12706
12707         mutex_enter(&ill->ill_lock);
12708         ill->ill_dlpi_pending = DL_PRIM_INVAL;
12709         mp = ill->ill_dlpi_deferred;
12710         ill->ill_dlpi_deferred = NULL;
12711         mutex_exit(&ill->ill_lock);
12712
12713         for (; mp != NULL; mp = nextmp) {
12714                 nextmp = mp->b_next;
12715                 inet_freemsg(mp);
12716         }
12717 }
12718
12719 /*
12720  * Check if the DLPI primitive `prim' is pending; print a warning if not.
12721  */
12722 boolean_t
12723 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim)
12724 {
12725         t_uscalar_t pending;
12726
12727         mutex_enter(&ill->ill_lock);
12728         if (ill->ill_dlpi_pending == prim) {
12729                 mutex_exit(&ill->ill_lock);
12730                 return (B_TRUE);
12731         }
12732
12733         /*
12734          * During teardown, ill_dlpi_dispatch() will send DLPI requests
12735          * without waiting, so don't print any warnings in that case.
12736          */
12737         if (ill->ill_state_flags & ILL_CONDEMNED) {
12738                 mutex_exit(&ill->ill_lock);
12739                 return (B_FALSE);
12740         }
12741         pending = ill->ill_dlpi_pending;
12742         mutex_exit(&ill->ill_lock);
12743
12744         if (pending == DL_PRIM_INVAL) {
12745                 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
12746                     "received unsolicited ack for %s on %s\n",
12747                     dl_primstr(prim), ill->ill_name);
12748         } else {
12749                 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
12750                     "received unexpected ack for %s on %s (expecting %s)\n",
12751                     dl_primstr(prim), ill->ill_name, dl_primstr(pending));
12752         }
12753         return (B_FALSE);
12754 }
12755
12756 /*
12757  * Complete the current DLPI operation associated with `prim' on `ill' and
12758  * start the next queued DLPI operation (if any).  If there are no queued DLPI
12759  * operations and the ill's current exclusive IPSQ operation has finished
12760  * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to
12761  * allow the next exclusive IPSQ operation to begin upon ipsq_exit().  See
12762  * the comments above ipsq_current_finish() for details.
12763  */
12764 void
12765 ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
12766 {
12767         mblk_t *mp;
12768         ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
12769         ipxop_t *ipx = ipsq->ipsq_xop;
12770
12771         ASSERT(IAM_WRITER_IPSQ(ipsq));
12772         mutex_enter(&ill->ill_lock);
12773
12774         ASSERT(prim != DL_PRIM_INVAL);
12775         ASSERT(ill->ill_dlpi_pending == prim);
12776
12777         ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name,
12778             dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending));
12779
12780         if ((mp = ill->ill_dlpi_deferred) == NULL) {
12781                 ill->ill_dlpi_pending = DL_PRIM_INVAL;
12782                 if (ipx->ipx_current_done) {
12783                         mutex_enter(&ipx->ipx_lock);
12784                         ipx->ipx_current_ipif = NULL;
12785                         mutex_exit(&ipx->ipx_lock);
12786                 }
12787                 cv_signal(&ill->ill_cv);
12788                 mutex_exit(&ill->ill_lock);
12789                 return;
12790         }
12791
12792         ill->ill_dlpi_deferred = mp->b_next;
12793         mp->b_next = NULL;
12794         mutex_exit(&ill->ill_lock);
12795
12796         ill_dlpi_dispatch(ill, mp);
12797 }
12798
12799 /*
12800  * Queue a (multicast) DLPI control message to be sent to the driver by
12801  * later calling ill_dlpi_send_queued.
12802  * We queue them while holding a lock (ill_mcast_lock) to ensure that they
12803  * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ
12804  * for the same group to race.
12805  * We send DLPI control messages in order using ill_lock.
12806  * For IPMP we should be called on the cast_ill.
12807  */
12808 void
12809 ill_dlpi_queue(ill_t *ill, mblk_t *mp)
12810 {
12811         mblk_t **mpp;
12812
12813         ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12814
12815         mutex_enter(&ill->ill_lock);
12816         /* Must queue message. Tail insertion */
12817         mpp = &ill->ill_dlpi_deferred;
12818         while (*mpp != NULL)
12819                 mpp = &((*mpp)->b_next);
12820
12821         *mpp = mp;
12822         mutex_exit(&ill->ill_lock);
12823 }
12824
12825 /*
12826  * Send the messages that were queued. Make sure there is only
12827  * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done()
12828  * when an ACK or a NAK is received to process the next queued message.
12829  * For IPMP we are called on the upper ill, but when send what is queued
12830  * on the cast_ill.
12831  */
12832 void
12833 ill_dlpi_send_queued(ill_t *ill)
12834 {
12835         mblk_t  *mp;
12836         union DL_primitives *dlp;
12837         t_uscalar_t prim;
12838         ill_t *release_ill = NULL;
12839
12840         if (IS_IPMP(ill)) {
12841                 /* On the upper IPMP ill. */
12842                 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
12843                 if (release_ill == NULL) {
12844                         /* Avoid ever sending anything down to the ipmpstub */
12845                         return;
12846                 }
12847                 ill = release_ill;
12848         }
12849         mutex_enter(&ill->ill_lock);
12850         while ((mp = ill->ill_dlpi_deferred) != NULL) {
12851                 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
12852                         /* Can't send. Somebody else will send it */
12853                         mutex_exit(&ill->ill_lock);
12854                         goto done;
12855                 }
12856                 ill->ill_dlpi_deferred = mp->b_next;
12857                 mp->b_next = NULL;
12858                 if (!ill->ill_dl_up) {
12859                         /*
12860                          * Nobody there. All multicast addresses will be
12861                          * re-joined when we get the DL_BIND_ACK bringing the
12862                          * interface up.
12863                          */
12864                         freemsg(mp);
12865                         continue;
12866                 }
12867                 dlp = (union DL_primitives *)mp->b_rptr;
12868                 prim = dlp->dl_primitive;
12869
12870                 if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
12871                     (prim == DL_UNBIND_REQ)) {
12872                         ill->ill_dlpi_pending = prim;
12873                 }
12874                 mutex_exit(&ill->ill_lock);
12875
12876                 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued",
12877                     char *, dl_primstr(prim), ill_t *, ill);
12878                 putnext(ill->ill_wq, mp);
12879                 mutex_enter(&ill->ill_lock);
12880         }
12881         mutex_exit(&ill->ill_lock);
12882 done:
12883         if (release_ill != NULL)
12884                 ill_refrele(release_ill);
12885 }
12886
12887 /*
12888  * Queue an IP (IGMP/MLD) message to be sent by IP from
12889  * ill_mcast_send_queued
12890  * We queue them while holding a lock (ill_mcast_lock) to ensure that they
12891  * are sent in order i.e., prevent a IGMP leave and IGMP join for the same
12892  * group to race.
12893  * We send them in order using ill_lock.
12894  * For IPMP we are called on the upper ill, but we queue on the cast_ill.
12895  */
12896 void
12897 ill_mcast_queue(ill_t *ill, mblk_t *mp)
12898 {
12899         mblk_t **mpp;
12900         ill_t *release_ill = NULL;
12901
12902         ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
12903
12904         if (IS_IPMP(ill)) {
12905                 /* On the upper IPMP ill. */
12906                 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
12907                 if (release_ill == NULL) {
12908                         /* Discard instead of queuing for the ipmp interface */
12909                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12910                         ip_drop_output("ipIfStatsOutDiscards - no cast_ill",
12911                             mp, ill);
12912                         freemsg(mp);
12913                         return;
12914                 }
12915                 ill = release_ill;
12916         }
12917
12918         mutex_enter(&ill->ill_lock);
12919         /* Must queue message. Tail insertion */
12920         mpp = &ill->ill_mcast_deferred;
12921         while (*mpp != NULL)
12922                 mpp = &((*mpp)->b_next);
12923
12924         *mpp = mp;
12925         mutex_exit(&ill->ill_lock);
12926         if (release_ill != NULL)
12927                 ill_refrele(release_ill);
12928 }
12929
12930 /*
12931  * Send the IP packets that were queued by ill_mcast_queue.
12932  * These are IGMP/MLD packets.
12933  *
12934  * For IPMP we are called on the upper ill, but when send what is queued
12935  * on the cast_ill.
12936  *
12937  * Request loopback of the report if we are acting as a multicast
12938  * router, so that the process-level routing demon can hear it.
12939  * This will run multiple times for the same group if there are members
12940  * on the same group for multiple ipif's on the same ill. The
12941  * igmp_input/mld_input code will suppress this due to the loopback thus we
12942  * always loopback membership report.
12943  *
12944  * We also need to make sure that this does not get load balanced
12945  * by IPMP. We do this by passing an ill to ip_output_simple.
12946  */
12947 void
12948 ill_mcast_send_queued(ill_t *ill)
12949 {
12950         mblk_t  *mp;
12951         ip_xmit_attr_t ixas;
12952         ill_t *release_ill = NULL;
12953
12954         if (IS_IPMP(ill)) {
12955                 /* On the upper IPMP ill. */
12956                 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
12957                 if (release_ill == NULL) {
12958                         /*
12959                          * We should have no messages on the ipmp interface
12960                          * but no point in trying to send them.
12961                          */
12962                         return;
12963                 }
12964                 ill = release_ill;
12965         }
12966         bzero(&ixas, sizeof (ixas));
12967         ixas.ixa_zoneid = ALL_ZONES;
12968         ixas.ixa_cred = kcred;
12969         ixas.ixa_cpid = NOPID;
12970         /*
12971          * Here we set ixa_ifindex. If IPMP it will be the lower ill which
12972          * makes ip_select_route pick the IRE_MULTICAST for the cast_ill.
12973          * That is necessary to handle IGMP/MLD snooping switches.
12974          */
12975         ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
12976         ixas.ixa_ipst = ill->ill_ipst;
12977
12978         mutex_enter(&ill->ill_lock);
12979         while ((mp = ill->ill_mcast_deferred) != NULL) {
12980                 ill->ill_mcast_deferred = mp->b_next;
12981                 mp->b_next = NULL;
12982                 if (!ill->ill_dl_up) {
12983                         /*
12984                          * Nobody there. Just drop the ip packets.
12985                          * IGMP/MLD will resend later, if this is a replumb.
12986                          */
12987                         freemsg(mp);
12988                         continue;
12989                 }
12990                 mutex_enter(&ill->ill_phyint->phyint_lock);
12991                 if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) {
12992                         /*
12993                          * When the ill is getting deactivated, we only want to
12994                          * send the DLPI messages, so drop IGMP/MLD packets.
12995                          * DLPI messages are handled by ill_dlpi_send_queued()
12996                          */
12997                         mutex_exit(&ill->ill_phyint->phyint_lock);
12998                         freemsg(mp);
12999                         continue;
13000                 }
13001                 mutex_exit(&ill->ill_phyint->phyint_lock);
13002                 mutex_exit(&ill->ill_lock);
13003
13004                 /* Check whether we are sending IPv4 or IPv6. */
13005                 if (ill->ill_isv6) {
13006                         ip6_t  *ip6h = (ip6_t *)mp->b_rptr;
13007
13008                         ixas.ixa_multicast_ttl = ip6h->ip6_hops;
13009                         ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
13010                 } else {
13011                         ipha_t *ipha = (ipha_t *)mp->b_rptr;
13012
13013                         ixas.ixa_multicast_ttl = ipha->ipha_ttl;
13014                         ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
13015                         ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM;
13016                 }
13017                 ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE;
13018                 ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE;
13019                 (void) ip_output_simple(mp, &ixas);
13020                 ixa_cleanup(&ixas);
13021
13022                 mutex_enter(&ill->ill_lock);
13023         }
13024         mutex_exit(&ill->ill_lock);
13025
13026 done:
13027         if (release_ill != NULL)
13028                 ill_refrele(release_ill);
13029 }
13030
13031 /*
13032  * Take down a specific interface, but don't lose any information about it.
13033  * (Always called as writer.)
13034  * This function goes through the down sequence even if the interface is
13035  * already down. There are 2 reasons.
13036  * a. Currently we permit interface routes that depend on down interfaces
13037  *    to be added. This behaviour itself is questionable. However it appears
13038  *    that both Solaris and 4.3 BSD have exhibited this behaviour for a long
13039  *    time. We go thru the cleanup in order to remove these routes.
13040  * b. The bringup of the interface could fail in ill_dl_up i.e. we get
13041  *    DL_ERROR_ACK in response to the DL_BIND request. The interface is
13042  *    down, but we need to cleanup i.e. do ill_dl_down and
13043  *    ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down.
13044  *
13045  * IP-MT notes:
13046  *
13047  * Model of reference to interfaces.
13048  *
13049  * The following members in ipif_t track references to the ipif.
13050  *      int     ipif_refcnt;    Active reference count
13051  *
13052  * The following members in ill_t track references to the ill.
13053  *      int             ill_refcnt;     active refcnt
13054  *      uint_t          ill_ire_cnt;    Number of ires referencing ill
13055  *      uint_t          ill_ncec_cnt;   Number of ncecs referencing ill
13056  *      uint_t          ill_nce_cnt;    Number of nces referencing ill
13057  *      uint_t          ill_ilm_cnt;    Number of ilms referencing ill
13058  *
13059  * Reference to an ipif or ill can be obtained in any of the following ways.
13060  *
13061  * Through the lookup functions ipif_lookup_* / ill_lookup_* functions
13062  * Pointers to ipif / ill from other data structures viz ire and conn.
13063  * Implicit reference to the ipif / ill by holding a reference to the ire.
13064  *
13065  * The ipif/ill lookup functions return a reference held ipif / ill.
13066  * ipif_refcnt and ill_refcnt track the reference counts respectively.
13067  * This is a purely dynamic reference count associated with threads holding
13068  * references to the ipif / ill. Pointers from other structures do not
13069  * count towards this reference count.
13070  *
13071  * ill_ire_cnt is the number of ire's associated with the
13072  * ill. This is incremented whenever a new ire is created referencing the
13073  * ill. This is done atomically inside ire_add_v[46] where the ire is
13074  * actually added to the ire hash table. The count is decremented in
13075  * ire_inactive where the ire is destroyed.
13076  *
13077  * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill.
13078  * This is incremented atomically in
13079  * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the
13080  * table. Similarly it is decremented in ncec_inactive() where the ncec
13081  * is destroyed.
13082  *
13083  * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is
13084  * incremented atomically in nce_add() where the nce is actually added to the
13085  * ill_nce. Similarly it is decremented in nce_inactive() where the nce
13086  * is destroyed.
13087  *
13088  * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in
13089  * ilm_add() and decremented before the ilm is freed in ilm_delete().
13090  *
13091  * Flow of ioctls involving interface down/up
13092  *
13093  * The following is the sequence of an attempt to set some critical flags on an
13094  * up interface.
13095  * ip_sioctl_flags
13096  * ipif_down
13097  * wait for ipif to be quiescent
13098  * ipif_down_tail
13099  * ip_sioctl_flags_tail
13100  *
13101  * All set ioctls that involve down/up sequence would have a skeleton similar
13102  * to the above. All the *tail functions are called after the refcounts have
13103  * dropped to the appropriate values.
13104  *
13105  * SIOC ioctls during the IPIF_CHANGING interval.
13106  *
13107  * Threads handling SIOC set ioctls serialize on the squeue, but this
13108  * is not done for SIOC get ioctls. Since a set ioctl can cause several
13109  * steps of internal changes to the state, some of which are visible in
13110  * ipif_flags (such as IFF_UP being cleared and later set), and we want
13111  * the set ioctl to be atomic related to the get ioctls, the SIOC get code
13112  * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then
13113  * enqueued in the ipsq and the operation is restarted by ipsq_exit() when
13114  * the current exclusive operation completes. The IPIF_CHANGING check
13115  * and enqueue is atomic using the ill_lock and ipsq_lock. The
13116  * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
13117  * change while the ill_lock is held. Before dropping the ill_lock we acquire
13118  * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish
13119  * until we release the ipsq_lock, even though the ill/ipif state flags
13120  * can change after we drop the ill_lock.
13121  */
13122 int
13123 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13124 {
13125         ill_t           *ill = ipif->ipif_ill;
13126         conn_t          *connp;
13127         boolean_t       success;
13128         boolean_t       ipif_was_up = B_FALSE;
13129         ip_stack_t      *ipst = ill->ill_ipst;
13130
13131         ASSERT(IAM_WRITER_IPIF(ipif));
13132
13133         ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
13134
13135         DTRACE_PROBE3(ipif__downup, char *, "ipif_down",
13136             ill_t *, ill, ipif_t *, ipif);
13137
13138         if (ipif->ipif_flags & IPIF_UP) {
13139                 mutex_enter(&ill->ill_lock);
13140                 ipif->ipif_flags &= ~IPIF_UP;
13141                 ASSERT(ill->ill_ipif_up_count > 0);
13142                 --ill->ill_ipif_up_count;
13143                 mutex_exit(&ill->ill_lock);
13144                 ipif_was_up = B_TRUE;
13145                 /* Update status in SCTP's list */
13146                 sctp_update_ipif(ipif, SCTP_IPIF_DOWN);
13147                 ill_nic_event_dispatch(ipif->ipif_ill,
13148                     MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0);
13149         }
13150
13151         /*
13152          * Removal of the last ipif from an ill may result in a DL_UNBIND
13153          * being sent to the driver, and we must not send any data packets to
13154          * the driver after the DL_UNBIND_REQ. To ensure this, all the
13155          * ire and nce entries used in the data path will be cleaned
13156          * up, and we also set  the ILL_DOWN_IN_PROGRESS bit to make
13157          * sure on new entries will be added until the ill is bound
13158          * again. The ILL_DOWN_IN_PROGRESS bit is turned off upon
13159          * receipt of a DL_BIND_ACK.
13160          */
13161         if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13162             ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13163             ill->ill_dl_up) {
13164                 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
13165         }
13166
13167         /*
13168          * Blow away memberships we established in ipif_multicast_up().
13169          */
13170         ipif_multicast_down(ipif);
13171
13172         /*
13173          * Remove from the mapping for __sin6_src_id. We insert only
13174          * when the address is not INADDR_ANY. As IPv4 addresses are
13175          * stored as mapped addresses, we need to check for mapped
13176          * INADDR_ANY also.
13177          */
13178         if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
13179             !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) &&
13180             !(ipif->ipif_flags & IPIF_NOLOCAL)) {
13181                 int err;
13182
13183                 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr,
13184                     ipif->ipif_zoneid, ipst);
13185                 if (err != 0) {
13186                         ip0dbg(("ipif_down: srcid_remove %d\n", err));
13187                 }
13188         }
13189
13190         if (ipif_was_up) {
13191                 /* only delete if we'd added ire's before */
13192                 if (ipif->ipif_isv6)
13193                         ipif_delete_ires_v6(ipif);
13194                 else
13195                         ipif_delete_ires_v4(ipif);
13196         }
13197
13198         if (ipif_was_up && ill->ill_ipif_up_count == 0) {
13199                 /*
13200                  * Since the interface is now down, it may have just become
13201                  * inactive.  Note that this needs to be done even for a
13202                  * lll_logical_down(), or ARP entries will not get correctly
13203                  * restored when the interface comes back up.
13204                  */
13205                 if (IS_UNDER_IPMP(ill))
13206                         ipmp_ill_refresh_active(ill);
13207         }
13208
13209         /*
13210          * neighbor-discovery or arp entries for this interface. The ipif
13211          * has to be quiesced, so we walk all the nce's and delete those
13212          * that point at the ipif->ipif_ill. At the same time, we also
13213          * update IPMP so that ipifs for data addresses are unbound. We dont
13214          * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer
13215          * that for ipif_down_tail()
13216          */
13217         ipif_nce_down(ipif);
13218
13219         /*
13220          * If this is the last ipif on the ill, we also need to remove
13221          * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will
13222          * never succeed.
13223          */
13224         if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0)
13225                 ire_walk_ill(0, 0, ill_downi, ill, ill);
13226
13227         /*
13228          * Walk all CONNs that can have a reference on an ire for this
13229          * ipif (we actually walk all that now have stale references).
13230          */
13231         ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
13232
13233         /*
13234          * If mp is NULL the caller will wait for the appropriate refcnt.
13235          * Eg. ip_sioctl_removeif -> ipif_free  -> ipif_down
13236          * and ill_delete -> ipif_free -> ipif_down
13237          */
13238         if (mp == NULL) {
13239                 ASSERT(q == NULL);
13240                 return (0);
13241         }
13242
13243         if (CONN_Q(q)) {
13244                 connp = Q_TO_CONN(q);
13245                 mutex_enter(&connp->conn_lock);
13246         } else {
13247                 connp = NULL;
13248         }
13249         mutex_enter(&ill->ill_lock);
13250         /*
13251          * Are there any ire's pointing to this ipif that are still active ?
13252          * If this is the last ipif going down, are there any ire's pointing
13253          * to this ill that are still active ?
13254          */
13255         if (ipif_is_quiescent(ipif)) {
13256                 mutex_exit(&ill->ill_lock);
13257                 if (connp != NULL)
13258                         mutex_exit(&connp->conn_lock);
13259                 return (0);
13260         }
13261
13262         ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p",
13263             ill->ill_name, (void *)ill));
13264         /*
13265          * Enqueue the mp atomically in ipsq_pending_mp. When the refcount
13266          * drops down, the operation will be restarted by ipif_ill_refrele_tail
13267          * which in turn is called by the last refrele on the ipif/ill/ire.
13268          */
13269         success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN);
13270         if (!success) {
13271                 /* The conn is closing. So just return */
13272                 ASSERT(connp != NULL);
13273                 mutex_exit(&ill->ill_lock);
13274                 mutex_exit(&connp->conn_lock);
13275                 return (EINTR);
13276         }
13277
13278         mutex_exit(&ill->ill_lock);
13279         if (connp != NULL)
13280                 mutex_exit(&connp->conn_lock);
13281         return (EINPROGRESS);
13282 }
13283
13284 int
13285 ipif_down_tail(ipif_t *ipif)
13286 {
13287         ill_t   *ill = ipif->ipif_ill;
13288         int     err = 0;
13289
13290         DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail",
13291             ill_t *, ill, ipif_t *, ipif);
13292
13293         /*
13294          * Skip any loopback interface (null wq).
13295          * If this is the last logical interface on the ill
13296          * have ill_dl_down tell the driver we are gone (unbind)
13297          * Note that lun 0 can ipif_down even though
13298          * there are other logical units that are up.
13299          * This occurs e.g. when we change a "significant" IFF_ flag.
13300          */
13301         if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13302             ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13303             ill->ill_dl_up) {
13304                 ill_dl_down(ill);
13305         }
13306         if (!ipif->ipif_isv6)
13307                 err = ipif_arp_down(ipif);
13308
13309         ill->ill_logical_down = 0;
13310
13311         ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
13312         ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT);
13313         return (err);
13314 }
13315
13316 /*
13317  * Bring interface logically down without bringing the physical interface
13318  * down e.g. when the netmask is changed. This avoids long lasting link
13319  * negotiations between an ethernet interface and a certain switches.
13320  */
13321 static int
13322 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13323 {
13324         DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down",
13325             ill_t *, ipif->ipif_ill, ipif_t *, ipif);
13326
13327         /*
13328          * The ill_logical_down flag is a transient flag. It is set here
13329          * and is cleared once the down has completed in ipif_down_tail.
13330          * This flag does not indicate whether the ill stream is in the
13331          * DL_BOUND state with the driver. Instead this flag is used by
13332          * ipif_down_tail to determine whether to DL_UNBIND the stream with
13333          * the driver. The state of the ill stream i.e. whether it is
13334          * DL_BOUND with the driver or not is indicated by the ill_dl_up flag.
13335          */
13336         ipif->ipif_ill->ill_logical_down = 1;
13337         return (ipif_down(ipif, q, mp));
13338 }
13339
13340 /*
13341  * Initiate deallocate of an IPIF. Always called as writer. Called by
13342  * ill_delete or ip_sioctl_removeif.
13343  */
13344 static void
13345 ipif_free(ipif_t *ipif)
13346 {
13347         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
13348
13349         ASSERT(IAM_WRITER_IPIF(ipif));
13350
13351         if (ipif->ipif_recovery_id != 0)
13352                 (void) untimeout(ipif->ipif_recovery_id);
13353         ipif->ipif_recovery_id = 0;
13354
13355         /*
13356          * Take down the interface. We can be called either from ill_delete
13357          * or from ip_sioctl_removeif.
13358          */
13359         (void) ipif_down(ipif, NULL, NULL);
13360
13361         /*
13362          * Now that the interface is down, there's no chance it can still
13363          * become a duplicate.  Cancel any timer that may have been set while
13364          * tearing down.
13365          */
13366         if (ipif->ipif_recovery_id != 0)
13367                 (void) untimeout(ipif->ipif_recovery_id);
13368         ipif->ipif_recovery_id = 0;
13369
13370         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13371         /* Remove pointers to this ill in the multicast routing tables */
13372         reset_mrt_vif_ipif(ipif);
13373         /* If necessary, clear the cached source ipif rotor. */
13374         if (ipif->ipif_ill->ill_src_ipif == ipif)
13375                 ipif->ipif_ill->ill_src_ipif = NULL;
13376         rw_exit(&ipst->ips_ill_g_lock);
13377 }
13378
13379 static void
13380 ipif_free_tail(ipif_t *ipif)
13381 {
13382         ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
13383
13384         /*
13385          * Need to hold both ill_g_lock and ill_lock while
13386          * inserting or removing an ipif from the linked list
13387          * of ipifs hanging off the ill.
13388          */
13389         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13390
13391 #ifdef DEBUG
13392         ipif_trace_cleanup(ipif);
13393 #endif
13394
13395         /* Ask SCTP to take it out of it list */
13396         sctp_update_ipif(ipif, SCTP_IPIF_REMOVE);
13397         ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT);
13398
13399         /* Get it out of the ILL interface list. */
13400         ipif_remove(ipif);
13401         rw_exit(&ipst->ips_ill_g_lock);
13402
13403         ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE)));
13404         ASSERT(ipif->ipif_recovery_id == 0);
13405         ASSERT(ipif->ipif_ire_local == NULL);
13406         ASSERT(ipif->ipif_ire_if == NULL);
13407
13408         /* Free the memory. */
13409         mi_free(ipif);
13410 }
13411
13412 /*
13413  * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id"
13414  * is zero.
13415  */
13416 void
13417 ipif_get_name(const ipif_t *ipif, char *buf, int len)
13418 {
13419         char    lbuf[LIFNAMSIZ];
13420         char    *name;
13421         size_t  name_len;
13422
13423         buf[0] = '\0';
13424         name = ipif->ipif_ill->ill_name;
13425         name_len = ipif->ipif_ill->ill_name_length;
13426         if (ipif->ipif_id != 0) {
13427                 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR,
13428                     ipif->ipif_id);
13429                 name = lbuf;
13430                 name_len = mi_strlen(name) + 1;
13431         }
13432         len -= 1;
13433         buf[len] = '\0';
13434         len = MIN(len, name_len);
13435         bcopy(name, buf, len);
13436 }
13437
13438 /*
13439  * Sets `buf' to an ill name.
13440  */
13441 void
13442 ill_get_name(const ill_t *ill, char *buf, int len)
13443 {
13444         char    *name;
13445         size_t  name_len;
13446
13447         name = ill->ill_name;
13448         name_len = ill->ill_name_length;
13449         len -= 1;
13450         buf[len] = '\0';
13451         len = MIN(len, name_len);
13452         bcopy(name, buf, len);
13453 }
13454
13455 /*
13456  * Find an IPIF based on the name passed in.  Names can be of the form <phys>
13457  * (e.g., le0) or <phys>:<#> (e.g., le0:1).  When there is no colon, the
13458  * implied unit id is zero. <phys> must correspond to the name of an ILL.
13459  * (May be called as writer.)
13460  */
13461 static ipif_t *
13462 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
13463     boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst)
13464 {
13465         char    *cp;
13466         char    *endp;
13467         long    id;
13468         ill_t   *ill;
13469         ipif_t  *ipif;
13470         uint_t  ire_type;
13471         boolean_t did_alloc = B_FALSE;
13472         char    last;
13473
13474         /*
13475          * If the caller wants to us to create the ipif, make sure we have a
13476          * valid zoneid
13477          */
13478         ASSERT(!do_alloc || zoneid != ALL_ZONES);
13479
13480         if (namelen == 0) {
13481                 return (NULL);
13482         }
13483
13484         *exists = B_FALSE;
13485         /* Look for a colon in the name. */
13486         endp = &name[namelen];
13487         for (cp = endp; --cp > name; ) {
13488                 if (*cp == IPIF_SEPARATOR_CHAR)
13489                         break;
13490         }
13491
13492         if (*cp == IPIF_SEPARATOR_CHAR) {
13493                 /*
13494                  * Reject any non-decimal aliases for logical
13495                  * interfaces. Aliases with leading zeroes
13496                  * are also rejected as they introduce ambiguity
13497                  * in the naming of the interfaces.
13498                  * In order to confirm with existing semantics,
13499                  * and to not break any programs/script relying
13500                  * on that behaviour, if<0>:0 is considered to be
13501                  * a valid interface.
13502                  *
13503                  * If alias has two or more digits and the first
13504                  * is zero, fail.
13505                  */
13506                 if (&cp[2] < endp && cp[1] == '0') {
13507                         return (NULL);
13508                 }
13509         }
13510
13511         if (cp <= name) {
13512                 cp = endp;
13513         }
13514         last = *cp;
13515         *cp = '\0';
13516
13517         /*
13518          * Look up the ILL, based on the portion of the name
13519          * before the slash. ill_lookup_on_name returns a held ill.
13520          * Temporary to check whether ill exists already. If so
13521          * ill_lookup_on_name will clear it.
13522          */
13523         ill = ill_lookup_on_name(name, do_alloc, isv6,
13524             &did_alloc, ipst);
13525         *cp = last;
13526         if (ill == NULL)
13527                 return (NULL);
13528
13529         /* Establish the unit number in the name. */
13530         id = 0;
13531         if (cp < endp && *endp == '\0') {
13532                 /* If there was a colon, the unit number follows. */
13533                 cp++;
13534                 if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13535                         ill_refrele(ill);
13536                         return (NULL);
13537                 }
13538         }
13539
13540         mutex_enter(&ill->ill_lock);
13541         /* Now see if there is an IPIF with this unit number. */
13542         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13543                 if (ipif->ipif_id == id) {
13544                         if (zoneid != ALL_ZONES &&
13545                             zoneid != ipif->ipif_zoneid &&
13546                             ipif->ipif_zoneid != ALL_ZONES) {
13547                                 mutex_exit(&ill->ill_lock);
13548                                 ill_refrele(ill);
13549                                 return (NULL);
13550                         }
13551                         if (IPIF_CAN_LOOKUP(ipif)) {
13552                                 ipif_refhold_locked(ipif);
13553                                 mutex_exit(&ill->ill_lock);
13554                                 if (!did_alloc)
13555                                         *exists = B_TRUE;
13556                                 /*
13557                                  * Drop locks before calling ill_refrele
13558                                  * since it can potentially call into
13559                                  * ipif_ill_refrele_tail which can end up
13560                                  * in trying to acquire any lock.
13561                                  */
13562                                 ill_refrele(ill);
13563                                 return (ipif);
13564                         }
13565                 }
13566         }
13567
13568         if (!do_alloc) {
13569                 mutex_exit(&ill->ill_lock);
13570                 ill_refrele(ill);
13571                 return (NULL);
13572         }
13573
13574         /*
13575          * If none found, atomically allocate and return a new one.
13576          * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL
13577          * to support "receive only" use of lo0:1 etc. as is still done
13578          * below as an initial guess.
13579          * However, this is now likely to be overriden later in ipif_up_done()
13580          * when we know for sure what address has been configured on the
13581          * interface, since we might have more than one loopback interface
13582          * with a loopback address, e.g. in the case of zones, and all the
13583          * interfaces with loopback addresses need to be marked IRE_LOOPBACK.
13584          */
13585         if (ill->ill_net_type == IRE_LOOPBACK && id == 0)
13586                 ire_type = IRE_LOOPBACK;
13587         else
13588                 ire_type = IRE_LOCAL;
13589         ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL);
13590         if (ipif != NULL)
13591                 ipif_refhold_locked(ipif);
13592         mutex_exit(&ill->ill_lock);
13593         ill_refrele(ill);
13594         return (ipif);
13595 }
13596
13597 /*
13598  * Variant of the above that queues the request on the ipsq when
13599  * IPIF_CHANGING is set.
13600  */
13601 static ipif_t *
13602 ipif_lookup_on_name_async(char *name, size_t namelen, boolean_t isv6,
13603     zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error,
13604     ip_stack_t *ipst)
13605 {
13606         char    *cp;
13607         char    *endp;
13608         long    id;
13609         ill_t   *ill;
13610         ipif_t  *ipif;
13611         boolean_t did_alloc = B_FALSE;
13612         ipsq_t  *ipsq;
13613
13614         if (error != NULL)
13615                 *error = 0;
13616
13617         if (namelen == 0) {
13618                 if (error != NULL)
13619                         *error = ENXIO;
13620                 return (NULL);
13621         }
13622
13623         /* Look for a colon in the name. */
13624         endp = &name[namelen];
13625         for (cp = endp; --cp > name; ) {
13626                 if (*cp == IPIF_SEPARATOR_CHAR)
13627                         break;
13628         }
13629
13630         if (*cp == IPIF_SEPARATOR_CHAR) {
13631                 /*
13632                  * Reject any non-decimal aliases for logical
13633                  * interfaces. Aliases with leading zeroes
13634                  * are also rejected as they introduce ambiguity
13635                  * in the naming of the interfaces.
13636                  * In order to confirm with existing semantics,
13637                  * and to not break any programs/script relying
13638                  * on that behaviour, if<0>:0 is considered to be
13639                  * a valid interface.
13640                  *
13641                  * If alias has two or more digits and the first
13642                  * is zero, fail.
13643                  */
13644                 if (&cp[2] < endp && cp[1] == '0') {
13645                         if (error != NULL)
13646                                 *error = EINVAL;
13647                         return (NULL);
13648                 }
13649         }
13650
13651         if (cp <= name) {
13652                 cp = endp;
13653         } else {
13654                 *cp = '\0';
13655         }
13656
13657         /*
13658          * Look up the ILL, based on the portion of the name
13659          * before the slash. ill_lookup_on_name returns a held ill.
13660          * Temporary to check whether ill exists already. If so
13661          * ill_lookup_on_name will clear it.
13662          */
13663         ill = ill_lookup_on_name(name, B_FALSE, isv6, &did_alloc, ipst);
13664         if (cp != endp)
13665                 *cp = IPIF_SEPARATOR_CHAR;
13666         if (ill == NULL)
13667                 return (NULL);
13668
13669         /* Establish the unit number in the name. */
13670         id = 0;
13671         if (cp < endp && *endp == '\0') {
13672                 /* If there was a colon, the unit number follows. */
13673                 cp++;
13674                 if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13675                         ill_refrele(ill);
13676                         if (error != NULL)
13677                                 *error = ENXIO;
13678                         return (NULL);
13679                 }
13680         }
13681
13682         GRAB_CONN_LOCK(q);
13683         mutex_enter(&ill->ill_lock);
13684         /* Now see if there is an IPIF with this unit number. */
13685         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13686                 if (ipif->ipif_id == id) {
13687                         if (zoneid != ALL_ZONES &&
13688                             zoneid != ipif->ipif_zoneid &&
13689                             ipif->ipif_zoneid != ALL_ZONES) {
13690                                 mutex_exit(&ill->ill_lock);
13691                                 RELEASE_CONN_LOCK(q);
13692                                 ill_refrele(ill);
13693                                 if (error != NULL)
13694                                         *error = ENXIO;
13695                                 return (NULL);
13696                         }
13697
13698                         if (!(IPIF_IS_CHANGING(ipif) ||
13699                             IPIF_IS_CONDEMNED(ipif)) ||
13700                             IAM_WRITER_IPIF(ipif)) {
13701                                 ipif_refhold_locked(ipif);
13702                                 mutex_exit(&ill->ill_lock);
13703                                 /*
13704                                  * Drop locks before calling ill_refrele
13705                                  * since it can potentially call into
13706                                  * ipif_ill_refrele_tail which can end up
13707                                  * in trying to acquire any lock.
13708                                  */
13709                                 RELEASE_CONN_LOCK(q);
13710                                 ill_refrele(ill);
13711                                 return (ipif);
13712                         } else if (q != NULL && !IPIF_IS_CONDEMNED(ipif)) {
13713                                 ipsq = ill->ill_phyint->phyint_ipsq;
13714                                 mutex_enter(&ipsq->ipsq_lock);
13715                                 mutex_enter(&ipsq->ipsq_xop->ipx_lock);
13716                                 mutex_exit(&ill->ill_lock);
13717                                 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
13718                                 mutex_exit(&ipsq->ipsq_xop->ipx_lock);
13719                                 mutex_exit(&ipsq->ipsq_lock);
13720                                 RELEASE_CONN_LOCK(q);
13721                                 ill_refrele(ill);
13722                                 if (error != NULL)
13723                                         *error = EINPROGRESS;
13724                                 return (NULL);
13725                         }
13726                 }
13727         }
13728         RELEASE_CONN_LOCK(q);
13729         mutex_exit(&ill->ill_lock);
13730         ill_refrele(ill);
13731         if (error != NULL)
13732                 *error = ENXIO;
13733         return (NULL);
13734 }
13735
13736 /*
13737  * This routine is called whenever a new address comes up on an ipif.  If
13738  * we are configured to respond to address mask requests, then we are supposed
13739  * to broadcast an address mask reply at this time.  This routine is also
13740  * called if we are already up, but a netmask change is made.  This is legal
13741  * but might not make the system manager very popular.  (May be called
13742  * as writer.)
13743  */
13744 void
13745 ipif_mask_reply(ipif_t *ipif)
13746 {
13747         icmph_t *icmph;
13748         ipha_t  *ipha;
13749         mblk_t  *mp;
13750         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
13751         ip_xmit_attr_t ixas;
13752
13753 #define REPLY_LEN       (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN)
13754
13755         if (!ipst->ips_ip_respond_to_address_mask_broadcast)
13756                 return;
13757
13758         /* ICMP mask reply is IPv4 only */
13759         ASSERT(!ipif->ipif_isv6);
13760         /* ICMP mask reply is not for a loopback interface */
13761         ASSERT(ipif->ipif_ill->ill_wq != NULL);
13762
13763         if (ipif->ipif_lcl_addr == INADDR_ANY)
13764                 return;
13765
13766         mp = allocb(REPLY_LEN, BPRI_HI);
13767         if (mp == NULL)
13768                 return;
13769         mp->b_wptr = mp->b_rptr + REPLY_LEN;
13770
13771         ipha = (ipha_t *)mp->b_rptr;
13772         bzero(ipha, REPLY_LEN);
13773         *ipha = icmp_ipha;
13774         ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
13775         ipha->ipha_src = ipif->ipif_lcl_addr;
13776         ipha->ipha_dst = ipif->ipif_brd_addr;
13777         ipha->ipha_length = htons(REPLY_LEN);
13778         ipha->ipha_ident = 0;
13779
13780         icmph = (icmph_t *)&ipha[1];
13781         icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
13782         bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
13783         icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
13784
13785         bzero(&ixas, sizeof (ixas));
13786         ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
13787         ixas.ixa_zoneid = ALL_ZONES;
13788         ixas.ixa_ifindex = 0;
13789         ixas.ixa_ipst = ipst;
13790         ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
13791         (void) ip_output_simple(mp, &ixas);
13792         ixa_cleanup(&ixas);
13793 #undef  REPLY_LEN
13794 }
13795
13796 /*
13797  * Join the ipif specific multicast groups.
13798  * Must be called after a mapping has been set up in the resolver.  (Always
13799  * called as writer.)
13800  */
13801 void
13802 ipif_multicast_up(ipif_t *ipif)
13803 {
13804         int err;
13805         ill_t *ill;
13806         ilm_t *ilm;
13807
13808         ASSERT(IAM_WRITER_IPIF(ipif));
13809
13810         ill = ipif->ipif_ill;
13811
13812         ip1dbg(("ipif_multicast_up\n"));
13813         if (!(ill->ill_flags & ILLF_MULTICAST) ||
13814             ipif->ipif_allhosts_ilm != NULL)
13815                 return;
13816
13817         if (ipif->ipif_isv6) {
13818                 in6_addr_t v6allmc = ipv6_all_hosts_mcast;
13819                 in6_addr_t v6solmc = ipv6_solicited_node_mcast;
13820
13821                 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
13822
13823                 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
13824                         return;
13825
13826                 ip1dbg(("ipif_multicast_up - addmulti\n"));
13827
13828                 /*
13829                  * Join the all hosts multicast address.  We skip this for
13830                  * underlying IPMP interfaces since they should be invisible.
13831                  */
13832                 if (!IS_UNDER_IPMP(ill)) {
13833                         ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid,
13834                             &err);
13835                         if (ilm == NULL) {
13836                                 ASSERT(err != 0);
13837                                 ip0dbg(("ipif_multicast_up: "
13838                                     "all_hosts_mcast failed %d\n", err));
13839                                 return;
13840                         }
13841                         ipif->ipif_allhosts_ilm = ilm;
13842                 }
13843
13844                 /*
13845                  * Enable multicast for the solicited node multicast address.
13846                  * If IPMP we need to put the membership on the upper ill.
13847                  */
13848                 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
13849                         ill_t *mcast_ill = NULL;
13850                         boolean_t need_refrele;
13851
13852                         if (IS_UNDER_IPMP(ill) &&
13853                             (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
13854                                 need_refrele = B_TRUE;
13855                         } else {
13856                                 mcast_ill = ill;
13857                                 need_refrele = B_FALSE;
13858                         }
13859
13860                         ilm = ip_addmulti(&v6solmc, mcast_ill,
13861                             ipif->ipif_zoneid, &err);
13862                         if (need_refrele)
13863                                 ill_refrele(mcast_ill);
13864
13865                         if (ilm == NULL) {
13866                                 ASSERT(err != 0);
13867                                 ip0dbg(("ipif_multicast_up: solicited MC"
13868                                     " failed %d\n", err));
13869                                 if ((ilm = ipif->ipif_allhosts_ilm) != NULL) {
13870                                         ipif->ipif_allhosts_ilm = NULL;
13871                                         (void) ip_delmulti(ilm);
13872                                 }
13873                                 return;
13874                         }
13875                         ipif->ipif_solmulti_ilm = ilm;
13876                 }
13877         } else {
13878                 in6_addr_t v6group;
13879
13880                 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill))
13881                         return;
13882
13883                 /* Join the all hosts multicast address */
13884                 ip1dbg(("ipif_multicast_up - addmulti\n"));
13885                 IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group);
13886
13887                 ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err);
13888                 if (ilm == NULL) {
13889                         ASSERT(err != 0);
13890                         ip0dbg(("ipif_multicast_up: failed %d\n", err));
13891                         return;
13892                 }
13893                 ipif->ipif_allhosts_ilm = ilm;
13894         }
13895 }
13896
13897 /*
13898  * Blow away any multicast groups that we joined in ipif_multicast_up().
13899  * (ilms from explicit memberships are handled in conn_update_ill.)
13900  */
13901 void
13902 ipif_multicast_down(ipif_t *ipif)
13903 {
13904         ASSERT(IAM_WRITER_IPIF(ipif));
13905
13906         ip1dbg(("ipif_multicast_down\n"));
13907
13908         if (ipif->ipif_allhosts_ilm != NULL) {
13909                 (void) ip_delmulti(ipif->ipif_allhosts_ilm);
13910                 ipif->ipif_allhosts_ilm = NULL;
13911         }
13912         if (ipif->ipif_solmulti_ilm != NULL) {
13913                 (void) ip_delmulti(ipif->ipif_solmulti_ilm);
13914                 ipif->ipif_solmulti_ilm = NULL;
13915         }
13916 }
13917
13918 /*
13919  * Used when an interface comes up to recreate any extra routes on this
13920  * interface.
13921  */
13922 int
13923 ill_recover_saved_ire(ill_t *ill)
13924 {
13925         mblk_t          *mp;
13926         ip_stack_t      *ipst = ill->ill_ipst;
13927
13928         ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name));
13929
13930         mutex_enter(&ill->ill_saved_ire_lock);
13931         for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
13932                 ire_t           *ire, *nire;
13933                 ifrt_t          *ifrt;
13934
13935                 ifrt = (ifrt_t *)mp->b_rptr;
13936                 /*
13937                  * Create a copy of the IRE with the saved address and netmask.
13938                  */
13939                 if (ill->ill_isv6) {
13940                         ire = ire_create_v6(
13941                             &ifrt->ifrt_v6addr,
13942                             &ifrt->ifrt_v6mask,
13943                             &ifrt->ifrt_v6gateway_addr,
13944                             ifrt->ifrt_type,
13945                             ill,
13946                             ifrt->ifrt_zoneid,
13947                             ifrt->ifrt_flags,
13948                             ipst);
13949                 } else {
13950                         ire = ire_create(
13951                             (uint8_t *)&ifrt->ifrt_addr,
13952                             (uint8_t *)&ifrt->ifrt_mask,
13953                             (uint8_t *)&ifrt->ifrt_gateway_addr,
13954                             ifrt->ifrt_type,
13955                             ill,
13956                             ifrt->ifrt_zoneid,
13957                             ifrt->ifrt_flags,
13958                             ipst);
13959                 }
13960                 if (ire == NULL) {
13961                         mutex_exit(&ill->ill_saved_ire_lock);
13962                         return (ENOMEM);
13963                 }
13964
13965                 if (ifrt->ifrt_flags & RTF_SETSRC) {
13966                         if (ill->ill_isv6) {
13967                                 ire->ire_setsrc_addr_v6 =
13968                                     ifrt->ifrt_v6setsrc_addr;
13969                         } else {
13970                                 ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr;
13971                         }
13972                 }
13973
13974                 /*
13975                  * Some software (for example, GateD and Sun Cluster) attempts
13976                  * to create (what amount to) IRE_PREFIX routes with the
13977                  * loopback address as the gateway.  This is primarily done to
13978                  * set up prefixes with the RTF_REJECT flag set (for example,
13979                  * when generating aggregate routes.)
13980                  *
13981                  * If the IRE type (as defined by ill->ill_net_type) is
13982                  * IRE_LOOPBACK, then we map the request into a
13983                  * IRE_IF_NORESOLVER.
13984                  */
13985                 if (ill->ill_net_type == IRE_LOOPBACK)
13986                         ire->ire_type = IRE_IF_NORESOLVER;
13987
13988                 /*
13989                  * ire held by ire_add, will be refreled' towards the
13990                  * the end of ipif_up_done
13991                  */
13992                 nire = ire_add(ire);
13993                 /*
13994                  * Check if it was a duplicate entry. This handles
13995                  * the case of two racing route adds for the same route
13996                  */
13997                 if (nire == NULL) {
13998                         ip1dbg(("ill_recover_saved_ire: FAILED\n"));
13999                 } else if (nire != ire) {
14000                         ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n",
14001                             (void *)nire));
14002                         ire_delete(nire);
14003                 } else {
14004                         ip1dbg(("ill_recover_saved_ire: added ire %p\n",
14005                             (void *)nire));
14006                 }
14007                 if (nire != NULL)
14008                         ire_refrele(nire);
14009         }
14010         mutex_exit(&ill->ill_saved_ire_lock);
14011         return (0);
14012 }
14013
14014 /*
14015  * Used to set the netmask and broadcast address to default values when the
14016  * interface is brought up.  (Always called as writer.)
14017  */
14018 static void
14019 ipif_set_default(ipif_t *ipif)
14020 {
14021         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14022
14023         if (!ipif->ipif_isv6) {
14024                 /*
14025                  * Interface holds an IPv4 address. Default
14026                  * mask is the natural netmask.
14027                  */
14028                 if (!ipif->ipif_net_mask) {
14029                         ipaddr_t        v4mask;
14030
14031                         v4mask = ip_net_mask(ipif->ipif_lcl_addr);
14032                         V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask);
14033                 }
14034                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14035                         /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14036                         ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14037                 } else {
14038                         V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14039                             ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14040                 }
14041                 /*
14042                  * NOTE: SunOS 4.X does this even if the broadcast address
14043                  * has been already set thus we do the same here.
14044                  */
14045                 if (ipif->ipif_flags & IPIF_BROADCAST) {
14046                         ipaddr_t        v4addr;
14047
14048                         v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask;
14049                         IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr);
14050                 }
14051         } else {
14052                 /*
14053                  * Interface holds an IPv6-only address.  Default
14054                  * mask is all-ones.
14055                  */
14056                 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))
14057                         ipif->ipif_v6net_mask = ipv6_all_ones;
14058                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14059                         /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14060                         ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14061                 } else {
14062                         V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14063                             ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14064                 }
14065         }
14066 }
14067
14068 /*
14069  * Return 0 if this address can be used as local address without causing
14070  * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address
14071  * is already up on a different ill, and EADDRINUSE if it's up on the same ill.
14072  * Note that the same IPv6 link-local address is allowed as long as the ills
14073  * are not on the same link.
14074  */
14075 int
14076 ip_addr_availability_check(ipif_t *new_ipif)
14077 {
14078         in6_addr_t our_v6addr;
14079         ill_t *ill;
14080         ipif_t *ipif;
14081         ill_walk_context_t ctx;
14082         ip_stack_t      *ipst = new_ipif->ipif_ill->ill_ipst;
14083
14084         ASSERT(IAM_WRITER_IPIF(new_ipif));
14085         ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock));
14086         ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
14087
14088         new_ipif->ipif_flags &= ~IPIF_UNNUMBERED;
14089         if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) ||
14090             IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr))
14091                 return (0);
14092
14093         our_v6addr = new_ipif->ipif_v6lcl_addr;
14094
14095         if (new_ipif->ipif_isv6)
14096                 ill = ILL_START_WALK_V6(&ctx, ipst);
14097         else
14098                 ill = ILL_START_WALK_V4(&ctx, ipst);
14099
14100         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
14101                 for (ipif = ill->ill_ipif; ipif != NULL;
14102                     ipif = ipif->ipif_next) {
14103                         if ((ipif == new_ipif) ||
14104                             !(ipif->ipif_flags & IPIF_UP) ||
14105                             (ipif->ipif_flags & IPIF_UNNUMBERED) ||
14106                             !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
14107                             &our_v6addr))
14108                                 continue;
14109
14110                         if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
14111                                 new_ipif->ipif_flags |= IPIF_UNNUMBERED;
14112                         else if (ipif->ipif_flags & IPIF_POINTOPOINT)
14113                                 ipif->ipif_flags |= IPIF_UNNUMBERED;
14114                         else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) ||
14115                             IN6_IS_ADDR_SITELOCAL(&our_v6addr)) &&
14116                             !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill))
14117                                 continue;
14118                         else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid &&
14119                             ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill))
14120                                 continue;
14121                         else if (new_ipif->ipif_ill == ill)
14122                                 return (EADDRINUSE);
14123                         else
14124                                 return (EADDRNOTAVAIL);
14125                 }
14126         }
14127
14128         return (0);
14129 }
14130
14131 /*
14132  * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add
14133  * IREs for the ipif.
14134  * When the routine returns EINPROGRESS then mp has been consumed and
14135  * the ioctl will be acked from ip_rput_dlpi.
14136  */
14137 int
14138 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
14139 {
14140         ill_t           *ill = ipif->ipif_ill;
14141         boolean_t       isv6 = ipif->ipif_isv6;
14142         int             err = 0;
14143         boolean_t       success;
14144         uint_t          ipif_orig_id;
14145         ip_stack_t      *ipst = ill->ill_ipst;
14146
14147         ASSERT(IAM_WRITER_IPIF(ipif));
14148
14149         ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
14150         DTRACE_PROBE3(ipif__downup, char *, "ipif_up",
14151             ill_t *, ill, ipif_t *, ipif);
14152
14153         /* Shouldn't get here if it is already up. */
14154         if (ipif->ipif_flags & IPIF_UP)
14155                 return (EALREADY);
14156
14157         /*
14158          * If this is a request to bring up a data address on an interface
14159          * under IPMP, then move the address to its IPMP meta-interface and
14160          * try to bring it up.  One complication is that the zeroth ipif for
14161          * an ill is special, in that every ill always has one, and that code
14162          * throughout IP deferences ill->ill_ipif without holding any locks.
14163          */
14164         if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) &&
14165             (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) {
14166                 ipif_t  *stubipif = NULL, *moveipif = NULL;
14167                 ill_t   *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
14168
14169                 /*
14170                  * The ipif being brought up should be quiesced.  If it's not,
14171                  * something has gone amiss and we need to bail out.  (If it's
14172                  * quiesced, we know it will remain so via IPIF_CONDEMNED.)
14173                  */
14174                 mutex_enter(&ill->ill_lock);
14175                 if (!ipif_is_quiescent(ipif)) {
14176                         mutex_exit(&ill->ill_lock);
14177                         return (EINVAL);
14178                 }
14179                 mutex_exit(&ill->ill_lock);
14180
14181                 /*
14182                  * If we're going to need to allocate ipifs, do it prior
14183                  * to starting the move (and grabbing locks).
14184                  */
14185                 if (ipif->ipif_id == 0) {
14186                         if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14187                             B_FALSE, &err)) == NULL) {
14188                                 return (err);
14189                         }
14190                         if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14191                             B_FALSE, &err)) == NULL) {
14192                                 mi_free(moveipif);
14193                                 return (err);
14194                         }
14195                 }
14196
14197                 /*
14198                  * Grab or transfer the ipif to move.  During the move, keep
14199                  * ill_g_lock held to prevent any ill walker threads from
14200                  * seeing things in an inconsistent state.
14201                  */
14202                 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14203                 if (ipif->ipif_id != 0) {
14204                         ipif_remove(ipif);
14205                 } else {
14206                         ipif_transfer(ipif, moveipif, stubipif);
14207                         ipif = moveipif;
14208                 }
14209
14210                 /*
14211                  * Place the ipif on the IPMP ill.  If the zeroth ipif on
14212                  * the IPMP ill is a stub (0.0.0.0 down address) then we
14213                  * replace that one.  Otherwise, pick the next available slot.
14214                  */
14215                 ipif->ipif_ill = ipmp_ill;
14216                 ipif_orig_id = ipif->ipif_id;
14217
14218                 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) {
14219                         ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL);
14220                         ipif = ipmp_ill->ill_ipif;
14221                 } else {
14222                         ipif->ipif_id = -1;
14223                         if ((err = ipif_insert(ipif, B_FALSE)) != 0) {
14224                                 /*
14225                                  * No more available ipif_id's -- put it back
14226                                  * on the original ill and fail the operation.
14227                                  * Since we're writer on the ill, we can be
14228                                  * sure our old slot is still available.
14229                                  */
14230                                 ipif->ipif_id = ipif_orig_id;
14231                                 ipif->ipif_ill = ill;
14232                                 if (ipif_orig_id == 0) {
14233                                         ipif_transfer(ipif, ill->ill_ipif,
14234                                             NULL);
14235                                 } else {
14236                                         VERIFY(ipif_insert(ipif, B_FALSE) == 0);
14237                                 }
14238                                 rw_exit(&ipst->ips_ill_g_lock);
14239                                 return (err);
14240                         }
14241                 }
14242                 rw_exit(&ipst->ips_ill_g_lock);
14243
14244                 /*
14245                  * Tell SCTP that the ipif has moved.  Note that even if we
14246                  * had to allocate a new ipif, the original sequence id was
14247                  * preserved and therefore SCTP won't know.
14248                  */
14249                 sctp_move_ipif(ipif, ill, ipmp_ill);
14250
14251                 /*
14252                  * If the ipif being brought up was on slot zero, then we
14253                  * first need to bring up the placeholder we stuck there.  In
14254                  * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive
14255                  * call to ipif_up() itself, if we successfully bring up the
14256                  * placeholder, we'll check ill_move_ipif and bring it up too.
14257                  */
14258                 if (ipif_orig_id == 0) {
14259                         ASSERT(ill->ill_move_ipif == NULL);
14260                         ill->ill_move_ipif = ipif;
14261                         if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0)
14262                                 ASSERT(ill->ill_move_ipif == NULL);
14263                         if (err != EINPROGRESS)
14264                                 ill->ill_move_ipif = NULL;
14265                         return (err);
14266                 }
14267
14268                 /*
14269                  * Bring it up on the IPMP ill.
14270                  */
14271                 return (ipif_up(ipif, q, mp));
14272         }
14273
14274         /* Skip arp/ndp for any loopback interface. */
14275         if (ill->ill_wq != NULL) {
14276                 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
14277                 ipsq_t  *ipsq = ill->ill_phyint->phyint_ipsq;
14278
14279                 if (!ill->ill_dl_up) {
14280                         /*
14281                          * ill_dl_up is not yet set. i.e. we are yet to
14282                          * DL_BIND with the driver and this is the first
14283                          * logical interface on the ill to become "up".
14284                          * Tell the driver to get going (via DL_BIND_REQ).
14285                          * Note that changing "significant" IFF_ flags
14286                          * address/netmask etc cause a down/up dance, but
14287                          * does not cause an unbind (DL_UNBIND) with the driver
14288                          */
14289                         return (ill_dl_up(ill, ipif, mp, q));
14290                 }
14291
14292                 /*
14293                  * ipif_resolver_up may end up needeing to bind/attach
14294                  * the ARP stream, which in turn necessitates a
14295                  * DLPI message exchange with the driver. ioctls are
14296                  * serialized and so we cannot send more than one
14297                  * interface up message at a time. If ipif_resolver_up
14298                  * does need to wait for the DLPI handshake for the ARP stream,
14299                  * we get EINPROGRESS and we will complete in arp_bringup_done.
14300                  */
14301
14302                 ASSERT(connp != NULL || !CONN_Q(q));
14303                 if (connp != NULL)
14304                         mutex_enter(&connp->conn_lock);
14305                 mutex_enter(&ill->ill_lock);
14306                 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
14307                 mutex_exit(&ill->ill_lock);
14308                 if (connp != NULL)
14309                         mutex_exit(&connp->conn_lock);
14310                 if (!success)
14311                         return (EINTR);
14312
14313                 /*
14314                  * Crank up IPv6 neighbor discovery. Unlike ARP, this should
14315                  * complete when ipif_ndp_up returns.
14316                  */
14317                 err = ipif_resolver_up(ipif, Res_act_initial);
14318                 if (err == EINPROGRESS) {
14319                         /* We will complete it in arp_bringup_done() */
14320                         return (err);
14321                 }
14322
14323                 if (isv6 && err == 0)
14324                         err = ipif_ndp_up(ipif, B_TRUE);
14325
14326                 ASSERT(err != EINPROGRESS);
14327                 mp = ipsq_pending_mp_get(ipsq, &connp);
14328                 ASSERT(mp != NULL);
14329                 if (err != 0)
14330                         return (err);
14331         } else {
14332                 /*
14333                  * Interfaces without underlying hardware don't do duplicate
14334                  * address detection.
14335                  */
14336                 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
14337                 ipif->ipif_addr_ready = 1;
14338                 err = ill_add_ires(ill);
14339                 /* allocation failure? */
14340                 if (err != 0)
14341                         return (err);
14342         }
14343
14344         err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
14345         if (err == 0 && ill->ill_move_ipif != NULL) {
14346                 ipif = ill->ill_move_ipif;
14347                 ill->ill_move_ipif = NULL;
14348                 return (ipif_up(ipif, q, mp));
14349         }
14350         return (err);
14351 }
14352
14353 /*
14354  * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST.
14355  * The identical set of IREs need to be removed in ill_delete_ires().
14356  */
14357 int
14358 ill_add_ires(ill_t *ill)
14359 {
14360         ire_t   *ire;
14361         in6_addr_t dummy6 = IN6ADDR_INITIALIZER(V6_MCAST, 0, 0, 1);
14362         in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP);
14363
14364         if (ill->ill_ire_multicast != NULL)
14365                 return (0);
14366
14367         /*
14368          * provide some dummy ire_addr for creating the ire.
14369          */
14370         if (ill->ill_isv6) {
14371                 ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill,
14372                     ALL_ZONES, RTF_UP, ill->ill_ipst);
14373         } else {
14374                 ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill,
14375                     ALL_ZONES, RTF_UP, ill->ill_ipst);
14376         }
14377         if (ire == NULL)
14378                 return (ENOMEM);
14379
14380         ill->ill_ire_multicast = ire;
14381         return (0);
14382 }
14383
14384 void
14385 ill_delete_ires(ill_t *ill)
14386 {
14387         if (ill->ill_ire_multicast != NULL) {
14388                 /*
14389                  * BIND/ATTACH completed; Release the ref for ill_ire_multicast
14390                  * which was taken without any th_tracing enabled.
14391                  * We also mark it as condemned (note that it was never added)
14392                  * so that caching conn's can move off of it.
14393                  */
14394                 ire_make_condemned(ill->ill_ire_multicast);
14395                 ire_refrele_notr(ill->ill_ire_multicast);
14396                 ill->ill_ire_multicast = NULL;
14397         }
14398 }
14399
14400 /*
14401  * Perform a bind for the physical device.
14402  * When the routine returns EINPROGRESS then mp has been consumed and
14403  * the ioctl will be acked from ip_rput_dlpi.
14404  * Allocate an unbind message and save it until ipif_down.
14405  */
14406 static int
14407 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
14408 {
14409         mblk_t  *bind_mp = NULL;
14410         mblk_t  *unbind_mp = NULL;
14411         conn_t  *connp;
14412         boolean_t success;
14413         int     err;
14414
14415         DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill);
14416
14417         ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
14418         ASSERT(IAM_WRITER_ILL(ill));
14419         ASSERT(mp != NULL);
14420
14421         /*
14422          * Make sure we have an IRE_MULTICAST in case we immediately
14423          * start receiving packets.
14424          */
14425         err = ill_add_ires(ill);
14426         if (err != 0)
14427                 goto bad;
14428
14429         bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
14430             DL_BIND_REQ);
14431         if (bind_mp == NULL)
14432                 goto bad;
14433         ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap;
14434         ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
14435
14436         /*
14437          * ill_unbind_mp would be non-null if the following sequence had
14438          * happened:
14439          * - send DL_BIND_REQ to driver, wait for response
14440          * - multiple ioctls that need to bring the ipif up are encountered,
14441          *   but they cannot enter the ipsq due to the outstanding DL_BIND_REQ.
14442          *   These ioctls will then be enqueued on the ipsq
14443          * - a DL_ERROR_ACK is returned for the DL_BIND_REQ
14444          * At this point, the pending ioctls in the ipsq will be drained, and
14445          * since ill->ill_dl_up was not set, ill_dl_up would be invoked with
14446          * a non-null ill->ill_unbind_mp
14447          */
14448         if (ill->ill_unbind_mp == NULL) {
14449                 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t),
14450                     DL_UNBIND_REQ);
14451                 if (unbind_mp == NULL)
14452                         goto bad;
14453         }
14454         /*
14455          * Record state needed to complete this operation when the
14456          * DL_BIND_ACK shows up.  Also remember the pre-allocated mblks.
14457          */
14458         connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
14459         ASSERT(connp != NULL || !CONN_Q(q));
14460         GRAB_CONN_LOCK(q);
14461         mutex_enter(&ipif->ipif_ill->ill_lock);
14462         success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
14463         mutex_exit(&ipif->ipif_ill->ill_lock);
14464         RELEASE_CONN_LOCK(q);
14465         if (!success)
14466                 goto bad;
14467
14468         /*
14469          * Save the unbind message for ill_dl_down(); it will be consumed when
14470          * the interface goes down.
14471          */
14472         if (ill->ill_unbind_mp == NULL)
14473                 ill->ill_unbind_mp = unbind_mp;
14474
14475         ill_dlpi_send(ill, bind_mp);
14476         /* Send down link-layer capabilities probe if not already done. */
14477         ill_capability_probe(ill);
14478
14479         /*
14480          * Sysid used to rely on the fact that netboots set domainname
14481          * and the like. Now that miniroot boots aren't strictly netboots
14482          * and miniroot network configuration is driven from userland
14483          * these things still need to be set. This situation can be detected
14484          * by comparing the interface being configured here to the one
14485          * dhcifname was set to reference by the boot loader. Once sysid is
14486          * converted to use dhcp_ipc_getinfo() this call can go away.
14487          */
14488         if ((ipif->ipif_flags & IPIF_DHCPRUNNING) &&
14489             (strcmp(ill->ill_name, dhcifname) == 0) &&
14490             (strlen(srpc_domain) == 0)) {
14491                 if (dhcpinit() != 0)
14492                         cmn_err(CE_WARN, "no cached dhcp response");
14493         }
14494
14495         /*
14496          * This operation will complete in ip_rput_dlpi with either
14497          * a DL_BIND_ACK or DL_ERROR_ACK.
14498          */
14499         return (EINPROGRESS);
14500 bad:
14501         ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
14502
14503         freemsg(bind_mp);
14504         freemsg(unbind_mp);
14505         return (ENOMEM);
14506 }
14507
14508 /* Add room for tcp+ip headers */
14509 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20;
14510
14511 /*
14512  * DLPI and ARP is up.
14513  * Create all the IREs associated with an interface. Bring up multicast.
14514  * Set the interface flag and finish other initialization
14515  * that potentially had to be deferred to after DL_BIND_ACK.
14516  */
14517 int
14518 ipif_up_done(ipif_t *ipif)
14519 {
14520         ill_t           *ill = ipif->ipif_ill;
14521         int             err = 0;
14522         boolean_t       loopback = B_FALSE;
14523         boolean_t       update_src_selection = B_TRUE;
14524         ipif_t          *tmp_ipif;
14525
14526         ip1dbg(("ipif_up_done(%s:%u)\n",
14527             ipif->ipif_ill->ill_name, ipif->ipif_id));
14528         DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done",
14529             ill_t *, ill, ipif_t *, ipif);
14530
14531         /* Check if this is a loopback interface */
14532         if (ipif->ipif_ill->ill_wq == NULL)
14533                 loopback = B_TRUE;
14534
14535         ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14536
14537         /*
14538          * If all other interfaces for this ill are down or DEPRECATED,
14539          * or otherwise unsuitable for source address selection,
14540          * reset the src generation numbers to make sure source
14541          * address selection gets to take this new ipif into account.
14542          * No need to hold ill_lock while traversing the ipif list since
14543          * we are writer
14544          */
14545         for (tmp_ipif = ill->ill_ipif; tmp_ipif;
14546             tmp_ipif = tmp_ipif->ipif_next) {
14547                 if (((tmp_ipif->ipif_flags &
14548                     (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) ||
14549                     !(tmp_ipif->ipif_flags & IPIF_UP)) ||
14550                     (tmp_ipif == ipif))
14551                         continue;
14552                 /* first useable pre-existing interface */
14553                 update_src_selection = B_FALSE;
14554                 break;
14555         }
14556         if (update_src_selection)
14557                 ip_update_source_selection(ill->ill_ipst);
14558
14559         if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) {
14560                 nce_t *loop_nce = NULL;
14561                 uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD);
14562
14563                 /*
14564                  * lo0:1 and subsequent ipifs were marked IRE_LOCAL in
14565                  * ipif_lookup_on_name(), but in the case of zones we can have
14566                  * several loopback addresses on lo0. So all the interfaces with
14567                  * loopback addresses need to be marked IRE_LOOPBACK.
14568                  */
14569                 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) ==
14570                     htonl(INADDR_LOOPBACK))
14571                         ipif->ipif_ire_type = IRE_LOOPBACK;
14572                 else
14573                         ipif->ipif_ire_type = IRE_LOCAL;
14574                 if (ill->ill_net_type != IRE_LOOPBACK)
14575                         flags |= NCE_F_PUBLISH;
14576
14577                 /* add unicast nce for the local addr */
14578                 err = nce_lookup_then_add_v4(ill, NULL,
14579                     ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags,
14580                     ND_REACHABLE, &loop_nce);
14581                 /* A shared-IP zone sees EEXIST for lo0:N */
14582                 if (err == 0 || err == EEXIST) {
14583                         ipif->ipif_added_nce = 1;
14584                         loop_nce->nce_ipif_cnt++;
14585                         nce_refrele(loop_nce);
14586                         err = 0;
14587                 } else {
14588                         ASSERT(loop_nce == NULL);
14589                         return (err);
14590                 }
14591         }
14592
14593         /* Create all the IREs associated with this interface */
14594         err = ipif_add_ires_v4(ipif, loopback);
14595         if (err != 0) {
14596                 /*
14597                  * see comments about return value from
14598                  * ip_addr_availability_check() in ipif_add_ires_v4().
14599                  */
14600                 if (err != EADDRINUSE) {
14601                         (void) ipif_arp_down(ipif);
14602                 } else {
14603                         /*
14604                          * Make IPMP aware of the deleted ipif so that
14605                          * the needed ipmp cleanup (e.g., of ipif_bound_ill)
14606                          * can be completed. Note that we do not want to
14607                          * destroy the nce that was created on the ipmp_ill
14608                          * for the active copy of the duplicate address in
14609                          * use.
14610                          */
14611                         if (IS_IPMP(ill))
14612                                 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
14613                         err = EADDRNOTAVAIL;
14614                 }
14615                 return (err);
14616         }
14617
14618         if (ill->ill_ipif_up_count == 1 && !loopback) {
14619                 /* Recover any additional IREs entries for this ill */
14620                 (void) ill_recover_saved_ire(ill);
14621         }
14622
14623         if (ill->ill_need_recover_multicast) {
14624                 /*
14625                  * Need to recover all multicast memberships in the driver.
14626                  * This had to be deferred until we had attached.  The same
14627                  * code exists in ipif_up_done_v6() to recover IPv6
14628                  * memberships.
14629                  *
14630                  * Note that it would be preferable to unconditionally do the
14631                  * ill_recover_multicast() in ill_dl_up(), but we cannot do
14632                  * that since ill_join_allmulti() depends on ill_dl_up being
14633                  * set, and it is not set until we receive a DL_BIND_ACK after
14634                  * having called ill_dl_up().
14635                  */
14636                 ill_recover_multicast(ill);
14637         }
14638
14639         if (ill->ill_ipif_up_count == 1) {
14640                 /*
14641                  * Since the interface is now up, it may now be active.
14642                  */
14643                 if (IS_UNDER_IPMP(ill))
14644                         ipmp_ill_refresh_active(ill);
14645
14646                 /*
14647                  * If this is an IPMP interface, we may now be able to
14648                  * establish ARP entries.
14649                  */
14650                 if (IS_IPMP(ill))
14651                         ipmp_illgrp_refresh_arpent(ill->ill_grp);
14652         }
14653
14654         /* Join the allhosts multicast address */
14655         ipif_multicast_up(ipif);
14656
14657         if (!loopback && !update_src_selection &&
14658             !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)))
14659                 ip_update_source_selection(ill->ill_ipst);
14660
14661         if (!loopback && ipif->ipif_addr_ready) {
14662                 /* Broadcast an address mask reply. */
14663                 ipif_mask_reply(ipif);
14664         }
14665         /* Perhaps ilgs should use this ill */
14666         update_conn_ill(NULL, ill->ill_ipst);
14667
14668         /*
14669          * This had to be deferred until we had bound.  Tell routing sockets and
14670          * others that this interface is up if it looks like the address has
14671          * been validated.  Otherwise, if it isn't ready yet, wait for
14672          * duplicate address detection to do its thing.
14673          */
14674         if (ipif->ipif_addr_ready)
14675                 ipif_up_notify(ipif);
14676         return (0);
14677 }
14678
14679 /*
14680  * Add the IREs associated with the ipif.
14681  * Those MUST be explicitly removed in ipif_delete_ires_v4.
14682  */
14683 static int
14684 ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback)
14685 {
14686         ill_t           *ill = ipif->ipif_ill;
14687         ip_stack_t      *ipst = ill->ill_ipst;
14688         ire_t           *ire_array[20];
14689         ire_t           **irep = ire_array;
14690         ire_t           **irep1;
14691         ipaddr_t        net_mask = 0;
14692         ipaddr_t        subnet_mask, route_mask;
14693         int             err;
14694         ire_t           *ire_local = NULL;      /* LOCAL or LOOPBACK */
14695         ire_t           *ire_if = NULL;
14696         uchar_t         *gw;
14697
14698         if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14699             !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14700                 /* Register the source address for __sin6_src_id */
14701                 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
14702                     ipif->ipif_zoneid, ipst);
14703                 if (err != 0) {
14704                         ip0dbg(("ipif_add_ires: srcid_insert %d\n", err));
14705                         return (err);
14706                 }
14707
14708                 if (loopback)
14709                         gw = (uchar_t *)&ipif->ipif_lcl_addr;
14710                 else
14711                         gw = NULL;
14712
14713                 /* If the interface address is set, create the local IRE. */
14714                 ire_local = ire_create(
14715                     (uchar_t *)&ipif->ipif_lcl_addr,    /* dest address */
14716                     (uchar_t *)&ip_g_all_ones,          /* mask */
14717                     gw,                                 /* gateway */
14718                     ipif->ipif_ire_type,                /* LOCAL or LOOPBACK */
14719                     ipif->ipif_ill,
14720                     ipif->ipif_zoneid,
14721                     ((ipif->ipif_flags & IPIF_PRIVATE) ?
14722                     RTF_PRIVATE : 0) | RTF_KERNEL,
14723                     ipst);
14724                 ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x"
14725                     " for 0x%x\n", (void *)ipif, (void *)ire_local,
14726                     ipif->ipif_ire_type,
14727                     ntohl(ipif->ipif_lcl_addr)));
14728                 if (ire_local == NULL) {
14729                         ip1dbg(("ipif_up_done: NULL ire_local\n"));
14730                         err = ENOMEM;
14731                         goto bad;
14732                 }
14733         } else {
14734                 ip1dbg((
14735                     "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n",
14736                     ipif->ipif_ire_type,
14737                     ntohl(ipif->ipif_lcl_addr),
14738                     (uint_t)ipif->ipif_flags));
14739         }
14740         if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14741             !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14742                 net_mask = ip_net_mask(ipif->ipif_lcl_addr);
14743         } else {
14744                 net_mask = htonl(IN_CLASSA_NET);        /* fallback */
14745         }
14746
14747         subnet_mask = ipif->ipif_net_mask;
14748
14749         /*
14750          * If mask was not specified, use natural netmask of
14751          * interface address. Also, store this mask back into the
14752          * ipif struct.
14753          */
14754         if (subnet_mask == 0) {
14755                 subnet_mask = net_mask;
14756                 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask);
14757                 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
14758                     ipif->ipif_v6subnet);
14759         }
14760
14761         /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
14762         if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
14763             ipif->ipif_subnet != INADDR_ANY) {
14764                 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14765
14766                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14767                         route_mask = IP_HOST_MASK;
14768                 } else {
14769                         route_mask = subnet_mask;
14770                 }
14771
14772                 ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p "
14773                     "creating if IRE ill_net_type 0x%x for 0x%x\n",
14774                     (void *)ipif, (void *)ill, ill->ill_net_type,
14775                     ntohl(ipif->ipif_subnet)));
14776                 ire_if = ire_create(
14777                     (uchar_t *)&ipif->ipif_subnet,
14778                     (uchar_t *)&route_mask,
14779                     (uchar_t *)&ipif->ipif_lcl_addr,
14780                     ill->ill_net_type,
14781                     ill,
14782                     ipif->ipif_zoneid,
14783                     ((ipif->ipif_flags & IPIF_PRIVATE) ?
14784                     RTF_PRIVATE: 0) | RTF_KERNEL,
14785                     ipst);
14786                 if (ire_if == NULL) {
14787                         ip1dbg(("ipif_up_done: NULL ire_if\n"));
14788                         err = ENOMEM;
14789                         goto bad;
14790                 }
14791         }
14792
14793         /*
14794          * Create any necessary broadcast IREs.
14795          */
14796         if ((ipif->ipif_flags & IPIF_BROADCAST) &&
14797             !(ipif->ipif_flags & IPIF_NOXMIT))
14798                 irep = ipif_create_bcast_ires(ipif, irep);
14799
14800         /* If an earlier ire_create failed, get out now */
14801         for (irep1 = irep; irep1 > ire_array; ) {
14802                 irep1--;
14803                 if (*irep1 == NULL) {
14804                         ip1dbg(("ipif_up_done: NULL ire found in ire_array\n"));
14805                         err = ENOMEM;
14806                         goto bad;
14807                 }
14808         }
14809
14810         /*
14811          * Need to atomically check for IP address availability under
14812          * ip_addr_avail_lock.  ill_g_lock is held as reader to ensure no new
14813          * ills or new ipifs can be added while we are checking availability.
14814          */
14815         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
14816         mutex_enter(&ipst->ips_ip_addr_avail_lock);
14817         /* Mark it up, and increment counters. */
14818         ipif->ipif_flags |= IPIF_UP;
14819         ill->ill_ipif_up_count++;
14820         err = ip_addr_availability_check(ipif);
14821         mutex_exit(&ipst->ips_ip_addr_avail_lock);
14822         rw_exit(&ipst->ips_ill_g_lock);
14823
14824         if (err != 0) {
14825                 /*
14826                  * Our address may already be up on the same ill. In this case,
14827                  * the ARP entry for our ipif replaced the one for the other
14828                  * ipif. So we don't want to delete it (otherwise the other ipif
14829                  * would be unable to send packets).
14830                  * ip_addr_availability_check() identifies this case for us and
14831                  * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL
14832                  * which is the expected error code.
14833                  */
14834                 ill->ill_ipif_up_count--;
14835                 ipif->ipif_flags &= ~IPIF_UP;
14836                 goto bad;
14837         }
14838
14839         /*
14840          * Add in all newly created IREs.  ire_create_bcast() has
14841          * already checked for duplicates of the IRE_BROADCAST type.
14842          * We add the IRE_INTERFACE before the IRE_LOCAL to ensure
14843          * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is
14844          * a /32 route.
14845          */
14846         if (ire_if != NULL) {
14847                 ire_if = ire_add(ire_if);
14848                 if (ire_if == NULL) {
14849                         err = ENOMEM;
14850                         goto bad2;
14851                 }
14852 #ifdef DEBUG
14853                 ire_refhold_notr(ire_if);
14854                 ire_refrele(ire_if);
14855 #endif
14856         }
14857         if (ire_local != NULL) {
14858                 ire_local = ire_add(ire_local);
14859                 if (ire_local == NULL) {
14860                         err = ENOMEM;
14861                         goto bad2;
14862                 }
14863 #ifdef DEBUG
14864                 ire_refhold_notr(ire_local);
14865                 ire_refrele(ire_local);
14866 #endif
14867         }
14868         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14869         if (ire_local != NULL)
14870                 ipif->ipif_ire_local = ire_local;
14871         if (ire_if != NULL)
14872                 ipif->ipif_ire_if = ire_if;
14873         rw_exit(&ipst->ips_ill_g_lock);
14874         ire_local = NULL;
14875         ire_if = NULL;
14876
14877         /*
14878          * We first add all of them, and if that succeeds we refrele the
14879          * bunch. That enables us to delete all of them should any of the
14880          * ire_adds fail.
14881          */
14882         for (irep1 = irep; irep1 > ire_array; ) {
14883                 irep1--;
14884                 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock)));
14885                 *irep1 = ire_add(*irep1);
14886                 if (*irep1 == NULL) {
14887                         err = ENOMEM;
14888                         goto bad2;
14889                 }
14890         }
14891
14892         for (irep1 = irep; irep1 > ire_array; ) {
14893                 irep1--;
14894                 /* refheld by ire_add. */
14895                 if (*irep1 != NULL) {
14896                         ire_refrele(*irep1);
14897                         *irep1 = NULL;
14898                 }
14899         }
14900
14901         if (!loopback) {
14902                 /*
14903                  * If the broadcast address has been set, make sure it makes
14904                  * sense based on the interface address.
14905                  * Only match on ill since we are sharing broadcast addresses.
14906                  */
14907                 if ((ipif->ipif_brd_addr != INADDR_ANY) &&
14908                     (ipif->ipif_flags & IPIF_BROADCAST)) {
14909                         ire_t   *ire;
14910
14911                         ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0,
14912                             IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES,
14913                             (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL);
14914
14915                         if (ire == NULL) {
14916                                 /*
14917                                  * If there isn't a matching broadcast IRE,
14918                                  * revert to the default for this netmask.
14919                                  */
14920                                 ipif->ipif_v6brd_addr = ipv6_all_zeros;
14921                                 mutex_enter(&ipif->ipif_ill->ill_lock);
14922                                 ipif_set_default(ipif);
14923                                 mutex_exit(&ipif->ipif_ill->ill_lock);
14924                         } else {
14925                                 ire_refrele(ire);
14926                         }
14927                 }
14928
14929         }
14930         return (0);
14931
14932 bad2:
14933         ill->ill_ipif_up_count--;
14934         ipif->ipif_flags &= ~IPIF_UP;
14935
14936 bad:
14937         ip1dbg(("ipif_add_ires: FAILED \n"));
14938         if (ire_local != NULL)
14939                 ire_delete(ire_local);
14940         if (ire_if != NULL)
14941                 ire_delete(ire_if);
14942
14943         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14944         ire_local = ipif->ipif_ire_local;
14945         ipif->ipif_ire_local = NULL;
14946         ire_if = ipif->ipif_ire_if;
14947         ipif->ipif_ire_if = NULL;
14948         rw_exit(&ipst->ips_ill_g_lock);
14949         if (ire_local != NULL) {
14950                 ire_delete(ire_local);
14951                 ire_refrele_notr(ire_local);
14952         }
14953         if (ire_if != NULL) {
14954                 ire_delete(ire_if);
14955                 ire_refrele_notr(ire_if);
14956         }
14957
14958         while (irep > ire_array) {
14959                 irep--;
14960                 if (*irep != NULL) {
14961                         ire_delete(*irep);
14962                 }
14963         }
14964         (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
14965
14966         return (err);
14967 }
14968
14969 /* Remove all the IREs created by ipif_add_ires_v4 */
14970 void
14971 ipif_delete_ires_v4(ipif_t *ipif)
14972 {
14973         ill_t           *ill = ipif->ipif_ill;
14974         ip_stack_t      *ipst = ill->ill_ipst;
14975         ire_t           *ire;
14976
14977         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14978         ire = ipif->ipif_ire_local;
14979         ipif->ipif_ire_local = NULL;
14980         rw_exit(&ipst->ips_ill_g_lock);
14981         if (ire != NULL) {
14982                 /*
14983                  * Move count to ipif so we don't loose the count due to
14984                  * a down/up dance.
14985                  */
14986                 atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count);
14987
14988                 ire_delete(ire);
14989                 ire_refrele_notr(ire);
14990         }
14991         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14992         ire = ipif->ipif_ire_if;
14993         ipif->ipif_ire_if = NULL;
14994         rw_exit(&ipst->ips_ill_g_lock);
14995         if (ire != NULL) {
14996                 ire_delete(ire);
14997                 ire_refrele_notr(ire);
14998         }
14999
15000         /*
15001          * Delete the broadcast IREs.
15002          */
15003         if ((ipif->ipif_flags & IPIF_BROADCAST) &&
15004             !(ipif->ipif_flags & IPIF_NOXMIT))
15005                 ipif_delete_bcast_ires(ipif);
15006 }
15007
15008 /*
15009  * Checks for availbility of a usable source address (if there is one) when the
15010  * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note
15011  * this selection is done regardless of the destination.
15012  */
15013 boolean_t
15014 ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid,
15015     ip_stack_t *ipst)
15016 {
15017         ipif_t          *ipif = NULL;
15018         ill_t           *uill;
15019
15020         ASSERT(ifindex != 0);
15021
15022         uill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
15023         if (uill == NULL)
15024                 return (B_FALSE);
15025
15026         mutex_enter(&uill->ill_lock);
15027         for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15028                 if (IPIF_IS_CONDEMNED(ipif))
15029                         continue;
15030                 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15031                         continue;
15032                 if (!(ipif->ipif_flags & IPIF_UP))
15033                         continue;
15034                 if (ipif->ipif_zoneid != zoneid)
15035                         continue;
15036                 if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15037                     ipif->ipif_lcl_addr == INADDR_ANY)
15038                         continue;
15039                 mutex_exit(&uill->ill_lock);
15040                 ill_refrele(uill);
15041                 return (B_TRUE);
15042         }
15043         mutex_exit(&uill->ill_lock);
15044         ill_refrele(uill);
15045         return (B_FALSE);
15046 }
15047
15048 /*
15049  * Find an ipif with a good local address on the ill+zoneid.
15050  */
15051 ipif_t *
15052 ipif_good_addr(ill_t *ill, zoneid_t zoneid)
15053 {
15054         ipif_t          *ipif;
15055
15056         mutex_enter(&ill->ill_lock);
15057         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15058                 if (IPIF_IS_CONDEMNED(ipif))
15059                         continue;
15060                 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15061                         continue;
15062                 if (!(ipif->ipif_flags & IPIF_UP))
15063                         continue;
15064                 if (ipif->ipif_zoneid != zoneid &&
15065                     ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES)
15066                         continue;
15067                 if (ill->ill_isv6 ?
15068                     IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15069                     ipif->ipif_lcl_addr == INADDR_ANY)
15070                         continue;
15071                 ipif_refhold_locked(ipif);
15072                 mutex_exit(&ill->ill_lock);
15073                 return (ipif);
15074         }
15075         mutex_exit(&ill->ill_lock);
15076         return (NULL);
15077 }
15078
15079 /*
15080  * IP source address type, sorted from worst to best.  For a given type,
15081  * always prefer IP addresses on the same subnet.  All-zones addresses are
15082  * suboptimal because they pose problems with unlabeled destinations.
15083  */
15084 typedef enum {
15085         IPIF_NONE,
15086         IPIF_DIFFNET_DEPRECATED,        /* deprecated and different subnet */
15087         IPIF_SAMENET_DEPRECATED,        /* deprecated and same subnet */
15088         IPIF_DIFFNET_ALLZONES,          /* allzones and different subnet */
15089         IPIF_SAMENET_ALLZONES,          /* allzones and same subnet */
15090         IPIF_DIFFNET,                   /* normal and different subnet */
15091         IPIF_SAMENET,                   /* normal and same subnet */
15092         IPIF_LOCALADDR                  /* local loopback */
15093 } ipif_type_t;
15094
15095 /*
15096  * Pick the optimal ipif on `ill' for sending to destination `dst' from zone
15097  * `zoneid'.  We rate usable ipifs from low -> high as per the ipif_type_t
15098  * enumeration, and return the highest-rated ipif.  If there's a tie, we pick
15099  * the first one, unless IPMP is used in which case we round-robin among them;
15100  * see below for more.
15101  *
15102  * Returns NULL if there is no suitable source address for the ill.
15103  * This only occurs when there is no valid source address for the ill.
15104  */
15105 ipif_t *
15106 ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid,
15107     boolean_t allow_usesrc, boolean_t *notreadyp)
15108 {
15109         ill_t   *usill = NULL;
15110         ill_t   *ipmp_ill = NULL;
15111         ipif_t  *start_ipif, *next_ipif, *ipif, *best_ipif;
15112         ipif_type_t type, best_type;
15113         ip_stack_t *ipst = ill->ill_ipst;
15114         boolean_t samenet;
15115
15116         if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) {
15117                 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
15118                     B_FALSE, ipst);
15119                 if (usill != NULL)
15120                         ill = usill;    /* Select source from usesrc ILL */
15121                 else
15122                         return (NULL);
15123         }
15124
15125         /*
15126          * Test addresses should never be used for source address selection,
15127          * so if we were passed one, switch to the IPMP meta-interface.
15128          */
15129         if (IS_UNDER_IPMP(ill)) {
15130                 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL)
15131                         ill = ipmp_ill; /* Select source from IPMP ill */
15132                 else
15133                         return (NULL);
15134         }
15135
15136         /*
15137          * Hold the ill_g_lock as reader. This makes sure that no ipif/ill
15138          * can be deleted. But an ipif/ill can get CONDEMNED any time.
15139          * After selecting the right ipif, under ill_lock make sure ipif is
15140          * not condemned, and increment refcnt. If ipif is CONDEMNED,
15141          * we retry. Inside the loop we still need to check for CONDEMNED,
15142          * but not under a lock.
15143          */
15144         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
15145 retry:
15146         /*
15147          * For source address selection, we treat the ipif list as circular
15148          * and continue until we get back to where we started.  This allows
15149          * IPMP to vary source address selection (which improves inbound load
15150          * spreading) by caching its last ending point and starting from
15151          * there.  NOTE: we don't have to worry about ill_src_ipif changing
15152          * ills since that can't happen on the IPMP ill.
15153          */
15154         start_ipif = ill->ill_ipif;
15155         if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
15156                 start_ipif = ill->ill_src_ipif;
15157
15158         ipif = start_ipif;
15159         best_ipif = NULL;
15160         best_type = IPIF_NONE;
15161         do {
15162                 if ((next_ipif = ipif->ipif_next) == NULL)
15163                         next_ipif = ill->ill_ipif;
15164
15165                 if (IPIF_IS_CONDEMNED(ipif))
15166                         continue;
15167                 /* Always skip NOLOCAL and ANYCAST interfaces */
15168                 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15169                         continue;
15170                 /* Always skip NOACCEPT interfaces */
15171                 if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT)
15172                         continue;
15173                 if (!(ipif->ipif_flags & IPIF_UP))
15174                         continue;
15175
15176                 if (!ipif->ipif_addr_ready) {
15177                         if (notreadyp != NULL)
15178                                 *notreadyp = B_TRUE;
15179                         continue;
15180                 }
15181
15182                 if (zoneid != ALL_ZONES &&
15183                     ipif->ipif_zoneid != zoneid &&
15184                     ipif->ipif_zoneid != ALL_ZONES)
15185                         continue;
15186
15187                 /*
15188                  * Interfaces with 0.0.0.0 address are allowed to be UP, but
15189                  * are not valid as source addresses.
15190                  */
15191                 if (ipif->ipif_lcl_addr == INADDR_ANY)
15192                         continue;
15193
15194                 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet);
15195
15196                 if (ipif->ipif_lcl_addr == dst) {
15197                         type = IPIF_LOCALADDR;
15198                 } else if (ipif->ipif_flags & IPIF_DEPRECATED) {
15199                         type = samenet ? IPIF_SAMENET_DEPRECATED :
15200                             IPIF_DIFFNET_DEPRECATED;
15201                 } else if (ipif->ipif_zoneid == ALL_ZONES) {
15202                         type = samenet ? IPIF_SAMENET_ALLZONES :
15203                             IPIF_DIFFNET_ALLZONES;
15204                 } else {
15205                         type = samenet ? IPIF_SAMENET : IPIF_DIFFNET;
15206                 }
15207
15208                 if (type > best_type) {
15209                         best_type = type;
15210                         best_ipif = ipif;
15211                         if (best_type == IPIF_LOCALADDR)
15212                                 break; /* can't get better */
15213                 }
15214         } while ((ipif = next_ipif) != start_ipif);
15215
15216         if ((ipif = best_ipif) != NULL) {
15217                 mutex_enter(&ipif->ipif_ill->ill_lock);
15218                 if (IPIF_IS_CONDEMNED(ipif)) {
15219                         mutex_exit(&ipif->ipif_ill->ill_lock);
15220                         goto retry;
15221                 }
15222                 ipif_refhold_locked(ipif);
15223
15224                 /*
15225                  * For IPMP, update the source ipif rotor to the next ipif,
15226                  * provided we can look it up.  (We must not use it if it's
15227                  * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
15228                  * ipif_free() checked ill_src_ipif.)
15229                  */
15230                 if (IS_IPMP(ill) && ipif != NULL) {
15231                         next_ipif = ipif->ipif_next;
15232                         if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif))
15233                                 ill->ill_src_ipif = next_ipif;
15234                         else
15235                                 ill->ill_src_ipif = NULL;
15236                 }
15237                 mutex_exit(&ipif->ipif_ill->ill_lock);
15238         }
15239
15240         rw_exit(&ipst->ips_ill_g_lock);
15241         if (usill != NULL)
15242                 ill_refrele(usill);
15243         if (ipmp_ill != NULL)
15244                 ill_refrele(ipmp_ill);
15245
15246 #ifdef DEBUG
15247         if (ipif == NULL) {
15248                 char buf1[INET6_ADDRSTRLEN];
15249
15250                 ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n",
15251                     ill->ill_name,
15252                     inet_ntop(AF_INET, &dst, buf1, sizeof (buf1))));
15253         } else {
15254                 char buf1[INET6_ADDRSTRLEN];
15255                 char buf2[INET6_ADDRSTRLEN];
15256
15257                 ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n",
15258                     ipif->ipif_ill->ill_name,
15259                     inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)),
15260                     inet_ntop(AF_INET, &ipif->ipif_lcl_addr,
15261                     buf2, sizeof (buf2))));
15262         }
15263 #endif /* DEBUG */
15264         return (ipif);
15265 }
15266
15267 /*
15268  * Pick a source address based on the destination ill and an optional setsrc
15269  * address.
15270  * The result is stored in srcp. If generation is set, then put the source
15271  * generation number there before we look for the source address (to avoid
15272  * missing changes in the set of source addresses.
15273  * If flagsp is set, then us it to pass back ipif_flags.
15274  *
15275  * If the caller wants to cache the returned source address and detect when
15276  * that might be stale, the caller should pass in a generation argument,
15277  * which the caller can later compare against ips_src_generation
15278  *
15279  * The precedence order for selecting an IPv4 source address is:
15280  *  - RTF_SETSRC on the offlink ire always wins.
15281  *  - If usrsrc is set, swap the ill to be the usesrc one.
15282  *  - If IPMP is used on the ill, select a random address from the most
15283  *    preferred ones below:
15284  * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES
15285  * 2. Not deprecated, not ALL_ZONES
15286  * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES
15287  * 4. Not deprecated, ALL_ZONES
15288  * 5. If onlink destination, same subnet and deprecated
15289  * 6. Deprecated.
15290  *
15291  * We have lower preference for ALL_ZONES IP addresses,
15292  * as they pose problems with unlabeled destinations.
15293  *
15294  * Note that when multiple IP addresses match e.g., #1 we pick
15295  * the first one if IPMP is not in use. With IPMP we randomize.
15296  */
15297 int
15298 ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst,
15299     ipaddr_t multicast_ifaddr,
15300     zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp,
15301     uint32_t *generation, uint64_t *flagsp)
15302 {
15303         ipif_t *ipif;
15304         boolean_t notready = B_FALSE;   /* Set if !ipif_addr_ready found */
15305
15306         if (flagsp != NULL)
15307                 *flagsp = 0;
15308
15309         /*
15310          * Need to grab the generation number before we check to
15311          * avoid a race with a change to the set of local addresses.
15312          * No lock needed since the thread which updates the set of local
15313          * addresses use ipif/ill locks and exit those (hence a store memory
15314          * barrier) before doing the atomic increase of ips_src_generation.
15315          */
15316         if (generation != NULL) {
15317                 *generation = ipst->ips_src_generation;
15318         }
15319
15320         if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) {
15321                 *srcp = multicast_ifaddr;
15322                 return (0);
15323         }
15324
15325         /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */
15326         if (setsrc != INADDR_ANY) {
15327                 *srcp = setsrc;
15328                 return (0);
15329         }
15330         ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, &notready);
15331         if (ipif == NULL) {
15332                 if (notready)
15333                         return (ENETDOWN);
15334                 else
15335                         return (EADDRNOTAVAIL);
15336         }
15337         *srcp = ipif->ipif_lcl_addr;
15338         if (flagsp != NULL)
15339                 *flagsp = ipif->ipif_flags;
15340         ipif_refrele(ipif);
15341         return (0);
15342 }
15343
15344 /* ARGSUSED */
15345 int
15346 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15347     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15348 {
15349         /*
15350          * ill_phyint_reinit merged the v4 and v6 into a single
15351          * ipsq.  We might not have been able to complete the
15352          * operation in ipif_set_values, if we could not become
15353          * exclusive.  If so restart it here.
15354          */
15355         return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15356 }
15357
15358 /*
15359  * Can operate on either a module or a driver queue.
15360  * Returns an error if not a module queue.
15361  */
15362 /* ARGSUSED */
15363 int
15364 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15365     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15366 {
15367         queue_t         *q1 = q;
15368         char            *cp;
15369         char            interf_name[LIFNAMSIZ];
15370         uint_t          ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr;
15371
15372         if (q->q_next == NULL) {
15373                 ip1dbg((
15374                     "if_unitsel: IF_UNITSEL: no q_next\n"));
15375                 return (EINVAL);
15376         }
15377
15378         if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0')
15379                 return (EALREADY);
15380
15381         do {
15382                 q1 = q1->q_next;
15383         } while (q1->q_next);
15384         cp = q1->q_qinfo->qi_minfo->mi_idname;
15385         (void) sprintf(interf_name, "%s%d", cp, ppa);
15386
15387         /*
15388          * Here we are not going to delay the ioack until after
15389          * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the
15390          * original ioctl message before sending the requests.
15391          */
15392         return (ipif_set_values(q, mp, interf_name, &ppa));
15393 }
15394
15395 /* ARGSUSED */
15396 int
15397 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15398     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15399 {
15400         return (ENXIO);
15401 }
15402
15403 /*
15404  * Create any IRE_BROADCAST entries for `ipif', and store those entries in
15405  * `irep'.  Returns a pointer to the next free `irep' entry
15406  * A mirror exists in ipif_delete_bcast_ires().
15407  *
15408  * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is
15409  * done in ire_add.
15410  */
15411 static ire_t **
15412 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
15413 {
15414         ipaddr_t addr;
15415         ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
15416         ipaddr_t subnetmask = ipif->ipif_net_mask;
15417         ill_t *ill = ipif->ipif_ill;
15418         zoneid_t zoneid = ipif->ipif_zoneid;
15419
15420         ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n"));
15421
15422         ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15423         ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15424
15425         if (ipif->ipif_lcl_addr == INADDR_ANY ||
15426             (ipif->ipif_flags & IPIF_NOLOCAL))
15427                 netmask = htonl(IN_CLASSA_NET);         /* fallback */
15428
15429         irep = ire_create_bcast(ill, 0, zoneid, irep);
15430         irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep);
15431
15432         /*
15433          * For backward compatibility, we create net broadcast IREs based on
15434          * the old "IP address class system", since some old machines only
15435          * respond to these class derived net broadcast.  However, we must not
15436          * create these net broadcast IREs if the subnetmask is shorter than
15437          * the IP address class based derived netmask.  Otherwise, we may
15438          * create a net broadcast address which is the same as an IP address
15439          * on the subnet -- and then TCP will refuse to talk to that address.
15440          */
15441         if (netmask < subnetmask) {
15442                 addr = netmask & ipif->ipif_subnet;
15443                 irep = ire_create_bcast(ill, addr, zoneid, irep);
15444                 irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep);
15445         }
15446
15447         /*
15448          * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15449          * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15450          * created.  Creating these broadcast IREs will only create confusion
15451          * as `addr' will be the same as the IP address.
15452          */
15453         if (subnetmask != 0xFFFFFFFF) {
15454                 addr = ipif->ipif_subnet;
15455                 irep = ire_create_bcast(ill, addr, zoneid, irep);
15456                 irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep);
15457         }
15458
15459         return (irep);
15460 }
15461
15462 /*
15463  * Mirror of ipif_create_bcast_ires()
15464  */
15465 static void
15466 ipif_delete_bcast_ires(ipif_t *ipif)
15467 {
15468         ipaddr_t        addr;
15469         ipaddr_t        netmask = ip_net_mask(ipif->ipif_lcl_addr);
15470         ipaddr_t        subnetmask = ipif->ipif_net_mask;
15471         ill_t           *ill = ipif->ipif_ill;
15472         zoneid_t        zoneid = ipif->ipif_zoneid;
15473         ire_t           *ire;
15474
15475         ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15476         ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15477
15478         if (ipif->ipif_lcl_addr == INADDR_ANY ||
15479             (ipif->ipif_flags & IPIF_NOLOCAL))
15480                 netmask = htonl(IN_CLASSA_NET);         /* fallback */
15481
15482         ire = ire_lookup_bcast(ill, 0, zoneid);
15483         ASSERT(ire != NULL);
15484         ire_delete(ire); ire_refrele(ire);
15485         ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid);
15486         ASSERT(ire != NULL);
15487         ire_delete(ire); ire_refrele(ire);
15488
15489         /*
15490          * For backward compatibility, we create net broadcast IREs based on
15491          * the old "IP address class system", since some old machines only
15492          * respond to these class derived net broadcast.  However, we must not
15493          * create these net broadcast IREs if the subnetmask is shorter than
15494          * the IP address class based derived netmask.  Otherwise, we may
15495          * create a net broadcast address which is the same as an IP address
15496          * on the subnet -- and then TCP will refuse to talk to that address.
15497          */
15498         if (netmask < subnetmask) {
15499                 addr = netmask & ipif->ipif_subnet;
15500                 ire = ire_lookup_bcast(ill, addr, zoneid);
15501                 ASSERT(ire != NULL);
15502                 ire_delete(ire); ire_refrele(ire);
15503                 ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid);
15504                 ASSERT(ire != NULL);
15505                 ire_delete(ire); ire_refrele(ire);
15506         }
15507
15508         /*
15509          * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15510          * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15511          * created.  Creating these broadcast IREs will only create confusion
15512          * as `addr' will be the same as the IP address.
15513          */
15514         if (subnetmask != 0xFFFFFFFF) {
15515                 addr = ipif->ipif_subnet;
15516                 ire = ire_lookup_bcast(ill, addr, zoneid);
15517                 ASSERT(ire != NULL);
15518                 ire_delete(ire); ire_refrele(ire);
15519                 ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid);
15520                 ASSERT(ire != NULL);
15521                 ire_delete(ire); ire_refrele(ire);
15522         }
15523 }
15524
15525 /*
15526  * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV*
15527  * from lifr_flags and the name from lifr_name.
15528  * Set IFF_IPV* and ill_isv6 prior to doing the lookup
15529  * since ipif_lookup_on_name uses the _isv6 flags when matching.
15530  * Returns EINPROGRESS when mp has been consumed by queueing it on
15531  * ipx_pending_mp and the ioctl will complete in ip_rput.
15532  *
15533  * Can operate on either a module or a driver queue.
15534  * Returns an error if not a module queue.
15535  */
15536 /* ARGSUSED */
15537 int
15538 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15539     ip_ioctl_cmd_t *ipip, void *if_req)
15540 {
15541         ill_t   *ill = q->q_ptr;
15542         phyint_t *phyi;
15543         ip_stack_t *ipst;
15544         struct lifreq *lifr = if_req;
15545         uint64_t new_flags;
15546
15547         ASSERT(ipif != NULL);
15548         ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name));
15549
15550         if (q->q_next == NULL) {
15551                 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n"));
15552                 return (EINVAL);
15553         }
15554
15555         /*
15556          * If we are not writer on 'q' then this interface exists already
15557          * and previous lookups (ip_extract_lifreq()) found this ipif --
15558          * so return EALREADY.
15559          */
15560         if (ill != ipif->ipif_ill)
15561                 return (EALREADY);
15562
15563         if (ill->ill_name[0] != '\0')
15564                 return (EALREADY);
15565
15566         /*
15567          * If there's another ill already with the requested name, ensure
15568          * that it's of the same type.  Otherwise, ill_phyint_reinit() will
15569          * fuse together two unrelated ills, which will cause chaos.
15570          */
15571         ipst = ill->ill_ipst;
15572         phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
15573             lifr->lifr_name, NULL);
15574         if (phyi != NULL) {
15575                 ill_t *ill_mate = phyi->phyint_illv4;
15576
15577                 if (ill_mate == NULL)
15578                         ill_mate = phyi->phyint_illv6;
15579                 ASSERT(ill_mate != NULL);
15580
15581                 if (ill_mate->ill_media->ip_m_mac_type !=
15582                     ill->ill_media->ip_m_mac_type) {
15583                         ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to "
15584                             "use the same ill name on differing media\n"));
15585                         return (EINVAL);
15586                 }
15587         }
15588
15589         /*
15590          * We start off as IFF_IPV4 in ipif_allocate and become
15591          * IFF_IPV4 or IFF_IPV6 here depending  on lifr_flags value.
15592          * The only flags that we read from user space are IFF_IPV4,
15593          * IFF_IPV6, and IFF_BROADCAST.
15594          *
15595          * This ill has not been inserted into the global list.
15596          * So we are still single threaded and don't need any lock
15597          *
15598          * Saniy check the flags.
15599          */
15600
15601         if ((lifr->lifr_flags & IFF_BROADCAST) &&
15602             ((lifr->lifr_flags & IFF_IPV6) ||
15603             (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) {
15604                 ip1dbg(("ip_sioctl_slifname: link not broadcast capable "
15605                     "or IPv6 i.e., no broadcast \n"));
15606                 return (EINVAL);
15607         }
15608
15609         new_flags =
15610             lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST);
15611
15612         if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) {
15613                 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of "
15614                     "IFF_IPV4 or IFF_IPV6\n"));
15615                 return (EINVAL);
15616         }
15617
15618         /*
15619          * We always start off as IPv4, so only need to check for IPv6.
15620          */
15621         if ((new_flags & IFF_IPV6) != 0) {
15622                 ill->ill_flags |= ILLF_IPV6;
15623                 ill->ill_flags &= ~ILLF_IPV4;
15624
15625                 if (lifr->lifr_flags & IFF_NOLINKLOCAL)
15626                         ill->ill_flags |= ILLF_NOLINKLOCAL;
15627         }
15628
15629         if ((new_flags & IFF_BROADCAST) != 0)
15630                 ipif->ipif_flags |= IPIF_BROADCAST;
15631         else
15632                 ipif->ipif_flags &= ~IPIF_BROADCAST;
15633
15634         /* We started off as V4. */
15635         if (ill->ill_flags & ILLF_IPV6) {
15636                 ill->ill_phyint->phyint_illv6 = ill;
15637                 ill->ill_phyint->phyint_illv4 = NULL;
15638         }
15639
15640         return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa));
15641 }
15642
15643 /* ARGSUSED */
15644 int
15645 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15646     ip_ioctl_cmd_t *ipip, void *if_req)
15647 {
15648         /*
15649          * ill_phyint_reinit merged the v4 and v6 into a single
15650          * ipsq.  We might not have been able to complete the
15651          * slifname in ipif_set_values, if we could not become
15652          * exclusive.  If so restart it here
15653          */
15654         return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15655 }
15656
15657 /*
15658  * Return a pointer to the ipif which matches the index, IP version type and
15659  * zoneid.
15660  */
15661 ipif_t *
15662 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
15663     ip_stack_t *ipst)
15664 {
15665         ill_t   *ill;
15666         ipif_t  *ipif = NULL;
15667
15668         ill = ill_lookup_on_ifindex(index, isv6, ipst);
15669         if (ill != NULL) {
15670                 mutex_enter(&ill->ill_lock);
15671                 for (ipif = ill->ill_ipif; ipif != NULL;
15672                     ipif = ipif->ipif_next) {
15673                         if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES ||
15674                             zoneid == ipif->ipif_zoneid ||
15675                             ipif->ipif_zoneid == ALL_ZONES)) {
15676                                 ipif_refhold_locked(ipif);
15677                                 break;
15678                         }
15679                 }
15680                 mutex_exit(&ill->ill_lock);
15681                 ill_refrele(ill);
15682         }
15683         return (ipif);
15684 }
15685
15686 /*
15687  * Change an existing physical interface's index. If the new index
15688  * is acceptable we update the index and the phyint_list_avl_by_index tree.
15689  * Finally, we update other systems which may have a dependence on the
15690  * index value.
15691  */
15692 /* ARGSUSED */
15693 int
15694 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15695     ip_ioctl_cmd_t *ipip, void *ifreq)
15696 {
15697         ill_t           *ill;
15698         phyint_t        *phyi;
15699         struct ifreq    *ifr = (struct ifreq *)ifreq;
15700         struct lifreq   *lifr = (struct lifreq *)ifreq;
15701         uint_t  old_index, index;
15702         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
15703         avl_index_t     where;
15704
15705         if (ipip->ipi_cmd_type == IF_CMD)
15706                 index = ifr->ifr_index;
15707         else
15708                 index = lifr->lifr_index;
15709
15710         /*
15711          * Only allow on physical interface. Also, index zero is illegal.
15712          */
15713         ill = ipif->ipif_ill;
15714         phyi = ill->ill_phyint;
15715         if (ipif->ipif_id != 0 || index == 0 || index > IF_INDEX_MAX) {
15716                 return (EINVAL);
15717         }
15718
15719         /* If the index is not changing, no work to do */
15720         if (phyi->phyint_ifindex == index)
15721                 return (0);
15722
15723         /*
15724          * Use phyint_exists() to determine if the new interface index
15725          * is already in use. If the index is unused then we need to
15726          * change the phyint's position in the phyint_list_avl_by_index
15727          * tree. If we do not do this, subsequent lookups (using the new
15728          * index value) will not find the phyint.
15729          */
15730         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15731         if (phyint_exists(index, ipst)) {
15732                 rw_exit(&ipst->ips_ill_g_lock);
15733                 return (EEXIST);
15734         }
15735
15736         /*
15737          * The new index is unused. Set it in the phyint. However we must not
15738          * forget to trigger NE_IFINDEX_CHANGE event before the ifindex
15739          * changes. The event must be bound to old ifindex value.
15740          */
15741         ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE,
15742             &index, sizeof (index));
15743
15744         old_index = phyi->phyint_ifindex;
15745         phyi->phyint_ifindex = index;
15746
15747         avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi);
15748         (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
15749             &index, &where);
15750         avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
15751             phyi, where);
15752         rw_exit(&ipst->ips_ill_g_lock);
15753
15754         /* Update SCTP's ILL list */
15755         sctp_ill_reindex(ill, old_index);
15756
15757         /* Send the routing sockets message */
15758         ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
15759         if (ILL_OTHER(ill))
15760                 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT);
15761
15762         /* Perhaps ilgs should use this ill */
15763         update_conn_ill(NULL, ill->ill_ipst);
15764         return (0);
15765 }
15766
15767 /* ARGSUSED */
15768 int
15769 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15770     ip_ioctl_cmd_t *ipip, void *ifreq)
15771 {
15772         struct ifreq    *ifr = (struct ifreq *)ifreq;
15773         struct lifreq   *lifr = (struct lifreq *)ifreq;
15774
15775         ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n",
15776             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
15777         /* Get the interface index */
15778         if (ipip->ipi_cmd_type == IF_CMD) {
15779                 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
15780         } else {
15781                 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
15782         }
15783         return (0);
15784 }
15785
15786 /* ARGSUSED */
15787 int
15788 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15789     ip_ioctl_cmd_t *ipip, void *ifreq)
15790 {
15791         struct lifreq   *lifr = (struct lifreq *)ifreq;
15792
15793         ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n",
15794             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
15795         /* Get the interface zone */
15796         ASSERT(ipip->ipi_cmd_type == LIF_CMD);
15797         lifr->lifr_zoneid = ipif->ipif_zoneid;
15798         return (0);
15799 }
15800
15801 /*
15802  * Set the zoneid of an interface.
15803  */
15804 /* ARGSUSED */
15805 int
15806 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15807     ip_ioctl_cmd_t *ipip, void *ifreq)
15808 {
15809         struct lifreq   *lifr = (struct lifreq *)ifreq;
15810         int err = 0;
15811         boolean_t need_up = B_FALSE;
15812         zone_t *zptr;
15813         zone_status_t status;
15814         zoneid_t zoneid;
15815
15816         ASSERT(ipip->ipi_cmd_type == LIF_CMD);
15817         if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES)
15818                 return (ENOTSUP);
15819
15820         /* cannot assign instance zero to a non-global zone */
15821         if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID)
15822                 return (ENOTSUP);
15823
15824         /*
15825          * Cannot assign to a zone that doesn't exist or is shutting down.  In
15826          * the event of a race with the zone shutdown processing, since IP
15827          * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the
15828          * interface will be cleaned up even if the zone is shut down
15829          * immediately after the status check. If the interface can't be brought
15830          * down right away, and the zone is shut down before the restart
15831          * function is called, we resolve the possible races by rechecking the
15832          * zone status in the restart function.
15833          */
15834         if ((zptr = zone_find_by_id(zoneid)) == NULL)
15835                 return (EINVAL);
15836         status = zone_status_get(zptr);
15837         zone_rele(zptr);
15838
15839         if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING)
15840                 return (EINVAL);
15841
15842         if (ipif->ipif_flags & IPIF_UP) {
15843                 /*
15844                  * If the interface is already marked up,
15845                  * we call ipif_down which will take care
15846                  * of ditching any IREs that have been set
15847                  * up based on the old interface address.
15848                  */
15849                 err = ipif_logical_down(ipif, q, mp);
15850                 if (err == EINPROGRESS)
15851                         return (err);
15852                 (void) ipif_down_tail(ipif);
15853                 need_up = B_TRUE;
15854         }
15855
15856         err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up);
15857         return (err);
15858 }
15859
15860 static int
15861 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
15862     queue_t *q, mblk_t *mp, boolean_t need_up)
15863 {
15864         int     err = 0;
15865         ip_stack_t      *ipst;
15866
15867         ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n",
15868             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
15869
15870         if (CONN_Q(q))
15871                 ipst = CONNQ_TO_IPST(q);
15872         else
15873                 ipst = ILLQ_TO_IPST(q);
15874
15875         /*
15876          * For exclusive stacks we don't allow a different zoneid than
15877          * global.
15878          */
15879         if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID &&
15880             zoneid != GLOBAL_ZONEID)
15881                 return (EINVAL);
15882
15883         /* Set the new zone id. */
15884         ipif->ipif_zoneid = zoneid;
15885
15886         /* Update sctp list */
15887         sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
15888
15889         /* The default multicast interface might have changed */
15890         ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6);
15891
15892         if (need_up) {
15893                 /*
15894                  * Now bring the interface back up.  If this
15895                  * is the only IPIF for the ILL, ipif_up
15896                  * will have to re-bind to the device, so
15897                  * we may get back EINPROGRESS, in which
15898                  * case, this IOCTL will get completed in
15899                  * ip_rput_dlpi when we see the DL_BIND_ACK.
15900                  */
15901                 err = ipif_up(ipif, q, mp);
15902         }
15903         return (err);
15904 }
15905
15906 /* ARGSUSED */
15907 int
15908 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15909     ip_ioctl_cmd_t *ipip, void *if_req)
15910 {
15911         struct lifreq *lifr = (struct lifreq *)if_req;
15912         zoneid_t zoneid;
15913         zone_t *zptr;
15914         zone_status_t status;
15915
15916         ASSERT(ipip->ipi_cmd_type == LIF_CMD);
15917         if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES)
15918                 zoneid = GLOBAL_ZONEID;
15919
15920         ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n",
15921             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
15922
15923         /*
15924          * We recheck the zone status to resolve the following race condition:
15925          * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone";
15926          * 2) hme0:1 is up and can't be brought down right away;
15927          * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued;
15928          * 3) zone "myzone" is halted; the zone status switches to
15929          * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list
15930          * the interfaces to remove - hme0:1 is not returned because it's not
15931          * yet in "myzone", so it won't be removed;
15932          * 4) the restart function for SIOCSLIFZONE is called; without the
15933          * status check here, we would have hme0:1 in "myzone" after it's been
15934          * destroyed.
15935          * Note that if the status check fails, we need to bring the interface
15936          * back to its state prior to ip_sioctl_slifzone(), hence the call to
15937          * ipif_up_done[_v6]().
15938          */
15939         status = ZONE_IS_UNINITIALIZED;
15940         if ((zptr = zone_find_by_id(zoneid)) != NULL) {
15941                 status = zone_status_get(zptr);
15942                 zone_rele(zptr);
15943         }
15944         if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) {
15945                 if (ipif->ipif_isv6) {
15946                         (void) ipif_up_done_v6(ipif);
15947                 } else {
15948                         (void) ipif_up_done(ipif);
15949                 }
15950                 return (EINVAL);
15951         }
15952
15953         (void) ipif_down_tail(ipif);
15954
15955         return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp,
15956             B_TRUE));
15957 }
15958
15959 /*
15960  * Return the number of addresses on `ill' with one or more of the values
15961  * in `set' set and all of the values in `clear' clear.
15962  */
15963 static uint_t
15964 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear)
15965 {
15966         ipif_t  *ipif;
15967         uint_t  cnt = 0;
15968
15969         ASSERT(IAM_WRITER_ILL(ill));
15970
15971         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
15972                 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear))
15973                         cnt++;
15974
15975         return (cnt);
15976 }
15977
15978 /*
15979  * Return the number of migratable addresses on `ill' that are under
15980  * application control.
15981  */
15982 uint_t
15983 ill_appaddr_cnt(const ill_t *ill)
15984 {
15985         return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF,
15986             IPIF_NOFAILOVER));
15987 }
15988
15989 /*
15990  * Return the number of point-to-point addresses on `ill'.
15991  */
15992 uint_t
15993 ill_ptpaddr_cnt(const ill_t *ill)
15994 {
15995         return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0));
15996 }
15997
15998 /* ARGSUSED */
15999 int
16000 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16001     ip_ioctl_cmd_t *ipip, void *ifreq)
16002 {
16003         struct lifreq   *lifr = ifreq;
16004
16005         ASSERT(q->q_next == NULL);
16006         ASSERT(CONN_Q(q));
16007
16008         ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n",
16009             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16010         lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex;
16011         ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index));
16012
16013         return (0);
16014 }
16015
16016 /* Find the previous ILL in this usesrc group */
16017 static ill_t *
16018 ill_prev_usesrc(ill_t *uill)
16019 {
16020         ill_t *ill;
16021
16022         for (ill = uill->ill_usesrc_grp_next;
16023             ASSERT(ill), ill->ill_usesrc_grp_next != uill;
16024             ill = ill->ill_usesrc_grp_next)
16025                 /* do nothing */;
16026         return (ill);
16027 }
16028
16029 /*
16030  * Release all members of the usesrc group. This routine is called
16031  * from ill_delete when the interface being unplumbed is the
16032  * group head.
16033  *
16034  * This silently clears the usesrc that ifconfig setup.
16035  * An alternative would be to keep that ifindex, and drop packets on the floor
16036  * since no source address can be selected.
16037  * Even if we keep the current semantics, don't need a lock and a linked list.
16038  * Can walk all the ills checking if they have a ill_usesrc_ifindex matching
16039  * the one that is being removed. Issue is how we return the usesrc users
16040  * (SIOCGLIFSRCOF). We want to be able to find the ills which have an
16041  * ill_usesrc_ifindex matching a target ill. We could also do that with an
16042  * ill walk, but the walker would need to insert in the ioctl response.
16043  */
16044 static void
16045 ill_disband_usesrc_group(ill_t *uill)
16046 {
16047         ill_t *next_ill, *tmp_ill;
16048         ip_stack_t      *ipst = uill->ill_ipst;
16049
16050         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16051         next_ill = uill->ill_usesrc_grp_next;
16052
16053         do {
16054                 ASSERT(next_ill != NULL);
16055                 tmp_ill = next_ill->ill_usesrc_grp_next;
16056                 ASSERT(tmp_ill != NULL);
16057                 next_ill->ill_usesrc_grp_next = NULL;
16058                 next_ill->ill_usesrc_ifindex = 0;
16059                 next_ill = tmp_ill;
16060         } while (next_ill->ill_usesrc_ifindex != 0);
16061         uill->ill_usesrc_grp_next = NULL;
16062 }
16063
16064 /*
16065  * Remove the client usesrc ILL from the list and relink to a new list
16066  */
16067 int
16068 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex)
16069 {
16070         ill_t *ill, *tmp_ill;
16071         ip_stack_t      *ipst = ucill->ill_ipst;
16072
16073         ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) &&
16074             (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16075
16076         /*
16077          * Check if the usesrc client ILL passed in is not already
16078          * in use as a usesrc ILL i.e one whose source address is
16079          * in use OR a usesrc ILL is not already in use as a usesrc
16080          * client ILL
16081          */
16082         if ((ucill->ill_usesrc_ifindex == 0) ||
16083             (uill->ill_usesrc_ifindex != 0)) {
16084                 return (-1);
16085         }
16086
16087         ill = ill_prev_usesrc(ucill);
16088         ASSERT(ill->ill_usesrc_grp_next != NULL);
16089
16090         /* Remove from the current list */
16091         if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) {
16092                 /* Only two elements in the list */
16093                 ASSERT(ill->ill_usesrc_ifindex == 0);
16094                 ill->ill_usesrc_grp_next = NULL;
16095         } else {
16096                 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next;
16097         }
16098
16099         if (ifindex == 0) {
16100                 ucill->ill_usesrc_ifindex = 0;
16101                 ucill->ill_usesrc_grp_next = NULL;
16102                 return (0);
16103         }
16104
16105         ucill->ill_usesrc_ifindex = ifindex;
16106         tmp_ill = uill->ill_usesrc_grp_next;
16107         uill->ill_usesrc_grp_next = ucill;
16108         ucill->ill_usesrc_grp_next =
16109             (tmp_ill != NULL) ? tmp_ill : uill;
16110         return (0);
16111 }
16112
16113 /*
16114  * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in
16115  * ip.c for locking details.
16116  */
16117 /* ARGSUSED */
16118 int
16119 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16120     ip_ioctl_cmd_t *ipip, void *ifreq)
16121 {
16122         struct lifreq *lifr = (struct lifreq *)ifreq;
16123         boolean_t isv6 = B_FALSE, reset_flg = B_FALSE;
16124         ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
16125         int err = 0, ret;
16126         uint_t ifindex;
16127         ipsq_t *ipsq = NULL;
16128         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
16129
16130         ASSERT(IAM_WRITER_IPIF(ipif));
16131         ASSERT(q->q_next == NULL);
16132         ASSERT(CONN_Q(q));
16133
16134         isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
16135
16136         ifindex = lifr->lifr_index;
16137         if (ifindex == 0) {
16138                 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) {
16139                         /* non usesrc group interface, nothing to reset */
16140                         return (0);
16141                 }
16142                 ifindex = usesrc_cli_ill->ill_usesrc_ifindex;
16143                 /* valid reset request */
16144                 reset_flg = B_TRUE;
16145         }
16146
16147         usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
16148         if (usesrc_ill == NULL)
16149                 return (ENXIO);
16150         if (usesrc_ill == ipif->ipif_ill) {
16151                 ill_refrele(usesrc_ill);
16152                 return (EINVAL);
16153         }
16154
16155         ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
16156             NEW_OP, B_TRUE);
16157         if (ipsq == NULL) {
16158                 err = EINPROGRESS;
16159                 /* Operation enqueued on the ipsq of the usesrc ILL */
16160                 goto done;
16161         }
16162
16163         /* USESRC isn't currently supported with IPMP */
16164         if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) {
16165                 err = ENOTSUP;
16166                 goto done;
16167         }
16168
16169         /*
16170          * USESRC isn't compatible with the STANDBY flag.  (STANDBY is only
16171          * used by IPMP underlying interfaces, but someone might think it's
16172          * more general and try to use it independently with VNI.)
16173          */
16174         if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
16175                 err = ENOTSUP;
16176                 goto done;
16177         }
16178
16179         /*
16180          * If the client is already in use as a usesrc_ill or a usesrc_ill is
16181          * already a client then return EINVAL
16182          */
16183         if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) {
16184                 err = EINVAL;
16185                 goto done;
16186         }
16187
16188         /*
16189          * If the ill_usesrc_ifindex field is already set to what it needs to
16190          * be then this is a duplicate operation.
16191          */
16192         if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) {
16193                 err = 0;
16194                 goto done;
16195         }
16196
16197         ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s,"
16198             " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name,
16199             usesrc_ill->ill_isv6));
16200
16201         /*
16202          * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next
16203          * and the ill_usesrc_ifindex fields
16204          */
16205         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
16206
16207         if (reset_flg) {
16208                 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0);
16209                 if (ret != 0) {
16210                         err = EINVAL;
16211                 }
16212                 rw_exit(&ipst->ips_ill_g_usesrc_lock);
16213                 goto done;
16214         }
16215
16216         /*
16217          * Four possibilities to consider:
16218          * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp
16219          * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't
16220          * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't
16221          * 4. Both are part of their respective usesrc groups
16222          */
16223         if ((usesrc_ill->ill_usesrc_grp_next == NULL) &&
16224             (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16225                 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0);
16226                 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16227                 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16228                 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill;
16229         } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) &&
16230             (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16231                 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16232                 /* Insert at head of list */
16233                 usesrc_cli_ill->ill_usesrc_grp_next =
16234                     usesrc_ill->ill_usesrc_grp_next;
16235                 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16236         } else {
16237                 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill,
16238                     ifindex);
16239                 if (ret != 0)
16240                         err = EINVAL;
16241         }
16242         rw_exit(&ipst->ips_ill_g_usesrc_lock);
16243
16244 done:
16245         if (ipsq != NULL)
16246                 ipsq_exit(ipsq);
16247         /* The refrele on the lifr_name ipif is done by ip_process_ioctl */
16248         ill_refrele(usesrc_ill);
16249
16250         /* Let conn_ixa caching know that source address selection changed */
16251         ip_update_source_selection(ipst);
16252
16253         return (err);
16254 }
16255
16256 /* ARGSUSED */
16257 int
16258 ip_sioctl_get_dadstate(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16259     ip_ioctl_cmd_t *ipip, void *if_req)
16260 {
16261         struct lifreq   *lifr = (struct lifreq *)if_req;
16262         ill_t           *ill = ipif->ipif_ill;
16263
16264         /*
16265          * Need a lock since IFF_UP can be set even when there are
16266          * references to the ipif.
16267          */
16268         mutex_enter(&ill->ill_lock);
16269         if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_addr_ready == 0)
16270                 lifr->lifr_dadstate = DAD_IN_PROGRESS;
16271         else
16272                 lifr->lifr_dadstate = DAD_DONE;
16273         mutex_exit(&ill->ill_lock);
16274         return (0);
16275 }
16276
16277 /*
16278  * comparison function used by avl.
16279  */
16280 static int
16281 ill_phyint_compare_index(const void *index_ptr, const void *phyip)
16282 {
16283
16284         uint_t index;
16285
16286         ASSERT(phyip != NULL && index_ptr != NULL);
16287
16288         index = *((uint_t *)index_ptr);
16289         /*
16290          * let the phyint with the lowest index be on top.
16291          */
16292         if (((phyint_t *)phyip)->phyint_ifindex < index)
16293                 return (1);
16294         if (((phyint_t *)phyip)->phyint_ifindex > index)
16295                 return (-1);
16296         return (0);
16297 }
16298
16299 /*
16300  * comparison function used by avl.
16301  */
16302 static int
16303 ill_phyint_compare_name(const void *name_ptr, const void *phyip)
16304 {
16305         ill_t *ill;
16306         int res = 0;
16307
16308         ASSERT(phyip != NULL && name_ptr != NULL);
16309
16310         if (((phyint_t *)phyip)->phyint_illv4)
16311                 ill = ((phyint_t *)phyip)->phyint_illv4;
16312         else
16313                 ill = ((phyint_t *)phyip)->phyint_illv6;
16314         ASSERT(ill != NULL);
16315
16316         res = strcmp(ill->ill_name, (char *)name_ptr);
16317         if (res > 0)
16318                 return (1);
16319         else if (res < 0)
16320                 return (-1);
16321         return (0);
16322 }
16323
16324 /*
16325  * This function is called on the unplumb path via ill_glist_delete() when
16326  * there are no ills left on the phyint and thus the phyint can be freed.
16327  */
16328 static void
16329 phyint_free(phyint_t *phyi)
16330 {
16331         ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
16332
16333         ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL);
16334
16335         /*
16336          * If this phyint was an IPMP meta-interface, blow away the group.
16337          * This is safe to do because all of the illgrps have already been
16338          * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us.
16339          * If we're cleaning up as a result of failed initialization,
16340          * phyint_grp may be NULL.
16341          */
16342         if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) {
16343                 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16344                 ipmp_grp_destroy(phyi->phyint_grp);
16345                 phyi->phyint_grp = NULL;
16346                 rw_exit(&ipst->ips_ipmp_lock);
16347         }
16348
16349         /*
16350          * If this interface was under IPMP, take it out of the group.
16351          */
16352         if (phyi->phyint_grp != NULL)
16353                 ipmp_phyint_leave_grp(phyi);
16354
16355         /*
16356          * Delete the phyint and disassociate its ipsq.  The ipsq itself
16357          * will be freed in ipsq_exit().
16358          */
16359         phyi->phyint_ipsq->ipsq_phyint = NULL;
16360         phyi->phyint_name[0] = '\0';
16361
16362         mi_free(phyi);
16363 }
16364
16365 /*
16366  * Attach the ill to the phyint structure which can be shared by both
16367  * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This
16368  * function is called from ipif_set_values and ill_lookup_on_name (for
16369  * loopback) where we know the name of the ill. We lookup the ill and if
16370  * there is one present already with the name use that phyint. Otherwise
16371  * reuse the one allocated by ill_init.
16372  */
16373 static void
16374 ill_phyint_reinit(ill_t *ill)
16375 {
16376         boolean_t isv6 = ill->ill_isv6;
16377         phyint_t *phyi_old;
16378         phyint_t *phyi;
16379         avl_index_t where = 0;
16380         ill_t   *ill_other = NULL;
16381         ip_stack_t      *ipst = ill->ill_ipst;
16382
16383         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
16384
16385         phyi_old = ill->ill_phyint;
16386         ASSERT(isv6 || (phyi_old->phyint_illv4 == ill &&
16387             phyi_old->phyint_illv6 == NULL));
16388         ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill &&
16389             phyi_old->phyint_illv4 == NULL));
16390         ASSERT(phyi_old->phyint_ifindex == 0);
16391
16392         /*
16393          * Now that our ill has a name, set it in the phyint.
16394          */
16395         (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ);
16396
16397         phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16398             ill->ill_name, &where);
16399
16400         /*
16401          * 1. We grabbed the ill_g_lock before inserting this ill into
16402          *    the global list of ills. So no other thread could have located
16403          *    this ill and hence the ipsq of this ill is guaranteed to be empty.
16404          * 2. Now locate the other protocol instance of this ill.
16405          * 3. Now grab both ill locks in the right order, and the phyint lock of
16406          *    the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq
16407          *    of neither ill can change.
16408          * 4. Merge the phyint and thus the ipsq as well of this ill onto the
16409          *    other ill.
16410          * 5. Release all locks.
16411          */
16412
16413         /*
16414          * Look for IPv4 if we are initializing IPv6 or look for IPv6 if
16415          * we are initializing IPv4.
16416          */
16417         if (phyi != NULL) {
16418                 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6;
16419                 ASSERT(ill_other->ill_phyint != NULL);
16420                 ASSERT((isv6 && !ill_other->ill_isv6) ||
16421                     (!isv6 && ill_other->ill_isv6));
16422                 GRAB_ILL_LOCKS(ill, ill_other);
16423                 /*
16424                  * We are potentially throwing away phyint_flags which
16425                  * could be different from the one that we obtain from
16426                  * ill_other->ill_phyint. But it is okay as we are assuming
16427                  * that the state maintained within IP is correct.
16428                  */
16429                 mutex_enter(&phyi->phyint_lock);
16430                 if (isv6) {
16431                         ASSERT(phyi->phyint_illv6 == NULL);
16432                         phyi->phyint_illv6 = ill;
16433                 } else {
16434                         ASSERT(phyi->phyint_illv4 == NULL);
16435                         phyi->phyint_illv4 = ill;
16436                 }
16437
16438                 /*
16439                  * Delete the old phyint and make its ipsq eligible
16440                  * to be freed in ipsq_exit().
16441                  */
16442                 phyi_old->phyint_illv4 = NULL;
16443                 phyi_old->phyint_illv6 = NULL;
16444                 phyi_old->phyint_ipsq->ipsq_phyint = NULL;
16445                 phyi_old->phyint_name[0] = '\0';
16446                 mi_free(phyi_old);
16447         } else {
16448                 mutex_enter(&ill->ill_lock);
16449                 /*
16450                  * We don't need to acquire any lock, since
16451                  * the ill is not yet visible globally  and we
16452                  * have not yet released the ill_g_lock.
16453                  */
16454                 phyi = phyi_old;
16455                 mutex_enter(&phyi->phyint_lock);
16456                 /* XXX We need a recovery strategy here. */
16457                 if (!phyint_assign_ifindex(phyi, ipst))
16458                         cmn_err(CE_PANIC, "phyint_assign_ifindex() failed");
16459
16460                 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16461                     (void *)phyi, where);
16462
16463                 (void) avl_find(&ipst->ips_phyint_g_list->
16464                     phyint_list_avl_by_index,
16465                     &phyi->phyint_ifindex, &where);
16466                 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16467                     (void *)phyi, where);
16468         }
16469
16470         /*
16471          * Reassigning ill_phyint automatically reassigns the ipsq also.
16472          * pending mp is not affected because that is per ill basis.
16473          */
16474         ill->ill_phyint = phyi;
16475
16476         /*
16477          * Now that the phyint's ifindex has been assigned, complete the
16478          * remaining
16479          */
16480         ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex;
16481         if (ill->ill_isv6) {
16482                 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
16483                     ill->ill_phyint->phyint_ifindex;
16484                 ill->ill_mcast_type = ipst->ips_mld_max_version;
16485         } else {
16486                 ill->ill_mcast_type = ipst->ips_igmp_max_version;
16487         }
16488
16489         /*
16490          * Generate an event within the hooks framework to indicate that
16491          * a new interface has just been added to IP.  For this event to
16492          * be generated, the network interface must, at least, have an
16493          * ifindex assigned to it.  (We don't generate the event for
16494          * loopback since ill_lookup_on_name() has its own NE_PLUMB event.)
16495          *
16496          * This needs to be run inside the ill_g_lock perimeter to ensure
16497          * that the ordering of delivered events to listeners matches the
16498          * order of them in the kernel.
16499          */
16500         if (!IS_LOOPBACK(ill)) {
16501                 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name,
16502                     ill->ill_name_length);
16503         }
16504         RELEASE_ILL_LOCKS(ill, ill_other);
16505         mutex_exit(&phyi->phyint_lock);
16506 }
16507
16508 /*
16509  * Notify any downstream modules of the name of this interface.
16510  * An M_IOCTL is used even though we don't expect a successful reply.
16511  * Any reply message from the driver (presumably an M_IOCNAK) will
16512  * eventually get discarded somewhere upstream.  The message format is
16513  * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig
16514  * to IP.
16515  */
16516 static void
16517 ip_ifname_notify(ill_t *ill, queue_t *q)
16518 {
16519         mblk_t *mp1, *mp2;
16520         struct iocblk *iocp;
16521         struct lifreq *lifr;
16522
16523         mp1 = mkiocb(SIOCSLIFNAME);
16524         if (mp1 == NULL)
16525                 return;
16526         mp2 = allocb(sizeof (struct lifreq), BPRI_HI);
16527         if (mp2 == NULL) {
16528                 freeb(mp1);
16529                 return;
16530         }
16531
16532         mp1->b_cont = mp2;
16533         iocp = (struct iocblk *)mp1->b_rptr;
16534         iocp->ioc_count = sizeof (struct lifreq);
16535
16536         lifr = (struct lifreq *)mp2->b_rptr;
16537         mp2->b_wptr += sizeof (struct lifreq);
16538         bzero(lifr, sizeof (struct lifreq));
16539
16540         (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ);
16541         lifr->lifr_ppa = ill->ill_ppa;
16542         lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6));
16543
16544         DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify",
16545             char *, "SIOCSLIFNAME", ill_t *, ill);
16546         putnext(q, mp1);
16547 }
16548
16549 static int
16550 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
16551 {
16552         int             err;
16553         ip_stack_t      *ipst = ill->ill_ipst;
16554         phyint_t        *phyi = ill->ill_phyint;
16555
16556         /*
16557          * Now that ill_name is set, the configuration for the IPMP
16558          * meta-interface can be performed.
16559          */
16560         if (IS_IPMP(ill)) {
16561                 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16562                 /*
16563                  * If phyi->phyint_grp is NULL, then this is the first IPMP
16564                  * meta-interface and we need to create the IPMP group.
16565                  */
16566                 if (phyi->phyint_grp == NULL) {
16567                         /*
16568                          * If someone has renamed another IPMP group to have
16569                          * the same name as our interface, bail.
16570                          */
16571                         if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) {
16572                                 rw_exit(&ipst->ips_ipmp_lock);
16573                                 return (EEXIST);
16574                         }
16575                         phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi);
16576                         if (phyi->phyint_grp == NULL) {
16577                                 rw_exit(&ipst->ips_ipmp_lock);
16578                                 return (ENOMEM);
16579                         }
16580                 }
16581                 rw_exit(&ipst->ips_ipmp_lock);
16582         }
16583
16584         /* Tell downstream modules where they are. */
16585         ip_ifname_notify(ill, q);
16586
16587         /*
16588          * ill_dl_phys returns EINPROGRESS in the usual case.
16589          * Error cases are ENOMEM ...
16590          */
16591         err = ill_dl_phys(ill, ipif, mp, q);
16592
16593         if (ill->ill_isv6) {
16594                 mutex_enter(&ipst->ips_mld_slowtimeout_lock);
16595                 if (ipst->ips_mld_slowtimeout_id == 0) {
16596                         ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo,
16597                             (void *)ipst,
16598                             MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16599                 }
16600                 mutex_exit(&ipst->ips_mld_slowtimeout_lock);
16601         } else {
16602                 mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
16603                 if (ipst->ips_igmp_slowtimeout_id == 0) {
16604                         ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo,
16605                             (void *)ipst,
16606                             MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16607                 }
16608                 mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
16609         }
16610
16611         return (err);
16612 }
16613
16614 /*
16615  * Common routine for ppa and ifname setting. Should be called exclusive.
16616  *
16617  * Returns EINPROGRESS when mp has been consumed by queueing it on
16618  * ipx_pending_mp and the ioctl will complete in ip_rput.
16619  *
16620  * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return
16621  * the new name and new ppa in lifr_name and lifr_ppa respectively.
16622  * For SLIFNAME, we pass these values back to the userland.
16623  */
16624 static int
16625 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
16626 {
16627         ill_t   *ill;
16628         ipif_t  *ipif;
16629         ipsq_t  *ipsq;
16630         char    *ppa_ptr;
16631         char    *old_ptr;
16632         char    old_char;
16633         int     error;
16634         ip_stack_t      *ipst;
16635
16636         ip1dbg(("ipif_set_values: interface %s\n", interf_name));
16637         ASSERT(q->q_next != NULL);
16638         ASSERT(interf_name != NULL);
16639
16640         ill = (ill_t *)q->q_ptr;
16641         ipst = ill->ill_ipst;
16642
16643         ASSERT(ill->ill_ipst != NULL);
16644         ASSERT(ill->ill_name[0] == '\0');
16645         ASSERT(IAM_WRITER_ILL(ill));
16646         ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ);
16647         ASSERT(ill->ill_ppa == UINT_MAX);
16648
16649         ill->ill_defend_start = ill->ill_defend_count = 0;
16650         /* The ppa is sent down by ifconfig or is chosen */
16651         if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) {
16652                 return (EINVAL);
16653         }
16654
16655         /*
16656          * make sure ppa passed in is same as ppa in the name.
16657          * This check is not made when ppa == UINT_MAX in that case ppa
16658          * in the name could be anything. System will choose a ppa and
16659          * update new_ppa_ptr and inter_name to contain the choosen ppa.
16660          */
16661         if (*new_ppa_ptr != UINT_MAX) {
16662                 /* stoi changes the pointer */
16663                 old_ptr = ppa_ptr;
16664                 /*
16665                  * ifconfig passed in 0 for the ppa for DLPI 1 style devices
16666                  * (they don't have an externally visible ppa).  We assign one
16667                  * here so that we can manage the interface.  Note that in
16668                  * the past this value was always 0 for DLPI 1 drivers.
16669                  */
16670                 if (*new_ppa_ptr == 0)
16671                         *new_ppa_ptr = stoi(&old_ptr);
16672                 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr))
16673                         return (EINVAL);
16674         }
16675         /*
16676          * terminate string before ppa
16677          * save char at that location.
16678          */
16679         old_char = ppa_ptr[0];
16680         ppa_ptr[0] = '\0';
16681
16682         ill->ill_ppa = *new_ppa_ptr;
16683         /*
16684          * Finish as much work now as possible before calling ill_glist_insert
16685          * which makes the ill globally visible and also merges it with the
16686          * other protocol instance of this phyint. The remaining work is
16687          * done after entering the ipsq which may happen sometime later.
16688          */
16689         ipif = ill->ill_ipif;
16690
16691         /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */
16692         ipif_assign_seqid(ipif);
16693
16694         if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)))
16695                 ill->ill_flags |= ILLF_IPV4;
16696
16697         ASSERT(ipif->ipif_next == NULL);        /* Only one ipif on ill */
16698         ASSERT((ipif->ipif_flags & IPIF_UP) == 0);
16699
16700         if (ill->ill_flags & ILLF_IPV6) {
16701
16702                 ill->ill_isv6 = B_TRUE;
16703                 ill_set_inputfn(ill);
16704                 if (ill->ill_rq != NULL) {
16705                         ill->ill_rq->q_qinfo = &iprinitv6;
16706                 }
16707
16708                 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */
16709                 ipif->ipif_v6lcl_addr = ipv6_all_zeros;
16710                 ipif->ipif_v6subnet = ipv6_all_zeros;
16711                 ipif->ipif_v6net_mask = ipv6_all_zeros;
16712                 ipif->ipif_v6brd_addr = ipv6_all_zeros;
16713                 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros;
16714                 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER;
16715                 /*
16716                  * point-to-point or Non-mulicast capable
16717                  * interfaces won't do NUD unless explicitly
16718                  * configured to do so.
16719                  */
16720                 if (ipif->ipif_flags & IPIF_POINTOPOINT ||
16721                     !(ill->ill_flags & ILLF_MULTICAST)) {
16722                         ill->ill_flags |= ILLF_NONUD;
16723                 }
16724                 /* Make sure IPv4 specific flag is not set on IPv6 if */
16725                 if (ill->ill_flags & ILLF_NOARP) {
16726                         /*
16727                          * Note: xresolv interfaces will eventually need
16728                          * NOARP set here as well, but that will require
16729                          * those external resolvers to have some
16730                          * knowledge of that flag and act appropriately.
16731                          * Not to be changed at present.
16732                          */
16733                         ill->ill_flags &= ~ILLF_NOARP;
16734                 }
16735                 /*
16736                  * Set the ILLF_ROUTER flag according to the global
16737                  * IPv6 forwarding policy.
16738                  */
16739                 if (ipst->ips_ipv6_forwarding != 0)
16740                         ill->ill_flags |= ILLF_ROUTER;
16741         } else if (ill->ill_flags & ILLF_IPV4) {
16742                 ill->ill_isv6 = B_FALSE;
16743                 ill_set_inputfn(ill);
16744                 ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER;
16745                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr);
16746                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet);
16747                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask);
16748                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr);
16749                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr);
16750                 /*
16751                  * Set the ILLF_ROUTER flag according to the global
16752                  * IPv4 forwarding policy.
16753                  */
16754                 if (ipst->ips_ip_forwarding != 0)
16755                         ill->ill_flags |= ILLF_ROUTER;
16756         }
16757
16758         ASSERT(ill->ill_phyint != NULL);
16759
16760         /*
16761          * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will
16762          * be completed in ill_glist_insert -> ill_phyint_reinit
16763          */
16764         if (!ill_allocate_mibs(ill))
16765                 return (ENOMEM);
16766
16767         /*
16768          * Pick a default sap until we get the DL_INFO_ACK back from
16769          * the driver.
16770          */
16771         ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap :
16772             ill->ill_media->ip_m_ipv4sap;
16773
16774         ill->ill_ifname_pending = 1;
16775         ill->ill_ifname_pending_err = 0;
16776
16777         /*
16778          * When the first ipif comes up in ipif_up_done(), multicast groups
16779          * that were joined while this ill was not bound to the DLPI link need
16780          * to be recovered by ill_recover_multicast().
16781          */
16782         ill->ill_need_recover_multicast = 1;
16783
16784         ill_refhold(ill);
16785         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
16786         if ((error = ill_glist_insert(ill, interf_name,
16787             (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) {
16788                 ill->ill_ppa = UINT_MAX;
16789                 ill->ill_name[0] = '\0';
16790                 /*
16791                  * undo null termination done above.
16792                  */
16793                 ppa_ptr[0] = old_char;
16794                 rw_exit(&ipst->ips_ill_g_lock);
16795                 ill_refrele(ill);
16796                 return (error);
16797         }
16798
16799         ASSERT(ill->ill_name_length <= LIFNAMSIZ);
16800
16801         /*
16802          * When we return the buffer pointed to by interf_name should contain
16803          * the same name as in ill_name.
16804          * If a ppa was choosen by the system (ppa passed in was UINT_MAX)
16805          * the buffer pointed to by new_ppa_ptr would not contain the right ppa
16806          * so copy full name and update the ppa ptr.
16807          * When ppa passed in != UINT_MAX all values are correct just undo
16808          * null termination, this saves a bcopy.
16809          */
16810         if (*new_ppa_ptr == UINT_MAX) {
16811                 bcopy(ill->ill_name, interf_name, ill->ill_name_length);
16812                 *new_ppa_ptr = ill->ill_ppa;
16813         } else {
16814                 /*
16815                  * undo null termination done above.
16816                  */
16817                 ppa_ptr[0] = old_char;
16818         }
16819
16820         /* Let SCTP know about this ILL */
16821         sctp_update_ill(ill, SCTP_ILL_INSERT);
16822
16823         /*
16824          * ill_glist_insert has made the ill visible globally, and
16825          * ill_phyint_reinit could have changed the ipsq. At this point,
16826          * we need to hold the ips_ill_g_lock across the call to enter the
16827          * ipsq to enforce atomicity and prevent reordering. In the event
16828          * the ipsq has changed, and if the new ipsq is currently busy,
16829          * we need to make sure that this half-completed ioctl is ahead of
16830          * any subsequent ioctl. We achieve this by not dropping the
16831          * ips_ill_g_lock which prevents any ill lookup itself thereby
16832          * ensuring that new ioctls can't start.
16833          */
16834         ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP,
16835             B_TRUE);
16836
16837         rw_exit(&ipst->ips_ill_g_lock);
16838         ill_refrele(ill);
16839         if (ipsq == NULL)
16840                 return (EINPROGRESS);
16841
16842         /*
16843          * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq.
16844          */
16845         if (ipsq->ipsq_xop->ipx_current_ipif == NULL)
16846                 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME);
16847         else
16848                 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif);
16849
16850         error = ipif_set_values_tail(ill, ipif, mp, q);
16851         ipsq_exit(ipsq);
16852         if (error != 0 && error != EINPROGRESS) {
16853                 /*
16854                  * restore previous values
16855                  */
16856                 ill->ill_isv6 = B_FALSE;
16857                 ill_set_inputfn(ill);
16858         }
16859         return (error);
16860 }
16861
16862 void
16863 ipif_init(ip_stack_t *ipst)
16864 {
16865         int i;
16866
16867         for (i = 0; i < MAX_G_HEADS; i++) {
16868                 ipst->ips_ill_g_heads[i].ill_g_list_head =
16869                     (ill_if_t *)&ipst->ips_ill_g_heads[i];
16870                 ipst->ips_ill_g_heads[i].ill_g_list_tail =
16871                     (ill_if_t *)&ipst->ips_ill_g_heads[i];
16872         }
16873
16874         avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16875             ill_phyint_compare_index,
16876             sizeof (phyint_t),
16877             offsetof(struct phyint, phyint_avl_by_index));
16878         avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16879             ill_phyint_compare_name,
16880             sizeof (phyint_t),
16881             offsetof(struct phyint, phyint_avl_by_name));
16882 }
16883
16884 /*
16885  * Save enough information so that we can recreate the IRE if
16886  * the interface goes down and then up.
16887  */
16888 void
16889 ill_save_ire(ill_t *ill, ire_t *ire)
16890 {
16891         mblk_t  *save_mp;
16892
16893         save_mp = allocb(sizeof (ifrt_t), BPRI_MED);
16894         if (save_mp != NULL) {
16895                 ifrt_t  *ifrt;
16896
16897                 save_mp->b_wptr += sizeof (ifrt_t);
16898                 ifrt = (ifrt_t *)save_mp->b_rptr;
16899                 bzero(ifrt, sizeof (ifrt_t));
16900                 ifrt->ifrt_type = ire->ire_type;
16901                 if (ire->ire_ipversion == IPV4_VERSION) {
16902                         ASSERT(!ill->ill_isv6);
16903                         ifrt->ifrt_addr = ire->ire_addr;
16904                         ifrt->ifrt_gateway_addr = ire->ire_gateway_addr;
16905                         ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr;
16906                         ifrt->ifrt_mask = ire->ire_mask;
16907                 } else {
16908                         ASSERT(ill->ill_isv6);
16909                         ifrt->ifrt_v6addr = ire->ire_addr_v6;
16910                         /* ire_gateway_addr_v6 can change due to RTM_CHANGE */
16911                         mutex_enter(&ire->ire_lock);
16912                         ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6;
16913                         mutex_exit(&ire->ire_lock);
16914                         ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6;
16915                         ifrt->ifrt_v6mask = ire->ire_mask_v6;
16916                 }
16917                 ifrt->ifrt_flags = ire->ire_flags;
16918                 ifrt->ifrt_zoneid = ire->ire_zoneid;
16919                 mutex_enter(&ill->ill_saved_ire_lock);
16920                 save_mp->b_cont = ill->ill_saved_ire_mp;
16921                 ill->ill_saved_ire_mp = save_mp;
16922                 ill->ill_saved_ire_cnt++;
16923                 mutex_exit(&ill->ill_saved_ire_lock);
16924         }
16925 }
16926
16927 /*
16928  * Remove one entry from ill_saved_ire_mp.
16929  */
16930 void
16931 ill_remove_saved_ire(ill_t *ill, ire_t *ire)
16932 {
16933         mblk_t  **mpp;
16934         mblk_t  *mp;
16935         ifrt_t  *ifrt;
16936
16937         /* Remove from ill_saved_ire_mp list if it is there */
16938         mutex_enter(&ill->ill_saved_ire_lock);
16939         for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL;
16940             mpp = &(*mpp)->b_cont) {
16941                 in6_addr_t      gw_addr_v6;
16942
16943                 /*
16944                  * On a given ill, the tuple of address, gateway, mask,
16945                  * ire_type, and zoneid is unique for each saved IRE.
16946                  */
16947                 mp = *mpp;
16948                 ifrt = (ifrt_t *)mp->b_rptr;
16949                 /* ire_gateway_addr_v6 can change - need lock */
16950                 mutex_enter(&ire->ire_lock);
16951                 gw_addr_v6 = ire->ire_gateway_addr_v6;
16952                 mutex_exit(&ire->ire_lock);
16953
16954                 if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
16955                     ifrt->ifrt_type != ire->ire_type)
16956                         continue;
16957
16958                 if (ill->ill_isv6 ?
16959                     (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
16960                     &ire->ire_addr_v6) &&
16961                     IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
16962                     &gw_addr_v6) &&
16963                     IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
16964                     &ire->ire_mask_v6)) :
16965                     (ifrt->ifrt_addr == ire->ire_addr &&
16966                     ifrt->ifrt_gateway_addr == ire->ire_gateway_addr &&
16967                     ifrt->ifrt_mask == ire->ire_mask)) {
16968                         *mpp = mp->b_cont;
16969                         ill->ill_saved_ire_cnt--;
16970                         freeb(mp);
16971                         break;
16972                 }
16973         }
16974         mutex_exit(&ill->ill_saved_ire_lock);
16975 }
16976
16977 /*
16978  * Derive an interface id from the link layer address.
16979  * Knows about IEEE 802 and IEEE EUI-64 mappings.
16980  */
16981 static void
16982 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr)
16983 {
16984         char            *addr;
16985
16986         /*
16987          * Note that some IPv6 interfaces get plumbed over links that claim to
16988          * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g.
16989          * PPP links).  The ETHERADDRL check here ensures that we only set the
16990          * interface ID on IPv6 interfaces above links that actually have real
16991          * Ethernet addresses.
16992          */
16993         if (ill->ill_phys_addr_length == ETHERADDRL) {
16994                 /* Form EUI-64 like address */
16995                 addr = (char *)&v6addr->s6_addr32[2];
16996                 bcopy(ill->ill_phys_addr, addr, 3);
16997                 addr[0] ^= 0x2;         /* Toggle Universal/Local bit */
16998                 addr[3] = (char)0xff;
16999                 addr[4] = (char)0xfe;
17000                 bcopy(ill->ill_phys_addr + 3, addr + 5, 3);
17001         }
17002 }
17003
17004 /* ARGSUSED */
17005 static void
17006 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17007 {
17008 }
17009
17010 typedef struct ipmp_ifcookie {
17011         uint32_t        ic_hostid;
17012         char            ic_ifname[LIFNAMSIZ];
17013         char            ic_zonename[ZONENAME_MAX];
17014 } ipmp_ifcookie_t;
17015
17016 /*
17017  * Construct a pseudo-random interface ID for the IPMP interface that's both
17018  * predictable and (almost) guaranteed to be unique.
17019  */
17020 static void
17021 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17022 {
17023         zone_t          *zp;
17024         uint8_t         *addr;
17025         uchar_t         hash[16];
17026         ulong_t         hostid;
17027         MD5_CTX         ctx;
17028         ipmp_ifcookie_t ic = { 0 };
17029
17030         ASSERT(IS_IPMP(ill));
17031
17032         (void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
17033         ic.ic_hostid = htonl((uint32_t)hostid);
17034
17035         (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ);
17036
17037         if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) {
17038                 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX);
17039                 zone_rele(zp);
17040         }
17041
17042         MD5Init(&ctx);
17043         MD5Update(&ctx, &ic, sizeof (ic));
17044         MD5Final(hash, &ctx);
17045
17046         /*
17047          * Map the hash to an interface ID per the basic approach in RFC3041.
17048          */
17049         addr = &v6addr->s6_addr8[8];
17050         bcopy(hash + 8, addr, sizeof (uint64_t));
17051         addr[0] &= ~0x2;                                /* set local bit */
17052 }
17053
17054 /*
17055  * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet.
17056  */
17057 static void
17058 ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr)
17059 {
17060         phyint_t *phyi = ill->ill_phyint;
17061
17062         /*
17063          * Check PHYI_MULTI_BCAST and length of physical
17064          * address to determine if we use the mapping or the
17065          * broadcast address.
17066          */
17067         if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17068             ill->ill_phys_addr_length != ETHERADDRL) {
17069                 ip_mbcast_mapping(ill, m_ip6addr, m_physaddr);
17070                 return;
17071         }
17072         m_physaddr[0] = 0x33;
17073         m_physaddr[1] = 0x33;
17074         m_physaddr[2] = m_ip6addr[12];
17075         m_physaddr[3] = m_ip6addr[13];
17076         m_physaddr[4] = m_ip6addr[14];
17077         m_physaddr[5] = m_ip6addr[15];
17078 }
17079
17080 /*
17081  * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet.
17082  */
17083 static void
17084 ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17085 {
17086         phyint_t *phyi = ill->ill_phyint;
17087
17088         /*
17089          * Check PHYI_MULTI_BCAST and length of physical
17090          * address to determine if we use the mapping or the
17091          * broadcast address.
17092          */
17093         if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17094             ill->ill_phys_addr_length != ETHERADDRL) {
17095                 ip_mbcast_mapping(ill, m_ipaddr, m_physaddr);
17096                 return;
17097         }
17098         m_physaddr[0] = 0x01;
17099         m_physaddr[1] = 0x00;
17100         m_physaddr[2] = 0x5e;
17101         m_physaddr[3] = m_ipaddr[1] & 0x7f;
17102         m_physaddr[4] = m_ipaddr[2];
17103         m_physaddr[5] = m_ipaddr[3];
17104 }
17105
17106 /* ARGSUSED */
17107 static void
17108 ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17109 {
17110         /*
17111          * for the MULTI_BCAST case and other cases when we want to
17112          * use the link-layer broadcast address for multicast.
17113          */
17114         uint8_t *bphys_addr;
17115         dl_unitdata_req_t *dlur;
17116
17117         dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17118         if (ill->ill_sap_length < 0) {
17119                 bphys_addr = (uchar_t *)dlur +
17120                     dlur->dl_dest_addr_offset;
17121         } else  {
17122                 bphys_addr = (uchar_t *)dlur +
17123                     dlur->dl_dest_addr_offset + ill->ill_sap_length;
17124         }
17125
17126         bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length);
17127 }
17128
17129 /*
17130  * Derive IPoIB interface id from the link layer address.
17131  */
17132 static void
17133 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17134 {
17135         char            *addr;
17136
17137         ASSERT(ill->ill_phys_addr_length == 20);
17138         addr = (char *)&v6addr->s6_addr32[2];
17139         bcopy(ill->ill_phys_addr + 12, addr, 8);
17140         /*
17141          * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit
17142          * in the globally assigned EUI-64 GUID to 1, in violation of IEEE
17143          * rules. In these cases, the IBA considers these GUIDs to be in
17144          * "Modified EUI-64" format, and thus toggling the u/l bit is not
17145          * required; vendors are required not to assign global EUI-64's
17146          * that differ only in u/l bit values, thus guaranteeing uniqueness
17147          * of the interface identifier. Whether the GUID is in modified
17148          * or proper EUI-64 format, the ipv6 identifier must have the u/l
17149          * bit set to 1.
17150          */
17151         addr[0] |= 2;                   /* Set Universal/Local bit to 1 */
17152 }
17153
17154 /*
17155  * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand.
17156  * Note on mapping from multicast IP addresses to IPoIB multicast link
17157  * addresses. IPoIB multicast link addresses are based on IBA link addresses.
17158  * The format of an IPoIB multicast address is:
17159  *
17160  *  4 byte QPN      Scope Sign.  Pkey
17161  * +--------------------------------------------+
17162  * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID |
17163  * +--------------------------------------------+
17164  *
17165  * The Scope and Pkey components are properties of the IBA port and
17166  * network interface. They can be ascertained from the broadcast address.
17167  * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6.
17168  */
17169 static void
17170 ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17171 {
17172         static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17173             0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
17174             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17175         uint8_t *bphys_addr;
17176         dl_unitdata_req_t *dlur;
17177
17178         bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17179
17180         /*
17181          * RFC 4391: IPv4 MGID is 28-bit long.
17182          */
17183         m_physaddr[16] = m_ipaddr[0] & 0x0f;
17184         m_physaddr[17] = m_ipaddr[1];
17185         m_physaddr[18] = m_ipaddr[2];
17186         m_physaddr[19] = m_ipaddr[3];
17187
17188
17189         dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17190         if (ill->ill_sap_length < 0) {
17191                 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17192         } else  {
17193                 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17194                     ill->ill_sap_length;
17195         }
17196         /*
17197          * Now fill in the IBA scope/Pkey values from the broadcast address.
17198          */
17199         m_physaddr[5] = bphys_addr[5];
17200         m_physaddr[8] = bphys_addr[8];
17201         m_physaddr[9] = bphys_addr[9];
17202 }
17203
17204 static void
17205 ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17206 {
17207         static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17208             0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00,
17209             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17210         uint8_t *bphys_addr;
17211         dl_unitdata_req_t *dlur;
17212
17213         bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17214
17215         /*
17216          * RFC 4391: IPv4 MGID is 80-bit long.
17217          */
17218         bcopy(&m_ipaddr[6], &m_physaddr[10], 10);
17219
17220         dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17221         if (ill->ill_sap_length < 0) {
17222                 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17223         } else  {
17224                 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17225                     ill->ill_sap_length;
17226         }
17227         /*
17228          * Now fill in the IBA scope/Pkey values from the broadcast address.
17229          */
17230         m_physaddr[5] = bphys_addr[5];
17231         m_physaddr[8] = bphys_addr[8];
17232         m_physaddr[9] = bphys_addr[9];
17233 }
17234
17235 /*
17236  * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4
17237  * tunnel).  The IPv4 address simply get placed in the lower 4 bytes of the
17238  * IPv6 interface id.  This is a suggested mechanism described in section 3.7
17239  * of RFC4213.
17240  */
17241 static void
17242 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17243 {
17244         ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t));
17245         v6addr->s6_addr32[2] = 0;
17246         bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t));
17247 }
17248
17249 /*
17250  * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6
17251  * tunnel).  The lower 8 bytes of the IPv6 address simply become the interface
17252  * id.
17253  */
17254 static void
17255 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17256 {
17257         in6_addr_t *v6lladdr = (in6_addr_t *)physaddr;
17258
17259         ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t));
17260         bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8);
17261 }
17262
17263 static void
17264 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17265 {
17266         ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17267 }
17268
17269 static void
17270 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17271 {
17272         ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17273 }
17274
17275 static void
17276 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17277 {
17278         ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17279 }
17280
17281 static void
17282 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17283 {
17284         ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17285 }
17286
17287 /*
17288  * Lookup an ill and verify that the zoneid has an ipif on that ill.
17289  * Returns an held ill, or NULL.
17290  */
17291 ill_t *
17292 ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6,
17293     ip_stack_t *ipst)
17294 {
17295         ill_t   *ill;
17296         ipif_t  *ipif;
17297
17298         ill = ill_lookup_on_ifindex(index, isv6, ipst);
17299         if (ill == NULL)
17300                 return (NULL);
17301
17302         mutex_enter(&ill->ill_lock);
17303         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17304                 if (IPIF_IS_CONDEMNED(ipif))
17305                         continue;
17306                 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid &&
17307                     ipif->ipif_zoneid != ALL_ZONES)
17308                         continue;
17309
17310                 mutex_exit(&ill->ill_lock);
17311                 return (ill);
17312         }
17313         mutex_exit(&ill->ill_lock);
17314         ill_refrele(ill);
17315         return (NULL);
17316 }
17317
17318 /*
17319  * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id)
17320  * If a pointer to an ipif_t is returned then the caller will need to do
17321  * an ill_refrele().
17322  */
17323 ipif_t *
17324 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
17325     ip_stack_t *ipst)
17326 {
17327         ipif_t *ipif;
17328         ill_t *ill;
17329
17330         ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
17331         if (ill == NULL)
17332                 return (NULL);
17333
17334         mutex_enter(&ill->ill_lock);
17335         if (ill->ill_state_flags & ILL_CONDEMNED) {
17336                 mutex_exit(&ill->ill_lock);
17337                 ill_refrele(ill);
17338                 return (NULL);
17339         }
17340
17341         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17342                 if (!IPIF_CAN_LOOKUP(ipif))
17343                         continue;
17344                 if (lifidx == ipif->ipif_id) {
17345                         ipif_refhold_locked(ipif);
17346                         break;
17347                 }
17348         }
17349
17350         mutex_exit(&ill->ill_lock);
17351         ill_refrele(ill);
17352         return (ipif);
17353 }
17354
17355 /*
17356  * Set ill_inputfn based on the current know state.
17357  * This needs to be called when any of the factors taken into
17358  * account changes.
17359  */
17360 void
17361 ill_set_inputfn(ill_t *ill)
17362 {
17363         ip_stack_t      *ipst = ill->ill_ipst;
17364
17365         if (ill->ill_isv6) {
17366                 ill->ill_inputfn = ill_input_short_v6;
17367         } else {
17368                 if (ill->ill_dhcpinit != 0)
17369                         ill->ill_inputfn = ill_input_full_v4;
17370                 else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head
17371                     != NULL)
17372                         ill->ill_inputfn = ill_input_full_v4;
17373                 else
17374                         ill->ill_inputfn = ill_input_short_v4;
17375         }
17376 }
17377
17378 /*
17379  * Re-evaluate ill_inputfn for all the IPv4 ills.
17380  * Used when RSVP comes and goes.
17381  */
17382 void
17383 ill_set_inputfn_all(ip_stack_t *ipst)
17384 {
17385         ill_walk_context_t      ctx;
17386         ill_t                   *ill;
17387
17388         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
17389         ill = ILL_START_WALK_V4(&ctx, ipst);
17390         for (; ill != NULL; ill = ill_next(&ctx, ill))
17391                 ill_set_inputfn(ill);
17392
17393         rw_exit(&ipst->ips_ill_g_lock);
17394 }
17395
17396 /*
17397  * Set the physical address information for `ill' to the contents of the
17398  * dl_notify_ind_t pointed to by `mp'.  Must be called as writer, and will be
17399  * asynchronous if `ill' cannot immediately be quiesced -- in which case
17400  * EINPROGRESS will be returned.
17401  */
17402 int
17403 ill_set_phys_addr(ill_t *ill, mblk_t *mp)
17404 {
17405         ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17406         dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr;
17407
17408         ASSERT(IAM_WRITER_IPSQ(ipsq));
17409
17410         if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR &&
17411             dlindp->dl_data != DL_CURR_DEST_ADDR &&
17412             dlindp->dl_data != DL_CURR_PHYS_ADDR) {
17413                 /* Changing DL_IPV6_TOKEN is not yet supported */
17414                 return (0);
17415         }
17416
17417         /*
17418          * We need to store up to two copies of `mp' in `ill'.  Due to the
17419          * design of ipsq_pending_mp_add(), we can't pass them as separate
17420          * arguments to ill_set_phys_addr_tail().  Instead, chain them
17421          * together here, then pull 'em apart in ill_set_phys_addr_tail().
17422          */
17423         if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) {
17424                 freemsg(mp);
17425                 return (ENOMEM);
17426         }
17427
17428         ipsq_current_start(ipsq, ill->ill_ipif, 0);
17429
17430         /*
17431          * Since we'll only do a logical down, we can't rely on ipif_down
17432          * to turn on ILL_DOWN_IN_PROGRESS, or for the DL_BIND_ACK to reset
17433          * ILL_DOWN_IN_PROGRESS. We instead manage this separately for this
17434          * case, to quiesce ire's and nce's for ill_is_quiescent.
17435          */
17436         mutex_enter(&ill->ill_lock);
17437         ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
17438         /* no more ire/nce addition allowed */
17439         mutex_exit(&ill->ill_lock);
17440
17441         /*
17442          * If we can quiesce the ill, then set the address.  If not, then
17443          * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail().
17444          */
17445         ill_down_ipifs(ill, B_TRUE);
17446         mutex_enter(&ill->ill_lock);
17447         if (!ill_is_quiescent(ill)) {
17448                 /* call cannot fail since `conn_t *' argument is NULL */
17449                 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
17450                     mp, ILL_DOWN);
17451                 mutex_exit(&ill->ill_lock);
17452                 return (EINPROGRESS);
17453         }
17454         mutex_exit(&ill->ill_lock);
17455
17456         ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL);
17457         return (0);
17458 }
17459
17460 /*
17461  * When the allowed-ips link property is set on the datalink, IP receives a
17462  * DL_NOTE_ALLOWED_IPS notification that is processed in ill_set_allowed_ips()
17463  * to initialize the ill_allowed_ips[] array in the ill_t. This array is then
17464  * used to vet addresses passed to ip_sioctl_addr() and to ensure that the
17465  * only IP addresses configured on the ill_t are those in the ill_allowed_ips[]
17466  * array.
17467  */
17468 void
17469 ill_set_allowed_ips(ill_t *ill, mblk_t *mp)
17470 {
17471         ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17472         dl_notify_ind_t *dlip = (dl_notify_ind_t *)mp->b_rptr;
17473         mac_protect_t *mrp;
17474         int i;
17475
17476         ASSERT(IAM_WRITER_IPSQ(ipsq));
17477         mrp = (mac_protect_t *)&dlip[1];
17478
17479         if (mrp->mp_ipaddrcnt == 0) { /* reset allowed-ips */
17480                 kmem_free(ill->ill_allowed_ips,
17481                     ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17482                 ill->ill_allowed_ips_cnt = 0;
17483                 ill->ill_allowed_ips = NULL;
17484                 mutex_enter(&ill->ill_phyint->phyint_lock);
17485                 ill->ill_phyint->phyint_flags &= ~PHYI_L3PROTECT;
17486                 mutex_exit(&ill->ill_phyint->phyint_lock);
17487                 return;
17488         }
17489
17490         if (ill->ill_allowed_ips != NULL) {
17491                 kmem_free(ill->ill_allowed_ips,
17492                     ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17493         }
17494         ill->ill_allowed_ips_cnt = mrp->mp_ipaddrcnt;
17495         ill->ill_allowed_ips = kmem_alloc(
17496             ill->ill_allowed_ips_cnt * sizeof (in6_addr_t), KM_SLEEP);
17497         for (i = 0; i < mrp->mp_ipaddrcnt;  i++)
17498                 ill->ill_allowed_ips[i] = mrp->mp_ipaddrs[i].ip_addr;
17499
17500         mutex_enter(&ill->ill_phyint->phyint_lock);
17501         ill->ill_phyint->phyint_flags |= PHYI_L3PROTECT;
17502         mutex_exit(&ill->ill_phyint->phyint_lock);
17503 }
17504
17505 /*
17506  * Once the ill associated with `q' has quiesced, set its physical address
17507  * information to the values in `addrmp'.  Note that two copies of `addrmp'
17508  * are passed (linked by b_cont), since we sometimes need to save two distinct
17509  * copies in the ill_t, and our context doesn't permit sleeping or allocation
17510  * failure (we'll free the other copy if it's not needed).  Since the ill_t
17511  * is quiesced, we know any stale nce's with the old address information have
17512  * already been removed, so we don't need to call nce_flush().
17513  */
17514 /* ARGSUSED */
17515 static void
17516 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
17517 {
17518         ill_t           *ill = q->q_ptr;
17519         mblk_t          *addrmp2 = unlinkb(addrmp);
17520         dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr;
17521         uint_t          addrlen, addroff;
17522         int             status;
17523
17524         ASSERT(IAM_WRITER_IPSQ(ipsq));
17525
17526         addroff = dlindp->dl_addr_offset;
17527         addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length);
17528
17529         switch (dlindp->dl_data) {
17530         case DL_IPV6_LINK_LAYER_ADDR:
17531                 ill_set_ndmp(ill, addrmp, addroff, addrlen);
17532                 freemsg(addrmp2);
17533                 break;
17534
17535         case DL_CURR_DEST_ADDR:
17536                 freemsg(ill->ill_dest_addr_mp);
17537                 ill->ill_dest_addr = addrmp->b_rptr + addroff;
17538                 ill->ill_dest_addr_mp = addrmp;
17539                 if (ill->ill_isv6) {
17540                         ill_setdesttoken(ill);
17541                         ipif_setdestlinklocal(ill->ill_ipif);
17542                 }
17543                 freemsg(addrmp2);
17544                 break;
17545
17546         case DL_CURR_PHYS_ADDR:
17547                 freemsg(ill->ill_phys_addr_mp);
17548                 ill->ill_phys_addr = addrmp->b_rptr + addroff;
17549                 ill->ill_phys_addr_mp = addrmp;
17550                 ill->ill_phys_addr_length = addrlen;
17551                 if (ill->ill_isv6)
17552                         ill_set_ndmp(ill, addrmp2, addroff, addrlen);
17553                 else
17554                         freemsg(addrmp2);
17555                 if (ill->ill_isv6) {
17556                         ill_setdefaulttoken(ill);
17557                         ipif_setlinklocal(ill->ill_ipif);
17558                 }
17559                 break;
17560         default:
17561                 ASSERT(0);
17562         }
17563
17564         /*
17565          * reset ILL_DOWN_IN_PROGRESS so that we can successfully add ires
17566          * as we bring the ipifs up again.
17567          */
17568         mutex_enter(&ill->ill_lock);
17569         ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
17570         mutex_exit(&ill->ill_lock);
17571         /*
17572          * If there are ipifs to bring up, ill_up_ipifs() will return
17573          * EINPROGRESS, and ipsq_current_finish() will be called by
17574          * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is
17575          * brought up.
17576          */
17577         status = ill_up_ipifs(ill, q, addrmp);
17578         if (status != EINPROGRESS)
17579                 ipsq_current_finish(ipsq);
17580 }
17581
17582 /*
17583  * Helper routine for setting the ill_nd_lla fields.
17584  */
17585 void
17586 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen)
17587 {
17588         freemsg(ill->ill_nd_lla_mp);
17589         ill->ill_nd_lla = ndmp->b_rptr + addroff;
17590         ill->ill_nd_lla_mp = ndmp;
17591         ill->ill_nd_lla_len = addrlen;
17592 }
17593
17594 /*
17595  * Replumb the ill.
17596  */
17597 int
17598 ill_replumb(ill_t *ill, mblk_t *mp)
17599 {
17600         ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17601
17602         ASSERT(IAM_WRITER_IPSQ(ipsq));
17603
17604         ipsq_current_start(ipsq, ill->ill_ipif, 0);
17605
17606         /*
17607          * If we can quiesce the ill, then continue.  If not, then
17608          * ill_replumb_tail() will be called from ipif_ill_refrele_tail().
17609          */
17610         ill_down_ipifs(ill, B_FALSE);
17611
17612         mutex_enter(&ill->ill_lock);
17613         if (!ill_is_quiescent(ill)) {
17614                 /* call cannot fail since `conn_t *' argument is NULL */
17615                 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
17616                     mp, ILL_DOWN);
17617                 mutex_exit(&ill->ill_lock);
17618                 return (EINPROGRESS);
17619         }
17620         mutex_exit(&ill->ill_lock);
17621
17622         ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL);
17623         return (0);
17624 }
17625
17626 /* ARGSUSED */
17627 static void
17628 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
17629 {
17630         ill_t *ill = q->q_ptr;
17631         int err;
17632         conn_t *connp = NULL;
17633
17634         ASSERT(IAM_WRITER_IPSQ(ipsq));
17635         freemsg(ill->ill_replumb_mp);
17636         ill->ill_replumb_mp = copyb(mp);
17637
17638         if (ill->ill_replumb_mp == NULL) {
17639                 /* out of memory */
17640                 ipsq_current_finish(ipsq);
17641                 return;
17642         }
17643
17644         mutex_enter(&ill->ill_lock);
17645         ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif,
17646             ill->ill_rq, ill->ill_replumb_mp, 0);
17647         mutex_exit(&ill->ill_lock);
17648
17649         if (!ill->ill_up_ipifs) {
17650                 /* already closing */
17651                 ipsq_current_finish(ipsq);
17652                 return;
17653         }
17654         ill->ill_replumbing = 1;
17655         err = ill_down_ipifs_tail(ill);
17656
17657         /*
17658          * Successfully quiesced and brought down the interface, now we send
17659          * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the
17660          * DL_NOTE_REPLUMB message.
17661          */
17662         mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO,
17663             DL_NOTIFY_CONF);
17664         ASSERT(mp != NULL);
17665         ((dl_notify_conf_t *)mp->b_rptr)->dl_notification =
17666             DL_NOTE_REPLUMB_DONE;
17667         ill_dlpi_send(ill, mp);
17668
17669         /*
17670          * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP
17671          * streams have to be unbound. When all the DLPI exchanges are done,
17672          * ipsq_current_finish() will be called by arp_bringup_done(). The
17673          * remainder of ipif bringup via ill_up_ipifs() will also be done in
17674          * arp_bringup_done().
17675          */
17676         ASSERT(ill->ill_replumb_mp != NULL);
17677         if (err == EINPROGRESS)
17678                 return;
17679         else
17680                 ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp);
17681         ASSERT(connp == NULL);
17682         if (err == 0 && ill->ill_replumb_mp != NULL &&
17683             ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) {
17684                 return;
17685         }
17686         ipsq_current_finish(ipsq);
17687 }
17688
17689 /*
17690  * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf'
17691  * which is `bufsize' bytes.  On success, zero is returned and `buf' updated
17692  * as per the ioctl.  On failure, an errno is returned.
17693  */
17694 static int
17695 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr)
17696 {
17697         int rval;
17698         struct strioctl iocb;
17699
17700         iocb.ic_cmd = cmd;
17701         iocb.ic_timout = 15;
17702         iocb.ic_len = bufsize;
17703         iocb.ic_dp = buf;
17704
17705         return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval));
17706 }
17707
17708 /*
17709  * Issue an SIOCGLIFCONF for address family `af' and store the result into a
17710  * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success.
17711  */
17712 static int
17713 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp,
17714     uint_t *bufsizep, cred_t *cr)
17715 {
17716         int err;
17717         struct lifnum lifn;
17718
17719         bzero(&lifn, sizeof (lifn));
17720         lifn.lifn_family = af;
17721         lifn.lifn_flags = LIFC_UNDER_IPMP;
17722
17723         if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0)
17724                 return (err);
17725
17726         /*
17727          * Pad the interface count to account for additional interfaces that
17728          * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
17729          */
17730         lifn.lifn_count += 4;
17731         bzero(lifcp, sizeof (*lifcp));
17732         lifcp->lifc_flags = LIFC_UNDER_IPMP;
17733         lifcp->lifc_family = af;
17734         lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
17735         lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
17736
17737         err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr);
17738         if (err != 0) {
17739                 kmem_free(lifcp->lifc_buf, *bufsizep);
17740                 return (err);
17741         }
17742
17743         return (0);
17744 }
17745
17746 /*
17747  * Helper for ip_interface_cleanup() that removes the loopback interface.
17748  */
17749 static void
17750 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
17751 {
17752         int err;
17753         struct lifreq lifr;
17754
17755         bzero(&lifr, sizeof (lifr));
17756         (void) strcpy(lifr.lifr_name, ipif_loopback_name);
17757
17758         /*
17759          * Attempt to remove the interface.  It may legitimately not exist
17760          * (e.g. the zone administrator unplumbed it), so ignore ENXIO.
17761          */
17762         err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr);
17763         if (err != 0 && err != ENXIO) {
17764                 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: "
17765                     "error %d\n", isv6 ? "v6" : "v4", err));
17766         }
17767 }
17768
17769 /*
17770  * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP
17771  * groups and that IPMP data addresses are down.  These conditions must be met
17772  * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp().
17773  */
17774 static void
17775 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
17776 {
17777         int af = isv6 ? AF_INET6 : AF_INET;
17778         int i, nifs;
17779         int err;
17780         uint_t bufsize;
17781         uint_t lifrsize = sizeof (struct lifreq);
17782         struct lifconf lifc;
17783         struct lifreq *lifrp;
17784
17785         if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) {
17786                 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list "
17787                     "(error %d); any IPMP interfaces cannot be shutdown", err);
17788                 return;
17789         }
17790
17791         nifs = lifc.lifc_len / lifrsize;
17792         for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
17793                 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
17794                 if (err != 0) {
17795                         cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get "
17796                             "flags: error %d", lifrp->lifr_name, err);
17797                         continue;
17798                 }
17799
17800                 if (lifrp->lifr_flags & IFF_IPMP) {
17801                         if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0)
17802                                 continue;
17803
17804                         lifrp->lifr_flags &= ~IFF_UP;
17805                         err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr);
17806                         if (err != 0) {
17807                                 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
17808                                     "bring down (error %d); IPMP interface may "
17809                                     "not be shutdown", lifrp->lifr_name, err);
17810                         }
17811
17812                         /*
17813                          * Check if IFF_DUPLICATE is still set -- and if so,
17814                          * reset the address to clear it.
17815                          */
17816                         err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
17817                         if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE))
17818                                 continue;
17819
17820                         err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr);
17821                         if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR,
17822                             lifrp, lifrsize, cr)) != 0) {
17823                                 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
17824                                     "reset DAD (error %d); IPMP interface may "
17825                                     "not be shutdown", lifrp->lifr_name, err);
17826                         }
17827                         continue;
17828                 }
17829
17830                 if (strchr(lifrp->lifr_name, IPIF_SEPARATOR_CHAR) == 0) {
17831                         lifrp->lifr_groupname[0] = '\0';
17832                         if ((err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp,
17833                             lifrsize, cr)) != 0) {
17834                                 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
17835                                     "leave IPMP group (error %d); associated "
17836                                     "IPMP interface may not be shutdown",
17837                                     lifrp->lifr_name, err);
17838                                 continue;
17839                         }
17840                 }
17841         }
17842
17843         kmem_free(lifc.lifc_buf, bufsize);
17844 }
17845
17846 #define UDPDEV          "/devices/pseudo/udp@0:udp"
17847 #define UDP6DEV         "/devices/pseudo/udp6@0:udp6"
17848
17849 /*
17850  * Remove the loopback interfaces and prep the IPMP interfaces to be torn down.
17851  * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away
17852  * when the user-level processes in the zone are killed and the latter are
17853  * cleaned up by str_stack_shutdown().
17854  */
17855 void
17856 ip_interface_cleanup(ip_stack_t *ipst)
17857 {
17858         ldi_handle_t    lh;
17859         ldi_ident_t     li;
17860         cred_t          *cr;
17861         int             err;
17862         int             i;
17863         char            *devs[] = { UDP6DEV, UDPDEV };
17864         netstackid_t    stackid = ipst->ips_netstack->netstack_stackid;
17865
17866         if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) {
17867                 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:"
17868                     " error %d", err);
17869                 return;
17870         }
17871
17872         cr = zone_get_kcred(netstackid_to_zoneid(stackid));
17873         ASSERT(cr != NULL);
17874
17875         /*
17876          * NOTE: loop executes exactly twice and is hardcoded to know that the
17877          * first iteration is IPv6.  (Unrolling yields repetitious code, hence
17878          * the loop.)
17879          */
17880         for (i = 0; i < 2; i++) {
17881                 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li);
17882                 if (err != 0) {
17883                         cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:"
17884                             " error %d", devs[i], err);
17885                         continue;
17886                 }
17887
17888                 ip_loopback_removeif(lh, i == 0, cr);
17889                 ip_ipmp_cleanup(lh, i == 0, cr);
17890
17891                 (void) ldi_close(lh, FREAD|FWRITE, cr);
17892         }
17893
17894         ldi_ident_release(li);
17895         crfree(cr);
17896 }
17897
17898 /*
17899  * This needs to be in-sync with nic_event_t definition
17900  */
17901 static const char *
17902 ill_hook_event2str(nic_event_t event)
17903 {
17904         switch (event) {
17905         case NE_PLUMB:
17906                 return ("PLUMB");
17907         case NE_UNPLUMB:
17908                 return ("UNPLUMB");
17909         case NE_UP:
17910                 return ("UP");
17911         case NE_DOWN:
17912                 return ("DOWN");
17913         case NE_ADDRESS_CHANGE:
17914                 return ("ADDRESS_CHANGE");
17915         case NE_LIF_UP:
17916                 return ("LIF_UP");
17917         case NE_LIF_DOWN:
17918                 return ("LIF_DOWN");
17919         case NE_IFINDEX_CHANGE:
17920                 return ("IFINDEX_CHANGE");
17921         default:
17922                 return ("UNKNOWN");
17923         }
17924 }
17925
17926 void
17927 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event,
17928     nic_event_data_t data, size_t datalen)
17929 {
17930         ip_stack_t              *ipst = ill->ill_ipst;
17931         hook_nic_event_int_t    *info;
17932         const char              *str = NULL;
17933
17934         /* create a new nic event info */
17935         if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL)
17936                 goto fail;
17937
17938         info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
17939         info->hnei_event.hne_lif = lif;
17940         info->hnei_event.hne_event = event;
17941         info->hnei_event.hne_protocol = ill->ill_isv6 ?
17942             ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data;
17943         info->hnei_event.hne_data = NULL;
17944         info->hnei_event.hne_datalen = 0;
17945         info->hnei_stackid = ipst->ips_netstack->netstack_stackid;
17946
17947         if (data != NULL && datalen != 0) {
17948                 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP);
17949                 if (info->hnei_event.hne_data == NULL)
17950                         goto fail;
17951                 bcopy(data, info->hnei_event.hne_data, datalen);
17952                 info->hnei_event.hne_datalen = datalen;
17953         }
17954
17955         if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info,
17956             DDI_NOSLEEP) == DDI_SUCCESS)
17957                 return;
17958
17959 fail:
17960         if (info != NULL) {
17961                 if (info->hnei_event.hne_data != NULL) {
17962                         kmem_free(info->hnei_event.hne_data,
17963                             info->hnei_event.hne_datalen);
17964                 }
17965                 kmem_free(info, sizeof (hook_nic_event_t));
17966         }
17967         str = ill_hook_event2str(event);
17968         ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event "
17969             "information for %s (ENOMEM)\n", str, ill->ill_name));
17970 }
17971
17972 static int
17973 ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act)
17974 {
17975         int             err = 0;
17976         const in_addr_t *addr = NULL;
17977         nce_t           *nce = NULL;
17978         ill_t           *ill = ipif->ipif_ill;
17979         ill_t           *bound_ill;
17980         boolean_t       added_ipif = B_FALSE;
17981         uint16_t        state;
17982         uint16_t        flags;
17983
17984         DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail",
17985             ill_t *, ill, ipif_t *, ipif);
17986         if (ipif->ipif_lcl_addr != INADDR_ANY) {
17987                 addr = &ipif->ipif_lcl_addr;
17988         }
17989
17990         if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) {
17991                 if (res_act != Res_act_initial)
17992                         return (EINVAL);
17993         }
17994
17995         if (addr != NULL) {
17996                 ipmp_illgrp_t   *illg = ill->ill_grp;
17997
17998                 /* add unicast nce for the local addr */
17999
18000                 if (IS_IPMP(ill)) {
18001                         /*
18002                          * If we're here via ipif_up(), then the ipif
18003                          * won't be bound yet -- add it to the group,
18004                          * which will bind it if possible. (We would
18005                          * add it in ipif_up(), but deleting on failure
18006                          * there is gruesome.)  If we're here via
18007                          * ipmp_ill_bind_ipif(), then the ipif has
18008                          * already been added to the group and we
18009                          * just need to use the binding.
18010                          */
18011                         if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) {
18012                                 bound_ill  = ipmp_illgrp_add_ipif(illg, ipif);
18013                                 if (bound_ill == NULL) {
18014                                         /*
18015                                          * We couldn't bind the ipif to an ill
18016                                          * yet, so we have nothing to publish.
18017                                          * Mark the address as ready and return.
18018                                          */
18019                                         ipif->ipif_addr_ready = 1;
18020                                         return (0);
18021                                 }
18022                                 added_ipif = B_TRUE;
18023                         }
18024                 } else {
18025                         bound_ill = ill;
18026                 }
18027
18028                 flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY |
18029                     NCE_F_NONUD);
18030                 /*
18031                  * If this is an initial bring-up (or the ipif was never
18032                  * completely brought up), do DAD.  Otherwise, we're here
18033                  * because IPMP has rebound an address to this ill: send
18034                  * unsolicited advertisements (ARP announcements) to
18035                  * inform others.
18036                  */
18037                 if (res_act == Res_act_initial || !ipif->ipif_addr_ready) {
18038                         state = ND_UNCHANGED; /* compute in nce_add_common() */
18039                 } else {
18040                         state = ND_REACHABLE;
18041                         flags |= NCE_F_UNSOL_ADV;
18042                 }
18043
18044 retry:
18045                 err = nce_lookup_then_add_v4(ill,
18046                     bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length,
18047                     addr, flags, state, &nce);
18048
18049                 /*
18050                  * note that we may encounter EEXIST if we are moving
18051                  * the nce as a result of a rebind operation.
18052                  */
18053                 switch (err) {
18054                 case 0:
18055                         ipif->ipif_added_nce = 1;
18056                         nce->nce_ipif_cnt++;
18057                         break;
18058                 case EEXIST:
18059                         ip1dbg(("ipif_arp_up: NCE already exists for %s\n",
18060                             ill->ill_name));
18061                         if (!NCE_MYADDR(nce->nce_common)) {
18062                                 /*
18063                                  * A leftover nce from before this address
18064                                  * existed
18065                                  */
18066                                 ncec_delete(nce->nce_common);
18067                                 nce_refrele(nce);
18068                                 nce = NULL;
18069                                 goto retry;
18070                         }
18071                         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
18072                                 nce_refrele(nce);
18073                                 nce = NULL;
18074                                 ip1dbg(("ipif_arp_up: NCE already exists "
18075                                     "for %s:%u\n", ill->ill_name,
18076                                     ipif->ipif_id));
18077                                 goto arp_up_done;
18078                         }
18079                         /*
18080                          * Duplicate local addresses are permissible for
18081                          * IPIF_POINTOPOINT interfaces which will get marked
18082                          * IPIF_UNNUMBERED later in
18083                          * ip_addr_availability_check().
18084                          *
18085                          * The nce_ipif_cnt field tracks the number of
18086                          * ipifs that have nce_addr as their local address.
18087                          */
18088                         ipif->ipif_addr_ready = 1;
18089                         ipif->ipif_added_nce = 1;
18090                         nce->nce_ipif_cnt++;
18091                         err = 0;
18092                         break;
18093                 default:
18094                         ASSERT(nce == NULL);
18095                         goto arp_up_done;
18096                 }
18097                 if (arp_no_defense) {
18098                         if ((ipif->ipif_flags & IPIF_UP) &&
18099                             !ipif->ipif_addr_ready)
18100                                 ipif_up_notify(ipif);
18101                         ipif->ipif_addr_ready = 1;
18102                 }
18103         } else {
18104                 /* zero address. nothing to publish */
18105                 ipif->ipif_addr_ready = 1;
18106         }
18107         if (nce != NULL)
18108                 nce_refrele(nce);
18109 arp_up_done:
18110         if (added_ipif && err != 0)
18111                 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
18112         return (err);
18113 }
18114
18115 int
18116 ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup)
18117 {
18118         int             err = 0;
18119         ill_t           *ill = ipif->ipif_ill;
18120         boolean_t       first_interface, wait_for_dlpi = B_FALSE;
18121
18122         DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up",
18123             ill_t *, ill, ipif_t *, ipif);
18124
18125         /*
18126          * need to bring up ARP or setup mcast mapping only
18127          * when the first interface is coming UP.
18128          */
18129         first_interface = (ill->ill_ipif_up_count == 0 &&
18130             ill->ill_ipif_dup_count == 0 && !was_dup);
18131
18132         if (res_act == Res_act_initial && first_interface) {
18133                 /*
18134                  * Send ATTACH + BIND
18135                  */
18136                 err = arp_ll_up(ill);
18137                 if (err != EINPROGRESS && err != 0)
18138                         return (err);
18139
18140                 /*
18141                  * Add NCE for local address. Start DAD.
18142                  * we'll wait to hear that DAD has finished
18143                  * before using the interface.
18144                  */
18145                 if (err == EINPROGRESS)
18146                         wait_for_dlpi = B_TRUE;
18147         }
18148
18149         if (!wait_for_dlpi)
18150                 (void) ipif_arp_up_done_tail(ipif, res_act);
18151
18152         return (!wait_for_dlpi ? 0 : EINPROGRESS);
18153 }
18154
18155 /*
18156  * Finish processing of "arp_up" after all the DLPI message
18157  * exchanges have completed between arp and the driver.
18158  */
18159 void
18160 arp_bringup_done(ill_t *ill, int err)
18161 {
18162         mblk_t  *mp1;
18163         ipif_t  *ipif;
18164         conn_t *connp = NULL;
18165         ipsq_t  *ipsq;
18166         queue_t *q;
18167
18168         ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name));
18169
18170         ASSERT(IAM_WRITER_ILL(ill));
18171
18172         ipsq = ill->ill_phyint->phyint_ipsq;
18173         ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18174         mp1 = ipsq_pending_mp_get(ipsq, &connp);
18175         ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18176         if (mp1 == NULL) /* bringup was aborted by the user */
18177                 return;
18178
18179         /*
18180          * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18181          * must have an associated conn_t.  Otherwise, we're bringing this
18182          * interface back up as part of handling an asynchronous event (e.g.,
18183          * physical address change).
18184          */
18185         if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18186                 ASSERT(connp != NULL);
18187                 q = CONNP_TO_WQ(connp);
18188         } else {
18189                 ASSERT(connp == NULL);
18190                 q = ill->ill_rq;
18191         }
18192         if (err == 0) {
18193                 if (ipif->ipif_isv6) {
18194                         if ((err = ipif_up_done_v6(ipif)) != 0)
18195                                 ip0dbg(("arp_bringup_done: init failed\n"));
18196                 } else {
18197                         err = ipif_arp_up_done_tail(ipif, Res_act_initial);
18198                         if (err != 0 ||
18199                             (err = ipif_up_done(ipif)) != 0) {
18200                                 ip0dbg(("arp_bringup_done: "
18201                                     "init failed err %x\n", err));
18202                                 (void) ipif_arp_down(ipif);
18203                         }
18204
18205                 }
18206         } else {
18207                 ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n"));
18208         }
18209
18210         if ((err == 0) && (ill->ill_up_ipifs)) {
18211                 err = ill_up_ipifs(ill, q, mp1);
18212                 if (err == EINPROGRESS)
18213                         return;
18214         }
18215
18216         /*
18217          * If we have a moved ipif to bring up, and everything has succeeded
18218          * to this point, bring it up on the IPMP ill.  Otherwise, leave it
18219          * down -- the admin can try to bring it up by hand if need be.
18220          */
18221         if (ill->ill_move_ipif != NULL) {
18222                 ipif = ill->ill_move_ipif;
18223                 ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif,
18224                     ipif->ipif_ill->ill_name));
18225                 ill->ill_move_ipif = NULL;
18226                 if (err == 0) {
18227                         err = ipif_up(ipif, q, mp1);
18228                         if (err == EINPROGRESS)
18229                                 return;
18230                 }
18231         }
18232
18233         /*
18234          * The operation must complete without EINPROGRESS since
18235          * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18236          * Otherwise, the operation will be stuck forever in the ipsq.
18237          */
18238         ASSERT(err != EINPROGRESS);
18239         if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18240                 DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish",
18241                     int, ipsq->ipsq_xop->ipx_current_ioctl,
18242                     ill_t *, ill, ipif_t *, ipif);
18243                 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18244         } else {
18245                 ipsq_current_finish(ipsq);
18246         }
18247 }
18248
18249 /*
18250  * Finish processing of arp replumb after all the DLPI message
18251  * exchanges have completed between arp and the driver.
18252  */
18253 void
18254 arp_replumb_done(ill_t *ill, int err)
18255 {
18256         mblk_t  *mp1;
18257         ipif_t  *ipif;
18258         conn_t *connp = NULL;
18259         ipsq_t  *ipsq;
18260         queue_t *q;
18261
18262         ASSERT(IAM_WRITER_ILL(ill));
18263
18264         ipsq = ill->ill_phyint->phyint_ipsq;
18265         ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18266         mp1 = ipsq_pending_mp_get(ipsq, &connp);
18267         ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18268         if (mp1 == NULL) {
18269                 ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n",
18270                     ipsq->ipsq_xop->ipx_current_ioctl));
18271                 /* bringup was aborted by the user */
18272                 return;
18273         }
18274         /*
18275          * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18276          * must have an associated conn_t.  Otherwise, we're bringing this
18277          * interface back up as part of handling an asynchronous event (e.g.,
18278          * physical address change).
18279          */
18280         if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18281                 ASSERT(connp != NULL);
18282                 q = CONNP_TO_WQ(connp);
18283         } else {
18284                 ASSERT(connp == NULL);
18285                 q = ill->ill_rq;
18286         }
18287         if ((err == 0) && (ill->ill_up_ipifs)) {
18288                 err = ill_up_ipifs(ill, q, mp1);
18289                 if (err == EINPROGRESS)
18290                         return;
18291         }
18292         /*
18293          * The operation must complete without EINPROGRESS since
18294          * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18295          * Otherwise, the operation will be stuck forever in the ipsq.
18296          */
18297         ASSERT(err != EINPROGRESS);
18298         if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18299                 DTRACE_PROBE4(ipif__ioctl, char *,
18300                     "arp_replumb_done finish",
18301                     int, ipsq->ipsq_xop->ipx_current_ioctl,
18302                     ill_t *, ill, ipif_t *, ipif);
18303                 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18304         } else {
18305                 ipsq_current_finish(ipsq);
18306         }
18307 }
18308
18309 void
18310 ipif_up_notify(ipif_t *ipif)
18311 {
18312         ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
18313         ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT);
18314         sctp_update_ipif(ipif, SCTP_IPIF_UP);
18315         ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id),
18316             NE_LIF_UP, NULL, 0);
18317 }
18318
18319 /*
18320  * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and
18321  * this assumes the context is cv_wait'able.  Hence it shouldnt' be used on
18322  * TPI end points with STREAMS modules pushed above.  This is assured by not
18323  * having the IPI_MODOK flag for the ioctl.  And IP ensures the ILB ioctl
18324  * never ends up on an ipsq, otherwise we may end up processing the ioctl
18325  * while unwinding from the ispq and that could be a thread from the bottom.
18326  */
18327 /* ARGSUSED */
18328 int
18329 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
18330     ip_ioctl_cmd_t *ipip, void *arg)
18331 {
18332         mblk_t *cmd_mp = mp->b_cont->b_cont;
18333         ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr);
18334         int ret = 0;
18335         int i;
18336         size_t size;
18337         ip_stack_t *ipst;
18338         zoneid_t zoneid;
18339         ilb_stack_t *ilbs;
18340
18341         ipst = CONNQ_TO_IPST(q);
18342         ilbs = ipst->ips_netstack->netstack_ilb;
18343         zoneid = Q_TO_CONN(q)->conn_zoneid;
18344
18345         switch (command) {
18346         case ILB_CREATE_RULE: {
18347                 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18348
18349                 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18350                         ret = EINVAL;
18351                         break;
18352                 }
18353
18354                 ret = ilb_rule_add(ilbs, zoneid, cmd);
18355                 break;
18356         }
18357         case ILB_DESTROY_RULE:
18358         case ILB_ENABLE_RULE:
18359         case ILB_DISABLE_RULE: {
18360                 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr;
18361
18362                 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) {
18363                         ret = EINVAL;
18364                         break;
18365                 }
18366
18367                 if (cmd->flags & ILB_RULE_ALLRULES) {
18368                         if (command == ILB_DESTROY_RULE) {
18369                                 ilb_rule_del_all(ilbs, zoneid);
18370                                 break;
18371                         } else if (command == ILB_ENABLE_RULE) {
18372                                 ilb_rule_enable_all(ilbs, zoneid);
18373                                 break;
18374                         } else if (command == ILB_DISABLE_RULE) {
18375                                 ilb_rule_disable_all(ilbs, zoneid);
18376                                 break;
18377                         }
18378                 } else {
18379                         if (command == ILB_DESTROY_RULE) {
18380                                 ret = ilb_rule_del(ilbs, zoneid, cmd->name);
18381                         } else if (command == ILB_ENABLE_RULE) {
18382                                 ret = ilb_rule_enable(ilbs, zoneid, cmd->name,
18383                                     NULL);
18384                         } else if (command == ILB_DISABLE_RULE) {
18385                                 ret = ilb_rule_disable(ilbs, zoneid, cmd->name,
18386                                     NULL);
18387                         }
18388                 }
18389                 break;
18390         }
18391         case ILB_NUM_RULES: {
18392                 ilb_num_rules_cmd_t *cmd;
18393
18394                 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) {
18395                         ret = EINVAL;
18396                         break;
18397                 }
18398                 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr;
18399                 ilb_get_num_rules(ilbs, zoneid, &(cmd->num));
18400                 break;
18401         }
18402         case ILB_RULE_NAMES: {
18403                 ilb_rule_names_cmd_t *cmd;
18404
18405                 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr;
18406                 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) ||
18407                     cmd->num_names == 0) {
18408                         ret = EINVAL;
18409                         break;
18410                 }
18411                 size = cmd->num_names * ILB_RULE_NAMESZ;
18412                 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) +
18413                     size != cmd_mp->b_wptr) {
18414                         ret = EINVAL;
18415                         break;
18416                 }
18417                 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf);
18418                 break;
18419         }
18420         case ILB_NUM_SERVERS: {
18421                 ilb_num_servers_cmd_t *cmd;
18422
18423                 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) {
18424                         ret = EINVAL;
18425                         break;
18426                 }
18427                 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr;
18428                 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name,
18429                     &(cmd->num));
18430                 break;
18431         }
18432         case ILB_LIST_RULE: {
18433                 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18434
18435                 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18436                         ret = EINVAL;
18437                         break;
18438                 }
18439                 ret = ilb_rule_list(ilbs, zoneid, cmd);
18440                 break;
18441         }
18442         case ILB_LIST_SERVERS: {
18443                 ilb_servers_info_cmd_t *cmd;
18444
18445                 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18446                 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) ||
18447                     cmd->num_servers == 0) {
18448                         ret = EINVAL;
18449                         break;
18450                 }
18451                 size = cmd->num_servers * sizeof (ilb_server_info_t);
18452                 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18453                     size != cmd_mp->b_wptr) {
18454                         ret = EINVAL;
18455                         break;
18456                 }
18457
18458                 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers,
18459                     &cmd->num_servers);
18460                 break;
18461         }
18462         case ILB_ADD_SERVERS: {
18463                 ilb_servers_info_cmd_t *cmd;
18464                 ilb_rule_t *rule;
18465
18466                 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18467                 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) {
18468                         ret = EINVAL;
18469                         break;
18470                 }
18471                 size = cmd->num_servers * sizeof (ilb_server_info_t);
18472                 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18473                     size != cmd_mp->b_wptr) {
18474                         ret = EINVAL;
18475                         break;
18476                 }
18477                 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18478                 if (rule == NULL) {
18479                         ASSERT(ret != 0);
18480                         break;
18481                 }
18482                 for (i = 0; i < cmd->num_servers; i++) {
18483                         ilb_server_info_t *s;
18484
18485                         s = &cmd->servers[i];
18486                         s->err = ilb_server_add(ilbs, rule, s);
18487                 }
18488                 ILB_RULE_REFRELE(rule);
18489                 break;
18490         }
18491         case ILB_DEL_SERVERS:
18492         case ILB_ENABLE_SERVERS:
18493         case ILB_DISABLE_SERVERS: {
18494                 ilb_servers_cmd_t *cmd;
18495                 ilb_rule_t *rule;
18496                 int (*f)();
18497
18498                 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr;
18499                 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) {
18500                         ret = EINVAL;
18501                         break;
18502                 }
18503                 size = cmd->num_servers * sizeof (ilb_server_arg_t);
18504                 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) +
18505                     size != cmd_mp->b_wptr) {
18506                         ret = EINVAL;
18507                         break;
18508                 }
18509
18510                 if (command == ILB_DEL_SERVERS)
18511                         f = ilb_server_del;
18512                 else if (command == ILB_ENABLE_SERVERS)
18513                         f = ilb_server_enable;
18514                 else if (command == ILB_DISABLE_SERVERS)
18515                         f = ilb_server_disable;
18516
18517                 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18518                 if (rule == NULL) {
18519                         ASSERT(ret != 0);
18520                         break;
18521                 }
18522
18523                 for (i = 0; i < cmd->num_servers; i++) {
18524                         ilb_server_arg_t *s;
18525
18526                         s = &cmd->servers[i];
18527                         s->err = f(ilbs, zoneid, NULL, rule, &s->addr);
18528                 }
18529                 ILB_RULE_REFRELE(rule);
18530                 break;
18531         }
18532         case ILB_LIST_NAT_TABLE: {
18533                 ilb_list_nat_cmd_t *cmd;
18534
18535                 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr;
18536                 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) {
18537                         ret = EINVAL;
18538                         break;
18539                 }
18540                 size = cmd->num_nat * sizeof (ilb_nat_entry_t);
18541                 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) +
18542                     size != cmd_mp->b_wptr) {
18543                         ret = EINVAL;
18544                         break;
18545                 }
18546
18547                 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat,
18548                     &cmd->flags);
18549                 break;
18550         }
18551         case ILB_LIST_STICKY_TABLE: {
18552                 ilb_list_sticky_cmd_t *cmd;
18553
18554                 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr;
18555                 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) {
18556                         ret = EINVAL;
18557                         break;
18558                 }
18559                 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t);
18560                 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) +
18561                     size != cmd_mp->b_wptr) {
18562                         ret = EINVAL;
18563                         break;
18564                 }
18565
18566                 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries,
18567                     &cmd->num_sticky, &cmd->flags);
18568                 break;
18569         }
18570         default:
18571                 ret = EINVAL;
18572                 break;
18573         }
18574 done:
18575         return (ret);
18576 }
18577
18578 /* Remove all cache entries for this logical interface */
18579 void
18580 ipif_nce_down(ipif_t *ipif)
18581 {
18582         ill_t *ill = ipif->ipif_ill;
18583         nce_t *nce;
18584
18585         DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down",
18586             ill_t *, ill, ipif_t *, ipif);
18587         if (ipif->ipif_added_nce) {
18588                 if (ipif->ipif_isv6)
18589                         nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
18590                 else
18591                         nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr);
18592                 if (nce != NULL) {
18593                         if (--nce->nce_ipif_cnt == 0)
18594                                 ncec_delete(nce->nce_common);
18595                         ipif->ipif_added_nce = 0;
18596                         nce_refrele(nce);
18597                 } else {
18598                         /*
18599                          * nce may already be NULL because it was already
18600                          * flushed, e.g., due to a call to nce_flush
18601                          */
18602                         ipif->ipif_added_nce = 0;
18603                 }
18604         }
18605         /*
18606          * Make IPMP aware of the deleted data address.
18607          */
18608         if (IS_IPMP(ill))
18609                 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
18610
18611         /*
18612          * Remove all other nces dependent on this ill when the last ipif
18613          * is going away.
18614          */
18615         if (ill->ill_ipif_up_count == 0) {
18616                 ncec_walk(ill, ncec_delete_per_ill, ill, ill->ill_ipst);
18617                 if (IS_UNDER_IPMP(ill))
18618                         nce_flush(ill, B_TRUE);
18619         }
18620 }
18621
18622 /*
18623  * find the first interface that uses usill for its source address.
18624  */
18625 ill_t *
18626 ill_lookup_usesrc(ill_t *usill)
18627 {
18628         ip_stack_t *ipst = usill->ill_ipst;
18629         ill_t *ill;
18630
18631         ASSERT(usill != NULL);
18632
18633         /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
18634         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
18635         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
18636         for (ill = usill->ill_usesrc_grp_next; ill != NULL && ill != usill;
18637             ill = ill->ill_usesrc_grp_next) {
18638                 if (!IS_UNDER_IPMP(ill) && (ill->ill_flags & ILLF_MULTICAST) &&
18639                     !ILL_IS_CONDEMNED(ill)) {
18640                         ill_refhold(ill);
18641                         break;
18642                 }
18643         }
18644         rw_exit(&ipst->ips_ill_g_lock);
18645         rw_exit(&ipst->ips_ill_g_usesrc_lock);
18646         return (ill);
18647 }
18648
18649 /*
18650  * This comment applies to both ip_sioctl_get_ifhwaddr and
18651  * ip_sioctl_get_lifhwaddr as the basic function of these two functions
18652  * is the same.
18653  *
18654  * The goal here is to find an IP interface that corresponds to the name
18655  * provided by the caller in the ifreq/lifreq structure held in the mblk_t
18656  * chain and to fill out a sockaddr/sockaddr_storage structure with the
18657  * mac address.
18658  *
18659  * The SIOCGIFHWADDR/SIOCGLIFHWADDR ioctl may return an error for a number
18660  * of different reasons:
18661  * ENXIO - the device name is not known to IP.
18662  * EADDRNOTAVAIL - the device has no hardware address. This is indicated
18663  * by ill_phys_addr not pointing to an actual address.
18664  * EPFNOSUPPORT - this will indicate that a request is being made for a
18665  * mac address that will not fit in the data structure supplier (struct
18666  * sockaddr).
18667  *
18668  */
18669 /* ARGSUSED */
18670 int
18671 ip_sioctl_get_ifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
18672     ip_ioctl_cmd_t *ipip, void *if_req)
18673 {
18674         struct sockaddr *sock;
18675         struct ifreq *ifr;
18676         mblk_t *mp1;
18677         ill_t *ill;
18678
18679         ASSERT(ipif != NULL);
18680         ill = ipif->ipif_ill;
18681
18682         if (ill->ill_phys_addr == NULL) {
18683                 return (EADDRNOTAVAIL);
18684         }
18685         if (ill->ill_phys_addr_length > sizeof (sock->sa_data)) {
18686                 return (EPFNOSUPPORT);
18687         }
18688
18689         ip1dbg(("ip_sioctl_get_hwaddr(%s)\n", ill->ill_name));
18690
18691         /* Existence of mp1 has been checked in ip_wput_nondata */
18692         mp1 = mp->b_cont->b_cont;
18693         ifr = (struct ifreq *)mp1->b_rptr;
18694
18695         sock = &ifr->ifr_addr;
18696         /*
18697          * The "family" field in the returned structure is set to a value
18698          * that represents the type of device to which the address belongs.
18699          * The value returned may differ to that on Linux but it will still
18700          * represent the correct symbol on Solaris.
18701          */
18702         sock->sa_family = arp_hw_type(ill->ill_mactype);
18703         bcopy(ill->ill_phys_addr, &sock->sa_data, ill->ill_phys_addr_length);
18704
18705         return (0);
18706 }
18707
18708 /*
18709  * The expection of applications using SIOCGIFHWADDR is that data will
18710  * be returned in the sa_data field of the sockaddr structure. With
18711  * SIOCGLIFHWADDR, we're breaking new ground as there is no Linux
18712  * equivalent. In light of this, struct sockaddr_dl is used as it
18713  * offers more space for address storage in sll_data.
18714  */
18715 /* ARGSUSED */
18716 int
18717 ip_sioctl_get_lifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
18718     ip_ioctl_cmd_t *ipip, void *if_req)
18719 {
18720         struct sockaddr_dl *sock;
18721         struct lifreq *lifr;
18722         mblk_t *mp1;
18723         ill_t *ill;
18724
18725         ASSERT(ipif != NULL);
18726         ill = ipif->ipif_ill;
18727
18728         if (ill->ill_phys_addr == NULL) {
18729                 return (EADDRNOTAVAIL);
18730         }
18731         if (ill->ill_phys_addr_length > sizeof (sock->sdl_data)) {
18732                 return (EPFNOSUPPORT);
18733         }
18734
18735         ip1dbg(("ip_sioctl_get_lifhwaddr(%s)\n", ill->ill_name));
18736
18737         /* Existence of mp1 has been checked in ip_wput_nondata */
18738         mp1 = mp->b_cont->b_cont;
18739         lifr = (struct lifreq *)mp1->b_rptr;
18740
18741         /*
18742          * sockaddr_ll is used here because it is also the structure used in
18743          * responding to the same ioctl in sockpfp. The only other choice is
18744          * sockaddr_dl which contains fields that are not required here
18745          * because its purpose is different.
18746          */
18747         lifr->lifr_type = ill->ill_type;
18748         sock = (struct sockaddr_dl *)&lifr->lifr_addr;
18749         sock->sdl_family = AF_LINK;
18750         sock->sdl_index = ill->ill_phyint->phyint_ifindex;
18751         sock->sdl_type = ill->ill_mactype;
18752         sock->sdl_nlen = 0;
18753         sock->sdl_slen = 0;
18754         sock->sdl_alen = ill->ill_phys_addr_length;
18755         bcopy(ill->ill_phys_addr, sock->sdl_data, ill->ill_phys_addr_length);
18756
18757         return (0);
18758 }