usr/src/uts/common/inet/ip/ip_if.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /* Copyright (c) 1990 Mentat Inc. */
  26
  27 #pragma ident   "%Z%%M% %I%     %E% SMI"
  28
  29 /*
  30  * This file contains the interface control functions for IP.
  31  */
  32
  33 #include <sys/types.h>
  34 #include <sys/stream.h>
  35 #include <sys/dlpi.h>
  36 #include <sys/stropts.h>
  37 #include <sys/strsun.h>
  38 #include <sys/sysmacros.h>
  39 #include <sys/strlog.h>
  40 #include <sys/ddi.h>
  41 #include <sys/sunddi.h>
  42 #include <sys/cmn_err.h>
  43 #include <sys/kstat.h>
  44 #include <sys/debug.h>
  45 #include <sys/zone.h>
  46 #include <sys/sunldi.h>
  47 #include <sys/file.h>
  48 #include <sys/bitmap.h>
  49
  50 #include <sys/kmem.h>
  51 #include <sys/systm.h>
  52 #include <sys/param.h>
  53 #include <sys/socket.h>
  54 #include <sys/isa_defs.h>
  55 #include <net/if.h>
  56 #include <net/if_arp.h>
  57 #include <net/if_types.h>
  58 #include <net/if_dl.h>
  59 #include <net/route.h>
  60 #include <sys/sockio.h>
  61 #include <netinet/in.h>
  62 #include <netinet/ip6.h>
  63 #include <netinet/icmp6.h>
  64 #include <netinet/igmp_var.h>
  65 #include <sys/strsun.h>
  66 #include <sys/policy.h>
  67 #include <sys/ethernet.h>
  68
  69 #include <inet/common.h>   /* for various inet/mi.h and inet/nd.h needs */
  70 #include <inet/mi.h>
  71 #include <inet/nd.h>
  72 #include <inet/arp.h>
  73 #include <inet/mib2.h>
  74 #include <inet/ip.h>
  75 #include <inet/ip6.h>
  76 #include <inet/ip6_asp.h>
  77 #include <inet/tcp.h>
  78 #include <inet/ip_multi.h>
  79 #include <inet/ip_ire.h>
  80 #include <inet/ip_ftable.h>
  81 #include <inet/ip_rts.h>
  82 #include <inet/ip_ndp.h>
  83 #include <inet/ip_if.h>
  84 #include <inet/ip_impl.h>
  85 #include <inet/tun.h>
  86 #include <inet/sctp_ip.h>
  87 #include <inet/ip_netinfo.h>
  88 #include <inet/mib2.h>
  89
  90 #include <net/pfkeyv2.h>
  91 #include <inet/ipsec_info.h>
  92 #include <inet/sadb.h>
  93 #include <inet/ipsec_impl.h>
  94 #include <sys/iphada.h>
  95
  96
  97 #include <netinet/igmp.h>
  98 #include <inet/ip_listutils.h>
  99 #include <inet/ipclassifier.h>
 100 #include <sys/mac.h>
 101
 102 #include <sys/systeminfo.h>
 103 #include <sys/bootconf.h>
 104
 105 #include <sys/tsol/tndb.h>
 106 #include <sys/tsol/tnet.h>
 107
 108 /* The character which tells where the ill_name ends */
 109 #define IPIF_SEPARATOR_CHAR     ':'
 110
 111 /* IP ioctl function table entry */
 112 typedef struct ipft_s {
 113         int     ipft_cmd;
 114         pfi_t   ipft_pfi;
 115         int     ipft_min_size;
 116         int     ipft_flags;
 117 } ipft_t;
 118 #define IPFT_F_NO_REPLY         0x1     /* IP ioctl does not expect any reply */
 119 #define IPFT_F_SELF_REPLY       0x2     /* ioctl callee does the ioctl reply */
 120
 121 typedef struct ip_sock_ar_s {
 122         union {
 123                 area_t  ip_sock_area;
 124                 ared_t  ip_sock_ared;
 125                 areq_t  ip_sock_areq;
 126         } ip_sock_ar_u;
 127         queue_t *ip_sock_ar_q;
 128 } ip_sock_ar_t;
 129
 130 static int      nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
 131 static int      nd_ill_forward_set(queue_t *q, mblk_t *mp,
 132                     char *value, caddr_t cp, cred_t *ioc_cr);
 133
 134 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
 135 static ip_m_t   *ip_m_lookup(t_uscalar_t mac_type);
 136 static int      ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
 137     mblk_t *mp, boolean_t need_up);
 138 static int      ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
 139     mblk_t *mp, boolean_t need_up);
 140 static int      ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
 141     queue_t *q, mblk_t *mp, boolean_t need_up);
 142 static int      ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
 143     mblk_t *mp, boolean_t need_up);
 144 static int      ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
 145     mblk_t *mp);
 146 static int      ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
 147     queue_t *q, mblk_t *mp, boolean_t need_up);
 148 static int      ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
 149     int ioccmd, struct linkblk *li, boolean_t doconsist);
 150 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
 151 static void     ip_wput_ioctl(queue_t *q, mblk_t *mp);
 152 static void     ipsq_flush(ill_t *ill);
 153
 154 static  int     ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
 155     queue_t *q, mblk_t *mp, boolean_t need_up);
 156 static void     ipsq_delete(ipsq_t *);
 157
 158 static ipif_t   *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
 159                     boolean_t initialize);
 160 static void     ipif_check_bcast_ires(ipif_t *test_ipif);
 161 static ire_t    **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
 162 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
 163                     boolean_t isv6);
 164 static void     ipif_down_delete_ire(ire_t *ire, char *ipif);
 165 static void     ipif_delete_cache_ire(ire_t *, char *);
 166 static int      ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
 167 static void     ipif_free(ipif_t *ipif);
 168 static void     ipif_free_tail(ipif_t *ipif);
 169 static void     ipif_mtu_change(ire_t *ire, char *ipif_arg);
 170 static void     ipif_multicast_down(ipif_t *ipif);
 171 static void     ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif);
 172 static void     ipif_set_default(ipif_t *ipif);
 173 static int      ipif_set_values(queue_t *q, mblk_t *mp,
 174     char *interf_name, uint_t *ppa);
 175 static int      ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
 176     queue_t *q);
 177 static ipif_t   *ipif_lookup_on_name(char *name, size_t namelen,
 178     boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
 179     queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *);
 180 static int      ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp);
 181 static void     ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp);
 182
 183 static int      ill_alloc_ppa(ill_if_t *, ill_t *);
 184 static int      ill_arp_off(ill_t *ill);
 185 static int      ill_arp_on(ill_t *ill);
 186 static void     ill_delete_interface_type(ill_if_t *);
 187 static int      ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
 188 static void     ill_dl_down(ill_t *ill);
 189 static void     ill_down(ill_t *ill);
 190 static void     ill_downi(ire_t *ire, char *ill_arg);
 191 static void     ill_free_mib(ill_t *ill);
 192 static void     ill_glist_delete(ill_t *);
 193 static boolean_t ill_has_usable_ipif(ill_t *);
 194 static int      ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int);
 195 static void     ill_nominate_bcast_rcv(ill_group_t *illgrp);
 196 static void     ill_phyint_free(ill_t *ill);
 197 static void     ill_phyint_reinit(ill_t *ill);
 198 static void     ill_set_nce_router_flags(ill_t *, boolean_t);
 199 static void     ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
 200 static void     ill_signal_ipsq_ills(ipsq_t *, boolean_t);
 201 static boolean_t ill_split_ipsq(ipsq_t *cur_sq);
 202 static void     ill_stq_cache_delete(ire_t *, char *);
 203
 204 static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *);
 205 static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *);
 206 static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
 207     in6_addr_t *);
 208 static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
 209     ipaddr_t *);
 210 static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *);
 211 static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
 212     in6_addr_t *);
 213 static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
 214     ipaddr_t *);
 215
 216 static void     ipif_save_ire(ipif_t *, ire_t *);
 217 static void     ipif_remove_ire(ipif_t *, ire_t *);
 218 static void     ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *);
 219 static void     ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
 220
 221 /*
 222  * Per-ill IPsec capabilities management.
 223  */
 224 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void);
 225 static void     ill_ipsec_capab_free(ill_ipsec_capab_t *);
 226 static void     ill_ipsec_capab_add(ill_t *, uint_t, boolean_t);
 227 static void     ill_ipsec_capab_delete(ill_t *, uint_t);
 228 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int);
 229 static void ill_capability_proto(ill_t *, int, mblk_t *);
 230 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *,
 231     boolean_t);
 232 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 233 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 234 static void ill_capability_mdt_reset(ill_t *, mblk_t **);
 235 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 236 static void ill_capability_ipsec_reset(ill_t *, mblk_t **);
 237 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 238 static void ill_capability_hcksum_reset(ill_t *, mblk_t **);
 239 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
 240     dl_capability_sub_t *);
 241 static void ill_capability_zerocopy_reset(ill_t *, mblk_t **);
 242 static void ill_capability_lso_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 243 static void ill_capability_lso_reset(ill_t *, mblk_t **);
 244 static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 245 static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *);
 246 static void     ill_capability_dls_reset(ill_t *, mblk_t **);
 247 static void     ill_capability_dls_disable(ill_t *);
 248
 249 static void     illgrp_cache_delete(ire_t *, char *);
 250 static void     illgrp_delete(ill_t *ill);
 251 static void     illgrp_reset_schednext(ill_t *ill);
 252
 253 static ill_t    *ill_prev_usesrc(ill_t *);
 254 static int      ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
 255 static void     ill_disband_usesrc_group(ill_t *);
 256
 257 static void     conn_cleanup_stale_ire(conn_t *, caddr_t);
 258
 259 #ifdef DEBUG
 260 static  void    ill_trace_cleanup(const ill_t *);
 261 static  void    ipif_trace_cleanup(const ipif_t *);
 262 #endif
 263
 264 /*
 265  * if we go over the memory footprint limit more than once in this msec
 266  * interval, we'll start pruning aggressively.
 267  */
 268 int ip_min_frag_prune_time = 0;
 269
 270 /*
 271  * max # of IPsec algorithms supported.  Limited to 1 byte by PF_KEY
 272  * and the IPsec DOI
 273  */
 274 #define MAX_IPSEC_ALGS  256
 275
 276 #define BITSPERBYTE     8
 277 #define BITS(type)      (BITSPERBYTE * (long)sizeof (type))
 278
 279 #define IPSEC_ALG_ENABLE(algs, algid) \
 280                 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \
 281                 (1 << ((algid) % BITS(ipsec_capab_elem_t))))
 282
 283 #define IPSEC_ALG_IS_ENABLED(algid, algs) \
 284                 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \
 285                 (1 << ((algid) % BITS(ipsec_capab_elem_t))))
 286
 287 typedef uint8_t ipsec_capab_elem_t;
 288
 289 /*
 290  * Per-algorithm parameters.  Note that at present, only encryption
 291  * algorithms have variable keysize (IKE does not provide a way to negotiate
 292  * auth algorithm keysize).
 293  *
 294  * All sizes here are in bits.
 295  */
 296 typedef struct
 297 {
 298         uint16_t        minkeylen;
 299         uint16_t        maxkeylen;
 300 } ipsec_capab_algparm_t;
 301
 302 /*
 303  * Per-ill capabilities.
 304  */
 305 struct ill_ipsec_capab_s {
 306         ipsec_capab_elem_t *encr_hw_algs;
 307         ipsec_capab_elem_t *auth_hw_algs;
 308         uint32_t algs_size;     /* size of _hw_algs in bytes */
 309         /* algorithm key lengths */
 310         ipsec_capab_algparm_t *encr_algparm;
 311         uint32_t encr_algparm_size;
 312         uint32_t encr_algparm_end;
 313 };
 314
 315 /*
 316  * The field values are larger than strictly necessary for simple
 317  * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls.
 318  */
 319 static area_t   ip_area_template = {
 320         AR_ENTRY_ADD,                   /* area_cmd */
 321         sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl),
 322                                         /* area_name_offset */
 323         /* area_name_length temporarily holds this structure length */
 324         sizeof (area_t),                        /* area_name_length */
 325         IP_ARP_PROTO_TYPE,              /* area_proto */
 326         sizeof (ip_sock_ar_t),          /* area_proto_addr_offset */
 327         IP_ADDR_LEN,                    /* area_proto_addr_length */
 328         sizeof (ip_sock_ar_t) + IP_ADDR_LEN,
 329                                         /* area_proto_mask_offset */
 330         0,                              /* area_flags */
 331         sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN,
 332                                         /* area_hw_addr_offset */
 333         /* Zero length hw_addr_length means 'use your idea of the address' */
 334         0                               /* area_hw_addr_length */
 335 };
 336
 337 /*
 338  * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver
 339  * support
 340  */
 341 static area_t   ip6_area_template = {
 342         AR_ENTRY_ADD,                   /* area_cmd */
 343         sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t),
 344                                         /* area_name_offset */
 345         /* area_name_length temporarily holds this structure length */
 346         sizeof (area_t),                        /* area_name_length */
 347         IP_ARP_PROTO_TYPE,              /* area_proto */
 348         sizeof (ip_sock_ar_t),          /* area_proto_addr_offset */
 349         IPV6_ADDR_LEN,                  /* area_proto_addr_length */
 350         sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN,
 351                                         /* area_proto_mask_offset */
 352         0,                              /* area_flags */
 353         sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN,
 354                                         /* area_hw_addr_offset */
 355         /* Zero length hw_addr_length means 'use your idea of the address' */
 356         0                               /* area_hw_addr_length */
 357 };
 358
 359 static ared_t   ip_ared_template = {
 360         AR_ENTRY_DELETE,
 361         sizeof (ared_t) + IP_ADDR_LEN,
 362         sizeof (ared_t),
 363         IP_ARP_PROTO_TYPE,
 364         sizeof (ared_t),
 365         IP_ADDR_LEN
 366 };
 367
 368 static ared_t   ip6_ared_template = {
 369         AR_ENTRY_DELETE,
 370         sizeof (ared_t) + IPV6_ADDR_LEN,
 371         sizeof (ared_t),
 372         IP_ARP_PROTO_TYPE,
 373         sizeof (ared_t),
 374         IPV6_ADDR_LEN
 375 };
 376
 377 /*
 378  * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as
 379  * as the areq doesn't include an IP address in ill_dl_up() (the only place a
 380  * areq is used).
 381  */
 382 static areq_t   ip_areq_template = {
 383         AR_ENTRY_QUERY,                 /* cmd */
 384         sizeof (areq_t)+(2*IP_ADDR_LEN),        /* name offset */
 385         sizeof (areq_t),        /* name len (filled by ill_arp_alloc) */
 386         IP_ARP_PROTO_TYPE,              /* protocol, from arps perspective */
 387         sizeof (areq_t),                        /* target addr offset */
 388         IP_ADDR_LEN,                    /* target addr_length */
 389         0,                              /* flags */
 390         sizeof (areq_t) + IP_ADDR_LEN,  /* sender addr offset */
 391         IP_ADDR_LEN,                    /* sender addr length */
 392         AR_EQ_DEFAULT_XMIT_COUNT,       /* xmit_count */
 393         AR_EQ_DEFAULT_XMIT_INTERVAL,    /* (re)xmit_interval in milliseconds */
 394         AR_EQ_DEFAULT_MAX_BUFFERED      /* max # of requests to buffer */
 395         /* anything else filled in by the code */
 396 };
 397
 398 static arc_t    ip_aru_template = {
 399         AR_INTERFACE_UP,
 400         sizeof (arc_t),         /* Name offset */
 401         sizeof (arc_t)          /* Name length (set by ill_arp_alloc) */
 402 };
 403
 404 static arc_t    ip_ard_template = {
 405         AR_INTERFACE_DOWN,
 406         sizeof (arc_t),         /* Name offset */
 407         sizeof (arc_t)          /* Name length (set by ill_arp_alloc) */
 408 };
 409
 410 static arc_t    ip_aron_template = {
 411         AR_INTERFACE_ON,
 412         sizeof (arc_t),         /* Name offset */
 413         sizeof (arc_t)          /* Name length (set by ill_arp_alloc) */
 414 };
 415
 416 static arc_t    ip_aroff_template = {
 417         AR_INTERFACE_OFF,
 418         sizeof (arc_t),         /* Name offset */
 419         sizeof (arc_t)          /* Name length (set by ill_arp_alloc) */
 420 };
 421
 422
 423 static arma_t   ip_arma_multi_template = {
 424         AR_MAPPING_ADD,
 425         sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN,
 426                                 /* Name offset */
 427         sizeof (arma_t),        /* Name length (set by ill_arp_alloc) */
 428         IP_ARP_PROTO_TYPE,
 429         sizeof (arma_t),                        /* proto_addr_offset */
 430         IP_ADDR_LEN,                            /* proto_addr_length */
 431         sizeof (arma_t) + IP_ADDR_LEN,          /* proto_mask_offset */
 432         sizeof (arma_t) + 2*IP_ADDR_LEN,        /* proto_extract_mask_offset */
 433         ACE_F_PERMANENT | ACE_F_MAPPING,        /* flags */
 434         sizeof (arma_t) + 3*IP_ADDR_LEN,        /* hw_addr_offset */
 435         IP_MAX_HW_LEN,                          /* hw_addr_length */
 436         0,                                      /* hw_mapping_start */
 437 };
 438
 439 static ipft_t   ip_ioctl_ftbl[] = {
 440         { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
 441         { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
 442                 IPFT_F_NO_REPLY },
 443         { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t),
 444                 IPFT_F_NO_REPLY },
 445         { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
 446         { 0 }
 447 };
 448
 449 /* Simple ICMP IP Header Template */
 450 static ipha_t icmp_ipha = {
 451         IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
 452 };
 453
 454 /* Flag descriptors for ip_ipif_report */
 455 static nv_t     ipif_nv_tbl[] = {
 456         { IPIF_UP,              "UP" },
 457         { IPIF_BROADCAST,       "BROADCAST" },
 458         { ILLF_DEBUG,           "DEBUG" },
 459         { PHYI_LOOPBACK,        "LOOPBACK" },
 460         { IPIF_POINTOPOINT,     "POINTOPOINT" },
 461         { ILLF_NOTRAILERS,      "NOTRAILERS" },
 462         { PHYI_RUNNING,         "RUNNING" },
 463         { ILLF_NOARP,           "NOARP" },
 464         { PHYI_PROMISC,         "PROMISC" },
 465         { PHYI_ALLMULTI,        "ALLMULTI" },
 466         { PHYI_INTELLIGENT,     "INTELLIGENT" },
 467         { ILLF_MULTICAST,       "MULTICAST" },
 468         { PHYI_MULTI_BCAST,     "MULTI_BCAST" },
 469         { IPIF_UNNUMBERED,      "UNNUMBERED" },
 470         { IPIF_DHCPRUNNING,     "DHCP" },
 471         { IPIF_PRIVATE,         "PRIVATE" },
 472         { IPIF_NOXMIT,          "NOXMIT" },
 473         { IPIF_NOLOCAL,         "NOLOCAL" },
 474         { IPIF_DEPRECATED,      "DEPRECATED" },
 475         { IPIF_PREFERRED,       "PREFERRED" },
 476         { IPIF_TEMPORARY,       "TEMPORARY" },
 477         { IPIF_ADDRCONF,        "ADDRCONF" },
 478         { PHYI_VIRTUAL,         "VIRTUAL" },
 479         { ILLF_ROUTER,          "ROUTER" },
 480         { ILLF_NONUD,           "NONUD" },
 481         { IPIF_ANYCAST,         "ANYCAST" },
 482         { ILLF_NORTEXCH,        "NORTEXCH" },
 483         { ILLF_IPV4,            "IPV4" },
 484         { ILLF_IPV6,            "IPV6" },
 485         { IPIF_NOFAILOVER,      "NOFAILOVER" },
 486         { PHYI_FAILED,          "FAILED" },
 487         { PHYI_STANDBY,         "STANDBY" },
 488         { PHYI_INACTIVE,        "INACTIVE" },
 489         { PHYI_OFFLINE,         "OFFLINE" },
 490 };
 491
 492 static uchar_t  ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
 493
 494 static ip_m_t   ip_m_tbl[] = {
 495         { DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
 496             ip_ether_v6intfid },
 497         { DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
 498             ip_nodef_v6intfid },
 499         { DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
 500             ip_nodef_v6intfid },
 501         { DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
 502             ip_nodef_v6intfid },
 503         { DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
 504             ip_ether_v6intfid },
 505         { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo,
 506             ip_ib_v6intfid },
 507         { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL},
 508         { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
 509             ip_nodef_v6intfid }
 510 };
 511
 512 static ill_t    ill_null;               /* Empty ILL for init. */
 513 char    ipif_loopback_name[] = "lo0";
 514 static char *ipv4_forward_suffix = ":ip_forwarding";
 515 static char *ipv6_forward_suffix = ":ip6_forwarding";
 516 static  sin6_t  sin6_null;      /* Zero address for quick clears */
 517 static  sin_t   sin_null;       /* Zero address for quick clears */
 518
 519 /* When set search for unused ipif_seqid */
 520 static ipif_t   ipif_zero;
 521
 522 /*
 523  * ppa arena is created after these many
 524  * interfaces have been plumbed.
 525  */
 526 uint_t  ill_no_arena = 12;      /* Setable in /etc/system */
 527
 528 /*
 529  * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout
 530  * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is
 531  * set through platform specific code (Niagara/Ontario).
 532  */
 533 #define SOFT_RINGS_ENABLED()    (ip_soft_rings_cnt ? \
 534                 (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE)
 535
 536 #define ILL_CAPAB_DLS   (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL)
 537
 538 static uint_t
 539 ipif_rand(ip_stack_t *ipst)
 540 {
 541         ipst->ips_ipif_src_random = ipst->ips_ipif_src_random * 1103515245 +
 542             12345;
 543         return ((ipst->ips_ipif_src_random >> 16) & 0x7fff);
 544 }
 545
 546 /*
 547  * Allocate per-interface mibs.
 548  * Returns true if ok. False otherwise.
 549  *  ipsq  may not yet be allocated (loopback case ).
 550  */
 551 static boolean_t
 552 ill_allocate_mibs(ill_t *ill)
 553 {
 554         /* Already allocated? */
 555         if (ill->ill_ip_mib != NULL) {
 556                 if (ill->ill_isv6)
 557                         ASSERT(ill->ill_icmp6_mib != NULL);
 558                 return (B_TRUE);
 559         }
 560
 561         ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
 562             KM_NOSLEEP);
 563         if (ill->ill_ip_mib == NULL) {
 564                 return (B_FALSE);
 565         }
 566
 567         /* Setup static information */
 568         SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
 569             sizeof (mib2_ipIfStatsEntry_t));
 570         if (ill->ill_isv6) {
 571                 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
 572                 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
 573                     sizeof (mib2_ipv6AddrEntry_t));
 574                 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
 575                     sizeof (mib2_ipv6RouteEntry_t));
 576                 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
 577                     sizeof (mib2_ipv6NetToMediaEntry_t));
 578                 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
 579                     sizeof (ipv6_member_t));
 580                 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
 581                     sizeof (ipv6_grpsrc_t));
 582         } else {
 583                 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
 584                 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
 585                     sizeof (mib2_ipAddrEntry_t));
 586                 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
 587                     sizeof (mib2_ipRouteEntry_t));
 588                 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
 589                     sizeof (mib2_ipNetToMediaEntry_t));
 590                 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
 591                     sizeof (ip_member_t));
 592                 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
 593                     sizeof (ip_grpsrc_t));
 594
 595                 /*
 596                  * For a v4 ill, we are done at this point, because per ill
 597                  * icmp mibs are only used for v6.
 598                  */
 599                 return (B_TRUE);
 600         }
 601
 602         ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
 603             KM_NOSLEEP);
 604         if (ill->ill_icmp6_mib == NULL) {
 605                 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
 606                 ill->ill_ip_mib = NULL;
 607                 return (B_FALSE);
 608         }
 609         /* static icmp info */
 610         ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
 611             sizeof (mib2_ipv6IfIcmpEntry_t);
 612         /*
 613          * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
 614          * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
 615          * -> ill_phyint_reinit
 616          */
 617         return (B_TRUE);
 618 }
 619
 620 /*
 621  * Common code for preparation of ARP commands.  Two points to remember:
 622  *      1) The ill_name is tacked on at the end of the allocated space so
 623  *         the templates name_offset field must contain the total space
 624  *         to allocate less the name length.
 625  *
 626  *      2) The templates name_length field should contain the *template*
 627  *         length.  We use it as a parameter to bcopy() and then write
 628  *         the real ill_name_length into the name_length field of the copy.
 629  * (Always called as writer.)
 630  */
 631 mblk_t *
 632 ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr)
 633 {
 634         arc_t   *arc = (arc_t *)template;
 635         char    *cp;
 636         int     len;
 637         mblk_t  *mp;
 638         uint_t  name_length = ill->ill_name_length;
 639         uint_t  template_len = arc->arc_name_length;
 640
 641         len = arc->arc_name_offset + name_length;
 642         mp = allocb(len, BPRI_HI);
 643         if (mp == NULL)
 644                 return (NULL);
 645         cp = (char *)mp->b_rptr;
 646         mp->b_wptr = (uchar_t *)&cp[len];
 647         if (template_len)
 648                 bcopy(template, cp, template_len);
 649         if (len > template_len)
 650                 bzero(&cp[template_len], len - template_len);
 651         mp->b_datap->db_type = M_PROTO;
 652
 653         arc = (arc_t *)cp;
 654         arc->arc_name_length = name_length;
 655         cp = (char *)arc + arc->arc_name_offset;
 656         bcopy(ill->ill_name, cp, name_length);
 657
 658         if (addr) {
 659                 area_t  *area = (area_t *)mp->b_rptr;
 660
 661                 cp = (char *)area + area->area_proto_addr_offset;
 662                 bcopy(addr, cp, area->area_proto_addr_length);
 663                 if (area->area_cmd == AR_ENTRY_ADD) {
 664                         cp = (char *)area;
 665                         len = area->area_proto_addr_length;
 666                         if (area->area_proto_mask_offset)
 667                                 cp += area->area_proto_mask_offset;
 668                         else
 669                                 cp += area->area_proto_addr_offset + len;
 670                         while (len-- > 0)
 671                                 *cp++ = (char)~0;
 672                 }
 673         }
 674         return (mp);
 675 }
 676
 677 mblk_t *
 678 ipif_area_alloc(ipif_t *ipif)
 679 {
 680         return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template,
 681             (char *)&ipif->ipif_lcl_addr));
 682 }
 683
 684 mblk_t *
 685 ipif_ared_alloc(ipif_t *ipif)
 686 {
 687         return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template,
 688             (char *)&ipif->ipif_lcl_addr));
 689 }
 690
 691 mblk_t *
 692 ill_ared_alloc(ill_t *ill, ipaddr_t addr)
 693 {
 694         return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template,
 695             (char *)&addr));
 696 }
 697
 698 /*
 699  * Completely vaporize a lower level tap and all associated interfaces.
 700  * ill_delete is called only out of ip_close when the device control
 701  * stream is being closed.
 702  */
 703 void
 704 ill_delete(ill_t *ill)
 705 {
 706         ipif_t  *ipif;
 707         ill_t   *prev_ill;
 708         ip_stack_t      *ipst = ill->ill_ipst;
 709
 710         /*
 711          * ill_delete may be forcibly entering the ipsq. The previous
 712          * ioctl may not have completed and may need to be aborted.
 713          * ipsq_flush takes care of it. If we don't need to enter the
 714          * the ipsq forcibly, the 2nd invocation of ipsq_flush in
 715          * ill_delete_tail is sufficient.
 716          */
 717         ipsq_flush(ill);
 718
 719         /*
 720          * Nuke all interfaces.  ipif_free will take down the interface,
 721          * remove it from the list, and free the data structure.
 722          * Walk down the ipif list and remove the logical interfaces
 723          * first before removing the main ipif. We can't unplumb
 724          * zeroth interface first in the case of IPv6 as reset_conn_ill
 725          * -> ip_ll_delmulti_v6 de-references ill_ipif for checking
 726          * POINTOPOINT.
 727          *
 728          * If ill_ipif was not properly initialized (i.e low on memory),
 729          * then no interfaces to clean up. In this case just clean up the
 730          * ill.
 731          */
 732         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
 733                 ipif_free(ipif);
 734
 735         /*
 736          * Used only by ill_arp_on and ill_arp_off, which are writers.
 737          * So nobody can be using this mp now. Free the mp allocated for
 738          * honoring ILLF_NOARP
 739          */
 740         freemsg(ill->ill_arp_on_mp);
 741         ill->ill_arp_on_mp = NULL;
 742
 743         /* Clean up msgs on pending upcalls for mrouted */
 744         reset_mrt_ill(ill);
 745
 746         /*
 747          * ipif_free -> reset_conn_ipif will remove all multicast
 748          * references for IPv4. For IPv6, we need to do it here as
 749          * it points only at ills.
 750          */
 751         reset_conn_ill(ill);
 752
 753         /*
 754          * ill_down will arrange to blow off any IRE's dependent on this
 755          * ILL, and shut down fragmentation reassembly.
 756          */
 757         ill_down(ill);
 758
 759         /* Let SCTP know, so that it can remove this from its list. */
 760         sctp_update_ill(ill, SCTP_ILL_REMOVE);
 761
 762         /*
 763          * If an address on this ILL is being used as a source address then
 764          * clear out the pointers in other ILLs that point to this ILL.
 765          */
 766         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
 767         if (ill->ill_usesrc_grp_next != NULL) {
 768                 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
 769                         ill_disband_usesrc_group(ill);
 770                 } else {        /* consumer of the usesrc ILL */
 771                         prev_ill = ill_prev_usesrc(ill);
 772                         prev_ill->ill_usesrc_grp_next =
 773                             ill->ill_usesrc_grp_next;
 774                 }
 775         }
 776         rw_exit(&ipst->ips_ill_g_usesrc_lock);
 777 }
 778
 779 static void
 780 ipif_non_duplicate(ipif_t *ipif)
 781 {
 782         ill_t *ill = ipif->ipif_ill;
 783         mutex_enter(&ill->ill_lock);
 784         if (ipif->ipif_flags & IPIF_DUPLICATE) {
 785                 ipif->ipif_flags &= ~IPIF_DUPLICATE;
 786                 ASSERT(ill->ill_ipif_dup_count > 0);
 787                 ill->ill_ipif_dup_count--;
 788         }
 789         mutex_exit(&ill->ill_lock);
 790 }
 791
 792 /*
 793  * ill_delete_tail is called from ip_modclose after all references
 794  * to the closing ill are gone. The wait is done in ip_modclose
 795  */
 796 void
 797 ill_delete_tail(ill_t *ill)
 798 {
 799         mblk_t  **mpp;
 800         ipif_t  *ipif;
 801         ip_stack_t      *ipst = ill->ill_ipst;
 802
 803         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
 804                 ipif_non_duplicate(ipif);
 805                 ipif_down_tail(ipif);
 806         }
 807
 808         ASSERT(ill->ill_ipif_dup_count == 0 &&
 809             ill->ill_arp_down_mp == NULL &&
 810             ill->ill_arp_del_mapping_mp == NULL);
 811
 812         /*
 813          * If polling capability is enabled (which signifies direct
 814          * upcall into IP and driver has ill saved as a handle),
 815          * we need to make sure that unbind has completed before we
 816          * let the ill disappear and driver no longer has any reference
 817          * to this ill.
 818          */
 819         mutex_enter(&ill->ill_lock);
 820         while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
 821                 cv_wait(&ill->ill_cv, &ill->ill_lock);
 822         mutex_exit(&ill->ill_lock);
 823
 824         /*
 825          * Clean up polling and soft ring capabilities
 826          */
 827         if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))
 828                 ill_capability_dls_disable(ill);
 829
 830         if (ill->ill_net_type != IRE_LOOPBACK)
 831                 qprocsoff(ill->ill_rq);
 832
 833         /*
 834          * We do an ipsq_flush once again now. New messages could have
 835          * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
 836          * could also have landed up if an ioctl thread had looked up
 837          * the ill before we set the ILL_CONDEMNED flag, but not yet
 838          * enqueued the ioctl when we did the ipsq_flush last time.
 839          */
 840         ipsq_flush(ill);
 841
 842         /*
 843          * Free capabilities.
 844          */
 845         if (ill->ill_ipsec_capab_ah != NULL) {
 846                 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH);
 847                 ill_ipsec_capab_free(ill->ill_ipsec_capab_ah);
 848                 ill->ill_ipsec_capab_ah = NULL;
 849         }
 850
 851         if (ill->ill_ipsec_capab_esp != NULL) {
 852                 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP);
 853                 ill_ipsec_capab_free(ill->ill_ipsec_capab_esp);
 854                 ill->ill_ipsec_capab_esp = NULL;
 855         }
 856
 857         if (ill->ill_mdt_capab != NULL) {
 858                 kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t));
 859                 ill->ill_mdt_capab = NULL;
 860         }
 861
 862         if (ill->ill_hcksum_capab != NULL) {
 863                 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
 864                 ill->ill_hcksum_capab = NULL;
 865         }
 866
 867         if (ill->ill_zerocopy_capab != NULL) {
 868                 kmem_free(ill->ill_zerocopy_capab,
 869                     sizeof (ill_zerocopy_capab_t));
 870                 ill->ill_zerocopy_capab = NULL;
 871         }
 872
 873         if (ill->ill_lso_capab != NULL) {
 874                 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
 875                 ill->ill_lso_capab = NULL;
 876         }
 877
 878         if (ill->ill_dls_capab != NULL) {
 879                 CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn);
 880                 ill->ill_dls_capab->ill_unbind_conn = NULL;
 881                 kmem_free(ill->ill_dls_capab,
 882                     sizeof (ill_dls_capab_t) +
 883                     (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS));
 884                 ill->ill_dls_capab = NULL;
 885         }
 886
 887         ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL));
 888
 889         while (ill->ill_ipif != NULL)
 890                 ipif_free_tail(ill->ill_ipif);
 891
 892         /*
 893          * We have removed all references to ilm from conn and the ones joined
 894          * within the kernel.
 895          *
 896          * We don't walk conns, mrts and ires because
 897          *
 898          * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts.
 899          * 2) ill_down ->ill_downi walks all the ires and cleans up
 900          *    ill references.
 901          */
 902         ASSERT(ilm_walk_ill(ill) == 0);
 903         /*
 904          * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free
 905          * could free the phyint. No more reference to the phyint after this
 906          * point.
 907          */
 908         (void) ill_glist_delete(ill);
 909
 910         rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER);
 911         if (ill->ill_ndd_name != NULL)
 912                 nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name);
 913         rw_exit(&ipst->ips_ip_g_nd_lock);
 914
 915
 916         if (ill->ill_frag_ptr != NULL) {
 917                 uint_t count;
 918
 919                 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
 920                         mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
 921                 }
 922                 mi_free(ill->ill_frag_ptr);
 923                 ill->ill_frag_ptr = NULL;
 924                 ill->ill_frag_hash_tbl = NULL;
 925         }
 926
 927         freemsg(ill->ill_nd_lla_mp);
 928         /* Free all retained control messages. */
 929         mpp = &ill->ill_first_mp_to_free;
 930         do {
 931                 while (mpp[0]) {
 932                         mblk_t  *mp;
 933                         mblk_t  *mp1;
 934
 935                         mp = mpp[0];
 936                         mpp[0] = mp->b_next;
 937                         for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
 938                                 mp1->b_next = NULL;
 939                                 mp1->b_prev = NULL;
 940                         }
 941                         freemsg(mp);
 942                 }
 943         } while (mpp++ != &ill->ill_last_mp_to_free);
 944
 945         ill_free_mib(ill);
 946
 947 #ifdef DEBUG
 948         ill_trace_cleanup(ill);
 949 #endif
 950
 951         /* Drop refcnt here */
 952         netstack_rele(ill->ill_ipst->ips_netstack);
 953         ill->ill_ipst = NULL;
 954 }
 955
 956 static void
 957 ill_free_mib(ill_t *ill)
 958 {
 959         ip_stack_t *ipst = ill->ill_ipst;
 960
 961         /*
 962          * MIB statistics must not be lost, so when an interface
 963          * goes away the counter values will be added to the global
 964          * MIBs.
 965          */
 966         if (ill->ill_ip_mib != NULL) {
 967                 if (ill->ill_isv6) {
 968                         ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
 969                             ill->ill_ip_mib);
 970                 } else {
 971                         ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
 972                             ill->ill_ip_mib);
 973                 }
 974
 975                 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
 976                 ill->ill_ip_mib = NULL;
 977         }
 978         if (ill->ill_icmp6_mib != NULL) {
 979                 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
 980                     ill->ill_icmp6_mib);
 981                 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
 982                 ill->ill_icmp6_mib = NULL;
 983         }
 984 }
 985
 986 /*
 987  * Concatenate together a physical address and a sap.
 988  *
 989  * Sap_lengths are interpreted as follows:
 990  *   sap_length == 0    ==>     no sap
 991  *   sap_length > 0     ==>     sap is at the head of the dlpi address
 992  *   sap_length < 0     ==>     sap is at the tail of the dlpi address
 993  */
 994 static void
 995 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
 996     t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
 997 {
 998         uint16_t sap_addr = (uint16_t)sap_src;
 999
1000         if (sap_length == 0) {
1001                 if (phys_src == NULL)
1002                         bzero(dst, phys_length);
1003                 else
1004                         bcopy(phys_src, dst, phys_length);
1005         } else if (sap_length < 0) {
1006                 if (phys_src == NULL)
1007                         bzero(dst, phys_length);
1008                 else
1009                         bcopy(phys_src, dst, phys_length);
1010                 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
1011         } else {
1012                 bcopy(&sap_addr, dst, sizeof (sap_addr));
1013                 if (phys_src == NULL)
1014                         bzero((char *)dst + sap_length, phys_length);
1015                 else
1016                         bcopy(phys_src, (char *)dst + sap_length, phys_length);
1017         }
1018 }
1019
1020 /*
1021  * Generate a dl_unitdata_req mblk for the device and address given.
1022  * addr_length is the length of the physical portion of the address.
1023  * If addr is NULL include an all zero address of the specified length.
1024  * TRUE? In any case, addr_length is taken to be the entire length of the
1025  * dlpi address, including the absolute value of sap_length.
1026  */
1027 mblk_t *
1028 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
1029                 t_scalar_t sap_length)
1030 {
1031         dl_unitdata_req_t *dlur;
1032         mblk_t  *mp;
1033         t_scalar_t      abs_sap_length;         /* absolute value */
1034
1035         abs_sap_length = ABS(sap_length);
1036         mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
1037             DL_UNITDATA_REQ);
1038         if (mp == NULL)
1039                 return (NULL);
1040         dlur = (dl_unitdata_req_t *)mp->b_rptr;
1041         /* HACK: accomodate incompatible DLPI drivers */
1042         if (addr_length == 8)
1043                 addr_length = 6;
1044         dlur->dl_dest_addr_length = addr_length + abs_sap_length;
1045         dlur->dl_dest_addr_offset = sizeof (*dlur);
1046         dlur->dl_priority.dl_min = 0;
1047         dlur->dl_priority.dl_max = 0;
1048         ill_dlur_copy_address(addr, addr_length, sap, sap_length,
1049             (uchar_t *)&dlur[1]);
1050         return (mp);
1051 }
1052
1053 /*
1054  * Add the 'mp' to the list of pending mp's headed by ill_pending_mp
1055  * Return an error if we already have 1 or more ioctls in progress.
1056  * This is used only for non-exclusive ioctls. Currently this is used
1057  * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive
1058  * and thus need to use ipsq_pending_mp_add.
1059  */
1060 boolean_t
1061 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp)
1062 {
1063         ASSERT(MUTEX_HELD(&ill->ill_lock));
1064         ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
1065         /*
1066          * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls.
1067          */
1068         ASSERT((add_mp->b_datap->db_type == M_IOCDATA) ||
1069             (add_mp->b_datap->db_type == M_IOCTL));
1070
1071         ASSERT(MUTEX_HELD(&connp->conn_lock));
1072         /*
1073          * Return error if the conn has started closing. The conn
1074          * could have finished cleaning up the pending mp list,
1075          * If so we should not add another mp to the list negating
1076          * the cleanup.
1077          */
1078         if (connp->conn_state_flags & CONN_CLOSING)
1079                 return (B_FALSE);
1080         /*
1081          * Add the pending mp to the head of the list, chained by b_next.
1082          * Note down the conn on which the ioctl request came, in b_prev.
1083          * This will be used to later get the conn, when we get a response
1084          * on the ill queue, from some other module (typically arp)
1085          */
1086         add_mp->b_next = (void *)ill->ill_pending_mp;
1087         add_mp->b_queue = CONNP_TO_WQ(connp);
1088         ill->ill_pending_mp = add_mp;
1089         if (connp != NULL)
1090                 connp->conn_oper_pending_ill = ill;
1091         return (B_TRUE);
1092 }
1093
1094 /*
1095  * Retrieve the ill_pending_mp and return it. We have to walk the list
1096  * of mblks starting at ill_pending_mp, and match based on the ioc_id.
1097  */
1098 mblk_t *
1099 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id)
1100 {
1101         mblk_t  *prev = NULL;
1102         mblk_t  *curr = NULL;
1103         uint_t  id;
1104         conn_t  *connp;
1105
1106         /*
1107          * When the conn closes, conn_ioctl_cleanup needs to clean
1108          * up the pending mp, but it does not know the ioc_id and
1109          * passes in a zero for it.
1110          */
1111         mutex_enter(&ill->ill_lock);
1112         if (ioc_id != 0)
1113                 *connpp = NULL;
1114
1115         /* Search the list for the appropriate ioctl based on ioc_id */
1116         for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL;
1117             prev = curr, curr = curr->b_next) {
1118                 id = ((struct iocblk *)curr->b_rptr)->ioc_id;
1119                 connp = Q_TO_CONN(curr->b_queue);
1120                 /* Match based on the ioc_id or based on the conn */
1121                 if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp))
1122                         break;
1123         }
1124
1125         if (curr != NULL) {
1126                 /* Unlink the mblk from the pending mp list */
1127                 if (prev != NULL) {
1128                         prev->b_next = curr->b_next;
1129                 } else {
1130                         ASSERT(ill->ill_pending_mp == curr);
1131                         ill->ill_pending_mp = curr->b_next;
1132                 }
1133
1134                 /*
1135                  * conn refcnt must have been bumped up at the start of
1136                  * the ioctl. So we can safely access the conn.
1137                  */
1138                 ASSERT(CONN_Q(curr->b_queue));
1139                 *connpp = Q_TO_CONN(curr->b_queue);
1140                 curr->b_next = NULL;
1141                 curr->b_queue = NULL;
1142         }
1143
1144         mutex_exit(&ill->ill_lock);
1145
1146         return (curr);
1147 }
1148
1149 /*
1150  * Add the pending mp to the list. There can be only 1 pending mp
1151  * in the list. Any exclusive ioctl that needs to wait for a response
1152  * from another module or driver needs to use this function to set
1153  * the ipsq_pending_mp to the ioctl mblk and wait for the response from
1154  * the other module/driver. This is also used while waiting for the
1155  * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
1156  */
1157 boolean_t
1158 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
1159     int waitfor)
1160 {
1161         ipsq_t  *ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
1162
1163         ASSERT(IAM_WRITER_IPIF(ipif));
1164         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
1165         ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
1166         ASSERT(ipsq->ipsq_pending_mp == NULL);
1167         /*
1168          * The caller may be using a different ipif than the one passed into
1169          * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
1170          * ill needs to wait for the V6 ill to quiesce).  So we can't ASSERT
1171          * that `ipsq_current_ipif == ipif'.
1172          */
1173         ASSERT(ipsq->ipsq_current_ipif != NULL);
1174
1175         /*
1176          * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls,
1177          * M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the driver.
1178          */
1179         ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) ||
1180             (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP) ||
1181             (DB_TYPE(add_mp) == M_PROTO) || (DB_TYPE(add_mp) == M_PCPROTO));
1182
1183         if (connp != NULL) {
1184                 ASSERT(MUTEX_HELD(&connp->conn_lock));
1185                 /*
1186                  * Return error if the conn has started closing. The conn
1187                  * could have finished cleaning up the pending mp list,
1188                  * If so we should not add another mp to the list negating
1189                  * the cleanup.
1190                  */
1191                 if (connp->conn_state_flags & CONN_CLOSING)
1192                         return (B_FALSE);
1193         }
1194         mutex_enter(&ipsq->ipsq_lock);
1195         ipsq->ipsq_pending_ipif = ipif;
1196         /*
1197          * Note down the queue in b_queue. This will be returned by
1198          * ipsq_pending_mp_get. Caller will then use these values to restart
1199          * the processing
1200          */
1201         add_mp->b_next = NULL;
1202         add_mp->b_queue = q;
1203         ipsq->ipsq_pending_mp = add_mp;
1204         ipsq->ipsq_waitfor = waitfor;
1205
1206         if (connp != NULL)
1207                 connp->conn_oper_pending_ill = ipif->ipif_ill;
1208         mutex_exit(&ipsq->ipsq_lock);
1209         return (B_TRUE);
1210 }
1211
1212 /*
1213  * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp
1214  * queued in the list.
1215  */
1216 mblk_t *
1217 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
1218 {
1219         mblk_t  *curr = NULL;
1220
1221         mutex_enter(&ipsq->ipsq_lock);
1222         *connpp = NULL;
1223         if (ipsq->ipsq_pending_mp == NULL) {
1224                 mutex_exit(&ipsq->ipsq_lock);
1225                 return (NULL);
1226         }
1227
1228         /* There can be only 1 such excl message */
1229         curr = ipsq->ipsq_pending_mp;
1230         ASSERT(curr != NULL && curr->b_next == NULL);
1231         ipsq->ipsq_pending_ipif = NULL;
1232         ipsq->ipsq_pending_mp = NULL;
1233         ipsq->ipsq_waitfor = 0;
1234         mutex_exit(&ipsq->ipsq_lock);
1235
1236         if (CONN_Q(curr->b_queue)) {
1237                 /*
1238                  * This mp did a refhold on the conn, at the start of the ioctl.
1239                  * So we can safely return a pointer to the conn to the caller.
1240                  */
1241                 *connpp = Q_TO_CONN(curr->b_queue);
1242         } else {
1243                 *connpp = NULL;
1244         }
1245         curr->b_next = NULL;
1246         curr->b_prev = NULL;
1247         return (curr);
1248 }
1249
1250 /*
1251  * Cleanup the ioctl mp queued in ipsq_pending_mp
1252  * - Called in the ill_delete path
1253  * - Called in the M_ERROR or M_HANGUP path on the ill.
1254  * - Called in the conn close path.
1255  */
1256 boolean_t
1257 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
1258 {
1259         mblk_t  *mp;
1260         ipsq_t  *ipsq;
1261         queue_t *q;
1262         ipif_t  *ipif;
1263
1264         ASSERT(IAM_WRITER_ILL(ill));
1265         ipsq = ill->ill_phyint->phyint_ipsq;
1266         mutex_enter(&ipsq->ipsq_lock);
1267         /*
1268          * If connp is null, unconditionally clean up the ipsq_pending_mp.
1269          * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl
1270          * even if it is meant for another ill, since we have to enqueue
1271          * a new mp now in ipsq_pending_mp to complete the ipif_down.
1272          * If connp is non-null we are called from the conn close path.
1273          */
1274         mp = ipsq->ipsq_pending_mp;
1275         if (mp == NULL || (connp != NULL &&
1276             mp->b_queue != CONNP_TO_WQ(connp))) {
1277                 mutex_exit(&ipsq->ipsq_lock);
1278                 return (B_FALSE);
1279         }
1280         /* Now remove from the ipsq_pending_mp */
1281         ipsq->ipsq_pending_mp = NULL;
1282         q = mp->b_queue;
1283         mp->b_next = NULL;
1284         mp->b_prev = NULL;
1285         mp->b_queue = NULL;
1286
1287         /* If MOVE was in progress, clear the move_in_progress fields also. */
1288         ill = ipsq->ipsq_pending_ipif->ipif_ill;
1289         if (ill->ill_move_in_progress) {
1290                 ILL_CLEAR_MOVE(ill);
1291         } else if (ill->ill_up_ipifs) {
1292                 ill_group_cleanup(ill);
1293         }
1294
1295         ipif = ipsq->ipsq_pending_ipif;
1296         ipsq->ipsq_pending_ipif = NULL;
1297         ipsq->ipsq_waitfor = 0;
1298         ipsq->ipsq_current_ipif = NULL;
1299         ipsq->ipsq_current_ioctl = 0;
1300         mutex_exit(&ipsq->ipsq_lock);
1301
1302         if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
1303                 if (connp == NULL) {
1304                         ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
1305                 } else {
1306                         ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
1307                         mutex_enter(&ipif->ipif_ill->ill_lock);
1308                         ipif->ipif_state_flags &= ~IPIF_CHANGING;
1309                         mutex_exit(&ipif->ipif_ill->ill_lock);
1310                 }
1311         } else {
1312                 /*
1313                  * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't
1314                  * be just inet_freemsg. we have to restart it
1315                  * otherwise the thread will be stuck.
1316                  */
1317                 inet_freemsg(mp);
1318         }
1319         return (B_TRUE);
1320 }
1321
1322 /*
1323  * The ill is closing. Cleanup all the pending mps. Called exclusively
1324  * towards the end of ill_delete. The refcount has gone to 0. So nobody
1325  * knows this ill, and hence nobody can add an mp to this list
1326  */
1327 static void
1328 ill_pending_mp_cleanup(ill_t *ill)
1329 {
1330         mblk_t  *mp;
1331         queue_t *q;
1332
1333         ASSERT(IAM_WRITER_ILL(ill));
1334
1335         mutex_enter(&ill->ill_lock);
1336         /*
1337          * Every mp on the pending mp list originating from an ioctl
1338          * added 1 to the conn refcnt, at the start of the ioctl.
1339          * So bump it down now.  See comments in ip_wput_nondata()
1340          */
1341         while (ill->ill_pending_mp != NULL) {
1342                 mp = ill->ill_pending_mp;
1343                 ill->ill_pending_mp = mp->b_next;
1344                 mutex_exit(&ill->ill_lock);
1345
1346                 q = mp->b_queue;
1347                 ASSERT(CONN_Q(q));
1348                 mp->b_next = NULL;
1349                 mp->b_prev = NULL;
1350                 mp->b_queue = NULL;
1351                 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
1352                 mutex_enter(&ill->ill_lock);
1353         }
1354         ill->ill_pending_ipif = NULL;
1355
1356         mutex_exit(&ill->ill_lock);
1357 }
1358
1359 /*
1360  * Called in the conn close path and ill delete path
1361  */
1362 static void
1363 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
1364 {
1365         ipsq_t  *ipsq;
1366         mblk_t  *prev;
1367         mblk_t  *curr;
1368         mblk_t  *next;
1369         queue_t *q;
1370         mblk_t  *tmp_list = NULL;
1371
1372         ASSERT(IAM_WRITER_ILL(ill));
1373         if (connp != NULL)
1374                 q = CONNP_TO_WQ(connp);
1375         else
1376                 q = ill->ill_wq;
1377
1378         ipsq = ill->ill_phyint->phyint_ipsq;
1379         /*
1380          * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
1381          * In the case of ioctl from a conn, there can be only 1 mp
1382          * queued on the ipsq. If an ill is being unplumbed, only messages
1383          * related to this ill are flushed, like M_ERROR or M_HANGUP message.
1384          * ioctls meant for this ill form conn's are not flushed. They will
1385          * be processed during ipsq_exit and will not find the ill and will
1386          * return error.
1387          */
1388         mutex_enter(&ipsq->ipsq_lock);
1389         for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
1390             curr = next) {
1391                 next = curr->b_next;
1392                 if (curr->b_queue == q || curr->b_queue == RD(q)) {
1393                         /* Unlink the mblk from the pending mp list */
1394                         if (prev != NULL) {
1395                                 prev->b_next = curr->b_next;
1396                         } else {
1397                                 ASSERT(ipsq->ipsq_xopq_mphead == curr);
1398                                 ipsq->ipsq_xopq_mphead = curr->b_next;
1399                         }
1400                         if (ipsq->ipsq_xopq_mptail == curr)
1401                                 ipsq->ipsq_xopq_mptail = prev;
1402                         /*
1403                          * Create a temporary list and release the ipsq lock
1404                          * New elements are added to the head of the tmp_list
1405                          */
1406                         curr->b_next = tmp_list;
1407                         tmp_list = curr;
1408                 } else {
1409                         prev = curr;
1410                 }
1411         }
1412         mutex_exit(&ipsq->ipsq_lock);
1413
1414         while (tmp_list != NULL) {
1415                 curr = tmp_list;
1416                 tmp_list = curr->b_next;
1417                 curr->b_next = NULL;
1418                 curr->b_prev = NULL;
1419                 curr->b_queue = NULL;
1420                 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
1421                         ip_ioctl_finish(q, curr, ENXIO, connp != NULL ?
1422                             CONN_CLOSE : NO_COPYOUT, NULL);
1423                 } else {
1424                         /*
1425                          * IP-MT XXX In the case of TLI/XTI bind / optmgmt
1426                          * this can't be just inet_freemsg. we have to
1427                          * restart it otherwise the thread will be stuck.
1428                          */
1429                         inet_freemsg(curr);
1430                 }
1431         }
1432 }
1433
1434 /*
1435  * This conn has started closing. Cleanup any pending ioctl from this conn.
1436  * STREAMS ensures that there can be at most 1 ioctl pending on a stream.
1437  */
1438 void
1439 conn_ioctl_cleanup(conn_t *connp)
1440 {
1441         mblk_t *curr;
1442         ipsq_t  *ipsq;
1443         ill_t   *ill;
1444         boolean_t refheld;
1445
1446         /*
1447          * Is any exclusive ioctl pending ? If so clean it up. If the
1448          * ioctl has not yet started, the mp is pending in the list headed by
1449          * ipsq_xopq_head. If the ioctl has started the mp could be present in
1450          * ipsq_pending_mp. If the ioctl timed out in the streamhead but
1451          * is currently executing now the mp is not queued anywhere but
1452          * conn_oper_pending_ill is null. The conn close will wait
1453          * till the conn_ref drops to zero.
1454          */
1455         mutex_enter(&connp->conn_lock);
1456         ill = connp->conn_oper_pending_ill;
1457         if (ill == NULL) {
1458                 mutex_exit(&connp->conn_lock);
1459                 return;
1460         }
1461
1462         curr = ill_pending_mp_get(ill, &connp, 0);
1463         if (curr != NULL) {
1464                 mutex_exit(&connp->conn_lock);
1465                 CONN_DEC_REF(connp);
1466                 inet_freemsg(curr);
1467                 return;
1468         }
1469         /*
1470          * We may not be able to refhold the ill if the ill/ipif
1471          * is changing. But we need to make sure that the ill will
1472          * not vanish. So we just bump up the ill_waiter count.
1473          */
1474         refheld = ill_waiter_inc(ill);
1475         mutex_exit(&connp->conn_lock);
1476         if (refheld) {
1477                 if (ipsq_enter(ill, B_TRUE)) {
1478                         ill_waiter_dcr(ill);
1479                         /*
1480                          * Check whether this ioctl has started and is
1481                          * pending now in ipsq_pending_mp. If it is not
1482                          * found there then check whether this ioctl has
1483                          * not even started and is in the ipsq_xopq list.
1484                          */
1485                         if (!ipsq_pending_mp_cleanup(ill, connp))
1486                                 ipsq_xopq_mp_cleanup(ill, connp);
1487                         ipsq = ill->ill_phyint->phyint_ipsq;
1488                         ipsq_exit(ipsq, B_TRUE, B_TRUE);
1489                         return;
1490                 }
1491         }
1492
1493         /*
1494          * The ill is also closing and we could not bump up the
1495          * ill_waiter_count or we could not enter the ipsq. Leave
1496          * the cleanup to ill_delete
1497          */
1498         mutex_enter(&connp->conn_lock);
1499         while (connp->conn_oper_pending_ill != NULL)
1500                 cv_wait(&connp->conn_refcv, &connp->conn_lock);
1501         mutex_exit(&connp->conn_lock);
1502         if (refheld)
1503                 ill_waiter_dcr(ill);
1504 }
1505
1506 /*
1507  * ipcl_walk function for cleaning up conn_*_ill fields.
1508  */
1509 static void
1510 conn_cleanup_ill(conn_t *connp, caddr_t arg)
1511 {
1512         ill_t   *ill = (ill_t *)arg;
1513         ire_t   *ire;
1514
1515         mutex_enter(&connp->conn_lock);
1516         if (connp->conn_multicast_ill == ill) {
1517                 /* Revert to late binding */
1518                 connp->conn_multicast_ill = NULL;
1519                 connp->conn_orig_multicast_ifindex = 0;
1520         }
1521         if (connp->conn_incoming_ill == ill)
1522                 connp->conn_incoming_ill = NULL;
1523         if (connp->conn_outgoing_ill == ill)
1524                 connp->conn_outgoing_ill = NULL;
1525         if (connp->conn_outgoing_pill == ill)
1526                 connp->conn_outgoing_pill = NULL;
1527         if (connp->conn_nofailover_ill == ill)
1528                 connp->conn_nofailover_ill = NULL;
1529         if (connp->conn_xmit_if_ill == ill)
1530                 connp->conn_xmit_if_ill = NULL;
1531         if (connp->conn_ire_cache != NULL) {
1532                 ire = connp->conn_ire_cache;
1533                 /*
1534                  * ip_newroute creates IRE_CACHE with ire_stq coming from
1535                  * interface X and ipif coming from interface Y, if interface
1536                  * X and Y are part of the same IPMPgroup. Thus whenever
1537                  * interface X goes down, remove all references to it by
1538                  * checking both on ire_ipif and ire_stq.
1539                  */
1540                 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
1541                     (ire->ire_type == IRE_CACHE &&
1542                     ire->ire_stq == ill->ill_wq)) {
1543                         connp->conn_ire_cache = NULL;
1544                         mutex_exit(&connp->conn_lock);
1545                         ire_refrele_notr(ire);
1546                         return;
1547                 }
1548         }
1549         mutex_exit(&connp->conn_lock);
1550
1551 }
1552
1553 /* ARGSUSED */
1554 void
1555 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
1556 {
1557         ill_t   *ill = q->q_ptr;
1558         ipif_t  *ipif;
1559
1560         ASSERT(IAM_WRITER_IPSQ(ipsq));
1561         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1562                 ipif_non_duplicate(ipif);
1563                 ipif_down_tail(ipif);
1564         }
1565         freemsg(mp);
1566         ipsq_current_finish(ipsq);
1567 }
1568
1569 /*
1570  * ill_down_start is called when we want to down this ill and bring it up again
1571  * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
1572  * all interfaces, but don't tear down any plumbing.
1573  */
1574 boolean_t
1575 ill_down_start(queue_t *q, mblk_t *mp)
1576 {
1577         ill_t   *ill = q->q_ptr;
1578         ipif_t  *ipif;
1579
1580         ASSERT(IAM_WRITER_ILL(ill));
1581
1582         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1583                 (void) ipif_down(ipif, NULL, NULL);
1584
1585         ill_down(ill);
1586
1587         (void) ipsq_pending_mp_cleanup(ill, NULL);
1588
1589         ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
1590
1591         /*
1592          * Atomically test and add the pending mp if references are active.
1593          */
1594         mutex_enter(&ill->ill_lock);
1595         if (!ill_is_quiescent(ill)) {
1596                 /* call cannot fail since `conn_t *' argument is NULL */
1597                 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
1598                     mp, ILL_DOWN);
1599                 mutex_exit(&ill->ill_lock);
1600                 return (B_FALSE);
1601         }
1602         mutex_exit(&ill->ill_lock);
1603         return (B_TRUE);
1604 }
1605
1606 static void
1607 ill_down(ill_t *ill)
1608 {
1609         ip_stack_t      *ipst = ill->ill_ipst;
1610
1611         /* Blow off any IREs dependent on this ILL. */
1612         ire_walk(ill_downi, (char *)ill, ipst);
1613
1614         /* Remove any conn_*_ill depending on this ill */
1615         ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
1616
1617         if (ill->ill_group != NULL) {
1618                 illgrp_delete(ill);
1619         }
1620 }
1621
1622 /*
1623  * ire_walk routine used to delete every IRE that depends on queues
1624  * associated with 'ill'.  (Always called as writer.)
1625  */
1626 static void
1627 ill_downi(ire_t *ire, char *ill_arg)
1628 {
1629         ill_t   *ill = (ill_t *)ill_arg;
1630
1631         /*
1632          * ip_newroute creates IRE_CACHE with ire_stq coming from
1633          * interface X and ipif coming from interface Y, if interface
1634          * X and Y are part of the same IPMP group. Thus whenever interface
1635          * X goes down, remove all references to it by checking both
1636          * on ire_ipif and ire_stq.
1637          */
1638         if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
1639             (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) {
1640                 ire_delete(ire);
1641         }
1642 }
1643
1644 /*
1645  * Remove ire/nce from the fastpath list.
1646  */
1647 void
1648 ill_fastpath_nack(ill_t *ill)
1649 {
1650         nce_fastpath_list_dispatch(ill, NULL, NULL);
1651 }
1652
1653 /* Consume an M_IOCACK of the fastpath probe. */
1654 void
1655 ill_fastpath_ack(ill_t *ill, mblk_t *mp)
1656 {
1657         mblk_t  *mp1 = mp;
1658
1659         /*
1660          * If this was the first attempt turn on the fastpath probing.
1661          */
1662         mutex_enter(&ill->ill_lock);
1663         if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
1664                 ill->ill_dlpi_fastpath_state = IDS_OK;
1665         mutex_exit(&ill->ill_lock);
1666
1667         /* Free the M_IOCACK mblk, hold on to the data */
1668         mp = mp->b_cont;
1669         freeb(mp1);
1670         if (mp == NULL)
1671                 return;
1672         if (mp->b_cont != NULL) {
1673                 /*
1674                  * Update all IRE's or NCE's that are waiting for
1675                  * fastpath update.
1676                  */
1677                 nce_fastpath_list_dispatch(ill, ndp_fastpath_update, mp);
1678                 mp1 = mp->b_cont;
1679                 freeb(mp);
1680                 mp = mp1;
1681         } else {
1682                 ip0dbg(("ill_fastpath_ack:  no b_cont\n"));
1683         }
1684
1685         freeb(mp);
1686 }
1687
1688 /*
1689  * Throw an M_IOCTL message downstream asking "do you know fastpath?"
1690  * The data portion of the request is a dl_unitdata_req_t template for
1691  * what we would send downstream in the absence of a fastpath confirmation.
1692  */
1693 int
1694 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
1695 {
1696         struct iocblk   *ioc;
1697         mblk_t  *mp;
1698
1699         if (dlur_mp == NULL)
1700                 return (EINVAL);
1701
1702         mutex_enter(&ill->ill_lock);
1703         switch (ill->ill_dlpi_fastpath_state) {
1704         case IDS_FAILED:
1705                 /*
1706                  * Driver NAKed the first fastpath ioctl - assume it doesn't
1707                  * support it.
1708                  */
1709                 mutex_exit(&ill->ill_lock);
1710                 return (ENOTSUP);
1711         case IDS_UNKNOWN:
1712                 /* This is the first probe */
1713                 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
1714                 break;
1715         default:
1716                 break;
1717         }
1718         mutex_exit(&ill->ill_lock);
1719
1720         if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
1721                 return (EAGAIN);
1722
1723         mp->b_cont = copyb(dlur_mp);
1724         if (mp->b_cont == NULL) {
1725                 freeb(mp);
1726                 return (EAGAIN);
1727         }
1728
1729         ioc = (struct iocblk *)mp->b_rptr;
1730         ioc->ioc_count = msgdsize(mp->b_cont);
1731
1732         putnext(ill->ill_wq, mp);
1733         return (0);
1734 }
1735
1736 void
1737 ill_capability_probe(ill_t *ill)
1738 {
1739         /*
1740          * Do so only if capabilities are still unknown.
1741          */
1742         if (ill->ill_dlpi_capab_state != IDS_UNKNOWN)
1743                 return;
1744
1745         ill->ill_dlpi_capab_state = IDS_INPROGRESS;
1746         ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
1747         ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL);
1748 }
1749
1750 void
1751 ill_capability_reset(ill_t *ill)
1752 {
1753         mblk_t *sc_mp = NULL;
1754         mblk_t *tmp;
1755
1756         /*
1757          * Note here that we reset the state to UNKNOWN, and later send
1758          * down the DL_CAPABILITY_REQ without first setting the state to
1759          * INPROGRESS.  We do this in order to distinguish the
1760          * DL_CAPABILITY_ACK response which may come back in response to
1761          * a "reset" apart from the "probe" DL_CAPABILITY_REQ.  This would
1762          * also handle the case where the driver doesn't send us back
1763          * a DL_CAPABILITY_ACK in response, since the "probe" routine
1764          * requires the state to be in UNKNOWN anyway.  In any case, all
1765          * features are turned off until the state reaches IDS_OK.
1766          */
1767         ill->ill_dlpi_capab_state = IDS_UNKNOWN;
1768         ill->ill_capab_reneg = B_FALSE;
1769
1770         /*
1771          * Disable sub-capabilities and request a list of sub-capability
1772          * messages which will be sent down to the driver.  Each handler
1773          * allocates the corresponding dl_capability_sub_t inside an
1774          * mblk, and links it to the existing sc_mp mblk, or return it
1775          * as sc_mp if it's the first sub-capability (the passed in
1776          * sc_mp is NULL).  Upon returning from all capability handlers,
1777          * sc_mp will be pulled-up, before passing it downstream.
1778          */
1779         ill_capability_mdt_reset(ill, &sc_mp);
1780         ill_capability_hcksum_reset(ill, &sc_mp);
1781         ill_capability_zerocopy_reset(ill, &sc_mp);
1782         ill_capability_ipsec_reset(ill, &sc_mp);
1783         ill_capability_dls_reset(ill, &sc_mp);
1784         ill_capability_lso_reset(ill, &sc_mp);
1785
1786         /* Nothing to send down in order to disable the capabilities? */
1787         if (sc_mp == NULL)
1788                 return;
1789
1790         tmp = msgpullup(sc_mp, -1);
1791         freemsg(sc_mp);
1792         if ((sc_mp = tmp) == NULL) {
1793                 cmn_err(CE_WARN, "ill_capability_reset: unable to send down "
1794                     "DL_CAPABILITY_REQ (ENOMEM)\n");
1795                 return;
1796         }
1797
1798         ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n"));
1799         ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp);
1800 }
1801
1802 /*
1803  * Request or set new-style hardware capabilities supported by DLS provider.
1804  */
1805 static void
1806 ill_capability_proto(ill_t *ill, int type, mblk_t *reqp)
1807 {
1808         mblk_t *mp;
1809         dl_capability_req_t *capb;
1810         size_t size = 0;
1811         uint8_t *ptr;
1812
1813         if (reqp != NULL)
1814                 size = MBLKL(reqp);
1815
1816         mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type);
1817         if (mp == NULL) {
1818                 freemsg(reqp);
1819                 return;
1820         }
1821         ptr = mp->b_rptr;
1822
1823         capb = (dl_capability_req_t *)ptr;
1824         ptr += sizeof (dl_capability_req_t);
1825
1826         if (reqp != NULL) {
1827                 capb->dl_sub_offset = sizeof (dl_capability_req_t);
1828                 capb->dl_sub_length = size;
1829                 bcopy(reqp->b_rptr, ptr, size);
1830                 ptr += size;
1831                 mp->b_cont = reqp->b_cont;
1832                 freeb(reqp);
1833         }
1834         ASSERT(ptr == mp->b_wptr);
1835
1836         ill_dlpi_send(ill, mp);
1837 }
1838
1839 static void
1840 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
1841 {
1842         dl_capab_id_t *id_ic;
1843         uint_t sub_dl_cap = outers->dl_cap;
1844         dl_capability_sub_t *inners;
1845         uint8_t *capend;
1846
1847         ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
1848
1849         /*
1850          * Note: range checks here are not absolutely sufficient to
1851          * make us robust against malformed messages sent by drivers;
1852          * this is in keeping with the rest of IP's dlpi handling.
1853          * (Remember, it's coming from something else in the kernel
1854          * address space)
1855          */
1856
1857         capend = (uint8_t *)(outers + 1) + outers->dl_length;
1858         if (capend > mp->b_wptr) {
1859                 cmn_err(CE_WARN, "ill_capability_id_ack: "
1860                     "malformed sub-capability too long for mblk");
1861                 return;
1862         }
1863
1864         id_ic = (dl_capab_id_t *)(outers + 1);
1865
1866         if (outers->dl_length < sizeof (*id_ic) ||
1867             (inners = &id_ic->id_subcap,
1868             inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
1869                 cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
1870                     "encapsulated capab type %d too long for mblk",
1871                     inners->dl_cap);
1872                 return;
1873         }
1874
1875         if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
1876                 ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
1877                     "isn't as expected; pass-thru module(s) detected, "
1878                     "discarding capability\n", inners->dl_cap));
1879                 return;
1880         }
1881
1882         /* Process the encapsulated sub-capability */
1883         ill_capability_dispatch(ill, mp, inners, B_TRUE);
1884 }
1885
1886 /*
1887  * Process Multidata Transmit capability negotiation ack received from a
1888  * DLS Provider.  isub must point to the sub-capability (DL_CAPAB_MDT) of a
1889  * DL_CAPABILITY_ACK message.
1890  */
1891 static void
1892 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1893 {
1894         mblk_t *nmp = NULL;
1895         dl_capability_req_t *oc;
1896         dl_capab_mdt_t *mdt_ic, *mdt_oc;
1897         ill_mdt_capab_t **ill_mdt_capab;
1898         uint_t sub_dl_cap = isub->dl_cap;
1899         uint8_t *capend;
1900
1901         ASSERT(sub_dl_cap == DL_CAPAB_MDT);
1902
1903         ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab;
1904
1905         /*
1906          * Note: range checks here are not absolutely sufficient to
1907          * make us robust against malformed messages sent by drivers;
1908          * this is in keeping with the rest of IP's dlpi handling.
1909          * (Remember, it's coming from something else in the kernel
1910          * address space)
1911          */
1912
1913         capend = (uint8_t *)(isub + 1) + isub->dl_length;
1914         if (capend > mp->b_wptr) {
1915                 cmn_err(CE_WARN, "ill_capability_mdt_ack: "
1916                     "malformed sub-capability too long for mblk");
1917                 return;
1918         }
1919
1920         mdt_ic = (dl_capab_mdt_t *)(isub + 1);
1921
1922         if (mdt_ic->mdt_version != MDT_VERSION_2) {
1923                 cmn_err(CE_CONT, "ill_capability_mdt_ack: "
1924                     "unsupported MDT sub-capability (version %d, expected %d)",
1925                     mdt_ic->mdt_version, MDT_VERSION_2);
1926                 return;
1927         }
1928
1929         if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) {
1930                 ip1dbg(("ill_capability_mdt_ack: mid token for MDT "
1931                     "capability isn't as expected; pass-thru module(s) "
1932                     "detected, discarding capability\n"));
1933                 return;
1934         }
1935
1936         if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) {
1937
1938                 if (*ill_mdt_capab == NULL) {
1939                         *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t),
1940                             KM_NOSLEEP);
1941
1942                         if (*ill_mdt_capab == NULL) {
1943                                 cmn_err(CE_WARN, "ill_capability_mdt_ack: "
1944                                     "could not enable MDT version %d "
1945                                     "for %s (ENOMEM)\n", MDT_VERSION_2,
1946                                     ill->ill_name);
1947                                 return;
1948                         }
1949                 }
1950
1951                 ip1dbg(("ill_capability_mdt_ack: interface %s supports "
1952                     "MDT version %d (%d bytes leading, %d bytes trailing "
1953                     "header spaces, %d max pld bufs, %d span limit)\n",
1954                     ill->ill_name, MDT_VERSION_2,
1955                     mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail,
1956                     mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit));
1957
1958                 (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2;
1959                 (*ill_mdt_capab)->ill_mdt_on = 1;
1960                 /*
1961                  * Round the following values to the nearest 32-bit; ULP
1962                  * may further adjust them to accomodate for additional
1963                  * protocol headers.  We pass these values to ULP during
1964                  * bind time.
1965                  */
1966                 (*ill_mdt_capab)->ill_mdt_hdr_head =
1967                     roundup(mdt_ic->mdt_hdr_head, 4);
1968                 (*ill_mdt_capab)->ill_mdt_hdr_tail =
1969                     roundup(mdt_ic->mdt_hdr_tail, 4);
1970                 (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld;
1971                 (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit;
1972
1973                 ill->ill_capabilities |= ILL_CAPAB_MDT;
1974         } else {
1975                 uint_t size;
1976                 uchar_t *rptr;
1977
1978                 size = sizeof (dl_capability_req_t) +
1979                     sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t);
1980
1981                 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1982                         cmn_err(CE_WARN, "ill_capability_mdt_ack: "
1983                             "could not enable MDT for %s (ENOMEM)\n",
1984                             ill->ill_name);
1985                         return;
1986                 }
1987
1988                 rptr = nmp->b_rptr;
1989                 /* initialize dl_capability_req_t */
1990                 oc = (dl_capability_req_t *)nmp->b_rptr;
1991                 oc->dl_sub_offset = sizeof (dl_capability_req_t);
1992                 oc->dl_sub_length = sizeof (dl_capability_sub_t) +
1993                     sizeof (dl_capab_mdt_t);
1994                 nmp->b_rptr += sizeof (dl_capability_req_t);
1995
1996                 /* initialize dl_capability_sub_t */
1997                 bcopy(isub, nmp->b_rptr, sizeof (*isub));
1998                 nmp->b_rptr += sizeof (*isub);
1999
2000                 /* initialize dl_capab_mdt_t */
2001                 mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr;
2002                 bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic));
2003
2004                 nmp->b_rptr = rptr;
2005
2006                 ip1dbg(("ill_capability_mdt_ack: asking interface %s "
2007                     "to enable MDT version %d\n", ill->ill_name,
2008                     MDT_VERSION_2));
2009
2010                 /* set ENABLE flag */
2011                 mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE;
2012
2013                 /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */
2014                 ill_dlpi_send(ill, nmp);
2015         }
2016 }
2017
2018 static void
2019 ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp)
2020 {
2021         mblk_t *mp;
2022         dl_capab_mdt_t *mdt_subcap;
2023         dl_capability_sub_t *dl_subcap;
2024         int size;
2025
2026         if (!ILL_MDT_CAPABLE(ill))
2027                 return;
2028
2029         ASSERT(ill->ill_mdt_capab != NULL);
2030         /*
2031          * Clear the capability flag for MDT but retain the ill_mdt_capab
2032          * structure since it's possible that another thread is still
2033          * referring to it.  The structure only gets deallocated when
2034          * we destroy the ill.
2035          */
2036         ill->ill_capabilities &= ~ILL_CAPAB_MDT;
2037
2038         size = sizeof (*dl_subcap) + sizeof (*mdt_subcap);
2039
2040         mp = allocb(size, BPRI_HI);
2041         if (mp == NULL) {
2042                 ip1dbg(("ill_capability_mdt_reset: unable to allocate "
2043                     "request to disable MDT\n"));
2044                 return;
2045         }
2046
2047         mp->b_wptr = mp->b_rptr + size;
2048
2049         dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
2050         dl_subcap->dl_cap = DL_CAPAB_MDT;
2051         dl_subcap->dl_length = sizeof (*mdt_subcap);
2052
2053         mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1);
2054         mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version;
2055         mdt_subcap->mdt_flags = 0;
2056         mdt_subcap->mdt_hdr_head = 0;
2057         mdt_subcap->mdt_hdr_tail = 0;
2058
2059         if (*sc_mp != NULL)
2060                 linkb(*sc_mp, mp);
2061         else
2062                 *sc_mp = mp;
2063 }
2064
2065 /*
2066  * Send a DL_NOTIFY_REQ to the specified ill to enable
2067  * DL_NOTE_PROMISC_ON/OFF_PHYS notifications.
2068  * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware
2069  * acceleration.
2070  * Returns B_TRUE on success, B_FALSE if the message could not be sent.
2071  */
2072 static boolean_t
2073 ill_enable_promisc_notify(ill_t *ill)
2074 {
2075         mblk_t *mp;
2076         dl_notify_req_t *req;
2077
2078         IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n"));
2079
2080         mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ);
2081         if (mp == NULL)
2082                 return (B_FALSE);
2083
2084         req = (dl_notify_req_t *)mp->b_rptr;
2085         req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS |
2086             DL_NOTE_PROMISC_OFF_PHYS;
2087
2088         ill_dlpi_send(ill, mp);
2089
2090         return (B_TRUE);
2091 }
2092
2093
2094 /*
2095  * Allocate an IPsec capability request which will be filled by our
2096  * caller to turn on support for one or more algorithms.
2097  */
2098 static mblk_t *
2099 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub)
2100 {
2101         mblk_t *nmp;
2102         dl_capability_req_t     *ocap;
2103         dl_capab_ipsec_t        *ocip;
2104         dl_capab_ipsec_t        *icip;
2105         uint8_t                 *ptr;
2106         icip = (dl_capab_ipsec_t *)(isub + 1);
2107
2108         /*
2109          * The first time around, we send a DL_NOTIFY_REQ to enable
2110          * PROMISC_ON/OFF notification from the provider. We need to
2111          * do this before enabling the algorithms to avoid leakage of
2112          * cleartext packets.
2113          */
2114
2115         if (!ill_enable_promisc_notify(ill))
2116                 return (NULL);
2117
2118         /*
2119          * Allocate new mblk which will contain a new capability
2120          * request to enable the capabilities.
2121          */
2122
2123         nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) +
2124             sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ);
2125         if (nmp == NULL)
2126                 return (NULL);
2127
2128         ptr = nmp->b_rptr;
2129
2130         /* initialize dl_capability_req_t */
2131         ocap = (dl_capability_req_t *)ptr;
2132         ocap->dl_sub_offset = sizeof (dl_capability_req_t);
2133         ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length;
2134         ptr += sizeof (dl_capability_req_t);
2135
2136         /* initialize dl_capability_sub_t */
2137         bcopy(isub, ptr, sizeof (*isub));
2138         ptr += sizeof (*isub);
2139
2140         /* initialize dl_capab_ipsec_t */
2141         ocip = (dl_capab_ipsec_t *)ptr;
2142         bcopy(icip, ocip, sizeof (*icip));
2143
2144         nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]);
2145         return (nmp);
2146 }
2147
2148 /*
2149  * Process an IPsec capability negotiation ack received from a DLS Provider.
2150  * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or
2151  * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message.
2152  */
2153 static void
2154 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
2155 {
2156         dl_capab_ipsec_t        *icip;
2157         dl_capab_ipsec_alg_t    *ialg;  /* ptr to input alg spec. */
2158         dl_capab_ipsec_alg_t    *oalg;  /* ptr to output alg spec. */
2159         uint_t cipher, nciphers;
2160         mblk_t *nmp;
2161         uint_t alg_len;
2162         boolean_t need_sadb_dump;
2163         uint_t sub_dl_cap = isub->dl_cap;
2164         ill_ipsec_capab_t **ill_capab;
2165         uint64_t ill_capab_flag;
2166         uint8_t *capend, *ciphend;
2167         boolean_t sadb_resync;
2168
2169         ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH ||
2170             sub_dl_cap == DL_CAPAB_IPSEC_ESP);
2171
2172         if (sub_dl_cap == DL_CAPAB_IPSEC_AH) {
2173                 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah;
2174                 ill_capab_flag = ILL_CAPAB_AH;
2175         } else {
2176                 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp;
2177                 ill_capab_flag = ILL_CAPAB_ESP;
2178         }
2179
2180         /*
2181          * If the ill capability structure exists, then this incoming
2182          * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle.
2183          * If this is so, then we'd need to resynchronize the SADB
2184          * after re-enabling the offloaded ciphers.
2185          */
2186         sadb_resync = (*ill_capab != NULL);
2187
2188         /*
2189          * Note: range checks here are not absolutely sufficient to
2190          * make us robust against malformed messages sent by drivers;
2191          * this is in keeping with the rest of IP's dlpi handling.
2192          * (Remember, it's coming from something else in the kernel
2193          * address space)
2194          */
2195
2196         capend = (uint8_t *)(isub + 1) + isub->dl_length;
2197         if (capend > mp->b_wptr) {
2198                 cmn_err(CE_WARN, "ill_capability_ipsec_ack: "
2199                     "malformed sub-capability too long for mblk");
2200                 return;
2201         }
2202
2203         /*
2204          * There are two types of acks we process here:
2205          * 1. acks in reply to a (first form) generic capability req
2206          *    (no ENABLE flag set)
2207          * 2. acks in reply to a ENABLE capability req.
2208          *    (ENABLE flag set)
2209          *
2210          * We process the subcapability passed as argument as follows:
2211          * 1 do initializations
2212          *   1.1 initialize nmp = NULL
2213          *   1.2 set need_sadb_dump to B_FALSE
2214          * 2 for each cipher in subcapability:
2215          *   2.1 if ENABLE flag is set:
2216          *      2.1.1 update per-ill ipsec capabilities info
2217          *      2.1.2 set need_sadb_dump to B_TRUE
2218          *   2.2 if ENABLE flag is not set:
2219          *      2.2.1 if nmp is NULL:
2220          *              2.2.1.1 allocate and initialize nmp
2221          *              2.2.1.2 init current pos in nmp
2222          *      2.2.2 copy current cipher to current pos in nmp
2223          *      2.2.3 set ENABLE flag in nmp
2224          *      2.2.4 update current pos
2225          * 3 if nmp is not equal to NULL, send enable request
2226          *   3.1 send capability request
2227          * 4 if need_sadb_dump is B_TRUE
2228          *   4.1 enable promiscuous on/off notifications
2229          *   4.2 call ill_dlpi_send(isub->dlcap) to send all
2230          *      AH or ESP SA's to interface.
2231          */
2232
2233         nmp = NULL;
2234         oalg = NULL;
2235         need_sadb_dump = B_FALSE;
2236         icip = (dl_capab_ipsec_t *)(isub + 1);
2237         ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]);
2238
2239         nciphers = icip->cip_nciphers;
2240         ciphend = (uint8_t *)(ialg + icip->cip_nciphers);
2241
2242         if (ciphend > capend) {
2243                 cmn_err(CE_WARN, "ill_capability_ipsec_ack: "
2244                     "too many ciphers for sub-capability len");
2245                 return;
2246         }
2247
2248         for (cipher = 0; cipher < nciphers; cipher++) {
2249                 alg_len = sizeof (dl_capab_ipsec_alg_t);
2250
2251                 if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) {
2252                         /*
2253                          * TBD: when we provide a way to disable capabilities
2254                          * from above, need to manage the request-pending state
2255                          * and fail if we were not expecting this ACK.
2256                          */
2257                         IPSECHW_DEBUG(IPSECHW_CAPAB,
2258                             ("ill_capability_ipsec_ack: got ENABLE ACK\n"));
2259
2260                         /*
2261                          * Update IPsec capabilities for this ill
2262                          */
2263
2264                         if (*ill_capab == NULL) {
2265                                 IPSECHW_DEBUG(IPSECHW_CAPAB,
2266                                     ("ill_capability_ipsec_ack: "
2267                                     "allocating ipsec_capab for ill\n"));
2268                                 *ill_capab = ill_ipsec_capab_alloc();
2269
2270                                 if (*ill_capab == NULL) {
2271                                         cmn_err(CE_WARN,
2272                                             "ill_capability_ipsec_ack: "
2273                                             "could not enable IPsec Hardware "
2274                                             "acceleration for %s (ENOMEM)\n",
2275                                             ill->ill_name);
2276                                         return;
2277                                 }
2278                         }
2279
2280                         ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH ||
2281                             ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR);
2282
2283                         if (ialg->alg_prim >= MAX_IPSEC_ALGS) {
2284                                 cmn_err(CE_WARN,
2285                                     "ill_capability_ipsec_ack: "
2286                                     "malformed IPsec algorithm id %d",
2287                                     ialg->alg_prim);
2288                                 continue;
2289                         }
2290
2291                         if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) {
2292                                 IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs,
2293                                     ialg->alg_prim);
2294                         } else {
2295                                 ipsec_capab_algparm_t *alp;
2296
2297                                 IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs,
2298                                     ialg->alg_prim);
2299                                 if (!ill_ipsec_capab_resize_algparm(*ill_capab,
2300                                     ialg->alg_prim)) {
2301                                         cmn_err(CE_WARN,
2302                                             "ill_capability_ipsec_ack: "
2303                                             "no space for IPsec alg id %d",
2304                                             ialg->alg_prim);
2305                                         continue;
2306                                 }
2307                                 alp = &((*ill_capab)->encr_algparm[
2308                                     ialg->alg_prim]);
2309                                 alp->minkeylen = ialg->alg_minbits;
2310                                 alp->maxkeylen = ialg->alg_maxbits;
2311                         }
2312                         ill->ill_capabilities |= ill_capab_flag;
2313                         /*
2314                          * indicate that a capability was enabled, which
2315                          * will be used below to kick off a SADB dump
2316                          * to the ill.
2317                          */
2318                         need_sadb_dump = B_TRUE;
2319                 } else {
2320                         IPSECHW_DEBUG(IPSECHW_CAPAB,
2321                             ("ill_capability_ipsec_ack: enabling alg 0x%x\n",
2322                             ialg->alg_prim));
2323
2324                         if (nmp == NULL) {
2325                                 nmp = ill_alloc_ipsec_cap_req(ill, isub);
2326                                 if (nmp == NULL) {
2327                                         /*
2328                                          * Sending the PROMISC_ON/OFF
2329                                          * notification request failed.
2330                                          * We cannot enable the algorithms
2331                                          * since the Provider will not
2332                                          * notify IP of promiscous mode
2333                                          * changes, which could lead
2334                                          * to leakage of packets.
2335                                          */
2336                                         cmn_err(CE_WARN,
2337                                             "ill_capability_ipsec_ack: "
2338                                             "could not enable IPsec Hardware "
2339                                             "acceleration for %s (ENOMEM)\n",
2340                                             ill->ill_name);
2341                                         return;
2342                                 }
2343                                 /* ptr to current output alg specifier */
2344                                 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr;
2345                         }
2346
2347                         /*
2348                          * Copy current alg specifier, set ENABLE
2349                          * flag, and advance to next output alg.
2350                          * For now we enable all IPsec capabilities.
2351                          */
2352                         ASSERT(oalg != NULL);
2353                         bcopy(ialg, oalg, alg_len);
2354                         oalg->alg_flag |= DL_CAPAB_ALG_ENABLE;
2355                         nmp->b_wptr += alg_len;
2356                         oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr;
2357                 }
2358
2359                 /* move to next input algorithm specifier */
2360                 ialg = (dl_capab_ipsec_alg_t *)
2361                     ((char *)ialg + alg_len);
2362         }
2363
2364         if (nmp != NULL)
2365                 /*
2366                  * nmp points to a DL_CAPABILITY_REQ message to enable
2367                  * IPsec hardware acceleration.
2368                  */
2369                 ill_dlpi_send(ill, nmp);
2370
2371         if (need_sadb_dump)
2372                 /*
2373                  * An acknowledgement corresponding to a request to
2374                  * enable acceleration was received, notify SADB.
2375                  */
2376                 ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync);
2377 }
2378
2379 /*
2380  * Given an mblk with enough space in it, create sub-capability entries for
2381  * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised
2382  * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared,
2383  * in preparation for the reset the DL_CAPABILITY_REQ message.
2384  */
2385 static void
2386 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen,
2387     ill_ipsec_capab_t *ill_cap, mblk_t *mp)
2388 {
2389         dl_capab_ipsec_t *oipsec;
2390         dl_capab_ipsec_alg_t *oalg;
2391         dl_capability_sub_t *dl_subcap;
2392         int i, k;
2393
2394         ASSERT(nciphers > 0);
2395         ASSERT(ill_cap != NULL);
2396         ASSERT(mp != NULL);
2397         ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen);
2398
2399         /* dl_capability_sub_t for "stype" */
2400         dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
2401         dl_subcap->dl_cap = stype;
2402         dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen;
2403         mp->b_wptr += sizeof (dl_capability_sub_t);
2404
2405         /* dl_capab_ipsec_t for "stype" */
2406         oipsec = (dl_capab_ipsec_t *)mp->b_wptr;
2407         oipsec->cip_version = 1;
2408         oipsec->cip_nciphers = nciphers;
2409         mp->b_wptr = (uchar_t *)&oipsec->cip_data[0];
2410
2411         /* create entries for "stype" AUTH ciphers */
2412         for (i = 0; i < ill_cap->algs_size; i++) {
2413                 for (k = 0; k < BITSPERBYTE; k++) {
2414                         if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0)
2415                                 continue;
2416
2417                         oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr;
2418                         bzero((void *)oalg, sizeof (*oalg));
2419                         oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH;
2420                         oalg->alg_prim = k + (BITSPERBYTE * i);
2421                         mp->b_wptr += sizeof (dl_capab_ipsec_alg_t);
2422                 }
2423         }
2424         /* create entries for "stype" ENCR ciphers */
2425         for (i = 0; i < ill_cap->algs_size; i++) {
2426                 for (k = 0; k < BITSPERBYTE; k++) {
2427                         if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0)
2428                                 continue;
2429
2430                         oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr;
2431                         bzero((void *)oalg, sizeof (*oalg));
2432                         oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR;
2433                         oalg->alg_prim = k + (BITSPERBYTE * i);
2434                         mp->b_wptr += sizeof (dl_capab_ipsec_alg_t);
2435                 }
2436         }
2437 }
2438
2439 /*
2440  * Macro to count number of 1s in a byte (8-bit word).  The total count is
2441  * accumulated into the passed-in argument (sum).  We could use SPARCv9's
2442  * POPC instruction, but our macro is more flexible for an arbitrary length
2443  * of bytes, such as {auth,encr}_hw_algs.  These variables are currently
2444  * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length
2445  * stays that way, we can reduce the number of iterations required.
2446  */
2447 #define COUNT_1S(val, sum) {                                    \
2448         uint8_t x = val & 0xff;                                 \
2449         x = (x & 0x55) + ((x >> 1) & 0x55);                     \
2450         x = (x & 0x33) + ((x >> 2) & 0x33);                     \
2451         sum += (x & 0xf) + ((x >> 4) & 0xf);                    \
2452 }
2453
2454 /* ARGSUSED */
2455 static void
2456 ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp)
2457 {
2458         mblk_t *mp;
2459         ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah;
2460         ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp;
2461         uint64_t ill_capabilities = ill->ill_capabilities;
2462         int ah_cnt = 0, esp_cnt = 0;
2463         int ah_len = 0, esp_len = 0;
2464         int i, size = 0;
2465
2466         if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)))
2467                 return;
2468
2469         ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH));
2470         ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP));
2471
2472         /* Find out the number of ciphers for AH */
2473         if (cap_ah != NULL) {
2474                 for (i = 0; i < cap_ah->algs_size; i++) {
2475                         COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt);
2476                         COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt);
2477                 }
2478                 if (ah_cnt > 0) {
2479                         size += sizeof (dl_capability_sub_t) +
2480                             sizeof (dl_capab_ipsec_t);
2481                         /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */
2482                         ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t);
2483                         size += ah_len;
2484                 }
2485         }
2486
2487         /* Find out the number of ciphers for ESP */
2488         if (cap_esp != NULL) {
2489                 for (i = 0; i < cap_esp->algs_size; i++) {
2490                         COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt);
2491                         COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt);
2492                 }
2493                 if (esp_cnt > 0) {
2494                         size += sizeof (dl_capability_sub_t) +
2495                             sizeof (dl_capab_ipsec_t);
2496                         /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */
2497                         esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t);
2498                         size += esp_len;
2499                 }
2500         }
2501
2502         if (size == 0) {
2503                 ip1dbg(("ill_capability_ipsec_reset: capabilities exist but "
2504                     "there's nothing to reset\n"));
2505                 return;
2506         }
2507
2508         mp = allocb(size, BPRI_HI);
2509         if (mp == NULL) {
2510                 ip1dbg(("ill_capability_ipsec_reset: unable to allocate "
2511                     "request to disable IPSEC Hardware Acceleration\n"));
2512                 return;
2513         }
2514
2515         /*
2516          * Clear the capability flags for IPsec HA but retain the ill
2517          * capability structures since it's possible that another thread
2518          * is still referring to them.  The structures only get deallocated
2519          * when we destroy the ill.
2520          *
2521          * Various places check the flags to see if the ill is capable of
2522          * hardware acceleration, and by clearing them we ensure that new
2523          * outbound IPsec packets are sent down encrypted.
2524          */
2525         ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP);
2526
2527         /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */
2528         if (ah_cnt > 0) {
2529                 ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len,
2530                     cap_ah, mp);
2531                 ASSERT(mp->b_rptr + size >= mp->b_wptr);
2532         }
2533
2534         /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */
2535         if (esp_cnt > 0) {
2536                 ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len,
2537                     cap_esp, mp);
2538                 ASSERT(mp->b_rptr + size >= mp->b_wptr);
2539         }
2540
2541         /*
2542          * At this point we've composed a bunch of sub-capabilities to be
2543          * encapsulated in a DL_CAPABILITY_REQ and later sent downstream
2544          * by the caller.  Upon receiving this reset message, the driver
2545          * must stop inbound decryption (by destroying all inbound SAs)
2546          * and let the corresponding packets come in encrypted.
2547          */
2548
2549         if (*sc_mp != NULL)
2550                 linkb(*sc_mp, mp);
2551         else
2552                 *sc_mp = mp;
2553 }
2554
2555 static void
2556 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp,
2557     boolean_t encapsulated)
2558 {
2559         boolean_t legacy = B_FALSE;
2560
2561         /*
2562          * If this DL_CAPABILITY_ACK came in as a response to our "reset"
2563          * DL_CAPABILITY_REQ, ignore it during this cycle.  We've just
2564          * instructed the driver to disable its advertised capabilities,
2565          * so there's no point in accepting any response at this moment.
2566          */
2567         if (ill->ill_dlpi_capab_state == IDS_UNKNOWN)
2568                 return;
2569
2570         /*
2571          * Note that only the following two sub-capabilities may be
2572          * considered as "legacy", since their original definitions
2573          * do not incorporate the dl_mid_t module ID token, and hence
2574          * may require the use of the wrapper sub-capability.
2575          */
2576         switch (subp->dl_cap) {
2577         case DL_CAPAB_IPSEC_AH:
2578         case DL_CAPAB_IPSEC_ESP:
2579                 legacy = B_TRUE;
2580                 break;
2581         }
2582
2583         /*
2584          * For legacy sub-capabilities which don't incorporate a queue_t
2585          * pointer in their structures, discard them if we detect that
2586          * there are intermediate modules in between IP and the driver.
2587          */
2588         if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) {
2589                 ip1dbg(("ill_capability_dispatch: unencapsulated capab type "
2590                     "%d discarded; %d module(s) present below IP\n",
2591                     subp->dl_cap, ill->ill_lmod_cnt));
2592                 return;
2593         }
2594
2595         switch (subp->dl_cap) {
2596         case DL_CAPAB_IPSEC_AH:
2597         case DL_CAPAB_IPSEC_ESP:
2598                 ill_capability_ipsec_ack(ill, mp, subp);
2599                 break;
2600         case DL_CAPAB_MDT:
2601                 ill_capability_mdt_ack(ill, mp, subp);
2602                 break;
2603         case DL_CAPAB_HCKSUM:
2604                 ill_capability_hcksum_ack(ill, mp, subp);
2605                 break;
2606         case DL_CAPAB_ZEROCOPY:
2607                 ill_capability_zerocopy_ack(ill, mp, subp);
2608                 break;
2609         case DL_CAPAB_POLL:
2610                 if (!SOFT_RINGS_ENABLED())
2611                         ill_capability_dls_ack(ill, mp, subp);
2612                 break;
2613         case DL_CAPAB_SOFT_RING:
2614                 if (SOFT_RINGS_ENABLED())
2615                         ill_capability_dls_ack(ill, mp, subp);
2616                 break;
2617         case DL_CAPAB_LSO:
2618                 ill_capability_lso_ack(ill, mp, subp);
2619                 break;
2620         default:
2621                 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
2622                     subp->dl_cap));
2623         }
2624 }
2625
2626 /*
2627  * As part of negotiating polling capability, the driver tells us
2628  * the default (or normal) blanking interval and packet threshold
2629  * (the receive timer fires if blanking interval is reached or
2630  * the packet threshold is reached).
2631  *
2632  * As part of manipulating the polling interval, we always use our
2633  * estimated interval (avg service time * number of packets queued
2634  * on the squeue) but we try to blank for a minimum of
2635  * rr_normal_blank_time * rr_max_blank_ratio. We disable the
2636  * packet threshold during this time. When we are not in polling mode
2637  * we set the blank interval typically lower, rr_normal_pkt_cnt *
2638  * rr_min_blank_ratio but up the packet cnt by a ratio of
2639  * rr_min_pkt_cnt_ratio so that we are still getting chains if
2640  * possible although for a shorter interval.
2641  */
2642 #define RR_MAX_BLANK_RATIO      20
2643 #define RR_MIN_BLANK_RATIO      10
2644 #define RR_MAX_PKT_CNT_RATIO    3
2645 #define RR_MIN_PKT_CNT_RATIO    3
2646
2647 /*
2648  * These can be tuned via /etc/system.
2649  */
2650 int rr_max_blank_ratio = RR_MAX_BLANK_RATIO;
2651 int rr_min_blank_ratio = RR_MIN_BLANK_RATIO;
2652 int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO;
2653 int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO;
2654
2655 static mac_resource_handle_t
2656 ill_ring_add(void *arg, mac_resource_t *mrp)
2657 {
2658         ill_t                   *ill = (ill_t *)arg;
2659         mac_rx_fifo_t           *mrfp = (mac_rx_fifo_t *)mrp;
2660         ill_rx_ring_t           *rx_ring;
2661         int                     ip_rx_index;
2662
2663         ASSERT(mrp != NULL);
2664         if (mrp->mr_type != MAC_RX_FIFO) {
2665                 return (NULL);
2666         }
2667         ASSERT(ill != NULL);
2668         ASSERT(ill->ill_dls_capab != NULL);
2669
2670         mutex_enter(&ill->ill_lock);
2671         for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
2672                 rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index];
2673                 ASSERT(rx_ring != NULL);
2674
2675                 if (rx_ring->rr_ring_state == ILL_RING_FREE) {
2676                         time_t normal_blank_time =
2677                             mrfp->mrf_normal_blank_time;
2678                         uint_t normal_pkt_cnt =
2679                             mrfp->mrf_normal_pkt_count;
2680
2681         bzero(rx_ring, sizeof (ill_rx_ring_t));
2682
2683         rx_ring->rr_blank = mrfp->mrf_blank;
2684         rx_ring->rr_handle = mrfp->mrf_arg;
2685         rx_ring->rr_ill = ill;
2686         rx_ring->rr_normal_blank_time = normal_blank_time;
2687         rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt;
2688
2689                         rx_ring->rr_max_blank_time =
2690                             normal_blank_time * rr_max_blank_ratio;
2691                         rx_ring->rr_min_blank_time =
2692                             normal_blank_time * rr_min_blank_ratio;
2693                         rx_ring->rr_max_pkt_cnt =
2694                             normal_pkt_cnt * rr_max_pkt_cnt_ratio;
2695                         rx_ring->rr_min_pkt_cnt =
2696                             normal_pkt_cnt * rr_min_pkt_cnt_ratio;
2697
2698                         rx_ring->rr_ring_state = ILL_RING_INUSE;
2699                         mutex_exit(&ill->ill_lock);
2700
2701                         DTRACE_PROBE2(ill__ring__add, (void *), ill,
2702                             (int), ip_rx_index);
2703                         return ((mac_resource_handle_t)rx_ring);
2704                 }
2705         }
2706
2707         /*
2708          * We ran out of ILL_MAX_RINGS worth rx_ring structures. If
2709          * we have devices which can overwhelm this limit, ILL_MAX_RING
2710          * should be made configurable. Meanwhile it cause no panic because
2711          * driver will pass ip_input a NULL handle which will make
2712          * IP allocate the default squeue and Polling mode will not
2713          * be used for this ring.
2714          */
2715         cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) "
2716             "for %s\n", ILL_MAX_RINGS, ill->ill_name);
2717
2718         mutex_exit(&ill->ill_lock);
2719         return (NULL);
2720 }
2721
2722 static boolean_t
2723 ill_capability_dls_init(ill_t *ill)
2724 {
2725         ill_dls_capab_t *ill_dls = ill->ill_dls_capab;
2726         conn_t                  *connp;
2727         size_t                  sz;
2728         ip_stack_t *ipst = ill->ill_ipst;
2729
2730         if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
2731                 if (ill_dls == NULL) {
2732                         cmn_err(CE_PANIC, "ill_capability_dls_init: "
2733                             "soft_ring enabled for ill=%s (%p) but data "
2734                             "structs uninitialized\n", ill->ill_name,
2735                             (void *)ill);
2736                 }
2737                 return (B_TRUE);
2738         } else if (ill->ill_capabilities & ILL_CAPAB_POLL) {
2739                 if (ill_dls == NULL) {
2740                         cmn_err(CE_PANIC, "ill_capability_dls_init: "
2741                             "polling enabled for ill=%s (%p) but data "
2742                             "structs uninitialized\n", ill->ill_name,
2743                             (void *)ill);
2744                 }
2745                 return (B_TRUE);
2746         }
2747
2748         if (ill_dls != NULL) {
2749                 ill_rx_ring_t   *rx_ring = ill_dls->ill_ring_tbl;
2750                 /* Soft_Ring or polling is being re-enabled */
2751
2752                 connp = ill_dls->ill_unbind_conn;
2753                 ASSERT(rx_ring != NULL);
2754                 bzero((void *)ill_dls, sizeof (ill_dls_capab_t));
2755                 bzero((void *)rx_ring,
2756                     sizeof (ill_rx_ring_t) * ILL_MAX_RINGS);
2757                 ill_dls->ill_ring_tbl = rx_ring;
2758                 ill_dls->ill_unbind_conn = connp;
2759                 return (B_TRUE);
2760         }
2761
2762         if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
2763             ipst->ips_netstack)) == NULL)
2764                 return (B_FALSE);
2765
2766         sz = sizeof (ill_dls_capab_t);
2767         sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS;
2768
2769         ill_dls = kmem_zalloc(sz, KM_NOSLEEP);
2770         if (ill_dls == NULL) {
2771                 cmn_err(CE_WARN, "ill_capability_dls_init: could not "
2772                     "allocate dls_capab for %s (%p)\n", ill->ill_name,
2773                     (void *)ill);
2774                 CONN_DEC_REF(connp);
2775                 return (B_FALSE);
2776         }
2777
2778         /* Allocate space to hold ring table */
2779         ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1];
2780         ill->ill_dls_capab = ill_dls;
2781         ill_dls->ill_unbind_conn = connp;
2782         return (B_TRUE);
2783 }
2784
2785 /*
2786  * ill_capability_dls_disable: disable soft_ring and/or polling
2787  * capability. Since any of the rings might already be in use, need
2788  * to call ip_squeue_clean_all() which gets behind the squeue to disable
2789  * direct calls if necessary.
2790  */
2791 static void
2792 ill_capability_dls_disable(ill_t *ill)
2793 {
2794         ill_dls_capab_t *ill_dls = ill->ill_dls_capab;
2795
2796         if (ill->ill_capabilities & ILL_CAPAB_DLS) {
2797                 ip_squeue_clean_all(ill);
2798                 ill_dls->ill_tx = NULL;
2799                 ill_dls->ill_tx_handle = NULL;
2800                 ill_dls->ill_dls_change_status = NULL;
2801                 ill_dls->ill_dls_bind = NULL;
2802                 ill_dls->ill_dls_unbind = NULL;
2803         }
2804
2805         ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS));
2806 }
2807
2808 static void
2809 ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls,
2810     dl_capability_sub_t *isub)
2811 {
2812         uint_t                  size;
2813         uchar_t                 *rptr;
2814         dl_capab_dls_t  dls, *odls;
2815         ill_dls_capab_t *ill_dls;
2816         mblk_t                  *nmp = NULL;
2817         dl_capability_req_t     *ocap;
2818         uint_t                  sub_dl_cap = isub->dl_cap;
2819
2820         if (!ill_capability_dls_init(ill))
2821                 return;
2822         ill_dls = ill->ill_dls_capab;
2823
2824         /* Copy locally to get the members aligned */
2825         bcopy((void *)idls, (void *)&dls,
2826             sizeof (dl_capab_dls_t));
2827
2828         /* Get the tx function and handle from dld */
2829         ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx;
2830         ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle;
2831
2832         if (sub_dl_cap == DL_CAPAB_SOFT_RING) {
2833                 ill_dls->ill_dls_change_status =
2834                     (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status;
2835                 ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind;
2836                 ill_dls->ill_dls_unbind =
2837                     (ip_dls_unbind_t)dls.dls_ring_unbind;
2838                 ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt;
2839         }
2840
2841         size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) +
2842             isub->dl_length;
2843
2844         if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
2845                 cmn_err(CE_WARN, "ill_capability_dls_capable: could "
2846                     "not allocate memory for CAPAB_REQ for %s (%p)\n",
2847                     ill->ill_name, (void *)ill);
2848                 return;
2849         }
2850
2851         /* initialize dl_capability_req_t */
2852         rptr = nmp->b_rptr;
2853         ocap = (dl_capability_req_t *)rptr;
2854         ocap->dl_sub_offset = sizeof (dl_capability_req_t);
2855         ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length;
2856         rptr += sizeof (dl_capability_req_t);
2857
2858         /* initialize dl_capability_sub_t */
2859         bcopy(isub, rptr, sizeof (*isub));
2860         rptr += sizeof (*isub);
2861
2862         odls = (dl_capab_dls_t *)rptr;
2863         rptr += sizeof (dl_capab_dls_t);
2864
2865         /* initialize dl_capab_dls_t to be sent down */
2866         dls.dls_rx_handle = (uintptr_t)ill;
2867         dls.dls_rx = (uintptr_t)ip_input;
2868         dls.dls_ring_add = (uintptr_t)ill_ring_add;
2869
2870         if (sub_dl_cap == DL_CAPAB_SOFT_RING) {
2871                 dls.dls_ring_cnt = ip_soft_rings_cnt;
2872                 dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment;
2873                 dls.dls_flags = SOFT_RING_ENABLE;
2874         } else {
2875                 dls.dls_flags = POLL_ENABLE;
2876                 ip1dbg(("ill_capability_dls_capable: asking interface %s "
2877                     "to enable polling\n", ill->ill_name));
2878         }
2879         bcopy((void *)&dls, (void *)odls,
2880             sizeof (dl_capab_dls_t));
2881         ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
2882         /*
2883          * nmp points to a DL_CAPABILITY_REQ message to
2884          * enable either soft_ring or polling
2885          */
2886         ill_dlpi_send(ill, nmp);
2887 }
2888
2889 static void
2890 ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp)
2891 {
2892         mblk_t *mp;
2893         dl_capab_dls_t *idls;
2894         dl_capability_sub_t *dl_subcap;
2895         int size;
2896
2897         if (!(ill->ill_capabilities & ILL_CAPAB_DLS))
2898                 return;
2899
2900         ASSERT(ill->ill_dls_capab != NULL);
2901
2902         size = sizeof (*dl_subcap) + sizeof (*idls);
2903
2904         mp = allocb(size, BPRI_HI);
2905         if (mp == NULL) {
2906                 ip1dbg(("ill_capability_dls_reset: unable to allocate "
2907                     "request to disable soft_ring\n"));
2908                 return;
2909         }
2910
2911         mp->b_wptr = mp->b_rptr + size;
2912
2913         dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
2914         dl_subcap->dl_length = sizeof (*idls);
2915         if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING)
2916                 dl_subcap->dl_cap = DL_CAPAB_SOFT_RING;
2917         else
2918                 dl_subcap->dl_cap = DL_CAPAB_POLL;
2919
2920         idls = (dl_capab_dls_t *)(dl_subcap + 1);
2921         if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING)
2922                 idls->dls_flags = SOFT_RING_DISABLE;
2923         else
2924                 idls->dls_flags = POLL_DISABLE;
2925
2926         if (*sc_mp != NULL)
2927                 linkb(*sc_mp, mp);
2928         else
2929                 *sc_mp = mp;
2930 }
2931
2932 /*
2933  * Process a soft_ring/poll capability negotiation ack received
2934  * from a DLS Provider.isub must point to the sub-capability
2935  * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message.
2936  */
2937 static void
2938 ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
2939 {
2940         dl_capab_dls_t          *idls;
2941         uint_t                  sub_dl_cap = isub->dl_cap;
2942         uint8_t                 *capend;
2943
2944         ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING ||
2945             sub_dl_cap == DL_CAPAB_POLL);
2946
2947         if (ill->ill_isv6)
2948                 return;
2949
2950         /*
2951          * Note: range checks here are not absolutely sufficient to
2952          * make us robust against malformed messages sent by drivers;
2953          * this is in keeping with the rest of IP's dlpi handling.
2954          * (Remember, it's coming from something else in the kernel
2955          * address space)
2956          */
2957         capend = (uint8_t *)(isub + 1) + isub->dl_length;
2958         if (capend > mp->b_wptr) {
2959                 cmn_err(CE_WARN, "ill_capability_dls_ack: "
2960                     "malformed sub-capability too long for mblk");
2961                 return;
2962         }
2963
2964         /*
2965          * There are two types of acks we process here:
2966          * 1. acks in reply to a (first form) generic capability req
2967          *    (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE)
2968          * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE
2969          *    capability req.
2970          */
2971         idls = (dl_capab_dls_t *)(isub + 1);
2972
2973         if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) {
2974                 ip1dbg(("ill_capability_dls_ack: mid token for dls "
2975                     "capability isn't as expected; pass-thru "
2976                     "module(s) detected, discarding capability\n"));
2977                 if (ill->ill_capabilities & ILL_CAPAB_DLS) {
2978                         /*
2979                          * This is a capability renegotitation case.
2980                          * The interface better be unusable at this
2981                          * point other wise bad things will happen
2982                          * if we disable direct calls on a running
2983                          * and up interface.
2984                          */
2985                         ill_capability_dls_disable(ill);
2986                 }
2987                 return;
2988         }
2989
2990         switch (idls->dls_flags) {
2991         default:
2992                 /* Disable if unknown flag */
2993         case SOFT_RING_DISABLE:
2994         case POLL_DISABLE:
2995                 ill_capability_dls_disable(ill);
2996                 break;
2997         case SOFT_RING_CAPABLE:
2998         case POLL_CAPABLE:
2999                 /*
3000                  * If the capability was already enabled, its safe
3001                  * to disable it first to get rid of stale information
3002                  * and then start enabling it again.
3003                  */
3004                 ill_capability_dls_disable(ill);
3005                 ill_capability_dls_capable(ill, idls, isub);
3006                 break;
3007         case SOFT_RING_ENABLE:
3008         case POLL_ENABLE:
3009                 mutex_enter(&ill->ill_lock);
3010                 if (sub_dl_cap == DL_CAPAB_SOFT_RING &&
3011                     !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) {
3012                         ASSERT(ill->ill_dls_capab != NULL);
3013                         ill->ill_capabilities |= ILL_CAPAB_SOFT_RING;
3014                 }
3015                 if (sub_dl_cap == DL_CAPAB_POLL &&
3016                     !(ill->ill_capabilities & ILL_CAPAB_POLL)) {
3017                         ASSERT(ill->ill_dls_capab != NULL);
3018                         ill->ill_capabilities |= ILL_CAPAB_POLL;
3019                         ip1dbg(("ill_capability_dls_ack: interface %s "
3020                             "has enabled polling\n", ill->ill_name));
3021                 }
3022                 mutex_exit(&ill->ill_lock);
3023                 break;
3024         }
3025 }
3026
3027 /*
3028  * Process a hardware checksum offload capability negotiation ack received
3029  * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
3030  * of a DL_CAPABILITY_ACK message.
3031  */
3032 static void
3033 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
3034 {
3035         dl_capability_req_t     *ocap;
3036         dl_capab_hcksum_t       *ihck, *ohck;
3037         ill_hcksum_capab_t      **ill_hcksum;
3038         mblk_t                  *nmp = NULL;
3039         uint_t                  sub_dl_cap = isub->dl_cap;
3040         uint8_t                 *capend;
3041
3042         ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
3043
3044         ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
3045
3046         /*
3047          * Note: range checks here are not absolutely sufficient to
3048          * make us robust against malformed messages sent by drivers;
3049          * this is in keeping with the rest of IP's dlpi handling.
3050          * (Remember, it's coming from something else in the kernel
3051          * address space)
3052          */
3053         capend = (uint8_t *)(isub + 1) + isub->dl_length;
3054         if (capend > mp->b_wptr) {
3055                 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
3056                     "malformed sub-capability too long for mblk");
3057                 return;
3058         }
3059
3060         /*
3061          * There are two types of acks we process here:
3062          * 1. acks in reply to a (first form) generic capability req
3063          *    (no ENABLE flag set)
3064          * 2. acks in reply to a ENABLE capability req.
3065          *    (ENABLE flag set)
3066          */
3067         ihck = (dl_capab_hcksum_t *)(isub + 1);
3068
3069         if (ihck->hcksum_version != HCKSUM_VERSION_1) {
3070                 cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
3071                     "unsupported hardware checksum "
3072                     "sub-capability (version %d, expected %d)",
3073                     ihck->hcksum_version, HCKSUM_VERSION_1);
3074                 return;
3075         }
3076
3077         if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
3078                 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
3079                     "checksum capability isn't as expected; pass-thru "
3080                     "module(s) detected, discarding capability\n"));
3081                 return;
3082         }
3083
3084 #define CURR_HCKSUM_CAPAB                               \
3085         (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 |    \
3086         HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
3087
3088         if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
3089             (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
3090                 /* do ENABLE processing */
3091                 if (*ill_hcksum == NULL) {
3092                         *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
3093                             KM_NOSLEEP);
3094
3095                         if (*ill_hcksum == NULL) {
3096                                 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
3097                                     "could not enable hcksum version %d "
3098                                     "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
3099                                     ill->ill_name);
3100                                 return;
3101                         }
3102                 }
3103
3104                 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
3105                 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
3106                 ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
3107                 ip1dbg(("ill_capability_hcksum_ack: interface %s "
3108                     "has enabled hardware checksumming\n ",
3109                     ill->ill_name));
3110         } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
3111                 /*
3112                  * Enabling hardware checksum offload
3113                  * Currently IP supports {TCP,UDP}/IPv4
3114                  * partial and full cksum offload and
3115                  * IPv4 header checksum offload.
3116                  * Allocate new mblk which will
3117                  * contain a new capability request
3118                  * to enable hardware checksum offload.
3119                  */
3120                 uint_t  size;
3121                 uchar_t *rptr;
3122
3123                 size = sizeof (dl_capability_req_t) +
3124                     sizeof (dl_capability_sub_t) + isub->dl_length;
3125
3126                 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
3127                         cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
3128                             "could not enable hardware cksum for %s (ENOMEM)\n",
3129                             ill->ill_name);
3130                         return;
3131                 }
3132
3133                 rptr = nmp->b_rptr;
3134                 /* initialize dl_capability_req_t */
3135                 ocap = (dl_capability_req_t *)nmp->b_rptr;
3136                 ocap->dl_sub_offset =
3137                     sizeof (dl_capability_req_t);
3138                 ocap->dl_sub_length =
3139                     sizeof (dl_capability_sub_t) +
3140                     isub->dl_length;
3141                 nmp->b_rptr += sizeof (dl_capability_req_t);
3142
3143                 /* initialize dl_capability_sub_t */
3144                 bcopy(isub, nmp->b_rptr, sizeof (*isub));
3145                 nmp->b_rptr += sizeof (*isub);
3146
3147                 /* initialize dl_capab_hcksum_t */
3148                 ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
3149                 bcopy(ihck, ohck, sizeof (*ihck));
3150
3151                 nmp->b_rptr = rptr;
3152                 ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
3153
3154                 /* Set ENABLE flag */
3155                 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
3156                 ohck->hcksum_txflags |= HCKSUM_ENABLE;
3157
3158                 /*
3159                  * nmp points to a DL_CAPABILITY_REQ message to enable
3160                  * hardware checksum acceleration.
3161                  */
3162                 ill_dlpi_send(ill, nmp);
3163         } else {
3164                 ip1dbg(("ill_capability_hcksum_ack: interface %s has "
3165                     "advertised %x hardware checksum capability flags\n",
3166                     ill->ill_name, ihck->hcksum_txflags));
3167         }
3168 }
3169
3170 static void
3171 ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp)
3172 {
3173         mblk_t *mp;
3174         dl_capab_hcksum_t *hck_subcap;
3175         dl_capability_sub_t *dl_subcap;
3176         int size;
3177
3178         if (!ILL_HCKSUM_CAPABLE(ill))
3179                 return;
3180
3181         ASSERT(ill->ill_hcksum_capab != NULL);
3182         /*
3183          * Clear the capability flag for hardware checksum offload but
3184          * retain the ill_hcksum_capab structure since it's possible that
3185          * another thread is still referring to it.  The structure only
3186          * gets deallocated when we destroy the ill.
3187          */
3188         ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM;
3189
3190         size = sizeof (*dl_subcap) + sizeof (*hck_subcap);
3191
3192         mp = allocb(size, BPRI_HI);
3193         if (mp == NULL) {
3194                 ip1dbg(("ill_capability_hcksum_reset: unable to allocate "
3195                     "request to disable hardware checksum offload\n"));
3196                 return;
3197         }
3198
3199         mp->b_wptr = mp->b_rptr + size;
3200
3201         dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
3202         dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
3203         dl_subcap->dl_length = sizeof (*hck_subcap);
3204
3205         hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
3206         hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
3207         hck_subcap->hcksum_txflags = 0;
3208
3209         if (*sc_mp != NULL)
3210                 linkb(*sc_mp, mp);
3211         else
3212                 *sc_mp = mp;
3213 }
3214
3215 static void
3216 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
3217 {
3218         mblk_t *nmp = NULL;
3219         dl_capability_req_t *oc;
3220         dl_capab_zerocopy_t *zc_ic, *zc_oc;
3221         ill_zerocopy_capab_t **ill_zerocopy_capab;
3222         uint_t sub_dl_cap = isub->dl_cap;
3223         uint8_t *capend;
3224
3225         ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
3226
3227         ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
3228
3229         /*
3230          * Note: range checks here are not absolutely sufficient to
3231          * make us robust against malformed messages sent by drivers;
3232          * this is in keeping with the rest of IP's dlpi handling.
3233          * (Remember, it's coming from something else in the kernel
3234          * address space)
3235          */
3236         capend = (uint8_t *)(isub + 1) + isub->dl_length;
3237         if (capend > mp->b_wptr) {
3238                 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
3239                     "malformed sub-capability too long for mblk");
3240                 return;
3241         }
3242
3243         zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
3244         if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
3245                 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
3246                     "unsupported ZEROCOPY sub-capability (version %d, "
3247                     "expected %d)", zc_ic->zerocopy_version,
3248                     ZEROCOPY_VERSION_1);
3249                 return;
3250         }
3251
3252         if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
3253                 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
3254                     "capability isn't as expected; pass-thru module(s) "
3255                     "detected, discarding capability\n"));
3256                 return;
3257         }
3258
3259         if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
3260                 if (*ill_zerocopy_capab == NULL) {
3261                         *ill_zerocopy_capab =
3262                             kmem_zalloc(sizeof (ill_zerocopy_capab_t),
3263                             KM_NOSLEEP);
3264
3265                         if (*ill_zerocopy_capab == NULL) {
3266                                 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
3267                                     "could not enable Zero-copy version %d "
3268                                     "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
3269                                     ill->ill_name);
3270                                 return;
3271                         }
3272                 }
3273
3274                 ip1dbg(("ill_capability_zerocopy_ack: interface %s "
3275                     "supports Zero-copy version %d\n", ill->ill_name,
3276                     ZEROCOPY_VERSION_1));
3277
3278                 (*ill_zerocopy_capab)->ill_zerocopy_version =
3279                     zc_ic->zerocopy_version;
3280                 (*ill_zerocopy_capab)->ill_zerocopy_flags =
3281                     zc_ic->zerocopy_flags;
3282
3283                 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
3284         } else {
3285                 uint_t size;
3286                 uchar_t *rptr;
3287
3288                 size = sizeof (dl_capability_req_t) +
3289                     sizeof (dl_capability_sub_t) +
3290                     sizeof (dl_capab_zerocopy_t);
3291
3292                 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
3293                         cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
3294                             "could not enable zerocopy for %s (ENOMEM)\n",
3295                             ill->ill_name);
3296                         return;
3297                 }
3298
3299                 rptr = nmp->b_rptr;
3300                 /* initialize dl_capability_req_t */
3301                 oc = (dl_capability_req_t *)rptr;
3302                 oc->dl_sub_offset = sizeof (dl_capability_req_t);
3303                 oc->dl_sub_length = sizeof (dl_capability_sub_t) +
3304                     sizeof (dl_capab_zerocopy_t);
3305                 rptr += sizeof (dl_capability_req_t);
3306
3307                 /* initialize dl_capability_sub_t */
3308                 bcopy(isub, rptr, sizeof (*isub));
3309                 rptr += sizeof (*isub);
3310
3311                 /* initialize dl_capab_zerocopy_t */
3312                 zc_oc = (dl_capab_zerocopy_t *)rptr;
3313                 *zc_oc = *zc_ic;
3314
3315                 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
3316                     "to enable zero-copy version %d\n", ill->ill_name,
3317                     ZEROCOPY_VERSION_1));
3318
3319                 /* set VMSAFE_MEM flag */
3320                 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
3321
3322                 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
3323                 ill_dlpi_send(ill, nmp);
3324         }
3325 }
3326
3327 static void
3328 ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp)
3329 {
3330         mblk_t *mp;
3331         dl_capab_zerocopy_t *zerocopy_subcap;
3332         dl_capability_sub_t *dl_subcap;
3333         int size;
3334
3335         if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
3336                 return;
3337
3338         ASSERT(ill->ill_zerocopy_capab != NULL);
3339         /*
3340          * Clear the capability flag for Zero-copy but retain the
3341          * ill_zerocopy_capab structure since it's possible that another
3342          * thread is still referring to it.  The structure only gets
3343          * deallocated when we destroy the ill.
3344          */
3345         ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY;
3346
3347         size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
3348
3349         mp = allocb(size, BPRI_HI);
3350         if (mp == NULL) {
3351                 ip1dbg(("ill_capability_zerocopy_reset: unable to allocate "
3352                     "request to disable Zero-copy\n"));
3353                 return;
3354         }
3355
3356         mp->b_wptr = mp->b_rptr + size;
3357
3358         dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
3359         dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
3360         dl_subcap->dl_length = sizeof (*zerocopy_subcap);
3361
3362         zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
3363         zerocopy_subcap->zerocopy_version =
3364             ill->ill_zerocopy_capab->ill_zerocopy_version;
3365         zerocopy_subcap->zerocopy_flags = 0;
3366
3367         if (*sc_mp != NULL)
3368                 linkb(*sc_mp, mp);
3369         else
3370                 *sc_mp = mp;
3371 }
3372
3373 /*
3374  * Process Large Segment Offload capability negotiation ack received from a
3375  * DLS Provider.  isub must point to the sub-capability (DL_CAPAB_LSO) of a
3376  * DL_CAPABILITY_ACK message.
3377  */
3378 static void
3379 ill_capability_lso_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
3380 {
3381         mblk_t *nmp = NULL;
3382         dl_capability_req_t *oc;
3383         dl_capab_lso_t *lso_ic, *lso_oc;
3384         ill_lso_capab_t **ill_lso_capab;
3385         uint_t sub_dl_cap = isub->dl_cap;
3386         uint8_t *capend;
3387
3388         ASSERT(sub_dl_cap == DL_CAPAB_LSO);
3389
3390         ill_lso_capab = (ill_lso_capab_t **)&ill->ill_lso_capab;
3391
3392         /*
3393          * Note: range checks here are not absolutely sufficient to
3394          * make us robust against malformed messages sent by drivers;
3395          * this is in keeping with the rest of IP's dlpi handling.
3396          * (Remember, it's coming from something else in the kernel
3397          * address space)
3398          */
3399         capend = (uint8_t *)(isub + 1) + isub->dl_length;
3400         if (capend > mp->b_wptr) {
3401                 cmn_err(CE_WARN, "ill_capability_lso_ack: "
3402                     "malformed sub-capability too long for mblk");
3403                 return;
3404         }
3405
3406         lso_ic = (dl_capab_lso_t *)(isub + 1);
3407
3408         if (lso_ic->lso_version != LSO_VERSION_1) {
3409                 cmn_err(CE_CONT, "ill_capability_lso_ack: "
3410                     "unsupported LSO sub-capability (version %d, expected %d)",
3411                     lso_ic->lso_version, LSO_VERSION_1);
3412                 return;
3413         }
3414
3415         if (!dlcapabcheckqid(&lso_ic->lso_mid, ill->ill_lmod_rq)) {
3416                 ip1dbg(("ill_capability_lso_ack: mid token for LSO "
3417                     "capability isn't as expected; pass-thru module(s) "
3418                     "detected, discarding capability\n"));
3419                 return;
3420         }
3421
3422         if ((lso_ic->lso_flags & LSO_TX_ENABLE) &&
3423             (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4)) {
3424                 if (*ill_lso_capab == NULL) {
3425                         *ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
3426                             KM_NOSLEEP);
3427
3428                         if (*ill_lso_capab == NULL) {
3429                                 cmn_err(CE_WARN, "ill_capability_lso_ack: "
3430                                     "could not enable LSO version %d "
3431                                     "for %s (ENOMEM)\n", LSO_VERSION_1,
3432                                     ill->ill_name);
3433                                 return;
3434                         }
3435                 }
3436
3437                 (*ill_lso_capab)->ill_lso_version = lso_ic->lso_version;
3438                 (*ill_lso_capab)->ill_lso_flags = lso_ic->lso_flags;
3439                 (*ill_lso_capab)->ill_lso_max = lso_ic->lso_max;
3440                 ill->ill_capabilities |= ILL_CAPAB_LSO;
3441
3442                 ip1dbg(("ill_capability_lso_ack: interface %s "
3443                     "has enabled LSO\n ", ill->ill_name));
3444         } else if (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4) {
3445                 uint_t size;
3446                 uchar_t *rptr;
3447
3448                 size = sizeof (dl_capability_req_t) +
3449                     sizeof (dl_capability_sub_t) + sizeof (dl_capab_lso_t);
3450
3451                 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
3452                         cmn_err(CE_WARN, "ill_capability_lso_ack: "
3453                             "could not enable LSO for %s (ENOMEM)\n",
3454                             ill->ill_name);
3455                         return;
3456                 }
3457
3458                 rptr = nmp->b_rptr;
3459                 /* initialize dl_capability_req_t */
3460                 oc = (dl_capability_req_t *)nmp->b_rptr;
3461                 oc->dl_sub_offset = sizeof (dl_capability_req_t);
3462                 oc->dl_sub_length = sizeof (dl_capability_sub_t) +
3463                     sizeof (dl_capab_lso_t);
3464                 nmp->b_rptr += sizeof (dl_capability_req_t);
3465
3466                 /* initialize dl_capability_sub_t */
3467                 bcopy(isub, nmp->b_rptr, sizeof (*isub));
3468                 nmp->b_rptr += sizeof (*isub);
3469
3470                 /* initialize dl_capab_lso_t */
3471                 lso_oc = (dl_capab_lso_t *)nmp->b_rptr;
3472                 bcopy(lso_ic, lso_oc, sizeof (*lso_ic));
3473
3474                 nmp->b_rptr = rptr;
3475                 ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
3476
3477                 /* set ENABLE flag */
3478                 lso_oc->lso_flags |= LSO_TX_ENABLE;
3479
3480                 /* nmp points to a DL_CAPABILITY_REQ message to enable LSO */
3481                 ill_dlpi_send(ill, nmp);
3482         } else {
3483                 ip1dbg(("ill_capability_lso_ack: interface %s has "
3484                     "advertised %x LSO capability flags\n",
3485                     ill->ill_name, lso_ic->lso_flags));
3486         }
3487 }
3488
3489
3490 static void
3491 ill_capability_lso_reset(ill_t *ill, mblk_t **sc_mp)
3492 {
3493         mblk_t *mp;
3494         dl_capab_lso_t *lso_subcap;
3495         dl_capability_sub_t *dl_subcap;
3496         int size;
3497
3498         if (!(ill->ill_capabilities & ILL_CAPAB_LSO))
3499                 return;
3500
3501         ASSERT(ill->ill_lso_capab != NULL);
3502         /*
3503          * Clear the capability flag for LSO but retain the
3504          * ill_lso_capab structure since it's possible that another
3505          * thread is still referring to it.  The structure only gets
3506          * deallocated when we destroy the ill.
3507          */
3508         ill->ill_capabilities &= ~ILL_CAPAB_LSO;
3509
3510         size = sizeof (*dl_subcap) + sizeof (*lso_subcap);
3511
3512         mp = allocb(size, BPRI_HI);
3513         if (mp == NULL) {
3514                 ip1dbg(("ill_capability_lso_reset: unable to allocate "
3515                     "request to disable LSO\n"));
3516                 return;
3517         }
3518
3519         mp->b_wptr = mp->b_rptr + size;
3520
3521         dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
3522         dl_subcap->dl_cap = DL_CAPAB_LSO;
3523         dl_subcap->dl_length = sizeof (*lso_subcap);
3524
3525         lso_subcap = (dl_capab_lso_t *)(dl_subcap + 1);
3526         lso_subcap->lso_version = ill->ill_lso_capab->ill_lso_version;
3527         lso_subcap->lso_flags = 0;
3528
3529         if (*sc_mp != NULL)
3530                 linkb(*sc_mp, mp);
3531         else
3532                 *sc_mp = mp;
3533 }
3534
3535 /*
3536  * Consume a new-style hardware capabilities negotiation ack.
3537  * Called from ip_rput_dlpi_writer().
3538  */
3539 void
3540 ill_capability_ack(ill_t *ill, mblk_t *mp)
3541 {
3542         dl_capability_ack_t *capp;
3543         dl_capability_sub_t *subp, *endp;
3544
3545         if (ill->ill_dlpi_capab_state == IDS_INPROGRESS)
3546                 ill->ill_dlpi_capab_state = IDS_OK;
3547
3548         capp = (dl_capability_ack_t *)mp->b_rptr;
3549
3550         if (capp->dl_sub_length == 0)
3551                 /* no new-style capabilities */
3552                 return;
3553
3554         /* make sure the driver supplied correct dl_sub_length */
3555         if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
3556                 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
3557                     "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
3558                 return;
3559         }
3560
3561 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
3562         /*
3563          * There are sub-capabilities. Process the ones we know about.
3564          * Loop until we don't have room for another sub-cap header..
3565          */
3566         for (subp = SC(capp, capp->dl_sub_offset),
3567             endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
3568             subp <= endp;
3569             subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
3570
3571                 switch (subp->dl_cap) {
3572                 case DL_CAPAB_ID_WRAPPER:
3573                         ill_capability_id_ack(ill, mp, subp);
3574                         break;
3575                 default:
3576                         ill_capability_dispatch(ill, mp, subp, B_FALSE);
3577                         break;
3578                 }
3579         }
3580 #undef SC
3581 }
3582
3583 /*
3584  * This routine is called to scan the fragmentation reassembly table for
3585  * the specified ILL for any packets that are starting to smell.
3586  * dead_interval is the maximum time in seconds that will be tolerated.  It
3587  * will either be the value specified in ip_g_frag_timeout, or zero if the
3588  * ILL is shutting down and it is time to blow everything off.
3589  *
3590  * It returns the number of seconds (as a time_t) that the next frag timer
3591  * should be scheduled for, 0 meaning that the timer doesn't need to be
3592  * re-started.  Note that the method of calculating next_timeout isn't
3593  * entirely accurate since time will flow between the time we grab
3594  * current_time and the time we schedule the next timeout.  This isn't a
3595  * big problem since this is the timer for sending an ICMP reassembly time
3596  * exceeded messages, and it doesn't have to be exactly accurate.
3597  *
3598  * This function is
3599  * sometimes called as writer, although this is not required.
3600  */
3601 time_t
3602 ill_frag_timeout(ill_t *ill, time_t dead_interval)
3603 {
3604         ipfb_t  *ipfb;
3605         ipfb_t  *endp;
3606         ipf_t   *ipf;
3607         ipf_t   *ipfnext;
3608         mblk_t  *mp;
3609         time_t  current_time = gethrestime_sec();
3610         time_t  next_timeout = 0;
3611         uint32_t        hdr_length;
3612         mblk_t  *send_icmp_head;
3613         mblk_t  *send_icmp_head_v6;
3614         zoneid_t zoneid;
3615         ip_stack_t *ipst = ill->ill_ipst;
3616
3617         ipfb = ill->ill_frag_hash_tbl;
3618         if (ipfb == NULL)
3619                 return (B_FALSE);
3620         endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
3621         /* Walk the frag hash table. */
3622         for (; ipfb < endp; ipfb++) {
3623                 send_icmp_head = NULL;
3624                 send_icmp_head_v6 = NULL;
3625                 mutex_enter(&ipfb->ipfb_lock);
3626                 while ((ipf = ipfb->ipfb_ipf) != 0) {
3627                         time_t frag_time = current_time - ipf->ipf_timestamp;
3628                         time_t frag_timeout;
3629
3630                         if (frag_time < dead_interval) {
3631                                 /*
3632                                  * There are some outstanding fragments
3633                                  * that will timeout later.  Make note of
3634                                  * the time so that we can reschedule the
3635                                  * next timeout appropriately.
3636                                  */
3637                                 frag_timeout = dead_interval - frag_time;
3638                                 if (next_timeout == 0 ||
3639                                     frag_timeout < next_timeout) {
3640                                         next_timeout = frag_timeout;
3641                                 }
3642                                 break;
3643                         }
3644                         /* Time's up.  Get it out of here. */
3645                         hdr_length = ipf->ipf_nf_hdr_len;
3646                         ipfnext = ipf->ipf_hash_next;
3647                         if (ipfnext)
3648                                 ipfnext->ipf_ptphn = ipf->ipf_ptphn;
3649                         *ipf->ipf_ptphn = ipfnext;
3650                         mp = ipf->ipf_mp->b_cont;
3651                         for (; mp; mp = mp->b_cont) {
3652                                 /* Extra points for neatness. */
3653                                 IP_REASS_SET_START(mp, 0);
3654                                 IP_REASS_SET_END(mp, 0);
3655                         }
3656                         mp = ipf->ipf_mp->b_cont;
3657                         ill->ill_frag_count -= ipf->ipf_count;
3658                         ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
3659                         ipfb->ipfb_count -= ipf->ipf_count;
3660                         ASSERT(ipfb->ipfb_frag_pkts > 0);
3661                         ipfb->ipfb_frag_pkts--;
3662                         /*
3663                          * We do not send any icmp message from here because
3664                          * we currently are holding the ipfb_lock for this
3665                          * hash chain. If we try and send any icmp messages
3666                          * from here we may end up via a put back into ip
3667                          * trying to get the same lock, causing a recursive
3668                          * mutex panic. Instead we build a list and send all
3669                          * the icmp messages after we have dropped the lock.
3670                          */
3671                         if (ill->ill_isv6) {
3672                                 if (hdr_length != 0) {
3673                                         mp->b_next = send_icmp_head_v6;
3674                                         send_icmp_head_v6 = mp;
3675                                 } else {
3676                                         freemsg(mp);
3677                                 }
3678                         } else {
3679                                 if (hdr_length != 0) {
3680                                         mp->b_next = send_icmp_head;
3681                                         send_icmp_head = mp;
3682                                 } else {
3683                                         freemsg(mp);
3684                                 }
3685                         }
3686                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
3687                         freeb(ipf->ipf_mp);
3688                 }
3689                 mutex_exit(&ipfb->ipfb_lock);
3690                 /*
3691                  * Now need to send any icmp messages that we delayed from
3692                  * above.
3693                  */
3694                 while (send_icmp_head_v6 != NULL) {
3695                         ip6_t *ip6h;
3696
3697                         mp = send_icmp_head_v6;
3698                         send_icmp_head_v6 = send_icmp_head_v6->b_next;
3699                         mp->b_next = NULL;
3700                         if (mp->b_datap->db_type == M_CTL)
3701                                 ip6h = (ip6_t *)mp->b_cont->b_rptr;
3702                         else
3703                                 ip6h = (ip6_t *)mp->b_rptr;
3704                         zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
3705                             ill, ipst);
3706                         if (zoneid == ALL_ZONES) {
3707                                 freemsg(mp);
3708                         } else {
3709                                 icmp_time_exceeded_v6(ill->ill_wq, mp,
3710                                     ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
3711                                     B_FALSE, zoneid, ipst);
3712                         }
3713                 }
3714                 while (send_icmp_head != NULL) {
3715                         ipaddr_t dst;
3716
3717                         mp = send_icmp_head;
3718                         send_icmp_head = send_icmp_head->b_next;
3719                         mp->b_next = NULL;
3720
3721                         if (mp->b_datap->db_type == M_CTL)
3722                                 dst = ((ipha_t *)mp->b_cont->b_rptr)->ipha_dst;
3723                         else
3724                                 dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
3725
3726                         zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst);
3727                         if (zoneid == ALL_ZONES) {
3728                                 freemsg(mp);
3729                         } else {
3730                                 icmp_time_exceeded(ill->ill_wq, mp,
3731                                     ICMP_REASSEMBLY_TIME_EXCEEDED, zoneid,
3732                                     ipst);
3733                         }
3734                 }
3735         }
3736         /*
3737          * A non-dying ILL will use the return value to decide whether to
3738          * restart the frag timer, and for how long.
3739          */
3740         return (next_timeout);
3741 }
3742
3743 /*
3744  * This routine is called when the approximate count of mblk memory used
3745  * for the specified ILL has exceeded max_count.
3746  */
3747 void
3748 ill_frag_prune(ill_t *ill, uint_t max_count)
3749 {
3750         ipfb_t  *ipfb;
3751         ipf_t   *ipf;
3752         size_t  count;
3753
3754         /*
3755          * If we are here within ip_min_frag_prune_time msecs remove
3756          * ill_frag_free_num_pkts oldest packets from each bucket and increment
3757          * ill_frag_free_num_pkts.
3758          */
3759         mutex_enter(&ill->ill_lock);
3760         if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <=
3761             (ip_min_frag_prune_time != 0 ?
3762             ip_min_frag_prune_time : msec_per_tick)) {
3763
3764                 ill->ill_frag_free_num_pkts++;
3765
3766         } else {
3767                 ill->ill_frag_free_num_pkts = 0;
3768         }
3769         ill->ill_last_frag_clean_time = lbolt;
3770         mutex_exit(&ill->ill_lock);
3771
3772         /*
3773          * free ill_frag_free_num_pkts oldest packets from each bucket.
3774          */
3775         if (ill->ill_frag_free_num_pkts != 0) {
3776                 int ix;
3777
3778                 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
3779                         ipfb = &ill->ill_frag_hash_tbl[ix];
3780                         mutex_enter(&ipfb->ipfb_lock);
3781                         if (ipfb->ipfb_ipf != NULL) {
3782                                 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
3783                                     ill->ill_frag_free_num_pkts);
3784                         }
3785                         mutex_exit(&ipfb->ipfb_lock);
3786                 }
3787         }
3788         /*
3789          * While the reassembly list for this ILL is too big, prune a fragment
3790          * queue by age, oldest first.  Note that the per ILL count is
3791          * approximate, while the per frag hash bucket counts are accurate.
3792          */
3793         while (ill->ill_frag_count > max_count) {
3794                 int     ix;
3795                 ipfb_t  *oipfb = NULL;
3796                 uint_t  oldest = UINT_MAX;
3797
3798                 count = 0;
3799                 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
3800                         ipfb = &ill->ill_frag_hash_tbl[ix];
3801                         mutex_enter(&ipfb->ipfb_lock);
3802                         ipf = ipfb->ipfb_ipf;
3803                         if (ipf != NULL && ipf->ipf_gen < oldest) {
3804                                 oldest = ipf->ipf_gen;
3805                                 oipfb = ipfb;
3806                         }
3807                         count += ipfb->ipfb_count;
3808                         mutex_exit(&ipfb->ipfb_lock);
3809                 }
3810                 /* Refresh the per ILL count */
3811                 ill->ill_frag_count = count;
3812                 if (oipfb == NULL) {
3813                         ill->ill_frag_count = 0;
3814                         break;
3815                 }
3816                 if (count <= max_count)
3817                         return; /* Somebody beat us to it, nothing to do */
3818                 mutex_enter(&oipfb->ipfb_lock);
3819                 ipf = oipfb->ipfb_ipf;
3820                 if (ipf != NULL) {
3821                         ill_frag_free_pkts(ill, oipfb, ipf, 1);
3822                 }
3823                 mutex_exit(&oipfb->ipfb_lock);
3824         }
3825 }
3826
3827 /*
3828  * free 'free_cnt' fragmented packets starting at ipf.
3829  */
3830 void
3831 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
3832 {
3833         size_t  count;
3834         mblk_t  *mp;
3835         mblk_t  *tmp;
3836         ipf_t **ipfp = ipf->ipf_ptphn;
3837
3838         ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
3839         ASSERT(ipfp != NULL);
3840         ASSERT(ipf != NULL);
3841
3842         while (ipf != NULL && free_cnt-- > 0) {
3843                 count = ipf->ipf_count;
3844                 mp = ipf->ipf_mp;
3845                 ipf = ipf->ipf_hash_next;
3846                 for (tmp = mp; tmp; tmp = tmp->b_cont) {
3847                         IP_REASS_SET_START(tmp, 0);
3848                         IP_REASS_SET_END(tmp, 0);
3849                 }
3850                 ill->ill_frag_count -= count;
3851                 ASSERT(ipfb->ipfb_count >= count);
3852                 ipfb->ipfb_count -= count;
3853                 ASSERT(ipfb->ipfb_frag_pkts > 0);
3854                 ipfb->ipfb_frag_pkts--;
3855                 freemsg(mp);
3856                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
3857         }
3858
3859         if (ipf)
3860                 ipf->ipf_ptphn = ipfp;
3861         ipfp[0] = ipf;
3862 }
3863
3864 #define ND_FORWARD_WARNING      "The <if>:ip*_forwarding ndd variables are " \
3865         "obsolete and may be removed in a future release of Solaris.  Use " \
3866         "ifconfig(1M) to manipulate the forwarding status of an interface."
3867
3868 /*
3869  * For obsolete per-interface forwarding configuration;
3870  * called in response to ND_GET.
3871  */
3872 /* ARGSUSED */
3873 static int
3874 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr)
3875 {
3876         ill_t *ill = (ill_t *)cp;
3877
3878         cmn_err(CE_WARN, ND_FORWARD_WARNING);
3879
3880         (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0);
3881         return (0);
3882 }
3883
3884 /*
3885  * For obsolete per-interface forwarding configuration;
3886  * called in response to ND_SET.
3887  */
3888 /* ARGSUSED */
3889 static int
3890 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp,
3891     cred_t *ioc_cr)
3892 {
3893         long value;
3894         int retval;
3895         ip_stack_t *ipst = CONNQ_TO_IPST(q);
3896
3897         cmn_err(CE_WARN, ND_FORWARD_WARNING);
3898
3899         if (ddi_strtol(valuestr, NULL, 10, &value) != 0 ||
3900             value < 0 || value > 1) {
3901                 return (EINVAL);
3902         }
3903
3904         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3905         retval = ill_forward_set((ill_t *)cp, (value != 0));
3906         rw_exit(&ipst->ips_ill_g_lock);
3907         return (retval);
3908 }
3909
3910 /*
3911  * Set an ill's ILLF_ROUTER flag appropriately.  If the ill is part of an
3912  * IPMP group, make sure all ill's in the group adopt the new policy.  Send
3913  * up RTS_IFINFO routing socket messages for each interface whose flags we
3914  * change.
3915  */
3916 int
3917 ill_forward_set(ill_t *ill, boolean_t enable)
3918 {
3919         ill_group_t *illgrp;
3920         ip_stack_t      *ipst = ill->ill_ipst;
3921
3922         ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
3923
3924         if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
3925             (!enable && !(ill->ill_flags & ILLF_ROUTER)))
3926                 return (0);
3927
3928         if (IS_LOOPBACK(ill))
3929                 return (EINVAL);
3930
3931         /*
3932          * If the ill is in an IPMP group, set the forwarding policy on all
3933          * members of the group to the same value.
3934          */
3935         illgrp = ill->ill_group;
3936         if (illgrp != NULL) {
3937                 ill_t *tmp_ill;
3938
3939                 for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL;
3940                     tmp_ill = tmp_ill->ill_group_next) {
3941                         ip1dbg(("ill_forward_set: %s %s forwarding on %s",
3942                             (enable ? "Enabling" : "Disabling"),
3943                             (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"),
3944                             tmp_ill->ill_name));
3945                         mutex_enter(&tmp_ill->ill_lock);
3946                         if (enable)
3947                                 tmp_ill->ill_flags |= ILLF_ROUTER;
3948                         else
3949                                 tmp_ill->ill_flags &= ~ILLF_ROUTER;
3950                         mutex_exit(&tmp_ill->ill_lock);
3951                         if (tmp_ill->ill_isv6)
3952                                 ill_set_nce_router_flags(tmp_ill, enable);
3953                         /* Notify routing socket listeners of this change. */
3954                         ip_rts_ifmsg(tmp_ill->ill_ipif);
3955                 }
3956         } else {
3957                 ip1dbg(("ill_forward_set: %s %s forwarding on %s",
3958                     (enable ? "Enabling" : "Disabling"),
3959                     (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
3960                 mutex_enter(&ill->ill_lock);
3961                 if (enable)
3962                         ill->ill_flags |= ILLF_ROUTER;
3963                 else
3964                         ill->ill_flags &= ~ILLF_ROUTER;
3965                 mutex_exit(&ill->ill_lock);
3966                 if (ill->ill_isv6)
3967                         ill_set_nce_router_flags(ill, enable);
3968                 /* Notify routing socket listeners of this change. */
3969                 ip_rts_ifmsg(ill->ill_ipif);
3970         }
3971
3972         return (0);
3973 }
3974
3975 /*
3976  * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
3977  * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
3978  * set or clear.
3979  */
3980 static void
3981 ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
3982 {
3983         ipif_t *ipif;
3984         nce_t *nce;
3985
3986         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
3987                 nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE);
3988                 if (nce != NULL) {
3989                         mutex_enter(&nce->nce_lock);
3990                         if (enable)
3991                                 nce->nce_flags |= NCE_F_ISROUTER;
3992                         else
3993                                 nce->nce_flags &= ~NCE_F_ISROUTER;
3994                         mutex_exit(&nce->nce_lock);
3995                         NCE_REFRELE(nce);
3996                 }
3997         }
3998 }
3999
4000 /*
4001  * Given an ill with a _valid_ name, add the ip_forwarding ndd variable
4002  * for this ill.  Make sure the v6/v4 question has been answered about this
4003  * ill.  The creation of this ndd variable is only for backwards compatibility.
4004  * The preferred way to control per-interface IP forwarding is through the
4005  * ILLF_ROUTER interface flag.
4006  */
4007 static int
4008 ill_set_ndd_name(ill_t *ill)
4009 {
4010         char *suffix;
4011         ip_stack_t      *ipst = ill->ill_ipst;
4012
4013         ASSERT(IAM_WRITER_ILL(ill));
4014
4015         if (ill->ill_isv6)
4016                 suffix = ipv6_forward_suffix;
4017         else
4018                 suffix = ipv4_forward_suffix;
4019
4020         ill->ill_ndd_name = ill->ill_name + ill->ill_name_length;
4021         bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1);
4022         /*
4023          * Copies over the '\0'.
4024          * Note that strlen(suffix) is always bounded.
4025          */
4026         bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1,
4027             strlen(suffix) + 1);
4028
4029         /*
4030          * Use of the nd table requires holding the reader lock.
4031          * Modifying the nd table thru nd_load/nd_unload requires
4032          * the writer lock.
4033          */
4034         rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER);
4035         if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get,
4036             nd_ill_forward_set, (caddr_t)ill)) {
4037                 /*
4038                  * If the nd_load failed, it only meant that it could not
4039                  * allocate a new bunch of room for further NDD expansion.
4040                  * Because of that, the ill_ndd_name will be set to 0, and
4041                  * this interface is at the mercy of the global ip_forwarding
4042                  * variable.
4043                  */
4044                 rw_exit(&ipst->ips_ip_g_nd_lock);
4045                 ill->ill_ndd_name = NULL;
4046                 return (ENOMEM);
4047         }
4048         rw_exit(&ipst->ips_ip_g_nd_lock);
4049         return (0);
4050 }
4051
4052 /*
4053  * Intializes the context structure and returns the first ill in the list
4054  * cuurently start_list and end_list can have values:
4055  * MAX_G_HEADS          Traverse both IPV4 and IPV6 lists.
4056  * IP_V4_G_HEAD         Traverse IPV4 list only.
4057  * IP_V6_G_HEAD         Traverse IPV6 list only.
4058  */
4059
4060 /*
4061  * We don't check for CONDEMNED ills here. Caller must do that if
4062  * necessary under the ill lock.
4063  */
4064 ill_t *
4065 ill_first(int start_list, int end_list, ill_walk_context_t *ctx,
4066     ip_stack_t *ipst)
4067 {
4068         ill_if_t *ifp;
4069         ill_t *ill;
4070         avl_tree_t *avl_tree;
4071
4072         ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
4073         ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
4074
4075         /*
4076          * setup the lists to search
4077          */
4078         if (end_list != MAX_G_HEADS) {
4079                 ctx->ctx_current_list = start_list;
4080                 ctx->ctx_last_list = end_list;
4081         } else {
4082                 ctx->ctx_last_list = MAX_G_HEADS - 1;
4083                 ctx->ctx_current_list = 0;
4084         }
4085
4086         while (ctx->ctx_current_list <= ctx->ctx_last_list) {
4087                 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
4088                 if (ifp != (ill_if_t *)
4089                     &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
4090                         avl_tree = &ifp->illif_avl_by_ppa;
4091                         ill = avl_first(avl_tree);
4092                         /*
4093                          * ill is guaranteed to be non NULL or ifp should have
4094                          * not existed.
4095                          */
4096                         ASSERT(ill != NULL);
4097                         return (ill);
4098                 }
4099                 ctx->ctx_current_list++;
4100         }
4101
4102         return (NULL);
4103 }
4104
4105 /*
4106  * returns the next ill in the list. ill_first() must have been called
4107  * before calling ill_next() or bad things will happen.
4108  */
4109
4110 /*
4111  * We don't check for CONDEMNED ills here. Caller must do that if
4112  * necessary under the ill lock.
4113  */
4114 ill_t *
4115 ill_next(ill_walk_context_t *ctx, ill_t *lastill)
4116 {
4117         ill_if_t *ifp;
4118         ill_t *ill;
4119         ip_stack_t      *ipst = lastill->ill_ipst;
4120
4121         ASSERT(lastill->ill_ifptr != (ill_if_t *)
4122             &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst));
4123         if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill,
4124             AVL_AFTER)) != NULL) {
4125                 return (ill);
4126         }
4127
4128         /* goto next ill_ifp in the list. */
4129         ifp = lastill->ill_ifptr->illif_next;
4130
4131         /* make sure not at end of circular list */
4132         while (ifp ==
4133             (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
4134                 if (++ctx->ctx_current_list > ctx->ctx_last_list)
4135                         return (NULL);
4136                 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
4137         }
4138
4139         return (avl_first(&ifp->illif_avl_by_ppa));
4140 }
4141
4142 /*
4143  * Check interface name for correct format which is name+ppa.
4144  * name can contain characters and digits, the right most digits
4145  * make up the ppa number. use of octal is not allowed, name must contain
4146  * a ppa, return pointer to the start of ppa.
4147  * In case of error return NULL.
4148  */
4149 static char *
4150 ill_get_ppa_ptr(char *name)
4151 {
4152         int namelen = mi_strlen(name);
4153
4154         int len = namelen;
4155
4156         name += len;
4157         while (len > 0) {
4158                 name--;
4159                 if (*name < '0' || *name > '9')
4160                         break;
4161                 len--;
4162         }
4163
4164         /* empty string, all digits, or no trailing digits */
4165         if (len == 0 || len == (int)namelen)
4166                 return (NULL);
4167
4168         name++;
4169         /* check for attempted use of octal */
4170         if (*name == '0' && len != (int)namelen - 1)
4171                 return (NULL);
4172         return (name);
4173 }
4174
4175 /*
4176  * use avl tree to locate the ill.
4177  */
4178 static ill_t *
4179 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
4180     ipsq_func_t func, int *error, ip_stack_t *ipst)
4181 {
4182         char *ppa_ptr = NULL;
4183         int len;
4184         uint_t ppa;
4185         ill_t *ill = NULL;
4186         ill_if_t *ifp;
4187         int list;
4188         ipsq_t *ipsq;
4189
4190         if (error != NULL)
4191                 *error = 0;
4192
4193         /*
4194          * get ppa ptr
4195          */
4196         if (isv6)
4197                 list = IP_V6_G_HEAD;
4198         else
4199                 list = IP_V4_G_HEAD;
4200
4201         if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
4202                 if (error != NULL)
4203                         *error = ENXIO;
4204                 return (NULL);
4205         }
4206
4207         len = ppa_ptr - name + 1;
4208
4209         ppa = stoi(&ppa_ptr);
4210
4211         ifp = IP_VX_ILL_G_LIST(list, ipst);
4212
4213         while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
4214                 /*
4215                  * match is done on len - 1 as the name is not null
4216                  * terminated it contains ppa in addition to the interface
4217                  * name.
4218                  */
4219                 if ((ifp->illif_name_len == len) &&
4220                     bcmp(ifp->illif_name, name, len - 1) == 0) {
4221                         break;
4222                 } else {
4223                         ifp = ifp->illif_next;
4224                 }
4225         }
4226
4227
4228         if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
4229                 /*
4230                  * Even the interface type does not exist.
4231                  */
4232                 if (error != NULL)
4233                         *error = ENXIO;
4234                 return (NULL);
4235         }
4236
4237         ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
4238         if (ill != NULL) {
4239                 /*
4240                  * The block comment at the start of ipif_down
4241                  * explains the use of the macros used below
4242                  */
4243                 GRAB_CONN_LOCK(q);
4244                 mutex_enter(&ill->ill_lock);
4245                 if (ILL_CAN_LOOKUP(ill)) {
4246                         ill_refhold_locked(ill);
4247                         mutex_exit(&ill->ill_lock);
4248                         RELEASE_CONN_LOCK(q);
4249                         return (ill);
4250                 } else if (ILL_CAN_WAIT(ill, q)) {
4251                         ipsq = ill->ill_phyint->phyint_ipsq;
4252                         mutex_enter(&ipsq->ipsq_lock);
4253                         mutex_exit(&ill->ill_lock);
4254                         ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
4255                         mutex_exit(&ipsq->ipsq_lock);
4256                         RELEASE_CONN_LOCK(q);
4257                         if (error != NULL)
4258                                 *error = EINPROGRESS;
4259                         return (NULL);
4260                 }
4261                 mutex_exit(&ill->ill_lock);
4262                 RELEASE_CONN_LOCK(q);
4263         }
4264         if (error != NULL)
4265                 *error = ENXIO;
4266         return (NULL);
4267 }
4268
4269 /*
4270  * comparison function for use with avl.
4271  */
4272 static int
4273 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr)
4274 {
4275         uint_t ppa;
4276         uint_t ill_ppa;
4277
4278         ASSERT(ppa_ptr != NULL && ill_ptr != NULL);
4279
4280         ppa = *((uint_t *)ppa_ptr);
4281         ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa;
4282         /*
4283          * We want the ill with the lowest ppa to be on the
4284          * top.
4285          */
4286         if (ill_ppa < ppa)
4287                 return (1);
4288         if (ill_ppa > ppa)
4289                 return (-1);
4290         return (0);
4291 }
4292
4293 /*
4294  * remove an interface type from the global list.
4295  */
4296 static void
4297 ill_delete_interface_type(ill_if_t *interface)
4298 {
4299         ASSERT(interface != NULL);
4300         ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0);
4301
4302         avl_destroy(&interface->illif_avl_by_ppa);
4303         if (interface->illif_ppa_arena != NULL)
4304                 vmem_destroy(interface->illif_ppa_arena);
4305
4306         remque(interface);
4307
4308         mi_free(interface);
4309 }
4310
4311 /* Defined in ip_netinfo.c */
4312 extern ddi_taskq_t      *eventq_queue_nic;
4313
4314 /*
4315  * remove ill from the global list.
4316  */
4317 static void
4318 ill_glist_delete(ill_t *ill)
4319 {
4320         char *nicname;
4321         size_t nicnamelen;
4322         hook_nic_event_t *info;
4323         ip_stack_t      *ipst;
4324
4325         if (ill == NULL)
4326                 return;
4327         ipst = ill->ill_ipst;
4328         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
4329
4330         if (ill->ill_name != NULL) {
4331                 nicname = kmem_alloc(ill->ill_name_length, KM_NOSLEEP);
4332                 if (nicname != NULL) {
4333                         bcopy(ill->ill_name, nicname, ill->ill_name_length);
4334                         nicnamelen = ill->ill_name_length;
4335                 }
4336         } else {
4337                 nicname = NULL;
4338                 nicnamelen = 0;
4339         }
4340
4341         /*
4342          * If the ill was never inserted into the AVL tree
4343          * we skip the if branch.
4344          */
4345         if (ill->ill_ifptr != NULL) {
4346                 /*
4347                  * remove from AVL tree and free ppa number
4348                  */
4349                 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill);
4350
4351                 if (ill->ill_ifptr->illif_ppa_arena != NULL) {
4352                         vmem_free(ill->ill_ifptr->illif_ppa_arena,
4353                             (void *)(uintptr_t)(ill->ill_ppa+1), 1);
4354                 }
4355                 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) {
4356                         ill_delete_interface_type(ill->ill_ifptr);
4357                 }
4358
4359                 /*
4360                  * Indicate ill is no longer in the list.
4361                  */
4362                 ill->ill_ifptr = NULL;
4363                 ill->ill_name_length = 0;
4364                 ill->ill_name[0] = '\0';
4365                 ill->ill_ppa = UINT_MAX;
4366         }
4367
4368         /*
4369          * Run the unplumb hook after the NIC has disappeared from being
4370          * visible so that attempts to revalidate its existance will fail.
4371          *
4372          * This needs to be run inside the ill_g_lock perimeter to ensure
4373          * that the ordering of delivered events to listeners matches the
4374          * order of them in the kernel.
4375          */
4376         if ((info = ill->ill_nic_event_info) != NULL) {
4377                 if (info->hne_event != NE_DOWN) {
4378                         ip2dbg(("ill_glist_delete: unexpected nic event %d "
4379                             "attached for %s\n", info->hne_event,
4380                             ill->ill_name));
4381                         if (info->hne_data != NULL)
4382                                 kmem_free(info->hne_data, info->hne_datalen);
4383                         kmem_free(info, sizeof (hook_nic_event_t));
4384                 } else {
4385                         if (ddi_taskq_dispatch(eventq_queue_nic,
4386                             ip_ne_queue_func, (void *)info, DDI_SLEEP)
4387                             == DDI_FAILURE) {
4388                                 ip2dbg(("ill_glist_delete: ddi_taskq_dispatch "
4389                                     "failed\n"));
4390                                 if (info->hne_data != NULL)
4391                                         kmem_free(info->hne_data,
4392                                             info->hne_datalen);
4393                                 kmem_free(info, sizeof (hook_nic_event_t));
4394                         }
4395                 }
4396         }
4397
4398         /* Generate NE_UNPLUMB event for ill_name. */
4399         info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP);
4400         if (info != NULL) {
4401                 info->hne_nic = ill->ill_phyint->phyint_ifindex;
4402                 info->hne_lif = 0;
4403                 info->hne_event = NE_UNPLUMB;
4404                 info->hne_data = nicname;
4405                 info->hne_datalen = nicnamelen;
4406                 info->hne_family = ill->ill_isv6 ?
4407                     ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data;
4408         } else {
4409                 ip2dbg(("ill_glist_delete: could not attach UNPLUMB nic event "
4410                     "information for %s (ENOMEM)\n", ill->ill_name));
4411                 if (nicname != NULL)
4412                         kmem_free(nicname, nicnamelen);
4413         }
4414
4415         ill->ill_nic_event_info = info;
4416
4417         ill_phyint_free(ill);
4418         rw_exit(&ipst->ips_ill_g_lock);
4419 }
4420
4421 /*
4422  * allocate a ppa, if the number of plumbed interfaces of this type are
4423  * less than ill_no_arena do a linear search to find a unused ppa.
4424  * When the number goes beyond ill_no_arena switch to using an arena.
4425  * Note: ppa value of zero cannot be allocated from vmem_arena as it
4426  * is the return value for an error condition, so allocation starts at one
4427  * and is decremented by one.
4428  */
4429 static int
4430 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill)
4431 {
4432         ill_t *tmp_ill;
4433         uint_t start, end;
4434         int ppa;
4435
4436         if (ifp->illif_ppa_arena == NULL &&
4437             (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) {
4438                 /*
4439                  * Create an arena.
4440                  */
4441                 ifp->illif_ppa_arena = vmem_create(ifp->illif_name,
4442                     (void *)1, UINT_MAX - 1, 1, NULL, NULL,
4443                     NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
4444                         /* allocate what has already been assigned */
4445                 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa);
4446                     tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa,
4447                     tmp_ill, AVL_AFTER)) {
4448                         ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
4449                             1,          /* size */
4450                             1,          /* align/quantum */
4451                             0,          /* phase */
4452                             0,          /* nocross */
4453                             /* minaddr */
4454                             (void *)((uintptr_t)tmp_ill->ill_ppa + 1),
4455                             /* maxaddr */
4456                             (void *)((uintptr_t)tmp_ill->ill_ppa + 2),
4457                             VM_NOSLEEP|VM_FIRSTFIT);
4458                         if (ppa == 0) {
4459                                 ip1dbg(("ill_alloc_ppa: ppa allocation"
4460                                     " failed while switching"));
4461                                 vmem_destroy(ifp->illif_ppa_arena);
4462                                 ifp->illif_ppa_arena = NULL;
4463                                 break;
4464                         }
4465                 }
4466         }
4467
4468         if (ifp->illif_ppa_arena != NULL) {
4469                 if (ill->ill_ppa == UINT_MAX) {
4470                         ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena,
4471                             1, VM_NOSLEEP|VM_FIRSTFIT);
4472                         if (ppa == 0)
4473                                 return (EAGAIN);
4474                         ill->ill_ppa = --ppa;
4475                 } else {
4476                         ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
4477                             1,          /* size */
4478                             1,          /* align/quantum */
4479                             0,          /* phase */
4480                             0,          /* nocross */
4481                             (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */
4482                             (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */
4483                             VM_NOSLEEP|VM_FIRSTFIT);
4484                         /*
4485                          * Most likely the allocation failed because
4486                          * the requested ppa was in use.
4487                          */
4488                         if (ppa == 0)
4489                                 return (EEXIST);
4490                 }
4491                 return (0);
4492         }
4493
4494         /*
4495          * No arena is in use and not enough (>ill_no_arena) interfaces have
4496          * been plumbed to create one. Do a linear search to get a unused ppa.
4497          */
4498         if (ill->ill_ppa == UINT_MAX) {
4499                 end = UINT_MAX - 1;
4500                 start = 0;
4501         } else {
4502                 end = start = ill->ill_ppa;
4503         }
4504
4505         tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL);
4506         while (tmp_ill != NULL && tmp_ill->ill_ppa == start) {
4507                 if (start++ >= end) {
4508                         if (ill->ill_ppa == UINT_MAX)
4509                                 return (EAGAIN);
4510                         else
4511                                 return (EEXIST);
4512                 }
4513                 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER);
4514         }
4515         ill->ill_ppa = start;
4516         return (0);
4517 }
4518
4519 /*
4520  * Insert ill into the list of configured ill's. Once this function completes,
4521  * the ill is globally visible and is available through lookups. More precisely
4522  * this happens after the caller drops the ill_g_lock.
4523  */
4524 static int
4525 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
4526 {
4527         ill_if_t *ill_interface;
4528         avl_index_t where = 0;
4529         int error;
4530         int name_length;
4531         int index;
4532         boolean_t check_length = B_FALSE;
4533         ip_stack_t      *ipst = ill->ill_ipst;
4534
4535         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
4536
4537         name_length = mi_strlen(name) + 1;
4538
4539         if (isv6)
4540                 index = IP_V6_G_HEAD;
4541         else
4542                 index = IP_V4_G_HEAD;
4543
4544         ill_interface = IP_VX_ILL_G_LIST(index, ipst);
4545         /*
4546          * Search for interface type based on name
4547          */
4548         while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
4549                 if ((ill_interface->illif_name_len == name_length) &&
4550                     (strcmp(ill_interface->illif_name, name) == 0)) {
4551                         break;
4552                 }
4553                 ill_interface = ill_interface->illif_next;
4554         }
4555
4556         /*
4557          * Interface type not found, create one.
4558          */
4559         if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
4560
4561                 ill_g_head_t ghead;
4562
4563                 /*
4564                  * allocate ill_if_t structure
4565                  */
4566
4567                 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t));
4568                 if (ill_interface == NULL) {
4569                         return (ENOMEM);
4570                 }
4571
4572
4573
4574                 (void) strcpy(ill_interface->illif_name, name);
4575                 ill_interface->illif_name_len = name_length;
4576
4577                 avl_create(&ill_interface->illif_avl_by_ppa,
4578                     ill_compare_ppa, sizeof (ill_t),
4579                     offsetof(struct ill_s, ill_avl_byppa));
4580
4581                 /*
4582                  * link the structure in the back to maintain order
4583                  * of configuration for ifconfig output.
4584                  */
4585                 ghead = ipst->ips_ill_g_heads[index];
4586                 insque(ill_interface, ghead.ill_g_list_tail);
4587
4588         }
4589
4590         if (ill->ill_ppa == UINT_MAX)
4591                 check_length = B_TRUE;
4592
4593         error = ill_alloc_ppa(ill_interface, ill);
4594         if (error != 0) {
4595                 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
4596                         ill_delete_interface_type(ill->ill_ifptr);
4597                 return (error);
4598         }
4599
4600         /*
4601          * When the ppa is choosen by the system, check that there is
4602          * enough space to insert ppa. if a specific ppa was passed in this
4603          * check is not required as the interface name passed in will have
4604          * the right ppa in it.
4605          */
4606         if (check_length) {
4607                 /*
4608                  * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars.
4609                  */
4610                 char buf[sizeof (uint_t) * 3];
4611
4612                 /*
4613                  * convert ppa to string to calculate the amount of space
4614                  * required for it in the name.
4615                  */
4616                 numtos(ill->ill_ppa, buf);
4617
4618                 /* Do we have enough space to insert ppa ? */
4619
4620                 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) {
4621                         /* Free ppa and interface type struct */
4622                         if (ill_interface->illif_ppa_arena != NULL) {
4623                                 vmem_free(ill_interface->illif_ppa_arena,
4624                                     (void *)(uintptr_t)(ill->ill_ppa+1), 1);
4625                         }
4626                         if (avl_numnodes(&ill_interface->illif_avl_by_ppa) ==
4627                             0) {
4628                                 ill_delete_interface_type(ill->ill_ifptr);
4629                         }
4630
4631                         return (EINVAL);
4632                 }
4633         }
4634
4635         (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa);
4636         ill->ill_name_length = mi_strlen(ill->ill_name) + 1;
4637
4638         (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa,
4639             &where);
4640         ill->ill_ifptr = ill_interface;
4641         avl_insert(&ill_interface->illif_avl_by_ppa, ill, where);
4642
4643         ill_phyint_reinit(ill);
4644         return (0);
4645 }
4646
4647 /* Initialize the per phyint (per IPMP group) ipsq used for serialization */
4648 static boolean_t
4649 ipsq_init(ill_t *ill)
4650 {
4651         ipsq_t  *ipsq;
4652
4653         /* Init the ipsq and impicitly enter as writer */
4654         ill->ill_phyint->phyint_ipsq =
4655             kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP);
4656         if (ill->ill_phyint->phyint_ipsq == NULL)
4657                 return (B_FALSE);
4658         ipsq = ill->ill_phyint->phyint_ipsq;
4659         ipsq->ipsq_phyint_list = ill->ill_phyint;
4660         ill->ill_phyint->phyint_ipsq_next = NULL;
4661         mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
4662         ipsq->ipsq_refs = 1;
4663         ipsq->ipsq_writer = curthread;
4664         ipsq->ipsq_reentry_cnt = 1;
4665         ipsq->ipsq_ipst = ill->ill_ipst;        /* No netstack_hold */
4666 #ifdef DEBUG
4667         ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack,
4668             IPSQ_STACK_DEPTH);
4669 #endif
4670         (void) strcpy(ipsq->ipsq_name, ill->ill_name);
4671         return (B_TRUE);
4672 }
4673
4674 /*
4675  * ill_init is called by ip_open when a device control stream is opened.
4676  * It does a few initializations, and shoots a DL_INFO_REQ message down
4677  * to the driver.  The response is later picked up in ip_rput_dlpi and
4678  * used to set up default mechanisms for talking to the driver.  (Always
4679  * called as writer.)
4680  *
4681  * If this function returns error, ip_open will call ip_close which in
4682  * turn will call ill_delete to clean up any memory allocated here that
4683  * is not yet freed.
4684  */
4685 int
4686 ill_init(queue_t *q, ill_t *ill)
4687 {
4688         int     count;
4689         dl_info_req_t   *dlir;
4690         mblk_t  *info_mp;
4691         uchar_t *frag_ptr;
4692
4693         /*
4694          * The ill is initialized to zero by mi_alloc*(). In addition
4695          * some fields already contain valid values, initialized in
4696          * ip_open(), before we reach here.
4697          */
4698         mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
4699
4700         ill->ill_rq = q;
4701         ill->ill_wq = WR(q);
4702
4703         info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
4704             BPRI_HI);
4705         if (info_mp == NULL)
4706                 return (ENOMEM);
4707
4708         /*
4709          * Allocate sufficient space to contain our fragment hash table and
4710          * the device name.
4711          */
4712         frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE +
4713             2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix));
4714         if (frag_ptr == NULL) {
4715                 freemsg(info_mp);
4716                 return (ENOMEM);
4717         }
4718         ill->ill_frag_ptr = frag_ptr;
4719         ill->ill_frag_free_num_pkts = 0;
4720         ill->ill_last_frag_clean_time = 0;
4721         ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr;
4722         ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE);
4723         for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
4724                 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock,
4725                     NULL, MUTEX_DEFAULT, NULL);
4726         }
4727
4728         ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
4729         if (ill->ill_phyint == NULL) {
4730                 freemsg(info_mp);
4731                 mi_free(frag_ptr);
4732                 return (ENOMEM);
4733         }
4734
4735         mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
4736         /*
4737          * For now pretend this is a v4 ill. We need to set phyint_ill*
4738          * at this point because of the following reason. If we can't
4739          * enter the ipsq at some point and cv_wait, the writer that
4740          * wakes us up tries to locate us using the list of all phyints
4741          * in an ipsq and the ills from the phyint thru the phyint_ill*.
4742          * If we don't set it now, we risk a missed wakeup.
4743          */
4744         ill->ill_phyint->phyint_illv4 = ill;
4745         ill->ill_ppa = UINT_MAX;
4746         ill->ill_fastpath_list = &ill->ill_fastpath_list;
4747
4748         if (!ipsq_init(ill)) {
4749                 freemsg(info_mp);
4750                 mi_free(frag_ptr);
4751                 mi_free(ill->ill_phyint);
4752                 return (ENOMEM);
4753         }
4754
4755         ill->ill_state_flags |= ILL_LL_SUBNET_PENDING;
4756
4757
4758         /* Frag queue limit stuff */
4759         ill->ill_frag_count = 0;
4760         ill->ill_ipf_gen = 0;
4761
4762         ill->ill_global_timer = INFINITY;
4763         ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
4764         ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
4765         ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
4766         ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
4767
4768         /*
4769          * Initialize IPv6 configuration variables.  The IP module is always
4770          * opened as an IPv4 module.  Instead tracking down the cases where
4771          * it switches to do ipv6, we'll just initialize the IPv6 configuration
4772          * here for convenience, this has no effect until the ill is set to do
4773          * IPv6.
4774          */
4775         ill->ill_reachable_time = ND_REACHABLE_TIME;
4776         ill->ill_reachable_retrans_time = ND_RETRANS_TIMER;
4777         ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT;
4778         ill->ill_max_buf = ND_MAX_Q;
4779         ill->ill_refcnt = 0;
4780
4781         /* Send down the Info Request to the driver. */
4782         info_mp->b_datap->db_type = M_PCPROTO;
4783         dlir = (dl_info_req_t *)info_mp->b_rptr;
4784         info_mp->b_wptr = (uchar_t *)&dlir[1];
4785         dlir->dl_primitive = DL_INFO_REQ;
4786
4787         ill->ill_dlpi_pending = DL_PRIM_INVAL;
4788
4789         qprocson(q);
4790         ill_dlpi_send(ill, info_mp);
4791
4792         return (0);
4793 }
4794
4795 /*
4796  * ill_dls_info
4797  * creates datalink socket info from the device.
4798  */
4799 int
4800 ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif)
4801 {
4802         size_t  len;
4803         ill_t   *ill = ipif->ipif_ill;
4804
4805         sdl->sdl_family = AF_LINK;
4806         sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
4807         sdl->sdl_type = ill->ill_type;
4808         ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data));
4809         len = strlen(sdl->sdl_data);
4810         ASSERT(len < 256);
4811         sdl->sdl_nlen = (uchar_t)len;
4812         sdl->sdl_alen = ill->ill_phys_addr_length;
4813         sdl->sdl_slen = 0;
4814         if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL)
4815                 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen);
4816
4817         return (sizeof (struct sockaddr_dl));
4818 }
4819
4820 /*
4821  * ill_xarp_info
4822  * creates xarp info from the device.
4823  */
4824 static int
4825 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill)
4826 {
4827         sdl->sdl_family = AF_LINK;
4828         sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
4829         sdl->sdl_type = ill->ill_type;
4830         ipif_get_name(ill->ill_ipif, sdl->sdl_data, sizeof (sdl->sdl_data));
4831         sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data);
4832         sdl->sdl_alen = ill->ill_phys_addr_length;
4833         sdl->sdl_slen = 0;
4834         return (sdl->sdl_nlen);
4835 }
4836
4837 static int
4838 loopback_kstat_update(kstat_t *ksp, int rw)
4839 {
4840         kstat_named_t *kn;
4841         netstackid_t    stackid;
4842         netstack_t      *ns;
4843         ip_stack_t      *ipst;
4844
4845         if (ksp == NULL || ksp->ks_data == NULL)
4846                 return (EIO);
4847
4848         if (rw == KSTAT_WRITE)
4849                 return (EACCES);
4850
4851         kn = KSTAT_NAMED_PTR(ksp);
4852         stackid = (zoneid_t)(uintptr_t)ksp->ks_private;
4853
4854         ns = netstack_find_by_stackid(stackid);
4855         if (ns == NULL)
4856                 return (-1);
4857
4858         ipst = ns->netstack_ip;
4859         if (ipst == NULL) {
4860                 netstack_rele(ns);
4861                 return (-1);
4862         }
4863         kn[0].value.ui32 = ipst->ips_loopback_packets;
4864         kn[1].value.ui32 = ipst->ips_loopback_packets;
4865         netstack_rele(ns);
4866         return (0);
4867 }
4868
4869
4870 /*
4871  * Has ifindex been plumbed already.
4872  * Compares both phyint_ifindex and phyint_group_ifindex.
4873  */
4874 static boolean_t
4875 phyint_exists(uint_t index, ip_stack_t *ipst)
4876 {
4877         phyint_t *phyi;
4878
4879         ASSERT(index != 0);
4880         ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
4881         /*
4882          * Indexes are stored in the phyint - a common structure
4883          * to both IPv4 and IPv6.
4884          */
4885         phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
4886         for (; phyi != NULL;
4887             phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
4888             phyi, AVL_AFTER)) {
4889                 if (phyi->phyint_ifindex == index ||
4890                     phyi->phyint_group_ifindex == index)
4891                         return (B_TRUE);
4892         }
4893         return (B_FALSE);
4894 }
4895
4896 /* Pick a unique ifindex */
4897 boolean_t
4898 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst)
4899 {
4900         uint_t starting_index;
4901
4902         if (!ipst->ips_ill_index_wrap) {
4903                 *indexp = ipst->ips_ill_index++;
4904                 if (ipst->ips_ill_index == 0) {
4905                         /* Reached the uint_t limit Next time wrap  */
4906                         ipst->ips_ill_index_wrap = B_TRUE;
4907                 }
4908                 return (B_TRUE);
4909         }
4910
4911         /*
4912          * Start reusing unused indexes. Note that we hold the ill_g_lock
4913          * at this point and don't want to call any function that attempts
4914          * to get the lock again.
4915          */
4916         starting_index = ipst->ips_ill_index++;
4917         for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) {
4918                 if (ipst->ips_ill_index != 0 &&
4919                     !phyint_exists(ipst->ips_ill_index, ipst)) {
4920                         /* found unused index - use it */
4921                         *indexp = ipst->ips_ill_index;
4922                         return (B_TRUE);
4923                 }
4924         }
4925
4926         /*
4927          * all interface indicies are inuse.
4928          */
4929         return (B_FALSE);
4930 }
4931
4932 /*
4933  * Assign a unique interface index for the phyint.
4934  */
4935 static boolean_t
4936 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst)
4937 {
4938         ASSERT(phyi->phyint_ifindex == 0);
4939         return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst));
4940 }
4941
4942 /*
4943  * Return a pointer to the ill which matches the supplied name.  Note that
4944  * the ill name length includes the null termination character.  (May be
4945  * called as writer.)
4946  * If do_alloc and the interface is "lo0" it will be automatically created.
4947  * Cannot bump up reference on condemned ills. So dup detect can't be done
4948  * using this func.
4949  */
4950 ill_t *
4951 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
4952     queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc,
4953     ip_stack_t *ipst)
4954 {
4955         ill_t   *ill;
4956         ipif_t  *ipif;
4957         kstat_named_t   *kn;
4958         boolean_t isloopback;
4959         ipsq_t *old_ipsq;
4960         in6_addr_t ov6addr;
4961
4962         isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
4963
4964         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4965         ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst);
4966         rw_exit(&ipst->ips_ill_g_lock);
4967         if (ill != NULL || (error != NULL && *error == EINPROGRESS))
4968                 return (ill);
4969
4970         /*
4971          * Couldn't find it.  Does this happen to be a lookup for the
4972          * loopback device and are we allowed to allocate it?
4973          */
4974         if (!isloopback || !do_alloc)
4975                 return (NULL);
4976
4977         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
4978
4979         ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst);
4980         if (ill != NULL || (error != NULL && *error == EINPROGRESS)) {
4981                 rw_exit(&ipst->ips_ill_g_lock);
4982                 return (ill);
4983         }
4984
4985         /* Create the loopback device on demand */
4986         ill = (ill_t *)(mi_alloc(sizeof (ill_t) +
4987             sizeof (ipif_loopback_name), BPRI_MED));
4988         if (ill == NULL)
4989                 goto done;
4990
4991         *ill = ill_null;
4992         mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL);
4993         ill->ill_ipst = ipst;
4994         netstack_hold(ipst->ips_netstack);
4995         /*
4996          * For exclusive stacks we set the zoneid to zero
4997          * to make IP operate as if in the global zone.
4998          */
4999         ill->ill_zoneid = GLOBAL_ZONEID;
5000
5001         ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
5002         if (ill->ill_phyint == NULL)
5003                 goto done;
5004
5005         if (isv6)
5006                 ill->ill_phyint->phyint_illv6 = ill;
5007         else
5008                 ill->ill_phyint->phyint_illv4 = ill;
5009         mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
5010         ill->ill_max_frag = IP_LOOPBACK_MTU;
5011         /* Add room for tcp+ip headers */
5012         if (isv6) {
5013                 ill->ill_isv6 = B_TRUE;
5014                 ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */
5015         } else {
5016                 ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20;
5017         }
5018         if (!ill_allocate_mibs(ill))
5019                 goto done;
5020         ill->ill_max_mtu = ill->ill_max_frag;
5021         /*
5022          * ipif_loopback_name can't be pointed at directly because its used
5023          * by both the ipv4 and ipv6 interfaces.  When the ill is removed
5024          * from the glist, ill_glist_delete() sets the first character of
5025          * ill_name to '\0'.
5026          */
5027         ill->ill_name = (char *)ill + sizeof (*ill);
5028         (void) strcpy(ill->ill_name, ipif_loopback_name);
5029         ill->ill_name_length = sizeof (ipif_loopback_name);
5030         /* Set ill_name_set for ill_phyint_reinit to work properly */
5031
5032         ill->ill_global_timer = INFINITY;
5033         ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
5034         ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
5035         ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
5036         ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
5037
5038         /* No resolver here. */
5039         ill->ill_net_type = IRE_LOOPBACK;
5040
5041         /* Initialize the ipsq */
5042         if (!ipsq_init(ill))
5043                 goto done;
5044
5045         ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL;
5046         ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--;
5047         ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0);
5048 #ifdef DEBUG
5049         ill->ill_phyint->phyint_ipsq->ipsq_depth = 0;
5050 #endif
5051         ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE);
5052         if (ipif == NULL)
5053                 goto done;
5054
5055         ill->ill_flags = ILLF_MULTICAST;
5056
5057         ov6addr = ipif->ipif_v6lcl_addr;
5058         /* Set up default loopback address and mask. */
5059         if (!isv6) {
5060                 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK);
5061
5062                 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr);
5063                 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
5064                 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask);
5065                 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
5066                     ipif->ipif_v6subnet);
5067                 ill->ill_flags |= ILLF_IPV4;
5068         } else {
5069                 ipif->ipif_v6lcl_addr = ipv6_loopback;
5070                 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
5071                 ipif->ipif_v6net_mask = ipv6_all_ones;
5072                 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
5073                     ipif->ipif_v6subnet);
5074                 ill->ill_flags |= ILLF_IPV6;
5075         }
5076
5077         /*
5078          * Chain us in at the end of the ill list. hold the ill
5079          * before we make it globally visible. 1 for the lookup.
5080          */
5081         ill->ill_refcnt = 0;
5082         ill_refhold(ill);
5083
5084         ill->ill_frag_count = 0;
5085         ill->ill_frag_free_num_pkts = 0;
5086         ill->ill_last_frag_clean_time = 0;
5087
5088         old_ipsq = ill->ill_phyint->phyint_ipsq;
5089
5090         if (ill_glist_insert(ill, "lo", isv6) != 0)
5091                 cmn_err(CE_PANIC, "cannot insert loopback interface");
5092
5093         /* Let SCTP know so that it can add this to its list */
5094         sctp_update_ill(ill, SCTP_ILL_INSERT);
5095
5096         /*
5097          * We have already assigned ipif_v6lcl_addr above, but we need to
5098          * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which
5099          * requires to be after ill_glist_insert() since we need the
5100          * ill_index set. Pass on ipv6_loopback as the old address.
5101          */
5102         sctp_update_ipif_addr(ipif, ov6addr);
5103
5104         /*
5105          * If the ipsq was changed in ill_phyint_reinit free the old ipsq.
5106          */
5107         if (old_ipsq != ill->ill_phyint->phyint_ipsq) {
5108                 /* Loopback ills aren't in any IPMP group */
5109                 ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP));
5110                 ipsq_delete(old_ipsq);
5111         }
5112
5113         /*
5114          * Delay this till the ipif is allocated as ipif_allocate
5115          * de-references ill_phyint for getting the ifindex. We
5116          * can't do this before ipif_allocate because ill_phyint_reinit
5117          * -> phyint_assign_ifindex expects ipif to be present.
5118          */
5119         mutex_enter(&ill->ill_phyint->phyint_lock);
5120         ill->ill_phyint->phyint_flags |= PHYI_LOOPBACK | PHYI_VIRTUAL;
5121         mutex_exit(&ill->ill_phyint->phyint_lock);
5122
5123         if (ipst->ips_loopback_ksp == NULL) {
5124                 /* Export loopback interface statistics */
5125                 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0,
5126                     ipif_loopback_name, "net",
5127                     KSTAT_TYPE_NAMED, 2, 0,
5128                     ipst->ips_netstack->netstack_stackid);
5129                 if (ipst->ips_loopback_ksp != NULL) {
5130                         ipst->ips_loopback_ksp->ks_update =
5131                             loopback_kstat_update;
5132                         kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp);
5133                         kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32);
5134                         kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32);
5135                         ipst->ips_loopback_ksp->ks_private =
5136                             (void *)(uintptr_t)ipst->ips_netstack->
5137                             netstack_stackid;
5138                         kstat_install(ipst->ips_loopback_ksp);
5139                 }
5140         }
5141
5142         if (error != NULL)
5143                 *error = 0;
5144         *did_alloc = B_TRUE;
5145         rw_exit(&ipst->ips_ill_g_lock);
5146         return (ill);
5147 done:
5148         if (ill != NULL) {
5149                 if (ill->ill_phyint != NULL) {
5150                         ipsq_t  *ipsq;
5151
5152                         ipsq = ill->ill_phyint->phyint_ipsq;
5153                         if (ipsq != NULL) {
5154                                 ipsq->ipsq_ipst = NULL;
5155                                 kmem_free(ipsq, sizeof (ipsq_t));
5156                         }
5157                         mi_free(ill->ill_phyint);
5158                 }
5159                 ill_free_mib(ill);
5160                 if (ill->ill_ipst != NULL)
5161                         netstack_rele(ill->ill_ipst->ips_netstack);
5162                 mi_free(ill);
5163         }
5164         rw_exit(&ipst->ips_ill_g_lock);
5165         if (error != NULL)
5166                 *error = ENOMEM;
5167         return (NULL);
5168 }
5169
5170 /*
5171  * For IPP calls - use the ip_stack_t for global stack.
5172  */
5173 ill_t *
5174 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6,
5175     queue_t *q, mblk_t *mp, ipsq_func_t func, int *err)
5176 {
5177         ip_stack_t      *ipst;
5178         ill_t           *ill;
5179
5180         ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip;
5181         if (ipst == NULL) {
5182                 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n");
5183                 return (NULL);
5184         }
5185
5186         ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst);
5187         netstack_rele(ipst->ips_netstack);
5188         return (ill);
5189 }
5190
5191 /*
5192  * Return a pointer to the ill which matches the index and IP version type.
5193  */
5194 ill_t *
5195 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp,
5196     ipsq_func_t func, int *err, ip_stack_t *ipst)
5197 {
5198         ill_t   *ill;
5199         ipsq_t  *ipsq;
5200         phyint_t *phyi;
5201
5202         ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) ||
5203             (q != NULL && mp != NULL && func != NULL && err != NULL));
5204
5205         if (err != NULL)
5206                 *err = 0;
5207
5208         /*
5209          * Indexes are stored in the phyint - a common structure
5210          * to both IPv4 and IPv6.
5211          */
5212         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5213         phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
5214             (void *) &index, NULL);
5215         if (phyi != NULL) {
5216                 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
5217                 if (ill != NULL) {
5218                         /*
5219                          * The block comment at the start of ipif_down
5220                          * explains the use of the macros used below
5221                          */
5222                         GRAB_CONN_LOCK(q);
5223                         mutex_enter(&ill->ill_lock);
5224                         if (ILL_CAN_LOOKUP(ill)) {
5225                                 ill_refhold_locked(ill);
5226                                 mutex_exit(&ill->ill_lock);
5227                                 RELEASE_CONN_LOCK(q);
5228                                 rw_exit(&ipst->ips_ill_g_lock);
5229                                 return (ill);
5230                         } else if (ILL_CAN_WAIT(ill, q)) {
5231                                 ipsq = ill->ill_phyint->phyint_ipsq;
5232                                 mutex_enter(&ipsq->ipsq_lock);
5233                                 rw_exit(&ipst->ips_ill_g_lock);
5234                                 mutex_exit(&ill->ill_lock);
5235                                 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
5236                                 mutex_exit(&ipsq->ipsq_lock);
5237                                 RELEASE_CONN_LOCK(q);
5238                                 if (err != NULL)
5239                                         *err = EINPROGRESS;
5240                                 return (NULL);
5241                         }
5242                         RELEASE_CONN_LOCK(q);
5243                         mutex_exit(&ill->ill_lock);
5244                 }
5245         }
5246         rw_exit(&ipst->ips_ill_g_lock);
5247         if (err != NULL)
5248                 *err = ENXIO;
5249         return (NULL);
5250 }
5251
5252 /*
5253  * Return the ifindex next in sequence after the passed in ifindex.
5254  * If there is no next ifindex for the given protocol, return 0.
5255  */
5256 uint_t
5257 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
5258 {
5259         phyint_t *phyi;
5260         phyint_t *phyi_initial;
5261         uint_t   ifindex;
5262
5263         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5264
5265         if (index == 0) {
5266                 phyi = avl_first(
5267                     &ipst->ips_phyint_g_list->phyint_list_avl_by_index);
5268         } else {
5269                 phyi = phyi_initial = avl_find(
5270                     &ipst->ips_phyint_g_list->phyint_list_avl_by_index,
5271                     (void *) &index, NULL);
5272         }
5273
5274         for (; phyi != NULL;
5275             phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
5276             phyi, AVL_AFTER)) {
5277                 /*
5278                  * If we're not returning the first interface in the tree
5279                  * and we still haven't moved past the phyint_t that
5280                  * corresponds to index, avl_walk needs to be called again
5281                  */
5282                 if (!((index != 0) && (phyi == phyi_initial))) {
5283                         if (isv6) {
5284                                 if ((phyi->phyint_illv6) &&
5285                                     ILL_CAN_LOOKUP(phyi->phyint_illv6) &&
5286                                     (phyi->phyint_illv6->ill_isv6 == 1))
5287                                         break;
5288                         } else {
5289                                 if ((phyi->phyint_illv4) &&
5290                                     ILL_CAN_LOOKUP(phyi->phyint_illv4) &&
5291                                     (phyi->phyint_illv4->ill_isv6 == 0))
5292                                         break;
5293                         }
5294                 }
5295         }
5296
5297         rw_exit(&ipst->ips_ill_g_lock);
5298
5299         if (phyi != NULL)
5300                 ifindex = phyi->phyint_ifindex;
5301         else
5302                 ifindex = 0;
5303
5304         return (ifindex);
5305 }
5306
5307
5308 /*
5309  * Return the ifindex for the named interface.
5310  * If there is no next ifindex for the interface, return 0.
5311  */
5312 uint_t
5313 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst)
5314 {
5315         phyint_t        *phyi;
5316         avl_index_t     where = 0;
5317         uint_t          ifindex;
5318
5319         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5320
5321         if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
5322             name, &where)) == NULL) {
5323                 rw_exit(&ipst->ips_ill_g_lock);
5324                 return (0);
5325         }
5326
5327         ifindex = phyi->phyint_ifindex;
5328
5329         rw_exit(&ipst->ips_ill_g_lock);
5330
5331         return (ifindex);
5332 }
5333
5334
5335 /*
5336  * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt
5337  * that gives a running thread a reference to the ill. This reference must be
5338  * released by the thread when it is done accessing the ill and related
5339  * objects. ill_refcnt can not be used to account for static references
5340  * such as other structures pointing to an ill. Callers must generally
5341  * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros
5342  * or be sure that the ill is not being deleted or changing state before
5343  * calling the refhold functions. A non-zero ill_refcnt ensures that the
5344  * ill won't change any of its critical state such as address, netmask etc.
5345  */
5346 void
5347 ill_refhold(ill_t *ill)
5348 {
5349         mutex_enter(&ill->ill_lock);
5350         ill->ill_refcnt++;
5351         ILL_TRACE_REF(ill);
5352         mutex_exit(&ill->ill_lock);
5353 }
5354
5355 void
5356 ill_refhold_locked(ill_t *ill)
5357 {
5358         ASSERT(MUTEX_HELD(&ill->ill_lock));
5359         ill->ill_refcnt++;
5360         ILL_TRACE_REF(ill);
5361 }
5362
5363 int
5364 ill_check_and_refhold(ill_t *ill)
5365 {
5366         mutex_enter(&ill->ill_lock);
5367         if (ILL_CAN_LOOKUP(ill)) {
5368                 ill_refhold_locked(ill);
5369                 mutex_exit(&ill->ill_lock);
5370                 return (0);
5371         }
5372         mutex_exit(&ill->ill_lock);
5373         return (ILL_LOOKUP_FAILED);
5374 }
5375
5376 /*
5377  * Must not be called while holding any locks. Otherwise if this is
5378  * the last reference to be released, there is a chance of recursive mutex
5379  * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
5380  * to restart an ioctl.
5381  */
5382 void
5383 ill_refrele(ill_t *ill)
5384 {
5385         mutex_enter(&ill->ill_lock);
5386         ASSERT(ill->ill_refcnt != 0);
5387         ill->ill_refcnt--;
5388         ILL_UNTRACE_REF(ill);
5389         if (ill->ill_refcnt != 0) {
5390                 /* Every ire pointing to the ill adds 1 to ill_refcnt */
5391                 mutex_exit(&ill->ill_lock);
5392                 return;
5393         }
5394
5395         /* Drops the ill_lock */
5396         ipif_ill_refrele_tail(ill);
5397 }
5398
5399 /*
5400  * Obtain a weak reference count on the ill. This reference ensures the
5401  * ill won't be freed, but the ill may change any of its critical state
5402  * such as netmask, address etc. Returns an error if the ill has started
5403  * closing.
5404  */
5405 boolean_t
5406 ill_waiter_inc(ill_t *ill)
5407 {
5408         mutex_enter(&ill->ill_lock);
5409         if (ill->ill_state_flags & ILL_CONDEMNED) {
5410                 mutex_exit(&ill->ill_lock);
5411                 return (B_FALSE);
5412         }
5413         ill->ill_waiters++;
5414         mutex_exit(&ill->ill_lock);
5415         return (B_TRUE);
5416 }
5417
5418 void
5419 ill_waiter_dcr(ill_t *ill)
5420 {
5421         mutex_enter(&ill->ill_lock);
5422         ill->ill_waiters--;
5423         if (ill->ill_waiters == 0)
5424                 cv_broadcast(&ill->ill_cv);
5425         mutex_exit(&ill->ill_lock);
5426 }
5427
5428 /*
5429  * Named Dispatch routine to produce a formatted report on all ILLs.
5430  * This report is accessed by using the ndd utility to "get" ND variable
5431  * "ip_ill_status".
5432  */
5433 /* ARGSUSED */
5434 int
5435 ip_ill_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
5436 {
5437         ill_t           *ill;
5438         ill_walk_context_t ctx;
5439         ip_stack_t      *ipst;
5440
5441         ipst = CONNQ_TO_IPST(q);
5442
5443         (void) mi_mpprintf(mp,
5444             "ILL      " MI_COL_HDRPAD_STR
5445         /*   01234567[89ABCDEF] */
5446             "rq       " MI_COL_HDRPAD_STR
5447         /*   01234567[89ABCDEF] */
5448             "wq       " MI_COL_HDRPAD_STR
5449         /*   01234567[89ABCDEF] */
5450             "upcnt mxfrg err name");
5451         /*   12345 12345 123 xxxxxxxx  */
5452
5453         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5454         ill = ILL_START_WALK_ALL(&ctx, ipst);
5455         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
5456                 (void) mi_mpprintf(mp,
5457                     MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR
5458                     "%05u %05u %03d %s",
5459                     (void *)ill, (void *)ill->ill_rq, (void *)ill->ill_wq,
5460                     ill->ill_ipif_up_count,
5461                     ill->ill_max_frag, ill->ill_error, ill->ill_name);
5462         }
5463         rw_exit(&ipst->ips_ill_g_lock);
5464
5465         return (0);
5466 }
5467
5468 /*
5469  * Named Dispatch routine to produce a formatted report on all IPIFs.
5470  * This report is accessed by using the ndd utility to "get" ND variable
5471  * "ip_ipif_status".
5472  */
5473 /* ARGSUSED */
5474 int
5475 ip_ipif_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
5476 {
5477         char    buf1[INET6_ADDRSTRLEN];
5478         char    buf2[INET6_ADDRSTRLEN];
5479         char    buf3[INET6_ADDRSTRLEN];
5480         char    buf4[INET6_ADDRSTRLEN];
5481         char    buf5[INET6_ADDRSTRLEN];
5482         char    buf6[INET6_ADDRSTRLEN];
5483         char    buf[LIFNAMSIZ];
5484         ill_t   *ill;
5485         ipif_t  *ipif;
5486         nv_t    *nvp;
5487         uint64_t flags;
5488         zoneid_t zoneid;
5489         ill_walk_context_t ctx;
5490         ip_stack_t *ipst = CONNQ_TO_IPST(q);
5491
5492         (void) mi_mpprintf(mp,
5493             "IPIF metric mtu in/out/forward name zone flags...\n"
5494             "\tlocal address\n"
5495             "\tsrc address\n"
5496             "\tsubnet\n"
5497             "\tmask\n"
5498             "\tbroadcast\n"
5499             "\tp-p-dst");
5500
5501         ASSERT(q->q_next == NULL);
5502         zoneid = Q_TO_CONN(q)->conn_zoneid;     /* IP is a driver */
5503
5504         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5505         ill = ILL_START_WALK_ALL(&ctx, ipst);
5506         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
5507                 for (ipif = ill->ill_ipif; ipif != NULL;
5508                     ipif = ipif->ipif_next) {
5509                         if (zoneid != GLOBAL_ZONEID &&
5510                             zoneid != ipif->ipif_zoneid &&
5511                             ipif->ipif_zoneid != ALL_ZONES)
5512                                 continue;
5513
5514                         ipif_get_name(ipif, buf, sizeof (buf));
5515                         (void) mi_mpprintf(mp,
5516                             MI_COL_PTRFMT_STR
5517                             "%04u %05u %u/%u/%u %s %d",
5518                             (void *)ipif,
5519                             ipif->ipif_metric, ipif->ipif_mtu,
5520                             ipif->ipif_ib_pkt_count,
5521                             ipif->ipif_ob_pkt_count,
5522                             ipif->ipif_fo_pkt_count,
5523                             buf,
5524                             ipif->ipif_zoneid);
5525
5526                 flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags |
5527                     ipif->ipif_ill->ill_phyint->phyint_flags;
5528
5529                 /* Tack on text strings for any flags. */
5530                 nvp = ipif_nv_tbl;
5531                 for (; nvp < A_END(ipif_nv_tbl); nvp++) {
5532                         if (nvp->nv_value & flags)
5533                                 (void) mi_mpprintf_nr(mp, " %s",
5534                                     nvp->nv_name);
5535                 }
5536                 (void) mi_mpprintf(mp,
5537                     "\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s",
5538                     inet_ntop(AF_INET6,
5539                     &ipif->ipif_v6lcl_addr, buf1, sizeof (buf1)),
5540                     inet_ntop(AF_INET6,
5541                     &ipif->ipif_v6src_addr, buf2, sizeof (buf2)),
5542                     inet_ntop(AF_INET6,
5543                     &ipif->ipif_v6subnet, buf3, sizeof (buf3)),
5544                     inet_ntop(AF_INET6,
5545                     &ipif->ipif_v6net_mask, buf4, sizeof (buf4)),
5546                     inet_ntop(AF_INET6,
5547                     &ipif->ipif_v6brd_addr, buf5, sizeof (buf5)),
5548                     inet_ntop(AF_INET6,
5549                     &ipif->ipif_v6pp_dst_addr, buf6, sizeof (buf6)));
5550                 }
5551         }
5552         rw_exit(&ipst->ips_ill_g_lock);
5553         return (0);
5554 }
5555
5556 /*
5557  * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the
5558  * driver.  We construct best guess defaults for lower level information that
5559  * we need.  If an interface is brought up without injection of any overriding
5560  * information from outside, we have to be ready to go with these defaults.
5561  * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ)
5562  * we primarely want the dl_provider_style.
5563  * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND
5564  * at which point we assume the other part of the information is valid.
5565  */
5566 void
5567 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
5568 {
5569         uchar_t         *brdcst_addr;
5570         uint_t          brdcst_addr_length, phys_addr_length;
5571         t_scalar_t      sap_length;
5572         dl_info_ack_t   *dlia;
5573         ip_m_t          *ipm;
5574         dl_qos_cl_sel1_t *sel1;
5575
5576         ASSERT(IAM_WRITER_ILL(ill));
5577
5578         /*
5579          * Till the ill is fully up ILL_CHANGING will be set and
5580          * the ill is not globally visible. So no need for a lock.
5581          */
5582         dlia = (dl_info_ack_t *)mp->b_rptr;
5583         ill->ill_mactype = dlia->dl_mac_type;
5584
5585         ipm = ip_m_lookup(dlia->dl_mac_type);
5586         if (ipm == NULL) {
5587                 ipm = ip_m_lookup(DL_OTHER);
5588                 ASSERT(ipm != NULL);
5589         }
5590         ill->ill_media = ipm;
5591
5592         /*
5593          * When the new DLPI stuff is ready we'll pull lengths
5594          * from dlia.
5595          */
5596         if (dlia->dl_version == DL_VERSION_2) {
5597                 brdcst_addr_length = dlia->dl_brdcst_addr_length;
5598                 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
5599                     brdcst_addr_length);
5600                 if (brdcst_addr == NULL) {
5601                         brdcst_addr_length = 0;
5602                 }
5603                 sap_length = dlia->dl_sap_length;
5604                 phys_addr_length = dlia->dl_addr_length - ABS(sap_length);
5605                 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n",
5606                     brdcst_addr_length, sap_length, phys_addr_length));
5607         } else {
5608                 brdcst_addr_length = 6;
5609                 brdcst_addr = ip_six_byte_all_ones;
5610                 sap_length = -2;
5611                 phys_addr_length = brdcst_addr_length;
5612         }
5613
5614         ill->ill_bcast_addr_length = brdcst_addr_length;
5615         ill->ill_phys_addr_length = phys_addr_length;
5616         ill->ill_sap_length = sap_length;
5617         ill->ill_max_frag = dlia->dl_max_sdu;
5618         ill->ill_max_mtu = ill->ill_max_frag;
5619
5620         ill->ill_type = ipm->ip_m_type;
5621
5622         if (!ill->ill_dlpi_style_set) {
5623                 if (dlia->dl_provider_style == DL_STYLE2)
5624                         ill->ill_needs_attach = 1;
5625
5626                 /*
5627                  * Allocate the first ipif on this ill. We don't delay it
5628                  * further as ioctl handling assumes atleast one ipif to
5629                  * be present.
5630                  *
5631                  * At this point we don't know whether the ill is v4 or v6.
5632                  * We will know this whan the SIOCSLIFNAME happens and
5633                  * the correct value for ill_isv6 will be assigned in
5634                  * ipif_set_values(). We need to hold the ill lock and
5635                  * clear the ILL_LL_SUBNET_PENDING flag and atomically do
5636                  * the wakeup.
5637                  */
5638                 (void) ipif_allocate(ill, 0, IRE_LOCAL,
5639                     dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE);
5640                 mutex_enter(&ill->ill_lock);
5641                 ASSERT(ill->ill_dlpi_style_set == 0);
5642                 ill->ill_dlpi_style_set = 1;
5643                 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING;
5644                 cv_broadcast(&ill->ill_cv);
5645                 mutex_exit(&ill->ill_lock);
5646                 freemsg(mp);
5647                 return;
5648         }
5649         ASSERT(ill->ill_ipif != NULL);
5650         /*
5651          * We know whether it is IPv4 or IPv6 now, as this is the
5652          * second DL_INFO_ACK we are recieving in response to the
5653          * DL_INFO_REQ sent in ipif_set_values.
5654          */
5655         if (ill->ill_isv6)
5656                 ill->ill_sap = IP6_DL_SAP;
5657         else
5658                 ill->ill_sap = IP_DL_SAP;
5659         /*
5660          * Set ipif_mtu which is used to set the IRE's
5661          * ire_max_frag value. The driver could have sent
5662          * a different mtu from what it sent last time. No
5663          * need to call ipif_mtu_change because IREs have
5664          * not yet been created.
5665          */
5666         ill->ill_ipif->ipif_mtu = ill->ill_max_mtu;
5667         /*
5668          * Clear all the flags that were set based on ill_bcast_addr_length
5669          * and ill_phys_addr_length (in ipif_set_values) as these could have
5670          * changed now and we need to re-evaluate.
5671          */
5672         ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP);
5673         ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT);
5674
5675         /*
5676          * Free ill_resolver_mp and ill_bcast_mp as things could have
5677          * changed now.
5678          */
5679         if (ill->ill_bcast_addr_length == 0) {
5680                 if (ill->ill_resolver_mp != NULL)
5681                         freemsg(ill->ill_resolver_mp);
5682                 if (ill->ill_bcast_mp != NULL)
5683                         freemsg(ill->ill_bcast_mp);
5684                 if (ill->ill_flags & ILLF_XRESOLV)
5685                         ill->ill_net_type = IRE_IF_RESOLVER;
5686                 else
5687                         ill->ill_net_type = IRE_IF_NORESOLVER;
5688                 ill->ill_resolver_mp = ill_dlur_gen(NULL,
5689                     ill->ill_phys_addr_length,
5690                     ill->ill_sap,
5691                     ill->ill_sap_length);
5692                 ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp);
5693
5694                 if (ill->ill_isv6)
5695                         /*
5696                          * Note: xresolv interfaces will eventually need NOARP
5697                          * set here as well, but that will require those
5698                          * external resolvers to have some knowledge of
5699                          * that flag and act appropriately. Not to be changed
5700                          * at present.
5701                          */
5702                         ill->ill_flags |= ILLF_NONUD;
5703                 else
5704                         ill->ill_flags |= ILLF_NOARP;
5705
5706                 if (ill->ill_phys_addr_length == 0) {
5707                         if (ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) {
5708                                 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT;
5709                                 ill->ill_phyint->phyint_flags |= PHYI_VIRTUAL;
5710                         } else {
5711                                 /* pt-pt supports multicast. */
5712                                 ill->ill_flags |= ILLF_MULTICAST;
5713                                 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT;
5714                         }
5715                 }
5716         } else {
5717                 ill->ill_net_type = IRE_IF_RESOLVER;
5718                 if (ill->ill_bcast_mp != NULL)
5719                         freemsg(ill->ill_bcast_mp);
5720                 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr,
5721                     ill->ill_bcast_addr_length, ill->ill_sap,
5722                     ill->ill_sap_length);
5723                 /*
5724                  * Later detect lack of DLPI driver multicast
5725                  * capability by catching DL_ENABMULTI errors in
5726                  * ip_rput_dlpi.
5727                  */
5728                 ill->ill_flags |= ILLF_MULTICAST;
5729                 if (!ill->ill_isv6)
5730                         ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
5731         }
5732         /* By default an interface does not support any CoS marking */
5733         ill->ill_flags &= ~ILLF_COS_ENABLED;
5734
5735         /*
5736          * If we get QoS information in DL_INFO_ACK, the device supports
5737          * some form of CoS marking, set ILLF_COS_ENABLED.
5738          */
5739         sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset,
5740             dlia->dl_qos_length);
5741         if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) {
5742                 ill->ill_flags |= ILLF_COS_ENABLED;
5743         }
5744
5745         /* Clear any previous error indication. */
5746         ill->ill_error = 0;
5747         freemsg(mp);
5748 }
5749
5750 /*
5751  * Perform various checks to verify that an address would make sense as a
5752  * local, remote, or subnet interface address.
5753  */
5754 static boolean_t
5755 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask)
5756 {
5757         ipaddr_t        net_mask;
5758
5759         /*
5760          * Don't allow all zeroes, all ones or experimental address, but allow
5761          * all ones netmask.
5762          */
5763         if ((net_mask = ip_net_mask(addr)) == 0)
5764                 return (B_FALSE);
5765         /* A given netmask overrides the "guess" netmask */
5766         if (subnet_mask != 0)
5767                 net_mask = subnet_mask;
5768         if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) ||
5769             (addr == (addr | ~net_mask)))) {
5770                 return (B_FALSE);
5771         }
5772         if (CLASSD(addr))
5773                 return (B_FALSE);
5774
5775         return (B_TRUE);
5776 }
5777
5778 #define V6_IPIF_LINKLOCAL(p)    \
5779         IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr)
5780
5781 /*
5782  * Compare two given ipifs and check if the second one is better than
5783  * the first one using the order of preference (not taking deprecated
5784  * into acount) specified in ipif_lookup_multicast().
5785  */
5786 static boolean_t
5787 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
5788 {
5789         /* Check the least preferred first. */
5790         if (IS_LOOPBACK(old_ipif->ipif_ill)) {
5791                 /* If both ipifs are the same, use the first one. */
5792                 if (IS_LOOPBACK(new_ipif->ipif_ill))
5793                         return (B_FALSE);
5794                 else
5795                         return (B_TRUE);
5796         }
5797
5798         /* For IPv6, check for link local address. */
5799         if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) {
5800                 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
5801                     V6_IPIF_LINKLOCAL(new_ipif)) {
5802                         /* The second one is equal or less preferred. */
5803                         return (B_FALSE);
5804                 } else {
5805                         return (B_TRUE);
5806                 }
5807         }
5808
5809         /* Then check for point to point interface. */
5810         if (old_ipif->ipif_flags & IPIF_POINTOPOINT) {
5811                 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
5812                     (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) ||
5813                     (new_ipif->ipif_flags & IPIF_POINTOPOINT)) {
5814                         return (B_FALSE);
5815                 } else {
5816                         return (B_TRUE);
5817                 }
5818         }
5819
5820         /* old_ipif is a normal interface, so no need to use the new one. */
5821         return (B_FALSE);
5822 }
5823
5824 /*
5825  * Find any non-virtual, not condemned, and up multicast capable interface
5826  * given an IP instance and zoneid.  Order of preference is:
5827  *
5828  * 1. normal
5829  * 1.1 normal, but deprecated
5830  * 2. point to point
5831  * 2.1 point to point, but deprecated
5832  * 3. link local
5833  * 3.1 link local, but deprecated
5834  * 4. loopback.
5835  */
5836 ipif_t *
5837 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
5838 {
5839         ill_t                   *ill;
5840         ill_walk_context_t      ctx;
5841         ipif_t                  *ipif;
5842         ipif_t                  *saved_ipif = NULL;
5843         ipif_t                  *dep_ipif = NULL;
5844
5845         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5846         if (isv6)
5847                 ill = ILL_START_WALK_V6(&ctx, ipst);
5848         else
5849                 ill = ILL_START_WALK_V4(&ctx, ipst);
5850
5851         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
5852                 mutex_enter(&ill->ill_lock);
5853                 if (IS_VNI(ill) || !ILL_CAN_LOOKUP(ill) ||
5854                     !(ill->ill_flags & ILLF_MULTICAST)) {
5855                         mutex_exit(&ill->ill_lock);
5856                         continue;
5857                 }
5858                 for (ipif = ill->ill_ipif; ipif != NULL;
5859                     ipif = ipif->ipif_next) {
5860                         if (zoneid != ipif->ipif_zoneid &&
5861                             zoneid != ALL_ZONES &&
5862                             ipif->ipif_zoneid != ALL_ZONES) {
5863                                 continue;
5864                         }
5865                         if (!(ipif->ipif_flags & IPIF_UP) ||
5866                             !IPIF_CAN_LOOKUP(ipif)) {
5867                                 continue;
5868                         }
5869
5870                         /*
5871                          * Found one candidate.  If it is deprecated,
5872                          * remember it in dep_ipif.  If it is not deprecated,
5873                          * remember it in saved_ipif.
5874                          */
5875                         if (ipif->ipif_flags & IPIF_DEPRECATED) {
5876                                 if (dep_ipif == NULL) {
5877                                         dep_ipif = ipif;
5878                                 } else if (ipif_comp_multi(dep_ipif, ipif,
5879                                     isv6)) {
5880                                         /*
5881                                          * If the previous dep_ipif does not
5882                                          * belong to the same ill, we've done
5883                                          * a ipif_refhold() on it.  So we need
5884                                          * to release it.
5885                                          */
5886                                         if (dep_ipif->ipif_ill != ill)
5887                                                 ipif_refrele(dep_ipif);
5888                                         dep_ipif = ipif;
5889                                 }
5890                                 continue;
5891                         }
5892                         if (saved_ipif == NULL) {
5893                                 saved_ipif = ipif;
5894                         } else {
5895                                 if (ipif_comp_multi(saved_ipif, ipif, isv6)) {
5896                                         if (saved_ipif->ipif_ill != ill)
5897                                                 ipif_refrele(saved_ipif);
5898                                         saved_ipif = ipif;
5899                                 }
5900                         }
5901                 }
5902                 /*
5903                  * Before going to the next ill, do a ipif_refhold() on the
5904                  * saved ones.
5905                  */
5906                 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill)
5907                         ipif_refhold_locked(saved_ipif);
5908                 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill)
5909                         ipif_refhold_locked(dep_ipif);
5910                 mutex_exit(&ill->ill_lock);
5911         }
5912         rw_exit(&ipst->ips_ill_g_lock);
5913
5914         /*
5915          * If we have only the saved_ipif, return it.  But if we have both
5916          * saved_ipif and dep_ipif, check to see which one is better.
5917          */
5918         if (saved_ipif != NULL) {
5919                 if (dep_ipif != NULL) {
5920                         if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) {
5921                                 ipif_refrele(saved_ipif);
5922                                 return (dep_ipif);
5923                         } else {
5924                                 ipif_refrele(dep_ipif);
5925                                 return (saved_ipif);
5926                         }
5927                 }
5928                 return (saved_ipif);
5929         } else {
5930                 return (dep_ipif);
5931         }
5932 }
5933
5934 /*
5935  * This function is called when an application does not specify an interface
5936  * to be used for multicast traffic (joining a group/sending data).  It
5937  * calls ire_lookup_multi() to look for an interface route for the
5938  * specified multicast group.  Doing this allows the administrator to add
5939  * prefix routes for multicast to indicate which interface to be used for
5940  * multicast traffic in the above scenario.  The route could be for all
5941  * multicast (224.0/4), for a single multicast group (a /32 route) or
5942  * anything in between.  If there is no such multicast route, we just find
5943  * any multicast capable interface and return it.  The returned ipif
5944  * is refhold'ed.
5945  */
5946 ipif_t *
5947 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst)
5948 {
5949         ire_t                   *ire;
5950         ipif_t                  *ipif;
5951
5952         ire = ire_lookup_multi(group, zoneid, ipst);
5953         if (ire != NULL) {
5954                 ipif = ire->ire_ipif;
5955                 ipif_refhold(ipif);
5956                 ire_refrele(ire);
5957                 return (ipif);
5958         }
5959
5960         return (ipif_lookup_multicast(ipst, zoneid, B_FALSE));
5961 }
5962
5963 /*
5964  * Look for an ipif with the specified interface address and destination.
5965  * The destination address is used only for matching point-to-point interfaces.
5966  */
5967 ipif_t *
5968 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
5969     ipsq_func_t func, int *error, ip_stack_t *ipst)
5970 {
5971         ipif_t  *ipif;
5972         ill_t   *ill;
5973         ill_walk_context_t ctx;
5974         ipsq_t  *ipsq;
5975
5976         if (error != NULL)
5977                 *error = 0;
5978
5979         /*
5980          * First match all the point-to-point interfaces
5981          * before looking at non-point-to-point interfaces.
5982          * This is done to avoid returning non-point-to-point
5983          * ipif instead of unnumbered point-to-point ipif.
5984          */
5985         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5986         ill = ILL_START_WALK_V4(&ctx, ipst);
5987         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
5988                 GRAB_CONN_LOCK(q);
5989                 mutex_enter(&ill->ill_lock);
5990                 for (ipif = ill->ill_ipif; ipif != NULL;
5991                     ipif = ipif->ipif_next) {
5992                         /* Allow the ipif to be down */
5993                         if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
5994                             (ipif->ipif_lcl_addr == if_addr) &&
5995                             (ipif->ipif_pp_dst_addr == dst)) {
5996                                 /*
5997                                  * The block comment at the start of ipif_down
5998                                  * explains the use of the macros used below
5999                                  */
6000                                 if (IPIF_CAN_LOOKUP(ipif)) {
6001                                         ipif_refhold_locked(ipif);
6002                                         mutex_exit(&ill->ill_lock);
6003                                         RELEASE_CONN_LOCK(q);
6004                                         rw_exit(&ipst->ips_ill_g_lock);
6005                                         return (ipif);
6006                                 } else if (IPIF_CAN_WAIT(ipif, q)) {
6007                                         ipsq = ill->ill_phyint->phyint_ipsq;
6008                                         mutex_enter(&ipsq->ipsq_lock);
6009                                         mutex_exit(&ill->ill_lock);
6010                                         rw_exit(&ipst->ips_ill_g_lock);
6011                                         ipsq_enq(ipsq, q, mp, func, NEW_OP,
6012                                             ill);
6013                                         mutex_exit(&ipsq->ipsq_lock);
6014                                         RELEASE_CONN_LOCK(q);
6015                                         if (error != NULL)
6016                                                 *error = EINPROGRESS;
6017                                         return (NULL);
6018                                 }
6019                         }
6020                 }
6021                 mutex_exit(&ill->ill_lock);
6022                 RELEASE_CONN_LOCK(q);
6023         }
6024         rw_exit(&ipst->ips_ill_g_lock);
6025
6026         /* lookup the ipif based on interface address */
6027         ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error,
6028             ipst);
6029         ASSERT(ipif == NULL || !ipif->ipif_isv6);
6030         return (ipif);
6031 }
6032
6033 /*
6034  * Look for an ipif with the specified address. For point-point links
6035  * we look for matches on either the destination address and the local
6036  * address, but we ignore the check on the local address if IPIF_UNNUMBERED
6037  * is set.
6038  * Matches on a specific ill if match_ill is set.
6039  */
6040 ipif_t *
6041 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
6042     mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
6043 {
6044         ipif_t  *ipif;
6045         ill_t   *ill;
6046         boolean_t ptp = B_FALSE;
6047         ipsq_t  *ipsq;
6048         ill_walk_context_t      ctx;
6049
6050         if (error != NULL)
6051                 *error = 0;
6052
6053         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6054         /*
6055          * Repeat twice, first based on local addresses and
6056          * next time for pointopoint.
6057          */
6058 repeat:
6059         ill = ILL_START_WALK_V4(&ctx, ipst);
6060         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
6061                 if (match_ill != NULL && ill != match_ill) {
6062                         continue;
6063                 }
6064                 GRAB_CONN_LOCK(q);
6065                 mutex_enter(&ill->ill_lock);
6066                 for (ipif = ill->ill_ipif; ipif != NULL;
6067                     ipif = ipif->ipif_next) {
6068                         if (zoneid != ALL_ZONES &&
6069                             zoneid != ipif->ipif_zoneid &&
6070                             ipif->ipif_zoneid != ALL_ZONES)
6071                                 continue;
6072                         /* Allow the ipif to be down */
6073                         if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
6074                             ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
6075                             (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
6076                             (ipif->ipif_pp_dst_addr == addr))) {
6077                                 /*
6078                                  * The block comment at the start of ipif_down
6079                                  * explains the use of the macros used below
6080                                  */
6081                                 if (IPIF_CAN_LOOKUP(ipif)) {
6082                                         ipif_refhold_locked(ipif);
6083                                         mutex_exit(&ill->ill_lock);
6084                                         RELEASE_CONN_LOCK(q);
6085                                         rw_exit(&ipst->ips_ill_g_lock);
6086                                         return (ipif);
6087                                 } else if (IPIF_CAN_WAIT(ipif, q)) {
6088                                         ipsq = ill->ill_phyint->phyint_ipsq;
6089                                         mutex_enter(&ipsq->ipsq_lock);
6090                                         mutex_exit(&ill->ill_lock);
6091                                         rw_exit(&ipst->ips_ill_g_lock);
6092                                         ipsq_enq(ipsq, q, mp, func, NEW_OP,
6093                                             ill);
6094                                         mutex_exit(&ipsq->ipsq_lock);
6095                                         RELEASE_CONN_LOCK(q);
6096                                         if (error != NULL)
6097                                                 *error = EINPROGRESS;
6098                                         return (NULL);
6099                                 }
6100                         }
6101                 }
6102                 mutex_exit(&ill->ill_lock);
6103                 RELEASE_CONN_LOCK(q);
6104         }
6105
6106         /* If we already did the ptp case, then we are done */
6107         if (ptp) {
6108                 rw_exit(&ipst->ips_ill_g_lock);
6109                 if (error != NULL)
6110                         *error = ENXIO;
6111                 return (NULL);
6112         }
6113         ptp = B_TRUE;
6114         goto repeat;
6115 }
6116
6117 /*
6118  * Look for an ipif with the specified address. For point-point links
6119  * we look for matches on either the destination address and the local
6120  * address, but we ignore the check on the local address if IPIF_UNNUMBERED
6121  * is set.
6122  * Matches on a specific ill if match_ill is set.
6123  * Return the zoneid for the ipif which matches. ALL_ZONES if no match.
6124  */
6125 zoneid_t
6126 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
6127 {
6128         zoneid_t zoneid;
6129         ipif_t  *ipif;
6130         ill_t   *ill;
6131         boolean_t ptp = B_FALSE;
6132         ill_walk_context_t      ctx;
6133
6134         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6135         /*
6136          * Repeat twice, first based on local addresses and
6137          * next time for pointopoint.
6138          */
6139 repeat:
6140         ill = ILL_START_WALK_V4(&ctx, ipst);
6141         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
6142                 if (match_ill != NULL && ill != match_ill) {
6143                         continue;
6144                 }
6145                 mutex_enter(&ill->ill_lock);
6146                 for (ipif = ill->ill_ipif; ipif != NULL;
6147                     ipif = ipif->ipif_next) {
6148                         /* Allow the ipif to be down */
6149                         if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
6150                             ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
6151                             (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
6152                             (ipif->ipif_pp_dst_addr == addr)) &&
6153                             !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
6154                                 zoneid = ipif->ipif_zoneid;
6155                                 mutex_exit(&ill->ill_lock);
6156                                 rw_exit(&ipst->ips_ill_g_lock);
6157                                 /*
6158                                  * If ipif_zoneid was ALL_ZONES then we have
6159                                  * a trusted extensions shared IP address.
6160                                  * In that case GLOBAL_ZONEID works to send.
6161                                  */
6162                                 if (zoneid == ALL_ZONES)
6163                                         zoneid = GLOBAL_ZONEID;
6164                                 return (zoneid);
6165                         }
6166                 }
6167                 mutex_exit(&ill->ill_lock);
6168         }
6169
6170         /* If we already did the ptp case, then we are done */
6171         if (ptp) {
6172                 rw_exit(&ipst->ips_ill_g_lock);
6173                 return (ALL_ZONES);
6174         }
6175         ptp = B_TRUE;
6176         goto repeat;
6177 }
6178
6179 /*
6180  * Look for an ipif that matches the specified remote address i.e. the
6181  * ipif that would receive the specified packet.
6182  * First look for directly connected interfaces and then do a recursive
6183  * IRE lookup and pick the first ipif corresponding to the source address in the
6184  * ire.
6185  * Returns: held ipif
6186  */
6187 ipif_t *
6188 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
6189 {
6190         ipif_t  *ipif;
6191         ire_t   *ire;
6192         ip_stack_t      *ipst = ill->ill_ipst;
6193
6194         ASSERT(!ill->ill_isv6);
6195
6196         /*
6197          * Someone could be changing this ipif currently or change it
6198          * after we return this. Thus  a few packets could use the old
6199          * old values. However structure updates/creates (ire, ilg, ilm etc)
6200          * will atomically be updated or cleaned up with the new value
6201          * Thus we don't need a lock to check the flags or other attrs below.
6202          */
6203         mutex_enter(&ill->ill_lock);
6204         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
6205                 if (!IPIF_CAN_LOOKUP(ipif))
6206                         continue;
6207                 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid &&
6208                     ipif->ipif_zoneid != ALL_ZONES)
6209                         continue;
6210                 /* Allow the ipif to be down */
6211                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
6212                         if ((ipif->ipif_pp_dst_addr == addr) ||
6213                             (!(ipif->ipif_flags & IPIF_UNNUMBERED) &&
6214                             ipif->ipif_lcl_addr == addr)) {
6215                                 ipif_refhold_locked(ipif);
6216                                 mutex_exit(&ill->ill_lock);
6217                                 return (ipif);
6218                         }
6219                 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) {
6220                         ipif_refhold_locked(ipif);
6221                         mutex_exit(&ill->ill_lock);
6222                         return (ipif);
6223                 }
6224         }
6225         mutex_exit(&ill->ill_lock);
6226         ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid,
6227             NULL, MATCH_IRE_RECURSIVE, ipst);
6228         if (ire != NULL) {
6229                 /*
6230                  * The callers of this function wants to know the
6231                  * interface on which they have to send the replies
6232                  * back. For IRE_CACHES that have ire_stq and ire_ipif
6233                  * derived from different ills, we really don't care
6234                  * what we return here.
6235                  */
6236                 ipif = ire->ire_ipif;
6237                 if (ipif != NULL) {
6238                         ipif_refhold(ipif);
6239                         ire_refrele(ire);
6240                         return (ipif);
6241                 }
6242                 ire_refrele(ire);
6243         }
6244         /* Pick the first interface */
6245         ipif = ipif_get_next_ipif(NULL, ill);
6246         return (ipif);
6247 }
6248
6249 /*
6250  * This func does not prevent refcnt from increasing. But if
6251  * the caller has taken steps to that effect, then this func
6252  * can be used to determine whether the ill has become quiescent
6253  */
6254 boolean_t
6255 ill_is_quiescent(ill_t *ill)
6256 {
6257         ipif_t  *ipif;
6258
6259         ASSERT(MUTEX_HELD(&ill->ill_lock));
6260
6261         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
6262                 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) {
6263                         return (B_FALSE);
6264                 }
6265         }
6266         if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0 ||
6267             ill->ill_nce_cnt != 0) {
6268                 return (B_FALSE);
6269         }
6270         return (B_TRUE);
6271 }
6272
6273 /*
6274  * This func does not prevent refcnt from increasing. But if
6275  * the caller has taken steps to that effect, then this func
6276  * can be used to determine whether the ipif has become quiescent
6277  */
6278 static boolean_t
6279 ipif_is_quiescent(ipif_t *ipif)
6280 {
6281         ill_t *ill;
6282
6283         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
6284
6285         if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) {
6286                 return (B_FALSE);
6287         }
6288
6289         ill = ipif->ipif_ill;
6290         if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
6291             ill->ill_logical_down) {
6292                 return (B_TRUE);
6293         }
6294
6295         /* This is the last ipif going down or being deleted on this ill */
6296         if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) {
6297                 return (B_FALSE);
6298         }
6299
6300         return (B_TRUE);
6301 }
6302
6303 /*
6304  * This func does not prevent refcnt from increasing. But if
6305  * the caller has taken steps to that effect, then this func
6306  * can be used to determine whether the ipifs marked with IPIF_MOVING
6307  * have become quiescent and can be moved in a failover/failback.
6308  */
6309 static ipif_t *
6310 ill_quiescent_to_move(ill_t *ill)
6311 {
6312         ipif_t  *ipif;
6313
6314         ASSERT(MUTEX_HELD(&ill->ill_lock));
6315
6316         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
6317                 if (ipif->ipif_state_flags & IPIF_MOVING) {
6318                         if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) {
6319                                 return (ipif);
6320                         }
6321                 }
6322         }
6323         return (NULL);
6324 }
6325
6326 /*
6327  * The ipif/ill/ire has been refreled. Do the tail processing.
6328  * Determine if the ipif or ill in question has become quiescent and if so
6329  * wakeup close and/or restart any queued pending ioctl that is waiting
6330  * for the ipif_down (or ill_down)
6331  */
6332 void
6333 ipif_ill_refrele_tail(ill_t *ill)
6334 {
6335         mblk_t  *mp;
6336         conn_t  *connp;
6337         ipsq_t  *ipsq;
6338         ipif_t  *ipif;
6339         dl_notify_ind_t *dlindp;
6340
6341         ASSERT(MUTEX_HELD(&ill->ill_lock));
6342
6343         if ((ill->ill_state_flags & ILL_CONDEMNED) &&
6344             ill_is_quiescent(ill)) {
6345                 /* ill_close may be waiting */
6346                 cv_broadcast(&ill->ill_cv);
6347         }
6348
6349         /* ipsq can't change because ill_lock  is held */
6350         ipsq = ill->ill_phyint->phyint_ipsq;
6351         if (ipsq->ipsq_waitfor == 0) {
6352                 /* Not waiting for anything, just return. */
6353                 mutex_exit(&ill->ill_lock);
6354                 return;
6355         }
6356         ASSERT(ipsq->ipsq_pending_mp != NULL &&
6357             ipsq->ipsq_pending_ipif != NULL);
6358         /*
6359          * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF.
6360          * Last ipif going down needs to down the ill, so ill_ire_cnt must
6361          * be zero for restarting an ioctl that ends up downing the ill.
6362          */
6363         ipif = ipsq->ipsq_pending_ipif;
6364         if (ipif->ipif_ill != ill) {
6365                 /* The ioctl is pending on some other ill. */
6366                 mutex_exit(&ill->ill_lock);
6367                 return;
6368         }
6369
6370         switch (ipsq->ipsq_waitfor) {
6371         case IPIF_DOWN:
6372         case IPIF_FREE:
6373                 if (!ipif_is_quiescent(ipif)) {
6374                         mutex_exit(&ill->ill_lock);
6375                         return;
6376                 }
6377                 break;
6378
6379         case ILL_DOWN:
6380         case ILL_FREE:
6381                 /*
6382                  * case ILL_FREE arises only for loopback. otherwise ill_delete
6383                  * waits synchronously in ip_close, and no message is queued in
6384                  * ipsq_pending_mp at all in this case
6385                  */
6386                 if (!ill_is_quiescent(ill)) {
6387                         mutex_exit(&ill->ill_lock);
6388                         return;
6389                 }
6390
6391                 break;
6392
6393         case ILL_MOVE_OK:
6394                 if (ill_quiescent_to_move(ill) != NULL) {
6395                         mutex_exit(&ill->ill_lock);
6396                         return;
6397                 }
6398
6399                 break;
6400         default:
6401                 cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n",
6402                     (void *)ipsq, ipsq->ipsq_waitfor);
6403         }
6404
6405         /*
6406          * Incr refcnt for the qwriter_ip call below which
6407          * does a refrele
6408          */
6409         ill_refhold_locked(ill);
6410         mutex_exit(&ill->ill_lock);
6411
6412         mp = ipsq_pending_mp_get(ipsq, &connp);
6413         ASSERT(mp != NULL);
6414
6415         /*
6416          * NOTE: all of the qwriter_ip() calls below use CUR_OP since
6417          * we can only get here when the current operation decides it
6418          * it needs to quiesce via ipsq_pending_mp_add().
6419          */
6420         switch (mp->b_datap->db_type) {
6421         case M_PCPROTO:
6422         case M_PROTO:
6423                 /*
6424                  * For now, only DL_NOTIFY_IND messages can use this facility.
6425                  */
6426                 dlindp = (dl_notify_ind_t *)mp->b_rptr;
6427                 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND);
6428
6429                 switch (dlindp->dl_notification) {
6430                 case DL_NOTE_PHYS_ADDR:
6431                         qwriter_ip(ill, ill->ill_rq, mp,
6432                             ill_set_phys_addr_tail, CUR_OP, B_TRUE);
6433                         return;
6434                 default:
6435                         ASSERT(0);
6436                 }
6437                 break;
6438
6439         case M_ERROR:
6440         case M_HANGUP:
6441                 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP,
6442                     B_TRUE);
6443                 return;
6444
6445         case M_IOCTL:
6446         case M_IOCDATA:
6447                 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) :
6448                     ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE);
6449                 return;
6450
6451         default:
6452                 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
6453                     "db_type %d\n", (void *)mp, mp->b_datap->db_type);
6454         }
6455 }
6456
6457 #ifdef DEBUG
6458 /* Reuse trace buffer from beginning (if reached the end) and record trace */
6459 static void
6460 th_trace_rrecord(th_trace_t *th_trace)
6461 {
6462         tr_buf_t *tr_buf;
6463         uint_t lastref;
6464
6465         lastref = th_trace->th_trace_lastref;
6466         lastref++;
6467         if (lastref == TR_BUF_MAX)
6468                 lastref = 0;
6469         th_trace->th_trace_lastref = lastref;
6470         tr_buf = &th_trace->th_trbuf[lastref];
6471         tr_buf->tr_time = lbolt;
6472         tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH);
6473 }
6474
6475 static void
6476 th_trace_free(void *value)
6477 {
6478         th_trace_t *th_trace = value;
6479
6480         ASSERT(th_trace->th_refcnt == 0);
6481         kmem_free(th_trace, sizeof (*th_trace));
6482 }
6483
6484 /*
6485  * Find or create the per-thread hash table used to track object references.
6486  * The ipst argument is NULL if we shouldn't allocate.
6487  *
6488  * Accesses per-thread data, so there's no need to lock here.
6489  */
6490 static mod_hash_t *
6491 th_trace_gethash(ip_stack_t *ipst)
6492 {
6493         th_hash_t *thh;
6494
6495         if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) {
6496                 mod_hash_t *mh;
6497                 char name[256];
6498                 size_t objsize, rshift;
6499                 int retv;
6500
6501                 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL)
6502                         return (NULL);
6503                 (void) snprintf(name, sizeof (name), "th_trace_%p", curthread);
6504
6505                 /*
6506                  * We use mod_hash_create_extended here rather than the more
6507                  * obvious mod_hash_create_ptrhash because the latter has a
6508                  * hard-coded KM_SLEEP, and we'd prefer to fail rather than
6509                  * block.
6510                  */
6511                 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)),
6512                     MAX(sizeof (ire_t), sizeof (nce_t)));
6513                 rshift = highbit(objsize);
6514                 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor,
6515                     th_trace_free, mod_hash_byptr, (void *)rshift,
6516                     mod_hash_ptrkey_cmp, KM_NOSLEEP);
6517                 if (mh == NULL) {
6518                         kmem_free(thh, sizeof (*thh));
6519                         return (NULL);
6520                 }
6521                 thh->thh_hash = mh;
6522                 thh->thh_ipst = ipst;
6523                 /*
6524                  * We trace ills, ipifs, ires, and nces.  All of these are
6525                  * per-IP-stack, so the lock on the thread list is as well.
6526                  */
6527                 rw_enter(&ip_thread_rwlock, RW_WRITER);
6528                 list_insert_tail(&ip_thread_list, thh);
6529                 rw_exit(&ip_thread_rwlock);
6530                 retv = tsd_set(ip_thread_data, thh);
6531                 ASSERT(retv == 0);
6532         }
6533         return (thh != NULL ? thh->thh_hash : NULL);
6534 }
6535
6536 boolean_t
6537 th_trace_ref(const void *obj, ip_stack_t *ipst)
6538 {
6539         th_trace_t *th_trace;
6540         mod_hash_t *mh;
6541         mod_hash_val_t val;
6542
6543         if ((mh = th_trace_gethash(ipst)) == NULL)
6544                 return (B_FALSE);
6545
6546         /*
6547          * Attempt to locate the trace buffer for this obj and thread.
6548          * If it does not exist, then allocate a new trace buffer and
6549          * insert into the hash.
6550          */
6551         if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) {
6552                 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP);
6553                 if (th_trace == NULL)
6554                         return (B_FALSE);
6555
6556                 th_trace->th_id = curthread;
6557                 if (mod_hash_insert(mh, (mod_hash_key_t)obj,
6558                     (mod_hash_val_t)th_trace) != 0) {
6559                         kmem_free(th_trace, sizeof (th_trace_t));
6560                         return (B_FALSE);
6561                 }
6562         } else {
6563                 th_trace = (th_trace_t *)val;
6564         }
6565
6566         ASSERT(th_trace->th_refcnt >= 0 &&
6567             th_trace->th_refcnt < TR_BUF_MAX - 1);
6568
6569         th_trace->th_refcnt++;
6570         th_trace_rrecord(th_trace);
6571         return (B_TRUE);
6572 }
6573
6574 /*
6575  * For the purpose of tracing a reference release, we assume that global
6576  * tracing is always on and that the same thread initiated the reference hold
6577  * is releasing.
6578  */
6579 void
6580 th_trace_unref(const void *obj)
6581 {
6582         int retv;
6583         mod_hash_t *mh;
6584         th_trace_t *th_trace;
6585         mod_hash_val_t val;
6586
6587         mh = th_trace_gethash(NULL);
6588         retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val);
6589         ASSERT(retv == 0);
6590         th_trace = (th_trace_t *)val;
6591
6592         ASSERT(th_trace->th_refcnt > 0);
6593         th_trace->th_refcnt--;
6594         th_trace_rrecord(th_trace);
6595 }
6596
6597 /*
6598  * If tracing has been disabled, then we assume that the reference counts are
6599  * now useless, and we clear them out before destroying the entries.
6600  */
6601 void
6602 th_trace_cleanup(const void *obj, boolean_t trace_disable)
6603 {
6604         th_hash_t       *thh;
6605         mod_hash_t      *mh;
6606         mod_hash_val_t  val;
6607         th_trace_t      *th_trace;
6608         int             retv;
6609
6610         rw_enter(&ip_thread_rwlock, RW_READER);
6611         for (thh = list_head(&ip_thread_list); thh != NULL;
6612             thh = list_next(&ip_thread_list, thh)) {
6613                 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj,
6614                     &val) == 0) {
6615                         th_trace = (th_trace_t *)val;
6616                         if (trace_disable)
6617                                 th_trace->th_refcnt = 0;
6618                         retv = mod_hash_destroy(mh, (mod_hash_key_t)obj);
6619                         ASSERT(retv == 0);
6620                 }
6621         }
6622         rw_exit(&ip_thread_rwlock);
6623 }
6624
6625 void
6626 ipif_trace_ref(ipif_t *ipif)
6627 {
6628         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
6629
6630         if (ipif->ipif_trace_disable)
6631                 return;
6632
6633         if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) {
6634                 ipif->ipif_trace_disable = B_TRUE;
6635                 ipif_trace_cleanup(ipif);
6636         }
6637 }
6638
6639 void
6640 ipif_untrace_ref(ipif_t *ipif)
6641 {
6642         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
6643
6644         if (!ipif->ipif_trace_disable)
6645                 th_trace_unref(ipif);
6646 }
6647
6648 void
6649 ill_trace_ref(ill_t *ill)
6650 {
6651         ASSERT(MUTEX_HELD(&ill->ill_lock));
6652
6653         if (ill->ill_trace_disable)
6654                 return;
6655
6656         if (!th_trace_ref(ill, ill->ill_ipst)) {
6657                 ill->ill_trace_disable = B_TRUE;
6658                 ill_trace_cleanup(ill);
6659         }
6660 }
6661
6662 void
6663 ill_untrace_ref(ill_t *ill)
6664 {
6665         ASSERT(MUTEX_HELD(&ill->ill_lock));
6666
6667         if (!ill->ill_trace_disable)
6668                 th_trace_unref(ill);
6669 }
6670
6671 /*
6672  * Called when ipif is unplumbed or when memory alloc fails.  Note that on
6673  * failure, ipif_trace_disable is set.
6674  */
6675 static void
6676 ipif_trace_cleanup(const ipif_t *ipif)
6677 {
6678         th_trace_cleanup(ipif, ipif->ipif_trace_disable);
6679 }
6680
6681 /*
6682  * Called when ill is unplumbed or when memory alloc fails.  Note that on
6683  * failure, ill_trace_disable is set.
6684  */
6685 static void
6686 ill_trace_cleanup(const ill_t *ill)
6687 {
6688         th_trace_cleanup(ill, ill->ill_trace_disable);
6689 }
6690 #endif /* DEBUG */
6691
6692 void
6693 ipif_refhold_locked(ipif_t *ipif)
6694 {
6695         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
6696         ipif->ipif_refcnt++;
6697         IPIF_TRACE_REF(ipif);
6698 }
6699
6700 void
6701 ipif_refhold(ipif_t *ipif)
6702 {
6703         ill_t   *ill;
6704
6705         ill = ipif->ipif_ill;
6706         mutex_enter(&ill->ill_lock);
6707         ipif->ipif_refcnt++;
6708         IPIF_TRACE_REF(ipif);
6709         mutex_exit(&ill->ill_lock);
6710 }
6711
6712 /*
6713  * Must not be called while holding any locks. Otherwise if this is
6714  * the last reference to be released there is a chance of recursive mutex
6715  * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
6716  * to restart an ioctl.
6717  */
6718 void
6719 ipif_refrele(ipif_t *ipif)
6720 {
6721         ill_t   *ill;
6722
6723         ill = ipif->ipif_ill;
6724
6725         mutex_enter(&ill->ill_lock);
6726         ASSERT(ipif->ipif_refcnt != 0);
6727         ipif->ipif_refcnt--;
6728         IPIF_UNTRACE_REF(ipif);
6729         if (ipif->ipif_refcnt != 0) {
6730                 mutex_exit(&ill->ill_lock);
6731                 return;
6732         }
6733
6734         /* Drops the ill_lock */
6735         ipif_ill_refrele_tail(ill);
6736 }
6737
6738 ipif_t *
6739 ipif_get_next_ipif(ipif_t *curr, ill_t *ill)
6740 {
6741         ipif_t  *ipif;
6742
6743         mutex_enter(&ill->ill_lock);
6744         for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next);
6745             ipif != NULL; ipif = ipif->ipif_next) {
6746                 if (!IPIF_CAN_LOOKUP(ipif))
6747                         continue;
6748                 ipif_refhold_locked(ipif);
6749                 mutex_exit(&ill->ill_lock);
6750                 return (ipif);
6751         }
6752         mutex_exit(&ill->ill_lock);
6753         return (NULL);
6754 }
6755
6756 /*
6757  * TODO: make this table extendible at run time
6758  * Return a pointer to the mac type info for 'mac_type'
6759  */
6760 static ip_m_t *
6761 ip_m_lookup(t_uscalar_t mac_type)
6762 {
6763         ip_m_t  *ipm;
6764
6765         for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++)
6766                 if (ipm->ip_m_mac_type == mac_type)
6767                         return (ipm);
6768         return (NULL);
6769 }
6770
6771 /*
6772  * ip_rt_add is called to add an IPv4 route to the forwarding table.
6773  * ipif_arg is passed in to associate it with the correct interface.
6774  * We may need to restart this operation if the ipif cannot be looked up
6775  * due to an exclusive operation that is currently in progress. The restart
6776  * entry point is specified by 'func'
6777  */
6778 int
6779 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
6780     ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ire_t **ire_arg,
6781     boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func,
6782     struct rtsa_s *sp, ip_stack_t *ipst)
6783 {
6784         ire_t   *ire;
6785         ire_t   *gw_ire = NULL;
6786         ipif_t  *ipif = NULL;
6787         boolean_t ipif_refheld = B_FALSE;
6788         uint_t  type;
6789         int     match_flags = MATCH_IRE_TYPE;
6790         int     error;
6791         tsol_gc_t *gc = NULL;
6792         tsol_gcgrp_t *gcgrp = NULL;
6793         boolean_t gcgrp_xtraref = B_FALSE;
6794
6795         ip1dbg(("ip_rt_add:"));
6796
6797         if (ire_arg != NULL)
6798                 *ire_arg = NULL;
6799
6800         /*
6801          * If this is the case of RTF_HOST being set, then we set the netmask
6802          * to all ones (regardless if one was supplied).
6803          */
6804         if (flags & RTF_HOST)
6805                 mask = IP_HOST_MASK;
6806
6807         /*
6808          * Prevent routes with a zero gateway from being created (since
6809          * interfaces can currently be plumbed and brought up no assigned
6810          * address).
6811          */
6812         if (gw_addr == 0)
6813                 return (ENETUNREACH);
6814         /*
6815          * Get the ipif, if any, corresponding to the gw_addr
6816          */
6817         ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &error,
6818             ipst);
6819         if (ipif != NULL) {
6820                 if (IS_VNI(ipif->ipif_ill)) {
6821                         ipif_refrele(ipif);
6822                         return (EINVAL);
6823                 }
6824                 ipif_refheld = B_TRUE;
6825         } else if (error == EINPROGRESS) {
6826                 ip1dbg(("ip_rt_add: null and EINPROGRESS"));
6827                 return (EINPROGRESS);
6828         } else {
6829                 error = 0;
6830         }
6831
6832         if (ipif != NULL) {
6833                 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull"));
6834                 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
6835         } else {
6836                 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null"));
6837         }
6838
6839         /*
6840          * GateD will attempt to create routes with a loopback interface
6841          * address as the gateway and with RTF_GATEWAY set.  We allow
6842          * these routes to be added, but create them as interface routes
6843          * since the gateway is an interface address.
6844          */
6845         if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) {
6846                 flags &= ~RTF_GATEWAY;
6847                 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK &&
6848                     mask == IP_HOST_MASK) {
6849                         ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif,
6850                             ALL_ZONES, NULL, match_flags, ipst);
6851                         if (ire != NULL) {
6852                                 ire_refrele(ire);
6853                                 if (ipif_refheld)
6854                                         ipif_refrele(ipif);
6855                                 return (EEXIST);
6856                         }
6857                         ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x"
6858                             "for 0x%x\n", (void *)ipif,
6859                             ipif->ipif_ire_type,
6860                             ntohl(ipif->ipif_lcl_addr)));
6861                         ire = ire_create(
6862                             (uchar_t *)&dst_addr,       /* dest address */
6863                             (uchar_t *)&mask,           /* mask */
6864                             (uchar_t *)&ipif->ipif_src_addr,
6865                             NULL,                       /* no gateway */
6866                             &ipif->ipif_mtu,
6867                             NULL,
6868                             ipif->ipif_rq,              /* recv-from queue */
6869                             NULL,                       /* no send-to queue */
6870                             ipif->ipif_ire_type,        /* LOOPBACK */
6871                             ipif,
6872                             0,
6873                             0,
6874                             0,
6875                             (ipif->ipif_flags & IPIF_PRIVATE) ?
6876                             RTF_PRIVATE : 0,
6877                             &ire_uinfo_null,
6878                             NULL,
6879                             NULL,
6880                             ipst);
6881
6882                         if (ire == NULL) {
6883                                 if (ipif_refheld)
6884                                         ipif_refrele(ipif);
6885                                 return (ENOMEM);
6886                         }
6887                         error = ire_add(&ire, q, mp, func, B_FALSE);
6888                         if (error == 0)
6889                                 goto save_ire;
6890                         if (ipif_refheld)
6891                                 ipif_refrele(ipif);
6892                         return (error);
6893
6894                 }
6895         }
6896
6897         /*
6898          * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
6899          * and the gateway address provided is one of the system's interface
6900          * addresses.  By using the routing socket interface and supplying an
6901          * RTA_IFP sockaddr with an interface index, an alternate method of
6902          * specifying an interface route to be created is available which uses
6903          * the interface index that specifies the outgoing interface rather than
6904          * the address of an outgoing interface (which may not be able to
6905          * uniquely identify an interface).  When coupled with the RTF_GATEWAY
6906          * flag, routes can be specified which not only specify the next-hop to
6907          * be used when routing to a certain prefix, but also which outgoing
6908          * interface should be used.
6909          *
6910          * Previously, interfaces would have unique addresses assigned to them
6911          * and so the address assigned to a particular interface could be used
6912          * to identify a particular interface.  One exception to this was the
6913          * case of an unnumbered interface (where IPIF_UNNUMBERED was set).
6914          *
6915          * With the advent of IPv6 and its link-local addresses, this
6916          * restriction was relaxed and interfaces could share addresses between
6917          * themselves.  In fact, typically all of the link-local interfaces on
6918          * an IPv6 node or router will have the same link-local address.  In
6919          * order to differentiate between these interfaces, the use of an
6920          * interface index is necessary and this index can be carried inside a
6921          * RTA_IFP sockaddr (which is actually a sockaddr_dl).  One restriction
6922          * of using the interface index, however, is that all of the ipif's that
6923          * are part of an ill have the same index and so the RTA_IFP sockaddr
6924          * cannot be used to differentiate between ipif's (or logical
6925          * interfaces) that belong to the same ill (physical interface).
6926          *
6927          * For example, in the following case involving IPv4 interfaces and
6928          * logical interfaces
6929          *
6930          *      192.0.2.32      255.255.255.224 192.0.2.33      U       if0
6931          *      192.0.2.32      255.255.255.224 192.0.2.34      U       if0:1
6932          *      192.0.2.32      255.255.255.224 192.0.2.35      U       if0:2
6933          *
6934          * the ipif's corresponding to each of these interface routes can be
6935          * uniquely identified by the "gateway" (actually interface address).
6936          *
6937          * In this case involving multiple IPv6 default routes to a particular
6938          * link-local gateway, the use of RTA_IFP is necessary to specify which
6939          * default route is of interest:
6940          *
6941          *      default         fe80::123:4567:89ab:cdef        U       if0
6942          *      default         fe80::123:4567:89ab:cdef        U       if1
6943          */
6944
6945         /* RTF_GATEWAY not set */
6946         if (!(flags & RTF_GATEWAY)) {
6947                 queue_t *stq;
6948
6949                 if (sp != NULL) {
6950                         ip2dbg(("ip_rt_add: gateway security attributes "
6951                             "cannot be set with interface route\n"));
6952                         if (ipif_refheld)
6953                                 ipif_refrele(ipif);
6954                         return (EINVAL);
6955                 }
6956
6957                 /*
6958                  * As the interface index specified with the RTA_IFP sockaddr is
6959                  * the same for all ipif's off of an ill, the matching logic
6960                  * below uses MATCH_IRE_ILL if such an index was specified.
6961                  * This means that routes sharing the same prefix when added
6962                  * using a RTA_IFP sockaddr must have distinct interface
6963                  * indices (namely, they must be on distinct ill's).
6964                  *
6965                  * On the other hand, since the gateway address will usually be
6966                  * different for each ipif on the system, the matching logic
6967                  * uses MATCH_IRE_IPIF in the case of a traditional interface
6968                  * route.  This means that interface routes for the same prefix
6969                  * can be created if they belong to distinct ipif's and if a
6970                  * RTA_IFP sockaddr is not present.
6971                  */
6972                 if (ipif_arg != NULL) {
6973                         if (ipif_refheld)  {
6974                                 ipif_refrele(ipif);
6975                                 ipif_refheld = B_FALSE;
6976                         }
6977                         ipif = ipif_arg;
6978                         match_flags |= MATCH_IRE_ILL;
6979                 } else {
6980                         /*
6981                          * Check the ipif corresponding to the gw_addr
6982                          */
6983                         if (ipif == NULL)
6984                                 return (ENETUNREACH);
6985                         match_flags |= MATCH_IRE_IPIF;
6986                 }
6987                 ASSERT(ipif != NULL);
6988
6989                 /*
6990                  * We check for an existing entry at this point.
6991                  *
6992                  * Since a netmask isn't passed in via the ioctl interface
6993                  * (SIOCADDRT), we don't check for a matching netmask in that
6994                  * case.
6995                  */
6996                 if (!ioctl_msg)
6997                         match_flags |= MATCH_IRE_MASK;
6998                 ire = ire_ftable_lookup(dst_addr, mask, 0, IRE_INTERFACE, ipif,
6999                     NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
7000                 if (ire != NULL) {
7001                         ire_refrele(ire);
7002                         if (ipif_refheld)
7003                                 ipif_refrele(ipif);
7004                         return (EEXIST);
7005                 }
7006
7007                 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
7008                     ? ipif->ipif_rq : ipif->ipif_wq;
7009
7010                 /*
7011                  * Create a copy of the IRE_LOOPBACK,
7012                  * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with
7013                  * the modified address and netmask.
7014                  */
7015                 ire = ire_create(
7016                     (uchar_t *)&dst_addr,
7017                     (uint8_t *)&mask,
7018                     (uint8_t *)&ipif->ipif_src_addr,
7019                     NULL,
7020                     &ipif->ipif_mtu,
7021                     NULL,
7022                     NULL,
7023                     stq,
7024                     ipif->ipif_net_type,
7025                     ipif,
7026                     0,
7027                     0,
7028                     0,
7029                     flags,
7030                     &ire_uinfo_null,
7031                     NULL,
7032                     NULL,
7033                     ipst);
7034                 if (ire == NULL) {
7035                         if (ipif_refheld)
7036                                 ipif_refrele(ipif);
7037                         return (ENOMEM);
7038                 }
7039
7040                 /*
7041                  * Some software (for example, GateD and Sun Cluster) attempts
7042                  * to create (what amount to) IRE_PREFIX routes with the
7043                  * loopback address as the gateway.  This is primarily done to
7044                  * set up prefixes with the RTF_REJECT flag set (for example,
7045                  * when generating aggregate routes.)
7046                  *
7047                  * If the IRE type (as defined by ipif->ipif_net_type) is
7048                  * IRE_LOOPBACK, then we map the request into a
7049                  * IRE_IF_NORESOLVER.
7050                  *
7051                  * Needless to say, the real IRE_LOOPBACK is NOT created by this
7052                  * routine, but rather using ire_create() directly.
7053                  *
7054                  */
7055                 if (ipif->ipif_net_type == IRE_LOOPBACK)
7056                         ire->ire_type = IRE_IF_NORESOLVER;
7057
7058                 error = ire_add(&ire, q, mp, func, B_FALSE);
7059                 if (error == 0)
7060                         goto save_ire;
7061
7062                 /*
7063                  * In the result of failure, ire_add() will have already
7064                  * deleted the ire in question, so there is no need to
7065                  * do that here.
7066                  */
7067                 if (ipif_refheld)
7068                         ipif_refrele(ipif);
7069                 return (error);
7070         }
7071         if (ipif_refheld) {
7072                 ipif_refrele(ipif);
7073                 ipif_refheld = B_FALSE;
7074         }
7075
7076         /*
7077          * Get an interface IRE for the specified gateway.
7078          * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
7079          * gateway, it is currently unreachable and we fail the request
7080          * accordingly.
7081          */
7082         ipif = ipif_arg;
7083         if (ipif_arg != NULL)
7084                 match_flags |= MATCH_IRE_ILL;
7085         gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL,
7086             ALL_ZONES, 0, NULL, match_flags, ipst);
7087         if (gw_ire == NULL)
7088                 return (ENETUNREACH);
7089
7090         /*
7091          * We create one of three types of IREs as a result of this request
7092          * based on the netmask.  A netmask of all ones (which is automatically
7093          * assumed when RTF_HOST is set) results in an IRE_HOST being created.
7094          * An all zeroes netmask implies a default route so an IRE_DEFAULT is
7095          * created.  Otherwise, an IRE_PREFIX route is created for the
7096          * destination prefix.
7097          */
7098         if (mask == IP_HOST_MASK)
7099                 type = IRE_HOST;
7100         else if (mask == 0)
7101                 type = IRE_DEFAULT;
7102         else
7103                 type = IRE_PREFIX;
7104
7105         /* check for a duplicate entry */
7106         ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg,
7107             NULL, ALL_ZONES, 0, NULL,
7108             match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst);
7109         if (ire != NULL) {
7110                 ire_refrele(gw_ire);
7111                 ire_refrele(ire);
7112                 return (EEXIST);
7113         }
7114
7115         /* Security attribute exists */
7116         if (sp != NULL) {
7117                 tsol_gcgrp_addr_t ga;
7118
7119                 /* find or create the gateway credentials group */
7120                 ga.ga_af = AF_INET;
7121                 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr);
7122
7123                 /* we hold reference to it upon success */
7124                 gcgrp = gcgrp_lookup(&ga, B_TRUE);
7125                 if (gcgrp == NULL) {
7126                         ire_refrele(gw_ire);
7127                         return (ENOMEM);
7128                 }
7129
7130                 /*
7131                  * Create and add the security attribute to the group; a
7132                  * reference to the group is made upon allocating a new
7133                  * entry successfully.  If it finds an already-existing
7134                  * entry for the security attribute in the group, it simply
7135                  * returns it and no new reference is made to the group.
7136                  */
7137                 gc = gc_create(sp, gcgrp, &gcgrp_xtraref);
7138                 if (gc == NULL) {
7139                         /* release reference held by gcgrp_lookup */
7140                         GCGRP_REFRELE(gcgrp);
7141                         ire_refrele(gw_ire);
7142                         return (ENOMEM);
7143                 }
7144         }
7145
7146         /* Create the IRE. */
7147         ire = ire_create(
7148             (uchar_t *)&dst_addr,               /* dest address */
7149             (uchar_t *)&mask,                   /* mask */
7150             /* src address assigned by the caller? */
7151             (uchar_t *)(((src_addr != INADDR_ANY) &&
7152             (flags & RTF_SETSRC)) ?  &src_addr : NULL),
7153             (uchar_t *)&gw_addr,                /* gateway address */
7154             &gw_ire->ire_max_frag,
7155             NULL,                               /* no src nce */
7156             NULL,                               /* no recv-from queue */
7157             NULL,                               /* no send-to queue */
7158             (ushort_t)type,                     /* IRE type */
7159             ipif_arg,
7160             0,
7161             0,
7162             0,
7163             flags,
7164             &gw_ire->ire_uinfo,                 /* Inherit ULP info from gw */
7165             gc,                                 /* security attribute */
7166             NULL,
7167             ipst);
7168
7169         /*
7170          * The ire holds a reference to the 'gc' and the 'gc' holds a
7171          * reference to the 'gcgrp'. We can now release the extra reference
7172          * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used.
7173          */
7174         if (gcgrp_xtraref)
7175                 GCGRP_REFRELE(gcgrp);
7176         if (ire == NULL) {
7177                 if (gc != NULL)
7178                         GC_REFRELE(gc);
7179                 ire_refrele(gw_ire);
7180                 return (ENOMEM);
7181         }
7182
7183         /*
7184          * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
7185          * SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
7186          */
7187
7188         /* Add the new IRE. */
7189         error = ire_add(&ire, q, mp, func, B_FALSE);
7190         if (error != 0) {
7191                 /*
7192                  * In the result of failure, ire_add() will have already
7193                  * deleted the ire in question, so there is no need to
7194                  * do that here.
7195                  */
7196                 ire_refrele(gw_ire);
7197                 return (error);
7198         }
7199
7200         if (flags & RTF_MULTIRT) {
7201                 /*
7202                  * Invoke the CGTP (multirouting) filtering module
7203                  * to add the dst address in the filtering database.
7204                  * Replicated inbound packets coming from that address
7205                  * will be filtered to discard the duplicates.
7206                  * It is not necessary to call the CGTP filter hook
7207                  * when the dst address is a broadcast or multicast,
7208                  * because an IP source address cannot be a broadcast
7209                  * or a multicast.
7210                  */
7211                 ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0,
7212                     IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
7213                 if (ire_dst != NULL) {
7214                         ip_cgtp_bcast_add(ire, ire_dst, ipst);
7215                         ire_refrele(ire_dst);
7216                         goto save_ire;
7217                 }
7218                 if (ipst->ips_ip_cgtp_filter_ops != NULL &&
7219                     !CLASSD(ire->ire_addr)) {
7220                         int res = ipst->ips_ip_cgtp_filter_ops->cfo_add_dest_v4(
7221                             ipst->ips_netstack->netstack_stackid,
7222                             ire->ire_addr,
7223                             ire->ire_gateway_addr,
7224                             ire->ire_src_addr,
7225                             gw_ire->ire_src_addr);
7226                         if (res != 0) {
7227                                 ire_refrele(gw_ire);
7228                                 ire_delete(ire);
7229                                 return (res);
7230                         }
7231                 }
7232         }
7233
7234         /*
7235          * Now that the prefix IRE entry has been created, delete any
7236          * existing gateway IRE cache entries as well as any IRE caches
7237          * using the gateway, and force them to be created through
7238          * ip_newroute.
7239          */
7240         if (gc != NULL) {
7241                 ASSERT(gcgrp != NULL);
7242                 ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES, ipst);
7243         }
7244
7245 save_ire:
7246         if (gw_ire != NULL) {
7247                 ire_refrele(gw_ire);
7248         }
7249         if (ipif != NULL) {
7250                 /*
7251                  * Save enough information so that we can recreate the IRE if
7252                  * the interface goes down and then up.  The metrics associated
7253                  * with the route will be saved as well when rts_setmetrics() is
7254                  * called after the IRE has been created.  In the case where
7255                  * memory cannot be allocated, none of this information will be
7256                  * saved.
7257                  */
7258                 ipif_save_ire(ipif, ire);
7259         }
7260         if (ioctl_msg)
7261                 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst);
7262         if (ire_arg != NULL) {
7263                 /*
7264                  * Store the ire that was successfully added into where ire_arg
7265                  * points to so that callers don't have to look it up
7266                  * themselves (but they are responsible for ire_refrele()ing
7267                  * the ire when they are finished with it).
7268                  */
7269                 *ire_arg = ire;
7270         } else {
7271                 ire_refrele(ire);               /* Held in ire_add */
7272         }
7273         if (ipif_refheld)
7274                 ipif_refrele(ipif);
7275         return (0);
7276 }
7277
7278 /*
7279  * ip_rt_delete is called to delete an IPv4 route.
7280  * ipif_arg is passed in to associate it with the correct interface.
7281  * We may need to restart this operation if the ipif cannot be looked up
7282  * due to an exclusive operation that is currently in progress. The restart
7283  * entry point is specified by 'func'
7284  */
7285 /* ARGSUSED4 */
7286 int
7287 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
7288     uint_t rtm_addrs, int flags, ipif_t *ipif_arg, boolean_t ioctl_msg,
7289     queue_t *q, mblk_t *mp, ipsq_func_t func, ip_stack_t *ipst)
7290 {
7291         ire_t   *ire = NULL;
7292         ipif_t  *ipif;
7293         boolean_t ipif_refheld = B_FALSE;
7294         uint_t  type;
7295         uint_t  match_flags = MATCH_IRE_TYPE;
7296         int     err = 0;
7297
7298         ip1dbg(("ip_rt_delete:"));
7299         /*
7300          * If this is the case of RTF_HOST being set, then we set the netmask
7301          * to all ones.  Otherwise, we use the netmask if one was supplied.
7302          */
7303         if (flags & RTF_HOST) {
7304                 mask = IP_HOST_MASK;
7305                 match_flags |= MATCH_IRE_MASK;
7306         } else if (rtm_addrs & RTA_NETMASK) {
7307                 match_flags |= MATCH_IRE_MASK;
7308         }
7309
7310         /*
7311          * Note that RTF_GATEWAY is never set on a delete, therefore
7312          * we check if the gateway address is one of our interfaces first,
7313          * and fall back on RTF_GATEWAY routes.
7314          *
7315          * This makes it possible to delete an original
7316          * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
7317          *
7318          * As the interface index specified with the RTA_IFP sockaddr is the
7319          * same for all ipif's off of an ill, the matching logic below uses
7320          * MATCH_IRE_ILL if such an index was specified.  This means a route
7321          * sharing the same prefix and interface index as the the route
7322          * intended to be deleted might be deleted instead if a RTA_IFP sockaddr
7323          * is specified in the request.
7324          *
7325          * On the other hand, since the gateway address will usually be
7326          * different for each ipif on the system, the matching logic
7327          * uses MATCH_IRE_IPIF in the case of a traditional interface
7328          * route.  This means that interface routes for the same prefix can be
7329          * uniquely identified if they belong to distinct ipif's and if a
7330          * RTA_IFP sockaddr is not present.
7331          *
7332          * For more detail on specifying routes by gateway address and by
7333          * interface index, see the comments in ip_rt_add().
7334          */
7335         ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &err,
7336             ipst);
7337         if (ipif != NULL)
7338                 ipif_refheld = B_TRUE;
7339         else if (err == EINPROGRESS)
7340                 return (err);
7341         else
7342                 err = 0;
7343         if (ipif != NULL) {
7344                 if (ipif_arg != NULL) {
7345                         if (ipif_refheld) {
7346                                 ipif_refrele(ipif);
7347                                 ipif_refheld = B_FALSE;
7348                         }
7349                         ipif = ipif_arg;
7350                         match_flags |= MATCH_IRE_ILL;
7351                 } else {
7352                         match_flags |= MATCH_IRE_IPIF;
7353                 }
7354                 if (ipif->ipif_ire_type == IRE_LOOPBACK) {
7355                         ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif,
7356                             ALL_ZONES, NULL, match_flags, ipst);
7357                 }
7358                 if (ire == NULL) {
7359                         ire = ire_ftable_lookup(dst_addr, mask, 0,
7360                             IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL,
7361                             match_flags, ipst);
7362                 }
7363         }
7364
7365         if (ire == NULL) {
7366                 /*
7367                  * At this point, the gateway address is not one of our own
7368                  * addresses or a matching interface route was not found.  We
7369                  * set the IRE type to lookup based on whether
7370                  * this is a host route, a default route or just a prefix.
7371                  *
7372                  * If an ipif_arg was passed in, then the lookup is based on an
7373                  * interface index so MATCH_IRE_ILL is added to match_flags.
7374                  * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is
7375                  * set as the route being looked up is not a traditional
7376                  * interface route.
7377                  */
7378                 match_flags &= ~MATCH_IRE_IPIF;
7379                 match_flags |= MATCH_IRE_GW;
7380                 if (ipif_arg != NULL)
7381                         match_flags |= MATCH_IRE_ILL;
7382                 if (mask == IP_HOST_MASK)
7383                         type = IRE_HOST;
7384                 else if (mask == 0)
7385                         type = IRE_DEFAULT;
7386                 else
7387                         type = IRE_PREFIX;
7388                 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg,
7389                     NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
7390         }
7391
7392         if (ipif_refheld)
7393                 ipif_refrele(ipif);
7394
7395         /* ipif is not refheld anymore */
7396         if (ire == NULL)
7397                 return (ESRCH);
7398
7399         if (ire->ire_flags & RTF_MULTIRT) {
7400                 /*
7401                  * Invoke the CGTP (multirouting) filtering module
7402                  * to remove the dst address from the filtering database.
7403                  * Packets coming from that address will no longer be
7404                  * filtered to remove duplicates.
7405                  */
7406                 if (ipst->ips_ip_cgtp_filter_ops != NULL) {
7407                         err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4(
7408                             ipst->ips_netstack->netstack_stackid,
7409                             ire->ire_addr, ire->ire_gateway_addr);
7410                 }
7411                 ip_cgtp_bcast_delete(ire, ipst);
7412         }
7413
7414         ipif = ire->ire_ipif;
7415         if (ipif != NULL)
7416                 ipif_remove_ire(ipif, ire);
7417         if (ioctl_msg)
7418                 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst);
7419         ire_delete(ire);
7420         ire_refrele(ire);
7421         return (err);
7422 }
7423
7424 /*
7425  * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL.
7426  */
7427 /* ARGSUSED */
7428 int
7429 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
7430     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
7431 {
7432         ipaddr_t dst_addr;
7433         ipaddr_t gw_addr;
7434         ipaddr_t mask;
7435         int error = 0;
7436         mblk_t *mp1;
7437         struct rtentry *rt;
7438         ipif_t *ipif = NULL;
7439         ip_stack_t      *ipst;
7440
7441         ASSERT(q->q_next == NULL);
7442         ipst = CONNQ_TO_IPST(q);
7443
7444         ip1dbg(("ip_siocaddrt:"));
7445         /* Existence of mp1 verified in ip_wput_nondata */
7446         mp1 = mp->b_cont->b_cont;
7447         rt = (struct rtentry *)mp1->b_rptr;
7448
7449         dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
7450         gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
7451
7452         /*
7453          * If the RTF_HOST flag is on, this is a request to assign a gateway
7454          * to a particular host address.  In this case, we set the netmask to
7455          * all ones for the particular destination address.  Otherwise,
7456          * determine the netmask to be used based on dst_addr and the interfaces
7457          * in use.
7458          */
7459         if (rt->rt_flags & RTF_HOST) {
7460                 mask = IP_HOST_MASK;
7461         } else {
7462                 /*
7463                  * Note that ip_subnet_mask returns a zero mask in the case of
7464                  * default (an all-zeroes address).
7465                  */
7466                 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
7467         }
7468
7469         error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL,
7470             B_TRUE, q, mp, ip_process_ioctl, NULL, ipst);
7471         if (ipif != NULL)
7472                 ipif_refrele(ipif);
7473         return (error);
7474 }
7475
7476 /*
7477  * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL.
7478  */
7479 /* ARGSUSED */
7480 int
7481 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
7482     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
7483 {
7484         ipaddr_t dst_addr;
7485         ipaddr_t gw_addr;
7486         ipaddr_t mask;
7487         int error;
7488         mblk_t *mp1;
7489         struct rtentry *rt;
7490         ipif_t *ipif = NULL;
7491         ip_stack_t      *ipst;
7492
7493         ASSERT(q->q_next == NULL);
7494         ipst = CONNQ_TO_IPST(q);
7495
7496         ip1dbg(("ip_siocdelrt:"));
7497         /* Existence of mp1 verified in ip_wput_nondata */
7498         mp1 = mp->b_cont->b_cont;
7499         rt = (struct rtentry *)mp1->b_rptr;
7500
7501         dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
7502         gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
7503
7504         /*
7505          * If the RTF_HOST flag is on, this is a request to delete a gateway
7506          * to a particular host address.  In this case, we set the netmask to
7507          * all ones for the particular destination address.  Otherwise,
7508          * determine the netmask to be used based on dst_addr and the interfaces
7509          * in use.
7510          */
7511         if (rt->rt_flags & RTF_HOST) {
7512                 mask = IP_HOST_MASK;
7513         } else {
7514                 /*
7515                  * Note that ip_subnet_mask returns a zero mask in the case of
7516                  * default (an all-zeroes address).
7517                  */
7518                 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
7519         }
7520
7521         error = ip_rt_delete(dst_addr, mask, gw_addr,
7522             RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, q,
7523             mp, ip_process_ioctl, ipst);
7524         if (ipif != NULL)
7525                 ipif_refrele(ipif);
7526         return (error);
7527 }
7528
7529 /*
7530  * Enqueue the mp onto the ipsq, chained by b_next.
7531  * b_prev stores the function to be executed later, and b_queue the queue
7532  * where this mp originated.
7533  */
7534 void
7535 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
7536     ill_t *pending_ill)
7537 {
7538         conn_t  *connp = NULL;
7539
7540         ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
7541         ASSERT(func != NULL);
7542
7543         mp->b_queue = q;
7544         mp->b_prev = (void *)func;
7545         mp->b_next = NULL;
7546
7547         switch (type) {
7548         case CUR_OP:
7549                 if (ipsq->ipsq_mptail != NULL) {
7550                         ASSERT(ipsq->ipsq_mphead != NULL);
7551                         ipsq->ipsq_mptail->b_next = mp;
7552                 } else {
7553                         ASSERT(ipsq->ipsq_mphead == NULL);
7554                         ipsq->ipsq_mphead = mp;
7555                 }
7556                 ipsq->ipsq_mptail = mp;
7557                 break;
7558
7559         case NEW_OP:
7560                 if (ipsq->ipsq_xopq_mptail != NULL) {
7561                         ASSERT(ipsq->ipsq_xopq_mphead != NULL);
7562                         ipsq->ipsq_xopq_mptail->b_next = mp;
7563                 } else {
7564                         ASSERT(ipsq->ipsq_xopq_mphead == NULL);
7565                         ipsq->ipsq_xopq_mphead = mp;
7566                 }
7567                 ipsq->ipsq_xopq_mptail = mp;
7568                 break;
7569         default:
7570                 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
7571         }
7572
7573         if (CONN_Q(q) && pending_ill != NULL) {
7574                 connp = Q_TO_CONN(q);
7575
7576                 ASSERT(MUTEX_HELD(&connp->conn_lock));
7577                 connp->conn_oper_pending_ill = pending_ill;
7578         }
7579 }
7580
7581 /*
7582  * Return the mp at the head of the ipsq. After emptying the ipsq
7583  * look at the next ioctl, if this ioctl is complete. Otherwise
7584  * return, we will resume when we complete the current ioctl.
7585  * The current ioctl will wait till it gets a response from the
7586  * driver below.
7587  */
7588 static mblk_t *
7589 ipsq_dq(ipsq_t *ipsq)
7590 {
7591         mblk_t  *mp;
7592
7593         ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
7594
7595         mp = ipsq->ipsq_mphead;
7596         if (mp != NULL) {
7597                 ipsq->ipsq_mphead = mp->b_next;
7598                 if (ipsq->ipsq_mphead == NULL)
7599                         ipsq->ipsq_mptail = NULL;
7600                 mp->b_next = NULL;
7601                 return (mp);
7602         }
7603         if (ipsq->ipsq_current_ipif != NULL)
7604                 return (NULL);
7605         mp = ipsq->ipsq_xopq_mphead;
7606         if (mp != NULL) {
7607                 ipsq->ipsq_xopq_mphead = mp->b_next;
7608                 if (ipsq->ipsq_xopq_mphead == NULL)
7609                         ipsq->ipsq_xopq_mptail = NULL;
7610                 mp->b_next = NULL;
7611                 return (mp);
7612         }
7613         return (NULL);
7614 }
7615
7616 /*
7617  * Enter the ipsq corresponding to ill, by waiting synchronously till
7618  * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
7619  * will have to drain completely before ipsq_enter returns success.
7620  * ipsq_current_ipif will be set if some exclusive ioctl is in progress,
7621  * and the ipsq_exit logic will start the next enqueued ioctl after
7622  * completion of the current ioctl. If 'force' is used, we don't wait
7623  * for the enqueued ioctls. This is needed when a conn_close wants to
7624  * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
7625  * of an ill can also use this option. But we dont' use it currently.
7626  */
7627 #define ENTER_SQ_WAIT_TICKS 100
7628 boolean_t
7629 ipsq_enter(ill_t *ill, boolean_t force)
7630 {
7631         ipsq_t  *ipsq;
7632         boolean_t waited_enough = B_FALSE;
7633
7634         /*
7635          * Holding the ill_lock prevents <ill-ipsq> assocs from changing.
7636          * Since the <ill-ipsq> assocs could change while we wait for the
7637          * writer, it is easier to wait on a fixed global rather than try to
7638          * cv_wait on a changing ipsq.
7639          */
7640         mutex_enter(&ill->ill_lock);
7641         for (;;) {
7642                 if (ill->ill_state_flags & ILL_CONDEMNED) {
7643                         mutex_exit(&ill->ill_lock);
7644                         return (B_FALSE);
7645                 }
7646
7647                 ipsq = ill->ill_phyint->phyint_ipsq;
7648                 mutex_enter(&ipsq->ipsq_lock);
7649                 if (ipsq->ipsq_writer == NULL &&
7650                     (ipsq->ipsq_current_ipif == NULL || waited_enough)) {
7651                         break;
7652                 } else if (ipsq->ipsq_writer != NULL) {
7653                         mutex_exit(&ipsq->ipsq_lock);
7654                         cv_wait(&ill->ill_cv, &ill->ill_lock);
7655                 } else {
7656                         mutex_exit(&ipsq->ipsq_lock);
7657                         if (force) {
7658                                 (void) cv_timedwait(&ill->ill_cv,
7659                                     &ill->ill_lock,
7660                                     lbolt + ENTER_SQ_WAIT_TICKS);
7661                                 waited_enough = B_TRUE;
7662                                 continue;
7663                         } else {
7664                                 cv_wait(&ill->ill_cv, &ill->ill_lock);
7665                         }
7666                 }
7667         }
7668
7669         ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL);
7670         ASSERT(ipsq->ipsq_reentry_cnt == 0);
7671         ipsq->ipsq_writer = curthread;
7672         ipsq->ipsq_reentry_cnt++;
7673 #ifdef DEBUG
7674         ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IPSQ_STACK_DEPTH);
7675 #endif
7676         mutex_exit(&ipsq->ipsq_lock);
7677         mutex_exit(&ill->ill_lock);
7678         return (B_TRUE);
7679 }
7680
7681 /*
7682  * The ipsq_t (ipsq) is the synchronization data structure used to serialize
7683  * certain critical operations like plumbing (i.e. most set ioctls),
7684  * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP
7685  * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per
7686  * IPMP group. The ipsq serializes exclusive ioctls issued by applications
7687  * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple
7688  * threads executing in the ipsq. Responses from the driver pertain to the
7689  * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated
7690  * as part of bringing up the interface) and are enqueued in ipsq_mphead.
7691  *
7692  * If a thread does not want to reenter the ipsq when it is already writer,
7693  * it must make sure that the specified reentry point to be called later
7694  * when the ipsq is empty, nor any code path starting from the specified reentry
7695  * point must never ever try to enter the ipsq again. Otherwise it can lead
7696  * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
7697  * When the thread that is currently exclusive finishes, it (ipsq_exit)
7698  * dequeues the requests waiting to become exclusive in ipsq_mphead and calls
7699  * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit
7700  * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
7701  * ioctl if the current ioctl has completed. If the current ioctl is still
7702  * in progress it simply returns. The current ioctl could be waiting for
7703  * a response from another module (arp_ or the driver or could be waiting for
7704  * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp
7705  * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the
7706  * execution of the ioctl and ipsq_exit does not start the next ioctl unless
7707  * ipsq_current_ipif is clear which happens only on ioctl completion.
7708  */
7709
7710 /*
7711  * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of
7712  * ipif or ill can be specified). The caller ensures ipif or ill is valid by
7713  * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued
7714  * completion.
7715  */
7716 ipsq_t *
7717 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
7718     ipsq_func_t func, int type, boolean_t reentry_ok)
7719 {
7720         ipsq_t  *ipsq;
7721
7722         /* Only 1 of ipif or ill can be specified */
7723         ASSERT((ipif != NULL) ^ (ill != NULL));
7724         if (ipif != NULL)
7725                 ill = ipif->ipif_ill;
7726
7727         /*
7728          * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock
7729          * ipsq of an ill can't change when ill_lock is held.
7730          */
7731         GRAB_CONN_LOCK(q);
7732         mutex_enter(&ill->ill_lock);
7733         ipsq = ill->ill_phyint->phyint_ipsq;
7734         mutex_enter(&ipsq->ipsq_lock);
7735
7736         /*
7737          * 1. Enter the ipsq if we are already writer and reentry is ok.
7738          *    (Note: If the caller does not specify reentry_ok then neither
7739          *    'func' nor any of its callees must ever attempt to enter the ipsq
7740          *    again. Otherwise it can lead to an infinite loop
7741          * 2. Enter the ipsq if there is no current writer and this attempted
7742          *    entry is part of the current ioctl or operation
7743          * 3. Enter the ipsq if there is no current writer and this is a new
7744          *    ioctl (or operation) and the ioctl (or operation) queue is
7745          *    empty and there is no ioctl (or operation) currently in progress
7746          */
7747         if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) ||
7748             (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL &&
7749             ipsq->ipsq_current_ipif == NULL))) ||
7750             (ipsq->ipsq_writer == curthread && reentry_ok)) {
7751                 /* Success. */
7752                 ipsq->ipsq_reentry_cnt++;
7753                 ipsq->ipsq_writer = curthread;
7754                 mutex_exit(&ipsq->ipsq_lock);
7755                 mutex_exit(&ill->ill_lock);
7756                 RELEASE_CONN_LOCK(q);
7757 #ifdef DEBUG
7758                 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack,
7759                     IPSQ_STACK_DEPTH);
7760 #endif
7761                 return (ipsq);
7762         }
7763
7764         ipsq_enq(ipsq, q, mp, func, type, ill);
7765
7766         mutex_exit(&ipsq->ipsq_lock);
7767         mutex_exit(&ill->ill_lock);
7768         RELEASE_CONN_LOCK(q);
7769         return (NULL);
7770 }
7771
7772 /*
7773  * Try to enter the IPSQ corresponding to `ill' as writer.  The caller ensures
7774  * ill is valid by refholding it if necessary; we will refrele.  If the IPSQ
7775  * cannot be entered, the mp is queued for completion.
7776  */
7777 void
7778 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
7779     boolean_t reentry_ok)
7780 {
7781         ipsq_t  *ipsq;
7782
7783         ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok);
7784
7785         /*
7786          * Drop the caller's refhold on the ill.  This is safe since we either
7787          * entered the IPSQ (and thus are exclusive), or failed to enter the
7788          * IPSQ, in which case we return without accessing ill anymore.  This
7789          * is needed because func needs to see the correct refcount.
7790          * e.g. removeif can work only then.
7791          */
7792         ill_refrele(ill);
7793         if (ipsq != NULL) {
7794                 (*func)(ipsq, q, mp, NULL);
7795                 ipsq_exit(ipsq, B_TRUE, B_TRUE);
7796         }
7797 }
7798
7799 /*
7800  * If there are more than ILL_GRP_CNT ills in a group,
7801  * we use kmem alloc'd buffers, else use the stack
7802  */
7803 #define ILL_GRP_CNT     14
7804 /*
7805  * Drain the ipsq, if there are messages on it, and then leave the ipsq.
7806  * Called by a thread that is currently exclusive on this ipsq.
7807  */
7808 void
7809 ipsq_exit(ipsq_t *ipsq, boolean_t start_igmp_timer, boolean_t start_mld_timer)
7810 {
7811         queue_t *q;
7812         mblk_t  *mp;
7813         ipsq_func_t     func;
7814         int     next;
7815         ill_t   **ill_list = NULL;
7816         size_t  ill_list_size = 0;
7817         int     cnt = 0;
7818         boolean_t need_ipsq_free = B_FALSE;
7819         ip_stack_t      *ipst = ipsq->ipsq_ipst;
7820
7821         ASSERT(IAM_WRITER_IPSQ(ipsq));
7822         mutex_enter(&ipsq->ipsq_lock);
7823         ASSERT(ipsq->ipsq_reentry_cnt >= 1);
7824         if (ipsq->ipsq_reentry_cnt != 1) {
7825                 ipsq->ipsq_reentry_cnt--;
7826                 mutex_exit(&ipsq->ipsq_lock);
7827                 return;
7828         }
7829
7830         mp = ipsq_dq(ipsq);
7831         while (mp != NULL) {
7832 again:
7833                 mutex_exit(&ipsq->ipsq_lock);
7834                 func = (ipsq_func_t)mp->b_prev;
7835                 q = (queue_t *)mp->b_queue;
7836                 mp->b_prev = NULL;
7837                 mp->b_queue = NULL;
7838
7839                 /*
7840                  * If 'q' is an conn queue, it is valid, since we did a
7841                  * a refhold on the connp, at the start of the ioctl.
7842                  * If 'q' is an ill queue, it is valid, since close of an
7843                  * ill will clean up the 'ipsq'.
7844                  */
7845                 (*func)(ipsq, q, mp, NULL);
7846
7847                 mutex_enter(&ipsq->ipsq_lock);
7848                 mp = ipsq_dq(ipsq);
7849         }
7850
7851         mutex_exit(&ipsq->ipsq_lock);
7852
7853         /*
7854          * Need to grab the locks in the right order. Need to
7855          * atomically check (under ipsq_lock) that there are no
7856          * messages before relinquishing the ipsq. Also need to
7857          * atomically wakeup waiters on ill_cv while holding ill_lock.
7858          * Holding ill_g_lock ensures that ipsq list of ills is stable.
7859          * If we need to call ill_split_ipsq and change <ill-ipsq> we need
7860          * to grab ill_g_lock as writer.
7861          */
7862         rw_enter(&ipst->ips_ill_g_lock,
7863             ipsq->ipsq_split ? RW_WRITER : RW_READER);
7864
7865         /* ipsq_refs can't change while ill_g_lock is held as reader */
7866         if (ipsq->ipsq_refs != 0) {
7867                 /* At most 2 ills v4/v6 per phyint */
7868                 cnt = ipsq->ipsq_refs << 1;
7869                 ill_list_size = cnt * sizeof (ill_t *);
7870                 /*
7871                  * If memory allocation fails, we will do the split
7872                  * the next time ipsq_exit is called for whatever reason.
7873                  * As long as the ipsq_split flag is set the need to
7874                  * split is remembered.
7875                  */
7876                 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP);
7877                 if (ill_list != NULL)
7878                         cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt);
7879         }
7880         mutex_enter(&ipsq->ipsq_lock);
7881         mp = ipsq_dq(ipsq);
7882         if (mp != NULL) {
7883                 /* oops, some message has landed up, we can't get out */
7884                 if (ill_list != NULL)
7885                         ill_unlock_ills(ill_list, cnt);
7886                 rw_exit(&ipst->ips_ill_g_lock);
7887                 if (ill_list != NULL)
7888                         kmem_free(ill_list, ill_list_size);
7889                 ill_list = NULL;
7890                 ill_list_size = 0;
7891                 cnt = 0;
7892                 goto again;
7893         }
7894
7895         /*
7896          * Split only if no ioctl is pending and if memory alloc succeeded
7897          * above.
7898          */
7899         if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL &&
7900             ill_list != NULL) {
7901                 /*
7902                  * No new ill can join this ipsq since we are holding the
7903                  * ill_g_lock. Hence ill_split_ipsq can safely traverse the
7904                  * ipsq. ill_split_ipsq may fail due to memory shortage.
7905                  * If so we will retry on the next ipsq_exit.
7906                  */
7907                 ipsq->ipsq_split = ill_split_ipsq(ipsq);
7908         }
7909
7910         /*
7911          * We are holding the ipsq lock, hence no new messages can
7912          * land up on the ipsq, and there are no messages currently.
7913          * Now safe to get out. Wake up waiters and relinquish ipsq
7914          * atomically while holding ill locks.
7915          */
7916         ipsq->ipsq_writer = NULL;
7917         ipsq->ipsq_reentry_cnt--;
7918         ASSERT(ipsq->ipsq_reentry_cnt == 0);
7919 #ifdef DEBUG
7920         ipsq->ipsq_depth = 0;
7921 #endif
7922         mutex_exit(&ipsq->ipsq_lock);
7923         /*
7924          * For IPMP this should wake up all ills in this ipsq.
7925          * We need to hold the ill_lock while waking up waiters to
7926          * avoid missed wakeups. But there is no need to acquire all
7927          * the ill locks and then wakeup. If we have not acquired all
7928          * the locks (due to memory failure above) ill_signal_ipsq_ills
7929          * wakes up ills one at a time after getting the right ill_lock
7930          */
7931         ill_signal_ipsq_ills(ipsq, ill_list != NULL);
7932         if (ill_list != NULL)
7933                 ill_unlock_ills(ill_list, cnt);
7934         if (ipsq->ipsq_refs == 0)
7935                 need_ipsq_free = B_TRUE;
7936         rw_exit(&ipst->ips_ill_g_lock);
7937         if (ill_list != 0)
7938                 kmem_free(ill_list, ill_list_size);
7939
7940         if (need_ipsq_free) {
7941                 /*
7942                  * Free the ipsq. ipsq_refs can't increase because ipsq can't be
7943                  * looked up. ipsq can be looked up only thru ill or phyint
7944                  * and there are no ills/phyint on this ipsq.
7945                  */
7946                 ipsq_delete(ipsq);
7947         }
7948         /*
7949          * Now start any igmp or mld timers that could not be started
7950          * while inside the ipsq. The timers can't be started while inside
7951          * the ipsq, since igmp_start_timers may need to call untimeout()
7952          * which can't be done while holding a lock i.e. the ipsq. Otherwise
7953          * there could be a deadlock since the timeout handlers
7954          * mld_timeout_handler / igmp_timeout_handler also synchronously
7955          * wait in ipsq_enter() trying to get the ipsq.
7956          *
7957          * However there is one exception to the above. If this thread is
7958          * itself the igmp/mld timeout handler thread, then we don't want
7959          * to start any new timer until the current handler is done. The
7960          * handler thread passes in B_FALSE for start_igmp/mld_timers, while
7961          * all others pass B_TRUE.
7962          */
7963         if (start_igmp_timer) {
7964                 mutex_enter(&ipst->ips_igmp_timer_lock);
7965                 next = ipst->ips_igmp_deferred_next;
7966                 ipst->ips_igmp_deferred_next = INFINITY;
7967                 mutex_exit(&ipst->ips_igmp_timer_lock);
7968
7969                 if (next != INFINITY)
7970                         igmp_start_timers(next, ipst);
7971         }
7972
7973         if (start_mld_timer) {
7974                 mutex_enter(&ipst->ips_mld_timer_lock);
7975                 next = ipst->ips_mld_deferred_next;
7976                 ipst->ips_mld_deferred_next = INFINITY;
7977                 mutex_exit(&ipst->ips_mld_timer_lock);
7978
7979                 if (next != INFINITY)
7980                         mld_start_timers(next, ipst);
7981         }
7982 }
7983
7984 /*
7985  * Start the current exclusive operation on `ipsq'; associate it with `ipif'
7986  * and `ioccmd'.
7987  */
7988 void
7989 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
7990 {
7991         ASSERT(IAM_WRITER_IPSQ(ipsq));
7992
7993         mutex_enter(&ipsq->ipsq_lock);
7994         ASSERT(ipsq->ipsq_current_ipif == NULL);
7995         ASSERT(ipsq->ipsq_current_ioctl == 0);
7996         ipsq->ipsq_current_ipif = ipif;
7997         ipsq->ipsq_current_ioctl = ioccmd;
7998         mutex_exit(&ipsq->ipsq_lock);
7999 }
8000
8001 /*
8002  * Finish the current exclusive operation on `ipsq'.  Note that other
8003  * operations will not be able to proceed until an ipsq_exit() is done.
8004  */
8005 void
8006 ipsq_current_finish(ipsq_t *ipsq)
8007 {
8008         ipif_t *ipif = ipsq->ipsq_current_ipif;
8009
8010         ASSERT(IAM_WRITER_IPSQ(ipsq));
8011
8012         /*
8013          * For SIOCSLIFREMOVEIF, the ipif has been already been blown away
8014          * (but we're careful to never set IPIF_CHANGING in that case).
8015          */
8016         if (ipsq->ipsq_current_ioctl != SIOCLIFREMOVEIF) {
8017                 mutex_enter(&ipif->ipif_ill->ill_lock);
8018                 ipif->ipif_state_flags &= ~IPIF_CHANGING;
8019
8020                 /* Send any queued event */
8021                 ill_nic_info_dispatch(ipif->ipif_ill);
8022                 mutex_exit(&ipif->ipif_ill->ill_lock);
8023         }
8024
8025         mutex_enter(&ipsq->ipsq_lock);
8026         ASSERT(ipsq->ipsq_current_ipif != NULL);
8027         ipsq->ipsq_current_ipif = NULL;
8028         ipsq->ipsq_current_ioctl = 0;
8029         mutex_exit(&ipsq->ipsq_lock);
8030 }
8031
8032 /*
8033  * The ill is closing. Flush all messages on the ipsq that originated
8034  * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead
8035  * for this ill since ipsq_enter could not have entered until then.
8036  * New messages can't be queued since the CONDEMNED flag is set.
8037  */
8038 static void
8039 ipsq_flush(ill_t *ill)
8040 {
8041         queue_t *q;
8042         mblk_t  *prev;
8043         mblk_t  *mp;
8044         mblk_t  *mp_next;
8045         ipsq_t  *ipsq;
8046
8047         ASSERT(IAM_WRITER_ILL(ill));
8048         ipsq = ill->ill_phyint->phyint_ipsq;
8049         /*
8050          * Flush any messages sent up by the driver.
8051          */
8052         mutex_enter(&ipsq->ipsq_lock);
8053         for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) {
8054                 mp_next = mp->b_next;
8055                 q = mp->b_queue;
8056                 if (q == ill->ill_rq || q == ill->ill_wq) {
8057                         /* Remove the mp from the ipsq */
8058                         if (prev == NULL)
8059                                 ipsq->ipsq_mphead = mp->b_next;
8060                         else
8061                                 prev->b_next = mp->b_next;
8062                         if (ipsq->ipsq_mptail == mp) {
8063                                 ASSERT(mp_next == NULL);
8064                                 ipsq->ipsq_mptail = prev;
8065                         }
8066                         inet_freemsg(mp);
8067                 } else {
8068                         prev = mp;
8069                 }
8070         }
8071         mutex_exit(&ipsq->ipsq_lock);
8072         (void) ipsq_pending_mp_cleanup(ill, NULL);
8073         ipsq_xopq_mp_cleanup(ill, NULL);
8074         ill_pending_mp_cleanup(ill);
8075 }
8076
8077 /* ARGSUSED */
8078 int
8079 ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8080     ip_ioctl_cmd_t *ipip, void *ifreq)
8081 {
8082         ill_t   *ill;
8083         struct lifreq   *lifr = (struct lifreq *)ifreq;
8084         boolean_t isv6;
8085         conn_t  *connp;
8086         ip_stack_t      *ipst;
8087
8088         connp = Q_TO_CONN(q);
8089         ipst = connp->conn_netstack->netstack_ip;
8090         isv6 = connp->conn_af_isv6;
8091         /*
8092          * Set original index.
8093          * Failover and failback move logical interfaces
8094          * from one physical interface to another.  The
8095          * original index indicates the parent of a logical
8096          * interface, in other words, the physical interface
8097          * the logical interface will be moved back to on
8098          * failback.
8099          */
8100
8101         /*
8102          * Don't allow the original index to be changed
8103          * for non-failover addresses, autoconfigured
8104          * addresses, or IPv6 link local addresses.
8105          */
8106         if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) ||
8107             (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) {
8108                 return (EINVAL);
8109         }
8110         /*
8111          * The new original index must be in use by some
8112          * physical interface.
8113          */
8114         ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL,
8115             NULL, NULL, ipst);
8116         if (ill == NULL)
8117                 return (ENXIO);
8118         ill_refrele(ill);
8119
8120         ipif->ipif_orig_ifindex = lifr->lifr_index;
8121         /*
8122          * When this ipif gets failed back, don't
8123          * preserve the original id, as it is no
8124          * longer applicable.
8125          */
8126         ipif->ipif_orig_ipifid = 0;
8127         /*
8128          * For IPv4, change the original index of any
8129          * multicast addresses associated with the
8130          * ipif to the new value.
8131          */
8132         if (!isv6) {
8133                 ilm_t *ilm;
8134
8135                 mutex_enter(&ipif->ipif_ill->ill_lock);
8136                 for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL;
8137                     ilm = ilm->ilm_next) {
8138                         if (ilm->ilm_ipif == ipif) {
8139                                 ilm->ilm_orig_ifindex = lifr->lifr_index;
8140                         }
8141                 }
8142                 mutex_exit(&ipif->ipif_ill->ill_lock);
8143         }
8144         return (0);
8145 }
8146
8147 /* ARGSUSED */
8148 int
8149 ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8150     ip_ioctl_cmd_t *ipip, void *ifreq)
8151 {
8152         struct lifreq *lifr = (struct lifreq *)ifreq;
8153
8154         /*
8155          * Get the original interface index i.e the one
8156          * before FAILOVER if it ever happened.
8157          */
8158         lifr->lifr_index = ipif->ipif_orig_ifindex;
8159         return (0);
8160 }
8161
8162 /*
8163  * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls,
8164  * refhold and return the associated ipif
8165  */
8166 /* ARGSUSED */
8167 int
8168 ip_extract_tunreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
8169     cmd_info_t *ci, ipsq_func_t func)
8170 {
8171         boolean_t exists;
8172         struct iftun_req *ta;
8173         ipif_t  *ipif;
8174         ill_t   *ill;
8175         boolean_t isv6;
8176         mblk_t  *mp1;
8177         int     error;
8178         conn_t  *connp;
8179         ip_stack_t      *ipst;
8180
8181         /* Existence verified in ip_wput_nondata */
8182         mp1 = mp->b_cont->b_cont;
8183         ta = (struct iftun_req *)mp1->b_rptr;
8184         /*
8185          * Null terminate the string to protect against buffer
8186          * overrun. String was generated by user code and may not
8187          * be trusted.
8188          */
8189         ta->ifta_lifr_name[LIFNAMSIZ - 1] = '\0';
8190
8191         connp = Q_TO_CONN(q);
8192         isv6 = connp->conn_af_isv6;
8193         ipst = connp->conn_netstack->netstack_ip;
8194
8195         /* Disallows implicit create */
8196         ipif = ipif_lookup_on_name(ta->ifta_lifr_name,
8197             mi_strlen(ta->ifta_lifr_name), B_FALSE, &exists, isv6,
8198             connp->conn_zoneid, CONNP_TO_WQ(connp), mp, func, &error, ipst);
8199         if (ipif == NULL)
8200                 return (error);
8201
8202         if (ipif->ipif_id != 0) {
8203                 /*
8204                  * We really don't want to set/get tunnel parameters
8205                  * on virtual tunnel interfaces.  Only allow the
8206                  * base tunnel to do these.
8207                  */
8208                 ipif_refrele(ipif);
8209                 return (EINVAL);
8210         }
8211
8212         /*
8213          * Send down to tunnel mod for ioctl processing.
8214          * Will finish ioctl in ip_rput_other().
8215          */
8216         ill = ipif->ipif_ill;
8217         if (ill->ill_net_type == IRE_LOOPBACK) {
8218                 ipif_refrele(ipif);
8219                 return (EOPNOTSUPP);
8220         }
8221
8222         if (ill->ill_wq == NULL) {
8223                 ipif_refrele(ipif);
8224                 return (ENXIO);
8225         }
8226         /*
8227          * Mark the ioctl as coming from an IPv6 interface for
8228          * tun's convenience.
8229          */
8230         if (ill->ill_isv6)
8231                 ta->ifta_flags |= 0x80000000;
8232         ci->ci_ipif = ipif;
8233         return (0);
8234 }
8235
8236 /*
8237  * Parse an ifreq or lifreq struct coming down ioctls and refhold
8238  * and return the associated ipif.
8239  * Return value:
8240  *      Non zero: An error has occurred. ci may not be filled out.
8241  *      zero : ci is filled out with the ioctl cmd in ci.ci_name, and
8242  *      a held ipif in ci.ci_ipif.
8243  */
8244 int
8245 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
8246     cmd_info_t *ci, ipsq_func_t func)
8247 {
8248         sin_t           *sin;
8249         sin6_t          *sin6;
8250         char            *name;
8251         struct ifreq    *ifr;
8252         struct lifreq    *lifr;
8253         ipif_t          *ipif = NULL;
8254         ill_t           *ill;
8255         conn_t          *connp;
8256         boolean_t       isv6;
8257         boolean_t       exists;
8258         int             err;
8259         mblk_t          *mp1;
8260         zoneid_t        zoneid;
8261         ip_stack_t      *ipst;
8262
8263         if (q->q_next != NULL) {
8264                 ill = (ill_t *)q->q_ptr;
8265                 isv6 = ill->ill_isv6;
8266                 connp = NULL;
8267                 zoneid = ALL_ZONES;
8268                 ipst = ill->ill_ipst;
8269         } else {
8270                 ill = NULL;
8271                 connp = Q_TO_CONN(q);
8272                 isv6 = connp->conn_af_isv6;
8273                 zoneid = connp->conn_zoneid;
8274                 if (zoneid == GLOBAL_ZONEID) {
8275                         /* global zone can access ipifs in all zones */
8276                         zoneid = ALL_ZONES;
8277                 }
8278                 ipst = connp->conn_netstack->netstack_ip;
8279         }
8280
8281         /* Has been checked in ip_wput_nondata */
8282         mp1 = mp->b_cont->b_cont;
8283
8284         if (ipip->ipi_cmd_type == IF_CMD) {
8285                 /* This a old style SIOC[GS]IF* command */
8286                 ifr = (struct ifreq *)mp1->b_rptr;
8287                 /*
8288                  * Null terminate the string to protect against buffer
8289                  * overrun. String was generated by user code and may not
8290                  * be trusted.
8291                  */
8292                 ifr->ifr_name[IFNAMSIZ - 1] = '\0';
8293                 sin = (sin_t *)&ifr->ifr_addr;
8294                 name = ifr->ifr_name;
8295                 ci->ci_sin = sin;
8296                 ci->ci_sin6 = NULL;
8297                 ci->ci_lifr = (struct lifreq *)ifr;
8298         } else {
8299                 /* This a new style SIOC[GS]LIF* command */
8300                 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
8301                 lifr = (struct lifreq *)mp1->b_rptr;
8302                 /*
8303                  * Null terminate the string to protect against buffer
8304                  * overrun. String was generated by user code and may not
8305                  * be trusted.
8306                  */
8307                 lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
8308                 name = lifr->lifr_name;
8309                 sin = (sin_t *)&lifr->lifr_addr;
8310                 sin6 = (sin6_t *)&lifr->lifr_addr;
8311                 if (ipip->ipi_cmd == SIOCSLIFGROUPNAME) {
8312                         (void) strncpy(ci->ci_groupname, lifr->lifr_groupname,
8313                             LIFNAMSIZ);
8314                 }
8315                 ci->ci_sin = sin;
8316                 ci->ci_sin6 = sin6;
8317                 ci->ci_lifr = lifr;
8318         }
8319
8320         if (ipip->ipi_cmd == SIOCSLIFNAME) {
8321                 /*
8322                  * The ioctl will be failed if the ioctl comes down
8323                  * an conn stream
8324                  */
8325                 if (ill == NULL) {
8326                         /*
8327                          * Not an ill queue, return EINVAL same as the
8328                          * old error code.
8329                          */
8330                         return (ENXIO);
8331                 }
8332                 ipif = ill->ill_ipif;
8333                 ipif_refhold(ipif);
8334         } else {
8335                 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE,
8336                     &exists, isv6, zoneid,
8337                     (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err,
8338                     ipst);
8339                 if (ipif == NULL) {
8340                         if (err == EINPROGRESS)
8341                                 return (err);
8342                         if (ipip->ipi_cmd == SIOCLIFFAILOVER ||
8343                             ipip->ipi_cmd == SIOCLIFFAILBACK) {
8344                                 /*
8345                                  * Need to try both v4 and v6 since this
8346                                  * ioctl can come down either v4 or v6
8347                                  * socket. The lifreq.lifr_family passed
8348                                  * down by this ioctl is AF_UNSPEC.
8349                                  */
8350                                 ipif = ipif_lookup_on_name(name,
8351                                     mi_strlen(name), B_FALSE, &exists, !isv6,
8352                                     zoneid, (connp == NULL) ? q :
8353                                     CONNP_TO_WQ(connp), mp, func, &err, ipst);
8354                                 if (err == EINPROGRESS)
8355                                         return (err);
8356                         }
8357                         err = 0;        /* Ensure we don't use it below */
8358                 }
8359         }
8360
8361         /*
8362          * Old style [GS]IFCMD does not admit IPv6 ipif
8363          */
8364         if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) {
8365                 ipif_refrele(ipif);
8366                 return (ENXIO);
8367         }
8368
8369         if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL &&
8370             name[0] == '\0') {
8371                 /*
8372                  * Handle a or a SIOC?IF* with a null name
8373                  * during plumb (on the ill queue before the I_PLINK).
8374                  */
8375                 ipif = ill->ill_ipif;
8376                 ipif_refhold(ipif);
8377         }
8378
8379         if (ipif == NULL)
8380                 return (ENXIO);
8381
8382         /*
8383          * Allow only GET operations if this ipif has been created
8384          * temporarily due to a MOVE operation.
8385          */
8386         if (ipif->ipif_replace_zero && !(ipip->ipi_flags & IPI_REPL)) {
8387                 ipif_refrele(ipif);
8388                 return (EINVAL);
8389         }
8390
8391         ci->ci_ipif = ipif;
8392         return (0);
8393 }
8394
8395 /*
8396  * Return the total number of ipifs.
8397  */
8398 static uint_t
8399 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
8400 {
8401         uint_t numifs = 0;
8402         ill_t   *ill;
8403         ill_walk_context_t      ctx;
8404         ipif_t  *ipif;
8405
8406         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
8407         ill = ILL_START_WALK_V4(&ctx, ipst);
8408
8409         while (ill != NULL) {
8410                 for (ipif = ill->ill_ipif; ipif != NULL;
8411                     ipif = ipif->ipif_next) {
8412                         if (ipif->ipif_zoneid == zoneid ||
8413                             ipif->ipif_zoneid == ALL_ZONES)
8414                                 numifs++;
8415                 }
8416                 ill = ill_next(&ctx, ill);
8417         }
8418         rw_exit(&ipst->ips_ill_g_lock);
8419         return (numifs);
8420 }
8421
8422 /*
8423  * Return the total number of ipifs.
8424  */
8425 static uint_t
8426 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
8427 {
8428         uint_t numifs = 0;
8429         ill_t   *ill;
8430         ipif_t  *ipif;
8431         ill_walk_context_t      ctx;
8432
8433         ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid));
8434
8435         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
8436         if (family == AF_INET)
8437                 ill = ILL_START_WALK_V4(&ctx, ipst);
8438         else if (family == AF_INET6)
8439                 ill = ILL_START_WALK_V6(&ctx, ipst);
8440         else
8441                 ill = ILL_START_WALK_ALL(&ctx, ipst);
8442
8443         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
8444                 for (ipif = ill->ill_ipif; ipif != NULL;
8445                     ipif = ipif->ipif_next) {
8446                         if ((ipif->ipif_flags & IPIF_NOXMIT) &&
8447                             !(lifn_flags & LIFC_NOXMIT))
8448                                 continue;
8449                         if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
8450                             !(lifn_flags & LIFC_TEMPORARY))
8451                                 continue;
8452                         if (((ipif->ipif_flags &
8453                             (IPIF_NOXMIT|IPIF_NOLOCAL|
8454                             IPIF_DEPRECATED)) ||
8455                             IS_LOOPBACK(ill) ||
8456                             !(ipif->ipif_flags & IPIF_UP)) &&
8457                             (lifn_flags & LIFC_EXTERNAL_SOURCE))
8458                                 continue;
8459
8460                         if (zoneid != ipif->ipif_zoneid &&
8461                             ipif->ipif_zoneid != ALL_ZONES &&
8462                             (zoneid != GLOBAL_ZONEID ||
8463                             !(lifn_flags & LIFC_ALLZONES)))
8464                                 continue;
8465
8466                         numifs++;
8467                 }
8468         }
8469         rw_exit(&ipst->ips_ill_g_lock);
8470         return (numifs);
8471 }
8472
8473 uint_t
8474 ip_get_lifsrcofnum(ill_t *ill)
8475 {
8476         uint_t numifs = 0;
8477         ill_t   *ill_head = ill;
8478         ip_stack_t      *ipst = ill->ill_ipst;
8479
8480         /*
8481          * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some
8482          * other thread may be trying to relink the ILLs in this usesrc group
8483          * and adjusting the ill_usesrc_grp_next pointers
8484          */
8485         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
8486         if ((ill->ill_usesrc_ifindex == 0) &&
8487             (ill->ill_usesrc_grp_next != NULL)) {
8488                 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head);
8489                     ill = ill->ill_usesrc_grp_next)
8490                         numifs++;
8491         }
8492         rw_exit(&ipst->ips_ill_g_usesrc_lock);
8493
8494         return (numifs);
8495 }
8496
8497 /* Null values are passed in for ipif, sin, and ifreq */
8498 /* ARGSUSED */
8499 int
8500 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
8501     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
8502 {
8503         int *nump;
8504         conn_t *connp = Q_TO_CONN(q);
8505
8506         ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
8507
8508         /* Existence of b_cont->b_cont checked in ip_wput_nondata */
8509         nump = (int *)mp->b_cont->b_cont->b_rptr;
8510
8511         *nump = ip_get_numifs(connp->conn_zoneid,
8512             connp->conn_netstack->netstack_ip);
8513         ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump));
8514         return (0);
8515 }
8516
8517 /* Null values are passed in for ipif, sin, and ifreq */
8518 /* ARGSUSED */
8519 int
8520 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin,
8521     queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
8522 {
8523         struct lifnum *lifn;
8524         mblk_t  *mp1;
8525         conn_t *connp = Q_TO_CONN(q);
8526
8527         ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
8528
8529         /* Existence checked in ip_wput_nondata */
8530         mp1 = mp->b_cont->b_cont;
8531
8532         lifn = (struct lifnum *)mp1->b_rptr;
8533         switch (lifn->lifn_family) {
8534         case AF_UNSPEC:
8535         case AF_INET:
8536         case AF_INET6:
8537                 break;
8538         default:
8539                 return (EAFNOSUPPORT);
8540         }
8541
8542         lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags,
8543             connp->conn_zoneid, connp->conn_netstack->netstack_ip);
8544         ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count));
8545         return (0);
8546 }
8547
8548 /* ARGSUSED */
8549 int
8550 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
8551     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
8552 {
8553         STRUCT_HANDLE(ifconf, ifc);
8554         mblk_t *mp1;
8555         struct iocblk *iocp;
8556         struct ifreq *ifr;
8557         ill_walk_context_t      ctx;
8558         ill_t   *ill;
8559         ipif_t  *ipif;
8560         struct sockaddr_in *sin;
8561         int32_t ifclen;
8562         zoneid_t zoneid;
8563         ip_stack_t *ipst = CONNQ_TO_IPST(q);
8564
8565         ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */
8566
8567         ip1dbg(("ip_sioctl_get_ifconf"));
8568         /* Existence verified in ip_wput_nondata */
8569         mp1 = mp->b_cont->b_cont;
8570         iocp = (struct iocblk *)mp->b_rptr;
8571         zoneid = Q_TO_CONN(q)->conn_zoneid;
8572
8573         /*
8574          * The original SIOCGIFCONF passed in a struct ifconf which specified
8575          * the user buffer address and length into which the list of struct
8576          * ifreqs was to be copied.  Since AT&T Streams does not seem to
8577          * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS,
8578          * the SIOCGIFCONF operation was redefined to simply provide
8579          * a large output buffer into which we are supposed to jam the ifreq
8580          * array.  The same ioctl command code was used, despite the fact that
8581          * both the applications and the kernel code had to change, thus making
8582          * it impossible to support both interfaces.
8583          *
8584          * For reasons not good enough to try to explain, the following
8585          * algorithm is used for deciding what to do with one of these:
8586          * If the IOCTL comes in as an I_STR, it is assumed to be of the new
8587          * form with the output buffer coming down as the continuation message.
8588          * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style,
8589          * and we have to copy in the ifconf structure to find out how big the
8590          * output buffer is and where to copy out to.  Sure no problem...
8591          *
8592          */
8593         STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL);
8594         if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) {
8595                 int numifs = 0;
8596                 size_t ifc_bufsize;
8597
8598                 /*
8599                  * Must be (better be!) continuation of a TRANSPARENT
8600                  * IOCTL.  We just copied in the ifconf structure.
8601                  */
8602                 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag,
8603                     (struct ifconf *)mp1->b_rptr);
8604
8605                 /*
8606                  * Allocate a buffer to hold requested information.
8607                  *
8608                  * If ifc_len is larger than what is needed, we only
8609                  * allocate what we will use.
8610                  *
8611                  * If ifc_len is smaller than what is needed, return
8612                  * EINVAL.
8613                  *
8614                  * XXX: the ill_t structure can hava 2 counters, for
8615                  * v4 and v6 (not just ill_ipif_up_count) to store the
8616                  * number of interfaces for a device, so we don't need
8617                  * to count them here...
8618                  */
8619                 numifs = ip_get_numifs(zoneid, ipst);
8620
8621                 ifclen = STRUCT_FGET(ifc, ifc_len);
8622                 ifc_bufsize = numifs * sizeof (struct ifreq);
8623                 if (ifc_bufsize > ifclen) {
8624                         if (iocp->ioc_cmd == O_SIOCGIFCONF) {
8625                                 /* old behaviour */
8626                                 return (EINVAL);
8627                         } else {
8628                                 ifc_bufsize = ifclen;
8629                         }
8630                 }
8631
8632                 mp1 = mi_copyout_alloc(q, mp,
8633                     STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE);
8634                 if (mp1 == NULL)
8635                         return (ENOMEM);
8636
8637                 mp1->b_wptr = mp1->b_rptr + ifc_bufsize;
8638         }
8639         bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
8640         /*
8641          * the SIOCGIFCONF ioctl only knows about
8642          * IPv4 addresses, so don't try to tell
8643          * it about interfaces with IPv6-only
8644          * addresses. (Last parm 'isv6' is B_FALSE)
8645          */
8646
8647         ifr = (struct ifreq *)mp1->b_rptr;
8648
8649         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
8650         ill = ILL_START_WALK_V4(&ctx, ipst);
8651         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
8652                 for (ipif = ill->ill_ipif; ipif != NULL;
8653                     ipif = ipif->ipif_next) {
8654                         if (zoneid != ipif->ipif_zoneid &&
8655                             ipif->ipif_zoneid != ALL_ZONES)
8656                                 continue;
8657                         if ((uchar_t *)&ifr[1] > mp1->b_wptr) {
8658                                 if (iocp->ioc_cmd == O_SIOCGIFCONF) {
8659                                         /* old behaviour */
8660                                         rw_exit(&ipst->ips_ill_g_lock);
8661                                         return (EINVAL);
8662                                 } else {
8663                                         goto if_copydone;
8664                                 }
8665                         }
8666                         ipif_get_name(ipif, ifr->ifr_name,
8667                             sizeof (ifr->ifr_name));
8668                         sin = (sin_t *)&ifr->ifr_addr;
8669                         *sin = sin_null;
8670                         sin->sin_family = AF_INET;
8671                         sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
8672                         ifr++;
8673                 }
8674         }
8675 if_copydone:
8676         rw_exit(&ipst->ips_ill_g_lock);
8677         mp1->b_wptr = (uchar_t *)ifr;
8678
8679         if (STRUCT_BUF(ifc) != NULL) {
8680                 STRUCT_FSET(ifc, ifc_len,
8681                     (int)((uchar_t *)ifr - mp1->b_rptr));
8682         }
8683         return (0);
8684 }
8685
8686 /*
8687  * Get the interfaces using the address hosted on the interface passed in,
8688  * as a source adddress
8689  */
8690 /* ARGSUSED */
8691 int
8692 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
8693     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
8694 {
8695         mblk_t *mp1;
8696         ill_t   *ill, *ill_head;
8697         ipif_t  *ipif, *orig_ipif;
8698         int     numlifs = 0;
8699         size_t  lifs_bufsize, lifsmaxlen;
8700         struct  lifreq *lifr;
8701         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8702         uint_t  ifindex;
8703         zoneid_t zoneid;
8704         int err = 0;
8705         boolean_t isv6 = B_FALSE;
8706         struct  sockaddr_in     *sin;
8707         struct  sockaddr_in6    *sin6;
8708         STRUCT_HANDLE(lifsrcof, lifs);
8709         ip_stack_t              *ipst;
8710
8711         ipst = CONNQ_TO_IPST(q);
8712
8713         ASSERT(q->q_next == NULL);
8714
8715         zoneid = Q_TO_CONN(q)->conn_zoneid;
8716
8717         /* Existence verified in ip_wput_nondata */
8718         mp1 = mp->b_cont->b_cont;
8719
8720         /*
8721          * Must be (better be!) continuation of a TRANSPARENT
8722          * IOCTL.  We just copied in the lifsrcof structure.
8723          */
8724         STRUCT_SET_HANDLE(lifs, iocp->ioc_flag,
8725             (struct lifsrcof *)mp1->b_rptr);
8726
8727         if (MBLKL(mp1) != STRUCT_SIZE(lifs))
8728                 return (EINVAL);
8729
8730         ifindex = STRUCT_FGET(lifs, lifs_ifindex);
8731         isv6 = (Q_TO_CONN(q))->conn_af_isv6;
8732         ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp,
8733             ip_process_ioctl, &err, ipst);
8734         if (ipif == NULL) {
8735                 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n",
8736                     ifindex));
8737                 return (err);
8738         }
8739
8740
8741         /* Allocate a buffer to hold requested information */
8742         numlifs = ip_get_lifsrcofnum(ipif->ipif_ill);
8743         lifs_bufsize = numlifs * sizeof (struct lifreq);
8744         lifsmaxlen =  STRUCT_FGET(lifs, lifs_maxlen);
8745         /* The actual size needed is always returned in lifs_len */
8746         STRUCT_FSET(lifs, lifs_len, lifs_bufsize);
8747
8748         /* If the amount we need is more than what is passed in, abort */
8749         if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) {
8750                 ipif_refrele(ipif);
8751                 return (0);
8752         }
8753
8754         mp1 = mi_copyout_alloc(q, mp,
8755             STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE);
8756         if (mp1 == NULL) {
8757                 ipif_refrele(ipif);
8758                 return (ENOMEM);
8759         }
8760
8761         mp1->b_wptr = mp1->b_rptr + lifs_bufsize;
8762         bzero(mp1->b_rptr, lifs_bufsize);
8763
8764         lifr = (struct lifreq *)mp1->b_rptr;
8765
8766         ill = ill_head = ipif->ipif_ill;
8767         orig_ipif = ipif;
8768
8769         /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
8770         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
8771         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
8772
8773         ill = ill->ill_usesrc_grp_next; /* start from next ill */
8774         for (; (ill != NULL) && (ill != ill_head);
8775             ill = ill->ill_usesrc_grp_next) {
8776
8777                 if ((uchar_t *)&lifr[1] > mp1->b_wptr)
8778                         break;
8779
8780                 ipif = ill->ill_ipif;
8781                 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name));
8782                 if (ipif->ipif_isv6) {
8783                         sin6 = (sin6_t *)&lifr->lifr_addr;
8784                         *sin6 = sin6_null;
8785                         sin6->sin6_family = AF_INET6;
8786                         sin6->sin6_addr = ipif->ipif_v6lcl_addr;
8787                         lifr->lifr_addrlen = ip_mask_to_plen_v6(
8788                             &ipif->ipif_v6net_mask);
8789                 } else {
8790                         sin = (sin_t *)&lifr->lifr_addr;
8791                         *sin = sin_null;
8792                         sin->sin_family = AF_INET;
8793                         sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
8794                         lifr->lifr_addrlen = ip_mask_to_plen(
8795                             ipif->ipif_net_mask);
8796                 }
8797                 lifr++;
8798         }
8799         rw_exit(&ipst->ips_ill_g_usesrc_lock);
8800         rw_exit(&ipst->ips_ill_g_lock);
8801         ipif_refrele(orig_ipif);
8802         mp1->b_wptr = (uchar_t *)lifr;
8803         STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr));
8804
8805         return (0);
8806 }
8807
8808 /* ARGSUSED */
8809 int
8810 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
8811     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
8812 {
8813         mblk_t *mp1;
8814         int     list;
8815         ill_t   *ill;
8816         ipif_t  *ipif;
8817         int     flags;
8818         int     numlifs = 0;
8819         size_t  lifc_bufsize;
8820         struct  lifreq *lifr;
8821         sa_family_t     family;
8822         struct  sockaddr_in     *sin;
8823         struct  sockaddr_in6    *sin6;
8824         ill_walk_context_t      ctx;
8825         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8826         int32_t lifclen;
8827         zoneid_t zoneid;
8828         STRUCT_HANDLE(lifconf, lifc);
8829         ip_stack_t *ipst = CONNQ_TO_IPST(q);
8830
8831         ip1dbg(("ip_sioctl_get_lifconf"));
8832
8833         ASSERT(q->q_next == NULL);
8834
8835         zoneid = Q_TO_CONN(q)->conn_zoneid;
8836
8837         /* Existence verified in ip_wput_nondata */
8838         mp1 = mp->b_cont->b_cont;
8839
8840         /*
8841          * An extended version of SIOCGIFCONF that takes an
8842          * additional address family and flags field.
8843          * AF_UNSPEC retrieve both IPv4 and IPv6.
8844          * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT
8845          * interfaces are omitted.
8846          * Similarly, IPIF_TEMPORARY interfaces are omitted
8847          * unless LIFC_TEMPORARY is specified.
8848          * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT,
8849          * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and
8850          * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE
8851          * has priority over LIFC_NOXMIT.
8852          */
8853         STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL);
8854
8855         if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc))
8856                 return (EINVAL);
8857
8858         /*
8859          * Must be (better be!) continuation of a TRANSPARENT
8860          * IOCTL.  We just copied in the lifconf structure.
8861          */
8862         STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr);
8863
8864         family = STRUCT_FGET(lifc, lifc_family);
8865         flags = STRUCT_FGET(lifc, lifc_flags);
8866
8867         switch (family) {
8868         case AF_UNSPEC:
8869                 /*
8870                  * walk all ILL's.
8871                  */
8872                 list = MAX_G_HEADS;
8873                 break;
8874         case AF_INET:
8875                 /*
8876                  * walk only IPV4 ILL's.
8877                  */
8878                 list = IP_V4_G_HEAD;
8879                 break;
8880         case AF_INET6:
8881                 /*
8882                  * walk only IPV6 ILL's.
8883                  */
8884                 list = IP_V6_G_HEAD;
8885                 break;
8886         default:
8887                 return (EAFNOSUPPORT);
8888         }
8889
8890         /*
8891          * Allocate a buffer to hold requested information.
8892          *
8893          * If lifc_len is larger than what is needed, we only
8894          * allocate what we will use.
8895          *
8896          * If lifc_len is smaller than what is needed, return
8897          * EINVAL.
8898          */
8899         numlifs = ip_get_numlifs(family, flags, zoneid, ipst);
8900         lifc_bufsize = numlifs * sizeof (struct lifreq);
8901         lifclen = STRUCT_FGET(lifc, lifc_len);
8902         if (lifc_bufsize > lifclen) {
8903                 if (iocp->ioc_cmd == O_SIOCGLIFCONF)
8904                         return (EINVAL);
8905                 else
8906                         lifc_bufsize = lifclen;
8907         }
8908
8909         mp1 = mi_copyout_alloc(q, mp,
8910             STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE);
8911         if (mp1 == NULL)
8912                 return (ENOMEM);
8913
8914         mp1->b_wptr = mp1->b_rptr + lifc_bufsize;
8915         bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
8916
8917         lifr = (struct lifreq *)mp1->b_rptr;
8918
8919         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
8920         ill = ill_first(list, list, &ctx, ipst);
8921         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
8922                 for (ipif = ill->ill_ipif; ipif != NULL;
8923                     ipif = ipif->ipif_next) {
8924                         if ((ipif->ipif_flags & IPIF_NOXMIT) &&
8925                             !(flags & LIFC_NOXMIT))
8926                                 continue;
8927
8928                         if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
8929                             !(flags & LIFC_TEMPORARY))
8930                                 continue;
8931
8932                         if (((ipif->ipif_flags &
8933                             (IPIF_NOXMIT|IPIF_NOLOCAL|
8934                             IPIF_DEPRECATED)) ||
8935                             IS_LOOPBACK(ill) ||
8936                             !(ipif->ipif_flags & IPIF_UP)) &&
8937                             (flags & LIFC_EXTERNAL_SOURCE))
8938                                 continue;
8939
8940                         if (zoneid != ipif->ipif_zoneid &&
8941                             ipif->ipif_zoneid != ALL_ZONES &&
8942                             (zoneid != GLOBAL_ZONEID ||
8943                             !(flags & LIFC_ALLZONES)))
8944                                 continue;
8945
8946                         if ((uchar_t *)&lifr[1] > mp1->b_wptr) {
8947                                 if (iocp->ioc_cmd == O_SIOCGLIFCONF) {
8948                                         rw_exit(&ipst->ips_ill_g_lock);
8949                                         return (EINVAL);
8950                                 } else {
8951                                         goto lif_copydone;
8952                                 }
8953                         }
8954
8955                         ipif_get_name(ipif, lifr->lifr_name,
8956                             sizeof (lifr->lifr_name));
8957                         if (ipif->ipif_isv6) {
8958                                 sin6 = (sin6_t *)&lifr->lifr_addr;
8959                                 *sin6 = sin6_null;
8960                                 sin6->sin6_family = AF_INET6;
8961                                 sin6->sin6_addr =
8962                                     ipif->ipif_v6lcl_addr;
8963                                 lifr->lifr_addrlen =
8964                                     ip_mask_to_plen_v6(
8965                                     &ipif->ipif_v6net_mask);
8966                         } else {
8967                                 sin = (sin_t *)&lifr->lifr_addr;
8968                                 *sin = sin_null;
8969                                 sin->sin_family = AF_INET;
8970                                 sin->sin_addr.s_addr =
8971                                     ipif->ipif_lcl_addr;
8972                                 lifr->lifr_addrlen =
8973                                     ip_mask_to_plen(
8974                                     ipif->ipif_net_mask);
8975                         }
8976                         lifr++;
8977                 }
8978         }
8979 lif_copydone:
8980         rw_exit(&ipst->ips_ill_g_lock);
8981
8982         mp1->b_wptr = (uchar_t *)lifr;
8983         if (STRUCT_BUF(lifc) != NULL) {
8984                 STRUCT_FSET(lifc, lifc_len,
8985                     (int)((uchar_t *)lifr - mp1->b_rptr));
8986         }
8987         return (0);
8988 }
8989
8990 /* ARGSUSED */
8991 int
8992 ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin,
8993     queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
8994 {
8995         ip_stack_t      *ipst;
8996
8997         if (q->q_next == NULL)
8998                 ipst = CONNQ_TO_IPST(q);
8999         else
9000                 ipst = ILLQ_TO_IPST(q);
9001
9002         /* Existence of b_cont->b_cont checked in ip_wput_nondata */
9003         ipst->ips_ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr;
9004         return (0);
9005 }
9006
9007 static void
9008 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
9009 {
9010         ip6_asp_t *table;
9011         size_t table_size;
9012         mblk_t *data_mp;
9013         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
9014         ip_stack_t      *ipst;
9015
9016         if (q->q_next == NULL)
9017                 ipst = CONNQ_TO_IPST(q);
9018         else
9019                 ipst = ILLQ_TO_IPST(q);
9020
9021         /* These two ioctls are I_STR only */
9022         if (iocp->ioc_count == TRANSPARENT) {
9023                 miocnak(q, mp, 0, EINVAL);
9024                 return;
9025         }
9026
9027         data_mp = mp->b_cont;
9028         if (data_mp == NULL) {
9029                 /* The user passed us a NULL argument */
9030                 table = NULL;
9031                 table_size = iocp->ioc_count;
9032         } else {
9033                 /*
9034                  * The user provided a table.  The stream head
9035                  * may have copied in the user data in chunks,
9036                  * so make sure everything is pulled up
9037                  * properly.
9038                  */
9039                 if (MBLKL(data_mp) < iocp->ioc_count) {
9040                         mblk_t *new_data_mp;
9041                         if ((new_data_mp = msgpullup(data_mp, -1)) ==
9042                             NULL) {
9043                                 miocnak(q, mp, 0, ENOMEM);
9044                                 return;
9045                         }
9046                         freemsg(data_mp);
9047                         data_mp = new_data_mp;
9048                         mp->b_cont = data_mp;
9049                 }
9050                 table = (ip6_asp_t *)data_mp->b_rptr;
9051                 table_size = iocp->ioc_count;
9052         }
9053
9054         switch (iocp->ioc_cmd) {
9055         case SIOCGIP6ADDRPOLICY:
9056                 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst);
9057                 if (iocp->ioc_rval == -1)
9058                         iocp->ioc_error = EINVAL;
9059 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
9060                 else if (table != NULL &&
9061                     (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) {
9062                         ip6_asp_t *src = table;
9063                         ip6_asp32_t *dst = (void *)table;
9064                         int count = table_size / sizeof (ip6_asp_t);
9065                         int i;
9066
9067                         /*
9068                          * We need to do an in-place shrink of the array
9069                          * to match the alignment attributes of the
9070                          * 32-bit ABI looking at it.
9071                          */
9072                         /* LINTED: logical expression always true: op "||" */
9073                         ASSERT(sizeof (*src) > sizeof (*dst));
9074                         for (i = 1; i < count; i++)
9075                                 bcopy(src + i, dst + i, sizeof (*dst));
9076                 }
9077 #endif
9078                 break;
9079
9080         case SIOCSIP6ADDRPOLICY:
9081                 ASSERT(mp->b_prev == NULL);
9082                 mp->b_prev = (void *)q;
9083 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
9084                 /*
9085                  * We pass in the datamodel here so that the ip6_asp_replace()
9086                  * routine can handle converting from 32-bit to native formats
9087                  * where necessary.
9088                  *
9089                  * A better way to handle this might be to convert the inbound
9090                  * data structure here, and hang it off a new 'mp'; thus the
9091                  * ip6_asp_replace() logic would always be dealing with native
9092                  * format data structures..
9093                  *
9094                  * (An even simpler way to handle these ioctls is to just
9095                  * add a 32-bit trailing 'pad' field to the ip6_asp_t structure
9096                  * and just recompile everything that depends on it.)
9097                  */
9098 #endif
9099                 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst,
9100                     iocp->ioc_flag & IOC_MODELS);
9101                 return;
9102         }
9103
9104         DB_TYPE(mp) =  (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK;
9105         qreply(q, mp);
9106 }
9107
9108 static void
9109 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
9110 {
9111         mblk_t          *data_mp;
9112         struct dstinforeq       *dir;
9113         uint8_t         *end, *cur;
9114         in6_addr_t      *daddr, *saddr;
9115         ipaddr_t        v4daddr;
9116         ire_t           *ire;
9117         char            *slabel, *dlabel;
9118         boolean_t       isipv4;
9119         int             match_ire;
9120         ill_t           *dst_ill;
9121         ipif_t          *src_ipif, *ire_ipif;
9122         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
9123         zoneid_t        zoneid;
9124         ip_stack_t      *ipst = CONNQ_TO_IPST(q);
9125
9126         ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
9127         zoneid = Q_TO_CONN(q)->conn_zoneid;
9128
9129         /*
9130          * This ioctl is I_STR only, and must have a
9131          * data mblk following the M_IOCTL mblk.
9132          */
9133         data_mp = mp->b_cont;
9134         if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) {
9135                 miocnak(q, mp, 0, EINVAL);
9136                 return;
9137         }
9138
9139         if (MBLKL(data_mp) < iocp->ioc_count) {
9140                 mblk_t *new_data_mp;
9141
9142                 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) {
9143                         miocnak(q, mp, 0, ENOMEM);
9144                         return;
9145                 }
9146                 freemsg(data_mp);
9147                 data_mp = new_data_mp;
9148                 mp->b_cont = data_mp;
9149         }
9150         match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT;
9151
9152         for (cur = data_mp->b_rptr, end = data_mp->b_wptr;
9153             end - cur >= sizeof (struct dstinforeq);
9154             cur += sizeof (struct dstinforeq)) {
9155                 dir = (struct dstinforeq *)cur;
9156                 daddr = &dir->dir_daddr;
9157                 saddr = &dir->dir_saddr;
9158
9159                 /*
9160                  * ip_addr_scope_v6() and ip6_asp_lookup() handle
9161                  * v4 mapped addresses; ire_ftable_lookup[_v6]()
9162                  * and ipif_select_source[_v6]() do not.
9163                  */
9164                 dir->dir_dscope = ip_addr_scope_v6(daddr);
9165                 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst);
9166
9167                 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr);
9168                 if (isipv4) {
9169                         IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr);
9170                         ire = ire_ftable_lookup(v4daddr, NULL, NULL,
9171                             0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst);
9172                 } else {
9173                         ire = ire_ftable_lookup_v6(daddr, NULL, NULL,
9174                             0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst);
9175                 }
9176                 if (ire == NULL) {
9177                         dir->dir_dreachable = 0;
9178
9179                         /* move on to next dst addr */
9180                         continue;
9181                 }
9182                 dir->dir_dreachable = 1;
9183
9184                 ire_ipif = ire->ire_ipif;
9185                 if (ire_ipif == NULL)
9186                         goto next_dst;
9187
9188                 /*
9189                  * We expect to get back an interface ire or a
9190                  * gateway ire cache entry.  For both types, the
9191                  * output interface is ire_ipif->ipif_ill.
9192                  */
9193                 dst_ill = ire_ipif->ipif_ill;
9194                 dir->dir_dmactype = dst_ill->ill_mactype;
9195
9196                 if (isipv4) {
9197                         src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid);
9198                 } else {
9199                         src_ipif = ipif_select_source_v6(dst_ill,
9200                             daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT,
9201                             zoneid);
9202                 }
9203                 if (src_ipif == NULL)
9204                         goto next_dst;
9205
9206                 *saddr = src_ipif->ipif_v6lcl_addr;
9207                 dir->dir_sscope = ip_addr_scope_v6(saddr);
9208                 slabel = ip6_asp_lookup(saddr, NULL, ipst);
9209                 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel);
9210                 dir->dir_sdeprecated =
9211                     (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0;
9212                 ipif_refrele(src_ipif);
9213 next_dst:
9214                 ire_refrele(ire);
9215         }
9216         miocack(q, mp, iocp->ioc_count, 0);
9217 }
9218
9219
9220 /*
9221  * Check if this is an address assigned to this machine.
9222  * Skips interfaces that are down by using ire checks.
9223  * Translates mapped addresses to v4 addresses and then
9224  * treats them as such, returning true if the v4 address
9225  * associated with this mapped address is configured.
9226  * Note: Applications will have to be careful what they do
9227  * with the response; use of mapped addresses limits
9228  * what can be done with the socket, especially with
9229  * respect to socket options and ioctls - neither IPv4
9230  * options nor IPv6 sticky options/ancillary data options
9231  * may be used.
9232  */
9233 /* ARGSUSED */
9234 int
9235 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
9236     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
9237 {
9238         struct sioc_addrreq *sia;
9239         sin_t *sin;
9240         ire_t *ire;
9241         mblk_t *mp1;
9242         zoneid_t zoneid;
9243         ip_stack_t      *ipst;
9244
9245         ip1dbg(("ip_sioctl_tmyaddr"));
9246
9247         ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
9248         zoneid = Q_TO_CONN(q)->conn_zoneid;
9249         ipst = CONNQ_TO_IPST(q);
9250
9251         /* Existence verified in ip_wput_nondata */
9252         mp1 = mp->b_cont->b_cont;
9253         sia = (struct sioc_addrreq *)mp1->b_rptr;
9254         sin = (sin_t *)&sia->sa_addr;
9255         switch (sin->sin_family) {
9256         case AF_INET6: {
9257                 sin6_t *sin6 = (sin6_t *)sin;
9258
9259                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
9260                         ipaddr_t v4_addr;
9261
9262                         IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
9263                             v4_addr);
9264                         ire = ire_ctable_lookup(v4_addr, 0,
9265                             IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
9266                             NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst);
9267                 } else {
9268                         in6_addr_t v6addr;
9269
9270                         v6addr = sin6->sin6_addr;
9271                         ire = ire_ctable_lookup_v6(&v6addr, 0,
9272                             IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
9273                             NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst);
9274                 }
9275                 break;
9276         }
9277         case AF_INET: {
9278                 ipaddr_t v4addr;
9279
9280                 v4addr = sin->sin_addr.s_addr;
9281                 ire = ire_ctable_lookup(v4addr, 0,
9282                     IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
9283                     NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst);
9284                 break;
9285         }
9286         default:
9287                 return (EAFNOSUPPORT);
9288         }
9289         if (ire != NULL) {
9290                 sia->sa_res = 1;
9291                 ire_refrele(ire);
9292         } else {
9293                 sia->sa_res = 0;
9294         }
9295         return (0);
9296 }
9297
9298 /*
9299  * Check if this is an address assigned on-link i.e. neighbor,
9300  * and makes sure it's reachable from the current zone.
9301  * Returns true for my addresses as well.
9302  * Translates mapped addresses to v4 addresses and then
9303  * treats them as such, returning true if the v4 address
9304  * associated with this mapped address is configured.
9305  * Note: Applications will have to be careful what they do
9306  * with the response; use of mapped addresses limits
9307  * what can be done with the socket, especially with
9308  * respect to socket options and ioctls - neither IPv4
9309  * options nor IPv6 sticky options/ancillary data options
9310  * may be used.
9311  */
9312 /* ARGSUSED */
9313 int
9314 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
9315     ip_ioctl_cmd_t *ipip, void *duymmy_ifreq)
9316 {
9317         struct sioc_addrreq *sia;
9318         sin_t *sin;
9319         mblk_t  *mp1;
9320         ire_t *ire = NULL;
9321         zoneid_t zoneid;
9322         ip_stack_t      *ipst;
9323
9324         ip1dbg(("ip_sioctl_tonlink"));
9325
9326         ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
9327         zoneid = Q_TO_CONN(q)->conn_zoneid;
9328         ipst = CONNQ_TO_IPST(q);
9329
9330         /* Existence verified in ip_wput_nondata */
9331         mp1 = mp->b_cont->b_cont;
9332         sia = (struct sioc_addrreq *)mp1->b_rptr;
9333         sin = (sin_t *)&sia->sa_addr;
9334
9335         /*
9336          * Match addresses with a zero gateway field to avoid
9337          * routes going through a router.
9338          * Exclude broadcast and multicast addresses.
9339          */
9340         switch (sin->sin_family) {
9341         case AF_INET6: {
9342                 sin6_t *sin6 = (sin6_t *)sin;
9343
9344                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
9345                         ipaddr_t v4_addr;
9346
9347                         IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
9348                             v4_addr);
9349                         if (!CLASSD(v4_addr)) {
9350                                 ire = ire_route_lookup(v4_addr, 0, 0, 0,
9351                                     NULL, NULL, zoneid, NULL,
9352                                     MATCH_IRE_GW, ipst);
9353                         }
9354                 } else {
9355                         in6_addr_t v6addr;
9356                         in6_addr_t v6gw;
9357
9358                         v6addr = sin6->sin6_addr;
9359                         v6gw = ipv6_all_zeros;
9360                         if (!IN6_IS_ADDR_MULTICAST(&v6addr)) {
9361                                 ire = ire_route_lookup_v6(&v6addr, 0,
9362                                     &v6gw, 0, NULL, NULL, zoneid,
9363                                     NULL, MATCH_IRE_GW, ipst);
9364                         }
9365                 }
9366                 break;
9367         }
9368         case AF_INET: {
9369                 ipaddr_t v4addr;
9370
9371                 v4addr = sin->sin_addr.s_addr;
9372                 if (!CLASSD(v4addr)) {
9373                         ire = ire_route_lookup(v4addr, 0, 0, 0,
9374                             NULL, NULL, zoneid, NULL,
9375                             MATCH_IRE_GW, ipst);
9376                 }
9377                 break;
9378         }
9379         default:
9380                 return (EAFNOSUPPORT);
9381         }
9382         sia->sa_res = 0;
9383         if (ire != NULL) {
9384                 if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE|
9385                     IRE_LOCAL|IRE_LOOPBACK)) {
9386                         sia->sa_res = 1;
9387                 }
9388                 ire_refrele(ire);
9389         }
9390         return (0);
9391 }
9392
9393 /*
9394  * TBD: implement when kernel maintaines a list of site prefixes.
9395  */
9396 /* ARGSUSED */
9397 int
9398 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9399     ip_ioctl_cmd_t *ipip, void *ifreq)
9400 {
9401         return (ENXIO);
9402 }
9403
9404 /* ARGSUSED */
9405 int
9406 ip_sioctl_tunparam(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
9407     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
9408 {
9409         ill_t           *ill;
9410         mblk_t          *mp1;
9411         conn_t          *connp;
9412         boolean_t       success;
9413
9414         ip1dbg(("ip_sioctl_tunparam(%s:%u %p)\n",
9415             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9416         /* ioctl comes down on an conn */
9417         ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
9418         connp = Q_TO_CONN(q);
9419
9420         mp->b_datap->db_type = M_IOCTL;
9421
9422         /*
9423          * Send down a copy. (copymsg does not copy b_next/b_prev).
9424          * The original mp contains contaminated b_next values due to 'mi',
9425          * which is needed to do the mi_copy_done. Unfortunately if we
9426          * send down the original mblk itself and if we are popped due to an
9427          * an unplumb before the response comes back from tunnel,
9428          * the streamhead (which does a freemsg) will see this contaminated
9429          * message and the assertion in freemsg about non-null b_next/b_prev
9430          * will panic a DEBUG kernel.
9431          */
9432         mp1 = copymsg(mp);
9433         if (mp1 == NULL)
9434                 return (ENOMEM);
9435
9436         ill = ipif->ipif_ill;
9437         mutex_enter(&connp->conn_lock);
9438         mutex_enter(&ill->ill_lock);
9439         if (ipip->ipi_cmd == SIOCSTUNPARAM || ipip->ipi_cmd == OSIOCSTUNPARAM) {
9440                 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp),
9441                     mp, 0);
9442         } else {
9443                 success = ill_pending_mp_add(ill, connp, mp);
9444         }
9445         mutex_exit(&ill->ill_lock);
9446         mutex_exit(&connp->conn_lock);
9447
9448         if (success) {
9449                 ip1dbg(("sending down tunparam request "));
9450                 putnext(ill->ill_wq, mp1);
9451                 return (EINPROGRESS);
9452         } else {
9453                 /* The conn has started closing */
9454                 freemsg(mp1);
9455                 return (EINTR);
9456         }
9457 }
9458
9459 /*
9460  * ARP IOCTLs.
9461  * How does IP get in the business of fronting ARP configuration/queries?
9462  * Well it's like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP)
9463  * are by tradition passed in through a datagram socket.  That lands in IP.
9464  * As it happens, this is just as well since the interface is quite crude in
9465  * that it passes in no information about protocol or hardware types, or
9466  * interface association.  After making the protocol assumption, IP is in
9467  * the position to look up the name of the ILL, which ARP will need, and
9468  * format a request that can be handled by ARP.  The request is passed up
9469  * stream to ARP, and the original IOCTL is completed by IP when ARP passes
9470  * back a response.  ARP supports its own set of more general IOCTLs, in
9471  * case anyone is interested.
9472  */
9473 /* ARGSUSED */
9474 int
9475 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9476     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
9477 {
9478         mblk_t *mp1;
9479         mblk_t *mp2;
9480         mblk_t *pending_mp;
9481         ipaddr_t ipaddr;
9482         area_t *area;
9483         struct iocblk *iocp;
9484         conn_t *connp;
9485         struct arpreq *ar;
9486         struct xarpreq *xar;
9487         int flags, alength;
9488         char *lladdr;
9489         ip_stack_t      *ipst;
9490         ill_t *ill = ipif->ipif_ill;
9491         boolean_t if_arp_ioctl = B_FALSE;
9492
9493         ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
9494         connp = Q_TO_CONN(q);
9495         ipst = connp->conn_netstack->netstack_ip;
9496
9497         if (ipip->ipi_cmd_type == XARP_CMD) {
9498                 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */
9499                 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr;
9500                 ar = NULL;
9501
9502                 flags = xar->xarp_flags;
9503                 lladdr = LLADDR(&xar->xarp_ha);
9504                 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
9505                 /*
9506                  * Validate against user's link layer address length
9507                  * input and name and addr length limits.
9508                  */
9509                 alength = ill->ill_phys_addr_length;
9510                 if (ipip->ipi_cmd == SIOCSXARP) {
9511                         if (alength != xar->xarp_ha.sdl_alen ||
9512                             (alength + xar->xarp_ha.sdl_nlen >
9513                             sizeof (xar->xarp_ha.sdl_data)))
9514                                 return (EINVAL);
9515                 }
9516         } else {
9517                 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */
9518                 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr;
9519                 xar = NULL;
9520
9521                 flags = ar->arp_flags;
9522                 lladdr = ar->arp_ha.sa_data;
9523                 /*
9524                  * Theoretically, the sa_family could tell us what link
9525                  * layer type this operation is trying to deal with. By
9526                  * common usage AF_UNSPEC means ethernet. We'll assume
9527                  * any attempt to use the SIOC?ARP ioctls is for ethernet,
9528                  * for now. Our new SIOC*XARP ioctls can be used more
9529                  * generally.
9530                  *
9531                  * If the underlying media happens to have a non 6 byte
9532                  * address, arp module will fail set/get, but the del
9533                  * operation will succeed.
9534                  */
9535                 alength = 6;
9536                 if ((ipip->ipi_cmd != SIOCDARP) &&
9537                     (alength != ill->ill_phys_addr_length)) {
9538                         return (EINVAL);
9539                 }
9540         }
9541
9542         /*
9543          * We are going to pass up to ARP a packet chain that looks
9544          * like:
9545          *
9546          * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK
9547          *
9548          * Get a copy of the original IOCTL mblk to head the chain,
9549          * to be sent up (in mp1). Also get another copy to store
9550          * in the ill_pending_mp list, for matching the response
9551          * when it comes back from ARP.
9552          */
9553         mp1 = copyb(mp);
9554         pending_mp = copymsg(mp);
9555         if (mp1 == NULL || pending_mp == NULL) {
9556                 if (mp1 != NULL)
9557                         freeb(mp1);
9558                 if (pending_mp != NULL)
9559                         inet_freemsg(pending_mp);
9560                 return (ENOMEM);
9561         }
9562
9563         ipaddr = sin->sin_addr.s_addr;
9564
9565         mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template,
9566             (caddr_t)&ipaddr);
9567         if (mp2 == NULL) {
9568                 freeb(mp1);
9569                 inet_freemsg(pending_mp);
9570                 return (ENOMEM);
9571         }
9572         /* Put together the chain. */
9573         mp1->b_cont = mp2;
9574         mp1->b_datap->db_type = M_IOCTL;
9575         mp2->b_cont = mp;
9576         mp2->b_datap->db_type = M_DATA;
9577
9578         iocp = (struct iocblk *)mp1->b_rptr;
9579
9580         /*
9581          * An M_IOCDATA's payload (struct copyresp) is mostly the same as an
9582          * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a
9583          * cp_private field (or cp_rval on 32-bit systems) in place of the
9584          * ioc_count field; set ioc_count to be correct.
9585          */
9586         iocp->ioc_count = MBLKL(mp1->b_cont);
9587
9588         /*
9589          * Set the proper command in the ARP message.
9590          * Convert the SIOC{G|S|D}ARP calls into our
9591          * AR_ENTRY_xxx calls.
9592          */
9593         area = (area_t *)mp2->b_rptr;
9594         switch (iocp->ioc_cmd) {
9595         case SIOCDARP:
9596         case SIOCDXARP:
9597                 /*
9598                  * We defer deleting the corresponding IRE until
9599                  * we return from arp.
9600                  */
9601                 area->area_cmd = AR_ENTRY_DELETE;
9602                 area->area_proto_mask_offset = 0;
9603                 break;
9604         case SIOCGARP:
9605         case SIOCGXARP:
9606                 area->area_cmd = AR_ENTRY_SQUERY;
9607                 area->area_proto_mask_offset = 0;
9608                 break;
9609         case SIOCSARP:
9610         case SIOCSXARP:
9611                 /*
9612                  * Delete the corresponding ire to make sure IP will
9613                  * pick up any change from arp.
9614                  */
9615                 if (!if_arp_ioctl) {
9616                         (void) ip_ire_clookup_and_delete(ipaddr, NULL, ipst);
9617                 } else {
9618                         ipif_t *ipif = ipif_get_next_ipif(NULL, ill);
9619                         if (ipif != NULL) {
9620                                 (void) ip_ire_clookup_and_delete(ipaddr, ipif,
9621                                     ipst);
9622                                 ipif_refrele(ipif);
9623                         }
9624                 }
9625                 break;
9626         }
9627         iocp->ioc_cmd = area->area_cmd;
9628
9629         /*
9630          * Fill in the rest of the ARP operation fields.
9631          */
9632         area->area_hw_addr_length = alength;
9633         bcopy(lladdr, (char *)area + area->area_hw_addr_offset, alength);
9634
9635         /* Translate the flags. */
9636         if (flags & ATF_PERM)
9637                 area->area_flags |= ACE_F_PERMANENT;
9638         if (flags & ATF_PUBL)
9639                 area->area_flags |= ACE_F_PUBLISH;
9640         if (flags & ATF_AUTHORITY)
9641                 area->area_flags |= ACE_F_AUTHORITY;
9642
9643         /*
9644          * Before sending 'mp' to ARP, we have to clear the b_next
9645          * and b_prev. Otherwise if STREAMS encounters such a message
9646          * in freemsg(), (because ARP can close any time) it can cause
9647          * a panic. But mi code needs the b_next and b_prev values of
9648          * mp->b_cont, to complete the ioctl. So we store it here
9649          * in pending_mp->bcont, and restore it in ip_sioctl_iocack()
9650          * when the response comes down from ARP.
9651          */
9652         pending_mp->b_cont->b_next = mp->b_cont->b_next;
9653         pending_mp->b_cont->b_prev = mp->b_cont->b_prev;
9654         mp->b_cont->b_next = NULL;
9655         mp->b_cont->b_prev = NULL;
9656
9657         mutex_enter(&connp->conn_lock);
9658         mutex_enter(&ill->ill_lock);
9659         /* conn has not yet started closing, hence this can't fail */
9660         VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0);
9661         mutex_exit(&ill->ill_lock);
9662         mutex_exit(&connp->conn_lock);
9663
9664         /*
9665          * Up to ARP it goes.  The response will come back in ip_wput() as an
9666          * M_IOCACK, and will be handed to ip_sioctl_iocack() for completion.
9667          */
9668         putnext(ill->ill_rq, mp1);
9669         return (EINPROGRESS);
9670 }
9671
9672 /*
9673  * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify
9674  * the associated sin and refhold and return the associated ipif via `ci'.
9675  */
9676 int
9677 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
9678     cmd_info_t *ci, ipsq_func_t func)
9679 {
9680         mblk_t  *mp1;
9681         int     err;
9682         sin_t   *sin;
9683         conn_t  *connp;
9684         ipif_t  *ipif;
9685         ire_t   *ire = NULL;
9686         ill_t   *ill = NULL;
9687         boolean_t exists;
9688         ip_stack_t *ipst;
9689         struct arpreq *ar;
9690         struct xarpreq *xar;
9691         struct sockaddr_dl *sdl;
9692
9693         /* ioctl comes down on a conn */
9694         ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
9695         connp = Q_TO_CONN(q);
9696         if (connp->conn_af_isv6)
9697                 return (ENXIO);
9698
9699         ipst = connp->conn_netstack->netstack_ip;
9700
9701         /* Verified in ip_wput_nondata */
9702         mp1 = mp->b_cont->b_cont;
9703
9704         if (ipip->ipi_cmd_type == XARP_CMD) {
9705                 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq));
9706                 xar = (struct xarpreq *)mp1->b_rptr;
9707                 sin = (sin_t *)&xar->xarp_pa;
9708                 sdl = &xar->xarp_ha;
9709
9710                 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET)
9711                         return (ENXIO);
9712                 if (sdl->sdl_nlen >= LIFNAMSIZ)
9713                         return (EINVAL);
9714         } else {
9715                 ASSERT(ipip->ipi_cmd_type == ARP_CMD);
9716                 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq));
9717                 ar = (struct arpreq *)mp1->b_rptr;
9718                 sin = (sin_t *)&ar->arp_pa;
9719         }
9720
9721         if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) {
9722                 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen,
9723                     B_FALSE, &exists, B_FALSE, ALL_ZONES, CONNP_TO_WQ(connp),
9724                     mp, func, &err, ipst);
9725                 if (ipif == NULL)
9726                         return (err);
9727                 if (ipif->ipif_id != 0 ||
9728                     ipif->ipif_net_type != IRE_IF_RESOLVER) {
9729                         ipif_refrele(ipif);
9730                         return (ENXIO);
9731                 }
9732         } else {
9733                 /*
9734                  * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with sdl_nlen ==
9735                  * 0: use the IP address to figure out the ill.  In the IPMP
9736                  * case, a simple forwarding table lookup will return the
9737                  * IRE_IF_RESOLVER for the first interface in the group, which
9738                  * might not be the interface on which the requested IP
9739                  * address was resolved due to the ill selection algorithm
9740                  * (see ip_newroute_get_dst_ill()).  So we do a cache table
9741                  * lookup first: if the IRE cache entry for the IP address is
9742                  * still there, it will contain the ill pointer for the right
9743                  * interface, so we use that. If the cache entry has been
9744                  * flushed, we fall back to the forwarding table lookup. This
9745                  * should be rare enough since IRE cache entries have a longer
9746                  * life expectancy than ARP cache entries.
9747                  */
9748                 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL,
9749                     ipst);
9750                 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) ||
9751                     ((ill = ire_to_ill(ire)) == NULL) ||
9752                     (ill->ill_net_type != IRE_IF_RESOLVER)) {
9753                         if (ire != NULL)
9754                                 ire_refrele(ire);
9755                         ire = ire_ftable_lookup(sin->sin_addr.s_addr,
9756                             0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0,
9757                             NULL, MATCH_IRE_TYPE, ipst);
9758                         if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) {
9759
9760                                 if (ire != NULL)
9761                                         ire_refrele(ire);
9762                                 return (ENXIO);
9763                         }
9764                 }
9765                 ASSERT(ire != NULL && ill != NULL);
9766                 ipif = ill->ill_ipif;
9767                 ipif_refhold(ipif);
9768                 ire_refrele(ire);
9769         }
9770         ci->ci_sin = sin;
9771         ci->ci_ipif = ipif;
9772         return (0);
9773 }
9774
9775 /*
9776  * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also
9777  * atomically set/clear the muxids. Also complete the ioctl by acking or
9778  * naking it.  Note that the code is structured such that the link type,
9779  * whether it's persistent or not, is treated equally.  ifconfig(1M) and
9780  * its clones use the persistent link, while pppd(1M) and perhaps many
9781  * other daemons may use non-persistent link.  When combined with some
9782  * ill_t states, linking and unlinking lower streams may be used as
9783  * indicators of dynamic re-plumbing events [see PSARC/1999/348].
9784  */
9785 /* ARGSUSED */
9786 void
9787 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
9788 {
9789         mblk_t          *mp1, *mp2;
9790         struct linkblk  *li;
9791         struct ipmx_s   *ipmxp;
9792         ill_t           *ill;
9793         int             ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
9794         int             err = 0;
9795         boolean_t       entered_ipsq = B_FALSE;
9796         boolean_t       islink;
9797         ip_stack_t      *ipst;
9798
9799         if (CONN_Q(q))
9800                 ipst = CONNQ_TO_IPST(q);
9801         else
9802                 ipst = ILLQ_TO_IPST(q);
9803
9804         ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK ||
9805             ioccmd == I_LINK || ioccmd == I_UNLINK);
9806
9807         islink = (ioccmd == I_PLINK || ioccmd == I_LINK);
9808
9809         mp1 = mp->b_cont;       /* This is the linkblk info */
9810         li = (struct linkblk *)mp1->b_rptr;
9811
9812         /*
9813          * ARP has added this special mblk, and the utility is asking us
9814          * to perform consistency checks, and also atomically set the
9815          * muxid. Ifconfig is an example.  It achieves this by using
9816          * /dev/arp as the mux to plink the arp stream, and pushes arp on
9817          * to /dev/udp[6] stream for use as the mux when plinking the IP
9818          * stream. SIOCSLIFMUXID is not required.  See ifconfig.c, arp.c
9819          * and other comments in this routine for more details.
9820          */
9821         mp2 = mp1->b_cont;      /* This is added by ARP */
9822
9823         /*
9824          * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than
9825          * ifconfig which didn't push ARP on top of the dummy mux, we won't
9826          * get the special mblk above.  For backward compatibility, we
9827          * request ip_sioctl_plink_ipmod() to skip the consistency checks.
9828          * The utility will use SIOCSLIFMUXID to store the muxids.  This is
9829          * not atomic, and can leave the streams unplumbable if the utility
9830          * is interrupted before it does the SIOCSLIFMUXID.
9831          */
9832         if (mp2 == NULL) {
9833                 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_FALSE);
9834                 if (err == EINPROGRESS)
9835                         return;
9836                 goto done;
9837         }
9838
9839         /*
9840          * This is an I_{P}LINK sent down by ifconfig through the ARP module;
9841          * ARP has appended this last mblk to tell us whether the lower stream
9842          * is an arp-dev stream or an IP module stream.
9843          */
9844         ipmxp = (struct ipmx_s *)mp2->b_rptr;
9845         if (ipmxp->ipmx_arpdev_stream) {
9846                 /*
9847                  * The lower stream is the arp-dev stream.
9848                  */
9849                 ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE,
9850                     q, mp, ip_sioctl_plink, &err, NULL, ipst);
9851                 if (ill == NULL) {
9852                         if (err == EINPROGRESS)
9853                                 return;
9854                         err = EINVAL;
9855                         goto done;
9856                 }
9857
9858                 if (ipsq == NULL) {
9859                         ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
9860                             NEW_OP, B_TRUE);
9861                         if (ipsq == NULL) {
9862                                 ill_refrele(ill);
9863                                 return;
9864                         }
9865                         entered_ipsq = B_TRUE;
9866                 }
9867                 ASSERT(IAM_WRITER_ILL(ill));
9868                 ill_refrele(ill);
9869
9870                 /*
9871                  * To ensure consistency between IP and ARP, the following
9872                  * LIFO scheme is used in plink/punlink. (IP first, ARP last).
9873                  * This is because the muxid's are stored in the IP stream on
9874                  * the ill.
9875                  *
9876                  * I_{P}LINK: ifconfig plinks the IP stream before plinking
9877                  * the ARP stream. On an arp-dev stream, IP checks that it is
9878                  * not yet plinked, and it also checks that the corresponding
9879                  * IP stream is already plinked.
9880                  *
9881                  * I_{P}UNLINK: ifconfig punlinks the ARP stream before
9882                  * punlinking the IP stream. IP does not allow punlink of the
9883                  * IP stream unless the arp stream has been punlinked.
9884                  */
9885                 if ((islink &&
9886                     (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) ||
9887                     (!islink && ill->ill_arp_muxid != li->l_index)) {
9888                         err = EINVAL;
9889                         goto done;
9890                 }
9891                 ill->ill_arp_muxid = islink ? li->l_index : 0;
9892         } else {
9893                 /*
9894                  * The lower stream is probably an IP module stream.  Do
9895                  * consistency checking.
9896                  */
9897                 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_TRUE);
9898                 if (err == EINPROGRESS)
9899                         return;
9900         }
9901 done:
9902         if (err == 0)
9903                 miocack(q, mp, 0, 0);
9904         else
9905                 miocnak(q, mp, 0, err);
9906
9907         /* Conn was refheld in ip_sioctl_copyin_setup */
9908         if (CONN_Q(q))
9909                 CONN_OPER_PENDING_DONE(Q_TO_CONN(q));
9910         if (entered_ipsq)
9911                 ipsq_exit(ipsq, B_TRUE, B_TRUE);
9912 }
9913
9914 /*
9915  * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to
9916  * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP
9917  * module stream).  If `doconsist' is set, then do the extended consistency
9918  * checks requested by ifconfig(1M) and (atomically) set ill_ip_muxid here.
9919  * Returns zero on success, EINPROGRESS if the operation is still pending, or
9920  * an error code on failure.
9921  */
9922 static int
9923 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
9924     struct linkblk *li, boolean_t doconsist)
9925 {
9926         ill_t           *ill;
9927         queue_t         *ipwq, *dwq;
9928         const char      *name;
9929         struct qinit    *qinfo;
9930         boolean_t       islink = (ioccmd == I_PLINK || ioccmd == I_LINK);
9931         boolean_t       entered_ipsq = B_FALSE;
9932
9933         /*
9934          * Walk the lower stream to verify it's the IP module stream.
9935          * The IP module is identified by its name, wput function,
9936          * and non-NULL q_next.  STREAMS ensures that the lower stream
9937          * (li->l_qbot) will not vanish until this ioctl completes.
9938          */
9939         for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) {
9940                 qinfo = ipwq->q_qinfo;
9941                 name = qinfo->qi_minfo->mi_idname;
9942                 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 &&
9943                     qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) {
9944                         break;
9945                 }
9946         }
9947
9948         /*
9949          * If this isn't an IP module stream, bail.
9950          */
9951         if (ipwq == NULL)
9952                 return (0);
9953
9954         ill = ipwq->q_ptr;
9955         ASSERT(ill != NULL);
9956
9957         if (ipsq == NULL) {
9958                 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
9959                     NEW_OP, B_TRUE);
9960                 if (ipsq == NULL)
9961                         return (EINPROGRESS);
9962                 entered_ipsq = B_TRUE;
9963         }
9964         ASSERT(IAM_WRITER_ILL(ill));
9965
9966         if (doconsist) {
9967                 /*
9968                  * Consistency checking requires that I_{P}LINK occurs
9969                  * prior to setting ill_ip_muxid, and that I_{P}UNLINK
9970                  * occurs prior to clearing ill_arp_muxid.
9971                  */
9972                 if ((islink && ill->ill_ip_muxid != 0) ||
9973                     (!islink && ill->ill_arp_muxid != 0)) {
9974                         if (entered_ipsq)
9975                                 ipsq_exit(ipsq, B_TRUE, B_TRUE);
9976                         return (EINVAL);
9977                 }
9978         }
9979
9980         /*
9981          * As part of I_{P}LINKing, stash the number of downstream modules and
9982          * the read queue of the module immediately below IP in the ill.
9983          * These are used during the capability negotiation below.
9984          */
9985         ill->ill_lmod_rq = NULL;
9986         ill->ill_lmod_cnt = 0;
9987         if (islink && ((dwq = ipwq->q_next) != NULL)) {
9988                 ill->ill_lmod_rq = RD(dwq);
9989                 for (; dwq != NULL; dwq = dwq->q_next)
9990                         ill->ill_lmod_cnt++;
9991         }
9992
9993         if (doconsist)
9994                 ill->ill_ip_muxid = islink ? li->l_index : 0;
9995
9996         /*
9997          * If there's at least one up ipif on this ill, then we're bound to
9998          * the underlying driver via DLPI.  In that case, renegotiate
9999          * capabilities to account for any possible change in modules
10000          * interposed between IP and the driver.
10001          */
10002         if (ill->ill_ipif_up_count > 0) {
10003                 if (islink)
10004                         ill_capability_probe(ill);
10005                 else
10006                         ill_capability_reset(ill);
10007         }
10008
10009         if (entered_ipsq)
10010                 ipsq_exit(ipsq, B_TRUE, B_TRUE);
10011
10012         return (0);
10013 }
10014
10015 /*
10016  * Search the ioctl command in the ioctl tables and return a pointer
10017  * to the ioctl command information. The ioctl command tables are
10018  * static and fully populated at compile time.
10019  */
10020 ip_ioctl_cmd_t *
10021 ip_sioctl_lookup(int ioc_cmd)
10022 {
10023         int index;
10024         ip_ioctl_cmd_t *ipip;
10025         ip_ioctl_cmd_t *ipip_end;
10026
10027         if (ioc_cmd == IPI_DONTCARE)
10028                 return (NULL);
10029
10030         /*
10031          * Do a 2 step search. First search the indexed table
10032          * based on the least significant byte of the ioctl cmd.
10033          * If we don't find a match, then search the misc table
10034          * serially.
10035          */
10036         index = ioc_cmd & 0xFF;
10037         if (index < ip_ndx_ioctl_count) {
10038                 ipip = &ip_ndx_ioctl_table[index];
10039                 if (ipip->ipi_cmd == ioc_cmd) {
10040                         /* Found a match in the ndx table */
10041                         return (ipip);
10042                 }
10043         }
10044
10045         /* Search the misc table */
10046         ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count];
10047         for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) {
10048                 if (ipip->ipi_cmd == ioc_cmd)
10049                         /* Found a match in the misc table */
10050                         return (ipip);
10051         }
10052
10053         return (NULL);
10054 }
10055
10056 /*
10057  * Wrapper function for resuming deferred ioctl processing
10058  * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER,
10059  * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently.
10060  */
10061 /* ARGSUSED */
10062 void
10063 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp,
10064     void *dummy_arg)
10065 {
10066         ip_sioctl_copyin_setup(q, mp);
10067 }
10068
10069 /*
10070  * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message
10071  * that arrives.  Most of the IOCTLs are "socket" IOCTLs which we handle
10072  * in either I_STR or TRANSPARENT form, using the mi_copy facility.
10073  * We establish here the size of the block to be copied in.  mi_copyin
10074  * arranges for this to happen, an processing continues in ip_wput with
10075  * an M_IOCDATA message.
10076  */
10077 void
10078 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp)
10079 {
10080         int     copyin_size;
10081         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
10082         ip_ioctl_cmd_t *ipip;
10083         cred_t *cr;
10084         ip_stack_t      *ipst;
10085
10086         if (CONN_Q(q))
10087                 ipst = CONNQ_TO_IPST(q);
10088         else
10089                 ipst = ILLQ_TO_IPST(q);
10090
10091         ipip = ip_sioctl_lookup(iocp->ioc_cmd);
10092         if (ipip == NULL) {
10093                 /*
10094                  * The ioctl is not one we understand or own.
10095                  * Pass it along to be processed down stream,
10096                  * if this is a module instance of IP, else nak
10097                  * the ioctl.
10098                  */
10099                 if (q->q_next == NULL) {
10100                         goto nak;
10101                 } else {
10102                         putnext(q, mp);
10103                         return;
10104                 }
10105         }
10106
10107         /*
10108          * If this is deferred, then we will do all the checks when we
10109          * come back.
10110          */
10111         if ((iocp->ioc_cmd == SIOCGDSTINFO ||
10112             iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) {
10113                 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume);
10114                 return;
10115         }
10116
10117         /*
10118          * Only allow a very small subset of IP ioctls on this stream if
10119          * IP is a module and not a driver. Allowing ioctls to be processed
10120          * in this case may cause assert failures or data corruption.
10121          * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few
10122          * ioctls allowed on an IP module stream, after which this stream
10123          * normally becomes a multiplexor (at which time the stream head
10124          * will fail all ioctls).
10125          */
10126         if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
10127                 if (ipip->ipi_flags & IPI_PASS_DOWN) {
10128                         /*
10129                          * Pass common Streams ioctls which the IP
10130                          * module does not own or consume along to
10131                          * be processed down stream.
10132                          */
10133                         putnext(q, mp);
10134                         return;
10135                 } else {
10136                         goto nak;
10137                 }
10138         }
10139
10140         /* Make sure we have ioctl data to process. */
10141         if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT))
10142                 goto nak;
10143
10144         /*
10145          * Prefer dblk credential over ioctl credential; some synthesized
10146          * ioctls have kcred set because there's no way to crhold()
10147          * a credential in some contexts.  (ioc_cr is not crfree() by
10148          * the framework; the caller of ioctl needs to hold the reference
10149          * for the duration of the call).
10150          */
10151         cr = DB_CREDDEF(mp, iocp->ioc_cr);
10152
10153         /* Make sure normal users don't send down privileged ioctls */
10154         if ((ipip->ipi_flags & IPI_PRIV) &&
10155             (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) {
10156                 /* We checked the privilege earlier but log it here */
10157                 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE));
10158                 return;
10159         }
10160
10161         /*
10162          * The ioctl command tables can only encode fixed length
10163          * ioctl data. If the length is variable, the table will
10164          * encode the length as zero. Such special cases are handled
10165          * below in the switch.
10166          */
10167         if (ipip->ipi_copyin_size != 0) {
10168                 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size);
10169                 return;
10170         }
10171
10172         switch (iocp->ioc_cmd) {
10173         case O_SIOCGIFCONF:
10174         case SIOCGIFCONF:
10175                 /*
10176                  * This IOCTL is hilarious.  See comments in
10177                  * ip_sioctl_get_ifconf for the story.
10178                  */
10179                 if (iocp->ioc_count == TRANSPARENT)
10180                         copyin_size = SIZEOF_STRUCT(ifconf,
10181                             iocp->ioc_flag);
10182                 else
10183                         copyin_size = iocp->ioc_count;
10184                 mi_copyin(q, mp, NULL, copyin_size);
10185                 return;
10186
10187         case O_SIOCGLIFCONF:
10188         case SIOCGLIFCONF:
10189                 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag);
10190                 mi_copyin(q, mp, NULL, copyin_size);
10191                 return;
10192
10193         case SIOCGLIFSRCOF:
10194                 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag);
10195                 mi_copyin(q, mp, NULL, copyin_size);
10196                 return;
10197         case SIOCGIP6ADDRPOLICY:
10198                 ip_sioctl_ip6addrpolicy(q, mp);
10199                 ip6_asp_table_refrele(ipst);
10200                 return;
10201
10202         case SIOCSIP6ADDRPOLICY:
10203                 ip_sioctl_ip6addrpolicy(q, mp);
10204                 return;
10205
10206         case SIOCGDSTINFO:
10207                 ip_sioctl_dstinfo(q, mp);
10208                 ip6_asp_table_refrele(ipst);
10209                 return;
10210
10211         case I_PLINK:
10212         case I_PUNLINK:
10213         case I_LINK:
10214         case I_UNLINK:
10215                 /*
10216                  * We treat non-persistent link similarly as the persistent
10217                  * link case, in terms of plumbing/unplumbing, as well as
10218                  * dynamic re-plumbing events indicator.  See comments
10219                  * in ip_sioctl_plink() for more.
10220                  *
10221                  * Request can be enqueued in the 'ipsq' while waiting
10222                  * to become exclusive. So bump up the conn ref.
10223                  */
10224                 if (CONN_Q(q))
10225                         CONN_INC_REF(Q_TO_CONN(q));
10226                 ip_sioctl_plink(NULL, q, mp, NULL);
10227                 return;
10228
10229         case ND_GET:
10230         case ND_SET:
10231                 /*
10232                  * Use of the nd table requires holding the reader lock.
10233                  * Modifying the nd table thru nd_load/nd_unload requires
10234                  * the writer lock.
10235                  */
10236                 rw_enter(&ipst->ips_ip_g_nd_lock, RW_READER);
10237                 if (nd_getset(q, ipst->ips_ip_g_nd, mp)) {
10238                         rw_exit(&ipst->ips_ip_g_nd_lock);
10239
10240                         if (iocp->ioc_error)
10241                                 iocp->ioc_count = 0;
10242                         mp->b_datap->db_type = M_IOCACK;
10243                         qreply(q, mp);
10244                         return;
10245                 }
10246                 rw_exit(&ipst->ips_ip_g_nd_lock);
10247                 /*
10248                  * We don't understand this subioctl of ND_GET / ND_SET.
10249                  * Maybe intended for some driver / module below us
10250                  */
10251                 if (q->q_next) {
10252                         putnext(q, mp);
10253                 } else {
10254                         iocp->ioc_error = ENOENT;
10255                         mp->b_datap->db_type = M_IOCNAK;
10256                         iocp->ioc_count = 0;
10257                         qreply(q, mp);
10258                 }
10259                 return;
10260
10261         case IP_IOCTL:
10262                 ip_wput_ioctl(q, mp);
10263                 return;
10264         default:
10265                 cmn_err(CE_PANIC, "should not happen ");
10266         }
10267 nak:
10268         if (mp->b_cont != NULL) {
10269                 freemsg(mp->b_cont);
10270                 mp->b_cont = NULL;
10271         }
10272         iocp->ioc_error = EINVAL;
10273         mp->b_datap->db_type = M_IOCNAK;
10274         iocp->ioc_count = 0;
10275         qreply(q, mp);
10276 }
10277
10278 /* ip_wput hands off ARP IOCTL responses to us */
10279 void
10280 ip_sioctl_iocack(queue_t *q, mblk_t *mp)
10281 {
10282         struct arpreq *ar;
10283         struct xarpreq *xar;
10284         area_t  *area;
10285         mblk_t  *area_mp;
10286         struct iocblk *iocp;
10287         mblk_t  *orig_ioc_mp, *tmp;
10288         struct iocblk   *orig_iocp;
10289         ill_t *ill;
10290         conn_t *connp = NULL;
10291         uint_t ioc_id;
10292         mblk_t *pending_mp;
10293         int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE;
10294         int *flagsp;
10295         char *storage = NULL;
10296         sin_t *sin;
10297         ipaddr_t addr;
10298         int err;
10299         ip_stack_t *ipst;
10300
10301         ill = q->q_ptr;
10302         ASSERT(ill != NULL);
10303         ipst = ill->ill_ipst;
10304
10305         /*
10306          * We should get back from ARP a packet chain that looks like:
10307          * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK
10308          */
10309         if (!(area_mp = mp->b_cont) ||
10310             (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) ||
10311             !(orig_ioc_mp = area_mp->b_cont) ||
10312             !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) {
10313                 freemsg(mp);
10314                 return;
10315         }
10316
10317         orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr;
10318
10319         tmp = (orig_ioc_mp->b_cont)->b_cont;
10320         if ((orig_iocp->ioc_cmd == SIOCGXARP) ||
10321             (orig_iocp->ioc_cmd == SIOCSXARP) ||
10322             (orig_iocp->ioc_cmd == SIOCDXARP)) {
10323                 x_arp_ioctl = B_TRUE;
10324                 xar = (struct xarpreq *)tmp->b_rptr;
10325                 sin = (sin_t *)&xar->xarp_pa;
10326                 flagsp = &xar->xarp_flags;
10327                 storage = xar->xarp_ha.sdl_data;
10328                 if (xar->xarp_ha.sdl_nlen != 0)
10329                         ifx_arp_ioctl = B_TRUE;
10330         } else {
10331                 ar = (struct arpreq *)tmp->b_rptr;
10332                 sin = (sin_t *)&ar->arp_pa;
10333                 flagsp = &ar->arp_flags;
10334                 storage = ar->arp_ha.sa_data;
10335         }
10336
10337         iocp = (struct iocblk *)mp->b_rptr;
10338
10339         /*
10340          * Pick out the originating queue based on the ioc_id.
10341          */
10342         ioc_id = iocp->ioc_id;
10343         pending_mp = ill_pending_mp_get(ill, &connp, ioc_id);
10344         if (pending_mp == NULL) {
10345                 ASSERT(connp == NULL);
10346                 inet_freemsg(mp);
10347                 return;
10348         }
10349         ASSERT(connp != NULL);
10350         q = CONNP_TO_WQ(connp);
10351
10352         /* Uncouple the internally generated IOCTL from the original one */
10353         area = (area_t *)area_mp->b_rptr;
10354         area_mp->b_cont = NULL;
10355
10356         /*
10357          * Restore the b_next and b_prev used by mi code. This is needed
10358          * to complete the ioctl using mi* functions. We stored them in
10359          * the pending mp prior to sending the request to ARP.
10360          */
10361         orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next;
10362         orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev;
10363         inet_freemsg(pending_mp);
10364
10365         /*
10366          * We're done if there was an error or if this is not an SIOCG{X}ARP
10367          * Catch the case where there is an IRE_CACHE by no entry in the
10368          * arp table.
10369          */
10370         addr = sin->sin_addr.s_addr;
10371         if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) {
10372                 ire_t                   *ire;
10373                 dl_unitdata_req_t       *dlup;
10374                 mblk_t                  *llmp;
10375                 int                     addr_len;
10376                 ill_t                   *ipsqill = NULL;
10377
10378                 if (ifx_arp_ioctl) {
10379                         /*
10380                          * There's no need to lookup the ill, since
10381                          * we've already done that when we started
10382                          * processing the ioctl and sent the message
10383                          * to ARP on that ill.  So use the ill that
10384                          * is stored in q->q_ptr.
10385                          */
10386                         ipsqill = ill;
10387                         ire = ire_ctable_lookup(addr, 0, IRE_CACHE,
10388                             ipsqill->ill_ipif, ALL_ZONES,
10389                             NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
10390                 } else {
10391                         ire = ire_ctable_lookup(addr, 0, IRE_CACHE,
10392                             NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
10393                         if (ire != NULL)
10394                                 ipsqill = ire_to_ill(ire);
10395                 }
10396
10397                 if ((x_arp_ioctl) && (ipsqill != NULL))
10398                         storage += ill_xarp_info(&xar->xarp_ha, ipsqill);
10399
10400                 if (ire != NULL) {
10401                         /*
10402                          * Since the ire obtained from cachetable is used for
10403                          * mac addr copying below, treat an incomplete ire as if
10404                          * as if we never found it.
10405                          */
10406                         if (ire->ire_nce != NULL &&
10407                             ire->ire_nce->nce_state != ND_REACHABLE) {
10408                                 ire_refrele(ire);
10409                                 ire = NULL;
10410                                 ipsqill = NULL;
10411                                 goto errack;
10412                         }
10413                         *flagsp = ATF_INUSE;
10414                         llmp = (ire->ire_nce != NULL ?
10415                             ire->ire_nce->nce_res_mp : NULL);
10416                         if (llmp != NULL && ipsqill != NULL) {
10417                                 uchar_t *macaddr;
10418
10419                                 addr_len = ipsqill->ill_phys_addr_length;
10420                                 if (x_arp_ioctl && ((addr_len +
10421                                     ipsqill->ill_name_length) >
10422                                     sizeof (xar->xarp_ha.sdl_data))) {
10423                                         ire_refrele(ire);
10424                                         freemsg(mp);
10425                                         ip_ioctl_finish(q, orig_ioc_mp,
10426                                             EINVAL, NO_COPYOUT, NULL);
10427                                         return;
10428                                 }
10429                                 *flagsp |= ATF_COM;
10430                                 dlup = (dl_unitdata_req_t *)llmp->b_rptr;
10431                                 if (ipsqill->ill_sap_length < 0)
10432                                         macaddr = llmp->b_rptr +
10433                                             dlup->dl_dest_addr_offset;
10434                                 else
10435                                         macaddr = llmp->b_rptr +
10436                                             dlup->dl_dest_addr_offset +
10437                                             ipsqill->ill_sap_length;
10438                                 /*
10439                                  * For SIOCGARP, MAC address length
10440                                  * validation has already been done
10441                                  * before the ioctl was issued to ARP to
10442                                  * allow it to progress only on 6 byte
10443                                  * addressable (ethernet like) media. Thus
10444                                  * the mac address copying can not overwrite
10445                                  * the sa_data area below.
10446                                  */
10447                                 bcopy(macaddr, storage, addr_len);
10448                         }
10449                         /* Ditch the internal IOCTL. */
10450                         freemsg(mp);
10451                         ire_refrele(ire);
10452                         ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL);
10453                         return;
10454                 }
10455         }
10456
10457         /*
10458          * Delete the coresponding IRE_CACHE if any.
10459          * Reset the error if there was one (in case there was no entry
10460          * in arp.)
10461          */
10462         if (iocp->ioc_cmd == AR_ENTRY_DELETE) {
10463                 ipif_t *ipintf = NULL;
10464
10465                 if (ifx_arp_ioctl) {
10466                         /*
10467                          * There's no need to lookup the ill, since
10468                          * we've already done that when we started
10469                          * processing the ioctl and sent the message
10470                          * to ARP on that ill.  So use the ill that
10471                          * is stored in q->q_ptr.
10472                          */
10473                         ipintf = ill->ill_ipif;
10474                 }
10475                 if (ip_ire_clookup_and_delete(addr, ipintf, ipst)) {
10476                         /*
10477                          * The address in "addr" may be an entry for a
10478                          * router. If that's true, then any off-net
10479                          * IRE_CACHE entries that go through the router
10480                          * with address "addr" must be clobbered. Use
10481                          * ire_walk to achieve this goal.
10482                          */
10483                         if (ifx_arp_ioctl)
10484                                 ire_walk_ill_v4(MATCH_IRE_ILL, 0,
10485                                     ire_delete_cache_gw, (char *)&addr, ill);
10486                         else
10487                                 ire_walk_v4(ire_delete_cache_gw, (char *)&addr,
10488                                     ALL_ZONES, ipst);
10489                         iocp->ioc_error = 0;
10490                 }
10491         }
10492 errack:
10493         if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) {
10494                 err = iocp->ioc_error;
10495                 freemsg(mp);
10496                 ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL);
10497                 return;
10498         }
10499
10500         /*
10501          * Completion of an SIOCG{X}ARP.  Translate the information from
10502          * the area_t into the struct {x}arpreq.
10503          */
10504         if (x_arp_ioctl) {
10505                 storage += ill_xarp_info(&xar->xarp_ha, ill);
10506                 if ((ill->ill_phys_addr_length + ill->ill_name_length) >
10507                     sizeof (xar->xarp_ha.sdl_data)) {
10508                         freemsg(mp);
10509                         ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT,
10510                             NULL);
10511                         return;
10512                 }
10513         }
10514         *flagsp = ATF_INUSE;
10515         if (area->area_flags & ACE_F_PERMANENT)
10516                 *flagsp |= ATF_PERM;
10517         if (area->area_flags & ACE_F_PUBLISH)
10518                 *flagsp |= ATF_PUBL;
10519         if (area->area_flags & ACE_F_AUTHORITY)
10520                 *flagsp |= ATF_AUTHORITY;
10521         if (area->area_hw_addr_length != 0) {
10522                 *flagsp |= ATF_COM;
10523                 /*
10524                  * For SIOCGARP, MAC address length validation has
10525                  * already been done before the ioctl was issued to ARP
10526                  * to allow it to progress only on 6 byte addressable
10527                  * (ethernet like) media. Thus the mac address copying
10528                  * can not overwrite the sa_data area below.
10529                  */
10530                 bcopy((char *)area + area->area_hw_addr_offset,
10531                     storage, area->area_hw_addr_length);
10532         }
10533
10534         /* Ditch the internal IOCTL. */
10535         freemsg(mp);
10536         /* Complete the original. */
10537         ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL);
10538 }
10539
10540 /*
10541  * Create a new logical interface. If ipif_id is zero (i.e. not a logical
10542  * interface) create the next available logical interface for this
10543  * physical interface.
10544  * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an
10545  * ipif with the specified name.
10546  *
10547  * If the address family is not AF_UNSPEC then set the address as well.
10548  *
10549  * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout)
10550  * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer.
10551  *
10552  * Executed as a writer on the ill or ill group.
10553  * So no lock is needed to traverse the ipif chain, or examine the
10554  * phyint flags.
10555  */
10556 /* ARGSUSED */
10557 int
10558 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
10559     ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
10560 {
10561         mblk_t  *mp1;
10562         struct lifreq *lifr;
10563         boolean_t       isv6;
10564         boolean_t       exists;
10565         char    *name;
10566         char    *endp;
10567         char    *cp;
10568         int     namelen;
10569         ipif_t  *ipif;
10570         long    id;
10571         ipsq_t  *ipsq;
10572         ill_t   *ill;
10573         sin_t   *sin;
10574         int     err = 0;
10575         boolean_t found_sep = B_FALSE;
10576         conn_t  *connp;
10577         zoneid_t zoneid;
10578         int     orig_ifindex = 0;
10579         ip_stack_t *ipst = CONNQ_TO_IPST(q);
10580
10581         ASSERT(q->q_next == NULL);
10582         ip1dbg(("ip_sioctl_addif\n"));
10583         /* Existence of mp1 has been checked in ip_wput_nondata */
10584         mp1 = mp->b_cont->b_cont;
10585         /*
10586          * Null terminate the string to protect against buffer
10587          * overrun. String was generated by user code and may not
10588          * be trusted.
10589          */
10590         lifr = (struct lifreq *)mp1->b_rptr;
10591         lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
10592         name = lifr->lifr_name;
10593         ASSERT(CONN_Q(q));
10594         connp = Q_TO_CONN(q);
10595         isv6 = connp->conn_af_isv6;
10596         zoneid = connp->conn_zoneid;
10597         namelen = mi_strlen(name);
10598         if (namelen == 0)
10599                 return (EINVAL);
10600
10601         exists = B_FALSE;
10602         if ((namelen + 1 == sizeof (ipif_loopback_name)) &&
10603             (mi_strcmp(name, ipif_loopback_name) == 0)) {
10604                 /*
10605                  * Allow creating lo0 using SIOCLIFADDIF.
10606                  * can't be any other writer thread. So can pass null below
10607                  * for the last 4 args to ipif_lookup_name.
10608                  */
10609                 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE,
10610                     &exists, isv6, zoneid, NULL, NULL, NULL, NULL, ipst);
10611                 /* Prevent any further action */
10612                 if (ipif == NULL) {
10613                         return (ENOBUFS);
10614                 } else if (!exists) {
10615                         /* We created the ipif now and as writer */
10616                         ipif_refrele(ipif);
10617                         return (0);
10618                 } else {
10619                         ill = ipif->ipif_ill;
10620                         ill_refhold(ill);
10621                         ipif_refrele(ipif);
10622                 }
10623         } else {
10624                 /* Look for a colon in the name. */
10625                 endp = &name[namelen];
10626                 for (cp = endp; --cp > name; ) {
10627                         if (*cp == IPIF_SEPARATOR_CHAR) {
10628                                 found_sep = B_TRUE;
10629                                 /*
10630                                  * Reject any non-decimal aliases for plumbing
10631                                  * of logical interfaces. Aliases with leading
10632                                  * zeroes are also rejected as they introduce
10633                                  * ambiguity in the naming of the interfaces.
10634                                  * Comparing with "0" takes care of all such
10635                                  * cases.
10636                                  */
10637                                 if ((strncmp("0", cp+1, 1)) == 0)
10638                                         return (EINVAL);
10639
10640                                 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 ||
10641                                     id <= 0 || *endp != '\0') {
10642                                         return (EINVAL);
10643                                 }
10644                                 *cp = '\0';
10645                                 break;
10646                         }
10647                 }
10648                 ill = ill_lookup_on_name(name, B_FALSE, isv6,
10649                     CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL, ipst);
10650                 if (found_sep)
10651                         *cp = IPIF_SEPARATOR_CHAR;
10652                 if (ill == NULL)
10653                         return (err);
10654         }
10655
10656         ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP,
10657             B_TRUE);
10658
10659         /*
10660          * Release the refhold due to the lookup, now that we are excl
10661          * or we are just returning
10662          */
10663         ill_refrele(ill);
10664
10665         if (ipsq == NULL)
10666                 return (EINPROGRESS);
10667
10668         /*
10669          * If the interface is failed, inactive or offlined, look for a working
10670          * interface in the ill group and create the ipif there. If we can't
10671          * find a good interface, create the ipif anyway so that in.mpathd can
10672          * move it to the first repaired interface.
10673          */
10674         if ((ill->ill_phyint->phyint_flags &
10675             (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
10676             ill->ill_phyint->phyint_groupname_len != 0) {
10677                 phyint_t *phyi;
10678                 char *groupname = ill->ill_phyint->phyint_groupname;
10679
10680                 /*
10681                  * We're looking for a working interface, but it doesn't matter
10682                  * if it's up or down; so instead of following the group lists,
10683                  * we look at each physical interface and compare the groupname.
10684                  * We're only interested in interfaces with IPv4 (resp. IPv6)
10685                  * plumbed when we're adding an IPv4 (resp. IPv6) ipif.
10686                  * Otherwise we create the ipif on the failed interface.
10687                  */
10688                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10689                 phyi = avl_first(&ipst->ips_phyint_g_list->
10690                     phyint_list_avl_by_index);
10691                 for (; phyi != NULL;
10692                     phyi = avl_walk(&ipst->ips_phyint_g_list->
10693                     phyint_list_avl_by_index,
10694                     phyi, AVL_AFTER)) {
10695                         if (phyi->phyint_groupname_len == 0)
10696                                 continue;
10697                         ASSERT(phyi->phyint_groupname != NULL);
10698                         if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 &&
10699                             !(phyi->phyint_flags &
10700                             (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
10701                             (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) :
10702                             (phyi->phyint_illv4 != NULL))) {
10703                                 break;
10704                         }
10705                 }
10706                 rw_exit(&ipst->ips_ill_g_lock);
10707
10708                 if (phyi != NULL) {
10709                         orig_ifindex = ill->ill_phyint->phyint_ifindex;
10710                         ill = (ill->ill_isv6 ? phyi->phyint_illv6 :
10711                             phyi->phyint_illv4);
10712                 }
10713         }
10714
10715         /*
10716          * We are now exclusive on the ipsq, so an ill move will be serialized
10717          * before or after us.
10718          */
10719         ASSERT(IAM_WRITER_ILL(ill));
10720         ASSERT(ill->ill_move_in_progress == B_FALSE);
10721
10722         if (found_sep && orig_ifindex == 0) {
10723                 /* Now see if there is an IPIF with this unit number. */
10724                 for (ipif = ill->ill_ipif; ipif != NULL;
10725                     ipif = ipif->ipif_next) {
10726                         if (ipif->ipif_id == id) {
10727                                 err = EEXIST;
10728                                 goto done;
10729                         }
10730                 }
10731         }
10732
10733         /*
10734          * We use IRE_LOCAL for lo0:1 etc. for "receive only" use
10735          * of lo0. We never come here when we plumb lo0:0. It
10736          * happens in ipif_lookup_on_name.
10737          * The specified unit number is ignored when we create the ipif on a
10738          * different interface. However, we save it in ipif_orig_ipifid below so
10739          * that the ipif fails back to the right position.
10740          */
10741         if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ?
10742             id : -1, IRE_LOCAL, B_TRUE)) == NULL) {
10743                 err = ENOBUFS;
10744                 goto done;
10745         }
10746
10747         /* Return created name with ioctl */
10748         (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name,
10749             IPIF_SEPARATOR_CHAR, ipif->ipif_id);
10750         ip1dbg(("created %s\n", lifr->lifr_name));
10751
10752         /* Set address */
10753         sin = (sin_t *)&lifr->lifr_addr;
10754         if (sin->sin_family != AF_UNSPEC) {
10755                 err = ip_sioctl_addr(ipif, sin, q, mp,
10756                     &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr);
10757         }
10758
10759         /* Set ifindex and unit number for failback */
10760         if (err == 0 && orig_ifindex != 0) {
10761                 ipif->ipif_orig_ifindex = orig_ifindex;
10762                 if (found_sep) {
10763                         ipif->ipif_orig_ipifid = id;
10764                 }
10765         }
10766
10767 done:
10768         ipsq_exit(ipsq, B_TRUE, B_TRUE);
10769         return (err);
10770 }
10771
10772 /*
10773  * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical
10774  * interface) delete it based on the IP address (on this physical interface).
10775  * Otherwise delete it based on the ipif_id.
10776  * Also, special handling to allow a removeif of lo0.
10777  */
10778 /* ARGSUSED */
10779 int
10780 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10781     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
10782 {
10783         conn_t          *connp;
10784         ill_t           *ill = ipif->ipif_ill;
10785         boolean_t        success;
10786         ip_stack_t      *ipst;
10787
10788         ipst = CONNQ_TO_IPST(q);
10789
10790         ASSERT(q->q_next == NULL);
10791         ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n",
10792             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10793         ASSERT(IAM_WRITER_IPIF(ipif));
10794
10795         connp = Q_TO_CONN(q);
10796         /*
10797          * Special case for unplumbing lo0 (the loopback physical interface).
10798          * If unplumbing lo0, the incoming address structure has been
10799          * initialized to all zeros. When unplumbing lo0, all its logical
10800          * interfaces must be removed too.
10801          *
10802          * Note that this interface may be called to remove a specific
10803          * loopback logical interface (eg, lo0:1). But in that case
10804          * ipif->ipif_id != 0 so that the code path for that case is the
10805          * same as any other interface (meaning it skips the code directly
10806          * below).
10807          */
10808         if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) {
10809                 if (sin->sin_family == AF_UNSPEC &&
10810                     (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) {
10811                         /*
10812                          * Mark it condemned. No new ref. will be made to ill.
10813                          */
10814                         mutex_enter(&ill->ill_lock);
10815                         ill->ill_state_flags |= ILL_CONDEMNED;
10816                         for (ipif = ill->ill_ipif; ipif != NULL;
10817                             ipif = ipif->ipif_next) {
10818                                 ipif->ipif_state_flags |= IPIF_CONDEMNED;
10819                         }
10820                         mutex_exit(&ill->ill_lock);
10821
10822                         ipif = ill->ill_ipif;
10823                         /* unplumb the loopback interface */
10824                         ill_delete(ill);
10825                         mutex_enter(&connp->conn_lock);
10826                         mutex_enter(&ill->ill_lock);
10827                         ASSERT(ill->ill_group == NULL);
10828
10829                         /* Are any references to this ill active */
10830                         if (ill_is_quiescent(ill)) {
10831                                 mutex_exit(&ill->ill_lock);
10832                                 mutex_exit(&connp->conn_lock);
10833                                 ill_delete_tail(ill);
10834                                 mi_free(ill);
10835                                 return (0);
10836                         }
10837                         success = ipsq_pending_mp_add(connp, ipif,
10838                             CONNP_TO_WQ(connp), mp, ILL_FREE);
10839                         mutex_exit(&connp->conn_lock);
10840                         mutex_exit(&ill->ill_lock);
10841                         if (success)
10842                                 return (EINPROGRESS);
10843                         else
10844                                 return (EINTR);
10845                 }
10846         }
10847
10848         /*
10849          * We are exclusive on the ipsq, so an ill move will be serialized
10850          * before or after us.
10851          */
10852         ASSERT(ill->ill_move_in_progress == B_FALSE);
10853
10854         if (ipif->ipif_id == 0) {
10855                 /* Find based on address */
10856                 if (ipif->ipif_isv6) {
10857                         sin6_t *sin6;
10858
10859                         if (sin->sin_family != AF_INET6)
10860                                 return (EAFNOSUPPORT);
10861
10862                         sin6 = (sin6_t *)sin;
10863                         /* We are a writer, so we should be able to lookup */
10864                         ipif = ipif_lookup_addr_v6(&sin6->sin6_addr,
10865                             ill, ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
10866                         if (ipif == NULL) {
10867                                 /*
10868                                  * Maybe the address in on another interface in
10869                                  * the same IPMP group? We check this below.
10870                                  */
10871                                 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr,
10872                                     NULL, ALL_ZONES, NULL, NULL, NULL, NULL,
10873                                     ipst);
10874                         }
10875                 } else {
10876                         ipaddr_t addr;
10877
10878                         if (sin->sin_family != AF_INET)
10879                                 return (EAFNOSUPPORT);
10880
10881                         addr = sin->sin_addr.s_addr;
10882                         /* We are a writer, so we should be able to lookup */
10883                         ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL,
10884                             NULL, NULL, NULL, ipst);
10885                         if (ipif == NULL) {
10886                                 /*
10887                                  * Maybe the address in on another interface in
10888                                  * the same IPMP group? We check this below.
10889                                  */
10890                                 ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES,
10891                                     NULL, NULL, NULL, NULL, ipst);
10892                         }
10893                 }
10894                 if (ipif == NULL) {
10895                         return (EADDRNOTAVAIL);
10896                 }
10897                 /*
10898                  * When the address to be removed is hosted on a different
10899                  * interface, we check if the interface is in the same IPMP
10900                  * group as the specified one; if so we proceed with the
10901                  * removal.
10902                  * ill->ill_group is NULL when the ill is down, so we have to
10903                  * compare the group names instead.
10904                  */
10905                 if (ipif->ipif_ill != ill &&
10906                     (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 ||
10907                     ill->ill_phyint->phyint_groupname_len == 0 ||
10908                     mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname,
10909                     ill->ill_phyint->phyint_groupname) != 0)) {
10910                         ipif_refrele(ipif);
10911                         return (EADDRNOTAVAIL);
10912                 }
10913
10914                 /* This is a writer */
10915                 ipif_refrele(ipif);
10916         }
10917
10918         /*
10919          * Can not delete instance zero since it is tied to the ill.
10920          */
10921         if (ipif->ipif_id == 0)
10922                 return (EBUSY);
10923
10924         mutex_enter(&ill->ill_lock);
10925         ipif->ipif_state_flags |= IPIF_CONDEMNED;
10926         mutex_exit(&ill->ill_lock);
10927
10928         ipif_free(ipif);
10929
10930         mutex_enter(&connp->conn_lock);
10931         mutex_enter(&ill->ill_lock);
10932
10933         /* Are any references to this ipif active */
10934         if (ipif->ipif_refcnt == 0 && ipif->ipif_ire_cnt == 0) {
10935                 mutex_exit(&ill->ill_lock);
10936                 mutex_exit(&connp->conn_lock);
10937                 ipif_non_duplicate(ipif);
10938                 ipif_down_tail(ipif);
10939                 ipif_free_tail(ipif);
10940                 return (0);
10941         }
10942         success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp,
10943             IPIF_FREE);
10944         mutex_exit(&ill->ill_lock);
10945         mutex_exit(&connp->conn_lock);
10946         if (success)
10947                 return (EINPROGRESS);
10948         else
10949                 return (EINTR);
10950 }
10951
10952 /*
10953  * Restart the removeif ioctl. The refcnt has gone down to 0.
10954  * The ipif is already condemned. So can't find it thru lookups.
10955  */
10956 /* ARGSUSED */
10957 int
10958 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
10959     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req)
10960 {
10961         ill_t *ill = ipif->ipif_ill;
10962
10963         ASSERT(IAM_WRITER_IPIF(ipif));
10964         ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED);
10965
10966         ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n",
10967             ill->ill_name, ipif->ipif_id, (void *)ipif));
10968
10969         if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) {
10970                 ASSERT(ill->ill_state_flags & ILL_CONDEMNED);
10971                 ill_delete_tail(ill);
10972                 mi_free(ill);
10973                 return (0);
10974         }
10975
10976         ipif_non_duplicate(ipif);
10977         ipif_down_tail(ipif);
10978         ipif_free_tail(ipif);
10979
10980         ILL_UNMARK_CHANGING(ill);
10981         return (0);
10982 }
10983
10984 /*
10985  * Set the local interface address.
10986  * Allow an address of all zero when the interface is down.
10987  */
10988 /* ARGSUSED */
10989 int
10990 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10991     ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
10992 {
10993         int err = 0;
10994         in6_addr_t v6addr;
10995         boolean_t need_up = B_FALSE;
10996
10997         ip1dbg(("ip_sioctl_addr(%s:%u %p)\n",
10998             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10999
11000         ASSERT(IAM_WRITER_IPIF(ipif));
11001
11002         if (ipif->ipif_isv6) {
11003                 sin6_t *sin6;
11004                 ill_t *ill;
11005                 phyint_t *phyi;
11006
11007                 if (sin->sin_family != AF_INET6)
11008                         return (EAFNOSUPPORT);
11009
11010                 sin6 = (sin6_t *)sin;
11011                 v6addr = sin6->sin6_addr;
11012                 ill = ipif->ipif_ill;
11013                 phyi = ill->ill_phyint;
11014
11015                 /*
11016                  * Enforce that true multicast interfaces have a link-local
11017                  * address for logical unit 0.
11018                  */
11019                 if (ipif->ipif_id == 0 &&
11020                     (ill->ill_flags & ILLF_MULTICAST) &&
11021                     !(ipif->ipif_flags & (IPIF_POINTOPOINT)) &&
11022                     !(phyi->phyint_flags & (PHYI_LOOPBACK)) &&
11023                     !IN6_IS_ADDR_LINKLOCAL(&v6addr)) {
11024                         return (EADDRNOTAVAIL);
11025                 }
11026
11027                 /*
11028                  * up interfaces shouldn't have the unspecified address
11029                  * unless they also have the IPIF_NOLOCAL flags set and
11030                  * have a subnet assigned.
11031                  */
11032                 if ((ipif->ipif_flags & IPIF_UP) &&
11033                     IN6_IS_ADDR_UNSPECIFIED(&v6addr) &&
11034                     (!(ipif->ipif_flags & IPIF_NOLOCAL) ||
11035                     IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) {
11036                         return (EADDRNOTAVAIL);
11037                 }
11038
11039                 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
11040                         return (EADDRNOTAVAIL);
11041         } else {
11042                 ipaddr_t addr;
11043
11044                 if (sin->sin_family != AF_INET)
11045                         return (EAFNOSUPPORT);
11046
11047                 addr = sin->sin_addr.s_addr;
11048
11049                 /* Allow 0 as the local address. */
11050                 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask))
11051                         return (EADDRNOTAVAIL);
11052
11053                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11054         }
11055
11056
11057         /*
11058          * Even if there is no change we redo things just to rerun
11059          * ipif_set_default.
11060          */
11061         if (ipif->ipif_flags & IPIF_UP) {
11062                 /*
11063                  * Setting a new local address, make sure
11064                  * we have net and subnet bcast ire's for
11065                  * the old address if we need them.
11066                  */
11067                 if (!ipif->ipif_isv6)
11068                         ipif_check_bcast_ires(ipif);
11069                 /*
11070                  * If the interface is already marked up,
11071                  * we call ipif_down which will take care
11072                  * of ditching any IREs that have been set
11073                  * up based on the old interface address.
11074                  */
11075                 err = ipif_logical_down(ipif, q, mp);
11076                 if (err == EINPROGRESS)
11077                         return (err);
11078                 ipif_down_tail(ipif);
11079                 need_up = 1;
11080         }
11081
11082         err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up);
11083         return (err);
11084 }
11085
11086 int
11087 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11088     boolean_t need_up)
11089 {
11090         in6_addr_t v6addr;
11091         in6_addr_t ov6addr;
11092         ipaddr_t addr;
11093         sin6_t  *sin6;
11094         int     sinlen;
11095         int     err = 0;
11096         ill_t   *ill = ipif->ipif_ill;
11097         boolean_t need_dl_down;
11098         boolean_t need_arp_down;
11099         struct iocblk *iocp;
11100
11101         iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL;
11102
11103         ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n",
11104             ill->ill_name, ipif->ipif_id, (void *)ipif));
11105         ASSERT(IAM_WRITER_IPIF(ipif));
11106
11107         /* Must cancel any pending timer before taking the ill_lock */
11108         if (ipif->ipif_recovery_id != 0)
11109                 (void) untimeout(ipif->ipif_recovery_id);
11110         ipif->ipif_recovery_id = 0;
11111
11112         if (ipif->ipif_isv6) {
11113                 sin6 = (sin6_t *)sin;
11114                 v6addr = sin6->sin6_addr;
11115                 sinlen = sizeof (struct sockaddr_in6);
11116         } else {
11117                 addr = sin->sin_addr.s_addr;
11118                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11119                 sinlen = sizeof (struct sockaddr_in);
11120         }
11121         mutex_enter(&ill->ill_lock);
11122         ov6addr = ipif->ipif_v6lcl_addr;
11123         ipif->ipif_v6lcl_addr = v6addr;
11124         sctp_update_ipif_addr(ipif, ov6addr);
11125         if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) {
11126                 ipif->ipif_v6src_addr = ipv6_all_zeros;
11127         } else {
11128                 ipif->ipif_v6src_addr = v6addr;
11129         }
11130         ipif->ipif_addr_ready = 0;
11131
11132         /*
11133          * If the interface was previously marked as a duplicate, then since
11134          * we've now got a "new" address, it should no longer be considered a
11135          * duplicate -- even if the "new" address is the same as the old one.
11136          * Note that if all ipifs are down, we may have a pending ARP down
11137          * event to handle.  This is because we want to recover from duplicates
11138          * and thus delay tearing down ARP until the duplicates have been
11139          * removed or disabled.
11140          */
11141         need_dl_down = need_arp_down = B_FALSE;
11142         if (ipif->ipif_flags & IPIF_DUPLICATE) {
11143                 need_arp_down = !need_up;
11144                 ipif->ipif_flags &= ~IPIF_DUPLICATE;
11145                 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
11146                     ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
11147                         need_dl_down = B_TRUE;
11148                 }
11149         }
11150
11151         if (ipif->ipif_isv6 && IN6_IS_ADDR_6TO4(&v6addr) &&
11152             !ill->ill_is_6to4tun) {
11153                 queue_t *wqp = ill->ill_wq;
11154
11155                 /*
11156                  * The local address of this interface is a 6to4 address,
11157                  * check if this interface is in fact a 6to4 tunnel or just
11158                  * an interface configured with a 6to4 address.  We are only
11159                  * interested in the former.
11160                  */
11161                 if (wqp != NULL) {
11162                         while ((wqp->q_next != NULL) &&
11163                             (wqp->q_next->q_qinfo != NULL) &&
11164                             (wqp->q_next->q_qinfo->qi_minfo != NULL)) {
11165
11166                                 if (wqp->q_next->q_qinfo->qi_minfo->mi_idnum
11167                                     == TUN6TO4_MODID) {
11168                                         /* set for use in IP */
11169                                         ill->ill_is_6to4tun = 1;
11170                                         break;
11171                                 }
11172                                 wqp = wqp->q_next;
11173                         }
11174                 }
11175         }
11176
11177         ipif_set_default(ipif);
11178
11179         /*
11180          * When publishing an interface address change event, we only notify
11181          * the event listeners of the new address.  It is assumed that if they
11182          * actively care about the addresses assigned that they will have
11183          * already discovered the previous address assigned (if there was one.)
11184          *
11185          * Don't attach nic event message for SIOCLIFADDIF ioctl.
11186          */
11187         if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) {
11188                 hook_nic_event_t *info;
11189                 if ((info = ipif->ipif_ill->ill_nic_event_info) != NULL) {
11190                         ip2dbg(("ip_sioctl_addr_tail: unexpected nic event %d "
11191                             "attached for %s\n", info->hne_event,
11192                             ill->ill_name));
11193                         if (info->hne_data != NULL)
11194                                 kmem_free(info->hne_data, info->hne_datalen);
11195                         kmem_free(info, sizeof (hook_nic_event_t));
11196                 }
11197
11198                 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP);
11199                 if (info != NULL) {
11200                         ip_stack_t      *ipst = ill->ill_ipst;
11201
11202                         info->hne_nic =
11203                             ipif->ipif_ill->ill_phyint->phyint_hook_ifindex;
11204                         info->hne_lif = MAP_IPIF_ID(ipif->ipif_id);
11205                         info->hne_event = NE_ADDRESS_CHANGE;
11206                         info->hne_family = ipif->ipif_isv6 ?
11207                             ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data;
11208                         info->hne_data = kmem_alloc(sinlen, KM_NOSLEEP);
11209                         if (info->hne_data != NULL) {
11210                                 info->hne_datalen = sinlen;
11211                                 bcopy(sin, info->hne_data, sinlen);
11212                         } else {
11213                                 ip2dbg(("ip_sioctl_addr_tail: could not attach "
11214                                     "address information for ADDRESS_CHANGE nic"
11215                                     " event of %s (ENOMEM)\n",
11216                                     ipif->ipif_ill->ill_name));
11217                                 kmem_free(info, sizeof (hook_nic_event_t));
11218                         }
11219                 } else
11220                         ip2dbg(("ip_sioctl_addr_tail: could not attach "
11221                             "ADDRESS_CHANGE nic event information for %s "
11222                             "(ENOMEM)\n", ipif->ipif_ill->ill_name));
11223
11224                 ipif->ipif_ill->ill_nic_event_info = info;
11225         }
11226
11227         mutex_exit(&ill->ill_lock);
11228
11229         if (need_up) {
11230                 /*
11231                  * Now bring the interface back up.  If this
11232                  * is the only IPIF for the ILL, ipif_up
11233                  * will have to re-bind to the device, so
11234                  * we may get back EINPROGRESS, in which
11235                  * case, this IOCTL will get completed in
11236                  * ip_rput_dlpi when we see the DL_BIND_ACK.
11237                  */
11238                 err = ipif_up(ipif, q, mp);
11239         }
11240
11241         if (need_dl_down)
11242                 ill_dl_down(ill);
11243         if (need_arp_down)
11244                 ipif_arp_down(ipif);
11245
11246         return (err);
11247 }
11248
11249
11250 /*
11251  * Restart entry point to restart the address set operation after the
11252  * refcounts have dropped to zero.
11253  */
11254 /* ARGSUSED */
11255 int
11256 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11257     ip_ioctl_cmd_t *ipip, void *ifreq)
11258 {
11259         ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n",
11260             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11261         ASSERT(IAM_WRITER_IPIF(ipif));
11262         ipif_down_tail(ipif);
11263         return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE));
11264 }
11265
11266 /* ARGSUSED */
11267 int
11268 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11269     ip_ioctl_cmd_t *ipip, void *if_req)
11270 {
11271         sin6_t *sin6 = (struct sockaddr_in6 *)sin;
11272         struct lifreq *lifr = (struct lifreq *)if_req;
11273
11274         ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n",
11275             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11276         /*
11277          * The net mask and address can't change since we have a
11278          * reference to the ipif. So no lock is necessary.
11279          */
11280         if (ipif->ipif_isv6) {
11281                 *sin6 = sin6_null;
11282                 sin6->sin6_family = AF_INET6;
11283                 sin6->sin6_addr = ipif->ipif_v6lcl_addr;
11284                 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11285                 lifr->lifr_addrlen =
11286                     ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11287         } else {
11288                 *sin = sin_null;
11289                 sin->sin_family = AF_INET;
11290                 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
11291                 if (ipip->ipi_cmd_type == LIF_CMD) {
11292                         lifr->lifr_addrlen =
11293                             ip_mask_to_plen(ipif->ipif_net_mask);
11294                 }
11295         }
11296         return (0);
11297 }
11298
11299 /*
11300  * Set the destination address for a pt-pt interface.
11301  */
11302 /* ARGSUSED */
11303 int
11304 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11305     ip_ioctl_cmd_t *ipip, void *if_req)
11306 {
11307         int err = 0;
11308         in6_addr_t v6addr;
11309         boolean_t need_up = B_FALSE;
11310
11311         ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n",
11312             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11313         ASSERT(IAM_WRITER_IPIF(ipif));
11314
11315         if (ipif->ipif_isv6) {
11316                 sin6_t *sin6;
11317
11318                 if (sin->sin_family != AF_INET6)
11319                         return (EAFNOSUPPORT);
11320
11321                 sin6 = (sin6_t *)sin;
11322                 v6addr = sin6->sin6_addr;
11323
11324                 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
11325                         return (EADDRNOTAVAIL);
11326         } else {
11327                 ipaddr_t addr;
11328
11329                 if (sin->sin_family != AF_INET)
11330                         return (EAFNOSUPPORT);
11331
11332                 addr = sin->sin_addr.s_addr;
11333                 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask))
11334                         return (EADDRNOTAVAIL);
11335
11336                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11337         }
11338
11339         if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr))
11340                 return (0);     /* No change */
11341
11342         if (ipif->ipif_flags & IPIF_UP) {
11343                 /*
11344                  * If the interface is already marked up,
11345                  * we call ipif_down which will take care
11346                  * of ditching any IREs that have been set
11347                  * up based on the old pp dst address.
11348                  */
11349                 err = ipif_logical_down(ipif, q, mp);
11350                 if (err == EINPROGRESS)
11351                         return (err);
11352                 ipif_down_tail(ipif);
11353                 need_up = B_TRUE;
11354         }
11355         /*
11356          * could return EINPROGRESS. If so ioctl will complete in
11357          * ip_rput_dlpi_writer
11358          */
11359         err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up);
11360         return (err);
11361 }
11362
11363 static int
11364 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11365     boolean_t need_up)
11366 {
11367         in6_addr_t v6addr;
11368         ill_t   *ill = ipif->ipif_ill;
11369         int     err = 0;
11370         boolean_t need_dl_down;
11371         boolean_t need_arp_down;
11372
11373         ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name,
11374             ipif->ipif_id, (void *)ipif));
11375
11376         /* Must cancel any pending timer before taking the ill_lock */
11377         if (ipif->ipif_recovery_id != 0)
11378                 (void) untimeout(ipif->ipif_recovery_id);
11379         ipif->ipif_recovery_id = 0;
11380
11381         if (ipif->ipif_isv6) {
11382                 sin6_t *sin6;
11383
11384                 sin6 = (sin6_t *)sin;
11385                 v6addr = sin6->sin6_addr;
11386         } else {
11387                 ipaddr_t addr;
11388
11389                 addr = sin->sin_addr.s_addr;
11390                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11391         }
11392         mutex_enter(&ill->ill_lock);
11393         /* Set point to point destination address. */
11394         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11395                 /*
11396                  * Allow this as a means of creating logical
11397                  * pt-pt interfaces on top of e.g. an Ethernet.
11398                  * XXX Undocumented HACK for testing.
11399                  * pt-pt interfaces are created with NUD disabled.
11400                  */
11401                 ipif->ipif_flags |= IPIF_POINTOPOINT;
11402                 ipif->ipif_flags &= ~IPIF_BROADCAST;
11403                 if (ipif->ipif_isv6)
11404                         ill->ill_flags |= ILLF_NONUD;
11405         }
11406
11407         /*
11408          * If the interface was previously marked as a duplicate, then since
11409          * we've now got a "new" address, it should no longer be considered a
11410          * duplicate -- even if the "new" address is the same as the old one.
11411          * Note that if all ipifs are down, we may have a pending ARP down
11412          * event to handle.
11413          */
11414         need_dl_down = need_arp_down = B_FALSE;
11415         if (ipif->ipif_flags & IPIF_DUPLICATE) {
11416                 need_arp_down = !need_up;
11417                 ipif->ipif_flags &= ~IPIF_DUPLICATE;
11418                 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
11419                     ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
11420                         need_dl_down = B_TRUE;
11421                 }
11422         }
11423
11424         /* Set the new address. */
11425         ipif->ipif_v6pp_dst_addr = v6addr;
11426         /* Make sure subnet tracks pp_dst */
11427         ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
11428         mutex_exit(&ill->ill_lock);
11429
11430         if (need_up) {
11431                 /*
11432                  * Now bring the interface back up.  If this
11433                  * is the only IPIF for the ILL, ipif_up
11434                  * will have to re-bind to the device, so
11435                  * we may get back EINPROGRESS, in which
11436                  * case, this IOCTL will get completed in
11437                  * ip_rput_dlpi when we see the DL_BIND_ACK.
11438                  */
11439                 err = ipif_up(ipif, q, mp);
11440         }
11441
11442         if (need_dl_down)
11443                 ill_dl_down(ill);
11444
11445         if (need_arp_down)
11446                 ipif_arp_down(ipif);
11447         return (err);
11448 }
11449
11450 /*
11451  * Restart entry point to restart the dstaddress set operation after the
11452  * refcounts have dropped to zero.
11453  */
11454 /* ARGSUSED */
11455 int
11456 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11457     ip_ioctl_cmd_t *ipip, void *ifreq)
11458 {
11459         ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n",
11460             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11461         ipif_down_tail(ipif);
11462         return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE));
11463 }
11464
11465 /* ARGSUSED */
11466 int
11467 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11468     ip_ioctl_cmd_t *ipip, void *if_req)
11469 {
11470         sin6_t  *sin6 = (struct sockaddr_in6 *)sin;
11471
11472         ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n",
11473             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11474         /*
11475          * Get point to point destination address. The addresses can't
11476          * change since we hold a reference to the ipif.
11477          */
11478         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0)
11479                 return (EADDRNOTAVAIL);
11480
11481         if (ipif->ipif_isv6) {
11482                 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11483                 *sin6 = sin6_null;
11484                 sin6->sin6_family = AF_INET6;
11485                 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr;
11486         } else {
11487                 *sin = sin_null;
11488                 sin->sin_family = AF_INET;
11489                 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr;
11490         }
11491         return (0);
11492 }
11493
11494 /*
11495  * part of ipmp, make this func return the active/inactive state and
11496  * caller can set once atomically instead of multiple mutex_enter/mutex_exit
11497  */
11498 /*
11499  * This function either sets or clears the IFF_INACTIVE flag.
11500  *
11501  * As long as there are some addresses or multicast memberships on the
11502  * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we
11503  * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface
11504  * will be used for outbound packets.
11505  *
11506  * Caller needs to verify the validity of setting IFF_INACTIVE.
11507  */
11508 static void
11509 phyint_inactive(phyint_t *phyi)
11510 {
11511         ill_t *ill_v4;
11512         ill_t *ill_v6;
11513         ipif_t *ipif;
11514         ilm_t *ilm;
11515
11516         ill_v4 = phyi->phyint_illv4;
11517         ill_v6 = phyi->phyint_illv6;
11518
11519         /*
11520          * No need for a lock while traversing the list since iam
11521          * a writer
11522          */
11523         if (ill_v4 != NULL) {
11524                 ASSERT(IAM_WRITER_ILL(ill_v4));
11525                 for (ipif = ill_v4->ill_ipif; ipif != NULL;
11526                     ipif = ipif->ipif_next) {
11527                         if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) {
11528                                 mutex_enter(&phyi->phyint_lock);
11529                                 phyi->phyint_flags &= ~PHYI_INACTIVE;
11530                                 mutex_exit(&phyi->phyint_lock);
11531                                 return;
11532                         }
11533                 }
11534                 for (ilm = ill_v4->ill_ilm; ilm != NULL;
11535                     ilm = ilm->ilm_next) {
11536                         if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) {
11537                                 mutex_enter(&phyi->phyint_lock);
11538                                 phyi->phyint_flags &= ~PHYI_INACTIVE;
11539                                 mutex_exit(&phyi->phyint_lock);
11540                                 return;
11541                         }
11542                 }
11543         }
11544         if (ill_v6 != NULL) {
11545                 ill_v6 = phyi->phyint_illv6;
11546                 for (ipif = ill_v6->ill_ipif; ipif != NULL;
11547                     ipif = ipif->ipif_next) {
11548                         if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) {
11549                                 mutex_enter(&phyi->phyint_lock);
11550                                 phyi->phyint_flags &= ~PHYI_INACTIVE;
11551                                 mutex_exit(&phyi->phyint_lock);
11552                                 return;
11553                         }
11554                 }
11555                 for (ilm = ill_v6->ill_ilm; ilm != NULL;
11556                     ilm = ilm->ilm_next) {
11557                         if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) {
11558                                 mutex_enter(&phyi->phyint_lock);
11559                                 phyi->phyint_flags &= ~PHYI_INACTIVE;
11560                                 mutex_exit(&phyi->phyint_lock);
11561                                 return;
11562                         }
11563                 }
11564         }
11565         mutex_enter(&phyi->phyint_lock);
11566         phyi->phyint_flags |= PHYI_INACTIVE;
11567         mutex_exit(&phyi->phyint_lock);
11568 }
11569
11570 /*
11571  * This function is called only when the phyint flags change. Currently
11572  * called from ip_sioctl_flags. We re-do the broadcast nomination so
11573  * that we can select a good ill.
11574  */
11575 static void
11576 ip_redo_nomination(phyint_t *phyi)
11577 {
11578         ill_t *ill_v4;
11579
11580         ill_v4 = phyi->phyint_illv4;
11581
11582         if (ill_v4 != NULL && ill_v4->ill_group != NULL) {
11583                 ASSERT(IAM_WRITER_ILL(ill_v4));
11584                 if (ill_v4->ill_group->illgrp_ill_count > 1)
11585                         ill_nominate_bcast_rcv(ill_v4->ill_group);
11586         }
11587 }
11588
11589 /*
11590  * Heuristic to check if ill is INACTIVE.
11591  * Checks if ill has an ipif with an usable ip address.
11592  *
11593  * Return values:
11594  *      B_TRUE  - ill is INACTIVE; has no usable ipif
11595  *      B_FALSE - ill is not INACTIVE; ill has at least one usable ipif
11596  */
11597 static boolean_t
11598 ill_is_inactive(ill_t *ill)
11599 {
11600         ipif_t *ipif;
11601
11602         /* Check whether it is in an IPMP group */
11603         if (ill->ill_phyint->phyint_groupname == NULL)
11604                 return (B_FALSE);
11605
11606         if (ill->ill_ipif_up_count == 0)
11607                 return (B_TRUE);
11608
11609         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
11610                 uint64_t flags = ipif->ipif_flags;
11611
11612                 /*
11613                  * This ipif is usable if it is IPIF_UP and not a
11614                  * dedicated test address.  A dedicated test address
11615                  * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED
11616                  * (note in particular that V6 test addresses are
11617                  * link-local data addresses and thus are marked
11618                  * IPIF_NOFAILOVER but not IPIF_DEPRECATED).
11619                  */
11620                 if ((flags & IPIF_UP) &&
11621                     ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) !=
11622                     (IPIF_DEPRECATED|IPIF_NOFAILOVER)))
11623                         return (B_FALSE);
11624         }
11625         return (B_TRUE);
11626 }
11627
11628 /*
11629  * Set interface flags.
11630  * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT,
11631  * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST,
11632  * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE.
11633  *
11634  * NOTE : We really don't enforce that ipif_id zero should be used
11635  *        for setting any flags other than IFF_LOGINT_FLAGS. This
11636  *        is because applications generally does SICGLIFFLAGS and
11637  *        ORs in the new flags (that affects the logical) and does a
11638  *        SIOCSLIFFLAGS. Thus, "flags" below could contain bits other
11639  *        than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the
11640  *        flags that will be turned on is correct with respect to
11641  *        ipif_id 0. For backward compatibility reasons, it is not done.
11642  */
11643 /* ARGSUSED */
11644 int
11645 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11646     ip_ioctl_cmd_t *ipip, void *if_req)
11647 {
11648         uint64_t turn_on;
11649         uint64_t turn_off;
11650         int     err;
11651         boolean_t need_up = B_FALSE;
11652         phyint_t *phyi;
11653         ill_t *ill;
11654         uint64_t intf_flags;
11655         boolean_t phyint_flags_modified = B_FALSE;
11656         uint64_t flags;
11657         struct ifreq *ifr;
11658         struct lifreq *lifr;
11659         boolean_t set_linklocal = B_FALSE;
11660         boolean_t zero_source = B_FALSE;
11661         ip_stack_t *ipst;
11662
11663         ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
11664             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11665
11666         ASSERT(IAM_WRITER_IPIF(ipif));
11667
11668         ill = ipif->ipif_ill;
11669         phyi = ill->ill_phyint;
11670         ipst = ill->ill_ipst;
11671
11672         if (ipip->ipi_cmd_type == IF_CMD) {
11673                 ifr = (struct ifreq *)if_req;
11674                 flags =  (uint64_t)(ifr->ifr_flags & 0x0000ffff);
11675         } else {
11676                 lifr = (struct lifreq *)if_req;
11677                 flags = lifr->lifr_flags;
11678         }
11679
11680         intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
11681
11682         /*
11683          * Has the flags been set correctly till now ?
11684          */
11685         ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
11686         ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
11687         ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
11688         /*
11689          * Compare the new flags to the old, and partition
11690          * into those coming on and those going off.
11691          * For the 16 bit command keep the bits above bit 16 unchanged.
11692          */
11693         if (ipip->ipi_cmd == SIOCSIFFLAGS)
11694                 flags |= intf_flags & ~0xFFFF;
11695
11696         /*
11697          * First check which bits will change and then which will
11698          * go on and off
11699          */
11700         turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE;
11701         if (!turn_on)
11702                 return (0);     /* No change */
11703
11704         turn_off = intf_flags & turn_on;
11705         turn_on ^= turn_off;
11706         err = 0;
11707
11708         /*
11709          * Don't allow any bits belonging to the logical interface
11710          * to be set or cleared on the replacement ipif that was
11711          * created temporarily during a MOVE.
11712          */
11713         if (ipif->ipif_replace_zero &&
11714             ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) {
11715                 return (EINVAL);
11716         }
11717
11718         /*
11719          * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on
11720          * IPv6 interfaces.
11721          */
11722         if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6))
11723                 return (EINVAL);
11724
11725         /*
11726          * cannot turn off IFF_NOXMIT on  VNI interfaces.
11727          */
11728         if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill))
11729                 return (EINVAL);
11730
11731         /*
11732          * Don't allow the IFF_ROUTER flag to be turned on on loopback
11733          * interfaces.  It makes no sense in that context.
11734          */
11735         if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK))
11736                 return (EINVAL);
11737
11738         if (flags & (IFF_NOLOCAL|IFF_ANYCAST))
11739                 zero_source = B_TRUE;
11740
11741         /*
11742          * For IPv6 ipif_id 0, don't allow the interface to be up without
11743          * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set.
11744          * If the link local address isn't set, and can be set, it will get
11745          * set later on in this function.
11746          */
11747         if (ipif->ipif_id == 0 && ipif->ipif_isv6 &&
11748             (flags & IFF_UP) && !zero_source &&
11749             IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
11750                 if (ipif_cant_setlinklocal(ipif))
11751                         return (EINVAL);
11752                 set_linklocal = B_TRUE;
11753         }
11754
11755         /*
11756          * ILL cannot be part of a usesrc group and and IPMP group at the
11757          * same time. No need to grab ill_g_usesrc_lock here, see
11758          * synchronization notes in ip.c
11759          */
11760         if (turn_on & PHYI_STANDBY &&
11761             ipif->ipif_ill->ill_usesrc_grp_next != NULL) {
11762                 return (EINVAL);
11763         }
11764
11765         /*
11766          * If we modify physical interface flags, we'll potentially need to
11767          * send up two routing socket messages for the changes (one for the
11768          * IPv4 ill, and another for the IPv6 ill).  Note that here.
11769          */
11770         if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
11771                 phyint_flags_modified = B_TRUE;
11772
11773         /*
11774          * If we are setting or clearing FAILED or STANDBY or OFFLINE,
11775          * we need to flush the IRE_CACHES belonging to this ill.
11776          * We handle this case here without doing the DOWN/UP dance
11777          * like it is done for other flags. If some other flags are
11778          * being turned on/off with FAILED/STANDBY/OFFLINE, the code
11779          * below will handle it by bringing it down and then
11780          * bringing it UP.
11781          */
11782         if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) {
11783                 ill_t *ill_v4, *ill_v6;
11784
11785                 ill_v4 = phyi->phyint_illv4;
11786                 ill_v6 = phyi->phyint_illv6;
11787
11788                 /*
11789                  * First set the INACTIVE flag if needed. Then delete the ires.
11790                  * ire_add will atomically prevent creating new IRE_CACHEs
11791                  * unless hidden flag is set.
11792                  * PHYI_FAILED and PHYI_INACTIVE are exclusive
11793                  */
11794                 if ((turn_on & PHYI_FAILED) &&
11795                     ((intf_flags & PHYI_STANDBY) ||
11796                     !ipst->ips_ipmp_enable_failback)) {
11797                         /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */
11798                         phyi->phyint_flags &= ~PHYI_INACTIVE;
11799                 }
11800                 if ((turn_off & PHYI_FAILED) &&
11801                     ((intf_flags & PHYI_STANDBY) ||
11802                     (!ipst->ips_ipmp_enable_failback &&
11803                     ill_is_inactive(ill)))) {
11804                         phyint_inactive(phyi);
11805                 }
11806
11807                 if (turn_on & PHYI_STANDBY) {
11808                         /*
11809                          * We implicitly set INACTIVE only when STANDBY is set.
11810                          * INACTIVE is also set on non-STANDBY phyint when user
11811                          * disables FAILBACK using configuration file.
11812                          * Do not allow STANDBY to be set on such INACTIVE
11813                          * phyint
11814                          */
11815                         if (phyi->phyint_flags & PHYI_INACTIVE)
11816                                 return (EINVAL);
11817                         if (!(phyi->phyint_flags & PHYI_FAILED))
11818                                 phyint_inactive(phyi);
11819                 }
11820                 if (turn_off & PHYI_STANDBY) {
11821                         if (ipst->ips_ipmp_enable_failback) {
11822                                 /*
11823                                  * Reset PHYI_INACTIVE.
11824                                  */
11825                                 phyi->phyint_flags &= ~PHYI_INACTIVE;
11826                         } else if (ill_is_inactive(ill) &&
11827                             !(phyi->phyint_flags & PHYI_FAILED)) {
11828                                 /*
11829                                  * Need to set INACTIVE, when user sets
11830                                  * STANDBY on a non-STANDBY phyint and
11831                                  * later resets STANDBY
11832                                  */
11833                                 phyint_inactive(phyi);
11834                         }
11835                 }
11836                 /*
11837                  * We should always send up a message so that the
11838                  * daemons come to know of it. Note that the zeroth
11839                  * interface can be down and the check below for IPIF_UP
11840                  * will not make sense as we are actually setting
11841                  * a phyint flag here. We assume that the ipif used
11842                  * is always the zeroth ipif. (ip_rts_ifmsg does not
11843                  * send up any message for non-zero ipifs).
11844                  */
11845                 phyint_flags_modified = B_TRUE;
11846
11847                 if (ill_v4 != NULL) {
11848                         ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
11849                             IRE_CACHE, ill_stq_cache_delete,
11850                             (char *)ill_v4, ill_v4);
11851                         illgrp_reset_schednext(ill_v4);
11852                 }
11853                 if (ill_v6 != NULL) {
11854                         ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
11855                             IRE_CACHE, ill_stq_cache_delete,
11856                             (char *)ill_v6, ill_v6);
11857                         illgrp_reset_schednext(ill_v6);
11858                 }
11859         }
11860
11861         /*
11862          * If ILLF_ROUTER changes, we need to change the ip forwarding
11863          * status of the interface and, if the interface is part of an IPMP
11864          * group, all other interfaces that are part of the same IPMP
11865          * group.
11866          */
11867         if ((turn_on | turn_off) & ILLF_ROUTER)
11868                 (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0));
11869
11870         /*
11871          * If the interface is not UP and we are not going to
11872          * bring it UP, record the flags and return. When the
11873          * interface comes UP later, the right actions will be
11874          * taken.
11875          */
11876         if (!(ipif->ipif_flags & IPIF_UP) &&
11877             !(turn_on & IPIF_UP)) {
11878                 /* Record new flags in their respective places. */
11879                 mutex_enter(&ill->ill_lock);
11880                 mutex_enter(&ill->ill_phyint->phyint_lock);
11881                 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
11882                 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
11883                 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
11884                 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
11885                 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
11886                 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
11887                 mutex_exit(&ill->ill_lock);
11888                 mutex_exit(&ill->ill_phyint->phyint_lock);
11889
11890                 /*
11891                  * We do the broadcast and nomination here rather
11892                  * than waiting for a FAILOVER/FAILBACK to happen. In
11893                  * the case of FAILBACK from INACTIVE standby to the
11894                  * interface that has been repaired, PHYI_FAILED has not
11895                  * been cleared yet. If there are only two interfaces in
11896                  * that group, all we have is a FAILED and INACTIVE
11897                  * interface. If we do the nomination soon after a failback,
11898                  * the broadcast nomination code would select the
11899                  * INACTIVE interface for receiving broadcasts as FAILED is
11900                  * not yet cleared. As we don't want STANDBY/INACTIVE to
11901                  * receive broadcast packets, we need to redo nomination
11902                  * when the FAILED is cleared here. Thus, in general we
11903                  * always do the nomination here for FAILED, STANDBY
11904                  * and OFFLINE.
11905                  */
11906                 if (((turn_on | turn_off) &
11907                     (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) {
11908                         ip_redo_nomination(phyi);
11909                 }
11910                 if (phyint_flags_modified) {
11911                         if (phyi->phyint_illv4 != NULL) {
11912                                 ip_rts_ifmsg(phyi->phyint_illv4->
11913                                     ill_ipif);
11914                         }
11915                         if (phyi->phyint_illv6 != NULL) {
11916                                 ip_rts_ifmsg(phyi->phyint_illv6->
11917                                     ill_ipif);
11918                         }
11919                 }
11920                 return (0);
11921         } else if (set_linklocal || zero_source) {
11922                 mutex_enter(&ill->ill_lock);
11923                 if (set_linklocal)
11924                         ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL;
11925                 if (zero_source)
11926                         ipif->ipif_state_flags |= IPIF_ZERO_SOURCE;
11927                 mutex_exit(&ill->ill_lock);
11928         }
11929
11930         /*
11931          * Disallow IPv6 interfaces coming up that have the unspecified address,
11932          * or point-to-point interfaces with an unspecified destination. We do
11933          * allow the address to be unspecified for IPIF_NOLOCAL interfaces that
11934          * have a subnet assigned, which is how in.ndpd currently manages its
11935          * onlink prefix list when no addresses are configured with those
11936          * prefixes.
11937          */
11938         if (ipif->ipif_isv6 &&
11939             ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
11940             (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) ||
11941             IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) ||
11942             ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
11943             IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) {
11944                 return (EINVAL);
11945         }
11946
11947         /*
11948          * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination
11949          * from being brought up.
11950          */
11951         if (!ipif->ipif_isv6 &&
11952             ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
11953             ipif->ipif_pp_dst_addr == INADDR_ANY)) {
11954                 return (EINVAL);
11955         }
11956
11957         /*
11958          * The only flag changes that we currently take specific action on
11959          * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL,
11960          * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and
11961          * IPIF_PREFERRED.  This is done by bring the ipif down, changing
11962          * the flags and bringing it back up again.
11963          */
11964         if ((turn_on|turn_off) &
11965             (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP|
11966             ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) {
11967                 /*
11968                  * Taking this ipif down, make sure we have
11969                  * valid net and subnet bcast ire's for other
11970                  * logical interfaces, if we need them.
11971                  */
11972                 if (!ipif->ipif_isv6)
11973                         ipif_check_bcast_ires(ipif);
11974
11975                 if (((ipif->ipif_flags | turn_on) & IPIF_UP) &&
11976                     !(turn_off & IPIF_UP)) {
11977                         need_up = B_TRUE;
11978                         if (ipif->ipif_flags & IPIF_UP)
11979                                 ill->ill_logical_down = 1;
11980                         turn_on &= ~IPIF_UP;
11981                 }
11982                 err = ipif_down(ipif, q, mp);
11983                 ip1dbg(("ipif_down returns %d err ", err));
11984                 if (err == EINPROGRESS)
11985                         return (err);
11986                 ipif_down_tail(ipif);
11987         }
11988         return (ip_sioctl_flags_tail(ipif, flags, q, mp, need_up));
11989 }
11990
11991 static int
11992 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp,
11993     boolean_t need_up)
11994 {
11995         ill_t   *ill;
11996         phyint_t *phyi;
11997         uint64_t turn_on;
11998         uint64_t turn_off;
11999         uint64_t intf_flags;
12000         boolean_t phyint_flags_modified = B_FALSE;
12001         int     err = 0;
12002         boolean_t set_linklocal = B_FALSE;
12003         boolean_t zero_source = B_FALSE;
12004
12005         ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n",
12006             ipif->ipif_ill->ill_name, ipif->ipif_id));
12007
12008         ASSERT(IAM_WRITER_IPIF(ipif));
12009
12010         ill = ipif->ipif_ill;
12011         phyi = ill->ill_phyint;
12012
12013         intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
12014         turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP);
12015
12016         turn_off = intf_flags & turn_on;
12017         turn_on ^= turn_off;
12018
12019         if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))
12020                 phyint_flags_modified = B_TRUE;
12021
12022         /*
12023          * Now we change the flags. Track current value of
12024          * other flags in their respective places.
12025          */
12026         mutex_enter(&ill->ill_lock);
12027         mutex_enter(&phyi->phyint_lock);
12028         ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
12029         ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
12030         ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
12031         ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
12032         phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
12033         phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
12034         if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) {
12035                 set_linklocal = B_TRUE;
12036                 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL;
12037         }
12038         if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) {
12039                 zero_source = B_TRUE;
12040                 ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE;
12041         }
12042         mutex_exit(&ill->ill_lock);
12043         mutex_exit(&phyi->phyint_lock);
12044
12045         if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)))
12046                 ip_redo_nomination(phyi);
12047
12048         if (set_linklocal)
12049                 (void) ipif_setlinklocal(ipif);
12050
12051         if (zero_source)
12052                 ipif->ipif_v6src_addr = ipv6_all_zeros;
12053         else
12054                 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
12055
12056         if (need_up) {
12057                 /*
12058                  * XXX ipif_up really does not know whether a phyint flags
12059                  * was modified or not. So, it sends up information on
12060                  * only one routing sockets message. As we don't bring up
12061                  * the interface and also set STANDBY/FAILED simultaneously
12062                  * it should be okay.
12063                  */
12064                 err = ipif_up(ipif, q, mp);
12065         } else {
12066                 /*
12067                  * Make sure routing socket sees all changes to the flags.
12068                  * ipif_up_done* handles this when we use ipif_up.
12069                  */
12070                 if (phyint_flags_modified) {
12071                         if (phyi->phyint_illv4 != NULL) {
12072                                 ip_rts_ifmsg(phyi->phyint_illv4->
12073                                     ill_ipif);
12074                         }
12075                         if (phyi->phyint_illv6 != NULL) {
12076                                 ip_rts_ifmsg(phyi->phyint_illv6->
12077                                     ill_ipif);
12078                         }
12079                 } else {
12080                         ip_rts_ifmsg(ipif);
12081                 }
12082                 /*
12083                  * Update the flags in SCTP's IPIF list, ipif_up() will do
12084                  * this in need_up case.
12085                  */
12086                 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
12087         }
12088         return (err);
12089 }
12090
12091 /*
12092  * Restart entry point to restart the flags restart operation after the
12093  * refcounts have dropped to zero.
12094  */
12095 /* ARGSUSED */
12096 int
12097 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12098     ip_ioctl_cmd_t *ipip, void *if_req)
12099 {
12100         int     err;
12101         struct ifreq *ifr = (struct ifreq *)if_req;
12102         struct lifreq *lifr = (struct lifreq *)if_req;
12103
12104         ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n",
12105             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12106
12107         ipif_down_tail(ipif);
12108         if (ipip->ipi_cmd_type == IF_CMD) {
12109                 /*
12110                  * Since ip_sioctl_flags expects an int and ifr_flags
12111                  * is a short we need to cast ifr_flags into an int
12112                  * to avoid having sign extension cause bits to get
12113                  * set that should not be.
12114                  */
12115                 err = ip_sioctl_flags_tail(ipif,
12116                     (uint64_t)(ifr->ifr_flags & 0x0000ffff),
12117                     q, mp, B_TRUE);
12118         } else {
12119                 err = ip_sioctl_flags_tail(ipif, lifr->lifr_flags,
12120                     q, mp, B_TRUE);
12121         }
12122         return (err);
12123 }
12124
12125 /*
12126  * Can operate on either a module or a driver queue.
12127  */
12128 /* ARGSUSED */
12129 int
12130 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12131     ip_ioctl_cmd_t *ipip, void *if_req)
12132 {
12133         /*
12134          * Has the flags been set correctly till now ?
12135          */
12136         ill_t *ill = ipif->ipif_ill;
12137         phyint_t *phyi = ill->ill_phyint;
12138
12139         ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n",
12140             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12141         ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
12142         ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
12143         ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
12144
12145         /*
12146          * Need a lock since some flags can be set even when there are
12147          * references to the ipif.
12148          */
12149         mutex_enter(&ill->ill_lock);
12150         if (ipip->ipi_cmd_type == IF_CMD) {
12151                 struct ifreq *ifr = (struct ifreq *)if_req;
12152
12153                 /* Get interface flags (low 16 only). */
12154                 ifr->ifr_flags = ((ipif->ipif_flags |
12155                     ill->ill_flags | phyi->phyint_flags) & 0xffff);
12156         } else {
12157                 struct lifreq *lifr = (struct lifreq *)if_req;
12158
12159                 /* Get interface flags. */
12160                 lifr->lifr_flags = ipif->ipif_flags |
12161                     ill->ill_flags | phyi->phyint_flags;
12162         }
12163         mutex_exit(&ill->ill_lock);
12164         return (0);
12165 }
12166
12167 /* ARGSUSED */
12168 int
12169 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12170     ip_ioctl_cmd_t *ipip, void *if_req)
12171 {
12172         int mtu;
12173         int ip_min_mtu;
12174         struct ifreq    *ifr;
12175         struct lifreq *lifr;
12176         ire_t   *ire;
12177         ip_stack_t *ipst;
12178
12179         ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name,
12180             ipif->ipif_id, (void *)ipif));
12181         if (ipip->ipi_cmd_type == IF_CMD) {
12182                 ifr = (struct ifreq *)if_req;
12183                 mtu = ifr->ifr_metric;
12184         } else {
12185                 lifr = (struct lifreq *)if_req;
12186                 mtu = lifr->lifr_mtu;
12187         }
12188
12189         if (ipif->ipif_isv6)
12190                 ip_min_mtu = IPV6_MIN_MTU;
12191         else
12192                 ip_min_mtu = IP_MIN_MTU;
12193
12194         if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu)
12195                 return (EINVAL);
12196
12197         /*
12198          * Change the MTU size in all relevant ire's.
12199          * Mtu change Vs. new ire creation - protocol below.
12200          * First change ipif_mtu and the ire_max_frag of the
12201          * interface ire. Then do an ire walk and change the
12202          * ire_max_frag of all affected ires. During ire_add
12203          * under the bucket lock, set the ire_max_frag of the
12204          * new ire being created from the ipif/ire from which
12205          * it is being derived. If an mtu change happens after
12206          * the ire is added, the new ire will be cleaned up.
12207          * Conversely if the mtu change happens before the ire
12208          * is added, ire_add will see the new value of the mtu.
12209          */
12210         ipif->ipif_mtu = mtu;
12211         ipif->ipif_flags |= IPIF_FIXEDMTU;
12212
12213         if (ipif->ipif_isv6)
12214                 ire = ipif_to_ire_v6(ipif);
12215         else
12216                 ire = ipif_to_ire(ipif);
12217         if (ire != NULL) {
12218                 ire->ire_max_frag = ipif->ipif_mtu;
12219                 ire_refrele(ire);
12220         }
12221         ipst = ipif->ipif_ill->ill_ipst;
12222         if (ipif->ipif_flags & IPIF_UP) {
12223                 if (ipif->ipif_isv6)
12224                         ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES,
12225                             ipst);
12226                 else
12227                         ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES,
12228                             ipst);
12229         }
12230         /* Update the MTU in SCTP's list */
12231         sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
12232         return (0);
12233 }
12234
12235 /* Get interface MTU. */
12236 /* ARGSUSED */
12237 int
12238 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12239         ip_ioctl_cmd_t *ipip, void *if_req)
12240 {
12241         struct ifreq    *ifr;
12242         struct lifreq   *lifr;
12243
12244         ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n",
12245             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12246         if (ipip->ipi_cmd_type == IF_CMD) {
12247                 ifr = (struct ifreq *)if_req;
12248                 ifr->ifr_metric = ipif->ipif_mtu;
12249         } else {
12250                 lifr = (struct lifreq *)if_req;
12251                 lifr->lifr_mtu = ipif->ipif_mtu;
12252         }
12253         return (0);
12254 }
12255
12256 /* Set interface broadcast address. */
12257 /* ARGSUSED2 */
12258 int
12259 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12260         ip_ioctl_cmd_t *ipip, void *if_req)
12261 {
12262         ipaddr_t addr;
12263         ire_t   *ire;
12264         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
12265
12266         ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name,
12267             ipif->ipif_id));
12268
12269         ASSERT(IAM_WRITER_IPIF(ipif));
12270         if (!(ipif->ipif_flags & IPIF_BROADCAST))
12271                 return (EADDRNOTAVAIL);
12272
12273         ASSERT(!(ipif->ipif_isv6));     /* No IPv6 broadcast */
12274
12275         if (sin->sin_family != AF_INET)
12276                 return (EAFNOSUPPORT);
12277
12278         addr = sin->sin_addr.s_addr;
12279         if (ipif->ipif_flags & IPIF_UP) {
12280                 /*
12281                  * If we are already up, make sure the new
12282                  * broadcast address makes sense.  If it does,
12283                  * there should be an IRE for it already.
12284                  * Don't match on ipif, only on the ill
12285                  * since we are sharing these now. Don't use
12286                  * MATCH_IRE_ILL_GROUP as we are looking for
12287                  * the broadcast ire on this ill and each ill
12288                  * in the group has its own broadcast ire.
12289                  */
12290                 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST,
12291                     ipif, ALL_ZONES, NULL,
12292                     (MATCH_IRE_ILL | MATCH_IRE_TYPE), ipst);
12293                 if (ire == NULL) {
12294                         return (EINVAL);
12295                 } else {
12296                         ire_refrele(ire);
12297                 }
12298         }
12299         /*
12300          * Changing the broadcast addr for this ipif.
12301          * Make sure we have valid net and subnet bcast
12302          * ire's for other logical interfaces, if needed.
12303          */
12304         if (addr != ipif->ipif_brd_addr)
12305                 ipif_check_bcast_ires(ipif);
12306         IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr);
12307         return (0);
12308 }
12309
12310 /* Get interface broadcast address. */
12311 /* ARGSUSED */
12312 int
12313 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12314     ip_ioctl_cmd_t *ipip, void *if_req)
12315 {
12316         ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n",
12317             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12318         if (!(ipif->ipif_flags & IPIF_BROADCAST))
12319                 return (EADDRNOTAVAIL);
12320
12321         /* IPIF_BROADCAST not possible with IPv6 */
12322         ASSERT(!ipif->ipif_isv6);
12323         *sin = sin_null;
12324         sin->sin_family = AF_INET;
12325         sin->sin_addr.s_addr = ipif->ipif_brd_addr;
12326         return (0);
12327 }
12328
12329 /*
12330  * This routine is called to handle the SIOCS*IFNETMASK IOCTL.
12331  */
12332 /* ARGSUSED */
12333 int
12334 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12335     ip_ioctl_cmd_t *ipip, void *if_req)
12336 {
12337         int err = 0;
12338         in6_addr_t v6mask;
12339
12340         ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n",
12341             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12342
12343         ASSERT(IAM_WRITER_IPIF(ipif));
12344
12345         if (ipif->ipif_isv6) {
12346                 sin6_t *sin6;
12347
12348                 if (sin->sin_family != AF_INET6)
12349                         return (EAFNOSUPPORT);
12350
12351                 sin6 = (sin6_t *)sin;
12352                 v6mask = sin6->sin6_addr;
12353         } else {
12354                 ipaddr_t mask;
12355
12356                 if (sin->sin_family != AF_INET)
12357                         return (EAFNOSUPPORT);
12358
12359                 mask = sin->sin_addr.s_addr;
12360                 V4MASK_TO_V6(mask, v6mask);
12361         }
12362
12363         /*
12364          * No big deal if the interface isn't already up, or the mask
12365          * isn't really changing, or this is pt-pt.
12366          */
12367         if (!(ipif->ipif_flags & IPIF_UP) ||
12368             IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) ||
12369             (ipif->ipif_flags & IPIF_POINTOPOINT)) {
12370                 ipif->ipif_v6net_mask = v6mask;
12371                 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
12372                         V6_MASK_COPY(ipif->ipif_v6lcl_addr,
12373                             ipif->ipif_v6net_mask,
12374                             ipif->ipif_v6subnet);
12375                 }
12376                 return (0);
12377         }
12378         /*
12379          * Make sure we have valid net and subnet broadcast ire's
12380          * for the old netmask, if needed by other logical interfaces.
12381          */
12382         if (!ipif->ipif_isv6)
12383                 ipif_check_bcast_ires(ipif);
12384
12385         err = ipif_logical_down(ipif, q, mp);
12386         if (err == EINPROGRESS)
12387                 return (err);
12388         ipif_down_tail(ipif);
12389         err = ip_sioctl_netmask_tail(ipif, sin, q, mp);
12390         return (err);
12391 }
12392
12393 static int
12394 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp)
12395 {
12396         in6_addr_t v6mask;
12397         int err = 0;
12398
12399         ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n",
12400             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12401
12402         if (ipif->ipif_isv6) {
12403                 sin6_t *sin6;
12404
12405                 sin6 = (sin6_t *)sin;
12406                 v6mask = sin6->sin6_addr;
12407         } else {
12408                 ipaddr_t mask;
12409
12410                 mask = sin->sin_addr.s_addr;
12411                 V4MASK_TO_V6(mask, v6mask);
12412         }
12413
12414         ipif->ipif_v6net_mask = v6mask;
12415         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
12416                 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
12417                     ipif->ipif_v6subnet);
12418         }
12419         err = ipif_up(ipif, q, mp);
12420
12421         if (err == 0 || err == EINPROGRESS) {
12422                 /*
12423                  * The interface must be DL_BOUND if this packet has to
12424                  * go out on the wire. Since we only go through a logical
12425                  * down and are bound with the driver during an internal
12426                  * down/up that is satisfied.
12427                  */
12428                 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) {
12429                         /* Potentially broadcast an address mask reply. */
12430                         ipif_mask_reply(ipif);
12431                 }
12432         }
12433         return (err);
12434 }
12435
12436 /* ARGSUSED */
12437 int
12438 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12439     ip_ioctl_cmd_t *ipip, void *if_req)
12440 {
12441         ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n",
12442             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12443         ipif_down_tail(ipif);
12444         return (ip_sioctl_netmask_tail(ipif, sin, q, mp));
12445 }
12446
12447 /* Get interface net mask. */
12448 /* ARGSUSED */
12449 int
12450 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12451     ip_ioctl_cmd_t *ipip, void *if_req)
12452 {
12453         struct lifreq *lifr = (struct lifreq *)if_req;
12454         struct sockaddr_in6 *sin6 = (sin6_t *)sin;
12455
12456         ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n",
12457             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12458
12459         /*
12460          * net mask can't change since we have a reference to the ipif.
12461          */
12462         if (ipif->ipif_isv6) {
12463                 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
12464                 *sin6 = sin6_null;
12465                 sin6->sin6_family = AF_INET6;
12466                 sin6->sin6_addr = ipif->ipif_v6net_mask;
12467                 lifr->lifr_addrlen =
12468                     ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
12469         } else {
12470                 *sin = sin_null;
12471                 sin->sin_family = AF_INET;
12472                 sin->sin_addr.s_addr = ipif->ipif_net_mask;
12473                 if (ipip->ipi_cmd_type == LIF_CMD) {
12474                         lifr->lifr_addrlen =
12475                             ip_mask_to_plen(ipif->ipif_net_mask);
12476                 }
12477         }
12478         return (0);
12479 }
12480
12481 /* ARGSUSED */
12482 int
12483 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12484     ip_ioctl_cmd_t *ipip, void *if_req)
12485 {
12486
12487         ip1dbg(("ip_sioctl_metric(%s:%u %p)\n",
12488             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12489         /*
12490          * Set interface metric.  We don't use this for
12491          * anything but we keep track of it in case it is
12492          * important to routing applications or such.
12493          */
12494         if (ipip->ipi_cmd_type == IF_CMD) {
12495                 struct ifreq    *ifr;
12496
12497                 ifr = (struct ifreq *)if_req;
12498                 ipif->ipif_metric = ifr->ifr_metric;
12499         } else {
12500                 struct lifreq   *lifr;
12501
12502                 lifr = (struct lifreq *)if_req;
12503                 ipif->ipif_metric = lifr->lifr_metric;
12504         }
12505         return (0);
12506 }
12507
12508
12509 /* ARGSUSED */
12510 int
12511 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12512     ip_ioctl_cmd_t *ipip, void *if_req)
12513 {
12514
12515         /* Get interface metric. */
12516         ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n",
12517             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12518         if (ipip->ipi_cmd_type == IF_CMD) {
12519                 struct ifreq    *ifr;
12520
12521                 ifr = (struct ifreq *)if_req;
12522                 ifr->ifr_metric = ipif->ipif_metric;
12523         } else {
12524                 struct lifreq   *lifr;
12525
12526                 lifr = (struct lifreq *)if_req;
12527                 lifr->lifr_metric = ipif->ipif_metric;
12528         }
12529
12530         return (0);
12531 }
12532
12533 /* ARGSUSED */
12534 int
12535 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12536     ip_ioctl_cmd_t *ipip, void *if_req)
12537 {
12538
12539         ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n",
12540             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12541         /*
12542          * Set the muxid returned from I_PLINK.
12543          */
12544         if (ipip->ipi_cmd_type == IF_CMD) {
12545                 struct ifreq *ifr = (struct ifreq *)if_req;
12546
12547                 ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid;
12548                 ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid;
12549         } else {
12550                 struct lifreq *lifr = (struct lifreq *)if_req;
12551
12552                 ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid;
12553                 ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid;
12554         }
12555         return (0);
12556 }
12557
12558 /* ARGSUSED */
12559 int
12560 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12561     ip_ioctl_cmd_t *ipip, void *if_req)
12562 {
12563
12564         ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n",
12565             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12566         /*
12567          * Get the muxid saved in ill for I_PUNLINK.
12568          */
12569         if (ipip->ipi_cmd_type == IF_CMD) {
12570                 struct ifreq *ifr = (struct ifreq *)if_req;
12571
12572                 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid;
12573                 ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid;
12574         } else {
12575                 struct lifreq *lifr = (struct lifreq *)if_req;
12576
12577                 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid;
12578                 lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid;
12579         }
12580         return (0);
12581 }
12582
12583 /*
12584  * Set the subnet prefix. Does not modify the broadcast address.
12585  */
12586 /* ARGSUSED */
12587 int
12588 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12589     ip_ioctl_cmd_t *ipip, void *if_req)
12590 {
12591         int err = 0;
12592         in6_addr_t v6addr;
12593         in6_addr_t v6mask;
12594         boolean_t need_up = B_FALSE;
12595         int addrlen;
12596
12597         ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n",
12598             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12599
12600         ASSERT(IAM_WRITER_IPIF(ipif));
12601         addrlen = ((struct lifreq *)if_req)->lifr_addrlen;
12602
12603         if (ipif->ipif_isv6) {
12604                 sin6_t *sin6;
12605
12606                 if (sin->sin_family != AF_INET6)
12607                         return (EAFNOSUPPORT);
12608
12609                 sin6 = (sin6_t *)sin;
12610                 v6addr = sin6->sin6_addr;
12611                 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones))
12612                         return (EADDRNOTAVAIL);
12613         } else {
12614                 ipaddr_t addr;
12615
12616                 if (sin->sin_family != AF_INET)
12617                         return (EAFNOSUPPORT);
12618
12619                 addr = sin->sin_addr.s_addr;
12620                 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF))
12621                         return (EADDRNOTAVAIL);
12622                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
12623                 /* Add 96 bits */
12624                 addrlen += IPV6_ABITS - IP_ABITS;
12625         }
12626
12627         if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL)
12628                 return (EINVAL);
12629
12630         /* Check if bits in the address is set past the mask */
12631         if (!V6_MASK_EQ(v6addr, v6mask, v6addr))
12632                 return (EINVAL);
12633
12634         if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) &&
12635             IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask))
12636                 return (0);     /* No change */
12637
12638         if (ipif->ipif_flags & IPIF_UP) {
12639                 /*
12640                  * If the interface is already marked up,
12641                  * we call ipif_down which will take care
12642                  * of ditching any IREs that have been set
12643                  * up based on the old interface address.
12644                  */
12645                 err = ipif_logical_down(ipif, q, mp);
12646                 if (err == EINPROGRESS)
12647                         return (err);
12648                 ipif_down_tail(ipif);
12649                 need_up = B_TRUE;
12650         }
12651
12652         err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up);
12653         return (err);
12654 }
12655
12656 static int
12657 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask,
12658     queue_t *q, mblk_t *mp, boolean_t need_up)
12659 {
12660         ill_t   *ill = ipif->ipif_ill;
12661         int     err = 0;
12662
12663         ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n",
12664             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12665
12666         /* Set the new address. */
12667         mutex_enter(&ill->ill_lock);
12668         ipif->ipif_v6net_mask = v6mask;
12669         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
12670                 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask,
12671                     ipif->ipif_v6subnet);
12672         }
12673         mutex_exit(&ill->ill_lock);
12674
12675         if (need_up) {
12676                 /*
12677                  * Now bring the interface back up.  If this
12678                  * is the only IPIF for the ILL, ipif_up
12679                  * will have to re-bind to the device, so
12680                  * we may get back EINPROGRESS, in which
12681                  * case, this IOCTL will get completed in
12682                  * ip_rput_dlpi when we see the DL_BIND_ACK.
12683                  */
12684                 err = ipif_up(ipif, q, mp);
12685                 if (err == EINPROGRESS)
12686                         return (err);
12687         }
12688         return (err);
12689 }
12690
12691 /* ARGSUSED */
12692 int
12693 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12694     ip_ioctl_cmd_t *ipip, void *if_req)
12695 {
12696         int     addrlen;
12697         in6_addr_t v6addr;
12698         in6_addr_t v6mask;
12699         struct lifreq *lifr = (struct lifreq *)if_req;
12700
12701         ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n",
12702             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12703         ipif_down_tail(ipif);
12704
12705         addrlen = lifr->lifr_addrlen;
12706         if (ipif->ipif_isv6) {
12707                 sin6_t *sin6;
12708
12709                 sin6 = (sin6_t *)sin;
12710                 v6addr = sin6->sin6_addr;
12711         } else {
12712                 ipaddr_t addr;
12713
12714                 addr = sin->sin_addr.s_addr;
12715                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
12716                 addrlen += IPV6_ABITS - IP_ABITS;
12717         }
12718         (void) ip_plen_to_mask_v6(addrlen, &v6mask);
12719
12720         return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE));
12721 }
12722
12723 /* ARGSUSED */
12724 int
12725 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12726     ip_ioctl_cmd_t *ipip, void *if_req)
12727 {
12728         struct lifreq *lifr = (struct lifreq *)if_req;
12729         struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin;
12730
12731         ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n",
12732             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12733         ASSERT(ipip->ipi_cmd_type == LIF_CMD);
12734
12735         if (ipif->ipif_isv6) {
12736                 *sin6 = sin6_null;
12737                 sin6->sin6_family = AF_INET6;
12738                 sin6->sin6_addr = ipif->ipif_v6subnet;
12739                 lifr->lifr_addrlen =
12740                     ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
12741         } else {
12742                 *sin = sin_null;
12743                 sin->sin_family = AF_INET;
12744                 sin->sin_addr.s_addr = ipif->ipif_subnet;
12745                 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask);
12746         }
12747         return (0);
12748 }
12749
12750 /*
12751  * Set the IPv6 address token.
12752  */
12753 /* ARGSUSED */
12754 int
12755 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12756     ip_ioctl_cmd_t *ipi, void *if_req)
12757 {
12758         ill_t *ill = ipif->ipif_ill;
12759         int err;
12760         in6_addr_t v6addr;
12761         in6_addr_t v6mask;
12762         boolean_t need_up = B_FALSE;
12763         int i;
12764         sin6_t *sin6 = (sin6_t *)sin;
12765         struct lifreq *lifr = (struct lifreq *)if_req;
12766         int addrlen;
12767
12768         ip1dbg(("ip_sioctl_token(%s:%u %p)\n",
12769             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12770         ASSERT(IAM_WRITER_IPIF(ipif));
12771
12772         addrlen = lifr->lifr_addrlen;
12773         /* Only allow for logical unit zero i.e. not on "le0:17" */
12774         if (ipif->ipif_id != 0)
12775                 return (EINVAL);
12776
12777         if (!ipif->ipif_isv6)
12778                 return (EINVAL);
12779
12780         if (addrlen > IPV6_ABITS)
12781                 return (EINVAL);
12782
12783         v6addr = sin6->sin6_addr;
12784
12785         /*
12786          * The length of the token is the length from the end.  To get
12787          * the proper mask for this, compute the mask of the bits not
12788          * in the token; ie. the prefix, and then xor to get the mask.
12789          */
12790         if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL)
12791                 return (EINVAL);
12792         for (i = 0; i < 4; i++) {
12793                 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
12794         }
12795
12796         if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) &&
12797             ill->ill_token_length == addrlen)
12798                 return (0);     /* No change */
12799
12800         if (ipif->ipif_flags & IPIF_UP) {
12801                 err = ipif_logical_down(ipif, q, mp);
12802                 if (err == EINPROGRESS)
12803                         return (err);
12804                 ipif_down_tail(ipif);
12805                 need_up = B_TRUE;
12806         }
12807         err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up);
12808         return (err);
12809 }
12810
12811 static int
12812 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q,
12813     mblk_t *mp, boolean_t need_up)
12814 {
12815         in6_addr_t v6addr;
12816         in6_addr_t v6mask;
12817         ill_t   *ill = ipif->ipif_ill;
12818         int     i;
12819         int     err = 0;
12820
12821         ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n",
12822             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12823         v6addr = sin6->sin6_addr;
12824         /*
12825          * The length of the token is the length from the end.  To get
12826          * the proper mask for this, compute the mask of the bits not
12827          * in the token; ie. the prefix, and then xor to get the mask.
12828          */
12829         (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask);
12830         for (i = 0; i < 4; i++)
12831                 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
12832
12833         mutex_enter(&ill->ill_lock);
12834         V6_MASK_COPY(v6addr, v6mask, ill->ill_token);
12835         ill->ill_token_length = addrlen;
12836         mutex_exit(&ill->ill_lock);
12837
12838         if (need_up) {
12839                 /*
12840                  * Now bring the interface back up.  If this
12841                  * is the only IPIF for the ILL, ipif_up
12842                  * will have to re-bind to the device, so
12843                  * we may get back EINPROGRESS, in which
12844                  * case, this IOCTL will get completed in
12845                  * ip_rput_dlpi when we see the DL_BIND_ACK.
12846                  */
12847                 err = ipif_up(ipif, q, mp);
12848                 if (err == EINPROGRESS)
12849                         return (err);
12850         }
12851         return (err);
12852 }
12853
12854 /* ARGSUSED */
12855 int
12856 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12857     ip_ioctl_cmd_t *ipi, void *if_req)
12858 {
12859         ill_t *ill;
12860         sin6_t *sin6 = (sin6_t *)sin;
12861         struct lifreq *lifr = (struct lifreq *)if_req;
12862
12863         ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n",
12864             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12865         if (ipif->ipif_id != 0)
12866                 return (EINVAL);
12867
12868         ill = ipif->ipif_ill;
12869         if (!ill->ill_isv6)
12870                 return (ENXIO);
12871
12872         *sin6 = sin6_null;
12873         sin6->sin6_family = AF_INET6;
12874         ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token));
12875         sin6->sin6_addr = ill->ill_token;
12876         lifr->lifr_addrlen = ill->ill_token_length;
12877         return (0);
12878 }
12879
12880 /*
12881  * Set (hardware) link specific information that might override
12882  * what was acquired through the DL_INFO_ACK.
12883  * The logic is as follows.
12884  *
12885  * become exclusive
12886  * set CHANGING flag
12887  * change mtu on affected IREs
12888  * clear CHANGING flag
12889  *
12890  * An ire add that occurs before the CHANGING flag is set will have its mtu
12891  * changed by the ip_sioctl_lnkinfo.
12892  *
12893  * During the time the CHANGING flag is set, no new ires will be added to the
12894  * bucket, and ire add will fail (due the CHANGING flag).
12895  *
12896  * An ire add that occurs after the CHANGING flag is set will have the right mtu
12897  * before it is added to the bucket.
12898  *
12899  * Obviously only 1 thread can set the CHANGING flag and we need to become
12900  * exclusive to set the flag.
12901  */
12902 /* ARGSUSED */
12903 int
12904 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12905     ip_ioctl_cmd_t *ipi, void *if_req)
12906 {
12907         ill_t           *ill = ipif->ipif_ill;
12908         ipif_t          *nipif;
12909         int             ip_min_mtu;
12910         boolean_t       mtu_walk = B_FALSE;
12911         struct lifreq   *lifr = (struct lifreq *)if_req;
12912         lif_ifinfo_req_t *lir;
12913         ire_t           *ire;
12914
12915         ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n",
12916             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
12917         lir = &lifr->lifr_ifinfo;
12918         ASSERT(IAM_WRITER_IPIF(ipif));
12919
12920         /* Only allow for logical unit zero i.e. not on "le0:17" */
12921         if (ipif->ipif_id != 0)
12922                 return (EINVAL);
12923
12924         /* Set interface MTU. */
12925         if (ipif->ipif_isv6)
12926                 ip_min_mtu = IPV6_MIN_MTU;
12927         else
12928                 ip_min_mtu = IP_MIN_MTU;
12929
12930         /*
12931          * Verify values before we set anything. Allow zero to
12932          * mean unspecified.
12933          */
12934         if (lir->lir_maxmtu != 0 &&
12935             (lir->lir_maxmtu > ill->ill_max_frag ||
12936             lir->lir_maxmtu < ip_min_mtu))
12937                 return (EINVAL);
12938         if (lir->lir_reachtime != 0 &&
12939             lir->lir_reachtime > ND_MAX_REACHTIME)
12940                 return (EINVAL);
12941         if (lir->lir_reachretrans != 0 &&
12942             lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME)
12943                 return (EINVAL);
12944
12945         mutex_enter(&ill->ill_lock);
12946         ill->ill_state_flags |= ILL_CHANGING;
12947         for (nipif = ill->ill_ipif; nipif != NULL;
12948             nipif = nipif->ipif_next) {
12949                 nipif->ipif_state_flags |= IPIF_CHANGING;
12950         }
12951
12952         mutex_exit(&ill->ill_lock);
12953
12954         if (lir->lir_maxmtu != 0) {
12955                 ill->ill_max_mtu = lir->lir_maxmtu;
12956                 ill->ill_mtu_userspecified = 1;
12957                 mtu_walk = B_TRUE;
12958         }
12959
12960         if (lir->lir_reachtime != 0)
12961                 ill->ill_reachable_time = lir->lir_reachtime;
12962
12963         if (lir->lir_reachretrans != 0)
12964                 ill->ill_reachable_retrans_time = lir->lir_reachretrans;
12965
12966         ill->ill_max_hops = lir->lir_maxhops;
12967
12968         ill->ill_max_buf = ND_MAX_Q;
12969
12970         if (mtu_walk) {
12971                 /*
12972                  * Set the MTU on all ipifs associated with this ill except
12973                  * for those whose MTU was fixed via SIOCSLIFMTU.
12974                  */
12975                 for (nipif = ill->ill_ipif; nipif != NULL;
12976                     nipif = nipif->ipif_next) {
12977                         if (nipif->ipif_flags & IPIF_FIXEDMTU)
12978                                 continue;
12979
12980                         nipif->ipif_mtu = ill->ill_max_mtu;
12981
12982                         if (!(nipif->ipif_flags & IPIF_UP))
12983                                 continue;
12984
12985                         if (nipif->ipif_isv6)
12986                                 ire = ipif_to_ire_v6(nipif);
12987                         else
12988                                 ire = ipif_to_ire(nipif);
12989                         if (ire != NULL) {
12990                                 ire->ire_max_frag = ipif->ipif_mtu;
12991                                 ire_refrele(ire);
12992                         }
12993                         if (ill->ill_isv6) {
12994                                 ire_walk_ill_v6(MATCH_IRE_ILL, 0,
12995                                     ipif_mtu_change, (char *)nipif,
12996                                     ill);
12997                         } else {
12998                                 ire_walk_ill_v4(MATCH_IRE_ILL, 0,
12999                                     ipif_mtu_change, (char *)nipif,
13000                                     ill);
13001                         }
13002                 }
13003         }
13004
13005         mutex_enter(&ill->ill_lock);
13006         for (nipif = ill->ill_ipif; nipif != NULL;
13007             nipif = nipif->ipif_next) {
13008                 nipif->ipif_state_flags &= ~IPIF_CHANGING;
13009         }
13010         ILL_UNMARK_CHANGING(ill);
13011         mutex_exit(&ill->ill_lock);
13012
13013         return (0);
13014 }
13015
13016 /* ARGSUSED */
13017 int
13018 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
13019     ip_ioctl_cmd_t *ipi, void *if_req)
13020 {
13021         struct lif_ifinfo_req *lir;
13022         ill_t *ill = ipif->ipif_ill;
13023
13024         ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n",
13025             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
13026         if (ipif->ipif_id != 0)
13027                 return (EINVAL);
13028
13029         lir = &((struct lifreq *)if_req)->lifr_ifinfo;
13030         lir->lir_maxhops = ill->ill_max_hops;
13031         lir->lir_reachtime = ill->ill_reachable_time;
13032         lir->lir_reachretrans = ill->ill_reachable_retrans_time;
13033         lir->lir_maxmtu = ill->ill_max_mtu;
13034
13035         return (0);
13036 }
13037
13038 /*
13039  * Return best guess as to the subnet mask for the specified address.
13040  * Based on the subnet masks for all the configured interfaces.
13041  *
13042  * We end up returning a zero mask in the case of default, multicast or
13043  * experimental.
13044  */
13045 static ipaddr_t
13046 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst)
13047 {
13048         ipaddr_t net_mask;
13049         ill_t   *ill;
13050         ipif_t  *ipif;
13051         ill_walk_context_t ctx;
13052         ipif_t  *fallback_ipif = NULL;
13053
13054         net_mask = ip_net_mask(addr);
13055         if (net_mask == 0) {
13056                 *ipifp = NULL;
13057                 return (0);
13058         }
13059
13060         /* Let's check to see if this is maybe a local subnet route. */
13061         /* this function only applies to IPv4 interfaces */
13062         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
13063         ill = ILL_START_WALK_V4(&ctx, ipst);
13064         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
13065                 mutex_enter(&ill->ill_lock);
13066                 for (ipif = ill->ill_ipif; ipif != NULL;
13067                     ipif = ipif->ipif_next) {
13068                         if (!IPIF_CAN_LOOKUP(ipif))
13069                                 continue;
13070                         if (!(ipif->ipif_flags & IPIF_UP))
13071                                 continue;
13072                         if ((ipif->ipif_subnet & net_mask) ==
13073                             (addr & net_mask)) {
13074                                 /*
13075                                  * Don't trust pt-pt interfaces if there are
13076                                  * other interfaces.
13077                                  */
13078                                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
13079                                         if (fallback_ipif == NULL) {
13080                                                 ipif_refhold_locked(ipif);
13081                                                 fallback_ipif = ipif;
13082                                         }
13083                                         continue;
13084                                 }
13085
13086                                 /*
13087                                  * Fine. Just assume the same net mask as the
13088                                  * directly attached subnet interface is using.
13089                                  */
13090                                 ipif_refhold_locked(ipif);
13091                                 mutex_exit(&ill->ill_lock);
13092                                 rw_exit(&ipst->ips_ill_g_lock);
13093                                 if (fallback_ipif != NULL)
13094                                         ipif_refrele(fallback_ipif);
13095                                 *ipifp = ipif;
13096                                 return (ipif->ipif_net_mask);
13097                         }
13098                 }
13099                 mutex_exit(&ill->ill_lock);
13100         }
13101         rw_exit(&ipst->ips_ill_g_lock);
13102
13103         *ipifp = fallback_ipif;
13104         return ((fallback_ipif != NULL) ?
13105             fallback_ipif->ipif_net_mask : net_mask);
13106 }
13107
13108 /*
13109  * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl.
13110  */
13111 static void
13112 ip_wput_ioctl(queue_t *q, mblk_t *mp)
13113 {
13114         IOCP    iocp;
13115         ipft_t  *ipft;
13116         ipllc_t *ipllc;
13117         mblk_t  *mp1;
13118         cred_t  *cr;
13119         int     error = 0;
13120         conn_t  *connp;
13121
13122         ip1dbg(("ip_wput_ioctl"));
13123         iocp = (IOCP)mp->b_rptr;
13124         mp1 = mp->b_cont;
13125         if (mp1 == NULL) {
13126                 iocp->ioc_error = EINVAL;
13127                 mp->b_datap->db_type = M_IOCNAK;
13128                 iocp->ioc_count = 0;
13129                 qreply(q, mp);
13130                 return;
13131         }
13132
13133         /*
13134          * These IOCTLs provide various control capabilities to
13135          * upstream agents such as ULPs and processes.  There
13136          * are currently two such IOCTLs implemented.  They
13137          * are used by TCP to provide update information for
13138          * existing IREs and to forcibly delete an IRE for a
13139          * host that is not responding, thereby forcing an
13140          * attempt at a new route.
13141          */
13142         iocp->ioc_error = EINVAL;
13143         if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd)))
13144                 goto done;
13145
13146         ipllc = (ipllc_t *)mp1->b_rptr;
13147         for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) {
13148                 if (ipllc->ipllc_cmd == ipft->ipft_cmd)
13149                         break;
13150         }
13151         /*
13152          * prefer credential from mblk over ioctl;
13153          * see ip_sioctl_copyin_setup
13154          */
13155         cr = DB_CREDDEF(mp, iocp->ioc_cr);
13156
13157         /*
13158          * Refhold the conn in case the request gets queued up in some lookup
13159          */
13160         ASSERT(CONN_Q(q));
13161         connp = Q_TO_CONN(q);
13162         CONN_INC_REF(connp);
13163         if (ipft->ipft_pfi &&
13164             ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size ||
13165             pullupmsg(mp1, ipft->ipft_min_size))) {
13166                 error = (*ipft->ipft_pfi)(q,
13167                     (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr);
13168         }
13169         if (ipft->ipft_flags & IPFT_F_SELF_REPLY) {
13170                 /*
13171                  * CONN_OPER_PENDING_DONE happens in the function called
13172                  * through ipft_pfi above.
13173                  */
13174                 return;
13175         }
13176
13177         CONN_OPER_PENDING_DONE(connp);
13178         if (ipft->ipft_flags & IPFT_F_NO_REPLY) {
13179                 freemsg(mp);
13180                 return;
13181         }
13182         iocp->ioc_error = error;
13183
13184 done:
13185         mp->b_datap->db_type = M_IOCACK;
13186         if (iocp->ioc_error)
13187                 iocp->ioc_count = 0;
13188         qreply(q, mp);
13189 }
13190
13191 /*
13192  * Lookup an ipif using the sequence id (ipif_seqid)
13193  */
13194 ipif_t *
13195 ipif_lookup_seqid(ill_t *ill, uint_t seqid)
13196 {
13197         ipif_t *ipif;
13198
13199         ASSERT(MUTEX_HELD(&ill->ill_lock));
13200
13201         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13202                 if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif))
13203                         return (ipif);
13204         }
13205         return (NULL);
13206 }
13207
13208 /*
13209  * Assign a unique id for the ipif. This is used later when we send
13210  * IRES to ARP for resolution where we initialize ire_ipif_seqid
13211  * to the value pointed by ire_ipif->ipif_seqid. Later when the
13212  * IRE is added, we verify that ipif has not disappeared.
13213  */
13214
13215 static void
13216 ipif_assign_seqid(ipif_t *ipif)
13217 {
13218         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
13219
13220         ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1);
13221 }
13222
13223 /*
13224  * Insert the ipif, so that the list of ipifs on the ill will be sorted
13225  * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will
13226  * be inserted into the first space available in the list. The value of
13227  * ipif_id will then be set to the appropriate value for its position.
13228  */
13229 static int
13230 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock)
13231 {
13232         ill_t *ill;
13233         ipif_t *tipif;
13234         ipif_t **tipifp;
13235         int id;
13236         ip_stack_t      *ipst;
13237
13238         ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK ||
13239             IAM_WRITER_IPIF(ipif));
13240
13241         ill = ipif->ipif_ill;
13242         ASSERT(ill != NULL);
13243         ipst = ill->ill_ipst;
13244
13245         /*
13246          * In the case of lo0:0 we already hold the ill_g_lock.
13247          * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate ->
13248          * ipif_insert. Another such caller is ipif_move.
13249          */
13250         if (acquire_g_lock)
13251                 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13252         if (acquire_ill_lock)
13253                 mutex_enter(&ill->ill_lock);
13254         id = ipif->ipif_id;
13255         tipifp = &(ill->ill_ipif);
13256         if (id == -1) { /* need to find a real id */
13257                 id = 0;
13258                 while ((tipif = *tipifp) != NULL) {
13259                         ASSERT(tipif->ipif_id >= id);
13260                         if (tipif->ipif_id != id)
13261                                 break; /* non-consecutive id */
13262                         id++;
13263                         tipifp = &(tipif->ipif_next);
13264                 }
13265                 /* limit number of logical interfaces */
13266                 if (id >= ipst->ips_ip_addrs_per_if) {
13267                         if (acquire_ill_lock)
13268                                 mutex_exit(&ill->ill_lock);
13269                         if (acquire_g_lock)
13270                                 rw_exit(&ipst->ips_ill_g_lock);
13271                         return (-1);
13272                 }
13273                 ipif->ipif_id = id; /* assign new id */
13274         } else if (id < ipst->ips_ip_addrs_per_if) {
13275                 /* we have a real id; insert ipif in the right place */
13276                 while ((tipif = *tipifp) != NULL) {
13277                         ASSERT(tipif->ipif_id != id);
13278                         if (tipif->ipif_id > id)
13279                                 break; /* found correct location */
13280                         tipifp = &(tipif->ipif_next);
13281                 }
13282         } else {
13283                 if (acquire_ill_lock)
13284                         mutex_exit(&ill->ill_lock);
13285                 if (acquire_g_lock)
13286                         rw_exit(&ipst->ips_ill_g_lock);
13287                 return (-1);
13288         }
13289
13290         ASSERT(tipifp != &(ill->ill_ipif) || id == 0);
13291
13292         ipif->ipif_next = tipif;
13293         *tipifp = ipif;
13294         if (acquire_ill_lock)
13295                 mutex_exit(&ill->ill_lock);
13296         if (acquire_g_lock)
13297                 rw_exit(&ipst->ips_ill_g_lock);
13298         return (0);
13299 }
13300
13301 static void
13302 ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock)
13303 {
13304         ipif_t  **ipifp;
13305         ill_t   *ill = ipif->ipif_ill;
13306
13307         ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock));
13308         if (acquire_ill_lock)
13309                 mutex_enter(&ill->ill_lock);
13310         else
13311                 ASSERT(MUTEX_HELD(&ill->ill_lock));
13312
13313         ipifp = &ill->ill_ipif;
13314         for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
13315                 if (*ipifp == ipif) {
13316                         *ipifp = ipif->ipif_next;
13317                         break;
13318                 }
13319         }
13320
13321         if (acquire_ill_lock)
13322                 mutex_exit(&ill->ill_lock);
13323 }
13324
13325 /*
13326  * Allocate and initialize a new interface control structure.  (Always
13327  * called as writer.)
13328  * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill
13329  * is not part of the global linked list of ills. ipif_seqid is unique
13330  * in the system and to preserve the uniqueness, it is assigned only
13331  * when ill becomes part of the global list. At that point ill will
13332  * have a name. If it doesn't get assigned here, it will get assigned
13333  * in ipif_set_values() as part of SIOCSLIFNAME processing.
13334  * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set
13335  * the interface flags or any other information from the DL_INFO_ACK for
13336  * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at
13337  * this point. The flags etc. will be set in ip_ll_subnet_defaults when the
13338  * second DL_INFO_ACK comes in from the driver.
13339  */
13340 static ipif_t *
13341 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize)
13342 {
13343         ipif_t  *ipif;
13344         phyint_t *phyi;
13345
13346         ip1dbg(("ipif_allocate(%s:%d ill %p)\n",
13347             ill->ill_name, id, (void *)ill));
13348         ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill));
13349
13350         if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL)
13351                 return (NULL);
13352         *ipif = ipif_zero;      /* start clean */
13353
13354         ipif->ipif_ill = ill;
13355         ipif->ipif_id = id;     /* could be -1 */
13356         /*
13357          * Inherit the zoneid from the ill; for the shared stack instance
13358          * this is always the global zone
13359          */
13360         ipif->ipif_zoneid = ill->ill_zoneid;
13361
13362         mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
13363
13364         ipif->ipif_refcnt = 0;
13365         ipif->ipif_saved_ire_cnt = 0;
13366
13367         if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) {
13368                 mi_free(ipif);
13369                 return (NULL);
13370         }
13371         /* -1 id should have been replaced by real id */
13372         id = ipif->ipif_id;
13373         ASSERT(id >= 0);
13374
13375         if (ill->ill_name[0] != '\0')
13376                 ipif_assign_seqid(ipif);
13377
13378         /*
13379          * Keep a copy of original id in ipif_orig_ipifid.  Failback
13380          * will attempt to restore the original id.  The SIOCSLIFOINDEX
13381          * ioctl sets ipif_orig_ipifid to zero.
13382          */
13383         ipif->ipif_orig_ipifid = id;
13384
13385         /*
13386          * We grab the ill_lock and phyint_lock to protect the flag changes.
13387          * The ipif is still not up and can't be looked up until the
13388          * ioctl completes and the IPIF_CHANGING flag is cleared.
13389          */
13390         mutex_enter(&ill->ill_lock);
13391         mutex_enter(&ill->ill_phyint->phyint_lock);
13392         /*
13393          * Set the running flag when logical interface zero is created.
13394          * For subsequent logical interfaces, a DLPI link down
13395          * notification message may have cleared the running flag to
13396          * indicate the link is down, so we shouldn't just blindly set it.
13397          */
13398         if (id == 0)
13399                 ill->ill_phyint->phyint_flags |= PHYI_RUNNING;
13400         ipif->ipif_ire_type = ire_type;
13401         phyi = ill->ill_phyint;
13402         ipif->ipif_orig_ifindex = phyi->phyint_ifindex;
13403
13404         if (ipif->ipif_isv6) {
13405                 ill->ill_flags |= ILLF_IPV6;
13406         } else {
13407                 ipaddr_t inaddr_any = INADDR_ANY;
13408
13409                 ill->ill_flags |= ILLF_IPV4;
13410
13411                 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */
13412                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
13413                     &ipif->ipif_v6lcl_addr);
13414                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
13415                     &ipif->ipif_v6src_addr);
13416                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
13417                     &ipif->ipif_v6subnet);
13418                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
13419                     &ipif->ipif_v6net_mask);
13420                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
13421                     &ipif->ipif_v6brd_addr);
13422                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
13423                     &ipif->ipif_v6pp_dst_addr);
13424         }
13425
13426         /*
13427          * Don't set the interface flags etc. now, will do it in
13428          * ip_ll_subnet_defaults.
13429          */
13430         if (!initialize) {
13431                 mutex_exit(&ill->ill_lock);
13432                 mutex_exit(&ill->ill_phyint->phyint_lock);
13433                 return (ipif);
13434         }
13435         ipif->ipif_mtu = ill->ill_max_mtu;
13436
13437         if (ill->ill_bcast_addr_length != 0) {
13438                 /*
13439                  * Later detect lack of DLPI driver multicast
13440                  * capability by catching DL_ENABMULTI errors in
13441                  * ip_rput_dlpi.
13442                  */
13443                 ill->ill_flags |= ILLF_MULTICAST;
13444                 if (!ipif->ipif_isv6)
13445                         ipif->ipif_flags |= IPIF_BROADCAST;
13446         } else {
13447                 if (ill->ill_net_type != IRE_LOOPBACK) {
13448                         if (ipif->ipif_isv6)
13449                                 /*
13450                                  * Note: xresolv interfaces will eventually need
13451                                  * NOARP set here as well, but that will require
13452                                  * those external resolvers to have some
13453                                  * knowledge of that flag and act appropriately.
13454                                  * Not to be changed at present.
13455                                  */
13456                                 ill->ill_flags |= ILLF_NONUD;
13457                         else
13458                                 ill->ill_flags |= ILLF_NOARP;
13459                 }
13460                 if (ill->ill_phys_addr_length == 0) {
13461                         if (ill->ill_media &&
13462                             ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) {
13463                                 ipif->ipif_flags |= IPIF_NOXMIT;
13464                                 phyi->phyint_flags |= PHYI_VIRTUAL;
13465                         } else {
13466                                 /* pt-pt supports multicast. */
13467                                 ill->ill_flags |= ILLF_MULTICAST;
13468                                 if (ill->ill_net_type == IRE_LOOPBACK) {
13469                                         phyi->phyint_flags |=
13470                                             (PHYI_LOOPBACK | PHYI_VIRTUAL);
13471                                 } else {
13472                                         ipif->ipif_flags |= IPIF_POINTOPOINT;
13473                                 }
13474                         }
13475                 }
13476         }
13477         mutex_exit(&ill->ill_lock);
13478         mutex_exit(&ill->ill_phyint->phyint_lock);
13479         return (ipif);
13480 }
13481
13482 /*
13483  * If appropriate, send a message up to the resolver delete the entry
13484  * for the address of this interface which is going out of business.
13485  * (Always called as writer).
13486  *
13487  * NOTE : We need to check for NULL mps as some of the fields are
13488  *        initialized only for some interface types. See ipif_resolver_up()
13489  *        for details.
13490  */
13491 void
13492 ipif_arp_down(ipif_t *ipif)
13493 {
13494         mblk_t  *mp;
13495         ill_t   *ill = ipif->ipif_ill;
13496
13497         ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
13498         ASSERT(IAM_WRITER_IPIF(ipif));
13499
13500         /* Delete the mapping for the local address */
13501         mp = ipif->ipif_arp_del_mp;
13502         if (mp != NULL) {
13503                 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
13504                     *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id));
13505                 putnext(ill->ill_rq, mp);
13506                 ipif->ipif_arp_del_mp = NULL;
13507         }
13508
13509         /*
13510          * If this is the last ipif that is going down and there are no
13511          * duplicate addresses we may yet attempt to re-probe, then we need to
13512          * clean up ARP completely.
13513          */
13514         if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) {
13515
13516                 /* Send up AR_INTERFACE_DOWN message */
13517                 mp = ill->ill_arp_down_mp;
13518                 if (mp != NULL) {
13519                         ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
13520                             *(unsigned *)mp->b_rptr, ill->ill_name,
13521                             ipif->ipif_id));
13522                         putnext(ill->ill_rq, mp);
13523                         ill->ill_arp_down_mp = NULL;
13524                 }
13525
13526                 /* Tell ARP to delete the multicast mappings */
13527                 mp = ill->ill_arp_del_mapping_mp;
13528                 if (mp != NULL) {
13529                         ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
13530                             *(unsigned *)mp->b_rptr, ill->ill_name,
13531                             ipif->ipif_id));
13532                         putnext(ill->ill_rq, mp);
13533                         ill->ill_arp_del_mapping_mp = NULL;
13534                 }
13535         }
13536 }
13537
13538 /*
13539  * This function sets up the multicast mappings in ARP. When ipif_resolver_up
13540  * calls this function, it passes a non-NULL arp_add_mapping_mp indicating
13541  * that it wants the add_mp allocated in this function to be returned
13542  * wihtout sending it to arp. When ip_rput_dlpi_writer calls this to
13543  * just re-do the multicast, it wants us to send the add_mp to ARP also.
13544  * ipif_resolver_up does not want us to do the "add" i.e sending to ARP,
13545  * as it does a ipif_arp_down after calling this function - which will
13546  * remove what we add here.
13547  *
13548  * Returns -1 on failures and 0 on success.
13549  */
13550 int
13551 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
13552 {
13553         mblk_t  *del_mp = NULL;
13554         mblk_t *add_mp = NULL;
13555         mblk_t *mp;
13556         ill_t   *ill = ipif->ipif_ill;
13557         phyint_t *phyi = ill->ill_phyint;
13558         ipaddr_t addr, mask, extract_mask = 0;
13559         arma_t  *arma;
13560         uint8_t *maddr, *bphys_addr;
13561         uint32_t hw_start;
13562         dl_unitdata_req_t *dlur;
13563
13564         ASSERT(IAM_WRITER_IPIF(ipif));
13565         if (ipif->ipif_flags & IPIF_POINTOPOINT)
13566                 return (0);
13567
13568         /*
13569          * Delete the existing mapping from ARP. Normally ipif_down
13570          * -> ipif_arp_down should send this up to ARP. The only
13571          * reason we would find this when we are switching from
13572          * Multicast to Broadcast where we did not do a down.
13573          */
13574         mp = ill->ill_arp_del_mapping_mp;
13575         if (mp != NULL) {
13576                 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n",
13577                     *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id));
13578                 putnext(ill->ill_rq, mp);
13579                 ill->ill_arp_del_mapping_mp = NULL;
13580         }
13581
13582         if (arp_add_mapping_mp != NULL)
13583                 *arp_add_mapping_mp = NULL;
13584
13585         /*
13586          * Check that the address is not to long for the constant
13587          * length reserved in the template arma_t.
13588          */
13589         if (ill->ill_phys_addr_length > IP_MAX_HW_LEN)
13590                 return (-1);
13591
13592         /* Add mapping mblk */
13593         addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP);
13594         mask = (ipaddr_t)htonl(IN_CLASSD_NET);
13595         add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template,
13596             (caddr_t)&addr);
13597         if (add_mp == NULL)
13598                 return (-1);
13599         arma = (arma_t *)add_mp->b_rptr;
13600         maddr = (uint8_t *)arma + arma->arma_hw_addr_offset;
13601         bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN);
13602         arma->arma_hw_addr_length = ill->ill_phys_addr_length;
13603
13604         /*
13605          * Determine the broadcast address.
13606          */
13607         dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
13608         if (ill->ill_sap_length < 0)
13609                 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
13610         else
13611                 bphys_addr = (uchar_t *)dlur +
13612                     dlur->dl_dest_addr_offset + ill->ill_sap_length;
13613         /*
13614          * Check PHYI_MULTI_BCAST and length of physical
13615          * address to determine if we use the mapping or the
13616          * broadcast address.
13617          */
13618         if (!(phyi->phyint_flags & PHYI_MULTI_BCAST))
13619                 if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length,
13620                     bphys_addr, maddr, &hw_start, &extract_mask))
13621                         phyi->phyint_flags |= PHYI_MULTI_BCAST;
13622
13623         if ((phyi->phyint_flags & PHYI_MULTI_BCAST) ||
13624             (ill->ill_flags & ILLF_MULTICAST)) {
13625                 /* Make sure this will not match the "exact" entry. */
13626                 addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP);
13627                 del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template,
13628                     (caddr_t)&addr);
13629                 if (del_mp == NULL) {
13630                         freemsg(add_mp);
13631                         return (-1);
13632                 }
13633                 bcopy(&extract_mask, (char *)arma +
13634                     arma->arma_proto_extract_mask_offset, IP_ADDR_LEN);
13635                 if (phyi->phyint_flags & PHYI_MULTI_BCAST) {
13636                         /* Use link-layer broadcast address for MULTI_BCAST */
13637                         bcopy(bphys_addr, maddr, ill->ill_phys_addr_length);
13638                         ip2dbg(("ipif_arp_setup_multicast: adding"
13639                             " MULTI_BCAST ARP setup for %s\n", ill->ill_name));
13640                 } else {
13641                         arma->arma_hw_mapping_start = hw_start;
13642                         ip2dbg(("ipif_arp_setup_multicast: adding multicast"
13643                             " ARP setup for %s\n", ill->ill_name));
13644                 }
13645         } else {
13646                 freemsg(add_mp);
13647                 ASSERT(del_mp == NULL);
13648                 /* It is neither MULTICAST nor MULTI_BCAST */
13649                 return (0);
13650         }
13651         ASSERT(add_mp != NULL && del_mp != NULL);
13652         ASSERT(ill->ill_arp_del_mapping_mp == NULL);
13653         ill->ill_arp_del_mapping_mp = del_mp;
13654         if (arp_add_mapping_mp != NULL) {
13655                 /* The caller just wants the mblks allocated */
13656                 *arp_add_mapping_mp = add_mp;
13657         } else {
13658                 /* The caller wants us to send it to arp */
13659                 putnext(ill->ill_rq, add_mp);
13660         }
13661         return (0);
13662 }
13663
13664 /*
13665  * Get the resolver set up for a new interface address.
13666  * (Always called as writer.)
13667  * Called both for IPv4 and IPv6 interfaces,
13668  * though it only sets up the resolver for v6
13669  * if it's an xresolv interface (one using an external resolver).
13670  * Honors ILLF_NOARP.
13671  * The enumerated value res_act is used to tune the behavior.
13672  * If set to Res_act_initial, then we set up all the resolver
13673  * structures for a new interface.  If set to Res_act_move, then
13674  * we just send an AR_ENTRY_ADD message up to ARP for IPv4
13675  * interfaces; this is called by ip_rput_dlpi_writer() to handle
13676  * asynchronous hardware address change notification.  If set to
13677  * Res_act_defend, then we tell ARP that it needs to send a single
13678  * gratuitous message in defense of the address.
13679  * Returns error on failure.
13680  */
13681 int
13682 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
13683 {
13684         caddr_t addr;
13685         mblk_t  *arp_up_mp = NULL;
13686         mblk_t  *arp_down_mp = NULL;
13687         mblk_t  *arp_add_mp = NULL;
13688         mblk_t  *arp_del_mp = NULL;
13689         mblk_t  *arp_add_mapping_mp = NULL;
13690         mblk_t  *arp_del_mapping_mp = NULL;
13691         ill_t   *ill = ipif->ipif_ill;
13692         uchar_t *area_p = NULL;
13693         uchar_t *ared_p = NULL;
13694         int     err = ENOMEM;
13695         boolean_t was_dup;
13696
13697         ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
13698             ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags));
13699         ASSERT(IAM_WRITER_IPIF(ipif));
13700
13701         was_dup = B_FALSE;
13702         if (res_act == Res_act_initial) {
13703                 ipif->ipif_addr_ready = 0;
13704                 /*
13705                  * We're bringing an interface up here.  There's no way that we
13706                  * should need to shut down ARP now.
13707                  */
13708                 mutex_enter(&ill->ill_lock);
13709                 if (ipif->ipif_flags & IPIF_DUPLICATE) {
13710                         ipif->ipif_flags &= ~IPIF_DUPLICATE;
13711                         ill->ill_ipif_dup_count--;
13712                         was_dup = B_TRUE;
13713                 }
13714                 mutex_exit(&ill->ill_lock);
13715         }
13716         if (ipif->ipif_recovery_id != 0)
13717                 (void) untimeout(ipif->ipif_recovery_id);
13718         ipif->ipif_recovery_id = 0;
13719         if (ill->ill_net_type != IRE_IF_RESOLVER) {
13720                 ipif->ipif_addr_ready = 1;
13721                 return (0);
13722         }
13723         /* NDP will set the ipif_addr_ready flag when it's ready */
13724         if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))
13725                 return (0);
13726
13727         if (ill->ill_isv6) {
13728                 /*
13729                  * External resolver for IPv6
13730                  */
13731                 ASSERT(res_act == Res_act_initial);
13732                 if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
13733                         addr = (caddr_t)&ipif->ipif_v6lcl_addr;
13734                         area_p = (uchar_t *)&ip6_area_template;
13735                         ared_p = (uchar_t *)&ip6_ared_template;
13736                 }
13737         } else {
13738                 /*
13739                  * IPv4 arp case. If the ARP stream has already started
13740                  * closing, fail this request for ARP bringup. Else
13741                  * record the fact that an ARP bringup is pending.
13742                  */
13743                 mutex_enter(&ill->ill_lock);
13744                 if (ill->ill_arp_closing) {
13745                         mutex_exit(&ill->ill_lock);
13746                         err = EINVAL;
13747                         goto failed;
13748                 } else {
13749                         if (ill->ill_ipif_up_count == 0 &&
13750                             ill->ill_ipif_dup_count == 0 && !was_dup)
13751                                 ill->ill_arp_bringup_pending = 1;
13752                         mutex_exit(&ill->ill_lock);
13753                 }
13754                 if (ipif->ipif_lcl_addr != INADDR_ANY) {
13755                         addr = (caddr_t)&ipif->ipif_lcl_addr;
13756                         area_p = (uchar_t *)&ip_area_template;
13757                         ared_p = (uchar_t *)&ip_ared_template;
13758                 }
13759         }
13760
13761         /*
13762          * Add an entry for the local address in ARP only if it
13763          * is not UNNUMBERED and the address is not INADDR_ANY.
13764          */
13765         if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) {
13766                 area_t *area;
13767
13768                 /* Now ask ARP to publish our address. */
13769                 arp_add_mp = ill_arp_alloc(ill, area_p, addr);
13770                 if (arp_add_mp == NULL)
13771                         goto failed;
13772                 area = (area_t *)arp_add_mp->b_rptr;
13773                 if (res_act != Res_act_initial) {
13774                         /*
13775                          * Copy the new hardware address and length into
13776                          * arp_add_mp to be sent to ARP.
13777                          */
13778                         area->area_hw_addr_length = ill->ill_phys_addr_length;
13779                         bcopy(ill->ill_phys_addr,
13780                             ((char *)area + area->area_hw_addr_offset),
13781                             area->area_hw_addr_length);
13782                 }
13783
13784                 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH |
13785                     ACE_F_MYADDR;
13786
13787                 if (res_act == Res_act_defend) {
13788                         area->area_flags |= ACE_F_DEFEND;
13789                         /*
13790                          * If we're just defending our address now, then
13791                          * there's no need to set up ARP multicast mappings.
13792                          * The publish command is enough.
13793                          */
13794                         goto done;
13795                 }
13796
13797                 if (res_act != Res_act_initial)
13798                         goto arp_setup_multicast;
13799
13800                 /*
13801                  * Allocate an ARP deletion message so we know we can tell ARP
13802                  * when the interface goes down.
13803                  */
13804                 arp_del_mp = ill_arp_alloc(ill, ared_p, addr);
13805                 if (arp_del_mp == NULL)
13806                         goto failed;
13807
13808         } else {
13809                 if (res_act != Res_act_initial)
13810                         goto done;
13811         }
13812         /*
13813          * Need to bring up ARP or setup multicast mapping only
13814          * when the first interface is coming UP.
13815          */
13816         if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
13817             was_dup) {
13818                 goto done;
13819         }
13820
13821         /*
13822          * Allocate an ARP down message (to be saved) and an ARP up
13823          * message.
13824          */
13825         arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0);
13826         if (arp_down_mp == NULL)
13827                 goto failed;
13828
13829         arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0);
13830         if (arp_up_mp == NULL)
13831                 goto failed;
13832
13833         if (ipif->ipif_flags & IPIF_POINTOPOINT)
13834                 goto done;
13835
13836 arp_setup_multicast:
13837         /*
13838          * Setup the multicast mappings. This function initializes
13839          * ill_arp_del_mapping_mp also. This does not need to be done for
13840          * IPv6.
13841          */
13842         if (!ill->ill_isv6) {
13843                 err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp);
13844                 if (err != 0)
13845                         goto failed;
13846                 ASSERT(ill->ill_arp_del_mapping_mp != NULL);
13847                 ASSERT(arp_add_mapping_mp != NULL);
13848         }
13849
13850 done:
13851         if (arp_del_mp != NULL) {
13852                 ASSERT(ipif->ipif_arp_del_mp == NULL);
13853                 ipif->ipif_arp_del_mp = arp_del_mp;
13854         }
13855         if (arp_down_mp != NULL) {
13856                 ASSERT(ill->ill_arp_down_mp == NULL);
13857                 ill->ill_arp_down_mp = arp_down_mp;
13858         }
13859         if (arp_del_mapping_mp != NULL) {
13860                 ASSERT(ill->ill_arp_del_mapping_mp == NULL);
13861                 ill->ill_arp_del_mapping_mp = arp_del_mapping_mp;
13862         }
13863         if (arp_up_mp != NULL) {
13864                 ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n",
13865                     ill->ill_name, ipif->ipif_id));
13866                 putnext(ill->ill_rq, arp_up_mp);
13867         }
13868         if (arp_add_mp != NULL) {
13869                 ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n",
13870                     ill->ill_name, ipif->ipif_id));
13871                 /*
13872                  * If it's an extended ARP implementation, then we'll wait to
13873                  * hear that DAD has finished before using the interface.
13874                  */
13875                 if (!ill->ill_arp_extend)
13876                         ipif->ipif_addr_ready = 1;
13877                 putnext(ill->ill_rq, arp_add_mp);
13878         } else {
13879                 ipif->ipif_addr_ready = 1;
13880         }
13881         if (arp_add_mapping_mp != NULL) {
13882                 ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n",
13883                     ill->ill_name, ipif->ipif_id));
13884                 putnext(ill->ill_rq, arp_add_mapping_mp);
13885         }
13886         if (res_act != Res_act_initial)
13887                 return (0);
13888
13889         if (ill->ill_flags & ILLF_NOARP)
13890                 err = ill_arp_off(ill);
13891         else
13892                 err = ill_arp_on(ill);
13893         if (err != 0) {
13894                 ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err));
13895                 freemsg(ipif->ipif_arp_del_mp);
13896                 freemsg(ill->ill_arp_down_mp);
13897                 freemsg(ill->ill_arp_del_mapping_mp);
13898                 ipif->ipif_arp_del_mp = NULL;
13899                 ill->ill_arp_down_mp = NULL;
13900                 ill->ill_arp_del_mapping_mp = NULL;
13901                 return (err);
13902         }
13903         return ((ill->ill_ipif_up_count != 0 || was_dup ||
13904             ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS);
13905
13906 failed:
13907         ip1dbg(("ipif_resolver_up: FAILED\n"));
13908         freemsg(arp_add_mp);
13909         freemsg(arp_del_mp);
13910         freemsg(arp_add_mapping_mp);
13911         freemsg(arp_up_mp);
13912         freemsg(arp_down_mp);
13913         ill->ill_arp_bringup_pending = 0;
13914         return (err);
13915 }
13916
13917 /*
13918  * This routine restarts IPv4 duplicate address detection (DAD) when a link has
13919  * just gone back up.
13920  */
13921 static void
13922 ipif_arp_start_dad(ipif_t *ipif)
13923 {
13924         ill_t *ill = ipif->ipif_ill;
13925         mblk_t *arp_add_mp;
13926         area_t *area;
13927
13928         if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing ||
13929             (ipif->ipif_flags & IPIF_UNNUMBERED) ||
13930             ipif->ipif_lcl_addr == INADDR_ANY ||
13931             (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template,
13932             (char *)&ipif->ipif_lcl_addr)) == NULL) {
13933                 /*
13934                  * If we can't contact ARP for some reason, that's not really a
13935                  * problem.  Just send out the routing socket notification that
13936                  * DAD completion would have done, and continue.
13937                  */
13938                 ipif_mask_reply(ipif);
13939                 ip_rts_ifmsg(ipif);
13940                 ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
13941                 sctp_update_ipif(ipif, SCTP_IPIF_UP);
13942                 ipif->ipif_addr_ready = 1;
13943                 return;
13944         }
13945
13946         /* Setting the 'unverified' flag restarts DAD */
13947         area = (area_t *)arp_add_mp->b_rptr;
13948         area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR |
13949             ACE_F_UNVERIFIED;
13950         putnext(ill->ill_rq, arp_add_mp);
13951 }
13952
13953 static void
13954 ipif_ndp_start_dad(ipif_t *ipif)
13955 {
13956         nce_t *nce;
13957
13958         nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE);
13959         if (nce == NULL)
13960                 return;
13961
13962         if (!ndp_restart_dad(nce)) {
13963                 /*
13964                  * If we can't restart DAD for some reason, that's not really a
13965                  * problem.  Just send out the routing socket notification that
13966                  * DAD completion would have done, and continue.
13967                  */
13968                 ip_rts_ifmsg(ipif);
13969                 ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
13970                 sctp_update_ipif(ipif, SCTP_IPIF_UP);
13971                 ipif->ipif_addr_ready = 1;
13972         }
13973         NCE_REFRELE(nce);
13974 }
13975
13976 /*
13977  * Restart duplicate address detection on all interfaces on the given ill.
13978  *
13979  * This is called when an interface transitions from down to up
13980  * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN).
13981  *
13982  * Note that since the underlying physical link has transitioned, we must cause
13983  * at least one routing socket message to be sent here, either via DAD
13984  * completion or just by default on the first ipif.  (If we don't do this, then
13985  * in.mpathd will see long delays when doing link-based failure recovery.)
13986  */
13987 void
13988 ill_restart_dad(ill_t *ill, boolean_t went_up)
13989 {
13990         ipif_t *ipif;
13991
13992         if (ill == NULL)
13993                 return;
13994
13995         /*
13996          * If layer two doesn't support duplicate address detection, then just
13997          * send the routing socket message now and be done with it.
13998          */
13999         if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) ||
14000             (!ill->ill_isv6 && !ill->ill_arp_extend)) {
14001                 ip_rts_ifmsg(ill->ill_ipif);
14002                 return;
14003         }
14004
14005         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
14006                 if (went_up) {
14007                         if (ipif->ipif_flags & IPIF_UP) {
14008                                 if (ill->ill_isv6)
14009                                         ipif_ndp_start_dad(ipif);
14010                                 else
14011                                         ipif_arp_start_dad(ipif);
14012                         } else if (ill->ill_isv6 &&
14013                             (ipif->ipif_flags & IPIF_DUPLICATE)) {
14014                                 /*
14015                                  * For IPv4, the ARP module itself will
14016                                  * automatically start the DAD process when it
14017                                  * sees DL_NOTE_LINK_UP.  We respond to the
14018                                  * AR_CN_READY at the completion of that task.
14019                                  * For IPv6, we must kick off the bring-up
14020                                  * process now.
14021                                  */
14022                                 ndp_do_recovery(ipif);
14023                         } else {
14024                                 /*
14025                                  * Unfortunately, the first ipif is "special"
14026                                  * and represents the underlying ill in the
14027                                  * routing socket messages.  Thus, when this
14028                                  * one ipif is down, we must still notify so
14029                                  * that the user knows the IFF_RUNNING status
14030                                  * change.  (If the first ipif is up, then
14031                                  * we'll handle eventual routing socket
14032                                  * notification via DAD completion.)
14033                                  */
14034                                 if (ipif == ill->ill_ipif)
14035                                         ip_rts_ifmsg(ill->ill_ipif);
14036                         }
14037                 } else {
14038                         /*
14039                          * After link down, we'll need to send a new routing
14040                          * message when the link comes back, so clear
14041                          * ipif_addr_ready.
14042                          */
14043                         ipif->ipif_addr_ready = 0;
14044                 }
14045         }
14046
14047         /*
14048          * If we've torn down links, then notify the user right away.
14049          */
14050         if (!went_up)
14051                 ip_rts_ifmsg(ill->ill_ipif);
14052 }
14053
14054 /*
14055  * Wakeup all threads waiting to enter the ipsq, and sleeping
14056  * on any of the ills in this ipsq. The ill_lock of the ill
14057  * must be held so that waiters don't miss wakeups
14058  */
14059 static void
14060 ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock)
14061 {
14062         phyint_t *phyint;
14063
14064         phyint = ipsq->ipsq_phyint_list;
14065         while (phyint != NULL) {
14066                 if (phyint->phyint_illv4) {
14067                         if (!caller_holds_lock)
14068                                 mutex_enter(&phyint->phyint_illv4->ill_lock);
14069                         ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
14070                         cv_broadcast(&phyint->phyint_illv4->ill_cv);
14071                         if (!caller_holds_lock)
14072                                 mutex_exit(&phyint->phyint_illv4->ill_lock);
14073                 }
14074                 if (phyint->phyint_illv6) {
14075                         if (!caller_holds_lock)
14076                                 mutex_enter(&phyint->phyint_illv6->ill_lock);
14077                         ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
14078                         cv_broadcast(&phyint->phyint_illv6->ill_cv);
14079                         if (!caller_holds_lock)
14080                                 mutex_exit(&phyint->phyint_illv6->ill_lock);
14081                 }
14082                 phyint = phyint->phyint_ipsq_next;
14083         }
14084 }
14085
14086 static ipsq_t *
14087 ipsq_create(char *groupname, ip_stack_t *ipst)
14088 {
14089         ipsq_t  *ipsq;
14090
14091         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
14092         ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP);
14093         if (ipsq == NULL) {
14094                 return (NULL);
14095         }
14096
14097         if (groupname != NULL)
14098                 (void) strcpy(ipsq->ipsq_name, groupname);
14099         else
14100                 ipsq->ipsq_name[0] = '\0';
14101
14102         mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL);
14103         ipsq->ipsq_flags |= IPSQ_GROUP;
14104         ipsq->ipsq_next = ipst->ips_ipsq_g_head;
14105         ipst->ips_ipsq_g_head = ipsq;
14106         ipsq->ipsq_ipst = ipst;         /* No netstack_hold */
14107         return (ipsq);
14108 }
14109
14110 /*
14111  * Return an ipsq correspoding to the groupname. If 'create' is true
14112  * allocate a new ipsq if one does not exist. Usually an ipsq is associated
14113  * uniquely with an IPMP group. However during IPMP groupname operations,
14114  * multiple IPMP groups may be associated with a single ipsq. But no
14115  * IPMP group can be associated with more than 1 ipsq at any time.
14116  * For example
14117  *      Interfaces              IPMP grpname    ipsq    ipsq_name      ipsq_refs
14118  *      hme1, hme2              mpk17-84        ipsq1   mpk17-84        2
14119  *      hme3, hme4              mpk17-85        ipsq2   mpk17-85        2
14120  *
14121  * Now the command ifconfig hme3 group mpk17-84 results in the temporary
14122  * status shown below during the execution of the above command.
14123  *      hme1, hme2, hme3, hme4  mpk17-84, mpk17-85      ipsq1   mpk17-84  4
14124  *
14125  * After the completion of the above groupname command we return to the stable
14126  * state shown below.
14127  *      hme1, hme2, hme3        mpk17-84        ipsq1   mpk17-84        3
14128  *      hme4                    mpk17-85        ipsq2   mpk17-85        1
14129  *
14130  * Because of the above, we don't search based on the ipsq_name since that
14131  * would miss the correct ipsq during certain windows as shown above.
14132  * The ipsq_name is only used during split of an ipsq to return the ipsq to its
14133  * natural state.
14134  */
14135 static ipsq_t *
14136 ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq,
14137     ip_stack_t *ipst)
14138 {
14139         ipsq_t  *ipsq;
14140         int     group_len;
14141         phyint_t *phyint;
14142
14143         ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
14144
14145         group_len = strlen(groupname);
14146         ASSERT(group_len != 0);
14147         group_len++;
14148
14149         for (ipsq = ipst->ips_ipsq_g_head;
14150             ipsq != NULL;
14151             ipsq = ipsq->ipsq_next) {
14152                 /*
14153                  * When an ipsq is being split, and ill_split_ipsq
14154                  * calls this function, we exclude it from being considered.
14155                  */
14156                 if (ipsq == exclude_ipsq)
14157                         continue;
14158
14159                 /*
14160                  * Compare against the ipsq_name. The groupname change happens
14161                  * in 2 phases. The 1st phase merges the from group into
14162                  * the to group's ipsq, by calling ill_merge_groups and restarts
14163                  * the ioctl. The 2nd phase then locates the ipsq again thru
14164                  * ipsq_name. At this point the phyint_groupname has not been
14165                  * updated.
14166                  */
14167                 if ((group_len == strlen(ipsq->ipsq_name) + 1) &&
14168                     (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) {
14169                         /*
14170                          * Verify that an ipmp groupname is exactly
14171                          * part of 1 ipsq and is not found in any other
14172                          * ipsq.
14173                          */
14174                         ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, ipst) ==
14175                             NULL);
14176                         return (ipsq);
14177                 }
14178
14179                 /*
14180                  * Comparison against ipsq_name alone is not sufficient.
14181                  * In the case when groups are currently being
14182                  * merged, the ipsq could hold other IPMP groups temporarily.
14183                  * so we walk the phyint list and compare against the
14184                  * phyint_groupname as well.
14185                  */
14186                 phyint = ipsq->ipsq_phyint_list;
14187                 while (phyint != NULL) {
14188                         if ((group_len == phyint->phyint_groupname_len) &&
14189                             (bcmp(phyint->phyint_groupname, groupname,
14190                             group_len) == 0)) {
14191                                 /*
14192                                  * Verify that an ipmp groupname is exactly
14193                                  * part of 1 ipsq and is not found in any other
14194                                  * ipsq.
14195                                  */
14196                                 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq,
14197                                     ipst) == NULL);
14198                                 return (ipsq);
14199                         }
14200                         phyint = phyint->phyint_ipsq_next;
14201                 }
14202         }
14203         if (create)
14204                 ipsq = ipsq_create(groupname, ipst);
14205         return (ipsq);
14206 }
14207
14208 static void
14209 ipsq_delete(ipsq_t *ipsq)
14210 {
14211         ipsq_t *nipsq;
14212         ipsq_t *pipsq = NULL;
14213         ip_stack_t *ipst = ipsq->ipsq_ipst;
14214
14215         /*
14216          * We don't hold the ipsq lock, but we are sure no new
14217          * messages can land up, since the ipsq_refs is zero.
14218          * i.e. this ipsq is unnamed and no phyint or phyint group
14219          * is associated with this ipsq. (Lookups are based on ill_name
14220          * or phyint_groupname)
14221          */
14222         ASSERT(ipsq->ipsq_refs == 0);
14223         ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL);
14224         ASSERT(ipsq->ipsq_pending_mp == NULL);
14225         if (!(ipsq->ipsq_flags & IPSQ_GROUP)) {
14226                 /*
14227                  * This is not the ipsq of an IPMP group.
14228                  */
14229                 ipsq->ipsq_ipst = NULL;
14230                 kmem_free(ipsq, sizeof (ipsq_t));
14231                 return;
14232         }
14233
14234         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14235
14236         /*
14237          * Locate the ipsq  before we can remove it from
14238          * the singly linked list of ipsq's.
14239          */
14240         for (nipsq = ipst->ips_ipsq_g_head; nipsq != NULL;
14241             nipsq = nipsq->ipsq_next) {
14242                 if (nipsq == ipsq) {
14243                         break;
14244                 }
14245                 pipsq = nipsq;
14246         }
14247
14248         ASSERT(nipsq == ipsq);
14249
14250         /* unlink ipsq from the list */
14251         if (pipsq != NULL)
14252                 pipsq->ipsq_next = ipsq->ipsq_next;
14253         else
14254                 ipst->ips_ipsq_g_head = ipsq->ipsq_next;
14255         ipsq->ipsq_ipst = NULL;
14256         kmem_free(ipsq, sizeof (ipsq_t));
14257         rw_exit(&ipst->ips_ill_g_lock);
14258 }
14259
14260 static void
14261 ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp,
14262     queue_t *q)
14263 {
14264         ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock));
14265         ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL);
14266         ASSERT(old_ipsq->ipsq_pending_ipif == NULL);
14267         ASSERT(old_ipsq->ipsq_pending_mp == NULL);
14268         ASSERT(current_mp != NULL);
14269
14270         ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl,
14271             NEW_OP, NULL);
14272
14273         ASSERT(new_ipsq->ipsq_xopq_mptail != NULL &&
14274             new_ipsq->ipsq_xopq_mphead != NULL);
14275
14276         /*
14277          * move from old ipsq to the new ipsq.
14278          */
14279         new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead;
14280         if (old_ipsq->ipsq_xopq_mphead != NULL)
14281                 new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail;
14282
14283         old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL;
14284 }
14285
14286 void
14287 ill_group_cleanup(ill_t *ill)
14288 {
14289         ill_t *ill_v4;
14290         ill_t *ill_v6;
14291         ipif_t *ipif;
14292
14293         ill_v4 = ill->ill_phyint->phyint_illv4;
14294         ill_v6 = ill->ill_phyint->phyint_illv6;
14295
14296         if (ill_v4 != NULL) {
14297                 mutex_enter(&ill_v4->ill_lock);
14298                 for (ipif = ill_v4->ill_ipif; ipif != NULL;
14299                     ipif = ipif->ipif_next) {
14300                         IPIF_UNMARK_MOVING(ipif);
14301                 }
14302                 ill_v4->ill_up_ipifs = B_FALSE;
14303                 mutex_exit(&ill_v4->ill_lock);
14304         }
14305
14306         if (ill_v6 != NULL) {
14307                 mutex_enter(&ill_v6->ill_lock);
14308                 for (ipif = ill_v6->ill_ipif; ipif != NULL;
14309                     ipif = ipif->ipif_next) {
14310                         IPIF_UNMARK_MOVING(ipif);
14311                 }
14312                 ill_v6->ill_up_ipifs = B_FALSE;
14313                 mutex_exit(&ill_v6->ill_lock);
14314         }
14315 }
14316 /*
14317  * This function is called when an ill has had a change in its group status
14318  * to bring up all the ipifs that were up before the change.
14319  */
14320 int
14321 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
14322 {
14323         ipif_t *ipif;
14324         ill_t *ill_v4;
14325         ill_t *ill_v6;
14326         ill_t *from_ill;
14327         int err = 0;
14328
14329
14330         ASSERT(IAM_WRITER_ILL(ill));
14331
14332         /*
14333          * Except for ipif_state_flags and ill_state_flags the other
14334          * fields of the ipif/ill that are modified below are protected
14335          * implicitly since we are a writer. We would have tried to down
14336          * even an ipif that was already down, in ill_down_ipifs. So we
14337          * just blindly clear the IPIF_CHANGING flag here on all ipifs.
14338          */
14339         ill_v4 = ill->ill_phyint->phyint_illv4;
14340         ill_v6 = ill->ill_phyint->phyint_illv6;
14341         if (ill_v4 != NULL) {
14342                 ill_v4->ill_up_ipifs = B_TRUE;
14343                 for (ipif = ill_v4->ill_ipif; ipif != NULL;
14344                     ipif = ipif->ipif_next) {
14345                         mutex_enter(&ill_v4->ill_lock);
14346                         ipif->ipif_state_flags &= ~IPIF_CHANGING;
14347                         IPIF_UNMARK_MOVING(ipif);
14348                         mutex_exit(&ill_v4->ill_lock);
14349                         if (ipif->ipif_was_up) {
14350                                 if (!(ipif->ipif_flags & IPIF_UP))
14351                                         err = ipif_up(ipif, q, mp);
14352                                 ipif->ipif_was_up = B_FALSE;
14353                                 if (err != 0) {
14354                                         /*
14355                                          * Can there be any other error ?
14356                                          */
14357                                         ASSERT(err == EINPROGRESS);
14358                                         return (err);
14359                                 }
14360                         }
14361                 }
14362                 mutex_enter(&ill_v4->ill_lock);
14363                 ill_v4->ill_state_flags &= ~ILL_CHANGING;
14364                 mutex_exit(&ill_v4->ill_lock);
14365                 ill_v4->ill_up_ipifs = B_FALSE;
14366                 if (ill_v4->ill_move_in_progress) {
14367                         ASSERT(ill_v4->ill_move_peer != NULL);
14368                         ill_v4->ill_move_in_progress = B_FALSE;
14369                         from_ill = ill_v4->ill_move_peer;
14370                         from_ill->ill_move_in_progress = B_FALSE;
14371                         from_ill->ill_move_peer = NULL;
14372                         mutex_enter(&from_ill->ill_lock);
14373                         from_ill->ill_state_flags &= ~ILL_CHANGING;
14374                         mutex_exit(&from_ill->ill_lock);
14375                         if (ill_v6 == NULL) {
14376                                 if (from_ill->ill_phyint->phyint_flags &
14377                                     PHYI_STANDBY) {
14378                                         phyint_inactive(from_ill->ill_phyint);
14379                                 }
14380                                 if (ill_v4->ill_phyint->phyint_flags &
14381                                     PHYI_STANDBY) {
14382                                         phyint_inactive(ill_v4->ill_phyint);
14383                                 }
14384                         }
14385                         ill_v4->ill_move_peer = NULL;
14386                 }
14387         }
14388
14389         if (ill_v6 != NULL) {
14390                 ill_v6->ill_up_ipifs = B_TRUE;
14391                 for (ipif = ill_v6->ill_ipif; ipif != NULL;
14392                     ipif = ipif->ipif_next) {
14393                         mutex_enter(&ill_v6->ill_lock);
14394                         ipif->ipif_state_flags &= ~IPIF_CHANGING;
14395                         IPIF_UNMARK_MOVING(ipif);
14396                         mutex_exit(&ill_v6->ill_lock);
14397                         if (ipif->ipif_was_up) {
14398                                 if (!(ipif->ipif_flags & IPIF_UP))
14399                                         err = ipif_up(ipif, q, mp);
14400                                 ipif->ipif_was_up = B_FALSE;
14401                                 if (err != 0) {
14402                                         /*
14403                                          * Can there be any other error ?
14404                                          */
14405                                         ASSERT(err == EINPROGRESS);
14406                                         return (err);
14407                                 }
14408                         }
14409                 }
14410                 mutex_enter(&ill_v6->ill_lock);
14411                 ill_v6->ill_state_flags &= ~ILL_CHANGING;
14412                 mutex_exit(&ill_v6->ill_lock);
14413                 ill_v6->ill_up_ipifs = B_FALSE;
14414                 if (ill_v6->ill_move_in_progress) {
14415                         ASSERT(ill_v6->ill_move_peer != NULL);
14416                         ill_v6->ill_move_in_progress = B_FALSE;
14417                         from_ill = ill_v6->ill_move_peer;
14418                         from_ill->ill_move_in_progress = B_FALSE;
14419                         from_ill->ill_move_peer = NULL;
14420                         mutex_enter(&from_ill->ill_lock);
14421                         from_ill->ill_state_flags &= ~ILL_CHANGING;
14422                         mutex_exit(&from_ill->ill_lock);
14423                         if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
14424                                 phyint_inactive(from_ill->ill_phyint);
14425                         }
14426                         if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) {
14427                                 phyint_inactive(ill_v6->ill_phyint);
14428                         }
14429                         ill_v6->ill_move_peer = NULL;
14430                 }
14431         }
14432         return (0);
14433 }
14434
14435 /*
14436  * bring down all the approriate ipifs.
14437  */
14438 /* ARGSUSED */
14439 static void
14440 ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover)
14441 {
14442         ipif_t *ipif;
14443
14444         ASSERT(IAM_WRITER_ILL(ill));
14445
14446         /*
14447          * Except for ipif_state_flags the other fields of the ipif/ill that
14448          * are modified below are protected implicitly since we are a writer
14449          */
14450         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
14451                 if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER))
14452                         continue;
14453                 if (index == 0 || index == ipif->ipif_orig_ifindex) {
14454                         /*
14455                          * We go through the ipif_down logic even if the ipif
14456                          * is already down, since routes can be added based
14457                          * on down ipifs. Going through ipif_down once again
14458                          * will delete any IREs created based on these routes.
14459                          */
14460                         if (ipif->ipif_flags & IPIF_UP)
14461                                 ipif->ipif_was_up = B_TRUE;
14462                         /*
14463                          * If called with chk_nofailover true ipif is moving.
14464                          */
14465                         mutex_enter(&ill->ill_lock);
14466                         if (chk_nofailover) {
14467                                 ipif->ipif_state_flags |=
14468                                     IPIF_MOVING | IPIF_CHANGING;
14469                         } else {
14470                                 ipif->ipif_state_flags |= IPIF_CHANGING;
14471                         }
14472                         mutex_exit(&ill->ill_lock);
14473                         /*
14474                          * Need to re-create net/subnet bcast ires if
14475                          * they are dependent on ipif.
14476                          */
14477                         if (!ipif->ipif_isv6)
14478                                 ipif_check_bcast_ires(ipif);
14479                         (void) ipif_logical_down(ipif, NULL, NULL);
14480                         ipif_non_duplicate(ipif);
14481                         ipif_down_tail(ipif);
14482                 }
14483         }
14484 }
14485
14486 #define IPSQ_INC_REF(ipsq, ipst)        {                       \
14487         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));           \
14488         (ipsq)->ipsq_refs++;                            \
14489 }
14490
14491 #define IPSQ_DEC_REF(ipsq, ipst)        {                       \
14492         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));           \
14493         (ipsq)->ipsq_refs--;                            \
14494         if ((ipsq)->ipsq_refs == 0)                             \
14495                 (ipsq)->ipsq_name[0] = '\0';            \
14496 }
14497
14498 /*
14499  * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to
14500  * new_ipsq.
14501  */
14502 static void
14503 ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq, ip_stack_t *ipst)
14504 {
14505         phyint_t *phyint;
14506         phyint_t *next_phyint;
14507
14508         /*
14509          * To change the ipsq of an ill, we need to hold the ill_g_lock as
14510          * writer and the ill_lock of the ill in question. Also the dest
14511          * ipsq can't vanish while we hold the ill_g_lock as writer.
14512          */
14513         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
14514
14515         phyint = cur_ipsq->ipsq_phyint_list;
14516         cur_ipsq->ipsq_phyint_list = NULL;
14517         while (phyint != NULL) {
14518                 next_phyint = phyint->phyint_ipsq_next;
14519                 IPSQ_DEC_REF(cur_ipsq, ipst);
14520                 phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list;
14521                 new_ipsq->ipsq_phyint_list = phyint;
14522                 IPSQ_INC_REF(new_ipsq, ipst);
14523                 phyint->phyint_ipsq = new_ipsq;
14524                 phyint = next_phyint;
14525         }
14526 }
14527
14528 #define SPLIT_SUCCESS           0
14529 #define SPLIT_NOT_NEEDED        1
14530 #define SPLIT_FAILED            2
14531
14532 int
14533 ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry,
14534     ip_stack_t *ipst)
14535 {
14536         ipsq_t *newipsq = NULL;
14537
14538         /*
14539          * Assertions denote pre-requisites for changing the ipsq of
14540          * a phyint
14541          */
14542         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
14543         /*
14544          * <ill-phyint> assocs can't change while ill_g_lock
14545          * is held as writer. See ill_phyint_reinit()
14546          */
14547         ASSERT(phyint->phyint_illv4 == NULL ||
14548             MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
14549         ASSERT(phyint->phyint_illv6 == NULL ||
14550             MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
14551
14552         if ((phyint->phyint_groupname_len !=
14553             (strlen(cur_ipsq->ipsq_name) + 1) ||
14554             bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name,
14555             phyint->phyint_groupname_len) != 0)) {
14556                 /*
14557                  * Once we fail in creating a new ipsq due to memory shortage,
14558                  * don't attempt to create new ipsq again, based on another
14559                  * phyint, since we want all phyints belonging to an IPMP group
14560                  * to be in the same ipsq even in the event of mem alloc fails.
14561                  */
14562                 newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry,
14563                     cur_ipsq, ipst);
14564                 if (newipsq == NULL) {
14565                         /* Memory allocation failure */
14566                         return (SPLIT_FAILED);
14567                 } else {
14568                         /* ipsq_refs protected by ill_g_lock (writer) */
14569                         IPSQ_DEC_REF(cur_ipsq, ipst);
14570                         phyint->phyint_ipsq = newipsq;
14571                         phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list;
14572                         newipsq->ipsq_phyint_list = phyint;
14573                         IPSQ_INC_REF(newipsq, ipst);
14574                         return (SPLIT_SUCCESS);
14575                 }
14576         }
14577         return (SPLIT_NOT_NEEDED);
14578 }
14579
14580 /*
14581  * The ill locks of the phyint and the ill_g_lock (writer) must be held
14582  * to do this split
14583  */
14584 static int
14585 ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, ip_stack_t *ipst)
14586 {
14587         ipsq_t *newipsq;
14588
14589         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
14590         /*
14591          * <ill-phyint> assocs can't change while ill_g_lock
14592          * is held as writer. See ill_phyint_reinit()
14593          */
14594
14595         ASSERT(phyint->phyint_illv4 == NULL ||
14596             MUTEX_HELD(&phyint->phyint_illv4->ill_lock));
14597         ASSERT(phyint->phyint_illv6 == NULL ||
14598             MUTEX_HELD(&phyint->phyint_illv6->ill_lock));
14599
14600         if (!ipsq_init((phyint->phyint_illv4 != NULL) ?
14601             phyint->phyint_illv4: phyint->phyint_illv6)) {
14602                 /*
14603                  * ipsq_init failed due to no memory
14604                  * caller will use the same ipsq
14605                  */
14606                 return (SPLIT_FAILED);
14607         }
14608
14609         /* ipsq_ref is protected by ill_g_lock (writer) */
14610         IPSQ_DEC_REF(cur_ipsq, ipst);
14611
14612         /*
14613          * This is a new ipsq that is unknown to the world.
14614          * So we don't need to hold ipsq_lock,
14615          */
14616         newipsq = phyint->phyint_ipsq;
14617         newipsq->ipsq_writer = NULL;
14618         newipsq->ipsq_reentry_cnt--;
14619         ASSERT(newipsq->ipsq_reentry_cnt == 0);
14620 #ifdef DEBUG
14621         newipsq->ipsq_depth = 0;
14622 #endif
14623
14624         return (SPLIT_SUCCESS);
14625 }
14626
14627 /*
14628  * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to
14629  * ipsq's representing their individual groups or themselves. Return
14630  * whether split needs to be retried again later.
14631  */
14632 static boolean_t
14633 ill_split_ipsq(ipsq_t *cur_ipsq)
14634 {
14635         phyint_t *phyint;
14636         phyint_t *next_phyint;
14637         int     error;
14638         boolean_t need_retry = B_FALSE;
14639         ip_stack_t      *ipst = cur_ipsq->ipsq_ipst;
14640
14641         phyint = cur_ipsq->ipsq_phyint_list;
14642         cur_ipsq->ipsq_phyint_list = NULL;
14643         while (phyint != NULL) {
14644                 next_phyint = phyint->phyint_ipsq_next;
14645                 /*
14646                  * 'created' will tell us whether the callee actually
14647                  * created an ipsq. Lack of memory may force the callee
14648                  * to return without creating an ipsq.
14649                  */
14650                 if (phyint->phyint_groupname == NULL) {
14651                         error = ill_split_to_own_ipsq(phyint, cur_ipsq, ipst);
14652                 } else {
14653                         error = ill_split_to_grp_ipsq(phyint, cur_ipsq,
14654                             need_retry, ipst);
14655                 }
14656
14657                 switch (error) {
14658                 case SPLIT_FAILED:
14659                         need_retry = B_TRUE;
14660                         /* FALLTHRU */
14661                 case SPLIT_NOT_NEEDED:
14662                         /*
14663                          * Keep it on the list.
14664                          */
14665                         phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list;
14666                         cur_ipsq->ipsq_phyint_list = phyint;
14667                         break;
14668                 case SPLIT_SUCCESS:
14669                         break;
14670                 default:
14671                         ASSERT(0);
14672                 }
14673
14674                 phyint = next_phyint;
14675         }
14676         return (need_retry);
14677 }
14678
14679 /*
14680  * given an ipsq 'ipsq' lock all ills associated with this ipsq.
14681  * and return the ills in the list. This list will be
14682  * needed to unlock all the ills later on by the caller.
14683  * The <ill-ipsq> associations could change between the
14684  * lock and unlock. Hence the unlock can't traverse the
14685  * ipsq to get the list of ills.
14686  */
14687 static int
14688 ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max)
14689 {
14690         int     cnt = 0;
14691         phyint_t        *phyint;
14692         ip_stack_t      *ipst = ipsq->ipsq_ipst;
14693
14694         /*
14695          * The caller holds ill_g_lock to ensure that the ill memberships
14696          * of the ipsq don't change
14697          */
14698         ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
14699
14700         phyint = ipsq->ipsq_phyint_list;
14701         while (phyint != NULL) {
14702                 if (phyint->phyint_illv4 != NULL) {
14703                         ASSERT(cnt < list_max);
14704                         list[cnt++] = phyint->phyint_illv4;
14705                 }
14706                 if (phyint->phyint_illv6 != NULL) {
14707                         ASSERT(cnt < list_max);
14708                         list[cnt++] = phyint->phyint_illv6;
14709                 }
14710                 phyint = phyint->phyint_ipsq_next;
14711         }
14712         ill_lock_ills(list, cnt);
14713         return (cnt);
14714 }
14715
14716 void
14717 ill_lock_ills(ill_t **list, int cnt)
14718 {
14719         int     i;
14720
14721         if (cnt > 1) {
14722                 boolean_t try_again;
14723                 do {
14724                         try_again = B_FALSE;
14725                         for (i = 0; i < cnt - 1; i++) {
14726                                 if (list[i] < list[i + 1]) {
14727                                         ill_t   *tmp;
14728
14729                                         /* swap the elements */
14730                                         tmp = list[i];
14731                                         list[i] = list[i + 1];
14732                                         list[i + 1] = tmp;
14733                                         try_again = B_TRUE;
14734                                 }
14735                         }
14736                 } while (try_again);
14737         }
14738
14739         for (i = 0; i < cnt; i++) {
14740                 if (i == 0) {
14741                         if (list[i] != NULL)
14742                                 mutex_enter(&list[i]->ill_lock);
14743                         else
14744                                 return;
14745                 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) {
14746                         mutex_enter(&list[i]->ill_lock);
14747                 }
14748         }
14749 }
14750
14751 void
14752 ill_unlock_ills(ill_t **list, int cnt)
14753 {
14754         int     i;
14755
14756         for (i = 0; i < cnt; i++) {
14757                 if ((i == 0) && (list[i] != NULL)) {
14758                         mutex_exit(&list[i]->ill_lock);
14759                 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) {
14760                         mutex_exit(&list[i]->ill_lock);
14761                 }
14762         }
14763 }
14764
14765 /*
14766  * Merge all the ills from 1 ipsq group into another ipsq group.
14767  * The source ipsq group is specified by the ipsq associated with
14768  * 'from_ill'. The destination ipsq group is specified by the ipsq
14769  * associated with 'to_ill' or 'groupname' respectively.
14770  * Note that ipsq itself does not have a reference count mechanism
14771  * and functions don't look up an ipsq and pass it around. Instead
14772  * functions pass around an ill or groupname, and the ipsq is looked
14773  * up from the ill or groupname and the required operation performed
14774  * atomically with the lookup on the ipsq.
14775  */
14776 static int
14777 ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp,
14778     queue_t *q)
14779 {
14780         ipsq_t *old_ipsq;
14781         ipsq_t *new_ipsq;
14782         ill_t   **ill_list;
14783         int     cnt;
14784         size_t  ill_list_size;
14785         boolean_t became_writer_on_new_sq = B_FALSE;
14786         ip_stack_t      *ipst = from_ill->ill_ipst;
14787
14788         ASSERT(to_ill == NULL || ipst == to_ill->ill_ipst);
14789         /* Exactly 1 of 'to_ill' and groupname can be specified. */
14790         ASSERT((to_ill != NULL) ^ (groupname != NULL));
14791
14792         /*
14793          * Need to hold ill_g_lock as writer and also the ill_lock to
14794          * change the <ill-ipsq> assoc of an ill. Need to hold the
14795          * ipsq_lock to prevent new messages from landing on an ipsq.
14796          */
14797         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14798
14799         old_ipsq = from_ill->ill_phyint->phyint_ipsq;
14800         if (groupname != NULL)
14801                 new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL, ipst);
14802         else {
14803                 new_ipsq = to_ill->ill_phyint->phyint_ipsq;
14804         }
14805
14806         ASSERT(old_ipsq != NULL && new_ipsq != NULL);
14807
14808         /*
14809          * both groups are on the same ipsq.
14810          */
14811         if (old_ipsq == new_ipsq) {
14812                 rw_exit(&ipst->ips_ill_g_lock);
14813                 return (0);
14814         }
14815
14816         cnt = old_ipsq->ipsq_refs << 1;
14817         ill_list_size = cnt * sizeof (ill_t *);
14818         ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP);
14819         if (ill_list == NULL) {
14820                 rw_exit(&ipst->ips_ill_g_lock);
14821                 return (ENOMEM);
14822         }
14823         cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt);
14824
14825         /* Need ipsq lock to enque messages on new ipsq or to become writer */
14826         mutex_enter(&new_ipsq->ipsq_lock);
14827         if ((new_ipsq->ipsq_writer == NULL &&
14828             new_ipsq->ipsq_current_ipif == NULL) ||
14829             (new_ipsq->ipsq_writer == curthread)) {
14830                 new_ipsq->ipsq_writer = curthread;
14831                 new_ipsq->ipsq_reentry_cnt++;
14832                 became_writer_on_new_sq = B_TRUE;
14833         }
14834
14835         /*
14836          * We are holding ill_g_lock as writer and all the ill locks of
14837          * the old ipsq. So the old_ipsq can't be looked up, and hence no new
14838          * message can land up on the old ipsq even though we don't hold the
14839          * ipsq_lock of the old_ipsq. Now move all messages to the newipsq.
14840          */
14841         ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q);
14842
14843         /*
14844          * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'.
14845          * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq>
14846          * assocs. till we release the ill_g_lock, and hence it can't vanish.
14847          */
14848         ill_merge_ipsq(old_ipsq, new_ipsq, ipst);
14849
14850         /*
14851          * Mark the new ipsq as needing a split since it is currently
14852          * being shared by more than 1 IPMP group. The split will
14853          * occur at the end of ipsq_exit
14854          */
14855         new_ipsq->ipsq_split = B_TRUE;
14856
14857         /* Now release all the locks */
14858         mutex_exit(&new_ipsq->ipsq_lock);
14859         ill_unlock_ills(ill_list, cnt);
14860         rw_exit(&ipst->ips_ill_g_lock);
14861
14862         kmem_free(ill_list, ill_list_size);
14863
14864         /*
14865          * If we succeeded in becoming writer on the new ipsq, then
14866          * drain the new ipsq and start processing  all enqueued messages
14867          * including the current ioctl we are processing which is either
14868          * a set groupname or failover/failback.
14869          */
14870         if (became_writer_on_new_sq)
14871                 ipsq_exit(new_ipsq, B_TRUE, B_TRUE);
14872
14873         /*
14874          * syncq has been changed and all the messages have been moved.
14875          */
14876         mutex_enter(&old_ipsq->ipsq_lock);
14877         old_ipsq->ipsq_current_ipif = NULL;
14878         old_ipsq->ipsq_current_ioctl = 0;
14879         mutex_exit(&old_ipsq->ipsq_lock);
14880         return (EINPROGRESS);
14881 }
14882
14883 /*
14884  * Delete and add the loopback copy and non-loopback copy of
14885  * the BROADCAST ire corresponding to ill and addr. Used to
14886  * group broadcast ires together when ill becomes part of
14887  * a group.
14888  *
14889  * This function is also called when ill is leaving the group
14890  * so that the ires belonging to the group gets re-grouped.
14891  */
14892 static void
14893 ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr)
14894 {
14895         ire_t *ire, *nire, *nire_next, *ire_head = NULL;
14896         ire_t **ire_ptpn = &ire_head;
14897         ip_stack_t      *ipst = ill->ill_ipst;
14898
14899         /*
14900          * The loopback and non-loopback IREs are inserted in the order in which
14901          * they're found, on the basis that they are correctly ordered (loopback
14902          * first).
14903          */
14904         for (;;) {
14905                 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif,
14906                     ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
14907                 if (ire == NULL)
14908                         break;
14909
14910                 /*
14911                  * we are passing in KM_SLEEP because it is not easy to
14912                  * go back to a sane state in case of memory failure.
14913                  */
14914                 nire = kmem_cache_alloc(ire_cache, KM_SLEEP);
14915                 ASSERT(nire != NULL);
14916                 bzero(nire, sizeof (ire_t));
14917                 /*
14918                  * Don't use ire_max_frag directly since we don't
14919                  * hold on to 'ire' until we add the new ire 'nire' and
14920                  * we don't want the new ire to have a dangling reference
14921                  * to 'ire'. The ire_max_frag of a broadcast ire must
14922                  * be in sync with the ipif_mtu of the associate ipif.
14923                  * For eg. this happens as a result of SIOCSLIFNAME,
14924                  * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by
14925                  * the driver. A change in ire_max_frag triggered as
14926                  * as a result of path mtu discovery, or due to an
14927                  * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a
14928                  * route change -mtu command does not apply to broadcast ires.
14929                  *
14930                  * XXX We need a recovery strategy here if ire_init fails
14931                  */
14932                 if (ire_init(nire,
14933                     (uchar_t *)&ire->ire_addr,
14934                     (uchar_t *)&ire->ire_mask,
14935                     (uchar_t *)&ire->ire_src_addr,
14936                     (uchar_t *)&ire->ire_gateway_addr,
14937                     ire->ire_stq == NULL ? &ip_loopback_mtu :
14938                     &ire->ire_ipif->ipif_mtu,
14939                     ire->ire_nce,
14940                     ire->ire_rfq,
14941                     ire->ire_stq,
14942                     ire->ire_type,
14943                     ire->ire_ipif,
14944                     ire->ire_cmask,
14945                     ire->ire_phandle,
14946                     ire->ire_ihandle,
14947                     ire->ire_flags,
14948                     &ire->ire_uinfo,
14949                     NULL,
14950                     NULL,
14951                     ipst) == NULL) {
14952                         cmn_err(CE_PANIC, "ire_init() failed");
14953                 }
14954                 ire_delete(ire);
14955                 ire_refrele(ire);
14956
14957                 /*
14958                  * The newly created IREs are inserted at the tail of the list
14959                  * starting with ire_head. As we've just allocated them no one
14960                  * knows about them so it's safe.
14961                  */
14962                 *ire_ptpn = nire;
14963                 ire_ptpn = &nire->ire_next;
14964         }
14965
14966         for (nire = ire_head; nire != NULL; nire = nire_next) {
14967                 int error;
14968                 ire_t *oire;
14969                 /* unlink the IRE from our list before calling ire_add() */
14970                 nire_next = nire->ire_next;
14971                 nire->ire_next = NULL;
14972
14973                 /* ire_add adds the ire at the right place in the list */
14974                 oire = nire;
14975                 error = ire_add(&nire, NULL, NULL, NULL, B_FALSE);
14976                 ASSERT(error == 0);
14977                 ASSERT(oire == nire);
14978                 ire_refrele(nire);      /* Held in ire_add */
14979         }
14980 }
14981
14982 /*
14983  * This function is usually called when an ill is inserted in
14984  * a group and all the ipifs are already UP. As all the ipifs
14985  * are already UP, the broadcast ires have already been created
14986  * and been inserted. But, ire_add_v4 would not have grouped properly.
14987  * We need to re-group for the benefit of ip_wput_ire which
14988  * expects BROADCAST ires to be grouped properly to avoid sending
14989  * more than one copy of the broadcast packet per group.
14990  *
14991  * NOTE : We don't check for ill_ipif_up_count to be non-zero here
14992  *        because when ipif_up_done ends up calling this, ires have
14993  *        already been added before illgrp_insert i.e before ill_group
14994  *        has been initialized.
14995  */
14996 static void
14997 ill_group_bcast_for_xmit(ill_t *ill)
14998 {
14999         ill_group_t *illgrp;
15000         ipif_t *ipif;
15001         ipaddr_t addr;
15002         ipaddr_t net_mask;
15003         ipaddr_t subnet_netmask;
15004
15005         illgrp = ill->ill_group;
15006
15007         /*
15008          * This function is called even when an ill is deleted from
15009          * the group. Hence, illgrp could be null.
15010          */
15011         if (illgrp != NULL && illgrp->illgrp_ill_count == 1)
15012                 return;
15013
15014         /*
15015          * Delete all the BROADCAST ires matching this ill and add
15016          * them back. This time, ire_add_v4 should take care of
15017          * grouping them with others because ill is part of the
15018          * group.
15019          */
15020         ill_bcast_delete_and_add(ill, 0);
15021         ill_bcast_delete_and_add(ill, INADDR_BROADCAST);
15022
15023         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15024
15025                 if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
15026                     !(ipif->ipif_flags & IPIF_NOLOCAL)) {
15027                         net_mask = ip_net_mask(ipif->ipif_lcl_addr);
15028                 } else {
15029                         net_mask = htonl(IN_CLASSA_NET);
15030                 }
15031                 addr = net_mask & ipif->ipif_subnet;
15032                 ill_bcast_delete_and_add(ill, addr);
15033                 ill_bcast_delete_and_add(ill, ~net_mask | addr);
15034
15035                 subnet_netmask = ipif->ipif_net_mask;
15036                 addr = ipif->ipif_subnet;
15037                 ill_bcast_delete_and_add(ill, addr);
15038                 ill_bcast_delete_and_add(ill, ~subnet_netmask | addr);
15039         }
15040 }
15041
15042 /*
15043  * This function is called from illgrp_delete when ill is being deleted
15044  * from the group.
15045  *
15046  * As ill is not there in the group anymore, any address belonging
15047  * to this ill should be cleared of IRE_MARK_NORECV.
15048  */
15049 static void
15050 ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr)
15051 {
15052         ire_t *ire;
15053         irb_t *irb;
15054         ip_stack_t      *ipst = ill->ill_ipst;
15055
15056         ASSERT(ill->ill_group == NULL);
15057
15058         ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif,
15059             ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
15060
15061         if (ire != NULL) {
15062                 /*
15063                  * IPMP and plumbing operations are serialized on the ipsq, so
15064                  * no one will insert or delete a broadcast ire under our feet.
15065                  */
15066                 irb = ire->ire_bucket;
15067                 rw_enter(&irb->irb_lock, RW_READER);
15068                 ire_refrele(ire);
15069
15070                 for (; ire != NULL; ire = ire->ire_next) {
15071                         if (ire->ire_addr != addr)
15072                                 break;
15073                         if (ire_to_ill(ire) != ill)
15074                                 continue;
15075
15076                         ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED));
15077                         ire->ire_marks &= ~IRE_MARK_NORECV;
15078                 }
15079                 rw_exit(&irb->irb_lock);
15080         }
15081 }
15082
15083 /*
15084  * This function must be called only after the broadcast ires
15085  * have been grouped together. For a given address addr, nominate
15086  * only one of the ires whose interface is not FAILED or OFFLINE.
15087  *
15088  * This is also called when an ipif goes down, so that we can nominate
15089  * a different ire with the same address for receiving.
15090  */
15091 static void
15092 ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr, ip_stack_t *ipst)
15093 {
15094         irb_t *irb;
15095         ire_t *ire;
15096         ire_t *ire1;
15097         ire_t *save_ire;
15098         ire_t **irep = NULL;
15099         boolean_t first = B_TRUE;
15100         ire_t *clear_ire = NULL;
15101         ire_t *start_ire = NULL;
15102         ire_t   *new_lb_ire;
15103         ire_t   *new_nlb_ire;
15104         boolean_t new_lb_ire_used = B_FALSE;
15105         boolean_t new_nlb_ire_used = B_FALSE;
15106         uint64_t match_flags;
15107         uint64_t phyi_flags;
15108         boolean_t fallback = B_FALSE;
15109         uint_t  max_frag;
15110
15111         ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES,
15112             NULL, MATCH_IRE_TYPE, ipst);
15113         /*
15114          * We may not be able to find some ires if a previous
15115          * ire_create failed. This happens when an ipif goes
15116          * down and we are unable to create BROADCAST ires due
15117          * to memory failure. Thus, we have to check for NULL
15118          * below. This should handle the case for LOOPBACK,
15119          * POINTOPOINT and interfaces with some POINTOPOINT
15120          * logicals for which there are no BROADCAST ires.
15121          */
15122         if (ire == NULL)
15123                 return;
15124         /*
15125          * Currently IRE_BROADCASTS are deleted when an ipif
15126          * goes down which runs exclusively. Thus, setting
15127          * IRE_MARK_RCVD should not race with ire_delete marking
15128          * IRE_MARK_CONDEMNED. We grab the lock below just to
15129          * be consistent with other parts of the code that walks
15130          * a given bucket.
15131          */
15132         save_ire = ire;
15133         irb = ire->ire_bucket;
15134         new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
15135         if (new_lb_ire == NULL) {
15136                 ire_refrele(ire);
15137                 return;
15138         }
15139         new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
15140         if (new_nlb_ire == NULL) {
15141                 ire_refrele(ire);
15142                 kmem_cache_free(ire_cache, new_lb_ire);
15143                 return;
15144         }
15145         IRB_REFHOLD(irb);
15146         rw_enter(&irb->irb_lock, RW_WRITER);
15147         /*
15148          * Get to the first ire matching the address and the
15149          * group. If the address does not match we are done
15150          * as we could not find the IRE. If the address matches
15151          * we should get to the first one matching the group.
15152          */
15153         while (ire != NULL) {
15154                 if (ire->ire_addr != addr ||
15155                     ire->ire_ipif->ipif_ill->ill_group == illgrp) {
15156                         break;
15157                 }
15158                 ire = ire->ire_next;
15159         }
15160         match_flags = PHYI_FAILED | PHYI_INACTIVE;
15161         start_ire = ire;
15162 redo:
15163         while (ire != NULL && ire->ire_addr == addr &&
15164             ire->ire_ipif->ipif_ill->ill_group == illgrp) {
15165                 /*
15166                  * The first ire for any address within a group
15167                  * should always be the one with IRE_MARK_NORECV cleared
15168                  * so that ip_wput_ire can avoid searching for one.
15169                  * Note down the insertion point which will be used
15170                  * later.
15171                  */
15172                 if (first && (irep == NULL))
15173                         irep = ire->ire_ptpn;
15174                 /*
15175                  * PHYI_FAILED is set when the interface fails.
15176                  * This interface might have become good, but the
15177                  * daemon has not yet detected. We should still
15178                  * not receive on this. PHYI_OFFLINE should never
15179                  * be picked as this has been offlined and soon
15180                  * be removed.
15181                  */
15182                 phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags;
15183                 if (phyi_flags & PHYI_OFFLINE) {
15184                         ire->ire_marks |= IRE_MARK_NORECV;
15185                         ire = ire->ire_next;
15186                         continue;
15187                 }
15188                 if (phyi_flags & match_flags) {
15189                         ire->ire_marks |= IRE_MARK_NORECV;
15190                         ire = ire->ire_next;
15191                         if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
15192                             PHYI_INACTIVE) {
15193                                 fallback = B_TRUE;
15194                         }
15195                         continue;
15196                 }
15197                 if (first) {
15198                         /*
15199                          * We will move this to the front of the list later
15200                          * on.
15201                          */
15202                         clear_ire = ire;
15203                         ire->ire_marks &= ~IRE_MARK_NORECV;
15204                 } else {
15205                         ire->ire_marks |= IRE_MARK_NORECV;
15206                 }
15207                 first = B_FALSE;
15208                 ire = ire->ire_next;
15209         }
15210         /*
15211          * If we never nominated anybody, try nominating at least
15212          * an INACTIVE, if we found one. Do it only once though.
15213          */
15214         if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) &&
15215             fallback) {
15216                 match_flags = PHYI_FAILED;
15217                 ire = start_ire;
15218                 irep = NULL;
15219                 goto redo;
15220         }
15221         ire_refrele(save_ire);
15222
15223         /*
15224          * irep non-NULL indicates that we entered the while loop
15225          * above. If clear_ire is at the insertion point, we don't
15226          * have to do anything. clear_ire will be NULL if all the
15227          * interfaces are failed.
15228          *
15229          * We cannot unlink and reinsert the ire at the right place
15230          * in the list since there can be other walkers of this bucket.
15231          * Instead we delete and recreate the ire
15232          */
15233         if (clear_ire != NULL && irep != NULL && *irep != clear_ire) {
15234                 ire_t *clear_ire_stq = NULL;
15235
15236                 bzero(new_lb_ire, sizeof (ire_t));
15237                 /* XXX We need a recovery strategy here. */
15238                 if (ire_init(new_lb_ire,
15239                     (uchar_t *)&clear_ire->ire_addr,
15240                     (uchar_t *)&clear_ire->ire_mask,
15241                     (uchar_t *)&clear_ire->ire_src_addr,
15242                     (uchar_t *)&clear_ire->ire_gateway_addr,
15243                     &clear_ire->ire_max_frag,
15244                     NULL, /* let ire_nce_init derive the resolver info */
15245                     clear_ire->ire_rfq,
15246                     clear_ire->ire_stq,
15247                     clear_ire->ire_type,
15248                     clear_ire->ire_ipif,
15249                     clear_ire->ire_cmask,
15250                     clear_ire->ire_phandle,
15251                     clear_ire->ire_ihandle,
15252                     clear_ire->ire_flags,
15253                     &clear_ire->ire_uinfo,
15254                     NULL,
15255                     NULL,
15256                     ipst) == NULL)
15257                         cmn_err(CE_PANIC, "ire_init() failed");
15258                 if (clear_ire->ire_stq == NULL) {
15259                         ire_t *ire_next = clear_ire->ire_next;
15260                         if (ire_next != NULL &&
15261                             ire_next->ire_stq != NULL &&
15262                             ire_next->ire_addr == clear_ire->ire_addr &&
15263                             ire_next->ire_ipif->ipif_ill ==
15264                             clear_ire->ire_ipif->ipif_ill) {
15265                                 clear_ire_stq = ire_next;
15266
15267                                 bzero(new_nlb_ire, sizeof (ire_t));
15268                                 /* XXX We need a recovery strategy here. */
15269                                 if (ire_init(new_nlb_ire,
15270                                     (uchar_t *)&clear_ire_stq->ire_addr,
15271                                     (uchar_t *)&clear_ire_stq->ire_mask,
15272                                     (uchar_t *)&clear_ire_stq->ire_src_addr,
15273                                     (uchar_t *)&clear_ire_stq->ire_gateway_addr,
15274                                     &clear_ire_stq->ire_max_frag,
15275                                     NULL,
15276                                     clear_ire_stq->ire_rfq,
15277                                     clear_ire_stq->ire_stq,
15278                                     clear_ire_stq->ire_type,
15279                                     clear_ire_stq->ire_ipif,
15280                                     clear_ire_stq->ire_cmask,
15281                                     clear_ire_stq->ire_phandle,
15282                                     clear_ire_stq->ire_ihandle,
15283                                     clear_ire_stq->ire_flags,
15284                                     &clear_ire_stq->ire_uinfo,
15285                                     NULL,
15286                                     NULL,
15287                                     ipst) == NULL)
15288                                         cmn_err(CE_PANIC, "ire_init() failed");
15289                         }
15290                 }
15291
15292                 /*
15293                  * Delete the ire. We can't call ire_delete() since
15294                  * we are holding the bucket lock. We can't release the
15295                  * bucket lock since we can't allow irep to change. So just
15296                  * mark it CONDEMNED. The IRB_REFRELE will delete the
15297                  * ire from the list and do the refrele.
15298                  */
15299                 clear_ire->ire_marks |= IRE_MARK_CONDEMNED;
15300                 irb->irb_marks |= IRB_MARK_CONDEMNED;
15301
15302                 if (clear_ire_stq != NULL && clear_ire_stq->ire_nce != NULL) {
15303                         nce_fastpath_list_delete(clear_ire_stq->ire_nce);
15304                         clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED;
15305                 }
15306
15307                 /*
15308                  * Also take care of otherfields like ib/ob pkt count
15309                  * etc. Need to dup them. ditto in ill_bcast_delete_and_add
15310                  */
15311
15312                 /* Set the max_frag before adding the ire */
15313                 max_frag = *new_lb_ire->ire_max_fragp;
15314                 new_lb_ire->ire_max_fragp = NULL;
15315                 new_lb_ire->ire_max_frag = max_frag;
15316
15317                 /* Add the new ire's. Insert at *irep */
15318                 new_lb_ire->ire_bucket = clear_ire->ire_bucket;
15319                 ire1 = *irep;
15320                 if (ire1 != NULL)
15321                         ire1->ire_ptpn = &new_lb_ire->ire_next;
15322                 new_lb_ire->ire_next = ire1;
15323                 /* Link the new one in. */
15324                 new_lb_ire->ire_ptpn = irep;
15325                 membar_producer();
15326                 *irep = new_lb_ire;
15327                 new_lb_ire_used = B_TRUE;
15328                 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted);
15329                 new_lb_ire->ire_bucket->irb_ire_cnt++;
15330                 new_lb_ire->ire_ipif->ipif_ire_cnt++;
15331
15332                 if (clear_ire_stq != NULL) {
15333                         /* Set the max_frag before adding the ire */
15334                         max_frag = *new_nlb_ire->ire_max_fragp;
15335                         new_nlb_ire->ire_max_fragp = NULL;
15336                         new_nlb_ire->ire_max_frag = max_frag;
15337
15338                         new_nlb_ire->ire_bucket = clear_ire->ire_bucket;
15339                         irep = &new_lb_ire->ire_next;
15340                         /* Add the new ire. Insert at *irep */
15341                         ire1 = *irep;
15342                         if (ire1 != NULL)
15343                                 ire1->ire_ptpn = &new_nlb_ire->ire_next;
15344                         new_nlb_ire->ire_next = ire1;
15345                         /* Link the new one in. */
15346                         new_nlb_ire->ire_ptpn = irep;
15347                         membar_producer();
15348                         *irep = new_nlb_ire;
15349                         new_nlb_ire_used = B_TRUE;
15350                         BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
15351                             ire_stats_inserted);
15352                         new_nlb_ire->ire_bucket->irb_ire_cnt++;
15353                         new_nlb_ire->ire_ipif->ipif_ire_cnt++;
15354                         ((ill_t *)new_nlb_ire->ire_stq->q_ptr)->ill_ire_cnt++;
15355                 }
15356         }
15357         rw_exit(&irb->irb_lock);
15358         if (!new_lb_ire_used)
15359                 kmem_cache_free(ire_cache, new_lb_ire);
15360         if (!new_nlb_ire_used)
15361                 kmem_cache_free(ire_cache, new_nlb_ire);
15362         IRB_REFRELE(irb);
15363 }
15364
15365 /*
15366  * Whenever an ipif goes down we have to renominate a different
15367  * broadcast ire to receive. Whenever an ipif comes up, we need
15368  * to make sure that we have only one nominated to receive.
15369  */
15370 static void
15371 ipif_renominate_bcast(ipif_t *ipif)
15372 {
15373         ill_t *ill = ipif->ipif_ill;
15374         ipaddr_t subnet_addr;
15375         ipaddr_t net_addr;
15376         ipaddr_t net_mask = 0;
15377         ipaddr_t subnet_netmask;
15378         ipaddr_t addr;
15379         ill_group_t *illgrp;
15380         ip_stack_t      *ipst = ill->ill_ipst;
15381
15382         illgrp = ill->ill_group;
15383         /*
15384          * If this is the last ipif going down, it might take
15385          * the ill out of the group. In that case ipif_down ->
15386          * illgrp_delete takes care of doing the nomination.
15387          * ipif_down does not call for this case.
15388          */
15389         ASSERT(illgrp != NULL);
15390
15391         /* There could not have been any ires associated with this */
15392         if (ipif->ipif_subnet == 0)
15393                 return;
15394
15395         ill_mark_bcast(illgrp, 0, ipst);
15396         ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst);
15397
15398         if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
15399             !(ipif->ipif_flags & IPIF_NOLOCAL)) {
15400                 net_mask = ip_net_mask(ipif->ipif_lcl_addr);
15401         } else {
15402                 net_mask = htonl(IN_CLASSA_NET);
15403         }
15404         addr = net_mask & ipif->ipif_subnet;
15405         ill_mark_bcast(illgrp, addr, ipst);
15406
15407         net_addr = ~net_mask | addr;
15408         ill_mark_bcast(illgrp, net_addr, ipst);
15409
15410         subnet_netmask = ipif->ipif_net_mask;
15411         addr = ipif->ipif_subnet;
15412         ill_mark_bcast(illgrp, addr, ipst);
15413
15414         subnet_addr = ~subnet_netmask | addr;
15415         ill_mark_bcast(illgrp, subnet_addr, ipst);
15416 }
15417
15418 /*
15419  * Whenever we form or delete ill groups, we need to nominate one set of
15420  * BROADCAST ires for receiving in the group.
15421  *
15422  * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires
15423  *    have been added, but ill_ipif_up_count is 0. Thus, we don't assert
15424  *    for ill_ipif_up_count to be non-zero. This is the only case where
15425  *    ill_ipif_up_count is zero and we would still find the ires.
15426  *
15427  * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one
15428  *    ipif is UP and we just have to do the nomination.
15429  *
15430  * 3) When ill_handoff_responsibility calls us, some ill has been removed
15431  *    from the group. So, we have to do the nomination.
15432  *
15433  * Because of (3), there could be just one ill in the group. But we have
15434  * to nominate still as IRE_MARK_NORCV may have been marked on this.
15435  * Thus, this function does not optimize when there is only one ill as
15436  * it is not correct for (3).
15437  */
15438 static void
15439 ill_nominate_bcast_rcv(ill_group_t *illgrp)
15440 {
15441         ill_t *ill;
15442         ipif_t *ipif;
15443         ipaddr_t subnet_addr;
15444         ipaddr_t prev_subnet_addr = 0;
15445         ipaddr_t net_addr;
15446         ipaddr_t prev_net_addr = 0;
15447         ipaddr_t net_mask = 0;
15448         ipaddr_t subnet_netmask;
15449         ipaddr_t addr;
15450         ip_stack_t      *ipst;
15451
15452         /*
15453          * When the last memeber is leaving, there is nothing to
15454          * nominate.
15455          */
15456         if (illgrp->illgrp_ill_count == 0) {
15457                 ASSERT(illgrp->illgrp_ill == NULL);
15458                 return;
15459         }
15460
15461         ill = illgrp->illgrp_ill;
15462         ASSERT(!ill->ill_isv6);
15463         ipst = ill->ill_ipst;
15464         /*
15465          * We assume that ires with same address and belonging to the
15466          * same group, has been grouped together. Nominating a *single*
15467          * ill in the group for sending and receiving broadcast is done
15468          * by making sure that the first BROADCAST ire (which will be
15469          * the one returned by ire_ctable_lookup for ip_rput and the
15470          * one that will be used in ip_wput_ire) will be the one that
15471          * will not have IRE_MARK_NORECV set.
15472          *
15473          * 1) ip_rput checks and discards packets received on ires marked
15474          *    with IRE_MARK_NORECV. Thus, we don't send up duplicate
15475          *    broadcast packets. We need to clear IRE_MARK_NORECV on the
15476          *    first ire in the group for every broadcast address in the group.
15477          *    ip_rput will accept packets only on the first ire i.e only
15478          *    one copy of the ill.
15479          *
15480          * 2) ip_wput_ire needs to send out just one copy of the broadcast
15481          *    packet for the whole group. It needs to send out on the ill
15482          *    whose ire has not been marked with IRE_MARK_NORECV. If it sends
15483          *    on the one marked with IRE_MARK_NORECV, ip_rput will accept
15484          *    the copy echoed back on other port where the ire is not marked
15485          *    with IRE_MARK_NORECV.
15486          *
15487          * Note that we just need to have the first IRE either loopback or
15488          * non-loopback (either of them may not exist if ire_create failed
15489          * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will
15490          * always hit the first one and hence will always accept one copy.
15491          *
15492          * We have a broadcast ire per ill for all the unique prefixes
15493          * hosted on that ill. As we don't have a way of knowing the
15494          * unique prefixes on a given ill and hence in the whole group,
15495          * we just call ill_mark_bcast on all the prefixes that exist
15496          * in the group. For the common case of one prefix, the code
15497          * below optimizes by remebering the last address used for
15498          * markng. In the case of multiple prefixes, this will still
15499          * optimize depending the order of prefixes.
15500          *
15501          * The only unique address across the whole group is 0.0.0.0 and
15502          * 255.255.255.255 and thus we call only once. ill_mark_bcast enables
15503          * the first ire in the bucket for receiving and disables the
15504          * others.
15505          */
15506         ill_mark_bcast(illgrp, 0, ipst);
15507         ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst);
15508         for (; ill != NULL; ill = ill->ill_group_next) {
15509
15510                 for (ipif = ill->ill_ipif; ipif != NULL;
15511                     ipif = ipif->ipif_next) {
15512
15513                         if (!(ipif->ipif_flags & IPIF_UP) ||
15514                             ipif->ipif_subnet == 0) {
15515                                 continue;
15516                         }
15517                         if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
15518                             !(ipif->ipif_flags & IPIF_NOLOCAL)) {
15519                                 net_mask = ip_net_mask(ipif->ipif_lcl_addr);
15520                         } else {
15521                                 net_mask = htonl(IN_CLASSA_NET);
15522                         }
15523                         addr = net_mask & ipif->ipif_subnet;
15524                         if (prev_net_addr == 0 || prev_net_addr != addr) {
15525                                 ill_mark_bcast(illgrp, addr, ipst);
15526                                 net_addr = ~net_mask | addr;
15527                                 ill_mark_bcast(illgrp, net_addr, ipst);
15528                         }
15529                         prev_net_addr = addr;
15530
15531                         subnet_netmask = ipif->ipif_net_mask;
15532                         addr = ipif->ipif_subnet;
15533                         if (prev_subnet_addr == 0 ||
15534                             prev_subnet_addr != addr) {
15535                                 ill_mark_bcast(illgrp, addr, ipst);
15536                                 subnet_addr = ~subnet_netmask | addr;
15537                                 ill_mark_bcast(illgrp, subnet_addr, ipst);
15538                         }
15539                         prev_subnet_addr = addr;
15540                 }
15541         }
15542 }
15543
15544 /*
15545  * This function is called while forming ill groups.
15546  *
15547  * Currently, we handle only allmulti groups. We want to join
15548  * allmulti on only one of the ills in the groups. In future,
15549  * when we have link aggregation, we may have to join normal
15550  * multicast groups on multiple ills as switch does inbound load
15551  * balancing. Following are the functions that calls this
15552  * function :
15553  *
15554  * 1) ill_recover_multicast : Interface is coming back UP.
15555  *    When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6
15556  *    will call ill_recover_multicast to recover all the multicast
15557  *    groups. We need to make sure that only one member is joined
15558  *    in the ill group.
15559  *
15560  * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed.
15561  *    Somebody is joining allmulti. We need to make sure that only one
15562  *    member is joined in the group.
15563  *
15564  * 3) illgrp_insert : If allmulti has already joined, we need to make
15565  *    sure that only one member is joined in the group.
15566  *
15567  * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving
15568  *    allmulti who we have nominated. We need to pick someother ill.
15569  *
15570  * 5) illgrp_delete : The ill we nominated is leaving the group,
15571  *    we need to pick a new ill to join the group.
15572  *
15573  * For (1), (2), (5) - we just have to check whether there is
15574  * a good ill joined in the group. If we could not find any ills
15575  * joined the group, we should join.
15576  *
15577  * For (4), the one that was nominated to receive, left the group.
15578  * There could be nobody joined in the group when this function is
15579  * called.
15580  *
15581  * For (3) - we need to explicitly check whether there are multiple
15582  * ills joined in the group.
15583  *
15584  * For simplicity, we don't differentiate any of the above cases. We
15585  * just leave the group if it is joined on any of them and join on
15586  * the first good ill.
15587  */
15588 int
15589 ill_nominate_mcast_rcv(ill_group_t *illgrp)
15590 {
15591         ilm_t *ilm;
15592         ill_t *ill;
15593         ill_t *fallback_inactive_ill = NULL;
15594         ill_t *fallback_failed_ill = NULL;
15595         int ret = 0;
15596
15597         /*
15598          * Leave the allmulti on all the ills and start fresh.
15599          */
15600         for (ill = illgrp->illgrp_ill; ill != NULL;
15601             ill = ill->ill_group_next) {
15602                 if (ill->ill_join_allmulti)
15603                         (void) ip_leave_allmulti(ill->ill_ipif);
15604         }
15605
15606         /*
15607          * Choose a good ill. Fallback to inactive or failed if
15608          * none available. We need to fallback to FAILED in the
15609          * case where we have 2 interfaces in a group - where
15610          * one of them is failed and another is a good one and
15611          * the good one (not marked inactive) is leaving the group.
15612          */
15613         ret = 0;
15614         for (ill = illgrp->illgrp_ill; ill != NULL;
15615             ill = ill->ill_group_next) {
15616                 /* Never pick an offline interface */
15617                 if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE)
15618                         continue;
15619
15620                 if (ill->ill_phyint->phyint_flags & PHYI_FAILED) {
15621                         fallback_failed_ill = ill;
15622                         continue;
15623                 }
15624                 if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) {
15625                         fallback_inactive_ill = ill;
15626                         continue;
15627                 }
15628                 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
15629                         if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
15630                                 ret = ip_join_allmulti(ill->ill_ipif);
15631                                 /*
15632                                  * ip_join_allmulti can fail because of memory
15633                                  * failures. So, make sure we join at least
15634                                  * on one ill.
15635                                  */
15636                                 if (ill->ill_join_allmulti)
15637                                         return (0);
15638                         }
15639                 }
15640         }
15641         if (ret != 0) {
15642                 /*
15643                  * If we tried nominating above and failed to do so,
15644                  * return error. We might have tried multiple times.
15645                  * But, return the latest error.
15646                  */
15647                 return (ret);
15648         }
15649         if ((ill = fallback_inactive_ill) != NULL) {
15650                 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
15651                         if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
15652                                 ret = ip_join_allmulti(ill->ill_ipif);
15653                                 return (ret);
15654                         }
15655                 }
15656         } else if ((ill = fallback_failed_ill) != NULL) {
15657                 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
15658                         if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
15659                                 ret = ip_join_allmulti(ill->ill_ipif);
15660                                 return (ret);
15661                         }
15662                 }
15663         }
15664         return (0);
15665 }
15666
15667 /*
15668  * This function is called from illgrp_delete after it is
15669  * deleted from the group to reschedule responsibilities
15670  * to a different ill.
15671  */
15672 static void
15673 ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp)
15674 {
15675         ilm_t   *ilm;
15676         ipif_t  *ipif;
15677         ipaddr_t subnet_addr;
15678         ipaddr_t net_addr;
15679         ipaddr_t net_mask = 0;
15680         ipaddr_t subnet_netmask;
15681         ipaddr_t addr;
15682         ip_stack_t *ipst = ill->ill_ipst;
15683
15684         ASSERT(ill->ill_group == NULL);
15685         /*
15686          * Broadcast Responsibility:
15687          *
15688          * 1. If this ill has been nominated for receiving broadcast
15689          * packets, we need to find a new one. Before we find a new
15690          * one, we need to re-group the ires that are part of this new
15691          * group (assumed by ill_nominate_bcast_rcv). We do this by
15692          * calling ill_group_bcast_for_xmit(ill) which will do the right
15693          * thing for us.
15694          *
15695          * 2. If this ill was not nominated for receiving broadcast
15696          * packets, we need to clear the IRE_MARK_NORECV flag
15697          * so that we continue to send up broadcast packets.
15698          */
15699         if (!ill->ill_isv6) {
15700                 /*
15701                  * Case 1 above : No optimization here. Just redo the
15702                  * nomination.
15703                  */
15704                 ill_group_bcast_for_xmit(ill);
15705                 ill_nominate_bcast_rcv(illgrp);
15706
15707                 /*
15708                  * Case 2 above : Lookup and clear IRE_MARK_NORECV.
15709                  */
15710                 ill_clear_bcast_mark(ill, 0);
15711                 ill_clear_bcast_mark(ill, INADDR_BROADCAST);
15712
15713                 for (ipif = ill->ill_ipif; ipif != NULL;
15714                     ipif = ipif->ipif_next) {
15715
15716                         if (!(ipif->ipif_flags & IPIF_UP) ||
15717                             ipif->ipif_subnet == 0) {
15718                                 continue;
15719                         }
15720                         if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
15721                             !(ipif->ipif_flags & IPIF_NOLOCAL)) {
15722                                 net_mask = ip_net_mask(ipif->ipif_lcl_addr);
15723                         } else {
15724                                 net_mask = htonl(IN_CLASSA_NET);
15725                         }
15726                         addr = net_mask & ipif->ipif_subnet;
15727                         ill_clear_bcast_mark(ill, addr);
15728
15729                         net_addr = ~net_mask | addr;
15730                         ill_clear_bcast_mark(ill, net_addr);
15731
15732                         subnet_netmask = ipif->ipif_net_mask;
15733                         addr = ipif->ipif_subnet;
15734                         ill_clear_bcast_mark(ill, addr);
15735
15736                         subnet_addr = ~subnet_netmask | addr;
15737                         ill_clear_bcast_mark(ill, subnet_addr);
15738                 }
15739         }
15740
15741         /*
15742          * Multicast Responsibility.
15743          *
15744          * If we have joined allmulti on this one, find a new member
15745          * in the group to join allmulti. As this ill is already part
15746          * of allmulti, we don't have to join on this one.
15747          *
15748          * If we have not joined allmulti on this one, there is no
15749          * responsibility to handoff. But we need to take new
15750          * responsibility i.e, join allmulti on this one if we need
15751          * to.
15752          */
15753         if (ill->ill_join_allmulti) {
15754                 (void) ill_nominate_mcast_rcv(illgrp);
15755         } else {
15756                 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
15757                         if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
15758                                 (void) ip_join_allmulti(ill->ill_ipif);
15759                                 break;
15760                         }
15761                 }
15762         }
15763
15764         /*
15765          * We intentionally do the flushing of IRE_CACHES only matching
15766          * on the ill and not on groups. Note that we are already deleted
15767          * from the group.
15768          *
15769          * This will make sure that all IRE_CACHES whose stq is pointing
15770          * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get
15771          * deleted and IRE_CACHES that are not pointing at this ill will
15772          * be left alone.
15773          */
15774         if (ill->ill_isv6) {
15775                 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
15776                     IRE_CACHE, illgrp_cache_delete, (char *)ill, ill);
15777         } else {
15778                 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
15779                     IRE_CACHE, illgrp_cache_delete, (char *)ill, ill);
15780         }
15781
15782         /*
15783          * Some conn may have cached one of the IREs deleted above. By removing
15784          * the ire reference, we clean up the extra reference to the ill held in
15785          * ire->ire_stq.
15786          */
15787         ipcl_walk(conn_cleanup_stale_ire, NULL, ipst);
15788
15789         /*
15790          * Re-do source address selection for all the members in the
15791          * group, if they borrowed source address from one of the ipifs
15792          * in this ill.
15793          */
15794         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15795                 if (ill->ill_isv6) {
15796                         ipif_update_other_ipifs_v6(ipif, illgrp);
15797                 } else {
15798                         ipif_update_other_ipifs(ipif, illgrp);
15799                 }
15800         }
15801 }
15802
15803 /*
15804  * Delete the ill from the group. The caller makes sure that it is
15805  * in a group and it okay to delete from the group. So, we always
15806  * delete here.
15807  */
15808 static void
15809 illgrp_delete(ill_t *ill)
15810 {
15811         ill_group_t *illgrp;
15812         ill_group_t *tmpg;
15813         ill_t *tmp_ill;
15814         ip_stack_t      *ipst = ill->ill_ipst;
15815
15816         /*
15817          * Reset illgrp_ill_schednext if it was pointing at us.
15818          * We need to do this before we set ill_group to NULL.
15819          */
15820         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15821         mutex_enter(&ill->ill_lock);
15822
15823         illgrp_reset_schednext(ill);
15824
15825         illgrp = ill->ill_group;
15826
15827         /* Delete the ill from illgrp. */
15828         if (illgrp->illgrp_ill == ill) {
15829                 illgrp->illgrp_ill = ill->ill_group_next;
15830         } else {
15831                 tmp_ill = illgrp->illgrp_ill;
15832                 while (tmp_ill->ill_group_next != ill) {
15833                         tmp_ill = tmp_ill->ill_group_next;
15834                         ASSERT(tmp_ill != NULL);
15835                 }
15836                 tmp_ill->ill_group_next = ill->ill_group_next;
15837         }
15838         ill->ill_group = NULL;
15839         ill->ill_group_next = NULL;
15840
15841         illgrp->illgrp_ill_count--;
15842         mutex_exit(&ill->ill_lock);
15843         rw_exit(&ipst->ips_ill_g_lock);
15844
15845         /*
15846          * As this ill is leaving the group, we need to hand off
15847          * the responsibilities to the other ills in the group, if
15848          * this ill had some responsibilities.
15849          */
15850
15851         ill_handoff_responsibility(ill, illgrp);
15852
15853         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15854
15855         if (illgrp->illgrp_ill_count == 0) {
15856
15857                 ASSERT(illgrp->illgrp_ill == NULL);
15858                 if (ill->ill_isv6) {
15859                         if (illgrp == ipst->ips_illgrp_head_v6) {
15860                                 ipst->ips_illgrp_head_v6 = illgrp->illgrp_next;
15861                         } else {
15862                                 tmpg = ipst->ips_illgrp_head_v6;
15863                                 while (tmpg->illgrp_next != illgrp) {
15864                                         tmpg = tmpg->illgrp_next;
15865                                         ASSERT(tmpg != NULL);
15866                                 }
15867                                 tmpg->illgrp_next = illgrp->illgrp_next;
15868                         }
15869                 } else {
15870                         if (illgrp == ipst->ips_illgrp_head_v4) {
15871                                 ipst->ips_illgrp_head_v4 = illgrp->illgrp_next;
15872                         } else {
15873                                 tmpg = ipst->ips_illgrp_head_v4;
15874                                 while (tmpg->illgrp_next != illgrp) {
15875                                         tmpg = tmpg->illgrp_next;
15876                                         ASSERT(tmpg != NULL);
15877                                 }
15878                                 tmpg->illgrp_next = illgrp->illgrp_next;
15879                         }
15880                 }
15881                 mutex_destroy(&illgrp->illgrp_lock);
15882                 mi_free(illgrp);
15883         }
15884         rw_exit(&ipst->ips_ill_g_lock);
15885
15886         /*
15887          * Even though the ill is out of the group its not necessary
15888          * to set ipsq_split as TRUE as the ipifs could be down temporarily
15889          * We will split the ipsq when phyint_groupname is set to NULL.
15890          */
15891
15892         /*
15893          * Send a routing sockets message if we are deleting from
15894          * groups with names.
15895          */
15896         if (ill->ill_phyint->phyint_groupname_len != 0)
15897                 ip_rts_ifmsg(ill->ill_ipif);
15898 }
15899
15900 /*
15901  * Re-do source address selection. This is normally called when
15902  * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST
15903  * ipif comes up.
15904  */
15905 void
15906 ill_update_source_selection(ill_t *ill)
15907 {
15908         ipif_t *ipif;
15909
15910         ASSERT(IAM_WRITER_ILL(ill));
15911
15912         if (ill->ill_group != NULL)
15913                 ill = ill->ill_group->illgrp_ill;
15914
15915         for (; ill != NULL; ill = ill->ill_group_next) {
15916                 for (ipif = ill->ill_ipif; ipif != NULL;
15917                     ipif = ipif->ipif_next) {
15918                         if (ill->ill_isv6)
15919                                 ipif_recreate_interface_routes_v6(NULL, ipif);
15920                         else
15921                                 ipif_recreate_interface_routes(NULL, ipif);
15922                 }
15923         }
15924 }
15925
15926 /*
15927  * Insert ill in a group headed by illgrp_head. The caller can either
15928  * pass a groupname in which case we search for a group with the
15929  * same name to insert in or pass a group to insert in. This function
15930  * would only search groups with names.
15931  *
15932  * NOTE : The caller should make sure that there is at least one ipif
15933  *        UP on this ill so that illgrp_scheduler can pick this ill
15934  *        for outbound packets. If ill_ipif_up_count is zero, we have
15935  *        already sent a DL_UNBIND to the driver and we don't want to
15936  *        send anymore packets. We don't assert for ipif_up_count
15937  *        to be greater than zero, because ipif_up_done wants to call
15938  *        this function before bumping up the ipif_up_count. See
15939  *        ipif_up_done() for details.
15940  */
15941 int
15942 illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname,
15943     ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up)
15944 {
15945         ill_group_t *illgrp;
15946         ill_t *prev_ill;
15947         phyint_t *phyi;
15948         ip_stack_t      *ipst = ill->ill_ipst;
15949
15950         ASSERT(ill->ill_group == NULL);
15951
15952         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15953         mutex_enter(&ill->ill_lock);
15954
15955         if (groupname != NULL) {
15956                 /*
15957                  * Look for a group with a matching groupname to insert.
15958                  */
15959                 for (illgrp = *illgrp_head; illgrp != NULL;
15960                     illgrp = illgrp->illgrp_next) {
15961
15962                         ill_t *tmp_ill;
15963
15964                         /*
15965                          * If we have an ill_group_t in the list which has
15966                          * no ill_t assigned then we must be in the process of
15967                          * removing this group. We skip this as illgrp_delete()
15968                          * will remove it from the list.
15969                          */
15970                         if ((tmp_ill = illgrp->illgrp_ill) == NULL) {
15971                                 ASSERT(illgrp->illgrp_ill_count == 0);
15972                                 continue;
15973                         }
15974
15975                         ASSERT(tmp_ill->ill_phyint != NULL);
15976                         phyi = tmp_ill->ill_phyint;
15977                         /*
15978                          * Look at groups which has names only.
15979                          */
15980                         if (phyi->phyint_groupname_len == 0)
15981                                 continue;
15982                         /*
15983                          * Names are stored in the phyint common to both
15984                          * IPv4 and IPv6.
15985                          */
15986                         if (mi_strcmp(phyi->phyint_groupname,
15987                             groupname) == 0) {
15988                                 break;
15989                         }
15990                 }
15991         } else {
15992                 /*
15993                  * If the caller passes in a NULL "grp_to_insert", we
15994                  * allocate one below and insert this singleton.
15995                  */
15996                 illgrp = grp_to_insert;
15997         }
15998
15999         ill->ill_group_next = NULL;
16000
16001         if (illgrp == NULL) {
16002                 illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t));
16003                 if (illgrp == NULL) {
16004                         return (ENOMEM);
16005                 }
16006                 illgrp->illgrp_next = *illgrp_head;
16007                 *illgrp_head = illgrp;
16008                 illgrp->illgrp_ill = ill;
16009                 illgrp->illgrp_ill_count = 1;
16010                 ill->ill_group = illgrp;
16011                 /*
16012                  * Used in illgrp_scheduler to protect multiple threads
16013                  * from traversing the list.
16014                  */
16015                 mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0);
16016         } else {
16017                 ASSERT(ill->ill_net_type ==
16018                     illgrp->illgrp_ill->ill_net_type);
16019                 ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type);
16020
16021                 /* Insert ill at tail of this group */
16022                 prev_ill = illgrp->illgrp_ill;
16023                 while (prev_ill->ill_group_next != NULL)
16024                         prev_ill = prev_ill->ill_group_next;
16025                 prev_ill->ill_group_next = ill;
16026                 ill->ill_group = illgrp;
16027                 illgrp->illgrp_ill_count++;
16028                 /*
16029                  * Inherit group properties. Currently only forwarding
16030                  * is the property we try to keep the same with all the
16031                  * ills. When there are more, we will abstract this into
16032                  * a function.
16033                  */
16034                 ill->ill_flags &= ~ILLF_ROUTER;
16035                 ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER);
16036         }
16037         mutex_exit(&ill->ill_lock);
16038         rw_exit(&ipst->ips_ill_g_lock);
16039
16040         /*
16041          * 1) When ipif_up_done() calls this function, ipif_up_count
16042          *    may be zero as it has not yet been bumped. But the ires
16043          *    have already been added. So, we do the nomination here
16044          *    itself. But, when ip_sioctl_groupname calls this, it checks
16045          *    for ill_ipif_up_count != 0. Thus we don't check for
16046          *    ill_ipif_up_count here while nominating broadcast ires for
16047          *    receive.
16048          *
16049          * 2) Similarly, we need to call ill_group_bcast_for_xmit here
16050          *    to group them properly as ire_add() has already happened
16051          *    in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert
16052          *    case, we need to do it here anyway.
16053          */
16054         if (!ill->ill_isv6) {
16055                 ill_group_bcast_for_xmit(ill);
16056                 ill_nominate_bcast_rcv(illgrp);
16057         }
16058
16059         if (!ipif_is_coming_up) {
16060                 /*
16061                  * When ipif_up_done() calls this function, the multicast
16062                  * groups have not been joined yet. So, there is no point in
16063                  * nomination. ip_join_allmulti will handle groups when
16064                  * ill_recover_multicast is called from ipif_up_done() later.
16065                  */
16066                 (void) ill_nominate_mcast_rcv(illgrp);
16067                 /*
16068                  * ipif_up_done calls ill_update_source_selection
16069                  * anyway. Moreover, we don't want to re-create
16070                  * interface routes while ipif_up_done() still has reference
16071                  * to them. Refer to ipif_up_done() for more details.
16072                  */
16073                 ill_update_source_selection(ill);
16074         }
16075
16076         /*
16077          * Send a routing sockets message if we are inserting into
16078          * groups with names.
16079          */
16080         if (groupname != NULL)
16081                 ip_rts_ifmsg(ill->ill_ipif);
16082         return (0);
16083 }
16084
16085 /*
16086  * Return the first phyint matching the groupname. There could
16087  * be more than one when there are ill groups.
16088  *
16089  * If 'usable' is set, then we exclude ones that are marked with any of
16090  * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE).
16091  * Needs work: called only from ip_sioctl_groupname and from the ipmp/netinfo
16092  * emulation of ipmp.
16093  */
16094 phyint_t *
16095 phyint_lookup_group(char *groupname, boolean_t usable, ip_stack_t *ipst)
16096 {
16097         phyint_t *phyi;
16098
16099         ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
16100         /*
16101          * Group names are stored in the phyint - a common structure
16102          * to both IPv4 and IPv6.
16103          */
16104         phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
16105         for (; phyi != NULL;
16106             phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16107             phyi, AVL_AFTER)) {
16108                 if (phyi->phyint_groupname_len == 0)
16109                         continue;
16110                 /*
16111                  * Skip the ones that should not be used since the callers
16112                  * sometime use this for sending packets.
16113                  */
16114                 if (usable && (phyi->phyint_flags &
16115                     (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)))
16116                         continue;
16117
16118                 ASSERT(phyi->phyint_groupname != NULL);
16119                 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0)
16120                         return (phyi);
16121         }
16122         return (NULL);
16123 }
16124
16125
16126 /*
16127  * Return the first usable phyint matching the group index. By 'usable'
16128  * we exclude ones that are marked ununsable with any of
16129  * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE).
16130  *
16131  * Used only for the ipmp/netinfo emulation of ipmp.
16132  */
16133 phyint_t *
16134 phyint_lookup_group_ifindex(uint_t group_ifindex, ip_stack_t *ipst)
16135 {
16136         phyint_t *phyi;
16137
16138         ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
16139
16140         if (!ipst->ips_ipmp_hook_emulation)
16141                 return (NULL);
16142
16143         /*
16144          * Group indicies are stored in the phyint - a common structure
16145          * to both IPv4 and IPv6.
16146          */
16147         phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index);
16148         for (; phyi != NULL;
16149             phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16150             phyi, AVL_AFTER)) {
16151                 /* Ignore the ones that do not have a group */
16152                 if (phyi->phyint_groupname_len == 0)
16153                         continue;
16154
16155                 ASSERT(phyi->phyint_group_ifindex != 0);
16156                 /*
16157                  * Skip the ones that should not be used since the callers
16158                  * sometime use this for sending packets.
16159                  */
16160                 if (phyi->phyint_flags &
16161                     (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE))
16162                         continue;
16163                 if (phyi->phyint_group_ifindex == group_ifindex)
16164                         return (phyi);
16165         }
16166         return (NULL);
16167 }
16168
16169
16170 /*
16171  * MT notes on creation and deletion of IPMP groups
16172  *
16173  * Creation and deletion of IPMP groups introduce the need to merge or
16174  * split the associated serialization objects i.e the ipsq's. Normally all
16175  * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled
16176  * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during
16177  * the execution of the SIOCSLIFGROUPNAME command the picture changes. There
16178  * is a need to change the <ill-ipsq> association and we have to operate on both
16179  * the source and destination IPMP groups. For eg. attempting to set the
16180  * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to
16181  * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the
16182  * source or destination IPMP group are mapped to a single ipsq for executing
16183  * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's.
16184  * The <ill-ipsq> mapping is restored back to normal at a later point. This is
16185  * termed as a split of the ipsq. The converse of the merge i.e. a split of the
16186  * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname
16187  * occurred on the ipsq, then the ipsq_split flag is set. This indicates the
16188  * ipsq has to be examined for redoing the <ill-ipsq> associations.
16189  *
16190  * In the above example the ioctl handling code locates the current ipsq of hme0
16191  * which is ipsq(mpk17-84). It then enters the above ipsq immediately or
16192  * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates
16193  * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into
16194  * the destination ipsq. If the destination ipsq is not busy, it also enters
16195  * the destination ipsq exclusively. Now the actual groupname setting operation
16196  * can proceed. If the destination ipsq is busy, the operation is enqueued
16197  * on the destination (merged) ipsq and will be handled in the unwind from
16198  * ipsq_exit.
16199  *
16200  * To prevent other threads accessing the ill while the group name change is
16201  * in progres, we bring down the ipifs which also removes the ill from the
16202  * group. The group is changed in phyint and when the first ipif on the ill
16203  * is brought up, the ill is inserted into the right IPMP group by
16204  * illgrp_insert.
16205  */
16206 /* ARGSUSED */
16207 int
16208 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16209     ip_ioctl_cmd_t *ipip, void *ifreq)
16210 {
16211         int i;
16212         char *tmp;
16213         int namelen;
16214         ill_t *ill = ipif->ipif_ill;
16215         ill_t *ill_v4, *ill_v6;
16216         int err = 0;
16217         phyint_t *phyi;
16218         phyint_t *phyi_tmp;
16219         struct lifreq *lifr;
16220         mblk_t  *mp1;
16221         char *groupname;
16222         ipsq_t *ipsq;
16223         ip_stack_t      *ipst = ill->ill_ipst;
16224
16225         ASSERT(IAM_WRITER_IPIF(ipif));
16226
16227         /* Existance verified in ip_wput_nondata */
16228         mp1 = mp->b_cont->b_cont;
16229         lifr = (struct lifreq *)mp1->b_rptr;
16230         groupname = lifr->lifr_groupname;
16231
16232         if (ipif->ipif_id != 0)
16233                 return (EINVAL);
16234
16235         phyi = ill->ill_phyint;
16236         ASSERT(phyi != NULL);
16237
16238         if (phyi->phyint_flags & PHYI_VIRTUAL)
16239                 return (EINVAL);
16240
16241         tmp = groupname;
16242         for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++)
16243                 ;
16244
16245         if (i == LIFNAMSIZ) {
16246                 /* no null termination */
16247                 return (EINVAL);
16248         }
16249
16250         /*
16251          * Calculate the namelen exclusive of the null
16252          * termination character.
16253          */
16254         namelen = tmp - groupname;
16255
16256         ill_v4 = phyi->phyint_illv4;
16257         ill_v6 = phyi->phyint_illv6;
16258
16259         /*
16260          * ILL cannot be part of a usesrc group and and IPMP group at the
16261          * same time. No need to grab the ill_g_usesrc_lock here, see
16262          * synchronization notes in ip.c
16263          */
16264         if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) {
16265                 return (EINVAL);
16266         }
16267
16268         /*
16269          * mark the ill as changing.
16270          * this should queue all new requests on the syncq.
16271          */
16272         GRAB_ILL_LOCKS(ill_v4, ill_v6);
16273
16274         if (ill_v4 != NULL)
16275                 ill_v4->ill_state_flags |= ILL_CHANGING;
16276         if (ill_v6 != NULL)
16277                 ill_v6->ill_state_flags |= ILL_CHANGING;
16278         RELEASE_ILL_LOCKS(ill_v4, ill_v6);
16279
16280         if (namelen == 0) {
16281                 /*
16282                  * Null string means remove this interface from the
16283                  * existing group.
16284                  */
16285                 if (phyi->phyint_groupname_len == 0) {
16286                         /*
16287                          * Never was in a group.
16288                          */
16289                         err = 0;
16290                         goto done;
16291                 }
16292
16293                 /*
16294                  * IPv4 or IPv6 may be temporarily out of the group when all
16295                  * the ipifs are down. Thus, we need to check for ill_group to
16296                  * be non-NULL.
16297                  */
16298                 if (ill_v4 != NULL && ill_v4->ill_group != NULL) {
16299                         ill_down_ipifs(ill_v4, mp, 0, B_FALSE);
16300                         mutex_enter(&ill_v4->ill_lock);
16301                         if (!ill_is_quiescent(ill_v4)) {
16302                                 /*
16303                                  * ipsq_pending_mp_add will not fail since
16304                                  * connp is NULL
16305                                  */
16306                                 (void) ipsq_pending_mp_add(NULL,
16307                                     ill_v4->ill_ipif, q, mp, ILL_DOWN);
16308                                 mutex_exit(&ill_v4->ill_lock);
16309                                 err = EINPROGRESS;
16310                                 goto done;
16311                         }
16312                         mutex_exit(&ill_v4->ill_lock);
16313                 }
16314
16315                 if (ill_v6 != NULL && ill_v6->ill_group != NULL) {
16316                         ill_down_ipifs(ill_v6, mp, 0, B_FALSE);
16317                         mutex_enter(&ill_v6->ill_lock);
16318                         if (!ill_is_quiescent(ill_v6)) {
16319                                 (void) ipsq_pending_mp_add(NULL,
16320                                     ill_v6->ill_ipif, q, mp, ILL_DOWN);
16321                                 mutex_exit(&ill_v6->ill_lock);
16322                                 err = EINPROGRESS;
16323                                 goto done;
16324                         }
16325                         mutex_exit(&ill_v6->ill_lock);
16326                 }
16327
16328                 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
16329                 GRAB_ILL_LOCKS(ill_v4, ill_v6);
16330                 mutex_enter(&phyi->phyint_lock);
16331                 ASSERT(phyi->phyint_groupname != NULL);
16332                 mi_free(phyi->phyint_groupname);
16333                 phyi->phyint_groupname = NULL;
16334                 phyi->phyint_groupname_len = 0;
16335
16336                 /* Restore the ifindex used to be the per interface one */
16337                 phyi->phyint_group_ifindex = 0;
16338                 phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
16339                 mutex_exit(&phyi->phyint_lock);
16340                 RELEASE_ILL_LOCKS(ill_v4, ill_v6);
16341                 rw_exit(&ipst->ips_ill_g_lock);
16342                 err = ill_up_ipifs(ill, q, mp);
16343
16344                 /*
16345                  * set the split flag so that the ipsq can be split
16346                  */
16347                 mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
16348                 phyi->phyint_ipsq->ipsq_split = B_TRUE;
16349                 mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
16350
16351         } else {
16352                 if (phyi->phyint_groupname_len != 0) {
16353                         ASSERT(phyi->phyint_groupname != NULL);
16354                         /* Are we inserting in the same group ? */
16355                         if (mi_strcmp(groupname,
16356                             phyi->phyint_groupname) == 0) {
16357                                 err = 0;
16358                                 goto done;
16359                         }
16360                 }
16361
16362                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
16363                 /*
16364                  * Merge ipsq for the group's.
16365                  * This check is here as multiple groups/ills might be
16366                  * sharing the same ipsq.
16367                  * If we have to merege than the operation is restarted
16368                  * on the new ipsq.
16369                  */
16370                 ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL, ipst);
16371                 if (phyi->phyint_ipsq != ipsq) {
16372                         rw_exit(&ipst->ips_ill_g_lock);
16373                         err = ill_merge_groups(ill, NULL, groupname, mp, q);
16374                         goto done;
16375                 }
16376                 /*
16377                  * Running exclusive on new ipsq.
16378                  */
16379
16380                 ASSERT(ipsq != NULL);
16381                 ASSERT(ipsq->ipsq_writer == curthread);
16382
16383                 /*
16384                  * Check whether the ill_type and ill_net_type matches before
16385                  * we allocate any memory so that the cleanup is easier.
16386                  *
16387                  * We can't group dissimilar ones as we can't load spread
16388                  * packets across the group because of potential link-level
16389                  * header differences.
16390                  */
16391                 phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst);
16392                 if (phyi_tmp != NULL) {
16393                         if ((ill_v4 != NULL &&
16394                             phyi_tmp->phyint_illv4 != NULL) &&
16395                             ((ill_v4->ill_net_type !=
16396                             phyi_tmp->phyint_illv4->ill_net_type) ||
16397                             (ill_v4->ill_type !=
16398                             phyi_tmp->phyint_illv4->ill_type))) {
16399                                 mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
16400                                 phyi->phyint_ipsq->ipsq_split = B_TRUE;
16401                                 mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
16402                                 rw_exit(&ipst->ips_ill_g_lock);
16403                                 return (EINVAL);
16404                         }
16405                         if ((ill_v6 != NULL &&
16406                             phyi_tmp->phyint_illv6 != NULL) &&
16407                             ((ill_v6->ill_net_type !=
16408                             phyi_tmp->phyint_illv6->ill_net_type) ||
16409                             (ill_v6->ill_type !=
16410                             phyi_tmp->phyint_illv6->ill_type))) {
16411                                 mutex_enter(&phyi->phyint_ipsq->ipsq_lock);
16412                                 phyi->phyint_ipsq->ipsq_split = B_TRUE;
16413                                 mutex_exit(&phyi->phyint_ipsq->ipsq_lock);
16414                                 rw_exit(&ipst->ips_ill_g_lock);
16415                                 return (EINVAL);
16416                         }
16417                 }
16418
16419                 rw_exit(&ipst->ips_ill_g_lock);
16420
16421                 /*
16422                  * bring down all v4 ipifs.
16423                  */
16424                 if (ill_v4 != NULL) {
16425                         ill_down_ipifs(ill_v4, mp, 0, B_FALSE);
16426                 }
16427
16428                 /*
16429                  * bring down all v6 ipifs.
16430                  */
16431                 if (ill_v6 != NULL) {
16432                         ill_down_ipifs(ill_v6, mp, 0, B_FALSE);
16433                 }
16434
16435                 /*
16436                  * make sure all ipifs are down and there are no active
16437                  * references. Call to ipsq_pending_mp_add will not fail
16438                  * since connp is NULL.
16439                  */
16440                 if (ill_v4 != NULL) {
16441                         mutex_enter(&ill_v4->ill_lock);
16442                         if (!ill_is_quiescent(ill_v4)) {
16443                                 (void) ipsq_pending_mp_add(NULL,
16444                                     ill_v4->ill_ipif, q, mp, ILL_DOWN);
16445                                 mutex_exit(&ill_v4->ill_lock);
16446                                 err = EINPROGRESS;
16447                                 goto done;
16448                         }
16449                         mutex_exit(&ill_v4->ill_lock);
16450                 }
16451
16452                 if (ill_v6 != NULL) {
16453                         mutex_enter(&ill_v6->ill_lock);
16454                         if (!ill_is_quiescent(ill_v6)) {
16455                                 (void) ipsq_pending_mp_add(NULL,
16456                                     ill_v6->ill_ipif, q, mp, ILL_DOWN);
16457                                 mutex_exit(&ill_v6->ill_lock);
16458                                 err = EINPROGRESS;
16459                                 goto done;
16460                         }
16461                         mutex_exit(&ill_v6->ill_lock);
16462                 }
16463
16464                 /*
16465                  * allocate including space for null terminator
16466                  * before we insert.
16467                  */
16468                 tmp = (char *)mi_alloc(namelen + 1, BPRI_MED);
16469                 if (tmp == NULL)
16470                         return (ENOMEM);
16471
16472                 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
16473                 GRAB_ILL_LOCKS(ill_v4, ill_v6);
16474                 mutex_enter(&phyi->phyint_lock);
16475                 if (phyi->phyint_groupname_len != 0) {
16476                         ASSERT(phyi->phyint_groupname != NULL);
16477                         mi_free(phyi->phyint_groupname);
16478                 }
16479
16480                 /*
16481                  * setup the new group name.
16482                  */
16483                 phyi->phyint_groupname = tmp;
16484                 bcopy(groupname, phyi->phyint_groupname, namelen + 1);
16485                 phyi->phyint_groupname_len = namelen + 1;
16486
16487                 if (ipst->ips_ipmp_hook_emulation) {
16488                         /*
16489                          * If the group already exists we use the existing
16490                          * group_ifindex, otherwise we pick a new index here.
16491                          */
16492                         if (phyi_tmp != NULL) {
16493                                 phyi->phyint_group_ifindex =
16494                                     phyi_tmp->phyint_group_ifindex;
16495                         } else {
16496                                 /* XXX We need a recovery strategy here. */
16497                                 if (!ip_assign_ifindex(
16498                                     &phyi->phyint_group_ifindex, ipst))
16499                                         cmn_err(CE_PANIC,
16500                                             "ip_assign_ifindex() failed");
16501                         }
16502                 }
16503                 /*
16504                  * Select whether the netinfo and hook use the per-interface
16505                  * or per-group ifindex.
16506                  */
16507                 if (ipst->ips_ipmp_hook_emulation)
16508                         phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex;
16509                 else
16510                         phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
16511
16512                 if (ipst->ips_ipmp_hook_emulation &&
16513                     phyi_tmp != NULL) {
16514                         /* First phyint in group - group PLUMB event */
16515                         ill_nic_info_plumb(ill, B_TRUE);
16516                 }
16517                 mutex_exit(&phyi->phyint_lock);
16518                 RELEASE_ILL_LOCKS(ill_v4, ill_v6);
16519                 rw_exit(&ipst->ips_ill_g_lock);
16520
16521                 err = ill_up_ipifs(ill, q, mp);
16522         }
16523
16524 done:
16525         /*
16526          *  normally ILL_CHANGING is cleared in ill_up_ipifs.
16527          */
16528         if (err != EINPROGRESS) {
16529                 GRAB_ILL_LOCKS(ill_v4, ill_v6);
16530                 if (ill_v4 != NULL)
16531                         ill_v4->ill_state_flags &= ~ILL_CHANGING;
16532                 if (ill_v6 != NULL)
16533                         ill_v6->ill_state_flags &= ~ILL_CHANGING;
16534                 RELEASE_ILL_LOCKS(ill_v4, ill_v6);
16535         }
16536         return (err);
16537 }
16538
16539 /* ARGSUSED */
16540 int
16541 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
16542     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
16543 {
16544         ill_t *ill;
16545         phyint_t *phyi;
16546         struct lifreq *lifr;
16547         mblk_t  *mp1;
16548
16549         /* Existence verified in ip_wput_nondata */
16550         mp1 = mp->b_cont->b_cont;
16551         lifr = (struct lifreq *)mp1->b_rptr;
16552         ill = ipif->ipif_ill;
16553         phyi = ill->ill_phyint;
16554
16555         lifr->lifr_groupname[0] = '\0';
16556         /*
16557          * ill_group may be null if all the interfaces
16558          * are down. But still, the phyint should always
16559          * hold the name.
16560          */
16561         if (phyi->phyint_groupname_len != 0) {
16562                 bcopy(phyi->phyint_groupname, lifr->lifr_groupname,
16563                     phyi->phyint_groupname_len);
16564         }
16565
16566         return (0);
16567 }
16568
16569
16570 typedef struct conn_move_s {
16571         ill_t   *cm_from_ill;
16572         ill_t   *cm_to_ill;
16573         int     cm_ifindex;
16574 } conn_move_t;
16575
16576 /*
16577  * ipcl_walk function for moving conn_multicast_ill for a given ill.
16578  */
16579 static void
16580 conn_move(conn_t *connp, caddr_t arg)
16581 {
16582         conn_move_t *connm;
16583         int ifindex;
16584         int i;
16585         ill_t *from_ill;
16586         ill_t *to_ill;
16587         ilg_t *ilg;
16588         ilm_t *ret_ilm;
16589
16590         connm = (conn_move_t *)arg;
16591         ifindex = connm->cm_ifindex;
16592         from_ill = connm->cm_from_ill;
16593         to_ill = connm->cm_to_ill;
16594
16595         /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */
16596
16597         /* All multicast fields protected by conn_lock */
16598         mutex_enter(&connp->conn_lock);
16599         ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
16600         if ((connp->conn_outgoing_ill == from_ill) &&
16601             (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) {
16602                 connp->conn_outgoing_ill = to_ill;
16603                 connp->conn_incoming_ill = to_ill;
16604         }
16605
16606         /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */
16607
16608         if ((connp->conn_multicast_ill == from_ill) &&
16609             (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) {
16610                 connp->conn_multicast_ill = connm->cm_to_ill;
16611         }
16612
16613         /* Change IP_XMIT_IF associations */
16614         if ((connp->conn_xmit_if_ill == from_ill) &&
16615             (ifindex == 0 || connp->conn_orig_xmit_ifindex == ifindex)) {
16616                 connp->conn_xmit_if_ill = to_ill;
16617         }
16618         /*
16619          * Change the ilg_ill to point to the new one. This assumes
16620          * ilm_move_v6 has moved the ilms to new_ill and the driver
16621          * has been told to receive packets on this interface.
16622          * ilm_move_v6 FAILBACKS all the ilms successfully always.
16623          * But when doing a FAILOVER, it might fail with ENOMEM and so
16624          * some ilms may not have moved. We check to see whether
16625          * the ilms have moved to to_ill. We can't check on from_ill
16626          * as in the process of moving, we could have split an ilm
16627          * in to two - which has the same orig_ifindex and v6group.
16628          *
16629          * For IPv4, ilg_ipif moves implicitly. The code below really
16630          * does not do anything for IPv4 as ilg_ill is NULL for IPv4.
16631          */
16632         for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
16633                 ilg = &connp->conn_ilg[i];
16634                 if ((ilg->ilg_ill == from_ill) &&
16635                     (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) {
16636                         /* ifindex != 0 indicates failback */
16637                         if (ifindex != 0) {
16638                                 connp->conn_ilg[i].ilg_ill = to_ill;
16639                                 continue;
16640                         }
16641
16642                         ret_ilm = ilm_lookup_ill_index_v6(to_ill,
16643                             &ilg->ilg_v6group, ilg->ilg_orig_ifindex,
16644                             connp->conn_zoneid);
16645
16646                         if (ret_ilm != NULL)
16647                                 connp->conn_ilg[i].ilg_ill = to_ill;
16648                 }
16649         }
16650         mutex_exit(&connp->conn_lock);
16651 }
16652
16653 static void
16654 conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex)
16655 {
16656         conn_move_t connm;
16657         ip_stack_t      *ipst = from_ill->ill_ipst;
16658
16659         connm.cm_from_ill = from_ill;
16660         connm.cm_to_ill = to_ill;
16661         connm.cm_ifindex = ifindex;
16662
16663         ipcl_walk(conn_move, (caddr_t)&connm, ipst);
16664 }
16665
16666 /*
16667  * ilm has been moved from from_ill to to_ill.
16668  * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill.
16669  * appropriately.
16670  *
16671  * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because
16672  *        the code there de-references ipif_ill to get the ill to
16673  *        send multicast requests. It does not work as ipif is on its
16674  *        move and already moved when this function is called.
16675  *        Thus, we need to use from_ill and to_ill send down multicast
16676  *        requests.
16677  */
16678 static void
16679 ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill)
16680 {
16681         ipif_t *ipif;
16682         ilm_t *ilm;
16683
16684         /*
16685          * See whether we need to send down DL_ENABMULTI_REQ on
16686          * to_ill as ilm has just been added.
16687          */
16688         ASSERT(IAM_WRITER_ILL(to_ill));
16689         ASSERT(IAM_WRITER_ILL(from_ill));
16690
16691         ILM_WALKER_HOLD(to_ill);
16692         for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
16693
16694                 if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED))
16695                         continue;
16696                 /*
16697                  * no locks held, ill/ipif cannot dissappear as long
16698                  * as we are writer.
16699                  */
16700                 ipif = to_ill->ill_ipif;
16701                 /*
16702                  * No need to hold any lock as we are the writer and this
16703                  * can only be changed by a writer.
16704                  */
16705                 ilm->ilm_is_new = B_FALSE;
16706
16707                 if (to_ill->ill_net_type != IRE_IF_RESOLVER ||
16708                     ipif->ipif_flags & IPIF_POINTOPOINT) {
16709                         ip1dbg(("ilm_send_multicast_reqs: to_ill not "
16710                             "resolver\n"));
16711                         continue;               /* Must be IRE_IF_NORESOLVER */
16712                 }
16713
16714
16715                 if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
16716                         ip1dbg(("ilm_send_multicast_reqs: "
16717                             "to_ill MULTI_BCAST\n"));
16718                         goto from;
16719                 }
16720
16721                 if (to_ill->ill_isv6)
16722                         mld_joingroup(ilm);
16723                 else
16724                         igmp_joingroup(ilm);
16725
16726                 if (to_ill->ill_ipif_up_count == 0) {
16727                         /*
16728                          * Nobody there. All multicast addresses will be
16729                          * re-joined when we get the DL_BIND_ACK bringing the
16730                          * interface up.
16731                          */
16732                         ilm->ilm_notify_driver = B_FALSE;
16733                         ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n"));
16734                         goto from;
16735                 }
16736
16737                 /*
16738                  * For allmulti address, we want to join on only one interface.
16739                  * Checking for ilm_numentries_v6 is not correct as you may
16740                  * find an ilm with zero address on to_ill, but we may not
16741                  * have nominated to_ill for receiving. Thus, if we have
16742                  * nominated from_ill (ill_join_allmulti is set), nominate
16743                  * only if to_ill is not already nominated (to_ill normally
16744                  * should not have been nominated if "from_ill" has already
16745                  * been nominated. As we don't prevent failovers from happening
16746                  * across groups, we don't assert).
16747                  */
16748                 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
16749                         /*
16750                          * There is no need to hold ill locks as we are
16751                          * writer on both ills and when ill_join_allmulti
16752                          * is changed the thread is always a writer.
16753                          */
16754                         if (from_ill->ill_join_allmulti &&
16755                             !to_ill->ill_join_allmulti) {
16756                                 (void) ip_join_allmulti(to_ill->ill_ipif);
16757                         }
16758                 } else if (ilm->ilm_notify_driver) {
16759
16760                         /*
16761                          * This is a newly moved ilm so we need to tell the
16762                          * driver about the new group. There can be more than
16763                          * one ilm's for the same group in the list each with a
16764                          * different orig_ifindex. We have to inform the driver
16765                          * once. In ilm_move_v[4,6] we only set the flag
16766                          * ilm_notify_driver for the first ilm.
16767                          */
16768
16769                         (void) ip_ll_send_enabmulti_req(to_ill,
16770                             &ilm->ilm_v6addr);
16771                 }
16772
16773                 ilm->ilm_notify_driver = B_FALSE;
16774
16775                 /*
16776                  * See whether we need to send down DL_DISABMULTI_REQ on
16777                  * from_ill as ilm has just been removed.
16778                  */
16779 from:
16780                 ipif = from_ill->ill_ipif;
16781                 if (from_ill->ill_net_type != IRE_IF_RESOLVER ||
16782                     ipif->ipif_flags & IPIF_POINTOPOINT) {
16783                         ip1dbg(("ilm_send_multicast_reqs: "
16784                             "from_ill not resolver\n"));
16785                         continue;               /* Must be IRE_IF_NORESOLVER */
16786                 }
16787
16788                 if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
16789                         ip1dbg(("ilm_send_multicast_reqs: "
16790                             "from_ill MULTI_BCAST\n"));
16791                         continue;
16792                 }
16793
16794                 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
16795                         if (from_ill->ill_join_allmulti)
16796                                 (void) ip_leave_allmulti(from_ill->ill_ipif);
16797                 } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) {
16798                         (void) ip_ll_send_disabmulti_req(from_ill,
16799                             &ilm->ilm_v6addr);
16800                 }
16801         }
16802         ILM_WALKER_RELE(to_ill);
16803 }
16804
16805 /*
16806  * This function is called when all multicast memberships needs
16807  * to be moved from "from_ill" to "to_ill" for IPv6. This function is
16808  * called only once unlike the IPv4 counterpart where it is called after
16809  * every logical interface is moved. The reason is due to multicast
16810  * memberships are joined using an interface address in IPv4 while in
16811  * IPv6, interface index is used.
16812  */
16813 static void
16814 ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex)
16815 {
16816         ilm_t   *ilm;
16817         ilm_t   *ilm_next;
16818         ilm_t   *new_ilm;
16819         ilm_t   **ilmp;
16820         int     count;
16821         char buf[INET6_ADDRSTRLEN];
16822         in6_addr_t ipv6_snm = ipv6_solicited_node_mcast;
16823         ip_stack_t      *ipst = from_ill->ill_ipst;
16824
16825         ASSERT(MUTEX_HELD(&to_ill->ill_lock));
16826         ASSERT(MUTEX_HELD(&from_ill->ill_lock));
16827         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
16828
16829         if (ifindex == 0) {
16830                 /*
16831                  * Form the solicited node mcast address which is used later.
16832                  */
16833                 ipif_t *ipif;
16834
16835                 ipif = from_ill->ill_ipif;
16836                 ASSERT(ipif->ipif_id == 0);
16837
16838                 ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
16839         }
16840
16841         ilmp = &from_ill->ill_ilm;
16842         for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) {
16843                 ilm_next = ilm->ilm_next;
16844
16845                 if (ilm->ilm_flags & ILM_DELETED) {
16846                         ilmp = &ilm->ilm_next;
16847                         continue;
16848                 }
16849
16850                 new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr,
16851                     ilm->ilm_orig_ifindex, ilm->ilm_zoneid);
16852                 ASSERT(ilm->ilm_orig_ifindex != 0);
16853                 if (ilm->ilm_orig_ifindex == ifindex) {
16854                         /*
16855                          * We are failing back multicast memberships.
16856                          * If the same ilm exists in to_ill, it means somebody
16857                          * has joined the same group there e.g. ff02::1
16858                          * is joined within the kernel when the interfaces
16859                          * came UP.
16860                          */
16861                         ASSERT(ilm->ilm_ipif == NULL);
16862                         if (new_ilm != NULL) {
16863                                 new_ilm->ilm_refcnt += ilm->ilm_refcnt;
16864                                 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
16865                                     !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
16866                                         new_ilm->ilm_is_new = B_TRUE;
16867                                 }
16868                         } else {
16869                                 /*
16870                                  * check if we can just move the ilm
16871                                  */
16872                                 if (from_ill->ill_ilm_walker_cnt != 0) {
16873                                         /*
16874                                          * We have walkers we cannot move
16875                                          * the ilm, so allocate a new ilm,
16876                                          * this (old) ilm will be marked
16877                                          * ILM_DELETED at the end of the loop
16878                                          * and will be freed when the
16879                                          * last walker exits.
16880                                          */
16881                                         new_ilm = (ilm_t *)mi_zalloc
16882                                             (sizeof (ilm_t));
16883                                         if (new_ilm == NULL) {
16884                                                 ip0dbg(("ilm_move_v6: "
16885                                                     "FAILBACK of IPv6"
16886                                                     " multicast address %s : "
16887                                                     "from %s to"
16888                                                     " %s failed : ENOMEM \n",
16889                                                     inet_ntop(AF_INET6,
16890                                                     &ilm->ilm_v6addr, buf,
16891                                                     sizeof (buf)),
16892                                                     from_ill->ill_name,
16893                                                     to_ill->ill_name));
16894
16895                                                         ilmp = &ilm->ilm_next;
16896                                                         continue;
16897                                         }
16898                                         *new_ilm = *ilm;
16899                                         /*
16900                                          * we don't want new_ilm linked to
16901                                          * ilm's filter list.
16902                                          */
16903                                         new_ilm->ilm_filter = NULL;
16904                                 } else {
16905                                         /*
16906                                          * No walkers we can move the ilm.
16907                                          * lets take it out of the list.
16908                                          */
16909                                         *ilmp = ilm->ilm_next;
16910                                         ilm->ilm_next = NULL;
16911                                         new_ilm = ilm;
16912                                 }
16913
16914                                 /*
16915                                  * if this is the first ilm for the group
16916                                  * set ilm_notify_driver so that we notify the
16917                                  * driver in ilm_send_multicast_reqs.
16918                                  */
16919                                 if (ilm_lookup_ill_v6(to_ill,
16920                                     &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
16921                                         new_ilm->ilm_notify_driver = B_TRUE;
16922
16923                                 new_ilm->ilm_ill = to_ill;
16924                                 /* Add to the to_ill's list */
16925                                 new_ilm->ilm_next = to_ill->ill_ilm;
16926                                 to_ill->ill_ilm = new_ilm;
16927                                 /*
16928                                  * set the flag so that mld_joingroup is
16929                                  * called in ilm_send_multicast_reqs().
16930                                  */
16931                                 new_ilm->ilm_is_new = B_TRUE;
16932                         }
16933                         goto bottom;
16934                 } else if (ifindex != 0) {
16935                         /*
16936                          * If this is FAILBACK (ifindex != 0) and the ifindex
16937                          * has not matched above, look at the next ilm.
16938                          */
16939                         ilmp = &ilm->ilm_next;
16940                         continue;
16941                 }
16942                 /*
16943                  * If we are here, it means ifindex is 0. Failover
16944                  * everything.
16945                  *
16946                  * We need to handle solicited node mcast address
16947                  * and all_nodes mcast address differently as they
16948                  * are joined witin the kenrel (ipif_multicast_up)
16949                  * and potentially from the userland. We are called
16950                  * after the ipifs of from_ill has been moved.
16951                  * If we still find ilms on ill with solicited node
16952                  * mcast address or all_nodes mcast address, it must
16953                  * belong to the UP interface that has not moved e.g.
16954                  * ipif_id 0 with the link local prefix does not move.
16955                  * We join this on the new ill accounting for all the
16956                  * userland memberships so that applications don't
16957                  * see any failure.
16958                  *
16959                  * We need to make sure that we account only for the
16960                  * solicited node and all node multicast addresses
16961                  * that was brought UP on these. In the case of
16962                  * a failover from A to B, we might have ilms belonging
16963                  * to A (ilm_orig_ifindex pointing at A) on B accounting
16964                  * for the membership from the userland. If we are failing
16965                  * over from B to C now, we will find the ones belonging
16966                  * to A on B. These don't account for the ill_ipif_up_count.
16967                  * They just move from B to C. The check below on
16968                  * ilm_orig_ifindex ensures that.
16969                  */
16970                 if ((ilm->ilm_orig_ifindex ==
16971                     from_ill->ill_phyint->phyint_ifindex) &&
16972                     (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) ||
16973                     IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast,
16974                     &ilm->ilm_v6addr))) {
16975                         ASSERT(ilm->ilm_refcnt > 0);
16976                         count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count;
16977                         /*
16978                          * For indentation reasons, we are not using a
16979                          * "else" here.
16980                          */
16981                         if (count == 0) {
16982                                 ilmp = &ilm->ilm_next;
16983                                 continue;
16984                         }
16985                         ilm->ilm_refcnt -= count;
16986                         if (new_ilm != NULL) {
16987                                 /*
16988                                  * Can find one with the same
16989                                  * ilm_orig_ifindex, if we are failing
16990                                  * over to a STANDBY. This happens
16991                                  * when somebody wants to join a group
16992                                  * on a STANDBY interface and we
16993                                  * internally join on a different one.
16994                                  * If we had joined on from_ill then, a
16995                                  * failover now will find a new ilm
16996                                  * with this index.
16997                                  */
16998                                 ip1dbg(("ilm_move_v6: FAILOVER, found"
16999                                     " new ilm on %s, group address %s\n",
17000                                     to_ill->ill_name,
17001                                     inet_ntop(AF_INET6,
17002                                     &ilm->ilm_v6addr, buf,
17003                                     sizeof (buf))));
17004                                 new_ilm->ilm_refcnt += count;
17005                                 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
17006                                     !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
17007                                         new_ilm->ilm_is_new = B_TRUE;
17008                                 }
17009                         } else {
17010                                 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t));
17011                                 if (new_ilm == NULL) {
17012                                         ip0dbg(("ilm_move_v6: FAILOVER of IPv6"
17013                                             " multicast address %s : from %s to"
17014                                             " %s failed : ENOMEM \n",
17015                                             inet_ntop(AF_INET6,
17016                                             &ilm->ilm_v6addr, buf,
17017                                             sizeof (buf)), from_ill->ill_name,
17018                                             to_ill->ill_name));
17019                                         ilmp = &ilm->ilm_next;
17020                                         continue;
17021                                 }
17022                                 *new_ilm = *ilm;
17023                                 new_ilm->ilm_filter = NULL;
17024                                 new_ilm->ilm_refcnt = count;
17025                                 new_ilm->ilm_timer = INFINITY;
17026                                 new_ilm->ilm_rtx.rtx_timer = INFINITY;
17027                                 new_ilm->ilm_is_new = B_TRUE;
17028                                 /*
17029                                  * If the to_ill has not joined this
17030                                  * group we need to tell the driver in
17031                                  * ill_send_multicast_reqs.
17032                                  */
17033                                 if (ilm_lookup_ill_v6(to_ill,
17034                                     &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
17035                                         new_ilm->ilm_notify_driver = B_TRUE;
17036
17037                                 new_ilm->ilm_ill = to_ill;
17038                                 /* Add to the to_ill's list */
17039                                 new_ilm->ilm_next = to_ill->ill_ilm;
17040                                 to_ill->ill_ilm = new_ilm;
17041                                 ASSERT(new_ilm->ilm_ipif == NULL);
17042                         }
17043                         if (ilm->ilm_refcnt == 0) {
17044                                 goto bottom;
17045                         } else {
17046                                 new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
17047                                 CLEAR_SLIST(new_ilm->ilm_filter);
17048                                 ilmp = &ilm->ilm_next;
17049                         }
17050                         continue;
17051                 } else {
17052                         /*
17053                          * ifindex = 0 means, move everything pointing at
17054                          * from_ill. We are doing this becuase ill has
17055                          * either FAILED or became INACTIVE.
17056                          *
17057                          * As we would like to move things later back to
17058                          * from_ill, we want to retain the identity of this
17059                          * ilm. Thus, we don't blindly increment the reference
17060                          * count on the ilms matching the address alone. We
17061                          * need to match on the ilm_orig_index also. new_ilm
17062                          * was obtained by matching ilm_orig_index also.
17063                          */
17064                         if (new_ilm != NULL) {
17065                                 /*
17066                                  * This is possible only if a previous restore
17067                                  * was incomplete i.e restore to
17068                                  * ilm_orig_ifindex left some ilms because
17069                                  * of some failures. Thus when we are failing
17070                                  * again, we might find our old friends there.
17071                                  */
17072                                 ip1dbg(("ilm_move_v6: FAILOVER, found new ilm"
17073                                     " on %s, group address %s\n",
17074                                     to_ill->ill_name,
17075                                     inet_ntop(AF_INET6,
17076                                     &ilm->ilm_v6addr, buf,
17077                                     sizeof (buf))));
17078                                 new_ilm->ilm_refcnt += ilm->ilm_refcnt;
17079                                 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE ||
17080                                     !SLIST_IS_EMPTY(new_ilm->ilm_filter)) {
17081                                         new_ilm->ilm_is_new = B_TRUE;
17082                                 }
17083                         } else {
17084                                 if (from_ill->ill_ilm_walker_cnt != 0) {
17085                                         new_ilm = (ilm_t *)
17086                                             mi_zalloc(sizeof (ilm_t));
17087                                         if (new_ilm == NULL) {
17088                                                 ip0dbg(("ilm_move_v6: "
17089                                                     "FAILOVER of IPv6"
17090                                                     " multicast address %s : "
17091                                                     "from %s to"
17092                                                     " %s failed : ENOMEM \n",
17093                                                     inet_ntop(AF_INET6,
17094                                                     &ilm->ilm_v6addr, buf,
17095                                                     sizeof (buf)),
17096                                                     from_ill->ill_name,
17097                                                     to_ill->ill_name));
17098
17099                                                         ilmp = &ilm->ilm_next;
17100                                                         continue;
17101                                         }
17102                                         *new_ilm = *ilm;
17103                                         new_ilm->ilm_filter = NULL;
17104                                 } else {
17105                                         *ilmp = ilm->ilm_next;
17106                                         new_ilm = ilm;
17107                                 }
17108                                 /*
17109                                  * If the to_ill has not joined this
17110                                  * group we need to tell the driver in
17111                                  * ill_send_multicast_reqs.
17112                                  */
17113                                 if (ilm_lookup_ill_v6(to_ill,
17114                                     &new_ilm->ilm_v6addr, ALL_ZONES) == NULL)
17115                                         new_ilm->ilm_notify_driver = B_TRUE;
17116
17117                                 /* Add to the to_ill's list */
17118                                 new_ilm->ilm_next = to_ill->ill_ilm;
17119                                 to_ill->ill_ilm = new_ilm;
17120                                 ASSERT(ilm->ilm_ipif == NULL);
17121                                 new_ilm->ilm_ill = to_ill;
17122                                 new_ilm->ilm_is_new = B_TRUE;
17123                         }
17124
17125                 }
17126
17127 bottom:
17128                 /*
17129                  * Revert multicast filter state to (EXCLUDE, NULL).
17130                  * new_ilm->ilm_is_new should already be set if needed.
17131                  */
17132                 new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
17133                 CLEAR_SLIST(new_ilm->ilm_filter);
17134                 /*
17135                  * We allocated/got a new ilm, free the old one.
17136                  */
17137                 if (new_ilm != ilm) {
17138                         if (from_ill->ill_ilm_walker_cnt == 0) {
17139                                 *ilmp = ilm->ilm_next;
17140                                 ilm->ilm_next = NULL;
17141                                 FREE_SLIST(ilm->ilm_filter);
17142                                 FREE_SLIST(ilm->ilm_pendsrcs);
17143                                 FREE_SLIST(ilm->ilm_rtx.rtx_allow);
17144                                 FREE_SLIST(ilm->ilm_rtx.rtx_block);
17145                                 mi_free((char *)ilm);
17146                         } else {
17147                                 ilm->ilm_flags |= ILM_DELETED;
17148                                 from_ill->ill_ilm_cleanup_reqd = 1;
17149                                 ilmp = &ilm->ilm_next;
17150                         }
17151                 }
17152         }
17153 }
17154
17155 /*
17156  * Move all the multicast memberships to to_ill. Called when
17157  * an ipif moves from "from_ill" to "to_ill". This function is slightly
17158  * different from IPv6 counterpart as multicast memberships are associated
17159  * with ills in IPv6. This function is called after every ipif is moved
17160  * unlike IPv6, where it is moved only once.
17161  */
17162 static void
17163 ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif)
17164 {
17165         ilm_t   *ilm;
17166         ilm_t   *ilm_next;
17167         ilm_t   *new_ilm;
17168         ilm_t   **ilmp;
17169         ip_stack_t      *ipst = from_ill->ill_ipst;
17170
17171         ASSERT(MUTEX_HELD(&to_ill->ill_lock));
17172         ASSERT(MUTEX_HELD(&from_ill->ill_lock));
17173         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
17174
17175         ilmp = &from_ill->ill_ilm;
17176         for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) {
17177                 ilm_next = ilm->ilm_next;
17178
17179                 if (ilm->ilm_flags & ILM_DELETED) {
17180                         ilmp = &ilm->ilm_next;
17181                         continue;
17182                 }
17183
17184                 ASSERT(ilm->ilm_ipif != NULL);
17185
17186                 if (ilm->ilm_ipif != ipif) {
17187                         ilmp = &ilm->ilm_next;
17188                         continue;
17189                 }
17190
17191                 if (V4_PART_OF_V6(ilm->ilm_v6addr) ==
17192                     htonl(INADDR_ALLHOSTS_GROUP)) {
17193                         new_ilm = ilm_lookup_ipif(ipif,
17194                             V4_PART_OF_V6(ilm->ilm_v6addr));
17195                         if (new_ilm != NULL) {
17196                                 new_ilm->ilm_refcnt += ilm->ilm_refcnt;
17197                                 /*
17198                                  * We still need to deal with the from_ill.
17199                                  */
17200                                 new_ilm->ilm_is_new = B_TRUE;
17201                                 new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
17202                                 CLEAR_SLIST(new_ilm->ilm_filter);
17203                                 goto delete_ilm;
17204                         }
17205                         /*
17206                          * If we could not find one e.g. ipif is
17207                          * still down on to_ill, we add this ilm
17208                          * on ill_new to preserve the reference
17209                          * count.
17210                          */
17211                 }
17212                 /*
17213                  * When ipifs move, ilms always move with it
17214                  * to the NEW ill. Thus we should never be
17215                  * able to find ilm till we really move it here.
17216                  */
17217                 ASSERT(ilm_lookup_ipif(ipif,
17218                     V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL);
17219
17220                 if (from_ill->ill_ilm_walker_cnt != 0) {
17221                         new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t));
17222                         if (new_ilm == NULL) {
17223                                 char buf[INET6_ADDRSTRLEN];
17224                                 ip0dbg(("ilm_move_v4: FAILBACK of IPv4"
17225                                     " multicast address %s : "
17226                                     "from %s to"
17227                                     " %s failed : ENOMEM \n",
17228                                     inet_ntop(AF_INET,
17229                                     &ilm->ilm_v6addr, buf,
17230                                     sizeof (buf)),
17231                                     from_ill->ill_name,
17232                                     to_ill->ill_name));
17233
17234                                 ilmp = &ilm->ilm_next;
17235                                 continue;
17236                         }
17237                         *new_ilm = *ilm;
17238                         /* We don't want new_ilm linked to ilm's filter list */
17239                         new_ilm->ilm_filter = NULL;
17240                 } else {
17241                         /* Remove from the list */
17242                         *ilmp = ilm->ilm_next;
17243                         new_ilm = ilm;
17244                 }
17245
17246                 /*
17247                  * If we have never joined this group on the to_ill
17248                  * make sure we tell the driver.
17249                  */
17250                 if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr,
17251                     ALL_ZONES) == NULL)
17252                         new_ilm->ilm_notify_driver = B_TRUE;
17253
17254                 /* Add to the to_ill's list */
17255                 new_ilm->ilm_next = to_ill->ill_ilm;
17256                 to_ill->ill_ilm = new_ilm;
17257                 new_ilm->ilm_is_new = B_TRUE;
17258
17259                 /*
17260                  * Revert multicast filter state to (EXCLUDE, NULL)
17261                  */
17262                 new_ilm->ilm_fmode = MODE_IS_EXCLUDE;
17263                 CLEAR_SLIST(new_ilm->ilm_filter);
17264
17265                 /*
17266                  * Delete only if we have allocated a new ilm.
17267                  */
17268                 if (new_ilm != ilm) {
17269 delete_ilm:
17270                         if (from_ill->ill_ilm_walker_cnt == 0) {
17271                                 /* Remove from the list */
17272                                 *ilmp = ilm->ilm_next;
17273                                 ilm->ilm_next = NULL;
17274                                 FREE_SLIST(ilm->ilm_filter);
17275                                 FREE_SLIST(ilm->ilm_pendsrcs);
17276                                 FREE_SLIST(ilm->ilm_rtx.rtx_allow);
17277                                 FREE_SLIST(ilm->ilm_rtx.rtx_block);
17278                                 mi_free((char *)ilm);
17279                         } else {
17280                                 ilm->ilm_flags |= ILM_DELETED;
17281                                 from_ill->ill_ilm_cleanup_reqd = 1;
17282                                 ilmp = &ilm->ilm_next;
17283                         }
17284                 }
17285         }
17286 }
17287
17288 static uint_t
17289 ipif_get_id(ill_t *ill, uint_t id)
17290 {
17291         uint_t  unit;
17292         ipif_t  *tipif;
17293         boolean_t found = B_FALSE;
17294         ip_stack_t      *ipst = ill->ill_ipst;
17295
17296         /*
17297          * During failback, we want to go back to the same id
17298          * instead of the smallest id so that the original
17299          * configuration is maintained. id is non-zero in that
17300          * case.
17301          */
17302         if (id != 0) {
17303                 /*
17304                  * While failing back, if we still have an ipif with
17305                  * MAX_ADDRS_PER_IF, it means this will be replaced
17306                  * as soon as we return from this function. It was
17307                  * to set to MAX_ADDRS_PER_IF by the caller so that
17308                  * we can choose the smallest id. Thus we return zero
17309                  * in that case ignoring the hint.
17310                  */
17311                 if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF)
17312                         return (0);
17313                 for (tipif = ill->ill_ipif; tipif != NULL;
17314                     tipif = tipif->ipif_next) {
17315                         if (tipif->ipif_id == id) {
17316                                 found = B_TRUE;
17317                                 break;
17318                         }
17319                 }
17320                 /*
17321                  * If somebody already plumbed another logical
17322                  * with the same id, we won't be able to find it.
17323                  */
17324                 if (!found)
17325                         return (id);
17326         }
17327         for (unit = 0; unit <= ipst->ips_ip_addrs_per_if; unit++) {
17328                 found = B_FALSE;
17329                 for (tipif = ill->ill_ipif; tipif != NULL;
17330                     tipif = tipif->ipif_next) {
17331                         if (tipif->ipif_id == unit) {
17332                                 found = B_TRUE;
17333                                 break;
17334                         }
17335                 }
17336                 if (!found)
17337                         break;
17338         }
17339         return (unit);
17340 }
17341
17342 /* ARGSUSED */
17343 static int
17344 ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp,
17345     ipif_t **rep_ipif_ptr)
17346 {
17347         ill_t   *from_ill;
17348         ipif_t  *rep_ipif;
17349         uint_t  unit;
17350         int err = 0;
17351         ipif_t  *to_ipif;
17352         struct iocblk   *iocp;
17353         boolean_t failback_cmd;
17354         boolean_t remove_ipif;
17355         int     rc;
17356         ip_stack_t      *ipst;
17357
17358         ASSERT(IAM_WRITER_ILL(to_ill));
17359         ASSERT(IAM_WRITER_IPIF(ipif));
17360
17361         iocp = (struct iocblk *)mp->b_rptr;
17362         failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK);
17363         remove_ipif = B_FALSE;
17364
17365         from_ill = ipif->ipif_ill;
17366         ipst = from_ill->ill_ipst;
17367
17368         ASSERT(MUTEX_HELD(&to_ill->ill_lock));
17369         ASSERT(MUTEX_HELD(&from_ill->ill_lock));
17370         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
17371
17372         /*
17373          * Don't move LINK LOCAL addresses as they are tied to
17374          * physical interface.
17375          */
17376         if (from_ill->ill_isv6 &&
17377             IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) {
17378                 ipif->ipif_was_up = B_FALSE;
17379                 IPIF_UNMARK_MOVING(ipif);
17380                 return (0);
17381         }
17382
17383         /*
17384          * We set the ipif_id to maximum so that the search for
17385          * ipif_id will pick the lowest number i.e 0 in the
17386          * following 2 cases :
17387          *
17388          * 1) We have a replacement ipif at the head of to_ill.
17389          *    We can't remove it yet as we can exceed ip_addrs_per_if
17390          *    on to_ill and hence the MOVE might fail. We want to
17391          *    remove it only if we could move the ipif. Thus, by
17392          *    setting it to the MAX value, we make the search in
17393          *    ipif_get_id return the zeroth id.
17394          *
17395          * 2) When DR pulls out the NIC and re-plumbs the interface,
17396          *    we might just have a zero address plumbed on the ipif
17397          *    with zero id in the case of IPv4. We remove that while
17398          *    doing the failback. We want to remove it only if we
17399          *    could move the ipif. Thus, by setting it to the MAX
17400          *    value, we make the search in ipif_get_id return the
17401          *    zeroth id.
17402          *
17403          * Both (1) and (2) are done only when when we are moving
17404          * an ipif (either due to failover/failback) which originally
17405          * belonged to this interface i.e the ipif_orig_ifindex is
17406          * the same as to_ill's ifindex. This is needed so that
17407          * FAILOVER from A -> B ( A failed) followed by FAILOVER
17408          * from B -> A (B is being removed from the group) and
17409          * FAILBACK from A -> B restores the original configuration.
17410          * Without the check for orig_ifindex, the second FAILOVER
17411          * could make the ipif belonging to B replace the A's zeroth
17412          * ipif and the subsequent failback re-creating the replacement
17413          * ipif again.
17414          *
17415          * NOTE : We created the replacement ipif when we did a
17416          * FAILOVER (See below). We could check for FAILBACK and
17417          * then look for replacement ipif to be removed. But we don't
17418          * want to do that because we wan't to allow the possibility
17419          * of a FAILOVER from A -> B (which creates the replacement ipif),
17420          * followed by a *FAILOVER* from B -> A instead of a FAILBACK
17421          * from B -> A.
17422          */
17423         to_ipif = to_ill->ill_ipif;
17424         if ((to_ill->ill_phyint->phyint_ifindex ==
17425             ipif->ipif_orig_ifindex) &&
17426             IPIF_REPL_CHECK(to_ipif, failback_cmd)) {
17427                 ASSERT(to_ipif->ipif_id == 0);
17428                 remove_ipif = B_TRUE;
17429                 to_ipif->ipif_id = MAX_ADDRS_PER_IF;
17430         }
17431         /*
17432          * Find the lowest logical unit number on the to_ill.
17433          * If we are failing back, try to get the original id
17434          * rather than the lowest one so that the original
17435          * configuration is maintained.
17436          *
17437          * XXX need a better scheme for this.
17438          */
17439         if (failback_cmd) {
17440                 unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid);
17441         } else {
17442                 unit = ipif_get_id(to_ill, 0);
17443         }
17444
17445         /* Reset back to zero in case we fail below */
17446         if (to_ipif->ipif_id == MAX_ADDRS_PER_IF)
17447                 to_ipif->ipif_id = 0;
17448
17449         if (unit == ipst->ips_ip_addrs_per_if) {
17450                 ipif->ipif_was_up = B_FALSE;
17451                 IPIF_UNMARK_MOVING(ipif);
17452                 return (EINVAL);
17453         }
17454
17455         /*
17456          * ipif is ready to move from "from_ill" to "to_ill".
17457          *
17458          * 1) If we are moving ipif with id zero, create a
17459          *    replacement ipif for this ipif on from_ill. If this fails
17460          *    fail the MOVE operation.
17461          *
17462          * 2) Remove the replacement ipif on to_ill if any.
17463          *    We could remove the replacement ipif when we are moving
17464          *    the ipif with id zero. But what if somebody already
17465          *    unplumbed it ? Thus we always remove it if it is present.
17466          *    We want to do it only if we are sure we are going to
17467          *    move the ipif to to_ill which is why there are no
17468          *    returns due to error till ipif is linked to to_ill.
17469          *    Note that the first ipif that we failback will always
17470          *    be zero if it is present.
17471          */
17472         if (ipif->ipif_id == 0) {
17473                 ipaddr_t inaddr_any = INADDR_ANY;
17474
17475                 rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED);
17476                 if (rep_ipif == NULL) {
17477                         ipif->ipif_was_up = B_FALSE;
17478                         IPIF_UNMARK_MOVING(ipif);
17479                         return (ENOMEM);
17480                 }
17481                 *rep_ipif = ipif_zero;
17482                 /*
17483                  * Before we put the ipif on the list, store the addresses
17484                  * as mapped addresses as some of the ioctls e.g SIOCGIFADDR
17485                  * assumes so. This logic is not any different from what
17486                  * ipif_allocate does.
17487                  */
17488                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
17489                     &rep_ipif->ipif_v6lcl_addr);
17490                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
17491                     &rep_ipif->ipif_v6src_addr);
17492                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
17493                     &rep_ipif->ipif_v6subnet);
17494                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
17495                     &rep_ipif->ipif_v6net_mask);
17496                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
17497                     &rep_ipif->ipif_v6brd_addr);
17498                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
17499                     &rep_ipif->ipif_v6pp_dst_addr);
17500                 /*
17501                  * We mark IPIF_NOFAILOVER so that this can never
17502                  * move.
17503                  */
17504                 rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER;
17505                 rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE;
17506                 rep_ipif->ipif_replace_zero = B_TRUE;
17507                 mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL,
17508                     MUTEX_DEFAULT, NULL);
17509                 rep_ipif->ipif_id = 0;
17510                 rep_ipif->ipif_ire_type = ipif->ipif_ire_type;
17511                 rep_ipif->ipif_ill = from_ill;
17512                 rep_ipif->ipif_orig_ifindex =
17513                     from_ill->ill_phyint->phyint_ifindex;
17514                 /* Insert at head */
17515                 rep_ipif->ipif_next = from_ill->ill_ipif;
17516                 from_ill->ill_ipif = rep_ipif;
17517                 /*
17518                  * We don't really care to let apps know about
17519                  * this interface.
17520                  */
17521         }
17522
17523         if (remove_ipif) {
17524                 /*
17525                  * We set to a max value above for this case to get
17526                  * id zero. ASSERT that we did get one.
17527                  */
17528                 ASSERT((to_ipif->ipif_id == 0) && (unit == 0));
17529                 rep_ipif = to_ipif;
17530                 to_ill->ill_ipif = rep_ipif->ipif_next;
17531                 rep_ipif->ipif_next = NULL;
17532                 /*
17533                  * If some apps scanned and find this interface,
17534                  * it is time to let them know, so that they can
17535                  * delete it.
17536                  */
17537
17538                 *rep_ipif_ptr = rep_ipif;
17539         }
17540
17541         /* Get it out of the ILL interface list. */
17542         ipif_remove(ipif, B_FALSE);
17543
17544         /* Assign the new ill */
17545         ipif->ipif_ill = to_ill;
17546         ipif->ipif_id = unit;
17547         /* id has already been checked */
17548         rc = ipif_insert(ipif, B_FALSE, B_FALSE);
17549         ASSERT(rc == 0);
17550         /* Let SCTP update its list */
17551         sctp_move_ipif(ipif, from_ill, to_ill);
17552         /*
17553          * Handle the failover and failback of ipif_t between
17554          * ill_t that have differing maximum mtu values.
17555          */
17556         if (ipif->ipif_mtu > to_ill->ill_max_mtu) {
17557                 if (ipif->ipif_saved_mtu == 0) {
17558                         /*
17559                          * As this ipif_t is moving to an ill_t
17560                          * that has a lower ill_max_mtu, its
17561                          * ipif_mtu needs to be saved so it can
17562                          * be restored during failback or during
17563                          * failover to an ill_t which has a
17564                          * higher ill_max_mtu.
17565                          */
17566                         ipif->ipif_saved_mtu = ipif->ipif_mtu;
17567                         ipif->ipif_mtu = to_ill->ill_max_mtu;
17568                 } else {
17569                         /*
17570                          * The ipif_t is, once again, moving to
17571                          * an ill_t that has a lower maximum mtu
17572                          * value.
17573                          */
17574                         ipif->ipif_mtu = to_ill->ill_max_mtu;
17575                 }
17576         } else if (ipif->ipif_mtu < to_ill->ill_max_mtu &&
17577             ipif->ipif_saved_mtu != 0) {
17578                 /*
17579                  * The mtu of this ipif_t had to be reduced
17580                  * during an earlier failover; this is an
17581                  * opportunity for it to be increased (either as
17582                  * part of another failover or a failback).
17583                  */
17584                 if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) {
17585                         ipif->ipif_mtu = ipif->ipif_saved_mtu;
17586                         ipif->ipif_saved_mtu = 0;
17587                 } else {
17588                         ipif->ipif_mtu = to_ill->ill_max_mtu;
17589                 }
17590         }
17591
17592         /*
17593          * We preserve all the other fields of the ipif including
17594          * ipif_saved_ire_mp. The routes that are saved here will
17595          * be recreated on the new interface and back on the old
17596          * interface when we move back.
17597          */
17598         ASSERT(ipif->ipif_arp_del_mp == NULL);
17599
17600         return (err);
17601 }
17602
17603 static int
17604 ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp,
17605     int ifindex, ipif_t **rep_ipif_ptr)
17606 {
17607         ipif_t *mipif;
17608         ipif_t *ipif_next;
17609         int err;
17610
17611         /*
17612          * We don't really try to MOVE back things if some of the
17613          * operations fail. The daemon will take care of moving again
17614          * later on.
17615          */
17616         for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) {
17617                 ipif_next = mipif->ipif_next;
17618                 if (!(mipif->ipif_flags & IPIF_NOFAILOVER) &&
17619                     (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) {
17620
17621                         err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr);
17622
17623                         /*
17624                          * When the MOVE fails, it is the job of the
17625                          * application to take care of this properly
17626                          * i.e try again if it is ENOMEM.
17627                          */
17628                         if (mipif->ipif_ill != from_ill) {
17629                                 /*
17630                                  * ipif has moved.
17631                                  *
17632                                  * Move the multicast memberships associated
17633                                  * with this ipif to the new ill. For IPv6, we
17634                                  * do it once after all the ipifs are moved
17635                                  * (in ill_move) as they are not associated
17636                                  * with ipifs.
17637                                  *
17638                                  * We need to move the ilms as the ipif has
17639                                  * already been moved to a new ill even
17640                                  * in the case of errors. Neither
17641                                  * ilm_free(ipif) will find the ilm
17642                                  * when somebody unplumbs this ipif nor
17643                                  * ilm_delete(ilm) will be able to find the
17644                                  * ilm, if we don't move now.
17645                                  */
17646                                 if (!from_ill->ill_isv6)
17647                                         ilm_move_v4(from_ill, to_ill, mipif);
17648                         }
17649
17650                         if (err != 0)
17651                                 return (err);
17652                 }
17653         }
17654         return (0);
17655 }
17656
17657 static int
17658 ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp)
17659 {
17660         int ifindex;
17661         int err;
17662         struct iocblk   *iocp;
17663         ipif_t  *ipif;
17664         ipif_t *rep_ipif_ptr = NULL;
17665         ipif_t  *from_ipif = NULL;
17666         boolean_t check_rep_if = B_FALSE;
17667         ip_stack_t      *ipst = from_ill->ill_ipst;
17668
17669         iocp = (struct iocblk *)mp->b_rptr;
17670         if (iocp->ioc_cmd == SIOCLIFFAILOVER) {
17671                 /*
17672                  * Move everything pointing at from_ill to to_ill.
17673                  * We acheive this by passing in 0 as ifindex.
17674                  */
17675                 ifindex = 0;
17676         } else {
17677                 /*
17678                  * Move everything pointing at from_ill whose original
17679                  * ifindex of connp, ipif, ilm points at to_ill->ill_index.
17680                  * We acheive this by passing in ifindex rather than 0.
17681                  * Multicast vifs, ilgs move implicitly because ipifs move.
17682                  */
17683                 ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK);
17684                 ifindex = to_ill->ill_phyint->phyint_ifindex;
17685         }
17686
17687         /*
17688          * Determine if there is at least one ipif that would move from
17689          * 'from_ill' to 'to_ill'. If so, it is possible that the replacement
17690          * ipif (if it exists) on the to_ill would be consumed as a result of
17691          * the move, in which case we need to quiesce the replacement ipif also.
17692          */
17693         for (from_ipif = from_ill->ill_ipif; from_ipif != NULL;
17694             from_ipif = from_ipif->ipif_next) {
17695                 if (((ifindex == 0) ||
17696                     (ifindex == from_ipif->ipif_orig_ifindex)) &&
17697                     !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) {
17698                         check_rep_if = B_TRUE;
17699                         break;
17700                 }
17701         }
17702
17703
17704         ill_down_ipifs(from_ill, mp, ifindex, B_TRUE);
17705
17706         GRAB_ILL_LOCKS(from_ill, to_ill);
17707         if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) {
17708                 (void) ipsq_pending_mp_add(NULL, ipif, q,
17709                     mp, ILL_MOVE_OK);
17710                 RELEASE_ILL_LOCKS(from_ill, to_ill);
17711                 return (EINPROGRESS);
17712         }
17713
17714         /* Check if the replacement ipif is quiescent to delete */
17715         if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif,
17716             (iocp->ioc_cmd == SIOCLIFFAILBACK))) {
17717                 to_ill->ill_ipif->ipif_state_flags |=
17718                     IPIF_MOVING | IPIF_CHANGING;
17719                 if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) {
17720                         (void) ipsq_pending_mp_add(NULL, ipif, q,
17721                             mp, ILL_MOVE_OK);
17722                         RELEASE_ILL_LOCKS(from_ill, to_ill);
17723                         return (EINPROGRESS);
17724                 }
17725         }
17726         RELEASE_ILL_LOCKS(from_ill, to_ill);
17727
17728         ASSERT(!MUTEX_HELD(&to_ill->ill_lock));
17729         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
17730         GRAB_ILL_LOCKS(from_ill, to_ill);
17731         err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr);
17732
17733         /* ilm_move is done inside ipif_move for IPv4 */
17734         if (err == 0 && from_ill->ill_isv6)
17735                 ilm_move_v6(from_ill, to_ill, ifindex);
17736
17737         RELEASE_ILL_LOCKS(from_ill, to_ill);
17738         rw_exit(&ipst->ips_ill_g_lock);
17739
17740         /*
17741          * send rts messages and multicast messages.
17742          */
17743         if (rep_ipif_ptr != NULL) {
17744                 if (rep_ipif_ptr->ipif_recovery_id != 0) {
17745                         (void) untimeout(rep_ipif_ptr->ipif_recovery_id);
17746                         rep_ipif_ptr->ipif_recovery_id = 0;
17747                 }
17748                 ip_rts_ifmsg(rep_ipif_ptr);
17749                 ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr);
17750 #ifdef DEBUG
17751                 ipif_trace_cleanup(rep_ipif_ptr);
17752 #endif
17753                 mi_free(rep_ipif_ptr);
17754         }
17755
17756         conn_move_ill(from_ill, to_ill, ifindex);
17757
17758         return (err);
17759 }
17760
17761 /*
17762  * Used to extract arguments for FAILOVER/FAILBACK ioctls.
17763  * Also checks for the validity of the arguments.
17764  * Note: We are already exclusive inside the from group.
17765  * It is upto the caller to release refcnt on the to_ill's.
17766  */
17767 static int
17768 ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4,
17769     ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6)
17770 {
17771         int dst_index;
17772         ipif_t *ipif_v4, *ipif_v6;
17773         struct lifreq *lifr;
17774         mblk_t *mp1;
17775         boolean_t exists;
17776         sin_t   *sin;
17777         int     err = 0;
17778         ip_stack_t      *ipst;
17779
17780         if (CONN_Q(q))
17781                 ipst = CONNQ_TO_IPST(q);
17782         else
17783                 ipst = ILLQ_TO_IPST(q);
17784
17785
17786         if ((mp1 = mp->b_cont) == NULL)
17787                 return (EPROTO);
17788
17789         if ((mp1 = mp1->b_cont) == NULL)
17790                 return (EPROTO);
17791
17792         lifr = (struct lifreq *)mp1->b_rptr;
17793         sin = (sin_t *)&lifr->lifr_addr;
17794
17795         /*
17796          * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6
17797          * specific operations.
17798          */
17799         if (sin->sin_family != AF_UNSPEC)
17800                 return (EINVAL);
17801
17802         /*
17803          * Get ipif with id 0. We are writer on the from ill. So we can pass
17804          * NULLs for the last 4 args and we know the lookup won't fail
17805          * with EINPROGRESS.
17806          */
17807         ipif_v4 = ipif_lookup_on_name(lifr->lifr_name,
17808             mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE,
17809             ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
17810         ipif_v6 = ipif_lookup_on_name(lifr->lifr_name,
17811             mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE,
17812             ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
17813
17814         if (ipif_v4 == NULL && ipif_v6 == NULL)
17815                 return (ENXIO);
17816
17817         if (ipif_v4 != NULL) {
17818                 ASSERT(ipif_v4->ipif_refcnt != 0);
17819                 if (ipif_v4->ipif_id != 0) {
17820                         err = EINVAL;
17821                         goto done;
17822                 }
17823
17824                 ASSERT(IAM_WRITER_IPIF(ipif_v4));
17825                 *ill_from_v4 = ipif_v4->ipif_ill;
17826         }
17827
17828         if (ipif_v6 != NULL) {
17829                 ASSERT(ipif_v6->ipif_refcnt != 0);
17830                 if (ipif_v6->ipif_id != 0) {
17831                         err = EINVAL;
17832                         goto done;
17833                 }
17834
17835                 ASSERT(IAM_WRITER_IPIF(ipif_v6));
17836                 *ill_from_v6 = ipif_v6->ipif_ill;
17837         }
17838
17839         err = 0;
17840         dst_index = lifr->lifr_movetoindex;
17841         *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE,
17842             q, mp, ip_process_ioctl, &err, ipst);
17843         if (err != 0) {
17844                 /*
17845                  * There could be only v6.
17846                  */
17847                 if (err != ENXIO)
17848                         goto done;
17849                 err = 0;
17850         }
17851
17852         *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE,
17853             q, mp, ip_process_ioctl, &err, ipst);
17854         if (err != 0) {
17855                 if (err != ENXIO)
17856                         goto done;
17857                 if (*ill_to_v4 == NULL) {
17858                         err = ENXIO;
17859                         goto done;
17860                 }
17861                 err = 0;
17862         }
17863
17864         /*
17865          * If we have something to MOVE i.e "from" not NULL,
17866          * "to" should be non-NULL.
17867          */
17868         if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) ||
17869             (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) {
17870                 err = EINVAL;
17871         }
17872
17873 done:
17874         if (ipif_v4 != NULL)
17875                 ipif_refrele(ipif_v4);
17876         if (ipif_v6 != NULL)
17877                 ipif_refrele(ipif_v6);
17878         return (err);
17879 }
17880
17881 /*
17882  * FAILOVER and FAILBACK are modelled as MOVE operations.
17883  *
17884  * We don't check whether the MOVE is within the same group or
17885  * not, because this ioctl can be used as a generic mechanism
17886  * to failover from interface A to B, though things will function
17887  * only if they are really part of the same group. Moreover,
17888  * all ipifs may be down and hence temporarily out of the group.
17889  *
17890  * ipif's that need to be moved are first brought down; V4 ipifs are brought
17891  * down first and then V6.  For each we wait for the ipif's to become quiescent.
17892  * Bringing down the ipifs ensures that all ires pointing to these ipifs's
17893  * have been deleted and there are no active references. Once quiescent the
17894  * ipif's are moved and brought up on the new ill.
17895  *
17896  * Normally the source ill and destination ill belong to the same IPMP group
17897  * and hence the same ipsq_t. In the event they don't belong to the same
17898  * same group the two ipsq's are first merged into one ipsq - that of the
17899  * to_ill. The multicast memberships on the source and destination ill cannot
17900  * change during the move operation since multicast joins/leaves also have to
17901  * execute on the same ipsq and are hence serialized.
17902  */
17903 /* ARGSUSED */
17904 int
17905 ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
17906     ip_ioctl_cmd_t *ipip, void *ifreq)
17907 {
17908         ill_t *ill_to_v4 = NULL;
17909         ill_t *ill_to_v6 = NULL;
17910         ill_t *ill_from_v4 = NULL;
17911         ill_t *ill_from_v6 = NULL;
17912         int err = 0;
17913
17914         /*
17915          * setup from and to ill's, we can get EINPROGRESS only for
17916          * to_ill's.
17917          */
17918         err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6,
17919             &ill_to_v4, &ill_to_v6);
17920
17921         if (err != 0) {
17922                 ip0dbg(("ip_sioctl_move: extract args failed\n"));
17923                 goto done;
17924         }
17925
17926         /*
17927          * nothing to do.
17928          */
17929         if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) {
17930                 goto done;
17931         }
17932
17933         /*
17934          * nothing to do.
17935          */
17936         if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) {
17937                 goto done;
17938         }
17939
17940         /*
17941          * Mark the ill as changing.
17942          * ILL_CHANGING flag is cleared when the ipif's are brought up
17943          * in ill_up_ipifs in case of error they are cleared below.
17944          */
17945
17946         GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6);
17947         if (ill_from_v4 != NULL)
17948                 ill_from_v4->ill_state_flags |= ILL_CHANGING;
17949         if (ill_from_v6 != NULL)
17950                 ill_from_v6->ill_state_flags |= ILL_CHANGING;
17951         RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6);
17952
17953         /*
17954          * Make sure that both src and dst are
17955          * in the same syncq group. If not make it happen.
17956          * We are not holding any locks because we are the writer
17957          * on the from_ipsq and we will hold locks in ill_merge_groups
17958          * to protect to_ipsq against changing.
17959          */
17960         if (ill_from_v4 != NULL) {
17961                 if (ill_from_v4->ill_phyint->phyint_ipsq !=
17962                     ill_to_v4->ill_phyint->phyint_ipsq) {
17963                         err = ill_merge_groups(ill_from_v4, ill_to_v4,
17964                             NULL, mp, q);
17965                         goto err_ret;
17966
17967                 }
17968                 ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock));
17969         } else {
17970
17971                 if (ill_from_v6->ill_phyint->phyint_ipsq !=
17972                     ill_to_v6->ill_phyint->phyint_ipsq) {
17973                         err = ill_merge_groups(ill_from_v6, ill_to_v6,
17974                             NULL, mp, q);
17975                         goto err_ret;
17976
17977                 }
17978                 ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock));
17979         }
17980
17981         /*
17982          * Now that the ipsq's have been merged and we are the writer
17983          * lets mark to_ill as changing as well.
17984          */
17985
17986         GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6);
17987         if (ill_to_v4 != NULL)
17988                 ill_to_v4->ill_state_flags |= ILL_CHANGING;
17989         if (ill_to_v6 != NULL)
17990                 ill_to_v6->ill_state_flags |= ILL_CHANGING;
17991         RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6);
17992
17993         /*
17994          * Its ok for us to proceed with the move even if
17995          * ill_pending_mp is non null on one of the from ill's as the reply
17996          * should not be looking at the ipif, it should only care about the
17997          * ill itself.
17998          */
17999
18000         /*
18001          * lets move ipv4 first.
18002          */
18003         if (ill_from_v4 != NULL) {
18004                 ASSERT(IAM_WRITER_ILL(ill_to_v4));
18005                 ill_from_v4->ill_move_in_progress = B_TRUE;
18006                 ill_to_v4->ill_move_in_progress = B_TRUE;
18007                 ill_to_v4->ill_move_peer = ill_from_v4;
18008                 ill_from_v4->ill_move_peer = ill_to_v4;
18009                 err = ill_move(ill_from_v4, ill_to_v4, q, mp);
18010         }
18011
18012         /*
18013          * Now lets move ipv6.
18014          */
18015         if (err == 0 && ill_from_v6 != NULL) {
18016                 ASSERT(IAM_WRITER_ILL(ill_to_v6));
18017                 ill_from_v6->ill_move_in_progress = B_TRUE;
18018                 ill_to_v6->ill_move_in_progress = B_TRUE;
18019                 ill_to_v6->ill_move_peer = ill_from_v6;
18020                 ill_from_v6->ill_move_peer = ill_to_v6;
18021                 err = ill_move(ill_from_v6, ill_to_v6, q, mp);
18022         }
18023
18024 err_ret:
18025         /*
18026          * EINPROGRESS means we are waiting for the ipif's that need to be
18027          * moved to become quiescent.
18028          */
18029         if (err == EINPROGRESS) {
18030                 goto done;
18031         }
18032
18033         /*
18034          * if err is set ill_up_ipifs will not be called
18035          * lets clear the flags.
18036          */
18037
18038         GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6);
18039         GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6);
18040         /*
18041          * Some of the clearing may be redundant. But it is simple
18042          * not making any extra checks.
18043          */
18044         if (ill_from_v6 != NULL) {
18045                 ill_from_v6->ill_move_in_progress = B_FALSE;
18046                 ill_from_v6->ill_move_peer = NULL;
18047                 ill_from_v6->ill_state_flags &= ~ILL_CHANGING;
18048         }
18049         if (ill_from_v4 != NULL) {
18050                 ill_from_v4->ill_move_in_progress = B_FALSE;
18051                 ill_from_v4->ill_move_peer = NULL;
18052                 ill_from_v4->ill_state_flags &= ~ILL_CHANGING;
18053         }
18054         if (ill_to_v6 != NULL) {
18055                 ill_to_v6->ill_move_in_progress = B_FALSE;
18056                 ill_to_v6->ill_move_peer = NULL;
18057                 ill_to_v6->ill_state_flags &= ~ILL_CHANGING;
18058         }
18059         if (ill_to_v4 != NULL) {
18060                 ill_to_v4->ill_move_in_progress = B_FALSE;
18061                 ill_to_v4->ill_move_peer = NULL;
18062                 ill_to_v4->ill_state_flags &= ~ILL_CHANGING;
18063         }
18064
18065         /*
18066          * Check for setting INACTIVE, if STANDBY is set and FAILED is not set.
18067          * Do this always to maintain proper state i.e even in case of errors.
18068          * As phyint_inactive looks at both v4 and v6 interfaces,
18069          * we need not call on both v4 and v6 interfaces.
18070          */
18071         if (ill_from_v4 != NULL) {
18072                 if ((ill_from_v4->ill_phyint->phyint_flags &
18073                     (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) {
18074                         phyint_inactive(ill_from_v4->ill_phyint);
18075                 }
18076         } else if (ill_from_v6 != NULL) {
18077                 if ((ill_from_v6->ill_phyint->phyint_flags &
18078                     (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) {
18079                         phyint_inactive(ill_from_v6->ill_phyint);
18080                 }
18081         }
18082
18083         if (ill_to_v4 != NULL) {
18084                 if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) {
18085                         ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE;
18086                 }
18087         } else if (ill_to_v6 != NULL) {
18088                 if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) {
18089                         ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE;
18090                 }
18091         }
18092
18093         RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6);
18094         RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6);
18095
18096 no_err:
18097         /*
18098          * lets bring the interfaces up on the to_ill.
18099          */
18100         if (err == 0) {
18101                 err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4,
18102                     q, mp);
18103         }
18104
18105         if (err == 0) {
18106                 if (ill_from_v4 != NULL && ill_to_v4 != NULL)
18107                         ilm_send_multicast_reqs(ill_from_v4, ill_to_v4);
18108
18109                 if (ill_from_v6 != NULL && ill_to_v6 != NULL)
18110                         ilm_send_multicast_reqs(ill_from_v6, ill_to_v6);
18111         }
18112 done:
18113
18114         if (ill_to_v4 != NULL) {
18115                 ill_refrele(ill_to_v4);
18116         }
18117         if (ill_to_v6 != NULL) {
18118                 ill_refrele(ill_to_v6);
18119         }
18120
18121         return (err);
18122 }
18123
18124 static void
18125 ill_dl_down(ill_t *ill)
18126 {
18127         /*
18128          * The ill is down; unbind but stay attached since we're still
18129          * associated with a PPA. If we have negotiated DLPI capabilites
18130          * with the data link service provider (IDS_OK) then reset them.
18131          * The interval between unbinding and rebinding is potentially
18132          * unbounded hence we cannot assume things will be the same.
18133          * The DLPI capabilities will be probed again when the data link
18134          * is brought up.
18135          */
18136         mblk_t  *mp = ill->ill_unbind_mp;
18137         hook_nic_event_t *info;
18138
18139         ip1dbg(("ill_dl_down(%s)\n", ill->ill_name));
18140
18141         ill->ill_unbind_mp = NULL;
18142         if (mp != NULL) {
18143                 ip1dbg(("ill_dl_down: %s (%u) for %s\n",
18144                     dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
18145                     ill->ill_name));
18146                 mutex_enter(&ill->ill_lock);
18147                 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS;
18148                 mutex_exit(&ill->ill_lock);
18149                 /*
18150                  * Reset the capabilities if the negotiation is done or is
18151                  * still in progress. Note that ill_capability_reset() will
18152                  * set ill_dlpi_capab_state to IDS_UNKNOWN, so the subsequent
18153                  * DL_CAPABILITY_ACK and DL_NOTE_CAPAB_RENEG will be ignored.
18154                  *
18155                  * Further, reset ill_capab_reneg to be B_FALSE so that the
18156                  * subsequent DL_CAPABILITY_ACK can be ignored, to prevent
18157                  * the capabilities renegotiation from happening.
18158                  */
18159                 if (ill->ill_dlpi_capab_state != IDS_UNKNOWN)
18160                         ill_capability_reset(ill);
18161                 ill->ill_capab_reneg = B_FALSE;
18162
18163                 ill_dlpi_send(ill, mp);
18164         }
18165
18166         /*
18167          * Toss all of our multicast memberships.  We could keep them, but
18168          * then we'd have to do bookkeeping of any joins and leaves performed
18169          * by the application while the the interface is down (we can't just
18170          * issue them because arp cannot currently process AR_ENTRY_SQUERY's
18171          * on a downed interface).
18172          */
18173         ill_leave_multicast(ill);
18174
18175         mutex_enter(&ill->ill_lock);
18176
18177         ill->ill_dl_up = 0;
18178
18179         if ((info = ill->ill_nic_event_info) != NULL) {
18180                 ip2dbg(("ill_dl_down:unexpected nic event %d attached for %s\n",
18181                     info->hne_event, ill->ill_name));
18182                 if (info->hne_data != NULL)
18183                         kmem_free(info->hne_data, info->hne_datalen);
18184                 kmem_free(info, sizeof (hook_nic_event_t));
18185         }
18186
18187         info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP);
18188         if (info != NULL) {
18189                 ip_stack_t      *ipst = ill->ill_ipst;
18190
18191                 info->hne_nic = ill->ill_phyint->phyint_hook_ifindex;
18192                 info->hne_lif = 0;
18193                 info->hne_event = NE_DOWN;
18194                 info->hne_data = NULL;
18195                 info->hne_datalen = 0;
18196                 info->hne_family = ill->ill_isv6 ?
18197                     ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data;
18198         } else
18199                 ip2dbg(("ill_dl_down: could not attach DOWN nic event "
18200                     "information for %s (ENOMEM)\n", ill->ill_name));
18201
18202         ill->ill_nic_event_info = info;
18203
18204         mutex_exit(&ill->ill_lock);
18205 }
18206
18207 static void
18208 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
18209 {
18210         union DL_primitives *dlp;
18211         t_uscalar_t prim;
18212
18213         ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
18214
18215         dlp = (union DL_primitives *)mp->b_rptr;
18216         prim = dlp->dl_primitive;
18217
18218         ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n",
18219             dlpi_prim_str(prim), prim, ill->ill_name));
18220
18221         switch (prim) {
18222         case DL_PHYS_ADDR_REQ:
18223         {
18224                 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr;
18225                 ill->ill_phys_addr_pend = dlpap->dl_addr_type;
18226                 break;
18227         }
18228         case DL_BIND_REQ:
18229                 mutex_enter(&ill->ill_lock);
18230                 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS;
18231                 mutex_exit(&ill->ill_lock);
18232                 break;
18233         }
18234
18235         /*
18236          * Except for the ACKs for the M_PCPROTO messages, all other ACKs
18237          * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore
18238          * we only wait for the ACK of the DL_UNBIND_REQ.
18239          */
18240         mutex_enter(&ill->ill_lock);
18241         if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
18242             (prim == DL_UNBIND_REQ)) {
18243                 ill->ill_dlpi_pending = prim;
18244         }
18245         mutex_exit(&ill->ill_lock);
18246
18247         putnext(ill->ill_wq, mp);
18248 }
18249
18250 /*
18251  * Helper function for ill_dlpi_send().
18252  */
18253 /* ARGSUSED */
18254 static void
18255 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
18256 {
18257         ill_dlpi_send((ill_t *)q->q_ptr, mp);
18258 }
18259
18260 /*
18261  * Send a DLPI control message to the driver but make sure there
18262  * is only one outstanding message. Uses ill_dlpi_pending to tell
18263  * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done()
18264  * when an ACK or a NAK is received to process the next queued message.
18265  */
18266 void
18267 ill_dlpi_send(ill_t *ill, mblk_t *mp)
18268 {
18269         mblk_t **mpp;
18270
18271         ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
18272
18273         /*
18274          * To ensure that any DLPI requests for current exclusive operation
18275          * are always completely sent before any DLPI messages for other
18276          * operations, require writer access before enqueuing.
18277          */
18278         if (!IAM_WRITER_ILL(ill)) {
18279                 ill_refhold(ill);
18280                 /* qwriter_ip() does the ill_refrele() */
18281                 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer,
18282                     NEW_OP, B_TRUE);
18283                 return;
18284         }
18285
18286         mutex_enter(&ill->ill_lock);
18287         if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
18288                 /* Must queue message. Tail insertion */
18289                 mpp = &ill->ill_dlpi_deferred;
18290                 while (*mpp != NULL)
18291                         mpp = &((*mpp)->b_next);
18292
18293                 ip1dbg(("ill_dlpi_send: deferring request for %s\n",
18294                     ill->ill_name));
18295
18296                 *mpp = mp;
18297                 mutex_exit(&ill->ill_lock);
18298                 return;
18299         }
18300         mutex_exit(&ill->ill_lock);
18301         ill_dlpi_dispatch(ill, mp);
18302 }
18303
18304 /*
18305  * Send all deferred DLPI messages without waiting for their ACKs.
18306  */
18307 void
18308 ill_dlpi_send_deferred(ill_t *ill)
18309 {
18310         mblk_t *mp, *nextmp;
18311
18312         /*
18313          * Clear ill_dlpi_pending so that the message is not queued in
18314          * ill_dlpi_send().
18315          */
18316         mutex_enter(&ill->ill_lock);
18317         ill->ill_dlpi_pending = DL_PRIM_INVAL;
18318         mp = ill->ill_dlpi_deferred;
18319         ill->ill_dlpi_deferred = NULL;
18320         mutex_exit(&ill->ill_lock);
18321
18322         for (; mp != NULL; mp = nextmp) {
18323                 nextmp = mp->b_next;
18324                 mp->b_next = NULL;
18325                 ill_dlpi_send(ill, mp);
18326         }
18327 }
18328
18329 /*
18330  * Check if the DLPI primitive `prim' is pending; print a warning if not.
18331  */
18332 boolean_t
18333 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim)
18334 {
18335         t_uscalar_t pending;
18336
18337         mutex_enter(&ill->ill_lock);
18338         if (ill->ill_dlpi_pending == prim) {
18339                 mutex_exit(&ill->ill_lock);
18340                 return (B_TRUE);
18341         }
18342
18343         /*
18344          * During teardown, ill_dlpi_dispatch() will send DLPI requests
18345          * without waiting, so don't print any warnings in that case.
18346          */
18347         if (ill->ill_state_flags & ILL_CONDEMNED) {
18348                 mutex_exit(&ill->ill_lock);
18349                 return (B_FALSE);
18350         }
18351         pending = ill->ill_dlpi_pending;
18352         mutex_exit(&ill->ill_lock);
18353
18354         if (pending == DL_PRIM_INVAL) {
18355                 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
18356                     "received unsolicited ack for %s on %s\n",
18357                     dlpi_prim_str(prim), ill->ill_name);
18358         } else {
18359                 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
18360                     "received unexpected ack for %s on %s (expecting %s)\n",
18361                     dlpi_prim_str(prim), ill->ill_name, dlpi_prim_str(pending));
18362         }
18363         return (B_FALSE);
18364 }
18365
18366 /*
18367  * Called when an DLPI control message has been acked or nacked to
18368  * send down the next queued message (if any).
18369  */
18370 void
18371 ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
18372 {
18373         mblk_t *mp;
18374
18375         ASSERT(IAM_WRITER_ILL(ill));
18376         mutex_enter(&ill->ill_lock);
18377
18378         ASSERT(prim != DL_PRIM_INVAL);
18379         ASSERT(ill->ill_dlpi_pending == prim);
18380
18381         ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name,
18382             dlpi_prim_str(ill->ill_dlpi_pending), ill->ill_dlpi_pending));
18383
18384         if ((mp = ill->ill_dlpi_deferred) == NULL) {
18385                 ill->ill_dlpi_pending = DL_PRIM_INVAL;
18386                 cv_signal(&ill->ill_cv);
18387                 mutex_exit(&ill->ill_lock);
18388                 return;
18389         }
18390
18391         ill->ill_dlpi_deferred = mp->b_next;
18392         mp->b_next = NULL;
18393         mutex_exit(&ill->ill_lock);
18394
18395         ill_dlpi_dispatch(ill, mp);
18396 }
18397
18398 void
18399 conn_delete_ire(conn_t *connp, caddr_t arg)
18400 {
18401         ipif_t  *ipif = (ipif_t *)arg;
18402         ire_t   *ire;
18403
18404         /*
18405          * Look at the cached ires on conns which has pointers to ipifs.
18406          * We just call ire_refrele which clears up the reference
18407          * to ire. Called when a conn closes. Also called from ipif_free
18408          * to cleanup indirect references to the stale ipif via the cached ire.
18409          */
18410         mutex_enter(&connp->conn_lock);
18411         ire = connp->conn_ire_cache;
18412         if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) {
18413                 connp->conn_ire_cache = NULL;
18414                 mutex_exit(&connp->conn_lock);
18415                 IRE_REFRELE_NOTR(ire);
18416                 return;
18417         }
18418         mutex_exit(&connp->conn_lock);
18419
18420 }
18421
18422 /*
18423  * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number
18424  * of IREs. Those IREs may have been previously cached in the conn structure.
18425  * This ipcl_walk() walker function releases all references to such IREs based
18426  * on the condemned flag.
18427  */
18428 /* ARGSUSED */
18429 void
18430 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
18431 {
18432         ire_t   *ire;
18433
18434         mutex_enter(&connp->conn_lock);
18435         ire = connp->conn_ire_cache;
18436         if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) {
18437                 connp->conn_ire_cache = NULL;
18438                 mutex_exit(&connp->conn_lock);
18439                 IRE_REFRELE_NOTR(ire);
18440                 return;
18441         }
18442         mutex_exit(&connp->conn_lock);
18443 }
18444
18445 /*
18446  * Take down a specific interface, but don't lose any information about it.
18447  * Also delete interface from its interface group (ifgrp).
18448  * (Always called as writer.)
18449  * This function goes through the down sequence even if the interface is
18450  * already down. There are 2 reasons.
18451  * a. Currently we permit interface routes that depend on down interfaces
18452  *    to be added. This behaviour itself is questionable. However it appears
18453  *    that both Solaris and 4.3 BSD have exhibited this behaviour for a long
18454  *    time. We go thru the cleanup in order to remove these routes.
18455  * b. The bringup of the interface could fail in ill_dl_up i.e. we get
18456  *    DL_ERROR_ACK in response to the the DL_BIND request. The interface is
18457  *    down, but we need to cleanup i.e. do ill_dl_down and
18458  *    ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down.
18459  *
18460  * IP-MT notes:
18461  *
18462  * Model of reference to interfaces.
18463  *
18464  * The following members in ipif_t track references to the ipif.
18465  *      int     ipif_refcnt;    Active reference count
18466  *      uint_t  ipif_ire_cnt;   Number of ire's referencing this ipif
18467  * The following members in ill_t track references to the ill.
18468  *      int             ill_refcnt;     active refcnt
18469  *      uint_t          ill_ire_cnt;    Number of ires referencing ill
18470  *      uint_t          ill_nce_cnt;    Number of nces referencing ill
18471  *
18472  * Reference to an ipif or ill can be obtained in any of the following ways.
18473  *
18474  * Through the lookup functions ipif_lookup_* / ill_lookup_* functions
18475  * Pointers to ipif / ill from other data structures viz ire and conn.
18476  * Implicit reference to the ipif / ill by holding a reference to the ire.
18477  *
18478  * The ipif/ill lookup functions return a reference held ipif / ill.
18479  * ipif_refcnt and ill_refcnt track the reference counts respectively.
18480  * This is a purely dynamic reference count associated with threads holding
18481  * references to the ipif / ill. Pointers from other structures do not
18482  * count towards this reference count.
18483  *
18484  * ipif_ire_cnt/ill_ire_cnt is the number of ire's associated with the
18485  * ipif/ill. This is incremented whenever a new ire is created referencing the
18486  * ipif/ill. This is done atomically inside ire_add_v[46] where the ire is
18487  * actually added to the ire hash table. The count is decremented in
18488  * ire_inactive where the ire is destroyed.
18489  *
18490  * nce's reference ill's thru nce_ill and the count of nce's associated with
18491  * an ill is recorded in ill_nce_cnt. This is incremented atomically in
18492  * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the
18493  * table. Similarly it is decremented in ndp_inactive() where the nce
18494  * is destroyed.
18495  *
18496  * Flow of ioctls involving interface down/up
18497  *
18498  * The following is the sequence of an attempt to set some critical flags on an
18499  * up interface.
18500  * ip_sioctl_flags
18501  * ipif_down
18502  * wait for ipif to be quiescent
18503  * ipif_down_tail
18504  * ip_sioctl_flags_tail
18505  *
18506  * All set ioctls that involve down/up sequence would have a skeleton similar
18507  * to the above. All the *tail functions are called after the refcounts have
18508  * dropped to the appropriate values.
18509  *
18510  * The mechanism to quiesce an ipif is as follows.
18511  *
18512  * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed
18513  * on the ipif. Callers either pass a flag requesting wait or the lookup
18514  *  functions will return NULL.
18515  *
18516  * Delete all ires referencing this ipif
18517  *
18518  * Any thread attempting to do an ipif_refhold on an ipif that has been
18519  * obtained thru a cached pointer will first make sure that
18520  * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then
18521  * increment the refcount.
18522  *
18523  * The above guarantees that the ipif refcount will eventually come down to
18524  * zero and the ipif will quiesce, once all threads that currently hold a
18525  * reference to the ipif refrelease the ipif. The ipif is quiescent after the
18526  * ipif_refcount has dropped to zero and all ire's associated with this ipif
18527  * have also been ire_inactive'd. i.e. when ipif_ire_cnt and ipif_refcnt both
18528  * drop to zero.
18529  *
18530  * Lookups during the IPIF_CHANGING/ILL_CHANGING interval.
18531  *
18532  * Threads trying to lookup an ipif or ill can pass a flag requesting
18533  * wait and restart if the ipif / ill cannot be looked up currently.
18534  * For eg. bind, and route operations (Eg. route add / delete) cannot return
18535  * failure if the ipif is currently undergoing an exclusive operation, and
18536  * hence pass the flag. The mblk is then enqueued in the ipsq and the operation
18537  * is restarted by ipsq_exit() when the currently exclusive ioctl completes.
18538  * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The
18539  * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
18540  * change while the ill_lock is held. Before dropping the ill_lock we acquire
18541  * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish
18542  * until we release the ipsq_lock, even though the the ill/ipif state flags
18543  * can change after we drop the ill_lock.
18544  *
18545  * An attempt to send out a packet using an ipif that is currently
18546  * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this
18547  * operation and restart it later when the exclusive condition on the ipif ends.
18548  * This is an example of not passing the wait flag to the lookup functions. For
18549  * example an attempt to refhold and use conn->conn_multicast_ipif and send
18550  * out a multicast packet on that ipif will fail while the ipif is
18551  * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is
18552  * currently IPIF_CHANGING will also fail.
18553  */
18554 int
18555 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
18556 {
18557         ill_t           *ill = ipif->ipif_ill;
18558         phyint_t        *phyi;
18559         conn_t          *connp;
18560         boolean_t       success;
18561         boolean_t       ipif_was_up = B_FALSE;
18562         ip_stack_t      *ipst = ill->ill_ipst;
18563
18564         ASSERT(IAM_WRITER_IPIF(ipif));
18565
18566         ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
18567
18568         if (ipif->ipif_flags & IPIF_UP) {
18569                 mutex_enter(&ill->ill_lock);
18570                 ipif->ipif_flags &= ~IPIF_UP;
18571                 ASSERT(ill->ill_ipif_up_count > 0);
18572                 --ill->ill_ipif_up_count;
18573                 mutex_exit(&ill->ill_lock);
18574                 ipif_was_up = B_TRUE;
18575                 /* Update status in SCTP's list */
18576                 sctp_update_ipif(ipif, SCTP_IPIF_DOWN);
18577         }
18578
18579         /*
18580          * Blow away memberships we established in ipif_multicast_up().
18581          */
18582         ipif_multicast_down(ipif);
18583
18584         /*
18585          * Remove from the mapping for __sin6_src_id. We insert only
18586          * when the address is not INADDR_ANY. As IPv4 addresses are
18587          * stored as mapped addresses, we need to check for mapped
18588          * INADDR_ANY also.
18589          */
18590         if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
18591             !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) &&
18592             !(ipif->ipif_flags & IPIF_NOLOCAL)) {
18593                 int err;
18594
18595                 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr,
18596                     ipif->ipif_zoneid, ipst);
18597                 if (err != 0) {
18598                         ip0dbg(("ipif_down: srcid_remove %d\n", err));
18599                 }
18600         }
18601
18602         /*
18603          * Before we delete the ill from the group (if any), we need
18604          * to make sure that we delete all the routes dependent on
18605          * this and also any ipifs dependent on this ipif for
18606          * source address. We need to do before we delete from
18607          * the group because
18608          *
18609          * 1) ipif_down_delete_ire de-references ill->ill_group.
18610          *
18611          * 2) ipif_update_other_ipifs needs to walk the whole group
18612          *    for re-doing source address selection. Note that
18613          *    ipif_select_source[_v6] called from
18614          *    ipif_update_other_ipifs[_v6] will not pick this ipif
18615          *    because we have already marked down here i.e cleared
18616          *    IPIF_UP.
18617          */
18618         if (ipif->ipif_isv6) {
18619                 ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES,
18620                     ipst);
18621         } else {
18622                 ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES,
18623                     ipst);
18624         }
18625
18626         /*
18627          * Cleaning up the conn_ire_cache or conns must be done only after the
18628          * ires have been deleted above. Otherwise a thread could end up
18629          * caching an ire in a conn after we have finished the cleanup of the
18630          * conn. The caching is done after making sure that the ire is not yet
18631          * condemned. Also documented in the block comment above ip_output
18632          */
18633         ipcl_walk(conn_cleanup_stale_ire, NULL, ipst);
18634         /* Also, delete the ires cached in SCTP */
18635         sctp_ire_cache_flush(ipif);
18636
18637         /*
18638          * Update any other ipifs which have used "our" local address as
18639          * a source address. This entails removing and recreating IRE_INTERFACE
18640          * entries for such ipifs.
18641          */
18642         if (ipif->ipif_isv6)
18643                 ipif_update_other_ipifs_v6(ipif, ill->ill_group);
18644         else
18645                 ipif_update_other_ipifs(ipif, ill->ill_group);
18646
18647         if (ipif_was_up) {
18648                 /*
18649                  * Check whether it is last ipif to leave this group.
18650                  * If this is the last ipif to leave, we should remove
18651                  * this ill from the group as ipif_select_source will not
18652                  * be able to find any useful ipifs if this ill is selected
18653                  * for load balancing.
18654                  *
18655                  * For nameless groups, we should call ifgrp_delete if this
18656                  * belongs to some group. As this ipif is going down, we may
18657                  * need to reconstruct groups.
18658                  */
18659                 phyi = ill->ill_phyint;
18660                 /*
18661                  * If the phyint_groupname_len is 0, it may or may not
18662                  * be in the nameless group. If the phyint_groupname_len is
18663                  * not 0, then this ill should be part of some group.
18664                  * As we always insert this ill in the group if
18665                  * phyint_groupname_len is not zero when the first ipif
18666                  * comes up (in ipif_up_done), it should be in a group
18667                  * when the namelen is not 0.
18668                  *
18669                  * NOTE : When we delete the ill from the group,it will
18670                  * blow away all the IRE_CACHES pointing either at this ipif or
18671                  * ill_wq (illgrp_cache_delete does this). Thus, no IRES
18672                  * should be pointing at this ill.
18673                  */
18674                 ASSERT(phyi->phyint_groupname_len == 0 ||
18675                     (phyi->phyint_groupname != NULL && ill->ill_group != NULL));
18676
18677                 if (phyi->phyint_groupname_len != 0) {
18678                         if (ill->ill_ipif_up_count == 0)
18679                                 illgrp_delete(ill);
18680                 }
18681
18682                 /*
18683                  * If we have deleted some of the broadcast ires associated
18684                  * with this ipif, we need to re-nominate somebody else if
18685                  * the ires that we deleted were the nominated ones.
18686                  */
18687                 if (ill->ill_group != NULL && !ill->ill_isv6)
18688                         ipif_renominate_bcast(ipif);
18689         }
18690
18691         /*
18692          * neighbor-discovery or arp entries for this interface.
18693          */
18694         ipif_ndp_down(ipif);
18695
18696         /*
18697          * If mp is NULL the caller will wait for the appropriate refcnt.
18698          * Eg. ip_sioctl_removeif -> ipif_free  -> ipif_down
18699          * and ill_delete -> ipif_free -> ipif_down
18700          */
18701         if (mp == NULL) {
18702                 ASSERT(q == NULL);
18703                 return (0);
18704         }
18705
18706         if (CONN_Q(q)) {
18707                 connp = Q_TO_CONN(q);
18708                 mutex_enter(&connp->conn_lock);
18709         } else {
18710                 connp = NULL;
18711         }
18712         mutex_enter(&ill->ill_lock);
18713         /*
18714          * Are there any ire's pointing to this ipif that are still active ?
18715          * If this is the last ipif going down, are there any ire's pointing
18716          * to this ill that are still active ?
18717          */
18718         if (ipif_is_quiescent(ipif)) {
18719                 mutex_exit(&ill->ill_lock);
18720                 if (connp != NULL)
18721                         mutex_exit(&connp->conn_lock);
18722                 return (0);
18723         }
18724
18725         ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p",
18726             ill->ill_name, (void *)ill));
18727         /*
18728          * Enqueue the mp atomically in ipsq_pending_mp. When the refcount
18729          * drops down, the operation will be restarted by ipif_ill_refrele_tail
18730          * which in turn is called by the last refrele on the ipif/ill/ire.
18731          */
18732         success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN);
18733         if (!success) {
18734                 /* The conn is closing. So just return */
18735                 ASSERT(connp != NULL);
18736                 mutex_exit(&ill->ill_lock);
18737                 mutex_exit(&connp->conn_lock);
18738                 return (EINTR);
18739         }
18740
18741         mutex_exit(&ill->ill_lock);
18742         if (connp != NULL)
18743                 mutex_exit(&connp->conn_lock);
18744         return (EINPROGRESS);
18745 }
18746
18747 void
18748 ipif_down_tail(ipif_t *ipif)
18749 {
18750         ill_t   *ill = ipif->ipif_ill;
18751
18752         /*
18753          * Skip any loopback interface (null wq).
18754          * If this is the last logical interface on the ill
18755          * have ill_dl_down tell the driver we are gone (unbind)
18756          * Note that lun 0 can ipif_down even though
18757          * there are other logical units that are up.
18758          * This occurs e.g. when we change a "significant" IFF_ flag.
18759          */
18760         if (ill->ill_wq != NULL && !ill->ill_logical_down &&
18761             ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
18762             ill->ill_dl_up) {
18763                 ill_dl_down(ill);
18764         }
18765         ill->ill_logical_down = 0;
18766
18767         /*
18768          * Have to be after removing the routes in ipif_down_delete_ire.
18769          */
18770         if (ipif->ipif_isv6) {
18771                 if (ill->ill_flags & ILLF_XRESOLV)
18772                         ipif_arp_down(ipif);
18773         } else {
18774                 ipif_arp_down(ipif);
18775         }
18776
18777         ip_rts_ifmsg(ipif);
18778         ip_rts_newaddrmsg(RTM_DELETE, 0, ipif);
18779 }
18780
18781 /*
18782  * Bring interface logically down without bringing the physical interface
18783  * down e.g. when the netmask is changed. This avoids long lasting link
18784  * negotiations between an ethernet interface and a certain switches.
18785  */
18786 static int
18787 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
18788 {
18789         /*
18790          * The ill_logical_down flag is a transient flag. It is set here
18791          * and is cleared once the down has completed in ipif_down_tail.
18792          * This flag does not indicate whether the ill stream is in the
18793          * DL_BOUND state with the driver. Instead this flag is used by
18794          * ipif_down_tail to determine whether to DL_UNBIND the stream with
18795          * the driver. The state of the ill stream i.e. whether it is
18796          * DL_BOUND with the driver or not is indicated by the ill_dl_up flag.
18797          */
18798         ipif->ipif_ill->ill_logical_down = 1;
18799         return (ipif_down(ipif, q, mp));
18800 }
18801
18802 /*
18803  * This is called when the SIOCSLIFUSESRC ioctl is processed in IP.
18804  * If the usesrc client ILL is already part of a usesrc group or not,
18805  * in either case a ire_stq with the matching usesrc client ILL will
18806  * locate the IRE's that need to be deleted. We want IREs to be created
18807  * with the new source address.
18808  */
18809 static void
18810 ipif_delete_cache_ire(ire_t *ire, char *ill_arg)
18811 {
18812         ill_t   *ucill = (ill_t *)ill_arg;
18813
18814         ASSERT(IAM_WRITER_ILL(ucill));
18815
18816         if (ire->ire_stq == NULL)
18817                 return;
18818
18819         if ((ire->ire_type == IRE_CACHE) &&
18820             ((ill_t *)ire->ire_stq->q_ptr == ucill))
18821                 ire_delete(ire);
18822 }
18823
18824 /*
18825  * ire_walk routine to delete every IRE dependent on the interface
18826  * address that is going down.  (Always called as writer.)
18827  * Works for both v4 and v6.
18828  * In addition for checking for ire_ipif matches it also checks for
18829  * IRE_CACHE entries which have the same source address as the
18830  * disappearing ipif since ipif_select_source might have picked
18831  * that source. Note that ipif_down/ipif_update_other_ipifs takes
18832  * care of any IRE_INTERFACE with the disappearing source address.
18833  */
18834 static void
18835 ipif_down_delete_ire(ire_t *ire, char *ipif_arg)
18836 {
18837         ipif_t  *ipif = (ipif_t *)ipif_arg;
18838         ill_t *ire_ill;
18839         ill_t *ipif_ill;
18840
18841         ASSERT(IAM_WRITER_IPIF(ipif));
18842         if (ire->ire_ipif == NULL)
18843                 return;
18844
18845         /*
18846          * For IPv4, we derive source addresses for an IRE from ipif's
18847          * belonging to the same IPMP group as the IRE's outgoing
18848          * interface.  If an IRE's outgoing interface isn't in the
18849          * same IPMP group as a particular ipif, then that ipif
18850          * couldn't have been used as a source address for this IRE.
18851          *
18852          * For IPv6, source addresses are only restricted to the IPMP group
18853          * if the IRE is for a link-local address or a multicast address.
18854          * Otherwise, source addresses for an IRE can be chosen from
18855          * interfaces other than the the outgoing interface for that IRE.
18856          *
18857          * For source address selection details, see ipif_select_source()
18858          * and ipif_select_source_v6().
18859          */
18860         if (ire->ire_ipversion == IPV4_VERSION ||
18861             IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) ||
18862             IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
18863                 ire_ill = ire->ire_ipif->ipif_ill;
18864                 ipif_ill = ipif->ipif_ill;
18865
18866                 if (ire_ill->ill_group != ipif_ill->ill_group) {
18867                         return;
18868                 }
18869         }
18870
18871
18872         if (ire->ire_ipif != ipif) {
18873                 /*
18874                  * Look for a matching source address.
18875                  */
18876                 if (ire->ire_type != IRE_CACHE)
18877                         return;
18878                 if (ipif->ipif_flags & IPIF_NOLOCAL)
18879                         return;
18880
18881                 if (ire->ire_ipversion == IPV4_VERSION) {
18882                         if (ire->ire_src_addr != ipif->ipif_src_addr)
18883                                 return;
18884                 } else {
18885                         if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
18886                             &ipif->ipif_v6lcl_addr))
18887                                 return;
18888                 }
18889                 ire_delete(ire);
18890                 return;
18891         }
18892         /*
18893          * ire_delete() will do an ire_flush_cache which will delete
18894          * all ire_ipif matches
18895          */
18896         ire_delete(ire);
18897 }
18898
18899 /*
18900  * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when
18901  * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or
18902  * 2) when an interface is brought up or down (on that ill).
18903  * This ensures that the IRE_CACHE entries don't retain stale source
18904  * address selection results.
18905  */
18906 void
18907 ill_ipif_cache_delete(ire_t *ire, char *ill_arg)
18908 {
18909         ill_t   *ill = (ill_t *)ill_arg;
18910         ill_t   *ipif_ill;
18911
18912         ASSERT(IAM_WRITER_ILL(ill));
18913         /*
18914          * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
18915          * Hence this should be IRE_CACHE.
18916          */
18917         ASSERT(ire->ire_type == IRE_CACHE);
18918
18919         /*
18920          * We are called for IRE_CACHES whose ire_ipif matches ill.
18921          * We are only interested in IRE_CACHES that has borrowed
18922          * the source address from ill_arg e.g. ipif_up_done[_v6]
18923          * for which we need to look at ire_ipif->ipif_ill match
18924          * with ill.
18925          */
18926         ASSERT(ire->ire_ipif != NULL);
18927         ipif_ill = ire->ire_ipif->ipif_ill;
18928         if (ipif_ill == ill || (ill->ill_group != NULL &&
18929             ipif_ill->ill_group == ill->ill_group)) {
18930                 ire_delete(ire);
18931         }
18932 }
18933
18934 /*
18935  * Delete all the ire whose stq references ill_arg.
18936  */
18937 static void
18938 ill_stq_cache_delete(ire_t *ire, char *ill_arg)
18939 {
18940         ill_t   *ill = (ill_t *)ill_arg;
18941         ill_t   *ire_ill;
18942
18943         ASSERT(IAM_WRITER_ILL(ill));
18944         /*
18945          * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
18946          * Hence this should be IRE_CACHE.
18947          */
18948         ASSERT(ire->ire_type == IRE_CACHE);
18949
18950         /*
18951          * We are called for IRE_CACHES whose ire_stq and ire_ipif
18952          * matches ill. We are only interested in IRE_CACHES that
18953          * has ire_stq->q_ptr pointing at ill_arg. Thus we do the
18954          * filtering here.
18955          */
18956         ire_ill = (ill_t *)ire->ire_stq->q_ptr;
18957
18958         if (ire_ill == ill)
18959                 ire_delete(ire);
18960 }
18961
18962 /*
18963  * This is called when an ill leaves the group. We want to delete
18964  * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is
18965  * pointing at ill.
18966  */
18967 static void
18968 illgrp_cache_delete(ire_t *ire, char *ill_arg)
18969 {
18970         ill_t   *ill = (ill_t *)ill_arg;
18971
18972         ASSERT(IAM_WRITER_ILL(ill));
18973         ASSERT(ill->ill_group == NULL);
18974         /*
18975          * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4.
18976          * Hence this should be IRE_CACHE.
18977          */
18978         ASSERT(ire->ire_type == IRE_CACHE);
18979         /*
18980          * We are called for IRE_CACHES whose ire_stq and ire_ipif
18981          * matches ill. We are interested in both.
18982          */
18983         ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) ||
18984             (ire->ire_ipif->ipif_ill == ill));
18985
18986         ire_delete(ire);
18987 }
18988
18989 /*
18990  * Initiate deallocate of an IPIF. Always called as writer. Called by
18991  * ill_delete or ip_sioctl_removeif.
18992  */
18993 static void
18994 ipif_free(ipif_t *ipif)
18995 {
18996         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
18997
18998         ASSERT(IAM_WRITER_IPIF(ipif));
18999
19000         if (ipif->ipif_recovery_id != 0)
19001                 (void) untimeout(ipif->ipif_recovery_id);
19002         ipif->ipif_recovery_id = 0;
19003
19004         /* Remove conn references */
19005         reset_conn_ipif(ipif);
19006
19007         /*
19008          * Make sure we have valid net and subnet broadcast ire's for the
19009          * other ipif's which share them with this ipif.
19010          */
19011         if (!ipif->ipif_isv6)
19012                 ipif_check_bcast_ires(ipif);
19013
19014         /*
19015          * Take down the interface. We can be called either from ill_delete
19016          * or from ip_sioctl_removeif.
19017          */
19018         (void) ipif_down(ipif, NULL, NULL);
19019
19020         /*
19021          * Now that the interface is down, there's no chance it can still
19022          * become a duplicate.  Cancel any timer that may have been set while
19023          * tearing down.
19024          */
19025         if (ipif->ipif_recovery_id != 0)
19026                 (void) untimeout(ipif->ipif_recovery_id);
19027         ipif->ipif_recovery_id = 0;
19028
19029         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
19030         /* Remove pointers to this ill in the multicast routing tables */
19031         reset_mrt_vif_ipif(ipif);
19032         rw_exit(&ipst->ips_ill_g_lock);
19033 }
19034
19035 /*
19036  * Warning: this is not the only function that calls mi_free on an ipif_t.  See
19037  * also ill_move().
19038  */
19039 static void
19040 ipif_free_tail(ipif_t *ipif)
19041 {
19042         mblk_t  *mp;
19043         ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
19044
19045         /*
19046          * Free state for addition IRE_IF_[NO]RESOLVER ire's.
19047          */
19048         mutex_enter(&ipif->ipif_saved_ire_lock);
19049         mp = ipif->ipif_saved_ire_mp;
19050         ipif->ipif_saved_ire_mp = NULL;
19051         mutex_exit(&ipif->ipif_saved_ire_lock);
19052         freemsg(mp);
19053
19054         /*
19055          * Need to hold both ill_g_lock and ill_lock while
19056          * inserting or removing an ipif from the linked list
19057          * of ipifs hanging off the ill.
19058          */
19059         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
19060         /*
19061          * Remove all IPv4 multicast memberships on the interface now.
19062          * IPv6 is not handled here as the multicast memberships are
19063          * tied to the ill rather than the ipif.
19064          */
19065         ilm_free(ipif);
19066
19067         /*
19068          * Since we held the ill_g_lock while doing the ilm_free above,
19069          * we can assert the ilms were really deleted and not just marked
19070          * ILM_DELETED.
19071          */
19072         ASSERT(ilm_walk_ipif(ipif) == 0);
19073
19074 #ifdef DEBUG
19075         ipif_trace_cleanup(ipif);
19076 #endif
19077
19078         /* Ask SCTP to take it out of it list */
19079         sctp_update_ipif(ipif, SCTP_IPIF_REMOVE);
19080
19081         /* Get it out of the ILL interface list. */
19082         ipif_remove(ipif, B_TRUE);
19083         rw_exit(&ipst->ips_ill_g_lock);
19084
19085         mutex_destroy(&ipif->ipif_saved_ire_lock);
19086
19087         ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE)));
19088         ASSERT(ipif->ipif_recovery_id == 0);
19089
19090         /* Free the memory. */
19091         mi_free(ipif);
19092 }
19093
19094 /*
19095  * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id"
19096  * is zero.
19097  */
19098 void
19099 ipif_get_name(const ipif_t *ipif, char *buf, int len)
19100 {
19101         char    lbuf[LIFNAMSIZ];
19102         char    *name;
19103         size_t  name_len;
19104
19105         buf[0] = '\0';
19106         name = ipif->ipif_ill->ill_name;
19107         name_len = ipif->ipif_ill->ill_name_length;
19108         if (ipif->ipif_id != 0) {
19109                 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR,
19110                     ipif->ipif_id);
19111                 name = lbuf;
19112                 name_len = mi_strlen(name) + 1;
19113         }
19114         len -= 1;
19115         buf[len] = '\0';
19116         len = MIN(len, name_len);
19117         bcopy(name, buf, len);
19118 }
19119
19120 /*
19121  * Find an IPIF based on the name passed in.  Names can be of the
19122  * form <phys> (e.g., le0), <phys>:<#> (e.g., le0:1),
19123  * The <phys> string can have forms like <dev><#> (e.g., le0),
19124  * <dev><#>.<module> (e.g. le0.foo), or <dev>.<module><#> (e.g. ip.tun3).
19125  * When there is no colon, the implied unit id is zero. <phys> must
19126  * correspond to the name of an ILL.  (May be called as writer.)
19127  */
19128 static ipif_t *
19129 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
19130     boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q,
19131     mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
19132 {
19133         char    *cp;
19134         char    *endp;
19135         long    id;
19136         ill_t   *ill;
19137         ipif_t  *ipif;
19138         uint_t  ire_type;
19139         boolean_t did_alloc = B_FALSE;
19140         ipsq_t  *ipsq;
19141
19142         if (error != NULL)
19143                 *error = 0;
19144
19145         /*
19146          * If the caller wants to us to create the ipif, make sure we have a
19147          * valid zoneid
19148          */
19149         ASSERT(!do_alloc || zoneid != ALL_ZONES);
19150
19151         if (namelen == 0) {
19152                 if (error != NULL)
19153                         *error = ENXIO;
19154                 return (NULL);
19155         }
19156
19157         *exists = B_FALSE;
19158         /* Look for a colon in the name. */
19159         endp = &name[namelen];
19160         for (cp = endp; --cp > name; ) {
19161                 if (*cp == IPIF_SEPARATOR_CHAR)
19162                         break;
19163         }
19164
19165         if (*cp == IPIF_SEPARATOR_CHAR) {
19166                 /*
19167                  * Reject any non-decimal aliases for logical
19168                  * interfaces. Aliases with leading zeroes
19169                  * are also rejected as they introduce ambiguity
19170                  * in the naming of the interfaces.
19171                  * In order to confirm with existing semantics,
19172                  * and to not break any programs/script relying
19173                  * on that behaviour, if<0>:0 is considered to be
19174                  * a valid interface.
19175                  *
19176                  * If alias has two or more digits and the first
19177                  * is zero, fail.
19178                  */
19179                 if (&cp[2] < endp && cp[1] == '0') {
19180                         if (error != NULL)
19181                                 *error = EINVAL;
19182                         return (NULL);
19183                 }
19184         }
19185
19186         if (cp <= name) {
19187                 cp = endp;
19188         } else {
19189                 *cp = '\0';
19190         }
19191
19192         /*
19193          * Look up the ILL, based on the portion of the name
19194          * before the slash. ill_lookup_on_name returns a held ill.
19195          * Temporary to check whether ill exists already. If so
19196          * ill_lookup_on_name will clear it.
19197          */
19198         ill = ill_lookup_on_name(name, do_alloc, isv6,
19199             q, mp, func, error, &did_alloc, ipst);
19200         if (cp != endp)
19201                 *cp = IPIF_SEPARATOR_CHAR;
19202         if (ill == NULL)
19203                 return (NULL);
19204
19205         /* Establish the unit number in the name. */
19206         id = 0;
19207         if (cp < endp && *endp == '\0') {
19208                 /* If there was a colon, the unit number follows. */
19209                 cp++;
19210                 if (ddi_strtol(cp, NULL, 0, &id) != 0) {
19211                         ill_refrele(ill);
19212                         if (error != NULL)
19213                                 *error = ENXIO;
19214                         return (NULL);
19215                 }
19216         }
19217
19218         GRAB_CONN_LOCK(q);
19219         mutex_enter(&ill->ill_lock);
19220         /* Now see if there is an IPIF with this unit number. */
19221         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
19222                 if (ipif->ipif_id == id) {
19223                         if (zoneid != ALL_ZONES &&
19224                             zoneid != ipif->ipif_zoneid &&
19225                             ipif->ipif_zoneid != ALL_ZONES) {
19226                                 mutex_exit(&ill->ill_lock);
19227                                 RELEASE_CONN_LOCK(q);
19228                                 ill_refrele(ill);
19229                                 if (error != NULL)
19230                                         *error = ENXIO;
19231                                 return (NULL);
19232                         }
19233                         /*
19234                          * The block comment at the start of ipif_down
19235                          * explains the use of the macros used below
19236                          */
19237                         if (IPIF_CAN_LOOKUP(ipif)) {
19238                                 ipif_refhold_locked(ipif);
19239                                 mutex_exit(&ill->ill_lock);
19240                                 if (!did_alloc)
19241                                         *exists = B_TRUE;
19242                                 /*
19243                                  * Drop locks before calling ill_refrele
19244                                  * since it can potentially call into
19245                                  * ipif_ill_refrele_tail which can end up
19246                                  * in trying to acquire any lock.
19247                                  */
19248                                 RELEASE_CONN_LOCK(q);
19249                                 ill_refrele(ill);
19250                                 return (ipif);
19251                         } else if (IPIF_CAN_WAIT(ipif, q)) {
19252                                 ipsq = ill->ill_phyint->phyint_ipsq;
19253                                 mutex_enter(&ipsq->ipsq_lock);
19254                                 mutex_exit(&ill->ill_lock);
19255                                 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
19256                                 mutex_exit(&ipsq->ipsq_lock);
19257                                 RELEASE_CONN_LOCK(q);
19258                                 ill_refrele(ill);
19259                                 if (error != NULL)
19260                                         *error = EINPROGRESS;
19261                                 return (NULL);
19262                         }
19263                 }
19264         }
19265         RELEASE_CONN_LOCK(q);
19266
19267         if (!do_alloc) {
19268                 mutex_exit(&ill->ill_lock);
19269                 ill_refrele(ill);
19270                 if (error != NULL)
19271                         *error = ENXIO;
19272                 return (NULL);
19273         }
19274
19275         /*
19276          * If none found, atomically allocate and return a new one.
19277          * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL
19278          * to support "receive only" use of lo0:1 etc. as is still done
19279          * below as an initial guess.
19280          * However, this is now likely to be overriden later in ipif_up_done()
19281          * when we know for sure what address has been configured on the
19282          * interface, since we might have more than one loopback interface
19283          * with a loopback address, e.g. in the case of zones, and all the
19284          * interfaces with loopback addresses need to be marked IRE_LOOPBACK.
19285          */
19286         if (ill->ill_net_type == IRE_LOOPBACK && id == 0)
19287                 ire_type = IRE_LOOPBACK;
19288         else
19289                 ire_type = IRE_LOCAL;
19290         ipif = ipif_allocate(ill, id, ire_type, B_TRUE);
19291         if (ipif != NULL)
19292                 ipif_refhold_locked(ipif);
19293         else if (error != NULL)
19294                 *error = ENOMEM;
19295         mutex_exit(&ill->ill_lock);
19296         ill_refrele(ill);
19297         return (ipif);
19298 }
19299
19300 /*
19301  * This routine is called whenever a new address comes up on an ipif.  If
19302  * we are configured to respond to address mask requests, then we are supposed
19303  * to broadcast an address mask reply at this time.  This routine is also
19304  * called if we are already up, but a netmask change is made.  This is legal
19305  * but might not make the system manager very popular.  (May be called
19306  * as writer.)
19307  */
19308 void
19309 ipif_mask_reply(ipif_t *ipif)
19310 {
19311         icmph_t *icmph;
19312         ipha_t  *ipha;
19313         mblk_t  *mp;
19314         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
19315
19316 #define REPLY_LEN       (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN)
19317
19318         if (!ipst->ips_ip_respond_to_address_mask_broadcast)
19319                 return;
19320
19321         /* ICMP mask reply is IPv4 only */
19322         ASSERT(!ipif->ipif_isv6);
19323         /* ICMP mask reply is not for a loopback interface */
19324         ASSERT(ipif->ipif_ill->ill_wq != NULL);
19325
19326         mp = allocb(REPLY_LEN, BPRI_HI);
19327         if (mp == NULL)
19328                 return;
19329         mp->b_wptr = mp->b_rptr + REPLY_LEN;
19330
19331         ipha = (ipha_t *)mp->b_rptr;
19332         bzero(ipha, REPLY_LEN);
19333         *ipha = icmp_ipha;
19334         ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
19335         ipha->ipha_src = ipif->ipif_src_addr;
19336         ipha->ipha_dst = ipif->ipif_brd_addr;
19337         ipha->ipha_length = htons(REPLY_LEN);
19338         ipha->ipha_ident = 0;
19339
19340         icmph = (icmph_t *)&ipha[1];
19341         icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
19342         bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
19343         icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
19344
19345         put(ipif->ipif_wq, mp);
19346
19347 #undef  REPLY_LEN
19348 }
19349
19350 /*
19351  * When the mtu in the ipif changes, we call this routine through ire_walk
19352  * to update all the relevant IREs.
19353  * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq.
19354  */
19355 static void
19356 ipif_mtu_change(ire_t *ire, char *ipif_arg)
19357 {
19358         ipif_t *ipif = (ipif_t *)ipif_arg;
19359
19360         if (ire->ire_stq == NULL || ire->ire_ipif != ipif)
19361                 return;
19362         ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET);
19363 }
19364
19365 /*
19366  * When the mtu in the ill changes, we call this routine through ire_walk
19367  * to update all the relevant IREs.
19368  * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq.
19369  */
19370 void
19371 ill_mtu_change(ire_t *ire, char *ill_arg)
19372 {
19373         ill_t   *ill = (ill_t *)ill_arg;
19374
19375         if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill)
19376                 return;
19377         ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
19378 }
19379
19380 /*
19381  * Join the ipif specific multicast groups.
19382  * Must be called after a mapping has been set up in the resolver.  (Always
19383  * called as writer.)
19384  */
19385 void
19386 ipif_multicast_up(ipif_t *ipif)
19387 {
19388         int err, index;
19389         ill_t *ill;
19390
19391         ASSERT(IAM_WRITER_IPIF(ipif));
19392
19393         ill = ipif->ipif_ill;
19394         index = ill->ill_phyint->phyint_ifindex;
19395
19396         ip1dbg(("ipif_multicast_up\n"));
19397         if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up)
19398                 return;
19399
19400         if (ipif->ipif_isv6) {
19401                 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
19402                         return;
19403
19404                 /* Join the all hosts multicast address */
19405                 ip1dbg(("ipif_multicast_up - addmulti\n"));
19406                 /*
19407                  * Passing B_TRUE means we have to join the multicast
19408                  * membership on this interface even though this is
19409                  * FAILED. If we join on a different one in the group,
19410                  * we will not be able to delete the membership later
19411                  * as we currently don't track where we join when we
19412                  * join within the kernel unlike applications where
19413                  * we have ilg/ilg_orig_index. See ip_addmulti_v6
19414                  * for more on this.
19415                  */
19416                 err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index,
19417                     ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
19418                 if (err != 0) {
19419                         ip0dbg(("ipif_multicast_up: "
19420                             "all_hosts_mcast failed %d\n",
19421                             err));
19422                         return;
19423                 }
19424                 /*
19425                  * Enable multicast for the solicited node multicast address
19426                  */
19427                 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
19428                         in6_addr_t ipv6_multi = ipv6_solicited_node_mcast;
19429
19430                         ipv6_multi.s6_addr32[3] |=
19431                             ipif->ipif_v6lcl_addr.s6_addr32[3];
19432
19433                         err = ip_addmulti_v6(&ipv6_multi, ill, index,
19434                             ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE,
19435                             NULL);
19436                         if (err != 0) {
19437                                 ip0dbg(("ipif_multicast_up: solicited MC"
19438                                     " failed %d\n", err));
19439                                 (void) ip_delmulti_v6(&ipv6_all_hosts_mcast,
19440                                     ill, ill->ill_phyint->phyint_ifindex,
19441                                     ipif->ipif_zoneid, B_TRUE, B_TRUE);
19442                                 return;
19443                         }
19444                 }
19445         } else {
19446                 if (ipif->ipif_lcl_addr == INADDR_ANY)
19447                         return;
19448
19449                 /* Join the all hosts multicast address */
19450                 ip1dbg(("ipif_multicast_up - addmulti\n"));
19451                 err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif,
19452                     ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
19453                 if (err) {
19454                         ip0dbg(("ipif_multicast_up: failed %d\n", err));
19455                         return;
19456                 }
19457         }
19458         ipif->ipif_multicast_up = 1;
19459 }
19460
19461 /*
19462  * Blow away any multicast groups that we joined in ipif_multicast_up().
19463  * (Explicit memberships are blown away in ill_leave_multicast() when the
19464  * ill is brought down.)
19465  */
19466 static void
19467 ipif_multicast_down(ipif_t *ipif)
19468 {
19469         int err;
19470
19471         ASSERT(IAM_WRITER_IPIF(ipif));
19472
19473         ip1dbg(("ipif_multicast_down\n"));
19474         if (!ipif->ipif_multicast_up)
19475                 return;
19476
19477         ip1dbg(("ipif_multicast_down - delmulti\n"));
19478
19479         if (!ipif->ipif_isv6) {
19480                 err = ip_delmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, B_TRUE,
19481                     B_TRUE);
19482                 if (err != 0)
19483                         ip0dbg(("ipif_multicast_down: failed %d\n", err));
19484
19485                 ipif->ipif_multicast_up = 0;
19486                 return;
19487         }
19488
19489         /*
19490          * Leave the all hosts multicast address. Similar to ip_addmulti_v6,
19491          * we should look for ilms on this ill rather than the ones that have
19492          * been failed over here.  They are here temporarily. As
19493          * ipif_multicast_up has joined on this ill, we should delete only
19494          * from this ill.
19495          */
19496         err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill,
19497             ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid,
19498             B_TRUE, B_TRUE);
19499         if (err != 0) {
19500                 ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n",
19501                     err));
19502         }
19503         /*
19504          * Disable multicast for the solicited node multicast address
19505          */
19506         if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
19507                 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast;
19508
19509                 ipv6_multi.s6_addr32[3] |=
19510                     ipif->ipif_v6lcl_addr.s6_addr32[3];
19511
19512                 err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill,
19513                     ipif->ipif_ill->ill_phyint->phyint_ifindex,
19514                     ipif->ipif_zoneid, B_TRUE, B_TRUE);
19515
19516                 if (err != 0) {
19517                         ip0dbg(("ipif_multicast_down: sol MC failed %d\n",
19518                             err));
19519                 }
19520         }
19521
19522         ipif->ipif_multicast_up = 0;
19523 }
19524
19525 /*
19526  * Used when an interface comes up to recreate any extra routes on this
19527  * interface.
19528  */
19529 static ire_t **
19530 ipif_recover_ire(ipif_t *ipif)
19531 {
19532         mblk_t  *mp;
19533         ire_t   **ipif_saved_irep;
19534         ire_t   **irep;
19535         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
19536
19537         ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name,
19538             ipif->ipif_id));
19539
19540         mutex_enter(&ipif->ipif_saved_ire_lock);
19541         ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) *
19542             ipif->ipif_saved_ire_cnt, KM_NOSLEEP);
19543         if (ipif_saved_irep == NULL) {
19544                 mutex_exit(&ipif->ipif_saved_ire_lock);
19545                 return (NULL);
19546         }
19547
19548         irep = ipif_saved_irep;
19549         for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
19550                 ire_t           *ire;
19551                 queue_t         *rfq;
19552                 queue_t         *stq;
19553                 ifrt_t          *ifrt;
19554                 uchar_t         *src_addr;
19555                 uchar_t         *gateway_addr;
19556                 ushort_t        type;
19557
19558                 /*
19559                  * When the ire was initially created and then added in
19560                  * ip_rt_add(), it was created either using ipif->ipif_net_type
19561                  * in the case of a traditional interface route, or as one of
19562                  * the IRE_OFFSUBNET types (with the exception of
19563                  * IRE_HOST types ire which is created by icmp_redirect() and
19564                  * which we don't need to save or recover).  In the case where
19565                  * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update
19566                  * the ire_type to IRE_IF_NORESOLVER before calling ire_add()
19567                  * to satisfy software like GateD and Sun Cluster which creates
19568                  * routes using the the loopback interface's address as a
19569                  * gateway.
19570                  *
19571                  * As ifrt->ifrt_type reflects the already updated ire_type,
19572                  * ire_create() will be called in the same way here as
19573                  * in ip_rt_add(), namely using ipif->ipif_net_type when
19574                  * the route looks like a traditional interface route (where
19575                  * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using
19576                  * the saved ifrt->ifrt_type.  This means that in the case where
19577                  * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by
19578                  * ire_create() will be an IRE_LOOPBACK, it will then be turned
19579                  * into an IRE_IF_NORESOLVER and then added by ire_add().
19580                  */
19581                 ifrt = (ifrt_t *)mp->b_rptr;
19582                 ASSERT(ifrt->ifrt_type != IRE_CACHE);
19583                 if (ifrt->ifrt_type & IRE_INTERFACE) {
19584                         rfq = NULL;
19585                         stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
19586                             ? ipif->ipif_rq : ipif->ipif_wq;
19587                         src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
19588                             ? (uint8_t *)&ifrt->ifrt_src_addr
19589                             : (uint8_t *)&ipif->ipif_src_addr;
19590                         gateway_addr = NULL;
19591                         type = ipif->ipif_net_type;
19592                 } else if (ifrt->ifrt_type & IRE_BROADCAST) {
19593                         /* Recover multiroute broadcast IRE. */
19594                         rfq = ipif->ipif_rq;
19595                         stq = ipif->ipif_wq;
19596                         src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
19597                             ? (uint8_t *)&ifrt->ifrt_src_addr
19598                             : (uint8_t *)&ipif->ipif_src_addr;
19599                         gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr;
19600                         type = ifrt->ifrt_type;
19601                 } else {
19602                         rfq = NULL;
19603                         stq = NULL;
19604                         src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
19605                             ? (uint8_t *)&ifrt->ifrt_src_addr : NULL;
19606                         gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr;
19607                         type = ifrt->ifrt_type;
19608                 }
19609
19610                 /*
19611                  * Create a copy of the IRE with the saved address and netmask.
19612                  */
19613                 ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for "
19614                     "0x%x/0x%x\n",
19615                     ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type,
19616                     ntohl(ifrt->ifrt_addr),
19617                     ntohl(ifrt->ifrt_mask)));
19618                 ire = ire_create(
19619                     (uint8_t *)&ifrt->ifrt_addr,
19620                     (uint8_t *)&ifrt->ifrt_mask,
19621                     src_addr,
19622                     gateway_addr,
19623                     &ifrt->ifrt_max_frag,
19624                     NULL,
19625                     rfq,
19626                     stq,
19627                     type,
19628                     ipif,
19629                     0,
19630                     0,
19631                     0,
19632                     ifrt->ifrt_flags,
19633                     &ifrt->ifrt_iulp_info,
19634                     NULL,
19635                     NULL,
19636                     ipst);
19637
19638                 if (ire == NULL) {
19639                         mutex_exit(&ipif->ipif_saved_ire_lock);
19640                         kmem_free(ipif_saved_irep,
19641                             ipif->ipif_saved_ire_cnt * sizeof (ire_t *));
19642                         return (NULL);
19643                 }
19644
19645                 /*
19646                  * Some software (for example, GateD and Sun Cluster) attempts
19647                  * to create (what amount to) IRE_PREFIX routes with the
19648                  * loopback address as the gateway.  This is primarily done to
19649                  * set up prefixes with the RTF_REJECT flag set (for example,
19650                  * when generating aggregate routes.)
19651                  *
19652                  * If the IRE type (as defined by ipif->ipif_net_type) is
19653                  * IRE_LOOPBACK, then we map the request into a
19654                  * IRE_IF_NORESOLVER.
19655                  */
19656                 if (ipif->ipif_net_type == IRE_LOOPBACK)
19657                         ire->ire_type = IRE_IF_NORESOLVER;
19658                 /*
19659                  * ire held by ire_add, will be refreled' towards the
19660                  * the end of ipif_up_done
19661                  */
19662                 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE);
19663                 *irep = ire;
19664                 irep++;
19665                 ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire));
19666         }
19667         mutex_exit(&ipif->ipif_saved_ire_lock);
19668         return (ipif_saved_irep);
19669 }
19670
19671 /*
19672  * Used to set the netmask and broadcast address to default values when the
19673  * interface is brought up.  (Always called as writer.)
19674  */
19675 static void
19676 ipif_set_default(ipif_t *ipif)
19677 {
19678         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
19679
19680         if (!ipif->ipif_isv6) {
19681                 /*
19682                  * Interface holds an IPv4 address. Default
19683                  * mask is the natural netmask.
19684                  */
19685                 if (!ipif->ipif_net_mask) {
19686                         ipaddr_t        v4mask;
19687
19688                         v4mask = ip_net_mask(ipif->ipif_lcl_addr);
19689                         V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask);
19690                 }
19691                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
19692                         /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
19693                         ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
19694                 } else {
19695                         V6_MASK_COPY(ipif->ipif_v6lcl_addr,
19696                             ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
19697                 }
19698                 /*
19699                  * NOTE: SunOS 4.X does this even if the broadcast address
19700                  * has been already set thus we do the same here.
19701                  */
19702                 if (ipif->ipif_flags & IPIF_BROADCAST) {
19703                         ipaddr_t        v4addr;
19704
19705                         v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask;
19706                         IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr);
19707                 }
19708         } else {
19709                 /*
19710                  * Interface holds an IPv6-only address.  Default
19711                  * mask is all-ones.
19712                  */
19713                 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))
19714                         ipif->ipif_v6net_mask = ipv6_all_ones;
19715                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
19716                         /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
19717                         ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
19718                 } else {
19719                         V6_MASK_COPY(ipif->ipif_v6lcl_addr,
19720                             ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
19721                 }
19722         }
19723 }
19724
19725 /*
19726  * Return 0 if this address can be used as local address without causing
19727  * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address
19728  * is already up on a different ill, and EADDRINUSE if it's up on the same ill.
19729  * Special checks are needed to allow the same IPv6 link-local address
19730  * on different ills.
19731  * TODO: allowing the same site-local address on different ill's.
19732  */
19733 int
19734 ip_addr_availability_check(ipif_t *new_ipif)
19735 {
19736         in6_addr_t our_v6addr;
19737         ill_t *ill;
19738         ipif_t *ipif;
19739         ill_walk_context_t ctx;
19740         ip_stack_t      *ipst = new_ipif->ipif_ill->ill_ipst;
19741
19742         ASSERT(IAM_WRITER_IPIF(new_ipif));
19743         ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock));
19744         ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
19745
19746         new_ipif->ipif_flags &= ~IPIF_UNNUMBERED;
19747         if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) ||
19748             IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr))
19749                 return (0);
19750
19751         our_v6addr = new_ipif->ipif_v6lcl_addr;
19752
19753         if (new_ipif->ipif_isv6)
19754                 ill = ILL_START_WALK_V6(&ctx, ipst);
19755         else
19756                 ill = ILL_START_WALK_V4(&ctx, ipst);
19757
19758         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
19759                 for (ipif = ill->ill_ipif; ipif != NULL;
19760                     ipif = ipif->ipif_next) {
19761                         if ((ipif == new_ipif) ||
19762                             !(ipif->ipif_flags & IPIF_UP) ||
19763                             (ipif->ipif_flags & IPIF_UNNUMBERED))
19764                                 continue;
19765                         if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
19766                             &our_v6addr)) {
19767                                 if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
19768                                         new_ipif->ipif_flags |= IPIF_UNNUMBERED;
19769                                 else if (ipif->ipif_flags & IPIF_POINTOPOINT)
19770                                         ipif->ipif_flags |= IPIF_UNNUMBERED;
19771                                 else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) &&
19772                                     new_ipif->ipif_ill != ill)
19773                                         continue;
19774                                 else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) &&
19775                                     new_ipif->ipif_ill != ill)
19776                                         continue;
19777                                 else if (new_ipif->ipif_zoneid !=
19778                                     ipif->ipif_zoneid &&
19779                                     ipif->ipif_zoneid != ALL_ZONES &&
19780                                     IS_LOOPBACK(ill))
19781                                         continue;
19782                                 else if (new_ipif->ipif_ill == ill)
19783                                         return (EADDRINUSE);
19784                                 else
19785                                         return (EADDRNOTAVAIL);
19786                         }
19787                 }
19788         }
19789
19790         return (0);
19791 }
19792
19793 /*
19794  * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add
19795  * IREs for the ipif.
19796  * When the routine returns EINPROGRESS then mp has been consumed and
19797  * the ioctl will be acked from ip_rput_dlpi.
19798  */
19799 static int
19800 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
19801 {
19802         ill_t   *ill = ipif->ipif_ill;
19803         boolean_t isv6 = ipif->ipif_isv6;
19804         int     err = 0;
19805         boolean_t success;
19806
19807         ASSERT(IAM_WRITER_IPIF(ipif));
19808
19809         ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
19810
19811         /* Shouldn't get here if it is already up. */
19812         if (ipif->ipif_flags & IPIF_UP)
19813                 return (EALREADY);
19814
19815         /* Skip arp/ndp for any loopback interface. */
19816         if (ill->ill_wq != NULL) {
19817                 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
19818                 ipsq_t  *ipsq = ill->ill_phyint->phyint_ipsq;
19819
19820                 if (!ill->ill_dl_up) {
19821                         /*
19822                          * ill_dl_up is not yet set. i.e. we are yet to
19823                          * DL_BIND with the driver and this is the first
19824                          * logical interface on the ill to become "up".
19825                          * Tell the driver to get going (via DL_BIND_REQ).
19826                          * Note that changing "significant" IFF_ flags
19827                          * address/netmask etc cause a down/up dance, but
19828                          * does not cause an unbind (DL_UNBIND) with the driver
19829                          */
19830                         return (ill_dl_up(ill, ipif, mp, q));
19831                 }
19832
19833                 /*
19834                  * ipif_resolver_up may end up sending an
19835                  * AR_INTERFACE_UP message to ARP, which would, in
19836                  * turn send a DLPI message to the driver. ioctls are
19837                  * serialized and so we cannot send more than one
19838                  * interface up message at a time. If ipif_resolver_up
19839                  * does send an interface up message to ARP, we get
19840                  * EINPROGRESS and we will complete in ip_arp_done.
19841                  */
19842
19843                 ASSERT(connp != NULL || !CONN_Q(q));
19844                 ASSERT(ipsq->ipsq_pending_mp == NULL);
19845                 if (connp != NULL)
19846                         mutex_enter(&connp->conn_lock);
19847                 mutex_enter(&ill->ill_lock);
19848                 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
19849                 mutex_exit(&ill->ill_lock);
19850                 if (connp != NULL)
19851                         mutex_exit(&connp->conn_lock);
19852                 if (!success)
19853                         return (EINTR);
19854
19855                 /*
19856                  * Crank up IPv6 neighbor discovery
19857                  * Unlike ARP, this should complete when
19858                  * ipif_ndp_up returns. However, for
19859                  * ILLF_XRESOLV interfaces we also send a
19860                  * AR_INTERFACE_UP to the external resolver.
19861                  * That ioctl will complete in ip_rput.
19862                  */
19863                 if (isv6) {
19864                         err = ipif_ndp_up(ipif);
19865                         if (err != 0) {
19866                                 if (err != EINPROGRESS)
19867                                         mp = ipsq_pending_mp_get(ipsq, &connp);
19868                                 return (err);
19869                         }
19870                 }
19871                 /* Now, ARP */
19872                 err = ipif_resolver_up(ipif, Res_act_initial);
19873                 if (err == EINPROGRESS) {
19874                         /* We will complete it in ip_arp_done */
19875                         return (err);
19876                 }
19877                 mp = ipsq_pending_mp_get(ipsq, &connp);
19878                 ASSERT(mp != NULL);
19879                 if (err != 0)
19880                         return (err);
19881         } else {
19882                 /*
19883                  * Interfaces without underlying hardware don't do duplicate
19884                  * address detection.
19885                  */
19886                 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
19887                 ipif->ipif_addr_ready = 1;
19888         }
19889         return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
19890 }
19891
19892 /*
19893  * Perform a bind for the physical device.
19894  * When the routine returns EINPROGRESS then mp has been consumed and
19895  * the ioctl will be acked from ip_rput_dlpi.
19896  * Allocate an unbind message and save it until ipif_down.
19897  */
19898 static int
19899 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
19900 {
19901         areq_t  *areq;
19902         mblk_t  *areq_mp = NULL;
19903         mblk_t  *bind_mp = NULL;
19904         mblk_t  *unbind_mp = NULL;
19905         conn_t  *connp;
19906         boolean_t success;
19907         uint16_t sap_addr;
19908
19909         ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
19910         ASSERT(IAM_WRITER_ILL(ill));
19911         ASSERT(mp != NULL);
19912
19913         /* Create a resolver cookie for ARP */
19914         if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) {
19915                 areq_mp = ill_arp_alloc(ill, (uchar_t *)&ip_areq_template, 0);
19916                 if (areq_mp == NULL)
19917                         return (ENOMEM);
19918
19919                 freemsg(ill->ill_resolver_mp);
19920                 ill->ill_resolver_mp = areq_mp;
19921                 areq = (areq_t *)areq_mp->b_rptr;
19922                 sap_addr = ill->ill_sap;
19923                 bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr));
19924         }
19925         bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
19926             DL_BIND_REQ);
19927         if (bind_mp == NULL)
19928                 goto bad;
19929         ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap;
19930         ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
19931
19932         unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ);
19933         if (unbind_mp == NULL)
19934                 goto bad;
19935
19936         /*
19937          * Record state needed to complete this operation when the
19938          * DL_BIND_ACK shows up.  Also remember the pre-allocated mblks.
19939          */
19940         ASSERT(WR(q)->q_next == NULL);
19941         connp = Q_TO_CONN(q);
19942
19943         mutex_enter(&connp->conn_lock);
19944         mutex_enter(&ipif->ipif_ill->ill_lock);
19945         success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
19946         mutex_exit(&ipif->ipif_ill->ill_lock);
19947         mutex_exit(&connp->conn_lock);
19948         if (!success)
19949                 goto bad;
19950
19951         /*
19952          * Save the unbind message for ill_dl_down(); it will be consumed when
19953          * the interface goes down.
19954          */
19955         ASSERT(ill->ill_unbind_mp == NULL);
19956         ill->ill_unbind_mp = unbind_mp;
19957
19958         ill_dlpi_send(ill, bind_mp);
19959         /* Send down link-layer capabilities probe if not already done. */
19960         ill_capability_probe(ill);
19961
19962         /*
19963          * Sysid used to rely on the fact that netboots set domainname
19964          * and the like. Now that miniroot boots aren't strictly netboots
19965          * and miniroot network configuration is driven from userland
19966          * these things still need to be set. This situation can be detected
19967          * by comparing the interface being configured here to the one
19968          * dhcack was set to reference by the boot loader. Once sysid is
19969          * converted to use dhcp_ipc_getinfo() this call can go away.
19970          */
19971         if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && (dhcack != NULL) &&
19972             (strcmp(ill->ill_name, dhcack) == 0) &&
19973             (strlen(srpc_domain) == 0)) {
19974                 if (dhcpinit() != 0)
19975                         cmn_err(CE_WARN, "no cached dhcp response");
19976         }
19977
19978         /*
19979          * This operation will complete in ip_rput_dlpi with either
19980          * a DL_BIND_ACK or DL_ERROR_ACK.
19981          */
19982         return (EINPROGRESS);
19983 bad:
19984         ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
19985         /*
19986          * We don't have to check for possible removal from illgrp
19987          * as we have not yet inserted in illgrp. For groups
19988          * without names, this ipif is still not UP and hence
19989          * this could not have possibly had any influence in forming
19990          * groups.
19991          */
19992
19993         freemsg(bind_mp);
19994         freemsg(unbind_mp);
19995         return (ENOMEM);
19996 }
19997
19998 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20;
19999
20000 /*
20001  * DLPI and ARP is up.
20002  * Create all the IREs associated with an interface bring up multicast.
20003  * Set the interface flag and finish other initialization
20004  * that potentially had to be differed to after DL_BIND_ACK.
20005  */
20006 int
20007 ipif_up_done(ipif_t *ipif)
20008 {
20009         ire_t   *ire_array[20];
20010         ire_t   **irep = ire_array;
20011         ire_t   **irep1;
20012         ipaddr_t net_mask = 0;
20013         ipaddr_t subnet_mask, route_mask;
20014         ill_t   *ill = ipif->ipif_ill;
20015         queue_t *stq;
20016         ipif_t   *src_ipif;
20017         ipif_t   *tmp_ipif;
20018         boolean_t       flush_ire_cache = B_TRUE;
20019         int     err = 0;
20020         phyint_t *phyi;
20021         ire_t   **ipif_saved_irep = NULL;
20022         int ipif_saved_ire_cnt;
20023         int     cnt;
20024         boolean_t       src_ipif_held = B_FALSE;
20025         boolean_t       ire_added = B_FALSE;
20026         boolean_t       loopback = B_FALSE;
20027         ip_stack_t      *ipst = ill->ill_ipst;
20028
20029         ip1dbg(("ipif_up_done(%s:%u)\n",
20030             ipif->ipif_ill->ill_name, ipif->ipif_id));
20031         /* Check if this is a loopback interface */
20032         if (ipif->ipif_ill->ill_wq == NULL)
20033                 loopback = B_TRUE;
20034
20035         ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
20036         /*
20037          * If all other interfaces for this ill are down or DEPRECATED,
20038          * or otherwise unsuitable for source address selection, remove
20039          * any IRE_CACHE entries for this ill to make sure source
20040          * address selection gets to take this new ipif into account.
20041          * No need to hold ill_lock while traversing the ipif list since
20042          * we are writer
20043          */
20044         for (tmp_ipif = ill->ill_ipif; tmp_ipif;
20045             tmp_ipif = tmp_ipif->ipif_next) {
20046                 if (((tmp_ipif->ipif_flags &
20047                     (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) ||
20048                     !(tmp_ipif->ipif_flags & IPIF_UP)) ||
20049                     (tmp_ipif == ipif))
20050                         continue;
20051                 /* first useable pre-existing interface */
20052                 flush_ire_cache = B_FALSE;
20053                 break;
20054         }
20055         if (flush_ire_cache)
20056                 ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE,
20057                     IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill);
20058
20059         /*
20060          * Figure out which way the send-to queue should go.  Only
20061          * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK
20062          * should show up here.
20063          */
20064         switch (ill->ill_net_type) {
20065         case IRE_IF_RESOLVER:
20066                 stq = ill->ill_rq;
20067                 break;
20068         case IRE_IF_NORESOLVER:
20069         case IRE_LOOPBACK:
20070                 stq = ill->ill_wq;
20071                 break;
20072         default:
20073                 return (EINVAL);
20074         }
20075
20076         if (IS_LOOPBACK(ill)) {
20077                 /*
20078                  * lo0:1 and subsequent ipifs were marked IRE_LOCAL in
20079                  * ipif_lookup_on_name(), but in the case of zones we can have
20080                  * several loopback addresses on lo0. So all the interfaces with
20081                  * loopback addresses need to be marked IRE_LOOPBACK.
20082                  */
20083                 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) ==
20084                     htonl(INADDR_LOOPBACK))
20085                         ipif->ipif_ire_type = IRE_LOOPBACK;
20086                 else
20087                         ipif->ipif_ire_type = IRE_LOCAL;
20088         }
20089
20090         if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) {
20091                 /*
20092                  * Can't use our source address. Select a different
20093                  * source address for the IRE_INTERFACE and IRE_LOCAL
20094                  */
20095                 src_ipif = ipif_select_source(ipif->ipif_ill,
20096                     ipif->ipif_subnet, ipif->ipif_zoneid);
20097                 if (src_ipif == NULL)
20098                         src_ipif = ipif;        /* Last resort */
20099                 else
20100                         src_ipif_held = B_TRUE;
20101         } else {
20102                 src_ipif = ipif;
20103         }
20104
20105         /* Create all the IREs associated with this interface */
20106         if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
20107             !(ipif->ipif_flags & IPIF_NOLOCAL)) {
20108
20109                 /*
20110                  * If we're on a labeled system then make sure that zone-
20111                  * private addresses have proper remote host database entries.
20112                  */
20113                 if (is_system_labeled() &&
20114                     ipif->ipif_ire_type != IRE_LOOPBACK &&
20115                     !tsol_check_interface_address(ipif))
20116                         return (EINVAL);
20117
20118                 /* Register the source address for __sin6_src_id */
20119                 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
20120                     ipif->ipif_zoneid, ipst);
20121                 if (err != 0) {
20122                         ip0dbg(("ipif_up_done: srcid_insert %d\n", err));
20123                         return (err);
20124                 }
20125
20126                 /* If the interface address is set, create the local IRE. */
20127                 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n",
20128                     (void *)ipif,
20129                     ipif->ipif_ire_type,
20130                     ntohl(ipif->ipif_lcl_addr)));
20131                 *irep++ = ire_create(
20132                     (uchar_t *)&ipif->ipif_lcl_addr,    /* dest address */
20133                     (uchar_t *)&ip_g_all_ones,          /* mask */
20134                     (uchar_t *)&src_ipif->ipif_src_addr, /* source address */
20135                     NULL,                               /* no gateway */
20136                     &ip_loopback_mtuplus,               /* max frag size */
20137                     NULL,
20138                     ipif->ipif_rq,                      /* recv-from queue */
20139                     NULL,                               /* no send-to queue */
20140                     ipif->ipif_ire_type,                /* LOCAL or LOOPBACK */
20141                     ipif,
20142                     0,
20143                     0,
20144                     0,
20145                     (ipif->ipif_flags & IPIF_PRIVATE) ?
20146                     RTF_PRIVATE : 0,
20147                     &ire_uinfo_null,
20148                     NULL,
20149                     NULL,
20150                     ipst);
20151         } else {
20152                 ip1dbg((
20153                     "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n",
20154                     ipif->ipif_ire_type,
20155                     ntohl(ipif->ipif_lcl_addr),
20156                     (uint_t)ipif->ipif_flags));
20157         }
20158         if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
20159             !(ipif->ipif_flags & IPIF_NOLOCAL)) {
20160                 net_mask = ip_net_mask(ipif->ipif_lcl_addr);
20161         } else {
20162                 net_mask = htonl(IN_CLASSA_NET);        /* fallback */
20163         }
20164
20165         subnet_mask = ipif->ipif_net_mask;
20166
20167         /*
20168          * If mask was not specified, use natural netmask of
20169          * interface address. Also, store this mask back into the
20170          * ipif struct.
20171          */
20172         if (subnet_mask == 0) {
20173                 subnet_mask = net_mask;
20174                 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask);
20175                 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
20176                     ipif->ipif_v6subnet);
20177         }
20178
20179         /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
20180         if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) &&
20181             ipif->ipif_subnet != INADDR_ANY) {
20182                 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
20183
20184                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
20185                         route_mask = IP_HOST_MASK;
20186                 } else {
20187                         route_mask = subnet_mask;
20188                 }
20189
20190                 ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p "
20191                     "creating if IRE ill_net_type 0x%x for 0x%x\n",
20192                     (void *)ipif, (void *)ill,
20193                     ill->ill_net_type,
20194                     ntohl(ipif->ipif_subnet)));
20195                 *irep++ = ire_create(
20196                     (uchar_t *)&ipif->ipif_subnet,      /* dest address */
20197                     (uchar_t *)&route_mask,             /* mask */
20198                     (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
20199                     NULL,                               /* no gateway */
20200                     &ipif->ipif_mtu,                    /* max frag */
20201                     NULL,
20202                     NULL,                               /* no recv queue */
20203                     stq,                                /* send-to queue */
20204                     ill->ill_net_type,                  /* IF_[NO]RESOLVER */
20205                     ipif,
20206                     0,
20207                     0,
20208                     0,
20209                     (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0,
20210                     &ire_uinfo_null,
20211                     NULL,
20212                     NULL,
20213                     ipst);
20214         }
20215
20216         /*
20217          * Create any necessary broadcast IREs.
20218          */
20219         if ((ipif->ipif_subnet != INADDR_ANY) &&
20220             (ipif->ipif_flags & IPIF_BROADCAST))
20221                 irep = ipif_create_bcast_ires(ipif, irep);
20222
20223         ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
20224
20225         /* If an earlier ire_create failed, get out now */
20226         for (irep1 = irep; irep1 > ire_array; ) {
20227                 irep1--;
20228                 if (*irep1 == NULL) {
20229                         ip1dbg(("ipif_up_done: NULL ire found in ire_array\n"));
20230                         err = ENOMEM;
20231                         goto bad;
20232                 }
20233         }
20234
20235         /*
20236          * Need to atomically check for ip_addr_availablity_check
20237          * under ip_addr_avail_lock, and if it fails got bad, and remove
20238          * from group also.The ill_g_lock is grabbed as reader
20239          * just to make sure no new ills or new ipifs are being added
20240          * to the system while we are checking the uniqueness of addresses.
20241          */
20242         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
20243         mutex_enter(&ipst->ips_ip_addr_avail_lock);
20244         /* Mark it up, and increment counters. */
20245         ipif->ipif_flags |= IPIF_UP;
20246         ill->ill_ipif_up_count++;
20247         err = ip_addr_availability_check(ipif);
20248         mutex_exit(&ipst->ips_ip_addr_avail_lock);
20249         rw_exit(&ipst->ips_ill_g_lock);
20250
20251         if (err != 0) {
20252                 /*
20253                  * Our address may already be up on the same ill. In this case,
20254                  * the ARP entry for our ipif replaced the one for the other
20255                  * ipif. So we don't want to delete it (otherwise the other ipif
20256                  * would be unable to send packets).
20257                  * ip_addr_availability_check() identifies this case for us and
20258                  * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL
20259                  * which is the expected error code.
20260                  */
20261                 if (err == EADDRINUSE) {
20262                         freemsg(ipif->ipif_arp_del_mp);
20263                         ipif->ipif_arp_del_mp = NULL;
20264                         err = EADDRNOTAVAIL;
20265                 }
20266                 ill->ill_ipif_up_count--;
20267                 ipif->ipif_flags &= ~IPIF_UP;
20268                 goto bad;
20269         }
20270
20271         /*
20272          * Add in all newly created IREs.  ire_create_bcast() has
20273          * already checked for duplicates of the IRE_BROADCAST type.
20274          * We want to add before we call ifgrp_insert which wants
20275          * to know whether IRE_IF_RESOLVER exists or not.
20276          *
20277          * NOTE : We refrele the ire though we may branch to "bad"
20278          *        later on where we do ire_delete. This is okay
20279          *        because nobody can delete it as we are running
20280          *        exclusively.
20281          */
20282         for (irep1 = irep; irep1 > ire_array; ) {
20283                 irep1--;
20284                 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock)));
20285                 /*
20286                  * refheld by ire_add. refele towards the end of the func
20287                  */
20288                 (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE);
20289         }
20290         ire_added = B_TRUE;
20291         /*
20292          * Form groups if possible.
20293          *
20294          * If we are supposed to be in a ill_group with a name, insert it
20295          * now as we know that at least one ipif is UP. Otherwise form
20296          * nameless groups.
20297          *
20298          * If ip_enable_group_ifs is set and ipif address is not 0, insert
20299          * this ipif into the appropriate interface group, or create a
20300          * new one. If this is already in a nameless group, we try to form
20301          * a bigger group looking at other ills potentially sharing this
20302          * ipif's prefix.
20303          */
20304         phyi = ill->ill_phyint;
20305         if (phyi->phyint_groupname_len != 0) {
20306                 ASSERT(phyi->phyint_groupname != NULL);
20307                 if (ill->ill_ipif_up_count == 1) {
20308                         ASSERT(ill->ill_group == NULL);
20309                         err = illgrp_insert(&ipst->ips_illgrp_head_v4, ill,
20310                             phyi->phyint_groupname, NULL, B_TRUE);
20311                         if (err != 0) {
20312                                 ip1dbg(("ipif_up_done: illgrp allocation "
20313                                     "failed, error %d\n", err));
20314                                 goto bad;
20315                         }
20316                 }
20317                 ASSERT(ill->ill_group != NULL);
20318         }
20319
20320         /*
20321          * When this is part of group, we need to make sure that
20322          * any broadcast ires created because of this ipif coming
20323          * UP gets marked/cleared with IRE_MARK_NORECV appropriately
20324          * so that we don't receive duplicate broadcast packets.
20325          */
20326         if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0)
20327                 ipif_renominate_bcast(ipif);
20328
20329         /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
20330         ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
20331         ipif_saved_irep = ipif_recover_ire(ipif);
20332
20333         if (!loopback) {
20334                 /*
20335                  * If the broadcast address has been set, make sure it makes
20336                  * sense based on the interface address.
20337                  * Only match on ill since we are sharing broadcast addresses.
20338                  */
20339                 if ((ipif->ipif_brd_addr != INADDR_ANY) &&
20340                     (ipif->ipif_flags & IPIF_BROADCAST)) {
20341                         ire_t   *ire;
20342
20343                         ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0,
20344                             IRE_BROADCAST, ipif, ALL_ZONES,
20345                             NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst);
20346
20347                         if (ire == NULL) {
20348                                 /*
20349                                  * If there isn't a matching broadcast IRE,
20350                                  * revert to the default for this netmask.
20351                                  */
20352                                 ipif->ipif_v6brd_addr = ipv6_all_zeros;
20353                                 mutex_enter(&ipif->ipif_ill->ill_lock);
20354                                 ipif_set_default(ipif);
20355                                 mutex_exit(&ipif->ipif_ill->ill_lock);
20356                         } else {
20357                                 ire_refrele(ire);
20358                         }
20359                 }
20360
20361         }
20362
20363         /* This is the first interface on this ill */
20364         if (ipif->ipif_ipif_up_count == 1 && !loopback) {
20365                 /*
20366                  * Need to recover all multicast memberships in the driver.
20367                  * This had to be deferred until we had attached.
20368                  */
20369                 ill_recover_multicast(ill);
20370         }
20371         /* Join the allhosts multicast address */
20372         ipif_multicast_up(ipif);
20373
20374         if (!loopback) {
20375                 /*
20376                  * See whether anybody else would benefit from the
20377                  * new ipif that we added. We call this always rather
20378                  * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST
20379                  * ipif is for the benefit of illgrp_insert (done above)
20380                  * which does not do source address selection as it does
20381                  * not want to re-create interface routes that we are
20382                  * having reference to it here.
20383                  */
20384                 ill_update_source_selection(ill);
20385         }
20386
20387         for (irep1 = irep; irep1 > ire_array; ) {
20388                 irep1--;
20389                 if (*irep1 != NULL) {
20390                         /* was held in ire_add */
20391                         ire_refrele(*irep1);
20392                 }
20393         }
20394
20395         cnt = ipif_saved_ire_cnt;
20396         for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) {
20397                 if (*irep1 != NULL) {
20398                         /* was held in ire_add */
20399                         ire_refrele(*irep1);
20400                 }
20401         }
20402
20403         if (!loopback && ipif->ipif_addr_ready) {
20404                 /* Broadcast an address mask reply. */
20405                 ipif_mask_reply(ipif);
20406         }
20407         if (ipif_saved_irep != NULL) {
20408                 kmem_free(ipif_saved_irep,
20409                     ipif_saved_ire_cnt * sizeof (ire_t *));
20410         }
20411         if (src_ipif_held)
20412                 ipif_refrele(src_ipif);
20413
20414         /*
20415          * This had to be deferred until we had bound.  Tell routing sockets and
20416          * others that this interface is up if it looks like the address has
20417          * been validated.  Otherwise, if it isn't ready yet, wait for
20418          * duplicate address detection to do its thing.
20419          */
20420         if (ipif->ipif_addr_ready) {
20421                 ip_rts_ifmsg(ipif);
20422                 ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
20423                 /* Let SCTP update the status for this ipif */
20424                 sctp_update_ipif(ipif, SCTP_IPIF_UP);
20425         }
20426         return (0);
20427
20428 bad:
20429         ip1dbg(("ipif_up_done: FAILED \n"));
20430         /*
20431          * We don't have to bother removing from ill groups because
20432          *
20433          * 1) For groups with names, we insert only when the first ipif
20434          *    comes up. In that case if it fails, it will not be in any
20435          *    group. So, we need not try to remove for that case.
20436          *
20437          * 2) For groups without names, either we tried to insert ipif_ill
20438          *    in a group as singleton or found some other group to become
20439          *    a bigger group. For the former, if it fails we don't have
20440          *    anything to do as ipif_ill is not in the group and for the
20441          *    latter, there are no failures in illgrp_insert/illgrp_delete
20442          *    (ENOMEM can't occur for this. Check ifgrp_insert).
20443          */
20444         while (irep > ire_array) {
20445                 irep--;
20446                 if (*irep != NULL) {
20447                         ire_delete(*irep);
20448                         if (ire_added)
20449                                 ire_refrele(*irep);
20450                 }
20451         }
20452         (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
20453
20454         if (ipif_saved_irep != NULL) {
20455                 kmem_free(ipif_saved_irep,
20456                     ipif_saved_ire_cnt * sizeof (ire_t *));
20457         }
20458         if (src_ipif_held)
20459                 ipif_refrele(src_ipif);
20460
20461         ipif_arp_down(ipif);
20462         return (err);
20463 }
20464
20465 /*
20466  * Turn off the ARP with the ILLF_NOARP flag.
20467  */
20468 static int
20469 ill_arp_off(ill_t *ill)
20470 {
20471         mblk_t  *arp_off_mp = NULL;
20472         mblk_t  *arp_on_mp = NULL;
20473
20474         ip1dbg(("ill_arp_off(%s)\n", ill->ill_name));
20475
20476         ASSERT(IAM_WRITER_ILL(ill));
20477         ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
20478
20479         /*
20480          * If the on message is still around we've already done
20481          * an arp_off without doing an arp_on thus there is no
20482          * work needed.
20483          */
20484         if (ill->ill_arp_on_mp != NULL)
20485                 return (0);
20486
20487         /*
20488          * Allocate an ARP on message (to be saved) and an ARP off message
20489          */
20490         arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0);
20491         if (!arp_off_mp)
20492                 return (ENOMEM);
20493
20494         arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0);
20495         if (!arp_on_mp)
20496                 goto failed;
20497
20498         ASSERT(ill->ill_arp_on_mp == NULL);
20499         ill->ill_arp_on_mp = arp_on_mp;
20500
20501         /* Send an AR_INTERFACE_OFF request */
20502         putnext(ill->ill_rq, arp_off_mp);
20503         return (0);
20504 failed:
20505
20506         if (arp_off_mp)
20507                 freemsg(arp_off_mp);
20508         return (ENOMEM);
20509 }
20510
20511 /*
20512  * Turn on ARP by turning off the ILLF_NOARP flag.
20513  */
20514 static int
20515 ill_arp_on(ill_t *ill)
20516 {
20517         mblk_t  *mp;
20518
20519         ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name));
20520
20521         ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
20522
20523         ASSERT(IAM_WRITER_ILL(ill));
20524         /*
20525          * Send an AR_INTERFACE_ON request if we have already done
20526          * an arp_off (which allocated the message).
20527          */
20528         if (ill->ill_arp_on_mp != NULL) {
20529                 mp = ill->ill_arp_on_mp;
20530                 ill->ill_arp_on_mp = NULL;
20531                 putnext(ill->ill_rq, mp);
20532         }
20533         return (0);
20534 }
20535
20536 /*
20537  * Called after either deleting ill from the group or when setting
20538  * FAILED or STANDBY on the interface.
20539  */
20540 static void
20541 illgrp_reset_schednext(ill_t *ill)
20542 {
20543         ill_group_t *illgrp;
20544         ill_t *save_ill;
20545
20546         ASSERT(IAM_WRITER_ILL(ill));
20547         /*
20548          * When called from illgrp_delete, ill_group will be non-NULL.
20549          * But when called from ip_sioctl_flags, it could be NULL if
20550          * somebody is setting FAILED/INACTIVE on some interface which
20551          * is not part of a group.
20552          */
20553         illgrp = ill->ill_group;
20554         if (illgrp == NULL)
20555                 return;
20556         if (illgrp->illgrp_ill_schednext != ill)
20557                 return;
20558
20559         illgrp->illgrp_ill_schednext = NULL;
20560         save_ill = ill;
20561         /*
20562          * Choose a good ill to be the next one for
20563          * outbound traffic. As the flags FAILED/STANDBY is
20564          * not yet marked when called from ip_sioctl_flags,
20565          * we check for ill separately.
20566          */
20567         for (ill = illgrp->illgrp_ill; ill != NULL;
20568             ill = ill->ill_group_next) {
20569                 if ((ill != save_ill) &&
20570                     !(ill->ill_phyint->phyint_flags &
20571                     (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) {
20572                         illgrp->illgrp_ill_schednext = ill;
20573                         return;
20574                 }
20575         }
20576 }
20577
20578 /*
20579  * Given an ill, find the next ill in the group to be scheduled.
20580  * (This should be called by ip_newroute() before ire_create().)
20581  * The passed in ill may be pulled out of the group, after we have picked
20582  * up a different outgoing ill from the same group. However ire add will
20583  * atomically check this.
20584  */
20585 ill_t *
20586 illgrp_scheduler(ill_t *ill)
20587 {
20588         ill_t *retill;
20589         ill_group_t *illgrp;
20590         int illcnt;
20591         int i;
20592         uint64_t flags;
20593         ip_stack_t      *ipst = ill->ill_ipst;
20594
20595         /*
20596          * We don't use a lock to check for the ill_group. If this ill
20597          * is currently being inserted we may end up just returning this
20598          * ill itself. That is ok.
20599          */
20600         if (ill->ill_group == NULL) {
20601                 ill_refhold(ill);
20602                 return (ill);
20603         }
20604
20605         /*
20606          * Grab the ill_g_lock as reader to make sure we are dealing with
20607          * a set of stable ills. No ill can be added or deleted or change
20608          * group while we hold the reader lock.
20609          */
20610         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
20611         if ((illgrp = ill->ill_group) == NULL) {
20612                 rw_exit(&ipst->ips_ill_g_lock);
20613                 ill_refhold(ill);
20614                 return (ill);
20615         }
20616
20617         illcnt = illgrp->illgrp_ill_count;
20618         mutex_enter(&illgrp->illgrp_lock);
20619         retill = illgrp->illgrp_ill_schednext;
20620
20621         if (retill == NULL)
20622                 retill = illgrp->illgrp_ill;
20623
20624         /*
20625          * We do a circular search beginning at illgrp_ill_schednext
20626          * or illgrp_ill. We don't check the flags against the ill lock
20627          * since it can change anytime. The ire creation will be atomic
20628          * and will fail if the ill is FAILED or OFFLINE.
20629          */
20630         for (i = 0; i < illcnt; i++) {
20631                 flags = retill->ill_phyint->phyint_flags;
20632
20633                 if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) &&
20634                     ILL_CAN_LOOKUP(retill)) {
20635                         illgrp->illgrp_ill_schednext = retill->ill_group_next;
20636                         ill_refhold(retill);
20637                         break;
20638                 }
20639                 retill = retill->ill_group_next;
20640                 if (retill == NULL)
20641                         retill = illgrp->illgrp_ill;
20642         }
20643         mutex_exit(&illgrp->illgrp_lock);
20644         rw_exit(&ipst->ips_ill_g_lock);
20645
20646         return (i == illcnt ? NULL : retill);
20647 }
20648
20649 /*
20650  * Checks for availbility of a usable source address (if there is one) when the
20651  * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note
20652  * this selection is done regardless of the destination.
20653  */
20654 boolean_t
20655 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid)
20656 {
20657         uint_t  ifindex;
20658         ipif_t  *ipif = NULL;
20659         ill_t   *uill;
20660         boolean_t isv6;
20661         ip_stack_t      *ipst = ill->ill_ipst;
20662
20663         ASSERT(ill != NULL);
20664
20665         isv6 = ill->ill_isv6;
20666         ifindex = ill->ill_usesrc_ifindex;
20667         if (ifindex != 0) {
20668                 uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL,
20669                     NULL, ipst);
20670                 if (uill == NULL)
20671                         return (NULL);
20672                 mutex_enter(&uill->ill_lock);
20673                 for (ipif = uill->ill_ipif; ipif != NULL;
20674                     ipif = ipif->ipif_next) {
20675                         if (!IPIF_CAN_LOOKUP(ipif))
20676                                 continue;
20677                         if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
20678                                 continue;
20679                         if (!(ipif->ipif_flags & IPIF_UP))
20680                                 continue;
20681                         if (ipif->ipif_zoneid != zoneid)
20682                                 continue;
20683                         if ((isv6 &&
20684                             IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) ||
20685                             (ipif->ipif_lcl_addr == INADDR_ANY))
20686                                 continue;
20687                         mutex_exit(&uill->ill_lock);
20688                         ill_refrele(uill);
20689                         return (B_TRUE);
20690                 }
20691                 mutex_exit(&uill->ill_lock);
20692                 ill_refrele(uill);
20693         }
20694         return (B_FALSE);
20695 }
20696
20697 /*
20698  * Determine the best source address given a destination address and an ill.
20699  * Prefers non-deprecated over deprecated but will return a deprecated
20700  * address if there is no other choice. If there is a usable source address
20701  * on the interface pointed to by ill_usesrc_ifindex then that is given
20702  * first preference.
20703  *
20704  * Returns NULL if there is no suitable source address for the ill.
20705  * This only occurs when there is no valid source address for the ill.
20706  */
20707 ipif_t *
20708 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
20709 {
20710         ipif_t *ipif;
20711         ipif_t *ipif_dep = NULL;        /* Fallback to deprecated */
20712         ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE];
20713         int index = 0;
20714         boolean_t wrapped = B_FALSE;
20715         boolean_t same_subnet_only = B_FALSE;
20716         boolean_t ipif_same_found, ipif_other_found;
20717         boolean_t specific_found;
20718         ill_t   *till, *usill = NULL;
20719         tsol_tpc_t *src_rhtp, *dst_rhtp;
20720         ip_stack_t      *ipst = ill->ill_ipst;
20721
20722         if (ill->ill_usesrc_ifindex != 0) {
20723                 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
20724                     B_FALSE, NULL, NULL, NULL, NULL, ipst);
20725                 if (usill != NULL)
20726                         ill = usill;    /* Select source from usesrc ILL */
20727                 else
20728                         return (NULL);
20729         }
20730
20731         /*
20732          * If we're dealing with an unlabeled destination on a labeled system,
20733          * make sure that we ignore source addresses that are incompatible with
20734          * the destination's default label.  That destination's default label
20735          * must dominate the minimum label on the source address.
20736          */
20737         dst_rhtp = NULL;
20738         if (is_system_labeled()) {
20739                 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE);
20740                 if (dst_rhtp == NULL)
20741                         return (NULL);
20742                 if (dst_rhtp->tpc_tp.host_type != UNLABELED) {
20743                         TPC_RELE(dst_rhtp);
20744                         dst_rhtp = NULL;
20745                 }
20746         }
20747
20748         /*
20749          * Holds the ill_g_lock as reader. This makes sure that no ipif/ill
20750          * can be deleted. But an ipif/ill can get CONDEMNED any time.
20751          * After selecting the right ipif, under ill_lock make sure ipif is
20752          * not condemned, and increment refcnt. If ipif is CONDEMNED,
20753          * we retry. Inside the loop we still need to check for CONDEMNED,
20754          * but not under a lock.
20755          */
20756         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
20757
20758 retry:
20759         till = ill;
20760         ipif_arr[0] = NULL;
20761
20762         if (till->ill_group != NULL)
20763                 till = till->ill_group->illgrp_ill;
20764
20765         /*
20766          * Choose one good source address from each ill across the group.
20767          * If possible choose a source address in the same subnet as
20768          * the destination address.
20769          *
20770          * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE
20771          * This is okay because of the following.
20772          *
20773          *    If PHYI_FAILED is set and we still have non-deprecated
20774          *    addresses, it means the addresses have not yet been
20775          *    failed over to a different interface. We potentially
20776          *    select them to create IRE_CACHES, which will be later
20777          *    flushed when the addresses move over.
20778          *
20779          *    If PHYI_INACTIVE is set and we still have non-deprecated
20780          *    addresses, it means either the user has configured them
20781          *    or PHYI_INACTIVE has not been cleared after the addresses
20782          *    been moved over. For the former, in.mpathd does a failover
20783          *    when the interface becomes INACTIVE and hence we should
20784          *    not find them. Once INACTIVE is set, we don't allow them
20785          *    to create logical interfaces anymore. For the latter, a
20786          *    flush will happen when INACTIVE is cleared which will
20787          *    flush the IRE_CACHES.
20788          *
20789          *    If PHYI_OFFLINE is set, all the addresses will be failed
20790          *    over soon. We potentially select them to create IRE_CACHEs,
20791          *    which will be later flushed when the addresses move over.
20792          *
20793          * NOTE : As ipif_select_source is called to borrow source address
20794          * for an ipif that is part of a group, source address selection
20795          * will be re-done whenever the group changes i.e either an
20796          * insertion/deletion in the group.
20797          *
20798          * Fill ipif_arr[] with source addresses, using these rules:
20799          *
20800          *      1. At most one source address from a given ill ends up
20801          *         in ipif_arr[] -- that is, at most one of the ipif's
20802          *         associated with a given ill ends up in ipif_arr[].
20803          *
20804          *      2. If there is at least one non-deprecated ipif in the
20805          *         IPMP group with a source address on the same subnet as
20806          *         our destination, then fill ipif_arr[] only with
20807          *         source addresses on the same subnet as our destination.
20808          *         Note that because of (1), only the first
20809          *         non-deprecated ipif found with a source address
20810          *         matching the destination ends up in ipif_arr[].
20811          *
20812          *      3. Otherwise, fill ipif_arr[] with non-deprecated source
20813          *         addresses not in the same subnet as our destination.
20814          *         Again, because of (1), only the first off-subnet source
20815          *         address will be chosen.
20816          *
20817          *      4. If there are no non-deprecated ipifs, then just use
20818          *         the source address associated with the last deprecated
20819          *         one we find that happens to be on the same subnet,
20820          *         otherwise the first one not in the same subnet.
20821          */
20822         specific_found = B_FALSE;
20823         for (; till != NULL; till = till->ill_group_next) {
20824                 ipif_same_found = B_FALSE;
20825                 ipif_other_found = B_FALSE;
20826                 for (ipif = till->ill_ipif; ipif != NULL;
20827                     ipif = ipif->ipif_next) {
20828                         if (!IPIF_CAN_LOOKUP(ipif))
20829                                 continue;
20830                         /* Always skip NOLOCAL and ANYCAST interfaces */
20831                         if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
20832                                 continue;
20833                         if (!(ipif->ipif_flags & IPIF_UP) ||
20834                             !ipif->ipif_addr_ready)
20835                                 continue;
20836                         if (ipif->ipif_zoneid != zoneid &&
20837                             ipif->ipif_zoneid != ALL_ZONES)
20838                                 continue;
20839                         /*
20840                          * Interfaces with 0.0.0.0 address are allowed to be UP,
20841                          * but are not valid as source addresses.
20842                          */
20843                         if (ipif->ipif_lcl_addr == INADDR_ANY)
20844                                 continue;
20845
20846                         /*
20847                          * Check compatibility of local address for
20848                          * destination's default label if we're on a labeled
20849                          * system.  Incompatible addresses can't be used at
20850                          * all.
20851                          */
20852                         if (dst_rhtp != NULL) {
20853                                 boolean_t incompat;
20854
20855                                 src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
20856                                     IPV4_VERSION, B_FALSE);
20857                                 if (src_rhtp == NULL)
20858                                         continue;
20859                                 incompat =
20860                                     src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
20861                                     src_rhtp->tpc_tp.tp_doi !=
20862                                     dst_rhtp->tpc_tp.tp_doi ||
20863                                     (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
20864                                     &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
20865                                     !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
20866                                     src_rhtp->tpc_tp.tp_sl_set_cipso));
20867                                 TPC_RELE(src_rhtp);
20868                                 if (incompat)
20869                                         continue;
20870                         }
20871
20872                         /*
20873                          * We prefer not to use all all-zones addresses, if we
20874                          * can avoid it, as they pose problems with unlabeled
20875                          * destinations.
20876                          */
20877                         if (ipif->ipif_zoneid != ALL_ZONES) {
20878                                 if (!specific_found &&
20879                                     (!same_subnet_only ||
20880                                     (ipif->ipif_net_mask & dst) ==
20881                                     ipif->ipif_subnet)) {
20882                                         index = 0;
20883                                         specific_found = B_TRUE;
20884                                         ipif_other_found = B_FALSE;
20885                                 }
20886                         } else {
20887                                 if (specific_found)
20888                                         continue;
20889                         }
20890                         if (ipif->ipif_flags & IPIF_DEPRECATED) {
20891                                 if (ipif_dep == NULL ||
20892                                     (ipif->ipif_net_mask & dst) ==
20893                                     ipif->ipif_subnet)
20894                                         ipif_dep = ipif;
20895                                 continue;
20896                         }
20897                         if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) {
20898                                 /* found a source address in the same subnet */
20899                                 if (!same_subnet_only) {
20900                                         same_subnet_only = B_TRUE;
20901                                         index = 0;
20902                                 }
20903                                 ipif_same_found = B_TRUE;
20904                         } else {
20905                                 if (same_subnet_only || ipif_other_found)
20906                                         continue;
20907                                 ipif_other_found = B_TRUE;
20908                         }
20909                         ipif_arr[index++] = ipif;
20910                         if (index == MAX_IPIF_SELECT_SOURCE) {
20911                                 wrapped = B_TRUE;
20912                                 index = 0;
20913                         }
20914                         if (ipif_same_found)
20915                                 break;
20916                 }
20917         }
20918
20919         if (ipif_arr[0] == NULL) {
20920                 ipif = ipif_dep;
20921         } else {
20922                 if (wrapped)
20923                         index = MAX_IPIF_SELECT_SOURCE;
20924                 ipif = ipif_arr[ipif_rand(ipst) % index];
20925                 ASSERT(ipif != NULL);
20926         }
20927
20928         if (ipif != NULL) {
20929                 mutex_enter(&ipif->ipif_ill->ill_lock);
20930                 if (!IPIF_CAN_LOOKUP(ipif)) {
20931                         mutex_exit(&ipif->ipif_ill->ill_lock);
20932                         goto retry;
20933                 }
20934                 ipif_refhold_locked(ipif);
20935                 mutex_exit(&ipif->ipif_ill->ill_lock);
20936         }
20937
20938         rw_exit(&ipst->ips_ill_g_lock);
20939         if (usill != NULL)
20940                 ill_refrele(usill);
20941         if (dst_rhtp != NULL)
20942                 TPC_RELE(dst_rhtp);
20943
20944 #ifdef DEBUG
20945         if (ipif == NULL) {
20946                 char buf1[INET6_ADDRSTRLEN];
20947
20948                 ip1dbg(("ipif_select_source(%s, %s) -> NULL\n",
20949                     ill->ill_name,
20950                     inet_ntop(AF_INET, &dst, buf1, sizeof (buf1))));
20951         } else {
20952                 char buf1[INET6_ADDRSTRLEN];
20953                 char buf2[INET6_ADDRSTRLEN];
20954
20955                 ip1dbg(("ipif_select_source(%s, %s) -> %s\n",
20956                     ipif->ipif_ill->ill_name,
20957                     inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)),
20958                     inet_ntop(AF_INET, &ipif->ipif_lcl_addr,
20959                     buf2, sizeof (buf2))));
20960         }
20961 #endif /* DEBUG */
20962         return (ipif);
20963 }
20964
20965
20966 /*
20967  * If old_ipif is not NULL, see if ipif was derived from old
20968  * ipif and if so, recreate the interface route by re-doing
20969  * source address selection. This happens when ipif_down ->
20970  * ipif_update_other_ipifs calls us.
20971  *
20972  * If old_ipif is NULL, just redo the source address selection
20973  * if needed. This happens when illgrp_insert or ipif_up_done
20974  * calls us.
20975  */
20976 static void
20977 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif)
20978 {
20979         ire_t *ire;
20980         ire_t *ipif_ire;
20981         queue_t *stq;
20982         ipif_t *nipif;
20983         ill_t *ill;
20984         boolean_t need_rele = B_FALSE;
20985         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
20986
20987         ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif));
20988         ASSERT(IAM_WRITER_IPIF(ipif));
20989
20990         ill = ipif->ipif_ill;
20991         if (!(ipif->ipif_flags &
20992             (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
20993                 /*
20994                  * Can't possibly have borrowed the source
20995                  * from old_ipif.
20996                  */
20997                 return;
20998         }
20999
21000         /*
21001          * Is there any work to be done? No work if the address
21002          * is INADDR_ANY, loopback or NOLOCAL or ANYCAST (
21003          * ipif_select_source() does not borrow addresses from
21004          * NOLOCAL and ANYCAST interfaces).
21005          */
21006         if ((old_ipif != NULL) &&
21007             ((old_ipif->ipif_lcl_addr == INADDR_ANY) ||
21008             (old_ipif->ipif_ill->ill_wq == NULL) ||
21009             (old_ipif->ipif_flags &
21010             (IPIF_NOLOCAL|IPIF_ANYCAST)))) {
21011                 return;
21012         }
21013
21014         /*
21015          * Perform the same checks as when creating the
21016          * IRE_INTERFACE in ipif_up_done.
21017          */
21018         if (!(ipif->ipif_flags & IPIF_UP))
21019                 return;
21020
21021         if ((ipif->ipif_flags & IPIF_NOXMIT) ||
21022             (ipif->ipif_subnet == INADDR_ANY))
21023                 return;
21024
21025         ipif_ire = ipif_to_ire(ipif);
21026         if (ipif_ire == NULL)
21027                 return;
21028
21029         /*
21030          * We know that ipif uses some other source for its
21031          * IRE_INTERFACE. Is it using the source of this
21032          * old_ipif?
21033          */
21034         if (old_ipif != NULL &&
21035             old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) {
21036                 ire_refrele(ipif_ire);
21037                 return;
21038         }
21039         if (ip_debug > 2) {
21040                 /* ip1dbg */
21041                 pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for"
21042                     " src %s\n", AF_INET, &ipif_ire->ire_src_addr);
21043         }
21044
21045         stq = ipif_ire->ire_stq;
21046
21047         /*
21048          * Can't use our source address. Select a different
21049          * source address for the IRE_INTERFACE.
21050          */
21051         nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid);
21052         if (nipif == NULL) {
21053                 /* Last resort - all ipif's have IPIF_NOLOCAL */
21054                 nipif = ipif;
21055         } else {
21056                 need_rele = B_TRUE;
21057         }
21058
21059         ire = ire_create(
21060             (uchar_t *)&ipif->ipif_subnet,      /* dest pref */
21061             (uchar_t *)&ipif->ipif_net_mask,    /* mask */
21062             (uchar_t *)&nipif->ipif_src_addr,   /* src addr */
21063             NULL,                               /* no gateway */
21064             &ipif->ipif_mtu,                    /* max frag */
21065             NULL,                               /* no src nce */
21066             NULL,                               /* no recv from queue */
21067             stq,                                /* send-to queue */
21068             ill->ill_net_type,                  /* IF_[NO]RESOLVER */
21069             ipif,
21070             0,
21071             0,
21072             0,
21073             0,
21074             &ire_uinfo_null,
21075             NULL,
21076             NULL,
21077             ipst);
21078
21079         if (ire != NULL) {
21080                 ire_t *ret_ire;
21081                 int error;
21082
21083                 /*
21084                  * We don't need ipif_ire anymore. We need to delete
21085                  * before we add so that ire_add does not detect
21086                  * duplicates.
21087                  */
21088                 ire_delete(ipif_ire);
21089                 ret_ire = ire;
21090                 error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE);
21091                 ASSERT(error == 0);
21092                 ASSERT(ire == ret_ire);
21093                 /* Held in ire_add */
21094                 ire_refrele(ret_ire);
21095         }
21096         /*
21097          * Either we are falling through from above or could not
21098          * allocate a replacement.
21099          */
21100         ire_refrele(ipif_ire);
21101         if (need_rele)
21102                 ipif_refrele(nipif);
21103 }
21104
21105 /*
21106  * This old_ipif is going away.
21107  *
21108  * Determine if any other ipif's is using our address as
21109  * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or
21110  * IPIF_DEPRECATED).
21111  * Find the IRE_INTERFACE for such ipifs and recreate them
21112  * to use an different source address following the rules in
21113  * ipif_up_done.
21114  *
21115  * This function takes an illgrp as an argument so that illgrp_delete
21116  * can call this to update source address even after deleting the
21117  * old_ipif->ipif_ill from the ill group.
21118  */
21119 static void
21120 ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp)
21121 {
21122         ipif_t *ipif;
21123         ill_t *ill;
21124         char    buf[INET6_ADDRSTRLEN];
21125
21126         ASSERT(IAM_WRITER_IPIF(old_ipif));
21127         ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif));
21128
21129         ill = old_ipif->ipif_ill;
21130
21131         ip1dbg(("ipif_update_other_ipifs(%s, %s)\n",
21132             ill->ill_name,
21133             inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr,
21134             buf, sizeof (buf))));
21135         /*
21136          * If this part of a group, look at all ills as ipif_select_source
21137          * borrows source address across all the ills in the group.
21138          */
21139         if (illgrp != NULL)
21140                 ill = illgrp->illgrp_ill;
21141
21142         for (; ill != NULL; ill = ill->ill_group_next) {
21143                 for (ipif = ill->ill_ipif; ipif != NULL;
21144                     ipif = ipif->ipif_next) {
21145
21146                         if (ipif == old_ipif)
21147                                 continue;
21148
21149                         ipif_recreate_interface_routes(old_ipif, ipif);
21150                 }
21151         }
21152 }
21153
21154 /* ARGSUSED */
21155 int
21156 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
21157         ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
21158 {
21159         /*
21160          * ill_phyint_reinit merged the v4 and v6 into a single
21161          * ipsq. Could also have become part of a ipmp group in the
21162          * process, and we might not have been able to complete the
21163          * operation in ipif_set_values, if we could not become
21164          * exclusive.  If so restart it here.
21165          */
21166         return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
21167 }
21168
21169
21170 /*
21171  * Can operate on either a module or a driver queue.
21172  * Returns an error if not a module queue.
21173  */
21174 /* ARGSUSED */
21175 int
21176 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
21177     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
21178 {
21179         queue_t         *q1 = q;
21180         char            *cp;
21181         char            interf_name[LIFNAMSIZ];
21182         uint_t          ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr;
21183
21184         if (q->q_next == NULL) {
21185                 ip1dbg((
21186                     "if_unitsel: IF_UNITSEL: no q_next\n"));
21187                 return (EINVAL);
21188         }
21189
21190         if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0')
21191                 return (EALREADY);
21192
21193         do {
21194                 q1 = q1->q_next;
21195         } while (q1->q_next);
21196         cp = q1->q_qinfo->qi_minfo->mi_idname;
21197         (void) sprintf(interf_name, "%s%d", cp, ppa);
21198
21199         /*
21200          * Here we are not going to delay the ioack until after
21201          * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the
21202          * original ioctl message before sending the requests.
21203          */
21204         return (ipif_set_values(q, mp, interf_name, &ppa));
21205 }
21206
21207 /* ARGSUSED */
21208 int
21209 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
21210     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
21211 {
21212         return (ENXIO);
21213 }
21214
21215 /*
21216  * Create any IRE_BROADCAST entries for `ipif', and store those entries in
21217  * `irep'.  Returns a pointer to the next free `irep' entry (just like
21218  * ire_check_and_create_bcast()).
21219  */
21220 static ire_t **
21221 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
21222 {
21223         ipaddr_t addr;
21224         ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
21225         ipaddr_t subnetmask = ipif->ipif_net_mask;
21226         int flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
21227
21228         ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n"));
21229
21230         ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
21231
21232         if (ipif->ipif_lcl_addr == INADDR_ANY ||
21233             (ipif->ipif_flags & IPIF_NOLOCAL))
21234                 netmask = htonl(IN_CLASSA_NET);         /* fallback */
21235
21236         irep = ire_check_and_create_bcast(ipif, 0, irep, flags);
21237         irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, flags);
21238
21239         /*
21240          * For backward compatibility, we create net broadcast IREs based on
21241          * the old "IP address class system", since some old machines only
21242          * respond to these class derived net broadcast.  However, we must not
21243          * create these net broadcast IREs if the subnetmask is shorter than
21244          * the IP address class based derived netmask.  Otherwise, we may
21245          * create a net broadcast address which is the same as an IP address
21246          * on the subnet -- and then TCP will refuse to talk to that address.
21247          */
21248         if (netmask < subnetmask) {
21249                 addr = netmask & ipif->ipif_subnet;
21250                 irep = ire_check_and_create_bcast(ipif, addr, irep, flags);
21251                 irep = ire_check_and_create_bcast(ipif, ~netmask | addr, irep,
21252                     flags);
21253         }
21254
21255         /*
21256          * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
21257          * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
21258          * created.  Creating these broadcast IREs will only create confusion
21259          * as `addr' will be the same as the IP address.
21260          */
21261         if (subnetmask != 0xFFFFFFFF) {
21262                 addr = ipif->ipif_subnet;
21263                 irep = ire_check_and_create_bcast(ipif, addr, irep, flags);
21264                 irep = ire_check_and_create_bcast(ipif, ~subnetmask | addr,
21265                     irep, flags);
21266         }
21267
21268         return (irep);
21269 }
21270
21271 /*
21272  * Broadcast IRE info structure used in the functions below.  Since we
21273  * allocate BCAST_COUNT of them on the stack, keep the bit layout compact.
21274  */
21275 typedef struct bcast_ireinfo {
21276         uchar_t         bi_type;        /* BCAST_* value from below */
21277         uchar_t         bi_willdie:1,   /* will this IRE be going away? */
21278                         bi_needrep:1,   /* do we need to replace it? */
21279                         bi_haverep:1,   /* have we replaced it? */
21280                         bi_pad:5;
21281         ipaddr_t        bi_addr;        /* IRE address */
21282         ipif_t          *bi_backup;     /* last-ditch ipif to replace it on */
21283 } bcast_ireinfo_t;
21284
21285 enum { BCAST_ALLONES, BCAST_ALLZEROES, BCAST_NET, BCAST_SUBNET, BCAST_COUNT };
21286
21287 /*
21288  * Check if `ipif' needs the dying broadcast IRE described by `bireinfop', and
21289  * return B_TRUE if it should immediately be used to recreate the IRE.
21290  */
21291 static boolean_t
21292 ipif_consider_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop)
21293 {
21294         ipaddr_t addr;
21295
21296         ASSERT(!bireinfop->bi_haverep && bireinfop->bi_willdie);
21297
21298         switch (bireinfop->bi_type) {
21299         case BCAST_NET:
21300                 addr = ipif->ipif_subnet & ip_net_mask(ipif->ipif_subnet);
21301                 if (addr != bireinfop->bi_addr)
21302                         return (B_FALSE);
21303                 break;
21304         case BCAST_SUBNET:
21305                 if (ipif->ipif_subnet != bireinfop->bi_addr)
21306                         return (B_FALSE);
21307                 break;
21308         }
21309
21310         bireinfop->bi_needrep = 1;
21311         if (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_NOLOCAL|IPIF_ANYCAST)) {
21312                 if (bireinfop->bi_backup == NULL)
21313                         bireinfop->bi_backup = ipif;
21314                 return (B_FALSE);
21315         }
21316         return (B_TRUE);
21317 }
21318
21319 /*
21320  * Create the broadcast IREs described by `bireinfop' on `ipif', and return
21321  * them ala ire_check_and_create_bcast().
21322  */
21323 static ire_t **
21324 ipif_create_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop, ire_t **irep)
21325 {
21326         ipaddr_t mask, addr;
21327
21328         ASSERT(!bireinfop->bi_haverep && bireinfop->bi_needrep);
21329
21330         addr = bireinfop->bi_addr;
21331         irep = ire_create_bcast(ipif, addr, irep);
21332
21333         switch (bireinfop->bi_type) {
21334         case BCAST_NET:
21335                 mask = ip_net_mask(ipif->ipif_subnet);
21336                 irep = ire_create_bcast(ipif, addr | ~mask, irep);
21337                 break;
21338         case BCAST_SUBNET:
21339                 mask = ipif->ipif_net_mask;
21340                 irep = ire_create_bcast(ipif, addr | ~mask, irep);
21341                 break;
21342         }
21343
21344         bireinfop->bi_haverep = 1;
21345         return (irep);
21346 }
21347
21348 /*
21349  * Walk through all of the ipifs on `ill' that will be affected by `test_ipif'
21350  * going away, and determine if any of the broadcast IREs (named by `bireinfop')
21351  * that are going away are still needed.  If so, have ipif_create_bcast()
21352  * recreate them (except for the deprecated case, as explained below).
21353  */
21354 static ire_t **
21355 ill_create_bcast(ill_t *ill, ipif_t *test_ipif, bcast_ireinfo_t *bireinfo,
21356     ire_t **irep)
21357 {
21358         int i;
21359         ipif_t *ipif;
21360
21361         ASSERT(!ill->ill_isv6);
21362         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
21363                 /*
21364                  * Skip this ipif if it's (a) the one being taken down, (b)
21365                  * not in the same zone, or (c) has no valid local address.
21366                  */
21367                 if (ipif == test_ipif ||
21368                     ipif->ipif_zoneid != test_ipif->ipif_zoneid ||
21369                     ipif->ipif_subnet == 0 ||
21370                     (ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST|IPIF_NOXMIT)) !=
21371                     (IPIF_UP|IPIF_BROADCAST))
21372                         continue;
21373
21374                 /*
21375                  * For each dying IRE that hasn't yet been replaced, see if
21376                  * `ipif' needs it and whether the IRE should be recreated on
21377                  * `ipif'.  If `ipif' is deprecated, ipif_consider_bcast()
21378                  * will return B_FALSE even if `ipif' needs the IRE on the
21379                  * hopes that we'll later find a needy non-deprecated ipif.
21380                  * However, the ipif is recorded in bi_backup for possible
21381                  * subsequent use by ipif_check_bcast_ires().
21382                  */
21383                 for (i = 0; i < BCAST_COUNT; i++) {
21384                         if (!bireinfo[i].bi_willdie || bireinfo[i].bi_haverep)
21385                                 continue;
21386                         if (!ipif_consider_bcast(ipif, &bireinfo[i]))
21387                                 continue;
21388                         irep = ipif_create_bcast(ipif, &bireinfo[i], irep);
21389                 }
21390
21391                 /*
21392                  * If we've replaced all of the broadcast IREs that are going
21393                  * to be taken down, we know we're done.
21394                  */
21395                 for (i = 0; i < BCAST_COUNT; i++) {
21396                         if (bireinfo[i].bi_willdie && !bireinfo[i].bi_haverep)
21397                                 break;
21398                 }
21399                 if (i == BCAST_COUNT)
21400                         break;
21401         }
21402         return (irep);
21403 }
21404
21405 /*
21406  * Check if `test_ipif' (which is going away) is associated with any existing
21407  * broadcast IREs, and whether any other ipifs (e.g., on the same ill) were
21408  * using those broadcast IREs.  If so, recreate the broadcast IREs on one or
21409  * more of those other ipifs.  (The old IREs will be deleted in ipif_down().)
21410  *
21411  * This is necessary because broadcast IREs are shared.  In particular, a
21412  * given ill has one set of all-zeroes and all-ones broadcast IREs (for every
21413  * zone), plus one set of all-subnet-ones, all-subnet-zeroes, all-net-ones,
21414  * and all-net-zeroes for every net/subnet (and every zone) it has IPIF_UP
21415  * ipifs on.  Thus, if there are two IPIF_UP ipifs on the same subnet with the
21416  * same zone, they will share the same set of broadcast IREs.
21417  *
21418  * Note: the upper bound of 12 IREs comes from the worst case of replacing all
21419  * six pairs (loopback and non-loopback) of broadcast IREs (all-zeroes,
21420  * all-ones, subnet-zeroes, subnet-ones, net-zeroes, and net-ones).
21421  */
21422 static void
21423 ipif_check_bcast_ires(ipif_t *test_ipif)
21424 {
21425         ill_t           *ill = test_ipif->ipif_ill;
21426         ire_t           *ire, *ire_array[12];           /* see note above */
21427         ire_t           **irep1, **irep = &ire_array[0];
21428         uint_t          i, willdie;
21429         ipaddr_t        mask = ip_net_mask(test_ipif->ipif_subnet);
21430         bcast_ireinfo_t bireinfo[BCAST_COUNT];
21431
21432         ASSERT(!test_ipif->ipif_isv6);
21433         ASSERT(IAM_WRITER_IPIF(test_ipif));
21434
21435         /*
21436          * No broadcast IREs for the LOOPBACK interface
21437          * or others such as point to point and IPIF_NOXMIT.
21438          */
21439         if (!(test_ipif->ipif_flags & IPIF_BROADCAST) ||
21440             (test_ipif->ipif_flags & IPIF_NOXMIT))
21441                 return;
21442
21443         bzero(bireinfo, sizeof (bireinfo));
21444         bireinfo[0].bi_type = BCAST_ALLZEROES;
21445         bireinfo[0].bi_addr = 0;
21446
21447         bireinfo[1].bi_type = BCAST_ALLONES;
21448         bireinfo[1].bi_addr = INADDR_BROADCAST;
21449
21450         bireinfo[2].bi_type = BCAST_NET;
21451         bireinfo[2].bi_addr = test_ipif->ipif_subnet & mask;
21452
21453         if (test_ipif->ipif_net_mask != 0)
21454                 mask = test_ipif->ipif_net_mask;
21455         bireinfo[3].bi_type = BCAST_SUBNET;
21456         bireinfo[3].bi_addr = test_ipif->ipif_subnet & mask;
21457
21458         /*
21459          * Figure out what (if any) broadcast IREs will die as a result of
21460          * `test_ipif' going away.  If none will die, we're done.
21461          */
21462         for (i = 0, willdie = 0; i < BCAST_COUNT; i++) {
21463                 ire = ire_ctable_lookup(bireinfo[i].bi_addr, 0, IRE_BROADCAST,
21464                     test_ipif, ALL_ZONES, NULL,
21465                     (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ill->ill_ipst);
21466                 if (ire != NULL) {
21467                         willdie++;
21468                         bireinfo[i].bi_willdie = 1;
21469                         ire_refrele(ire);
21470                 }
21471         }
21472
21473         if (willdie == 0)
21474                 return;
21475
21476         /*
21477          * Walk through all the ipifs that will be affected by the dying IREs,
21478          * and recreate the IREs as necessary.
21479          */
21480         irep = ill_create_bcast(ill, test_ipif, bireinfo, irep);
21481
21482         /*
21483          * Scan through the set of broadcast IREs and see if there are any
21484          * that we need to replace that have not yet been replaced.  If so,
21485          * replace them using the appropriate backup ipif.
21486          */
21487         for (i = 0; i < BCAST_COUNT; i++) {
21488                 if (bireinfo[i].bi_needrep && !bireinfo[i].bi_haverep)
21489                         irep = ipif_create_bcast(bireinfo[i].bi_backup,
21490                             &bireinfo[i], irep);
21491         }
21492
21493         /*
21494          * If we can't create all of them, don't add any of them.  (Code in
21495          * ip_wput_ire() and ire_to_ill() assumes that we always have a
21496          * non-loopback copy and loopback copy for a given address.)
21497          */
21498         for (irep1 = irep; irep1 > ire_array; ) {
21499                 irep1--;
21500                 if (*irep1 == NULL) {
21501                         ip0dbg(("ipif_check_bcast_ires: can't create "
21502                             "IRE_BROADCAST, memory allocation failure\n"));
21503                         while (irep > ire_array) {
21504                                 irep--;
21505                                 if (*irep != NULL)
21506                                         ire_delete(*irep);
21507                         }
21508                         return;
21509                 }
21510         }
21511
21512         for (irep1 = irep; irep1 > ire_array; ) {
21513                 irep1--;
21514                 if (ire_add(irep1, NULL, NULL, NULL, B_FALSE) == 0)
21515                         ire_refrele(*irep1);            /* Held in ire_add */
21516         }
21517 }
21518
21519 /*
21520  * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV*
21521  * from lifr_flags and the name from lifr_name.
21522  * Set IFF_IPV* and ill_isv6 prior to doing the lookup
21523  * since ipif_lookup_on_name uses the _isv6 flags when matching.
21524  * Returns EINPROGRESS when mp has been consumed by queueing it on
21525  * ill_pending_mp and the ioctl will complete in ip_rput.
21526  *
21527  * Can operate on either a module or a driver queue.
21528  * Returns an error if not a module queue.
21529  */
21530 /* ARGSUSED */
21531 int
21532 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
21533     ip_ioctl_cmd_t *ipip, void *if_req)
21534 {
21535         int     err;
21536         ill_t   *ill;
21537         struct lifreq *lifr = (struct lifreq *)if_req;
21538
21539         ASSERT(ipif != NULL);
21540         ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name));
21541
21542         if (q->q_next == NULL) {
21543                 ip1dbg((
21544                     "if_sioctl_slifname: SIOCSLIFNAME: no q_next\n"));
21545                 return (EINVAL);
21546         }
21547
21548         ill = (ill_t *)q->q_ptr;
21549         /*
21550          * If we are not writer on 'q' then this interface exists already
21551          * and previous lookups (ipif_extract_lifreq()) found this ipif.
21552          * So return EALREADY
21553          */
21554         if (ill != ipif->ipif_ill)
21555                 return (EALREADY);
21556
21557         if (ill->ill_name[0] != '\0')
21558                 return (EALREADY);
21559
21560         /*
21561          * Set all the flags. Allows all kinds of override. Provide some
21562          * sanity checking by not allowing IFF_BROADCAST and IFF_MULTICAST
21563          * unless there is either multicast/broadcast support in the driver
21564          * or it is a pt-pt link.
21565          */
21566         if (lifr->lifr_flags & (IFF_PROMISC|IFF_ALLMULTI)) {
21567                 /* Meaningless to IP thus don't allow them to be set. */
21568                 ip1dbg(("ip_setname: EINVAL 1\n"));
21569                 return (EINVAL);
21570         }
21571         /*
21572          * For a DL_STYLE2 driver (ill_needs_attach), we would not have the
21573          * ill_bcast_addr_length info.
21574          */
21575         if (!ill->ill_needs_attach &&
21576             ((lifr->lifr_flags & IFF_MULTICAST) &&
21577             !(lifr->lifr_flags & IFF_POINTOPOINT) &&
21578             ill->ill_bcast_addr_length == 0)) {
21579                 /* Link not broadcast/pt-pt capable i.e. no multicast */
21580                 ip1dbg(("ip_setname: EINVAL 2\n"));
21581                 return (EINVAL);
21582         }
21583         if ((lifr->lifr_flags & IFF_BROADCAST) &&
21584             ((lifr->lifr_flags & IFF_IPV6) ||
21585             (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) {
21586                 /* Link not broadcast capable or IPv6 i.e. no broadcast */
21587                 ip1dbg(("ip_setname: EINVAL 3\n"));
21588                 return (EINVAL);
21589         }
21590         if (lifr->lifr_flags & IFF_UP) {
21591                 /* Can only be set with SIOCSLIFFLAGS */
21592                 ip1dbg(("ip_setname: EINVAL 4\n"));
21593                 return (EINVAL);
21594         }
21595         if ((lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV6 &&
21596             (lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV4) {
21597                 ip1dbg(("ip_setname: EINVAL 5\n"));
21598                 return (EINVAL);
21599         }
21600         /*
21601          * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces.
21602          */
21603         if ((lifr->lifr_flags & IFF_XRESOLV) &&
21604             !(lifr->lifr_flags & IFF_IPV6) &&
21605             !(ipif->ipif_isv6)) {
21606                 ip1dbg(("ip_setname: EINVAL 6\n"));
21607                 return (EINVAL);
21608         }
21609
21610         /*
21611          * The user has done SIOCGLIFFLAGS prior to this ioctl and hence
21612          * we have all the flags here. So, we assign rather than we OR.
21613          * We can't OR the flags here because we don't want to set
21614          * both IFF_IPV4 and IFF_IPV6. We start off as IFF_IPV4 in
21615          * ipif_allocate and become IFF_IPV4 or IFF_IPV6 here depending
21616          * on lifr_flags value here.
21617          */
21618         /*
21619          * This ill has not been inserted into the global list.
21620          * So we are still single threaded and don't need any lock
21621          */
21622         ipif->ipif_flags = lifr->lifr_flags & IFF_LOGINT_FLAGS &
21623             ~IFF_DUPLICATE;
21624         ill->ill_flags = lifr->lifr_flags & IFF_PHYINTINST_FLAGS;
21625         ill->ill_phyint->phyint_flags = lifr->lifr_flags & IFF_PHYINT_FLAGS;
21626
21627         /* We started off as V4. */
21628         if (ill->ill_flags & ILLF_IPV6) {
21629                 ill->ill_phyint->phyint_illv6 = ill;
21630                 ill->ill_phyint->phyint_illv4 = NULL;
21631         }
21632         err = ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa);
21633         return (err);
21634 }
21635
21636 /* ARGSUSED */
21637 int
21638 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
21639     ip_ioctl_cmd_t *ipip, void *if_req)
21640 {
21641         /*
21642          * ill_phyint_reinit merged the v4 and v6 into a single
21643          * ipsq. Could also have become part of a ipmp group in the
21644          * process, and we might not have been able to complete the
21645          * slifname in ipif_set_values, if we could not become
21646          * exclusive.  If so restart it here
21647          */
21648         return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
21649 }
21650
21651 /*
21652  * Return a pointer to the ipif which matches the index, IP version type and
21653  * zoneid.
21654  */
21655 ipif_t *
21656 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
21657     queue_t *q, mblk_t *mp, ipsq_func_t func, int *err, ip_stack_t *ipst)
21658 {
21659         ill_t   *ill;
21660         ipif_t  *ipif = NULL;
21661
21662         ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) ||
21663             (q != NULL && mp != NULL && func != NULL && err != NULL));
21664
21665         if (err != NULL)
21666                 *err = 0;
21667
21668         ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst);
21669         if (ill != NULL) {
21670                 mutex_enter(&ill->ill_lock);
21671                 for (ipif = ill->ill_ipif; ipif != NULL;
21672                     ipif = ipif->ipif_next) {
21673                         if (IPIF_CAN_LOOKUP(ipif) && (zoneid == ALL_ZONES ||
21674                             zoneid == ipif->ipif_zoneid ||
21675                             ipif->ipif_zoneid == ALL_ZONES)) {
21676                                 ipif_refhold_locked(ipif);
21677                                 break;
21678                         }
21679                 }
21680                 mutex_exit(&ill->ill_lock);
21681                 ill_refrele(ill);
21682                 if (ipif == NULL && err != NULL)
21683                         *err = ENXIO;
21684         }
21685         return (ipif);
21686 }
21687
21688 typedef struct conn_change_s {
21689         uint_t cc_old_ifindex;
21690         uint_t cc_new_ifindex;
21691 } conn_change_t;
21692
21693 /*
21694  * ipcl_walk function for changing interface index.
21695  */
21696 static void
21697 conn_change_ifindex(conn_t *connp, caddr_t arg)
21698 {
21699         conn_change_t *connc;
21700         uint_t old_ifindex;
21701         uint_t new_ifindex;
21702         int i;
21703         ilg_t *ilg;
21704
21705         connc = (conn_change_t *)arg;
21706         old_ifindex = connc->cc_old_ifindex;
21707         new_ifindex = connc->cc_new_ifindex;
21708
21709         if (connp->conn_orig_bound_ifindex == old_ifindex)
21710                 connp->conn_orig_bound_ifindex = new_ifindex;
21711
21712         if (connp->conn_orig_multicast_ifindex == old_ifindex)
21713                 connp->conn_orig_multicast_ifindex = new_ifindex;
21714
21715         if (connp->conn_orig_xmit_ifindex == old_ifindex)
21716                 connp->conn_orig_xmit_ifindex = new_ifindex;
21717
21718         for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
21719                 ilg = &connp->conn_ilg[i];
21720                 if (ilg->ilg_orig_ifindex == old_ifindex)
21721                         ilg->ilg_orig_ifindex = new_ifindex;
21722         }
21723 }
21724
21725 /*
21726  * Walk all the ipifs and ilms on this ill and change the orig_ifindex
21727  * to new_index if it matches the old_index.
21728  *
21729  * Failovers typically happen within a group of ills. But somebody
21730  * can remove an ill from the group after a failover happened. If
21731  * we are setting the ifindex after this, we potentially need to
21732  * look at all the ills rather than just the ones in the group.
21733  * We cut down the work by looking at matching ill_net_types
21734  * and ill_types as we could not possibly grouped them together.
21735  */
21736 static void
21737 ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc)
21738 {
21739         ill_t *ill;
21740         ipif_t *ipif;
21741         uint_t old_ifindex;
21742         uint_t new_ifindex;
21743         ilm_t *ilm;
21744         ill_walk_context_t ctx;
21745         ip_stack_t      *ipst = ill_orig->ill_ipst;
21746
21747         old_ifindex = connc->cc_old_ifindex;
21748         new_ifindex = connc->cc_new_ifindex;
21749
21750         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
21751         ill = ILL_START_WALK_ALL(&ctx, ipst);
21752         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
21753                 if ((ill_orig->ill_net_type != ill->ill_net_type) ||
21754                     (ill_orig->ill_type != ill->ill_type)) {
21755                         continue;
21756                 }
21757                 for (ipif = ill->ill_ipif; ipif != NULL;
21758                     ipif = ipif->ipif_next) {
21759                         if (ipif->ipif_orig_ifindex == old_ifindex)
21760                                 ipif->ipif_orig_ifindex = new_ifindex;
21761                 }
21762                 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
21763                         if (ilm->ilm_orig_ifindex == old_ifindex)
21764                                 ilm->ilm_orig_ifindex = new_ifindex;
21765                 }
21766         }
21767         rw_exit(&ipst->ips_ill_g_lock);
21768 }
21769
21770 /*
21771  * We first need to ensure that the new index is unique, and
21772  * then carry the change across both v4 and v6 ill representation
21773  * of the physical interface.
21774  */
21775 /* ARGSUSED */
21776 int
21777 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
21778     ip_ioctl_cmd_t *ipip, void *ifreq)
21779 {
21780         ill_t           *ill;
21781         ill_t           *ill_other;
21782         phyint_t        *phyi;
21783         int             old_index;
21784         conn_change_t   connc;
21785         struct ifreq    *ifr = (struct ifreq *)ifreq;
21786         struct lifreq   *lifr = (struct lifreq *)ifreq;
21787         uint_t  index;
21788         ill_t   *ill_v4;
21789         ill_t   *ill_v6;
21790         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
21791
21792         if (ipip->ipi_cmd_type == IF_CMD)
21793                 index = ifr->ifr_index;
21794         else
21795                 index = lifr->lifr_index;
21796
21797         /*
21798          * Only allow on physical interface. Also, index zero is illegal.
21799          *
21800          * Need to check for PHYI_FAILED and PHYI_INACTIVE
21801          *
21802          * 1) If PHYI_FAILED is set, a failover could have happened which
21803          *    implies a possible failback might have to happen. As failback
21804          *    depends on the old index, we should fail setting the index.
21805          *
21806          * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that
21807          *    any addresses or multicast memberships are failed over to
21808          *    a non-STANDBY interface. As failback depends on the old
21809          *    index, we should fail setting the index for this case also.
21810          *
21811          * 3) If PHYI_OFFLINE is set, a possible failover has happened.
21812          *    Be consistent with PHYI_FAILED and fail the ioctl.
21813          */
21814         ill = ipif->ipif_ill;
21815         phyi = ill->ill_phyint;
21816         if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) ||
21817             ipif->ipif_id != 0 || index == 0) {
21818                 return (EINVAL);
21819         }
21820         old_index = phyi->phyint_ifindex;
21821
21822         /* If the index is not changing, no work to do */
21823         if (old_index == index)
21824                 return (0);
21825
21826         /*
21827          * Use ill_lookup_on_ifindex to determine if the
21828          * new index is unused and if so allow the change.
21829          */
21830         ill_v6 = ill_lookup_on_ifindex(index, B_TRUE, NULL, NULL, NULL, NULL,
21831             ipst);
21832         ill_v4 = ill_lookup_on_ifindex(index, B_FALSE, NULL, NULL, NULL, NULL,
21833             ipst);
21834         if (ill_v6 != NULL || ill_v4 != NULL) {
21835                 if (ill_v4 != NULL)
21836                         ill_refrele(ill_v4);
21837                 if (ill_v6 != NULL)
21838                         ill_refrele(ill_v6);
21839                 return (EBUSY);
21840         }
21841
21842         /*
21843          * The new index is unused. Set it in the phyint.
21844          * Locate the other ill so that we can send a routing
21845          * sockets message.
21846          */
21847         if (ill->ill_isv6) {
21848                 ill_other = phyi->phyint_illv4;
21849         } else {
21850                 ill_other = phyi->phyint_illv6;
21851         }
21852
21853         phyi->phyint_ifindex = index;
21854
21855         /* Update SCTP's ILL list */
21856         sctp_ill_reindex(ill, old_index);
21857
21858         connc.cc_old_ifindex = old_index;
21859         connc.cc_new_ifindex = index;
21860         ip_change_ifindex(ill, &connc);
21861         ipcl_walk(conn_change_ifindex, (caddr_t)&connc, ipst);
21862
21863         /* Send the routing sockets message */
21864         ip_rts_ifmsg(ipif);
21865         if (ill_other != NULL)
21866                 ip_rts_ifmsg(ill_other->ill_ipif);
21867
21868         return (0);
21869 }
21870
21871 /* ARGSUSED */
21872 int
21873 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
21874     ip_ioctl_cmd_t *ipip, void *ifreq)
21875 {
21876         struct ifreq    *ifr = (struct ifreq *)ifreq;
21877         struct lifreq   *lifr = (struct lifreq *)ifreq;
21878
21879         ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n",
21880             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
21881         /* Get the interface index */
21882         if (ipip->ipi_cmd_type == IF_CMD) {
21883                 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
21884         } else {
21885                 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
21886         }
21887         return (0);
21888 }
21889
21890 /* ARGSUSED */
21891 int
21892 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
21893     ip_ioctl_cmd_t *ipip, void *ifreq)
21894 {
21895         struct lifreq   *lifr = (struct lifreq *)ifreq;
21896
21897         ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n",
21898             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
21899         /* Get the interface zone */
21900         ASSERT(ipip->ipi_cmd_type == LIF_CMD);
21901         lifr->lifr_zoneid = ipif->ipif_zoneid;
21902         return (0);
21903 }
21904
21905 /*
21906  * Set the zoneid of an interface.
21907  */
21908 /* ARGSUSED */
21909 int
21910 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
21911     ip_ioctl_cmd_t *ipip, void *ifreq)
21912 {
21913         struct lifreq   *lifr = (struct lifreq *)ifreq;
21914         int err = 0;
21915         boolean_t need_up = B_FALSE;
21916         zone_t *zptr;
21917         zone_status_t status;
21918         zoneid_t zoneid;
21919
21920         ASSERT(ipip->ipi_cmd_type == LIF_CMD);
21921         if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) {
21922                 if (!is_system_labeled())
21923                         return (ENOTSUP);
21924                 zoneid = GLOBAL_ZONEID;
21925         }
21926
21927         /* cannot assign instance zero to a non-global zone */
21928         if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID)
21929                 return (ENOTSUP);
21930
21931         /*
21932          * Cannot assign to a zone that doesn't exist or is shutting down.  In
21933          * the event of a race with the zone shutdown processing, since IP
21934          * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the
21935          * interface will be cleaned up even if the zone is shut down
21936          * immediately after the status check. If the interface can't be brought
21937          * down right away, and the zone is shut down before the restart
21938          * function is called, we resolve the possible races by rechecking the
21939          * zone status in the restart function.
21940          */
21941         if ((zptr = zone_find_by_id(zoneid)) == NULL)
21942                 return (EINVAL);
21943         status = zone_status_get(zptr);
21944         zone_rele(zptr);
21945
21946         if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING)
21947                 return (EINVAL);
21948
21949         if (ipif->ipif_flags & IPIF_UP) {
21950                 /*
21951                  * If the interface is already marked up,
21952                  * we call ipif_down which will take care
21953                  * of ditching any IREs that have been set
21954                  * up based on the old interface address.
21955                  */
21956                 err = ipif_logical_down(ipif, q, mp);
21957                 if (err == EINPROGRESS)
21958                         return (err);
21959                 ipif_down_tail(ipif);
21960                 need_up = B_TRUE;
21961         }
21962
21963         err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up);
21964         return (err);
21965 }
21966
21967 static int
21968 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
21969     queue_t *q, mblk_t *mp, boolean_t need_up)
21970 {
21971         int     err = 0;
21972         ip_stack_t      *ipst;
21973
21974         ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n",
21975             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
21976
21977         if (CONN_Q(q))
21978                 ipst = CONNQ_TO_IPST(q);
21979         else
21980                 ipst = ILLQ_TO_IPST(q);
21981
21982         /*
21983          * For exclusive stacks we don't allow a different zoneid than
21984          * global.
21985          */
21986         if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID &&
21987             zoneid != GLOBAL_ZONEID)
21988                 return (EINVAL);
21989
21990         /* Set the new zone id. */
21991         ipif->ipif_zoneid = zoneid;
21992
21993         /* Update sctp list */
21994         sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
21995
21996         if (need_up) {
21997                 /*
21998                  * Now bring the interface back up.  If this
21999                  * is the only IPIF for the ILL, ipif_up
22000                  * will have to re-bind to the device, so
22001                  * we may get back EINPROGRESS, in which
22002                  * case, this IOCTL will get completed in
22003                  * ip_rput_dlpi when we see the DL_BIND_ACK.
22004                  */
22005                 err = ipif_up(ipif, q, mp);
22006         }
22007         return (err);
22008 }
22009
22010 /* ARGSUSED */
22011 int
22012 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
22013     ip_ioctl_cmd_t *ipip, void *if_req)
22014 {
22015         struct lifreq *lifr = (struct lifreq *)if_req;
22016         zoneid_t zoneid;
22017         zone_t *zptr;
22018         zone_status_t status;
22019
22020         ASSERT(ipif->ipif_id != 0);
22021         ASSERT(ipip->ipi_cmd_type == LIF_CMD);
22022         if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES)
22023                 zoneid = GLOBAL_ZONEID;
22024
22025         ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n",
22026             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
22027
22028         /*
22029          * We recheck the zone status to resolve the following race condition:
22030          * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone";
22031          * 2) hme0:1 is up and can't be brought down right away;
22032          * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued;
22033          * 3) zone "myzone" is halted; the zone status switches to
22034          * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list
22035          * the interfaces to remove - hme0:1 is not returned because it's not
22036          * yet in "myzone", so it won't be removed;
22037          * 4) the restart function for SIOCSLIFZONE is called; without the
22038          * status check here, we would have hme0:1 in "myzone" after it's been
22039          * destroyed.
22040          * Note that if the status check fails, we need to bring the interface
22041          * back to its state prior to ip_sioctl_slifzone(), hence the call to
22042          * ipif_up_done[_v6]().
22043          */
22044         status = ZONE_IS_UNINITIALIZED;
22045         if ((zptr = zone_find_by_id(zoneid)) != NULL) {
22046                 status = zone_status_get(zptr);
22047                 zone_rele(zptr);
22048         }
22049         if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) {
22050                 if (ipif->ipif_isv6) {
22051                         (void) ipif_up_done_v6(ipif);
22052                 } else {
22053                         (void) ipif_up_done(ipif);
22054                 }
22055                 return (EINVAL);
22056         }
22057
22058         ipif_down_tail(ipif);
22059
22060         return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp,
22061             B_TRUE));
22062 }
22063
22064 /* ARGSUSED */
22065 int
22066 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
22067         ip_ioctl_cmd_t *ipip, void *ifreq)
22068 {
22069         struct lifreq   *lifr = ifreq;
22070
22071         ASSERT(q->q_next == NULL);
22072         ASSERT(CONN_Q(q));
22073
22074         ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n",
22075             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
22076         lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex;
22077         ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index));
22078
22079         return (0);
22080 }
22081
22082
22083 /* Find the previous ILL in this usesrc group */
22084 static ill_t *
22085 ill_prev_usesrc(ill_t *uill)
22086 {
22087         ill_t *ill;
22088
22089         for (ill = uill->ill_usesrc_grp_next;
22090             ASSERT(ill), ill->ill_usesrc_grp_next != uill;
22091             ill = ill->ill_usesrc_grp_next)
22092                 /* do nothing */;
22093         return (ill);
22094 }
22095
22096 /*
22097  * Release all members of the usesrc group. This routine is called
22098  * from ill_delete when the interface being unplumbed is the
22099  * group head.
22100  */
22101 static void
22102 ill_disband_usesrc_group(ill_t *uill)
22103 {
22104         ill_t *next_ill, *tmp_ill;
22105         ip_stack_t      *ipst = uill->ill_ipst;
22106
22107         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
22108         next_ill = uill->ill_usesrc_grp_next;
22109
22110         do {
22111                 ASSERT(next_ill != NULL);
22112                 tmp_ill = next_ill->ill_usesrc_grp_next;
22113                 ASSERT(tmp_ill != NULL);
22114                 next_ill->ill_usesrc_grp_next = NULL;
22115                 next_ill->ill_usesrc_ifindex = 0;
22116                 next_ill = tmp_ill;
22117         } while (next_ill->ill_usesrc_ifindex != 0);
22118         uill->ill_usesrc_grp_next = NULL;
22119 }
22120
22121 /*
22122  * Remove the client usesrc ILL from the list and relink to a new list
22123  */
22124 int
22125 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex)
22126 {
22127         ill_t *ill, *tmp_ill;
22128         ip_stack_t      *ipst = ucill->ill_ipst;
22129
22130         ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) &&
22131             (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
22132
22133         /*
22134          * Check if the usesrc client ILL passed in is not already
22135          * in use as a usesrc ILL i.e one whose source address is
22136          * in use OR a usesrc ILL is not already in use as a usesrc
22137          * client ILL
22138          */
22139         if ((ucill->ill_usesrc_ifindex == 0) ||
22140             (uill->ill_usesrc_ifindex != 0)) {
22141                 return (-1);
22142         }
22143
22144         ill = ill_prev_usesrc(ucill);
22145         ASSERT(ill->ill_usesrc_grp_next != NULL);
22146
22147         /* Remove from the current list */
22148         if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) {
22149                 /* Only two elements in the list */
22150                 ASSERT(ill->ill_usesrc_ifindex == 0);
22151                 ill->ill_usesrc_grp_next = NULL;
22152         } else {
22153                 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next;
22154         }
22155
22156         if (ifindex == 0) {
22157                 ucill->ill_usesrc_ifindex = 0;
22158                 ucill->ill_usesrc_grp_next = NULL;
22159                 return (0);
22160         }
22161
22162         ucill->ill_usesrc_ifindex = ifindex;
22163         tmp_ill = uill->ill_usesrc_grp_next;
22164         uill->ill_usesrc_grp_next = ucill;
22165         ucill->ill_usesrc_grp_next =
22166             (tmp_ill != NULL) ? tmp_ill : uill;
22167         return (0);
22168 }
22169
22170 /*
22171  * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in
22172  * ip.c for locking details.
22173  */
22174 /* ARGSUSED */
22175 int
22176 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
22177     ip_ioctl_cmd_t *ipip, void *ifreq)
22178 {
22179         struct lifreq *lifr = (struct lifreq *)ifreq;
22180         boolean_t isv6 = B_FALSE, reset_flg = B_FALSE,
22181             ill_flag_changed = B_FALSE;
22182         ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
22183         int err = 0, ret;
22184         uint_t ifindex;
22185         phyint_t *us_phyint, *us_cli_phyint;
22186         ipsq_t *ipsq = NULL;
22187         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
22188
22189         ASSERT(IAM_WRITER_IPIF(ipif));
22190         ASSERT(q->q_next == NULL);
22191         ASSERT(CONN_Q(q));
22192
22193         isv6 = (Q_TO_CONN(q))->conn_af_isv6;
22194         us_cli_phyint = usesrc_cli_ill->ill_phyint;
22195
22196         ASSERT(us_cli_phyint != NULL);
22197
22198         /*
22199          * If the client ILL is being used for IPMP, abort.
22200          * Note, this can be done before ipsq_try_enter since we are already
22201          * exclusive on this ILL
22202          */
22203         if ((us_cli_phyint->phyint_groupname != NULL) ||
22204             (us_cli_phyint->phyint_flags & PHYI_STANDBY)) {
22205                 return (EINVAL);
22206         }
22207
22208         ifindex = lifr->lifr_index;
22209         if (ifindex == 0) {
22210                 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) {
22211                         /* non usesrc group interface, nothing to reset */
22212                         return (0);
22213                 }
22214                 ifindex = usesrc_cli_ill->ill_usesrc_ifindex;
22215                 /* valid reset request */
22216                 reset_flg = B_TRUE;
22217         }
22218
22219         usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp,
22220             ip_process_ioctl, &err, ipst);
22221
22222         if (usesrc_ill == NULL) {
22223                 return (err);
22224         }
22225
22226         /*
22227          * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP
22228          * group nor can either of the interfaces be used for standy. So
22229          * to guarantee mutual exclusion with ip_sioctl_flags (which sets
22230          * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname)
22231          * we need to be exclusive on the ipsq belonging to the usesrc_ill.
22232          * We are already exlusive on this ipsq i.e ipsq corresponding to
22233          * the usesrc_cli_ill
22234          */
22235         ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
22236             NEW_OP, B_TRUE);
22237         if (ipsq == NULL) {
22238                 err = EINPROGRESS;
22239                 /* Operation enqueued on the ipsq of the usesrc ILL */
22240                 goto done;
22241         }
22242
22243         /* Check if the usesrc_ill is used for IPMP */
22244         us_phyint = usesrc_ill->ill_phyint;
22245         if ((us_phyint->phyint_groupname != NULL) ||
22246             (us_phyint->phyint_flags & PHYI_STANDBY)) {
22247                 err = EINVAL;
22248                 goto done;
22249         }
22250
22251         /*
22252          * If the client is already in use as a usesrc_ill or a usesrc_ill is
22253          * already a client then return EINVAL
22254          */
22255         if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) {
22256                 err = EINVAL;
22257                 goto done;
22258         }
22259
22260         /*
22261          * If the ill_usesrc_ifindex field is already set to what it needs to
22262          * be then this is a duplicate operation.
22263          */
22264         if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) {
22265                 err = 0;
22266                 goto done;
22267         }
22268
22269         ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s,"
22270             " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name,
22271             usesrc_ill->ill_isv6));
22272
22273         /*
22274          * The next step ensures that no new ires will be created referencing
22275          * the client ill, until the ILL_CHANGING flag is cleared. Then
22276          * we go through an ire walk deleting all ire caches that reference
22277          * the client ill. New ires referencing the client ill that are added
22278          * to the ire table before the ILL_CHANGING flag is set, will be
22279          * cleaned up by the ire walk below. Attempt to add new ires referencing
22280          * the client ill while the ILL_CHANGING flag is set will be failed
22281          * during the ire_add in ire_atomic_start. ire_atomic_start atomically
22282          * checks (under the ill_g_usesrc_lock) that the ire being added
22283          * is not stale, i.e the ire_stq and ire_ipif are consistent and
22284          * belong to the same usesrc group.
22285          */
22286         mutex_enter(&usesrc_cli_ill->ill_lock);
22287         usesrc_cli_ill->ill_state_flags |= ILL_CHANGING;
22288         mutex_exit(&usesrc_cli_ill->ill_lock);
22289         ill_flag_changed = B_TRUE;
22290
22291         if (ipif->ipif_isv6)
22292                 ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill,
22293                     ALL_ZONES, ipst);
22294         else
22295                 ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill,
22296                     ALL_ZONES, ipst);
22297
22298         /*
22299          * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next
22300          * and the ill_usesrc_ifindex fields
22301          */
22302         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
22303
22304         if (reset_flg) {
22305                 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0);
22306                 if (ret != 0) {
22307                         err = EINVAL;
22308                 }
22309                 rw_exit(&ipst->ips_ill_g_usesrc_lock);
22310                 goto done;
22311         }
22312
22313         /*
22314          * Four possibilities to consider:
22315          * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp
22316          * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't
22317          * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't
22318          * 4. Both are part of their respective usesrc groups
22319          */
22320         if ((usesrc_ill->ill_usesrc_grp_next == NULL) &&
22321             (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
22322                 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0);
22323                 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
22324                 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
22325                 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill;
22326         } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) &&
22327             (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
22328                 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
22329                 /* Insert at head of list */
22330                 usesrc_cli_ill->ill_usesrc_grp_next =
22331                     usesrc_ill->ill_usesrc_grp_next;
22332                 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
22333         } else {
22334                 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill,
22335                     ifindex);
22336                 if (ret != 0)
22337                         err = EINVAL;
22338         }
22339         rw_exit(&ipst->ips_ill_g_usesrc_lock);
22340
22341 done:
22342         if (ill_flag_changed) {
22343                 mutex_enter(&usesrc_cli_ill->ill_lock);
22344                 usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING;
22345                 mutex_exit(&usesrc_cli_ill->ill_lock);
22346         }
22347         if (ipsq != NULL)
22348                 ipsq_exit(ipsq, B_TRUE, B_TRUE);
22349         /* The refrele on the lifr_name ipif is done by ip_process_ioctl */
22350         ill_refrele(usesrc_ill);
22351         return (err);
22352 }
22353
22354 /*
22355  * comparison function used by avl.
22356  */
22357 static int
22358 ill_phyint_compare_index(const void *index_ptr, const void *phyip)
22359 {
22360
22361         uint_t index;
22362
22363         ASSERT(phyip != NULL && index_ptr != NULL);
22364
22365         index = *((uint_t *)index_ptr);
22366         /*
22367          * let the phyint with the lowest index be on top.
22368          */
22369         if (((phyint_t *)phyip)->phyint_ifindex < index)
22370                 return (1);
22371         if (((phyint_t *)phyip)->phyint_ifindex > index)
22372                 return (-1);
22373         return (0);
22374 }
22375
22376 /*
22377  * comparison function used by avl.
22378  */
22379 static int
22380 ill_phyint_compare_name(const void *name_ptr, const void *phyip)
22381 {
22382         ill_t *ill;
22383         int res = 0;
22384
22385         ASSERT(phyip != NULL && name_ptr != NULL);
22386
22387         if (((phyint_t *)phyip)->phyint_illv4)
22388                 ill = ((phyint_t *)phyip)->phyint_illv4;
22389         else
22390                 ill = ((phyint_t *)phyip)->phyint_illv6;
22391         ASSERT(ill != NULL);
22392
22393         res = strcmp(ill->ill_name, (char *)name_ptr);
22394         if (res > 0)
22395                 return (1);
22396         else if (res < 0)
22397                 return (-1);
22398         return (0);
22399 }
22400 /*
22401  * This function is called from ill_delete when the ill is being
22402  * unplumbed. We remove the reference from the phyint and we also
22403  * free the phyint when there are no more references to it.
22404  */
22405 static void
22406 ill_phyint_free(ill_t *ill)
22407 {
22408         phyint_t *phyi;
22409         phyint_t *next_phyint;
22410         ipsq_t *cur_ipsq;
22411         ip_stack_t      *ipst = ill->ill_ipst;
22412
22413         ASSERT(ill->ill_phyint != NULL);
22414
22415         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
22416         phyi = ill->ill_phyint;
22417         ill->ill_phyint = NULL;
22418         /*
22419          * ill_init allocates a phyint always to store the copy
22420          * of flags relevant to phyint. At that point in time, we could
22421          * not assign the name and hence phyint_illv4/v6 could not be
22422          * initialized. Later in ipif_set_values, we assign the name to
22423          * the ill, at which point in time we assign phyint_illv4/v6.
22424          * Thus we don't rely on phyint_illv6 to be initialized always.
22425          */
22426         if (ill->ill_flags & ILLF_IPV6) {
22427                 phyi->phyint_illv6 = NULL;
22428         } else {
22429                 phyi->phyint_illv4 = NULL;
22430         }
22431         /*
22432          * ipif_down removes it from the group when the last ipif goes
22433          * down.
22434          */
22435         ASSERT(ill->ill_group == NULL);
22436
22437         if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL)
22438                 return;
22439
22440         /*
22441          * Make sure this phyint was put in the list.
22442          */
22443         if (phyi->phyint_ifindex > 0) {
22444                 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
22445                     phyi);
22446                 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
22447                     phyi);
22448         }
22449         /*
22450          * remove phyint from the ipsq list.
22451          */
22452         cur_ipsq = phyi->phyint_ipsq;
22453         if (phyi == cur_ipsq->ipsq_phyint_list) {
22454                 cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next;
22455         } else {
22456                 next_phyint = cur_ipsq->ipsq_phyint_list;
22457                 while (next_phyint != NULL) {
22458                         if (next_phyint->phyint_ipsq_next == phyi) {
22459                                 next_phyint->phyint_ipsq_next =
22460                                     phyi->phyint_ipsq_next;
22461                                 break;
22462                         }
22463                         next_phyint = next_phyint->phyint_ipsq_next;
22464                 }
22465                 ASSERT(next_phyint != NULL);
22466         }
22467         IPSQ_DEC_REF(cur_ipsq, ipst);
22468
22469         if (phyi->phyint_groupname_len != 0) {
22470                 ASSERT(phyi->phyint_groupname != NULL);
22471                 mi_free(phyi->phyint_groupname);
22472         }
22473         mi_free(phyi);
22474 }
22475
22476 /*
22477  * Attach the ill to the phyint structure which can be shared by both
22478  * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This
22479  * function is called from ipif_set_values and ill_lookup_on_name (for
22480  * loopback) where we know the name of the ill. We lookup the ill and if
22481  * there is one present already with the name use that phyint. Otherwise
22482  * reuse the one allocated by ill_init.
22483  */
22484 static void
22485 ill_phyint_reinit(ill_t *ill)
22486 {
22487         boolean_t isv6 = ill->ill_isv6;
22488         phyint_t *phyi_old;
22489         phyint_t *phyi;
22490         avl_index_t where = 0;
22491         ill_t   *ill_other = NULL;
22492         ipsq_t  *ipsq;
22493         ip_stack_t      *ipst = ill->ill_ipst;
22494
22495         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
22496
22497         phyi_old = ill->ill_phyint;
22498         ASSERT(isv6 || (phyi_old->phyint_illv4 == ill &&
22499             phyi_old->phyint_illv6 == NULL));
22500         ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill &&
22501             phyi_old->phyint_illv4 == NULL));
22502         ASSERT(phyi_old->phyint_ifindex == 0);
22503
22504         phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
22505             ill->ill_name, &where);
22506
22507         /*
22508          * 1. We grabbed the ill_g_lock before inserting this ill into
22509          *    the global list of ills. So no other thread could have located
22510          *    this ill and hence the ipsq of this ill is guaranteed to be empty.
22511          * 2. Now locate the other protocol instance of this ill.
22512          * 3. Now grab both ill locks in the right order, and the phyint lock of
22513          *    the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq
22514          *    of neither ill can change.
22515          * 4. Merge the phyint and thus the ipsq as well of this ill onto the
22516          *    other ill.
22517          * 5. Release all locks.
22518          */
22519
22520         /*
22521          * Look for IPv4 if we are initializing IPv6 or look for IPv6 if
22522          * we are initializing IPv4.
22523          */
22524         if (phyi != NULL) {
22525                 ill_other = (isv6) ? phyi->phyint_illv4 :
22526                     phyi->phyint_illv6;
22527                 ASSERT(ill_other->ill_phyint != NULL);
22528                 ASSERT((isv6 && !ill_other->ill_isv6) ||
22529                     (!isv6 && ill_other->ill_isv6));
22530                 GRAB_ILL_LOCKS(ill, ill_other);
22531                 /*
22532                  * We are potentially throwing away phyint_flags which
22533                  * could be different from the one that we obtain from
22534                  * ill_other->ill_phyint. But it is okay as we are assuming
22535                  * that the state maintained within IP is correct.
22536                  */
22537                 mutex_enter(&phyi->phyint_lock);
22538                 if (isv6) {
22539                         ASSERT(phyi->phyint_illv6 == NULL);
22540                         phyi->phyint_illv6 = ill;
22541                 } else {
22542                         ASSERT(phyi->phyint_illv4 == NULL);
22543                         phyi->phyint_illv4 = ill;
22544                 }
22545                 /*
22546                  * This is a new ill, currently undergoing SLIFNAME
22547                  * So we could not have joined an IPMP group until now.
22548                  */
22549                 ASSERT(phyi_old->phyint_ipsq_next == NULL &&
22550                     phyi_old->phyint_groupname == NULL);
22551
22552                 /*
22553                  * This phyi_old is going away. Decref ipsq_refs and
22554                  * assert it is zero. The ipsq itself will be freed in
22555                  * ipsq_exit
22556                  */
22557                 ipsq = phyi_old->phyint_ipsq;
22558                 IPSQ_DEC_REF(ipsq, ipst);
22559                 ASSERT(ipsq->ipsq_refs == 0);
22560                 /* Get the singleton phyint out of the ipsq list */
22561                 ASSERT(phyi_old->phyint_ipsq_next == NULL);
22562                 ipsq->ipsq_phyint_list = NULL;
22563                 phyi_old->phyint_illv4 = NULL;
22564                 phyi_old->phyint_illv6 = NULL;
22565                 mi_free(phyi_old);
22566         } else {
22567                 mutex_enter(&ill->ill_lock);
22568                 /*
22569                  * We don't need to acquire any lock, since
22570                  * the ill is not yet visible globally  and we
22571                  * have not yet released the ill_g_lock.
22572                  */
22573                 phyi = phyi_old;
22574                 mutex_enter(&phyi->phyint_lock);
22575                 /* XXX We need a recovery strategy here. */
22576                 if (!phyint_assign_ifindex(phyi, ipst))
22577                         cmn_err(CE_PANIC, "phyint_assign_ifindex() failed");
22578
22579                 /* No IPMP group yet, thus the hook uses the ifindex */
22580                 phyi->phyint_hook_ifindex = phyi->phyint_ifindex;
22581
22582                 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
22583                     (void *)phyi, where);
22584
22585                 (void) avl_find(&ipst->ips_phyint_g_list->
22586                     phyint_list_avl_by_index,
22587                     &phyi->phyint_ifindex, &where);
22588                 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
22589                     (void *)phyi, where);
22590         }
22591
22592         /*
22593          * Reassigning ill_phyint automatically reassigns the ipsq also.
22594          * pending mp is not affected because that is per ill basis.
22595          */
22596         ill->ill_phyint = phyi;
22597
22598         /*
22599          * Keep the index on ipif_orig_index to be used by FAILOVER.
22600          * We do this here as when the first ipif was allocated,
22601          * ipif_allocate does not know the right interface index.
22602          */
22603
22604         ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex;
22605         /*
22606          * Now that the phyint's ifindex has been assigned, complete the
22607          * remaining
22608          */
22609
22610         ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex;
22611         if (ill->ill_isv6) {
22612                 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
22613                     ill->ill_phyint->phyint_ifindex;
22614                 ill->ill_mcast_type = ipst->ips_mld_max_version;
22615         } else {
22616                 ill->ill_mcast_type = ipst->ips_igmp_max_version;
22617         }
22618
22619         /*
22620          * Generate an event within the hooks framework to indicate that
22621          * a new interface has just been added to IP.  For this event to
22622          * be generated, the network interface must, at least, have an
22623          * ifindex assigned to it.
22624          *
22625          * This needs to be run inside the ill_g_lock perimeter to ensure
22626          * that the ordering of delivered events to listeners matches the
22627          * order of them in the kernel.
22628          *
22629          * This function could be called from ill_lookup_on_name. In that case
22630          * the interface is loopback "lo", which will not generate a NIC event.
22631          */
22632         if (ill->ill_name_length <= 2 ||
22633             ill->ill_name[0] != 'l' || ill->ill_name[1] != 'o') {
22634                 /*
22635                  * Generate nic plumb event for ill_name even if
22636                  * ipmp_hook_emulation is set. That avoids generating events
22637                  * for the ill_names should ipmp_hook_emulation be turned on
22638                  * later.
22639                  */
22640                 ill_nic_info_plumb(ill, B_FALSE);
22641         }
22642         RELEASE_ILL_LOCKS(ill, ill_other);
22643         mutex_exit(&phyi->phyint_lock);
22644 }
22645
22646 /*
22647  * Allocate a NE_PLUMB nic info event and store in the ill.
22648  * If 'group' is set we do it for the group name, otherwise the ill name.
22649  * It will be sent when we leave the ipsq.
22650  */
22651 void
22652 ill_nic_info_plumb(ill_t *ill, boolean_t group)
22653 {
22654         phyint_t        *phyi = ill->ill_phyint;
22655         ip_stack_t      *ipst = ill->ill_ipst;
22656         hook_nic_event_t *info;
22657         char            *name;
22658         int             namelen;
22659
22660         ASSERT(MUTEX_HELD(&ill->ill_lock));
22661
22662         if ((info = ill->ill_nic_event_info) != NULL) {
22663                 ip2dbg(("ill_nic_info_plumb: unexpected nic event %d "
22664                     "attached for %s\n", info->hne_event,
22665                     ill->ill_name));
22666                 if (info->hne_data != NULL)
22667                         kmem_free(info->hne_data, info->hne_datalen);
22668                 kmem_free(info, sizeof (hook_nic_event_t));
22669                 ill->ill_nic_event_info = NULL;
22670         }
22671
22672         info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP);
22673         if (info == NULL) {
22674                 ip2dbg(("ill_nic_info_plumb: could not attach PLUMB nic "
22675                     "event information for %s (ENOMEM)\n",
22676                     ill->ill_name));
22677                 return;
22678         }
22679
22680         if (group) {
22681                 ASSERT(phyi->phyint_groupname_len != 0);
22682                 namelen = phyi->phyint_groupname_len;
22683                 name = phyi->phyint_groupname;
22684         } else {
22685                 namelen = ill->ill_name_length;
22686                 name = ill->ill_name;
22687         }
22688
22689         info->hne_nic = phyi->phyint_hook_ifindex;
22690         info->hne_lif = 0;
22691         info->hne_event = NE_PLUMB;
22692         info->hne_family = ill->ill_isv6 ?
22693             ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data;
22694
22695         info->hne_data = kmem_alloc(namelen, KM_NOSLEEP);
22696         if (info->hne_data != NULL) {
22697                 info->hne_datalen = namelen;
22698                 bcopy(name, info->hne_data, info->hne_datalen);
22699         } else {
22700                 ip2dbg(("ill_nic_info_plumb: could not attach "
22701                     "name information for PLUMB nic event "
22702                     "of %s (ENOMEM)\n", name));
22703                 kmem_free(info, sizeof (hook_nic_event_t));
22704                 info = NULL;
22705         }
22706         ill->ill_nic_event_info = info;
22707 }
22708
22709 /*
22710  * Unhook the nic event message from the ill and enqueue it
22711  * into the nic event taskq.
22712  */
22713 void
22714 ill_nic_info_dispatch(ill_t *ill)
22715 {
22716         hook_nic_event_t *info;
22717
22718         ASSERT(MUTEX_HELD(&ill->ill_lock));
22719
22720         if ((info = ill->ill_nic_event_info) != NULL) {
22721                 if (ddi_taskq_dispatch(eventq_queue_nic,
22722                     ip_ne_queue_func, info, DDI_SLEEP) == DDI_FAILURE) {
22723                         ip2dbg(("ill_nic_info_dispatch: "
22724                             "ddi_taskq_dispatch failed\n"));
22725                         if (info->hne_data != NULL)
22726                                 kmem_free(info->hne_data, info->hne_datalen);
22727                         kmem_free(info, sizeof (hook_nic_event_t));
22728                 }
22729                 ill->ill_nic_event_info = NULL;
22730         }
22731 }
22732
22733 /*
22734  * Notify any downstream modules of the name of this interface.
22735  * An M_IOCTL is used even though we don't expect a successful reply.
22736  * Any reply message from the driver (presumably an M_IOCNAK) will
22737  * eventually get discarded somewhere upstream.  The message format is
22738  * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig
22739  * to IP.
22740  */
22741 static void
22742 ip_ifname_notify(ill_t *ill, queue_t *q)
22743 {
22744         mblk_t *mp1, *mp2;
22745         struct iocblk *iocp;
22746         struct lifreq *lifr;
22747
22748         mp1 = mkiocb(SIOCSLIFNAME);
22749         if (mp1 == NULL)
22750                 return;
22751         mp2 = allocb(sizeof (struct lifreq), BPRI_HI);
22752         if (mp2 == NULL) {
22753                 freeb(mp1);
22754                 return;
22755         }
22756
22757         mp1->b_cont = mp2;
22758         iocp = (struct iocblk *)mp1->b_rptr;
22759         iocp->ioc_count = sizeof (struct lifreq);
22760
22761         lifr = (struct lifreq *)mp2->b_rptr;
22762         mp2->b_wptr += sizeof (struct lifreq);
22763         bzero(lifr, sizeof (struct lifreq));
22764
22765         (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ);
22766         lifr->lifr_ppa = ill->ill_ppa;
22767         lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6));
22768
22769         putnext(q, mp1);
22770 }
22771
22772 static int
22773 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
22774 {
22775         int err;
22776         ip_stack_t      *ipst = ill->ill_ipst;
22777
22778         /* Set the obsolete NDD per-interface forwarding name. */
22779         err = ill_set_ndd_name(ill);
22780         if (err != 0) {
22781                 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n",
22782                     err);
22783         }
22784
22785         /* Tell downstream modules where they are. */
22786         ip_ifname_notify(ill, q);
22787
22788         /*
22789          * ill_dl_phys returns EINPROGRESS in the usual case.
22790          * Error cases are ENOMEM ...
22791          */
22792         err = ill_dl_phys(ill, ipif, mp, q);
22793
22794         /*
22795          * If there is no IRE expiration timer running, get one started.
22796          * igmp and mld timers will be triggered by the first multicast
22797          */
22798         if (ipst->ips_ip_ire_expire_id == 0) {
22799                 /*
22800                  * acquire the lock and check again.
22801                  */
22802                 mutex_enter(&ipst->ips_ip_trash_timer_lock);
22803                 if (ipst->ips_ip_ire_expire_id == 0) {
22804                         ipst->ips_ip_ire_expire_id = timeout(
22805                             ip_trash_timer_expire, ipst,
22806                             MSEC_TO_TICK(ipst->ips_ip_timer_interval));
22807                 }
22808                 mutex_exit(&ipst->ips_ip_trash_timer_lock);
22809         }
22810
22811         if (ill->ill_isv6) {
22812                 mutex_enter(&ipst->ips_mld_slowtimeout_lock);
22813                 if (ipst->ips_mld_slowtimeout_id == 0) {
22814                         ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo,
22815                             (void *)ipst,
22816                             MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
22817                 }
22818                 mutex_exit(&ipst->ips_mld_slowtimeout_lock);
22819         } else {
22820                 mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
22821                 if (ipst->ips_igmp_slowtimeout_id == 0) {
22822                         ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo,
22823                             (void *)ipst,
22824                             MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
22825                 }
22826                 mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
22827         }
22828
22829         return (err);
22830 }
22831
22832 /*
22833  * Common routine for ppa and ifname setting. Should be called exclusive.
22834  *
22835  * Returns EINPROGRESS when mp has been consumed by queueing it on
22836  * ill_pending_mp and the ioctl will complete in ip_rput.
22837  *
22838  * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return
22839  * the new name and new ppa in lifr_name and lifr_ppa respectively.
22840  * For SLIFNAME, we pass these values back to the userland.
22841  */
22842 static int
22843 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
22844 {
22845         ill_t   *ill;
22846         ipif_t  *ipif;
22847         ipsq_t  *ipsq;
22848         char    *ppa_ptr;
22849         char    *old_ptr;
22850         char    old_char;
22851         int     error;
22852         ip_stack_t      *ipst;
22853
22854         ip1dbg(("ipif_set_values: interface %s\n", interf_name));
22855         ASSERT(q->q_next != NULL);
22856         ASSERT(interf_name != NULL);
22857
22858         ill = (ill_t *)q->q_ptr;
22859         ipst = ill->ill_ipst;
22860
22861         ASSERT(ill->ill_ipst != NULL);
22862         ASSERT(ill->ill_name[0] == '\0');
22863         ASSERT(IAM_WRITER_ILL(ill));
22864         ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ);
22865         ASSERT(ill->ill_ppa == UINT_MAX);
22866
22867         /* The ppa is sent down by ifconfig or is chosen */
22868         if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) {
22869                 return (EINVAL);
22870         }
22871
22872         /*
22873          * make sure ppa passed in is same as ppa in the name.
22874          * This check is not made when ppa == UINT_MAX in that case ppa
22875          * in the name could be anything. System will choose a ppa and
22876          * update new_ppa_ptr and inter_name to contain the choosen ppa.
22877          */
22878         if (*new_ppa_ptr != UINT_MAX) {
22879                 /* stoi changes the pointer */
22880                 old_ptr = ppa_ptr;
22881                 /*
22882                  * ifconfig passed in 0 for the ppa for DLPI 1 style devices
22883                  * (they don't have an externally visible ppa).  We assign one
22884                  * here so that we can manage the interface.  Note that in
22885                  * the past this value was always 0 for DLPI 1 drivers.
22886                  */
22887                 if (*new_ppa_ptr == 0)
22888                         *new_ppa_ptr = stoi(&old_ptr);
22889                 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr))
22890                         return (EINVAL);
22891         }
22892         /*
22893          * terminate string before ppa
22894          * save char at that location.
22895          */
22896         old_char = ppa_ptr[0];
22897         ppa_ptr[0] = '\0';
22898
22899         ill->ill_ppa = *new_ppa_ptr;
22900         /*
22901          * Finish as much work now as possible before calling ill_glist_insert
22902          * which makes the ill globally visible and also merges it with the
22903          * other protocol instance of this phyint. The remaining work is
22904          * done after entering the ipsq which may happen sometime later.
22905          * ill_set_ndd_name occurs after the ill has been made globally visible.
22906          */
22907         ipif = ill->ill_ipif;
22908
22909         /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */
22910         ipif_assign_seqid(ipif);
22911
22912         if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)))
22913                 ill->ill_flags |= ILLF_IPV4;
22914
22915         ASSERT(ipif->ipif_next == NULL);        /* Only one ipif on ill */
22916         ASSERT((ipif->ipif_flags & IPIF_UP) == 0);
22917
22918         if (ill->ill_flags & ILLF_IPV6) {
22919
22920                 ill->ill_isv6 = B_TRUE;
22921                 if (ill->ill_rq != NULL) {
22922                         ill->ill_rq->q_qinfo = &iprinitv6;
22923                         ill->ill_wq->q_qinfo = &ipwinitv6;
22924                 }
22925
22926                 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */
22927                 ipif->ipif_v6lcl_addr = ipv6_all_zeros;
22928                 ipif->ipif_v6src_addr = ipv6_all_zeros;
22929                 ipif->ipif_v6subnet = ipv6_all_zeros;
22930                 ipif->ipif_v6net_mask = ipv6_all_zeros;
22931                 ipif->ipif_v6brd_addr = ipv6_all_zeros;
22932                 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros;
22933                 /*
22934                  * point-to-point or Non-mulicast capable
22935                  * interfaces won't do NUD unless explicitly
22936                  * configured to do so.
22937                  */
22938                 if (ipif->ipif_flags & IPIF_POINTOPOINT ||
22939                     !(ill->ill_flags & ILLF_MULTICAST)) {
22940                         ill->ill_flags |= ILLF_NONUD;
22941                 }
22942                 /* Make sure IPv4 specific flag is not set on IPv6 if */
22943                 if (ill->ill_flags & ILLF_NOARP) {
22944                         /*
22945                          * Note: xresolv interfaces will eventually need
22946                          * NOARP set here as well, but that will require
22947                          * those external resolvers to have some
22948                          * knowledge of that flag and act appropriately.
22949                          * Not to be changed at present.
22950                          */
22951                         ill->ill_flags &= ~ILLF_NOARP;
22952                 }
22953                 /*
22954                  * Set the ILLF_ROUTER flag according to the global
22955                  * IPv6 forwarding policy.
22956                  */
22957                 if (ipst->ips_ipv6_forward != 0)
22958                         ill->ill_flags |= ILLF_ROUTER;
22959         } else if (ill->ill_flags & ILLF_IPV4) {
22960                 ill->ill_isv6 = B_FALSE;
22961                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr);
22962                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr);
22963                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet);
22964                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask);
22965                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr);
22966                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr);
22967                 /*
22968                  * Set the ILLF_ROUTER flag according to the global
22969                  * IPv4 forwarding policy.
22970                  */
22971                 if (ipst->ips_ip_g_forward != 0)
22972                         ill->ill_flags |= ILLF_ROUTER;
22973         }
22974
22975         ASSERT(ill->ill_phyint != NULL);
22976
22977         /*
22978          * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will
22979          * be completed in ill_glist_insert -> ill_phyint_reinit
22980          */
22981         if (!ill_allocate_mibs(ill))
22982                 return (ENOMEM);
22983
22984         /*
22985          * Pick a default sap until we get the DL_INFO_ACK back from
22986          * the driver.
22987          */
22988         if (ill->ill_sap == 0) {
22989                 if (ill->ill_isv6)
22990                         ill->ill_sap  = IP6_DL_SAP;
22991                 else
22992                         ill->ill_sap  = IP_DL_SAP;
22993         }
22994
22995         ill->ill_ifname_pending = 1;
22996         ill->ill_ifname_pending_err = 0;
22997
22998         ill_refhold(ill);
22999         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
23000         if ((error = ill_glist_insert(ill, interf_name,
23001             (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) {
23002                 ill->ill_ppa = UINT_MAX;
23003                 ill->ill_name[0] = '\0';
23004                 /*
23005                  * undo null termination done above.
23006                  */
23007                 ppa_ptr[0] = old_char;
23008                 rw_exit(&ipst->ips_ill_g_lock);
23009                 ill_refrele(ill);
23010                 return (error);
23011         }
23012
23013         ASSERT(ill->ill_name_length <= LIFNAMSIZ);
23014
23015         /*
23016          * When we return the buffer pointed to by interf_name should contain
23017          * the same name as in ill_name.
23018          * If a ppa was choosen by the system (ppa passed in was UINT_MAX)
23019          * the buffer pointed to by new_ppa_ptr would not contain the right ppa
23020          * so copy full name and update the ppa ptr.
23021          * When ppa passed in != UINT_MAX all values are correct just undo
23022          * null termination, this saves a bcopy.
23023          */
23024         if (*new_ppa_ptr == UINT_MAX) {
23025                 bcopy(ill->ill_name, interf_name, ill->ill_name_length);
23026                 *new_ppa_ptr = ill->ill_ppa;
23027         } else {
23028                 /*
23029                  * undo null termination done above.
23030                  */
23031                 ppa_ptr[0] = old_char;
23032         }
23033
23034         /* Let SCTP know about this ILL */
23035         sctp_update_ill(ill, SCTP_ILL_INSERT);
23036
23037         ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_reprocess_ioctl, NEW_OP,
23038             B_TRUE);
23039
23040         rw_exit(&ipst->ips_ill_g_lock);
23041         ill_refrele(ill);
23042         if (ipsq == NULL)
23043                 return (EINPROGRESS);
23044
23045         /*
23046          * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq.
23047          */
23048         if (ipsq->ipsq_current_ipif == NULL)
23049                 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME);
23050         else
23051                 ASSERT(ipsq->ipsq_current_ipif == ipif);
23052
23053         error = ipif_set_values_tail(ill, ipif, mp, q);
23054         ipsq_exit(ipsq, B_TRUE, B_TRUE);
23055         if (error != 0 && error != EINPROGRESS) {
23056                 /*
23057                  * restore previous values
23058                  */
23059                 ill->ill_isv6 = B_FALSE;
23060         }
23061         return (error);
23062 }
23063
23064
23065 void
23066 ipif_init(ip_stack_t *ipst)
23067 {
23068         hrtime_t hrt;
23069         int i;
23070
23071         /*
23072          * Can't call drv_getparm here as it is too early in the boot.
23073          * As we use ipif_src_random just for picking a different
23074          * source address everytime, this need not be really random.
23075          */
23076         hrt = gethrtime();
23077         ipst->ips_ipif_src_random =
23078             ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff);
23079
23080         for (i = 0; i < MAX_G_HEADS; i++) {
23081                 ipst->ips_ill_g_heads[i].ill_g_list_head =
23082                     (ill_if_t *)&ipst->ips_ill_g_heads[i];
23083                 ipst->ips_ill_g_heads[i].ill_g_list_tail =
23084                     (ill_if_t *)&ipst->ips_ill_g_heads[i];
23085         }
23086
23087         avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
23088             ill_phyint_compare_index,
23089             sizeof (phyint_t),
23090             offsetof(struct phyint, phyint_avl_by_index));
23091         avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
23092             ill_phyint_compare_name,
23093             sizeof (phyint_t),
23094             offsetof(struct phyint, phyint_avl_by_name));
23095 }
23096
23097 /*
23098  * Lookup the ipif corresponding to the onlink destination address. For
23099  * point-to-point interfaces, it matches with remote endpoint destination
23100  * address. For point-to-multipoint interfaces it only tries to match the
23101  * destination with the interface's subnet address. The longest, most specific
23102  * match is found to take care of such rare network configurations like -
23103  * le0: 129.146.1.1/16
23104  * le1: 129.146.2.2/24
23105  * It is used only by SO_DONTROUTE at the moment.
23106  */
23107 ipif_t *
23108 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
23109 {
23110         ipif_t  *ipif, *best_ipif;
23111         ill_t   *ill;
23112         ill_walk_context_t ctx;
23113
23114         ASSERT(zoneid != ALL_ZONES);
23115         best_ipif = NULL;
23116
23117         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
23118         ill = ILL_START_WALK_V4(&ctx, ipst);
23119         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
23120                 mutex_enter(&ill->ill_lock);
23121                 for (ipif = ill->ill_ipif; ipif != NULL;
23122                     ipif = ipif->ipif_next) {
23123                         if (!IPIF_CAN_LOOKUP(ipif))
23124                                 continue;
23125                         if (ipif->ipif_zoneid != zoneid &&
23126                             ipif->ipif_zoneid != ALL_ZONES)
23127                                 continue;
23128                         /*
23129                          * Point-to-point case. Look for exact match with
23130                          * destination address.
23131                          */
23132                         if (ipif->ipif_flags & IPIF_POINTOPOINT) {
23133                                 if (ipif->ipif_pp_dst_addr == addr) {
23134                                         ipif_refhold_locked(ipif);
23135                                         mutex_exit(&ill->ill_lock);
23136                                         rw_exit(&ipst->ips_ill_g_lock);
23137                                         if (best_ipif != NULL)
23138                                                 ipif_refrele(best_ipif);
23139                                         return (ipif);
23140                                 }
23141                         } else if (ipif->ipif_subnet == (addr &
23142                             ipif->ipif_net_mask)) {
23143                                 /*
23144                                  * Point-to-multipoint case. Looping through to
23145                                  * find the most specific match. If there are
23146                                  * multiple best match ipif's then prefer ipif's
23147                                  * that are UP. If there is only one best match
23148                                  * ipif and it is DOWN we must still return it.
23149                                  */
23150                                 if ((best_ipif == NULL) ||
23151                                     (ipif->ipif_net_mask >
23152                                     best_ipif->ipif_net_mask) ||
23153                                     ((ipif->ipif_net_mask ==
23154                                     best_ipif->ipif_net_mask) &&
23155                                     ((ipif->ipif_flags & IPIF_UP) &&
23156                                     (!(best_ipif->ipif_flags & IPIF_UP))))) {
23157                                         ipif_refhold_locked(ipif);
23158                                         mutex_exit(&ill->ill_lock);
23159                                         rw_exit(&ipst->ips_ill_g_lock);
23160                                         if (best_ipif != NULL)
23161                                                 ipif_refrele(best_ipif);
23162                                         best_ipif = ipif;
23163                                         rw_enter(&ipst->ips_ill_g_lock,
23164                                             RW_READER);
23165                                         mutex_enter(&ill->ill_lock);
23166                                 }
23167                         }
23168                 }
23169                 mutex_exit(&ill->ill_lock);
23170         }
23171         rw_exit(&ipst->ips_ill_g_lock);
23172         return (best_ipif);
23173 }
23174
23175
23176 /*
23177  * Save enough information so that we can recreate the IRE if
23178  * the interface goes down and then up.
23179  */
23180 static void
23181 ipif_save_ire(ipif_t *ipif, ire_t *ire)
23182 {
23183         mblk_t  *save_mp;
23184
23185         save_mp = allocb(sizeof (ifrt_t), BPRI_MED);
23186         if (save_mp != NULL) {
23187                 ifrt_t  *ifrt;
23188
23189                 save_mp->b_wptr += sizeof (ifrt_t);
23190                 ifrt = (ifrt_t *)save_mp->b_rptr;
23191                 bzero(ifrt, sizeof (ifrt_t));
23192                 ifrt->ifrt_type = ire->ire_type;
23193                 ifrt->ifrt_addr = ire->ire_addr;
23194                 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr;
23195                 ifrt->ifrt_src_addr = ire->ire_src_addr;
23196                 ifrt->ifrt_mask = ire->ire_mask;
23197                 ifrt->ifrt_flags = ire->ire_flags;
23198                 ifrt->ifrt_max_frag = ire->ire_max_frag;
23199                 mutex_enter(&ipif->ipif_saved_ire_lock);
23200                 save_mp->b_cont = ipif->ipif_saved_ire_mp;
23201                 ipif->ipif_saved_ire_mp = save_mp;
23202                 ipif->ipif_saved_ire_cnt++;
23203                 mutex_exit(&ipif->ipif_saved_ire_lock);
23204         }
23205 }
23206
23207
23208 static void
23209 ipif_remove_ire(ipif_t *ipif, ire_t *ire)
23210 {
23211         mblk_t  **mpp;
23212         mblk_t  *mp;
23213         ifrt_t  *ifrt;
23214
23215         /* Remove from ipif_saved_ire_mp list if it is there */
23216         mutex_enter(&ipif->ipif_saved_ire_lock);
23217         for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL;
23218             mpp = &(*mpp)->b_cont) {
23219                 /*
23220                  * On a given ipif, the triple of address, gateway and
23221                  * mask is unique for each saved IRE (in the case of
23222                  * ordinary interface routes, the gateway address is
23223                  * all-zeroes).
23224                  */
23225                 mp = *mpp;
23226                 ifrt = (ifrt_t *)mp->b_rptr;
23227                 if (ifrt->ifrt_addr == ire->ire_addr &&
23228                     ifrt->ifrt_gateway_addr == ire->ire_gateway_addr &&
23229                     ifrt->ifrt_mask == ire->ire_mask) {
23230                         *mpp = mp->b_cont;
23231                         ipif->ipif_saved_ire_cnt--;
23232                         freeb(mp);
23233                         break;
23234                 }
23235         }
23236         mutex_exit(&ipif->ipif_saved_ire_lock);
23237 }
23238
23239
23240 /*
23241  * IP multirouting broadcast routes handling
23242  * Append CGTP broadcast IREs to regular ones created
23243  * at ifconfig time.
23244  */
23245 static void
23246 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst, ip_stack_t *ipst)
23247 {
23248         ire_t *ire_prim;
23249
23250         ASSERT(ire != NULL);
23251         ASSERT(ire_dst != NULL);
23252
23253         ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0,
23254             IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
23255         if (ire_prim != NULL) {
23256                 /*
23257                  * We are in the special case of broadcasts for
23258                  * CGTP. We add an IRE_BROADCAST that holds
23259                  * the RTF_MULTIRT flag, the destination
23260                  * address of ire_dst and the low level
23261                  * info of ire_prim. In other words, CGTP
23262                  * broadcast is added to the redundant ipif.
23263                  */
23264                 ipif_t *ipif_prim;
23265                 ire_t  *bcast_ire;
23266
23267                 ipif_prim = ire_prim->ire_ipif;
23268
23269                 ip2dbg(("ip_cgtp_filter_bcast_add: "
23270                     "ire_dst %p, ire_prim %p, ipif_prim %p\n",
23271                     (void *)ire_dst, (void *)ire_prim,
23272                     (void *)ipif_prim));
23273
23274                 bcast_ire = ire_create(
23275                     (uchar_t *)&ire->ire_addr,
23276                     (uchar_t *)&ip_g_all_ones,
23277                     (uchar_t *)&ire_dst->ire_src_addr,
23278                     (uchar_t *)&ire->ire_gateway_addr,
23279                     &ipif_prim->ipif_mtu,
23280                     NULL,
23281                     ipif_prim->ipif_rq,
23282                     ipif_prim->ipif_wq,
23283                     IRE_BROADCAST,
23284                     ipif_prim,
23285                     0,
23286                     0,
23287                     0,
23288                     ire->ire_flags,
23289                     &ire_uinfo_null,
23290                     NULL,
23291                     NULL,
23292                     ipst);
23293
23294                 if (bcast_ire != NULL) {
23295
23296                         if (ire_add(&bcast_ire, NULL, NULL, NULL,
23297                             B_FALSE) == 0) {
23298                                 ip2dbg(("ip_cgtp_filter_bcast_add: "
23299                                     "added bcast_ire %p\n",
23300                                     (void *)bcast_ire));
23301
23302                                 ipif_save_ire(bcast_ire->ire_ipif,
23303                                     bcast_ire);
23304                                 ire_refrele(bcast_ire);
23305                         }
23306                 }
23307                 ire_refrele(ire_prim);
23308         }
23309 }
23310
23311
23312 /*
23313  * IP multirouting broadcast routes handling
23314  * Remove the broadcast ire
23315  */
23316 static void
23317 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst)
23318 {
23319         ire_t *ire_dst;
23320
23321         ASSERT(ire != NULL);
23322         ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST,
23323             NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
23324         if (ire_dst != NULL) {
23325                 ire_t *ire_prim;
23326
23327                 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0,
23328                     IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
23329                 if (ire_prim != NULL) {
23330                         ipif_t *ipif_prim;
23331                         ire_t  *bcast_ire;
23332
23333                         ipif_prim = ire_prim->ire_ipif;
23334
23335                         ip2dbg(("ip_cgtp_filter_bcast_delete: "
23336                             "ire_dst %p, ire_prim %p, ipif_prim %p\n",
23337                             (void *)ire_dst, (void *)ire_prim,
23338                             (void *)ipif_prim));
23339
23340                         bcast_ire = ire_ctable_lookup(ire->ire_addr,
23341                             ire->ire_gateway_addr,
23342                             IRE_BROADCAST,
23343                             ipif_prim, ALL_ZONES,
23344                             NULL,
23345                             MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF |
23346                             MATCH_IRE_MASK, ipst);
23347
23348                         if (bcast_ire != NULL) {
23349                                 ip2dbg(("ip_cgtp_filter_bcast_delete: "
23350                                     "looked up bcast_ire %p\n",
23351                                     (void *)bcast_ire));
23352                                 ipif_remove_ire(bcast_ire->ire_ipif,
23353                                     bcast_ire);
23354                                 ire_delete(bcast_ire);
23355                         }
23356                         ire_refrele(ire_prim);
23357                 }
23358                 ire_refrele(ire_dst);
23359         }
23360 }
23361
23362 /*
23363  * IPsec hardware acceleration capabilities related functions.
23364  */
23365
23366 /*
23367  * Free a per-ill IPsec capabilities structure.
23368  */
23369 static void
23370 ill_ipsec_capab_free(ill_ipsec_capab_t *capab)
23371 {
23372         if (capab->auth_hw_algs != NULL)
23373                 kmem_free(capab->auth_hw_algs, capab->algs_size);
23374         if (capab->encr_hw_algs != NULL)
23375                 kmem_free(capab->encr_hw_algs, capab->algs_size);
23376         if (capab->encr_algparm != NULL)
23377                 kmem_free(capab->encr_algparm, capab->encr_algparm_size);
23378         kmem_free(capab, sizeof (ill_ipsec_capab_t));
23379 }
23380
23381 /*
23382  * Allocate a new per-ill IPsec capabilities structure. This structure
23383  * is specific to an IPsec protocol (AH or ESP). It is implemented as
23384  * an array which specifies, for each algorithm, whether this algorithm
23385  * is supported by the ill or not.
23386  */
23387 static ill_ipsec_capab_t *
23388 ill_ipsec_capab_alloc(void)
23389 {
23390         ill_ipsec_capab_t *capab;
23391         uint_t nelems;
23392
23393         capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP);
23394         if (capab == NULL)
23395                 return (NULL);
23396
23397         /* we need one bit per algorithm */
23398         nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t);
23399         capab->algs_size = nelems * sizeof (ipsec_capab_elem_t);
23400
23401         /* allocate memory to store algorithm flags */
23402         capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP);
23403         if (capab->encr_hw_algs == NULL)
23404                 goto nomem;
23405         capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP);
23406         if (capab->auth_hw_algs == NULL)
23407                 goto nomem;
23408         /*
23409          * Leave encr_algparm NULL for now since we won't need it half
23410          * the time
23411          */
23412         return (capab);
23413
23414 nomem:
23415         ill_ipsec_capab_free(capab);
23416         return (NULL);
23417 }
23418
23419 /*
23420  * Resize capability array.  Since we're exclusive, this is OK.
23421  */
23422 static boolean_t
23423 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid)
23424 {
23425         ipsec_capab_algparm_t *nalp, *oalp;
23426         uint32_t olen, nlen;
23427
23428         oalp = capab->encr_algparm;
23429         olen = capab->encr_algparm_size;
23430
23431         if (oalp != NULL) {
23432                 if (algid < capab->encr_algparm_end)
23433                         return (B_TRUE);
23434         }
23435
23436         nlen = (algid + 1) * sizeof (*nalp);
23437         nalp = kmem_zalloc(nlen, KM_NOSLEEP);
23438         if (nalp == NULL)
23439                 return (B_FALSE);
23440
23441         if (oalp != NULL) {
23442                 bcopy(oalp, nalp, olen);
23443                 kmem_free(oalp, olen);
23444         }
23445         capab->encr_algparm = nalp;
23446         capab->encr_algparm_size = nlen;
23447         capab->encr_algparm_end = algid + 1;
23448
23449         return (B_TRUE);
23450 }
23451
23452 /*
23453  * Compare the capabilities of the specified ill with the protocol
23454  * and algorithms specified by the SA passed as argument.
23455  * If they match, returns B_TRUE, B_FALSE if they do not match.
23456  *
23457  * The ill can be passed as a pointer to it, or by specifying its index
23458  * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments).
23459  *
23460  * Called by ipsec_out_is_accelerated() do decide whether an outbound
23461  * packet is eligible for hardware acceleration, and by
23462  * ill_ipsec_capab_send_all() to decide whether a SA must be sent down
23463  * to a particular ill.
23464  */
23465 boolean_t
23466 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6,
23467     ipsa_t *sa, netstack_t *ns)
23468 {
23469         boolean_t sa_isv6;
23470         uint_t algid;
23471         struct ill_ipsec_capab_s *cpp;
23472         boolean_t need_refrele = B_FALSE;
23473         ip_stack_t      *ipst = ns->netstack_ip;
23474
23475         if (ill == NULL) {
23476                 ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL,
23477                     NULL, NULL, NULL, ipst);
23478                 if (ill == NULL) {
23479                         ip0dbg(("ipsec_capab_match: ill doesn't exist\n"));
23480                         return (B_FALSE);
23481                 }
23482                 need_refrele = B_TRUE;
23483         }
23484
23485         /*
23486          * Use the address length specified by the SA to determine
23487          * if it corresponds to a IPv6 address, and fail the matching
23488          * if the isv6 flag passed as argument does not match.
23489          * Note: this check is used for SADB capability checking before
23490          * sending SA information to an ill.
23491          */
23492         sa_isv6 = (sa->ipsa_addrfam == AF_INET6);
23493         if (sa_isv6 != ill_isv6)
23494                 /* protocol mismatch */
23495                 goto done;
23496
23497         /*
23498          * Check if the ill supports the protocol, algorithm(s) and
23499          * key size(s) specified by the SA, and get the pointers to
23500          * the algorithms supported by the ill.
23501          */
23502         switch (sa->ipsa_type) {
23503
23504         case SADB_SATYPE_ESP:
23505                 if (!(ill->ill_capabilities & ILL_CAPAB_ESP))
23506                         /* ill does not support ESP acceleration */
23507                         goto done;
23508                 cpp = ill->ill_ipsec_capab_esp;
23509                 algid = sa->ipsa_auth_alg;
23510                 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs))
23511                         goto done;
23512                 algid = sa->ipsa_encr_alg;
23513                 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs))
23514                         goto done;
23515                 if (algid < cpp->encr_algparm_end) {
23516                         ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid];
23517                         if (sa->ipsa_encrkeybits < alp->minkeylen)
23518                                 goto done;
23519                         if (sa->ipsa_encrkeybits > alp->maxkeylen)
23520                                 goto done;
23521                 }
23522                 break;
23523
23524         case SADB_SATYPE_AH:
23525                 if (!(ill->ill_capabilities & ILL_CAPAB_AH))
23526                         /* ill does not support AH acceleration */
23527                         goto done;
23528                 if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg,
23529                     ill->ill_ipsec_capab_ah->auth_hw_algs))
23530                         goto done;
23531                 break;
23532         }
23533
23534         if (need_refrele)
23535                 ill_refrele(ill);
23536         return (B_TRUE);
23537 done:
23538         if (need_refrele)
23539                 ill_refrele(ill);
23540         return (B_FALSE);
23541 }
23542
23543
23544 /*
23545  * Add a new ill to the list of IPsec capable ills.
23546  * Called from ill_capability_ipsec_ack() when an ACK was received
23547  * indicating that IPsec hardware processing was enabled for an ill.
23548  *
23549  * ill must point to the ill for which acceleration was enabled.
23550  * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP.
23551  */
23552 static void
23553 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync)
23554 {
23555         ipsec_capab_ill_t **ills, *cur_ill, *new_ill;
23556         uint_t sa_type;
23557         uint_t ipproto;
23558         ip_stack_t      *ipst = ill->ill_ipst;
23559
23560         ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) ||
23561             (dl_cap == DL_CAPAB_IPSEC_ESP));
23562
23563         switch (dl_cap) {
23564         case DL_CAPAB_IPSEC_AH:
23565                 sa_type = SADB_SATYPE_AH;
23566                 ills = &ipst->ips_ipsec_capab_ills_ah;
23567                 ipproto = IPPROTO_AH;
23568                 break;
23569         case DL_CAPAB_IPSEC_ESP:
23570                 sa_type = SADB_SATYPE_ESP;
23571                 ills = &ipst->ips_ipsec_capab_ills_esp;
23572                 ipproto = IPPROTO_ESP;
23573                 break;
23574         }
23575
23576         rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER);
23577
23578         /*
23579          * Add ill index to list of hardware accelerators. If
23580          * already in list, do nothing.
23581          */
23582         for (cur_ill = *ills; cur_ill != NULL &&
23583             (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex ||
23584             cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next)
23585                 ;
23586
23587         if (cur_ill == NULL) {
23588                 /* if this is a new entry for this ill */
23589                 new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP);
23590                 if (new_ill == NULL) {
23591                         rw_exit(&ipst->ips_ipsec_capab_ills_lock);
23592                         return;
23593                 }
23594
23595                 new_ill->ill_index = ill->ill_phyint->phyint_ifindex;
23596                 new_ill->ill_isv6 = ill->ill_isv6;
23597                 new_ill->next = *ills;
23598                 *ills = new_ill;
23599         } else if (!sadb_resync) {
23600                 /* not resync'ing SADB and an entry exists for this ill */
23601                 rw_exit(&ipst->ips_ipsec_capab_ills_lock);
23602                 return;
23603         }
23604
23605         rw_exit(&ipst->ips_ipsec_capab_ills_lock);
23606
23607         if (ipst->ips_ipcl_proto_fanout_v6[ipproto].connf_head != NULL)
23608                 /*
23609                  * IPsec module for protocol loaded, initiate dump
23610                  * of the SADB to this ill.
23611                  */
23612                 sadb_ill_download(ill, sa_type);
23613 }
23614
23615 /*
23616  * Remove an ill from the list of IPsec capable ills.
23617  */
23618 static void
23619 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap)
23620 {
23621         ipsec_capab_ill_t **ills, *cur_ill, *prev_ill;
23622         ip_stack_t      *ipst = ill->ill_ipst;
23623
23624         ASSERT(dl_cap == DL_CAPAB_IPSEC_AH ||
23625             dl_cap == DL_CAPAB_IPSEC_ESP);
23626
23627         ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipst->ips_ipsec_capab_ills_ah :
23628             &ipst->ips_ipsec_capab_ills_esp;
23629
23630         rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER);
23631
23632         prev_ill = NULL;
23633         for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index !=
23634             ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 !=
23635             ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next)
23636                 ;
23637         if (cur_ill == NULL) {
23638                 /* entry not found */
23639                 rw_exit(&ipst->ips_ipsec_capab_ills_lock);
23640                 return;
23641         }
23642         if (prev_ill == NULL) {
23643                 /* entry at front of list */
23644                 *ills = NULL;
23645         } else {
23646                 prev_ill->next = cur_ill->next;
23647         }
23648         kmem_free(cur_ill, sizeof (ipsec_capab_ill_t));
23649         rw_exit(&ipst->ips_ipsec_capab_ills_lock);
23650 }
23651
23652 /*
23653  * Called by SADB to send a DL_CONTROL_REQ message to every ill
23654  * supporting the specified IPsec protocol acceleration.
23655  * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP.
23656  * We free the mblk and, if sa is non-null, release the held referece.
23657  */
23658 void
23659 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa,
23660     netstack_t *ns)
23661 {
23662         ipsec_capab_ill_t *ici, *cur_ici;
23663         ill_t *ill;
23664         mblk_t *nmp, *mp_ship_list = NULL, *next_mp;
23665         ip_stack_t      *ipst = ns->netstack_ip;
23666
23667         ici = (sa_type == SADB_SATYPE_AH) ? ipst->ips_ipsec_capab_ills_ah :
23668             ipst->ips_ipsec_capab_ills_esp;
23669
23670         rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_READER);
23671
23672         for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) {
23673                 ill = ill_lookup_on_ifindex(cur_ici->ill_index,
23674                     cur_ici->ill_isv6, NULL, NULL, NULL, NULL, ipst);
23675
23676                 /*
23677                  * Handle the case where the ill goes away while the SADB is
23678                  * attempting to send messages.  If it's going away, it's
23679                  * nuking its shadow SADB, so we don't care..
23680                  */
23681
23682                 if (ill == NULL)
23683                         continue;
23684
23685                 if (sa != NULL) {
23686                         /*
23687                          * Make sure capabilities match before
23688                          * sending SA to ill.
23689                          */
23690                         if (!ipsec_capab_match(ill, cur_ici->ill_index,
23691                             cur_ici->ill_isv6, sa, ipst->ips_netstack)) {
23692                                 ill_refrele(ill);
23693                                 continue;
23694                         }
23695
23696                         mutex_enter(&sa->ipsa_lock);
23697                         sa->ipsa_flags |= IPSA_F_HW;
23698                         mutex_exit(&sa->ipsa_lock);
23699                 }
23700
23701                 /*
23702                  * Copy template message, and add it to the front
23703                  * of the mblk ship list. We want to avoid holding
23704                  * the ipsec_capab_ills_lock while sending the
23705                  * message to the ills.
23706                  *
23707                  * The b_next and b_prev are temporarily used
23708                  * to build a list of mblks to be sent down, and to
23709                  * save the ill to which they must be sent.
23710                  */
23711                 nmp = copymsg(mp);
23712                 if (nmp == NULL) {
23713                         ill_refrele(ill);
23714                         continue;
23715                 }
23716                 ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL);
23717                 nmp->b_next = mp_ship_list;
23718                 mp_ship_list = nmp;
23719                 nmp->b_prev = (mblk_t *)ill;
23720         }
23721
23722         rw_exit(&ipst->ips_ipsec_capab_ills_lock);
23723
23724         for (nmp = mp_ship_list; nmp != NULL; nmp = next_mp) {
23725                 /* restore the mblk to a sane state */
23726                 next_mp = nmp->b_next;
23727                 nmp->b_next = NULL;
23728                 ill = (ill_t *)nmp->b_prev;
23729                 nmp->b_prev = NULL;
23730
23731                 ill_dlpi_send(ill, nmp);
23732                 ill_refrele(ill);
23733         }
23734
23735         if (sa != NULL)
23736                 IPSA_REFRELE(sa);
23737         freemsg(mp);
23738 }
23739
23740 /*
23741  * Derive an interface id from the link layer address.
23742  * Knows about IEEE 802 and IEEE EUI-64 mappings.
23743  */
23744 static boolean_t
23745 ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
23746 {
23747         char            *addr;
23748
23749         if (phys_length != ETHERADDRL)
23750                 return (B_FALSE);
23751
23752         /* Form EUI-64 like address */
23753         addr = (char *)&v6addr->s6_addr32[2];
23754         bcopy((char *)phys_addr, addr, 3);
23755         addr[0] ^= 0x2;         /* Toggle Universal/Local bit */
23756         addr[3] = (char)0xff;
23757         addr[4] = (char)0xfe;
23758         bcopy((char *)phys_addr + 3, addr + 5, 3);
23759         return (B_TRUE);
23760 }
23761
23762 /* ARGSUSED */
23763 static boolean_t
23764 ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
23765 {
23766         return (B_FALSE);
23767 }
23768
23769 /* ARGSUSED */
23770 static boolean_t
23771 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
23772     uint32_t *hw_start, in6_addr_t *v6_extract_mask)
23773 {
23774         /*
23775          * Multicast address mappings used over Ethernet/802.X.
23776          * This address is used as a base for mappings.
23777          */
23778         static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00,
23779             0x00, 0x00, 0x00};
23780
23781         /*
23782          * Extract low order 32 bits from IPv6 multicast address.
23783          * Or that into the link layer address, starting from the
23784          * second byte.
23785          */
23786         *hw_start = 2;
23787         v6_extract_mask->s6_addr32[0] = 0;
23788         v6_extract_mask->s6_addr32[1] = 0;
23789         v6_extract_mask->s6_addr32[2] = 0;
23790         v6_extract_mask->s6_addr32[3] = 0xffffffffU;
23791         bcopy(ipv6_g_phys_multi_addr, maddr, lla_length);
23792         return (B_TRUE);
23793 }
23794
23795 /*
23796  * Indicate by return value whether multicast is supported. If not,
23797  * this code should not touch/change any parameters.
23798  */
23799 /* ARGSUSED */
23800 static boolean_t
23801 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
23802     uint32_t *hw_start, ipaddr_t *extract_mask)
23803 {
23804         /*
23805          * Multicast address mappings used over Ethernet/802.X.
23806          * This address is used as a base for mappings.
23807          */
23808         static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e,
23809             0x00, 0x00, 0x00 };
23810
23811         if (phys_length != ETHERADDRL)
23812                 return (B_FALSE);
23813
23814         *extract_mask = htonl(0x007fffff);
23815         *hw_start = 2;
23816         bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL);
23817         return (B_TRUE);
23818 }
23819
23820 /*
23821  * Derive IPoIB interface id from the link layer address.
23822  */
23823 static boolean_t
23824 ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr)
23825 {
23826         char            *addr;
23827
23828         if (phys_length != 20)
23829                 return (B_FALSE);
23830         addr = (char *)&v6addr->s6_addr32[2];
23831         bcopy(phys_addr + 12, addr, 8);
23832         /*
23833          * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit
23834          * in the globally assigned EUI-64 GUID to 1, in violation of IEEE
23835          * rules. In these cases, the IBA considers these GUIDs to be in
23836          * "Modified EUI-64" format, and thus toggling the u/l bit is not
23837          * required; vendors are required not to assign global EUI-64's
23838          * that differ only in u/l bit values, thus guaranteeing uniqueness
23839          * of the interface identifier. Whether the GUID is in modified
23840          * or proper EUI-64 format, the ipv6 identifier must have the u/l
23841          * bit set to 1.
23842          */
23843         addr[0] |= 2;                   /* Set Universal/Local bit to 1 */
23844         return (B_TRUE);
23845 }
23846
23847 /*
23848  * Note on mapping from multicast IP addresses to IPoIB multicast link
23849  * addresses. IPoIB multicast link addresses are based on IBA link addresses.
23850  * The format of an IPoIB multicast address is:
23851  *
23852  *  4 byte QPN      Scope Sign.  Pkey
23853  * +--------------------------------------------+
23854  * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID |
23855  * +--------------------------------------------+
23856  *
23857  * The Scope and Pkey components are properties of the IBA port and
23858  * network interface. They can be ascertained from the broadcast address.
23859  * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6.
23860  */
23861
23862 static boolean_t
23863 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
23864     uint32_t *hw_start, in6_addr_t *v6_extract_mask)
23865 {
23866         /*
23867          * Base IPoIB IPv6 multicast address used for mappings.
23868          * Does not contain the IBA scope/Pkey values.
23869          */
23870         static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
23871             0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00,
23872             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
23873
23874         /*
23875          * Extract low order 80 bits from IPv6 multicast address.
23876          * Or that into the link layer address, starting from the
23877          * sixth byte.
23878          */
23879         *hw_start = 6;
23880         bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length);
23881
23882         /*
23883          * Now fill in the IBA scope/Pkey values from the broadcast address.
23884          */
23885         *(maddr + 5) = *(bphys_addr + 5);
23886         *(maddr + 8) = *(bphys_addr + 8);
23887         *(maddr + 9) = *(bphys_addr + 9);
23888
23889         v6_extract_mask->s6_addr32[0] = 0;
23890         v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff);
23891         v6_extract_mask->s6_addr32[2] = 0xffffffffU;
23892         v6_extract_mask->s6_addr32[3] = 0xffffffffU;
23893         return (B_TRUE);
23894 }
23895
23896 static boolean_t
23897 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
23898     uint32_t *hw_start, ipaddr_t *extract_mask)
23899 {
23900         /*
23901          * Base IPoIB IPv4 multicast address used for mappings.
23902          * Does not contain the IBA scope/Pkey values.
23903          */
23904         static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
23905             0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
23906             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
23907
23908         if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr))
23909                 return (B_FALSE);
23910
23911         /*
23912          * Extract low order 28 bits from IPv4 multicast address.
23913          * Or that into the link layer address, starting from the
23914          * sixteenth byte.
23915          */
23916         *extract_mask = htonl(0x0fffffff);
23917         *hw_start = 16;
23918         bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length);
23919
23920         /*
23921          * Now fill in the IBA scope/Pkey values from the broadcast address.
23922          */
23923         *(maddr + 5) = *(bphys_addr + 5);
23924         *(maddr + 8) = *(bphys_addr + 8);
23925         *(maddr + 9) = *(bphys_addr + 9);
23926         return (B_TRUE);
23927 }
23928
23929 /*
23930  * Returns B_TRUE if an ipif is present in the given zone, matching some flags
23931  * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there.
23932  * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with
23933  * the link-local address is preferred.
23934  */
23935 boolean_t
23936 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
23937 {
23938         ipif_t  *ipif;
23939         ipif_t  *maybe_ipif = NULL;
23940
23941         mutex_enter(&ill->ill_lock);
23942         if (ill->ill_state_flags & ILL_CONDEMNED) {
23943                 mutex_exit(&ill->ill_lock);
23944                 if (ipifp != NULL)
23945                         *ipifp = NULL;
23946                 return (B_FALSE);
23947         }
23948         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
23949                 if (!IPIF_CAN_LOOKUP(ipif))
23950                         continue;
23951                 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid &&
23952                     ipif->ipif_zoneid != ALL_ZONES)
23953                         continue;
23954                 if ((ipif->ipif_flags & flags) != flags)
23955                         continue;
23956
23957                 if (ipifp == NULL) {
23958                         mutex_exit(&ill->ill_lock);
23959                         ASSERT(maybe_ipif == NULL);
23960                         return (B_TRUE);
23961                 }
23962                 if (!ill->ill_isv6 ||
23963                     IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) {
23964                         ipif_refhold_locked(ipif);
23965                         mutex_exit(&ill->ill_lock);
23966                         *ipifp = ipif;
23967                         return (B_TRUE);
23968                 }
23969                 if (maybe_ipif == NULL)
23970                         maybe_ipif = ipif;
23971         }
23972         if (ipifp != NULL) {
23973                 if (maybe_ipif != NULL)
23974                         ipif_refhold_locked(maybe_ipif);
23975                 *ipifp = maybe_ipif;
23976         }
23977         mutex_exit(&ill->ill_lock);
23978         return (maybe_ipif != NULL);
23979 }
23980
23981 /*
23982  * Same as ipif_lookup_zoneid() but looks at all the ills in the same group.
23983  */
23984 boolean_t
23985 ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
23986 {
23987         ill_t *illg;
23988         ip_stack_t      *ipst = ill->ill_ipst;
23989
23990         /*
23991          * We look at the passed-in ill first without grabbing ill_g_lock.
23992          */
23993         if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) {
23994                 return (B_TRUE);
23995         }
23996         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
23997         if (ill->ill_group == NULL) {
23998                 /* ill not in a group */
23999                 rw_exit(&ipst->ips_ill_g_lock);
24000                 return (B_FALSE);
24001         }
24002
24003         /*
24004          * There's no ipif in the zone on ill, however ill is part of an IPMP
24005          * group. We need to look for an ipif in the zone on all the ills in the
24006          * group.
24007          */
24008         illg = ill->ill_group->illgrp_ill;
24009         do {
24010                 /*
24011                  * We don't call ipif_lookup_zoneid() on ill as we already know
24012                  * that it's not there.
24013                  */
24014                 if (illg != ill &&
24015                     ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) {
24016                         break;
24017                 }
24018         } while ((illg = illg->ill_group_next) != NULL);
24019         rw_exit(&ipst->ips_ill_g_lock);
24020         return (illg != NULL);
24021 }
24022
24023 /*
24024  * Check if this ill is only being used to send ICMP probes for IPMP
24025  */
24026 boolean_t
24027 ill_is_probeonly(ill_t *ill)
24028 {
24029         /*
24030          * Check if the interface is FAILED, or INACTIVE
24031          */
24032         if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE))
24033                 return (B_TRUE);
24034
24035         return (B_FALSE);
24036 }
24037
24038 /*
24039  * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id)
24040  * If a pointer to an ipif_t is returned then the caller will need to do
24041  * an ill_refrele().
24042  *
24043  * If there is no real interface which matches the ifindex, then it looks
24044  * for a group that has a matching index. In the case of a group match the
24045  * lifidx must be zero. We don't need emulate the logical interfaces
24046  * since IP Filter's use of netinfo doesn't use that.
24047  */
24048 ipif_t *
24049 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
24050     ip_stack_t *ipst)
24051 {
24052         ipif_t *ipif;
24053         ill_t *ill;
24054
24055         ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
24056             ipst);
24057
24058         if (ill == NULL) {
24059                 /* Fallback to group names only if hook_emulation set */
24060                 if (!ipst->ips_ipmp_hook_emulation)
24061                         return (NULL);
24062
24063                 if (lifidx != 0)
24064                         return (NULL);
24065                 ill = ill_group_lookup_on_ifindex(ifindex, isv6, ipst);
24066                 if (ill == NULL)
24067                         return (NULL);
24068         }
24069
24070         mutex_enter(&ill->ill_lock);
24071         if (ill->ill_state_flags & ILL_CONDEMNED) {
24072                 mutex_exit(&ill->ill_lock);
24073                 ill_refrele(ill);
24074                 return (NULL);
24075         }
24076
24077         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
24078                 if (!IPIF_CAN_LOOKUP(ipif))
24079                         continue;
24080                 if (lifidx == ipif->ipif_id) {
24081                         ipif_refhold_locked(ipif);
24082                         break;
24083                 }
24084         }
24085
24086         mutex_exit(&ill->ill_lock);
24087         ill_refrele(ill);
24088         return (ipif);
24089 }
24090
24091 /*
24092  * Flush the fastpath by deleting any nce's that are waiting for the fastpath,
24093  * There is one exceptions IRE_BROADCAST are difficult to recreate,
24094  * so instead we just nuke their nce_fp_mp's; see ndp_fastpath_flush()
24095  * for details.
24096  */
24097 void
24098 ill_fastpath_flush(ill_t *ill)
24099 {
24100         ip_stack_t *ipst = ill->ill_ipst;
24101
24102         nce_fastpath_list_dispatch(ill, NULL, NULL);
24103         ndp_walk_common((ill->ill_isv6 ? ipst->ips_ndp6 : ipst->ips_ndp4),
24104             ill, (pfi_t)ndp_fastpath_flush, NULL, B_TRUE);
24105 }
24106
24107 /*
24108  * Set the physical address information for `ill' to the contents of the
24109  * dl_notify_ind_t pointed to by `mp'.  Must be called as writer, and will be
24110  * asynchronous if `ill' cannot immediately be quiesced -- in which case
24111  * EINPROGRESS will be returned.
24112  */
24113 int
24114 ill_set_phys_addr(ill_t *ill, mblk_t *mp)
24115 {
24116         ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
24117         dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr;
24118
24119         ASSERT(IAM_WRITER_IPSQ(ipsq));
24120
24121         if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR &&
24122             dlindp->dl_data != DL_CURR_PHYS_ADDR) {
24123                 /* Changing DL_IPV6_TOKEN is not yet supported */
24124                 return (0);
24125         }
24126
24127         /*
24128          * We need to store up to two copies of `mp' in `ill'.  Due to the
24129          * design of ipsq_pending_mp_add(), we can't pass them as separate
24130          * arguments to ill_set_phys_addr_tail().  Instead, chain them
24131          * together here, then pull 'em apart in ill_set_phys_addr_tail().
24132          */
24133         if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) {
24134                 freemsg(mp);
24135                 return (ENOMEM);
24136         }
24137
24138         ipsq_current_start(ipsq, ill->ill_ipif, 0);
24139
24140         /*
24141          * If we can quiesce the ill, then set the address.  If not, then
24142          * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail().
24143          */
24144         ill_down_ipifs(ill, NULL, 0, B_FALSE);
24145         mutex_enter(&ill->ill_lock);
24146         if (!ill_is_quiescent(ill)) {
24147                 /* call cannot fail since `conn_t *' argument is NULL */
24148                 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
24149                     mp, ILL_DOWN);
24150                 mutex_exit(&ill->ill_lock);
24151                 return (EINPROGRESS);
24152         }
24153         mutex_exit(&ill->ill_lock);
24154
24155         ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL);
24156         return (0);
24157 }
24158
24159 /*
24160  * Once the ill associated with `q' has quiesced, set its physical address
24161  * information to the values in `addrmp'.  Note that two copies of `addrmp'
24162  * are passed (linked by b_cont), since we sometimes need to save two distinct
24163  * copies in the ill_t, and our context doesn't permit sleeping or allocation
24164  * failure (we'll free the other copy if it's not needed).  Since the ill_t
24165  * is quiesced, we know any stale IREs with the old address information have
24166  * already been removed, so we don't need to call ill_fastpath_flush().
24167  */
24168 /* ARGSUSED */
24169 static void
24170 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
24171 {
24172         ill_t           *ill = q->q_ptr;
24173         mblk_t          *addrmp2 = unlinkb(addrmp);
24174         dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr;
24175         uint_t          addrlen, addroff;
24176
24177         ASSERT(IAM_WRITER_IPSQ(ipsq));
24178
24179         addroff = dlindp->dl_addr_offset;
24180         addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length);
24181
24182         switch (dlindp->dl_data) {
24183         case DL_IPV6_LINK_LAYER_ADDR:
24184                 ill_set_ndmp(ill, addrmp, addroff, addrlen);
24185                 freemsg(addrmp2);
24186                 break;
24187
24188         case DL_CURR_PHYS_ADDR:
24189                 freemsg(ill->ill_phys_addr_mp);
24190                 ill->ill_phys_addr = addrmp->b_rptr + addroff;
24191                 ill->ill_phys_addr_mp = addrmp;
24192                 ill->ill_phys_addr_length = addrlen;
24193
24194                 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))
24195                         ill_set_ndmp(ill, addrmp2, addroff, addrlen);
24196                 else
24197                         freemsg(addrmp2);
24198                 break;
24199         default:
24200                 ASSERT(0);
24201         }
24202
24203         /*
24204          * If there are ipifs to bring up, ill_up_ipifs() will return
24205          * EINPROGRESS, and ipsq_current_finish() will be called by
24206          * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is
24207          * brought up.
24208          */
24209         if (ill_up_ipifs(ill, q, addrmp) != EINPROGRESS)
24210                 ipsq_current_finish(ipsq);
24211 }
24212
24213 /*
24214  * Helper routine for setting the ill_nd_lla fields.
24215  */
24216 void
24217 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen)
24218 {
24219         freemsg(ill->ill_nd_lla_mp);
24220         ill->ill_nd_lla = ndmp->b_rptr + addroff;
24221         ill->ill_nd_lla_mp = ndmp;
24222         ill->ill_nd_lla_len = addrlen;
24223 }
24224
24225 major_t IP_MAJ;
24226 #define IP      "ip"
24227
24228 #define UDP6DEV         "/devices/pseudo/udp6@0:udp6"
24229 #define UDPDEV          "/devices/pseudo/udp@0:udp"
24230
24231 /*
24232  * Issue REMOVEIF ioctls to have the loopback interfaces
24233  * go away.  Other interfaces are either I_LINKed or I_PLINKed;
24234  * the former going away when the user-level processes in the zone
24235  * are killed  * and the latter are cleaned up by the stream head
24236  * str_stack_shutdown callback that undoes all I_PLINKs.
24237  */
24238 void
24239 ip_loopback_cleanup(ip_stack_t *ipst)
24240 {
24241         int error;
24242         ldi_handle_t    lh = NULL;
24243         ldi_ident_t     li = NULL;
24244         int             rval;
24245         cred_t          *cr;
24246         struct strioctl iocb;
24247         struct lifreq   lifreq;
24248
24249         IP_MAJ = ddi_name_to_major(IP);
24250
24251 #ifdef NS_DEBUG
24252         (void) printf("ip_loopback_cleanup() stackid %d\n",
24253             ipst->ips_netstack->netstack_stackid);
24254 #endif
24255
24256         bzero(&lifreq, sizeof (lifreq));
24257         (void) strcpy(lifreq.lifr_name, ipif_loopback_name);
24258
24259         error = ldi_ident_from_major(IP_MAJ, &li);
24260         if (error) {
24261 #ifdef DEBUG
24262                 printf("ip_loopback_cleanup: lyr ident get failed error %d\n",
24263                     error);
24264 #endif
24265                 return;
24266         }
24267
24268         cr = zone_get_kcred(netstackid_to_zoneid(
24269             ipst->ips_netstack->netstack_stackid));
24270         ASSERT(cr != NULL);
24271         error = ldi_open_by_name(UDP6DEV, FREAD|FWRITE, cr, &lh, li);
24272         if (error) {
24273 #ifdef DEBUG
24274                 printf("ip_loopback_cleanup: open of UDP6DEV failed error %d\n",
24275                     error);
24276 #endif
24277                 goto out;
24278         }
24279         iocb.ic_cmd = SIOCLIFREMOVEIF;
24280         iocb.ic_timout = 15;
24281         iocb.ic_len = sizeof (lifreq);
24282         iocb.ic_dp = (char *)&lifreq;
24283
24284         error = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval);
24285         /* LINTED - statement has no consequent */
24286         if (error) {
24287 #ifdef NS_DEBUG
24288                 printf("ip_loopback_cleanup: ioctl SIOCLIFREMOVEIF failed on "
24289                     "UDP6 error %d\n", error);
24290 #endif
24291         }
24292         (void) ldi_close(lh, FREAD|FWRITE, cr);
24293         lh = NULL;
24294
24295         error = ldi_open_by_name(UDPDEV, FREAD|FWRITE, cr, &lh, li);
24296         if (error) {
24297 #ifdef NS_DEBUG
24298                 printf("ip_loopback_cleanup: open of UDPDEV failed error %d\n",
24299                     error);
24300 #endif
24301                 goto out;
24302         }
24303
24304         iocb.ic_cmd = SIOCLIFREMOVEIF;
24305         iocb.ic_timout = 15;
24306         iocb.ic_len = sizeof (lifreq);
24307         iocb.ic_dp = (char *)&lifreq;
24308
24309         error = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval);
24310         /* LINTED - statement has no consequent */
24311         if (error) {
24312 #ifdef NS_DEBUG
24313                 printf("ip_loopback_cleanup: ioctl SIOCLIFREMOVEIF failed on "
24314                     "UDP error %d\n", error);
24315 #endif
24316         }
24317         (void) ldi_close(lh, FREAD|FWRITE, cr);
24318         lh = NULL;
24319
24320 out:
24321         /* Close layered handles */
24322         if (lh)
24323                 (void) ldi_close(lh, FREAD|FWRITE, cr);
24324         if (li)
24325                 ldi_ident_release(li);
24326
24327         crfree(cr);
24328 }