Unleashed v1.4
[unleashed.git] / kernel / net / ip / ip_if.c
blob01417d534b47f4403138ba79a507ce81339c5e05
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 1990 Mentat Inc.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 * Copyright (c) 2016, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved.
30 * This file contains the interface control functions for IP.
33 #include <sys/types.h>
34 #include <sys/stream.h>
35 #include <sys/dlpi.h>
36 #include <sys/stropts.h>
37 #include <sys/strsun.h>
38 #include <sys/sysmacros.h>
39 #include <sys/strsubr.h>
40 #include <sys/strlog.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/cmn_err.h>
44 #include <sys/kstat.h>
45 #include <sys/debug.h>
46 #include <sys/zone.h>
47 #include <sys/sunldi.h>
48 #include <sys/file.h>
49 #include <sys/bitmap.h>
50 #include <sys/cpuvar.h>
51 #include <sys/time.h>
52 #include <sys/ctype.h>
53 #include <sys/kmem.h>
54 #include <sys/systm.h>
55 #include <sys/param.h>
56 #include <sys/socket.h>
57 #include <sys/isa_defs.h>
58 #include <net/if.h>
59 #include <net/if_arp.h>
60 #include <net/if_types.h>
61 #include <net/if_dl.h>
62 #include <net/route.h>
63 #include <sys/sockio.h>
64 #include <netinet/in.h>
65 #include <netinet/ip6.h>
66 #include <netinet/icmp6.h>
67 #include <netinet/igmp_var.h>
68 #include <sys/policy.h>
69 #include <sys/ethernet.h>
70 #include <sys/callb.h>
71 #include <sys/md5.h>
73 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */
74 #include <inet/mi.h>
75 #include <inet/nd.h>
76 #include <inet/tunables.h>
77 #include <inet/arp.h>
78 #include <inet/ip_arp.h>
79 #include <inet/mib2.h>
80 #include <inet/ip.h>
81 #include <inet/ip6.h>
82 #include <inet/ip6_asp.h>
83 #include <inet/tcp.h>
84 #include <inet/ip_multi.h>
85 #include <inet/ip_ire.h>
86 #include <inet/ip_ftable.h>
87 #include <inet/ip_rts.h>
88 #include <inet/ip_ndp.h>
89 #include <inet/ip_if.h>
90 #include <inet/ip_impl.h>
91 #include <inet/sctp_ip.h>
92 #include <inet/ip_netinfo.h>
93 #include <inet/ilb_ip.h>
95 #include <netinet/igmp.h>
96 #include <inet/ip_listutils.h>
97 #include <inet/ipclassifier.h>
98 #include <sys/mac_client.h>
99 #include <sys/dld.h>
100 #include <sys/mac_flow.h>
102 #include <sys/systeminfo.h>
103 #include <sys/bootconf.h>
105 #include <inet/rawip_impl.h> /* needed for icmp_stack_t */
106 #include <inet/udp_impl.h> /* needed for udp_stack_t */
108 /* The character which tells where the ill_name ends */
109 #define IPIF_SEPARATOR_CHAR ':'
111 /* IP ioctl function table entry */
112 typedef struct ipft_s {
113 int ipft_cmd;
114 pfi_t ipft_pfi;
115 int ipft_min_size;
116 int ipft_flags;
117 } ipft_t;
118 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */
119 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */
121 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
122 static int nd_ill_forward_set(queue_t *q, mblk_t *mp,
123 char *value, caddr_t cp, cred_t *ioc_cr);
125 static boolean_t ill_is_quiescent(ill_t *);
126 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
127 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type);
128 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
129 mblk_t *mp, boolean_t need_up);
130 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
131 mblk_t *mp, boolean_t need_up);
132 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
133 queue_t *q, mblk_t *mp, boolean_t need_up);
134 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
135 mblk_t *mp);
136 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
137 mblk_t *mp);
138 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
139 queue_t *q, mblk_t *mp, boolean_t need_up);
140 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
141 int ioccmd, struct linkblk *li);
142 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
143 static void ip_wput_ioctl(queue_t *q, mblk_t *mp);
144 static void ipsq_flush(ill_t *ill);
146 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
147 queue_t *q, mblk_t *mp, boolean_t need_up);
148 static void ipsq_delete(ipsq_t *);
150 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
151 boolean_t initialize, boolean_t insert, int *errorp);
152 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
153 static void ipif_delete_bcast_ires(ipif_t *ipif);
154 static int ipif_add_ires_v4(ipif_t *, boolean_t);
155 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
156 boolean_t isv6);
157 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
158 static void ipif_free(ipif_t *ipif);
159 static void ipif_free_tail(ipif_t *ipif);
160 static void ipif_set_default(ipif_t *ipif);
161 static int ipif_set_values(queue_t *q, mblk_t *mp,
162 char *interf_name, uint_t *ppa);
163 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
164 queue_t *q);
165 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen,
166 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
167 ip_stack_t *);
168 static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen,
169 boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func,
170 int *error, ip_stack_t *);
172 static int ill_alloc_ppa(ill_if_t *, ill_t *);
173 static void ill_delete_interface_type(ill_if_t *);
174 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
175 static void ill_dl_down(ill_t *ill);
176 static void ill_down(ill_t *ill);
177 static void ill_down_ipifs(ill_t *, boolean_t);
178 static void ill_free_mib(ill_t *ill);
179 static void ill_glist_delete(ill_t *);
180 static void ill_phyint_reinit(ill_t *ill);
181 static void ill_set_nce_router_flags(ill_t *, boolean_t);
182 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
183 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *);
185 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
186 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
187 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
188 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
189 static ip_v4mapinfo_func_t ip_ether_v4_mapping;
190 static ip_v6mapinfo_func_t ip_ether_v6_mapping;
191 static ip_v4mapinfo_func_t ip_ib_v4_mapping;
192 static ip_v6mapinfo_func_t ip_ib_v6_mapping;
193 static ip_v4mapinfo_func_t ip_mbcast_mapping;
194 static void phyint_free(phyint_t *);
196 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
197 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
198 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
199 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
200 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
201 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
202 dl_capability_sub_t *);
203 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
204 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *);
205 static void ill_capability_dld_ack(ill_t *, mblk_t *,
206 dl_capability_sub_t *);
207 static void ill_capability_dld_enable(ill_t *);
208 static void ill_capability_ack_thr(void *);
209 static void ill_capability_lso_enable(ill_t *);
211 static ill_t *ill_prev_usesrc(ill_t *);
212 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
213 static void ill_disband_usesrc_group(ill_t *);
214 static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
216 #ifdef DEBUG
217 static void ill_trace_cleanup(const ill_t *);
218 static void ipif_trace_cleanup(const ipif_t *);
219 #endif
221 static void ill_dlpi_clear_deferred(ill_t *ill);
223 static void phyint_flags_init(phyint_t *, t_uscalar_t);
226 * if we go over the memory footprint limit more than once in this msec
227 * interval, we'll start pruning aggressively.
229 int ip_min_frag_prune_time = 0;
231 static ipft_t ip_ioctl_ftbl[] = {
232 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
233 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
234 IPFT_F_NO_REPLY },
235 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
236 { 0 }
239 /* Simple ICMP IP Header Template */
240 static ipha_t icmp_ipha = {
241 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
244 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
246 static ip_m_t ip_m_tbl[] = {
247 { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
248 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
249 ip_nodef_v6intfid },
250 { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
251 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
252 ip_nodef_v6intfid },
253 { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
254 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
255 ip_nodef_v6intfid },
256 { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
257 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
258 ip_nodef_v6intfid },
259 { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
260 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
261 ip_nodef_v6intfid },
262 { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
263 ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
264 ip_nodef_v6intfid },
265 { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
266 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
267 ip_ipv4_v6destintfid },
268 { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
269 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
270 ip_ipv6_v6destintfid },
271 { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
272 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
273 ip_nodef_v6intfid },
274 { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
275 NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
276 { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
277 NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
278 { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
279 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
280 ip_nodef_v6intfid }
283 char ipif_loopback_name[] = "lo0";
285 /* These are used by all IP network modules. */
286 sin6_t sin6_null; /* Zero address for quick clears */
287 sin_t sin_null; /* Zero address for quick clears */
289 /* When set search for unused ipif_seqid */
290 static ipif_t ipif_zero;
293 * ppa arena is created after these many
294 * interfaces have been plumbed.
296 uint_t ill_no_arena = 12; /* Setable in /etc/system */
299 * Allocate per-interface mibs.
300 * Returns true if ok. False otherwise.
301 * ipsq may not yet be allocated (loopback case ).
303 static boolean_t
304 ill_allocate_mibs(ill_t *ill)
306 /* Already allocated? */
307 if (ill->ill_ip_mib != NULL) {
308 if (ill->ill_isv6)
309 ASSERT(ill->ill_icmp6_mib != NULL);
310 return (B_TRUE);
313 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
314 KM_NOSLEEP);
315 if (ill->ill_ip_mib == NULL) {
316 return (B_FALSE);
319 /* Setup static information */
320 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
321 sizeof (mib2_ipIfStatsEntry_t));
322 if (ill->ill_isv6) {
323 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
324 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
325 sizeof (mib2_ipv6AddrEntry_t));
326 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
327 sizeof (mib2_ipv6RouteEntry_t));
328 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
329 sizeof (mib2_ipv6NetToMediaEntry_t));
330 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
331 sizeof (ipv6_member_t));
332 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
333 sizeof (ipv6_grpsrc_t));
334 } else {
335 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
336 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
337 sizeof (mib2_ipAddrEntry_t));
338 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
339 sizeof (mib2_ipRouteEntry_t));
340 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
341 sizeof (mib2_ipNetToMediaEntry_t));
342 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
343 sizeof (ip_member_t));
344 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
345 sizeof (ip_grpsrc_t));
348 * For a v4 ill, we are done at this point, because per ill
349 * icmp mibs are only used for v6.
351 return (B_TRUE);
354 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
355 KM_NOSLEEP);
356 if (ill->ill_icmp6_mib == NULL) {
357 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
358 ill->ill_ip_mib = NULL;
359 return (B_FALSE);
361 /* static icmp info */
362 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
363 sizeof (mib2_ipv6IfIcmpEntry_t);
365 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
366 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
367 * -> ill_phyint_reinit
369 return (B_TRUE);
373 * Completely vaporize a lower level tap and all associated interfaces.
374 * ill_delete is called only out of ip_close when the device control
375 * stream is being closed.
377 void
378 ill_delete(ill_t *ill)
380 ipif_t *ipif;
381 ill_t *prev_ill;
382 ip_stack_t *ipst = ill->ill_ipst;
385 * ill_delete may be forcibly entering the ipsq. The previous
386 * ioctl may not have completed and may need to be aborted.
387 * ipsq_flush takes care of it. If we don't need to enter the
388 * the ipsq forcibly, the 2nd invocation of ipsq_flush in
389 * ill_delete_tail is sufficient.
391 ipsq_flush(ill);
394 * Nuke all interfaces. ipif_free will take down the interface,
395 * remove it from the list, and free the data structure.
396 * Walk down the ipif list and remove the logical interfaces
397 * first before removing the main ipif. We can't unplumb
398 * zeroth interface first in the case of IPv6 as update_conn_ill
399 * -> ip_ll_multireq de-references ill_ipif for checking
400 * POINTOPOINT.
402 * If ill_ipif was not properly initialized (i.e low on memory),
403 * then no interfaces to clean up. In this case just clean up the
404 * ill.
406 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
407 ipif_free(ipif);
410 * clean out all the nce_t entries that depend on this
411 * ill for the ill_phys_addr.
413 nce_flush(ill, B_TRUE);
415 /* Clean up msgs on pending upcalls for mrouted */
416 reset_mrt_ill(ill);
418 update_conn_ill(ill, ipst);
421 * Remove multicast references added as a result of calls to
422 * ip_join_allmulti().
424 ip_purge_allmulti(ill);
427 * If the ill being deleted is under IPMP, boot it out of the illgrp.
429 if (IS_UNDER_IPMP(ill))
430 ipmp_ill_leave_illgrp(ill);
433 * ill_down will arrange to blow off any IRE's dependent on this
434 * ILL, and shut down fragmentation reassembly.
436 ill_down(ill);
438 /* Let SCTP know, so that it can remove this from its list. */
439 sctp_update_ill(ill, SCTP_ILL_REMOVE);
442 * Walk all CONNs that can have a reference on an ire or nce for this
443 * ill (we actually walk all that now have stale references).
445 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
447 /* With IPv6 we have dce_ifindex. Cleanup for neatness */
448 if (ill->ill_isv6)
449 dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
452 * If an address on this ILL is being used as a source address then
453 * clear out the pointers in other ILLs that point to this ILL.
455 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
456 if (ill->ill_usesrc_grp_next != NULL) {
457 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
458 ill_disband_usesrc_group(ill);
459 } else { /* consumer of the usesrc ILL */
460 prev_ill = ill_prev_usesrc(ill);
461 prev_ill->ill_usesrc_grp_next =
462 ill->ill_usesrc_grp_next;
465 rw_exit(&ipst->ips_ill_g_usesrc_lock);
468 static void
469 ipif_non_duplicate(ipif_t *ipif)
471 ill_t *ill = ipif->ipif_ill;
472 mutex_enter(&ill->ill_lock);
473 if (ipif->ipif_flags & IPIF_DUPLICATE) {
474 ipif->ipif_flags &= ~IPIF_DUPLICATE;
475 ASSERT(ill->ill_ipif_dup_count > 0);
476 ill->ill_ipif_dup_count--;
478 mutex_exit(&ill->ill_lock);
482 * ill_delete_tail is called from ip_modclose after all references
483 * to the closing ill are gone. The wait is done in ip_modclose
485 void
486 ill_delete_tail(ill_t *ill)
488 mblk_t **mpp;
489 ipif_t *ipif;
490 ip_stack_t *ipst = ill->ill_ipst;
492 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
493 ipif_non_duplicate(ipif);
494 (void) ipif_down_tail(ipif);
497 ASSERT(ill->ill_ipif_dup_count == 0);
500 * If polling capability is enabled (which signifies direct
501 * upcall into IP and driver has ill saved as a handle),
502 * we need to make sure that unbind has completed before we
503 * let the ill disappear and driver no longer has any reference
504 * to this ill.
506 mutex_enter(&ill->ill_lock);
507 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
508 cv_wait(&ill->ill_cv, &ill->ill_lock);
509 mutex_exit(&ill->ill_lock);
510 ASSERT(!(ill->ill_capabilities &
511 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
513 if (ill->ill_net_type != IRE_LOOPBACK)
514 qprocsoff(ill->ill_rq);
517 * We do an ipsq_flush once again now. New messages could have
518 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
519 * could also have landed up if an ioctl thread had looked up
520 * the ill before we set the ILL_CONDEMNED flag, but not yet
521 * enqueued the ioctl when we did the ipsq_flush last time.
523 ipsq_flush(ill);
526 * Free capabilities.
528 if (ill->ill_hcksum_capab != NULL) {
529 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
530 ill->ill_hcksum_capab = NULL;
533 if (ill->ill_zerocopy_capab != NULL) {
534 kmem_free(ill->ill_zerocopy_capab,
535 sizeof (ill_zerocopy_capab_t));
536 ill->ill_zerocopy_capab = NULL;
539 if (ill->ill_lso_capab != NULL) {
540 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
541 ill->ill_lso_capab = NULL;
544 if (ill->ill_dld_capab != NULL) {
545 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
546 ill->ill_dld_capab = NULL;
549 /* Clean up ill_allowed_ips* related state */
550 if (ill->ill_allowed_ips != NULL) {
551 ASSERT(ill->ill_allowed_ips_cnt > 0);
552 kmem_free(ill->ill_allowed_ips,
553 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
554 ill->ill_allowed_ips = NULL;
555 ill->ill_allowed_ips_cnt = 0;
558 while (ill->ill_ipif != NULL)
559 ipif_free_tail(ill->ill_ipif);
562 * We have removed all references to ilm from conn and the ones joined
563 * within the kernel.
565 * We don't walk conns, mrts and ires because
567 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
568 * 2) ill_down ->ill_downi walks all the ires and cleans up
569 * ill references.
573 * If this ill is an IPMP meta-interface, blow away the illgrp. This
574 * is safe to do because the illgrp has already been unlinked from the
575 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
577 if (IS_IPMP(ill)) {
578 ipmp_illgrp_destroy(ill->ill_grp);
579 ill->ill_grp = NULL;
582 if (ill->ill_mphysaddr_list != NULL) {
583 multiphysaddr_t *mpa, *tmpa;
585 mpa = ill->ill_mphysaddr_list;
586 ill->ill_mphysaddr_list = NULL;
587 while (mpa) {
588 tmpa = mpa->mpa_next;
589 kmem_free(mpa, sizeof (*mpa));
590 mpa = tmpa;
594 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
595 * could free the phyint. No more reference to the phyint after this
596 * point.
598 (void) ill_glist_delete(ill);
600 if (ill->ill_frag_ptr != NULL) {
601 uint_t count;
603 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
604 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
606 mi_free(ill->ill_frag_ptr);
607 ill->ill_frag_ptr = NULL;
608 ill->ill_frag_hash_tbl = NULL;
611 freemsg(ill->ill_nd_lla_mp);
612 /* Free all retained control messages. */
613 mpp = &ill->ill_first_mp_to_free;
614 do {
615 while (mpp[0]) {
616 mblk_t *mp;
617 mblk_t *mp1;
619 mp = mpp[0];
620 mpp[0] = mp->b_next;
621 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
622 mp1->b_next = NULL;
623 mp1->b_prev = NULL;
625 freemsg(mp);
627 } while (mpp++ != &ill->ill_last_mp_to_free);
629 ill_free_mib(ill);
631 #ifdef DEBUG
632 ill_trace_cleanup(ill);
633 #endif
635 /* The default multicast interface might have changed */
636 ire_increment_multicast_generation(ipst, ill->ill_isv6);
638 /* Drop refcnt here */
639 netstack_rele(ill->ill_ipst->ips_netstack);
640 ill->ill_ipst = NULL;
643 static void
644 ill_free_mib(ill_t *ill)
646 ip_stack_t *ipst = ill->ill_ipst;
649 * MIB statistics must not be lost, so when an interface
650 * goes away the counter values will be added to the global
651 * MIBs.
653 if (ill->ill_ip_mib != NULL) {
654 if (ill->ill_isv6) {
655 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
656 ill->ill_ip_mib);
657 } else {
658 ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
659 ill->ill_ip_mib);
662 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
663 ill->ill_ip_mib = NULL;
665 if (ill->ill_icmp6_mib != NULL) {
666 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
667 ill->ill_icmp6_mib);
668 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
669 ill->ill_icmp6_mib = NULL;
674 * Concatenate together a physical address and a sap.
676 * Sap_lengths are interpreted as follows:
677 * sap_length == 0 ==> no sap
678 * sap_length > 0 ==> sap is at the head of the dlpi address
679 * sap_length < 0 ==> sap is at the tail of the dlpi address
681 static void
682 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
683 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
685 uint16_t sap_addr = (uint16_t)sap_src;
687 if (sap_length == 0) {
688 if (phys_src == NULL)
689 bzero(dst, phys_length);
690 else
691 bcopy(phys_src, dst, phys_length);
692 } else if (sap_length < 0) {
693 if (phys_src == NULL)
694 bzero(dst, phys_length);
695 else
696 bcopy(phys_src, dst, phys_length);
697 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
698 } else {
699 bcopy(&sap_addr, dst, sizeof (sap_addr));
700 if (phys_src == NULL)
701 bzero((char *)dst + sap_length, phys_length);
702 else
703 bcopy(phys_src, (char *)dst + sap_length, phys_length);
708 * Generate a dl_unitdata_req mblk for the device and address given.
709 * addr_length is the length of the physical portion of the address.
710 * If addr is NULL include an all zero address of the specified length.
711 * TRUE? In any case, addr_length is taken to be the entire length of the
712 * dlpi address, including the absolute value of sap_length.
714 mblk_t *
715 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
716 t_scalar_t sap_length)
718 dl_unitdata_req_t *dlur;
719 mblk_t *mp;
720 t_scalar_t abs_sap_length; /* absolute value */
722 abs_sap_length = ABS(sap_length);
723 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
724 DL_UNITDATA_REQ);
725 if (mp == NULL)
726 return (NULL);
727 dlur = (dl_unitdata_req_t *)mp->b_rptr;
728 /* HACK: accomodate incompatible DLPI drivers */
729 if (addr_length == 8)
730 addr_length = 6;
731 dlur->dl_dest_addr_length = addr_length + abs_sap_length;
732 dlur->dl_dest_addr_offset = sizeof (*dlur);
733 dlur->dl_priority.dl_min = 0;
734 dlur->dl_priority.dl_max = 0;
735 ill_dlur_copy_address(addr, addr_length, sap, sap_length,
736 (uchar_t *)&dlur[1]);
737 return (mp);
741 * Add the pending mp to the list. There can be only 1 pending mp
742 * in the list. Any exclusive ioctl that needs to wait for a response
743 * from another module or driver needs to use this function to set
744 * the ipx_pending_mp to the ioctl mblk and wait for the response from
745 * the other module/driver. This is also used while waiting for the
746 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
748 boolean_t
749 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
750 int waitfor)
752 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
754 ASSERT(IAM_WRITER_IPIF(ipif));
755 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
756 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
757 ASSERT(ipx->ipx_pending_mp == NULL);
759 * The caller may be using a different ipif than the one passed into
760 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
761 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT
762 * that `ipx_current_ipif == ipif'.
764 ASSERT(ipx->ipx_current_ipif != NULL);
767 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the
768 * driver.
770 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) ||
771 (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) ||
772 (DB_TYPE(add_mp) == M_PCPROTO));
774 if (connp != NULL) {
775 ASSERT(MUTEX_HELD(&connp->conn_lock));
777 * Return error if the conn has started closing. The conn
778 * could have finished cleaning up the pending mp list,
779 * If so we should not add another mp to the list negating
780 * the cleanup.
782 if (connp->conn_state_flags & CONN_CLOSING)
783 return (B_FALSE);
785 mutex_enter(&ipx->ipx_lock);
786 ipx->ipx_pending_ipif = ipif;
788 * Note down the queue in b_queue. This will be returned by
789 * ipsq_pending_mp_get. Caller will then use these values to restart
790 * the processing
792 add_mp->b_next = NULL;
793 add_mp->b_queue = q;
794 ipx->ipx_pending_mp = add_mp;
795 ipx->ipx_waitfor = waitfor;
796 mutex_exit(&ipx->ipx_lock);
798 if (connp != NULL)
799 connp->conn_oper_pending_ill = ipif->ipif_ill;
801 return (B_TRUE);
805 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
806 * queued in the list.
808 mblk_t *
809 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
811 mblk_t *curr = NULL;
812 ipxop_t *ipx = ipsq->ipsq_xop;
814 *connpp = NULL;
815 mutex_enter(&ipx->ipx_lock);
816 if (ipx->ipx_pending_mp == NULL) {
817 mutex_exit(&ipx->ipx_lock);
818 return (NULL);
821 /* There can be only 1 such excl message */
822 curr = ipx->ipx_pending_mp;
823 ASSERT(curr->b_next == NULL);
824 ipx->ipx_pending_ipif = NULL;
825 ipx->ipx_pending_mp = NULL;
826 ipx->ipx_waitfor = 0;
827 mutex_exit(&ipx->ipx_lock);
829 if (CONN_Q(curr->b_queue)) {
831 * This mp did a refhold on the conn, at the start of the ioctl.
832 * So we can safely return a pointer to the conn to the caller.
834 *connpp = Q_TO_CONN(curr->b_queue);
835 } else {
836 *connpp = NULL;
838 curr->b_next = NULL;
839 curr->b_prev = NULL;
840 return (curr);
844 * Cleanup the ioctl mp queued in ipx_pending_mp
845 * - Called in the ill_delete path
846 * - Called in the M_ERROR or M_HANGUP path on the ill.
847 * - Called in the conn close path.
849 * Returns success on finding the pending mblk associated with the ioctl or
850 * exclusive operation in progress, failure otherwise.
852 boolean_t
853 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
855 mblk_t *mp;
856 ipxop_t *ipx;
857 queue_t *q;
858 ipif_t *ipif;
859 int cmd;
861 ASSERT(IAM_WRITER_ILL(ill));
862 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
864 mutex_enter(&ipx->ipx_lock);
865 mp = ipx->ipx_pending_mp;
866 if (connp != NULL) {
867 if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) {
869 * Nothing to clean since the conn that is closing
870 * does not have a matching pending mblk in
871 * ipx_pending_mp.
873 mutex_exit(&ipx->ipx_lock);
874 return (B_FALSE);
876 } else {
878 * A non-zero ill_error signifies we are called in the
879 * M_ERROR or M_HANGUP path and we need to unconditionally
880 * abort any current ioctl and do the corresponding cleanup.
881 * A zero ill_error means we are in the ill_delete path and
882 * we do the cleanup only if there is a pending mp.
884 if (mp == NULL && ill->ill_error == 0) {
885 mutex_exit(&ipx->ipx_lock);
886 return (B_FALSE);
890 /* Now remove from the ipx_pending_mp */
891 ipx->ipx_pending_mp = NULL;
892 ipif = ipx->ipx_pending_ipif;
893 ipx->ipx_pending_ipif = NULL;
894 ipx->ipx_waitfor = 0;
895 ipx->ipx_current_ipif = NULL;
896 cmd = ipx->ipx_current_ioctl;
897 ipx->ipx_current_ioctl = 0;
898 ipx->ipx_current_done = B_TRUE;
899 mutex_exit(&ipx->ipx_lock);
901 if (mp == NULL)
902 return (B_FALSE);
904 q = mp->b_queue;
905 mp->b_next = NULL;
906 mp->b_prev = NULL;
907 mp->b_queue = NULL;
909 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
910 DTRACE_PROBE4(ipif__ioctl,
911 char *, "ipsq_pending_mp_cleanup",
912 int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
913 ipif_t *, ipif);
914 if (connp == NULL) {
915 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
916 } else {
917 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
918 mutex_enter(&ipif->ipif_ill->ill_lock);
919 ipif->ipif_state_flags &= ~IPIF_CHANGING;
920 mutex_exit(&ipif->ipif_ill->ill_lock);
922 } else {
923 inet_freemsg(mp);
925 return (B_TRUE);
929 * Called in the conn close path and ill delete path
931 static void
932 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
934 ipsq_t *ipsq;
935 mblk_t *prev;
936 mblk_t *curr;
937 mblk_t *next;
938 queue_t *wq, *rq = NULL;
939 mblk_t *tmp_list = NULL;
941 ASSERT(IAM_WRITER_ILL(ill));
942 if (connp != NULL)
943 wq = CONNP_TO_WQ(connp);
944 else
945 wq = ill->ill_wq;
948 * In the case of lo0 being unplumbed, ill_wq will be NULL. Guard
949 * against this here.
951 if (wq != NULL)
952 rq = RD(wq);
954 ipsq = ill->ill_phyint->phyint_ipsq;
956 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
957 * In the case of ioctl from a conn, there can be only 1 mp
958 * queued on the ipsq. If an ill is being unplumbed flush all
959 * the messages.
961 mutex_enter(&ipsq->ipsq_lock);
962 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
963 curr = next) {
964 next = curr->b_next;
965 if (connp == NULL ||
966 (curr->b_queue == wq || curr->b_queue == rq)) {
967 /* Unlink the mblk from the pending mp list */
968 if (prev != NULL) {
969 prev->b_next = curr->b_next;
970 } else {
971 ASSERT(ipsq->ipsq_xopq_mphead == curr);
972 ipsq->ipsq_xopq_mphead = curr->b_next;
974 if (ipsq->ipsq_xopq_mptail == curr)
975 ipsq->ipsq_xopq_mptail = prev;
977 * Create a temporary list and release the ipsq lock
978 * New elements are added to the head of the tmp_list
980 curr->b_next = tmp_list;
981 tmp_list = curr;
982 } else {
983 prev = curr;
986 mutex_exit(&ipsq->ipsq_lock);
988 while (tmp_list != NULL) {
989 curr = tmp_list;
990 tmp_list = curr->b_next;
991 curr->b_next = NULL;
992 curr->b_prev = NULL;
993 wq = curr->b_queue;
994 curr->b_queue = NULL;
995 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
996 DTRACE_PROBE4(ipif__ioctl,
997 char *, "ipsq_xopq_mp_cleanup",
998 int, 0, ill_t *, NULL, ipif_t *, NULL);
999 ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ?
1000 CONN_CLOSE : NO_COPYOUT, NULL);
1001 } else {
1003 * IP-MT XXX In the case of TLI/XTI bind / optmgmt
1004 * this can't be just inet_freemsg. we have to
1005 * restart it otherwise the thread will be stuck.
1007 inet_freemsg(curr);
1013 * This conn has started closing. Cleanup any pending ioctl from this conn.
1014 * STREAMS ensures that there can be at most 1 active ioctl on a stream.
1016 void
1017 conn_ioctl_cleanup(conn_t *connp)
1019 ipsq_t *ipsq;
1020 ill_t *ill;
1021 boolean_t refheld;
1024 * Check for a queued ioctl. If the ioctl has not yet started, the mp
1025 * is pending in the list headed by ipsq_xopq_head. If the ioctl has
1026 * started the mp could be present in ipx_pending_mp. Note that if
1027 * conn_oper_pending_ill is NULL, the ioctl may still be in flight and
1028 * not yet queued anywhere. In this case, the conn close code will wait
1029 * until the conn_ref is dropped. If the stream was a tcp stream, then
1030 * tcp_close will wait first until all ioctls have completed for this
1031 * conn.
1033 mutex_enter(&connp->conn_lock);
1034 ill = connp->conn_oper_pending_ill;
1035 if (ill == NULL) {
1036 mutex_exit(&connp->conn_lock);
1037 return;
1041 * We may not be able to refhold the ill if the ill/ipif
1042 * is changing. But we need to make sure that the ill will
1043 * not vanish. So we just bump up the ill_waiter count.
1045 refheld = ill_waiter_inc(ill);
1046 mutex_exit(&connp->conn_lock);
1047 if (refheld) {
1048 if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
1049 ill_waiter_dcr(ill);
1051 * Check whether this ioctl has started and is
1052 * pending. If it is not found there then check
1053 * whether this ioctl has not even started and is in
1054 * the ipsq_xopq list.
1056 if (!ipsq_pending_mp_cleanup(ill, connp))
1057 ipsq_xopq_mp_cleanup(ill, connp);
1058 ipsq = ill->ill_phyint->phyint_ipsq;
1059 ipsq_exit(ipsq);
1060 return;
1065 * The ill is also closing and we could not bump up the
1066 * ill_waiter_count or we could not enter the ipsq. Leave
1067 * the cleanup to ill_delete
1069 mutex_enter(&connp->conn_lock);
1070 while (connp->conn_oper_pending_ill != NULL)
1071 cv_wait(&connp->conn_refcv, &connp->conn_lock);
1072 mutex_exit(&connp->conn_lock);
1073 if (refheld)
1074 ill_waiter_dcr(ill);
1078 * ipcl_walk function for cleaning up conn_*_ill fields.
1079 * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
1080 * conn_bound_if in place. We prefer dropping
1081 * packets instead of sending them out the wrong interface, or accepting
1082 * packets from the wrong ifindex.
1084 static void
1085 conn_cleanup_ill(conn_t *connp, caddr_t arg)
1087 ill_t *ill = (ill_t *)arg;
1089 mutex_enter(&connp->conn_lock);
1090 if (connp->conn_dhcpinit_ill == ill) {
1091 connp->conn_dhcpinit_ill = NULL;
1092 ASSERT(ill->ill_dhcpinit != 0);
1093 atomic_dec_32(&ill->ill_dhcpinit);
1094 ill_set_inputfn(ill);
1096 mutex_exit(&connp->conn_lock);
1099 static int
1100 ill_down_ipifs_tail(ill_t *ill)
1102 ipif_t *ipif;
1103 int err;
1105 ASSERT(IAM_WRITER_ILL(ill));
1106 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1107 ipif_non_duplicate(ipif);
1109 * ipif_down_tail will call arp_ll_down on the last ipif
1110 * and typically return EINPROGRESS when the DL_UNBIND is sent.
1112 if ((err = ipif_down_tail(ipif)) != 0)
1113 return (err);
1115 return (0);
1118 /* ARGSUSED */
1119 void
1120 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
1122 ASSERT(IAM_WRITER_IPSQ(ipsq));
1123 (void) ill_down_ipifs_tail(q->q_ptr);
1124 freemsg(mp);
1125 ipsq_current_finish(ipsq);
1129 * ill_down_start is called when we want to down this ill and bring it up again
1130 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
1131 * all interfaces, but don't tear down any plumbing.
1133 boolean_t
1134 ill_down_start(queue_t *q, mblk_t *mp)
1136 ill_t *ill = q->q_ptr;
1137 ipif_t *ipif;
1139 ASSERT(IAM_WRITER_ILL(ill));
1141 * It is possible that some ioctl is already in progress while we
1142 * received the M_ERROR / M_HANGUP in which case, we need to abort
1143 * the ioctl. ill_down_start() is being processed as CUR_OP rather
1144 * than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent
1145 * the in progress ioctl from ever completing.
1147 * The thread that started the ioctl (if any) must have returned,
1148 * since we are now executing as writer. After the 2 calls below,
1149 * the state of the ipsq and the ill would reflect no trace of any
1150 * pending operation. Subsequently if there is any response to the
1151 * original ioctl from the driver, it would be discarded as an
1152 * unsolicited message from the driver.
1154 (void) ipsq_pending_mp_cleanup(ill, NULL);
1155 ill_dlpi_clear_deferred(ill);
1157 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1158 (void) ipif_down(ipif, NULL, NULL);
1160 ill_down(ill);
1163 * Walk all CONNs that can have a reference on an ire or nce for this
1164 * ill (we actually walk all that now have stale references).
1166 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
1168 /* With IPv6 we have dce_ifindex. Cleanup for neatness */
1169 if (ill->ill_isv6)
1170 dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
1172 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
1175 * Atomically test and add the pending mp if references are active.
1177 mutex_enter(&ill->ill_lock);
1178 if (!ill_is_quiescent(ill)) {
1179 /* call cannot fail since `conn_t *' argument is NULL */
1180 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
1181 mp, ILL_DOWN);
1182 mutex_exit(&ill->ill_lock);
1183 return (B_FALSE);
1185 mutex_exit(&ill->ill_lock);
1186 return (B_TRUE);
1189 static void
1190 ill_down(ill_t *ill)
1192 mblk_t *mp;
1193 ip_stack_t *ipst = ill->ill_ipst;
1196 * Blow off any IREs dependent on this ILL.
1197 * The caller needs to handle conn_ixa_cleanup
1199 ill_delete_ires(ill);
1201 ire_walk_ill(0, 0, ill_downi, ill, ill);
1203 /* Remove any conn_*_ill depending on this ill */
1204 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
1207 * Free state for additional IREs.
1209 mutex_enter(&ill->ill_saved_ire_lock);
1210 mp = ill->ill_saved_ire_mp;
1211 ill->ill_saved_ire_mp = NULL;
1212 ill->ill_saved_ire_cnt = 0;
1213 mutex_exit(&ill->ill_saved_ire_lock);
1214 freemsg(mp);
1218 * ire_walk routine used to delete every IRE that depends on
1219 * 'ill'. (Always called as writer, and may only be called from ire_walk.)
1221 * Note: since the routes added by the kernel are deleted separately,
1222 * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
1224 * We also remove references on ire_nce_cache entries that refer to the ill.
1226 void
1227 ill_downi(ire_t *ire, char *ill_arg)
1229 ill_t *ill = (ill_t *)ill_arg;
1230 nce_t *nce;
1232 mutex_enter(&ire->ire_lock);
1233 nce = ire->ire_nce_cache;
1234 if (nce != NULL && nce->nce_ill == ill)
1235 ire->ire_nce_cache = NULL;
1236 else
1237 nce = NULL;
1238 mutex_exit(&ire->ire_lock);
1239 if (nce != NULL)
1240 nce_refrele(nce);
1241 if (ire->ire_ill == ill) {
1243 * The existing interface binding for ire must be
1244 * deleted before trying to bind the route to another
1245 * interface. However, since we are using the contents of the
1246 * ire after ire_delete, the caller has to ensure that
1247 * CONDEMNED (deleted) ire's are not removed from the list
1248 * when ire_delete() returns. Currently ill_downi() is
1249 * only called as part of ire_walk*() routines, so that
1250 * the irb_refhold() done by ire_walk*() will ensure that
1251 * ire_delete() does not lead to ire_inactive().
1253 ASSERT(ire->ire_bucket->irb_refcnt > 0);
1254 ire_delete(ire);
1255 if (ire->ire_unbound)
1256 ire_rebind(ire);
1260 /* Remove IRE_IF_CLONE on this ill */
1261 void
1262 ill_downi_if_clone(ire_t *ire, char *ill_arg)
1264 ill_t *ill = (ill_t *)ill_arg;
1266 ASSERT(ire->ire_type & IRE_IF_CLONE);
1267 if (ire->ire_ill == ill)
1268 ire_delete(ire);
1271 /* Consume an M_IOCACK of the fastpath probe. */
1272 void
1273 ill_fastpath_ack(ill_t *ill, mblk_t *mp)
1275 mblk_t *mp1 = mp;
1278 * If this was the first attempt turn on the fastpath probing.
1280 mutex_enter(&ill->ill_lock);
1281 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
1282 ill->ill_dlpi_fastpath_state = IDS_OK;
1283 mutex_exit(&ill->ill_lock);
1285 /* Free the M_IOCACK mblk, hold on to the data */
1286 mp = mp->b_cont;
1287 freeb(mp1);
1288 if (mp == NULL)
1289 return;
1290 if (mp->b_cont != NULL)
1291 nce_fastpath_update(ill, mp);
1292 else
1293 ip0dbg(("ill_fastpath_ack: no b_cont\n"));
1294 freemsg(mp);
1298 * Throw an M_IOCTL message downstream asking "do you know fastpath?"
1299 * The data portion of the request is a dl_unitdata_req_t template for
1300 * what we would send downstream in the absence of a fastpath confirmation.
1303 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
1305 struct iocblk *ioc;
1306 mblk_t *mp;
1308 if (dlur_mp == NULL)
1309 return (EINVAL);
1311 mutex_enter(&ill->ill_lock);
1312 switch (ill->ill_dlpi_fastpath_state) {
1313 case IDS_FAILED:
1315 * Driver NAKed the first fastpath ioctl - assume it doesn't
1316 * support it.
1318 mutex_exit(&ill->ill_lock);
1319 return (ENOTSUP);
1320 case IDS_UNKNOWN:
1321 /* This is the first probe */
1322 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
1323 break;
1324 default:
1325 break;
1327 mutex_exit(&ill->ill_lock);
1329 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
1330 return (EAGAIN);
1332 mp->b_cont = copyb(dlur_mp);
1333 if (mp->b_cont == NULL) {
1334 freeb(mp);
1335 return (EAGAIN);
1338 ioc = (struct iocblk *)mp->b_rptr;
1339 ioc->ioc_count = msgdsize(mp->b_cont);
1341 DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
1342 char *, "DL_IOC_HDR_INFO", ill_t *, ill);
1343 putnext(ill->ill_wq, mp);
1344 return (0);
1347 void
1348 ill_capability_probe(ill_t *ill)
1350 mblk_t *mp;
1352 ASSERT(IAM_WRITER_ILL(ill));
1354 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
1355 ill->ill_dlpi_capab_state != IDCS_FAILED)
1356 return;
1359 * We are starting a new cycle of capability negotiation.
1360 * Free up the capab reset messages of any previous incarnation.
1361 * We will do a fresh allocation when we get the response to our probe
1363 if (ill->ill_capab_reset_mp != NULL) {
1364 freemsg(ill->ill_capab_reset_mp);
1365 ill->ill_capab_reset_mp = NULL;
1368 ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
1370 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
1371 if (mp == NULL)
1372 return;
1374 ill_capability_send(ill, mp);
1375 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
1378 void
1379 ill_capability_reset(ill_t *ill, boolean_t reneg)
1381 ASSERT(IAM_WRITER_ILL(ill));
1383 if (ill->ill_dlpi_capab_state != IDCS_OK)
1384 return;
1386 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
1388 ill_capability_send(ill, ill->ill_capab_reset_mp);
1389 ill->ill_capab_reset_mp = NULL;
1391 * We turn off all capabilities except those pertaining to
1392 * direct function call capabilities viz. ILL_CAPAB_DLD*
1393 * which will be turned off by the corresponding reset functions.
1395 ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY);
1398 static void
1399 ill_capability_reset_alloc(ill_t *ill)
1401 mblk_t *mp;
1402 size_t size = 0;
1403 int err;
1404 dl_capability_req_t *capb;
1406 ASSERT(IAM_WRITER_ILL(ill));
1407 ASSERT(ill->ill_capab_reset_mp == NULL);
1409 if (ILL_HCKSUM_CAPABLE(ill)) {
1410 size += sizeof (dl_capability_sub_t) +
1411 sizeof (dl_capab_hcksum_t);
1414 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
1415 size += sizeof (dl_capability_sub_t) +
1416 sizeof (dl_capab_zerocopy_t);
1419 if (ill->ill_capabilities & ILL_CAPAB_DLD) {
1420 size += sizeof (dl_capability_sub_t) +
1421 sizeof (dl_capab_dld_t);
1424 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
1425 STR_NOSIG, &err);
1427 mp->b_datap->db_type = M_PROTO;
1428 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
1430 capb = (dl_capability_req_t *)mp->b_rptr;
1431 capb->dl_primitive = DL_CAPABILITY_REQ;
1432 capb->dl_sub_offset = sizeof (dl_capability_req_t);
1433 capb->dl_sub_length = size;
1435 mp->b_wptr += sizeof (dl_capability_req_t);
1438 * Each handler fills in the corresponding dl_capability_sub_t
1439 * inside the mblk,
1441 ill_capability_hcksum_reset_fill(ill, mp);
1442 ill_capability_zerocopy_reset_fill(ill, mp);
1443 ill_capability_dld_reset_fill(ill, mp);
1445 ill->ill_capab_reset_mp = mp;
1448 static void
1449 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
1451 dl_capab_id_t *id_ic;
1452 uint_t sub_dl_cap = outers->dl_cap;
1453 dl_capability_sub_t *inners;
1454 uint8_t *capend;
1456 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
1459 * Note: range checks here are not absolutely sufficient to
1460 * make us robust against malformed messages sent by drivers;
1461 * this is in keeping with the rest of IP's dlpi handling.
1462 * (Remember, it's coming from something else in the kernel
1463 * address space)
1466 capend = (uint8_t *)(outers + 1) + outers->dl_length;
1467 if (capend > mp->b_wptr) {
1468 cmn_err(CE_WARN, "ill_capability_id_ack: "
1469 "malformed sub-capability too long for mblk");
1470 return;
1473 id_ic = (dl_capab_id_t *)(outers + 1);
1475 if (outers->dl_length < sizeof (*id_ic) ||
1476 (inners = &id_ic->id_subcap,
1477 inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
1478 cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
1479 "encapsulated capab type %d too long for mblk",
1480 inners->dl_cap);
1481 return;
1484 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
1485 ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
1486 "isn't as expected; pass-thru module(s) detected, "
1487 "discarding capability\n", inners->dl_cap));
1488 return;
1491 /* Process the encapsulated sub-capability */
1492 ill_capability_dispatch(ill, mp, inners);
1495 static void
1496 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
1498 dl_capability_sub_t *dl_subcap;
1500 if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
1501 return;
1504 * The dl_capab_dld_t that follows the dl_capability_sub_t is not
1505 * initialized below since it is not used by DLD.
1507 dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1508 dl_subcap->dl_cap = DL_CAPAB_DLD;
1509 dl_subcap->dl_length = sizeof (dl_capab_dld_t);
1511 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
1514 static void
1515 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
1518 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK
1519 * is only to get the VRRP capability.
1521 * Note that we cannot check ill_ipif_up_count here since
1522 * ill_ipif_up_count is only incremented when the resolver is setup.
1523 * That is done asynchronously, and can race with this function.
1525 if (!ill->ill_dl_up) {
1526 if (subp->dl_cap == DL_CAPAB_VRRP)
1527 ill_capability_vrrp_ack(ill, mp, subp);
1528 return;
1531 switch (subp->dl_cap) {
1532 case DL_CAPAB_HCKSUM:
1533 ill_capability_hcksum_ack(ill, mp, subp);
1534 break;
1535 case DL_CAPAB_ZEROCOPY:
1536 ill_capability_zerocopy_ack(ill, mp, subp);
1537 break;
1538 case DL_CAPAB_DLD:
1539 ill_capability_dld_ack(ill, mp, subp);
1540 break;
1541 case DL_CAPAB_VRRP:
1542 break;
1543 default:
1544 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
1545 subp->dl_cap));
1550 * Process the vrrp capability received from a DLS Provider. isub must point
1551 * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message.
1553 static void
1554 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1556 dl_capab_vrrp_t *vrrp;
1557 uint_t sub_dl_cap = isub->dl_cap;
1558 uint8_t *capend;
1560 ASSERT(IAM_WRITER_ILL(ill));
1561 ASSERT(sub_dl_cap == DL_CAPAB_VRRP);
1564 * Note: range checks here are not absolutely sufficient to
1565 * make us robust against malformed messages sent by drivers;
1566 * this is in keeping with the rest of IP's dlpi handling.
1567 * (Remember, it's coming from something else in the kernel
1568 * address space)
1570 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1571 if (capend > mp->b_wptr) {
1572 cmn_err(CE_WARN, "ill_capability_vrrp_ack: "
1573 "malformed sub-capability too long for mblk");
1574 return;
1576 vrrp = (dl_capab_vrrp_t *)(isub + 1);
1579 * Compare the IP address family and set ILLF_VRRP for the right ill.
1581 if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) ||
1582 (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) {
1583 ill->ill_flags |= ILLF_VRRP;
1588 * Process a hardware checksum offload capability negotiation ack received
1589 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
1590 * of a DL_CAPABILITY_ACK message.
1592 static void
1593 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1595 dl_capability_req_t *ocap;
1596 dl_capab_hcksum_t *ihck, *ohck;
1597 ill_hcksum_capab_t **ill_hcksum;
1598 mblk_t *nmp = NULL;
1599 uint_t sub_dl_cap = isub->dl_cap;
1600 uint8_t *capend;
1602 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
1604 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
1607 * Note: range checks here are not absolutely sufficient to
1608 * make us robust against malformed messages sent by drivers;
1609 * this is in keeping with the rest of IP's dlpi handling.
1610 * (Remember, it's coming from something else in the kernel
1611 * address space)
1613 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1614 if (capend > mp->b_wptr) {
1615 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1616 "malformed sub-capability too long for mblk");
1617 return;
1621 * There are two types of acks we process here:
1622 * 1. acks in reply to a (first form) generic capability req
1623 * (no ENABLE flag set)
1624 * 2. acks in reply to a ENABLE capability req.
1625 * (ENABLE flag set)
1627 ihck = (dl_capab_hcksum_t *)(isub + 1);
1629 if (ihck->hcksum_version != HCKSUM_VERSION_1) {
1630 cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
1631 "unsupported hardware checksum "
1632 "sub-capability (version %d, expected %d)",
1633 ihck->hcksum_version, HCKSUM_VERSION_1);
1634 return;
1637 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
1638 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
1639 "checksum capability isn't as expected; pass-thru "
1640 "module(s) detected, discarding capability\n"));
1641 return;
1644 #define CURR_HCKSUM_CAPAB \
1645 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \
1646 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
1648 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
1649 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
1650 /* do ENABLE processing */
1651 if (*ill_hcksum == NULL) {
1652 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
1653 KM_NOSLEEP);
1655 if (*ill_hcksum == NULL) {
1656 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1657 "could not enable hcksum version %d "
1658 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
1659 ill->ill_name);
1660 return;
1664 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
1665 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
1666 ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
1667 ip1dbg(("ill_capability_hcksum_ack: interface %s "
1668 "has enabled hardware checksumming\n ",
1669 ill->ill_name));
1670 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
1672 * Enabling hardware checksum offload
1673 * Currently IP supports {TCP,UDP}/IPv4
1674 * partial and full cksum offload and
1675 * IPv4 header checksum offload.
1676 * Allocate new mblk which will
1677 * contain a new capability request
1678 * to enable hardware checksum offload.
1680 uint_t size;
1681 uchar_t *rptr;
1683 size = sizeof (dl_capability_req_t) +
1684 sizeof (dl_capability_sub_t) + isub->dl_length;
1686 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1687 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1688 "could not enable hardware cksum for %s (ENOMEM)\n",
1689 ill->ill_name);
1690 return;
1693 rptr = nmp->b_rptr;
1694 /* initialize dl_capability_req_t */
1695 ocap = (dl_capability_req_t *)nmp->b_rptr;
1696 ocap->dl_sub_offset =
1697 sizeof (dl_capability_req_t);
1698 ocap->dl_sub_length =
1699 sizeof (dl_capability_sub_t) +
1700 isub->dl_length;
1701 nmp->b_rptr += sizeof (dl_capability_req_t);
1703 /* initialize dl_capability_sub_t */
1704 bcopy(isub, nmp->b_rptr, sizeof (*isub));
1705 nmp->b_rptr += sizeof (*isub);
1707 /* initialize dl_capab_hcksum_t */
1708 ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
1709 bcopy(ihck, ohck, sizeof (*ihck));
1711 nmp->b_rptr = rptr;
1712 ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
1714 /* Set ENABLE flag */
1715 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
1716 ohck->hcksum_txflags |= HCKSUM_ENABLE;
1719 * nmp points to a DL_CAPABILITY_REQ message to enable
1720 * hardware checksum acceleration.
1722 ill_capability_send(ill, nmp);
1723 } else {
1724 ip1dbg(("ill_capability_hcksum_ack: interface %s has "
1725 "advertised %x hardware checksum capability flags\n",
1726 ill->ill_name, ihck->hcksum_txflags));
1730 static void
1731 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
1733 dl_capab_hcksum_t *hck_subcap;
1734 dl_capability_sub_t *dl_subcap;
1736 if (!ILL_HCKSUM_CAPABLE(ill))
1737 return;
1739 ASSERT(ill->ill_hcksum_capab != NULL);
1741 dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1742 dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
1743 dl_subcap->dl_length = sizeof (*hck_subcap);
1745 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
1746 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
1747 hck_subcap->hcksum_txflags = 0;
1749 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
1752 static void
1753 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1755 mblk_t *nmp = NULL;
1756 dl_capability_req_t *oc;
1757 dl_capab_zerocopy_t *zc_ic, *zc_oc;
1758 ill_zerocopy_capab_t **ill_zerocopy_capab;
1759 uint_t sub_dl_cap = isub->dl_cap;
1760 uint8_t *capend;
1762 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
1764 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
1767 * Note: range checks here are not absolutely sufficient to
1768 * make us robust against malformed messages sent by drivers;
1769 * this is in keeping with the rest of IP's dlpi handling.
1770 * (Remember, it's coming from something else in the kernel
1771 * address space)
1773 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1774 if (capend > mp->b_wptr) {
1775 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1776 "malformed sub-capability too long for mblk");
1777 return;
1780 zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
1781 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
1782 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
1783 "unsupported ZEROCOPY sub-capability (version %d, "
1784 "expected %d)", zc_ic->zerocopy_version,
1785 ZEROCOPY_VERSION_1);
1786 return;
1789 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
1790 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
1791 "capability isn't as expected; pass-thru module(s) "
1792 "detected, discarding capability\n"));
1793 return;
1796 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
1797 if (*ill_zerocopy_capab == NULL) {
1798 *ill_zerocopy_capab =
1799 kmem_zalloc(sizeof (ill_zerocopy_capab_t),
1800 KM_NOSLEEP);
1802 if (*ill_zerocopy_capab == NULL) {
1803 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1804 "could not enable Zero-copy version %d "
1805 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
1806 ill->ill_name);
1807 return;
1811 ip1dbg(("ill_capability_zerocopy_ack: interface %s "
1812 "supports Zero-copy version %d\n", ill->ill_name,
1813 ZEROCOPY_VERSION_1));
1815 (*ill_zerocopy_capab)->ill_zerocopy_version =
1816 zc_ic->zerocopy_version;
1817 (*ill_zerocopy_capab)->ill_zerocopy_flags =
1818 zc_ic->zerocopy_flags;
1820 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
1821 } else {
1822 uint_t size;
1823 uchar_t *rptr;
1825 size = sizeof (dl_capability_req_t) +
1826 sizeof (dl_capability_sub_t) +
1827 sizeof (dl_capab_zerocopy_t);
1829 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1830 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1831 "could not enable zerocopy for %s (ENOMEM)\n",
1832 ill->ill_name);
1833 return;
1836 rptr = nmp->b_rptr;
1837 /* initialize dl_capability_req_t */
1838 oc = (dl_capability_req_t *)rptr;
1839 oc->dl_sub_offset = sizeof (dl_capability_req_t);
1840 oc->dl_sub_length = sizeof (dl_capability_sub_t) +
1841 sizeof (dl_capab_zerocopy_t);
1842 rptr += sizeof (dl_capability_req_t);
1844 /* initialize dl_capability_sub_t */
1845 bcopy(isub, rptr, sizeof (*isub));
1846 rptr += sizeof (*isub);
1848 /* initialize dl_capab_zerocopy_t */
1849 zc_oc = (dl_capab_zerocopy_t *)rptr;
1850 *zc_oc = *zc_ic;
1852 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
1853 "to enable zero-copy version %d\n", ill->ill_name,
1854 ZEROCOPY_VERSION_1));
1856 /* set VMSAFE_MEM flag */
1857 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
1859 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
1860 ill_capability_send(ill, nmp);
1864 static void
1865 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
1867 dl_capab_zerocopy_t *zerocopy_subcap;
1868 dl_capability_sub_t *dl_subcap;
1870 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
1871 return;
1873 ASSERT(ill->ill_zerocopy_capab != NULL);
1875 dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1876 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
1877 dl_subcap->dl_length = sizeof (*zerocopy_subcap);
1879 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
1880 zerocopy_subcap->zerocopy_version =
1881 ill->ill_zerocopy_capab->ill_zerocopy_version;
1882 zerocopy_subcap->zerocopy_flags = 0;
1884 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
1888 * DLD capability
1889 * Refer to dld.h for more information regarding the purpose and usage
1890 * of this capability.
1892 static void
1893 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1895 dl_capab_dld_t *dld_ic, dld;
1896 uint_t sub_dl_cap = isub->dl_cap;
1897 uint8_t *capend;
1898 ill_dld_capab_t *idc;
1900 ASSERT(IAM_WRITER_ILL(ill));
1901 ASSERT(sub_dl_cap == DL_CAPAB_DLD);
1904 * Note: range checks here are not absolutely sufficient to
1905 * make us robust against malformed messages sent by drivers;
1906 * this is in keeping with the rest of IP's dlpi handling.
1907 * (Remember, it's coming from something else in the kernel
1908 * address space)
1910 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1911 if (capend > mp->b_wptr) {
1912 cmn_err(CE_WARN, "ill_capability_dld_ack: "
1913 "malformed sub-capability too long for mblk");
1914 return;
1916 dld_ic = (dl_capab_dld_t *)(isub + 1);
1917 if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
1918 cmn_err(CE_CONT, "ill_capability_dld_ack: "
1919 "unsupported DLD sub-capability (version %d, "
1920 "expected %d)", dld_ic->dld_version,
1921 DLD_CURRENT_VERSION);
1922 return;
1924 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
1925 ip1dbg(("ill_capability_dld_ack: mid token for dld "
1926 "capability isn't as expected; pass-thru module(s) "
1927 "detected, discarding capability\n"));
1928 return;
1932 * Copy locally to ensure alignment.
1934 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
1936 if ((idc = ill->ill_dld_capab) == NULL) {
1937 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
1938 if (idc == NULL) {
1939 cmn_err(CE_WARN, "ill_capability_dld_ack: "
1940 "could not enable DLD version %d "
1941 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
1942 ill->ill_name);
1943 return;
1945 ill->ill_dld_capab = idc;
1947 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
1948 idc->idc_capab_dh = (void *)dld.dld_capab_handle;
1949 ip1dbg(("ill_capability_dld_ack: interface %s "
1950 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
1952 ill_capability_dld_enable(ill);
1956 * Typically capability negotiation between IP and the driver happens via
1957 * DLPI message exchange. However GLD also offers a direct function call
1958 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
1959 * But arbitrary function calls into IP or GLD are not permitted, since both
1960 * of them are protected by their own perimeter mechanism. The perimeter can
1961 * be viewed as a coarse lock or serialization mechanism. The hierarchy of
1962 * these perimeters is IP -> MAC. Thus for example to enable the squeue
1963 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
1964 * to enter the mac perimeter and then do the direct function calls into
1965 * GLD to enable squeue polling. The ring related callbacks from the mac into
1966 * the stack to add, bind, quiesce, restart or cleanup a ring are all
1967 * protected by the mac perimeter.
1969 static void
1970 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
1972 ill_dld_capab_t *idc = ill->ill_dld_capab;
1973 int err;
1975 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
1976 DLD_ENABLE);
1977 ASSERT(err == 0);
1980 static void
1981 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
1983 ill_dld_capab_t *idc = ill->ill_dld_capab;
1984 int err;
1986 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
1987 DLD_DISABLE);
1988 ASSERT(err == 0);
1991 boolean_t
1992 ill_mac_perim_held(ill_t *ill)
1994 ill_dld_capab_t *idc = ill->ill_dld_capab;
1996 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
1997 DLD_QUERY));
2000 static void
2001 ill_capability_direct_enable(ill_t *ill)
2003 ill_dld_capab_t *idc = ill->ill_dld_capab;
2004 ill_dld_direct_t *idd = &idc->idc_direct;
2005 dld_capab_direct_t direct;
2006 int rc;
2008 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2010 bzero(&direct, sizeof (direct));
2011 direct.di_rx_cf = (uintptr_t)ip_input;
2012 direct.di_rx_ch = ill;
2014 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
2015 DLD_ENABLE);
2016 if (rc == 0) {
2017 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
2018 idd->idd_tx_dh = direct.di_tx_dh;
2019 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
2020 idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
2021 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
2022 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
2023 ASSERT(idd->idd_tx_cb_df != NULL);
2024 ASSERT(idd->idd_tx_fctl_df != NULL);
2025 ASSERT(idd->idd_tx_df != NULL);
2027 * One time registration of flow enable callback function
2029 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
2030 ill_flow_enable, ill);
2031 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
2032 DTRACE_PROBE1(direct_on, (ill_t *), ill);
2033 } else {
2034 cmn_err(CE_WARN, "warning: could not enable DIRECT "
2035 "capability, rc = %d\n", rc);
2036 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
2040 static void
2041 ill_capability_poll_enable(ill_t *ill)
2043 ill_dld_capab_t *idc = ill->ill_dld_capab;
2044 dld_capab_poll_t poll;
2045 int rc;
2047 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2049 bzero(&poll, sizeof (poll));
2050 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
2051 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
2052 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
2053 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
2054 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
2055 poll.poll_ring_ch = ill;
2056 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
2057 DLD_ENABLE);
2058 if (rc == 0) {
2059 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
2060 DTRACE_PROBE1(poll_on, (ill_t *), ill);
2061 } else {
2062 ip1dbg(("warning: could not enable POLL "
2063 "capability, rc = %d\n", rc));
2064 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
2069 * Enable the LSO capability.
2071 static void
2072 ill_capability_lso_enable(ill_t *ill)
2074 ill_dld_capab_t *idc = ill->ill_dld_capab;
2075 dld_capab_lso_t lso;
2076 int rc;
2078 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2080 if (ill->ill_lso_capab == NULL) {
2081 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
2082 KM_NOSLEEP);
2083 if (ill->ill_lso_capab == NULL) {
2084 cmn_err(CE_WARN, "ill_capability_lso_enable: "
2085 "could not enable LSO for %s (ENOMEM)\n",
2086 ill->ill_name);
2087 return;
2091 bzero(&lso, sizeof (lso));
2092 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
2093 DLD_ENABLE)) == 0) {
2094 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
2095 ill->ill_lso_capab->ill_lso_max = lso.lso_max;
2096 ill->ill_capabilities |= ILL_CAPAB_LSO;
2097 ip1dbg(("ill_capability_lso_enable: interface %s "
2098 "has enabled LSO\n ", ill->ill_name));
2099 } else {
2100 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
2101 ill->ill_lso_capab = NULL;
2102 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
2106 static void
2107 ill_capability_dld_enable(ill_t *ill)
2109 mac_perim_handle_t mph;
2111 ASSERT(IAM_WRITER_ILL(ill));
2113 if (ill->ill_isv6)
2114 return;
2116 ill_mac_perim_enter(ill, &mph);
2117 if (!ill->ill_isv6) {
2118 ill_capability_direct_enable(ill);
2119 ill_capability_poll_enable(ill);
2120 ill_capability_lso_enable(ill);
2122 ill->ill_capabilities |= ILL_CAPAB_DLD;
2123 ill_mac_perim_exit(ill, mph);
2126 static void
2127 ill_capability_dld_disable(ill_t *ill)
2129 ill_dld_capab_t *idc;
2130 ill_dld_direct_t *idd;
2131 mac_perim_handle_t mph;
2133 ASSERT(IAM_WRITER_ILL(ill));
2135 if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
2136 return;
2138 ill_mac_perim_enter(ill, &mph);
2140 idc = ill->ill_dld_capab;
2141 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
2143 * For performance we avoid locks in the transmit data path
2144 * and don't maintain a count of the number of threads using
2145 * direct calls. Thus some threads could be using direct
2146 * transmit calls to GLD, even after the capability mechanism
2147 * turns it off. This is still safe since the handles used in
2148 * the direct calls continue to be valid until the unplumb is
2149 * completed. Remove the callback that was added (1-time) at
2150 * capab enable time.
2152 mutex_enter(&ill->ill_lock);
2153 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
2154 mutex_exit(&ill->ill_lock);
2155 if (ill->ill_flownotify_mh != NULL) {
2156 idd = &idc->idc_direct;
2157 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
2158 ill->ill_flownotify_mh);
2159 ill->ill_flownotify_mh = NULL;
2161 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
2162 NULL, DLD_DISABLE);
2165 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
2166 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
2167 ip_squeue_clean_all(ill);
2168 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
2169 NULL, DLD_DISABLE);
2172 if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
2173 ASSERT(ill->ill_lso_capab != NULL);
2175 * Clear the capability flag for LSO but retain the
2176 * ill_lso_capab structure since it's possible that another
2177 * thread is still referring to it. The structure only gets
2178 * deallocated when we destroy the ill.
2181 ill->ill_capabilities &= ~ILL_CAPAB_LSO;
2182 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
2183 NULL, DLD_DISABLE);
2186 ill->ill_capabilities &= ~ILL_CAPAB_DLD;
2187 ill_mac_perim_exit(ill, mph);
2191 * Capability Negotiation protocol
2193 * We don't wait for DLPI capability operations to finish during interface
2194 * bringup or teardown. Doing so would introduce more asynchrony and the
2195 * interface up/down operations will need multiple return and restarts.
2196 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
2197 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
2198 * exclusive operation won't start until the DLPI operations of the previous
2199 * exclusive operation complete.
2201 * The capability state machine is shown below.
2203 * state next state event, action
2205 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe
2206 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack
2207 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack)
2208 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG
2209 * IDCS_OK IDCS_RESET_SENT ill_capability_reset
2210 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr
2211 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr ->
2212 * ill_capability_probe.
2216 * Dedicated thread started from ip_stack_init that handles capability
2217 * disable. This thread ensures the taskq dispatch does not fail by waiting
2218 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure
2219 * that direct calls to DLD are done in a cv_waitable context.
2221 void
2222 ill_taskq_dispatch(ip_stack_t *ipst)
2224 callb_cpr_t cprinfo;
2225 char name[64];
2226 mblk_t *mp;
2228 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
2229 ipst->ips_netstack->netstack_stackid);
2230 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
2231 name);
2232 mutex_enter(&ipst->ips_capab_taskq_lock);
2234 for (;;) {
2235 mp = ipst->ips_capab_taskq_head;
2236 while (mp != NULL) {
2237 ipst->ips_capab_taskq_head = mp->b_next;
2238 if (ipst->ips_capab_taskq_head == NULL)
2239 ipst->ips_capab_taskq_tail = NULL;
2240 mutex_exit(&ipst->ips_capab_taskq_lock);
2241 mp->b_next = NULL;
2243 VERIFY(taskq_dispatch(system_taskq,
2244 ill_capability_ack_thr, mp, TQ_SLEEP) != 0);
2245 mutex_enter(&ipst->ips_capab_taskq_lock);
2246 mp = ipst->ips_capab_taskq_head;
2249 if (ipst->ips_capab_taskq_quit)
2250 break;
2251 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2252 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
2253 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
2255 VERIFY(ipst->ips_capab_taskq_head == NULL);
2256 VERIFY(ipst->ips_capab_taskq_tail == NULL);
2257 CALLB_CPR_EXIT(&cprinfo);
2258 thread_exit();
2262 * Consume a new-style hardware capabilities negotiation ack.
2263 * Called via taskq on receipt of DL_CAPABILITY_ACK.
2265 static void
2266 ill_capability_ack_thr(void *arg)
2268 mblk_t *mp = arg;
2269 dl_capability_ack_t *capp;
2270 dl_capability_sub_t *subp, *endp;
2271 ill_t *ill;
2272 boolean_t reneg;
2274 ill = (ill_t *)mp->b_prev;
2275 mp->b_prev = NULL;
2277 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
2279 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
2280 ill->ill_dlpi_capab_state == IDCS_RENEG) {
2282 * We have received the ack for our DL_CAPAB reset request.
2283 * There isnt' anything in the message that needs processing.
2284 * All message based capabilities have been disabled, now
2285 * do the function call based capability disable.
2287 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
2288 ill_capability_dld_disable(ill);
2289 ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
2290 if (reneg)
2291 ill_capability_probe(ill);
2292 goto done;
2295 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
2296 ill->ill_dlpi_capab_state = IDCS_OK;
2298 capp = (dl_capability_ack_t *)mp->b_rptr;
2300 if (capp->dl_sub_length == 0) {
2301 /* no new-style capabilities */
2302 goto done;
2305 /* make sure the driver supplied correct dl_sub_length */
2306 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
2307 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
2308 "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
2309 goto done;
2312 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
2314 * There are sub-capabilities. Process the ones we know about.
2315 * Loop until we don't have room for another sub-cap header..
2317 for (subp = SC(capp, capp->dl_sub_offset),
2318 endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
2319 subp <= endp;
2320 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
2322 switch (subp->dl_cap) {
2323 case DL_CAPAB_ID_WRAPPER:
2324 ill_capability_id_ack(ill, mp, subp);
2325 break;
2326 default:
2327 ill_capability_dispatch(ill, mp, subp);
2328 break;
2331 #undef SC
2332 done:
2333 inet_freemsg(mp);
2334 ill_capability_done(ill);
2335 ipsq_exit(ill->ill_phyint->phyint_ipsq);
2339 * This needs to be started in a taskq thread to provide a cv_waitable
2340 * context.
2342 void
2343 ill_capability_ack(ill_t *ill, mblk_t *mp)
2345 ip_stack_t *ipst = ill->ill_ipst;
2347 mp->b_prev = (mblk_t *)ill;
2348 ASSERT(mp->b_next == NULL);
2350 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
2351 TQ_NOSLEEP) != 0)
2352 return;
2355 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread
2356 * which will do the dispatch using TQ_SLEEP to guarantee success.
2358 mutex_enter(&ipst->ips_capab_taskq_lock);
2359 if (ipst->ips_capab_taskq_head == NULL) {
2360 ASSERT(ipst->ips_capab_taskq_tail == NULL);
2361 ipst->ips_capab_taskq_head = mp;
2362 } else {
2363 ipst->ips_capab_taskq_tail->b_next = mp;
2365 ipst->ips_capab_taskq_tail = mp;
2367 cv_signal(&ipst->ips_capab_taskq_cv);
2368 mutex_exit(&ipst->ips_capab_taskq_lock);
2372 * This routine is called to scan the fragmentation reassembly table for
2373 * the specified ILL for any packets that are starting to smell.
2374 * dead_interval is the maximum time in seconds that will be tolerated. It
2375 * will either be the value specified in ip_g_frag_timeout, or zero if the
2376 * ILL is shutting down and it is time to blow everything off.
2378 * It returns the number of seconds (as a time_t) that the next frag timer
2379 * should be scheduled for, 0 meaning that the timer doesn't need to be
2380 * re-started. Note that the method of calculating next_timeout isn't
2381 * entirely accurate since time will flow between the time we grab
2382 * current_time and the time we schedule the next timeout. This isn't a
2383 * big problem since this is the timer for sending an ICMP reassembly time
2384 * exceeded messages, and it doesn't have to be exactly accurate.
2386 * This function is
2387 * sometimes called as writer, although this is not required.
2389 time_t
2390 ill_frag_timeout(ill_t *ill, time_t dead_interval)
2392 ipfb_t *ipfb;
2393 ipfb_t *endp;
2394 ipf_t *ipf;
2395 ipf_t *ipfnext;
2396 mblk_t *mp;
2397 time_t current_time = gethrestime_sec();
2398 time_t next_timeout = 0;
2399 uint32_t hdr_length;
2400 mblk_t *send_icmp_head;
2401 mblk_t *send_icmp_head_v6;
2402 ip_stack_t *ipst = ill->ill_ipst;
2403 ip_recv_attr_t iras;
2405 bzero(&iras, sizeof (iras));
2406 iras.ira_flags = 0;
2407 iras.ira_ill = iras.ira_rill = ill;
2408 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2409 iras.ira_rifindex = iras.ira_ruifindex;
2411 ipfb = ill->ill_frag_hash_tbl;
2412 if (ipfb == NULL)
2413 return (B_FALSE);
2414 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
2415 /* Walk the frag hash table. */
2416 for (; ipfb < endp; ipfb++) {
2417 send_icmp_head = NULL;
2418 send_icmp_head_v6 = NULL;
2419 mutex_enter(&ipfb->ipfb_lock);
2420 while ((ipf = ipfb->ipfb_ipf) != 0) {
2421 time_t frag_time = current_time - ipf->ipf_timestamp;
2422 time_t frag_timeout;
2424 if (frag_time < dead_interval) {
2426 * There are some outstanding fragments
2427 * that will timeout later. Make note of
2428 * the time so that we can reschedule the
2429 * next timeout appropriately.
2431 frag_timeout = dead_interval - frag_time;
2432 if (next_timeout == 0 ||
2433 frag_timeout < next_timeout) {
2434 next_timeout = frag_timeout;
2436 break;
2438 /* Time's up. Get it out of here. */
2439 hdr_length = ipf->ipf_nf_hdr_len;
2440 ipfnext = ipf->ipf_hash_next;
2441 if (ipfnext)
2442 ipfnext->ipf_ptphn = ipf->ipf_ptphn;
2443 *ipf->ipf_ptphn = ipfnext;
2444 mp = ipf->ipf_mp->b_cont;
2445 for (; mp; mp = mp->b_cont) {
2446 /* Extra points for neatness. */
2447 IP_REASS_SET_START(mp, 0);
2448 IP_REASS_SET_END(mp, 0);
2450 mp = ipf->ipf_mp->b_cont;
2451 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count);
2452 ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
2453 ipfb->ipfb_count -= ipf->ipf_count;
2454 ASSERT(ipfb->ipfb_frag_pkts > 0);
2455 ipfb->ipfb_frag_pkts--;
2457 * We do not send any icmp message from here because
2458 * we currently are holding the ipfb_lock for this
2459 * hash chain. If we try and send any icmp messages
2460 * from here we may end up via a put back into ip
2461 * trying to get the same lock, causing a recursive
2462 * mutex panic. Instead we build a list and send all
2463 * the icmp messages after we have dropped the lock.
2465 if (ill->ill_isv6) {
2466 if (hdr_length != 0) {
2467 mp->b_next = send_icmp_head_v6;
2468 send_icmp_head_v6 = mp;
2469 } else {
2470 freemsg(mp);
2472 } else {
2473 if (hdr_length != 0) {
2474 mp->b_next = send_icmp_head;
2475 send_icmp_head = mp;
2476 } else {
2477 freemsg(mp);
2480 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2481 ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
2482 freeb(ipf->ipf_mp);
2484 mutex_exit(&ipfb->ipfb_lock);
2486 * Now need to send any icmp messages that we delayed from
2487 * above.
2489 while (send_icmp_head_v6 != NULL) {
2490 ip6_t *ip6h;
2492 mp = send_icmp_head_v6;
2493 send_icmp_head_v6 = send_icmp_head_v6->b_next;
2494 mp->b_next = NULL;
2495 ip6h = (ip6_t *)mp->b_rptr;
2496 iras.ira_flags = 0;
2498 * This will result in an incorrect ALL_ZONES zoneid
2499 * for multicast packets, but we
2500 * don't send ICMP errors for those in any case.
2502 iras.ira_zoneid =
2503 ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
2504 ill, ipst);
2505 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2506 icmp_time_exceeded_v6(mp,
2507 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
2508 &iras);
2509 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2511 while (send_icmp_head != NULL) {
2512 ipaddr_t dst;
2514 mp = send_icmp_head;
2515 send_icmp_head = send_icmp_head->b_next;
2516 mp->b_next = NULL;
2518 dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
2520 iras.ira_flags = IRAF_IS_IPV4;
2522 * This will result in an incorrect ALL_ZONES zoneid
2523 * for broadcast and multicast packets, but we
2524 * don't send ICMP errors for those in any case.
2526 iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
2527 ill, ipst);
2528 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2529 icmp_time_exceeded(mp,
2530 ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
2531 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2535 * A non-dying ILL will use the return value to decide whether to
2536 * restart the frag timer, and for how long.
2538 return (next_timeout);
2542 * This routine is called when the approximate count of mblk memory used
2543 * for the specified ILL has exceeded max_count.
2545 void
2546 ill_frag_prune(ill_t *ill, uint_t max_count)
2548 ipfb_t *ipfb;
2549 ipf_t *ipf;
2550 size_t count;
2551 clock_t now;
2554 * If we are here within ip_min_frag_prune_time msecs remove
2555 * ill_frag_free_num_pkts oldest packets from each bucket and increment
2556 * ill_frag_free_num_pkts.
2558 mutex_enter(&ill->ill_lock);
2559 now = ddi_get_lbolt();
2560 if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <=
2561 (ip_min_frag_prune_time != 0 ?
2562 ip_min_frag_prune_time : msec_per_tick)) {
2564 ill->ill_frag_free_num_pkts++;
2566 } else {
2567 ill->ill_frag_free_num_pkts = 0;
2569 ill->ill_last_frag_clean_time = now;
2570 mutex_exit(&ill->ill_lock);
2573 * free ill_frag_free_num_pkts oldest packets from each bucket.
2575 if (ill->ill_frag_free_num_pkts != 0) {
2576 int ix;
2578 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2579 ipfb = &ill->ill_frag_hash_tbl[ix];
2580 mutex_enter(&ipfb->ipfb_lock);
2581 if (ipfb->ipfb_ipf != NULL) {
2582 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
2583 ill->ill_frag_free_num_pkts);
2585 mutex_exit(&ipfb->ipfb_lock);
2589 * While the reassembly list for this ILL is too big, prune a fragment
2590 * queue by age, oldest first.
2592 while (ill->ill_frag_count > max_count) {
2593 int ix;
2594 ipfb_t *oipfb = NULL;
2595 uint_t oldest = UINT_MAX;
2597 count = 0;
2598 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2599 ipfb = &ill->ill_frag_hash_tbl[ix];
2600 mutex_enter(&ipfb->ipfb_lock);
2601 ipf = ipfb->ipfb_ipf;
2602 if (ipf != NULL && ipf->ipf_gen < oldest) {
2603 oldest = ipf->ipf_gen;
2604 oipfb = ipfb;
2606 count += ipfb->ipfb_count;
2607 mutex_exit(&ipfb->ipfb_lock);
2609 if (oipfb == NULL)
2610 break;
2612 if (count <= max_count)
2613 return; /* Somebody beat us to it, nothing to do */
2614 mutex_enter(&oipfb->ipfb_lock);
2615 ipf = oipfb->ipfb_ipf;
2616 if (ipf != NULL) {
2617 ill_frag_free_pkts(ill, oipfb, ipf, 1);
2619 mutex_exit(&oipfb->ipfb_lock);
2624 * free 'free_cnt' fragmented packets starting at ipf.
2626 void
2627 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
2629 size_t count;
2630 mblk_t *mp;
2631 mblk_t *tmp;
2632 ipf_t **ipfp = ipf->ipf_ptphn;
2634 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
2635 ASSERT(ipfp != NULL);
2636 ASSERT(ipf != NULL);
2638 while (ipf != NULL && free_cnt-- > 0) {
2639 count = ipf->ipf_count;
2640 mp = ipf->ipf_mp;
2641 ipf = ipf->ipf_hash_next;
2642 for (tmp = mp; tmp; tmp = tmp->b_cont) {
2643 IP_REASS_SET_START(tmp, 0);
2644 IP_REASS_SET_END(tmp, 0);
2646 atomic_add_32(&ill->ill_frag_count, -count);
2647 ASSERT(ipfb->ipfb_count >= count);
2648 ipfb->ipfb_count -= count;
2649 ASSERT(ipfb->ipfb_frag_pkts > 0);
2650 ipfb->ipfb_frag_pkts--;
2651 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2652 ip_drop_input("ipIfStatsReasmFails", mp, ill);
2653 freemsg(mp);
2656 if (ipf)
2657 ipf->ipf_ptphn = ipfp;
2658 ipfp[0] = ipf;
2662 * Helper function for ill_forward_set().
2664 static void
2665 ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
2667 ip_stack_t *ipst = ill->ill_ipst;
2669 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2671 ip1dbg(("ill_forward_set: %s %s forwarding on %s",
2672 (enable ? "Enabling" : "Disabling"),
2673 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
2674 mutex_enter(&ill->ill_lock);
2675 if (enable)
2676 ill->ill_flags |= ILLF_ROUTER;
2677 else
2678 ill->ill_flags &= ~ILLF_ROUTER;
2679 mutex_exit(&ill->ill_lock);
2680 if (ill->ill_isv6)
2681 ill_set_nce_router_flags(ill, enable);
2682 /* Notify routing socket listeners of this change. */
2683 if (ill->ill_ipif != NULL)
2684 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
2688 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing
2689 * socket messages for each interface whose flags we change.
2692 ill_forward_set(ill_t *ill, boolean_t enable)
2694 ipmp_illgrp_t *illg;
2695 ip_stack_t *ipst = ill->ill_ipst;
2697 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2699 if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
2700 (!enable && !(ill->ill_flags & ILLF_ROUTER)))
2701 return (0);
2703 if (IS_LOOPBACK(ill))
2704 return (EINVAL);
2706 if (enable && ill->ill_allowed_ips_cnt > 0)
2707 return (EPERM);
2709 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
2711 * Update all of the interfaces in the group.
2713 illg = ill->ill_grp;
2714 ill = list_head(&illg->ig_if);
2715 for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
2716 ill_forward_set_on_ill(ill, enable);
2719 * Update the IPMP meta-interface.
2721 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
2722 return (0);
2725 ill_forward_set_on_ill(ill, enable);
2726 return (0);
2730 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
2731 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
2732 * set or clear.
2734 static void
2735 ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
2737 ipif_t *ipif;
2738 ncec_t *ncec;
2739 nce_t *nce;
2741 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
2743 * NOTE: we match across the illgrp because nce's for
2744 * addresses on IPMP interfaces have an nce_ill that points to
2745 * the bound underlying ill.
2747 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
2748 if (nce != NULL) {
2749 ncec = nce->nce_common;
2750 mutex_enter(&ncec->ncec_lock);
2751 if (enable)
2752 ncec->ncec_flags |= NCE_F_ISROUTER;
2753 else
2754 ncec->ncec_flags &= ~NCE_F_ISROUTER;
2755 mutex_exit(&ncec->ncec_lock);
2756 nce_refrele(nce);
2762 * Intializes the context structure and returns the first ill in the list
2763 * cuurently start_list and end_list can have values:
2764 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists.
2765 * IP_V4_G_HEAD Traverse IPV4 list only.
2766 * IP_V6_G_HEAD Traverse IPV6 list only.
2770 * We don't check for CONDEMNED ills here. Caller must do that if
2771 * necessary under the ill lock.
2773 ill_t *
2774 ill_first(int start_list, int end_list, ill_walk_context_t *ctx,
2775 ip_stack_t *ipst)
2777 ill_if_t *ifp;
2778 ill_t *ill;
2779 avl_tree_t *avl_tree;
2781 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
2782 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
2785 * setup the lists to search
2787 if (end_list != MAX_G_HEADS) {
2788 ctx->ctx_current_list = start_list;
2789 ctx->ctx_last_list = end_list;
2790 } else {
2791 ctx->ctx_last_list = MAX_G_HEADS - 1;
2792 ctx->ctx_current_list = 0;
2795 while (ctx->ctx_current_list <= ctx->ctx_last_list) {
2796 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2797 if (ifp != (ill_if_t *)
2798 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2799 avl_tree = &ifp->illif_avl_by_ppa;
2800 ill = avl_first(avl_tree);
2802 * ill is guaranteed to be non NULL or ifp should have
2803 * not existed.
2805 ASSERT(ill != NULL);
2806 return (ill);
2808 ctx->ctx_current_list++;
2811 return (NULL);
2815 * returns the next ill in the list. ill_first() must have been called
2816 * before calling ill_next() or bad things will happen.
2820 * We don't check for CONDEMNED ills here. Caller must do that if
2821 * necessary under the ill lock.
2823 ill_t *
2824 ill_next(ill_walk_context_t *ctx, ill_t *lastill)
2826 ill_if_t *ifp;
2827 ill_t *ill;
2828 ip_stack_t *ipst = lastill->ill_ipst;
2830 ASSERT(lastill->ill_ifptr != (ill_if_t *)
2831 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst));
2832 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill,
2833 AVL_AFTER)) != NULL) {
2834 return (ill);
2837 /* goto next ill_ifp in the list. */
2838 ifp = lastill->ill_ifptr->illif_next;
2840 /* make sure not at end of circular list */
2841 while (ifp ==
2842 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2843 if (++ctx->ctx_current_list > ctx->ctx_last_list)
2844 return (NULL);
2845 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2848 return (avl_first(&ifp->illif_avl_by_ppa));
2852 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
2853 * The final number (PPA) must not have any leading zeros. Upon success, a
2854 * pointer to the start of the PPA is returned; otherwise NULL is returned.
2856 static char *
2857 ill_get_ppa_ptr(char *name)
2859 int namelen = strlen(name);
2860 int end_ndx = namelen - 1;
2861 int ppa_ndx, i;
2864 * Check that the first character is [a-zA-Z], and that the last
2865 * character is [0-9].
2867 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
2868 return (NULL);
2871 * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
2873 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
2874 if (!isdigit(name[ppa_ndx - 1]))
2875 break;
2877 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
2878 return (NULL);
2881 * Check that the intermediate characters are [a-z0-9.]
2883 for (i = 1; i < ppa_ndx; i++) {
2884 if (!isalpha(name[i]) && !isdigit(name[i]) &&
2885 name[i] != '.' && name[i] != '_') {
2886 return (NULL);
2890 return (name + ppa_ndx);
2894 * use avl tree to locate the ill.
2896 static ill_t *
2897 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst)
2899 char *ppa_ptr = NULL;
2900 int len;
2901 uint_t ppa;
2902 ill_t *ill = NULL;
2903 ill_if_t *ifp;
2904 int list;
2907 * get ppa ptr
2909 if (isv6)
2910 list = IP_V6_G_HEAD;
2911 else
2912 list = IP_V4_G_HEAD;
2914 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
2915 return (NULL);
2918 len = ppa_ptr - name + 1;
2920 ppa = stoi(&ppa_ptr);
2922 ifp = IP_VX_ILL_G_LIST(list, ipst);
2924 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
2926 * match is done on len - 1 as the name is not null
2927 * terminated it contains ppa in addition to the interface
2928 * name.
2930 if ((ifp->illif_name_len == len) &&
2931 bcmp(ifp->illif_name, name, len - 1) == 0) {
2932 break;
2933 } else {
2934 ifp = ifp->illif_next;
2938 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
2940 * Even the interface type does not exist.
2942 return (NULL);
2945 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
2946 if (ill != NULL) {
2947 mutex_enter(&ill->ill_lock);
2948 if (ILL_CAN_LOOKUP(ill)) {
2949 ill_refhold_locked(ill);
2950 mutex_exit(&ill->ill_lock);
2951 return (ill);
2953 mutex_exit(&ill->ill_lock);
2955 return (NULL);
2959 * comparison function for use with avl.
2961 static int
2962 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr)
2964 uint_t ppa;
2965 uint_t ill_ppa;
2967 ASSERT(ppa_ptr != NULL && ill_ptr != NULL);
2969 ppa = *((uint_t *)ppa_ptr);
2970 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa;
2972 * We want the ill with the lowest ppa to be on the
2973 * top.
2975 if (ill_ppa < ppa)
2976 return (1);
2977 if (ill_ppa > ppa)
2978 return (-1);
2979 return (0);
2983 * remove an interface type from the global list.
2985 static void
2986 ill_delete_interface_type(ill_if_t *interface)
2988 ASSERT(interface != NULL);
2989 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0);
2991 avl_destroy(&interface->illif_avl_by_ppa);
2992 if (interface->illif_ppa_arena != NULL)
2993 vmem_destroy(interface->illif_ppa_arena);
2995 remque(interface);
2997 mi_free(interface);
3001 * remove ill from the global list.
3003 static void
3004 ill_glist_delete(ill_t *ill)
3006 ip_stack_t *ipst;
3007 phyint_t *phyi;
3009 if (ill == NULL)
3010 return;
3011 ipst = ill->ill_ipst;
3012 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3015 * If the ill was never inserted into the AVL tree
3016 * we skip the if branch.
3018 if (ill->ill_ifptr != NULL) {
3020 * remove from AVL tree and free ppa number
3022 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill);
3024 if (ill->ill_ifptr->illif_ppa_arena != NULL) {
3025 vmem_free(ill->ill_ifptr->illif_ppa_arena,
3026 (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3028 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) {
3029 ill_delete_interface_type(ill->ill_ifptr);
3033 * Indicate ill is no longer in the list.
3035 ill->ill_ifptr = NULL;
3036 ill->ill_name_length = 0;
3037 ill->ill_name[0] = '\0';
3038 ill->ill_ppa = UINT_MAX;
3041 /* Generate one last event for this ill. */
3042 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
3043 ill->ill_name_length);
3045 ASSERT(ill->ill_phyint != NULL);
3046 phyi = ill->ill_phyint;
3047 ill->ill_phyint = NULL;
3050 * ill_init allocates a phyint always to store the copy
3051 * of flags relevant to phyint. At that point in time, we could
3052 * not assign the name and hence phyint_illv4/v6 could not be
3053 * initialized. Later in ipif_set_values, we assign the name to
3054 * the ill, at which point in time we assign phyint_illv4/v6.
3055 * Thus we don't rely on phyint_illv6 to be initialized always.
3057 if (ill->ill_flags & ILLF_IPV6)
3058 phyi->phyint_illv6 = NULL;
3059 else
3060 phyi->phyint_illv4 = NULL;
3062 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
3063 rw_exit(&ipst->ips_ill_g_lock);
3064 return;
3068 * There are no ills left on this phyint; pull it out of the phyint
3069 * avl trees, and free it.
3071 if (phyi->phyint_ifindex > 0) {
3072 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3073 phyi);
3074 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
3075 phyi);
3077 rw_exit(&ipst->ips_ill_g_lock);
3079 phyint_free(phyi);
3083 * allocate a ppa, if the number of plumbed interfaces of this type are
3084 * less than ill_no_arena do a linear search to find a unused ppa.
3085 * When the number goes beyond ill_no_arena switch to using an arena.
3086 * Note: ppa value of zero cannot be allocated from vmem_arena as it
3087 * is the return value for an error condition, so allocation starts at one
3088 * and is decremented by one.
3090 static int
3091 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill)
3093 ill_t *tmp_ill;
3094 uint_t start, end;
3095 int ppa;
3097 if (ifp->illif_ppa_arena == NULL &&
3098 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) {
3100 * Create an arena.
3102 ifp->illif_ppa_arena = vmem_create(ifp->illif_name,
3103 (void *)1, UINT_MAX - 1, 1, NULL, NULL,
3104 NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
3105 /* allocate what has already been assigned */
3106 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa);
3107 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa,
3108 tmp_ill, AVL_AFTER)) {
3109 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3110 1, /* size */
3111 1, /* align/quantum */
3112 0, /* phase */
3113 0, /* nocross */
3114 /* minaddr */
3115 (void *)((uintptr_t)tmp_ill->ill_ppa + 1),
3116 /* maxaddr */
3117 (void *)((uintptr_t)tmp_ill->ill_ppa + 2),
3118 VM_NOSLEEP|VM_FIRSTFIT);
3119 if (ppa == 0) {
3120 ip1dbg(("ill_alloc_ppa: ppa allocation"
3121 " failed while switching"));
3122 vmem_destroy(ifp->illif_ppa_arena);
3123 ifp->illif_ppa_arena = NULL;
3124 break;
3129 if (ifp->illif_ppa_arena != NULL) {
3130 if (ill->ill_ppa == UINT_MAX) {
3131 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena,
3132 1, VM_NOSLEEP|VM_FIRSTFIT);
3133 if (ppa == 0)
3134 return (EAGAIN);
3135 ill->ill_ppa = --ppa;
3136 } else {
3137 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3138 1, /* size */
3139 1, /* align/quantum */
3140 0, /* phase */
3141 0, /* nocross */
3142 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */
3143 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */
3144 VM_NOSLEEP|VM_FIRSTFIT);
3146 * Most likely the allocation failed because
3147 * the requested ppa was in use.
3149 if (ppa == 0)
3150 return (EEXIST);
3152 return (0);
3156 * No arena is in use and not enough (>ill_no_arena) interfaces have
3157 * been plumbed to create one. Do a linear search to get a unused ppa.
3159 if (ill->ill_ppa == UINT_MAX) {
3160 end = UINT_MAX - 1;
3161 start = 0;
3162 } else {
3163 end = start = ill->ill_ppa;
3166 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL);
3167 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) {
3168 if (start++ >= end) {
3169 if (ill->ill_ppa == UINT_MAX)
3170 return (EAGAIN);
3171 else
3172 return (EEXIST);
3174 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER);
3176 ill->ill_ppa = start;
3177 return (0);
3181 * Insert ill into the list of configured ill's. Once this function completes,
3182 * the ill is globally visible and is available through lookups. More precisely
3183 * this happens after the caller drops the ill_g_lock.
3185 static int
3186 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
3188 ill_if_t *ill_interface;
3189 avl_index_t where = 0;
3190 int error;
3191 int name_length;
3192 int index;
3193 boolean_t check_length = B_FALSE;
3194 ip_stack_t *ipst = ill->ill_ipst;
3196 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
3198 name_length = mi_strlen(name) + 1;
3200 if (isv6)
3201 index = IP_V6_G_HEAD;
3202 else
3203 index = IP_V4_G_HEAD;
3205 ill_interface = IP_VX_ILL_G_LIST(index, ipst);
3207 * Search for interface type based on name
3209 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3210 if ((ill_interface->illif_name_len == name_length) &&
3211 (strcmp(ill_interface->illif_name, name) == 0)) {
3212 break;
3214 ill_interface = ill_interface->illif_next;
3218 * Interface type not found, create one.
3220 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3221 ill_g_head_t ghead;
3224 * allocate ill_if_t structure
3226 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t));
3227 if (ill_interface == NULL) {
3228 return (ENOMEM);
3231 (void) strcpy(ill_interface->illif_name, name);
3232 ill_interface->illif_name_len = name_length;
3234 avl_create(&ill_interface->illif_avl_by_ppa,
3235 ill_compare_ppa, sizeof (ill_t),
3236 offsetof(struct ill_s, ill_avl_byppa));
3239 * link the structure in the back to maintain order
3240 * of configuration for ifconfig output.
3242 ghead = ipst->ips_ill_g_heads[index];
3243 insque(ill_interface, ghead.ill_g_list_tail);
3246 if (ill->ill_ppa == UINT_MAX)
3247 check_length = B_TRUE;
3249 error = ill_alloc_ppa(ill_interface, ill);
3250 if (error != 0) {
3251 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3252 ill_delete_interface_type(ill->ill_ifptr);
3253 return (error);
3257 * When the ppa is choosen by the system, check that there is
3258 * enough space to insert ppa. if a specific ppa was passed in this
3259 * check is not required as the interface name passed in will have
3260 * the right ppa in it.
3262 if (check_length) {
3264 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars.
3266 char buf[sizeof (uint_t) * 3];
3269 * convert ppa to string to calculate the amount of space
3270 * required for it in the name.
3272 numtos(ill->ill_ppa, buf);
3274 /* Do we have enough space to insert ppa ? */
3276 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) {
3277 /* Free ppa and interface type struct */
3278 if (ill_interface->illif_ppa_arena != NULL) {
3279 vmem_free(ill_interface->illif_ppa_arena,
3280 (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3282 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3283 ill_delete_interface_type(ill->ill_ifptr);
3285 return (EINVAL);
3289 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa);
3290 ill->ill_name_length = mi_strlen(ill->ill_name) + 1;
3292 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa,
3293 &where);
3294 ill->ill_ifptr = ill_interface;
3295 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where);
3297 ill_phyint_reinit(ill);
3298 return (0);
3301 /* Initialize the per phyint ipsq used for serialization */
3302 static boolean_t
3303 ipsq_init(ill_t *ill, boolean_t enter)
3305 ipsq_t *ipsq;
3306 ipxop_t *ipx;
3308 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL)
3309 return (B_FALSE);
3311 ill->ill_phyint->phyint_ipsq = ipsq;
3312 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop;
3313 ipx->ipx_ipsq = ipsq;
3314 ipsq->ipsq_next = ipsq;
3315 ipsq->ipsq_phyint = ill->ill_phyint;
3316 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
3317 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0);
3318 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */
3319 if (enter) {
3320 ipx->ipx_writer = curthread;
3321 ipx->ipx_forced = B_FALSE;
3322 ipx->ipx_reentry_cnt = 1;
3323 #ifdef DEBUG
3324 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
3325 #endif
3327 return (B_TRUE);
3331 * Here we perform initialisation of the ill_t common to both regular
3332 * interface ILLs and the special loopback ILL created by ill_lookup_on_name.
3334 static int
3335 ill_init_common(ill_t *ill, queue_t *q, boolean_t isv6, boolean_t is_loopback,
3336 boolean_t ipsq_enter)
3338 int count;
3339 uchar_t *frag_ptr;
3341 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
3342 mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
3343 ill->ill_saved_ire_cnt = 0;
3345 if (is_loopback) {
3346 ill->ill_max_frag = isv6 ? ip_loopback_mtu_v6plus :
3347 ip_loopback_mtuplus;
3349 * No resolver here.
3351 ill->ill_net_type = IRE_LOOPBACK;
3352 } else {
3353 ill->ill_rq = q;
3354 ill->ill_wq = WR(q);
3355 ill->ill_ppa = UINT_MAX;
3358 ill->ill_isv6 = isv6;
3361 * Allocate sufficient space to contain our fragment hash table and
3362 * the device name.
3364 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 2 * LIFNAMSIZ);
3365 if (frag_ptr == NULL)
3366 return (ENOMEM);
3367 ill->ill_frag_ptr = frag_ptr;
3368 ill->ill_frag_free_num_pkts = 0;
3369 ill->ill_last_frag_clean_time = 0;
3370 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr;
3371 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE);
3372 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
3373 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock,
3374 NULL, MUTEX_DEFAULT, NULL);
3377 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
3378 if (ill->ill_phyint == NULL) {
3379 mi_free(frag_ptr);
3380 return (ENOMEM);
3383 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
3384 if (isv6) {
3385 ill->ill_phyint->phyint_illv6 = ill;
3386 } else {
3387 ill->ill_phyint->phyint_illv4 = ill;
3389 if (is_loopback) {
3390 phyint_flags_init(ill->ill_phyint, DL_LOOP);
3393 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
3395 ill_set_inputfn(ill);
3397 if (!ipsq_init(ill, ipsq_enter)) {
3398 mi_free(frag_ptr);
3399 mi_free(ill->ill_phyint);
3400 return (ENOMEM);
3403 /* Frag queue limit stuff */
3404 ill->ill_frag_count = 0;
3405 ill->ill_ipf_gen = 0;
3407 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
3408 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
3409 ill->ill_global_timer = INFINITY;
3410 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
3411 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
3412 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
3413 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
3416 * Initialize IPv6 configuration variables. The IP module is always
3417 * opened as an IPv4 module. Instead tracking down the cases where
3418 * it switches to do ipv6, we'll just initialize the IPv6 configuration
3419 * here for convenience, this has no effect until the ill is set to do
3420 * IPv6.
3422 ill->ill_reachable_time = ND_REACHABLE_TIME;
3423 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT;
3424 ill->ill_max_buf = ND_MAX_Q;
3425 ill->ill_refcnt = 0;
3427 return (0);
3431 * ill_init is called by ip_open when a device control stream is opened.
3432 * It does a few initializations, and shoots a DL_INFO_REQ message down
3433 * to the driver. The response is later picked up in ip_rput_dlpi and
3434 * used to set up default mechanisms for talking to the driver. (Always
3435 * called as writer.)
3437 * If this function returns error, ip_open will call ip_close which in
3438 * turn will call ill_delete to clean up any memory allocated here that
3439 * is not yet freed.
3441 * Note: ill_ipst and ill_zoneid must be set before calling ill_init.
3444 ill_init(queue_t *q, ill_t *ill)
3446 int ret;
3447 dl_info_req_t *dlir;
3448 mblk_t *info_mp;
3450 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
3451 BPRI_HI);
3452 if (info_mp == NULL)
3453 return (ENOMEM);
3456 * For now pretend this is a v4 ill. We need to set phyint_ill*
3457 * at this point because of the following reason. If we can't
3458 * enter the ipsq at some point and cv_wait, the writer that
3459 * wakes us up tries to locate us using the list of all phyints
3460 * in an ipsq and the ills from the phyint thru the phyint_ill*.
3461 * If we don't set it now, we risk a missed wakeup.
3463 if ((ret = ill_init_common(ill, q, B_FALSE, B_FALSE, B_TRUE)) != 0) {
3464 freemsg(info_mp);
3465 return (ret);
3468 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING;
3470 /* Send down the Info Request to the driver. */
3471 info_mp->b_datap->db_type = M_PCPROTO;
3472 dlir = (dl_info_req_t *)info_mp->b_rptr;
3473 info_mp->b_wptr = (uchar_t *)&dlir[1];
3474 dlir->dl_primitive = DL_INFO_REQ;
3476 ill->ill_dlpi_pending = DL_PRIM_INVAL;
3478 qprocson(q);
3479 ill_dlpi_send(ill, info_mp);
3481 return (0);
3485 * ill_dls_info
3486 * creates datalink socket info from the device.
3489 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill)
3491 size_t len;
3493 sdl->sdl_family = AF_LINK;
3494 sdl->sdl_index = ill_get_upper_ifindex(ill);
3495 sdl->sdl_type = ill->ill_type;
3496 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3497 len = strlen(sdl->sdl_data);
3498 ASSERT(len < 256);
3499 sdl->sdl_nlen = (uchar_t)len;
3500 sdl->sdl_alen = ill->ill_phys_addr_length;
3501 sdl->sdl_slen = 0;
3502 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL)
3503 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen);
3505 return (sizeof (struct sockaddr_dl));
3509 * ill_xarp_info
3510 * creates xarp info from the device.
3512 static int
3513 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill)
3515 sdl->sdl_family = AF_LINK;
3516 sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
3517 sdl->sdl_type = ill->ill_type;
3518 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3519 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data);
3520 sdl->sdl_alen = ill->ill_phys_addr_length;
3521 sdl->sdl_slen = 0;
3522 return (sdl->sdl_nlen);
3525 static int
3526 loopback_kstat_update(kstat_t *ksp, int rw)
3528 kstat_named_t *kn;
3529 netstackid_t stackid;
3530 netstack_t *ns;
3531 ip_stack_t *ipst;
3533 if (ksp == NULL || ksp->ks_data == NULL)
3534 return (EIO);
3536 if (rw == KSTAT_WRITE)
3537 return (EACCES);
3539 kn = KSTAT_NAMED_PTR(ksp);
3540 stackid = (zoneid_t)(uintptr_t)ksp->ks_private;
3542 ns = netstack_find_by_stackid(stackid);
3543 if (ns == NULL)
3544 return (-1);
3546 ipst = ns->netstack_ip;
3547 if (ipst == NULL) {
3548 netstack_rele(ns);
3549 return (-1);
3551 kn[0].value.ui32 = ipst->ips_loopback_packets;
3552 kn[1].value.ui32 = ipst->ips_loopback_packets;
3553 netstack_rele(ns);
3554 return (0);
3558 * Has ifindex been plumbed already?
3560 static boolean_t
3561 phyint_exists(uint_t index, ip_stack_t *ipst)
3563 ASSERT(index != 0);
3564 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
3566 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3567 &index, NULL) != NULL);
3571 * Pick a unique ifindex.
3572 * When the index counter passes IF_INDEX_MAX for the first time, the wrap
3573 * flag is set so that next time time ip_assign_ifindex() is called, it
3574 * falls through and resets the index counter back to 1, the minimum value
3575 * for the interface index. The logic below assumes that ips_ill_index
3576 * can hold a value of IF_INDEX_MAX+1 without there being any loss
3577 * (i.e. reset back to 0.)
3579 boolean_t
3580 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst)
3582 uint_t loops;
3584 if (!ipst->ips_ill_index_wrap) {
3585 *indexp = ipst->ips_ill_index++;
3586 if (ipst->ips_ill_index > IF_INDEX_MAX) {
3588 * Reached the maximum ifindex value, set the wrap
3589 * flag to indicate that it is no longer possible
3590 * to assume that a given index is unallocated.
3592 ipst->ips_ill_index_wrap = B_TRUE;
3594 return (B_TRUE);
3597 if (ipst->ips_ill_index > IF_INDEX_MAX)
3598 ipst->ips_ill_index = 1;
3601 * Start reusing unused indexes. Note that we hold the ill_g_lock
3602 * at this point and don't want to call any function that attempts
3603 * to get the lock again.
3605 for (loops = IF_INDEX_MAX; loops > 0; loops--) {
3606 if (!phyint_exists(ipst->ips_ill_index, ipst)) {
3607 /* found unused index - use it */
3608 *indexp = ipst->ips_ill_index;
3609 return (B_TRUE);
3612 ipst->ips_ill_index++;
3613 if (ipst->ips_ill_index > IF_INDEX_MAX)
3614 ipst->ips_ill_index = 1;
3618 * all interface indicies are inuse.
3620 return (B_FALSE);
3624 * Assign a unique interface index for the phyint.
3626 static boolean_t
3627 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst)
3629 ASSERT(phyi->phyint_ifindex == 0);
3630 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst));
3634 * Initialize the flags on `phyi' as per the provided mactype.
3636 static void
3637 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype)
3639 uint64_t flags = 0;
3642 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces,
3643 * we always presume the underlying hardware is working and set
3644 * PHYI_RUNNING (if it's not, the driver will subsequently send a
3645 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization
3646 * there are no active interfaces in the group so we set PHYI_FAILED.
3648 if (mactype == SUNW_DL_IPMP)
3649 flags |= PHYI_FAILED;
3650 else
3651 flags |= PHYI_RUNNING;
3653 switch (mactype) {
3654 case SUNW_DL_VNI:
3655 flags |= PHYI_VIRTUAL;
3656 break;
3657 case SUNW_DL_IPMP:
3658 flags |= PHYI_IPMP;
3659 break;
3660 case DL_LOOP:
3661 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL);
3662 break;
3665 mutex_enter(&phyi->phyint_lock);
3666 phyi->phyint_flags |= flags;
3667 mutex_exit(&phyi->phyint_lock);
3671 * Return a pointer to the ill which matches the supplied name. Note that
3672 * the ill name length includes the null termination character. (May be
3673 * called as writer.)
3674 * If do_alloc and the interface is "lo0" it will be automatically created.
3675 * Cannot bump up reference on condemned ills. So dup detect can't be done
3676 * using this func.
3678 ill_t *
3679 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
3680 boolean_t *did_alloc, ip_stack_t *ipst)
3682 ill_t *ill;
3683 ipif_t *ipif;
3684 ipsq_t *ipsq;
3685 kstat_named_t *kn;
3686 boolean_t isloopback;
3687 in6_addr_t ov6addr;
3689 isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
3691 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3692 ill = ill_find_by_name(name, isv6, ipst);
3693 rw_exit(&ipst->ips_ill_g_lock);
3694 if (ill != NULL)
3695 return (ill);
3698 * Couldn't find it. Does this happen to be a lookup for the
3699 * loopback device and are we allowed to allocate it?
3701 if (!isloopback || !do_alloc)
3702 return (NULL);
3704 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3705 ill = ill_find_by_name(name, isv6, ipst);
3706 if (ill != NULL) {
3707 rw_exit(&ipst->ips_ill_g_lock);
3708 return (ill);
3711 /* Create the loopback device on demand */
3712 ill = (ill_t *)(mi_alloc(sizeof (ill_t) +
3713 sizeof (ipif_loopback_name), BPRI_MED));
3714 if (ill == NULL)
3715 goto done;
3717 bzero(ill, sizeof (*ill));
3718 ill->ill_ipst = ipst;
3719 netstack_hold(ipst->ips_netstack);
3721 * For exclusive stacks we set the zoneid to zero
3722 * to make IP operate as if in the global zone.
3724 ill->ill_zoneid = GLOBAL_ZONEID;
3726 if (ill_init_common(ill, NULL, isv6, B_TRUE, B_FALSE) != 0)
3727 goto done;
3729 if (!ill_allocate_mibs(ill))
3730 goto done;
3732 ill->ill_current_frag = ill->ill_max_frag;
3733 ill->ill_mtu = ill->ill_max_frag; /* Initial value */
3734 ill->ill_mc_mtu = ill->ill_mtu;
3736 * ipif_loopback_name can't be pointed at directly because its used
3737 * by both the ipv4 and ipv6 interfaces. When the ill is removed
3738 * from the glist, ill_glist_delete() sets the first character of
3739 * ill_name to '\0'.
3741 ill->ill_name = (char *)ill + sizeof (*ill);
3742 (void) strcpy(ill->ill_name, ipif_loopback_name);
3743 ill->ill_name_length = sizeof (ipif_loopback_name);
3744 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */
3745 ill->ill_dlpi_pending = DL_PRIM_INVAL;
3747 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL);
3748 if (ipif == NULL)
3749 goto done;
3751 ill->ill_flags = ILLF_MULTICAST;
3753 ov6addr = ipif->ipif_v6lcl_addr;
3754 /* Set up default loopback address and mask. */
3755 if (!isv6) {
3756 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK);
3758 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr);
3759 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask);
3760 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3761 ipif->ipif_v6subnet);
3762 ill->ill_flags |= ILLF_IPV4;
3763 } else {
3764 ipif->ipif_v6lcl_addr = ipv6_loopback;
3765 ipif->ipif_v6net_mask = ipv6_all_ones;
3766 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3767 ipif->ipif_v6subnet);
3768 ill->ill_flags |= ILLF_IPV6;
3772 * Chain us in at the end of the ill list. hold the ill
3773 * before we make it globally visible. 1 for the lookup.
3775 ill_refhold(ill);
3777 ipsq = ill->ill_phyint->phyint_ipsq;
3779 if (ill_glist_insert(ill, "lo", isv6) != 0)
3780 cmn_err(CE_PANIC, "cannot insert loopback interface");
3782 /* Let SCTP know so that it can add this to its list */
3783 sctp_update_ill(ill, SCTP_ILL_INSERT);
3786 * We have already assigned ipif_v6lcl_addr above, but we need to
3787 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which
3788 * requires to be after ill_glist_insert() since we need the
3789 * ill_index set. Pass on ipv6_loopback as the old address.
3791 sctp_update_ipif_addr(ipif, ov6addr);
3793 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
3796 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs.
3797 * If so, free our original one.
3799 if (ipsq != ill->ill_phyint->phyint_ipsq)
3800 ipsq_delete(ipsq);
3802 if (ipst->ips_loopback_ksp == NULL) {
3803 /* Export loopback interface statistics */
3804 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0,
3805 ipif_loopback_name, "net",
3806 KSTAT_TYPE_NAMED, 2, 0,
3807 ipst->ips_netstack->netstack_stackid);
3808 if (ipst->ips_loopback_ksp != NULL) {
3809 ipst->ips_loopback_ksp->ks_update =
3810 loopback_kstat_update;
3811 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp);
3812 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32);
3813 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32);
3814 ipst->ips_loopback_ksp->ks_private =
3815 (void *)(uintptr_t)ipst->ips_netstack->
3816 netstack_stackid;
3817 kstat_install(ipst->ips_loopback_ksp);
3821 *did_alloc = B_TRUE;
3822 rw_exit(&ipst->ips_ill_g_lock);
3823 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id),
3824 NE_PLUMB, ill->ill_name, ill->ill_name_length);
3825 return (ill);
3826 done:
3827 if (ill != NULL) {
3828 if (ill->ill_phyint != NULL) {
3829 ipsq = ill->ill_phyint->phyint_ipsq;
3830 if (ipsq != NULL) {
3831 ipsq->ipsq_phyint = NULL;
3832 ipsq_delete(ipsq);
3834 mi_free(ill->ill_phyint);
3836 ill_free_mib(ill);
3837 if (ill->ill_ipst != NULL)
3838 netstack_rele(ill->ill_ipst->ips_netstack);
3839 mi_free(ill);
3841 rw_exit(&ipst->ips_ill_g_lock);
3842 return (NULL);
3846 * For IPP calls - use the ip_stack_t for global stack.
3848 ill_t *
3849 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6)
3851 ip_stack_t *ipst;
3852 ill_t *ill;
3853 netstack_t *ns;
3855 ns = netstack_find_by_stackid(GLOBAL_NETSTACKID);
3857 if ((ipst = ns->netstack_ip) == NULL) {
3858 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n");
3859 netstack_rele(ns);
3860 return (NULL);
3863 ill = ill_lookup_on_ifindex(index, isv6, ipst);
3864 netstack_rele(ns);
3865 return (ill);
3869 * Return a pointer to the ill which matches the index and IP version type.
3871 ill_t *
3872 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
3874 ill_t *ill;
3875 phyint_t *phyi;
3878 * Indexes are stored in the phyint - a common structure
3879 * to both IPv4 and IPv6.
3881 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3882 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3883 (void *) &index, NULL);
3884 if (phyi != NULL) {
3885 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
3886 if (ill != NULL) {
3887 mutex_enter(&ill->ill_lock);
3888 if (!ILL_IS_CONDEMNED(ill)) {
3889 ill_refhold_locked(ill);
3890 mutex_exit(&ill->ill_lock);
3891 rw_exit(&ipst->ips_ill_g_lock);
3892 return (ill);
3894 mutex_exit(&ill->ill_lock);
3897 rw_exit(&ipst->ips_ill_g_lock);
3898 return (NULL);
3902 * Verify whether or not an interface index is valid for the specified zoneid
3903 * to transmit packets.
3904 * It can be zero (meaning "reset") or an interface index assigned
3905 * to a non-VNI interface. (We don't use VNI interface to send packets.)
3907 boolean_t
3908 ip_xmit_ifindex_valid(uint_t ifindex, zoneid_t zoneid, boolean_t isv6,
3909 ip_stack_t *ipst)
3911 ill_t *ill;
3913 if (ifindex == 0)
3914 return (B_TRUE);
3916 ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, isv6, ipst);
3917 if (ill == NULL)
3918 return (B_FALSE);
3919 if (IS_VNI(ill)) {
3920 ill_refrele(ill);
3921 return (B_FALSE);
3923 ill_refrele(ill);
3924 return (B_TRUE);
3928 * Return the ifindex next in sequence after the passed in ifindex.
3929 * If there is no next ifindex for the given protocol, return 0.
3931 uint_t
3932 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
3934 phyint_t *phyi;
3935 phyint_t *phyi_initial;
3936 uint_t ifindex;
3938 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3940 if (index == 0) {
3941 phyi = avl_first(
3942 &ipst->ips_phyint_g_list->phyint_list_avl_by_index);
3943 } else {
3944 phyi = phyi_initial = avl_find(
3945 &ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3946 (void *) &index, NULL);
3949 for (; phyi != NULL;
3950 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3951 phyi, AVL_AFTER)) {
3953 * If we're not returning the first interface in the tree
3954 * and we still haven't moved past the phyint_t that
3955 * corresponds to index, avl_walk needs to be called again
3957 if (!((index != 0) && (phyi == phyi_initial))) {
3958 if (isv6) {
3959 if ((phyi->phyint_illv6) &&
3960 ILL_CAN_LOOKUP(phyi->phyint_illv6) &&
3961 (phyi->phyint_illv6->ill_isv6 == 1))
3962 break;
3963 } else {
3964 if ((phyi->phyint_illv4) &&
3965 ILL_CAN_LOOKUP(phyi->phyint_illv4) &&
3966 (phyi->phyint_illv4->ill_isv6 == 0))
3967 break;
3972 rw_exit(&ipst->ips_ill_g_lock);
3974 if (phyi != NULL)
3975 ifindex = phyi->phyint_ifindex;
3976 else
3977 ifindex = 0;
3979 return (ifindex);
3983 * Return the ifindex for the named interface.
3984 * If there is no next ifindex for the interface, return 0.
3986 uint_t
3987 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst)
3989 phyint_t *phyi;
3990 avl_index_t where = 0;
3991 uint_t ifindex;
3993 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3995 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
3996 name, &where)) == NULL) {
3997 rw_exit(&ipst->ips_ill_g_lock);
3998 return (0);
4001 ifindex = phyi->phyint_ifindex;
4003 rw_exit(&ipst->ips_ill_g_lock);
4005 return (ifindex);
4009 * Return the ifindex to be used by upper layer protocols for instance
4010 * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill.
4012 uint_t
4013 ill_get_upper_ifindex(const ill_t *ill)
4015 if (IS_UNDER_IPMP(ill))
4016 return (ipmp_ill_get_ipmp_ifindex(ill));
4017 else
4018 return (ill->ill_phyint->phyint_ifindex);
4023 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt
4024 * that gives a running thread a reference to the ill. This reference must be
4025 * released by the thread when it is done accessing the ill and related
4026 * objects. ill_refcnt can not be used to account for static references
4027 * such as other structures pointing to an ill. Callers must generally
4028 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros
4029 * or be sure that the ill is not being deleted or changing state before
4030 * calling the refhold functions. A non-zero ill_refcnt ensures that the
4031 * ill won't change any of its critical state such as address, netmask etc.
4033 void
4034 ill_refhold(ill_t *ill)
4036 mutex_enter(&ill->ill_lock);
4037 ill->ill_refcnt++;
4038 ILL_TRACE_REF(ill);
4039 mutex_exit(&ill->ill_lock);
4042 void
4043 ill_refhold_locked(ill_t *ill)
4045 ASSERT(MUTEX_HELD(&ill->ill_lock));
4046 ill->ill_refcnt++;
4047 ILL_TRACE_REF(ill);
4050 /* Returns true if we managed to get a refhold */
4051 boolean_t
4052 ill_check_and_refhold(ill_t *ill)
4054 mutex_enter(&ill->ill_lock);
4055 if (!ILL_IS_CONDEMNED(ill)) {
4056 ill_refhold_locked(ill);
4057 mutex_exit(&ill->ill_lock);
4058 return (B_TRUE);
4060 mutex_exit(&ill->ill_lock);
4061 return (B_FALSE);
4065 * Must not be called while holding any locks. Otherwise if this is
4066 * the last reference to be released, there is a chance of recursive mutex
4067 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
4068 * to restart an ioctl.
4070 void
4071 ill_refrele(ill_t *ill)
4073 mutex_enter(&ill->ill_lock);
4074 ASSERT(ill->ill_refcnt != 0);
4075 ill->ill_refcnt--;
4076 ILL_UNTRACE_REF(ill);
4077 if (ill->ill_refcnt != 0) {
4078 /* Every ire pointing to the ill adds 1 to ill_refcnt */
4079 mutex_exit(&ill->ill_lock);
4080 return;
4083 /* Drops the ill_lock */
4084 ipif_ill_refrele_tail(ill);
4088 * Obtain a weak reference count on the ill. This reference ensures the
4089 * ill won't be freed, but the ill may change any of its critical state
4090 * such as netmask, address etc. Returns an error if the ill has started
4091 * closing.
4093 boolean_t
4094 ill_waiter_inc(ill_t *ill)
4096 mutex_enter(&ill->ill_lock);
4097 if (ill->ill_state_flags & ILL_CONDEMNED) {
4098 mutex_exit(&ill->ill_lock);
4099 return (B_FALSE);
4101 ill->ill_waiters++;
4102 mutex_exit(&ill->ill_lock);
4103 return (B_TRUE);
4106 void
4107 ill_waiter_dcr(ill_t *ill)
4109 mutex_enter(&ill->ill_lock);
4110 ill->ill_waiters--;
4111 if (ill->ill_waiters == 0)
4112 cv_broadcast(&ill->ill_cv);
4113 mutex_exit(&ill->ill_lock);
4117 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the
4118 * driver. We construct best guess defaults for lower level information that
4119 * we need. If an interface is brought up without injection of any overriding
4120 * information from outside, we have to be ready to go with these defaults.
4121 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ)
4122 * we primarely want the dl_provider_style.
4123 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND
4124 * at which point we assume the other part of the information is valid.
4126 void
4127 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
4129 uchar_t *brdcst_addr;
4130 uint_t brdcst_addr_length, phys_addr_length;
4131 t_scalar_t sap_length;
4132 dl_info_ack_t *dlia;
4133 ip_m_t *ipm;
4134 dl_qos_cl_sel1_t *sel1;
4135 int min_mtu;
4137 ASSERT(IAM_WRITER_ILL(ill));
4140 * Till the ill is fully up the ill is not globally visible.
4141 * So no need for a lock.
4143 dlia = (dl_info_ack_t *)mp->b_rptr;
4144 ill->ill_mactype = dlia->dl_mac_type;
4146 ipm = ip_m_lookup(dlia->dl_mac_type);
4147 if (ipm == NULL) {
4148 ipm = ip_m_lookup(DL_OTHER);
4149 ASSERT(ipm != NULL);
4151 ill->ill_media = ipm;
4154 * When the new DLPI stuff is ready we'll pull lengths
4155 * from dlia.
4157 if (dlia->dl_version == DL_VERSION_2) {
4158 brdcst_addr_length = dlia->dl_brdcst_addr_length;
4159 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
4160 brdcst_addr_length);
4161 if (brdcst_addr == NULL) {
4162 brdcst_addr_length = 0;
4164 sap_length = dlia->dl_sap_length;
4165 phys_addr_length = dlia->dl_addr_length - ABS(sap_length);
4166 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n",
4167 brdcst_addr_length, sap_length, phys_addr_length));
4168 } else {
4169 brdcst_addr_length = 6;
4170 brdcst_addr = ip_six_byte_all_ones;
4171 sap_length = -2;
4172 phys_addr_length = brdcst_addr_length;
4175 ill->ill_bcast_addr_length = brdcst_addr_length;
4176 ill->ill_phys_addr_length = phys_addr_length;
4177 ill->ill_sap_length = sap_length;
4180 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU,
4181 * but we must ensure a minimum IP MTU is used since other bits of
4182 * IP will fly apart otherwise.
4184 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
4185 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
4186 ill->ill_current_frag = ill->ill_max_frag;
4187 ill->ill_mtu = ill->ill_max_frag;
4188 ill->ill_mc_mtu = ill->ill_mtu; /* Overridden by DL_NOTE_SDU_SIZE2 */
4190 ill->ill_type = ipm->ip_m_type;
4192 if (!ill->ill_dlpi_style_set) {
4193 if (dlia->dl_provider_style == DL_STYLE2)
4194 ill->ill_needs_attach = 1;
4196 phyint_flags_init(ill->ill_phyint, ill->ill_mactype);
4199 * Allocate the first ipif on this ill. We don't delay it
4200 * further as ioctl handling assumes at least one ipif exists.
4202 * At this point we don't know whether the ill is v4 or v6.
4203 * We will know this whan the SIOCSLIFNAME happens and
4204 * the correct value for ill_isv6 will be assigned in
4205 * ipif_set_values(). We need to hold the ill lock and
4206 * clear the ILL_LL_SUBNET_PENDING flag and atomically do
4207 * the wakeup.
4209 (void) ipif_allocate(ill, 0, IRE_LOCAL,
4210 dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL);
4211 mutex_enter(&ill->ill_lock);
4212 ASSERT(ill->ill_dlpi_style_set == 0);
4213 ill->ill_dlpi_style_set = 1;
4214 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING;
4215 cv_broadcast(&ill->ill_cv);
4216 mutex_exit(&ill->ill_lock);
4217 freemsg(mp);
4218 return;
4220 ASSERT(ill->ill_ipif != NULL);
4222 * We know whether it is IPv4 or IPv6 now, as this is the
4223 * second DL_INFO_ACK we are recieving in response to the
4224 * DL_INFO_REQ sent in ipif_set_values.
4226 ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap;
4228 * Clear all the flags that were set based on ill_bcast_addr_length
4229 * and ill_phys_addr_length (in ipif_set_values) as these could have
4230 * changed now and we need to re-evaluate.
4232 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP);
4233 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT);
4236 * Free ill_bcast_mp as things could have changed now.
4238 * NOTE: The IPMP meta-interface is special-cased because it starts
4239 * with no underlying interfaces (and thus an unknown broadcast
4240 * address length), but we enforce that an interface is broadcast-
4241 * capable as part of allowing it to join a group.
4243 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
4244 if (ill->ill_bcast_mp != NULL)
4245 freemsg(ill->ill_bcast_mp);
4246 ill->ill_net_type = IRE_IF_NORESOLVER;
4248 ill->ill_bcast_mp = ill_dlur_gen(NULL,
4249 ill->ill_phys_addr_length,
4250 ill->ill_sap,
4251 ill->ill_sap_length);
4253 if (ill->ill_isv6)
4255 * Note: xresolv interfaces will eventually need NOARP
4256 * set here as well, but that will require those
4257 * external resolvers to have some knowledge of
4258 * that flag and act appropriately. Not to be changed
4259 * at present.
4261 ill->ill_flags |= ILLF_NONUD;
4262 else
4263 ill->ill_flags |= ILLF_NOARP;
4265 if (ill->ill_mactype == SUNW_DL_VNI) {
4266 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT;
4267 } else if (ill->ill_phys_addr_length == 0 ||
4268 ill->ill_mactype == DL_IPV4 ||
4269 ill->ill_mactype == DL_IPV6) {
4271 * The underying link is point-to-point, so mark the
4272 * interface as such. We can do IP multicast over
4273 * such a link since it transmits all network-layer
4274 * packets to the remote side the same way.
4276 ill->ill_flags |= ILLF_MULTICAST;
4277 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT;
4279 } else {
4280 ill->ill_net_type = IRE_IF_RESOLVER;
4281 if (ill->ill_bcast_mp != NULL)
4282 freemsg(ill->ill_bcast_mp);
4283 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr,
4284 ill->ill_bcast_addr_length, ill->ill_sap,
4285 ill->ill_sap_length);
4287 * Later detect lack of DLPI driver multicast
4288 * capability by catching DL_ENABMULTI errors in
4289 * ip_rput_dlpi.
4291 ill->ill_flags |= ILLF_MULTICAST;
4292 if (!ill->ill_isv6)
4293 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
4296 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */
4297 if (ill->ill_mactype == SUNW_DL_IPMP)
4298 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP);
4300 /* By default an interface does not support any CoS marking */
4301 ill->ill_flags &= ~ILLF_COS_ENABLED;
4304 * If we get QoS information in DL_INFO_ACK, the device supports
4305 * some form of CoS marking, set ILLF_COS_ENABLED.
4307 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset,
4308 dlia->dl_qos_length);
4309 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) {
4310 ill->ill_flags |= ILLF_COS_ENABLED;
4313 /* Clear any previous error indication. */
4314 ill->ill_error = 0;
4315 freemsg(mp);
4319 * Perform various checks to verify that an address would make sense as a
4320 * local, remote, or subnet interface address.
4322 static boolean_t
4323 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask)
4325 ipaddr_t net_mask;
4328 * Don't allow all zeroes, or all ones, but allow
4329 * all ones netmask.
4331 if ((net_mask = ip_net_mask(addr)) == 0)
4332 return (B_FALSE);
4333 /* A given netmask overrides the "guess" netmask */
4334 if (subnet_mask != 0)
4335 net_mask = subnet_mask;
4336 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) ||
4337 (addr == (addr | ~net_mask)))) {
4338 return (B_FALSE);
4342 * Even if the netmask is all ones, we do not allow address to be
4343 * 255.255.255.255
4345 if (addr == INADDR_BROADCAST)
4346 return (B_FALSE);
4348 if (CLASSD(addr))
4349 return (B_FALSE);
4351 return (B_TRUE);
4354 #define V6_IPIF_LINKLOCAL(p) \
4355 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr)
4358 * Compare two given ipifs and check if the second one is better than
4359 * the first one using the order of preference (not taking deprecated
4360 * into acount) specified in ipif_lookup_multicast().
4362 static boolean_t
4363 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
4365 /* Check the least preferred first. */
4366 if (IS_LOOPBACK(old_ipif->ipif_ill)) {
4367 /* If both ipifs are the same, use the first one. */
4368 if (IS_LOOPBACK(new_ipif->ipif_ill))
4369 return (B_FALSE);
4370 else
4371 return (B_TRUE);
4374 /* For IPv6, check for link local address. */
4375 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) {
4376 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4377 V6_IPIF_LINKLOCAL(new_ipif)) {
4378 /* The second one is equal or less preferred. */
4379 return (B_FALSE);
4380 } else {
4381 return (B_TRUE);
4385 /* Then check for point to point interface. */
4386 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) {
4387 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4388 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) ||
4389 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) {
4390 return (B_FALSE);
4391 } else {
4392 return (B_TRUE);
4396 /* old_ipif is a normal interface, so no need to use the new one. */
4397 return (B_FALSE);
4401 * Find a mulitcast-capable ipif given an IP instance and zoneid.
4402 * The ipif must be up, and its ill must multicast-capable, not
4403 * condemned, not an underlying interface in an IPMP group, and
4404 * not a VNI interface. Order of preference:
4406 * 1a. normal
4407 * 1b. normal, but deprecated
4408 * 2a. point to point
4409 * 2b. point to point, but deprecated
4410 * 3a. link local
4411 * 3b. link local, but deprecated
4412 * 4. loopback.
4414 static ipif_t *
4415 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4417 ill_t *ill;
4418 ill_walk_context_t ctx;
4419 ipif_t *ipif;
4420 ipif_t *saved_ipif = NULL;
4421 ipif_t *dep_ipif = NULL;
4423 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4424 if (isv6)
4425 ill = ILL_START_WALK_V6(&ctx, ipst);
4426 else
4427 ill = ILL_START_WALK_V4(&ctx, ipst);
4429 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4430 mutex_enter(&ill->ill_lock);
4431 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) ||
4432 ILL_IS_CONDEMNED(ill) ||
4433 !(ill->ill_flags & ILLF_MULTICAST)) {
4434 mutex_exit(&ill->ill_lock);
4435 continue;
4437 for (ipif = ill->ill_ipif; ipif != NULL;
4438 ipif = ipif->ipif_next) {
4439 if (zoneid != ipif->ipif_zoneid &&
4440 zoneid != ALL_ZONES &&
4441 ipif->ipif_zoneid != ALL_ZONES) {
4442 continue;
4444 if (!(ipif->ipif_flags & IPIF_UP) ||
4445 IPIF_IS_CONDEMNED(ipif)) {
4446 continue;
4450 * Found one candidate. If it is deprecated,
4451 * remember it in dep_ipif. If it is not deprecated,
4452 * remember it in saved_ipif.
4454 if (ipif->ipif_flags & IPIF_DEPRECATED) {
4455 if (dep_ipif == NULL) {
4456 dep_ipif = ipif;
4457 } else if (ipif_comp_multi(dep_ipif, ipif,
4458 isv6)) {
4460 * If the previous dep_ipif does not
4461 * belong to the same ill, we've done
4462 * a ipif_refhold() on it. So we need
4463 * to release it.
4465 if (dep_ipif->ipif_ill != ill)
4466 ipif_refrele(dep_ipif);
4467 dep_ipif = ipif;
4469 continue;
4471 if (saved_ipif == NULL) {
4472 saved_ipif = ipif;
4473 } else {
4474 if (ipif_comp_multi(saved_ipif, ipif, isv6)) {
4475 if (saved_ipif->ipif_ill != ill)
4476 ipif_refrele(saved_ipif);
4477 saved_ipif = ipif;
4482 * Before going to the next ill, do a ipif_refhold() on the
4483 * saved ones.
4485 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill)
4486 ipif_refhold_locked(saved_ipif);
4487 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill)
4488 ipif_refhold_locked(dep_ipif);
4489 mutex_exit(&ill->ill_lock);
4491 rw_exit(&ipst->ips_ill_g_lock);
4494 * If we have only the saved_ipif, return it. But if we have both
4495 * saved_ipif and dep_ipif, check to see which one is better.
4497 if (saved_ipif != NULL) {
4498 if (dep_ipif != NULL) {
4499 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) {
4500 ipif_refrele(saved_ipif);
4501 return (dep_ipif);
4502 } else {
4503 ipif_refrele(dep_ipif);
4504 return (saved_ipif);
4507 return (saved_ipif);
4508 } else {
4509 return (dep_ipif);
4513 ill_t *
4514 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4516 ipif_t *ipif;
4517 ill_t *ill;
4519 ipif = ipif_lookup_multicast(ipst, zoneid, isv6);
4520 if (ipif == NULL)
4521 return (NULL);
4523 ill = ipif->ipif_ill;
4524 ill_refhold(ill);
4525 ipif_refrele(ipif);
4526 return (ill);
4530 * This function is called when an application does not specify an interface
4531 * to be used for multicast traffic (joining a group/sending data). It
4532 * calls ire_lookup_multi() to look for an interface route for the
4533 * specified multicast group. Doing this allows the administrator to add
4534 * prefix routes for multicast to indicate which interface to be used for
4535 * multicast traffic in the above scenario. The route could be for all
4536 * multicast (224.0/4), for a single multicast group (a /32 route) or
4537 * anything in between. If there is no such multicast route, we just find
4538 * any multicast capable interface and return it. The returned ipif
4539 * is refhold'ed.
4541 ill_t *
4542 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
4543 ipaddr_t *setsrcp)
4545 ill_t *ill;
4547 ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, setsrcp);
4548 if (ill != NULL)
4549 return (ill);
4551 return (ill_lookup_multicast(ipst, zoneid, B_FALSE));
4555 * Look for an ipif with the specified interface address and destination.
4556 * The destination address is used only for matching point-to-point interfaces.
4558 ipif_t *
4559 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst)
4561 ipif_t *ipif;
4562 ill_t *ill;
4563 ill_walk_context_t ctx;
4566 * First match all the point-to-point interfaces
4567 * before looking at non-point-to-point interfaces.
4568 * This is done to avoid returning non-point-to-point
4569 * ipif instead of unnumbered point-to-point ipif.
4571 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4572 ill = ILL_START_WALK_V4(&ctx, ipst);
4573 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4574 mutex_enter(&ill->ill_lock);
4575 for (ipif = ill->ill_ipif; ipif != NULL;
4576 ipif = ipif->ipif_next) {
4577 /* Allow the ipif to be down */
4578 if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
4579 (ipif->ipif_lcl_addr == if_addr) &&
4580 (ipif->ipif_pp_dst_addr == dst)) {
4581 if (!IPIF_IS_CONDEMNED(ipif)) {
4582 ipif_refhold_locked(ipif);
4583 mutex_exit(&ill->ill_lock);
4584 rw_exit(&ipst->ips_ill_g_lock);
4585 return (ipif);
4589 mutex_exit(&ill->ill_lock);
4591 rw_exit(&ipst->ips_ill_g_lock);
4593 /* lookup the ipif based on interface address */
4594 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst);
4595 ASSERT(ipif == NULL || !ipif->ipif_isv6);
4596 return (ipif);
4600 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
4602 static ipif_t *
4603 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags,
4604 zoneid_t zoneid, ip_stack_t *ipst)
4606 ipif_t *ipif;
4607 ill_t *ill;
4608 boolean_t ptp = B_FALSE;
4609 ill_walk_context_t ctx;
4610 boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP);
4611 boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP);
4613 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4615 * Repeat twice, first based on local addresses and
4616 * next time for pointopoint.
4618 repeat:
4619 ill = ILL_START_WALK_V4(&ctx, ipst);
4620 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4621 if (match_ill != NULL && ill != match_ill &&
4622 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
4623 continue;
4625 mutex_enter(&ill->ill_lock);
4626 for (ipif = ill->ill_ipif; ipif != NULL;
4627 ipif = ipif->ipif_next) {
4628 if (zoneid != ALL_ZONES &&
4629 zoneid != ipif->ipif_zoneid &&
4630 ipif->ipif_zoneid != ALL_ZONES)
4631 continue;
4633 if (no_duplicate && !(ipif->ipif_flags & IPIF_UP))
4634 continue;
4636 /* Allow the ipif to be down */
4637 if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4638 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4639 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4640 (ipif->ipif_pp_dst_addr == addr))) {
4641 if (!IPIF_IS_CONDEMNED(ipif)) {
4642 ipif_refhold_locked(ipif);
4643 mutex_exit(&ill->ill_lock);
4644 rw_exit(&ipst->ips_ill_g_lock);
4645 return (ipif);
4649 mutex_exit(&ill->ill_lock);
4652 /* If we already did the ptp case, then we are done */
4653 if (ptp) {
4654 rw_exit(&ipst->ips_ill_g_lock);
4655 return (NULL);
4657 ptp = B_TRUE;
4658 goto repeat;
4662 * Lookup an ipif with the specified address. For point-to-point links we
4663 * look for matches on either the destination address or the local address,
4664 * but we skip the local address check if IPIF_UNNUMBERED is set. If the
4665 * `match_ill' argument is non-NULL, the lookup is restricted to that ill
4666 * (or illgrp if `match_ill' is in an IPMP group).
4668 ipif_t *
4669 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4670 ip_stack_t *ipst)
4672 return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP,
4673 zoneid, ipst));
4677 * Lookup an ipif with the specified address. Similar to ipif_lookup_addr,
4678 * except that we will only return an address if it is not marked as
4679 * IPIF_DUPLICATE
4681 ipif_t *
4682 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4683 ip_stack_t *ipst)
4685 return (ipif_lookup_addr_common(addr, match_ill,
4686 (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP),
4687 zoneid, ipst));
4691 * Special abbreviated version of ipif_lookup_addr() that doesn't match
4692 * `match_ill' across the IPMP group. This function is only needed in some
4693 * corner-cases; almost everything should use ipif_lookup_addr().
4695 ipif_t *
4696 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4698 ASSERT(match_ill != NULL);
4699 return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES,
4700 ipst));
4704 * Look for an ipif with the specified address. For point-point links
4705 * we look for matches on either the destination address and the local
4706 * address, but we ignore the check on the local address if IPIF_UNNUMBERED
4707 * is set.
4708 * If the `match_ill' argument is non-NULL, the lookup is restricted to that
4709 * ill (or illgrp if `match_ill' is in an IPMP group).
4710 * Return the zoneid for the ipif which matches. ALL_ZONES if no match.
4712 zoneid_t
4713 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4715 zoneid_t zoneid;
4716 ipif_t *ipif;
4717 ill_t *ill;
4718 boolean_t ptp = B_FALSE;
4719 ill_walk_context_t ctx;
4721 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4723 * Repeat twice, first based on local addresses and
4724 * next time for pointopoint.
4726 repeat:
4727 ill = ILL_START_WALK_V4(&ctx, ipst);
4728 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4729 if (match_ill != NULL && ill != match_ill &&
4730 !IS_IN_SAME_ILLGRP(ill, match_ill)) {
4731 continue;
4733 mutex_enter(&ill->ill_lock);
4734 for (ipif = ill->ill_ipif; ipif != NULL;
4735 ipif = ipif->ipif_next) {
4736 /* Allow the ipif to be down */
4737 if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4738 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4739 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4740 (ipif->ipif_pp_dst_addr == addr)) &&
4741 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
4742 zoneid = ipif->ipif_zoneid;
4743 mutex_exit(&ill->ill_lock);
4744 rw_exit(&ipst->ips_ill_g_lock);
4745 return (zoneid);
4748 mutex_exit(&ill->ill_lock);
4751 /* If we already did the ptp case, then we are done */
4752 if (ptp) {
4753 rw_exit(&ipst->ips_ill_g_lock);
4754 return (ALL_ZONES);
4756 ptp = B_TRUE;
4757 goto repeat;
4761 * Look for an ipif that matches the specified remote address i.e. the
4762 * ipif that would receive the specified packet.
4763 * First look for directly connected interfaces and then do a recursive
4764 * IRE lookup and pick the first ipif corresponding to the source address in the
4765 * ire.
4766 * Returns: held ipif
4768 * This is only used for ICMP_ADDRESS_MASK_REQUESTs
4770 ipif_t *
4771 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
4773 ipif_t *ipif;
4775 ASSERT(!ill->ill_isv6);
4778 * Someone could be changing this ipif currently or change it
4779 * after we return this. Thus a few packets could use the old
4780 * old values. However structure updates/creates (ire, ilg, ilm etc)
4781 * will atomically be updated or cleaned up with the new value
4782 * Thus we don't need a lock to check the flags or other attrs below.
4784 mutex_enter(&ill->ill_lock);
4785 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4786 if (IPIF_IS_CONDEMNED(ipif))
4787 continue;
4788 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid &&
4789 ipif->ipif_zoneid != ALL_ZONES)
4790 continue;
4791 /* Allow the ipif to be down */
4792 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
4793 if ((ipif->ipif_pp_dst_addr == addr) ||
4794 (!(ipif->ipif_flags & IPIF_UNNUMBERED) &&
4795 ipif->ipif_lcl_addr == addr)) {
4796 ipif_refhold_locked(ipif);
4797 mutex_exit(&ill->ill_lock);
4798 return (ipif);
4800 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) {
4801 ipif_refhold_locked(ipif);
4802 mutex_exit(&ill->ill_lock);
4803 return (ipif);
4806 mutex_exit(&ill->ill_lock);
4808 * For a remote destination it isn't possible to nail down a particular
4809 * ipif.
4812 /* Pick the first interface */
4813 ipif = ipif_get_next_ipif(NULL, ill);
4814 return (ipif);
4818 * This func does not prevent refcnt from increasing. But if
4819 * the caller has taken steps to that effect, then this func
4820 * can be used to determine whether the ill has become quiescent
4822 static boolean_t
4823 ill_is_quiescent(ill_t *ill)
4825 ipif_t *ipif;
4827 ASSERT(MUTEX_HELD(&ill->ill_lock));
4829 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4830 if (ipif->ipif_refcnt != 0)
4831 return (B_FALSE);
4833 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) {
4834 return (B_FALSE);
4836 return (B_TRUE);
4839 boolean_t
4840 ill_is_freeable(ill_t *ill)
4842 ipif_t *ipif;
4844 ASSERT(MUTEX_HELD(&ill->ill_lock));
4846 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4847 if (ipif->ipif_refcnt != 0) {
4848 return (B_FALSE);
4851 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) {
4852 return (B_FALSE);
4854 return (B_TRUE);
4858 * This func does not prevent refcnt from increasing. But if
4859 * the caller has taken steps to that effect, then this func
4860 * can be used to determine whether the ipif has become quiescent
4862 static boolean_t
4863 ipif_is_quiescent(ipif_t *ipif)
4865 ill_t *ill;
4867 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4869 if (ipif->ipif_refcnt != 0)
4870 return (B_FALSE);
4872 ill = ipif->ipif_ill;
4873 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
4874 ill->ill_logical_down) {
4875 return (B_TRUE);
4878 /* This is the last ipif going down or being deleted on this ill */
4879 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) {
4880 return (B_FALSE);
4883 return (B_TRUE);
4887 * return true if the ipif can be destroyed: the ipif has to be quiescent
4888 * with zero references from ire/ilm to it.
4890 static boolean_t
4891 ipif_is_freeable(ipif_t *ipif)
4893 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4894 ASSERT(ipif->ipif_id != 0);
4895 return (ipif->ipif_refcnt == 0);
4899 * The ipif/ill/ire has been refreled. Do the tail processing.
4900 * Determine if the ipif or ill in question has become quiescent and if so
4901 * wakeup close and/or restart any queued pending ioctl that is waiting
4902 * for the ipif_down (or ill_down)
4904 void
4905 ipif_ill_refrele_tail(ill_t *ill)
4907 mblk_t *mp;
4908 conn_t *connp;
4909 ipsq_t *ipsq;
4910 ipxop_t *ipx;
4911 ipif_t *ipif;
4912 dl_notify_ind_t *dlindp;
4914 ASSERT(MUTEX_HELD(&ill->ill_lock));
4916 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) {
4917 /* ip_modclose() may be waiting */
4918 cv_broadcast(&ill->ill_cv);
4921 ipsq = ill->ill_phyint->phyint_ipsq;
4922 mutex_enter(&ipsq->ipsq_lock);
4923 ipx = ipsq->ipsq_xop;
4924 mutex_enter(&ipx->ipx_lock);
4925 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */
4926 goto unlock;
4928 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL);
4930 ipif = ipx->ipx_pending_ipif;
4931 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */
4932 goto unlock;
4934 switch (ipx->ipx_waitfor) {
4935 case IPIF_DOWN:
4936 if (!ipif_is_quiescent(ipif))
4937 goto unlock;
4938 break;
4939 case IPIF_FREE:
4940 if (!ipif_is_freeable(ipif))
4941 goto unlock;
4942 break;
4943 case ILL_DOWN:
4944 if (!ill_is_quiescent(ill))
4945 goto unlock;
4946 break;
4947 case ILL_FREE:
4949 * ILL_FREE is only for loopback; normal ill teardown waits
4950 * synchronously in ip_modclose() without using ipx_waitfor,
4951 * handled by the cv_broadcast() at the top of this function.
4953 if (!ill_is_freeable(ill))
4954 goto unlock;
4955 break;
4956 default:
4957 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n",
4958 (void *)ipsq, ipx->ipx_waitfor);
4961 ill_refhold_locked(ill); /* for qwriter_ip() call below */
4962 mutex_exit(&ipx->ipx_lock);
4963 mp = ipsq_pending_mp_get(ipsq, &connp);
4964 mutex_exit(&ipsq->ipsq_lock);
4965 mutex_exit(&ill->ill_lock);
4967 ASSERT(mp != NULL);
4969 * NOTE: all of the qwriter_ip() calls below use CUR_OP since
4970 * we can only get here when the current operation decides it
4971 * it needs to quiesce via ipsq_pending_mp_add().
4973 switch (mp->b_datap->db_type) {
4974 case M_PCPROTO:
4975 case M_PROTO:
4977 * For now, only DL_NOTIFY_IND messages can use this facility.
4979 dlindp = (dl_notify_ind_t *)mp->b_rptr;
4980 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND);
4982 switch (dlindp->dl_notification) {
4983 case DL_NOTE_PHYS_ADDR:
4984 qwriter_ip(ill, ill->ill_rq, mp,
4985 ill_set_phys_addr_tail, CUR_OP, B_TRUE);
4986 return;
4987 case DL_NOTE_REPLUMB:
4988 qwriter_ip(ill, ill->ill_rq, mp,
4989 ill_replumb_tail, CUR_OP, B_TRUE);
4990 return;
4991 default:
4992 ASSERT(0);
4993 ill_refrele(ill);
4995 break;
4997 case M_ERROR:
4998 case M_HANGUP:
4999 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP,
5000 B_TRUE);
5001 return;
5003 case M_IOCTL:
5004 case M_IOCDATA:
5005 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) :
5006 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE);
5007 return;
5009 default:
5010 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
5011 "db_type %d\n", (void *)mp, mp->b_datap->db_type);
5013 return;
5014 unlock:
5015 mutex_exit(&ipsq->ipsq_lock);
5016 mutex_exit(&ipx->ipx_lock);
5017 mutex_exit(&ill->ill_lock);
5020 #ifdef DEBUG
5021 /* Reuse trace buffer from beginning (if reached the end) and record trace */
5022 static void
5023 th_trace_rrecord(th_trace_t *th_trace)
5025 tr_buf_t *tr_buf;
5026 uint_t lastref;
5028 lastref = th_trace->th_trace_lastref;
5029 lastref++;
5030 if (lastref == TR_BUF_MAX)
5031 lastref = 0;
5032 th_trace->th_trace_lastref = lastref;
5033 tr_buf = &th_trace->th_trbuf[lastref];
5034 tr_buf->tr_time = ddi_get_lbolt();
5035 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH);
5038 static void
5039 th_trace_free(void *value)
5041 th_trace_t *th_trace = value;
5043 ASSERT(th_trace->th_refcnt == 0);
5044 kmem_free(th_trace, sizeof (*th_trace));
5048 * Find or create the per-thread hash table used to track object references.
5049 * The ipst argument is NULL if we shouldn't allocate.
5051 * Accesses per-thread data, so there's no need to lock here.
5053 static mod_hash_t *
5054 th_trace_gethash(ip_stack_t *ipst)
5056 th_hash_t *thh;
5058 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) {
5059 mod_hash_t *mh;
5060 char name[256];
5061 size_t objsize, rshift;
5062 int retv;
5064 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL)
5065 return (NULL);
5066 (void) snprintf(name, sizeof (name), "th_trace_%p",
5067 (void *)curthread);
5070 * We use mod_hash_create_extended here rather than the more
5071 * obvious mod_hash_create_ptrhash because the latter has a
5072 * hard-coded KM_SLEEP, and we'd prefer to fail rather than
5073 * block.
5075 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)),
5076 MAX(sizeof (ire_t), sizeof (ncec_t)));
5077 rshift = highbit(objsize);
5078 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor,
5079 th_trace_free, mod_hash_byptr, (void *)rshift,
5080 mod_hash_ptrkey_cmp, KM_NOSLEEP);
5081 if (mh == NULL) {
5082 kmem_free(thh, sizeof (*thh));
5083 return (NULL);
5085 thh->thh_hash = mh;
5086 thh->thh_ipst = ipst;
5088 * We trace ills, ipifs, ires, and nces. All of these are
5089 * per-IP-stack, so the lock on the thread list is as well.
5091 rw_enter(&ip_thread_rwlock, RW_WRITER);
5092 list_insert_tail(&ip_thread_list, thh);
5093 rw_exit(&ip_thread_rwlock);
5094 retv = tsd_set(ip_thread_data, thh);
5095 ASSERT(retv == 0);
5097 return (thh != NULL ? thh->thh_hash : NULL);
5100 boolean_t
5101 th_trace_ref(const void *obj, ip_stack_t *ipst)
5103 th_trace_t *th_trace;
5104 mod_hash_t *mh;
5105 mod_hash_val_t val;
5107 if ((mh = th_trace_gethash(ipst)) == NULL)
5108 return (B_FALSE);
5111 * Attempt to locate the trace buffer for this obj and thread.
5112 * If it does not exist, then allocate a new trace buffer and
5113 * insert into the hash.
5115 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) {
5116 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP);
5117 if (th_trace == NULL)
5118 return (B_FALSE);
5120 th_trace->th_id = curthread;
5121 if (mod_hash_insert(mh, (mod_hash_key_t)obj,
5122 (mod_hash_val_t)th_trace) != 0) {
5123 kmem_free(th_trace, sizeof (th_trace_t));
5124 return (B_FALSE);
5126 } else {
5127 th_trace = (th_trace_t *)val;
5130 ASSERT(th_trace->th_refcnt >= 0 &&
5131 th_trace->th_refcnt < TR_BUF_MAX - 1);
5133 th_trace->th_refcnt++;
5134 th_trace_rrecord(th_trace);
5135 return (B_TRUE);
5139 * For the purpose of tracing a reference release, we assume that global
5140 * tracing is always on and that the same thread initiated the reference hold
5141 * is releasing.
5143 void
5144 th_trace_unref(const void *obj)
5146 int retv;
5147 mod_hash_t *mh;
5148 th_trace_t *th_trace;
5149 mod_hash_val_t val;
5151 mh = th_trace_gethash(NULL);
5152 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val);
5153 ASSERT(retv == 0);
5154 th_trace = (th_trace_t *)val;
5156 ASSERT(th_trace->th_refcnt > 0);
5157 th_trace->th_refcnt--;
5158 th_trace_rrecord(th_trace);
5162 * If tracing has been disabled, then we assume that the reference counts are
5163 * now useless, and we clear them out before destroying the entries.
5165 void
5166 th_trace_cleanup(const void *obj, boolean_t trace_disable)
5168 th_hash_t *thh;
5169 mod_hash_t *mh;
5170 mod_hash_val_t val;
5171 th_trace_t *th_trace;
5172 int retv;
5174 rw_enter(&ip_thread_rwlock, RW_READER);
5175 for (thh = list_head(&ip_thread_list); thh != NULL;
5176 thh = list_next(&ip_thread_list, thh)) {
5177 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj,
5178 &val) == 0) {
5179 th_trace = (th_trace_t *)val;
5180 if (trace_disable)
5181 th_trace->th_refcnt = 0;
5182 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj);
5183 ASSERT(retv == 0);
5186 rw_exit(&ip_thread_rwlock);
5189 void
5190 ipif_trace_ref(ipif_t *ipif)
5192 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5194 if (ipif->ipif_trace_disable)
5195 return;
5197 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) {
5198 ipif->ipif_trace_disable = B_TRUE;
5199 ipif_trace_cleanup(ipif);
5203 void
5204 ipif_untrace_ref(ipif_t *ipif)
5206 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5208 if (!ipif->ipif_trace_disable)
5209 th_trace_unref(ipif);
5212 void
5213 ill_trace_ref(ill_t *ill)
5215 ASSERT(MUTEX_HELD(&ill->ill_lock));
5217 if (ill->ill_trace_disable)
5218 return;
5220 if (!th_trace_ref(ill, ill->ill_ipst)) {
5221 ill->ill_trace_disable = B_TRUE;
5222 ill_trace_cleanup(ill);
5226 void
5227 ill_untrace_ref(ill_t *ill)
5229 ASSERT(MUTEX_HELD(&ill->ill_lock));
5231 if (!ill->ill_trace_disable)
5232 th_trace_unref(ill);
5236 * Called when ipif is unplumbed or when memory alloc fails. Note that on
5237 * failure, ipif_trace_disable is set.
5239 static void
5240 ipif_trace_cleanup(const ipif_t *ipif)
5242 th_trace_cleanup(ipif, ipif->ipif_trace_disable);
5246 * Called when ill is unplumbed or when memory alloc fails. Note that on
5247 * failure, ill_trace_disable is set.
5249 static void
5250 ill_trace_cleanup(const ill_t *ill)
5252 th_trace_cleanup(ill, ill->ill_trace_disable);
5254 #endif /* DEBUG */
5256 void
5257 ipif_refhold_locked(ipif_t *ipif)
5259 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5260 ipif->ipif_refcnt++;
5261 IPIF_TRACE_REF(ipif);
5264 void
5265 ipif_refhold(ipif_t *ipif)
5267 ill_t *ill;
5269 ill = ipif->ipif_ill;
5270 mutex_enter(&ill->ill_lock);
5271 ipif->ipif_refcnt++;
5272 IPIF_TRACE_REF(ipif);
5273 mutex_exit(&ill->ill_lock);
5277 * Must not be called while holding any locks. Otherwise if this is
5278 * the last reference to be released there is a chance of recursive mutex
5279 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
5280 * to restart an ioctl.
5282 void
5283 ipif_refrele(ipif_t *ipif)
5285 ill_t *ill;
5287 ill = ipif->ipif_ill;
5289 mutex_enter(&ill->ill_lock);
5290 ASSERT(ipif->ipif_refcnt != 0);
5291 ipif->ipif_refcnt--;
5292 IPIF_UNTRACE_REF(ipif);
5293 if (ipif->ipif_refcnt != 0) {
5294 mutex_exit(&ill->ill_lock);
5295 return;
5298 /* Drops the ill_lock */
5299 ipif_ill_refrele_tail(ill);
5302 ipif_t *
5303 ipif_get_next_ipif(ipif_t *curr, ill_t *ill)
5305 ipif_t *ipif;
5307 mutex_enter(&ill->ill_lock);
5308 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next);
5309 ipif != NULL; ipif = ipif->ipif_next) {
5310 if (IPIF_IS_CONDEMNED(ipif))
5311 continue;
5312 ipif_refhold_locked(ipif);
5313 mutex_exit(&ill->ill_lock);
5314 return (ipif);
5316 mutex_exit(&ill->ill_lock);
5317 return (NULL);
5321 * TODO: make this table extendible at run time
5322 * Return a pointer to the mac type info for 'mac_type'
5324 static ip_m_t *
5325 ip_m_lookup(t_uscalar_t mac_type)
5327 ip_m_t *ipm;
5329 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++)
5330 if (ipm->ip_m_mac_type == mac_type)
5331 return (ipm);
5332 return (NULL);
5336 * Make a link layer address from the multicast IP address *addr.
5337 * To form the link layer address, invoke the ip_m_v*mapping function
5338 * associated with the link-layer type.
5340 void
5341 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr)
5343 ip_m_t *ipm;
5345 if (ill->ill_net_type == IRE_IF_NORESOLVER)
5346 return;
5348 ASSERT(addr != NULL);
5350 ipm = ip_m_lookup(ill->ill_mactype);
5351 if (ipm == NULL ||
5352 (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) ||
5353 (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) {
5354 ip0dbg(("no mapping for ill %s mactype 0x%x\n",
5355 ill->ill_name, ill->ill_mactype));
5356 return;
5358 if (ill->ill_isv6)
5359 (*ipm->ip_m_v6mapping)(ill, addr, hwaddr);
5360 else
5361 (*ipm->ip_m_v4mapping)(ill, addr, hwaddr);
5365 * Returns B_FALSE if the IPv4 netmask pointed by `mask' is non-contiguous.
5366 * Otherwise returns B_TRUE.
5368 * The netmask can be verified to be contiguous with 32 shifts and or
5369 * operations. Take the contiguous mask (in host byte order) and compute
5370 * mask | mask << 1 | mask << 2 | ... | mask << 31
5371 * the result will be the same as the 'mask' for contiguous mask.
5373 static boolean_t
5374 ip_contiguous_mask(uint32_t mask)
5376 uint32_t m = mask;
5377 int i;
5379 for (i = 1; i < 32; i++)
5380 m |= (mask << i);
5382 return (m == mask);
5386 * ip_rt_add is called to add an IPv4 route to the forwarding table.
5387 * ill is passed in to associate it with the correct interface.
5388 * If ire_arg is set, then we return the held IRE in that location.
5391 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
5392 ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg,
5393 boolean_t ioctl_msg, ip_stack_t *ipst, zoneid_t zoneid)
5395 ire_t *ire, *nire;
5396 ire_t *gw_ire = NULL;
5397 ipif_t *ipif = NULL;
5398 uint_t type;
5399 int match_flags = MATCH_IRE_TYPE;
5400 boolean_t unbound = B_FALSE;
5402 ip1dbg(("ip_rt_add:"));
5404 if (ire_arg != NULL)
5405 *ire_arg = NULL;
5407 /* disallow non-contiguous netmasks */
5408 if (!ip_contiguous_mask(ntohl(mask)))
5409 return (ENOTSUP);
5412 * If this is the case of RTF_HOST being set, then we set the netmask
5413 * to all ones (regardless if one was supplied).
5415 if (flags & RTF_HOST)
5416 mask = IP_HOST_MASK;
5419 * Prevent routes with a zero gateway from being created (since
5420 * interfaces can currently be plumbed and brought up no assigned
5421 * address).
5423 if (gw_addr == 0)
5424 return (ENETUNREACH);
5426 * Get the ipif, if any, corresponding to the gw_addr
5427 * If -ifp was specified we restrict ourselves to the ill, otherwise
5428 * we match on the gatway and destination to handle unnumbered pt-pt
5429 * interfaces.
5431 if (ill != NULL)
5432 ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst);
5433 else
5434 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
5435 if (ipif != NULL) {
5436 if (IS_VNI(ipif->ipif_ill)) {
5437 ipif_refrele(ipif);
5438 return (EINVAL);
5443 * GateD will attempt to create routes with a loopback interface
5444 * address as the gateway and with RTF_GATEWAY set. We allow
5445 * these routes to be added, but create them as interface routes
5446 * since the gateway is an interface address.
5448 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) {
5449 flags &= ~RTF_GATEWAY;
5450 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK &&
5451 mask == IP_HOST_MASK) {
5452 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
5453 NULL, ALL_ZONES, MATCH_IRE_TYPE, 0, ipst, NULL);
5454 if (ire != NULL) {
5455 ire_refrele(ire);
5456 ipif_refrele(ipif);
5457 return (EEXIST);
5459 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x"
5460 "for 0x%x\n", (void *)ipif,
5461 ipif->ipif_ire_type,
5462 ntohl(ipif->ipif_lcl_addr)));
5463 ire = ire_create(
5464 (uchar_t *)&dst_addr, /* dest address */
5465 (uchar_t *)&mask, /* mask */
5466 NULL, /* no gateway */
5467 ipif->ipif_ire_type, /* LOOPBACK */
5468 ipif->ipif_ill,
5469 zoneid,
5470 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
5471 ipst);
5473 if (ire == NULL) {
5474 ipif_refrele(ipif);
5475 return (ENOMEM);
5477 /* src address assigned by the caller? */
5478 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5479 ire->ire_setsrc_addr = src_addr;
5481 nire = ire_add(ire);
5482 if (nire == NULL) {
5484 * In the result of failure, ire_add() will have
5485 * already deleted the ire in question, so there
5486 * is no need to do that here.
5488 ipif_refrele(ipif);
5489 return (ENOMEM);
5492 * Check if it was a duplicate entry. This handles
5493 * the case of two racing route adds for the same route
5495 if (nire != ire) {
5496 ASSERT(nire->ire_identical_ref > 1);
5497 ire_delete(nire);
5498 ire_refrele(nire);
5499 ipif_refrele(ipif);
5500 return (EEXIST);
5502 ire = nire;
5503 goto save_ire;
5508 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
5509 * and the gateway address provided is one of the system's interface
5510 * addresses. By using the routing socket interface and supplying an
5511 * RTA_IFP sockaddr with an interface index, an alternate method of
5512 * specifying an interface route to be created is available which uses
5513 * the interface index that specifies the outgoing interface rather than
5514 * the address of an outgoing interface (which may not be able to
5515 * uniquely identify an interface). When coupled with the RTF_GATEWAY
5516 * flag, routes can be specified which not only specify the next-hop to
5517 * be used when routing to a certain prefix, but also which outgoing
5518 * interface should be used.
5520 * Previously, interfaces would have unique addresses assigned to them
5521 * and so the address assigned to a particular interface could be used
5522 * to identify a particular interface. One exception to this was the
5523 * case of an unnumbered interface (where IPIF_UNNUMBERED was set).
5525 * With the advent of IPv6 and its link-local addresses, this
5526 * restriction was relaxed and interfaces could share addresses between
5527 * themselves. In fact, typically all of the link-local interfaces on
5528 * an IPv6 node or router will have the same link-local address. In
5529 * order to differentiate between these interfaces, the use of an
5530 * interface index is necessary and this index can be carried inside a
5531 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction
5532 * of using the interface index, however, is that all of the ipif's that
5533 * are part of an ill have the same index and so the RTA_IFP sockaddr
5534 * cannot be used to differentiate between ipif's (or logical
5535 * interfaces) that belong to the same ill (physical interface).
5537 * For example, in the following case involving IPv4 interfaces and
5538 * logical interfaces
5540 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0
5541 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0
5542 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0
5544 * the ipif's corresponding to each of these interface routes can be
5545 * uniquely identified by the "gateway" (actually interface address).
5547 * In this case involving multiple IPv6 default routes to a particular
5548 * link-local gateway, the use of RTA_IFP is necessary to specify which
5549 * default route is of interest:
5551 * default fe80::123:4567:89ab:cdef U if0
5552 * default fe80::123:4567:89ab:cdef U if1
5555 /* RTF_GATEWAY not set */
5556 if (!(flags & RTF_GATEWAY)) {
5558 * Whether or not ill (RTA_IFP) is set, we require that
5559 * the gateway is one of our local addresses.
5561 if (ipif == NULL)
5562 return (ENETUNREACH);
5565 * We use MATCH_IRE_ILL here. If the caller specified an
5566 * interface (from the RTA_IFP sockaddr) we use it, otherwise
5567 * we use the ill derived from the gateway address.
5568 * We can always match the gateway address since we record it
5569 * in ire_gateway_addr.
5570 * We don't allow RTA_IFP to specify a different ill than the
5571 * one matching the ipif to make sure we can delete the route.
5573 match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL;
5574 if (ill == NULL) {
5575 ill = ipif->ipif_ill;
5576 } else if (ill != ipif->ipif_ill) {
5577 ipif_refrele(ipif);
5578 return (EINVAL);
5582 * We check for an existing entry at this point.
5584 * Since a netmask isn't passed in via the ioctl interface
5585 * (SIOCADDRT), we don't check for a matching netmask in that
5586 * case.
5588 if (!ioctl_msg)
5589 match_flags |= MATCH_IRE_MASK;
5590 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
5591 IRE_INTERFACE, ill, ALL_ZONES, match_flags, 0, ipst, NULL);
5592 if (ire != NULL) {
5593 ire_refrele(ire);
5594 ipif_refrele(ipif);
5595 return (EEXIST);
5599 * Some software (for example, GateD and Sun Cluster) attempts
5600 * to create (what amount to) IRE_PREFIX routes with the
5601 * loopback address as the gateway. This is primarily done to
5602 * set up prefixes with the RTF_REJECT flag set (for example,
5603 * when generating aggregate routes.)
5605 * If the IRE type (as defined by ill->ill_net_type) would be
5606 * IRE_LOOPBACK, then we map the request into a
5607 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as
5608 * these interface routes, by definition, can only be that.
5610 * Needless to say, the real IRE_LOOPBACK is NOT created by this
5611 * routine, but rather using ire_create() directly.
5614 type = ill->ill_net_type;
5615 if (type == IRE_LOOPBACK) {
5616 type = IRE_IF_NORESOLVER;
5617 flags |= RTF_BLACKHOLE;
5621 * Create a copy of the IRE_IF_NORESOLVER or
5622 * IRE_IF_RESOLVER with the modified address, netmask, and
5623 * gateway.
5625 ire = ire_create(
5626 (uchar_t *)&dst_addr,
5627 (uint8_t *)&mask,
5628 (uint8_t *)&gw_addr,
5629 type,
5630 ill,
5631 zoneid,
5632 flags,
5633 ipst);
5634 if (ire == NULL) {
5635 ipif_refrele(ipif);
5636 return (ENOMEM);
5639 /* src address assigned by the caller? */
5640 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5641 ire->ire_setsrc_addr = src_addr;
5643 nire = ire_add(ire);
5644 if (nire == NULL) {
5646 * In the result of failure, ire_add() will have
5647 * already deleted the ire in question, so there
5648 * is no need to do that here.
5650 ipif_refrele(ipif);
5651 return (ENOMEM);
5654 * Check if it was a duplicate entry. This handles
5655 * the case of two racing route adds for the same route
5657 if (nire != ire) {
5658 ire_delete(nire);
5659 ire_refrele(nire);
5660 ipif_refrele(ipif);
5661 return (EEXIST);
5663 ire = nire;
5664 goto save_ire;
5668 * Get an interface IRE for the specified gateway.
5669 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
5670 * gateway, it is currently unreachable and we fail the request
5671 * accordingly. We reject any RTF_GATEWAY routes where the gateway
5672 * is an IRE_LOCAL or IRE_LOOPBACK.
5673 * If RTA_IFP was specified we look on that particular ill.
5675 if (ill != NULL)
5676 match_flags |= MATCH_IRE_ILL;
5678 /* Check whether the gateway is reachable. */
5679 again:
5680 type = IRE_INTERFACE | IRE_LOCAL | IRE_LOOPBACK;
5681 if (flags & RTF_INDIRECT)
5682 type |= IRE_OFFLINK;
5684 gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill,
5685 ALL_ZONES, match_flags, 0, ipst, NULL);
5686 if (gw_ire == NULL) {
5688 * With IPMP, we allow host routes to influence in.mpathd's
5689 * target selection. However, if the test addresses are on
5690 * their own network, the above lookup will fail since the
5691 * underlying IRE_INTERFACEs are marked hidden. So allow
5692 * hidden test IREs to be found and try again.
5694 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) {
5695 match_flags |= MATCH_IRE_TESTHIDDEN;
5696 goto again;
5698 if (ipif != NULL)
5699 ipif_refrele(ipif);
5700 return (ENETUNREACH);
5702 if (gw_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
5703 ire_refrele(gw_ire);
5704 if (ipif != NULL)
5705 ipif_refrele(ipif);
5706 return (ENETUNREACH);
5709 if (ill == NULL && !(flags & RTF_INDIRECT)) {
5710 unbound = B_TRUE;
5711 if (ipst->ips_ip_strict_src_multihoming > 0)
5712 ill = gw_ire->ire_ill;
5716 * We create one of three types of IREs as a result of this request
5717 * based on the netmask. A netmask of all ones (which is automatically
5718 * assumed when RTF_HOST is set) results in an IRE_HOST being created.
5719 * An all zeroes netmask implies a default route so an IRE_DEFAULT is
5720 * created. Otherwise, an IRE_PREFIX route is created for the
5721 * destination prefix.
5723 if (mask == IP_HOST_MASK)
5724 type = IRE_HOST;
5725 else if (mask == 0)
5726 type = IRE_DEFAULT;
5727 else
5728 type = IRE_PREFIX;
5730 /* check for a duplicate entry */
5731 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
5732 ALL_ZONES, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, 0, ipst,
5733 NULL);
5734 if (ire != NULL) {
5735 if (ipif != NULL)
5736 ipif_refrele(ipif);
5737 ire_refrele(gw_ire);
5738 ire_refrele(ire);
5739 return (EEXIST);
5742 /* Create the IRE. */
5743 ire = ire_create(
5744 (uchar_t *)&dst_addr, /* dest address */
5745 (uchar_t *)&mask, /* mask */
5746 (uchar_t *)&gw_addr, /* gateway address */
5747 (ushort_t)type, /* IRE type */
5748 ill,
5749 zoneid,
5750 flags,
5751 ipst);
5753 if (ire == NULL) {
5754 if (ipif != NULL)
5755 ipif_refrele(ipif);
5756 ire_refrele(gw_ire);
5757 return (ENOMEM);
5760 /* src address assigned by the caller? */
5761 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5762 ire->ire_setsrc_addr = src_addr;
5764 ire->ire_unbound = unbound;
5767 * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
5768 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
5771 /* Add the new IRE. */
5772 nire = ire_add(ire);
5773 if (nire == NULL) {
5775 * In the result of failure, ire_add() will have
5776 * already deleted the ire in question, so there
5777 * is no need to do that here.
5779 if (ipif != NULL)
5780 ipif_refrele(ipif);
5781 ire_refrele(gw_ire);
5782 return (ENOMEM);
5785 * Check if it was a duplicate entry. This handles
5786 * the case of two racing route adds for the same route
5788 if (nire != ire) {
5789 ire_delete(nire);
5790 ire_refrele(nire);
5791 if (ipif != NULL)
5792 ipif_refrele(ipif);
5793 ire_refrele(gw_ire);
5794 return (EEXIST);
5796 ire = nire;
5798 save_ire:
5799 if (gw_ire != NULL) {
5800 ire_refrele(gw_ire);
5801 gw_ire = NULL;
5803 if (ill != NULL) {
5805 * Save enough information so that we can recreate the IRE if
5806 * the interface goes down and then up. The metrics associated
5807 * with the route will be saved as well when rts_setmetrics() is
5808 * called after the IRE has been created. In the case where
5809 * memory cannot be allocated, none of this information will be
5810 * saved.
5812 ill_save_ire(ill, ire);
5814 if (ioctl_msg)
5815 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst);
5816 if (ire_arg != NULL) {
5818 * Store the ire that was successfully added into where ire_arg
5819 * points to so that callers don't have to look it up
5820 * themselves (but they are responsible for ire_refrele()ing
5821 * the ire when they are finished with it).
5823 *ire_arg = ire;
5824 } else {
5825 ire_refrele(ire); /* Held in ire_add */
5827 if (ipif != NULL)
5828 ipif_refrele(ipif);
5829 return (0);
5833 * ip_rt_delete is called to delete an IPv4 route.
5834 * ill is passed in to associate it with the correct interface.
5836 /* ARGSUSED4 */
5838 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
5839 uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg,
5840 ip_stack_t *ipst, zoneid_t zoneid)
5842 ire_t *ire = NULL;
5843 ipif_t *ipif;
5844 uint_t type;
5845 uint_t match_flags = MATCH_IRE_TYPE;
5846 int err = 0;
5848 ip1dbg(("ip_rt_delete:"));
5850 * If this is the case of RTF_HOST being set, then we set the netmask
5851 * to all ones. Otherwise, we use the netmask if one was supplied.
5853 if (flags & RTF_HOST) {
5854 mask = IP_HOST_MASK;
5855 match_flags |= MATCH_IRE_MASK;
5856 } else if (rtm_addrs & RTA_NETMASK) {
5857 match_flags |= MATCH_IRE_MASK;
5861 * Note that RTF_GATEWAY is never set on a delete, therefore
5862 * we check if the gateway address is one of our interfaces first,
5863 * and fall back on RTF_GATEWAY routes.
5865 * This makes it possible to delete an original
5866 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
5867 * However, we have RTF_KERNEL set on the ones created by ipif_up
5868 * and those can not be deleted here.
5870 * We use MATCH_IRE_ILL if we know the interface. If the caller
5871 * specified an interface (from the RTA_IFP sockaddr) we use it,
5872 * otherwise we use the ill derived from the gateway address.
5873 * We can always match the gateway address since we record it
5874 * in ire_gateway_addr.
5876 * For more detail on specifying routes by gateway address and by
5877 * interface index, see the comments in ip_rt_add().
5879 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
5880 if (ipif != NULL) {
5881 ill_t *ill_match;
5883 if (ill != NULL)
5884 ill_match = ill;
5885 else
5886 ill_match = ipif->ipif_ill;
5888 match_flags |= MATCH_IRE_ILL;
5889 if (ipif->ipif_ire_type == IRE_LOOPBACK) {
5890 ire = ire_ftable_lookup_v4(dst_addr, mask, 0,
5891 IRE_LOOPBACK, ill_match, ALL_ZONES, match_flags, 0,
5892 ipst, NULL);
5894 if (ire == NULL) {
5895 match_flags |= MATCH_IRE_GW;
5896 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
5897 IRE_INTERFACE, ill_match, ALL_ZONES, match_flags,
5898 0, ipst, NULL);
5900 /* Avoid deleting routes created by kernel from an ipif */
5901 if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) {
5902 ire_refrele(ire);
5903 ire = NULL;
5906 /* Restore in case we didn't find a match */
5907 match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL);
5910 if (ire == NULL) {
5912 * At this point, the gateway address is not one of our own
5913 * addresses or a matching interface route was not found. We
5914 * set the IRE type to lookup based on whether
5915 * this is a host route, a default route or just a prefix.
5917 * If an ill was passed in, then the lookup is based on an
5918 * interface index so MATCH_IRE_ILL is added to match_flags.
5920 match_flags |= MATCH_IRE_GW;
5921 if (ill != NULL)
5922 match_flags |= MATCH_IRE_ILL;
5923 if (mask == IP_HOST_MASK)
5924 type = IRE_HOST;
5925 else if (mask == 0)
5926 type = IRE_DEFAULT;
5927 else
5928 type = IRE_PREFIX;
5929 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
5930 ALL_ZONES, match_flags, 0, ipst, NULL);
5933 if (ipif != NULL) {
5934 ipif_refrele(ipif);
5935 ipif = NULL;
5938 if (ire == NULL)
5939 return (ESRCH);
5941 ill = ire->ire_ill;
5942 if (ill != NULL)
5943 ill_remove_saved_ire(ill, ire);
5944 if (ioctl_msg)
5945 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst);
5946 ire_delete(ire);
5947 ire_refrele(ire);
5948 return (err);
5952 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL.
5954 /* ARGSUSED */
5956 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
5957 ip_ioctl_cmd_t *ipip, void *dummy_if_req)
5959 ipaddr_t dst_addr;
5960 ipaddr_t gw_addr;
5961 ipaddr_t mask;
5962 int error = 0;
5963 mblk_t *mp1;
5964 struct rtentry *rt;
5965 ipif_t *ipif = NULL;
5966 ip_stack_t *ipst;
5968 ASSERT(q->q_next == NULL);
5969 ipst = CONNQ_TO_IPST(q);
5971 ip1dbg(("ip_siocaddrt:"));
5972 /* Existence of mp1 verified in ip_wput_nondata */
5973 mp1 = mp->b_cont->b_cont;
5974 rt = (struct rtentry *)mp1->b_rptr;
5976 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
5977 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
5980 * If the RTF_HOST flag is on, this is a request to assign a gateway
5981 * to a particular host address. In this case, we set the netmask to
5982 * all ones for the particular destination address. Otherwise,
5983 * determine the netmask to be used based on dst_addr and the interfaces
5984 * in use.
5986 if (rt->rt_flags & RTF_HOST) {
5987 mask = IP_HOST_MASK;
5988 } else {
5990 * Note that ip_subnet_mask returns a zero mask in the case of
5991 * default (an all-zeroes address).
5993 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
5996 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL,
5997 B_TRUE, ipst, ALL_ZONES);
5998 if (ipif != NULL)
5999 ipif_refrele(ipif);
6000 return (error);
6004 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL.
6006 /* ARGSUSED */
6008 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6009 ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6011 ipaddr_t dst_addr;
6012 ipaddr_t gw_addr;
6013 ipaddr_t mask;
6014 int error;
6015 mblk_t *mp1;
6016 struct rtentry *rt;
6017 ipif_t *ipif = NULL;
6018 ip_stack_t *ipst;
6020 ASSERT(q->q_next == NULL);
6021 ipst = CONNQ_TO_IPST(q);
6023 ip1dbg(("ip_siocdelrt:"));
6024 /* Existence of mp1 verified in ip_wput_nondata */
6025 mp1 = mp->b_cont->b_cont;
6026 rt = (struct rtentry *)mp1->b_rptr;
6028 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6029 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6032 * If the RTF_HOST flag is on, this is a request to delete a gateway
6033 * to a particular host address. In this case, we set the netmask to
6034 * all ones for the particular destination address. Otherwise,
6035 * determine the netmask to be used based on dst_addr and the interfaces
6036 * in use.
6038 if (rt->rt_flags & RTF_HOST) {
6039 mask = IP_HOST_MASK;
6040 } else {
6042 * Note that ip_subnet_mask returns a zero mask in the case of
6043 * default (an all-zeroes address).
6045 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
6048 error = ip_rt_delete(dst_addr, mask, gw_addr,
6049 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE,
6050 ipst, ALL_ZONES);
6051 if (ipif != NULL)
6052 ipif_refrele(ipif);
6053 return (error);
6057 * Enqueue the mp onto the ipsq, chained by b_next.
6058 * b_prev stores the function to be executed later, and b_queue the queue
6059 * where this mp originated.
6061 void
6062 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6063 ill_t *pending_ill)
6065 conn_t *connp;
6066 ipxop_t *ipx = ipsq->ipsq_xop;
6068 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
6069 ASSERT(MUTEX_HELD(&ipx->ipx_lock));
6070 ASSERT(func != NULL);
6072 mp->b_queue = q;
6073 mp->b_prev = (void *)func;
6074 mp->b_next = NULL;
6076 switch (type) {
6077 case CUR_OP:
6078 if (ipx->ipx_mptail != NULL) {
6079 ASSERT(ipx->ipx_mphead != NULL);
6080 ipx->ipx_mptail->b_next = mp;
6081 } else {
6082 ASSERT(ipx->ipx_mphead == NULL);
6083 ipx->ipx_mphead = mp;
6085 ipx->ipx_mptail = mp;
6086 break;
6088 case NEW_OP:
6089 if (ipsq->ipsq_xopq_mptail != NULL) {
6090 ASSERT(ipsq->ipsq_xopq_mphead != NULL);
6091 ipsq->ipsq_xopq_mptail->b_next = mp;
6092 } else {
6093 ASSERT(ipsq->ipsq_xopq_mphead == NULL);
6094 ipsq->ipsq_xopq_mphead = mp;
6096 ipsq->ipsq_xopq_mptail = mp;
6097 ipx->ipx_ipsq_queued = B_TRUE;
6098 break;
6100 case SWITCH_OP:
6101 ASSERT(ipsq->ipsq_swxop != NULL);
6102 /* only one switch operation is currently allowed */
6103 ASSERT(ipsq->ipsq_switch_mp == NULL);
6104 ipsq->ipsq_switch_mp = mp;
6105 ipx->ipx_ipsq_queued = B_TRUE;
6106 break;
6107 default:
6108 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
6111 if (CONN_Q(q) && pending_ill != NULL) {
6112 connp = Q_TO_CONN(q);
6113 ASSERT(MUTEX_HELD(&connp->conn_lock));
6114 connp->conn_oper_pending_ill = pending_ill;
6119 * Dequeue the next message that requested exclusive access to this IPSQ's
6120 * xop. Specifically:
6122 * 1. If we're still processing the current operation on `ipsq', then
6123 * dequeue the next message for the operation (from ipx_mphead), or
6124 * return NULL if there are no queued messages for the operation.
6125 * These messages are queued via CUR_OP to qwriter_ip() and friends.
6127 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is
6128 * not set) see if the ipsq has requested an xop switch. If so, switch
6129 * `ipsq' to a different xop. Xop switches only happen when joining or
6130 * leaving IPMP groups and require a careful dance -- see the comments
6131 * in-line below for details. If we're leaving a group xop or if we're
6132 * joining a group xop and become writer on it, then we proceed to (3).
6133 * Otherwise, we return NULL and exit the xop.
6135 * 3. For each IPSQ in the xop, return any switch operation stored on
6136 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before
6137 * any other messages queued on the IPSQ. Otherwise, dequeue the next
6138 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead.
6139 * Note that if the phyint tied to `ipsq' is not using IPMP there will
6140 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for
6141 * each phyint in the group, including the IPMP meta-interface phyint.
6143 static mblk_t *
6144 ipsq_dq(ipsq_t *ipsq)
6146 ill_t *illv4, *illv6;
6147 mblk_t *mp;
6148 ipsq_t *xopipsq;
6149 ipsq_t *leftipsq = NULL;
6150 ipxop_t *ipx;
6151 phyint_t *phyi = ipsq->ipsq_phyint;
6152 ip_stack_t *ipst = ipsq->ipsq_ipst;
6153 boolean_t emptied = B_FALSE;
6156 * Grab all the locks we need in the defined order (ill_g_lock ->
6157 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next.
6159 rw_enter(&ipst->ips_ill_g_lock,
6160 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER);
6161 mutex_enter(&ipsq->ipsq_lock);
6162 ipx = ipsq->ipsq_xop;
6163 mutex_enter(&ipx->ipx_lock);
6166 * Dequeue the next message associated with the current exclusive
6167 * operation, if any.
6169 if ((mp = ipx->ipx_mphead) != NULL) {
6170 ipx->ipx_mphead = mp->b_next;
6171 if (ipx->ipx_mphead == NULL)
6172 ipx->ipx_mptail = NULL;
6173 mp->b_next = (void *)ipsq;
6174 goto out;
6177 if (ipx->ipx_current_ipif != NULL)
6178 goto empty;
6180 if (ipsq->ipsq_swxop != NULL) {
6182 * The exclusive operation that is now being completed has
6183 * requested a switch to a different xop. This happens
6184 * when an interface joins or leaves an IPMP group. Joins
6185 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()).
6186 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb
6187 * (phyint_free()), or interface plumb for an ill type
6188 * not in the IPMP group (ip_rput_dlpi_writer()).
6190 * Xop switches are not allowed on the IPMP meta-interface.
6192 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP));
6193 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
6194 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq);
6196 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) {
6198 * We're switching back to our own xop, so we have two
6199 * xop's to drain/exit: our own, and the group xop
6200 * that we are leaving.
6202 * First, pull ourselves out of the group ipsq list.
6203 * This is safe since we're writer on ill_g_lock.
6205 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop);
6207 xopipsq = ipx->ipx_ipsq;
6208 while (xopipsq->ipsq_next != ipsq)
6209 xopipsq = xopipsq->ipsq_next;
6211 xopipsq->ipsq_next = ipsq->ipsq_next;
6212 ipsq->ipsq_next = ipsq;
6213 ipsq->ipsq_xop = ipsq->ipsq_swxop;
6214 ipsq->ipsq_swxop = NULL;
6217 * Second, prepare to exit the group xop. The actual
6218 * ipsq_exit() is done at the end of this function
6219 * since we cannot hold any locks across ipsq_exit().
6220 * Note that although we drop the group's ipx_lock, no
6221 * threads can proceed since we're still ipx_writer.
6223 leftipsq = xopipsq;
6224 mutex_exit(&ipx->ipx_lock);
6227 * Third, set ipx to point to our own xop (which was
6228 * inactive and therefore can be entered).
6230 ipx = ipsq->ipsq_xop;
6231 mutex_enter(&ipx->ipx_lock);
6232 ASSERT(ipx->ipx_writer == NULL);
6233 ASSERT(ipx->ipx_current_ipif == NULL);
6234 } else {
6236 * We're switching from our own xop to a group xop.
6237 * The requestor of the switch must ensure that the
6238 * group xop cannot go away (e.g. by ensuring the
6239 * phyint associated with the xop cannot go away).
6241 * If we can become writer on our new xop, then we'll
6242 * do the drain. Otherwise, the current writer of our
6243 * new xop will do the drain when it exits.
6245 * First, splice ourselves into the group IPSQ list.
6246 * This is safe since we're writer on ill_g_lock.
6248 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6250 xopipsq = ipsq->ipsq_swxop->ipx_ipsq;
6251 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq)
6252 xopipsq = xopipsq->ipsq_next;
6254 xopipsq->ipsq_next = ipsq;
6255 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq;
6256 ipsq->ipsq_xop = ipsq->ipsq_swxop;
6257 ipsq->ipsq_swxop = NULL;
6260 * Second, exit our own xop, since it's now unused.
6261 * This is safe since we've got the only reference.
6263 ASSERT(ipx->ipx_writer == curthread);
6264 ipx->ipx_writer = NULL;
6265 VERIFY(--ipx->ipx_reentry_cnt == 0);
6266 ipx->ipx_ipsq_queued = B_FALSE;
6267 mutex_exit(&ipx->ipx_lock);
6270 * Third, set ipx to point to our new xop, and check
6271 * if we can become writer on it. If we cannot, then
6272 * the current writer will drain the IPSQ group when
6273 * it exits. Our ipsq_xop is guaranteed to be stable
6274 * because we're still holding ipsq_lock.
6276 ipx = ipsq->ipsq_xop;
6277 mutex_enter(&ipx->ipx_lock);
6278 if (ipx->ipx_writer != NULL ||
6279 ipx->ipx_current_ipif != NULL) {
6280 goto out;
6285 * Fourth, become writer on our new ipx before we continue
6286 * with the drain. Note that we never dropped ipsq_lock
6287 * above, so no other thread could've raced with us to
6288 * become writer first. Also, we're holding ipx_lock, so
6289 * no other thread can examine the ipx right now.
6291 ASSERT(ipx->ipx_current_ipif == NULL);
6292 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6293 VERIFY(ipx->ipx_reentry_cnt++ == 0);
6294 ipx->ipx_writer = curthread;
6295 ipx->ipx_forced = B_FALSE;
6296 #ifdef DEBUG
6297 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6298 #endif
6301 xopipsq = ipsq;
6302 do {
6304 * So that other operations operate on a consistent and
6305 * complete phyint, a switch message on an IPSQ must be
6306 * handled prior to any other operations on that IPSQ.
6308 if ((mp = xopipsq->ipsq_switch_mp) != NULL) {
6309 xopipsq->ipsq_switch_mp = NULL;
6310 ASSERT(mp->b_next == NULL);
6311 mp->b_next = (void *)xopipsq;
6312 goto out;
6315 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) {
6316 xopipsq->ipsq_xopq_mphead = mp->b_next;
6317 if (xopipsq->ipsq_xopq_mphead == NULL)
6318 xopipsq->ipsq_xopq_mptail = NULL;
6319 mp->b_next = (void *)xopipsq;
6320 goto out;
6322 } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6323 empty:
6325 * There are no messages. Further, we are holding ipx_lock, hence no
6326 * new messages can end up on any IPSQ in the xop.
6328 ipx->ipx_writer = NULL;
6329 ipx->ipx_forced = B_FALSE;
6330 VERIFY(--ipx->ipx_reentry_cnt == 0);
6331 ipx->ipx_ipsq_queued = B_FALSE;
6332 emptied = B_TRUE;
6333 #ifdef DEBUG
6334 ipx->ipx_depth = 0;
6335 #endif
6336 out:
6337 mutex_exit(&ipx->ipx_lock);
6338 mutex_exit(&ipsq->ipsq_lock);
6341 * If we completely emptied the xop, then wake up any threads waiting
6342 * to enter any of the IPSQ's associated with it.
6344 if (emptied) {
6345 xopipsq = ipsq;
6346 do {
6347 if ((phyi = xopipsq->ipsq_phyint) == NULL)
6348 continue;
6350 illv4 = phyi->phyint_illv4;
6351 illv6 = phyi->phyint_illv6;
6353 GRAB_ILL_LOCKS(illv4, illv6);
6354 if (illv4 != NULL)
6355 cv_broadcast(&illv4->ill_cv);
6356 if (illv6 != NULL)
6357 cv_broadcast(&illv6->ill_cv);
6358 RELEASE_ILL_LOCKS(illv4, illv6);
6359 } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6361 rw_exit(&ipst->ips_ill_g_lock);
6364 * Now that all locks are dropped, exit the IPSQ we left.
6366 if (leftipsq != NULL)
6367 ipsq_exit(leftipsq);
6369 return (mp);
6373 * Return completion status of previously initiated DLPI operations on
6374 * ills in the purview of an ipsq.
6376 static boolean_t
6377 ipsq_dlpi_done(ipsq_t *ipsq)
6379 ipsq_t *ipsq_start;
6380 phyint_t *phyi;
6381 ill_t *ill;
6383 ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock));
6384 ipsq_start = ipsq;
6386 do {
6388 * The only current users of this function are ipsq_try_enter
6389 * and ipsq_enter which have made sure that ipsq_writer is
6390 * NULL before we reach here. ill_dlpi_pending is modified
6391 * only by an ipsq writer
6393 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL);
6394 phyi = ipsq->ipsq_phyint;
6396 * phyi could be NULL if a phyint that is part of an
6397 * IPMP group is being unplumbed. A more detailed
6398 * comment is in ipmp_grp_update_kstats()
6400 if (phyi != NULL) {
6401 ill = phyi->phyint_illv4;
6402 if (ill != NULL &&
6403 (ill->ill_dlpi_pending != DL_PRIM_INVAL ||
6404 ill->ill_arl_dlpi_pending))
6405 return (B_FALSE);
6407 ill = phyi->phyint_illv6;
6408 if (ill != NULL &&
6409 ill->ill_dlpi_pending != DL_PRIM_INVAL)
6410 return (B_FALSE);
6413 } while ((ipsq = ipsq->ipsq_next) != ipsq_start);
6415 return (B_TRUE);
6419 * Enter the ipsq corresponding to ill, by waiting synchronously till
6420 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
6421 * will have to drain completely before ipsq_enter returns success.
6422 * ipx_current_ipif will be set if some exclusive op is in progress,
6423 * and the ipsq_exit logic will start the next enqueued op after
6424 * completion of the current op. If 'force' is used, we don't wait
6425 * for the enqueued ops. This is needed when a conn_close wants to
6426 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
6427 * of an ill can also use this option. But we dont' use it currently.
6429 #define ENTER_SQ_WAIT_TICKS 100
6430 boolean_t
6431 ipsq_enter(ill_t *ill, boolean_t force, int type)
6433 ipsq_t *ipsq;
6434 ipxop_t *ipx;
6435 boolean_t waited_enough = B_FALSE;
6436 ip_stack_t *ipst = ill->ill_ipst;
6439 * Note that the relationship between ill and ipsq is fixed as long as
6440 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the
6441 * relationship between the IPSQ and xop cannot change. However,
6442 * since we cannot hold ipsq_lock across the cv_wait(), it may change
6443 * while we're waiting. We wait on ill_cv and rely on ipsq_exit()
6444 * waking up all ills in the xop when it becomes available.
6446 for (;;) {
6447 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6448 mutex_enter(&ill->ill_lock);
6449 if (ill->ill_state_flags & ILL_CONDEMNED) {
6450 mutex_exit(&ill->ill_lock);
6451 rw_exit(&ipst->ips_ill_g_lock);
6452 return (B_FALSE);
6455 ipsq = ill->ill_phyint->phyint_ipsq;
6456 mutex_enter(&ipsq->ipsq_lock);
6457 ipx = ipsq->ipsq_xop;
6458 mutex_enter(&ipx->ipx_lock);
6460 if (ipx->ipx_writer == NULL && (type == CUR_OP ||
6461 (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) ||
6462 waited_enough))
6463 break;
6465 rw_exit(&ipst->ips_ill_g_lock);
6467 if (!force || ipx->ipx_writer != NULL) {
6468 mutex_exit(&ipx->ipx_lock);
6469 mutex_exit(&ipsq->ipsq_lock);
6470 cv_wait(&ill->ill_cv, &ill->ill_lock);
6471 } else {
6472 mutex_exit(&ipx->ipx_lock);
6473 mutex_exit(&ipsq->ipsq_lock);
6474 (void) cv_reltimedwait(&ill->ill_cv,
6475 &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK);
6476 waited_enough = B_TRUE;
6478 mutex_exit(&ill->ill_lock);
6481 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6482 ASSERT(ipx->ipx_reentry_cnt == 0);
6483 ipx->ipx_writer = curthread;
6484 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL);
6485 ipx->ipx_reentry_cnt++;
6486 #ifdef DEBUG
6487 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6488 #endif
6489 mutex_exit(&ipx->ipx_lock);
6490 mutex_exit(&ipsq->ipsq_lock);
6491 mutex_exit(&ill->ill_lock);
6492 rw_exit(&ipst->ips_ill_g_lock);
6494 return (B_TRUE);
6498 * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock
6499 * across the call to the core interface ipsq_try_enter() and hence calls this
6500 * function directly. This is explained more fully in ipif_set_values().
6501 * In order to support the above constraint, ipsq_try_enter is implemented as
6502 * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently
6504 static ipsq_t *
6505 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
6506 int type, boolean_t reentry_ok)
6508 ipsq_t *ipsq;
6509 ipxop_t *ipx;
6510 ip_stack_t *ipst = ill->ill_ipst;
6513 * lock ordering:
6514 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
6516 * ipx of an ipsq can't change when ipsq_lock is held.
6518 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
6519 GRAB_CONN_LOCK(q);
6520 mutex_enter(&ill->ill_lock);
6521 ipsq = ill->ill_phyint->phyint_ipsq;
6522 mutex_enter(&ipsq->ipsq_lock);
6523 ipx = ipsq->ipsq_xop;
6524 mutex_enter(&ipx->ipx_lock);
6527 * 1. Enter the ipsq if we are already writer and reentry is ok.
6528 * (Note: If the caller does not specify reentry_ok then neither
6529 * 'func' nor any of its callees must ever attempt to enter the ipsq
6530 * again. Otherwise it can lead to an infinite loop
6531 * 2. Enter the ipsq if there is no current writer and this attempted
6532 * entry is part of the current operation
6533 * 3. Enter the ipsq if there is no current writer and this is a new
6534 * operation and the operation queue is empty and there is no
6535 * operation currently in progress and if all previously initiated
6536 * DLPI operations have completed.
6538 if ((ipx->ipx_writer == curthread && reentry_ok) ||
6539 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
6540 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL &&
6541 ipsq_dlpi_done(ipsq))))) {
6542 /* Success. */
6543 ipx->ipx_reentry_cnt++;
6544 ipx->ipx_writer = curthread;
6545 ipx->ipx_forced = B_FALSE;
6546 mutex_exit(&ipx->ipx_lock);
6547 mutex_exit(&ipsq->ipsq_lock);
6548 mutex_exit(&ill->ill_lock);
6549 RELEASE_CONN_LOCK(q);
6550 #ifdef DEBUG
6551 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6552 #endif
6553 return (ipsq);
6556 if (func != NULL)
6557 ipsq_enq(ipsq, q, mp, func, type, ill);
6559 mutex_exit(&ipx->ipx_lock);
6560 mutex_exit(&ipsq->ipsq_lock);
6561 mutex_exit(&ill->ill_lock);
6562 RELEASE_CONN_LOCK(q);
6563 return (NULL);
6567 * The ipsq_t (ipsq) is the synchronization data structure used to serialize
6568 * certain critical operations like plumbing (i.e. most set ioctls), etc.
6569 * There is one ipsq per phyint. The ipsq
6570 * serializes exclusive ioctls issued by applications on a per ipsq basis in
6571 * ipsq_xopq_mphead. It also protects against multiple threads executing in
6572 * the ipsq. Responses from the driver pertain to the current ioctl (say a
6573 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing
6574 * up the interface) and are enqueued in ipx_mphead.
6576 * If a thread does not want to reenter the ipsq when it is already writer,
6577 * it must make sure that the specified reentry point to be called later
6578 * when the ipsq is empty, nor any code path starting from the specified reentry
6579 * point must never ever try to enter the ipsq again. Otherwise it can lead
6580 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
6581 * When the thread that is currently exclusive finishes, it (ipsq_exit)
6582 * dequeues the requests waiting to become exclusive in ipx_mphead and calls
6583 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit
6584 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
6585 * ioctl if the current ioctl has completed. If the current ioctl is still
6586 * in progress it simply returns. The current ioctl could be waiting for
6587 * a response from another module (the driver or could be waiting for
6588 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
6589 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
6590 * execution of the ioctl and ipsq_exit does not start the next ioctl unless
6591 * ipx_current_ipif is NULL which happens only once the ioctl is complete and
6592 * all associated DLPI operations have completed.
6596 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif'
6597 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ
6598 * on success, or NULL on failure. The caller ensures ipif/ill is valid by
6599 * refholding it as necessary. If the IPSQ cannot be entered and `func' is
6600 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ
6601 * can be entered. If `func' is NULL, then `q' and `mp' are ignored.
6603 ipsq_t *
6604 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
6605 ipsq_func_t func, int type, boolean_t reentry_ok)
6607 ip_stack_t *ipst;
6608 ipsq_t *ipsq;
6610 /* Only 1 of ipif or ill can be specified */
6611 ASSERT((ipif != NULL) ^ (ill != NULL));
6613 if (ipif != NULL)
6614 ill = ipif->ipif_ill;
6615 ipst = ill->ill_ipst;
6617 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6618 ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok);
6619 rw_exit(&ipst->ips_ill_g_lock);
6621 return (ipsq);
6625 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures
6626 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ
6627 * cannot be entered, the mp is queued for completion.
6629 void
6630 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6631 boolean_t reentry_ok)
6633 ipsq_t *ipsq;
6635 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok);
6638 * Drop the caller's refhold on the ill. This is safe since we either
6639 * entered the IPSQ (and thus are exclusive), or failed to enter the
6640 * IPSQ, in which case we return without accessing ill anymore. This
6641 * is needed because func needs to see the correct refcount.
6642 * e.g. removeif can work only then.
6644 ill_refrele(ill);
6645 if (ipsq != NULL) {
6646 (*func)(ipsq, q, mp, NULL);
6647 ipsq_exit(ipsq);
6652 * Exit the specified IPSQ. If this is the final exit on it then drain it
6653 * prior to exiting. Caller must be writer on the specified IPSQ.
6655 void
6656 ipsq_exit(ipsq_t *ipsq)
6658 mblk_t *mp;
6659 ipsq_t *mp_ipsq;
6660 queue_t *q;
6661 phyint_t *phyi;
6662 ipsq_func_t func;
6664 ASSERT(IAM_WRITER_IPSQ(ipsq));
6666 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1);
6667 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) {
6668 ipsq->ipsq_xop->ipx_reentry_cnt--;
6669 return;
6672 for (;;) {
6673 phyi = ipsq->ipsq_phyint;
6674 mp = ipsq_dq(ipsq);
6675 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next;
6678 * If we've changed to a new IPSQ, and the phyint associated
6679 * with the old one has gone away, free the old IPSQ. Note
6680 * that this cannot happen while the IPSQ is in a group.
6682 if (mp_ipsq != ipsq && phyi == NULL) {
6683 ASSERT(ipsq->ipsq_next == ipsq);
6684 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6685 ipsq_delete(ipsq);
6688 if (mp == NULL)
6689 break;
6691 q = mp->b_queue;
6692 func = (ipsq_func_t)mp->b_prev;
6693 ipsq = mp_ipsq;
6694 mp->b_next = mp->b_prev = NULL;
6695 mp->b_queue = NULL;
6698 * If 'q' is an conn queue, it is valid, since we did a
6699 * a refhold on the conn at the start of the ioctl.
6700 * If 'q' is an ill queue, it is valid, since close of an
6701 * ill will clean up its IPSQ.
6703 (*func)(ipsq, q, mp, NULL);
6708 * Used to start any igmp or mld timers that could not be started
6709 * while holding ill_mcast_lock. The timers can't be started while holding
6710 * the lock, since mld/igmp_start_timers may need to call untimeout()
6711 * which can't be done while holding the lock which the timeout handler
6712 * acquires. Otherwise
6713 * there could be a deadlock since the timeout handlers
6714 * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire
6715 * ill_mcast_lock.
6717 void
6718 ill_mcast_timer_start(ip_stack_t *ipst)
6720 int next;
6722 mutex_enter(&ipst->ips_igmp_timer_lock);
6723 next = ipst->ips_igmp_deferred_next;
6724 ipst->ips_igmp_deferred_next = INFINITY;
6725 mutex_exit(&ipst->ips_igmp_timer_lock);
6727 if (next != INFINITY)
6728 igmp_start_timers(next, ipst);
6730 mutex_enter(&ipst->ips_mld_timer_lock);
6731 next = ipst->ips_mld_deferred_next;
6732 ipst->ips_mld_deferred_next = INFINITY;
6733 mutex_exit(&ipst->ips_mld_timer_lock);
6735 if (next != INFINITY)
6736 mld_start_timers(next, ipst);
6740 * Start the current exclusive operation on `ipsq'; associate it with `ipif'
6741 * and `ioccmd'.
6743 void
6744 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
6746 ill_t *ill = ipif->ipif_ill;
6747 ipxop_t *ipx = ipsq->ipsq_xop;
6749 ASSERT(IAM_WRITER_IPSQ(ipsq));
6750 ASSERT(ipx->ipx_current_ipif == NULL);
6751 ASSERT(ipx->ipx_current_ioctl == 0);
6753 ipx->ipx_current_done = B_FALSE;
6754 ipx->ipx_current_ioctl = ioccmd;
6755 mutex_enter(&ipx->ipx_lock);
6756 ipx->ipx_current_ipif = ipif;
6757 mutex_exit(&ipx->ipx_lock);
6760 * Set IPIF_CHANGING on one or more ipifs associated with the
6761 * current exclusive operation. IPIF_CHANGING prevents any new
6762 * references to the ipif (so that the references will eventually
6763 * drop to zero) and also prevents any "get" operations (e.g.,
6764 * SIOCGLIFFLAGS) from being able to access the ipif until the
6765 * operation has completed and the ipif is again in a stable state.
6767 * For ioctls, IPIF_CHANGING is set on the ipif associated with the
6768 * ioctl. For internal operations (where ioccmd is zero), all ipifs
6769 * on the ill are marked with IPIF_CHANGING since it's unclear which
6770 * ipifs will be affected.
6772 * Note that SIOCLIFREMOVEIF is a special case as it sets
6773 * IPIF_CONDEMNED internally after identifying the right ipif to
6774 * operate on.
6776 switch (ioccmd) {
6777 case SIOCLIFREMOVEIF:
6778 break;
6779 case 0:
6780 mutex_enter(&ill->ill_lock);
6781 ipif = ipif->ipif_ill->ill_ipif;
6782 for (; ipif != NULL; ipif = ipif->ipif_next)
6783 ipif->ipif_state_flags |= IPIF_CHANGING;
6784 mutex_exit(&ill->ill_lock);
6785 break;
6786 default:
6787 mutex_enter(&ill->ill_lock);
6788 ipif->ipif_state_flags |= IPIF_CHANGING;
6789 mutex_exit(&ill->ill_lock);
6794 * Finish the current exclusive operation on `ipsq'. Usually, this will allow
6795 * the next exclusive operation to begin once we ipsq_exit(). However, if
6796 * pending DLPI operations remain, then we will wait for the queue to drain
6797 * before allowing the next exclusive operation to begin. This ensures that
6798 * DLPI operations from one exclusive operation are never improperly processed
6799 * as part of a subsequent exclusive operation.
6801 void
6802 ipsq_current_finish(ipsq_t *ipsq)
6804 ipxop_t *ipx = ipsq->ipsq_xop;
6805 t_uscalar_t dlpi_pending = DL_PRIM_INVAL;
6806 ipif_t *ipif = ipx->ipx_current_ipif;
6808 ASSERT(IAM_WRITER_IPSQ(ipsq));
6811 * For SIOCLIFREMOVEIF, the ipif has been already been blown away
6812 * (but in that case, IPIF_CHANGING will already be clear and no
6813 * pending DLPI messages can remain).
6815 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) {
6816 ill_t *ill = ipif->ipif_ill;
6818 mutex_enter(&ill->ill_lock);
6819 dlpi_pending = ill->ill_dlpi_pending;
6820 if (ipx->ipx_current_ioctl == 0) {
6821 ipif = ill->ill_ipif;
6822 for (; ipif != NULL; ipif = ipif->ipif_next)
6823 ipif->ipif_state_flags &= ~IPIF_CHANGING;
6824 } else {
6825 ipif->ipif_state_flags &= ~IPIF_CHANGING;
6827 mutex_exit(&ill->ill_lock);
6830 ASSERT(!ipx->ipx_current_done);
6831 ipx->ipx_current_done = B_TRUE;
6832 ipx->ipx_current_ioctl = 0;
6833 if (dlpi_pending == DL_PRIM_INVAL) {
6834 mutex_enter(&ipx->ipx_lock);
6835 ipx->ipx_current_ipif = NULL;
6836 mutex_exit(&ipx->ipx_lock);
6841 * The ill is closing. Flush all messages on the ipsq that originated
6842 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead
6843 * for this ill since ipsq_enter could not have entered until then.
6844 * New messages can't be queued since the CONDEMNED flag is set.
6846 static void
6847 ipsq_flush(ill_t *ill)
6849 queue_t *q;
6850 mblk_t *prev;
6851 mblk_t *mp;
6852 mblk_t *mp_next;
6853 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
6855 ASSERT(IAM_WRITER_ILL(ill));
6858 * Flush any messages sent up by the driver.
6860 mutex_enter(&ipx->ipx_lock);
6861 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) {
6862 mp_next = mp->b_next;
6863 q = mp->b_queue;
6864 if (q == ill->ill_rq || q == ill->ill_wq) {
6865 /* dequeue mp */
6866 if (prev == NULL)
6867 ipx->ipx_mphead = mp->b_next;
6868 else
6869 prev->b_next = mp->b_next;
6870 if (ipx->ipx_mptail == mp) {
6871 ASSERT(mp_next == NULL);
6872 ipx->ipx_mptail = prev;
6874 inet_freemsg(mp);
6875 } else {
6876 prev = mp;
6879 mutex_exit(&ipx->ipx_lock);
6880 (void) ipsq_pending_mp_cleanup(ill, NULL);
6881 ipsq_xopq_mp_cleanup(ill, NULL);
6885 * Parse an ifreq or lifreq struct coming down ioctls and refhold
6886 * and return the associated ipif.
6887 * Return value:
6888 * Non zero: An error has occurred. ci may not be filled out.
6889 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and
6890 * a held ipif in ci.ci_ipif.
6893 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
6894 cmd_info_t *ci)
6896 char *name;
6897 struct ifreq *ifr;
6898 struct lifreq *lifr;
6899 ipif_t *ipif = NULL;
6900 ill_t *ill;
6901 conn_t *connp;
6902 boolean_t isv6;
6903 int err;
6904 mblk_t *mp1;
6905 zoneid_t zoneid;
6906 ip_stack_t *ipst;
6908 if (q->q_next != NULL) {
6909 ill = (ill_t *)q->q_ptr;
6910 isv6 = ill->ill_isv6;
6911 connp = NULL;
6912 zoneid = ALL_ZONES;
6913 ipst = ill->ill_ipst;
6914 } else {
6915 ill = NULL;
6916 connp = Q_TO_CONN(q);
6917 isv6 = (connp->conn_family == AF_INET6);
6918 zoneid = connp->conn_zoneid;
6919 if (zoneid == GLOBAL_ZONEID) {
6920 /* global zone can access ipifs in all zones */
6921 zoneid = ALL_ZONES;
6923 ipst = connp->conn_netstack->netstack_ip;
6926 /* Has been checked in ip_wput_nondata */
6927 mp1 = mp->b_cont->b_cont;
6929 if (ipip->ipi_cmd_type == IF_CMD) {
6930 /* This a old style SIOC[GS]IF* command */
6931 ifr = (struct ifreq *)mp1->b_rptr;
6933 * Null terminate the string to protect against buffer
6934 * overrun. String was generated by user code and may not
6935 * be trusted.
6937 ifr->ifr_name[IFNAMSIZ - 1] = '\0';
6938 name = ifr->ifr_name;
6939 ci->ci_sin = (sin_t *)&ifr->ifr_addr;
6940 ci->ci_sin6 = NULL;
6941 ci->ci_lifr = (struct lifreq *)ifr;
6942 } else {
6943 /* This a new style SIOC[GS]LIF* command */
6944 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
6945 lifr = (struct lifreq *)mp1->b_rptr;
6947 * Null terminate the string to protect against buffer
6948 * overrun. String was generated by user code and may not
6949 * be trusted.
6951 lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
6952 name = lifr->lifr_name;
6953 ci->ci_sin = (sin_t *)&lifr->lifr_addr;
6954 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr;
6955 ci->ci_lifr = lifr;
6958 if (ipip->ipi_cmd == SIOCSLIFNAME) {
6960 * The ioctl will be failed if the ioctl comes down
6961 * an conn stream
6963 if (ill == NULL) {
6965 * Not an ill queue, return EINVAL same as the
6966 * old error code.
6968 return (ENXIO);
6970 ipif = ill->ill_ipif;
6971 ipif_refhold(ipif);
6972 } else {
6974 * Ensure that ioctls don't see any internal state changes
6975 * caused by set ioctls by deferring them if IPIF_CHANGING is
6976 * set.
6978 ipif = ipif_lookup_on_name_async(name, mi_strlen(name),
6979 isv6, zoneid, q, mp, ip_process_ioctl, &err, ipst);
6980 if (ipif == NULL) {
6981 if (err == EINPROGRESS)
6982 return (err);
6983 err = 0; /* Ensure we don't use it below */
6988 * Old style [GS]IFCMD does not admit IPv6 ipif
6990 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) {
6991 ipif_refrele(ipif);
6992 return (ENXIO);
6995 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL &&
6996 name[0] == '\0') {
6998 * Handle a or a SIOC?IF* with a null name
6999 * during plumb (on the ill queue before the I_PLINK).
7001 ipif = ill->ill_ipif;
7002 ipif_refhold(ipif);
7005 if (ipif == NULL)
7006 return (ENXIO);
7008 DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq",
7009 int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif);
7011 ci->ci_ipif = ipif;
7012 return (0);
7016 * Return the total number of ipifs.
7018 static uint_t
7019 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
7021 uint_t numifs = 0;
7022 ill_t *ill;
7023 ill_walk_context_t ctx;
7024 ipif_t *ipif;
7026 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7027 ill = ILL_START_WALK_V4(&ctx, ipst);
7028 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7029 if (IS_UNDER_IPMP(ill))
7030 continue;
7031 for (ipif = ill->ill_ipif; ipif != NULL;
7032 ipif = ipif->ipif_next) {
7033 if (ipif->ipif_zoneid == zoneid ||
7034 ipif->ipif_zoneid == ALL_ZONES)
7035 numifs++;
7038 rw_exit(&ipst->ips_ill_g_lock);
7039 return (numifs);
7043 * Return the total number of ipifs.
7045 static uint_t
7046 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
7048 uint_t numifs = 0;
7049 ill_t *ill;
7050 ipif_t *ipif;
7051 ill_walk_context_t ctx;
7053 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid));
7055 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7056 if (family == AF_INET)
7057 ill = ILL_START_WALK_V4(&ctx, ipst);
7058 else if (family == AF_INET6)
7059 ill = ILL_START_WALK_V6(&ctx, ipst);
7060 else
7061 ill = ILL_START_WALK_ALL(&ctx, ipst);
7063 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7064 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP))
7065 continue;
7067 for (ipif = ill->ill_ipif; ipif != NULL;
7068 ipif = ipif->ipif_next) {
7069 if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7070 !(lifn_flags & LIFC_NOXMIT))
7071 continue;
7072 if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7073 !(lifn_flags & LIFC_TEMPORARY))
7074 continue;
7075 if (((ipif->ipif_flags &
7076 (IPIF_NOXMIT|IPIF_NOLOCAL|
7077 IPIF_DEPRECATED)) ||
7078 IS_LOOPBACK(ill) ||
7079 !(ipif->ipif_flags & IPIF_UP)) &&
7080 (lifn_flags & LIFC_EXTERNAL_SOURCE))
7081 continue;
7083 if (zoneid != ipif->ipif_zoneid &&
7084 ipif->ipif_zoneid != ALL_ZONES &&
7085 (zoneid != GLOBAL_ZONEID ||
7086 !(lifn_flags & LIFC_ALLZONES)))
7087 continue;
7089 numifs++;
7092 rw_exit(&ipst->ips_ill_g_lock);
7093 return (numifs);
7096 uint_t
7097 ip_get_lifsrcofnum(ill_t *ill)
7099 uint_t numifs = 0;
7100 ill_t *ill_head = ill;
7101 ip_stack_t *ipst = ill->ill_ipst;
7104 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some
7105 * other thread may be trying to relink the ILLs in this usesrc group
7106 * and adjusting the ill_usesrc_grp_next pointers
7108 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7109 if ((ill->ill_usesrc_ifindex == 0) &&
7110 (ill->ill_usesrc_grp_next != NULL)) {
7111 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head);
7112 ill = ill->ill_usesrc_grp_next)
7113 numifs++;
7115 rw_exit(&ipst->ips_ill_g_usesrc_lock);
7117 return (numifs);
7120 /* Null values are passed in for ipif, sin, and ifreq */
7121 /* ARGSUSED */
7123 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7124 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7126 int *nump;
7127 conn_t *connp = Q_TO_CONN(q);
7129 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7131 /* Existence of b_cont->b_cont checked in ip_wput_nondata */
7132 nump = (int *)mp->b_cont->b_cont->b_rptr;
7134 *nump = ip_get_numifs(connp->conn_zoneid,
7135 connp->conn_netstack->netstack_ip);
7136 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump));
7137 return (0);
7140 /* Null values are passed in for ipif, sin, and ifreq */
7141 /* ARGSUSED */
7143 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin,
7144 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7146 struct lifnum *lifn;
7147 mblk_t *mp1;
7148 conn_t *connp = Q_TO_CONN(q);
7150 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7152 /* Existence checked in ip_wput_nondata */
7153 mp1 = mp->b_cont->b_cont;
7155 lifn = (struct lifnum *)mp1->b_rptr;
7156 switch (lifn->lifn_family) {
7157 case AF_UNSPEC:
7158 case AF_INET:
7159 case AF_INET6:
7160 break;
7161 default:
7162 return (EAFNOSUPPORT);
7165 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags,
7166 connp->conn_zoneid, connp->conn_netstack->netstack_ip);
7167 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count));
7168 return (0);
7171 /* ARGSUSED */
7173 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7174 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7176 STRUCT_HANDLE(ifconf, ifc);
7177 mblk_t *mp1;
7178 struct iocblk *iocp;
7179 struct ifreq *ifr;
7180 ill_walk_context_t ctx;
7181 ill_t *ill;
7182 ipif_t *ipif;
7183 struct sockaddr_in *sin;
7184 int32_t ifclen;
7185 zoneid_t zoneid;
7186 ip_stack_t *ipst = CONNQ_TO_IPST(q);
7188 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */
7190 ip1dbg(("ip_sioctl_get_ifconf"));
7191 /* Existence verified in ip_wput_nondata */
7192 mp1 = mp->b_cont->b_cont;
7193 iocp = (struct iocblk *)mp->b_rptr;
7194 zoneid = Q_TO_CONN(q)->conn_zoneid;
7197 * The original SIOCGIFCONF passed in a struct ifconf which specified
7198 * the user buffer address and length into which the list of struct
7199 * ifreqs was to be copied. Since AT&T Streams does not seem to
7200 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS,
7201 * the SIOCGIFCONF operation was redefined to simply provide
7202 * a large output buffer into which we are supposed to jam the ifreq
7203 * array. The same ioctl command code was used, despite the fact that
7204 * both the applications and the kernel code had to change, thus making
7205 * it impossible to support both interfaces.
7207 * For reasons not good enough to try to explain, the following
7208 * algorithm is used for deciding what to do with one of these:
7209 * If the IOCTL comes in as an I_STR, it is assumed to be of the new
7210 * form with the output buffer coming down as the continuation message.
7211 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style,
7212 * and we have to copy in the ifconf structure to find out how big the
7213 * output buffer is and where to copy out to. Sure no problem...
7216 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL);
7217 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) {
7218 int numifs = 0;
7219 size_t ifc_bufsize;
7222 * Must be (better be!) continuation of a TRANSPARENT
7223 * IOCTL. We just copied in the ifconf structure.
7225 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag,
7226 (struct ifconf *)mp1->b_rptr);
7229 * Allocate a buffer to hold requested information.
7231 * If ifc_len is larger than what is needed, we only
7232 * allocate what we will use.
7234 * If ifc_len is smaller than what is needed, return
7235 * EINVAL.
7237 * XXX: the ill_t structure can hava 2 counters, for
7238 * v4 and v6 (not just ill_ipif_up_count) to store the
7239 * number of interfaces for a device, so we don't need
7240 * to count them here...
7242 numifs = ip_get_numifs(zoneid, ipst);
7244 ifclen = STRUCT_FGET(ifc, ifc_len);
7245 ifc_bufsize = numifs * sizeof (struct ifreq);
7246 if (ifc_bufsize > ifclen) {
7247 if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7248 /* old behaviour */
7249 return (EINVAL);
7250 } else {
7251 ifc_bufsize = ifclen;
7255 mp1 = mi_copyout_alloc(q, mp,
7256 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE);
7257 if (mp1 == NULL)
7258 return (ENOMEM);
7260 mp1->b_wptr = mp1->b_rptr + ifc_bufsize;
7262 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7264 * the SIOCGIFCONF ioctl only knows about
7265 * IPv4 addresses, so don't try to tell
7266 * it about interfaces with IPv6-only
7267 * addresses. (Last parm 'isv6' is B_FALSE)
7270 ifr = (struct ifreq *)mp1->b_rptr;
7272 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7273 ill = ILL_START_WALK_V4(&ctx, ipst);
7274 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7275 if (IS_UNDER_IPMP(ill))
7276 continue;
7277 for (ipif = ill->ill_ipif; ipif != NULL;
7278 ipif = ipif->ipif_next) {
7279 if (zoneid != ipif->ipif_zoneid &&
7280 ipif->ipif_zoneid != ALL_ZONES)
7281 continue;
7282 if ((uchar_t *)&ifr[1] > mp1->b_wptr) {
7283 if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7284 /* old behaviour */
7285 rw_exit(&ipst->ips_ill_g_lock);
7286 return (EINVAL);
7287 } else {
7288 goto if_copydone;
7291 ipif_get_name(ipif, ifr->ifr_name,
7292 sizeof (ifr->ifr_name));
7293 sin = (sin_t *)&ifr->ifr_addr;
7294 *sin = sin_null;
7295 sin->sin_family = AF_INET;
7296 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7297 ifr++;
7300 if_copydone:
7301 rw_exit(&ipst->ips_ill_g_lock);
7302 mp1->b_wptr = (uchar_t *)ifr;
7304 if (STRUCT_BUF(ifc) != NULL) {
7305 STRUCT_FSET(ifc, ifc_len,
7306 (int)((uchar_t *)ifr - mp1->b_rptr));
7308 return (0);
7312 * Get the interfaces using the address hosted on the interface passed in,
7313 * as a source adddress
7315 /* ARGSUSED */
7317 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7318 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7320 mblk_t *mp1;
7321 ill_t *ill, *ill_head;
7322 ipif_t *ipif, *orig_ipif;
7323 int numlifs = 0;
7324 size_t lifs_bufsize, lifsmaxlen;
7325 struct lifreq *lifr;
7326 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7327 uint_t ifindex;
7328 zoneid_t zoneid;
7329 boolean_t isv6 = B_FALSE;
7330 struct sockaddr_in *sin;
7331 struct sockaddr_in6 *sin6;
7332 STRUCT_HANDLE(lifsrcof, lifs);
7333 ip_stack_t *ipst;
7335 ipst = CONNQ_TO_IPST(q);
7337 ASSERT(q->q_next == NULL);
7339 zoneid = Q_TO_CONN(q)->conn_zoneid;
7341 /* Existence verified in ip_wput_nondata */
7342 mp1 = mp->b_cont->b_cont;
7345 * Must be (better be!) continuation of a TRANSPARENT
7346 * IOCTL. We just copied in the lifsrcof structure.
7348 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag,
7349 (struct lifsrcof *)mp1->b_rptr);
7351 if (MBLKL(mp1) != STRUCT_SIZE(lifs))
7352 return (EINVAL);
7354 ifindex = STRUCT_FGET(lifs, lifs_ifindex);
7355 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
7356 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst);
7357 if (ipif == NULL) {
7358 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n",
7359 ifindex));
7360 return (ENXIO);
7363 /* Allocate a buffer to hold requested information */
7364 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill);
7365 lifs_bufsize = numlifs * sizeof (struct lifreq);
7366 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen);
7367 /* The actual size needed is always returned in lifs_len */
7368 STRUCT_FSET(lifs, lifs_len, lifs_bufsize);
7370 /* If the amount we need is more than what is passed in, abort */
7371 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) {
7372 ipif_refrele(ipif);
7373 return (0);
7376 mp1 = mi_copyout_alloc(q, mp,
7377 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE);
7378 if (mp1 == NULL) {
7379 ipif_refrele(ipif);
7380 return (ENOMEM);
7383 mp1->b_wptr = mp1->b_rptr + lifs_bufsize;
7384 bzero(mp1->b_rptr, lifs_bufsize);
7386 lifr = (struct lifreq *)mp1->b_rptr;
7388 ill = ill_head = ipif->ipif_ill;
7389 orig_ipif = ipif;
7391 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
7392 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7393 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7395 ill = ill->ill_usesrc_grp_next; /* start from next ill */
7396 for (; (ill != NULL) && (ill != ill_head);
7397 ill = ill->ill_usesrc_grp_next) {
7399 if ((uchar_t *)&lifr[1] > mp1->b_wptr)
7400 break;
7402 ipif = ill->ill_ipif;
7403 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name));
7404 if (ipif->ipif_isv6) {
7405 sin6 = (sin6_t *)&lifr->lifr_addr;
7406 *sin6 = sin6_null;
7407 sin6->sin6_family = AF_INET6;
7408 sin6->sin6_addr = ipif->ipif_v6lcl_addr;
7409 lifr->lifr_addrlen = ip_mask_to_plen_v6(
7410 &ipif->ipif_v6net_mask);
7411 } else {
7412 sin = (sin_t *)&lifr->lifr_addr;
7413 *sin = sin_null;
7414 sin->sin_family = AF_INET;
7415 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7416 lifr->lifr_addrlen = ip_mask_to_plen(
7417 ipif->ipif_net_mask);
7419 lifr++;
7421 rw_exit(&ipst->ips_ill_g_lock);
7422 rw_exit(&ipst->ips_ill_g_usesrc_lock);
7423 ipif_refrele(orig_ipif);
7424 mp1->b_wptr = (uchar_t *)lifr;
7425 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr));
7427 return (0);
7430 /* ARGSUSED */
7432 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7433 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7435 mblk_t *mp1;
7436 int list;
7437 ill_t *ill;
7438 ipif_t *ipif;
7439 int flags;
7440 int numlifs = 0;
7441 size_t lifc_bufsize;
7442 struct lifreq *lifr;
7443 sa_family_t family;
7444 struct sockaddr_in *sin;
7445 struct sockaddr_in6 *sin6;
7446 ill_walk_context_t ctx;
7447 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7448 int32_t lifclen;
7449 zoneid_t zoneid;
7450 STRUCT_HANDLE(lifconf, lifc);
7451 ip_stack_t *ipst = CONNQ_TO_IPST(q);
7453 ip1dbg(("ip_sioctl_get_lifconf"));
7455 ASSERT(q->q_next == NULL);
7457 zoneid = Q_TO_CONN(q)->conn_zoneid;
7459 /* Existence verified in ip_wput_nondata */
7460 mp1 = mp->b_cont->b_cont;
7463 * An extended version of SIOCGIFCONF that takes an
7464 * additional address family and flags field.
7465 * AF_UNSPEC retrieve both IPv4 and IPv6.
7466 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT
7467 * interfaces are omitted.
7468 * Similarly, IPIF_TEMPORARY interfaces are omitted
7469 * unless LIFC_TEMPORARY is specified.
7470 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT,
7471 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and
7472 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE
7473 * has priority over LIFC_NOXMIT.
7475 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL);
7477 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc))
7478 return (EINVAL);
7481 * Must be (better be!) continuation of a TRANSPARENT
7482 * IOCTL. We just copied in the lifconf structure.
7484 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr);
7486 family = STRUCT_FGET(lifc, lifc_family);
7487 flags = STRUCT_FGET(lifc, lifc_flags);
7489 switch (family) {
7490 case AF_UNSPEC:
7492 * walk all ILL's.
7494 list = MAX_G_HEADS;
7495 break;
7496 case AF_INET:
7498 * walk only IPV4 ILL's.
7500 list = IP_V4_G_HEAD;
7501 break;
7502 case AF_INET6:
7504 * walk only IPV6 ILL's.
7506 list = IP_V6_G_HEAD;
7507 break;
7508 default:
7509 return (EAFNOSUPPORT);
7513 * Allocate a buffer to hold requested information.
7515 * If lifc_len is larger than what is needed, we only
7516 * allocate what we will use.
7518 * If lifc_len is smaller than what is needed, return
7519 * EINVAL.
7521 numlifs = ip_get_numlifs(family, flags, zoneid, ipst);
7522 lifc_bufsize = numlifs * sizeof (struct lifreq);
7523 lifclen = STRUCT_FGET(lifc, lifc_len);
7524 if (lifc_bufsize > lifclen) {
7525 if (iocp->ioc_cmd == O_SIOCGLIFCONF)
7526 return (EINVAL);
7527 else
7528 lifc_bufsize = lifclen;
7531 mp1 = mi_copyout_alloc(q, mp,
7532 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE);
7533 if (mp1 == NULL)
7534 return (ENOMEM);
7536 mp1->b_wptr = mp1->b_rptr + lifc_bufsize;
7537 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7539 lifr = (struct lifreq *)mp1->b_rptr;
7541 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7542 ill = ill_first(list, list, &ctx, ipst);
7543 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7544 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP))
7545 continue;
7547 for (ipif = ill->ill_ipif; ipif != NULL;
7548 ipif = ipif->ipif_next) {
7549 if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7550 !(flags & LIFC_NOXMIT))
7551 continue;
7553 if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7554 !(flags & LIFC_TEMPORARY))
7555 continue;
7557 if (((ipif->ipif_flags &
7558 (IPIF_NOXMIT|IPIF_NOLOCAL|
7559 IPIF_DEPRECATED)) ||
7560 IS_LOOPBACK(ill) ||
7561 !(ipif->ipif_flags & IPIF_UP)) &&
7562 (flags & LIFC_EXTERNAL_SOURCE))
7563 continue;
7565 if (zoneid != ipif->ipif_zoneid &&
7566 ipif->ipif_zoneid != ALL_ZONES &&
7567 (zoneid != GLOBAL_ZONEID ||
7568 !(flags & LIFC_ALLZONES)))
7569 continue;
7571 if ((uchar_t *)&lifr[1] > mp1->b_wptr) {
7572 if (iocp->ioc_cmd == O_SIOCGLIFCONF) {
7573 rw_exit(&ipst->ips_ill_g_lock);
7574 return (EINVAL);
7575 } else {
7576 goto lif_copydone;
7580 ipif_get_name(ipif, lifr->lifr_name,
7581 sizeof (lifr->lifr_name));
7582 lifr->lifr_type = ill->ill_type;
7583 if (ipif->ipif_isv6) {
7584 sin6 = (sin6_t *)&lifr->lifr_addr;
7585 *sin6 = sin6_null;
7586 sin6->sin6_family = AF_INET6;
7587 sin6->sin6_addr =
7588 ipif->ipif_v6lcl_addr;
7589 lifr->lifr_addrlen =
7590 ip_mask_to_plen_v6(
7591 &ipif->ipif_v6net_mask);
7592 } else {
7593 sin = (sin_t *)&lifr->lifr_addr;
7594 *sin = sin_null;
7595 sin->sin_family = AF_INET;
7596 sin->sin_addr.s_addr =
7597 ipif->ipif_lcl_addr;
7598 lifr->lifr_addrlen =
7599 ip_mask_to_plen(
7600 ipif->ipif_net_mask);
7602 lifr++;
7605 lif_copydone:
7606 rw_exit(&ipst->ips_ill_g_lock);
7608 mp1->b_wptr = (uchar_t *)lifr;
7609 if (STRUCT_BUF(lifc) != NULL) {
7610 STRUCT_FSET(lifc, lifc_len,
7611 (int)((uchar_t *)lifr - mp1->b_rptr));
7613 return (0);
7616 static void
7617 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
7619 ip6_asp_t *table;
7620 size_t table_size;
7621 mblk_t *data_mp;
7622 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7623 ip_stack_t *ipst;
7625 if (q->q_next == NULL)
7626 ipst = CONNQ_TO_IPST(q);
7627 else
7628 ipst = ILLQ_TO_IPST(q);
7630 /* These two ioctls are I_STR only */
7631 if (iocp->ioc_count == TRANSPARENT) {
7632 miocnak(q, mp, 0, EINVAL);
7633 return;
7636 data_mp = mp->b_cont;
7637 if (data_mp == NULL) {
7638 /* The user passed us a NULL argument */
7639 table = NULL;
7640 table_size = iocp->ioc_count;
7641 } else {
7643 * The user provided a table. The stream head
7644 * may have copied in the user data in chunks,
7645 * so make sure everything is pulled up
7646 * properly.
7648 if (MBLKL(data_mp) < iocp->ioc_count) {
7649 mblk_t *new_data_mp;
7650 if ((new_data_mp = msgpullup(data_mp, -1)) ==
7651 NULL) {
7652 miocnak(q, mp, 0, ENOMEM);
7653 return;
7655 freemsg(data_mp);
7656 data_mp = new_data_mp;
7657 mp->b_cont = data_mp;
7659 table = (ip6_asp_t *)data_mp->b_rptr;
7660 table_size = iocp->ioc_count;
7663 switch (iocp->ioc_cmd) {
7664 case SIOCGIP6ADDRPOLICY:
7665 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst);
7666 if (iocp->ioc_rval == -1)
7667 iocp->ioc_error = EINVAL;
7668 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7669 else if (table != NULL &&
7670 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) {
7671 ip6_asp_t *src = table;
7672 ip6_asp32_t *dst = (void *)table;
7673 int count = table_size / sizeof (ip6_asp_t);
7674 int i;
7677 * We need to do an in-place shrink of the array
7678 * to match the alignment attributes of the
7679 * 32-bit ABI looking at it.
7681 /* LINTED: logical expression always true: op "||" */
7682 ASSERT(sizeof (*src) > sizeof (*dst));
7683 for (i = 1; i < count; i++)
7684 bcopy(src + i, dst + i, sizeof (*dst));
7686 #endif
7687 break;
7689 case SIOCSIP6ADDRPOLICY:
7690 ASSERT(mp->b_prev == NULL);
7691 mp->b_prev = (void *)q;
7692 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7694 * We pass in the datamodel here so that the ip6_asp_replace()
7695 * routine can handle converting from 32-bit to native formats
7696 * where necessary.
7698 * A better way to handle this might be to convert the inbound
7699 * data structure here, and hang it off a new 'mp'; thus the
7700 * ip6_asp_replace() logic would always be dealing with native
7701 * format data structures..
7703 * (An even simpler way to handle these ioctls is to just
7704 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure
7705 * and just recompile everything that depends on it.)
7707 #endif
7708 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst,
7709 iocp->ioc_flag & IOC_MODELS);
7710 return;
7713 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK;
7714 qreply(q, mp);
7717 static void
7718 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
7720 mblk_t *data_mp;
7721 struct dstinforeq *dir;
7722 uint8_t *end, *cur;
7723 in6_addr_t *daddr, *saddr;
7724 ipaddr_t v4daddr;
7725 ire_t *ire;
7726 ipaddr_t v4setsrc;
7727 in6_addr_t v6setsrc;
7728 char *slabel, *dlabel;
7729 boolean_t isipv4;
7730 int match_ire;
7731 ill_t *dst_ill;
7732 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7733 conn_t *connp = Q_TO_CONN(q);
7734 zoneid_t zoneid = IPCL_ZONEID(connp);
7735 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
7736 uint64_t ipif_flags;
7738 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
7741 * This ioctl is I_STR only, and must have a
7742 * data mblk following the M_IOCTL mblk.
7744 data_mp = mp->b_cont;
7745 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) {
7746 miocnak(q, mp, 0, EINVAL);
7747 return;
7750 if (MBLKL(data_mp) < iocp->ioc_count) {
7751 mblk_t *new_data_mp;
7753 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) {
7754 miocnak(q, mp, 0, ENOMEM);
7755 return;
7757 freemsg(data_mp);
7758 data_mp = new_data_mp;
7759 mp->b_cont = data_mp;
7761 match_ire = MATCH_IRE_DSTONLY;
7763 for (cur = data_mp->b_rptr, end = data_mp->b_wptr;
7764 end - cur >= sizeof (struct dstinforeq);
7765 cur += sizeof (struct dstinforeq)) {
7766 dir = (struct dstinforeq *)cur;
7767 daddr = &dir->dir_daddr;
7768 saddr = &dir->dir_saddr;
7771 * ip_addr_scope_v6() and ip6_asp_lookup() handle
7772 * v4 mapped addresses; ire_ftable_lookup_v6()
7773 * and ip_select_source_v6() do not.
7775 dir->dir_dscope = ip_addr_scope_v6(daddr);
7776 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst);
7778 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr);
7779 if (isipv4) {
7780 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr);
7781 v4setsrc = INADDR_ANY;
7782 ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid,
7783 match_ire, IRR_ALLOCATE, 0, ipst, &v4setsrc, NULL);
7784 } else {
7785 v6setsrc = ipv6_all_zeros;
7786 ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid,
7787 match_ire, IRR_ALLOCATE, 0, ipst, &v6setsrc, NULL);
7789 ASSERT(ire != NULL);
7790 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
7791 ire_refrele(ire);
7792 dir->dir_dreachable = 0;
7794 /* move on to next dst addr */
7795 continue;
7797 dir->dir_dreachable = 1;
7799 dst_ill = ire_nexthop_ill(ire);
7800 if (dst_ill == NULL) {
7801 ire_refrele(ire);
7802 continue;
7805 /* With ipmp we most likely look at the ipmp ill here */
7806 dir->dir_dmactype = dst_ill->ill_mactype;
7808 if (isipv4) {
7809 ipaddr_t v4saddr;
7811 if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr,
7812 connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst,
7813 &v4saddr, NULL, &ipif_flags) != 0) {
7814 v4saddr = INADDR_ANY;
7815 ipif_flags = 0;
7817 IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr);
7818 } else {
7819 if (ip_select_source_v6(dst_ill, &v6setsrc, daddr,
7820 zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT,
7821 saddr, NULL, &ipif_flags) != 0) {
7822 *saddr = ipv6_all_zeros;
7823 ipif_flags = 0;
7827 dir->dir_sscope = ip_addr_scope_v6(saddr);
7828 slabel = ip6_asp_lookup(saddr, NULL, ipst);
7829 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel);
7830 dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0;
7831 ire_refrele(ire);
7832 ill_refrele(dst_ill);
7834 miocack(q, mp, iocp->ioc_count, 0);
7838 * Check if this is an address assigned to this machine.
7839 * Skips interfaces that are down by using ire checks.
7840 * Translates mapped addresses to v4 addresses and then
7841 * treats them as such, returning true if the v4 address
7842 * associated with this mapped address is configured.
7843 * Note: Applications will have to be careful what they do
7844 * with the response; use of mapped addresses limits
7845 * what can be done with the socket, especially with
7846 * respect to socket options and ioctls - neither IPv4
7847 * options nor IPv6 sticky options/ancillary data options
7848 * may be used.
7850 /* ARGSUSED */
7852 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
7853 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
7855 struct sioc_addrreq *sia;
7856 sin_t *sin;
7857 ire_t *ire;
7858 mblk_t *mp1;
7859 zoneid_t zoneid;
7860 ip_stack_t *ipst;
7862 ip1dbg(("ip_sioctl_tmyaddr"));
7864 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
7865 zoneid = Q_TO_CONN(q)->conn_zoneid;
7866 ipst = CONNQ_TO_IPST(q);
7868 /* Existence verified in ip_wput_nondata */
7869 mp1 = mp->b_cont->b_cont;
7870 sia = (struct sioc_addrreq *)mp1->b_rptr;
7871 sin = (sin_t *)&sia->sa_addr;
7872 switch (sin->sin_family) {
7873 case AF_INET6: {
7874 sin6_t *sin6 = (sin6_t *)sin;
7876 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
7877 ipaddr_t v4_addr;
7879 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
7880 v4_addr);
7881 ire = ire_ftable_lookup_v4(v4_addr, 0, 0,
7882 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
7883 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
7884 } else {
7885 in6_addr_t v6addr;
7887 v6addr = sin6->sin6_addr;
7888 ire = ire_ftable_lookup_v6(&v6addr, 0, 0,
7889 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
7890 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
7892 break;
7894 case AF_INET: {
7895 ipaddr_t v4addr;
7897 v4addr = sin->sin_addr.s_addr;
7898 ire = ire_ftable_lookup_v4(v4addr, 0, 0,
7899 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
7900 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
7901 break;
7903 default:
7904 return (EAFNOSUPPORT);
7906 if (ire != NULL) {
7907 sia->sa_res = 1;
7908 ire_refrele(ire);
7909 } else {
7910 sia->sa_res = 0;
7912 return (0);
7916 * Check if this is an address assigned on-link i.e. neighbor,
7917 * and makes sure it's reachable from the current zone.
7918 * Returns true for my addresses as well.
7919 * Translates mapped addresses to v4 addresses and then
7920 * treats them as such, returning true if the v4 address
7921 * associated with this mapped address is configured.
7922 * Note: Applications will have to be careful what they do
7923 * with the response; use of mapped addresses limits
7924 * what can be done with the socket, especially with
7925 * respect to socket options and ioctls - neither IPv4
7926 * options nor IPv6 sticky options/ancillary data options
7927 * may be used.
7929 /* ARGSUSED */
7931 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
7932 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq)
7934 struct sioc_addrreq *sia;
7935 sin_t *sin;
7936 mblk_t *mp1;
7937 ire_t *ire = NULL;
7938 zoneid_t zoneid;
7939 ip_stack_t *ipst;
7941 ip1dbg(("ip_sioctl_tonlink"));
7943 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
7944 zoneid = Q_TO_CONN(q)->conn_zoneid;
7945 ipst = CONNQ_TO_IPST(q);
7947 /* Existence verified in ip_wput_nondata */
7948 mp1 = mp->b_cont->b_cont;
7949 sia = (struct sioc_addrreq *)mp1->b_rptr;
7950 sin = (sin_t *)&sia->sa_addr;
7953 * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST
7954 * to make sure we only look at on-link unicast address.
7956 switch (sin->sin_family) {
7957 case AF_INET6: {
7958 sin6_t *sin6 = (sin6_t *)sin;
7960 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
7961 ipaddr_t v4_addr;
7963 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
7964 v4_addr);
7965 if (!CLASSD(v4_addr)) {
7966 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0,
7967 NULL, zoneid, MATCH_IRE_DSTONLY, 0, ipst,
7968 NULL);
7970 } else {
7971 in6_addr_t v6addr;
7973 v6addr = sin6->sin6_addr;
7974 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) {
7975 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0,
7976 NULL, zoneid, MATCH_IRE_DSTONLY, 0, ipst,
7977 NULL);
7980 break;
7982 case AF_INET: {
7983 ipaddr_t v4addr;
7985 v4addr = sin->sin_addr.s_addr;
7986 if (!CLASSD(v4addr)) {
7987 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL,
7988 zoneid, MATCH_IRE_DSTONLY, 0, ipst, NULL);
7990 break;
7992 default:
7993 return (EAFNOSUPPORT);
7995 sia->sa_res = 0;
7996 if (ire != NULL) {
7997 ASSERT(!(ire->ire_type & IRE_MULTICAST));
7999 if ((ire->ire_type & IRE_ONLINK) &&
8000 !(ire->ire_type & IRE_BROADCAST))
8001 sia->sa_res = 1;
8002 ire_refrele(ire);
8004 return (0);
8008 * TBD: implement when kernel maintaines a list of site prefixes.
8010 /* ARGSUSED */
8012 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8013 ip_ioctl_cmd_t *ipip, void *ifreq)
8015 return (ENXIO);
8018 /* ARP IOCTLs. */
8019 /* ARGSUSED */
8021 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8022 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8024 int err;
8025 ipaddr_t ipaddr;
8026 struct iocblk *iocp;
8027 conn_t *connp;
8028 struct arpreq *ar;
8029 struct xarpreq *xar;
8030 int arp_flags, flags, alength;
8031 uchar_t *lladdr;
8032 ip_stack_t *ipst;
8033 ill_t *ill = ipif->ipif_ill;
8034 ill_t *proxy_ill = NULL;
8035 ipmp_arpent_t *entp = NULL;
8036 boolean_t proxyarp = B_FALSE;
8037 boolean_t if_arp_ioctl = B_FALSE;
8038 ncec_t *ncec = NULL;
8039 nce_t *nce;
8041 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8042 connp = Q_TO_CONN(q);
8043 ipst = connp->conn_netstack->netstack_ip;
8044 iocp = (struct iocblk *)mp->b_rptr;
8046 if (ipip->ipi_cmd_type == XARP_CMD) {
8047 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */
8048 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr;
8049 ar = NULL;
8051 arp_flags = xar->xarp_flags;
8052 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha);
8053 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
8055 * Validate against user's link layer address length
8056 * input and name and addr length limits.
8058 alength = ill->ill_phys_addr_length;
8059 if (ipip->ipi_cmd == SIOCSXARP) {
8060 if (alength != xar->xarp_ha.sdl_alen ||
8061 (alength + xar->xarp_ha.sdl_nlen >
8062 sizeof (xar->xarp_ha.sdl_data)))
8063 return (EINVAL);
8065 } else {
8066 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */
8067 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr;
8068 xar = NULL;
8070 arp_flags = ar->arp_flags;
8071 lladdr = (uchar_t *)ar->arp_ha.sa_data;
8073 * Theoretically, the sa_family could tell us what link
8074 * layer type this operation is trying to deal with. By
8075 * common usage AF_UNSPEC means ethernet. We'll assume
8076 * any attempt to use the SIOC?ARP ioctls is for ethernet,
8077 * for now. Our new SIOC*XARP ioctls can be used more
8078 * generally.
8080 * If the underlying media happens to have a non 6 byte
8081 * address, arp module will fail set/get, but the del
8082 * operation will succeed.
8084 alength = 6;
8085 if ((ipip->ipi_cmd != SIOCDARP) &&
8086 (alength != ill->ill_phys_addr_length)) {
8087 return (EINVAL);
8091 /* Translate ATF* flags to NCE* flags */
8092 flags = 0;
8093 if (arp_flags & ATF_AUTHORITY)
8094 flags |= NCE_F_AUTHORITY;
8095 if (arp_flags & ATF_PERM)
8096 flags |= NCE_F_NONUD; /* not subject to aging */
8097 if (arp_flags & ATF_PUBL)
8098 flags |= NCE_F_PUBLISH;
8101 * IPMP ARP special handling:
8103 * 1. Since ARP mappings must appear consistent across the group,
8104 * prohibit changing ARP mappings on the underlying interfaces.
8106 * 2. Since ARP mappings for IPMP data addresses are maintained by
8107 * IP itself, prohibit changing them.
8109 * 3. For proxy ARP, use a functioning hardware address in the group,
8110 * provided one exists. If one doesn't, just add the entry as-is;
8111 * ipmp_illgrp_refresh_arpent() will refresh it if things change.
8113 if (IS_UNDER_IPMP(ill)) {
8114 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP)
8115 return (EPERM);
8117 if (IS_IPMP(ill)) {
8118 ipmp_illgrp_t *illg = ill->ill_grp;
8120 switch (ipip->ipi_cmd) {
8121 case SIOCSARP:
8122 case SIOCSXARP:
8123 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength);
8124 if (proxy_ill != NULL) {
8125 proxyarp = B_TRUE;
8126 if (!ipmp_ill_is_active(proxy_ill))
8127 proxy_ill = ipmp_illgrp_next_ill(illg);
8128 if (proxy_ill != NULL)
8129 lladdr = proxy_ill->ill_phys_addr;
8131 /* FALLTHRU */
8135 ipaddr = sin->sin_addr.s_addr;
8137 * don't match across illgrp per case (1) and (2).
8138 * XXX use IS_IPMP(ill) like ndp_sioc_update?
8140 nce = nce_lookup_v4(ill, &ipaddr);
8141 if (nce != NULL)
8142 ncec = nce->nce_common;
8144 switch (iocp->ioc_cmd) {
8145 case SIOCDARP:
8146 case SIOCDXARP: {
8148 * Delete the NCE if any.
8150 if (ncec == NULL) {
8151 iocp->ioc_error = ENXIO;
8152 break;
8154 /* Don't allow changes to arp mappings of local addresses. */
8155 if (NCE_MYADDR(ncec)) {
8156 nce_refrele(nce);
8157 return (ENOTSUP);
8159 iocp->ioc_error = 0;
8162 * Delete the nce_common which has ncec_ill set to ipmp_ill.
8163 * This will delete all the nce entries on the under_ills.
8165 ncec_delete(ncec);
8167 * Once the NCE has been deleted, then the ire_dep* consistency
8168 * mechanism will find any IRE which depended on the now
8169 * condemned NCE (as part of sending packets).
8170 * That mechanism handles redirects by deleting redirects
8171 * that refer to UNREACHABLE nces.
8173 break;
8175 case SIOCGARP:
8176 case SIOCGXARP:
8177 if (ncec != NULL) {
8178 lladdr = ncec->ncec_lladdr;
8179 flags = ncec->ncec_flags;
8180 iocp->ioc_error = 0;
8181 ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags);
8182 } else {
8183 iocp->ioc_error = ENXIO;
8185 break;
8186 case SIOCSARP:
8187 case SIOCSXARP:
8188 /* Don't allow changes to arp mappings of local addresses. */
8189 if (ncec != NULL && NCE_MYADDR(ncec)) {
8190 nce_refrele(nce);
8191 return (ENOTSUP);
8194 /* static arp entries will undergo NUD if ATF_PERM is not set */
8195 flags |= NCE_F_STATIC;
8196 if (!if_arp_ioctl) {
8197 ip_nce_lookup_and_update(&ipaddr, NULL, ipst,
8198 lladdr, alength, flags);
8199 } else {
8200 ipif_t *ipif = ipif_get_next_ipif(NULL, ill);
8201 if (ipif != NULL) {
8202 ip_nce_lookup_and_update(&ipaddr, ipif, ipst,
8203 lladdr, alength, flags);
8204 ipif_refrele(ipif);
8207 if (nce != NULL) {
8208 nce_refrele(nce);
8209 nce = NULL;
8212 * NCE_F_STATIC entries will be added in state ND_REACHABLE
8213 * by nce_add_common()
8215 err = nce_lookup_then_add_v4(ill, lladdr,
8216 ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED,
8217 &nce);
8218 if (err == EEXIST) {
8219 ncec = nce->nce_common;
8220 mutex_enter(&ncec->ncec_lock);
8221 ncec->ncec_state = ND_REACHABLE;
8222 ncec->ncec_flags = flags;
8223 nce_update(ncec, ND_UNCHANGED, lladdr);
8224 mutex_exit(&ncec->ncec_lock);
8225 err = 0;
8227 if (nce != NULL) {
8228 nce_refrele(nce);
8229 nce = NULL;
8231 if (IS_IPMP(ill) && err == 0) {
8232 entp = ipmp_illgrp_create_arpent(ill->ill_grp,
8233 proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length,
8234 flags);
8235 if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
8236 iocp->ioc_error = (entp == NULL ? ENOMEM : 0);
8237 break;
8240 iocp->ioc_error = err;
8243 if (nce != NULL) {
8244 nce_refrele(nce);
8248 * If we created an IPMP ARP entry, mark that we've notified ARP.
8250 if (entp != NULL)
8251 ipmp_illgrp_mark_arpent(ill->ill_grp, entp);
8253 return (iocp->ioc_error);
8257 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify
8258 * the associated sin and refhold and return the associated ipif via `ci'.
8261 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
8262 cmd_info_t *ci)
8264 mblk_t *mp1;
8265 sin_t *sin;
8266 conn_t *connp;
8267 ipif_t *ipif;
8268 ire_t *ire = NULL;
8269 ill_t *ill = NULL;
8270 boolean_t exists;
8271 ip_stack_t *ipst;
8272 struct arpreq *ar;
8273 struct xarpreq *xar;
8274 struct sockaddr_dl *sdl;
8276 /* ioctl comes down on a conn */
8277 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8278 connp = Q_TO_CONN(q);
8279 if (connp->conn_family == AF_INET6)
8280 return (ENXIO);
8282 ipst = connp->conn_netstack->netstack_ip;
8284 /* Verified in ip_wput_nondata */
8285 mp1 = mp->b_cont->b_cont;
8287 if (ipip->ipi_cmd_type == XARP_CMD) {
8288 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq));
8289 xar = (struct xarpreq *)mp1->b_rptr;
8290 sin = (sin_t *)&xar->xarp_pa;
8291 sdl = &xar->xarp_ha;
8293 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET)
8294 return (ENXIO);
8295 if (sdl->sdl_nlen >= LIFNAMSIZ)
8296 return (EINVAL);
8297 } else {
8298 ASSERT(ipip->ipi_cmd_type == ARP_CMD);
8299 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq));
8300 ar = (struct arpreq *)mp1->b_rptr;
8301 sin = (sin_t *)&ar->arp_pa;
8304 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) {
8305 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen,
8306 B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst);
8307 if (ipif == NULL)
8308 return (ENXIO);
8309 if (ipif->ipif_id != 0) {
8310 ipif_refrele(ipif);
8311 return (ENXIO);
8313 } else {
8315 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen
8316 * of 0: use the IP address to find the ipif. If the IP
8317 * address is an IPMP test address, ire_ftable_lookup() will
8318 * find the wrong ill, so we first do an ipif_lookup_addr().
8320 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES,
8321 ipst);
8322 if (ipif == NULL) {
8323 ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr,
8324 0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES,
8325 MATCH_IRE_TYPE, 0, ipst, NULL);
8326 if (ire == NULL || ((ill = ire->ire_ill) == NULL)) {
8327 if (ire != NULL)
8328 ire_refrele(ire);
8329 return (ENXIO);
8331 ASSERT(ire != NULL && ill != NULL);
8332 ipif = ill->ill_ipif;
8333 ipif_refhold(ipif);
8334 ire_refrele(ire);
8338 if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) {
8339 ipif_refrele(ipif);
8340 return (ENXIO);
8343 ci->ci_sin = sin;
8344 ci->ci_ipif = ipif;
8345 return (0);
8349 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the
8350 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is
8351 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it
8352 * up and thus an ill can join that illgrp.
8354 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than
8355 * open()/close() primarily because close() is not allowed to fail or block
8356 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason
8357 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure
8358 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the
8359 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts
8360 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent
8361 * state if I_UNLINK didn't occur.
8363 * Note that for each plumb/unplumb operation, we may end up here more than
8364 * once because of the way ifconfig works. However, it's OK to link the same
8365 * illgrp more than once, or unlink an illgrp that's already unlinked.
8367 static int
8368 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd)
8370 int err;
8371 ip_stack_t *ipst = ill->ill_ipst;
8373 ASSERT(IS_IPMP(ill));
8374 ASSERT(IAM_WRITER_ILL(ill));
8376 switch (ioccmd) {
8377 case I_LINK:
8378 return (ENOTSUP);
8380 case I_PLINK:
8381 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8382 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp);
8383 rw_exit(&ipst->ips_ipmp_lock);
8384 break;
8386 case I_PUNLINK:
8388 * Require all UP ipifs be brought down prior to unlinking the
8389 * illgrp so any associated IREs (and other state) is torched.
8391 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
8392 return (EBUSY);
8395 * NOTE: We hold ipmp_lock across the unlink to prevent a race
8396 * with an SIOCSLIFGROUPNAME request from an ill trying to
8397 * join this group. Specifically: ills trying to join grab
8398 * ipmp_lock and bump a "pending join" counter checked by
8399 * ipmp_illgrp_unlink_grp(). During the unlink no new pending
8400 * joins can occur (since we have ipmp_lock). Once we drop
8401 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not
8402 * find the illgrp (since we unlinked it) and will return
8403 * EAFNOSUPPORT. This will then take them back through the
8404 * IPMP meta-interface plumbing logic in ifconfig, and thus
8405 * back through I_PLINK above.
8407 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8408 err = ipmp_illgrp_unlink_grp(ill->ill_grp);
8409 rw_exit(&ipst->ips_ipmp_lock);
8410 return (err);
8411 default:
8412 break;
8414 return (0);
8418 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also
8419 * atomically set/clear the muxids. Also complete the ioctl by acking or
8420 * naking it. Note that the code is structured such that the link type,
8421 * whether it's persistent or not, is treated equally. ifconfig(8) and
8422 * its clones use the persistent link, while pppd(8) and perhaps many
8423 * other daemons may use non-persistent link. When combined with some
8424 * ill_t states, linking and unlinking lower streams may be used as
8425 * indicators of dynamic re-plumbing events [see PSARC/1999/348].
8427 /* ARGSUSED */
8428 void
8429 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8431 mblk_t *mp1;
8432 struct linkblk *li;
8433 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
8434 int err = 0;
8436 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK ||
8437 ioccmd == I_LINK || ioccmd == I_UNLINK);
8439 mp1 = mp->b_cont; /* This is the linkblk info */
8440 li = (struct linkblk *)mp1->b_rptr;
8442 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li);
8443 if (err == EINPROGRESS)
8444 return;
8445 if (err == 0)
8446 miocack(q, mp, 0, 0);
8447 else
8448 miocnak(q, mp, 0, err);
8450 /* Conn was refheld in ip_sioctl_copyin_setup */
8451 if (CONN_Q(q)) {
8452 CONN_DEC_IOCTLREF(Q_TO_CONN(q));
8453 CONN_OPER_PENDING_DONE(Q_TO_CONN(q));
8458 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to
8459 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP
8460 * module stream).
8461 * Returns zero on success, EINPROGRESS if the operation is still pending, or
8462 * an error code on failure.
8464 static int
8465 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
8466 struct linkblk *li)
8468 int err = 0;
8469 ill_t *ill;
8470 queue_t *ipwq, *dwq;
8471 const char *name;
8472 struct qinit *qinfo;
8473 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK);
8474 boolean_t entered_ipsq = B_FALSE;
8475 boolean_t is_ip = B_FALSE;
8476 arl_t *arl;
8479 * Walk the lower stream to verify it's the IP module stream.
8480 * The IP module is identified by its name, wput function,
8481 * and non-NULL q_next. STREAMS ensures that the lower stream
8482 * (li->l_qbot) will not vanish until this ioctl completes.
8484 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) {
8485 qinfo = ipwq->q_qinfo;
8486 name = qinfo->qi_minfo->mi_idname;
8487 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 &&
8488 qinfo->qi_putp != ip_lwput && ipwq->q_next != NULL) {
8489 is_ip = B_TRUE;
8490 break;
8492 if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 &&
8493 qinfo->qi_putp != ip_lwput && ipwq->q_next != NULL) {
8494 break;
8499 * If this isn't an IP module stream, bail.
8501 if (ipwq == NULL)
8502 return (0);
8504 if (!is_ip) {
8505 arl = (arl_t *)ipwq->q_ptr;
8506 ill = arl_to_ill(arl);
8507 if (ill == NULL)
8508 return (0);
8509 } else {
8510 ill = ipwq->q_ptr;
8512 ASSERT(ill != NULL);
8514 if (ipsq == NULL) {
8515 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
8516 NEW_OP, B_FALSE);
8517 if (ipsq == NULL) {
8518 if (!is_ip)
8519 ill_refrele(ill);
8520 return (EINPROGRESS);
8522 entered_ipsq = B_TRUE;
8524 ASSERT(IAM_WRITER_ILL(ill));
8525 mutex_enter(&ill->ill_lock);
8526 if (!is_ip) {
8527 if (islink && ill->ill_muxid == 0) {
8529 * Plumbing has to be done with IP plumbed first, arp
8530 * second, but here we have arp being plumbed first.
8532 mutex_exit(&ill->ill_lock);
8533 if (entered_ipsq)
8534 ipsq_exit(ipsq);
8535 ill_refrele(ill);
8536 return (EINVAL);
8539 mutex_exit(&ill->ill_lock);
8540 if (!is_ip) {
8541 arl->arl_muxid = islink ? li->l_index : 0;
8542 ill_refrele(ill);
8543 goto done;
8546 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
8547 goto done;
8550 * As part of I_{P}LINKing, stash the number of downstream modules and
8551 * the read queue of the module immediately below IP in the ill.
8552 * These are used during the capability negotiation below.
8554 ill->ill_lmod_rq = NULL;
8555 ill->ill_lmod_cnt = 0;
8556 if (islink && ((dwq = ipwq->q_next) != NULL)) {
8557 ill->ill_lmod_rq = RD(dwq);
8558 for (; dwq != NULL; dwq = dwq->q_next)
8559 ill->ill_lmod_cnt++;
8562 ill->ill_muxid = islink ? li->l_index : 0;
8565 * Mark the ipsq busy until the capability operations initiated below
8566 * complete. The PLINK/UNLINK ioctl itself completes when our caller
8567 * returns, but the capability operation may complete asynchronously
8568 * much later.
8570 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd);
8572 * If there's at least one up ipif on this ill, then we're bound to
8573 * the underlying driver via DLPI. In that case, renegotiate
8574 * capabilities to account for any possible change in modules
8575 * interposed between IP and the driver.
8577 if (ill->ill_ipif_up_count > 0) {
8578 if (islink)
8579 ill_capability_probe(ill);
8580 else
8581 ill_capability_reset(ill, B_FALSE);
8583 ipsq_current_finish(ipsq);
8584 done:
8585 if (entered_ipsq)
8586 ipsq_exit(ipsq);
8588 return (err);
8592 * Search the ioctl command in the ioctl tables and return a pointer
8593 * to the ioctl command information. The ioctl command tables are
8594 * static and fully populated at compile time.
8596 ip_ioctl_cmd_t *
8597 ip_sioctl_lookup(int ioc_cmd)
8599 int index;
8600 ip_ioctl_cmd_t *ipip;
8601 ip_ioctl_cmd_t *ipip_end;
8603 if (ioc_cmd == IPI_DONTCARE)
8604 return (NULL);
8607 * Do a 2 step search. First search the indexed table
8608 * based on the least significant byte of the ioctl cmd.
8609 * If we don't find a match, then search the misc table
8610 * serially.
8612 index = ioc_cmd & 0xFF;
8613 if (index < ip_ndx_ioctl_count) {
8614 ipip = &ip_ndx_ioctl_table[index];
8615 if (ipip->ipi_cmd == ioc_cmd) {
8616 /* Found a match in the ndx table */
8617 return (ipip);
8621 /* Search the misc table */
8622 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count];
8623 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) {
8624 if (ipip->ipi_cmd == ioc_cmd)
8625 /* Found a match in the misc table */
8626 return (ipip);
8629 return (NULL);
8633 * helper function for ip_sioctl_getsetprop(), which does some sanity checks
8635 static boolean_t
8636 getset_ioctl_checks(mod_ioc_prop_t *pioc, int ioc_cmd)
8638 uint_t flags = pioc->mpr_flags;
8639 if (ioc_cmd == SIOCSETPROP) {
8641 * One can either reset the value to it's default value or
8642 * change the current value or append/remove the value from
8643 * a multi-valued properties.
8645 if ((flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8646 flags != MOD_PROP_ACTIVE &&
8647 flags != (MOD_PROP_ACTIVE|MOD_PROP_APPEND) &&
8648 flags != (MOD_PROP_ACTIVE|MOD_PROP_REMOVE))
8649 return (B_FALSE);
8650 } else {
8651 ASSERT(ioc_cmd == SIOCGETPROP);
8654 * One can retrieve only one kind of property information
8655 * at a time.
8657 if ((flags & MOD_PROP_ACTIVE) != MOD_PROP_ACTIVE &&
8658 (flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8659 (flags & MOD_PROP_POSSIBLE) != MOD_PROP_POSSIBLE &&
8660 (flags & MOD_PROP_PERM) != MOD_PROP_PERM)
8661 return (B_FALSE);
8664 return (B_TRUE);
8668 * process the SIOC{SET|GET}PROP ioctl's
8670 /* ARGSUSED */
8672 ip_sioctl_getsetprop(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
8673 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req)
8675 int ioc_cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
8676 mblk_t *mp1;
8677 mod_ioc_prop_t *pioc;
8678 mod_prop_info_t *ptbl = NULL, *pinfo = NULL;
8679 ip_stack_t *ipst;
8680 netstack_t *stack;
8681 cred_t *cr;
8682 boolean_t set;
8683 int err;
8685 ASSERT(q->q_next == NULL);
8686 ASSERT(CONN_Q(q));
8688 mp1 = mp->b_cont->b_cont;
8689 ipst = CONNQ_TO_IPST(q);
8690 stack = ipst->ips_netstack;
8691 pioc = (mod_ioc_prop_t *)mp1->b_rptr;
8692 if (!getset_ioctl_checks(pioc, ioc_cmd))
8693 return (EINVAL);
8695 switch (pioc->mpr_proto) {
8696 case MOD_PROTO_IP:
8697 case MOD_PROTO_IPV4:
8698 case MOD_PROTO_IPV6:
8699 ptbl = ipst->ips_propinfo_tbl;
8700 break;
8701 case MOD_PROTO_RAWIP:
8702 ptbl = stack->netstack_icmp->is_propinfo_tbl;
8703 break;
8704 case MOD_PROTO_TCP:
8705 ptbl = stack->netstack_tcp->tcps_propinfo_tbl;
8706 break;
8707 case MOD_PROTO_UDP:
8708 ptbl = stack->netstack_udp->us_propinfo_tbl;
8709 break;
8710 case MOD_PROTO_SCTP:
8711 ptbl = stack->netstack_sctp->sctps_propinfo_tbl;
8712 break;
8713 default:
8714 return (EINVAL);
8717 pioc->mpr_ifname[sizeof(pioc->mpr_ifname)-1] = '\0';
8718 pioc->mpr_name[sizeof(pioc->mpr_name)-1] = '\0';
8719 pioc->mpr_val[sizeof(pioc->mpr_val)-1] = '\0';
8721 pinfo = mod_prop_lookup(ptbl, pioc->mpr_name, pioc->mpr_proto);
8722 if (pinfo == NULL)
8723 return (ENOENT);
8725 set = (ioc_cmd == SIOCSETPROP) ? B_TRUE : B_FALSE;
8726 if (set && pinfo->mpi_setf != NULL) {
8727 cr = msg_getcred(mp, NULL);
8728 err = pinfo->mpi_setf(stack, cr, pinfo, pioc->mpr_ifname,
8729 pioc->mpr_val, pioc->mpr_flags);
8730 } else if (!set && pinfo->mpi_getf != NULL) {
8731 err = pinfo->mpi_getf(stack, pinfo, pioc->mpr_ifname,
8732 pioc->mpr_val, sizeof(pioc->mpr_val), pioc->mpr_flags);
8733 } else {
8734 err = EPERM;
8737 return (err);
8741 * process the legacy ND_GET, ND_SET ioctl just for {ip|ip6}_forwarding
8742 * as several routing daemons have unfortunately used this 'unpublished'
8743 * but well-known ioctls.
8745 /* ARGSUSED */
8746 static void
8747 ip_process_legacy_nddprop(queue_t *q, mblk_t *mp)
8749 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8750 mblk_t *mp1 = mp->b_cont;
8751 char *pname, *pval, *buf;
8752 uint_t bufsize, proto;
8753 mod_prop_info_t *pinfo = NULL;
8754 ip_stack_t *ipst;
8755 int err = 0;
8757 ASSERT(CONN_Q(q));
8758 ipst = CONNQ_TO_IPST(q);
8760 if (iocp->ioc_count == 0 || mp1 == NULL) {
8761 miocnak(q, mp, 0, EINVAL);
8762 return;
8765 mp1->b_datap->db_lim[-1] = '\0'; /* Force null termination */
8766 pval = buf = pname = (char *)mp1->b_rptr;
8767 bufsize = MBLKL(mp1);
8769 if (strcmp(pname, "ip_forwarding") == 0) {
8770 pname = "forwarding";
8771 proto = MOD_PROTO_IPV4;
8772 } else if (strcmp(pname, "ip6_forwarding") == 0) {
8773 pname = "forwarding";
8774 proto = MOD_PROTO_IPV6;
8775 } else {
8776 miocnak(q, mp, 0, EINVAL);
8777 return;
8780 pinfo = mod_prop_lookup(ipst->ips_propinfo_tbl, pname, proto);
8782 switch (iocp->ioc_cmd) {
8783 case ND_GET:
8784 if ((err = pinfo->mpi_getf(ipst->ips_netstack, pinfo, NULL, buf,
8785 bufsize, 0)) == 0) {
8786 miocack(q, mp, iocp->ioc_count, 0);
8787 return;
8789 break;
8790 case ND_SET:
8792 * buffer will have property name and value in the following
8793 * format,
8794 * <property name>'\0'<property value>'\0', extract them;
8796 while (*pval++)
8797 noop;
8799 if (!*pval || pval >= (char *)mp1->b_wptr) {
8800 err = EINVAL;
8801 } else if ((err = pinfo->mpi_setf(ipst->ips_netstack, NULL,
8802 pinfo, NULL, pval, 0)) == 0) {
8803 miocack(q, mp, 0, 0);
8804 return;
8806 break;
8807 default:
8808 err = EINVAL;
8809 break;
8811 miocnak(q, mp, 0, err);
8815 * Wrapper function for resuming deferred ioctl processing
8816 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER,
8817 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently.
8819 /* ARGSUSED */
8820 void
8821 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp,
8822 void *dummy_arg)
8824 ip_sioctl_copyin_setup(q, mp);
8828 * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message
8829 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle
8830 * in either I_STR or TRANSPARENT form, using the mi_copy facility.
8831 * We establish here the size of the block to be copied in. mi_copyin
8832 * arranges for this to happen, an processing continues in ip_wput_nondata with
8833 * an M_IOCDATA message.
8835 void
8836 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp)
8838 int copyin_size;
8839 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8840 ip_ioctl_cmd_t *ipip;
8841 cred_t *cr;
8842 ip_stack_t *ipst;
8844 if (CONN_Q(q))
8845 ipst = CONNQ_TO_IPST(q);
8846 else
8847 ipst = ILLQ_TO_IPST(q);
8849 ipip = ip_sioctl_lookup(iocp->ioc_cmd);
8850 if (ipip == NULL) {
8852 * The ioctl is not one we understand or own.
8853 * Pass it along to be processed down stream,
8854 * if this is a module instance of IP, else nak
8855 * the ioctl.
8857 if (q->q_next == NULL) {
8858 goto nak;
8859 } else {
8860 putnext(q, mp);
8861 return;
8866 * If this is deferred, then we will do all the checks when we
8867 * come back.
8869 if ((iocp->ioc_cmd == SIOCGDSTINFO ||
8870 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) {
8871 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume);
8872 return;
8876 * Only allow a very small subset of IP ioctls on this stream if
8877 * IP is a module and not a driver. Allowing ioctls to be processed
8878 * in this case may cause assert failures or data corruption.
8879 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few
8880 * ioctls allowed on an IP module stream, after which this stream
8881 * normally becomes a multiplexor (at which time the stream head
8882 * will fail all ioctls).
8884 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
8885 goto nak;
8888 /* Make sure we have ioctl data to process. */
8889 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT))
8890 goto nak;
8893 * Prefer dblk credential over ioctl credential; some synthesized
8894 * ioctls have kcred set because there's no way to crhold()
8895 * a credential in some contexts. (ioc_cr is not crfree() by
8896 * the framework; the caller of ioctl needs to hold the reference
8897 * for the duration of the call).
8899 cr = msg_getcred(mp, NULL);
8900 if (cr == NULL)
8901 cr = iocp->ioc_cr;
8903 /* Make sure normal users don't send down privileged ioctls */
8904 if ((ipip->ipi_flags & IPI_PRIV) &&
8905 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) {
8906 /* We checked the privilege earlier but log it here */
8907 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE));
8908 return;
8912 * The ioctl command tables can only encode fixed length
8913 * ioctl data. If the length is variable, the table will
8914 * encode the length as zero. Such special cases are handled
8915 * below in the switch.
8917 if (ipip->ipi_copyin_size != 0) {
8918 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size);
8919 return;
8922 switch (iocp->ioc_cmd) {
8923 case O_SIOCGIFCONF:
8924 case SIOCGIFCONF:
8926 * This IOCTL is hilarious. See comments in
8927 * ip_sioctl_get_ifconf for the story.
8929 if (iocp->ioc_count == TRANSPARENT)
8930 copyin_size = SIZEOF_STRUCT(ifconf,
8931 iocp->ioc_flag);
8932 else
8933 copyin_size = iocp->ioc_count;
8934 mi_copyin(q, mp, NULL, copyin_size);
8935 return;
8937 case O_SIOCGLIFCONF:
8938 case SIOCGLIFCONF:
8939 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag);
8940 mi_copyin(q, mp, NULL, copyin_size);
8941 return;
8943 case SIOCGLIFSRCOF:
8944 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag);
8945 mi_copyin(q, mp, NULL, copyin_size);
8946 return;
8948 case SIOCGIP6ADDRPOLICY:
8949 ip_sioctl_ip6addrpolicy(q, mp);
8950 ip6_asp_table_refrele(ipst);
8951 return;
8953 case SIOCSIP6ADDRPOLICY:
8954 ip_sioctl_ip6addrpolicy(q, mp);
8955 return;
8957 case SIOCGDSTINFO:
8958 ip_sioctl_dstinfo(q, mp);
8959 ip6_asp_table_refrele(ipst);
8960 return;
8962 case ND_SET:
8963 case ND_GET:
8964 ip_process_legacy_nddprop(q, mp);
8965 return;
8967 case I_PLINK:
8968 case I_PUNLINK:
8969 case I_LINK:
8970 case I_UNLINK:
8972 * We treat non-persistent link similarly as the persistent
8973 * link case, in terms of plumbing/unplumbing, as well as
8974 * dynamic re-plumbing events indicator. See comments
8975 * in ip_sioctl_plink() for more.
8977 * Request can be enqueued in the 'ipsq' while waiting
8978 * to become exclusive. So bump up the conn ref.
8980 if (CONN_Q(q)) {
8981 CONN_INC_REF(Q_TO_CONN(q));
8982 CONN_INC_IOCTLREF(Q_TO_CONN(q))
8984 ip_sioctl_plink(NULL, q, mp, NULL);
8985 return;
8987 case IP_IOCTL:
8988 ip_wput_ioctl(q, mp);
8989 return;
8991 case SIOCILB:
8992 /* The ioctl length varies depending on the ILB command. */
8993 copyin_size = iocp->ioc_count;
8994 if (copyin_size < sizeof (ilb_cmd_t))
8995 goto nak;
8996 mi_copyin(q, mp, NULL, copyin_size);
8997 return;
8999 default:
9000 cmn_err(CE_WARN, "Unknown ioctl %d/0x%x slipped through.",
9001 iocp->ioc_cmd, iocp->ioc_cmd);
9002 /* FALLTHRU */
9004 nak:
9005 if (mp->b_cont != NULL) {
9006 freemsg(mp->b_cont);
9007 mp->b_cont = NULL;
9009 iocp->ioc_error = EINVAL;
9010 mp->b_datap->db_type = M_IOCNAK;
9011 iocp->ioc_count = 0;
9012 qreply(q, mp);
9015 static void
9016 ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags)
9018 struct arpreq *ar;
9019 struct xarpreq *xar;
9020 mblk_t *tmp;
9021 struct iocblk *iocp;
9022 int x_arp_ioctl = B_FALSE;
9023 int *flagsp;
9024 char *storage = NULL;
9026 ASSERT(ill != NULL);
9028 iocp = (struct iocblk *)mp->b_rptr;
9029 ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP);
9031 tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */
9032 if ((iocp->ioc_cmd == SIOCGXARP) ||
9033 (iocp->ioc_cmd == SIOCSXARP)) {
9034 x_arp_ioctl = B_TRUE;
9035 xar = (struct xarpreq *)tmp->b_rptr;
9036 flagsp = &xar->xarp_flags;
9037 storage = xar->xarp_ha.sdl_data;
9038 } else {
9039 ar = (struct arpreq *)tmp->b_rptr;
9040 flagsp = &ar->arp_flags;
9041 storage = ar->arp_ha.sa_data;
9045 * We're done if this is not an SIOCG{X}ARP
9047 if (x_arp_ioctl) {
9048 storage += ill_xarp_info(&xar->xarp_ha, ill);
9049 if ((ill->ill_phys_addr_length + ill->ill_name_length) >
9050 sizeof (xar->xarp_ha.sdl_data)) {
9051 iocp->ioc_error = EINVAL;
9052 return;
9055 *flagsp = ATF_INUSE;
9057 * If /sbin/arp told us we are the authority using the "permanent"
9058 * flag, or if this is one of my addresses print "permanent"
9059 * in the /sbin/arp output.
9061 if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY))
9062 *flagsp |= ATF_AUTHORITY;
9063 if (flags & NCE_F_NONUD)
9064 *flagsp |= ATF_PERM; /* not subject to aging */
9065 if (flags & NCE_F_PUBLISH)
9066 *flagsp |= ATF_PUBL;
9067 if (hwaddr != NULL) {
9068 *flagsp |= ATF_COM;
9069 bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length);
9074 * Create a new logical interface. If ipif_id is zero (i.e. not a logical
9075 * interface) create the next available logical interface for this
9076 * physical interface.
9077 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an
9078 * ipif with the specified name.
9080 * If the address family is not AF_UNSPEC then set the address as well.
9082 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout)
9083 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer.
9085 * Executed as a writer on the ill.
9086 * So no lock is needed to traverse the ipif chain, or examine the
9087 * phyint flags.
9089 /* ARGSUSED */
9091 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
9092 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9094 mblk_t *mp1;
9095 struct lifreq *lifr;
9096 boolean_t isv6;
9097 boolean_t exists;
9098 char *name;
9099 char *endp;
9100 char *cp;
9101 int namelen;
9102 ipif_t *ipif;
9103 long id;
9104 ipsq_t *ipsq;
9105 ill_t *ill;
9106 sin_t *sin;
9107 int err = 0;
9108 boolean_t found_sep = B_FALSE;
9109 conn_t *connp;
9110 zoneid_t zoneid;
9111 ip_stack_t *ipst = CONNQ_TO_IPST(q);
9113 ASSERT(q->q_next == NULL);
9114 ip1dbg(("ip_sioctl_addif\n"));
9115 /* Existence of mp1 has been checked in ip_wput_nondata */
9116 mp1 = mp->b_cont->b_cont;
9118 * Null terminate the string to protect against buffer
9119 * overrun. String was generated by user code and may not
9120 * be trusted.
9122 lifr = (struct lifreq *)mp1->b_rptr;
9123 lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
9124 name = lifr->lifr_name;
9125 ASSERT(CONN_Q(q));
9126 connp = Q_TO_CONN(q);
9127 isv6 = (connp->conn_family == AF_INET6);
9128 zoneid = connp->conn_zoneid;
9129 namelen = mi_strlen(name);
9130 if (namelen == 0)
9131 return (EINVAL);
9133 exists = B_FALSE;
9134 if ((namelen + 1 == sizeof (ipif_loopback_name)) &&
9135 (mi_strcmp(name, ipif_loopback_name) == 0)) {
9137 * Allow creating lo0 using SIOCLIFADDIF.
9138 * can't be any other writer thread. So can pass null below
9139 * for the last 4 args to ipif_lookup_name.
9141 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE,
9142 &exists, isv6, zoneid, ipst);
9143 /* Prevent any further action */
9144 if (ipif == NULL) {
9145 return (ENOBUFS);
9146 } else if (!exists) {
9147 /* We created the ipif now and as writer */
9148 ipif_refrele(ipif);
9149 return (0);
9150 } else {
9151 ill = ipif->ipif_ill;
9152 ill_refhold(ill);
9153 ipif_refrele(ipif);
9155 } else {
9156 /* Look for a colon in the name. */
9157 endp = &name[namelen];
9158 for (cp = endp; --cp > name; ) {
9159 if (*cp == IPIF_SEPARATOR_CHAR) {
9160 found_sep = B_TRUE;
9162 * Reject any non-decimal aliases for plumbing
9163 * of logical interfaces. Aliases with leading
9164 * zeroes are also rejected as they introduce
9165 * ambiguity in the naming of the interfaces.
9166 * Comparing with "0" takes care of all such
9167 * cases.
9169 if ((strncmp("0", cp+1, 1)) == 0)
9170 return (EINVAL);
9172 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 ||
9173 id <= 0 || *endp != '\0') {
9174 return (EINVAL);
9176 *cp = '\0';
9177 break;
9180 ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst);
9181 if (found_sep)
9182 *cp = IPIF_SEPARATOR_CHAR;
9183 if (ill == NULL)
9184 return (ENXIO);
9187 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP,
9188 B_TRUE);
9191 * Release the refhold due to the lookup, now that we are excl
9192 * or we are just returning
9194 ill_refrele(ill);
9196 if (ipsq == NULL)
9197 return (EINPROGRESS);
9199 /* We are now exclusive on the IPSQ */
9200 ASSERT(IAM_WRITER_ILL(ill));
9202 if (found_sep) {
9203 /* Now see if there is an IPIF with this unit number. */
9204 for (ipif = ill->ill_ipif; ipif != NULL;
9205 ipif = ipif->ipif_next) {
9206 if (ipif->ipif_id == id) {
9207 err = EEXIST;
9208 goto done;
9214 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use
9215 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name()
9216 * instead.
9218 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL,
9219 B_TRUE, B_TRUE, &err)) == NULL) {
9220 goto done;
9223 /* Return created name with ioctl */
9224 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name,
9225 IPIF_SEPARATOR_CHAR, ipif->ipif_id);
9226 ip1dbg(("created %s\n", lifr->lifr_name));
9228 /* Set address */
9229 sin = (sin_t *)&lifr->lifr_addr;
9230 if (sin->sin_family != AF_UNSPEC) {
9231 err = ip_sioctl_addr(ipif, sin, q, mp,
9232 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr);
9235 done:
9236 ipsq_exit(ipsq);
9237 return (err);
9241 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical
9242 * interface) delete it based on the IP address (on this physical interface).
9243 * Otherwise delete it based on the ipif_id.
9244 * Also, special handling to allow a removeif of lo0.
9246 /* ARGSUSED */
9248 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9249 ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9251 conn_t *connp;
9252 ill_t *ill = ipif->ipif_ill;
9253 boolean_t success;
9254 ip_stack_t *ipst;
9256 ipst = CONNQ_TO_IPST(q);
9258 ASSERT(q->q_next == NULL);
9259 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n",
9260 ill->ill_name, ipif->ipif_id, (void *)ipif));
9261 ASSERT(IAM_WRITER_IPIF(ipif));
9263 connp = Q_TO_CONN(q);
9265 * Special case for unplumbing lo0 (the loopback physical interface).
9266 * If unplumbing lo0, the incoming address structure has been
9267 * initialized to all zeros. When unplumbing lo0, all its logical
9268 * interfaces must be removed too.
9270 * Note that this interface may be called to remove a specific
9271 * loopback logical interface (eg, lo0:1). But in that case
9272 * ipif->ipif_id != 0 so that the code path for that case is the
9273 * same as any other interface (meaning it skips the code directly
9274 * below).
9276 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9277 if (sin->sin_family == AF_UNSPEC &&
9278 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) {
9280 * Mark it condemned. No new ref. will be made to ill.
9282 mutex_enter(&ill->ill_lock);
9283 ill->ill_state_flags |= ILL_CONDEMNED;
9284 for (ipif = ill->ill_ipif; ipif != NULL;
9285 ipif = ipif->ipif_next) {
9286 ipif->ipif_state_flags |= IPIF_CONDEMNED;
9288 mutex_exit(&ill->ill_lock);
9290 ipif = ill->ill_ipif;
9291 /* unplumb the loopback interface */
9292 ill_delete(ill);
9293 mutex_enter(&connp->conn_lock);
9294 mutex_enter(&ill->ill_lock);
9296 /* Are any references to this ill active */
9297 if (ill_is_freeable(ill)) {
9298 mutex_exit(&ill->ill_lock);
9299 mutex_exit(&connp->conn_lock);
9300 ill_delete_tail(ill);
9301 mi_free(ill);
9302 return (0);
9304 success = ipsq_pending_mp_add(connp, ipif,
9305 CONNP_TO_WQ(connp), mp, ILL_FREE);
9306 mutex_exit(&connp->conn_lock);
9307 mutex_exit(&ill->ill_lock);
9308 if (success)
9309 return (EINPROGRESS);
9310 else
9311 return (EINTR);
9315 if (ipif->ipif_id == 0) {
9316 ipsq_t *ipsq;
9318 /* Find based on address */
9319 if (ipif->ipif_isv6) {
9320 sin6_t *sin6;
9322 if (sin->sin_family != AF_INET6)
9323 return (EAFNOSUPPORT);
9325 sin6 = (sin6_t *)sin;
9326 /* We are a writer, so we should be able to lookup */
9327 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill,
9328 ipst);
9329 } else {
9330 if (sin->sin_family != AF_INET)
9331 return (EAFNOSUPPORT);
9333 /* We are a writer, so we should be able to lookup */
9334 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill,
9335 ipst);
9337 if (ipif == NULL) {
9338 return (EADDRNOTAVAIL);
9342 * It is possible for a user to send an SIOCLIFREMOVEIF with
9343 * lifr_name of the physical interface but with an ip address
9344 * lifr_addr of a logical interface plumbed over it.
9345 * So update ipx_current_ipif now that ipif points to the
9346 * correct one.
9348 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
9349 ipsq->ipsq_xop->ipx_current_ipif = ipif;
9351 /* This is a writer */
9352 ipif_refrele(ipif);
9356 * Can not delete instance zero since it is tied to the ill.
9358 if (ipif->ipif_id == 0)
9359 return (EBUSY);
9361 mutex_enter(&ill->ill_lock);
9362 ipif->ipif_state_flags |= IPIF_CONDEMNED;
9363 mutex_exit(&ill->ill_lock);
9365 ipif_free(ipif);
9367 mutex_enter(&connp->conn_lock);
9368 mutex_enter(&ill->ill_lock);
9370 /* Are any references to this ipif active */
9371 if (ipif_is_freeable(ipif)) {
9372 mutex_exit(&ill->ill_lock);
9373 mutex_exit(&connp->conn_lock);
9374 ipif_non_duplicate(ipif);
9375 (void) ipif_down_tail(ipif);
9376 ipif_free_tail(ipif); /* frees ipif */
9377 return (0);
9379 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp,
9380 IPIF_FREE);
9381 mutex_exit(&ill->ill_lock);
9382 mutex_exit(&connp->conn_lock);
9383 if (success)
9384 return (EINPROGRESS);
9385 else
9386 return (EINTR);
9390 * Restart the removeif ioctl. The refcnt has gone down to 0.
9391 * The ipif is already condemned. So can't find it thru lookups.
9393 /* ARGSUSED */
9395 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
9396 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9398 ill_t *ill = ipif->ipif_ill;
9400 ASSERT(IAM_WRITER_IPIF(ipif));
9401 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED);
9403 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n",
9404 ill->ill_name, ipif->ipif_id, (void *)ipif));
9406 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9407 ASSERT(ill->ill_state_flags & ILL_CONDEMNED);
9408 ill_delete_tail(ill);
9409 mi_free(ill);
9410 return (0);
9413 ipif_non_duplicate(ipif);
9414 (void) ipif_down_tail(ipif);
9415 ipif_free_tail(ipif);
9417 return (0);
9421 * Set the local interface address using the given prefix and ill_token.
9423 /* ARGSUSED */
9425 ip_sioctl_prefix(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9426 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9428 int err;
9429 in6_addr_t v6addr;
9430 sin6_t *sin6;
9431 ill_t *ill;
9432 int i;
9434 ip1dbg(("ip_sioctl_prefix(%s:%u %p)\n",
9435 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9437 ASSERT(IAM_WRITER_IPIF(ipif));
9439 if (!ipif->ipif_isv6)
9440 return (EINVAL);
9442 if (sin->sin_family != AF_INET6)
9443 return (EAFNOSUPPORT);
9445 sin6 = (sin6_t *)sin;
9446 v6addr = sin6->sin6_addr;
9447 ill = ipif->ipif_ill;
9449 if (IN6_IS_ADDR_UNSPECIFIED(&v6addr) ||
9450 IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token))
9451 return (EADDRNOTAVAIL);
9453 for (i = 0; i < 4; i++)
9454 sin6->sin6_addr.s6_addr32[i] |= ill->ill_token.s6_addr32[i];
9456 err = ip_sioctl_addr(ipif, sin, q, mp,
9457 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], dummy_ifreq);
9458 return (err);
9462 * Restart entry point to restart the address set operation after the
9463 * refcounts have dropped to zero.
9465 /* ARGSUSED */
9467 ip_sioctl_prefix_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9468 ip_ioctl_cmd_t *ipip, void *ifreq)
9470 ip1dbg(("ip_sioctl_prefix_restart(%s:%u %p)\n",
9471 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9472 return (ip_sioctl_addr_restart(ipif, sin, q, mp, ipip, ifreq));
9476 * Set the local interface address.
9477 * Allow an address of all zero when the interface is down.
9479 /* ARGSUSED */
9481 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9482 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9484 int err = 0;
9485 in6_addr_t v6addr;
9486 boolean_t need_up = B_FALSE;
9487 ill_t *ill;
9488 int i;
9490 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n",
9491 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9493 ASSERT(IAM_WRITER_IPIF(ipif));
9495 ill = ipif->ipif_ill;
9496 if (ipif->ipif_isv6) {
9497 sin6_t *sin6;
9498 phyint_t *phyi;
9500 if (sin->sin_family != AF_INET6)
9501 return (EAFNOSUPPORT);
9503 sin6 = (sin6_t *)sin;
9504 v6addr = sin6->sin6_addr;
9505 phyi = ill->ill_phyint;
9508 * Enforce that true multicast interfaces have a link-local
9509 * address for logical unit 0.
9511 * However for those ipif's for which link-local address was
9512 * not created by default, also allow setting :: as the address.
9513 * This scenario would arise, when we delete an address on ipif
9514 * with logical unit 0, we would want to set :: as the address.
9516 if (ipif->ipif_id == 0 &&
9517 (ill->ill_flags & ILLF_MULTICAST) &&
9518 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) &&
9519 !(phyi->phyint_flags & (PHYI_LOOPBACK)) &&
9520 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) {
9523 * if default link-local was not created by kernel for
9524 * this ill, allow setting :: as the address on ipif:0.
9526 if (ill->ill_flags & ILLF_NOLINKLOCAL) {
9527 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr))
9528 return (EADDRNOTAVAIL);
9529 } else {
9530 return (EADDRNOTAVAIL);
9535 * up interfaces shouldn't have the unspecified address
9536 * unless they also have the IPIF_NOLOCAL flags set and
9537 * have a subnet assigned.
9539 if ((ipif->ipif_flags & IPIF_UP) &&
9540 IN6_IS_ADDR_UNSPECIFIED(&v6addr) &&
9541 (!(ipif->ipif_flags & IPIF_NOLOCAL) ||
9542 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) {
9543 return (EADDRNOTAVAIL);
9546 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
9547 return (EADDRNOTAVAIL);
9548 } else {
9549 ipaddr_t addr;
9551 if (sin->sin_family != AF_INET)
9552 return (EAFNOSUPPORT);
9554 addr = sin->sin_addr.s_addr;
9556 /* Allow INADDR_ANY as the local address. */
9557 if (addr != INADDR_ANY &&
9558 !ip_addr_ok_v4(addr, ipif->ipif_net_mask))
9559 return (EADDRNOTAVAIL);
9561 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9564 * verify that the address being configured is permitted by the
9565 * ill_allowed_ips[] for the interface.
9567 if (ill->ill_allowed_ips_cnt > 0) {
9568 for (i = 0; i < ill->ill_allowed_ips_cnt; i++) {
9569 if (IN6_ARE_ADDR_EQUAL(&ill->ill_allowed_ips[i],
9570 &v6addr))
9571 break;
9573 if (i == ill->ill_allowed_ips_cnt) {
9574 pr_addr_dbg("!allowed addr %s\n", AF_INET6, &v6addr);
9575 return (EPERM);
9579 * Even if there is no change we redo things just to rerun
9580 * ipif_set_default.
9582 if (ipif->ipif_flags & IPIF_UP) {
9584 * Setting a new local address, make sure
9585 * we have net and subnet bcast ire's for
9586 * the old address if we need them.
9589 * If the interface is already marked up,
9590 * we call ipif_down which will take care
9591 * of ditching any IREs that have been set
9592 * up based on the old interface address.
9594 err = ipif_logical_down(ipif, q, mp);
9595 if (err == EINPROGRESS)
9596 return (err);
9597 (void) ipif_down_tail(ipif);
9598 need_up = 1;
9601 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up);
9602 return (err);
9606 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9607 boolean_t need_up)
9609 in6_addr_t v6addr;
9610 in6_addr_t ov6addr;
9611 ipaddr_t addr;
9612 sin6_t *sin6;
9613 int sinlen;
9614 int err = 0;
9615 ill_t *ill = ipif->ipif_ill;
9616 boolean_t need_dl_down;
9617 boolean_t need_arp_down;
9618 struct iocblk *iocp;
9620 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL;
9622 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n",
9623 ill->ill_name, ipif->ipif_id, (void *)ipif));
9624 ASSERT(IAM_WRITER_IPIF(ipif));
9626 /* Must cancel any pending timer before taking the ill_lock */
9627 if (ipif->ipif_recovery_id != 0)
9628 (void) untimeout(ipif->ipif_recovery_id);
9629 ipif->ipif_recovery_id = 0;
9631 if (ipif->ipif_isv6) {
9632 sin6 = (sin6_t *)sin;
9633 v6addr = sin6->sin6_addr;
9634 sinlen = sizeof (struct sockaddr_in6);
9635 } else {
9636 addr = sin->sin_addr.s_addr;
9637 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9638 sinlen = sizeof (struct sockaddr_in);
9640 mutex_enter(&ill->ill_lock);
9641 ov6addr = ipif->ipif_v6lcl_addr;
9642 ipif->ipif_v6lcl_addr = v6addr;
9643 sctp_update_ipif_addr(ipif, ov6addr);
9644 ipif->ipif_addr_ready = 0;
9646 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
9649 * If the interface was previously marked as a duplicate, then since
9650 * we've now got a "new" address, it should no longer be considered a
9651 * duplicate -- even if the "new" address is the same as the old one.
9652 * Note that if all ipifs are down, we may have a pending ARP down
9653 * event to handle. This is because we want to recover from duplicates
9654 * and thus delay tearing down ARP until the duplicates have been
9655 * removed or disabled.
9657 need_dl_down = need_arp_down = B_FALSE;
9658 if (ipif->ipif_flags & IPIF_DUPLICATE) {
9659 need_arp_down = !need_up;
9660 ipif->ipif_flags &= ~IPIF_DUPLICATE;
9661 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
9662 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
9663 need_dl_down = B_TRUE;
9667 ipif_set_default(ipif);
9670 * If we've just manually set the IPv6 link-local address (0th ipif),
9671 * tag the ill so that future updates to the interface ID don't result
9672 * in this address getting automatically reconfigured from under the
9673 * administrator.
9675 if (ipif->ipif_isv6 && ipif->ipif_id == 0) {
9676 if (iocp == NULL || (iocp->ioc_cmd == SIOCSLIFADDR &&
9677 !IN6_IS_ADDR_UNSPECIFIED(&v6addr)))
9678 ill->ill_manual_linklocal = 1;
9682 * When publishing an interface address change event, we only notify
9683 * the event listeners of the new address. It is assumed that if they
9684 * actively care about the addresses assigned that they will have
9685 * already discovered the previous address assigned (if there was one.)
9687 * Don't attach nic event message for SIOCLIFADDIF ioctl.
9689 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) {
9690 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id),
9691 NE_ADDRESS_CHANGE, sin, sinlen);
9694 mutex_exit(&ill->ill_lock);
9696 if (need_up) {
9698 * Now bring the interface back up. If this
9699 * is the only IPIF for the ILL, ipif_up
9700 * will have to re-bind to the device, so
9701 * we may get back EINPROGRESS, in which
9702 * case, this IOCTL will get completed in
9703 * ip_rput_dlpi when we see the DL_BIND_ACK.
9705 err = ipif_up(ipif, q, mp);
9706 } else {
9707 /* Perhaps ilgs should use this ill */
9708 update_conn_ill(NULL, ill->ill_ipst);
9711 if (need_dl_down)
9712 ill_dl_down(ill);
9714 if (need_arp_down && !ill->ill_isv6)
9715 (void) ipif_arp_down(ipif);
9718 * The default multicast interface might have changed (for
9719 * instance if the IPv6 scope of the address changed)
9721 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
9723 return (err);
9727 * Restart entry point to restart the address set operation after the
9728 * refcounts have dropped to zero.
9730 /* ARGSUSED */
9732 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9733 ip_ioctl_cmd_t *ipip, void *ifreq)
9735 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n",
9736 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9737 ASSERT(IAM_WRITER_IPIF(ipif));
9738 (void) ipif_down_tail(ipif);
9739 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE));
9742 /* ARGSUSED */
9744 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9745 ip_ioctl_cmd_t *ipip, void *if_req)
9747 sin6_t *sin6 = (struct sockaddr_in6 *)sin;
9748 struct lifreq *lifr = (struct lifreq *)if_req;
9750 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n",
9751 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9753 * The net mask and address can't change since we have a
9754 * reference to the ipif. So no lock is necessary.
9756 if (ipif->ipif_isv6) {
9757 *sin6 = sin6_null;
9758 sin6->sin6_family = AF_INET6;
9759 sin6->sin6_addr = ipif->ipif_v6lcl_addr;
9760 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
9761 sin6->sin6_scope_id =
9762 ipif->ipif_ill->ill_phyint->phyint_ifindex;
9764 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
9765 lifr->lifr_addrlen =
9766 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
9767 } else {
9768 *sin = sin_null;
9769 sin->sin_family = AF_INET;
9770 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
9771 if (ipip->ipi_cmd_type == LIF_CMD) {
9772 lifr->lifr_addrlen =
9773 ip_mask_to_plen(ipif->ipif_net_mask);
9776 return (0);
9780 * Set the destination address for a pt-pt interface.
9782 /* ARGSUSED */
9784 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9785 ip_ioctl_cmd_t *ipip, void *if_req)
9787 int err = 0;
9788 in6_addr_t v6addr;
9789 boolean_t need_up = B_FALSE;
9791 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n",
9792 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9793 ASSERT(IAM_WRITER_IPIF(ipif));
9795 if (ipif->ipif_isv6) {
9796 sin6_t *sin6;
9798 if (sin->sin_family != AF_INET6)
9799 return (EAFNOSUPPORT);
9801 sin6 = (sin6_t *)sin;
9802 v6addr = sin6->sin6_addr;
9804 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
9805 return (EADDRNOTAVAIL);
9806 } else {
9807 ipaddr_t addr;
9809 if (sin->sin_family != AF_INET)
9810 return (EAFNOSUPPORT);
9812 addr = sin->sin_addr.s_addr;
9813 if (addr != INADDR_ANY &&
9814 !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) {
9815 return (EADDRNOTAVAIL);
9818 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9821 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr))
9822 return (0); /* No change */
9824 if (ipif->ipif_flags & IPIF_UP) {
9826 * If the interface is already marked up,
9827 * we call ipif_down which will take care
9828 * of ditching any IREs that have been set
9829 * up based on the old pp dst address.
9831 err = ipif_logical_down(ipif, q, mp);
9832 if (err == EINPROGRESS)
9833 return (err);
9834 (void) ipif_down_tail(ipif);
9835 need_up = B_TRUE;
9838 * could return EINPROGRESS. If so ioctl will complete in
9839 * ip_rput_dlpi_writer
9841 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up);
9842 return (err);
9845 static int
9846 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9847 boolean_t need_up)
9849 in6_addr_t v6addr;
9850 ill_t *ill = ipif->ipif_ill;
9851 int err = 0;
9852 boolean_t need_dl_down;
9853 boolean_t need_arp_down;
9855 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name,
9856 ipif->ipif_id, (void *)ipif));
9858 /* Must cancel any pending timer before taking the ill_lock */
9859 if (ipif->ipif_recovery_id != 0)
9860 (void) untimeout(ipif->ipif_recovery_id);
9861 ipif->ipif_recovery_id = 0;
9863 if (ipif->ipif_isv6) {
9864 sin6_t *sin6;
9866 sin6 = (sin6_t *)sin;
9867 v6addr = sin6->sin6_addr;
9868 } else {
9869 ipaddr_t addr;
9871 addr = sin->sin_addr.s_addr;
9872 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9874 mutex_enter(&ill->ill_lock);
9875 /* Set point to point destination address. */
9876 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
9878 * Allow this as a means of creating logical
9879 * pt-pt interfaces on top of e.g. an Ethernet.
9880 * XXX Undocumented HACK for testing.
9881 * pt-pt interfaces are created with NUD disabled.
9883 ipif->ipif_flags |= IPIF_POINTOPOINT;
9884 ipif->ipif_flags &= ~IPIF_BROADCAST;
9885 if (ipif->ipif_isv6)
9886 ill->ill_flags |= ILLF_NONUD;
9890 * If the interface was previously marked as a duplicate, then since
9891 * we've now got a "new" address, it should no longer be considered a
9892 * duplicate -- even if the "new" address is the same as the old one.
9893 * Note that if all ipifs are down, we may have a pending ARP down
9894 * event to handle.
9896 need_dl_down = need_arp_down = B_FALSE;
9897 if (ipif->ipif_flags & IPIF_DUPLICATE) {
9898 need_arp_down = !need_up;
9899 ipif->ipif_flags &= ~IPIF_DUPLICATE;
9900 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
9901 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
9902 need_dl_down = B_TRUE;
9907 * If we've just manually set the IPv6 destination link-local address
9908 * (0th ipif), tag the ill so that future updates to the destination
9909 * interface ID (as can happen with interfaces over IP tunnels) don't
9910 * result in this address getting automatically reconfigured from
9911 * under the administrator.
9913 if (ipif->ipif_isv6 && ipif->ipif_id == 0)
9914 ill->ill_manual_dst_linklocal = 1;
9916 /* Set the new address. */
9917 ipif->ipif_v6pp_dst_addr = v6addr;
9918 /* Make sure subnet tracks pp_dst */
9919 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
9920 mutex_exit(&ill->ill_lock);
9922 if (need_up) {
9924 * Now bring the interface back up. If this
9925 * is the only IPIF for the ILL, ipif_up
9926 * will have to re-bind to the device, so
9927 * we may get back EINPROGRESS, in which
9928 * case, this IOCTL will get completed in
9929 * ip_rput_dlpi when we see the DL_BIND_ACK.
9931 err = ipif_up(ipif, q, mp);
9934 if (need_dl_down)
9935 ill_dl_down(ill);
9936 if (need_arp_down && !ipif->ipif_isv6)
9937 (void) ipif_arp_down(ipif);
9939 return (err);
9943 * Restart entry point to restart the dstaddress set operation after the
9944 * refcounts have dropped to zero.
9946 /* ARGSUSED */
9948 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9949 ip_ioctl_cmd_t *ipip, void *ifreq)
9951 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n",
9952 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9953 (void) ipif_down_tail(ipif);
9954 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE));
9957 /* ARGSUSED */
9959 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9960 ip_ioctl_cmd_t *ipip, void *if_req)
9962 sin6_t *sin6 = (struct sockaddr_in6 *)sin;
9964 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n",
9965 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9967 * Get point to point destination address. The addresses can't
9968 * change since we hold a reference to the ipif.
9970 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0)
9971 return (EADDRNOTAVAIL);
9973 if (ipif->ipif_isv6) {
9974 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
9975 *sin6 = sin6_null;
9976 sin6->sin6_family = AF_INET6;
9977 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr;
9978 } else {
9979 *sin = sin_null;
9980 sin->sin_family = AF_INET;
9981 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr;
9983 return (0);
9987 * Check which flags will change by the given flags being set
9988 * silently ignore flags which userland is not allowed to control.
9989 * (Because these flags may change between SIOCGLIFFLAGS and
9990 * SIOCSLIFFLAGS, and that's outside of userland's control,
9991 * we need to silently ignore them rather than fail.)
9993 static void
9994 ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp,
9995 uint64_t *offp)
9997 ill_t *ill = ipif->ipif_ill;
9998 phyint_t *phyi = ill->ill_phyint;
9999 uint64_t cantchange_flags, intf_flags;
10000 uint64_t turn_on, turn_off;
10002 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10003 cantchange_flags = IFF_CANTCHANGE;
10004 if (IS_IPMP(ill))
10005 cantchange_flags |= IFF_IPMP_CANTCHANGE;
10006 turn_on = (flags ^ intf_flags) & ~cantchange_flags;
10007 turn_off = intf_flags & turn_on;
10008 turn_on ^= turn_off;
10009 *onp = turn_on;
10010 *offp = turn_off;
10014 * Set interface flags. Many flags require special handling (e.g.,
10015 * bringing the interface down); see below for details.
10017 * NOTE : We really don't enforce that ipif_id zero should be used
10018 * for setting any flags other than IFF_LOGINT_FLAGS. This
10019 * is because applications generally does SICGLIFFLAGS and
10020 * ORs in the new flags (that affects the logical) and does a
10021 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other
10022 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the
10023 * flags that will be turned on is correct with respect to
10024 * ipif_id 0. For backward compatibility reasons, it is not done.
10026 /* ARGSUSED */
10028 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10029 ip_ioctl_cmd_t *ipip, void *if_req)
10031 uint64_t turn_on;
10032 uint64_t turn_off;
10033 int err = 0;
10034 phyint_t *phyi;
10035 ill_t *ill;
10036 conn_t *connp;
10037 uint64_t intf_flags;
10038 boolean_t phyint_flags_modified = B_FALSE;
10039 uint64_t flags;
10040 struct ifreq *ifr;
10041 struct lifreq *lifr;
10042 boolean_t set_linklocal = B_FALSE;
10044 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
10045 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10047 ASSERT(IAM_WRITER_IPIF(ipif));
10049 ill = ipif->ipif_ill;
10050 phyi = ill->ill_phyint;
10052 if (ipip->ipi_cmd_type == IF_CMD) {
10053 ifr = (struct ifreq *)if_req;
10054 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff);
10055 } else {
10056 lifr = (struct lifreq *)if_req;
10057 flags = lifr->lifr_flags;
10060 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10063 * Have the flags been set correctly until now?
10065 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10066 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10067 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10069 * Compare the new flags to the old, and partition
10070 * into those coming on and those going off.
10071 * For the 16 bit command keep the bits above bit 16 unchanged.
10073 if (ipip->ipi_cmd == SIOCSIFFLAGS)
10074 flags |= intf_flags & ~0xFFFF;
10077 * Explicitly fail attempts to change flags that are always invalid on
10078 * an IPMP meta-interface.
10080 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID))
10081 return (EINVAL);
10083 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10084 if ((turn_on|turn_off) == 0)
10085 return (0); /* No change */
10088 * All test addresses must be IFF_DEPRECATED (to ensure source address
10089 * selection avoids them) -- so force IFF_DEPRECATED on, and do not
10090 * allow it to be turned off.
10092 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED &&
10093 (turn_on|intf_flags) & IFF_NOFAILOVER)
10094 return (EINVAL);
10096 if ((connp = Q_TO_CONN(q)) == NULL)
10097 return (EINVAL);
10100 * Only vrrp control socket is allowed to change IFF_UP and
10101 * IFF_NOACCEPT flags when IFF_VRRP is set.
10103 if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) {
10104 if (!connp->conn_isvrrp)
10105 return (EINVAL);
10109 * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by
10110 * VRRP control socket.
10112 if ((turn_off | turn_on) & IFF_NOACCEPT) {
10113 if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP))
10114 return (EINVAL);
10117 if (turn_on & IFF_NOFAILOVER) {
10118 turn_on |= IFF_DEPRECATED;
10119 flags |= IFF_DEPRECATED;
10123 * On underlying interfaces, only allow applications to manage test
10124 * addresses -- otherwise, they may get confused when the address
10125 * moves as part of being brought up. Likewise, prevent an
10126 * application-managed test address from being converted to a data
10127 * address. To prevent migration of administratively up addresses in
10128 * the kernel, we don't allow them to be converted either.
10130 if (IS_UNDER_IPMP(ill)) {
10131 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF;
10133 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER))
10134 return (EINVAL);
10136 if ((turn_off & IFF_NOFAILOVER) &&
10137 (flags & (appflags | IFF_UP | IFF_DUPLICATE)))
10138 return (EINVAL);
10142 * Only allow IFF_TEMPORARY flag to be set on
10143 * IPv6 interfaces.
10145 if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6))
10146 return (EINVAL);
10149 * cannot turn off IFF_NOXMIT on VNI interfaces.
10151 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill))
10152 return (EINVAL);
10155 * Don't allow the IFF_ROUTER flag to be turned on on loopback
10156 * interfaces. It makes no sense in that context.
10158 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK))
10159 return (EINVAL);
10162 * For IPv6 ipif_id 0, don't allow the interface to be up without
10163 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set.
10164 * If the link local address isn't set, and can be set, it will get
10165 * set later on in this function.
10167 if (ipif->ipif_id == 0 && ipif->ipif_isv6 &&
10168 (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) &&
10169 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
10170 if (ipif_cant_setlinklocal(ipif))
10171 return (EINVAL);
10172 set_linklocal = B_TRUE;
10176 * If we modify physical interface flags, we'll potentially need to
10177 * send up two routing socket messages for the changes (one for the
10178 * IPv4 ill, and another for the IPv6 ill). Note that here.
10180 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10181 phyint_flags_modified = B_TRUE;
10184 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE
10185 * (otherwise, we'd immediately use them, defeating standby). Also,
10186 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not
10187 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already
10188 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We
10189 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics
10190 * will not be honored.
10192 if (turn_on & PHYI_STANDBY) {
10194 * No need to grab ill_g_usesrc_lock here; see the
10195 * synchronization notes in ip.c.
10197 if (ill->ill_usesrc_grp_next != NULL ||
10198 intf_flags & PHYI_INACTIVE)
10199 return (EINVAL);
10200 if (!(flags & PHYI_FAILED)) {
10201 flags |= PHYI_INACTIVE;
10202 turn_on |= PHYI_INACTIVE;
10206 if (turn_off & PHYI_STANDBY) {
10207 flags &= ~PHYI_INACTIVE;
10208 turn_off |= PHYI_INACTIVE;
10212 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both
10213 * would end up on.
10215 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
10216 (PHYI_FAILED | PHYI_INACTIVE))
10217 return (EINVAL);
10220 * If ILLF_ROUTER changes, we need to change the ip forwarding
10221 * status of the interface.
10223 if ((turn_on | turn_off) & ILLF_ROUTER) {
10224 err = ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0));
10225 if (err != 0)
10226 return (err);
10230 * If the interface is not UP and we are not going to
10231 * bring it UP, record the flags and return. When the
10232 * interface comes UP later, the right actions will be
10233 * taken.
10235 if (!(ipif->ipif_flags & IPIF_UP) &&
10236 !(turn_on & IPIF_UP)) {
10237 /* Record new flags in their respective places. */
10238 mutex_enter(&ill->ill_lock);
10239 mutex_enter(&ill->ill_phyint->phyint_lock);
10240 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10241 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10242 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10243 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10244 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10245 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10246 mutex_exit(&ill->ill_lock);
10247 mutex_exit(&ill->ill_phyint->phyint_lock);
10250 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the
10251 * same to the kernel: if any of them has been set by
10252 * userland, the interface cannot be used for data traffic.
10254 if ((turn_on|turn_off) &
10255 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10256 ASSERT(!IS_IPMP(ill));
10258 * It's possible the ill is part of an "anonymous"
10259 * IPMP group rather than a real group. In that case,
10260 * there are no other interfaces in the group and thus
10261 * no need to call ipmp_phyint_refresh_active().
10263 if (IS_UNDER_IPMP(ill))
10264 ipmp_phyint_refresh_active(phyi);
10267 if (phyint_flags_modified) {
10268 if (phyi->phyint_illv4 != NULL) {
10269 ip_rts_ifmsg(phyi->phyint_illv4->
10270 ill_ipif, RTSQ_DEFAULT);
10272 if (phyi->phyint_illv6 != NULL) {
10273 ip_rts_ifmsg(phyi->phyint_illv6->
10274 ill_ipif, RTSQ_DEFAULT);
10277 /* The default multicast interface might have changed */
10278 ire_increment_multicast_generation(ill->ill_ipst,
10279 ill->ill_isv6);
10281 return (0);
10282 } else if (set_linklocal) {
10283 mutex_enter(&ill->ill_lock);
10284 if (set_linklocal)
10285 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL;
10286 mutex_exit(&ill->ill_lock);
10290 * Disallow IPv6 interfaces coming up that have the unspecified address,
10291 * or point-to-point interfaces with an unspecified destination. We do
10292 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that
10293 * have a subnet assigned, which is how in.ndpd currently manages its
10294 * onlink prefix list when no addresses are configured with those
10295 * prefixes.
10297 if (ipif->ipif_isv6 &&
10298 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
10299 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) ||
10300 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) ||
10301 ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10302 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) {
10303 return (EINVAL);
10307 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination
10308 * from being brought up.
10310 if (!ipif->ipif_isv6 &&
10311 ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10312 ipif->ipif_pp_dst_addr == INADDR_ANY)) {
10313 return (EINVAL);
10317 * If we are going to change one or more of the flags that are
10318 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP,
10319 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and
10320 * IPIF_NOFAILOVER, we will take special action. This is
10321 * done by bring the ipif down, changing the flags and bringing
10322 * it back up again. For IPIF_NOFAILOVER, the act of bringing it
10323 * back up will trigger the address to be moved.
10325 * If we are going to change IFF_NOACCEPT, we need to bring
10326 * all the ipifs down then bring them up again. The act of
10327 * bringing all the ipifs back up will trigger the local
10328 * ires being recreated with "no_accept" set/cleared.
10330 * Note that ILLF_NOACCEPT is always set separately from the
10331 * other flags.
10333 if ((turn_on|turn_off) &
10334 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP|
10335 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED|
10336 IPIF_NOFAILOVER)) {
10338 * ipif_down() will ire_delete bcast ire's for the subnet,
10339 * while the ire_identical_ref tracks the case of IRE_BROADCAST
10340 * entries shared between multiple ipifs on the same subnet.
10342 if (((ipif->ipif_flags | turn_on) & IPIF_UP) &&
10343 !(turn_off & IPIF_UP)) {
10344 if (ipif->ipif_flags & IPIF_UP)
10345 ill->ill_logical_down = 1;
10346 turn_on &= ~IPIF_UP;
10348 err = ipif_down(ipif, q, mp);
10349 ip1dbg(("ipif_down returns %d err ", err));
10350 if (err == EINPROGRESS)
10351 return (err);
10352 (void) ipif_down_tail(ipif);
10353 } else if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10355 * If we can quiesce the ill, then continue. If not, then
10356 * ip_sioctl_flags_tail() will be called from
10357 * ipif_ill_refrele_tail().
10359 ill_down_ipifs(ill, B_TRUE);
10361 mutex_enter(&connp->conn_lock);
10362 mutex_enter(&ill->ill_lock);
10363 if (!ill_is_quiescent(ill)) {
10364 boolean_t success;
10366 success = ipsq_pending_mp_add(connp, ill->ill_ipif,
10367 q, mp, ILL_DOWN);
10368 mutex_exit(&ill->ill_lock);
10369 mutex_exit(&connp->conn_lock);
10370 return (success ? EINPROGRESS : EINTR);
10372 mutex_exit(&ill->ill_lock);
10373 mutex_exit(&connp->conn_lock);
10375 return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10378 static int
10379 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
10381 ill_t *ill;
10382 phyint_t *phyi;
10383 uint64_t turn_on, turn_off;
10384 boolean_t phyint_flags_modified = B_FALSE;
10385 int err = 0;
10386 boolean_t set_linklocal = B_FALSE;
10388 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n",
10389 ipif->ipif_ill->ill_name, ipif->ipif_id));
10391 ASSERT(IAM_WRITER_IPIF(ipif));
10393 ill = ipif->ipif_ill;
10394 phyi = ill->ill_phyint;
10396 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10399 * IFF_UP is handled separately.
10401 turn_on &= ~IFF_UP;
10402 turn_off &= ~IFF_UP;
10404 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10405 phyint_flags_modified = B_TRUE;
10408 * Now we change the flags. Track current value of
10409 * other flags in their respective places.
10411 mutex_enter(&ill->ill_lock);
10412 mutex_enter(&phyi->phyint_lock);
10413 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10414 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10415 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10416 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10417 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10418 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10419 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) {
10420 set_linklocal = B_TRUE;
10421 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL;
10424 mutex_exit(&ill->ill_lock);
10425 mutex_exit(&phyi->phyint_lock);
10427 if (set_linklocal)
10428 (void) ipif_setlinklocal(ipif);
10431 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to
10432 * the kernel: if any of them has been set by userland, the interface
10433 * cannot be used for data traffic.
10435 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10436 ASSERT(!IS_IPMP(ill));
10438 * It's possible the ill is part of an "anonymous" IPMP group
10439 * rather than a real group. In that case, there are no other
10440 * interfaces in the group and thus no need for us to call
10441 * ipmp_phyint_refresh_active().
10443 if (IS_UNDER_IPMP(ill))
10444 ipmp_phyint_refresh_active(phyi);
10447 if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10449 * If the ILLF_NOACCEPT flag is changed, bring up all the
10450 * ipifs that were brought down.
10452 * The routing sockets messages are sent as the result
10453 * of ill_up_ipifs(), further, SCTP's IPIF list was updated
10454 * as well.
10456 err = ill_up_ipifs(ill, q, mp);
10457 } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) {
10459 * XXX ipif_up really does not know whether a phyint flags
10460 * was modified or not. So, it sends up information on
10461 * only one routing sockets message. As we don't bring up
10462 * the interface and also set PHYI_ flags simultaneously
10463 * it should be okay.
10465 err = ipif_up(ipif, q, mp);
10466 } else {
10468 * Make sure routing socket sees all changes to the flags.
10469 * ipif_up_done* handles this when we use ipif_up.
10471 if (phyint_flags_modified) {
10472 if (phyi->phyint_illv4 != NULL) {
10473 ip_rts_ifmsg(phyi->phyint_illv4->
10474 ill_ipif, RTSQ_DEFAULT);
10476 if (phyi->phyint_illv6 != NULL) {
10477 ip_rts_ifmsg(phyi->phyint_illv6->
10478 ill_ipif, RTSQ_DEFAULT);
10480 } else {
10481 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
10484 * Update the flags in SCTP's IPIF list, ipif_up() will do
10485 * this in need_up case.
10487 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10490 /* The default multicast interface might have changed */
10491 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
10492 return (err);
10496 * Restart the flags operation now that the refcounts have dropped to zero.
10498 /* ARGSUSED */
10500 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10501 ip_ioctl_cmd_t *ipip, void *if_req)
10503 uint64_t flags;
10504 struct ifreq *ifr = if_req;
10505 struct lifreq *lifr = if_req;
10506 uint64_t turn_on, turn_off;
10508 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n",
10509 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10511 if (ipip->ipi_cmd_type == IF_CMD) {
10512 /* cast to uint16_t prevents unwanted sign extension */
10513 flags = (uint16_t)ifr->ifr_flags;
10514 } else {
10515 flags = lifr->lifr_flags;
10519 * If this function call is a result of the ILLF_NOACCEPT flag
10520 * change, do not call ipif_down_tail(). See ip_sioctl_flags().
10522 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10523 if (!((turn_on|turn_off) & ILLF_NOACCEPT))
10524 (void) ipif_down_tail(ipif);
10526 return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10530 * Can operate on either a module or a driver queue.
10532 /* ARGSUSED */
10534 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10535 ip_ioctl_cmd_t *ipip, void *if_req)
10538 * Has the flags been set correctly till now ?
10540 ill_t *ill = ipif->ipif_ill;
10541 phyint_t *phyi = ill->ill_phyint;
10543 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n",
10544 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10545 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10546 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10547 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10550 * Need a lock since some flags can be set even when there are
10551 * references to the ipif.
10553 mutex_enter(&ill->ill_lock);
10554 if (ipip->ipi_cmd_type == IF_CMD) {
10555 struct ifreq *ifr = (struct ifreq *)if_req;
10557 /* Get interface flags (low 16 only). */
10558 ifr->ifr_flags = ((ipif->ipif_flags |
10559 ill->ill_flags | phyi->phyint_flags) & 0xffff);
10560 } else {
10561 struct lifreq *lifr = (struct lifreq *)if_req;
10563 /* Get interface flags. */
10564 lifr->lifr_flags = ipif->ipif_flags |
10565 ill->ill_flags | phyi->phyint_flags;
10567 mutex_exit(&ill->ill_lock);
10568 return (0);
10572 * We allow the MTU to be set on an ILL, but not have it be different
10573 * for different IPIFs since we don't actually send packets on IPIFs.
10575 /* ARGSUSED */
10577 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10578 ip_ioctl_cmd_t *ipip, void *if_req)
10580 int mtu;
10581 int ip_min_mtu;
10582 struct ifreq *ifr;
10583 struct lifreq *lifr;
10584 ill_t *ill;
10586 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name,
10587 ipif->ipif_id, (void *)ipif));
10588 if (ipip->ipi_cmd_type == IF_CMD) {
10589 ifr = (struct ifreq *)if_req;
10590 mtu = ifr->ifr_metric;
10591 } else {
10592 lifr = (struct lifreq *)if_req;
10593 mtu = lifr->lifr_mtu;
10595 /* Only allow for logical unit zero i.e. not on "bge0:17" */
10596 if (ipif->ipif_id != 0)
10597 return (EINVAL);
10599 ill = ipif->ipif_ill;
10600 if (ipif->ipif_isv6)
10601 ip_min_mtu = IPV6_MIN_MTU;
10602 else
10603 ip_min_mtu = IP_MIN_MTU;
10605 mutex_enter(&ill->ill_lock);
10606 if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) {
10607 mutex_exit(&ill->ill_lock);
10608 return (EINVAL);
10610 /* Avoid increasing ill_mc_mtu */
10611 if (ill->ill_mc_mtu > mtu)
10612 ill->ill_mc_mtu = mtu;
10615 * The dce and fragmentation code can handle changes to ill_mtu
10616 * concurrent with sending/fragmenting packets.
10618 ill->ill_mtu = mtu;
10619 ill->ill_flags |= ILLF_FIXEDMTU;
10620 mutex_exit(&ill->ill_lock);
10623 * Make sure all dce_generation checks find out
10624 * that ill_mtu/ill_mc_mtu has changed.
10626 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
10629 * Refresh IPMP meta-interface MTU if necessary.
10631 if (IS_UNDER_IPMP(ill))
10632 ipmp_illgrp_refresh_mtu(ill->ill_grp);
10634 /* Update the MTU in SCTP's list */
10635 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10636 return (0);
10639 /* Get interface MTU. */
10640 /* ARGSUSED */
10642 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10643 ip_ioctl_cmd_t *ipip, void *if_req)
10645 struct ifreq *ifr;
10646 struct lifreq *lifr;
10648 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n",
10649 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10652 * We allow a get on any logical interface even though the set
10653 * can only be done on logical unit 0.
10655 if (ipip->ipi_cmd_type == IF_CMD) {
10656 ifr = (struct ifreq *)if_req;
10657 ifr->ifr_metric = ipif->ipif_ill->ill_mtu;
10658 } else {
10659 lifr = (struct lifreq *)if_req;
10660 lifr->lifr_mtu = ipif->ipif_ill->ill_mtu;
10662 return (0);
10665 /* Set interface broadcast address. */
10666 /* ARGSUSED2 */
10668 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10669 ip_ioctl_cmd_t *ipip, void *if_req)
10671 ipaddr_t addr;
10672 ire_t *ire;
10673 ill_t *ill = ipif->ipif_ill;
10674 ip_stack_t *ipst = ill->ill_ipst;
10676 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name,
10677 ipif->ipif_id));
10679 ASSERT(IAM_WRITER_IPIF(ipif));
10680 if (!(ipif->ipif_flags & IPIF_BROADCAST))
10681 return (EADDRNOTAVAIL);
10683 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */
10685 if (sin->sin_family != AF_INET)
10686 return (EAFNOSUPPORT);
10688 addr = sin->sin_addr.s_addr;
10690 if (ipif->ipif_flags & IPIF_UP) {
10692 * If we are already up, make sure the new
10693 * broadcast address makes sense. If it does,
10694 * there should be an IRE for it already.
10696 ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST,
10697 ill, ipif->ipif_zoneid,
10698 (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL);
10699 if (ire == NULL) {
10700 return (EINVAL);
10701 } else {
10702 ire_refrele(ire);
10706 * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST
10707 * needs to already exist we never need to change the set of
10708 * IRE_BROADCASTs when we are UP.
10710 if (addr != ipif->ipif_brd_addr)
10711 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr);
10713 return (0);
10716 /* Get interface broadcast address. */
10717 /* ARGSUSED */
10719 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10720 ip_ioctl_cmd_t *ipip, void *if_req)
10722 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n",
10723 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10724 if (!(ipif->ipif_flags & IPIF_BROADCAST))
10725 return (EADDRNOTAVAIL);
10727 /* IPIF_BROADCAST not possible with IPv6 */
10728 ASSERT(!ipif->ipif_isv6);
10729 *sin = sin_null;
10730 sin->sin_family = AF_INET;
10731 sin->sin_addr.s_addr = ipif->ipif_brd_addr;
10732 return (0);
10736 * This routine is called to handle the SIOCS*IFNETMASK IOCTL.
10738 /* ARGSUSED */
10740 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10741 ip_ioctl_cmd_t *ipip, void *if_req)
10743 int err = 0;
10744 in6_addr_t v6mask;
10746 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n",
10747 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10749 ASSERT(IAM_WRITER_IPIF(ipif));
10751 if (ipif->ipif_isv6) {
10752 sin6_t *sin6;
10754 if (sin->sin_family != AF_INET6)
10755 return (EAFNOSUPPORT);
10757 sin6 = (sin6_t *)sin;
10758 v6mask = sin6->sin6_addr;
10759 } else {
10760 ipaddr_t mask;
10762 if (sin->sin_family != AF_INET)
10763 return (EAFNOSUPPORT);
10765 mask = sin->sin_addr.s_addr;
10766 if (!ip_contiguous_mask(ntohl(mask)))
10767 return (ENOTSUP);
10768 V4MASK_TO_V6(mask, v6mask);
10772 * No big deal if the interface isn't already up, or the mask
10773 * isn't really changing, or this is pt-pt.
10775 if (!(ipif->ipif_flags & IPIF_UP) ||
10776 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) ||
10777 (ipif->ipif_flags & IPIF_POINTOPOINT)) {
10778 ipif->ipif_v6net_mask = v6mask;
10779 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
10780 V6_MASK_COPY(ipif->ipif_v6lcl_addr,
10781 ipif->ipif_v6net_mask,
10782 ipif->ipif_v6subnet);
10784 return (0);
10787 * Make sure we have valid net and subnet broadcast ire's
10788 * for the old netmask, if needed by other logical interfaces.
10790 err = ipif_logical_down(ipif, q, mp);
10791 if (err == EINPROGRESS)
10792 return (err);
10793 (void) ipif_down_tail(ipif);
10794 err = ip_sioctl_netmask_tail(ipif, sin, q, mp);
10795 return (err);
10798 static int
10799 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp)
10801 in6_addr_t v6mask;
10802 int err = 0;
10804 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n",
10805 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10807 if (ipif->ipif_isv6) {
10808 sin6_t *sin6;
10810 sin6 = (sin6_t *)sin;
10811 v6mask = sin6->sin6_addr;
10812 } else {
10813 ipaddr_t mask;
10815 mask = sin->sin_addr.s_addr;
10816 V4MASK_TO_V6(mask, v6mask);
10819 ipif->ipif_v6net_mask = v6mask;
10820 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
10821 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
10822 ipif->ipif_v6subnet);
10824 err = ipif_up(ipif, q, mp);
10826 if (err == 0 || err == EINPROGRESS) {
10828 * The interface must be DL_BOUND if this packet has to
10829 * go out on the wire. Since we only go through a logical
10830 * down and are bound with the driver during an internal
10831 * down/up that is satisfied.
10833 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) {
10834 /* Potentially broadcast an address mask reply. */
10835 ipif_mask_reply(ipif);
10838 return (err);
10841 /* ARGSUSED */
10843 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10844 ip_ioctl_cmd_t *ipip, void *if_req)
10846 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n",
10847 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10848 (void) ipif_down_tail(ipif);
10849 return (ip_sioctl_netmask_tail(ipif, sin, q, mp));
10852 /* Get interface net mask. */
10853 /* ARGSUSED */
10855 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10856 ip_ioctl_cmd_t *ipip, void *if_req)
10858 struct lifreq *lifr = (struct lifreq *)if_req;
10859 struct sockaddr_in6 *sin6 = (sin6_t *)sin;
10861 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n",
10862 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10865 * net mask can't change since we have a reference to the ipif.
10867 if (ipif->ipif_isv6) {
10868 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
10869 *sin6 = sin6_null;
10870 sin6->sin6_family = AF_INET6;
10871 sin6->sin6_addr = ipif->ipif_v6net_mask;
10872 lifr->lifr_addrlen =
10873 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
10874 } else {
10875 *sin = sin_null;
10876 sin->sin_family = AF_INET;
10877 sin->sin_addr.s_addr = ipif->ipif_net_mask;
10878 if (ipip->ipi_cmd_type == LIF_CMD) {
10879 lifr->lifr_addrlen =
10880 ip_mask_to_plen(ipif->ipif_net_mask);
10883 return (0);
10886 /* ARGSUSED */
10888 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10889 ip_ioctl_cmd_t *ipip, void *if_req)
10891 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n",
10892 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10895 * Since no applications should ever be setting metrics on underlying
10896 * interfaces, we explicitly fail to smoke 'em out.
10898 if (IS_UNDER_IPMP(ipif->ipif_ill))
10899 return (EINVAL);
10902 * Set interface metric. We don't use this for
10903 * anything but we keep track of it in case it is
10904 * important to routing applications or such.
10906 if (ipip->ipi_cmd_type == IF_CMD) {
10907 struct ifreq *ifr;
10909 ifr = (struct ifreq *)if_req;
10910 ipif->ipif_ill->ill_metric = ifr->ifr_metric;
10911 } else {
10912 struct lifreq *lifr;
10914 lifr = (struct lifreq *)if_req;
10915 ipif->ipif_ill->ill_metric = lifr->lifr_metric;
10917 return (0);
10920 /* ARGSUSED */
10922 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10923 ip_ioctl_cmd_t *ipip, void *if_req)
10925 /* Get interface metric. */
10926 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n",
10927 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10929 if (ipip->ipi_cmd_type == IF_CMD) {
10930 struct ifreq *ifr;
10932 ifr = (struct ifreq *)if_req;
10933 ifr->ifr_metric = ipif->ipif_ill->ill_metric;
10934 } else {
10935 struct lifreq *lifr;
10937 lifr = (struct lifreq *)if_req;
10938 lifr->lifr_metric = ipif->ipif_ill->ill_metric;
10941 return (0);
10944 /* ARGSUSED */
10946 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10947 ip_ioctl_cmd_t *ipip, void *if_req)
10949 int arp_muxid;
10951 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n",
10952 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10954 * Set the muxid returned from I_PLINK.
10956 if (ipip->ipi_cmd_type == IF_CMD) {
10957 struct ifreq *ifr = (struct ifreq *)if_req;
10959 ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid;
10960 arp_muxid = ifr->ifr_arp_muxid;
10961 } else {
10962 struct lifreq *lifr = (struct lifreq *)if_req;
10964 ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid;
10965 arp_muxid = lifr->lifr_arp_muxid;
10967 arl_set_muxid(ipif->ipif_ill, arp_muxid);
10968 return (0);
10971 /* ARGSUSED */
10973 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10974 ip_ioctl_cmd_t *ipip, void *if_req)
10976 int arp_muxid = 0;
10978 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n",
10979 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10981 * Get the muxid saved in ill for I_PUNLINK.
10983 arp_muxid = arl_get_muxid(ipif->ipif_ill);
10984 if (ipip->ipi_cmd_type == IF_CMD) {
10985 struct ifreq *ifr = (struct ifreq *)if_req;
10987 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid;
10988 ifr->ifr_arp_muxid = arp_muxid;
10989 } else {
10990 struct lifreq *lifr = (struct lifreq *)if_req;
10992 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid;
10993 lifr->lifr_arp_muxid = arp_muxid;
10995 return (0);
10999 * Set the subnet prefix. Does not modify the broadcast address.
11001 /* ARGSUSED */
11003 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11004 ip_ioctl_cmd_t *ipip, void *if_req)
11006 int err = 0;
11007 in6_addr_t v6addr;
11008 in6_addr_t v6mask;
11009 boolean_t need_up = B_FALSE;
11010 int addrlen;
11012 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n",
11013 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11015 ASSERT(IAM_WRITER_IPIF(ipif));
11016 addrlen = ((struct lifreq *)if_req)->lifr_addrlen;
11018 if (ipif->ipif_isv6) {
11019 sin6_t *sin6;
11021 if (sin->sin_family != AF_INET6)
11022 return (EAFNOSUPPORT);
11024 sin6 = (sin6_t *)sin;
11025 v6addr = sin6->sin6_addr;
11026 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones))
11027 return (EADDRNOTAVAIL);
11028 } else {
11029 ipaddr_t addr;
11031 if (sin->sin_family != AF_INET)
11032 return (EAFNOSUPPORT);
11034 addr = sin->sin_addr.s_addr;
11035 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF))
11036 return (EADDRNOTAVAIL);
11037 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11038 /* Add 96 bits */
11039 addrlen += IPV6_ABITS - IP_ABITS;
11042 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL)
11043 return (EINVAL);
11045 /* Check if bits in the address is set past the mask */
11046 if (!V6_MASK_EQ(v6addr, v6mask, v6addr))
11047 return (EINVAL);
11049 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) &&
11050 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask))
11051 return (0); /* No change */
11053 if (ipif->ipif_flags & IPIF_UP) {
11055 * If the interface is already marked up,
11056 * we call ipif_down which will take care
11057 * of ditching any IREs that have been set
11058 * up based on the old interface address.
11060 err = ipif_logical_down(ipif, q, mp);
11061 if (err == EINPROGRESS)
11062 return (err);
11063 (void) ipif_down_tail(ipif);
11064 need_up = B_TRUE;
11067 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up);
11068 return (err);
11071 static int
11072 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask,
11073 queue_t *q, mblk_t *mp, boolean_t need_up)
11075 ill_t *ill = ipif->ipif_ill;
11076 int err = 0;
11078 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n",
11079 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11081 /* Set the new address. */
11082 mutex_enter(&ill->ill_lock);
11083 ipif->ipif_v6net_mask = v6mask;
11084 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11085 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask,
11086 ipif->ipif_v6subnet);
11088 mutex_exit(&ill->ill_lock);
11090 if (need_up) {
11092 * Now bring the interface back up. If this
11093 * is the only IPIF for the ILL, ipif_up
11094 * will have to re-bind to the device, so
11095 * we may get back EINPROGRESS, in which
11096 * case, this IOCTL will get completed in
11097 * ip_rput_dlpi when we see the DL_BIND_ACK.
11099 err = ipif_up(ipif, q, mp);
11100 if (err == EINPROGRESS)
11101 return (err);
11103 return (err);
11106 /* ARGSUSED */
11108 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11109 ip_ioctl_cmd_t *ipip, void *if_req)
11111 int addrlen;
11112 in6_addr_t v6addr;
11113 in6_addr_t v6mask;
11114 struct lifreq *lifr = (struct lifreq *)if_req;
11116 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n",
11117 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11118 (void) ipif_down_tail(ipif);
11120 addrlen = lifr->lifr_addrlen;
11121 if (ipif->ipif_isv6) {
11122 sin6_t *sin6;
11124 sin6 = (sin6_t *)sin;
11125 v6addr = sin6->sin6_addr;
11126 } else {
11127 ipaddr_t addr;
11129 addr = sin->sin_addr.s_addr;
11130 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11131 addrlen += IPV6_ABITS - IP_ABITS;
11133 (void) ip_plen_to_mask_v6(addrlen, &v6mask);
11135 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE));
11138 /* ARGSUSED */
11140 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11141 ip_ioctl_cmd_t *ipip, void *if_req)
11143 struct lifreq *lifr = (struct lifreq *)if_req;
11144 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin;
11146 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n",
11147 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11148 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11150 if (ipif->ipif_isv6) {
11151 *sin6 = sin6_null;
11152 sin6->sin6_family = AF_INET6;
11153 sin6->sin6_addr = ipif->ipif_v6subnet;
11154 lifr->lifr_addrlen =
11155 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11156 } else {
11157 *sin = sin_null;
11158 sin->sin_family = AF_INET;
11159 sin->sin_addr.s_addr = ipif->ipif_subnet;
11160 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask);
11162 return (0);
11166 * Set the IPv6 address token.
11168 /* ARGSUSED */
11170 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11171 ip_ioctl_cmd_t *ipi, void *if_req)
11173 ill_t *ill = ipif->ipif_ill;
11174 int err;
11175 in6_addr_t v6addr;
11176 in6_addr_t v6mask;
11177 boolean_t need_up = B_FALSE;
11178 int i;
11179 sin6_t *sin6 = (sin6_t *)sin;
11180 struct lifreq *lifr = (struct lifreq *)if_req;
11181 int addrlen;
11183 ip1dbg(("ip_sioctl_token(%s:%u %p)\n",
11184 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11185 ASSERT(IAM_WRITER_IPIF(ipif));
11187 addrlen = lifr->lifr_addrlen;
11188 /* Only allow for logical unit zero i.e. not on "le0:17" */
11189 if (ipif->ipif_id != 0)
11190 return (EINVAL);
11192 if (!ipif->ipif_isv6)
11193 return (EINVAL);
11195 if (addrlen > IPV6_ABITS)
11196 return (EINVAL);
11198 v6addr = sin6->sin6_addr;
11201 * The length of the token is the length from the end. To get
11202 * the proper mask for this, compute the mask of the bits not
11203 * in the token; ie. the prefix, and then xor to get the mask.
11205 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL)
11206 return (EINVAL);
11207 for (i = 0; i < 4; i++) {
11208 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11211 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) &&
11212 ill->ill_token_length == addrlen)
11213 return (0); /* No change */
11215 if (ipif->ipif_flags & IPIF_UP) {
11216 err = ipif_logical_down(ipif, q, mp);
11217 if (err == EINPROGRESS)
11218 return (err);
11219 (void) ipif_down_tail(ipif);
11220 need_up = B_TRUE;
11222 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up);
11223 return (err);
11226 static int
11227 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q,
11228 mblk_t *mp, boolean_t need_up)
11230 in6_addr_t v6addr;
11231 in6_addr_t v6mask;
11232 ill_t *ill = ipif->ipif_ill;
11233 int i;
11234 int err = 0;
11236 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n",
11237 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11238 v6addr = sin6->sin6_addr;
11240 * The length of the token is the length from the end. To get
11241 * the proper mask for this, compute the mask of the bits not
11242 * in the token; ie. the prefix, and then xor to get the mask.
11244 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask);
11245 for (i = 0; i < 4; i++)
11246 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11248 mutex_enter(&ill->ill_lock);
11249 V6_MASK_COPY(v6addr, v6mask, ill->ill_token);
11250 ill->ill_token_length = addrlen;
11251 ill->ill_manual_token = 1;
11253 /* Reconfigure the link-local address based on this new token */
11254 ipif_setlinklocal(ill->ill_ipif);
11256 mutex_exit(&ill->ill_lock);
11258 if (need_up) {
11260 * Now bring the interface back up. If this
11261 * is the only IPIF for the ILL, ipif_up
11262 * will have to re-bind to the device, so
11263 * we may get back EINPROGRESS, in which
11264 * case, this IOCTL will get completed in
11265 * ip_rput_dlpi when we see the DL_BIND_ACK.
11267 err = ipif_up(ipif, q, mp);
11268 if (err == EINPROGRESS)
11269 return (err);
11271 return (err);
11274 /* ARGSUSED */
11276 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11277 ip_ioctl_cmd_t *ipi, void *if_req)
11279 ill_t *ill;
11280 sin6_t *sin6 = (sin6_t *)sin;
11281 struct lifreq *lifr = (struct lifreq *)if_req;
11283 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n",
11284 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11285 if (ipif->ipif_id != 0)
11286 return (EINVAL);
11288 ill = ipif->ipif_ill;
11289 if (!ill->ill_isv6)
11290 return (ENXIO);
11292 *sin6 = sin6_null;
11293 sin6->sin6_family = AF_INET6;
11294 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token));
11295 sin6->sin6_addr = ill->ill_token;
11296 lifr->lifr_addrlen = ill->ill_token_length;
11297 return (0);
11301 * Set (hardware) link specific information that might override
11302 * what was acquired through the DL_INFO_ACK.
11304 /* ARGSUSED */
11306 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11307 ip_ioctl_cmd_t *ipi, void *if_req)
11309 ill_t *ill = ipif->ipif_ill;
11310 int ip_min_mtu;
11311 struct lifreq *lifr = (struct lifreq *)if_req;
11312 lif_ifinfo_req_t *lir;
11314 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n",
11315 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11316 lir = &lifr->lifr_ifinfo;
11317 ASSERT(IAM_WRITER_IPIF(ipif));
11319 /* Only allow for logical unit zero i.e. not on "bge0:17" */
11320 if (ipif->ipif_id != 0)
11321 return (EINVAL);
11323 /* Set interface MTU. */
11324 if (ipif->ipif_isv6)
11325 ip_min_mtu = IPV6_MIN_MTU;
11326 else
11327 ip_min_mtu = IP_MIN_MTU;
11330 * Verify values before we set anything. Allow zero to
11331 * mean unspecified.
11333 * XXX We should be able to set the user-defined lir_mtu to some value
11334 * that is greater than ill_current_frag but less than ill_max_frag- the
11335 * ill_max_frag value tells us the max MTU that can be handled by the
11336 * datalink, whereas the ill_current_frag is dynamically computed for
11337 * some link-types like tunnels, based on the tunnel PMTU. However,
11338 * since there is currently no way of distinguishing between
11339 * administratively fixed link mtu values (e.g., those set via
11340 * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered
11341 * for tunnels) we conservatively choose the ill_current_frag as the
11342 * upper-bound.
11344 if (lir->lir_maxmtu != 0 &&
11345 (lir->lir_maxmtu > ill->ill_current_frag ||
11346 lir->lir_maxmtu < ip_min_mtu))
11347 return (EINVAL);
11348 if (lir->lir_reachtime != 0 &&
11349 lir->lir_reachtime > ND_MAX_REACHTIME)
11350 return (EINVAL);
11351 if (lir->lir_reachretrans != 0 &&
11352 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME)
11353 return (EINVAL);
11355 mutex_enter(&ill->ill_lock);
11357 * The dce and fragmentation code can handle changes to ill_mtu
11358 * concurrent with sending/fragmenting packets.
11360 if (lir->lir_maxmtu != 0)
11361 ill->ill_user_mtu = lir->lir_maxmtu;
11363 if (lir->lir_reachtime != 0)
11364 ill->ill_reachable_time = lir->lir_reachtime;
11366 if (lir->lir_reachretrans != 0)
11367 ill->ill_reachable_retrans_time = lir->lir_reachretrans;
11369 ill->ill_max_hops = lir->lir_maxhops;
11370 ill->ill_max_buf = ND_MAX_Q;
11371 if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) {
11373 * ill_mtu is the actual interface MTU, obtained as the min
11374 * of user-configured mtu and the value announced by the
11375 * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since
11376 * we have already made the choice of requiring
11377 * ill_user_mtu < ill_current_frag by the time we get here,
11378 * the ill_mtu effectively gets assigned to the ill_user_mtu
11379 * here.
11381 ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu);
11382 ill->ill_mc_mtu = MIN(ill->ill_mc_mtu, ill->ill_user_mtu);
11384 mutex_exit(&ill->ill_lock);
11387 * Make sure all dce_generation checks find out
11388 * that ill_mtu/ill_mc_mtu has changed.
11390 if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0))
11391 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
11394 * Refresh IPMP meta-interface MTU if necessary.
11396 if (IS_UNDER_IPMP(ill))
11397 ipmp_illgrp_refresh_mtu(ill->ill_grp);
11399 return (0);
11402 /* ARGSUSED */
11404 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11405 ip_ioctl_cmd_t *ipi, void *if_req)
11407 struct lif_ifinfo_req *lir;
11408 ill_t *ill = ipif->ipif_ill;
11410 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n",
11411 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11412 if (ipif->ipif_id != 0)
11413 return (EINVAL);
11415 lir = &((struct lifreq *)if_req)->lifr_ifinfo;
11416 lir->lir_maxhops = ill->ill_max_hops;
11417 lir->lir_reachtime = ill->ill_reachable_time;
11418 lir->lir_reachretrans = ill->ill_reachable_retrans_time;
11419 lir->lir_maxmtu = ill->ill_mtu;
11421 return (0);
11425 * Return best guess as to the subnet mask for the specified address.
11426 * Based on the subnet masks for all the configured interfaces.
11428 * We end up returning a zero mask in the case of default, multicast or
11429 * experimental.
11431 static ipaddr_t
11432 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst)
11434 ipaddr_t net_mask;
11435 ill_t *ill;
11436 ipif_t *ipif;
11437 ill_walk_context_t ctx;
11438 ipif_t *fallback_ipif = NULL;
11440 net_mask = ip_net_mask(addr);
11441 if (net_mask == 0) {
11442 *ipifp = NULL;
11443 return (0);
11446 /* Let's check to see if this is maybe a local subnet route. */
11447 /* this function only applies to IPv4 interfaces */
11448 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
11449 ill = ILL_START_WALK_V4(&ctx, ipst);
11450 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
11451 mutex_enter(&ill->ill_lock);
11452 for (ipif = ill->ill_ipif; ipif != NULL;
11453 ipif = ipif->ipif_next) {
11454 if (IPIF_IS_CONDEMNED(ipif))
11455 continue;
11456 if (!(ipif->ipif_flags & IPIF_UP))
11457 continue;
11458 if ((ipif->ipif_subnet & net_mask) ==
11459 (addr & net_mask)) {
11461 * Don't trust pt-pt interfaces if there are
11462 * other interfaces.
11464 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
11465 if (fallback_ipif == NULL) {
11466 ipif_refhold_locked(ipif);
11467 fallback_ipif = ipif;
11469 continue;
11473 * Fine. Just assume the same net mask as the
11474 * directly attached subnet interface is using.
11476 ipif_refhold_locked(ipif);
11477 mutex_exit(&ill->ill_lock);
11478 rw_exit(&ipst->ips_ill_g_lock);
11479 if (fallback_ipif != NULL)
11480 ipif_refrele(fallback_ipif);
11481 *ipifp = ipif;
11482 return (ipif->ipif_net_mask);
11485 mutex_exit(&ill->ill_lock);
11487 rw_exit(&ipst->ips_ill_g_lock);
11489 *ipifp = fallback_ipif;
11490 return ((fallback_ipif != NULL) ?
11491 fallback_ipif->ipif_net_mask : net_mask);
11495 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl.
11497 static void
11498 ip_wput_ioctl(queue_t *q, mblk_t *mp)
11500 IOCP iocp;
11501 ipft_t *ipft;
11502 ipllc_t *ipllc;
11503 mblk_t *mp1;
11504 cred_t *cr;
11505 int error = 0;
11506 conn_t *connp;
11508 ip1dbg(("ip_wput_ioctl"));
11509 iocp = (IOCP)mp->b_rptr;
11510 mp1 = mp->b_cont;
11511 if (mp1 == NULL) {
11512 iocp->ioc_error = EINVAL;
11513 mp->b_datap->db_type = M_IOCNAK;
11514 iocp->ioc_count = 0;
11515 qreply(q, mp);
11516 return;
11520 * These IOCTLs provide various control capabilities to
11521 * upstream agents such as ULPs and processes. There
11522 * are currently two such IOCTLs implemented. They
11523 * are used by TCP to provide update information for
11524 * existing IREs and to forcibly delete an IRE for a
11525 * host that is not responding, thereby forcing an
11526 * attempt at a new route.
11528 iocp->ioc_error = EINVAL;
11529 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd)))
11530 goto done;
11532 ipllc = (ipllc_t *)mp1->b_rptr;
11533 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) {
11534 if (ipllc->ipllc_cmd == ipft->ipft_cmd)
11535 break;
11538 * prefer credential from mblk over ioctl;
11539 * see ip_sioctl_copyin_setup
11541 cr = msg_getcred(mp, NULL);
11542 if (cr == NULL)
11543 cr = iocp->ioc_cr;
11546 * Refhold the conn in case the request gets queued up in some lookup
11548 ASSERT(CONN_Q(q));
11549 connp = Q_TO_CONN(q);
11550 CONN_INC_REF(connp);
11551 CONN_INC_IOCTLREF(connp);
11552 if (ipft->ipft_pfi &&
11553 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size ||
11554 pullupmsg(mp1, ipft->ipft_min_size))) {
11555 error = (*ipft->ipft_pfi)(q,
11556 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr);
11558 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) {
11560 * CONN_OPER_PENDING_DONE happens in the function called
11561 * through ipft_pfi above.
11563 return;
11566 CONN_DEC_IOCTLREF(connp);
11567 CONN_OPER_PENDING_DONE(connp);
11568 if (ipft->ipft_flags & IPFT_F_NO_REPLY) {
11569 freemsg(mp);
11570 return;
11572 iocp->ioc_error = error;
11574 done:
11575 mp->b_datap->db_type = M_IOCACK;
11576 if (iocp->ioc_error)
11577 iocp->ioc_count = 0;
11578 qreply(q, mp);
11582 * Assign a unique id for the ipif. This is used by sctp_addr.c
11583 * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures.
11585 static void
11586 ipif_assign_seqid(ipif_t *ipif)
11588 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
11590 ipif->ipif_seqid = atomic_inc_64_nv(&ipst->ips_ipif_g_seqid);
11594 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are
11595 * administratively down (i.e., no DAD), of the same type, and locked. Note
11596 * that the clone is complete -- including the seqid -- and the expectation is
11597 * that the caller will either free or overwrite `sipif' before it's unlocked.
11599 static void
11600 ipif_clone(const ipif_t *sipif, ipif_t *dipif)
11602 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock));
11603 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock));
11604 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11605 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11606 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type);
11608 dipif->ipif_flags = sipif->ipif_flags;
11609 dipif->ipif_zoneid = sipif->ipif_zoneid;
11610 dipif->ipif_v6subnet = sipif->ipif_v6subnet;
11611 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr;
11612 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask;
11613 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr;
11614 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr;
11617 * As per the comment atop the function, we assume that these sipif
11618 * fields will be changed before sipif is unlocked.
11620 dipif->ipif_seqid = sipif->ipif_seqid;
11621 dipif->ipif_state_flags = sipif->ipif_state_flags;
11625 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif'
11626 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin
11627 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then
11628 * transfer the xop to `dipif'. Requires that all ipifs are administratively
11629 * down (i.e., no DAD), of the same type, and unlocked.
11631 static void
11632 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
11634 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq;
11635 ipxop_t *ipx = ipsq->ipsq_xop;
11637 ASSERT(sipif != dipif);
11638 ASSERT(sipif != virgipif);
11641 * Grab all of the locks that protect the ipif in a defined order.
11643 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11645 ipif_clone(sipif, dipif);
11646 if (virgipif != NULL) {
11647 ipif_clone(virgipif, sipif);
11648 mi_free(virgipif);
11651 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11654 * Transfer ownership of the current xop, if necessary.
11656 if (ipx->ipx_current_ipif == sipif) {
11657 ASSERT(ipx->ipx_pending_ipif == NULL);
11658 mutex_enter(&ipx->ipx_lock);
11659 ipx->ipx_current_ipif = dipif;
11660 mutex_exit(&ipx->ipx_lock);
11663 if (virgipif == NULL)
11664 mi_free(sipif);
11668 * checks if:
11669 * - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and
11670 * - logical interface is within the allowed range
11672 static int
11673 is_lifname_valid(ill_t *ill, unsigned int ipif_id)
11675 if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ)
11676 return (ENAMETOOLONG);
11678 if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if)
11679 return (ERANGE);
11680 return (0);
11684 * Insert the ipif, so that the list of ipifs on the ill will be sorted
11685 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will
11686 * be inserted into the first space available in the list. The value of
11687 * ipif_id will then be set to the appropriate value for its position.
11689 static int
11690 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock)
11692 ill_t *ill;
11693 ipif_t *tipif;
11694 ipif_t **tipifp;
11695 int id, err;
11696 ip_stack_t *ipst;
11698 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK ||
11699 IAM_WRITER_IPIF(ipif));
11701 ill = ipif->ipif_ill;
11702 ASSERT(ill != NULL);
11703 ipst = ill->ill_ipst;
11706 * In the case of lo0:0 we already hold the ill_g_lock.
11707 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate ->
11708 * ipif_insert.
11710 if (acquire_g_lock)
11711 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
11712 mutex_enter(&ill->ill_lock);
11713 id = ipif->ipif_id;
11714 tipifp = &(ill->ill_ipif);
11715 if (id == -1) { /* need to find a real id */
11716 id = 0;
11717 while ((tipif = *tipifp) != NULL) {
11718 ASSERT(tipif->ipif_id >= id);
11719 if (tipif->ipif_id != id)
11720 break; /* non-consecutive id */
11721 id++;
11722 tipifp = &(tipif->ipif_next);
11724 if ((err = is_lifname_valid(ill, id)) != 0) {
11725 mutex_exit(&ill->ill_lock);
11726 if (acquire_g_lock)
11727 rw_exit(&ipst->ips_ill_g_lock);
11728 return (err);
11730 ipif->ipif_id = id; /* assign new id */
11731 } else if ((err = is_lifname_valid(ill, id)) == 0) {
11732 /* we have a real id; insert ipif in the right place */
11733 while ((tipif = *tipifp) != NULL) {
11734 ASSERT(tipif->ipif_id != id);
11735 if (tipif->ipif_id > id)
11736 break; /* found correct location */
11737 tipifp = &(tipif->ipif_next);
11739 } else {
11740 mutex_exit(&ill->ill_lock);
11741 if (acquire_g_lock)
11742 rw_exit(&ipst->ips_ill_g_lock);
11743 return (err);
11746 ASSERT(tipifp != &(ill->ill_ipif) || id == 0);
11748 ipif->ipif_next = tipif;
11749 *tipifp = ipif;
11750 mutex_exit(&ill->ill_lock);
11751 if (acquire_g_lock)
11752 rw_exit(&ipst->ips_ill_g_lock);
11754 return (0);
11757 static void
11758 ipif_remove(ipif_t *ipif)
11760 ipif_t **ipifp;
11761 ill_t *ill = ipif->ipif_ill;
11763 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock));
11765 mutex_enter(&ill->ill_lock);
11766 ipifp = &ill->ill_ipif;
11767 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
11768 if (*ipifp == ipif) {
11769 *ipifp = ipif->ipif_next;
11770 break;
11773 mutex_exit(&ill->ill_lock);
11777 * Allocate and initialize a new interface control structure. (Always
11778 * called as writer.)
11779 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill
11780 * is not part of the global linked list of ills. ipif_seqid is unique
11781 * in the system and to preserve the uniqueness, it is assigned only
11782 * when ill becomes part of the global list. At that point ill will
11783 * have a name. If it doesn't get assigned here, it will get assigned
11784 * in ipif_set_values() as part of SIOCSLIFNAME processing.
11785 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set
11786 * the interface flags or any other information from the DL_INFO_ACK for
11787 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at
11788 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the
11789 * second DL_INFO_ACK comes in from the driver.
11791 static ipif_t *
11792 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
11793 boolean_t insert, int *errorp)
11795 int err;
11796 ipif_t *ipif;
11797 ip_stack_t *ipst = ill->ill_ipst;
11799 ip1dbg(("ipif_allocate(%s:%d ill %p)\n",
11800 ill->ill_name, id, (void *)ill));
11801 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill));
11803 if (errorp != NULL)
11804 *errorp = 0;
11806 if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) {
11807 if (errorp != NULL)
11808 *errorp = ENOMEM;
11809 return (NULL);
11811 *ipif = ipif_zero; /* start clean */
11813 ipif->ipif_ill = ill;
11814 ipif->ipif_id = id; /* could be -1 */
11816 * Inherit the zoneid from the ill; for the shared stack instance
11817 * this is always the global zone
11819 ipif->ipif_zoneid = ill->ill_zoneid;
11821 ipif->ipif_refcnt = 0;
11823 if (insert) {
11824 if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) {
11825 mi_free(ipif);
11826 if (errorp != NULL)
11827 *errorp = err;
11828 return (NULL);
11830 /* -1 id should have been replaced by real id */
11831 id = ipif->ipif_id;
11832 ASSERT(id >= 0);
11835 if (ill->ill_name[0] != '\0')
11836 ipif_assign_seqid(ipif);
11839 * If this is the zeroth ipif on the IPMP ill, create the illgrp
11840 * (which must not exist yet because the zeroth ipif is created once
11841 * per ill). However, do not not link it to the ipmp_grp_t until
11842 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details.
11844 if (id == 0 && IS_IPMP(ill)) {
11845 if (ipmp_illgrp_create(ill) == NULL) {
11846 if (insert) {
11847 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
11848 ipif_remove(ipif);
11849 rw_exit(&ipst->ips_ill_g_lock);
11851 mi_free(ipif);
11852 if (errorp != NULL)
11853 *errorp = ENOMEM;
11854 return (NULL);
11859 * We grab ill_lock to protect the flag changes. The ipif is still
11860 * not up and can't be looked up until the ioctl completes and the
11861 * IPIF_CHANGING flag is cleared.
11863 mutex_enter(&ill->ill_lock);
11865 ipif->ipif_ire_type = ire_type;
11867 if (ipif->ipif_isv6) {
11868 ill->ill_flags |= ILLF_IPV6;
11869 } else {
11870 ipaddr_t inaddr_any = INADDR_ANY;
11872 ill->ill_flags |= ILLF_IPV4;
11874 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */
11875 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
11876 &ipif->ipif_v6lcl_addr);
11877 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
11878 &ipif->ipif_v6subnet);
11879 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
11880 &ipif->ipif_v6net_mask);
11881 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
11882 &ipif->ipif_v6brd_addr);
11883 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
11884 &ipif->ipif_v6pp_dst_addr);
11888 * Don't set the interface flags etc. now, will do it in
11889 * ip_ll_subnet_defaults.
11891 if (!initialize)
11892 goto out;
11895 * NOTE: The IPMP meta-interface is special-cased because it starts
11896 * with no underlying interfaces (and thus an unknown broadcast
11897 * address length), but all interfaces that can be placed into an IPMP
11898 * group are required to be broadcast-capable.
11900 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) {
11902 * Later detect lack of DLPI driver multicast capability by
11903 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi().
11905 ill->ill_flags |= ILLF_MULTICAST;
11906 if (!ipif->ipif_isv6)
11907 ipif->ipif_flags |= IPIF_BROADCAST;
11908 } else {
11909 if (ill->ill_net_type != IRE_LOOPBACK) {
11910 if (ipif->ipif_isv6)
11912 * Note: xresolv interfaces will eventually need
11913 * NOARP set here as well, but that will require
11914 * those external resolvers to have some
11915 * knowledge of that flag and act appropriately.
11916 * Not to be changed at present.
11918 ill->ill_flags |= ILLF_NONUD;
11919 else
11920 ill->ill_flags |= ILLF_NOARP;
11922 if (ill->ill_phys_addr_length == 0) {
11923 if (IS_VNI(ill)) {
11924 ipif->ipif_flags |= IPIF_NOXMIT;
11925 } else {
11926 /* pt-pt supports multicast. */
11927 ill->ill_flags |= ILLF_MULTICAST;
11928 if (ill->ill_net_type != IRE_LOOPBACK)
11929 ipif->ipif_flags |= IPIF_POINTOPOINT;
11933 out:
11934 mutex_exit(&ill->ill_lock);
11935 return (ipif);
11939 * Remove the neighbor cache entries associated with this logical
11940 * interface.
11943 ipif_arp_down(ipif_t *ipif)
11945 ill_t *ill = ipif->ipif_ill;
11946 int err = 0;
11948 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
11949 ASSERT(IAM_WRITER_IPIF(ipif));
11951 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down",
11952 ill_t *, ill, ipif_t *, ipif);
11953 ipif_nce_down(ipif);
11956 * If this is the last ipif that is going down and there are no
11957 * duplicate addresses we may yet attempt to re-probe, then we need to
11958 * clean up ARP completely.
11960 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
11961 !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) {
11963 * If this was the last ipif on an IPMP interface, purge any
11964 * static ARP entries associated with it.
11966 if (IS_IPMP(ill))
11967 ipmp_illgrp_refresh_arpent(ill->ill_grp);
11969 /* UNBIND, DETACH */
11970 err = arp_ll_down(ill);
11973 return (err);
11977 * Get the resolver set up for a new IP address. (Always called as writer.)
11978 * Called both for IPv4 and IPv6 interfaces, though it only does some
11979 * basic DAD related initialization for IPv6. Honors ILLF_NOARP.
11981 * The enumerated value res_act tunes the behavior:
11982 * * Res_act_initial: set up all the resolver structures for a new
11983 * IP address.
11984 * * Res_act_defend: tell ARP that it needs to send a single gratuitous
11985 * ARP message in defense of the address.
11986 * * Res_act_rebind: tell ARP to change the hardware address for an IP
11987 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif().
11989 * Returns zero on success, or an errno upon failure.
11992 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
11994 ill_t *ill = ipif->ipif_ill;
11995 int err;
11996 boolean_t was_dup;
11998 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
11999 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags));
12000 ASSERT(IAM_WRITER_IPIF(ipif));
12002 was_dup = B_FALSE;
12003 if (res_act == Res_act_initial) {
12004 ipif->ipif_addr_ready = 0;
12006 * We're bringing an interface up here. There's no way that we
12007 * should need to shut down ARP now.
12009 mutex_enter(&ill->ill_lock);
12010 if (ipif->ipif_flags & IPIF_DUPLICATE) {
12011 ipif->ipif_flags &= ~IPIF_DUPLICATE;
12012 ill->ill_ipif_dup_count--;
12013 was_dup = B_TRUE;
12015 mutex_exit(&ill->ill_lock);
12017 if (ipif->ipif_recovery_id != 0)
12018 (void) untimeout(ipif->ipif_recovery_id);
12019 ipif->ipif_recovery_id = 0;
12020 if (ill->ill_net_type != IRE_IF_RESOLVER) {
12021 ipif->ipif_addr_ready = 1;
12022 return (0);
12024 /* NDP will set the ipif_addr_ready flag when it's ready */
12025 if (ill->ill_isv6)
12026 return (0);
12028 err = ipif_arp_up(ipif, res_act, was_dup);
12029 return (err);
12033 * This routine restarts IPv4/IPv6 duplicate address detection (DAD)
12034 * when a link has just gone back up.
12036 static void
12037 ipif_nce_start_dad(ipif_t *ipif)
12039 ncec_t *ncec;
12040 ill_t *ill = ipif->ipif_ill;
12041 boolean_t isv6 = ill->ill_isv6;
12043 if (isv6) {
12044 ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill,
12045 &ipif->ipif_v6lcl_addr);
12046 } else {
12047 ipaddr_t v4addr;
12049 if (ill->ill_net_type != IRE_IF_RESOLVER ||
12050 (ipif->ipif_flags & IPIF_UNNUMBERED) ||
12051 ipif->ipif_lcl_addr == INADDR_ANY) {
12053 * If we can't contact ARP for some reason,
12054 * that's not really a problem. Just send
12055 * out the routing socket notification that
12056 * DAD completion would have done, and continue.
12058 ipif_mask_reply(ipif);
12059 ipif_up_notify(ipif);
12060 ipif->ipif_addr_ready = 1;
12061 return;
12064 IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr);
12065 ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr);
12068 if (ncec == NULL) {
12069 ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n",
12070 (void *)ipif));
12071 return;
12073 if (!nce_restart_dad(ncec)) {
12075 * If we can't restart DAD for some reason, that's not really a
12076 * problem. Just send out the routing socket notification that
12077 * DAD completion would have done, and continue.
12079 ipif_up_notify(ipif);
12080 ipif->ipif_addr_ready = 1;
12082 ncec_refrele(ncec);
12086 * Restart duplicate address detection on all interfaces on the given ill.
12088 * This is called when an interface transitions from down to up
12089 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN).
12091 * Note that since the underlying physical link has transitioned, we must cause
12092 * at least one routing socket message to be sent here, either via DAD
12093 * completion or just by default on the first ipif. (If we don't do this, then
12094 * in.mpathd will see long delays when doing link-based failure recovery.)
12096 void
12097 ill_restart_dad(ill_t *ill, boolean_t went_up)
12099 ipif_t *ipif;
12101 if (ill == NULL)
12102 return;
12105 * If layer two doesn't support duplicate address detection, then just
12106 * send the routing socket message now and be done with it.
12108 if (!ill->ill_isv6 && arp_no_defense) {
12109 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12110 return;
12113 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12114 if (went_up) {
12116 if (ipif->ipif_flags & IPIF_UP) {
12117 ipif_nce_start_dad(ipif);
12118 } else if (ipif->ipif_flags & IPIF_DUPLICATE) {
12120 * kick off the bring-up process now.
12122 ipif_do_recovery(ipif);
12123 } else {
12125 * Unfortunately, the first ipif is "special"
12126 * and represents the underlying ill in the
12127 * routing socket messages. Thus, when this
12128 * one ipif is down, we must still notify so
12129 * that the user knows the IFF_RUNNING status
12130 * change. (If the first ipif is up, then
12131 * we'll handle eventual routing socket
12132 * notification via DAD completion.)
12134 if (ipif == ill->ill_ipif) {
12135 ip_rts_ifmsg(ill->ill_ipif,
12136 RTSQ_DEFAULT);
12139 } else {
12141 * After link down, we'll need to send a new routing
12142 * message when the link comes back, so clear
12143 * ipif_addr_ready.
12145 ipif->ipif_addr_ready = 0;
12150 * If we've torn down links, then notify the user right away.
12152 if (!went_up)
12153 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12156 static void
12157 ipsq_delete(ipsq_t *ipsq)
12159 ipxop_t *ipx = ipsq->ipsq_xop;
12161 ipsq->ipsq_ipst = NULL;
12162 ASSERT(ipsq->ipsq_phyint == NULL);
12163 ASSERT(ipsq->ipsq_xop != NULL);
12164 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL);
12165 ASSERT(ipx->ipx_pending_mp == NULL);
12166 kmem_free(ipsq, sizeof (ipsq_t));
12169 static int
12170 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
12172 int err = 0;
12173 ipif_t *ipif;
12175 if (ill == NULL)
12176 return (0);
12178 ASSERT(IAM_WRITER_ILL(ill));
12179 ill->ill_up_ipifs = B_TRUE;
12180 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12181 if (ipif->ipif_was_up) {
12182 if (!(ipif->ipif_flags & IPIF_UP))
12183 err = ipif_up(ipif, q, mp);
12184 ipif->ipif_was_up = B_FALSE;
12185 if (err != 0) {
12186 ASSERT(err == EINPROGRESS);
12187 return (err);
12191 ill->ill_up_ipifs = B_FALSE;
12192 return (0);
12196 * This function is called to bring up all the ipifs that were up before
12197 * bringing the ill down via ill_down_ipifs().
12200 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
12202 int err;
12204 ASSERT(IAM_WRITER_ILL(ill));
12206 if (ill->ill_replumbing) {
12207 ill->ill_replumbing = 0;
12209 * Send down REPLUMB_DONE notification followed by the
12210 * BIND_REQ on the arp stream.
12212 if (!ill->ill_isv6)
12213 arp_send_replumb_conf(ill);
12215 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp);
12216 if (err != 0)
12217 return (err);
12219 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp));
12223 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring
12224 * down the ipifs without sending DL_UNBIND_REQ to the driver.
12226 static void
12227 ill_down_ipifs(ill_t *ill, boolean_t logical)
12229 ipif_t *ipif;
12231 ASSERT(IAM_WRITER_ILL(ill));
12233 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12235 * We go through the ipif_down logic even if the ipif
12236 * is already down, since routes can be added based
12237 * on down ipifs. Going through ipif_down once again
12238 * will delete any IREs created based on these routes.
12240 if (ipif->ipif_flags & IPIF_UP)
12241 ipif->ipif_was_up = B_TRUE;
12243 if (logical) {
12244 (void) ipif_logical_down(ipif, NULL, NULL);
12245 ipif_non_duplicate(ipif);
12246 (void) ipif_down_tail(ipif);
12247 } else {
12248 (void) ipif_down(ipif, NULL, NULL);
12254 * Redo source address selection. This makes IXAF_VERIFY_SOURCE take
12255 * a look again at valid source addresses.
12256 * This should be called each time after the set of source addresses has been
12257 * changed.
12259 void
12260 ip_update_source_selection(ip_stack_t *ipst)
12262 /* We skip past SRC_GENERATION_VERIFY */
12263 if (atomic_inc_32_nv(&ipst->ips_src_generation) ==
12264 SRC_GENERATION_VERIFY)
12265 atomic_inc_32(&ipst->ips_src_generation);
12269 * Finish the group join started in ip_sioctl_groupname().
12271 /* ARGSUSED */
12272 static void
12273 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
12275 ill_t *ill = q->q_ptr;
12276 phyint_t *phyi = ill->ill_phyint;
12277 ipmp_grp_t *grp = phyi->phyint_grp;
12278 ip_stack_t *ipst = ill->ill_ipst;
12280 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */
12281 ASSERT(!IS_IPMP(ill) && grp != NULL);
12282 ASSERT(IAM_WRITER_IPSQ(ipsq));
12284 if (phyi->phyint_illv4 != NULL) {
12285 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12286 VERIFY(grp->gr_pendv4-- > 0);
12287 rw_exit(&ipst->ips_ipmp_lock);
12288 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4);
12290 if (phyi->phyint_illv6 != NULL) {
12291 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12292 VERIFY(grp->gr_pendv6-- > 0);
12293 rw_exit(&ipst->ips_ipmp_lock);
12294 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6);
12296 freemsg(mp);
12300 * Process an SIOCSLIFGROUPNAME request.
12302 /* ARGSUSED */
12304 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12305 ip_ioctl_cmd_t *ipip, void *ifreq)
12307 struct lifreq *lifr = ifreq;
12308 ill_t *ill = ipif->ipif_ill;
12309 ip_stack_t *ipst = ill->ill_ipst;
12310 phyint_t *phyi = ill->ill_phyint;
12311 ipmp_grp_t *grp = phyi->phyint_grp;
12312 mblk_t *ipsq_mp;
12313 int err = 0;
12316 * Note that phyint_grp can only change here, where we're exclusive.
12318 ASSERT(IAM_WRITER_ILL(ill));
12320 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL ||
12321 (phyi->phyint_flags & PHYI_VIRTUAL))
12322 return (EINVAL);
12324 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0';
12326 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12329 * If the name hasn't changed, there's nothing to do.
12331 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0)
12332 goto unlock;
12335 * Handle requests to rename an IPMP meta-interface.
12337 * Note that creation of the IPMP meta-interface is handled in
12338 * userland through the standard plumbing sequence. As part of the
12339 * plumbing the IPMP meta-interface, its initial groupname is set to
12340 * the name of the interface (see ipif_set_values_tail()).
12342 if (IS_IPMP(ill)) {
12343 err = ipmp_grp_rename(grp, lifr->lifr_groupname);
12344 goto unlock;
12348 * Handle requests to add or remove an IP interface from a group.
12350 if (lifr->lifr_groupname[0] != '\0') { /* add */
12352 * Moves are handled by first removing the interface from
12353 * its existing group, and then adding it to another group.
12354 * So, fail if it's already in a group.
12356 if (IS_UNDER_IPMP(ill)) {
12357 err = EALREADY;
12358 goto unlock;
12361 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst);
12362 if (grp == NULL) {
12363 err = ENOENT;
12364 goto unlock;
12368 * Check if the phyint and its ills are suitable for
12369 * inclusion into the group.
12371 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0)
12372 goto unlock;
12375 * Checks pass; join the group, and enqueue the remaining
12376 * illgrp joins for when we've become part of the group xop
12377 * and are exclusive across its IPSQs. Since qwriter_ip()
12378 * requires an mblk_t to scribble on, and since `mp' will be
12379 * freed as part of completing the ioctl, allocate another.
12381 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) {
12382 err = ENOMEM;
12383 goto unlock;
12387 * Before we drop ipmp_lock, bump gr_pend* to ensure that the
12388 * IPMP meta-interface ills needed by `phyi' cannot go away
12389 * before ip_join_illgrps() is called back. See the comments
12390 * in ip_sioctl_plink_ipmp() for more.
12392 if (phyi->phyint_illv4 != NULL)
12393 grp->gr_pendv4++;
12394 if (phyi->phyint_illv6 != NULL)
12395 grp->gr_pendv6++;
12397 rw_exit(&ipst->ips_ipmp_lock);
12399 ipmp_phyint_join_grp(phyi, grp);
12400 ill_refhold(ill);
12401 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps,
12402 SWITCH_OP, B_FALSE);
12403 return (0);
12404 } else {
12406 * Request to remove the interface from a group. If the
12407 * interface is not in a group, this trivially succeeds.
12409 rw_exit(&ipst->ips_ipmp_lock);
12410 if (IS_UNDER_IPMP(ill))
12411 ipmp_phyint_leave_grp(phyi);
12412 return (0);
12414 unlock:
12415 rw_exit(&ipst->ips_ipmp_lock);
12416 return (err);
12420 * Process an SIOCGLIFBINDING request.
12422 /* ARGSUSED */
12424 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12425 ip_ioctl_cmd_t *ipip, void *ifreq)
12427 ill_t *ill;
12428 struct lifreq *lifr = ifreq;
12429 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
12431 if (!IS_IPMP(ipif->ipif_ill))
12432 return (EINVAL);
12434 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12435 if ((ill = ipif->ipif_bound_ill) == NULL)
12436 lifr->lifr_binding[0] = '\0';
12437 else
12438 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ);
12439 rw_exit(&ipst->ips_ipmp_lock);
12440 return (0);
12444 * Process an SIOCGLIFGROUPNAME request.
12446 /* ARGSUSED */
12448 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12449 ip_ioctl_cmd_t *ipip, void *ifreq)
12451 ipmp_grp_t *grp;
12452 struct lifreq *lifr = ifreq;
12453 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
12455 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12456 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL)
12457 lifr->lifr_groupname[0] = '\0';
12458 else
12459 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ);
12460 rw_exit(&ipst->ips_ipmp_lock);
12461 return (0);
12465 * Process an SIOCGLIFGROUPINFO request.
12467 /* ARGSUSED */
12469 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12470 ip_ioctl_cmd_t *ipip, void *dummy)
12472 ipmp_grp_t *grp;
12473 lifgroupinfo_t *lifgr;
12474 ip_stack_t *ipst = CONNQ_TO_IPST(q);
12476 /* ip_wput_nondata() verified mp->b_cont->b_cont */
12477 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr;
12478 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0';
12480 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12481 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) {
12482 rw_exit(&ipst->ips_ipmp_lock);
12483 return (ENOENT);
12485 ipmp_grp_info(grp, lifgr);
12486 rw_exit(&ipst->ips_ipmp_lock);
12487 return (0);
12490 static void
12491 ill_dl_down(ill_t *ill)
12493 DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill);
12496 * The ill is down; unbind but stay attached since we're still
12497 * associated with a PPA. If we have negotiated DLPI capabilites
12498 * with the data link service provider (IDS_OK) then reset them.
12499 * The interval between unbinding and rebinding is potentially
12500 * unbounded hence we cannot assume things will be the same.
12501 * The DLPI capabilities will be probed again when the data link
12502 * is brought up.
12504 mblk_t *mp = ill->ill_unbind_mp;
12506 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name));
12508 if (!ill->ill_replumbing) {
12509 /* Free all ilms for this ill */
12510 update_conn_ill(ill, ill->ill_ipst);
12511 } else {
12512 ill_leave_multicast(ill);
12515 ill->ill_unbind_mp = NULL;
12516 if (mp != NULL) {
12517 ip1dbg(("ill_dl_down: %s (%u) for %s\n",
12518 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
12519 ill->ill_name));
12520 mutex_enter(&ill->ill_lock);
12521 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS;
12522 mutex_exit(&ill->ill_lock);
12524 * ip_rput does not pass up normal (M_PROTO) DLPI messages
12525 * after ILL_CONDEMNED is set. So in the unplumb case, we call
12526 * ill_capability_dld_disable disable rightaway. If this is not
12527 * an unplumb operation then the disable happens on receipt of
12528 * the capab ack via ip_rput_dlpi_writer ->
12529 * ill_capability_ack_thr. In both cases the order of
12530 * the operations seen by DLD is capability disable followed
12531 * by DL_UNBIND. Also the DLD capability disable needs a
12532 * cv_wait'able context.
12534 if (ill->ill_state_flags & ILL_CONDEMNED)
12535 ill_capability_dld_disable(ill);
12536 ill_capability_reset(ill, B_FALSE);
12537 ill_dlpi_send(ill, mp);
12539 mutex_enter(&ill->ill_lock);
12540 ill->ill_dl_up = 0;
12541 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
12542 mutex_exit(&ill->ill_lock);
12545 void
12546 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
12548 union DL_primitives *dlp;
12549 t_uscalar_t prim;
12550 boolean_t waitack = B_FALSE;
12552 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12554 dlp = (union DL_primitives *)mp->b_rptr;
12555 prim = dlp->dl_primitive;
12557 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n",
12558 dl_primstr(prim), prim, ill->ill_name));
12560 switch (prim) {
12561 case DL_PHYS_ADDR_REQ:
12563 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr;
12564 ill->ill_phys_addr_pend = dlpap->dl_addr_type;
12565 break;
12567 case DL_BIND_REQ:
12568 mutex_enter(&ill->ill_lock);
12569 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS;
12570 mutex_exit(&ill->ill_lock);
12571 break;
12575 * Except for the ACKs for the M_PCPROTO messages, all other ACKs
12576 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore
12577 * we only wait for the ACK of the DL_UNBIND_REQ.
12579 mutex_enter(&ill->ill_lock);
12580 if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
12581 (prim == DL_UNBIND_REQ)) {
12582 ill->ill_dlpi_pending = prim;
12583 waitack = B_TRUE;
12586 mutex_exit(&ill->ill_lock);
12587 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch",
12588 char *, dl_primstr(prim), ill_t *, ill);
12589 putnext(ill->ill_wq, mp);
12592 * There is no ack for DL_NOTIFY_CONF messages
12594 if (waitack && prim == DL_NOTIFY_CONF)
12595 ill_dlpi_done(ill, prim);
12599 * Helper function for ill_dlpi_send().
12601 /* ARGSUSED */
12602 static void
12603 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
12605 ill_dlpi_send(q->q_ptr, mp);
12609 * Send a DLPI control message to the driver but make sure there
12610 * is only one outstanding message. Uses ill_dlpi_pending to tell
12611 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done()
12612 * when an ACK or a NAK is received to process the next queued message.
12614 void
12615 ill_dlpi_send(ill_t *ill, mblk_t *mp)
12617 mblk_t **mpp;
12619 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12622 * To ensure that any DLPI requests for current exclusive operation
12623 * are always completely sent before any DLPI messages for other
12624 * operations, require writer access before enqueuing.
12626 if (!IAM_WRITER_ILL(ill)) {
12627 ill_refhold(ill);
12628 /* qwriter_ip() does the ill_refrele() */
12629 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer,
12630 NEW_OP, B_TRUE);
12631 return;
12634 mutex_enter(&ill->ill_lock);
12635 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
12636 /* Must queue message. Tail insertion */
12637 mpp = &ill->ill_dlpi_deferred;
12638 while (*mpp != NULL)
12639 mpp = &((*mpp)->b_next);
12641 ip1dbg(("ill_dlpi_send: deferring request for %s "
12642 "while %s pending\n", ill->ill_name,
12643 dl_primstr(ill->ill_dlpi_pending)));
12645 *mpp = mp;
12646 mutex_exit(&ill->ill_lock);
12647 return;
12649 mutex_exit(&ill->ill_lock);
12650 ill_dlpi_dispatch(ill, mp);
12653 void
12654 ill_capability_send(ill_t *ill, mblk_t *mp)
12656 ill->ill_capab_pending_cnt++;
12657 ill_dlpi_send(ill, mp);
12660 void
12661 ill_capability_done(ill_t *ill)
12663 ASSERT(ill->ill_capab_pending_cnt != 0);
12665 ill_dlpi_done(ill, DL_CAPABILITY_REQ);
12667 ill->ill_capab_pending_cnt--;
12668 if (ill->ill_capab_pending_cnt == 0 &&
12669 ill->ill_dlpi_capab_state == IDCS_OK)
12670 ill_capability_reset_alloc(ill);
12674 * Send all deferred DLPI messages without waiting for their ACKs.
12676 void
12677 ill_dlpi_send_deferred(ill_t *ill)
12679 mblk_t *mp, *nextmp;
12682 * Clear ill_dlpi_pending so that the message is not queued in
12683 * ill_dlpi_send().
12685 mutex_enter(&ill->ill_lock);
12686 ill->ill_dlpi_pending = DL_PRIM_INVAL;
12687 mp = ill->ill_dlpi_deferred;
12688 ill->ill_dlpi_deferred = NULL;
12689 mutex_exit(&ill->ill_lock);
12691 for (; mp != NULL; mp = nextmp) {
12692 nextmp = mp->b_next;
12693 mp->b_next = NULL;
12694 ill_dlpi_send(ill, mp);
12699 * Clear all the deferred DLPI messages. Called on receiving an M_ERROR
12700 * or M_HANGUP
12702 static void
12703 ill_dlpi_clear_deferred(ill_t *ill)
12705 mblk_t *mp, *nextmp;
12707 mutex_enter(&ill->ill_lock);
12708 ill->ill_dlpi_pending = DL_PRIM_INVAL;
12709 mp = ill->ill_dlpi_deferred;
12710 ill->ill_dlpi_deferred = NULL;
12711 mutex_exit(&ill->ill_lock);
12713 for (; mp != NULL; mp = nextmp) {
12714 nextmp = mp->b_next;
12715 inet_freemsg(mp);
12720 * Check if the DLPI primitive `prim' is pending; print a warning if not.
12722 boolean_t
12723 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim)
12725 t_uscalar_t pending;
12727 mutex_enter(&ill->ill_lock);
12728 if (ill->ill_dlpi_pending == prim) {
12729 mutex_exit(&ill->ill_lock);
12730 return (B_TRUE);
12734 * During teardown, ill_dlpi_dispatch() will send DLPI requests
12735 * without waiting, so don't print any warnings in that case.
12737 if (ill->ill_state_flags & ILL_CONDEMNED) {
12738 mutex_exit(&ill->ill_lock);
12739 return (B_FALSE);
12741 pending = ill->ill_dlpi_pending;
12742 mutex_exit(&ill->ill_lock);
12744 if (pending == DL_PRIM_INVAL) {
12745 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
12746 "received unsolicited ack for %s on %s\n",
12747 dl_primstr(prim), ill->ill_name);
12748 } else {
12749 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
12750 "received unexpected ack for %s on %s (expecting %s)\n",
12751 dl_primstr(prim), ill->ill_name, dl_primstr(pending));
12753 return (B_FALSE);
12757 * Complete the current DLPI operation associated with `prim' on `ill' and
12758 * start the next queued DLPI operation (if any). If there are no queued DLPI
12759 * operations and the ill's current exclusive IPSQ operation has finished
12760 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to
12761 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See
12762 * the comments above ipsq_current_finish() for details.
12764 void
12765 ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
12767 mblk_t *mp;
12768 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
12769 ipxop_t *ipx = ipsq->ipsq_xop;
12771 ASSERT(IAM_WRITER_IPSQ(ipsq));
12772 mutex_enter(&ill->ill_lock);
12774 ASSERT(prim != DL_PRIM_INVAL);
12775 ASSERT(ill->ill_dlpi_pending == prim);
12777 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name,
12778 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending));
12780 if ((mp = ill->ill_dlpi_deferred) == NULL) {
12781 ill->ill_dlpi_pending = DL_PRIM_INVAL;
12782 if (ipx->ipx_current_done) {
12783 mutex_enter(&ipx->ipx_lock);
12784 ipx->ipx_current_ipif = NULL;
12785 mutex_exit(&ipx->ipx_lock);
12787 cv_signal(&ill->ill_cv);
12788 mutex_exit(&ill->ill_lock);
12789 return;
12792 ill->ill_dlpi_deferred = mp->b_next;
12793 mp->b_next = NULL;
12794 mutex_exit(&ill->ill_lock);
12796 ill_dlpi_dispatch(ill, mp);
12800 * Queue a (multicast) DLPI control message to be sent to the driver by
12801 * later calling ill_dlpi_send_queued.
12802 * We queue them while holding a lock (ill_mcast_lock) to ensure that they
12803 * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ
12804 * for the same group to race.
12805 * We send DLPI control messages in order using ill_lock.
12806 * For IPMP we should be called on the cast_ill.
12808 void
12809 ill_dlpi_queue(ill_t *ill, mblk_t *mp)
12811 mblk_t **mpp;
12813 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12815 mutex_enter(&ill->ill_lock);
12816 /* Must queue message. Tail insertion */
12817 mpp = &ill->ill_dlpi_deferred;
12818 while (*mpp != NULL)
12819 mpp = &((*mpp)->b_next);
12821 *mpp = mp;
12822 mutex_exit(&ill->ill_lock);
12826 * Send the messages that were queued. Make sure there is only
12827 * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done()
12828 * when an ACK or a NAK is received to process the next queued message.
12829 * For IPMP we are called on the upper ill, but when send what is queued
12830 * on the cast_ill.
12832 void
12833 ill_dlpi_send_queued(ill_t *ill)
12835 mblk_t *mp;
12836 union DL_primitives *dlp;
12837 t_uscalar_t prim;
12838 ill_t *release_ill = NULL;
12840 if (IS_IPMP(ill)) {
12841 /* On the upper IPMP ill. */
12842 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
12843 if (release_ill == NULL) {
12844 /* Avoid ever sending anything down to the ipmpstub */
12845 return;
12847 ill = release_ill;
12849 mutex_enter(&ill->ill_lock);
12850 while ((mp = ill->ill_dlpi_deferred) != NULL) {
12851 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
12852 /* Can't send. Somebody else will send it */
12853 mutex_exit(&ill->ill_lock);
12854 goto done;
12856 ill->ill_dlpi_deferred = mp->b_next;
12857 mp->b_next = NULL;
12858 if (!ill->ill_dl_up) {
12860 * Nobody there. All multicast addresses will be
12861 * re-joined when we get the DL_BIND_ACK bringing the
12862 * interface up.
12864 freemsg(mp);
12865 continue;
12867 dlp = (union DL_primitives *)mp->b_rptr;
12868 prim = dlp->dl_primitive;
12870 if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
12871 (prim == DL_UNBIND_REQ)) {
12872 ill->ill_dlpi_pending = prim;
12874 mutex_exit(&ill->ill_lock);
12876 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued",
12877 char *, dl_primstr(prim), ill_t *, ill);
12878 putnext(ill->ill_wq, mp);
12879 mutex_enter(&ill->ill_lock);
12881 mutex_exit(&ill->ill_lock);
12882 done:
12883 if (release_ill != NULL)
12884 ill_refrele(release_ill);
12888 * Queue an IP (IGMP/MLD) message to be sent by IP from
12889 * ill_mcast_send_queued
12890 * We queue them while holding a lock (ill_mcast_lock) to ensure that they
12891 * are sent in order i.e., prevent a IGMP leave and IGMP join for the same
12892 * group to race.
12893 * We send them in order using ill_lock.
12894 * For IPMP we are called on the upper ill, but we queue on the cast_ill.
12896 void
12897 ill_mcast_queue(ill_t *ill, mblk_t *mp)
12899 mblk_t **mpp;
12900 ill_t *release_ill = NULL;
12902 ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
12904 if (IS_IPMP(ill)) {
12905 /* On the upper IPMP ill. */
12906 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
12907 if (release_ill == NULL) {
12908 /* Discard instead of queuing for the ipmp interface */
12909 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12910 ip_drop_output("ipIfStatsOutDiscards - no cast_ill",
12911 mp, ill);
12912 freemsg(mp);
12913 return;
12915 ill = release_ill;
12918 mutex_enter(&ill->ill_lock);
12919 /* Must queue message. Tail insertion */
12920 mpp = &ill->ill_mcast_deferred;
12921 while (*mpp != NULL)
12922 mpp = &((*mpp)->b_next);
12924 *mpp = mp;
12925 mutex_exit(&ill->ill_lock);
12926 if (release_ill != NULL)
12927 ill_refrele(release_ill);
12931 * Send the IP packets that were queued by ill_mcast_queue.
12932 * These are IGMP/MLD packets.
12934 * For IPMP we are called on the upper ill, but when send what is queued
12935 * on the cast_ill.
12937 * Request loopback of the report if we are acting as a multicast
12938 * router, so that the process-level routing demon can hear it.
12939 * This will run multiple times for the same group if there are members
12940 * on the same group for multiple ipif's on the same ill. The
12941 * igmp_input/mld_input code will suppress this due to the loopback thus we
12942 * always loopback membership report.
12944 * We also need to make sure that this does not get load balanced
12945 * by IPMP. We do this by passing an ill to ip_output_simple.
12947 void
12948 ill_mcast_send_queued(ill_t *ill)
12950 mblk_t *mp;
12951 ip_xmit_attr_t ixas;
12952 ill_t *release_ill = NULL;
12954 if (IS_IPMP(ill)) {
12955 /* On the upper IPMP ill. */
12956 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
12957 if (release_ill == NULL) {
12959 * We should have no messages on the ipmp interface
12960 * but no point in trying to send them.
12962 return;
12964 ill = release_ill;
12966 bzero(&ixas, sizeof (ixas));
12967 ixas.ixa_zoneid = ALL_ZONES;
12968 ixas.ixa_cred = kcred;
12969 ixas.ixa_cpid = NOPID;
12971 * Here we set ixa_ifindex. If IPMP it will be the lower ill which
12972 * makes ip_select_route pick the IRE_MULTICAST for the cast_ill.
12973 * That is necessary to handle IGMP/MLD snooping switches.
12975 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
12976 ixas.ixa_ipst = ill->ill_ipst;
12978 mutex_enter(&ill->ill_lock);
12979 while ((mp = ill->ill_mcast_deferred) != NULL) {
12980 ill->ill_mcast_deferred = mp->b_next;
12981 mp->b_next = NULL;
12982 if (!ill->ill_dl_up) {
12984 * Nobody there. Just drop the ip packets.
12985 * IGMP/MLD will resend later, if this is a replumb.
12987 freemsg(mp);
12988 continue;
12990 mutex_enter(&ill->ill_phyint->phyint_lock);
12991 if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) {
12993 * When the ill is getting deactivated, we only want to
12994 * send the DLPI messages, so drop IGMP/MLD packets.
12995 * DLPI messages are handled by ill_dlpi_send_queued()
12997 mutex_exit(&ill->ill_phyint->phyint_lock);
12998 freemsg(mp);
12999 continue;
13001 mutex_exit(&ill->ill_phyint->phyint_lock);
13002 mutex_exit(&ill->ill_lock);
13004 /* Check whether we are sending IPv4 or IPv6. */
13005 if (ill->ill_isv6) {
13006 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
13008 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
13009 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
13010 } else {
13011 ipha_t *ipha = (ipha_t *)mp->b_rptr;
13013 ixas.ixa_multicast_ttl = ipha->ipha_ttl;
13014 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
13015 ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM;
13017 ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE;
13018 ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE;
13019 (void) ip_output_simple(mp, &ixas);
13020 ixa_cleanup(&ixas);
13022 mutex_enter(&ill->ill_lock);
13024 mutex_exit(&ill->ill_lock);
13026 done:
13027 if (release_ill != NULL)
13028 ill_refrele(release_ill);
13032 * Take down a specific interface, but don't lose any information about it.
13033 * (Always called as writer.)
13034 * This function goes through the down sequence even if the interface is
13035 * already down. There are 2 reasons.
13036 * a. Currently we permit interface routes that depend on down interfaces
13037 * to be added. This behaviour itself is questionable. However it appears
13038 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long
13039 * time. We go thru the cleanup in order to remove these routes.
13040 * b. The bringup of the interface could fail in ill_dl_up i.e. we get
13041 * DL_ERROR_ACK in response to the DL_BIND request. The interface is
13042 * down, but we need to cleanup i.e. do ill_dl_down and
13043 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down.
13045 * IP-MT notes:
13047 * Model of reference to interfaces.
13049 * The following members in ipif_t track references to the ipif.
13050 * int ipif_refcnt; Active reference count
13052 * The following members in ill_t track references to the ill.
13053 * int ill_refcnt; active refcnt
13054 * uint_t ill_ire_cnt; Number of ires referencing ill
13055 * uint_t ill_ncec_cnt; Number of ncecs referencing ill
13056 * uint_t ill_nce_cnt; Number of nces referencing ill
13057 * uint_t ill_ilm_cnt; Number of ilms referencing ill
13059 * Reference to an ipif or ill can be obtained in any of the following ways.
13061 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions
13062 * Pointers to ipif / ill from other data structures viz ire and conn.
13063 * Implicit reference to the ipif / ill by holding a reference to the ire.
13065 * The ipif/ill lookup functions return a reference held ipif / ill.
13066 * ipif_refcnt and ill_refcnt track the reference counts respectively.
13067 * This is a purely dynamic reference count associated with threads holding
13068 * references to the ipif / ill. Pointers from other structures do not
13069 * count towards this reference count.
13071 * ill_ire_cnt is the number of ire's associated with the
13072 * ill. This is incremented whenever a new ire is created referencing the
13073 * ill. This is done atomically inside ire_add_v[46] where the ire is
13074 * actually added to the ire hash table. The count is decremented in
13075 * ire_inactive where the ire is destroyed.
13077 * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill.
13078 * This is incremented atomically in
13079 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the
13080 * table. Similarly it is decremented in ncec_inactive() where the ncec
13081 * is destroyed.
13083 * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is
13084 * incremented atomically in nce_add() where the nce is actually added to the
13085 * ill_nce. Similarly it is decremented in nce_inactive() where the nce
13086 * is destroyed.
13088 * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in
13089 * ilm_add() and decremented before the ilm is freed in ilm_delete().
13091 * Flow of ioctls involving interface down/up
13093 * The following is the sequence of an attempt to set some critical flags on an
13094 * up interface.
13095 * ip_sioctl_flags
13096 * ipif_down
13097 * wait for ipif to be quiescent
13098 * ipif_down_tail
13099 * ip_sioctl_flags_tail
13101 * All set ioctls that involve down/up sequence would have a skeleton similar
13102 * to the above. All the *tail functions are called after the refcounts have
13103 * dropped to the appropriate values.
13105 * SIOC ioctls during the IPIF_CHANGING interval.
13107 * Threads handling SIOC set ioctls serialize on the squeue, but this
13108 * is not done for SIOC get ioctls. Since a set ioctl can cause several
13109 * steps of internal changes to the state, some of which are visible in
13110 * ipif_flags (such as IFF_UP being cleared and later set), and we want
13111 * the set ioctl to be atomic related to the get ioctls, the SIOC get code
13112 * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then
13113 * enqueued in the ipsq and the operation is restarted by ipsq_exit() when
13114 * the current exclusive operation completes. The IPIF_CHANGING check
13115 * and enqueue is atomic using the ill_lock and ipsq_lock. The
13116 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
13117 * change while the ill_lock is held. Before dropping the ill_lock we acquire
13118 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish
13119 * until we release the ipsq_lock, even though the ill/ipif state flags
13120 * can change after we drop the ill_lock.
13123 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13125 ill_t *ill = ipif->ipif_ill;
13126 conn_t *connp;
13127 boolean_t success;
13128 boolean_t ipif_was_up = B_FALSE;
13129 ip_stack_t *ipst = ill->ill_ipst;
13131 ASSERT(IAM_WRITER_IPIF(ipif));
13133 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
13135 DTRACE_PROBE3(ipif__downup, char *, "ipif_down",
13136 ill_t *, ill, ipif_t *, ipif);
13138 if (ipif->ipif_flags & IPIF_UP) {
13139 mutex_enter(&ill->ill_lock);
13140 ipif->ipif_flags &= ~IPIF_UP;
13141 ASSERT(ill->ill_ipif_up_count > 0);
13142 --ill->ill_ipif_up_count;
13143 mutex_exit(&ill->ill_lock);
13144 ipif_was_up = B_TRUE;
13145 /* Update status in SCTP's list */
13146 sctp_update_ipif(ipif, SCTP_IPIF_DOWN);
13147 ill_nic_event_dispatch(ipif->ipif_ill,
13148 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0);
13152 * Removal of the last ipif from an ill may result in a DL_UNBIND
13153 * being sent to the driver, and we must not send any data packets to
13154 * the driver after the DL_UNBIND_REQ. To ensure this, all the
13155 * ire and nce entries used in the data path will be cleaned
13156 * up, and we also set the ILL_DOWN_IN_PROGRESS bit to make
13157 * sure on new entries will be added until the ill is bound
13158 * again. The ILL_DOWN_IN_PROGRESS bit is turned off upon
13159 * receipt of a DL_BIND_ACK.
13161 if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13162 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13163 ill->ill_dl_up) {
13164 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
13168 * Blow away memberships we established in ipif_multicast_up().
13170 ipif_multicast_down(ipif);
13173 * Remove from the mapping for __sin6_src_id. We insert only
13174 * when the address is not INADDR_ANY. As IPv4 addresses are
13175 * stored as mapped addresses, we need to check for mapped
13176 * INADDR_ANY also.
13178 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
13179 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) &&
13180 !(ipif->ipif_flags & IPIF_NOLOCAL)) {
13181 int err;
13183 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr,
13184 ipif->ipif_zoneid, ipst);
13185 if (err != 0) {
13186 ip0dbg(("ipif_down: srcid_remove %d\n", err));
13190 if (ipif_was_up) {
13191 /* only delete if we'd added ire's before */
13192 if (ipif->ipif_isv6)
13193 ipif_delete_ires_v6(ipif);
13194 else
13195 ipif_delete_ires_v4(ipif);
13198 if (ipif_was_up && ill->ill_ipif_up_count == 0) {
13200 * Since the interface is now down, it may have just become
13201 * inactive. Note that this needs to be done even for a
13202 * lll_logical_down(), or ARP entries will not get correctly
13203 * restored when the interface comes back up.
13205 if (IS_UNDER_IPMP(ill))
13206 ipmp_ill_refresh_active(ill);
13210 * neighbor-discovery or arp entries for this interface. The ipif
13211 * has to be quiesced, so we walk all the nce's and delete those
13212 * that point at the ipif->ipif_ill. At the same time, we also
13213 * update IPMP so that ipifs for data addresses are unbound. We dont
13214 * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer
13215 * that for ipif_down_tail()
13217 ipif_nce_down(ipif);
13220 * If this is the last ipif on the ill, we also need to remove
13221 * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will
13222 * never succeed.
13224 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0)
13225 ire_walk_ill(0, 0, ill_downi, ill, ill);
13228 * Walk all CONNs that can have a reference on an ire for this
13229 * ipif (we actually walk all that now have stale references).
13231 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
13234 * If mp is NULL the caller will wait for the appropriate refcnt.
13235 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down
13236 * and ill_delete -> ipif_free -> ipif_down
13238 if (mp == NULL) {
13239 ASSERT(q == NULL);
13240 return (0);
13243 if (CONN_Q(q)) {
13244 connp = Q_TO_CONN(q);
13245 mutex_enter(&connp->conn_lock);
13246 } else {
13247 connp = NULL;
13249 mutex_enter(&ill->ill_lock);
13251 * Are there any ire's pointing to this ipif that are still active ?
13252 * If this is the last ipif going down, are there any ire's pointing
13253 * to this ill that are still active ?
13255 if (ipif_is_quiescent(ipif)) {
13256 mutex_exit(&ill->ill_lock);
13257 if (connp != NULL)
13258 mutex_exit(&connp->conn_lock);
13259 return (0);
13262 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p",
13263 ill->ill_name, (void *)ill));
13265 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount
13266 * drops down, the operation will be restarted by ipif_ill_refrele_tail
13267 * which in turn is called by the last refrele on the ipif/ill/ire.
13269 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN);
13270 if (!success) {
13271 /* The conn is closing. So just return */
13272 ASSERT(connp != NULL);
13273 mutex_exit(&ill->ill_lock);
13274 mutex_exit(&connp->conn_lock);
13275 return (EINTR);
13278 mutex_exit(&ill->ill_lock);
13279 if (connp != NULL)
13280 mutex_exit(&connp->conn_lock);
13281 return (EINPROGRESS);
13285 ipif_down_tail(ipif_t *ipif)
13287 ill_t *ill = ipif->ipif_ill;
13288 int err = 0;
13290 DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail",
13291 ill_t *, ill, ipif_t *, ipif);
13294 * Skip any loopback interface (null wq).
13295 * If this is the last logical interface on the ill
13296 * have ill_dl_down tell the driver we are gone (unbind)
13297 * Note that lun 0 can ipif_down even though
13298 * there are other logical units that are up.
13299 * This occurs e.g. when we change a "significant" IFF_ flag.
13301 if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13302 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13303 ill->ill_dl_up) {
13304 ill_dl_down(ill);
13306 if (!ipif->ipif_isv6)
13307 err = ipif_arp_down(ipif);
13309 ill->ill_logical_down = 0;
13311 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
13312 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT);
13313 return (err);
13317 * Bring interface logically down without bringing the physical interface
13318 * down e.g. when the netmask is changed. This avoids long lasting link
13319 * negotiations between an ethernet interface and a certain switches.
13321 static int
13322 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13324 DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down",
13325 ill_t *, ipif->ipif_ill, ipif_t *, ipif);
13328 * The ill_logical_down flag is a transient flag. It is set here
13329 * and is cleared once the down has completed in ipif_down_tail.
13330 * This flag does not indicate whether the ill stream is in the
13331 * DL_BOUND state with the driver. Instead this flag is used by
13332 * ipif_down_tail to determine whether to DL_UNBIND the stream with
13333 * the driver. The state of the ill stream i.e. whether it is
13334 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag.
13336 ipif->ipif_ill->ill_logical_down = 1;
13337 return (ipif_down(ipif, q, mp));
13341 * Initiate deallocate of an IPIF. Always called as writer. Called by
13342 * ill_delete or ip_sioctl_removeif.
13344 static void
13345 ipif_free(ipif_t *ipif)
13347 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
13349 ASSERT(IAM_WRITER_IPIF(ipif));
13351 if (ipif->ipif_recovery_id != 0)
13352 (void) untimeout(ipif->ipif_recovery_id);
13353 ipif->ipif_recovery_id = 0;
13356 * Take down the interface. We can be called either from ill_delete
13357 * or from ip_sioctl_removeif.
13359 (void) ipif_down(ipif, NULL, NULL);
13362 * Now that the interface is down, there's no chance it can still
13363 * become a duplicate. Cancel any timer that may have been set while
13364 * tearing down.
13366 if (ipif->ipif_recovery_id != 0)
13367 (void) untimeout(ipif->ipif_recovery_id);
13368 ipif->ipif_recovery_id = 0;
13370 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13371 /* Remove pointers to this ill in the multicast routing tables */
13372 reset_mrt_vif_ipif(ipif);
13373 /* If necessary, clear the cached source ipif rotor. */
13374 if (ipif->ipif_ill->ill_src_ipif == ipif)
13375 ipif->ipif_ill->ill_src_ipif = NULL;
13376 rw_exit(&ipst->ips_ill_g_lock);
13379 static void
13380 ipif_free_tail(ipif_t *ipif)
13382 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
13385 * Need to hold both ill_g_lock and ill_lock while
13386 * inserting or removing an ipif from the linked list
13387 * of ipifs hanging off the ill.
13389 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13391 #ifdef DEBUG
13392 ipif_trace_cleanup(ipif);
13393 #endif
13395 /* Ask SCTP to take it out of it list */
13396 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE);
13397 ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT);
13399 /* Get it out of the ILL interface list. */
13400 ipif_remove(ipif);
13401 rw_exit(&ipst->ips_ill_g_lock);
13403 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE)));
13404 ASSERT(ipif->ipif_recovery_id == 0);
13405 ASSERT(ipif->ipif_ire_local == NULL);
13406 ASSERT(ipif->ipif_ire_if == NULL);
13408 /* Free the memory. */
13409 mi_free(ipif);
13413 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id"
13414 * is zero.
13416 void
13417 ipif_get_name(const ipif_t *ipif, char *buf, int len)
13419 char lbuf[LIFNAMSIZ];
13420 char *name;
13421 size_t name_len;
13423 buf[0] = '\0';
13424 name = ipif->ipif_ill->ill_name;
13425 name_len = ipif->ipif_ill->ill_name_length;
13426 if (ipif->ipif_id != 0) {
13427 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR,
13428 ipif->ipif_id);
13429 name = lbuf;
13430 name_len = mi_strlen(name) + 1;
13432 len -= 1;
13433 buf[len] = '\0';
13434 len = MIN(len, name_len);
13435 bcopy(name, buf, len);
13439 * Sets `buf' to an ill name.
13441 void
13442 ill_get_name(const ill_t *ill, char *buf, int len)
13444 char *name;
13445 size_t name_len;
13447 name = ill->ill_name;
13448 name_len = ill->ill_name_length;
13449 len -= 1;
13450 buf[len] = '\0';
13451 len = MIN(len, name_len);
13452 bcopy(name, buf, len);
13456 * Find an IPIF based on the name passed in. Names can be of the form <phys>
13457 * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the
13458 * implied unit id is zero. <phys> must correspond to the name of an ILL.
13459 * (May be called as writer.)
13461 static ipif_t *
13462 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
13463 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst)
13465 char *cp;
13466 char *endp;
13467 long id;
13468 ill_t *ill;
13469 ipif_t *ipif;
13470 uint_t ire_type;
13471 boolean_t did_alloc = B_FALSE;
13472 char last;
13475 * If the caller wants to us to create the ipif, make sure we have a
13476 * valid zoneid
13478 ASSERT(!do_alloc || zoneid != ALL_ZONES);
13480 if (namelen == 0) {
13481 return (NULL);
13484 *exists = B_FALSE;
13485 /* Look for a colon in the name. */
13486 endp = &name[namelen];
13487 for (cp = endp; --cp > name; ) {
13488 if (*cp == IPIF_SEPARATOR_CHAR)
13489 break;
13492 if (*cp == IPIF_SEPARATOR_CHAR) {
13494 * Reject any non-decimal aliases for logical
13495 * interfaces. Aliases with leading zeroes
13496 * are also rejected as they introduce ambiguity
13497 * in the naming of the interfaces.
13498 * In order to confirm with existing semantics,
13499 * and to not break any programs/script relying
13500 * on that behaviour, if<0>:0 is considered to be
13501 * a valid interface.
13503 * If alias has two or more digits and the first
13504 * is zero, fail.
13506 if (&cp[2] < endp && cp[1] == '0') {
13507 return (NULL);
13511 if (cp <= name) {
13512 cp = endp;
13514 last = *cp;
13515 *cp = '\0';
13518 * Look up the ILL, based on the portion of the name
13519 * before the slash. ill_lookup_on_name returns a held ill.
13520 * Temporary to check whether ill exists already. If so
13521 * ill_lookup_on_name will clear it.
13523 ill = ill_lookup_on_name(name, do_alloc, isv6,
13524 &did_alloc, ipst);
13525 *cp = last;
13526 if (ill == NULL)
13527 return (NULL);
13529 /* Establish the unit number in the name. */
13530 id = 0;
13531 if (cp < endp && *endp == '\0') {
13532 /* If there was a colon, the unit number follows. */
13533 cp++;
13534 if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13535 ill_refrele(ill);
13536 return (NULL);
13540 mutex_enter(&ill->ill_lock);
13541 /* Now see if there is an IPIF with this unit number. */
13542 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13543 if (ipif->ipif_id == id) {
13544 if (zoneid != ALL_ZONES &&
13545 zoneid != ipif->ipif_zoneid &&
13546 ipif->ipif_zoneid != ALL_ZONES) {
13547 mutex_exit(&ill->ill_lock);
13548 ill_refrele(ill);
13549 return (NULL);
13551 if (IPIF_CAN_LOOKUP(ipif)) {
13552 ipif_refhold_locked(ipif);
13553 mutex_exit(&ill->ill_lock);
13554 if (!did_alloc)
13555 *exists = B_TRUE;
13557 * Drop locks before calling ill_refrele
13558 * since it can potentially call into
13559 * ipif_ill_refrele_tail which can end up
13560 * in trying to acquire any lock.
13562 ill_refrele(ill);
13563 return (ipif);
13568 if (!do_alloc) {
13569 mutex_exit(&ill->ill_lock);
13570 ill_refrele(ill);
13571 return (NULL);
13575 * If none found, atomically allocate and return a new one.
13576 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL
13577 * to support "receive only" use of lo0:1 etc. as is still done
13578 * below as an initial guess.
13579 * However, this is now likely to be overriden later in ipif_up_done()
13580 * when we know for sure what address has been configured on the
13581 * interface, since we might have more than one loopback interface
13582 * with a loopback address, e.g. in the case of zones, and all the
13583 * interfaces with loopback addresses need to be marked IRE_LOOPBACK.
13585 if (ill->ill_net_type == IRE_LOOPBACK && id == 0)
13586 ire_type = IRE_LOOPBACK;
13587 else
13588 ire_type = IRE_LOCAL;
13589 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL);
13590 if (ipif != NULL)
13591 ipif_refhold_locked(ipif);
13592 mutex_exit(&ill->ill_lock);
13593 ill_refrele(ill);
13594 return (ipif);
13598 * Variant of the above that queues the request on the ipsq when
13599 * IPIF_CHANGING is set.
13601 static ipif_t *
13602 ipif_lookup_on_name_async(char *name, size_t namelen, boolean_t isv6,
13603 zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error,
13604 ip_stack_t *ipst)
13606 char *cp;
13607 char *endp;
13608 long id;
13609 ill_t *ill;
13610 ipif_t *ipif;
13611 boolean_t did_alloc = B_FALSE;
13612 ipsq_t *ipsq;
13614 if (error != NULL)
13615 *error = 0;
13617 if (namelen == 0) {
13618 if (error != NULL)
13619 *error = ENXIO;
13620 return (NULL);
13623 /* Look for a colon in the name. */
13624 endp = &name[namelen];
13625 for (cp = endp; --cp > name; ) {
13626 if (*cp == IPIF_SEPARATOR_CHAR)
13627 break;
13630 if (*cp == IPIF_SEPARATOR_CHAR) {
13632 * Reject any non-decimal aliases for logical
13633 * interfaces. Aliases with leading zeroes
13634 * are also rejected as they introduce ambiguity
13635 * in the naming of the interfaces.
13636 * In order to confirm with existing semantics,
13637 * and to not break any programs/script relying
13638 * on that behaviour, if<0>:0 is considered to be
13639 * a valid interface.
13641 * If alias has two or more digits and the first
13642 * is zero, fail.
13644 if (&cp[2] < endp && cp[1] == '0') {
13645 if (error != NULL)
13646 *error = EINVAL;
13647 return (NULL);
13651 if (cp <= name) {
13652 cp = endp;
13653 } else {
13654 *cp = '\0';
13658 * Look up the ILL, based on the portion of the name
13659 * before the slash. ill_lookup_on_name returns a held ill.
13660 * Temporary to check whether ill exists already. If so
13661 * ill_lookup_on_name will clear it.
13663 ill = ill_lookup_on_name(name, B_FALSE, isv6, &did_alloc, ipst);
13664 if (cp != endp)
13665 *cp = IPIF_SEPARATOR_CHAR;
13666 if (ill == NULL)
13667 return (NULL);
13669 /* Establish the unit number in the name. */
13670 id = 0;
13671 if (cp < endp && *endp == '\0') {
13672 /* If there was a colon, the unit number follows. */
13673 cp++;
13674 if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13675 ill_refrele(ill);
13676 if (error != NULL)
13677 *error = ENXIO;
13678 return (NULL);
13682 GRAB_CONN_LOCK(q);
13683 mutex_enter(&ill->ill_lock);
13684 /* Now see if there is an IPIF with this unit number. */
13685 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13686 if (ipif->ipif_id == id) {
13687 if (zoneid != ALL_ZONES &&
13688 zoneid != ipif->ipif_zoneid &&
13689 ipif->ipif_zoneid != ALL_ZONES) {
13690 mutex_exit(&ill->ill_lock);
13691 RELEASE_CONN_LOCK(q);
13692 ill_refrele(ill);
13693 if (error != NULL)
13694 *error = ENXIO;
13695 return (NULL);
13698 if (!(IPIF_IS_CHANGING(ipif) ||
13699 IPIF_IS_CONDEMNED(ipif)) ||
13700 IAM_WRITER_IPIF(ipif)) {
13701 ipif_refhold_locked(ipif);
13702 mutex_exit(&ill->ill_lock);
13704 * Drop locks before calling ill_refrele
13705 * since it can potentially call into
13706 * ipif_ill_refrele_tail which can end up
13707 * in trying to acquire any lock.
13709 RELEASE_CONN_LOCK(q);
13710 ill_refrele(ill);
13711 return (ipif);
13712 } else if (q != NULL && !IPIF_IS_CONDEMNED(ipif)) {
13713 ipsq = ill->ill_phyint->phyint_ipsq;
13714 mutex_enter(&ipsq->ipsq_lock);
13715 mutex_enter(&ipsq->ipsq_xop->ipx_lock);
13716 mutex_exit(&ill->ill_lock);
13717 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
13718 mutex_exit(&ipsq->ipsq_xop->ipx_lock);
13719 mutex_exit(&ipsq->ipsq_lock);
13720 RELEASE_CONN_LOCK(q);
13721 ill_refrele(ill);
13722 if (error != NULL)
13723 *error = EINPROGRESS;
13724 return (NULL);
13728 RELEASE_CONN_LOCK(q);
13729 mutex_exit(&ill->ill_lock);
13730 ill_refrele(ill);
13731 if (error != NULL)
13732 *error = ENXIO;
13733 return (NULL);
13737 * This routine is called whenever a new address comes up on an ipif. If
13738 * we are configured to respond to address mask requests, then we are supposed
13739 * to broadcast an address mask reply at this time. This routine is also
13740 * called if we are already up, but a netmask change is made. This is legal
13741 * but might not make the system manager very popular. (May be called
13742 * as writer.)
13744 void
13745 ipif_mask_reply(ipif_t *ipif)
13747 icmph_t *icmph;
13748 ipha_t *ipha;
13749 mblk_t *mp;
13750 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
13751 ip_xmit_attr_t ixas;
13753 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN)
13755 if (!ipst->ips_ip_respond_to_address_mask_broadcast)
13756 return;
13758 /* ICMP mask reply is IPv4 only */
13759 ASSERT(!ipif->ipif_isv6);
13760 /* ICMP mask reply is not for a loopback interface */
13761 ASSERT(ipif->ipif_ill->ill_wq != NULL);
13763 if (ipif->ipif_lcl_addr == INADDR_ANY)
13764 return;
13766 mp = allocb(REPLY_LEN, BPRI_HI);
13767 if (mp == NULL)
13768 return;
13769 mp->b_wptr = mp->b_rptr + REPLY_LEN;
13771 ipha = (ipha_t *)mp->b_rptr;
13772 bzero(ipha, REPLY_LEN);
13773 *ipha = icmp_ipha;
13774 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
13775 ipha->ipha_src = ipif->ipif_lcl_addr;
13776 ipha->ipha_dst = ipif->ipif_brd_addr;
13777 ipha->ipha_length = htons(REPLY_LEN);
13778 ipha->ipha_ident = 0;
13780 icmph = (icmph_t *)&ipha[1];
13781 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
13782 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
13783 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
13785 bzero(&ixas, sizeof (ixas));
13786 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
13787 ixas.ixa_zoneid = ALL_ZONES;
13788 ixas.ixa_ifindex = 0;
13789 ixas.ixa_ipst = ipst;
13790 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
13791 (void) ip_output_simple(mp, &ixas);
13792 ixa_cleanup(&ixas);
13793 #undef REPLY_LEN
13797 * Join the ipif specific multicast groups.
13798 * Must be called after a mapping has been set up in the resolver. (Always
13799 * called as writer.)
13801 void
13802 ipif_multicast_up(ipif_t *ipif)
13804 int err;
13805 ill_t *ill;
13806 ilm_t *ilm;
13808 ASSERT(IAM_WRITER_IPIF(ipif));
13810 ill = ipif->ipif_ill;
13812 ip1dbg(("ipif_multicast_up\n"));
13813 if (!(ill->ill_flags & ILLF_MULTICAST) ||
13814 ipif->ipif_allhosts_ilm != NULL)
13815 return;
13817 if (ipif->ipif_isv6) {
13818 in6_addr_t v6allmc = ipv6_all_hosts_mcast;
13819 in6_addr_t v6solmc = ipv6_solicited_node_mcast;
13821 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
13823 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
13824 return;
13826 ip1dbg(("ipif_multicast_up - addmulti\n"));
13829 * Join the all hosts multicast address. We skip this for
13830 * underlying IPMP interfaces since they should be invisible.
13832 if (!IS_UNDER_IPMP(ill)) {
13833 ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid,
13834 &err);
13835 if (ilm == NULL) {
13836 ASSERT(err != 0);
13837 ip0dbg(("ipif_multicast_up: "
13838 "all_hosts_mcast failed %d\n", err));
13839 return;
13841 ipif->ipif_allhosts_ilm = ilm;
13845 * Enable multicast for the solicited node multicast address.
13846 * If IPMP we need to put the membership on the upper ill.
13848 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
13849 ill_t *mcast_ill = NULL;
13850 boolean_t need_refrele;
13852 if (IS_UNDER_IPMP(ill) &&
13853 (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
13854 need_refrele = B_TRUE;
13855 } else {
13856 mcast_ill = ill;
13857 need_refrele = B_FALSE;
13860 ilm = ip_addmulti(&v6solmc, mcast_ill,
13861 ipif->ipif_zoneid, &err);
13862 if (need_refrele)
13863 ill_refrele(mcast_ill);
13865 if (ilm == NULL) {
13866 ASSERT(err != 0);
13867 ip0dbg(("ipif_multicast_up: solicited MC"
13868 " failed %d\n", err));
13869 if ((ilm = ipif->ipif_allhosts_ilm) != NULL) {
13870 ipif->ipif_allhosts_ilm = NULL;
13871 (void) ip_delmulti(ilm);
13873 return;
13875 ipif->ipif_solmulti_ilm = ilm;
13877 } else {
13878 in6_addr_t v6group;
13880 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill))
13881 return;
13883 /* Join the all hosts multicast address */
13884 ip1dbg(("ipif_multicast_up - addmulti\n"));
13885 IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group);
13887 ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err);
13888 if (ilm == NULL) {
13889 ASSERT(err != 0);
13890 ip0dbg(("ipif_multicast_up: failed %d\n", err));
13891 return;
13893 ipif->ipif_allhosts_ilm = ilm;
13898 * Blow away any multicast groups that we joined in ipif_multicast_up().
13899 * (ilms from explicit memberships are handled in conn_update_ill.)
13901 void
13902 ipif_multicast_down(ipif_t *ipif)
13904 ASSERT(IAM_WRITER_IPIF(ipif));
13906 ip1dbg(("ipif_multicast_down\n"));
13908 if (ipif->ipif_allhosts_ilm != NULL) {
13909 (void) ip_delmulti(ipif->ipif_allhosts_ilm);
13910 ipif->ipif_allhosts_ilm = NULL;
13912 if (ipif->ipif_solmulti_ilm != NULL) {
13913 (void) ip_delmulti(ipif->ipif_solmulti_ilm);
13914 ipif->ipif_solmulti_ilm = NULL;
13919 * Used when an interface comes up to recreate any extra routes on this
13920 * interface.
13923 ill_recover_saved_ire(ill_t *ill)
13925 mblk_t *mp;
13926 ip_stack_t *ipst = ill->ill_ipst;
13928 ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name));
13930 mutex_enter(&ill->ill_saved_ire_lock);
13931 for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
13932 ire_t *ire, *nire;
13933 ifrt_t *ifrt;
13935 ifrt = (ifrt_t *)mp->b_rptr;
13937 * Create a copy of the IRE with the saved address and netmask.
13939 if (ill->ill_isv6) {
13940 ire = ire_create_v6(
13941 &ifrt->ifrt_v6addr,
13942 &ifrt->ifrt_v6mask,
13943 &ifrt->ifrt_v6gateway_addr,
13944 ifrt->ifrt_type,
13945 ill,
13946 ifrt->ifrt_zoneid,
13947 ifrt->ifrt_flags,
13948 ipst);
13949 } else {
13950 ire = ire_create(
13951 (uint8_t *)&ifrt->ifrt_addr,
13952 (uint8_t *)&ifrt->ifrt_mask,
13953 (uint8_t *)&ifrt->ifrt_gateway_addr,
13954 ifrt->ifrt_type,
13955 ill,
13956 ifrt->ifrt_zoneid,
13957 ifrt->ifrt_flags,
13958 ipst);
13960 if (ire == NULL) {
13961 mutex_exit(&ill->ill_saved_ire_lock);
13962 return (ENOMEM);
13965 if (ifrt->ifrt_flags & RTF_SETSRC) {
13966 if (ill->ill_isv6) {
13967 ire->ire_setsrc_addr_v6 =
13968 ifrt->ifrt_v6setsrc_addr;
13969 } else {
13970 ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr;
13975 * Some software (for example, GateD and Sun Cluster) attempts
13976 * to create (what amount to) IRE_PREFIX routes with the
13977 * loopback address as the gateway. This is primarily done to
13978 * set up prefixes with the RTF_REJECT flag set (for example,
13979 * when generating aggregate routes.)
13981 * If the IRE type (as defined by ill->ill_net_type) is
13982 * IRE_LOOPBACK, then we map the request into a
13983 * IRE_IF_NORESOLVER.
13985 if (ill->ill_net_type == IRE_LOOPBACK)
13986 ire->ire_type = IRE_IF_NORESOLVER;
13989 * ire held by ire_add, will be refreled' towards the
13990 * the end of ipif_up_done
13992 nire = ire_add(ire);
13994 * Check if it was a duplicate entry. This handles
13995 * the case of two racing route adds for the same route
13997 if (nire == NULL) {
13998 ip1dbg(("ill_recover_saved_ire: FAILED\n"));
13999 } else if (nire != ire) {
14000 ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n",
14001 (void *)nire));
14002 ire_delete(nire);
14003 } else {
14004 ip1dbg(("ill_recover_saved_ire: added ire %p\n",
14005 (void *)nire));
14007 if (nire != NULL)
14008 ire_refrele(nire);
14010 mutex_exit(&ill->ill_saved_ire_lock);
14011 return (0);
14015 * Used to set the netmask and broadcast address to default values when the
14016 * interface is brought up. (Always called as writer.)
14018 static void
14019 ipif_set_default(ipif_t *ipif)
14021 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14023 if (!ipif->ipif_isv6) {
14025 * Interface holds an IPv4 address. Default
14026 * mask is the natural netmask.
14028 if (!ipif->ipif_net_mask) {
14029 ipaddr_t v4mask;
14031 v4mask = ip_net_mask(ipif->ipif_lcl_addr);
14032 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask);
14034 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14035 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14036 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14037 } else {
14038 V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14039 ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14042 * NOTE: SunOS 4.X does this even if the broadcast address
14043 * has been already set thus we do the same here.
14045 if (ipif->ipif_flags & IPIF_BROADCAST) {
14046 ipaddr_t v4addr;
14048 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask;
14049 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr);
14051 } else {
14053 * Interface holds an IPv6-only address. Default
14054 * mask is all-ones.
14056 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))
14057 ipif->ipif_v6net_mask = ipv6_all_ones;
14058 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14059 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14060 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14061 } else {
14062 V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14063 ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14069 * Return 0 if this address can be used as local address without causing
14070 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address
14071 * is already up on a different ill, and EADDRINUSE if it's up on the same ill.
14072 * Note that the same IPv6 link-local address is allowed as long as the ills
14073 * are not on the same link.
14076 ip_addr_availability_check(ipif_t *new_ipif)
14078 in6_addr_t our_v6addr;
14079 ill_t *ill;
14080 ipif_t *ipif;
14081 ill_walk_context_t ctx;
14082 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst;
14084 ASSERT(IAM_WRITER_IPIF(new_ipif));
14085 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock));
14086 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
14088 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED;
14089 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) ||
14090 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr))
14091 return (0);
14093 our_v6addr = new_ipif->ipif_v6lcl_addr;
14095 if (new_ipif->ipif_isv6)
14096 ill = ILL_START_WALK_V6(&ctx, ipst);
14097 else
14098 ill = ILL_START_WALK_V4(&ctx, ipst);
14100 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
14101 for (ipif = ill->ill_ipif; ipif != NULL;
14102 ipif = ipif->ipif_next) {
14103 if ((ipif == new_ipif) ||
14104 !(ipif->ipif_flags & IPIF_UP) ||
14105 (ipif->ipif_flags & IPIF_UNNUMBERED) ||
14106 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
14107 &our_v6addr))
14108 continue;
14110 if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
14111 new_ipif->ipif_flags |= IPIF_UNNUMBERED;
14112 else if (ipif->ipif_flags & IPIF_POINTOPOINT)
14113 ipif->ipif_flags |= IPIF_UNNUMBERED;
14114 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) ||
14115 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) &&
14116 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill))
14117 continue;
14118 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid &&
14119 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill))
14120 continue;
14121 else if (new_ipif->ipif_ill == ill)
14122 return (EADDRINUSE);
14123 else
14124 return (EADDRNOTAVAIL);
14128 return (0);
14132 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add
14133 * IREs for the ipif.
14134 * When the routine returns EINPROGRESS then mp has been consumed and
14135 * the ioctl will be acked from ip_rput_dlpi.
14138 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
14140 ill_t *ill = ipif->ipif_ill;
14141 boolean_t isv6 = ipif->ipif_isv6;
14142 int err = 0;
14143 boolean_t success;
14144 uint_t ipif_orig_id;
14145 ip_stack_t *ipst = ill->ill_ipst;
14147 ASSERT(IAM_WRITER_IPIF(ipif));
14149 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
14150 DTRACE_PROBE3(ipif__downup, char *, "ipif_up",
14151 ill_t *, ill, ipif_t *, ipif);
14153 /* Shouldn't get here if it is already up. */
14154 if (ipif->ipif_flags & IPIF_UP)
14155 return (EALREADY);
14158 * If this is a request to bring up a data address on an interface
14159 * under IPMP, then move the address to its IPMP meta-interface and
14160 * try to bring it up. One complication is that the zeroth ipif for
14161 * an ill is special, in that every ill always has one, and that code
14162 * throughout IP deferences ill->ill_ipif without holding any locks.
14164 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) &&
14165 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) {
14166 ipif_t *stubipif = NULL, *moveipif = NULL;
14167 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
14170 * The ipif being brought up should be quiesced. If it's not,
14171 * something has gone amiss and we need to bail out. (If it's
14172 * quiesced, we know it will remain so via IPIF_CONDEMNED.)
14174 mutex_enter(&ill->ill_lock);
14175 if (!ipif_is_quiescent(ipif)) {
14176 mutex_exit(&ill->ill_lock);
14177 return (EINVAL);
14179 mutex_exit(&ill->ill_lock);
14182 * If we're going to need to allocate ipifs, do it prior
14183 * to starting the move (and grabbing locks).
14185 if (ipif->ipif_id == 0) {
14186 if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14187 B_FALSE, &err)) == NULL) {
14188 return (err);
14190 if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14191 B_FALSE, &err)) == NULL) {
14192 mi_free(moveipif);
14193 return (err);
14198 * Grab or transfer the ipif to move. During the move, keep
14199 * ill_g_lock held to prevent any ill walker threads from
14200 * seeing things in an inconsistent state.
14202 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14203 if (ipif->ipif_id != 0) {
14204 ipif_remove(ipif);
14205 } else {
14206 ipif_transfer(ipif, moveipif, stubipif);
14207 ipif = moveipif;
14211 * Place the ipif on the IPMP ill. If the zeroth ipif on
14212 * the IPMP ill is a stub (0.0.0.0 down address) then we
14213 * replace that one. Otherwise, pick the next available slot.
14215 ipif->ipif_ill = ipmp_ill;
14216 ipif_orig_id = ipif->ipif_id;
14218 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) {
14219 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL);
14220 ipif = ipmp_ill->ill_ipif;
14221 } else {
14222 ipif->ipif_id = -1;
14223 if ((err = ipif_insert(ipif, B_FALSE)) != 0) {
14225 * No more available ipif_id's -- put it back
14226 * on the original ill and fail the operation.
14227 * Since we're writer on the ill, we can be
14228 * sure our old slot is still available.
14230 ipif->ipif_id = ipif_orig_id;
14231 ipif->ipif_ill = ill;
14232 if (ipif_orig_id == 0) {
14233 ipif_transfer(ipif, ill->ill_ipif,
14234 NULL);
14235 } else {
14236 VERIFY(ipif_insert(ipif, B_FALSE) == 0);
14238 rw_exit(&ipst->ips_ill_g_lock);
14239 return (err);
14242 rw_exit(&ipst->ips_ill_g_lock);
14245 * Tell SCTP that the ipif has moved. Note that even if we
14246 * had to allocate a new ipif, the original sequence id was
14247 * preserved and therefore SCTP won't know.
14249 sctp_move_ipif(ipif, ill, ipmp_ill);
14252 * If the ipif being brought up was on slot zero, then we
14253 * first need to bring up the placeholder we stuck there. In
14254 * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive
14255 * call to ipif_up() itself, if we successfully bring up the
14256 * placeholder, we'll check ill_move_ipif and bring it up too.
14258 if (ipif_orig_id == 0) {
14259 ASSERT(ill->ill_move_ipif == NULL);
14260 ill->ill_move_ipif = ipif;
14261 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0)
14262 ASSERT(ill->ill_move_ipif == NULL);
14263 if (err != EINPROGRESS)
14264 ill->ill_move_ipif = NULL;
14265 return (err);
14269 * Bring it up on the IPMP ill.
14271 return (ipif_up(ipif, q, mp));
14274 /* Skip arp/ndp for any loopback interface. */
14275 if (ill->ill_wq != NULL) {
14276 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
14277 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
14279 if (!ill->ill_dl_up) {
14281 * ill_dl_up is not yet set. i.e. we are yet to
14282 * DL_BIND with the driver and this is the first
14283 * logical interface on the ill to become "up".
14284 * Tell the driver to get going (via DL_BIND_REQ).
14285 * Note that changing "significant" IFF_ flags
14286 * address/netmask etc cause a down/up dance, but
14287 * does not cause an unbind (DL_UNBIND) with the driver
14289 return (ill_dl_up(ill, ipif, mp, q));
14293 * ipif_resolver_up may end up needeing to bind/attach
14294 * the ARP stream, which in turn necessitates a
14295 * DLPI message exchange with the driver. ioctls are
14296 * serialized and so we cannot send more than one
14297 * interface up message at a time. If ipif_resolver_up
14298 * does need to wait for the DLPI handshake for the ARP stream,
14299 * we get EINPROGRESS and we will complete in arp_bringup_done.
14302 ASSERT(connp != NULL || !CONN_Q(q));
14303 if (connp != NULL)
14304 mutex_enter(&connp->conn_lock);
14305 mutex_enter(&ill->ill_lock);
14306 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
14307 mutex_exit(&ill->ill_lock);
14308 if (connp != NULL)
14309 mutex_exit(&connp->conn_lock);
14310 if (!success)
14311 return (EINTR);
14314 * Crank up IPv6 neighbor discovery. Unlike ARP, this should
14315 * complete when ipif_ndp_up returns.
14317 err = ipif_resolver_up(ipif, Res_act_initial);
14318 if (err == EINPROGRESS) {
14319 /* We will complete it in arp_bringup_done() */
14320 return (err);
14323 if (isv6 && err == 0)
14324 err = ipif_ndp_up(ipif, B_TRUE);
14326 ASSERT(err != EINPROGRESS);
14327 mp = ipsq_pending_mp_get(ipsq, &connp);
14328 ASSERT(mp != NULL);
14329 if (err != 0)
14330 return (err);
14331 } else {
14333 * Interfaces without underlying hardware don't do duplicate
14334 * address detection.
14336 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
14337 ipif->ipif_addr_ready = 1;
14338 err = ill_add_ires(ill);
14339 /* allocation failure? */
14340 if (err != 0)
14341 return (err);
14344 err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
14345 if (err == 0 && ill->ill_move_ipif != NULL) {
14346 ipif = ill->ill_move_ipif;
14347 ill->ill_move_ipif = NULL;
14348 return (ipif_up(ipif, q, mp));
14350 return (err);
14354 * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST.
14355 * The identical set of IREs need to be removed in ill_delete_ires().
14358 ill_add_ires(ill_t *ill)
14360 ire_t *ire;
14361 in6_addr_t dummy6 = IN6ADDR_INITIALIZER(V6_MCAST, 0, 0, 1);
14362 in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP);
14364 if (ill->ill_ire_multicast != NULL)
14365 return (0);
14368 * provide some dummy ire_addr for creating the ire.
14370 if (ill->ill_isv6) {
14371 ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill,
14372 ALL_ZONES, RTF_UP, ill->ill_ipst);
14373 } else {
14374 ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill,
14375 ALL_ZONES, RTF_UP, ill->ill_ipst);
14377 if (ire == NULL)
14378 return (ENOMEM);
14380 ill->ill_ire_multicast = ire;
14381 return (0);
14384 void
14385 ill_delete_ires(ill_t *ill)
14387 if (ill->ill_ire_multicast != NULL) {
14389 * BIND/ATTACH completed; Release the ref for ill_ire_multicast
14390 * which was taken without any th_tracing enabled.
14391 * We also mark it as condemned (note that it was never added)
14392 * so that caching conn's can move off of it.
14394 ire_make_condemned(ill->ill_ire_multicast);
14395 ire_refrele_notr(ill->ill_ire_multicast);
14396 ill->ill_ire_multicast = NULL;
14401 * Perform a bind for the physical device.
14402 * When the routine returns EINPROGRESS then mp has been consumed and
14403 * the ioctl will be acked from ip_rput_dlpi.
14404 * Allocate an unbind message and save it until ipif_down.
14406 static int
14407 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
14409 mblk_t *bind_mp = NULL;
14410 mblk_t *unbind_mp = NULL;
14411 conn_t *connp;
14412 boolean_t success;
14413 int err;
14415 DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill);
14417 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
14418 ASSERT(IAM_WRITER_ILL(ill));
14419 ASSERT(mp != NULL);
14422 * Make sure we have an IRE_MULTICAST in case we immediately
14423 * start receiving packets.
14425 err = ill_add_ires(ill);
14426 if (err != 0)
14427 goto bad;
14429 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
14430 DL_BIND_REQ);
14431 if (bind_mp == NULL)
14432 goto bad;
14433 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap;
14434 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
14437 * ill_unbind_mp would be non-null if the following sequence had
14438 * happened:
14439 * - send DL_BIND_REQ to driver, wait for response
14440 * - multiple ioctls that need to bring the ipif up are encountered,
14441 * but they cannot enter the ipsq due to the outstanding DL_BIND_REQ.
14442 * These ioctls will then be enqueued on the ipsq
14443 * - a DL_ERROR_ACK is returned for the DL_BIND_REQ
14444 * At this point, the pending ioctls in the ipsq will be drained, and
14445 * since ill->ill_dl_up was not set, ill_dl_up would be invoked with
14446 * a non-null ill->ill_unbind_mp
14448 if (ill->ill_unbind_mp == NULL) {
14449 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t),
14450 DL_UNBIND_REQ);
14451 if (unbind_mp == NULL)
14452 goto bad;
14455 * Record state needed to complete this operation when the
14456 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks.
14458 connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
14459 ASSERT(connp != NULL || !CONN_Q(q));
14460 GRAB_CONN_LOCK(q);
14461 mutex_enter(&ipif->ipif_ill->ill_lock);
14462 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
14463 mutex_exit(&ipif->ipif_ill->ill_lock);
14464 RELEASE_CONN_LOCK(q);
14465 if (!success)
14466 goto bad;
14469 * Save the unbind message for ill_dl_down(); it will be consumed when
14470 * the interface goes down.
14472 if (ill->ill_unbind_mp == NULL)
14473 ill->ill_unbind_mp = unbind_mp;
14475 ill_dlpi_send(ill, bind_mp);
14476 /* Send down link-layer capabilities probe if not already done. */
14477 ill_capability_probe(ill);
14480 * Sysid used to rely on the fact that netboots set domainname
14481 * and the like. Now that miniroot boots aren't strictly netboots
14482 * and miniroot network configuration is driven from userland
14483 * these things still need to be set. This situation can be detected
14484 * by comparing the interface being configured here to the one
14485 * dhcifname was set to reference by the boot loader. Once sysid is
14486 * converted to use dhcp_ipc_getinfo() this call can go away.
14488 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) &&
14489 (strcmp(ill->ill_name, dhcifname) == 0) &&
14490 (strlen(srpc_domain) == 0)) {
14491 if (dhcpinit() != 0)
14492 cmn_err(CE_WARN, "no cached dhcp response");
14496 * This operation will complete in ip_rput_dlpi with either
14497 * a DL_BIND_ACK or DL_ERROR_ACK.
14499 return (EINPROGRESS);
14500 bad:
14501 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
14503 freemsg(bind_mp);
14504 freemsg(unbind_mp);
14505 return (ENOMEM);
14508 /* Add room for tcp+ip headers */
14509 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20;
14512 * DLPI and ARP is up.
14513 * Create all the IREs associated with an interface. Bring up multicast.
14514 * Set the interface flag and finish other initialization
14515 * that potentially had to be deferred to after DL_BIND_ACK.
14518 ipif_up_done(ipif_t *ipif)
14520 ill_t *ill = ipif->ipif_ill;
14521 int err = 0;
14522 boolean_t loopback = B_FALSE;
14523 boolean_t update_src_selection = B_TRUE;
14524 ipif_t *tmp_ipif;
14526 ip1dbg(("ipif_up_done(%s:%u)\n",
14527 ipif->ipif_ill->ill_name, ipif->ipif_id));
14528 DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done",
14529 ill_t *, ill, ipif_t *, ipif);
14531 /* Check if this is a loopback interface */
14532 if (ipif->ipif_ill->ill_wq == NULL)
14533 loopback = B_TRUE;
14535 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14538 * If all other interfaces for this ill are down or DEPRECATED,
14539 * or otherwise unsuitable for source address selection,
14540 * reset the src generation numbers to make sure source
14541 * address selection gets to take this new ipif into account.
14542 * No need to hold ill_lock while traversing the ipif list since
14543 * we are writer
14545 for (tmp_ipif = ill->ill_ipif; tmp_ipif;
14546 tmp_ipif = tmp_ipif->ipif_next) {
14547 if (((tmp_ipif->ipif_flags &
14548 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) ||
14549 !(tmp_ipif->ipif_flags & IPIF_UP)) ||
14550 (tmp_ipif == ipif))
14551 continue;
14552 /* first useable pre-existing interface */
14553 update_src_selection = B_FALSE;
14554 break;
14556 if (update_src_selection)
14557 ip_update_source_selection(ill->ill_ipst);
14559 if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) {
14560 nce_t *loop_nce = NULL;
14561 uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD);
14564 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in
14565 * ipif_lookup_on_name(), but in the case of zones we can have
14566 * several loopback addresses on lo0. So all the interfaces with
14567 * loopback addresses need to be marked IRE_LOOPBACK.
14569 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) ==
14570 htonl(INADDR_LOOPBACK))
14571 ipif->ipif_ire_type = IRE_LOOPBACK;
14572 else
14573 ipif->ipif_ire_type = IRE_LOCAL;
14574 if (ill->ill_net_type != IRE_LOOPBACK)
14575 flags |= NCE_F_PUBLISH;
14577 /* add unicast nce for the local addr */
14578 err = nce_lookup_then_add_v4(ill, NULL,
14579 ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags,
14580 ND_REACHABLE, &loop_nce);
14581 /* A shared-IP zone sees EEXIST for lo0:N */
14582 if (err == 0 || err == EEXIST) {
14583 ipif->ipif_added_nce = 1;
14584 loop_nce->nce_ipif_cnt++;
14585 nce_refrele(loop_nce);
14586 err = 0;
14587 } else {
14588 ASSERT(loop_nce == NULL);
14589 return (err);
14593 /* Create all the IREs associated with this interface */
14594 err = ipif_add_ires_v4(ipif, loopback);
14595 if (err != 0) {
14597 * see comments about return value from
14598 * ip_addr_availability_check() in ipif_add_ires_v4().
14600 if (err != EADDRINUSE) {
14601 (void) ipif_arp_down(ipif);
14602 } else {
14604 * Make IPMP aware of the deleted ipif so that
14605 * the needed ipmp cleanup (e.g., of ipif_bound_ill)
14606 * can be completed. Note that we do not want to
14607 * destroy the nce that was created on the ipmp_ill
14608 * for the active copy of the duplicate address in
14609 * use.
14611 if (IS_IPMP(ill))
14612 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
14613 err = EADDRNOTAVAIL;
14615 return (err);
14618 if (ill->ill_ipif_up_count == 1 && !loopback) {
14619 /* Recover any additional IREs entries for this ill */
14620 (void) ill_recover_saved_ire(ill);
14623 if (ill->ill_need_recover_multicast) {
14625 * Need to recover all multicast memberships in the driver.
14626 * This had to be deferred until we had attached. The same
14627 * code exists in ipif_up_done_v6() to recover IPv6
14628 * memberships.
14630 * Note that it would be preferable to unconditionally do the
14631 * ill_recover_multicast() in ill_dl_up(), but we cannot do
14632 * that since ill_join_allmulti() depends on ill_dl_up being
14633 * set, and it is not set until we receive a DL_BIND_ACK after
14634 * having called ill_dl_up().
14636 ill_recover_multicast(ill);
14639 if (ill->ill_ipif_up_count == 1) {
14641 * Since the interface is now up, it may now be active.
14643 if (IS_UNDER_IPMP(ill))
14644 ipmp_ill_refresh_active(ill);
14647 * If this is an IPMP interface, we may now be able to
14648 * establish ARP entries.
14650 if (IS_IPMP(ill))
14651 ipmp_illgrp_refresh_arpent(ill->ill_grp);
14654 /* Join the allhosts multicast address */
14655 ipif_multicast_up(ipif);
14657 if (!loopback && !update_src_selection &&
14658 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)))
14659 ip_update_source_selection(ill->ill_ipst);
14661 if (!loopback && ipif->ipif_addr_ready) {
14662 /* Broadcast an address mask reply. */
14663 ipif_mask_reply(ipif);
14665 /* Perhaps ilgs should use this ill */
14666 update_conn_ill(NULL, ill->ill_ipst);
14669 * This had to be deferred until we had bound. Tell routing sockets and
14670 * others that this interface is up if it looks like the address has
14671 * been validated. Otherwise, if it isn't ready yet, wait for
14672 * duplicate address detection to do its thing.
14674 if (ipif->ipif_addr_ready)
14675 ipif_up_notify(ipif);
14676 return (0);
14680 * Add the IREs associated with the ipif.
14681 * Those MUST be explicitly removed in ipif_delete_ires_v4.
14683 static int
14684 ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback)
14686 ill_t *ill = ipif->ipif_ill;
14687 ip_stack_t *ipst = ill->ill_ipst;
14688 ire_t *ire_array[20];
14689 ire_t **irep = ire_array;
14690 ire_t **irep1;
14691 ipaddr_t net_mask = 0;
14692 ipaddr_t subnet_mask, route_mask;
14693 int err;
14694 ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */
14695 ire_t *ire_if = NULL;
14696 uchar_t *gw;
14698 if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14699 !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14700 /* Register the source address for __sin6_src_id */
14701 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
14702 ipif->ipif_zoneid, ipst);
14703 if (err != 0) {
14704 ip0dbg(("ipif_add_ires: srcid_insert %d\n", err));
14705 return (err);
14708 if (loopback)
14709 gw = (uchar_t *)&ipif->ipif_lcl_addr;
14710 else
14711 gw = NULL;
14713 /* If the interface address is set, create the local IRE. */
14714 ire_local = ire_create(
14715 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */
14716 (uchar_t *)&ip_g_all_ones, /* mask */
14717 gw, /* gateway */
14718 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */
14719 ipif->ipif_ill,
14720 ipif->ipif_zoneid,
14721 ((ipif->ipif_flags & IPIF_PRIVATE) ?
14722 RTF_PRIVATE : 0) | RTF_KERNEL,
14723 ipst);
14724 ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x"
14725 " for 0x%x\n", (void *)ipif, (void *)ire_local,
14726 ipif->ipif_ire_type,
14727 ntohl(ipif->ipif_lcl_addr)));
14728 if (ire_local == NULL) {
14729 ip1dbg(("ipif_up_done: NULL ire_local\n"));
14730 err = ENOMEM;
14731 goto bad;
14733 } else {
14734 ip1dbg((
14735 "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n",
14736 ipif->ipif_ire_type,
14737 ntohl(ipif->ipif_lcl_addr),
14738 (uint_t)ipif->ipif_flags));
14740 if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14741 !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14742 net_mask = ip_net_mask(ipif->ipif_lcl_addr);
14743 } else {
14744 net_mask = htonl(IN_CLASSA_NET); /* fallback */
14747 subnet_mask = ipif->ipif_net_mask;
14750 * If mask was not specified, use natural netmask of
14751 * interface address. Also, store this mask back into the
14752 * ipif struct.
14754 if (subnet_mask == 0) {
14755 subnet_mask = net_mask;
14756 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask);
14757 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
14758 ipif->ipif_v6subnet);
14761 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
14762 if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
14763 ipif->ipif_subnet != INADDR_ANY) {
14764 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14766 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14767 route_mask = IP_HOST_MASK;
14768 } else {
14769 route_mask = subnet_mask;
14772 ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p "
14773 "creating if IRE ill_net_type 0x%x for 0x%x\n",
14774 (void *)ipif, (void *)ill, ill->ill_net_type,
14775 ntohl(ipif->ipif_subnet)));
14776 ire_if = ire_create(
14777 (uchar_t *)&ipif->ipif_subnet,
14778 (uchar_t *)&route_mask,
14779 (uchar_t *)&ipif->ipif_lcl_addr,
14780 ill->ill_net_type,
14781 ill,
14782 ipif->ipif_zoneid,
14783 ((ipif->ipif_flags & IPIF_PRIVATE) ?
14784 RTF_PRIVATE: 0) | RTF_KERNEL,
14785 ipst);
14786 if (ire_if == NULL) {
14787 ip1dbg(("ipif_up_done: NULL ire_if\n"));
14788 err = ENOMEM;
14789 goto bad;
14794 * Create any necessary broadcast IREs.
14796 if ((ipif->ipif_flags & IPIF_BROADCAST) &&
14797 !(ipif->ipif_flags & IPIF_NOXMIT))
14798 irep = ipif_create_bcast_ires(ipif, irep);
14800 /* If an earlier ire_create failed, get out now */
14801 for (irep1 = irep; irep1 > ire_array; ) {
14802 irep1--;
14803 if (*irep1 == NULL) {
14804 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n"));
14805 err = ENOMEM;
14806 goto bad;
14811 * Need to atomically check for IP address availability under
14812 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new
14813 * ills or new ipifs can be added while we are checking availability.
14815 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
14816 mutex_enter(&ipst->ips_ip_addr_avail_lock);
14817 /* Mark it up, and increment counters. */
14818 ipif->ipif_flags |= IPIF_UP;
14819 ill->ill_ipif_up_count++;
14820 err = ip_addr_availability_check(ipif);
14821 mutex_exit(&ipst->ips_ip_addr_avail_lock);
14822 rw_exit(&ipst->ips_ill_g_lock);
14824 if (err != 0) {
14826 * Our address may already be up on the same ill. In this case,
14827 * the ARP entry for our ipif replaced the one for the other
14828 * ipif. So we don't want to delete it (otherwise the other ipif
14829 * would be unable to send packets).
14830 * ip_addr_availability_check() identifies this case for us and
14831 * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL
14832 * which is the expected error code.
14834 ill->ill_ipif_up_count--;
14835 ipif->ipif_flags &= ~IPIF_UP;
14836 goto bad;
14840 * Add in all newly created IREs. ire_create_bcast() has
14841 * already checked for duplicates of the IRE_BROADCAST type.
14842 * We add the IRE_INTERFACE before the IRE_LOCAL to ensure
14843 * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is
14844 * a /32 route.
14846 if (ire_if != NULL) {
14847 ire_if = ire_add(ire_if);
14848 if (ire_if == NULL) {
14849 err = ENOMEM;
14850 goto bad2;
14852 #ifdef DEBUG
14853 ire_refhold_notr(ire_if);
14854 ire_refrele(ire_if);
14855 #endif
14857 if (ire_local != NULL) {
14858 ire_local = ire_add(ire_local);
14859 if (ire_local == NULL) {
14860 err = ENOMEM;
14861 goto bad2;
14863 #ifdef DEBUG
14864 ire_refhold_notr(ire_local);
14865 ire_refrele(ire_local);
14866 #endif
14868 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14869 if (ire_local != NULL)
14870 ipif->ipif_ire_local = ire_local;
14871 if (ire_if != NULL)
14872 ipif->ipif_ire_if = ire_if;
14873 rw_exit(&ipst->ips_ill_g_lock);
14874 ire_local = NULL;
14875 ire_if = NULL;
14878 * We first add all of them, and if that succeeds we refrele the
14879 * bunch. That enables us to delete all of them should any of the
14880 * ire_adds fail.
14882 for (irep1 = irep; irep1 > ire_array; ) {
14883 irep1--;
14884 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock)));
14885 *irep1 = ire_add(*irep1);
14886 if (*irep1 == NULL) {
14887 err = ENOMEM;
14888 goto bad2;
14892 for (irep1 = irep; irep1 > ire_array; ) {
14893 irep1--;
14894 /* refheld by ire_add. */
14895 if (*irep1 != NULL) {
14896 ire_refrele(*irep1);
14897 *irep1 = NULL;
14901 if (!loopback) {
14903 * If the broadcast address has been set, make sure it makes
14904 * sense based on the interface address.
14905 * Only match on ill since we are sharing broadcast addresses.
14907 if ((ipif->ipif_brd_addr != INADDR_ANY) &&
14908 (ipif->ipif_flags & IPIF_BROADCAST)) {
14909 ire_t *ire;
14911 ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0,
14912 IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES,
14913 (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL);
14915 if (ire == NULL) {
14917 * If there isn't a matching broadcast IRE,
14918 * revert to the default for this netmask.
14920 ipif->ipif_v6brd_addr = ipv6_all_zeros;
14921 mutex_enter(&ipif->ipif_ill->ill_lock);
14922 ipif_set_default(ipif);
14923 mutex_exit(&ipif->ipif_ill->ill_lock);
14924 } else {
14925 ire_refrele(ire);
14930 return (0);
14932 bad2:
14933 ill->ill_ipif_up_count--;
14934 ipif->ipif_flags &= ~IPIF_UP;
14936 bad:
14937 ip1dbg(("ipif_add_ires: FAILED \n"));
14938 if (ire_local != NULL)
14939 ire_delete(ire_local);
14940 if (ire_if != NULL)
14941 ire_delete(ire_if);
14943 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14944 ire_local = ipif->ipif_ire_local;
14945 ipif->ipif_ire_local = NULL;
14946 ire_if = ipif->ipif_ire_if;
14947 ipif->ipif_ire_if = NULL;
14948 rw_exit(&ipst->ips_ill_g_lock);
14949 if (ire_local != NULL) {
14950 ire_delete(ire_local);
14951 ire_refrele_notr(ire_local);
14953 if (ire_if != NULL) {
14954 ire_delete(ire_if);
14955 ire_refrele_notr(ire_if);
14958 while (irep > ire_array) {
14959 irep--;
14960 if (*irep != NULL) {
14961 ire_delete(*irep);
14964 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
14966 return (err);
14969 /* Remove all the IREs created by ipif_add_ires_v4 */
14970 void
14971 ipif_delete_ires_v4(ipif_t *ipif)
14973 ill_t *ill = ipif->ipif_ill;
14974 ip_stack_t *ipst = ill->ill_ipst;
14975 ire_t *ire;
14977 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14978 ire = ipif->ipif_ire_local;
14979 ipif->ipif_ire_local = NULL;
14980 rw_exit(&ipst->ips_ill_g_lock);
14981 if (ire != NULL) {
14983 * Move count to ipif so we don't loose the count due to
14984 * a down/up dance.
14986 atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count);
14988 ire_delete(ire);
14989 ire_refrele_notr(ire);
14991 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14992 ire = ipif->ipif_ire_if;
14993 ipif->ipif_ire_if = NULL;
14994 rw_exit(&ipst->ips_ill_g_lock);
14995 if (ire != NULL) {
14996 ire_delete(ire);
14997 ire_refrele_notr(ire);
15001 * Delete the broadcast IREs.
15003 if ((ipif->ipif_flags & IPIF_BROADCAST) &&
15004 !(ipif->ipif_flags & IPIF_NOXMIT))
15005 ipif_delete_bcast_ires(ipif);
15009 * Checks for availbility of a usable source address (if there is one) when the
15010 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note
15011 * this selection is done regardless of the destination.
15013 boolean_t
15014 ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid,
15015 ip_stack_t *ipst)
15017 ipif_t *ipif = NULL;
15018 ill_t *uill;
15020 ASSERT(ifindex != 0);
15022 uill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
15023 if (uill == NULL)
15024 return (B_FALSE);
15026 mutex_enter(&uill->ill_lock);
15027 for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15028 if (IPIF_IS_CONDEMNED(ipif))
15029 continue;
15030 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15031 continue;
15032 if (!(ipif->ipif_flags & IPIF_UP))
15033 continue;
15034 if (ipif->ipif_zoneid != zoneid)
15035 continue;
15036 if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15037 ipif->ipif_lcl_addr == INADDR_ANY)
15038 continue;
15039 mutex_exit(&uill->ill_lock);
15040 ill_refrele(uill);
15041 return (B_TRUE);
15043 mutex_exit(&uill->ill_lock);
15044 ill_refrele(uill);
15045 return (B_FALSE);
15049 * Find an ipif with a good local address on the ill+zoneid.
15051 ipif_t *
15052 ipif_good_addr(ill_t *ill, zoneid_t zoneid)
15054 ipif_t *ipif;
15056 mutex_enter(&ill->ill_lock);
15057 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15058 if (IPIF_IS_CONDEMNED(ipif))
15059 continue;
15060 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15061 continue;
15062 if (!(ipif->ipif_flags & IPIF_UP))
15063 continue;
15064 if (ipif->ipif_zoneid != zoneid &&
15065 ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES)
15066 continue;
15067 if (ill->ill_isv6 ?
15068 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15069 ipif->ipif_lcl_addr == INADDR_ANY)
15070 continue;
15071 ipif_refhold_locked(ipif);
15072 mutex_exit(&ill->ill_lock);
15073 return (ipif);
15075 mutex_exit(&ill->ill_lock);
15076 return (NULL);
15080 * IP source address type, sorted from worst to best. For a given type,
15081 * always prefer IP addresses on the same subnet. All-zones addresses are
15082 * suboptimal because they pose problems with unlabeled destinations.
15084 typedef enum {
15085 IPIF_NONE,
15086 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */
15087 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */
15088 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */
15089 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */
15090 IPIF_DIFFNET, /* normal and different subnet */
15091 IPIF_SAMENET, /* normal and same subnet */
15092 IPIF_LOCALADDR /* local loopback */
15093 } ipif_type_t;
15096 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone
15097 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t
15098 * enumeration, and return the highest-rated ipif. If there's a tie, we pick
15099 * the first one, unless IPMP is used in which case we round-robin among them;
15100 * see below for more.
15102 * Returns NULL if there is no suitable source address for the ill.
15103 * This only occurs when there is no valid source address for the ill.
15105 ipif_t *
15106 ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid,
15107 boolean_t allow_usesrc, boolean_t *notreadyp)
15109 ill_t *usill = NULL;
15110 ill_t *ipmp_ill = NULL;
15111 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif;
15112 ipif_type_t type, best_type;
15113 ip_stack_t *ipst = ill->ill_ipst;
15114 boolean_t samenet;
15116 if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) {
15117 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
15118 B_FALSE, ipst);
15119 if (usill != NULL)
15120 ill = usill; /* Select source from usesrc ILL */
15121 else
15122 return (NULL);
15126 * Test addresses should never be used for source address selection,
15127 * so if we were passed one, switch to the IPMP meta-interface.
15129 if (IS_UNDER_IPMP(ill)) {
15130 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL)
15131 ill = ipmp_ill; /* Select source from IPMP ill */
15132 else
15133 return (NULL);
15137 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill
15138 * can be deleted. But an ipif/ill can get CONDEMNED any time.
15139 * After selecting the right ipif, under ill_lock make sure ipif is
15140 * not condemned, and increment refcnt. If ipif is CONDEMNED,
15141 * we retry. Inside the loop we still need to check for CONDEMNED,
15142 * but not under a lock.
15144 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
15145 retry:
15147 * For source address selection, we treat the ipif list as circular
15148 * and continue until we get back to where we started. This allows
15149 * IPMP to vary source address selection (which improves inbound load
15150 * spreading) by caching its last ending point and starting from
15151 * there. NOTE: we don't have to worry about ill_src_ipif changing
15152 * ills since that can't happen on the IPMP ill.
15154 start_ipif = ill->ill_ipif;
15155 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
15156 start_ipif = ill->ill_src_ipif;
15158 ipif = start_ipif;
15159 best_ipif = NULL;
15160 best_type = IPIF_NONE;
15161 do {
15162 if ((next_ipif = ipif->ipif_next) == NULL)
15163 next_ipif = ill->ill_ipif;
15165 if (IPIF_IS_CONDEMNED(ipif))
15166 continue;
15167 /* Always skip NOLOCAL and ANYCAST interfaces */
15168 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15169 continue;
15170 /* Always skip NOACCEPT interfaces */
15171 if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT)
15172 continue;
15173 if (!(ipif->ipif_flags & IPIF_UP))
15174 continue;
15176 if (!ipif->ipif_addr_ready) {
15177 if (notreadyp != NULL)
15178 *notreadyp = B_TRUE;
15179 continue;
15182 if (zoneid != ALL_ZONES &&
15183 ipif->ipif_zoneid != zoneid &&
15184 ipif->ipif_zoneid != ALL_ZONES)
15185 continue;
15188 * Interfaces with 0.0.0.0 address are allowed to be UP, but
15189 * are not valid as source addresses.
15191 if (ipif->ipif_lcl_addr == INADDR_ANY)
15192 continue;
15194 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet);
15196 if (ipif->ipif_lcl_addr == dst) {
15197 type = IPIF_LOCALADDR;
15198 } else if (ipif->ipif_flags & IPIF_DEPRECATED) {
15199 type = samenet ? IPIF_SAMENET_DEPRECATED :
15200 IPIF_DIFFNET_DEPRECATED;
15201 } else if (ipif->ipif_zoneid == ALL_ZONES) {
15202 type = samenet ? IPIF_SAMENET_ALLZONES :
15203 IPIF_DIFFNET_ALLZONES;
15204 } else {
15205 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET;
15208 if (type > best_type) {
15209 best_type = type;
15210 best_ipif = ipif;
15211 if (best_type == IPIF_LOCALADDR)
15212 break; /* can't get better */
15214 } while ((ipif = next_ipif) != start_ipif);
15216 if ((ipif = best_ipif) != NULL) {
15217 mutex_enter(&ipif->ipif_ill->ill_lock);
15218 if (IPIF_IS_CONDEMNED(ipif)) {
15219 mutex_exit(&ipif->ipif_ill->ill_lock);
15220 goto retry;
15222 ipif_refhold_locked(ipif);
15225 * For IPMP, update the source ipif rotor to the next ipif,
15226 * provided we can look it up. (We must not use it if it's
15227 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
15228 * ipif_free() checked ill_src_ipif.)
15230 if (IS_IPMP(ill) && ipif != NULL) {
15231 next_ipif = ipif->ipif_next;
15232 if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif))
15233 ill->ill_src_ipif = next_ipif;
15234 else
15235 ill->ill_src_ipif = NULL;
15237 mutex_exit(&ipif->ipif_ill->ill_lock);
15240 rw_exit(&ipst->ips_ill_g_lock);
15241 if (usill != NULL)
15242 ill_refrele(usill);
15243 if (ipmp_ill != NULL)
15244 ill_refrele(ipmp_ill);
15246 #ifdef DEBUG
15247 if (ipif == NULL) {
15248 char buf1[INET6_ADDRSTRLEN];
15250 ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n",
15251 ill->ill_name,
15252 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1))));
15253 } else {
15254 char buf1[INET6_ADDRSTRLEN];
15255 char buf2[INET6_ADDRSTRLEN];
15257 ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n",
15258 ipif->ipif_ill->ill_name,
15259 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)),
15260 inet_ntop(AF_INET, &ipif->ipif_lcl_addr,
15261 buf2, sizeof (buf2))));
15263 #endif /* DEBUG */
15264 return (ipif);
15268 * Pick a source address based on the destination ill and an optional setsrc
15269 * address.
15270 * The result is stored in srcp. If generation is set, then put the source
15271 * generation number there before we look for the source address (to avoid
15272 * missing changes in the set of source addresses.
15273 * If flagsp is set, then us it to pass back ipif_flags.
15275 * If the caller wants to cache the returned source address and detect when
15276 * that might be stale, the caller should pass in a generation argument,
15277 * which the caller can later compare against ips_src_generation
15279 * The precedence order for selecting an IPv4 source address is:
15280 * - RTF_SETSRC on the offlink ire always wins.
15281 * - If usrsrc is set, swap the ill to be the usesrc one.
15282 * - If IPMP is used on the ill, select a random address from the most
15283 * preferred ones below:
15284 * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES
15285 * 2. Not deprecated, not ALL_ZONES
15286 * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES
15287 * 4. Not deprecated, ALL_ZONES
15288 * 5. If onlink destination, same subnet and deprecated
15289 * 6. Deprecated.
15291 * We have lower preference for ALL_ZONES IP addresses,
15292 * as they pose problems with unlabeled destinations.
15294 * Note that when multiple IP addresses match e.g., #1 we pick
15295 * the first one if IPMP is not in use. With IPMP we randomize.
15298 ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst,
15299 ipaddr_t multicast_ifaddr,
15300 zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp,
15301 uint32_t *generation, uint64_t *flagsp)
15303 ipif_t *ipif;
15304 boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */
15306 if (flagsp != NULL)
15307 *flagsp = 0;
15310 * Need to grab the generation number before we check to
15311 * avoid a race with a change to the set of local addresses.
15312 * No lock needed since the thread which updates the set of local
15313 * addresses use ipif/ill locks and exit those (hence a store memory
15314 * barrier) before doing the atomic increase of ips_src_generation.
15316 if (generation != NULL) {
15317 *generation = ipst->ips_src_generation;
15320 if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) {
15321 *srcp = multicast_ifaddr;
15322 return (0);
15325 /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */
15326 if (setsrc != INADDR_ANY) {
15327 *srcp = setsrc;
15328 return (0);
15330 ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, &notready);
15331 if (ipif == NULL) {
15332 if (notready)
15333 return (ENETDOWN);
15334 else
15335 return (EADDRNOTAVAIL);
15337 *srcp = ipif->ipif_lcl_addr;
15338 if (flagsp != NULL)
15339 *flagsp = ipif->ipif_flags;
15340 ipif_refrele(ipif);
15341 return (0);
15344 /* ARGSUSED */
15346 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15347 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15350 * ill_phyint_reinit merged the v4 and v6 into a single
15351 * ipsq. We might not have been able to complete the
15352 * operation in ipif_set_values, if we could not become
15353 * exclusive. If so restart it here.
15355 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15359 * Can operate on either a module or a driver queue.
15360 * Returns an error if not a module queue.
15362 /* ARGSUSED */
15364 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15365 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15367 queue_t *q1 = q;
15368 char *cp;
15369 char interf_name[LIFNAMSIZ];
15370 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr;
15372 if (q->q_next == NULL) {
15373 ip1dbg((
15374 "if_unitsel: IF_UNITSEL: no q_next\n"));
15375 return (EINVAL);
15378 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0')
15379 return (EALREADY);
15381 do {
15382 q1 = q1->q_next;
15383 } while (q1->q_next);
15384 cp = q1->q_qinfo->qi_minfo->mi_idname;
15385 (void) sprintf(interf_name, "%s%d", cp, ppa);
15388 * Here we are not going to delay the ioack until after
15389 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the
15390 * original ioctl message before sending the requests.
15392 return (ipif_set_values(q, mp, interf_name, &ppa));
15395 /* ARGSUSED */
15397 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15398 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15400 return (ENXIO);
15404 * Create any IRE_BROADCAST entries for `ipif', and store those entries in
15405 * `irep'. Returns a pointer to the next free `irep' entry
15406 * A mirror exists in ipif_delete_bcast_ires().
15408 * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is
15409 * done in ire_add.
15411 static ire_t **
15412 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
15414 ipaddr_t addr;
15415 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
15416 ipaddr_t subnetmask = ipif->ipif_net_mask;
15417 ill_t *ill = ipif->ipif_ill;
15418 zoneid_t zoneid = ipif->ipif_zoneid;
15420 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n"));
15422 ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15423 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15425 if (ipif->ipif_lcl_addr == INADDR_ANY ||
15426 (ipif->ipif_flags & IPIF_NOLOCAL))
15427 netmask = htonl(IN_CLASSA_NET); /* fallback */
15429 irep = ire_create_bcast(ill, 0, zoneid, irep);
15430 irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep);
15433 * For backward compatibility, we create net broadcast IREs based on
15434 * the old "IP address class system", since some old machines only
15435 * respond to these class derived net broadcast. However, we must not
15436 * create these net broadcast IREs if the subnetmask is shorter than
15437 * the IP address class based derived netmask. Otherwise, we may
15438 * create a net broadcast address which is the same as an IP address
15439 * on the subnet -- and then TCP will refuse to talk to that address.
15441 if (netmask < subnetmask) {
15442 addr = netmask & ipif->ipif_subnet;
15443 irep = ire_create_bcast(ill, addr, zoneid, irep);
15444 irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep);
15448 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15449 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15450 * created. Creating these broadcast IREs will only create confusion
15451 * as `addr' will be the same as the IP address.
15453 if (subnetmask != 0xFFFFFFFF) {
15454 addr = ipif->ipif_subnet;
15455 irep = ire_create_bcast(ill, addr, zoneid, irep);
15456 irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep);
15459 return (irep);
15463 * Mirror of ipif_create_bcast_ires()
15465 static void
15466 ipif_delete_bcast_ires(ipif_t *ipif)
15468 ipaddr_t addr;
15469 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
15470 ipaddr_t subnetmask = ipif->ipif_net_mask;
15471 ill_t *ill = ipif->ipif_ill;
15472 zoneid_t zoneid = ipif->ipif_zoneid;
15473 ire_t *ire;
15475 ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15476 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15478 if (ipif->ipif_lcl_addr == INADDR_ANY ||
15479 (ipif->ipif_flags & IPIF_NOLOCAL))
15480 netmask = htonl(IN_CLASSA_NET); /* fallback */
15482 ire = ire_lookup_bcast(ill, 0, zoneid);
15483 ASSERT(ire != NULL);
15484 ire_delete(ire); ire_refrele(ire);
15485 ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid);
15486 ASSERT(ire != NULL);
15487 ire_delete(ire); ire_refrele(ire);
15490 * For backward compatibility, we create net broadcast IREs based on
15491 * the old "IP address class system", since some old machines only
15492 * respond to these class derived net broadcast. However, we must not
15493 * create these net broadcast IREs if the subnetmask is shorter than
15494 * the IP address class based derived netmask. Otherwise, we may
15495 * create a net broadcast address which is the same as an IP address
15496 * on the subnet -- and then TCP will refuse to talk to that address.
15498 if (netmask < subnetmask) {
15499 addr = netmask & ipif->ipif_subnet;
15500 ire = ire_lookup_bcast(ill, addr, zoneid);
15501 ASSERT(ire != NULL);
15502 ire_delete(ire); ire_refrele(ire);
15503 ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid);
15504 ASSERT(ire != NULL);
15505 ire_delete(ire); ire_refrele(ire);
15509 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15510 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15511 * created. Creating these broadcast IREs will only create confusion
15512 * as `addr' will be the same as the IP address.
15514 if (subnetmask != 0xFFFFFFFF) {
15515 addr = ipif->ipif_subnet;
15516 ire = ire_lookup_bcast(ill, addr, zoneid);
15517 ASSERT(ire != NULL);
15518 ire_delete(ire); ire_refrele(ire);
15519 ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid);
15520 ASSERT(ire != NULL);
15521 ire_delete(ire); ire_refrele(ire);
15526 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV*
15527 * from lifr_flags and the name from lifr_name.
15528 * Set IFF_IPV* and ill_isv6 prior to doing the lookup
15529 * since ipif_lookup_on_name uses the _isv6 flags when matching.
15530 * Returns EINPROGRESS when mp has been consumed by queueing it on
15531 * ipx_pending_mp and the ioctl will complete in ip_rput.
15533 * Can operate on either a module or a driver queue.
15534 * Returns an error if not a module queue.
15536 /* ARGSUSED */
15538 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15539 ip_ioctl_cmd_t *ipip, void *if_req)
15541 ill_t *ill = q->q_ptr;
15542 phyint_t *phyi;
15543 ip_stack_t *ipst;
15544 struct lifreq *lifr = if_req;
15545 uint64_t new_flags;
15547 ASSERT(ipif != NULL);
15548 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name));
15550 if (q->q_next == NULL) {
15551 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n"));
15552 return (EINVAL);
15556 * If we are not writer on 'q' then this interface exists already
15557 * and previous lookups (ip_extract_lifreq()) found this ipif --
15558 * so return EALREADY.
15560 if (ill != ipif->ipif_ill)
15561 return (EALREADY);
15563 if (ill->ill_name[0] != '\0')
15564 return (EALREADY);
15567 * If there's another ill already with the requested name, ensure
15568 * that it's of the same type. Otherwise, ill_phyint_reinit() will
15569 * fuse together two unrelated ills, which will cause chaos.
15571 ipst = ill->ill_ipst;
15572 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
15573 lifr->lifr_name, NULL);
15574 if (phyi != NULL) {
15575 ill_t *ill_mate = phyi->phyint_illv4;
15577 if (ill_mate == NULL)
15578 ill_mate = phyi->phyint_illv6;
15579 ASSERT(ill_mate != NULL);
15581 if (ill_mate->ill_media->ip_m_mac_type !=
15582 ill->ill_media->ip_m_mac_type) {
15583 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to "
15584 "use the same ill name on differing media\n"));
15585 return (EINVAL);
15590 * We start off as IFF_IPV4 in ipif_allocate and become
15591 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value.
15592 * The only flags that we read from user space are IFF_IPV4,
15593 * IFF_IPV6, and IFF_BROADCAST.
15595 * This ill has not been inserted into the global list.
15596 * So we are still single threaded and don't need any lock
15598 * Saniy check the flags.
15601 if ((lifr->lifr_flags & IFF_BROADCAST) &&
15602 ((lifr->lifr_flags & IFF_IPV6) ||
15603 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) {
15604 ip1dbg(("ip_sioctl_slifname: link not broadcast capable "
15605 "or IPv6 i.e., no broadcast \n"));
15606 return (EINVAL);
15609 new_flags =
15610 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST);
15612 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) {
15613 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of "
15614 "IFF_IPV4 or IFF_IPV6\n"));
15615 return (EINVAL);
15619 * We always start off as IPv4, so only need to check for IPv6.
15621 if ((new_flags & IFF_IPV6) != 0) {
15622 ill->ill_flags |= ILLF_IPV6;
15623 ill->ill_flags &= ~ILLF_IPV4;
15625 if (lifr->lifr_flags & IFF_NOLINKLOCAL)
15626 ill->ill_flags |= ILLF_NOLINKLOCAL;
15629 if ((new_flags & IFF_BROADCAST) != 0)
15630 ipif->ipif_flags |= IPIF_BROADCAST;
15631 else
15632 ipif->ipif_flags &= ~IPIF_BROADCAST;
15634 /* We started off as V4. */
15635 if (ill->ill_flags & ILLF_IPV6) {
15636 ill->ill_phyint->phyint_illv6 = ill;
15637 ill->ill_phyint->phyint_illv4 = NULL;
15640 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa));
15643 /* ARGSUSED */
15645 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15646 ip_ioctl_cmd_t *ipip, void *if_req)
15649 * ill_phyint_reinit merged the v4 and v6 into a single
15650 * ipsq. We might not have been able to complete the
15651 * slifname in ipif_set_values, if we could not become
15652 * exclusive. If so restart it here
15654 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15658 * Return a pointer to the ipif which matches the index, IP version type and
15659 * zoneid.
15661 ipif_t *
15662 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
15663 ip_stack_t *ipst)
15665 ill_t *ill;
15666 ipif_t *ipif = NULL;
15668 ill = ill_lookup_on_ifindex(index, isv6, ipst);
15669 if (ill != NULL) {
15670 mutex_enter(&ill->ill_lock);
15671 for (ipif = ill->ill_ipif; ipif != NULL;
15672 ipif = ipif->ipif_next) {
15673 if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES ||
15674 zoneid == ipif->ipif_zoneid ||
15675 ipif->ipif_zoneid == ALL_ZONES)) {
15676 ipif_refhold_locked(ipif);
15677 break;
15680 mutex_exit(&ill->ill_lock);
15681 ill_refrele(ill);
15683 return (ipif);
15687 * Change an existing physical interface's index. If the new index
15688 * is acceptable we update the index and the phyint_list_avl_by_index tree.
15689 * Finally, we update other systems which may have a dependence on the
15690 * index value.
15692 /* ARGSUSED */
15694 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15695 ip_ioctl_cmd_t *ipip, void *ifreq)
15697 ill_t *ill;
15698 phyint_t *phyi;
15699 struct ifreq *ifr = (struct ifreq *)ifreq;
15700 struct lifreq *lifr = (struct lifreq *)ifreq;
15701 uint_t old_index, index;
15702 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
15703 avl_index_t where;
15705 if (ipip->ipi_cmd_type == IF_CMD)
15706 index = ifr->ifr_index;
15707 else
15708 index = lifr->lifr_index;
15711 * Only allow on physical interface. Also, index zero is illegal.
15713 ill = ipif->ipif_ill;
15714 phyi = ill->ill_phyint;
15715 if (ipif->ipif_id != 0 || index == 0 || index > IF_INDEX_MAX) {
15716 return (EINVAL);
15719 /* If the index is not changing, no work to do */
15720 if (phyi->phyint_ifindex == index)
15721 return (0);
15724 * Use phyint_exists() to determine if the new interface index
15725 * is already in use. If the index is unused then we need to
15726 * change the phyint's position in the phyint_list_avl_by_index
15727 * tree. If we do not do this, subsequent lookups (using the new
15728 * index value) will not find the phyint.
15730 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15731 if (phyint_exists(index, ipst)) {
15732 rw_exit(&ipst->ips_ill_g_lock);
15733 return (EEXIST);
15737 * The new index is unused. Set it in the phyint. However we must not
15738 * forget to trigger NE_IFINDEX_CHANGE event before the ifindex
15739 * changes. The event must be bound to old ifindex value.
15741 ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE,
15742 &index, sizeof (index));
15744 old_index = phyi->phyint_ifindex;
15745 phyi->phyint_ifindex = index;
15747 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi);
15748 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
15749 &index, &where);
15750 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
15751 phyi, where);
15752 rw_exit(&ipst->ips_ill_g_lock);
15754 /* Update SCTP's ILL list */
15755 sctp_ill_reindex(ill, old_index);
15757 /* Send the routing sockets message */
15758 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
15759 if (ILL_OTHER(ill))
15760 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT);
15762 /* Perhaps ilgs should use this ill */
15763 update_conn_ill(NULL, ill->ill_ipst);
15764 return (0);
15767 /* ARGSUSED */
15769 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15770 ip_ioctl_cmd_t *ipip, void *ifreq)
15772 struct ifreq *ifr = (struct ifreq *)ifreq;
15773 struct lifreq *lifr = (struct lifreq *)ifreq;
15775 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n",
15776 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
15777 /* Get the interface index */
15778 if (ipip->ipi_cmd_type == IF_CMD) {
15779 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
15780 } else {
15781 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
15783 return (0);
15786 /* ARGSUSED */
15788 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15789 ip_ioctl_cmd_t *ipip, void *ifreq)
15791 struct lifreq *lifr = (struct lifreq *)ifreq;
15793 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n",
15794 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
15795 /* Get the interface zone */
15796 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
15797 lifr->lifr_zoneid = ipif->ipif_zoneid;
15798 return (0);
15802 * Set the zoneid of an interface.
15804 /* ARGSUSED */
15806 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15807 ip_ioctl_cmd_t *ipip, void *ifreq)
15809 struct lifreq *lifr = (struct lifreq *)ifreq;
15810 int err = 0;
15811 boolean_t need_up = B_FALSE;
15812 zone_t *zptr;
15813 zone_status_t status;
15814 zoneid_t zoneid;
15816 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
15817 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES)
15818 return (ENOTSUP);
15820 /* cannot assign instance zero to a non-global zone */
15821 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID)
15822 return (ENOTSUP);
15825 * Cannot assign to a zone that doesn't exist or is shutting down. In
15826 * the event of a race with the zone shutdown processing, since IP
15827 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the
15828 * interface will be cleaned up even if the zone is shut down
15829 * immediately after the status check. If the interface can't be brought
15830 * down right away, and the zone is shut down before the restart
15831 * function is called, we resolve the possible races by rechecking the
15832 * zone status in the restart function.
15834 if ((zptr = zone_find_by_id(zoneid)) == NULL)
15835 return (EINVAL);
15836 status = zone_status_get(zptr);
15837 zone_rele(zptr);
15839 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING)
15840 return (EINVAL);
15842 if (ipif->ipif_flags & IPIF_UP) {
15844 * If the interface is already marked up,
15845 * we call ipif_down which will take care
15846 * of ditching any IREs that have been set
15847 * up based on the old interface address.
15849 err = ipif_logical_down(ipif, q, mp);
15850 if (err == EINPROGRESS)
15851 return (err);
15852 (void) ipif_down_tail(ipif);
15853 need_up = B_TRUE;
15856 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up);
15857 return (err);
15860 static int
15861 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
15862 queue_t *q, mblk_t *mp, boolean_t need_up)
15864 int err = 0;
15865 ip_stack_t *ipst;
15867 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n",
15868 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
15870 if (CONN_Q(q))
15871 ipst = CONNQ_TO_IPST(q);
15872 else
15873 ipst = ILLQ_TO_IPST(q);
15876 * For exclusive stacks we don't allow a different zoneid than
15877 * global.
15879 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID &&
15880 zoneid != GLOBAL_ZONEID)
15881 return (EINVAL);
15883 /* Set the new zone id. */
15884 ipif->ipif_zoneid = zoneid;
15886 /* Update sctp list */
15887 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
15889 /* The default multicast interface might have changed */
15890 ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6);
15892 if (need_up) {
15894 * Now bring the interface back up. If this
15895 * is the only IPIF for the ILL, ipif_up
15896 * will have to re-bind to the device, so
15897 * we may get back EINPROGRESS, in which
15898 * case, this IOCTL will get completed in
15899 * ip_rput_dlpi when we see the DL_BIND_ACK.
15901 err = ipif_up(ipif, q, mp);
15903 return (err);
15906 /* ARGSUSED */
15908 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15909 ip_ioctl_cmd_t *ipip, void *if_req)
15911 struct lifreq *lifr = (struct lifreq *)if_req;
15912 zoneid_t zoneid;
15913 zone_t *zptr;
15914 zone_status_t status;
15916 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
15917 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES)
15918 zoneid = GLOBAL_ZONEID;
15920 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n",
15921 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
15924 * We recheck the zone status to resolve the following race condition:
15925 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone";
15926 * 2) hme0:1 is up and can't be brought down right away;
15927 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued;
15928 * 3) zone "myzone" is halted; the zone status switches to
15929 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list
15930 * the interfaces to remove - hme0:1 is not returned because it's not
15931 * yet in "myzone", so it won't be removed;
15932 * 4) the restart function for SIOCSLIFZONE is called; without the
15933 * status check here, we would have hme0:1 in "myzone" after it's been
15934 * destroyed.
15935 * Note that if the status check fails, we need to bring the interface
15936 * back to its state prior to ip_sioctl_slifzone(), hence the call to
15937 * ipif_up_done[_v6]().
15939 status = ZONE_IS_UNINITIALIZED;
15940 if ((zptr = zone_find_by_id(zoneid)) != NULL) {
15941 status = zone_status_get(zptr);
15942 zone_rele(zptr);
15944 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) {
15945 if (ipif->ipif_isv6) {
15946 (void) ipif_up_done_v6(ipif);
15947 } else {
15948 (void) ipif_up_done(ipif);
15950 return (EINVAL);
15953 (void) ipif_down_tail(ipif);
15955 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp,
15956 B_TRUE));
15960 * Return the number of addresses on `ill' with one or more of the values
15961 * in `set' set and all of the values in `clear' clear.
15963 static uint_t
15964 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear)
15966 ipif_t *ipif;
15967 uint_t cnt = 0;
15969 ASSERT(IAM_WRITER_ILL(ill));
15971 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
15972 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear))
15973 cnt++;
15975 return (cnt);
15979 * Return the number of migratable addresses on `ill' that are under
15980 * application control.
15982 uint_t
15983 ill_appaddr_cnt(const ill_t *ill)
15985 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF,
15986 IPIF_NOFAILOVER));
15990 * Return the number of point-to-point addresses on `ill'.
15992 uint_t
15993 ill_ptpaddr_cnt(const ill_t *ill)
15995 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0));
15998 /* ARGSUSED */
16000 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16001 ip_ioctl_cmd_t *ipip, void *ifreq)
16003 struct lifreq *lifr = ifreq;
16005 ASSERT(q->q_next == NULL);
16006 ASSERT(CONN_Q(q));
16008 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n",
16009 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16010 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex;
16011 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index));
16013 return (0);
16016 /* Find the previous ILL in this usesrc group */
16017 static ill_t *
16018 ill_prev_usesrc(ill_t *uill)
16020 ill_t *ill;
16022 for (ill = uill->ill_usesrc_grp_next;
16023 ASSERT(ill), ill->ill_usesrc_grp_next != uill;
16024 ill = ill->ill_usesrc_grp_next)
16025 /* do nothing */;
16026 return (ill);
16030 * Release all members of the usesrc group. This routine is called
16031 * from ill_delete when the interface being unplumbed is the
16032 * group head.
16034 * This silently clears the usesrc that ifconfig setup.
16035 * An alternative would be to keep that ifindex, and drop packets on the floor
16036 * since no source address can be selected.
16037 * Even if we keep the current semantics, don't need a lock and a linked list.
16038 * Can walk all the ills checking if they have a ill_usesrc_ifindex matching
16039 * the one that is being removed. Issue is how we return the usesrc users
16040 * (SIOCGLIFSRCOF). We want to be able to find the ills which have an
16041 * ill_usesrc_ifindex matching a target ill. We could also do that with an
16042 * ill walk, but the walker would need to insert in the ioctl response.
16044 static void
16045 ill_disband_usesrc_group(ill_t *uill)
16047 ill_t *next_ill, *tmp_ill;
16048 ip_stack_t *ipst = uill->ill_ipst;
16050 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16051 next_ill = uill->ill_usesrc_grp_next;
16053 do {
16054 ASSERT(next_ill != NULL);
16055 tmp_ill = next_ill->ill_usesrc_grp_next;
16056 ASSERT(tmp_ill != NULL);
16057 next_ill->ill_usesrc_grp_next = NULL;
16058 next_ill->ill_usesrc_ifindex = 0;
16059 next_ill = tmp_ill;
16060 } while (next_ill->ill_usesrc_ifindex != 0);
16061 uill->ill_usesrc_grp_next = NULL;
16065 * Remove the client usesrc ILL from the list and relink to a new list
16068 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex)
16070 ill_t *ill, *tmp_ill;
16071 ip_stack_t *ipst = ucill->ill_ipst;
16073 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) &&
16074 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16077 * Check if the usesrc client ILL passed in is not already
16078 * in use as a usesrc ILL i.e one whose source address is
16079 * in use OR a usesrc ILL is not already in use as a usesrc
16080 * client ILL
16082 if ((ucill->ill_usesrc_ifindex == 0) ||
16083 (uill->ill_usesrc_ifindex != 0)) {
16084 return (-1);
16087 ill = ill_prev_usesrc(ucill);
16088 ASSERT(ill->ill_usesrc_grp_next != NULL);
16090 /* Remove from the current list */
16091 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) {
16092 /* Only two elements in the list */
16093 ASSERT(ill->ill_usesrc_ifindex == 0);
16094 ill->ill_usesrc_grp_next = NULL;
16095 } else {
16096 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next;
16099 if (ifindex == 0) {
16100 ucill->ill_usesrc_ifindex = 0;
16101 ucill->ill_usesrc_grp_next = NULL;
16102 return (0);
16105 ucill->ill_usesrc_ifindex = ifindex;
16106 tmp_ill = uill->ill_usesrc_grp_next;
16107 uill->ill_usesrc_grp_next = ucill;
16108 ucill->ill_usesrc_grp_next =
16109 (tmp_ill != NULL) ? tmp_ill : uill;
16110 return (0);
16114 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in
16115 * ip.c for locking details.
16117 /* ARGSUSED */
16119 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16120 ip_ioctl_cmd_t *ipip, void *ifreq)
16122 struct lifreq *lifr = (struct lifreq *)ifreq;
16123 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE;
16124 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
16125 int err = 0, ret;
16126 uint_t ifindex;
16127 ipsq_t *ipsq = NULL;
16128 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
16130 ASSERT(IAM_WRITER_IPIF(ipif));
16131 ASSERT(q->q_next == NULL);
16132 ASSERT(CONN_Q(q));
16134 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
16136 ifindex = lifr->lifr_index;
16137 if (ifindex == 0) {
16138 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) {
16139 /* non usesrc group interface, nothing to reset */
16140 return (0);
16142 ifindex = usesrc_cli_ill->ill_usesrc_ifindex;
16143 /* valid reset request */
16144 reset_flg = B_TRUE;
16147 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
16148 if (usesrc_ill == NULL)
16149 return (ENXIO);
16150 if (usesrc_ill == ipif->ipif_ill) {
16151 ill_refrele(usesrc_ill);
16152 return (EINVAL);
16155 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
16156 NEW_OP, B_TRUE);
16157 if (ipsq == NULL) {
16158 err = EINPROGRESS;
16159 /* Operation enqueued on the ipsq of the usesrc ILL */
16160 goto done;
16163 /* USESRC isn't currently supported with IPMP */
16164 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) {
16165 err = ENOTSUP;
16166 goto done;
16170 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only
16171 * used by IPMP underlying interfaces, but someone might think it's
16172 * more general and try to use it independently with VNI.)
16174 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
16175 err = ENOTSUP;
16176 goto done;
16180 * If the client is already in use as a usesrc_ill or a usesrc_ill is
16181 * already a client then return EINVAL
16183 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) {
16184 err = EINVAL;
16185 goto done;
16189 * If the ill_usesrc_ifindex field is already set to what it needs to
16190 * be then this is a duplicate operation.
16192 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) {
16193 err = 0;
16194 goto done;
16197 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s,"
16198 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name,
16199 usesrc_ill->ill_isv6));
16202 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next
16203 * and the ill_usesrc_ifindex fields
16205 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
16207 if (reset_flg) {
16208 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0);
16209 if (ret != 0) {
16210 err = EINVAL;
16212 rw_exit(&ipst->ips_ill_g_usesrc_lock);
16213 goto done;
16217 * Four possibilities to consider:
16218 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp
16219 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't
16220 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't
16221 * 4. Both are part of their respective usesrc groups
16223 if ((usesrc_ill->ill_usesrc_grp_next == NULL) &&
16224 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16225 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0);
16226 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16227 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16228 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill;
16229 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) &&
16230 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16231 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16232 /* Insert at head of list */
16233 usesrc_cli_ill->ill_usesrc_grp_next =
16234 usesrc_ill->ill_usesrc_grp_next;
16235 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16236 } else {
16237 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill,
16238 ifindex);
16239 if (ret != 0)
16240 err = EINVAL;
16242 rw_exit(&ipst->ips_ill_g_usesrc_lock);
16244 done:
16245 if (ipsq != NULL)
16246 ipsq_exit(ipsq);
16247 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */
16248 ill_refrele(usesrc_ill);
16250 /* Let conn_ixa caching know that source address selection changed */
16251 ip_update_source_selection(ipst);
16253 return (err);
16256 /* ARGSUSED */
16258 ip_sioctl_get_dadstate(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16259 ip_ioctl_cmd_t *ipip, void *if_req)
16261 struct lifreq *lifr = (struct lifreq *)if_req;
16262 ill_t *ill = ipif->ipif_ill;
16265 * Need a lock since IFF_UP can be set even when there are
16266 * references to the ipif.
16268 mutex_enter(&ill->ill_lock);
16269 if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_addr_ready == 0)
16270 lifr->lifr_dadstate = DAD_IN_PROGRESS;
16271 else
16272 lifr->lifr_dadstate = DAD_DONE;
16273 mutex_exit(&ill->ill_lock);
16274 return (0);
16278 * comparison function used by avl.
16280 static int
16281 ill_phyint_compare_index(const void *index_ptr, const void *phyip)
16284 uint_t index;
16286 ASSERT(phyip != NULL && index_ptr != NULL);
16288 index = *((uint_t *)index_ptr);
16290 * let the phyint with the lowest index be on top.
16292 if (((phyint_t *)phyip)->phyint_ifindex < index)
16293 return (1);
16294 if (((phyint_t *)phyip)->phyint_ifindex > index)
16295 return (-1);
16296 return (0);
16300 * comparison function used by avl.
16302 static int
16303 ill_phyint_compare_name(const void *name_ptr, const void *phyip)
16305 ill_t *ill;
16306 int res = 0;
16308 ASSERT(phyip != NULL && name_ptr != NULL);
16310 if (((phyint_t *)phyip)->phyint_illv4)
16311 ill = ((phyint_t *)phyip)->phyint_illv4;
16312 else
16313 ill = ((phyint_t *)phyip)->phyint_illv6;
16314 ASSERT(ill != NULL);
16316 res = strcmp(ill->ill_name, (char *)name_ptr);
16317 if (res > 0)
16318 return (1);
16319 else if (res < 0)
16320 return (-1);
16321 return (0);
16325 * This function is called on the unplumb path via ill_glist_delete() when
16326 * there are no ills left on the phyint and thus the phyint can be freed.
16328 static void
16329 phyint_free(phyint_t *phyi)
16331 ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
16333 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL);
16336 * If this phyint was an IPMP meta-interface, blow away the group.
16337 * This is safe to do because all of the illgrps have already been
16338 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us.
16339 * If we're cleaning up as a result of failed initialization,
16340 * phyint_grp may be NULL.
16342 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) {
16343 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16344 ipmp_grp_destroy(phyi->phyint_grp);
16345 phyi->phyint_grp = NULL;
16346 rw_exit(&ipst->ips_ipmp_lock);
16350 * If this interface was under IPMP, take it out of the group.
16352 if (phyi->phyint_grp != NULL)
16353 ipmp_phyint_leave_grp(phyi);
16356 * Delete the phyint and disassociate its ipsq. The ipsq itself
16357 * will be freed in ipsq_exit().
16359 phyi->phyint_ipsq->ipsq_phyint = NULL;
16360 phyi->phyint_name[0] = '\0';
16362 mi_free(phyi);
16366 * Attach the ill to the phyint structure which can be shared by both
16367 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This
16368 * function is called from ipif_set_values and ill_lookup_on_name (for
16369 * loopback) where we know the name of the ill. We lookup the ill and if
16370 * there is one present already with the name use that phyint. Otherwise
16371 * reuse the one allocated by ill_init.
16373 static void
16374 ill_phyint_reinit(ill_t *ill)
16376 boolean_t isv6 = ill->ill_isv6;
16377 phyint_t *phyi_old;
16378 phyint_t *phyi;
16379 avl_index_t where = 0;
16380 ill_t *ill_other = NULL;
16381 ip_stack_t *ipst = ill->ill_ipst;
16383 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
16385 phyi_old = ill->ill_phyint;
16386 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill &&
16387 phyi_old->phyint_illv6 == NULL));
16388 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill &&
16389 phyi_old->phyint_illv4 == NULL));
16390 ASSERT(phyi_old->phyint_ifindex == 0);
16393 * Now that our ill has a name, set it in the phyint.
16395 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ);
16397 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16398 ill->ill_name, &where);
16401 * 1. We grabbed the ill_g_lock before inserting this ill into
16402 * the global list of ills. So no other thread could have located
16403 * this ill and hence the ipsq of this ill is guaranteed to be empty.
16404 * 2. Now locate the other protocol instance of this ill.
16405 * 3. Now grab both ill locks in the right order, and the phyint lock of
16406 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq
16407 * of neither ill can change.
16408 * 4. Merge the phyint and thus the ipsq as well of this ill onto the
16409 * other ill.
16410 * 5. Release all locks.
16414 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if
16415 * we are initializing IPv4.
16417 if (phyi != NULL) {
16418 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6;
16419 ASSERT(ill_other->ill_phyint != NULL);
16420 ASSERT((isv6 && !ill_other->ill_isv6) ||
16421 (!isv6 && ill_other->ill_isv6));
16422 GRAB_ILL_LOCKS(ill, ill_other);
16424 * We are potentially throwing away phyint_flags which
16425 * could be different from the one that we obtain from
16426 * ill_other->ill_phyint. But it is okay as we are assuming
16427 * that the state maintained within IP is correct.
16429 mutex_enter(&phyi->phyint_lock);
16430 if (isv6) {
16431 ASSERT(phyi->phyint_illv6 == NULL);
16432 phyi->phyint_illv6 = ill;
16433 } else {
16434 ASSERT(phyi->phyint_illv4 == NULL);
16435 phyi->phyint_illv4 = ill;
16439 * Delete the old phyint and make its ipsq eligible
16440 * to be freed in ipsq_exit().
16442 phyi_old->phyint_illv4 = NULL;
16443 phyi_old->phyint_illv6 = NULL;
16444 phyi_old->phyint_ipsq->ipsq_phyint = NULL;
16445 phyi_old->phyint_name[0] = '\0';
16446 mi_free(phyi_old);
16447 } else {
16448 mutex_enter(&ill->ill_lock);
16450 * We don't need to acquire any lock, since
16451 * the ill is not yet visible globally and we
16452 * have not yet released the ill_g_lock.
16454 phyi = phyi_old;
16455 mutex_enter(&phyi->phyint_lock);
16456 /* XXX We need a recovery strategy here. */
16457 if (!phyint_assign_ifindex(phyi, ipst))
16458 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed");
16460 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16461 (void *)phyi, where);
16463 (void) avl_find(&ipst->ips_phyint_g_list->
16464 phyint_list_avl_by_index,
16465 &phyi->phyint_ifindex, &where);
16466 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16467 (void *)phyi, where);
16471 * Reassigning ill_phyint automatically reassigns the ipsq also.
16472 * pending mp is not affected because that is per ill basis.
16474 ill->ill_phyint = phyi;
16477 * Now that the phyint's ifindex has been assigned, complete the
16478 * remaining
16480 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex;
16481 if (ill->ill_isv6) {
16482 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
16483 ill->ill_phyint->phyint_ifindex;
16484 ill->ill_mcast_type = ipst->ips_mld_max_version;
16485 } else {
16486 ill->ill_mcast_type = ipst->ips_igmp_max_version;
16490 * Generate an event within the hooks framework to indicate that
16491 * a new interface has just been added to IP. For this event to
16492 * be generated, the network interface must, at least, have an
16493 * ifindex assigned to it. (We don't generate the event for
16494 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.)
16496 * This needs to be run inside the ill_g_lock perimeter to ensure
16497 * that the ordering of delivered events to listeners matches the
16498 * order of them in the kernel.
16500 if (!IS_LOOPBACK(ill)) {
16501 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name,
16502 ill->ill_name_length);
16504 RELEASE_ILL_LOCKS(ill, ill_other);
16505 mutex_exit(&phyi->phyint_lock);
16509 * Notify any downstream modules of the name of this interface.
16510 * An M_IOCTL is used even though we don't expect a successful reply.
16511 * Any reply message from the driver (presumably an M_IOCNAK) will
16512 * eventually get discarded somewhere upstream. The message format is
16513 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig
16514 * to IP.
16516 static void
16517 ip_ifname_notify(ill_t *ill, queue_t *q)
16519 mblk_t *mp1, *mp2;
16520 struct iocblk *iocp;
16521 struct lifreq *lifr;
16523 mp1 = mkiocb(SIOCSLIFNAME);
16524 if (mp1 == NULL)
16525 return;
16526 mp2 = allocb(sizeof (struct lifreq), BPRI_HI);
16527 if (mp2 == NULL) {
16528 freeb(mp1);
16529 return;
16532 mp1->b_cont = mp2;
16533 iocp = (struct iocblk *)mp1->b_rptr;
16534 iocp->ioc_count = sizeof (struct lifreq);
16536 lifr = (struct lifreq *)mp2->b_rptr;
16537 mp2->b_wptr += sizeof (struct lifreq);
16538 bzero(lifr, sizeof (struct lifreq));
16540 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ);
16541 lifr->lifr_ppa = ill->ill_ppa;
16542 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6));
16544 DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify",
16545 char *, "SIOCSLIFNAME", ill_t *, ill);
16546 putnext(q, mp1);
16549 static int
16550 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
16552 int err;
16553 ip_stack_t *ipst = ill->ill_ipst;
16554 phyint_t *phyi = ill->ill_phyint;
16557 * Now that ill_name is set, the configuration for the IPMP
16558 * meta-interface can be performed.
16560 if (IS_IPMP(ill)) {
16561 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16563 * If phyi->phyint_grp is NULL, then this is the first IPMP
16564 * meta-interface and we need to create the IPMP group.
16566 if (phyi->phyint_grp == NULL) {
16568 * If someone has renamed another IPMP group to have
16569 * the same name as our interface, bail.
16571 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) {
16572 rw_exit(&ipst->ips_ipmp_lock);
16573 return (EEXIST);
16575 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi);
16576 if (phyi->phyint_grp == NULL) {
16577 rw_exit(&ipst->ips_ipmp_lock);
16578 return (ENOMEM);
16581 rw_exit(&ipst->ips_ipmp_lock);
16584 /* Tell downstream modules where they are. */
16585 ip_ifname_notify(ill, q);
16588 * ill_dl_phys returns EINPROGRESS in the usual case.
16589 * Error cases are ENOMEM ...
16591 err = ill_dl_phys(ill, ipif, mp, q);
16593 if (ill->ill_isv6) {
16594 mutex_enter(&ipst->ips_mld_slowtimeout_lock);
16595 if (ipst->ips_mld_slowtimeout_id == 0) {
16596 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo,
16597 (void *)ipst,
16598 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16600 mutex_exit(&ipst->ips_mld_slowtimeout_lock);
16601 } else {
16602 mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
16603 if (ipst->ips_igmp_slowtimeout_id == 0) {
16604 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo,
16605 (void *)ipst,
16606 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16608 mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
16611 return (err);
16615 * Common routine for ppa and ifname setting. Should be called exclusive.
16617 * Returns EINPROGRESS when mp has been consumed by queueing it on
16618 * ipx_pending_mp and the ioctl will complete in ip_rput.
16620 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return
16621 * the new name and new ppa in lifr_name and lifr_ppa respectively.
16622 * For SLIFNAME, we pass these values back to the userland.
16624 static int
16625 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
16627 ill_t *ill;
16628 ipif_t *ipif;
16629 ipsq_t *ipsq;
16630 char *ppa_ptr;
16631 char *old_ptr;
16632 char old_char;
16633 int error;
16634 ip_stack_t *ipst;
16636 ip1dbg(("ipif_set_values: interface %s\n", interf_name));
16637 ASSERT(q->q_next != NULL);
16638 ASSERT(interf_name != NULL);
16640 ill = (ill_t *)q->q_ptr;
16641 ipst = ill->ill_ipst;
16643 ASSERT(ill->ill_ipst != NULL);
16644 ASSERT(ill->ill_name[0] == '\0');
16645 ASSERT(IAM_WRITER_ILL(ill));
16646 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ);
16647 ASSERT(ill->ill_ppa == UINT_MAX);
16649 ill->ill_defend_start = ill->ill_defend_count = 0;
16650 /* The ppa is sent down by ifconfig or is chosen */
16651 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) {
16652 return (EINVAL);
16656 * make sure ppa passed in is same as ppa in the name.
16657 * This check is not made when ppa == UINT_MAX in that case ppa
16658 * in the name could be anything. System will choose a ppa and
16659 * update new_ppa_ptr and inter_name to contain the choosen ppa.
16661 if (*new_ppa_ptr != UINT_MAX) {
16662 /* stoi changes the pointer */
16663 old_ptr = ppa_ptr;
16665 * ifconfig passed in 0 for the ppa for DLPI 1 style devices
16666 * (they don't have an externally visible ppa). We assign one
16667 * here so that we can manage the interface. Note that in
16668 * the past this value was always 0 for DLPI 1 drivers.
16670 if (*new_ppa_ptr == 0)
16671 *new_ppa_ptr = stoi(&old_ptr);
16672 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr))
16673 return (EINVAL);
16676 * terminate string before ppa
16677 * save char at that location.
16679 old_char = ppa_ptr[0];
16680 ppa_ptr[0] = '\0';
16682 ill->ill_ppa = *new_ppa_ptr;
16684 * Finish as much work now as possible before calling ill_glist_insert
16685 * which makes the ill globally visible and also merges it with the
16686 * other protocol instance of this phyint. The remaining work is
16687 * done after entering the ipsq which may happen sometime later.
16689 ipif = ill->ill_ipif;
16691 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */
16692 ipif_assign_seqid(ipif);
16694 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)))
16695 ill->ill_flags |= ILLF_IPV4;
16697 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */
16698 ASSERT((ipif->ipif_flags & IPIF_UP) == 0);
16700 if (ill->ill_flags & ILLF_IPV6) {
16702 ill->ill_isv6 = B_TRUE;
16703 ill_set_inputfn(ill);
16704 if (ill->ill_rq != NULL) {
16705 ill->ill_rq->q_qinfo = &iprinitv6;
16708 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */
16709 ipif->ipif_v6lcl_addr = ipv6_all_zeros;
16710 ipif->ipif_v6subnet = ipv6_all_zeros;
16711 ipif->ipif_v6net_mask = ipv6_all_zeros;
16712 ipif->ipif_v6brd_addr = ipv6_all_zeros;
16713 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros;
16714 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER;
16716 * point-to-point or Non-mulicast capable
16717 * interfaces won't do NUD unless explicitly
16718 * configured to do so.
16720 if (ipif->ipif_flags & IPIF_POINTOPOINT ||
16721 !(ill->ill_flags & ILLF_MULTICAST)) {
16722 ill->ill_flags |= ILLF_NONUD;
16724 /* Make sure IPv4 specific flag is not set on IPv6 if */
16725 if (ill->ill_flags & ILLF_NOARP) {
16727 * Note: xresolv interfaces will eventually need
16728 * NOARP set here as well, but that will require
16729 * those external resolvers to have some
16730 * knowledge of that flag and act appropriately.
16731 * Not to be changed at present.
16733 ill->ill_flags &= ~ILLF_NOARP;
16736 * Set the ILLF_ROUTER flag according to the global
16737 * IPv6 forwarding policy.
16739 if (ipst->ips_ipv6_forwarding != 0)
16740 ill->ill_flags |= ILLF_ROUTER;
16741 } else if (ill->ill_flags & ILLF_IPV4) {
16742 ill->ill_isv6 = B_FALSE;
16743 ill_set_inputfn(ill);
16744 ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER;
16745 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr);
16746 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet);
16747 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask);
16748 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr);
16749 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr);
16751 * Set the ILLF_ROUTER flag according to the global
16752 * IPv4 forwarding policy.
16754 if (ipst->ips_ip_forwarding != 0)
16755 ill->ill_flags |= ILLF_ROUTER;
16758 ASSERT(ill->ill_phyint != NULL);
16761 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will
16762 * be completed in ill_glist_insert -> ill_phyint_reinit
16764 if (!ill_allocate_mibs(ill))
16765 return (ENOMEM);
16768 * Pick a default sap until we get the DL_INFO_ACK back from
16769 * the driver.
16771 ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap :
16772 ill->ill_media->ip_m_ipv4sap;
16774 ill->ill_ifname_pending = 1;
16775 ill->ill_ifname_pending_err = 0;
16778 * When the first ipif comes up in ipif_up_done(), multicast groups
16779 * that were joined while this ill was not bound to the DLPI link need
16780 * to be recovered by ill_recover_multicast().
16782 ill->ill_need_recover_multicast = 1;
16784 ill_refhold(ill);
16785 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
16786 if ((error = ill_glist_insert(ill, interf_name,
16787 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) {
16788 ill->ill_ppa = UINT_MAX;
16789 ill->ill_name[0] = '\0';
16791 * undo null termination done above.
16793 ppa_ptr[0] = old_char;
16794 rw_exit(&ipst->ips_ill_g_lock);
16795 ill_refrele(ill);
16796 return (error);
16799 ASSERT(ill->ill_name_length <= LIFNAMSIZ);
16802 * When we return the buffer pointed to by interf_name should contain
16803 * the same name as in ill_name.
16804 * If a ppa was choosen by the system (ppa passed in was UINT_MAX)
16805 * the buffer pointed to by new_ppa_ptr would not contain the right ppa
16806 * so copy full name and update the ppa ptr.
16807 * When ppa passed in != UINT_MAX all values are correct just undo
16808 * null termination, this saves a bcopy.
16810 if (*new_ppa_ptr == UINT_MAX) {
16811 bcopy(ill->ill_name, interf_name, ill->ill_name_length);
16812 *new_ppa_ptr = ill->ill_ppa;
16813 } else {
16815 * undo null termination done above.
16817 ppa_ptr[0] = old_char;
16820 /* Let SCTP know about this ILL */
16821 sctp_update_ill(ill, SCTP_ILL_INSERT);
16824 * ill_glist_insert has made the ill visible globally, and
16825 * ill_phyint_reinit could have changed the ipsq. At this point,
16826 * we need to hold the ips_ill_g_lock across the call to enter the
16827 * ipsq to enforce atomicity and prevent reordering. In the event
16828 * the ipsq has changed, and if the new ipsq is currently busy,
16829 * we need to make sure that this half-completed ioctl is ahead of
16830 * any subsequent ioctl. We achieve this by not dropping the
16831 * ips_ill_g_lock which prevents any ill lookup itself thereby
16832 * ensuring that new ioctls can't start.
16834 ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP,
16835 B_TRUE);
16837 rw_exit(&ipst->ips_ill_g_lock);
16838 ill_refrele(ill);
16839 if (ipsq == NULL)
16840 return (EINPROGRESS);
16843 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq.
16845 if (ipsq->ipsq_xop->ipx_current_ipif == NULL)
16846 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME);
16847 else
16848 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif);
16850 error = ipif_set_values_tail(ill, ipif, mp, q);
16851 ipsq_exit(ipsq);
16852 if (error != 0 && error != EINPROGRESS) {
16854 * restore previous values
16856 ill->ill_isv6 = B_FALSE;
16857 ill_set_inputfn(ill);
16859 return (error);
16862 void
16863 ipif_init(ip_stack_t *ipst)
16865 int i;
16867 for (i = 0; i < MAX_G_HEADS; i++) {
16868 ipst->ips_ill_g_heads[i].ill_g_list_head =
16869 (ill_if_t *)&ipst->ips_ill_g_heads[i];
16870 ipst->ips_ill_g_heads[i].ill_g_list_tail =
16871 (ill_if_t *)&ipst->ips_ill_g_heads[i];
16874 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16875 ill_phyint_compare_index,
16876 sizeof (phyint_t),
16877 offsetof(struct phyint, phyint_avl_by_index));
16878 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16879 ill_phyint_compare_name,
16880 sizeof (phyint_t),
16881 offsetof(struct phyint, phyint_avl_by_name));
16885 * Save enough information so that we can recreate the IRE if
16886 * the interface goes down and then up.
16888 void
16889 ill_save_ire(ill_t *ill, ire_t *ire)
16891 mblk_t *save_mp;
16893 save_mp = allocb(sizeof (ifrt_t), BPRI_MED);
16894 if (save_mp != NULL) {
16895 ifrt_t *ifrt;
16897 save_mp->b_wptr += sizeof (ifrt_t);
16898 ifrt = (ifrt_t *)save_mp->b_rptr;
16899 bzero(ifrt, sizeof (ifrt_t));
16900 ifrt->ifrt_type = ire->ire_type;
16901 if (ire->ire_ipversion == IPV4_VERSION) {
16902 ASSERT(!ill->ill_isv6);
16903 ifrt->ifrt_addr = ire->ire_addr;
16904 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr;
16905 ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr;
16906 ifrt->ifrt_mask = ire->ire_mask;
16907 } else {
16908 ASSERT(ill->ill_isv6);
16909 ifrt->ifrt_v6addr = ire->ire_addr_v6;
16910 /* ire_gateway_addr_v6 can change due to RTM_CHANGE */
16911 mutex_enter(&ire->ire_lock);
16912 ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6;
16913 mutex_exit(&ire->ire_lock);
16914 ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6;
16915 ifrt->ifrt_v6mask = ire->ire_mask_v6;
16917 ifrt->ifrt_flags = ire->ire_flags;
16918 ifrt->ifrt_zoneid = ire->ire_zoneid;
16919 mutex_enter(&ill->ill_saved_ire_lock);
16920 save_mp->b_cont = ill->ill_saved_ire_mp;
16921 ill->ill_saved_ire_mp = save_mp;
16922 ill->ill_saved_ire_cnt++;
16923 mutex_exit(&ill->ill_saved_ire_lock);
16928 * Remove one entry from ill_saved_ire_mp.
16930 void
16931 ill_remove_saved_ire(ill_t *ill, ire_t *ire)
16933 mblk_t **mpp;
16934 mblk_t *mp;
16935 ifrt_t *ifrt;
16937 /* Remove from ill_saved_ire_mp list if it is there */
16938 mutex_enter(&ill->ill_saved_ire_lock);
16939 for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL;
16940 mpp = &(*mpp)->b_cont) {
16941 in6_addr_t gw_addr_v6;
16944 * On a given ill, the tuple of address, gateway, mask,
16945 * ire_type, and zoneid is unique for each saved IRE.
16947 mp = *mpp;
16948 ifrt = (ifrt_t *)mp->b_rptr;
16949 /* ire_gateway_addr_v6 can change - need lock */
16950 mutex_enter(&ire->ire_lock);
16951 gw_addr_v6 = ire->ire_gateway_addr_v6;
16952 mutex_exit(&ire->ire_lock);
16954 if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
16955 ifrt->ifrt_type != ire->ire_type)
16956 continue;
16958 if (ill->ill_isv6 ?
16959 (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
16960 &ire->ire_addr_v6) &&
16961 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
16962 &gw_addr_v6) &&
16963 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
16964 &ire->ire_mask_v6)) :
16965 (ifrt->ifrt_addr == ire->ire_addr &&
16966 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr &&
16967 ifrt->ifrt_mask == ire->ire_mask)) {
16968 *mpp = mp->b_cont;
16969 ill->ill_saved_ire_cnt--;
16970 freeb(mp);
16971 break;
16974 mutex_exit(&ill->ill_saved_ire_lock);
16978 * Derive an interface id from the link layer address.
16979 * Knows about IEEE 802 and IEEE EUI-64 mappings.
16981 static void
16982 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr)
16984 char *addr;
16987 * Note that some IPv6 interfaces get plumbed over links that claim to
16988 * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g.
16989 * PPP links). The ETHERADDRL check here ensures that we only set the
16990 * interface ID on IPv6 interfaces above links that actually have real
16991 * Ethernet addresses.
16993 if (ill->ill_phys_addr_length == ETHERADDRL) {
16994 /* Form EUI-64 like address */
16995 addr = (char *)&v6addr->s6_addr32[2];
16996 bcopy(ill->ill_phys_addr, addr, 3);
16997 addr[0] ^= 0x2; /* Toggle Universal/Local bit */
16998 addr[3] = (char)0xff;
16999 addr[4] = (char)0xfe;
17000 bcopy(ill->ill_phys_addr + 3, addr + 5, 3);
17004 /* ARGSUSED */
17005 static void
17006 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17010 typedef struct ipmp_ifcookie {
17011 uint32_t ic_hostid;
17012 char ic_ifname[LIFNAMSIZ];
17013 char ic_zonename[ZONENAME_MAX];
17014 } ipmp_ifcookie_t;
17017 * Construct a pseudo-random interface ID for the IPMP interface that's both
17018 * predictable and (almost) guaranteed to be unique.
17020 static void
17021 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17023 zone_t *zp;
17024 uint8_t *addr;
17025 uchar_t hash[16];
17026 ulong_t hostid;
17027 MD5_CTX ctx;
17028 ipmp_ifcookie_t ic = { 0 };
17030 ASSERT(IS_IPMP(ill));
17032 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
17033 ic.ic_hostid = htonl((uint32_t)hostid);
17035 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ);
17037 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) {
17038 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX);
17039 zone_rele(zp);
17042 MD5Init(&ctx);
17043 MD5Update(&ctx, &ic, sizeof (ic));
17044 MD5Final(hash, &ctx);
17047 * Map the hash to an interface ID per the basic approach in RFC3041.
17049 addr = &v6addr->s6_addr8[8];
17050 bcopy(hash + 8, addr, sizeof (uint64_t));
17051 addr[0] &= ~0x2; /* set local bit */
17055 * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet.
17057 static void
17058 ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr)
17060 phyint_t *phyi = ill->ill_phyint;
17063 * Check PHYI_MULTI_BCAST and length of physical
17064 * address to determine if we use the mapping or the
17065 * broadcast address.
17067 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17068 ill->ill_phys_addr_length != ETHERADDRL) {
17069 ip_mbcast_mapping(ill, m_ip6addr, m_physaddr);
17070 return;
17072 m_physaddr[0] = 0x33;
17073 m_physaddr[1] = 0x33;
17074 m_physaddr[2] = m_ip6addr[12];
17075 m_physaddr[3] = m_ip6addr[13];
17076 m_physaddr[4] = m_ip6addr[14];
17077 m_physaddr[5] = m_ip6addr[15];
17081 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet.
17083 static void
17084 ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17086 phyint_t *phyi = ill->ill_phyint;
17089 * Check PHYI_MULTI_BCAST and length of physical
17090 * address to determine if we use the mapping or the
17091 * broadcast address.
17093 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17094 ill->ill_phys_addr_length != ETHERADDRL) {
17095 ip_mbcast_mapping(ill, m_ipaddr, m_physaddr);
17096 return;
17098 m_physaddr[0] = 0x01;
17099 m_physaddr[1] = 0x00;
17100 m_physaddr[2] = 0x5e;
17101 m_physaddr[3] = m_ipaddr[1] & 0x7f;
17102 m_physaddr[4] = m_ipaddr[2];
17103 m_physaddr[5] = m_ipaddr[3];
17106 /* ARGSUSED */
17107 static void
17108 ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17111 * for the MULTI_BCAST case and other cases when we want to
17112 * use the link-layer broadcast address for multicast.
17114 uint8_t *bphys_addr;
17115 dl_unitdata_req_t *dlur;
17117 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17118 if (ill->ill_sap_length < 0) {
17119 bphys_addr = (uchar_t *)dlur +
17120 dlur->dl_dest_addr_offset;
17121 } else {
17122 bphys_addr = (uchar_t *)dlur +
17123 dlur->dl_dest_addr_offset + ill->ill_sap_length;
17126 bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length);
17130 * Derive IPoIB interface id from the link layer address.
17132 static void
17133 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17135 char *addr;
17137 ASSERT(ill->ill_phys_addr_length == 20);
17138 addr = (char *)&v6addr->s6_addr32[2];
17139 bcopy(ill->ill_phys_addr + 12, addr, 8);
17141 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit
17142 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE
17143 * rules. In these cases, the IBA considers these GUIDs to be in
17144 * "Modified EUI-64" format, and thus toggling the u/l bit is not
17145 * required; vendors are required not to assign global EUI-64's
17146 * that differ only in u/l bit values, thus guaranteeing uniqueness
17147 * of the interface identifier. Whether the GUID is in modified
17148 * or proper EUI-64 format, the ipv6 identifier must have the u/l
17149 * bit set to 1.
17151 addr[0] |= 2; /* Set Universal/Local bit to 1 */
17155 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand.
17156 * Note on mapping from multicast IP addresses to IPoIB multicast link
17157 * addresses. IPoIB multicast link addresses are based on IBA link addresses.
17158 * The format of an IPoIB multicast address is:
17160 * 4 byte QPN Scope Sign. Pkey
17161 * +--------------------------------------------+
17162 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID |
17163 * +--------------------------------------------+
17165 * The Scope and Pkey components are properties of the IBA port and
17166 * network interface. They can be ascertained from the broadcast address.
17167 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6.
17169 static void
17170 ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17172 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17173 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
17174 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17175 uint8_t *bphys_addr;
17176 dl_unitdata_req_t *dlur;
17178 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17181 * RFC 4391: IPv4 MGID is 28-bit long.
17183 m_physaddr[16] = m_ipaddr[0] & 0x0f;
17184 m_physaddr[17] = m_ipaddr[1];
17185 m_physaddr[18] = m_ipaddr[2];
17186 m_physaddr[19] = m_ipaddr[3];
17189 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17190 if (ill->ill_sap_length < 0) {
17191 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17192 } else {
17193 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17194 ill->ill_sap_length;
17197 * Now fill in the IBA scope/Pkey values from the broadcast address.
17199 m_physaddr[5] = bphys_addr[5];
17200 m_physaddr[8] = bphys_addr[8];
17201 m_physaddr[9] = bphys_addr[9];
17204 static void
17205 ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17207 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17208 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00,
17209 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17210 uint8_t *bphys_addr;
17211 dl_unitdata_req_t *dlur;
17213 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17216 * RFC 4391: IPv4 MGID is 80-bit long.
17218 bcopy(&m_ipaddr[6], &m_physaddr[10], 10);
17220 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17221 if (ill->ill_sap_length < 0) {
17222 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17223 } else {
17224 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17225 ill->ill_sap_length;
17228 * Now fill in the IBA scope/Pkey values from the broadcast address.
17230 m_physaddr[5] = bphys_addr[5];
17231 m_physaddr[8] = bphys_addr[8];
17232 m_physaddr[9] = bphys_addr[9];
17236 * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4
17237 * tunnel). The IPv4 address simply get placed in the lower 4 bytes of the
17238 * IPv6 interface id. This is a suggested mechanism described in section 3.7
17239 * of RFC4213.
17241 static void
17242 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17244 ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t));
17245 v6addr->s6_addr32[2] = 0;
17246 bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t));
17250 * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6
17251 * tunnel). The lower 8 bytes of the IPv6 address simply become the interface
17252 * id.
17254 static void
17255 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17257 in6_addr_t *v6lladdr = (in6_addr_t *)physaddr;
17259 ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t));
17260 bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8);
17263 static void
17264 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17266 ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17269 static void
17270 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17272 ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17275 static void
17276 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17278 ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17281 static void
17282 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17284 ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17288 * Lookup an ill and verify that the zoneid has an ipif on that ill.
17289 * Returns an held ill, or NULL.
17291 ill_t *
17292 ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6,
17293 ip_stack_t *ipst)
17295 ill_t *ill;
17296 ipif_t *ipif;
17298 ill = ill_lookup_on_ifindex(index, isv6, ipst);
17299 if (ill == NULL)
17300 return (NULL);
17302 mutex_enter(&ill->ill_lock);
17303 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17304 if (IPIF_IS_CONDEMNED(ipif))
17305 continue;
17306 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid &&
17307 ipif->ipif_zoneid != ALL_ZONES)
17308 continue;
17310 mutex_exit(&ill->ill_lock);
17311 return (ill);
17313 mutex_exit(&ill->ill_lock);
17314 ill_refrele(ill);
17315 return (NULL);
17319 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id)
17320 * If a pointer to an ipif_t is returned then the caller will need to do
17321 * an ill_refrele().
17323 ipif_t *
17324 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
17325 ip_stack_t *ipst)
17327 ipif_t *ipif;
17328 ill_t *ill;
17330 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
17331 if (ill == NULL)
17332 return (NULL);
17334 mutex_enter(&ill->ill_lock);
17335 if (ill->ill_state_flags & ILL_CONDEMNED) {
17336 mutex_exit(&ill->ill_lock);
17337 ill_refrele(ill);
17338 return (NULL);
17341 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17342 if (!IPIF_CAN_LOOKUP(ipif))
17343 continue;
17344 if (lifidx == ipif->ipif_id) {
17345 ipif_refhold_locked(ipif);
17346 break;
17350 mutex_exit(&ill->ill_lock);
17351 ill_refrele(ill);
17352 return (ipif);
17356 * Set ill_inputfn based on the current know state.
17357 * This needs to be called when any of the factors taken into
17358 * account changes.
17360 void
17361 ill_set_inputfn(ill_t *ill)
17363 ip_stack_t *ipst = ill->ill_ipst;
17365 if (ill->ill_isv6) {
17366 ill->ill_inputfn = ill_input_short_v6;
17367 } else {
17368 if (ill->ill_dhcpinit != 0)
17369 ill->ill_inputfn = ill_input_full_v4;
17370 else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head
17371 != NULL)
17372 ill->ill_inputfn = ill_input_full_v4;
17373 else
17374 ill->ill_inputfn = ill_input_short_v4;
17379 * Re-evaluate ill_inputfn for all the IPv4 ills.
17380 * Used when RSVP comes and goes.
17382 void
17383 ill_set_inputfn_all(ip_stack_t *ipst)
17385 ill_walk_context_t ctx;
17386 ill_t *ill;
17388 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
17389 ill = ILL_START_WALK_V4(&ctx, ipst);
17390 for (; ill != NULL; ill = ill_next(&ctx, ill))
17391 ill_set_inputfn(ill);
17393 rw_exit(&ipst->ips_ill_g_lock);
17397 * Set the physical address information for `ill' to the contents of the
17398 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be
17399 * asynchronous if `ill' cannot immediately be quiesced -- in which case
17400 * EINPROGRESS will be returned.
17403 ill_set_phys_addr(ill_t *ill, mblk_t *mp)
17405 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17406 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr;
17408 ASSERT(IAM_WRITER_IPSQ(ipsq));
17410 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR &&
17411 dlindp->dl_data != DL_CURR_DEST_ADDR &&
17412 dlindp->dl_data != DL_CURR_PHYS_ADDR) {
17413 /* Changing DL_IPV6_TOKEN is not yet supported */
17414 return (0);
17418 * We need to store up to two copies of `mp' in `ill'. Due to the
17419 * design of ipsq_pending_mp_add(), we can't pass them as separate
17420 * arguments to ill_set_phys_addr_tail(). Instead, chain them
17421 * together here, then pull 'em apart in ill_set_phys_addr_tail().
17423 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) {
17424 freemsg(mp);
17425 return (ENOMEM);
17428 ipsq_current_start(ipsq, ill->ill_ipif, 0);
17431 * Since we'll only do a logical down, we can't rely on ipif_down
17432 * to turn on ILL_DOWN_IN_PROGRESS, or for the DL_BIND_ACK to reset
17433 * ILL_DOWN_IN_PROGRESS. We instead manage this separately for this
17434 * case, to quiesce ire's and nce's for ill_is_quiescent.
17436 mutex_enter(&ill->ill_lock);
17437 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
17438 /* no more ire/nce addition allowed */
17439 mutex_exit(&ill->ill_lock);
17442 * If we can quiesce the ill, then set the address. If not, then
17443 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail().
17445 ill_down_ipifs(ill, B_TRUE);
17446 mutex_enter(&ill->ill_lock);
17447 if (!ill_is_quiescent(ill)) {
17448 /* call cannot fail since `conn_t *' argument is NULL */
17449 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
17450 mp, ILL_DOWN);
17451 mutex_exit(&ill->ill_lock);
17452 return (EINPROGRESS);
17454 mutex_exit(&ill->ill_lock);
17456 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL);
17457 return (0);
17461 * When the allowed-ips link property is set on the datalink, IP receives a
17462 * DL_NOTE_ALLOWED_IPS notification that is processed in ill_set_allowed_ips()
17463 * to initialize the ill_allowed_ips[] array in the ill_t. This array is then
17464 * used to vet addresses passed to ip_sioctl_addr() and to ensure that the
17465 * only IP addresses configured on the ill_t are those in the ill_allowed_ips[]
17466 * array.
17468 void
17469 ill_set_allowed_ips(ill_t *ill, mblk_t *mp)
17471 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17472 dl_notify_ind_t *dlip = (dl_notify_ind_t *)mp->b_rptr;
17473 mac_protect_t *mrp;
17474 int i;
17476 ASSERT(IAM_WRITER_IPSQ(ipsq));
17477 mrp = (mac_protect_t *)&dlip[1];
17479 if (mrp->mp_ipaddrcnt == 0) { /* reset allowed-ips */
17480 kmem_free(ill->ill_allowed_ips,
17481 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17482 ill->ill_allowed_ips_cnt = 0;
17483 ill->ill_allowed_ips = NULL;
17484 mutex_enter(&ill->ill_phyint->phyint_lock);
17485 ill->ill_phyint->phyint_flags &= ~PHYI_L3PROTECT;
17486 mutex_exit(&ill->ill_phyint->phyint_lock);
17487 return;
17490 if (ill->ill_allowed_ips != NULL) {
17491 kmem_free(ill->ill_allowed_ips,
17492 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17494 ill->ill_allowed_ips_cnt = mrp->mp_ipaddrcnt;
17495 ill->ill_allowed_ips = kmem_alloc(
17496 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t), KM_SLEEP);
17497 for (i = 0; i < mrp->mp_ipaddrcnt; i++)
17498 ill->ill_allowed_ips[i] = mrp->mp_ipaddrs[i].ip_addr;
17500 mutex_enter(&ill->ill_phyint->phyint_lock);
17501 ill->ill_phyint->phyint_flags |= PHYI_L3PROTECT;
17502 mutex_exit(&ill->ill_phyint->phyint_lock);
17506 * Once the ill associated with `q' has quiesced, set its physical address
17507 * information to the values in `addrmp'. Note that two copies of `addrmp'
17508 * are passed (linked by b_cont), since we sometimes need to save two distinct
17509 * copies in the ill_t, and our context doesn't permit sleeping or allocation
17510 * failure (we'll free the other copy if it's not needed). Since the ill_t
17511 * is quiesced, we know any stale nce's with the old address information have
17512 * already been removed, so we don't need to call nce_flush().
17514 /* ARGSUSED */
17515 static void
17516 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
17518 ill_t *ill = q->q_ptr;
17519 mblk_t *addrmp2 = unlinkb(addrmp);
17520 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr;
17521 uint_t addrlen, addroff;
17522 int status;
17524 ASSERT(IAM_WRITER_IPSQ(ipsq));
17526 addroff = dlindp->dl_addr_offset;
17527 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length);
17529 switch (dlindp->dl_data) {
17530 case DL_IPV6_LINK_LAYER_ADDR:
17531 ill_set_ndmp(ill, addrmp, addroff, addrlen);
17532 freemsg(addrmp2);
17533 break;
17535 case DL_CURR_DEST_ADDR:
17536 freemsg(ill->ill_dest_addr_mp);
17537 ill->ill_dest_addr = addrmp->b_rptr + addroff;
17538 ill->ill_dest_addr_mp = addrmp;
17539 if (ill->ill_isv6) {
17540 ill_setdesttoken(ill);
17541 ipif_setdestlinklocal(ill->ill_ipif);
17543 freemsg(addrmp2);
17544 break;
17546 case DL_CURR_PHYS_ADDR:
17547 freemsg(ill->ill_phys_addr_mp);
17548 ill->ill_phys_addr = addrmp->b_rptr + addroff;
17549 ill->ill_phys_addr_mp = addrmp;
17550 ill->ill_phys_addr_length = addrlen;
17551 if (ill->ill_isv6)
17552 ill_set_ndmp(ill, addrmp2, addroff, addrlen);
17553 else
17554 freemsg(addrmp2);
17555 if (ill->ill_isv6) {
17556 ill_setdefaulttoken(ill);
17557 ipif_setlinklocal(ill->ill_ipif);
17559 break;
17560 default:
17561 ASSERT(0);
17565 * reset ILL_DOWN_IN_PROGRESS so that we can successfully add ires
17566 * as we bring the ipifs up again.
17568 mutex_enter(&ill->ill_lock);
17569 ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
17570 mutex_exit(&ill->ill_lock);
17572 * If there are ipifs to bring up, ill_up_ipifs() will return
17573 * EINPROGRESS, and ipsq_current_finish() will be called by
17574 * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is
17575 * brought up.
17577 status = ill_up_ipifs(ill, q, addrmp);
17578 if (status != EINPROGRESS)
17579 ipsq_current_finish(ipsq);
17583 * Helper routine for setting the ill_nd_lla fields.
17585 void
17586 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen)
17588 freemsg(ill->ill_nd_lla_mp);
17589 ill->ill_nd_lla = ndmp->b_rptr + addroff;
17590 ill->ill_nd_lla_mp = ndmp;
17591 ill->ill_nd_lla_len = addrlen;
17595 * Replumb the ill.
17598 ill_replumb(ill_t *ill, mblk_t *mp)
17600 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17602 ASSERT(IAM_WRITER_IPSQ(ipsq));
17604 ipsq_current_start(ipsq, ill->ill_ipif, 0);
17607 * If we can quiesce the ill, then continue. If not, then
17608 * ill_replumb_tail() will be called from ipif_ill_refrele_tail().
17610 ill_down_ipifs(ill, B_FALSE);
17612 mutex_enter(&ill->ill_lock);
17613 if (!ill_is_quiescent(ill)) {
17614 /* call cannot fail since `conn_t *' argument is NULL */
17615 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
17616 mp, ILL_DOWN);
17617 mutex_exit(&ill->ill_lock);
17618 return (EINPROGRESS);
17620 mutex_exit(&ill->ill_lock);
17622 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL);
17623 return (0);
17626 /* ARGSUSED */
17627 static void
17628 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
17630 ill_t *ill = q->q_ptr;
17631 int err;
17632 conn_t *connp = NULL;
17634 ASSERT(IAM_WRITER_IPSQ(ipsq));
17635 freemsg(ill->ill_replumb_mp);
17636 ill->ill_replumb_mp = copyb(mp);
17638 if (ill->ill_replumb_mp == NULL) {
17639 /* out of memory */
17640 ipsq_current_finish(ipsq);
17641 return;
17644 mutex_enter(&ill->ill_lock);
17645 ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif,
17646 ill->ill_rq, ill->ill_replumb_mp, 0);
17647 mutex_exit(&ill->ill_lock);
17649 if (!ill->ill_up_ipifs) {
17650 /* already closing */
17651 ipsq_current_finish(ipsq);
17652 return;
17654 ill->ill_replumbing = 1;
17655 err = ill_down_ipifs_tail(ill);
17658 * Successfully quiesced and brought down the interface, now we send
17659 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the
17660 * DL_NOTE_REPLUMB message.
17662 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO,
17663 DL_NOTIFY_CONF);
17664 ASSERT(mp != NULL);
17665 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification =
17666 DL_NOTE_REPLUMB_DONE;
17667 ill_dlpi_send(ill, mp);
17670 * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP
17671 * streams have to be unbound. When all the DLPI exchanges are done,
17672 * ipsq_current_finish() will be called by arp_bringup_done(). The
17673 * remainder of ipif bringup via ill_up_ipifs() will also be done in
17674 * arp_bringup_done().
17676 ASSERT(ill->ill_replumb_mp != NULL);
17677 if (err == EINPROGRESS)
17678 return;
17679 else
17680 ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp);
17681 ASSERT(connp == NULL);
17682 if (err == 0 && ill->ill_replumb_mp != NULL &&
17683 ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) {
17684 return;
17686 ipsq_current_finish(ipsq);
17690 * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf'
17691 * which is `bufsize' bytes. On success, zero is returned and `buf' updated
17692 * as per the ioctl. On failure, an errno is returned.
17694 static int
17695 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr)
17697 int rval;
17698 struct strioctl iocb;
17700 iocb.ic_cmd = cmd;
17701 iocb.ic_timout = 15;
17702 iocb.ic_len = bufsize;
17703 iocb.ic_dp = buf;
17705 return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval));
17709 * Issue an SIOCGLIFCONF for address family `af' and store the result into a
17710 * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success.
17712 static int
17713 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp,
17714 uint_t *bufsizep, cred_t *cr)
17716 int err;
17717 struct lifnum lifn;
17719 bzero(&lifn, sizeof (lifn));
17720 lifn.lifn_family = af;
17721 lifn.lifn_flags = LIFC_UNDER_IPMP;
17723 if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0)
17724 return (err);
17727 * Pad the interface count to account for additional interfaces that
17728 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
17730 lifn.lifn_count += 4;
17731 bzero(lifcp, sizeof (*lifcp));
17732 lifcp->lifc_flags = LIFC_UNDER_IPMP;
17733 lifcp->lifc_family = af;
17734 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
17735 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
17737 err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr);
17738 if (err != 0) {
17739 kmem_free(lifcp->lifc_buf, *bufsizep);
17740 return (err);
17743 return (0);
17747 * Helper for ip_interface_cleanup() that removes the loopback interface.
17749 static void
17750 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
17752 int err;
17753 struct lifreq lifr;
17755 bzero(&lifr, sizeof (lifr));
17756 (void) strcpy(lifr.lifr_name, ipif_loopback_name);
17759 * Attempt to remove the interface. It may legitimately not exist
17760 * (e.g. the zone administrator unplumbed it), so ignore ENXIO.
17762 err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr);
17763 if (err != 0 && err != ENXIO) {
17764 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: "
17765 "error %d\n", isv6 ? "v6" : "v4", err));
17770 * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP
17771 * groups and that IPMP data addresses are down. These conditions must be met
17772 * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp().
17774 static void
17775 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
17777 int af = isv6 ? AF_INET6 : AF_INET;
17778 int i, nifs;
17779 int err;
17780 uint_t bufsize;
17781 uint_t lifrsize = sizeof (struct lifreq);
17782 struct lifconf lifc;
17783 struct lifreq *lifrp;
17785 if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) {
17786 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list "
17787 "(error %d); any IPMP interfaces cannot be shutdown", err);
17788 return;
17791 nifs = lifc.lifc_len / lifrsize;
17792 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
17793 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
17794 if (err != 0) {
17795 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get "
17796 "flags: error %d", lifrp->lifr_name, err);
17797 continue;
17800 if (lifrp->lifr_flags & IFF_IPMP) {
17801 if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0)
17802 continue;
17804 lifrp->lifr_flags &= ~IFF_UP;
17805 err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr);
17806 if (err != 0) {
17807 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
17808 "bring down (error %d); IPMP interface may "
17809 "not be shutdown", lifrp->lifr_name, err);
17813 * Check if IFF_DUPLICATE is still set -- and if so,
17814 * reset the address to clear it.
17816 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
17817 if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE))
17818 continue;
17820 err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr);
17821 if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR,
17822 lifrp, lifrsize, cr)) != 0) {
17823 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
17824 "reset DAD (error %d); IPMP interface may "
17825 "not be shutdown", lifrp->lifr_name, err);
17827 continue;
17830 if (strchr(lifrp->lifr_name, IPIF_SEPARATOR_CHAR) == 0) {
17831 lifrp->lifr_groupname[0] = '\0';
17832 if ((err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp,
17833 lifrsize, cr)) != 0) {
17834 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
17835 "leave IPMP group (error %d); associated "
17836 "IPMP interface may not be shutdown",
17837 lifrp->lifr_name, err);
17838 continue;
17843 kmem_free(lifc.lifc_buf, bufsize);
17846 #define UDPDEV "/devices/pseudo/udp@0:udp"
17847 #define UDP6DEV "/devices/pseudo/udp6@0:udp6"
17850 * Remove the loopback interfaces and prep the IPMP interfaces to be torn down.
17851 * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away
17852 * when the user-level processes in the zone are killed and the latter are
17853 * cleaned up by str_stack_shutdown().
17855 void
17856 ip_interface_cleanup(ip_stack_t *ipst)
17858 ldi_handle_t lh;
17859 ldi_ident_t li;
17860 cred_t *cr;
17861 int err;
17862 int i;
17863 char *devs[] = { UDP6DEV, UDPDEV };
17864 netstackid_t stackid = ipst->ips_netstack->netstack_stackid;
17866 if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) {
17867 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:"
17868 " error %d", err);
17869 return;
17872 cr = zone_get_kcred(netstackid_to_zoneid(stackid));
17873 ASSERT(cr != NULL);
17876 * NOTE: loop executes exactly twice and is hardcoded to know that the
17877 * first iteration is IPv6. (Unrolling yields repetitious code, hence
17878 * the loop.)
17880 for (i = 0; i < 2; i++) {
17881 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li);
17882 if (err != 0) {
17883 cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:"
17884 " error %d", devs[i], err);
17885 continue;
17888 ip_loopback_removeif(lh, i == 0, cr);
17889 ip_ipmp_cleanup(lh, i == 0, cr);
17891 (void) ldi_close(lh, FREAD|FWRITE, cr);
17894 ldi_ident_release(li);
17895 crfree(cr);
17899 * This needs to be in-sync with nic_event_t definition
17901 static const char *
17902 ill_hook_event2str(nic_event_t event)
17904 switch (event) {
17905 case NE_PLUMB:
17906 return ("PLUMB");
17907 case NE_UNPLUMB:
17908 return ("UNPLUMB");
17909 case NE_UP:
17910 return ("UP");
17911 case NE_DOWN:
17912 return ("DOWN");
17913 case NE_ADDRESS_CHANGE:
17914 return ("ADDRESS_CHANGE");
17915 case NE_LIF_UP:
17916 return ("LIF_UP");
17917 case NE_LIF_DOWN:
17918 return ("LIF_DOWN");
17919 case NE_IFINDEX_CHANGE:
17920 return ("IFINDEX_CHANGE");
17921 default:
17922 return ("UNKNOWN");
17926 void
17927 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event,
17928 nic_event_data_t data, size_t datalen)
17930 ip_stack_t *ipst = ill->ill_ipst;
17931 hook_nic_event_int_t *info;
17932 const char *str = NULL;
17934 /* create a new nic event info */
17935 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL)
17936 goto fail;
17938 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
17939 info->hnei_event.hne_lif = lif;
17940 info->hnei_event.hne_event = event;
17941 info->hnei_event.hne_protocol = ill->ill_isv6 ?
17942 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data;
17943 info->hnei_event.hne_data = NULL;
17944 info->hnei_event.hne_datalen = 0;
17945 info->hnei_stackid = ipst->ips_netstack->netstack_stackid;
17947 if (data != NULL && datalen != 0) {
17948 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP);
17949 if (info->hnei_event.hne_data == NULL)
17950 goto fail;
17951 bcopy(data, info->hnei_event.hne_data, datalen);
17952 info->hnei_event.hne_datalen = datalen;
17955 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info,
17956 DDI_NOSLEEP) == DDI_SUCCESS)
17957 return;
17959 fail:
17960 if (info != NULL) {
17961 if (info->hnei_event.hne_data != NULL) {
17962 kmem_free(info->hnei_event.hne_data,
17963 info->hnei_event.hne_datalen);
17965 kmem_free(info, sizeof (hook_nic_event_t));
17967 str = ill_hook_event2str(event);
17968 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event "
17969 "information for %s (ENOMEM)\n", str, ill->ill_name));
17972 static int
17973 ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act)
17975 int err = 0;
17976 const in_addr_t *addr = NULL;
17977 nce_t *nce = NULL;
17978 ill_t *ill = ipif->ipif_ill;
17979 ill_t *bound_ill;
17980 boolean_t added_ipif = B_FALSE;
17981 uint16_t state;
17982 uint16_t flags;
17984 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail",
17985 ill_t *, ill, ipif_t *, ipif);
17986 if (ipif->ipif_lcl_addr != INADDR_ANY) {
17987 addr = &ipif->ipif_lcl_addr;
17990 if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) {
17991 if (res_act != Res_act_initial)
17992 return (EINVAL);
17995 if (addr != NULL) {
17996 ipmp_illgrp_t *illg = ill->ill_grp;
17998 /* add unicast nce for the local addr */
18000 if (IS_IPMP(ill)) {
18002 * If we're here via ipif_up(), then the ipif
18003 * won't be bound yet -- add it to the group,
18004 * which will bind it if possible. (We would
18005 * add it in ipif_up(), but deleting on failure
18006 * there is gruesome.) If we're here via
18007 * ipmp_ill_bind_ipif(), then the ipif has
18008 * already been added to the group and we
18009 * just need to use the binding.
18011 if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) {
18012 bound_ill = ipmp_illgrp_add_ipif(illg, ipif);
18013 if (bound_ill == NULL) {
18015 * We couldn't bind the ipif to an ill
18016 * yet, so we have nothing to publish.
18017 * Mark the address as ready and return.
18019 ipif->ipif_addr_ready = 1;
18020 return (0);
18022 added_ipif = B_TRUE;
18024 } else {
18025 bound_ill = ill;
18028 flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY |
18029 NCE_F_NONUD);
18031 * If this is an initial bring-up (or the ipif was never
18032 * completely brought up), do DAD. Otherwise, we're here
18033 * because IPMP has rebound an address to this ill: send
18034 * unsolicited advertisements (ARP announcements) to
18035 * inform others.
18037 if (res_act == Res_act_initial || !ipif->ipif_addr_ready) {
18038 state = ND_UNCHANGED; /* compute in nce_add_common() */
18039 } else {
18040 state = ND_REACHABLE;
18041 flags |= NCE_F_UNSOL_ADV;
18044 retry:
18045 err = nce_lookup_then_add_v4(ill,
18046 bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length,
18047 addr, flags, state, &nce);
18050 * note that we may encounter EEXIST if we are moving
18051 * the nce as a result of a rebind operation.
18053 switch (err) {
18054 case 0:
18055 ipif->ipif_added_nce = 1;
18056 nce->nce_ipif_cnt++;
18057 break;
18058 case EEXIST:
18059 ip1dbg(("ipif_arp_up: NCE already exists for %s\n",
18060 ill->ill_name));
18061 if (!NCE_MYADDR(nce->nce_common)) {
18063 * A leftover nce from before this address
18064 * existed
18066 ncec_delete(nce->nce_common);
18067 nce_refrele(nce);
18068 nce = NULL;
18069 goto retry;
18071 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
18072 nce_refrele(nce);
18073 nce = NULL;
18074 ip1dbg(("ipif_arp_up: NCE already exists "
18075 "for %s:%u\n", ill->ill_name,
18076 ipif->ipif_id));
18077 goto arp_up_done;
18080 * Duplicate local addresses are permissible for
18081 * IPIF_POINTOPOINT interfaces which will get marked
18082 * IPIF_UNNUMBERED later in
18083 * ip_addr_availability_check().
18085 * The nce_ipif_cnt field tracks the number of
18086 * ipifs that have nce_addr as their local address.
18088 ipif->ipif_addr_ready = 1;
18089 ipif->ipif_added_nce = 1;
18090 nce->nce_ipif_cnt++;
18091 err = 0;
18092 break;
18093 default:
18094 ASSERT(nce == NULL);
18095 goto arp_up_done;
18097 if (arp_no_defense) {
18098 if ((ipif->ipif_flags & IPIF_UP) &&
18099 !ipif->ipif_addr_ready)
18100 ipif_up_notify(ipif);
18101 ipif->ipif_addr_ready = 1;
18103 } else {
18104 /* zero address. nothing to publish */
18105 ipif->ipif_addr_ready = 1;
18107 if (nce != NULL)
18108 nce_refrele(nce);
18109 arp_up_done:
18110 if (added_ipif && err != 0)
18111 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
18112 return (err);
18116 ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup)
18118 int err = 0;
18119 ill_t *ill = ipif->ipif_ill;
18120 boolean_t first_interface, wait_for_dlpi = B_FALSE;
18122 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up",
18123 ill_t *, ill, ipif_t *, ipif);
18126 * need to bring up ARP or setup mcast mapping only
18127 * when the first interface is coming UP.
18129 first_interface = (ill->ill_ipif_up_count == 0 &&
18130 ill->ill_ipif_dup_count == 0 && !was_dup);
18132 if (res_act == Res_act_initial && first_interface) {
18134 * Send ATTACH + BIND
18136 err = arp_ll_up(ill);
18137 if (err != EINPROGRESS && err != 0)
18138 return (err);
18141 * Add NCE for local address. Start DAD.
18142 * we'll wait to hear that DAD has finished
18143 * before using the interface.
18145 if (err == EINPROGRESS)
18146 wait_for_dlpi = B_TRUE;
18149 if (!wait_for_dlpi)
18150 (void) ipif_arp_up_done_tail(ipif, res_act);
18152 return (!wait_for_dlpi ? 0 : EINPROGRESS);
18156 * Finish processing of "arp_up" after all the DLPI message
18157 * exchanges have completed between arp and the driver.
18159 void
18160 arp_bringup_done(ill_t *ill, int err)
18162 mblk_t *mp1;
18163 ipif_t *ipif;
18164 conn_t *connp = NULL;
18165 ipsq_t *ipsq;
18166 queue_t *q;
18168 ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name));
18170 ASSERT(IAM_WRITER_ILL(ill));
18172 ipsq = ill->ill_phyint->phyint_ipsq;
18173 ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18174 mp1 = ipsq_pending_mp_get(ipsq, &connp);
18175 ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18176 if (mp1 == NULL) /* bringup was aborted by the user */
18177 return;
18180 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18181 * must have an associated conn_t. Otherwise, we're bringing this
18182 * interface back up as part of handling an asynchronous event (e.g.,
18183 * physical address change).
18185 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18186 ASSERT(connp != NULL);
18187 q = CONNP_TO_WQ(connp);
18188 } else {
18189 ASSERT(connp == NULL);
18190 q = ill->ill_rq;
18192 if (err == 0) {
18193 if (ipif->ipif_isv6) {
18194 if ((err = ipif_up_done_v6(ipif)) != 0)
18195 ip0dbg(("arp_bringup_done: init failed\n"));
18196 } else {
18197 err = ipif_arp_up_done_tail(ipif, Res_act_initial);
18198 if (err != 0 ||
18199 (err = ipif_up_done(ipif)) != 0) {
18200 ip0dbg(("arp_bringup_done: "
18201 "init failed err %x\n", err));
18202 (void) ipif_arp_down(ipif);
18206 } else {
18207 ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n"));
18210 if ((err == 0) && (ill->ill_up_ipifs)) {
18211 err = ill_up_ipifs(ill, q, mp1);
18212 if (err == EINPROGRESS)
18213 return;
18217 * If we have a moved ipif to bring up, and everything has succeeded
18218 * to this point, bring it up on the IPMP ill. Otherwise, leave it
18219 * down -- the admin can try to bring it up by hand if need be.
18221 if (ill->ill_move_ipif != NULL) {
18222 ipif = ill->ill_move_ipif;
18223 ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif,
18224 ipif->ipif_ill->ill_name));
18225 ill->ill_move_ipif = NULL;
18226 if (err == 0) {
18227 err = ipif_up(ipif, q, mp1);
18228 if (err == EINPROGRESS)
18229 return;
18234 * The operation must complete without EINPROGRESS since
18235 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18236 * Otherwise, the operation will be stuck forever in the ipsq.
18238 ASSERT(err != EINPROGRESS);
18239 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18240 DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish",
18241 int, ipsq->ipsq_xop->ipx_current_ioctl,
18242 ill_t *, ill, ipif_t *, ipif);
18243 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18244 } else {
18245 ipsq_current_finish(ipsq);
18250 * Finish processing of arp replumb after all the DLPI message
18251 * exchanges have completed between arp and the driver.
18253 void
18254 arp_replumb_done(ill_t *ill, int err)
18256 mblk_t *mp1;
18257 ipif_t *ipif;
18258 conn_t *connp = NULL;
18259 ipsq_t *ipsq;
18260 queue_t *q;
18262 ASSERT(IAM_WRITER_ILL(ill));
18264 ipsq = ill->ill_phyint->phyint_ipsq;
18265 ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18266 mp1 = ipsq_pending_mp_get(ipsq, &connp);
18267 ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18268 if (mp1 == NULL) {
18269 ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n",
18270 ipsq->ipsq_xop->ipx_current_ioctl));
18271 /* bringup was aborted by the user */
18272 return;
18275 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18276 * must have an associated conn_t. Otherwise, we're bringing this
18277 * interface back up as part of handling an asynchronous event (e.g.,
18278 * physical address change).
18280 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18281 ASSERT(connp != NULL);
18282 q = CONNP_TO_WQ(connp);
18283 } else {
18284 ASSERT(connp == NULL);
18285 q = ill->ill_rq;
18287 if ((err == 0) && (ill->ill_up_ipifs)) {
18288 err = ill_up_ipifs(ill, q, mp1);
18289 if (err == EINPROGRESS)
18290 return;
18293 * The operation must complete without EINPROGRESS since
18294 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18295 * Otherwise, the operation will be stuck forever in the ipsq.
18297 ASSERT(err != EINPROGRESS);
18298 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18299 DTRACE_PROBE4(ipif__ioctl, char *,
18300 "arp_replumb_done finish",
18301 int, ipsq->ipsq_xop->ipx_current_ioctl,
18302 ill_t *, ill, ipif_t *, ipif);
18303 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18304 } else {
18305 ipsq_current_finish(ipsq);
18309 void
18310 ipif_up_notify(ipif_t *ipif)
18312 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
18313 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT);
18314 sctp_update_ipif(ipif, SCTP_IPIF_UP);
18315 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id),
18316 NE_LIF_UP, NULL, 0);
18320 * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and
18321 * this assumes the context is cv_wait'able. Hence it shouldnt' be used on
18322 * TPI end points with STREAMS modules pushed above. This is assured by not
18323 * having the IPI_MODOK flag for the ioctl. And IP ensures the ILB ioctl
18324 * never ends up on an ipsq, otherwise we may end up processing the ioctl
18325 * while unwinding from the ispq and that could be a thread from the bottom.
18327 /* ARGSUSED */
18329 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
18330 ip_ioctl_cmd_t *ipip, void *arg)
18332 mblk_t *cmd_mp = mp->b_cont->b_cont;
18333 ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr);
18334 int ret = 0;
18335 int i;
18336 size_t size;
18337 ip_stack_t *ipst;
18338 zoneid_t zoneid;
18339 ilb_stack_t *ilbs;
18341 ipst = CONNQ_TO_IPST(q);
18342 ilbs = ipst->ips_netstack->netstack_ilb;
18343 zoneid = Q_TO_CONN(q)->conn_zoneid;
18345 switch (command) {
18346 case ILB_CREATE_RULE: {
18347 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18349 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18350 ret = EINVAL;
18351 break;
18354 ret = ilb_rule_add(ilbs, zoneid, cmd);
18355 break;
18357 case ILB_DESTROY_RULE:
18358 case ILB_ENABLE_RULE:
18359 case ILB_DISABLE_RULE: {
18360 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr;
18362 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) {
18363 ret = EINVAL;
18364 break;
18367 if (cmd->flags & ILB_RULE_ALLRULES) {
18368 if (command == ILB_DESTROY_RULE) {
18369 ilb_rule_del_all(ilbs, zoneid);
18370 break;
18371 } else if (command == ILB_ENABLE_RULE) {
18372 ilb_rule_enable_all(ilbs, zoneid);
18373 break;
18374 } else if (command == ILB_DISABLE_RULE) {
18375 ilb_rule_disable_all(ilbs, zoneid);
18376 break;
18378 } else {
18379 if (command == ILB_DESTROY_RULE) {
18380 ret = ilb_rule_del(ilbs, zoneid, cmd->name);
18381 } else if (command == ILB_ENABLE_RULE) {
18382 ret = ilb_rule_enable(ilbs, zoneid, cmd->name,
18383 NULL);
18384 } else if (command == ILB_DISABLE_RULE) {
18385 ret = ilb_rule_disable(ilbs, zoneid, cmd->name,
18386 NULL);
18389 break;
18391 case ILB_NUM_RULES: {
18392 ilb_num_rules_cmd_t *cmd;
18394 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) {
18395 ret = EINVAL;
18396 break;
18398 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr;
18399 ilb_get_num_rules(ilbs, zoneid, &(cmd->num));
18400 break;
18402 case ILB_RULE_NAMES: {
18403 ilb_rule_names_cmd_t *cmd;
18405 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr;
18406 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) ||
18407 cmd->num_names == 0) {
18408 ret = EINVAL;
18409 break;
18411 size = cmd->num_names * ILB_RULE_NAMESZ;
18412 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) +
18413 size != cmd_mp->b_wptr) {
18414 ret = EINVAL;
18415 break;
18417 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf);
18418 break;
18420 case ILB_NUM_SERVERS: {
18421 ilb_num_servers_cmd_t *cmd;
18423 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) {
18424 ret = EINVAL;
18425 break;
18427 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr;
18428 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name,
18429 &(cmd->num));
18430 break;
18432 case ILB_LIST_RULE: {
18433 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18435 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18436 ret = EINVAL;
18437 break;
18439 ret = ilb_rule_list(ilbs, zoneid, cmd);
18440 break;
18442 case ILB_LIST_SERVERS: {
18443 ilb_servers_info_cmd_t *cmd;
18445 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18446 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) ||
18447 cmd->num_servers == 0) {
18448 ret = EINVAL;
18449 break;
18451 size = cmd->num_servers * sizeof (ilb_server_info_t);
18452 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18453 size != cmd_mp->b_wptr) {
18454 ret = EINVAL;
18455 break;
18458 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers,
18459 &cmd->num_servers);
18460 break;
18462 case ILB_ADD_SERVERS: {
18463 ilb_servers_info_cmd_t *cmd;
18464 ilb_rule_t *rule;
18466 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18467 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) {
18468 ret = EINVAL;
18469 break;
18471 size = cmd->num_servers * sizeof (ilb_server_info_t);
18472 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18473 size != cmd_mp->b_wptr) {
18474 ret = EINVAL;
18475 break;
18477 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18478 if (rule == NULL) {
18479 ASSERT(ret != 0);
18480 break;
18482 for (i = 0; i < cmd->num_servers; i++) {
18483 ilb_server_info_t *s;
18485 s = &cmd->servers[i];
18486 s->err = ilb_server_add(ilbs, rule, s);
18488 ILB_RULE_REFRELE(rule);
18489 break;
18491 case ILB_DEL_SERVERS:
18492 case ILB_ENABLE_SERVERS:
18493 case ILB_DISABLE_SERVERS: {
18494 ilb_servers_cmd_t *cmd;
18495 ilb_rule_t *rule;
18496 int (*f)();
18498 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr;
18499 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) {
18500 ret = EINVAL;
18501 break;
18503 size = cmd->num_servers * sizeof (ilb_server_arg_t);
18504 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) +
18505 size != cmd_mp->b_wptr) {
18506 ret = EINVAL;
18507 break;
18510 if (command == ILB_DEL_SERVERS)
18511 f = ilb_server_del;
18512 else if (command == ILB_ENABLE_SERVERS)
18513 f = ilb_server_enable;
18514 else if (command == ILB_DISABLE_SERVERS)
18515 f = ilb_server_disable;
18517 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18518 if (rule == NULL) {
18519 ASSERT(ret != 0);
18520 break;
18523 for (i = 0; i < cmd->num_servers; i++) {
18524 ilb_server_arg_t *s;
18526 s = &cmd->servers[i];
18527 s->err = f(ilbs, zoneid, NULL, rule, &s->addr);
18529 ILB_RULE_REFRELE(rule);
18530 break;
18532 case ILB_LIST_NAT_TABLE: {
18533 ilb_list_nat_cmd_t *cmd;
18535 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr;
18536 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) {
18537 ret = EINVAL;
18538 break;
18540 size = cmd->num_nat * sizeof (ilb_nat_entry_t);
18541 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) +
18542 size != cmd_mp->b_wptr) {
18543 ret = EINVAL;
18544 break;
18547 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat,
18548 &cmd->flags);
18549 break;
18551 case ILB_LIST_STICKY_TABLE: {
18552 ilb_list_sticky_cmd_t *cmd;
18554 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr;
18555 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) {
18556 ret = EINVAL;
18557 break;
18559 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t);
18560 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) +
18561 size != cmd_mp->b_wptr) {
18562 ret = EINVAL;
18563 break;
18566 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries,
18567 &cmd->num_sticky, &cmd->flags);
18568 break;
18570 default:
18571 ret = EINVAL;
18572 break;
18574 done:
18575 return (ret);
18578 /* Remove all cache entries for this logical interface */
18579 void
18580 ipif_nce_down(ipif_t *ipif)
18582 ill_t *ill = ipif->ipif_ill;
18583 nce_t *nce;
18585 DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down",
18586 ill_t *, ill, ipif_t *, ipif);
18587 if (ipif->ipif_added_nce) {
18588 if (ipif->ipif_isv6)
18589 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
18590 else
18591 nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr);
18592 if (nce != NULL) {
18593 if (--nce->nce_ipif_cnt == 0)
18594 ncec_delete(nce->nce_common);
18595 ipif->ipif_added_nce = 0;
18596 nce_refrele(nce);
18597 } else {
18599 * nce may already be NULL because it was already
18600 * flushed, e.g., due to a call to nce_flush
18602 ipif->ipif_added_nce = 0;
18606 * Make IPMP aware of the deleted data address.
18608 if (IS_IPMP(ill))
18609 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
18612 * Remove all other nces dependent on this ill when the last ipif
18613 * is going away.
18615 if (ill->ill_ipif_up_count == 0) {
18616 ncec_walk(ill, ncec_delete_per_ill, ill, ill->ill_ipst);
18617 if (IS_UNDER_IPMP(ill))
18618 nce_flush(ill, B_TRUE);
18623 * find the first interface that uses usill for its source address.
18625 ill_t *
18626 ill_lookup_usesrc(ill_t *usill)
18628 ip_stack_t *ipst = usill->ill_ipst;
18629 ill_t *ill;
18631 ASSERT(usill != NULL);
18633 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
18634 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
18635 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
18636 for (ill = usill->ill_usesrc_grp_next; ill != NULL && ill != usill;
18637 ill = ill->ill_usesrc_grp_next) {
18638 if (!IS_UNDER_IPMP(ill) && (ill->ill_flags & ILLF_MULTICAST) &&
18639 !ILL_IS_CONDEMNED(ill)) {
18640 ill_refhold(ill);
18641 break;
18644 rw_exit(&ipst->ips_ill_g_lock);
18645 rw_exit(&ipst->ips_ill_g_usesrc_lock);
18646 return (ill);
18650 * This comment applies to both ip_sioctl_get_ifhwaddr and
18651 * ip_sioctl_get_lifhwaddr as the basic function of these two functions
18652 * is the same.
18654 * The goal here is to find an IP interface that corresponds to the name
18655 * provided by the caller in the ifreq/lifreq structure held in the mblk_t
18656 * chain and to fill out a sockaddr/sockaddr_storage structure with the
18657 * mac address.
18659 * The SIOCGIFHWADDR/SIOCGLIFHWADDR ioctl may return an error for a number
18660 * of different reasons:
18661 * ENXIO - the device name is not known to IP.
18662 * EADDRNOTAVAIL - the device has no hardware address. This is indicated
18663 * by ill_phys_addr not pointing to an actual address.
18664 * EPFNOSUPPORT - this will indicate that a request is being made for a
18665 * mac address that will not fit in the data structure supplier (struct
18666 * sockaddr).
18669 /* ARGSUSED */
18671 ip_sioctl_get_ifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
18672 ip_ioctl_cmd_t *ipip, void *if_req)
18674 struct sockaddr *sock;
18675 struct ifreq *ifr;
18676 mblk_t *mp1;
18677 ill_t *ill;
18679 ASSERT(ipif != NULL);
18680 ill = ipif->ipif_ill;
18682 if (ill->ill_phys_addr == NULL) {
18683 return (EADDRNOTAVAIL);
18685 if (ill->ill_phys_addr_length > sizeof (sock->sa_data)) {
18686 return (EPFNOSUPPORT);
18689 ip1dbg(("ip_sioctl_get_hwaddr(%s)\n", ill->ill_name));
18691 /* Existence of mp1 has been checked in ip_wput_nondata */
18692 mp1 = mp->b_cont->b_cont;
18693 ifr = (struct ifreq *)mp1->b_rptr;
18695 sock = &ifr->ifr_addr;
18697 * The "family" field in the returned structure is set to a value
18698 * that represents the type of device to which the address belongs.
18699 * The value returned may differ to that on Linux but it will still
18700 * represent the correct symbol on Solaris.
18702 sock->sa_family = arp_hw_type(ill->ill_mactype);
18703 bcopy(ill->ill_phys_addr, &sock->sa_data, ill->ill_phys_addr_length);
18705 return (0);
18709 * The expection of applications using SIOCGIFHWADDR is that data will
18710 * be returned in the sa_data field of the sockaddr structure. With
18711 * SIOCGLIFHWADDR, we're breaking new ground as there is no Linux
18712 * equivalent. In light of this, struct sockaddr_dl is used as it
18713 * offers more space for address storage in sll_data.
18715 /* ARGSUSED */
18717 ip_sioctl_get_lifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
18718 ip_ioctl_cmd_t *ipip, void *if_req)
18720 struct sockaddr_dl *sock;
18721 struct lifreq *lifr;
18722 mblk_t *mp1;
18723 ill_t *ill;
18725 ASSERT(ipif != NULL);
18726 ill = ipif->ipif_ill;
18728 if (ill->ill_phys_addr == NULL) {
18729 return (EADDRNOTAVAIL);
18731 if (ill->ill_phys_addr_length > sizeof (sock->sdl_data)) {
18732 return (EPFNOSUPPORT);
18735 ip1dbg(("ip_sioctl_get_lifhwaddr(%s)\n", ill->ill_name));
18737 /* Existence of mp1 has been checked in ip_wput_nondata */
18738 mp1 = mp->b_cont->b_cont;
18739 lifr = (struct lifreq *)mp1->b_rptr;
18742 * sockaddr_ll is used here because it is also the structure used in
18743 * responding to the same ioctl in sockpfp. The only other choice is
18744 * sockaddr_dl which contains fields that are not required here
18745 * because its purpose is different.
18747 lifr->lifr_type = ill->ill_type;
18748 sock = (struct sockaddr_dl *)&lifr->lifr_addr;
18749 sock->sdl_family = AF_LINK;
18750 sock->sdl_index = ill->ill_phyint->phyint_ifindex;
18751 sock->sdl_type = ill->ill_mactype;
18752 sock->sdl_nlen = 0;
18753 sock->sdl_slen = 0;
18754 sock->sdl_alen = ill->ill_phys_addr_length;
18755 bcopy(ill->ill_phys_addr, sock->sdl_data, ill->ill_phys_addr_length);
18757 return (0);