6877528 flushing tunnel policy doesn't bump tunnel link MTU
[illumos-gate.git] / usr / src / uts / common / inet / iptun / iptun.c
blob1f2798872a9c4d217ae78681d87c192cd4c761e0
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * iptun - IP Tunneling Driver
29 * This module is a GLDv3 driver that implements virtual datalinks over IP
30 * (a.k.a, IP tunneling). The datalinks are managed through a dld ioctl
31 * interface (see iptun_ctl.c), and registered with GLDv3 using
32 * mac_register(). It implements the logic for various forms of IP (IPv4 or
33 * IPv6) encapsulation within IP (IPv4 or IPv6) by interacting with the ip
34 * module below it. Each virtual IP tunnel datalink has a conn_t associated
35 * with it representing the "outer" IP connection.
37 * The module implements the following locking semantics:
39 * Lookups and deletions in iptun_hash are synchronized using iptun_hash_lock.
40 * See comments above iptun_hash_lock for details.
42 * No locks are ever held while calling up to GLDv3. The general architecture
43 * of GLDv3 requires this, as the mac perimeter (essentially a lock) for a
44 * given link will be held while making downcalls (iptun_m_*() callbacks).
45 * Because we need to hold locks while handling downcalls, holding these locks
46 * while issuing upcalls results in deadlock scenarios. See the block comment
47 * above iptun_task_cb() for details on how we safely issue upcalls without
48 * holding any locks.
50 * The contents of each iptun_t is protected by an iptun_mutex which is held
51 * in iptun_enter() (called by iptun_enter_by_linkid()), and exited in
52 * iptun_exit().
54 * See comments in iptun_delete() and iptun_free() for details on how the
55 * iptun_t is deleted safely.
58 #include <sys/types.h>
59 #include <sys/kmem.h>
60 #include <sys/errno.h>
61 #include <sys/modhash.h>
62 #include <sys/list.h>
63 #include <sys/strsun.h>
64 #include <sys/file.h>
65 #include <sys/systm.h>
66 #include <sys/tihdr.h>
67 #include <sys/param.h>
68 #include <sys/mac_provider.h>
69 #include <sys/mac_ipv4.h>
70 #include <sys/mac_ipv6.h>
71 #include <sys/mac_6to4.h>
72 #include <sys/tsol/tnet.h>
73 #include <sys/sunldi.h>
74 #include <netinet/in.h>
75 #include <netinet/ip6.h>
76 #include <inet/ip.h>
77 #include <inet/ip_ire.h>
78 #include <inet/ipsec_impl.h>
79 #include <sys/tsol/label.h>
80 #include <sys/tsol/tnet.h>
81 #include <inet/iptun.h>
82 #include "iptun_impl.h"
84 /* Do the tunnel type and address family match? */
85 #define IPTUN_ADDR_MATCH(iptun_type, family) \
86 ((iptun_type == IPTUN_TYPE_IPV4 && family == AF_INET) || \
87 (iptun_type == IPTUN_TYPE_IPV6 && family == AF_INET6) || \
88 (iptun_type == IPTUN_TYPE_6TO4 && family == AF_INET))
90 #define IPTUN_HASH_KEY(key) ((mod_hash_key_t)(uintptr_t)(key))
92 #define IPTUN_MIN_IPV4_MTU 576 /* ip.h still uses 68 (!) */
93 #define IPTUN_MIN_IPV6_MTU IPV6_MIN_MTU
94 #define IPTUN_MAX_IPV4_MTU (IP_MAXPACKET - sizeof (ipha_t))
95 #define IPTUN_MAX_IPV6_MTU (IP_MAXPACKET - sizeof (ip6_t) - \
96 sizeof (iptun_encaplim_t))
98 #define IPTUN_MIN_HOPLIMIT 1
99 #define IPTUN_MAX_HOPLIMIT UINT8_MAX
101 #define IPTUN_MIN_ENCAPLIMIT 0
102 #define IPTUN_MAX_ENCAPLIMIT UINT8_MAX
104 #define IPTUN_IPSEC_REQ_MASK (IPSEC_PREF_REQUIRED | IPSEC_PREF_NEVER)
106 static iptun_encaplim_t iptun_encaplim_init = {
107 { IPPROTO_NONE, 0 },
108 IP6OPT_TUNNEL_LIMIT,
110 IPTUN_DEFAULT_ENCAPLIMIT, /* filled in with actual value later */
111 IP6OPT_PADN,
117 * Table containing per-iptun-type information.
118 * Since IPv6 can run over all of these we have the IPv6 min as the min MTU.
120 static iptun_typeinfo_t iptun_type_table[] = {
121 { IPTUN_TYPE_IPV4, MAC_PLUGIN_IDENT_IPV4, IPV4_VERSION,
122 IPTUN_MIN_IPV6_MTU, IPTUN_MAX_IPV4_MTU, B_TRUE },
123 { IPTUN_TYPE_IPV6, MAC_PLUGIN_IDENT_IPV6, IPV6_VERSION,
124 IPTUN_MIN_IPV6_MTU, IPTUN_MAX_IPV6_MTU, B_TRUE },
125 { IPTUN_TYPE_6TO4, MAC_PLUGIN_IDENT_6TO4, IPV4_VERSION,
126 IPTUN_MIN_IPV6_MTU, IPTUN_MAX_IPV4_MTU, B_FALSE },
127 { IPTUN_TYPE_UNKNOWN, NULL, 0, 0, 0, B_FALSE }
131 * iptun_hash is an iptun_t lookup table by link ID protected by
132 * iptun_hash_lock. While the hash table's integrity is maintained via
133 * internal locking in the mod_hash_*() functions, we need additional locking
134 * so that an iptun_t cannot be deleted after a hash lookup has returned an
135 * iptun_t and before iptun_lock has been entered. As such, we use
136 * iptun_hash_lock when doing lookups and removals from iptun_hash.
138 mod_hash_t *iptun_hash;
139 static kmutex_t iptun_hash_lock;
141 static uint_t iptun_tunnelcount; /* total for all stacks */
142 kmem_cache_t *iptun_cache;
143 ddi_taskq_t *iptun_taskq;
145 typedef enum {
146 IPTUN_TASK_MTU_UPDATE, /* tell mac about new tunnel link MTU */
147 IPTUN_TASK_LADDR_UPDATE, /* tell mac about new local address */
148 IPTUN_TASK_RADDR_UPDATE, /* tell mac about new remote address */
149 IPTUN_TASK_LINK_UPDATE, /* tell mac about new link state */
150 IPTUN_TASK_PDATA_UPDATE /* tell mac about updated plugin data */
151 } iptun_task_t;
153 typedef struct iptun_task_data_s {
154 iptun_task_t itd_task;
155 datalink_id_t itd_linkid;
156 } iptun_task_data_t;
158 static void iptun_task_dispatch(iptun_t *, iptun_task_t);
159 static int iptun_enter(iptun_t *);
160 static void iptun_exit(iptun_t *);
161 static void iptun_headergen(iptun_t *, boolean_t);
162 static void iptun_drop_pkt(mblk_t *, uint64_t *);
163 static void iptun_input(void *, mblk_t *, void *, ip_recv_attr_t *);
164 static void iptun_input_icmp(void *, mblk_t *, void *, ip_recv_attr_t *);
165 static void iptun_output(iptun_t *, mblk_t *);
166 static uint32_t iptun_get_maxmtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
167 static uint32_t iptun_update_mtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
168 static uint32_t iptun_get_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
169 static void iptun_update_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
170 static int iptun_setladdr(iptun_t *, const struct sockaddr_storage *);
172 static void iptun_output_6to4(iptun_t *, mblk_t *);
173 static void iptun_output_common(iptun_t *, ip_xmit_attr_t *, mblk_t *);
174 static boolean_t iptun_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
175 ip_recv_attr_t *);
177 static void iptun_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
178 ixa_notify_arg_t);
180 static mac_callbacks_t iptun_m_callbacks;
182 static int
183 iptun_m_getstat(void *arg, uint_t stat, uint64_t *val)
185 iptun_t *iptun = arg;
186 int err = 0;
188 switch (stat) {
189 case MAC_STAT_IERRORS:
190 *val = iptun->iptun_ierrors;
191 break;
192 case MAC_STAT_OERRORS:
193 *val = iptun->iptun_oerrors;
194 break;
195 case MAC_STAT_RBYTES:
196 *val = iptun->iptun_rbytes;
197 break;
198 case MAC_STAT_IPACKETS:
199 *val = iptun->iptun_ipackets;
200 break;
201 case MAC_STAT_OBYTES:
202 *val = iptun->iptun_obytes;
203 break;
204 case MAC_STAT_OPACKETS:
205 *val = iptun->iptun_opackets;
206 break;
207 case MAC_STAT_NORCVBUF:
208 *val = iptun->iptun_norcvbuf;
209 break;
210 case MAC_STAT_NOXMTBUF:
211 *val = iptun->iptun_noxmtbuf;
212 break;
213 default:
214 err = ENOTSUP;
217 return (err);
220 static int
221 iptun_m_start(void *arg)
223 iptun_t *iptun = arg;
224 int err;
226 if ((err = iptun_enter(iptun)) == 0) {
227 iptun->iptun_flags |= IPTUN_MAC_STARTED;
228 iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
229 iptun_exit(iptun);
231 return (err);
234 static void
235 iptun_m_stop(void *arg)
237 iptun_t *iptun = arg;
239 if (iptun_enter(iptun) == 0) {
240 iptun->iptun_flags &= ~IPTUN_MAC_STARTED;
241 iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
242 iptun_exit(iptun);
247 * iptun_m_setpromisc() does nothing and always succeeds. This is because a
248 * tunnel data-link only ever receives packets that are destined exclusively
249 * for the local address of the tunnel.
251 /* ARGSUSED */
252 static int
253 iptun_m_setpromisc(void *arg, boolean_t on)
255 return (0);
258 /* ARGSUSED */
259 static int
260 iptun_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
262 return (ENOTSUP);
266 * iptun_m_unicst() sets the local address.
268 /* ARGSUSED */
269 static int
270 iptun_m_unicst(void *arg, const uint8_t *addrp)
272 iptun_t *iptun = arg;
273 int err;
274 struct sockaddr_storage ss;
275 struct sockaddr_in *sin;
276 struct sockaddr_in6 *sin6;
278 if ((err = iptun_enter(iptun)) == 0) {
279 switch (iptun->iptun_typeinfo->iti_ipvers) {
280 case IPV4_VERSION:
281 sin = (struct sockaddr_in *)&ss;
282 sin->sin_family = AF_INET;
283 bcopy(addrp, &sin->sin_addr, sizeof (in_addr_t));
284 break;
285 case IPV6_VERSION:
286 sin6 = (struct sockaddr_in6 *)&ss;
287 sin6->sin6_family = AF_INET6;
288 bcopy(addrp, &sin6->sin6_addr, sizeof (in6_addr_t));
289 break;
290 default:
291 ASSERT(0);
293 err = iptun_setladdr(iptun, &ss);
294 iptun_exit(iptun);
296 return (err);
299 static mblk_t *
300 iptun_m_tx(void *arg, mblk_t *mpchain)
302 mblk_t *mp, *nmp;
303 iptun_t *iptun = arg;
305 if (!IS_IPTUN_RUNNING(iptun)) {
306 iptun_drop_pkt(mpchain, &iptun->iptun_noxmtbuf);
307 return (NULL);
310 for (mp = mpchain; mp != NULL; mp = nmp) {
311 nmp = mp->b_next;
312 mp->b_next = NULL;
313 iptun_output(iptun, mp);
316 return (NULL);
319 /* ARGSUSED */
320 static int
321 iptun_m_setprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
322 uint_t pr_valsize, const void *pr_val)
324 iptun_t *iptun = barg;
325 uint32_t value = *(uint32_t *)pr_val;
326 int err;
329 * We need to enter this iptun_t since we'll be modifying the outer
330 * header.
332 if ((err = iptun_enter(iptun)) != 0)
333 return (err);
335 switch (pr_num) {
336 case MAC_PROP_IPTUN_HOPLIMIT:
337 if (value < IPTUN_MIN_HOPLIMIT || value > IPTUN_MAX_HOPLIMIT) {
338 err = EINVAL;
339 break;
341 if (value != iptun->iptun_hoplimit) {
342 iptun->iptun_hoplimit = (uint8_t)value;
343 iptun_headergen(iptun, B_TRUE);
345 break;
346 case MAC_PROP_IPTUN_ENCAPLIMIT:
347 if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_IPV6 ||
348 value > IPTUN_MAX_ENCAPLIMIT) {
349 err = EINVAL;
350 break;
352 if (value != iptun->iptun_encaplimit) {
353 iptun->iptun_encaplimit = (uint8_t)value;
354 iptun_headergen(iptun, B_TRUE);
356 break;
357 case MAC_PROP_MTU: {
358 uint32_t maxmtu = iptun_get_maxmtu(iptun, NULL, 0);
360 if (value < iptun->iptun_typeinfo->iti_minmtu ||
361 value > maxmtu) {
362 err = EINVAL;
363 break;
365 iptun->iptun_flags |= IPTUN_FIXED_MTU;
366 if (value != iptun->iptun_mtu) {
367 iptun->iptun_mtu = value;
368 iptun_task_dispatch(iptun, IPTUN_TASK_MTU_UPDATE);
370 break;
372 default:
373 err = EINVAL;
375 iptun_exit(iptun);
376 return (err);
379 /* ARGSUSED */
380 static int
381 iptun_m_getprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
382 uint_t pr_flags, uint_t pr_valsize, void *pr_val, uint_t *perm)
384 iptun_t *iptun = barg;
385 mac_propval_range_t range;
386 boolean_t is_default = (pr_flags & MAC_PROP_DEFAULT);
387 boolean_t is_possible = (pr_flags & MAC_PROP_POSSIBLE);
388 int err;
390 if ((err = iptun_enter(iptun)) != 0)
391 return (err);
393 if ((pr_flags & ~(MAC_PROP_DEFAULT | MAC_PROP_POSSIBLE)) != 0) {
394 err = ENOTSUP;
395 goto done;
397 if (is_default && is_possible) {
398 err = EINVAL;
399 goto done;
402 *perm = MAC_PROP_PERM_RW;
404 if (is_possible) {
405 if (pr_valsize < sizeof (mac_propval_range_t)) {
406 err = EINVAL;
407 goto done;
409 range.mpr_count = 1;
410 range.mpr_type = MAC_PROPVAL_UINT32;
411 } else if (pr_valsize < sizeof (uint32_t)) {
412 err = EINVAL;
413 goto done;
416 switch (pr_num) {
417 case MAC_PROP_IPTUN_HOPLIMIT:
418 if (is_possible) {
419 range.range_uint32[0].mpur_min = IPTUN_MIN_HOPLIMIT;
420 range.range_uint32[0].mpur_max = IPTUN_MAX_HOPLIMIT;
421 } else if (is_default) {
422 *(uint32_t *)pr_val = IPTUN_DEFAULT_HOPLIMIT;
423 } else {
424 *(uint32_t *)pr_val = iptun->iptun_hoplimit;
426 break;
427 case MAC_PROP_IPTUN_ENCAPLIMIT:
428 if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_IPV6) {
429 err = ENOTSUP;
430 goto done;
432 if (is_possible) {
433 range.range_uint32[0].mpur_min = IPTUN_MIN_ENCAPLIMIT;
434 range.range_uint32[0].mpur_max = IPTUN_MAX_ENCAPLIMIT;
435 } else if (is_default) {
436 *(uint32_t *)pr_val = IPTUN_DEFAULT_ENCAPLIMIT;
437 } else {
438 *(uint32_t *)pr_val = iptun->iptun_encaplimit;
440 break;
441 case MAC_PROP_MTU: {
442 uint32_t maxmtu = iptun_get_maxmtu(iptun, NULL, 0);
444 if (is_possible) {
445 range.range_uint32[0].mpur_min =
446 iptun->iptun_typeinfo->iti_minmtu;
447 range.range_uint32[0].mpur_max = maxmtu;
448 } else {
450 * The MAC module knows the current value and should
451 * never call us for it. There is also no default
452 * MTU, as by default, it is a dynamic property.
454 err = ENOTSUP;
455 goto done;
457 break;
459 default:
460 err = EINVAL;
461 goto done;
463 if (is_possible)
464 bcopy(&range, pr_val, sizeof (range));
465 done:
466 iptun_exit(iptun);
467 return (err);
470 uint_t
471 iptun_count(void)
473 return (iptun_tunnelcount);
477 * Enter an iptun_t exclusively. This is essentially just a mutex, but we
478 * don't allow iptun_enter() to succeed on a tunnel if it's in the process of
479 * being deleted.
481 static int
482 iptun_enter(iptun_t *iptun)
484 mutex_enter(&iptun->iptun_lock);
485 while (iptun->iptun_flags & IPTUN_DELETE_PENDING)
486 cv_wait(&iptun->iptun_enter_cv, &iptun->iptun_lock);
487 if (iptun->iptun_flags & IPTUN_CONDEMNED) {
488 mutex_exit(&iptun->iptun_lock);
489 return (ENOENT);
491 return (0);
495 * Exit the tunnel entered in iptun_enter().
497 static void
498 iptun_exit(iptun_t *iptun)
500 mutex_exit(&iptun->iptun_lock);
504 * Enter the IP tunnel instance by datalink ID.
506 static int
507 iptun_enter_by_linkid(datalink_id_t linkid, iptun_t **iptun)
509 int err;
511 mutex_enter(&iptun_hash_lock);
512 if (mod_hash_find(iptun_hash, IPTUN_HASH_KEY(linkid),
513 (mod_hash_val_t *)iptun) == 0)
514 err = iptun_enter(*iptun);
515 else
516 err = ENOENT;
517 if (err != 0)
518 *iptun = NULL;
519 mutex_exit(&iptun_hash_lock);
520 return (err);
524 * Handle tasks that were deferred through the iptun_taskq because they require
525 * calling up to the mac module, and we can't call up to the mac module while
526 * holding locks.
528 * This is tricky to get right without introducing race conditions and
529 * deadlocks with the mac module, as we cannot issue an upcall while in the
530 * iptun_t. The reason is that upcalls may try and enter the mac perimeter,
531 * while iptun callbacks (such as iptun_m_setprop()) called from the mac
532 * module will already have the perimeter held, and will then try and enter
533 * the iptun_t. You can see the lock ordering problem with this; this will
534 * deadlock.
536 * The safe way to do this is to enter the iptun_t in question and copy the
537 * information we need out of it so that we can exit it and know that the
538 * information being passed up to the upcalls won't be subject to modification
539 * by other threads. The problem now is that we need to exit it prior to
540 * issuing the upcall, but once we do this, a thread could come along and
541 * delete the iptun_t and thus the mac handle required to issue the upcall.
542 * To prevent this, we set the IPTUN_UPCALL_PENDING flag prior to exiting the
543 * iptun_t. This flag is the condition associated with iptun_upcall_cv, which
544 * iptun_delete() will cv_wait() on. When the upcall completes, we clear
545 * IPTUN_UPCALL_PENDING and cv_signal() any potentially waiting
546 * iptun_delete(). We can thus still safely use iptun->iptun_mh after having
547 * exited the iptun_t.
549 static void
550 iptun_task_cb(void *arg)
552 iptun_task_data_t *itd = arg;
553 iptun_task_t task = itd->itd_task;
554 datalink_id_t linkid = itd->itd_linkid;
555 iptun_t *iptun;
556 uint32_t mtu;
557 iptun_addr_t addr;
558 link_state_t linkstate;
559 size_t header_size;
560 iptun_header_t header;
562 kmem_free(itd, sizeof (*itd));
565 * Note that if the lookup fails, it's because the tunnel was deleted
566 * between the time the task was dispatched and now. That isn't an
567 * error.
569 if (iptun_enter_by_linkid(linkid, &iptun) != 0)
570 return;
572 iptun->iptun_flags |= IPTUN_UPCALL_PENDING;
574 switch (task) {
575 case IPTUN_TASK_MTU_UPDATE:
576 mtu = iptun->iptun_mtu;
577 break;
578 case IPTUN_TASK_LADDR_UPDATE:
579 addr = iptun->iptun_laddr;
580 break;
581 case IPTUN_TASK_RADDR_UPDATE:
582 addr = iptun->iptun_raddr;
583 break;
584 case IPTUN_TASK_LINK_UPDATE:
585 linkstate = IS_IPTUN_RUNNING(iptun) ?
586 LINK_STATE_UP : LINK_STATE_DOWN;
587 break;
588 case IPTUN_TASK_PDATA_UPDATE:
589 header_size = iptun->iptun_header_size;
590 header = iptun->iptun_header;
591 break;
592 default:
593 ASSERT(0);
596 iptun_exit(iptun);
598 switch (task) {
599 case IPTUN_TASK_MTU_UPDATE:
600 (void) mac_maxsdu_update(iptun->iptun_mh, mtu);
601 break;
602 case IPTUN_TASK_LADDR_UPDATE:
603 mac_unicst_update(iptun->iptun_mh, (uint8_t *)&addr.ia_addr);
604 break;
605 case IPTUN_TASK_RADDR_UPDATE:
606 mac_dst_update(iptun->iptun_mh, (uint8_t *)&addr.ia_addr);
607 break;
608 case IPTUN_TASK_LINK_UPDATE:
609 mac_link_update(iptun->iptun_mh, linkstate);
610 break;
611 case IPTUN_TASK_PDATA_UPDATE:
612 if (mac_pdata_update(iptun->iptun_mh,
613 header_size == 0 ? NULL : &header, header_size) != 0)
614 atomic_inc_64(&iptun->iptun_taskq_fail);
615 break;
618 mutex_enter(&iptun->iptun_lock);
619 iptun->iptun_flags &= ~IPTUN_UPCALL_PENDING;
620 cv_signal(&iptun->iptun_upcall_cv);
621 mutex_exit(&iptun->iptun_lock);
624 static void
625 iptun_task_dispatch(iptun_t *iptun, iptun_task_t iptun_task)
627 iptun_task_data_t *itd;
629 itd = kmem_alloc(sizeof (*itd), KM_NOSLEEP);
630 if (itd == NULL) {
631 atomic_inc_64(&iptun->iptun_taskq_fail);
632 return;
634 itd->itd_task = iptun_task;
635 itd->itd_linkid = iptun->iptun_linkid;
636 if (ddi_taskq_dispatch(iptun_taskq, iptun_task_cb, itd, DDI_NOSLEEP)) {
637 atomic_inc_64(&iptun->iptun_taskq_fail);
638 kmem_free(itd, sizeof (*itd));
643 * Convert an iptun_addr_t to sockaddr_storage.
645 static void
646 iptun_getaddr(iptun_addr_t *iptun_addr, struct sockaddr_storage *ss)
648 struct sockaddr_in *sin;
649 struct sockaddr_in6 *sin6;
651 bzero(ss, sizeof (*ss));
652 switch (iptun_addr->ia_family) {
653 case AF_INET:
654 sin = (struct sockaddr_in *)ss;
655 sin->sin_addr.s_addr = iptun_addr->ia_addr.iau_addr4;
656 break;
657 case AF_INET6:
658 sin6 = (struct sockaddr_in6 *)ss;
659 sin6->sin6_addr = iptun_addr->ia_addr.iau_addr6;
660 break;
661 default:
662 ASSERT(0);
664 ss->ss_family = iptun_addr->ia_family;
668 * General purpose function to set an IP tunnel source or destination address.
670 static int
671 iptun_setaddr(iptun_type_t iptun_type, iptun_addr_t *iptun_addr,
672 const struct sockaddr_storage *ss)
674 if (!IPTUN_ADDR_MATCH(iptun_type, ss->ss_family))
675 return (EINVAL);
677 switch (ss->ss_family) {
678 case AF_INET: {
679 struct sockaddr_in *sin = (struct sockaddr_in *)ss;
681 if ((sin->sin_addr.s_addr == INADDR_ANY) ||
682 (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
683 CLASSD(sin->sin_addr.s_addr)) {
684 return (EADDRNOTAVAIL);
686 iptun_addr->ia_addr.iau_addr4 = sin->sin_addr.s_addr;
687 break;
689 case AF_INET6: {
690 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;
692 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
693 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) ||
694 IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
695 return (EADDRNOTAVAIL);
697 iptun_addr->ia_addr.iau_addr6 = sin6->sin6_addr;
698 break;
700 default:
701 return (EAFNOSUPPORT);
703 iptun_addr->ia_family = ss->ss_family;
704 return (0);
707 static int
708 iptun_setladdr(iptun_t *iptun, const struct sockaddr_storage *laddr)
710 return (iptun_setaddr(iptun->iptun_typeinfo->iti_type,
711 &iptun->iptun_laddr, laddr));
714 static int
715 iptun_setraddr(iptun_t *iptun, const struct sockaddr_storage *raddr)
717 if (!(iptun->iptun_typeinfo->iti_hasraddr))
718 return (EINVAL);
719 return (iptun_setaddr(iptun->iptun_typeinfo->iti_type,
720 &iptun->iptun_raddr, raddr));
723 static boolean_t
724 iptun_canbind(iptun_t *iptun)
727 * A tunnel may bind when its source address has been set, and if its
728 * tunnel type requires one, also its destination address.
730 return ((iptun->iptun_flags & IPTUN_LADDR) &&
731 ((iptun->iptun_flags & IPTUN_RADDR) ||
732 !(iptun->iptun_typeinfo->iti_hasraddr)));
736 * Verify that the local address is valid, and insert in the fanout
738 static int
739 iptun_bind(iptun_t *iptun)
741 conn_t *connp = iptun->iptun_connp;
742 int error = 0;
743 ip_xmit_attr_t *ixa;
744 iulp_t uinfo;
745 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
747 /* Get an exclusive ixa for this thread, and replace conn_ixa */
748 ixa = conn_get_ixa(connp, B_TRUE);
749 if (ixa == NULL)
750 return (ENOMEM);
751 ASSERT(ixa->ixa_refcnt >= 2);
752 ASSERT(ixa == connp->conn_ixa);
754 /* We create PMTU state including for 6to4 */
755 ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
757 ASSERT(iptun_canbind(iptun));
759 mutex_enter(&connp->conn_lock);
761 * Note that conn_proto can't be set since the upper protocol
762 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
763 * ipcl_iptun_classify doesn't use conn_proto.
765 connp->conn_ipversion = iptun->iptun_typeinfo->iti_ipvers;
767 switch (iptun->iptun_typeinfo->iti_type) {
768 case IPTUN_TYPE_IPV4:
769 IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
770 &connp->conn_laddr_v6);
771 IN6_IPADDR_TO_V4MAPPED(iptun->iptun_raddr4,
772 &connp->conn_faddr_v6);
773 ixa->ixa_flags |= IXAF_IS_IPV4;
774 if (ip_laddr_verify_v4(iptun->iptun_laddr4, IPCL_ZONEID(connp),
775 ipst, B_FALSE) != IPVL_UNICAST_UP) {
776 mutex_exit(&connp->conn_lock);
777 error = EADDRNOTAVAIL;
778 goto done;
780 break;
781 case IPTUN_TYPE_IPV6:
782 connp->conn_laddr_v6 = iptun->iptun_laddr6;
783 connp->conn_faddr_v6 = iptun->iptun_raddr6;
784 ixa->ixa_flags &= ~IXAF_IS_IPV4;
785 /* We use a zero scopeid for now */
786 if (ip_laddr_verify_v6(&iptun->iptun_laddr6, IPCL_ZONEID(connp),
787 ipst, B_FALSE, 0) != IPVL_UNICAST_UP) {
788 mutex_exit(&connp->conn_lock);
789 error = EADDRNOTAVAIL;
790 goto done;
792 break;
793 case IPTUN_TYPE_6TO4:
794 IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
795 &connp->conn_laddr_v6);
796 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_faddr_v6);
797 ixa->ixa_flags |= IXAF_IS_IPV4;
798 mutex_exit(&connp->conn_lock);
800 switch (ip_laddr_verify_v4(iptun->iptun_laddr4,
801 IPCL_ZONEID(connp), ipst, B_FALSE)) {
802 case IPVL_UNICAST_UP:
803 case IPVL_UNICAST_DOWN:
804 break;
805 default:
806 error = EADDRNOTAVAIL;
807 goto done;
809 goto insert;
812 /* In case previous destination was multirt */
813 ip_attr_newdst(ixa);
816 * When we set a tunnel's destination address, we do not
817 * care if the destination is reachable. Transient routing
818 * issues should not inhibit the creation of a tunnel
819 * interface, for example. Thus we pass B_FALSE here.
821 connp->conn_saddr_v6 = connp->conn_laddr_v6;
822 mutex_exit(&connp->conn_lock);
824 /* As long as the MTU is large we avoid fragmentation */
825 ixa->ixa_flags |= IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF;
827 /* We handle IPsec in iptun_output_common */
828 error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
829 &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
830 &connp->conn_saddr_v6, &uinfo, 0);
832 if (error != 0)
833 goto done;
835 /* saddr shouldn't change since it was already set */
836 ASSERT(IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
837 &connp->conn_saddr_v6));
839 /* We set IXAF_VERIFY_PMTU to catch PMTU increases */
840 ixa->ixa_flags |= IXAF_VERIFY_PMTU;
841 ASSERT(uinfo.iulp_mtu != 0);
844 * Allow setting new policies.
845 * The addresses/ports are already set, thus the IPsec policy calls
846 * can handle their passed-in conn's.
848 connp->conn_policy_cached = B_FALSE;
850 insert:
851 error = ipcl_conn_insert(connp);
852 if (error != 0)
853 goto done;
855 /* Record this as the "last" send even though we haven't sent any */
856 connp->conn_v6lastdst = connp->conn_faddr_v6;
858 iptun->iptun_flags |= IPTUN_BOUND;
860 * Now that we're bound with ip below us, this is a good
861 * time to initialize the destination path MTU and to
862 * re-calculate the tunnel's link MTU.
864 (void) iptun_update_mtu(iptun, ixa, 0);
866 if (IS_IPTUN_RUNNING(iptun))
867 iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
869 done:
870 ixa_refrele(ixa);
871 return (error);
874 static void
875 iptun_unbind(iptun_t *iptun)
877 ASSERT(iptun->iptun_flags & IPTUN_BOUND);
878 ASSERT(mutex_owned(&iptun->iptun_lock) ||
879 (iptun->iptun_flags & IPTUN_CONDEMNED));
880 ip_unbind(iptun->iptun_connp);
881 iptun->iptun_flags &= ~IPTUN_BOUND;
882 if (!(iptun->iptun_flags & IPTUN_CONDEMNED))
883 iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
887 * Re-generate the template data-link header for a given IP tunnel given the
888 * tunnel's current parameters.
890 static void
891 iptun_headergen(iptun_t *iptun, boolean_t update_mac)
893 switch (iptun->iptun_typeinfo->iti_ipvers) {
894 case IPV4_VERSION:
896 * We only need to use a custom IP header if the administrator
897 * has supplied a non-default hoplimit.
899 if (iptun->iptun_hoplimit == IPTUN_DEFAULT_HOPLIMIT) {
900 iptun->iptun_header_size = 0;
901 break;
903 iptun->iptun_header_size = sizeof (ipha_t);
904 iptun->iptun_header4.ipha_version_and_hdr_length =
905 IP_SIMPLE_HDR_VERSION;
906 iptun->iptun_header4.ipha_fragment_offset_and_flags =
907 htons(IPH_DF);
908 iptun->iptun_header4.ipha_ttl = iptun->iptun_hoplimit;
909 break;
910 case IPV6_VERSION: {
911 ip6_t *ip6hp = &iptun->iptun_header6.it6h_ip6h;
914 * We only need to use a custom IPv6 header if either the
915 * administrator has supplied a non-default hoplimit, or we
916 * need to include an encapsulation limit option in the outer
917 * header.
919 if (iptun->iptun_hoplimit == IPTUN_DEFAULT_HOPLIMIT &&
920 iptun->iptun_encaplimit == 0) {
921 iptun->iptun_header_size = 0;
922 break;
925 (void) memset(ip6hp, 0, sizeof (*ip6hp));
926 if (iptun->iptun_encaplimit == 0) {
927 iptun->iptun_header_size = sizeof (ip6_t);
928 ip6hp->ip6_nxt = IPPROTO_NONE;
929 } else {
930 iptun_encaplim_t *iel;
932 iptun->iptun_header_size = sizeof (iptun_ipv6hdrs_t);
934 * The mac_ipv6 plugin requires ip6_plen to be in host
935 * byte order and reflect the extension headers
936 * present in the template. The actual network byte
937 * order ip6_plen will be set on a per-packet basis on
938 * transmit.
940 ip6hp->ip6_plen = sizeof (*iel);
941 ip6hp->ip6_nxt = IPPROTO_DSTOPTS;
942 iel = &iptun->iptun_header6.it6h_encaplim;
943 *iel = iptun_encaplim_init;
944 iel->iel_telopt.ip6ot_encap_limit =
945 iptun->iptun_encaplimit;
948 ip6hp->ip6_hlim = iptun->iptun_hoplimit;
949 break;
953 if (update_mac)
954 iptun_task_dispatch(iptun, IPTUN_TASK_PDATA_UPDATE);
958 * Insert inbound and outbound IPv4 and IPv6 policy into the given policy
959 * head.
961 static boolean_t
962 iptun_insert_simple_policies(ipsec_policy_head_t *ph, ipsec_act_t *actp,
963 uint_t n, netstack_t *ns)
965 int f = IPSEC_AF_V4;
967 if (!ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_INBOUND, ns) ||
968 !ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_OUTBOUND, ns))
969 return (B_FALSE);
971 f = IPSEC_AF_V6;
972 return (ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_INBOUND, ns) &&
973 ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_OUTBOUND, ns));
977 * Used to set IPsec policy when policy is set through the IPTUN_CREATE or
978 * IPTUN_MODIFY ioctls.
980 static int
981 iptun_set_sec_simple(iptun_t *iptun, const ipsec_req_t *ipsr)
983 int rc = 0;
984 uint_t nact;
985 ipsec_act_t *actp = NULL;
986 boolean_t clear_all, old_policy = B_FALSE;
987 ipsec_tun_pol_t *itp;
988 char name[MAXLINKNAMELEN];
989 uint64_t gen;
990 netstack_t *ns = iptun->iptun_ns;
992 /* Can't specify self-encap on a tunnel. */
993 if (ipsr->ipsr_self_encap_req != 0)
994 return (EINVAL);
997 * If it's a "clear-all" entry, unset the security flags and resume
998 * normal cleartext (or inherit-from-global) policy.
1000 clear_all = ((ipsr->ipsr_ah_req & IPTUN_IPSEC_REQ_MASK) == 0 &&
1001 (ipsr->ipsr_esp_req & IPTUN_IPSEC_REQ_MASK) == 0);
1003 ASSERT(mutex_owned(&iptun->iptun_lock));
1004 itp = iptun->iptun_itp;
1005 if (itp == NULL) {
1006 if (clear_all)
1007 goto bail;
1008 if ((rc = dls_mgmt_get_linkinfo(iptun->iptun_linkid, name, NULL,
1009 NULL, NULL)) != 0)
1010 goto bail;
1011 ASSERT(name[0] != '\0');
1012 if ((itp = create_tunnel_policy(name, &rc, &gen, ns)) == NULL)
1013 goto bail;
1014 iptun->iptun_itp = itp;
1017 /* Allocate the actvec now, before holding itp or polhead locks. */
1018 ipsec_actvec_from_req(ipsr, &actp, &nact, ns);
1019 if (actp == NULL) {
1020 rc = ENOMEM;
1021 goto bail;
1025 * Just write on the active polhead. Save the primary/secondary stuff
1026 * for spdsock operations.
1028 * Mutex because we need to write to the polhead AND flags atomically.
1029 * Other threads will acquire the polhead lock as a reader if the
1030 * (unprotected) flag is set.
1032 mutex_enter(&itp->itp_lock);
1033 if (itp->itp_flags & ITPF_P_TUNNEL) {
1034 /* Oops, we lost a race. Let's get out of here. */
1035 rc = EBUSY;
1036 goto mutex_bail;
1038 old_policy = ((itp->itp_flags & ITPF_P_ACTIVE) != 0);
1040 if (old_policy) {
1041 ITPF_CLONE(itp->itp_flags);
1042 rc = ipsec_copy_polhead(itp->itp_policy, itp->itp_inactive, ns);
1043 if (rc != 0) {
1044 /* inactive has already been cleared. */
1045 itp->itp_flags &= ~ITPF_IFLAGS;
1046 goto mutex_bail;
1048 rw_enter(&itp->itp_policy->iph_lock, RW_WRITER);
1049 ipsec_polhead_flush(itp->itp_policy, ns);
1050 } else {
1051 /* Else assume itp->itp_policy is already flushed. */
1052 rw_enter(&itp->itp_policy->iph_lock, RW_WRITER);
1055 if (clear_all) {
1056 ASSERT(avl_numnodes(&itp->itp_policy->iph_rulebyid) == 0);
1057 itp->itp_flags &= ~ITPF_PFLAGS;
1058 rw_exit(&itp->itp_policy->iph_lock);
1059 old_policy = B_FALSE; /* Clear out the inactive one too. */
1060 goto recover_bail;
1063 if (iptun_insert_simple_policies(itp->itp_policy, actp, nact, ns)) {
1064 rw_exit(&itp->itp_policy->iph_lock);
1066 * Adjust MTU and make sure the DL side knows what's up.
1068 itp->itp_flags = ITPF_P_ACTIVE;
1069 (void) iptun_update_mtu(iptun, NULL, 0);
1070 old_policy = B_FALSE; /* Blank out inactive - we succeeded */
1071 } else {
1072 rw_exit(&itp->itp_policy->iph_lock);
1073 rc = ENOMEM;
1076 recover_bail:
1077 if (old_policy) {
1078 /* Recover policy in in active polhead. */
1079 ipsec_swap_policy(itp->itp_policy, itp->itp_inactive, ns);
1080 ITPF_SWAP(itp->itp_flags);
1083 /* Clear policy in inactive polhead. */
1084 itp->itp_flags &= ~ITPF_IFLAGS;
1085 rw_enter(&itp->itp_inactive->iph_lock, RW_WRITER);
1086 ipsec_polhead_flush(itp->itp_inactive, ns);
1087 rw_exit(&itp->itp_inactive->iph_lock);
1089 mutex_bail:
1090 mutex_exit(&itp->itp_lock);
1092 bail:
1093 if (actp != NULL)
1094 ipsec_actvec_free(actp, nact);
1096 return (rc);
1099 static iptun_typeinfo_t *
1100 iptun_gettypeinfo(iptun_type_t type)
1102 int i;
1104 for (i = 0; iptun_type_table[i].iti_type != IPTUN_TYPE_UNKNOWN; i++) {
1105 if (iptun_type_table[i].iti_type == type)
1106 break;
1108 return (&iptun_type_table[i]);
1112 * Set the parameters included in ik on the tunnel iptun. Parameters that can
1113 * only be set at creation time are set in iptun_create().
1115 static int
1116 iptun_setparams(iptun_t *iptun, const iptun_kparams_t *ik)
1118 int err = 0;
1119 netstack_t *ns = iptun->iptun_ns;
1120 iptun_addr_t orig_laddr, orig_raddr;
1121 uint_t orig_flags = iptun->iptun_flags;
1123 if (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR) {
1124 if (orig_flags & IPTUN_LADDR)
1125 orig_laddr = iptun->iptun_laddr;
1126 if ((err = iptun_setladdr(iptun, &ik->iptun_kparam_laddr)) != 0)
1127 return (err);
1128 iptun->iptun_flags |= IPTUN_LADDR;
1131 if (ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR) {
1132 if (orig_flags & IPTUN_RADDR)
1133 orig_raddr = iptun->iptun_raddr;
1134 if ((err = iptun_setraddr(iptun, &ik->iptun_kparam_raddr)) != 0)
1135 goto done;
1136 iptun->iptun_flags |= IPTUN_RADDR;
1139 if (ik->iptun_kparam_flags & IPTUN_KPARAM_SECINFO) {
1141 * Set IPsec policy originating from the ifconfig(1M) command
1142 * line. This is traditionally called "simple" policy because
1143 * the ipsec_req_t (iptun_kparam_secinfo) can only describe a
1144 * simple policy of "do ESP on everything" and/or "do AH on
1145 * everything" (as opposed to the rich policy that can be
1146 * defined with ipsecconf(1M)).
1148 if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) {
1150 * Can't set security properties for automatic
1151 * tunnels.
1153 err = EINVAL;
1154 goto done;
1157 if (!ipsec_loaded(ns->netstack_ipsec)) {
1158 /* If IPsec can be loaded, try and load it now. */
1159 if (ipsec_failed(ns->netstack_ipsec)) {
1160 err = EPROTONOSUPPORT;
1161 goto done;
1163 ipsec_loader_loadnow(ns->netstack_ipsec);
1165 * ipsec_loader_loadnow() returns while IPsec is
1166 * loaded asynchronously. While a method exists to
1167 * wait for IPsec to load (ipsec_loader_wait()), it
1168 * requires use of a STREAMS queue to do a qwait().
1169 * We're not in STREAMS context here, and so we can't
1170 * use it. This is not a problem in practice because
1171 * in the vast majority of cases, key management and
1172 * global policy will have loaded before any tunnels
1173 * are plumbed, and so IPsec will already have been
1174 * loaded.
1176 err = EAGAIN;
1177 goto done;
1180 err = iptun_set_sec_simple(iptun, &ik->iptun_kparam_secinfo);
1181 if (err == 0) {
1182 iptun->iptun_flags |= IPTUN_SIMPLE_POLICY;
1183 iptun->iptun_simple_policy = ik->iptun_kparam_secinfo;
1186 done:
1187 if (err != 0) {
1188 /* Restore original source and destination. */
1189 if (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR &&
1190 (orig_flags & IPTUN_LADDR))
1191 iptun->iptun_laddr = orig_laddr;
1192 if ((ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR) &&
1193 (orig_flags & IPTUN_RADDR))
1194 iptun->iptun_raddr = orig_raddr;
1195 iptun->iptun_flags = orig_flags;
1197 return (err);
1200 static int
1201 iptun_register(iptun_t *iptun)
1203 mac_register_t *mac;
1204 int err;
1206 ASSERT(!(iptun->iptun_flags & IPTUN_MAC_REGISTERED));
1208 if ((mac = mac_alloc(MAC_VERSION)) == NULL)
1209 return (EINVAL);
1211 mac->m_type_ident = iptun->iptun_typeinfo->iti_ident;
1212 mac->m_driver = iptun;
1213 mac->m_dip = iptun_dip;
1214 mac->m_instance = (uint_t)-1;
1215 mac->m_src_addr = (uint8_t *)&iptun->iptun_laddr.ia_addr;
1216 mac->m_dst_addr = iptun->iptun_typeinfo->iti_hasraddr ?
1217 (uint8_t *)&iptun->iptun_raddr.ia_addr : NULL;
1218 mac->m_callbacks = &iptun_m_callbacks;
1219 mac->m_min_sdu = iptun->iptun_typeinfo->iti_minmtu;
1220 mac->m_max_sdu = iptun->iptun_mtu;
1221 if (iptun->iptun_header_size != 0) {
1222 mac->m_pdata = &iptun->iptun_header;
1223 mac->m_pdata_size = iptun->iptun_header_size;
1225 if ((err = mac_register(mac, &iptun->iptun_mh)) == 0)
1226 iptun->iptun_flags |= IPTUN_MAC_REGISTERED;
1227 mac_free(mac);
1228 return (err);
1231 static int
1232 iptun_unregister(iptun_t *iptun)
1234 int err;
1236 ASSERT(iptun->iptun_flags & IPTUN_MAC_REGISTERED);
1237 if ((err = mac_unregister(iptun->iptun_mh)) == 0)
1238 iptun->iptun_flags &= ~IPTUN_MAC_REGISTERED;
1239 return (err);
1242 static conn_t *
1243 iptun_conn_create(iptun_t *iptun, netstack_t *ns, cred_t *credp)
1245 conn_t *connp;
1247 if ((connp = ipcl_conn_create(IPCL_IPCCONN, KM_NOSLEEP, ns)) == NULL)
1248 return (NULL);
1250 connp->conn_flags |= IPCL_IPTUN;
1251 connp->conn_iptun = iptun;
1252 connp->conn_recv = iptun_input;
1253 connp->conn_recvicmp = iptun_input_icmp;
1254 connp->conn_verifyicmp = iptun_verifyicmp;
1257 * Register iptun_notify to listen to capability changes detected by IP.
1258 * This upcall is made in the context of the call to conn_ip_output.
1260 connp->conn_ixa->ixa_notify = iptun_notify;
1261 connp->conn_ixa->ixa_notify_cookie = iptun;
1264 * For exclusive stacks we set conn_zoneid to GLOBAL_ZONEID as is done
1265 * for all other conn_t's.
1267 * Note that there's an important distinction between iptun_zoneid and
1268 * conn_zoneid. The conn_zoneid is set to GLOBAL_ZONEID in non-global
1269 * exclusive stack zones to make the ip module believe that the
1270 * non-global zone is actually a global zone. Therefore, when
1271 * interacting with the ip module, we must always use conn_zoneid.
1273 connp->conn_zoneid = (ns->netstack_stackid == GLOBAL_NETSTACKID) ?
1274 crgetzoneid(credp) : GLOBAL_ZONEID;
1275 connp->conn_cred = credp;
1276 /* crfree() is done in ipcl_conn_destroy(), called by CONN_DEC_REF() */
1277 crhold(connp->conn_cred);
1278 connp->conn_cpid = NOPID;
1280 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1281 connp->conn_ixa->ixa_zoneid = connp->conn_zoneid;
1282 ASSERT(connp->conn_ref == 1);
1284 /* Cache things in ixa without an extra refhold */
1285 connp->conn_ixa->ixa_cred = connp->conn_cred;
1286 connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1287 if (is_system_labeled())
1288 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1291 * Have conn_ip_output drop packets should our outer source
1292 * go invalid
1294 connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1296 switch (iptun->iptun_typeinfo->iti_ipvers) {
1297 case IPV4_VERSION:
1298 connp->conn_family = AF_INET6;
1299 break;
1300 case IPV6_VERSION:
1301 connp->conn_family = AF_INET;
1302 break;
1304 mutex_enter(&connp->conn_lock);
1305 connp->conn_state_flags &= ~CONN_INCIPIENT;
1306 mutex_exit(&connp->conn_lock);
1307 return (connp);
1310 static void
1311 iptun_conn_destroy(conn_t *connp)
1313 ip_quiesce_conn(connp);
1314 connp->conn_iptun = NULL;
1315 ASSERT(connp->conn_ref == 1);
1316 CONN_DEC_REF(connp);
1319 static iptun_t *
1320 iptun_alloc(void)
1322 iptun_t *iptun;
1324 if ((iptun = kmem_cache_alloc(iptun_cache, KM_NOSLEEP)) != NULL) {
1325 bzero(iptun, sizeof (*iptun));
1326 atomic_inc_32(&iptun_tunnelcount);
1328 return (iptun);
1331 static void
1332 iptun_free(iptun_t *iptun)
1334 ASSERT(iptun->iptun_flags & IPTUN_CONDEMNED);
1336 if (iptun->iptun_flags & IPTUN_HASH_INSERTED) {
1337 iptun_stack_t *iptuns = iptun->iptun_iptuns;
1339 mutex_enter(&iptun_hash_lock);
1340 VERIFY(mod_hash_remove(iptun_hash,
1341 IPTUN_HASH_KEY(iptun->iptun_linkid),
1342 (mod_hash_val_t *)&iptun) == 0);
1343 mutex_exit(&iptun_hash_lock);
1344 iptun->iptun_flags &= ~IPTUN_HASH_INSERTED;
1345 mutex_enter(&iptuns->iptuns_lock);
1346 list_remove(&iptuns->iptuns_iptunlist, iptun);
1347 mutex_exit(&iptuns->iptuns_lock);
1350 if (iptun->iptun_flags & IPTUN_BOUND)
1351 iptun_unbind(iptun);
1354 * After iptun_unregister(), there will be no threads executing a
1355 * downcall from the mac module, including in the tx datapath.
1357 if (iptun->iptun_flags & IPTUN_MAC_REGISTERED)
1358 VERIFY(iptun_unregister(iptun) == 0);
1360 if (iptun->iptun_itp != NULL) {
1362 * Remove from the AVL tree, AND release the reference iptun_t
1363 * itself holds on the ITP.
1365 itp_unlink(iptun->iptun_itp, iptun->iptun_ns);
1366 ITP_REFRELE(iptun->iptun_itp, iptun->iptun_ns);
1367 iptun->iptun_itp = NULL;
1368 iptun->iptun_flags &= ~IPTUN_SIMPLE_POLICY;
1372 * After ipcl_conn_destroy(), there will be no threads executing an
1373 * upcall from ip (i.e., iptun_input()), and it is then safe to free
1374 * the iptun_t.
1376 if (iptun->iptun_connp != NULL) {
1377 iptun_conn_destroy(iptun->iptun_connp);
1378 iptun->iptun_connp = NULL;
1381 kmem_cache_free(iptun_cache, iptun);
1382 atomic_dec_32(&iptun_tunnelcount);
1386 iptun_create(iptun_kparams_t *ik, cred_t *credp)
1388 iptun_t *iptun = NULL;
1389 int err = 0, mherr;
1390 char linkname[MAXLINKNAMELEN];
1391 ipsec_tun_pol_t *itp;
1392 netstack_t *ns = NULL;
1393 iptun_stack_t *iptuns;
1394 datalink_id_t tmpid;
1395 zoneid_t zoneid = crgetzoneid(credp);
1396 boolean_t link_created = B_FALSE;
1398 /* The tunnel type is mandatory */
1399 if (!(ik->iptun_kparam_flags & IPTUN_KPARAM_TYPE))
1400 return (EINVAL);
1403 * Is the linkid that the caller wishes to associate with this new
1404 * tunnel assigned to this zone?
1406 if (zone_check_datalink(&zoneid, ik->iptun_kparam_linkid) != 0) {
1407 if (zoneid != GLOBAL_ZONEID)
1408 return (EINVAL);
1409 } else if (zoneid == GLOBAL_ZONEID) {
1410 return (EINVAL);
1414 * Make sure that we're not trying to create a tunnel that has already
1415 * been created.
1417 if (iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun) == 0) {
1418 iptun_exit(iptun);
1419 iptun = NULL;
1420 err = EEXIST;
1421 goto done;
1424 ns = netstack_find_by_cred(credp);
1425 iptuns = ns->netstack_iptun;
1427 if ((iptun = iptun_alloc()) == NULL) {
1428 err = ENOMEM;
1429 goto done;
1432 iptun->iptun_linkid = ik->iptun_kparam_linkid;
1433 iptun->iptun_zoneid = zoneid;
1434 iptun->iptun_ns = ns;
1436 iptun->iptun_typeinfo = iptun_gettypeinfo(ik->iptun_kparam_type);
1437 if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_UNKNOWN) {
1438 err = EINVAL;
1439 goto done;
1442 if (ik->iptun_kparam_flags & IPTUN_KPARAM_IMPLICIT)
1443 iptun->iptun_flags |= IPTUN_IMPLICIT;
1445 if ((err = iptun_setparams(iptun, ik)) != 0)
1446 goto done;
1448 iptun->iptun_hoplimit = IPTUN_DEFAULT_HOPLIMIT;
1449 if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_IPV6)
1450 iptun->iptun_encaplimit = IPTUN_DEFAULT_ENCAPLIMIT;
1452 iptun_headergen(iptun, B_FALSE);
1454 iptun->iptun_connp = iptun_conn_create(iptun, ns, credp);
1455 if (iptun->iptun_connp == NULL) {
1456 err = ENOMEM;
1457 goto done;
1460 iptun->iptun_mtu = iptun->iptun_typeinfo->iti_maxmtu;
1461 iptun->iptun_dpmtu = iptun->iptun_mtu;
1464 * Find an ITP based on linkname. If we have parms already set via
1465 * the iptun_setparams() call above, it may have created an ITP for
1466 * us. We always try get_tunnel_policy() for DEBUG correctness
1467 * checks, and we may wish to refactor this to only check when
1468 * iptun_itp is NULL.
1470 if ((err = dls_mgmt_get_linkinfo(iptun->iptun_linkid, linkname, NULL,
1471 NULL, NULL)) != 0)
1472 goto done;
1473 if ((itp = get_tunnel_policy(linkname, ns)) != NULL)
1474 iptun->iptun_itp = itp;
1477 * See if we have the necessary IP addresses assigned to this tunnel
1478 * to try and bind them with ip underneath us. If we're not ready to
1479 * bind yet, then we'll defer the bind operation until the addresses
1480 * are modified.
1482 if (iptun_canbind(iptun) && ((err = iptun_bind(iptun)) != 0))
1483 goto done;
1485 if ((err = iptun_register(iptun)) != 0)
1486 goto done;
1488 err = dls_devnet_create(iptun->iptun_mh, iptun->iptun_linkid,
1489 iptun->iptun_zoneid);
1490 if (err != 0)
1491 goto done;
1492 link_created = B_TRUE;
1495 * We hash by link-id as that is the key used by all other iptun
1496 * interfaces (modify, delete, etc.).
1498 if ((mherr = mod_hash_insert(iptun_hash,
1499 IPTUN_HASH_KEY(iptun->iptun_linkid), (mod_hash_val_t)iptun)) == 0) {
1500 mutex_enter(&iptuns->iptuns_lock);
1501 list_insert_head(&iptuns->iptuns_iptunlist, iptun);
1502 mutex_exit(&iptuns->iptuns_lock);
1503 iptun->iptun_flags |= IPTUN_HASH_INSERTED;
1504 } else if (mherr == MH_ERR_NOMEM) {
1505 err = ENOMEM;
1506 } else if (mherr == MH_ERR_DUPLICATE) {
1507 err = EEXIST;
1508 } else {
1509 err = EINVAL;
1512 done:
1513 if (iptun == NULL && ns != NULL)
1514 netstack_rele(ns);
1515 if (err != 0 && iptun != NULL) {
1516 if (link_created) {
1517 (void) dls_devnet_destroy(iptun->iptun_mh, &tmpid,
1518 B_TRUE);
1520 iptun->iptun_flags |= IPTUN_CONDEMNED;
1521 iptun_free(iptun);
1523 return (err);
1527 iptun_delete(datalink_id_t linkid, cred_t *credp)
1529 int err;
1530 iptun_t *iptun = NULL;
1532 if ((err = iptun_enter_by_linkid(linkid, &iptun)) != 0)
1533 return (err);
1535 /* One cannot delete a tunnel that belongs to another zone. */
1536 if (iptun->iptun_zoneid != crgetzoneid(credp)) {
1537 iptun_exit(iptun);
1538 return (EACCES);
1542 * We need to exit iptun in order to issue calls up the stack such as
1543 * dls_devnet_destroy(). If we call up while still in iptun, deadlock
1544 * with calls coming down the stack is possible. We prevent other
1545 * threads from entering this iptun after we've exited it by setting
1546 * the IPTUN_DELETE_PENDING flag. This will cause callers of
1547 * iptun_enter() to block waiting on iptun_enter_cv. The assumption
1548 * here is that the functions we're calling while IPTUN_DELETE_PENDING
1549 * is set dont resuult in an iptun_enter() call, as that would result
1550 * in deadlock.
1552 iptun->iptun_flags |= IPTUN_DELETE_PENDING;
1554 /* Wait for any pending upcall to the mac module to complete. */
1555 while (iptun->iptun_flags & IPTUN_UPCALL_PENDING)
1556 cv_wait(&iptun->iptun_upcall_cv, &iptun->iptun_lock);
1558 iptun_exit(iptun);
1560 if ((err = dls_devnet_destroy(iptun->iptun_mh, &linkid, B_TRUE)) == 0) {
1562 * mac_disable() will fail with EBUSY if there are references
1563 * to the iptun MAC. If there are none, then mac_disable()
1564 * will assure that none can be acquired until the MAC is
1565 * unregistered.
1567 * XXX CR 6791335 prevents us from calling mac_disable() prior
1568 * to dls_devnet_destroy(), so we unfortunately need to
1569 * attempt to re-create the devnet node if mac_disable()
1570 * fails.
1572 if ((err = mac_disable(iptun->iptun_mh)) != 0) {
1573 (void) dls_devnet_create(iptun->iptun_mh, linkid,
1574 iptun->iptun_zoneid);
1579 * Now that we know the fate of this iptun_t, we need to clear
1580 * IPTUN_DELETE_PENDING, and set IPTUN_CONDEMNED if the iptun_t is
1581 * slated to be freed. Either way, we need to signal the threads
1582 * waiting in iptun_enter() so that they can either fail if
1583 * IPTUN_CONDEMNED is set, or continue if it's not.
1585 mutex_enter(&iptun->iptun_lock);
1586 iptun->iptun_flags &= ~IPTUN_DELETE_PENDING;
1587 if (err == 0)
1588 iptun->iptun_flags |= IPTUN_CONDEMNED;
1589 cv_broadcast(&iptun->iptun_enter_cv);
1590 mutex_exit(&iptun->iptun_lock);
1593 * Note that there is no danger in calling iptun_free() after having
1594 * dropped the iptun_lock since callers of iptun_enter() at this point
1595 * are doing so from iptun_enter_by_linkid() (mac_disable() got rid of
1596 * threads entering from mac callbacks which call iptun_enter()
1597 * directly) which holds iptun_hash_lock, and iptun_free() grabs this
1598 * lock in order to remove the iptun_t from the hash table.
1600 if (err == 0)
1601 iptun_free(iptun);
1603 return (err);
1607 iptun_modify(const iptun_kparams_t *ik, cred_t *credp)
1609 iptun_t *iptun;
1610 boolean_t laddr_change = B_FALSE, raddr_change = B_FALSE;
1611 int err;
1613 if ((err = iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun)) != 0)
1614 return (err);
1616 /* One cannot modify a tunnel that belongs to another zone. */
1617 if (iptun->iptun_zoneid != crgetzoneid(credp)) {
1618 err = EACCES;
1619 goto done;
1622 /* The tunnel type cannot be changed */
1623 if (ik->iptun_kparam_flags & IPTUN_KPARAM_TYPE) {
1624 err = EINVAL;
1625 goto done;
1628 if ((err = iptun_setparams(iptun, ik)) != 0)
1629 goto done;
1630 iptun_headergen(iptun, B_FALSE);
1633 * If any of the tunnel's addresses has been modified and the tunnel
1634 * has the necessary addresses assigned to it, we need to try to bind
1635 * with ip underneath us. If we're not ready to bind yet, then we'll
1636 * try again when the addresses are modified later.
1638 laddr_change = (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR);
1639 raddr_change = (ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR);
1640 if (laddr_change || raddr_change) {
1641 if (iptun->iptun_flags & IPTUN_BOUND)
1642 iptun_unbind(iptun);
1643 if (iptun_canbind(iptun) && (err = iptun_bind(iptun)) != 0) {
1644 if (laddr_change)
1645 iptun->iptun_flags &= ~IPTUN_LADDR;
1646 if (raddr_change)
1647 iptun->iptun_flags &= ~IPTUN_RADDR;
1648 goto done;
1652 if (laddr_change)
1653 iptun_task_dispatch(iptun, IPTUN_TASK_LADDR_UPDATE);
1654 if (raddr_change)
1655 iptun_task_dispatch(iptun, IPTUN_TASK_RADDR_UPDATE);
1657 done:
1658 iptun_exit(iptun);
1659 return (err);
1662 /* Given an IP tunnel's datalink id, fill in its parameters. */
1664 iptun_info(iptun_kparams_t *ik, cred_t *credp)
1666 iptun_t *iptun;
1667 int err;
1669 /* Is the tunnel link visible from the caller's zone? */
1670 if (!dls_devnet_islinkvisible(ik->iptun_kparam_linkid,
1671 crgetzoneid(credp)))
1672 return (ENOENT);
1674 if ((err = iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun)) != 0)
1675 return (err);
1677 bzero(ik, sizeof (iptun_kparams_t));
1679 ik->iptun_kparam_linkid = iptun->iptun_linkid;
1680 ik->iptun_kparam_type = iptun->iptun_typeinfo->iti_type;
1681 ik->iptun_kparam_flags |= IPTUN_KPARAM_TYPE;
1683 if (iptun->iptun_flags & IPTUN_LADDR) {
1684 iptun_getaddr(&iptun->iptun_laddr, &ik->iptun_kparam_laddr);
1685 ik->iptun_kparam_flags |= IPTUN_KPARAM_LADDR;
1687 if (iptun->iptun_flags & IPTUN_RADDR) {
1688 iptun_getaddr(&iptun->iptun_raddr, &ik->iptun_kparam_raddr);
1689 ik->iptun_kparam_flags |= IPTUN_KPARAM_RADDR;
1692 if (iptun->iptun_flags & IPTUN_IMPLICIT)
1693 ik->iptun_kparam_flags |= IPTUN_KPARAM_IMPLICIT;
1695 if (iptun->iptun_itp != NULL) {
1696 mutex_enter(&iptun->iptun_itp->itp_lock);
1697 if (iptun->iptun_itp->itp_flags & ITPF_P_ACTIVE) {
1698 ik->iptun_kparam_flags |= IPTUN_KPARAM_IPSECPOL;
1699 if (iptun->iptun_flags & IPTUN_SIMPLE_POLICY) {
1700 ik->iptun_kparam_flags |= IPTUN_KPARAM_SECINFO;
1701 ik->iptun_kparam_secinfo =
1702 iptun->iptun_simple_policy;
1705 mutex_exit(&iptun->iptun_itp->itp_lock);
1708 done:
1709 iptun_exit(iptun);
1710 return (err);
1714 iptun_set_6to4relay(netstack_t *ns, ipaddr_t relay_addr)
1716 if (relay_addr == INADDR_BROADCAST || CLASSD(relay_addr))
1717 return (EADDRNOTAVAIL);
1718 ns->netstack_iptun->iptuns_relay_rtr_addr = relay_addr;
1719 return (0);
1722 void
1723 iptun_get_6to4relay(netstack_t *ns, ipaddr_t *relay_addr)
1725 *relay_addr = ns->netstack_iptun->iptuns_relay_rtr_addr;
1728 void
1729 iptun_set_policy(datalink_id_t linkid, ipsec_tun_pol_t *itp)
1731 iptun_t *iptun;
1733 if (iptun_enter_by_linkid(linkid, &iptun) != 0)
1734 return;
1735 if (iptun->iptun_itp != itp) {
1736 ASSERT(iptun->iptun_itp == NULL);
1737 ITP_REFHOLD(itp);
1738 iptun->iptun_itp = itp;
1741 * IPsec policy means IPsec overhead, which means lower MTU.
1742 * Refresh the MTU for this tunnel.
1744 (void) iptun_update_mtu(iptun, NULL, 0);
1745 iptun_exit(iptun);
1749 * Obtain the path MTU to the tunnel destination.
1750 * Can return zero in some cases.
1752 static uint32_t
1753 iptun_get_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
1755 uint32_t pmtu = 0;
1756 conn_t *connp = iptun->iptun_connp;
1757 boolean_t need_rele = B_FALSE;
1760 * We only obtain the pmtu for tunnels that have a remote tunnel
1761 * address.
1763 if (!(iptun->iptun_flags & IPTUN_RADDR))
1764 return (0);
1766 if (ixa == NULL) {
1767 ixa = conn_get_ixa(connp, B_FALSE);
1768 if (ixa == NULL)
1769 return (0);
1770 need_rele = B_TRUE;
1773 * Guard against ICMP errors before we have sent, as well as against
1774 * and a thread which held conn_ixa.
1776 if (ixa->ixa_ire != NULL) {
1777 pmtu = ip_get_pmtu(ixa);
1780 * For both IPv4 and IPv6 we can have indication that the outer
1781 * header needs fragmentation.
1783 if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
1784 /* Must allow fragmentation in ip_output */
1785 ixa->ixa_flags &= ~IXAF_DONTFRAG;
1786 } else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
1787 ixa->ixa_flags |= IXAF_DONTFRAG;
1788 } else {
1789 /* ip_get_pmtu might have set this - we don't want it */
1790 ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
1794 if (need_rele)
1795 ixa_refrele(ixa);
1796 return (pmtu);
1800 * Update the ip_xmit_attr_t to capture the current lower path mtu as known
1801 * by ip.
1803 static void
1804 iptun_update_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
1806 uint32_t pmtu;
1807 conn_t *connp = iptun->iptun_connp;
1808 boolean_t need_rele = B_FALSE;
1810 /* IXAF_VERIFY_PMTU is not set if we don't have a fixed destination */
1811 if (!(iptun->iptun_flags & IPTUN_RADDR))
1812 return;
1814 if (ixa == NULL) {
1815 ixa = conn_get_ixa(connp, B_FALSE);
1816 if (ixa == NULL)
1817 return;
1818 need_rele = B_TRUE;
1821 * Guard against ICMP errors before we have sent, as well as against
1822 * and a thread which held conn_ixa.
1824 if (ixa->ixa_ire != NULL) {
1825 pmtu = ip_get_pmtu(ixa);
1827 * Update ixa_fragsize and ixa_pmtu.
1829 ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
1832 * For both IPv4 and IPv6 we can have indication that the outer
1833 * header needs fragmentation.
1835 if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
1836 /* Must allow fragmentation in ip_output */
1837 ixa->ixa_flags &= ~IXAF_DONTFRAG;
1838 } else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
1839 ixa->ixa_flags |= IXAF_DONTFRAG;
1840 } else {
1841 /* ip_get_pmtu might have set this - we don't want it */
1842 ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
1846 if (need_rele)
1847 ixa_refrele(ixa);
1851 * There is nothing that iptun can verify in addition to IP having
1852 * verified the IP addresses in the fanout.
1854 /* ARGSUSED */
1855 static boolean_t
1856 iptun_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
1857 ip_recv_attr_t *ira)
1859 return (B_TRUE);
1863 * Notify function registered with ip_xmit_attr_t.
1865 static void
1866 iptun_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
1867 ixa_notify_arg_t narg)
1869 iptun_t *iptun = (iptun_t *)arg;
1871 switch (ntype) {
1872 case IXAN_PMTU:
1873 (void) iptun_update_mtu(iptun, ixa, narg);
1874 break;
1879 * Returns the max of old_ovhd and the overhead associated with pol.
1881 static uint32_t
1882 iptun_max_policy_overhead(ipsec_policy_t *pol, uint32_t old_ovhd)
1884 uint32_t new_ovhd = old_ovhd;
1886 while (pol != NULL) {
1887 new_ovhd = max(new_ovhd,
1888 ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
1889 pol = pol->ipsp_hash.hash_next;
1891 return (new_ovhd);
1894 static uint32_t
1895 iptun_get_ipsec_overhead(iptun_t *iptun)
1897 ipsec_policy_root_t *ipr;
1898 ipsec_policy_head_t *iph;
1899 ipsec_policy_t *pol;
1900 ipsec_selector_t sel;
1901 int i;
1902 uint32_t ipsec_ovhd = 0;
1903 ipsec_tun_pol_t *itp = iptun->iptun_itp;
1904 netstack_t *ns = iptun->iptun_ns;
1906 if (itp == NULL || !(itp->itp_flags & ITPF_P_ACTIVE)) {
1908 * Consult global policy, just in case. This will only work
1909 * if we have both source and destination addresses to work
1910 * with.
1912 if ((iptun->iptun_flags & (IPTUN_LADDR|IPTUN_RADDR)) !=
1913 (IPTUN_LADDR|IPTUN_RADDR))
1914 return (0);
1916 iph = ipsec_system_policy(ns);
1917 bzero(&sel, sizeof (sel));
1918 sel.ips_isv4 =
1919 (iptun->iptun_typeinfo->iti_ipvers == IPV4_VERSION);
1920 switch (iptun->iptun_typeinfo->iti_ipvers) {
1921 case IPV4_VERSION:
1922 sel.ips_local_addr_v4 = iptun->iptun_laddr4;
1923 sel.ips_remote_addr_v4 = iptun->iptun_raddr4;
1924 break;
1925 case IPV6_VERSION:
1926 sel.ips_local_addr_v6 = iptun->iptun_laddr6;
1927 sel.ips_remote_addr_v6 = iptun->iptun_raddr6;
1928 break;
1930 /* Check for both IPv4 and IPv6. */
1931 sel.ips_protocol = IPPROTO_ENCAP;
1932 pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
1933 &sel);
1934 if (pol != NULL) {
1935 ipsec_ovhd = ipsec_act_ovhd(&pol->ipsp_act->ipa_act);
1936 IPPOL_REFRELE(pol);
1938 sel.ips_protocol = IPPROTO_IPV6;
1939 pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
1940 &sel);
1941 if (pol != NULL) {
1942 ipsec_ovhd = max(ipsec_ovhd,
1943 ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
1944 IPPOL_REFRELE(pol);
1946 IPPH_REFRELE(iph, ns);
1947 } else {
1949 * Look through all of the possible IPsec actions for the
1950 * tunnel, and find the largest potential IPsec overhead.
1952 iph = itp->itp_policy;
1953 rw_enter(&iph->iph_lock, RW_READER);
1954 ipr = &(iph->iph_root[IPSEC_TYPE_OUTBOUND]);
1955 ipsec_ovhd = iptun_max_policy_overhead(
1956 ipr->ipr_nonhash[IPSEC_AF_V4], 0);
1957 ipsec_ovhd = iptun_max_policy_overhead(
1958 ipr->ipr_nonhash[IPSEC_AF_V6], ipsec_ovhd);
1959 for (i = 0; i < ipr->ipr_nchains; i++) {
1960 ipsec_ovhd = iptun_max_policy_overhead(
1961 ipr->ipr_hash[i].hash_head, ipsec_ovhd);
1963 rw_exit(&iph->iph_lock);
1966 return (ipsec_ovhd);
1970 * Calculate and return the maximum possible upper MTU for the given tunnel.
1972 * If new_pmtu is set then we also need to update the lower path MTU information
1973 * in the ip_xmit_attr_t. That is needed since we set IXAF_VERIFY_PMTU so that
1974 * we are notified by conn_ip_output() when the path MTU increases.
1976 static uint32_t
1977 iptun_get_maxmtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
1979 size_t header_size, ipsec_overhead;
1980 uint32_t maxmtu, pmtu;
1983 * Start with the path-MTU to the remote address, which is either
1984 * provided as the new_pmtu argument, or obtained using
1985 * iptun_get_dst_pmtu().
1987 if (new_pmtu != 0) {
1988 if (iptun->iptun_flags & IPTUN_RADDR)
1989 iptun->iptun_dpmtu = new_pmtu;
1990 pmtu = new_pmtu;
1991 } else if (iptun->iptun_flags & IPTUN_RADDR) {
1992 if ((pmtu = iptun_get_dst_pmtu(iptun, ixa)) == 0) {
1994 * We weren't able to obtain the path-MTU of the
1995 * destination. Use the previous value.
1997 pmtu = iptun->iptun_dpmtu;
1998 } else {
1999 iptun->iptun_dpmtu = pmtu;
2001 } else {
2003 * We have no path-MTU information to go on, use the maximum
2004 * possible value.
2006 pmtu = iptun->iptun_typeinfo->iti_maxmtu;
2010 * Now calculate tunneling overhead and subtract that from the
2011 * path-MTU information obtained above.
2013 if (iptun->iptun_header_size != 0) {
2014 header_size = iptun->iptun_header_size;
2015 } else {
2016 switch (iptun->iptun_typeinfo->iti_ipvers) {
2017 case IPV4_VERSION:
2018 header_size = sizeof (ipha_t);
2019 if (is_system_labeled())
2020 header_size += IP_MAX_OPT_LENGTH;
2021 break;
2022 case IPV6_VERSION:
2023 header_size = sizeof (iptun_ipv6hdrs_t);
2024 break;
2028 ipsec_overhead = iptun_get_ipsec_overhead(iptun);
2030 maxmtu = pmtu - (header_size + ipsec_overhead);
2031 return (max(maxmtu, iptun->iptun_typeinfo->iti_minmtu));
2035 * Re-calculate the tunnel's MTU as seen from above and notify the MAC layer
2036 * of any change in MTU. The new_pmtu argument is the new lower path MTU to
2037 * the tunnel destination to be used in the tunnel MTU calculation. Passing
2038 * in 0 for new_pmtu causes the lower path MTU to be dynamically updated using
2039 * ip_get_pmtu().
2041 * If the calculated tunnel MTU is different than its previous value, then we
2042 * notify the MAC layer above us of this change using mac_maxsdu_update().
2044 static uint32_t
2045 iptun_update_mtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
2047 uint32_t newmtu;
2049 /* We always update the ixa since we might have set IXAF_VERIFY_PMTU */
2050 iptun_update_dst_pmtu(iptun, ixa);
2053 * We return the current MTU without updating it if it was pegged to a
2054 * static value using the MAC_PROP_MTU link property.
2056 if (iptun->iptun_flags & IPTUN_FIXED_MTU)
2057 return (iptun->iptun_mtu);
2059 /* If the MTU isn't fixed, then use the maximum possible value. */
2060 newmtu = iptun_get_maxmtu(iptun, ixa, new_pmtu);
2062 * We only dynamically adjust the tunnel MTU for tunnels with
2063 * destinations because dynamic MTU calculations are based on the
2064 * destination path-MTU.
2066 if ((iptun->iptun_flags & IPTUN_RADDR) && newmtu != iptun->iptun_mtu) {
2067 iptun->iptun_mtu = newmtu;
2068 if (iptun->iptun_flags & IPTUN_MAC_REGISTERED)
2069 iptun_task_dispatch(iptun, IPTUN_TASK_MTU_UPDATE);
2072 return (newmtu);
2076 * Frees a packet or packet chain and bumps stat for each freed packet.
2078 static void
2079 iptun_drop_pkt(mblk_t *mp, uint64_t *stat)
2081 mblk_t *pktmp;
2083 for (pktmp = mp; pktmp != NULL; pktmp = mp) {
2084 mp = mp->b_next;
2085 pktmp->b_next = NULL;
2086 if (stat != NULL)
2087 atomic_inc_64(stat);
2088 freemsg(pktmp);
2093 * Allocate and return a new mblk to hold an IP and ICMP header, and chain the
2094 * original packet to its b_cont. Returns NULL on failure.
2096 static mblk_t *
2097 iptun_build_icmperr(size_t hdrs_size, mblk_t *orig_pkt)
2099 mblk_t *icmperr_mp;
2101 if ((icmperr_mp = allocb(hdrs_size, BPRI_MED)) != NULL) {
2102 icmperr_mp->b_wptr += hdrs_size;
2103 /* tack on the offending packet */
2104 icmperr_mp->b_cont = orig_pkt;
2106 return (icmperr_mp);
2110 * Transmit an ICMP error. mp->b_rptr points at the packet to be included in
2111 * the ICMP error.
2113 static void
2114 iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp,
2115 ts_label_t *tsl)
2117 size_t orig_pktsize, hdrs_size;
2118 mblk_t *icmperr_mp;
2119 ipha_t *new_ipha;
2120 icmph_t *new_icmp;
2121 ip_xmit_attr_t ixas;
2122 conn_t *connp = iptun->iptun_connp;
2124 orig_pktsize = msgdsize(mp);
2125 hdrs_size = sizeof (ipha_t) + sizeof (icmph_t);
2126 if ((icmperr_mp = iptun_build_icmperr(hdrs_size, mp)) == NULL) {
2127 iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2128 return;
2131 new_ipha = (ipha_t *)icmperr_mp->b_rptr;
2132 new_icmp = (icmph_t *)(new_ipha + 1);
2134 new_ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
2135 new_ipha->ipha_type_of_service = 0;
2136 new_ipha->ipha_ident = 0;
2137 new_ipha->ipha_fragment_offset_and_flags = 0;
2138 new_ipha->ipha_ttl = orig_ipha->ipha_ttl;
2139 new_ipha->ipha_protocol = IPPROTO_ICMP;
2140 new_ipha->ipha_src = orig_ipha->ipha_dst;
2141 new_ipha->ipha_dst = orig_ipha->ipha_src;
2142 new_ipha->ipha_hdr_checksum = 0; /* will be computed by ip */
2143 new_ipha->ipha_length = htons(hdrs_size + orig_pktsize);
2145 *new_icmp = *icmp;
2146 new_icmp->icmph_checksum = 0;
2147 new_icmp->icmph_checksum = IP_CSUM(icmperr_mp, sizeof (ipha_t), 0);
2149 bzero(&ixas, sizeof (ixas));
2150 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
2151 if (new_ipha->ipha_src == INADDR_ANY)
2152 ixas.ixa_flags |= IXAF_SET_SOURCE;
2154 ixas.ixa_zoneid = IPCL_ZONEID(connp);
2155 ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
2156 ixas.ixa_cred = connp->conn_cred;
2157 ixas.ixa_cpid = NOPID;
2158 if (is_system_labeled())
2159 ixas.ixa_tsl = tsl;
2161 ixas.ixa_ifindex = 0;
2162 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2164 (void) ip_output_simple(icmperr_mp, &ixas);
2165 ixa_cleanup(&ixas);
2168 static void
2169 iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp,
2170 ts_label_t *tsl)
2172 size_t orig_pktsize, hdrs_size;
2173 mblk_t *icmp6err_mp;
2174 ip6_t *new_ip6h;
2175 icmp6_t *new_icmp6;
2176 ip_xmit_attr_t ixas;
2177 conn_t *connp = iptun->iptun_connp;
2179 orig_pktsize = msgdsize(mp);
2180 hdrs_size = sizeof (ip6_t) + sizeof (icmp6_t);
2181 if ((icmp6err_mp = iptun_build_icmperr(hdrs_size, mp)) == NULL) {
2182 iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2183 return;
2186 new_ip6h = (ip6_t *)icmp6err_mp->b_rptr;
2187 new_icmp6 = (icmp6_t *)(new_ip6h + 1);
2189 new_ip6h->ip6_vcf = orig_ip6h->ip6_vcf;
2190 new_ip6h->ip6_plen = htons(sizeof (icmp6_t) + orig_pktsize);
2191 new_ip6h->ip6_hops = orig_ip6h->ip6_hops;
2192 new_ip6h->ip6_nxt = IPPROTO_ICMPV6;
2193 new_ip6h->ip6_src = orig_ip6h->ip6_dst;
2194 new_ip6h->ip6_dst = orig_ip6h->ip6_src;
2196 *new_icmp6 = *icmp6;
2197 /* The checksum is calculated in ip_output_simple and friends. */
2198 new_icmp6->icmp6_cksum = new_ip6h->ip6_plen;
2200 bzero(&ixas, sizeof (ixas));
2201 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
2202 if (IN6_IS_ADDR_UNSPECIFIED(&new_ip6h->ip6_src))
2203 ixas.ixa_flags |= IXAF_SET_SOURCE;
2205 ixas.ixa_zoneid = IPCL_ZONEID(connp);
2206 ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
2207 ixas.ixa_cred = connp->conn_cred;
2208 ixas.ixa_cpid = NOPID;
2209 if (is_system_labeled())
2210 ixas.ixa_tsl = tsl;
2212 ixas.ixa_ifindex = 0;
2213 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2215 (void) ip_output_simple(icmp6err_mp, &ixas);
2216 ixa_cleanup(&ixas);
2219 static void
2220 iptun_icmp_error_v4(iptun_t *iptun, ipha_t *orig_ipha, mblk_t *mp,
2221 uint8_t type, uint8_t code, ts_label_t *tsl)
2223 icmph_t icmp;
2225 bzero(&icmp, sizeof (icmp));
2226 icmp.icmph_type = type;
2227 icmp.icmph_code = code;
2229 iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
2232 static void
2233 iptun_icmp_fragneeded_v4(iptun_t *iptun, uint32_t newmtu, ipha_t *orig_ipha,
2234 mblk_t *mp, ts_label_t *tsl)
2236 icmph_t icmp;
2238 icmp.icmph_type = ICMP_DEST_UNREACHABLE;
2239 icmp.icmph_code = ICMP_FRAGMENTATION_NEEDED;
2240 icmp.icmph_du_zero = 0;
2241 icmp.icmph_du_mtu = htons(newmtu);
2243 iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
2246 static void
2247 iptun_icmp_error_v6(iptun_t *iptun, ip6_t *orig_ip6h, mblk_t *mp,
2248 uint8_t type, uint8_t code, uint32_t offset, ts_label_t *tsl)
2250 icmp6_t icmp6;
2252 bzero(&icmp6, sizeof (icmp6));
2253 icmp6.icmp6_type = type;
2254 icmp6.icmp6_code = code;
2255 if (type == ICMP6_PARAM_PROB)
2256 icmp6.icmp6_pptr = htonl(offset);
2258 iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
2261 static void
2262 iptun_icmp_toobig_v6(iptun_t *iptun, uint32_t newmtu, ip6_t *orig_ip6h,
2263 mblk_t *mp, ts_label_t *tsl)
2265 icmp6_t icmp6;
2267 icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG;
2268 icmp6.icmp6_code = 0;
2269 icmp6.icmp6_mtu = htonl(newmtu);
2271 iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
2275 * Determines if the packet pointed to by ipha or ip6h is an ICMP error. The
2276 * mp argument is only used to do bounds checking.
2278 static boolean_t
2279 is_icmp_error(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h)
2281 uint16_t hlen;
2283 if (ipha != NULL) {
2284 icmph_t *icmph;
2286 ASSERT(ip6h == NULL);
2287 if (ipha->ipha_protocol != IPPROTO_ICMP)
2288 return (B_FALSE);
2290 hlen = IPH_HDR_LENGTH(ipha);
2291 icmph = (icmph_t *)((uint8_t *)ipha + hlen);
2292 return (ICMP_IS_ERROR(icmph->icmph_type) ||
2293 icmph->icmph_type == ICMP_REDIRECT);
2294 } else {
2295 icmp6_t *icmp6;
2296 uint8_t *nexthdrp;
2298 ASSERT(ip6h != NULL);
2299 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hlen, &nexthdrp) ||
2300 *nexthdrp != IPPROTO_ICMPV6) {
2301 return (B_FALSE);
2304 icmp6 = (icmp6_t *)((uint8_t *)ip6h + hlen);
2305 return (ICMP6_IS_ERROR(icmp6->icmp6_type) ||
2306 icmp6->icmp6_type == ND_REDIRECT);
2311 * Find inner and outer IP headers from a tunneled packet as setup for calls
2312 * into ipsec_tun_{in,out}bound().
2313 * Note that we need to allow the outer header to be in a separate mblk from
2314 * the inner header.
2315 * If the caller knows the outer_hlen, the caller passes it in. Otherwise zero.
2317 static size_t
2318 iptun_find_headers(mblk_t *mp, size_t outer_hlen, ipha_t **outer4,
2319 ipha_t **inner4, ip6_t **outer6, ip6_t **inner6)
2321 ipha_t *ipha;
2322 size_t first_mblkl = MBLKL(mp);
2323 mblk_t *inner_mp;
2326 * Don't bother handling packets that don't have a full IP header in
2327 * the fist mblk. For the input path, the ip module ensures that this
2328 * won't happen, and on the output path, the IP tunneling MAC-type
2329 * plugins ensure that this also won't happen.
2331 if (first_mblkl < sizeof (ipha_t))
2332 return (0);
2333 ipha = (ipha_t *)(mp->b_rptr);
2334 switch (IPH_HDR_VERSION(ipha)) {
2335 case IPV4_VERSION:
2336 *outer4 = ipha;
2337 *outer6 = NULL;
2338 if (outer_hlen == 0)
2339 outer_hlen = IPH_HDR_LENGTH(ipha);
2340 break;
2341 case IPV6_VERSION:
2342 *outer4 = NULL;
2343 *outer6 = (ip6_t *)ipha;
2344 if (outer_hlen == 0)
2345 outer_hlen = ip_hdr_length_v6(mp, (ip6_t *)ipha);
2346 break;
2347 default:
2348 return (0);
2351 if (first_mblkl < outer_hlen ||
2352 (first_mblkl == outer_hlen && mp->b_cont == NULL))
2353 return (0);
2356 * We don't bother doing a pullup here since the outer header will
2357 * just get stripped off soon on input anyway. We just want to ensure
2358 * that the inner* pointer points to a full header.
2360 if (first_mblkl == outer_hlen) {
2361 inner_mp = mp->b_cont;
2362 ipha = (ipha_t *)inner_mp->b_rptr;
2363 } else {
2364 inner_mp = mp;
2365 ipha = (ipha_t *)(mp->b_rptr + outer_hlen);
2367 switch (IPH_HDR_VERSION(ipha)) {
2368 case IPV4_VERSION:
2369 if (inner_mp->b_wptr - (uint8_t *)ipha < sizeof (ipha_t))
2370 return (0);
2371 *inner4 = ipha;
2372 *inner6 = NULL;
2373 break;
2374 case IPV6_VERSION:
2375 if (inner_mp->b_wptr - (uint8_t *)ipha < sizeof (ip6_t))
2376 return (0);
2377 *inner4 = NULL;
2378 *inner6 = (ip6_t *)ipha;
2379 break;
2380 default:
2381 return (0);
2384 return (outer_hlen);
2388 * Received ICMP error in response to an X over IPv4 packet that we
2389 * transmitted.
2391 * NOTE: "outer" refers to what's inside the ICMP payload. We will get one of
2392 * the following:
2394 * [IPv4(0)][ICMPv4][IPv4(1)][IPv4(2)][ULP]
2396 * or
2398 * [IPv4(0)][ICMPv4][IPv4(1)][IPv6][ULP]
2400 * And "outer4" will get set to IPv4(1), and inner[46] will correspond to
2401 * whatever the very-inner packet is (IPv4(2) or IPv6).
2403 static void
2404 iptun_input_icmp_v4(iptun_t *iptun, mblk_t *data_mp, icmph_t *icmph,
2405 ip_recv_attr_t *ira)
2407 uint8_t *orig;
2408 ipha_t *outer4, *inner4;
2409 ip6_t *outer6, *inner6;
2410 int outer_hlen;
2411 uint8_t type, code;
2413 ASSERT(data_mp->b_cont == NULL);
2415 * Temporarily move b_rptr forward so that iptun_find_headers() can
2416 * find headers in the ICMP packet payload.
2418 orig = data_mp->b_rptr;
2419 data_mp->b_rptr = (uint8_t *)(icmph + 1);
2421 * The ip module ensures that ICMP errors contain at least the
2422 * original IP header (otherwise, the error would never have made it
2423 * here).
2425 ASSERT(MBLKL(data_mp) >= 0);
2426 outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
2427 &inner6);
2428 ASSERT(outer6 == NULL);
2429 data_mp->b_rptr = orig;
2430 if (outer_hlen == 0) {
2431 iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2432 return;
2435 /* Only ICMP errors due to tunneled packets should reach here. */
2436 ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP ||
2437 outer4->ipha_protocol == IPPROTO_IPV6);
2439 data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2440 inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
2441 if (data_mp == NULL) {
2442 /* Callee did all of the freeing. */
2443 atomic_inc_64(&iptun->iptun_ierrors);
2444 return;
2446 /* We should never see reassembled fragment here. */
2447 ASSERT(data_mp->b_next == NULL);
2449 data_mp->b_rptr = (uint8_t *)outer4 + outer_hlen;
2452 * If the original packet being transmitted was itself an ICMP error,
2453 * then drop this packet. We don't want to generate an ICMP error in
2454 * response to an ICMP error.
2456 if (is_icmp_error(data_mp, inner4, inner6)) {
2457 iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2458 return;
2461 switch (icmph->icmph_type) {
2462 case ICMP_DEST_UNREACHABLE:
2463 type = (inner4 != NULL ? icmph->icmph_type : ICMP6_DST_UNREACH);
2464 switch (icmph->icmph_code) {
2465 case ICMP_FRAGMENTATION_NEEDED: {
2466 uint32_t newmtu;
2469 * We reconcile this with the fact that the tunnel may
2470 * also have IPsec policy by letting iptun_update_mtu
2471 * take care of it.
2473 newmtu = iptun_update_mtu(iptun, NULL,
2474 ntohs(icmph->icmph_du_mtu));
2476 if (inner4 != NULL) {
2477 iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
2478 data_mp, ira->ira_tsl);
2479 } else {
2480 iptun_icmp_toobig_v6(iptun, newmtu, inner6,
2481 data_mp, ira->ira_tsl);
2483 return;
2485 case ICMP_DEST_NET_UNREACH_ADMIN:
2486 case ICMP_DEST_HOST_UNREACH_ADMIN:
2487 code = (inner4 != NULL ? ICMP_DEST_NET_UNREACH_ADMIN :
2488 ICMP6_DST_UNREACH_ADMIN);
2489 break;
2490 default:
2491 code = (inner4 != NULL ? ICMP_HOST_UNREACHABLE :
2492 ICMP6_DST_UNREACH_ADDR);
2493 break;
2495 break;
2496 case ICMP_TIME_EXCEEDED:
2497 if (inner6 != NULL) {
2498 type = ICMP6_TIME_EXCEEDED;
2499 code = 0;
2500 } /* else we're already set. */
2501 break;
2502 case ICMP_PARAM_PROBLEM:
2504 * This is a problem with the outer header we transmitted.
2505 * Treat this as an output error.
2507 iptun_drop_pkt(data_mp, &iptun->iptun_oerrors);
2508 return;
2509 default:
2510 iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2511 return;
2514 if (inner4 != NULL) {
2515 iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
2516 ira->ira_tsl);
2517 } else {
2518 iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
2519 ira->ira_tsl);
2524 * Return B_TRUE if the IPv6 packet pointed to by ip6h contains a Tunnel
2525 * Encapsulation Limit destination option. If there is one, set encaplim_ptr
2526 * to point to the option value.
2528 static boolean_t
2529 iptun_find_encaplimit(mblk_t *mp, ip6_t *ip6h, uint8_t **encaplim_ptr)
2531 ip_pkt_t pkt;
2532 uint8_t *endptr;
2533 ip6_dest_t *destp;
2534 struct ip6_opt *optp;
2536 pkt.ipp_fields = 0; /* must be initialized */
2537 (void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &pkt, NULL);
2538 if ((pkt.ipp_fields & IPPF_DSTOPTS) != 0) {
2539 destp = pkt.ipp_dstopts;
2540 } else if ((pkt.ipp_fields & IPPF_RTHDRDSTOPTS) != 0) {
2541 destp = pkt.ipp_rthdrdstopts;
2542 } else {
2543 return (B_FALSE);
2546 endptr = (uint8_t *)destp + 8 * (destp->ip6d_len + 1);
2547 optp = (struct ip6_opt *)(destp + 1);
2548 while (endptr - (uint8_t *)optp > sizeof (*optp)) {
2549 if (optp->ip6o_type == IP6OPT_TUNNEL_LIMIT) {
2550 if ((uint8_t *)(optp + 1) >= endptr)
2551 return (B_FALSE);
2552 *encaplim_ptr = (uint8_t *)&optp[1];
2553 return (B_TRUE);
2555 optp = (struct ip6_opt *)((uint8_t *)optp + optp->ip6o_len + 2);
2557 return (B_FALSE);
2561 * Received ICMPv6 error in response to an X over IPv6 packet that we
2562 * transmitted.
2564 * NOTE: "outer" refers to what's inside the ICMP payload. We will get one of
2565 * the following:
2567 * [IPv6(0)][ICMPv6][IPv6(1)][IPv4][ULP]
2569 * or
2571 * [IPv6(0)][ICMPv6][IPv6(1)][IPv6(2)][ULP]
2573 * And "outer6" will get set to IPv6(1), and inner[46] will correspond to
2574 * whatever the very-inner packet is (IPv4 or IPv6(2)).
2576 static void
2577 iptun_input_icmp_v6(iptun_t *iptun, mblk_t *data_mp, icmp6_t *icmp6h,
2578 ip_recv_attr_t *ira)
2580 uint8_t *orig;
2581 ipha_t *outer4, *inner4;
2582 ip6_t *outer6, *inner6;
2583 int outer_hlen;
2584 uint8_t type, code;
2586 ASSERT(data_mp->b_cont == NULL);
2589 * Temporarily move b_rptr forward so that iptun_find_headers() can
2590 * find IP headers in the ICMP packet payload.
2592 orig = data_mp->b_rptr;
2593 data_mp->b_rptr = (uint8_t *)(icmp6h + 1);
2595 * The ip module ensures that ICMP errors contain at least the
2596 * original IP header (otherwise, the error would never have made it
2597 * here).
2599 ASSERT(MBLKL(data_mp) >= 0);
2600 outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
2601 &inner6);
2602 ASSERT(outer4 == NULL);
2603 data_mp->b_rptr = orig; /* Restore r_ptr */
2604 if (outer_hlen == 0) {
2605 iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2606 return;
2609 data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2610 inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
2611 if (data_mp == NULL) {
2612 /* Callee did all of the freeing. */
2613 atomic_inc_64(&iptun->iptun_ierrors);
2614 return;
2616 /* We should never see reassembled fragment here. */
2617 ASSERT(data_mp->b_next == NULL);
2619 data_mp->b_rptr = (uint8_t *)outer6 + outer_hlen;
2622 * If the original packet being transmitted was itself an ICMP error,
2623 * then drop this packet. We don't want to generate an ICMP error in
2624 * response to an ICMP error.
2626 if (is_icmp_error(data_mp, inner4, inner6)) {
2627 iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2628 return;
2631 switch (icmp6h->icmp6_type) {
2632 case ICMP6_PARAM_PROB: {
2633 uint8_t *encaplim_ptr;
2636 * If the ICMPv6 error points to a valid Tunnel Encapsulation
2637 * Limit option and the limit value is 0, then fall through
2638 * and send a host unreachable message. Otherwise, treat the
2639 * error as an output error, as there must have been a problem
2640 * with a packet we sent.
2642 if (!iptun_find_encaplimit(data_mp, outer6, &encaplim_ptr) ||
2643 (icmp6h->icmp6_pptr !=
2644 ((ptrdiff_t)encaplim_ptr - (ptrdiff_t)outer6)) ||
2645 *encaplim_ptr != 0) {
2646 iptun_drop_pkt(data_mp, &iptun->iptun_oerrors);
2647 return;
2649 /* FALLTHRU */
2651 case ICMP6_TIME_EXCEEDED:
2652 case ICMP6_DST_UNREACH:
2653 type = (inner4 != NULL ? ICMP_DEST_UNREACHABLE :
2654 ICMP6_DST_UNREACH);
2655 code = (inner4 != NULL ? ICMP_HOST_UNREACHABLE :
2656 ICMP6_DST_UNREACH_ADDR);
2657 break;
2658 case ICMP6_PACKET_TOO_BIG: {
2659 uint32_t newmtu;
2662 * We reconcile this with the fact that the tunnel may also
2663 * have IPsec policy by letting iptun_update_mtu take care of
2664 * it.
2666 newmtu = iptun_update_mtu(iptun, NULL,
2667 ntohl(icmp6h->icmp6_mtu));
2669 if (inner4 != NULL) {
2670 iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
2671 data_mp, ira->ira_tsl);
2672 } else {
2673 iptun_icmp_toobig_v6(iptun, newmtu, inner6, data_mp,
2674 ira->ira_tsl);
2676 return;
2678 default:
2679 iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2680 return;
2683 if (inner4 != NULL) {
2684 iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
2685 ira->ira_tsl);
2686 } else {
2687 iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
2688 ira->ira_tsl);
2693 * Called as conn_recvicmp from IP for ICMP errors.
2695 /* ARGSUSED2 */
2696 static void
2697 iptun_input_icmp(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2699 conn_t *connp = arg;
2700 iptun_t *iptun = connp->conn_iptun;
2701 mblk_t *tmpmp;
2702 size_t hlen;
2704 ASSERT(IPCL_IS_IPTUN(connp));
2706 if (mp->b_cont != NULL) {
2708 * Since ICMP error processing necessitates access to bits
2709 * that are within the ICMP error payload (the original packet
2710 * that caused the error), pull everything up into a single
2711 * block for convenience.
2713 if ((tmpmp = msgpullup(mp, -1)) == NULL) {
2714 iptun_drop_pkt(mp, &iptun->iptun_norcvbuf);
2715 return;
2717 freemsg(mp);
2718 mp = tmpmp;
2721 hlen = ira->ira_ip_hdr_length;
2722 switch (iptun->iptun_typeinfo->iti_ipvers) {
2723 case IPV4_VERSION:
2725 * The outer IP header coming up from IP is always ipha_t
2726 * alligned (otherwise, we would have crashed in ip).
2728 iptun_input_icmp_v4(iptun, mp, (icmph_t *)(mp->b_rptr + hlen),
2729 ira);
2730 break;
2731 case IPV6_VERSION:
2732 iptun_input_icmp_v6(iptun, mp, (icmp6_t *)(mp->b_rptr + hlen),
2733 ira);
2734 break;
2738 static boolean_t
2739 iptun_in_6to4_ok(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
2741 ipaddr_t v4addr;
2744 * It's possible that someone sent us an IPv4-in-IPv4 packet with the
2745 * IPv4 address of a 6to4 tunnel as the destination.
2747 if (inner6 == NULL)
2748 return (B_FALSE);
2751 * Make sure that the IPv6 destination is within the site that this
2752 * 6to4 tunnel is routing for. We don't want people bouncing random
2753 * tunneled IPv6 packets through this 6to4 router.
2755 IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst, (struct in_addr *)&v4addr);
2756 if (outer4->ipha_dst != v4addr)
2757 return (B_FALSE);
2759 if (IN6_IS_ADDR_6TO4(&inner6->ip6_src)) {
2761 * Section 9 of RFC 3056 (security considerations) suggests
2762 * that when a packet is from a 6to4 site (i.e., it's not a
2763 * global address being forwarded froma relay router), make
2764 * sure that the packet was tunneled by that site's 6to4
2765 * router.
2767 IN6_6TO4_TO_V4ADDR(&inner6->ip6_src, (struct in_addr *)&v4addr);
2768 if (outer4->ipha_src != v4addr)
2769 return (B_FALSE);
2770 } else {
2772 * Only accept packets from a relay router if we've configured
2773 * outbound relay router functionality.
2775 if (iptun->iptun_iptuns->iptuns_relay_rtr_addr == INADDR_ANY)
2776 return (B_FALSE);
2779 return (B_TRUE);
2783 * Input function for everything that comes up from the ip module below us.
2784 * This is called directly from the ip module via connp->conn_recv().
2786 * We receive M_DATA messages with IP-in-IP tunneled packets.
2788 /* ARGSUSED2 */
2789 static void
2790 iptun_input(void *arg, mblk_t *data_mp, void *arg2, ip_recv_attr_t *ira)
2792 conn_t *connp = arg;
2793 iptun_t *iptun = connp->conn_iptun;
2794 int outer_hlen;
2795 ipha_t *outer4, *inner4;
2796 ip6_t *outer6, *inner6;
2798 ASSERT(IPCL_IS_IPTUN(connp));
2799 ASSERT(DB_TYPE(data_mp) == M_DATA);
2801 outer_hlen = iptun_find_headers(data_mp, ira->ira_ip_hdr_length,
2802 &outer4, &inner4, &outer6, &inner6);
2803 if (outer_hlen == 0)
2804 goto drop;
2807 * If the system is labeled, we call tsol_check_dest() on the packet
2808 * destination (our local tunnel address) to ensure that the packet as
2809 * labeled should be allowed to be sent to us. We don't need to call
2810 * the more involved tsol_receive_local() since the tunnel link itself
2811 * cannot be assigned to shared-stack non-global zones.
2813 if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
2814 if (ira->ira_tsl == NULL)
2815 goto drop;
2816 if (tsol_check_dest(ira->ira_tsl, (outer4 != NULL ?
2817 (void *)&outer4->ipha_dst : (void *)&outer6->ip6_dst),
2818 (outer4 != NULL ? IPV4_VERSION : IPV6_VERSION),
2819 CONN_MAC_DEFAULT, B_FALSE, NULL) != 0)
2820 goto drop;
2823 data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2824 inner4, inner6, outer4, outer6, outer_hlen, iptun->iptun_ns);
2825 if (data_mp == NULL) {
2826 /* Callee did all of the freeing. */
2827 return;
2830 if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4 &&
2831 !iptun_in_6to4_ok(iptun, outer4, inner6))
2832 goto drop;
2835 * We need to statistically account for each packet individually, so
2836 * we might as well split up any b_next chains here.
2838 do {
2839 mblk_t *mp;
2841 mp = data_mp->b_next;
2842 data_mp->b_next = NULL;
2844 atomic_inc_64(&iptun->iptun_ipackets);
2845 atomic_add_64(&iptun->iptun_rbytes, msgdsize(data_mp));
2846 mac_rx(iptun->iptun_mh, NULL, data_mp);
2848 data_mp = mp;
2849 } while (data_mp != NULL);
2850 return;
2851 drop:
2852 iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2856 * Do 6to4-specific header-processing on output. Return B_TRUE if the packet
2857 * was processed without issue, or B_FALSE if the packet had issues and should
2858 * be dropped.
2860 static boolean_t
2861 iptun_out_process_6to4(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
2863 ipaddr_t v4addr;
2866 * IPv6 source must be a 6to4 address. This is because a conscious
2867 * decision was made to not allow a Solaris system to be used as a
2868 * relay router (for security reasons) when 6to4 was initially
2869 * integrated. If this decision is ever reversed, the following check
2870 * can be removed.
2872 if (!IN6_IS_ADDR_6TO4(&inner6->ip6_src))
2873 return (B_FALSE);
2876 * RFC3056 mandates that the IPv4 source MUST be set to the IPv4
2877 * portion of the 6to4 IPv6 source address. In other words, make sure
2878 * that we're tunneling packets from our own 6to4 site.
2880 IN6_6TO4_TO_V4ADDR(&inner6->ip6_src, (struct in_addr *)&v4addr);
2881 if (outer4->ipha_src != v4addr)
2882 return (B_FALSE);
2885 * Automatically set the destination of the outer IPv4 header as
2886 * described in RFC3056. There are two possibilities:
2888 * a. If the IPv6 destination is a 6to4 address, set the IPv4 address
2889 * to the IPv4 portion of the 6to4 address.
2890 * b. If the IPv6 destination is a native IPv6 address, set the IPv4
2891 * destination to the address of a relay router.
2893 * Design Note: b shouldn't be necessary here, and this is a flaw in
2894 * the design of the 6to4relay command. Instead of setting a 6to4
2895 * relay address in this module via an ioctl, the 6to4relay command
2896 * could simply add a IPv6 route for native IPv6 addresses (such as a
2897 * default route) in the forwarding table that uses a 6to4 destination
2898 * as its next hop, and the IPv4 portion of that address could be a
2899 * 6to4 relay address. In order for this to work, IP would have to
2900 * resolve the next hop address, which would necessitate a link-layer
2901 * address resolver for 6to4 links, which doesn't exist today.
2903 * In fact, if a resolver existed for 6to4 links, then setting the
2904 * IPv4 destination in the outer header could be done as part of
2905 * link-layer address resolution and fast-path header generation, and
2906 * not here.
2908 if (IN6_IS_ADDR_6TO4(&inner6->ip6_dst)) {
2909 /* destination is a 6to4 router */
2910 IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst,
2911 (struct in_addr *)&outer4->ipha_dst);
2913 /* Reject attempts to send to INADDR_ANY */
2914 if (outer4->ipha_dst == INADDR_ANY)
2915 return (B_FALSE);
2916 } else {
2918 * The destination is a native IPv6 address. If output to a
2919 * relay-router is enabled, use the relay-router's IPv4
2920 * address as the destination.
2922 if (iptun->iptun_iptuns->iptuns_relay_rtr_addr == INADDR_ANY)
2923 return (B_FALSE);
2924 outer4->ipha_dst = iptun->iptun_iptuns->iptuns_relay_rtr_addr;
2928 * If the outer source and destination are equal, this means that the
2929 * 6to4 router somehow forwarded an IPv6 packet destined for its own
2930 * 6to4 site to its 6to4 tunnel interface, which will result in this
2931 * packet infinitely bouncing between ip and iptun.
2933 return (outer4->ipha_src != outer4->ipha_dst);
2937 * Process output packets with outer IPv4 headers. Frees mp and bumps stat on
2938 * error.
2940 static mblk_t *
2941 iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4,
2942 ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
2944 uint8_t *innerptr = (inner4 != NULL ?
2945 (uint8_t *)inner4 : (uint8_t *)inner6);
2946 size_t minmtu = iptun->iptun_typeinfo->iti_minmtu;
2948 if (inner4 != NULL) {
2949 ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP);
2951 * Copy the tos from the inner IPv4 header. We mask off ECN
2952 * bits (bits 6 and 7) because there is currently no
2953 * tunnel-tunnel communication to determine if both sides
2954 * support ECN. We opt for the safe choice: don't copy the
2955 * ECN bits when doing encapsulation.
2957 outer4->ipha_type_of_service =
2958 inner4->ipha_type_of_service & ~0x03;
2959 } else {
2960 ASSERT(outer4->ipha_protocol == IPPROTO_IPV6 &&
2961 inner6 != NULL);
2963 if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF)
2964 outer4->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
2965 else
2966 outer4->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS;
2969 * As described in section 3.2.2 of RFC4213, if the packet payload is
2970 * less than or equal to the minimum MTU size, then we need to allow
2971 * IPv4 to fragment the packet. The reason is that even if we end up
2972 * receiving an ICMP frag-needed, the interface above this tunnel
2973 * won't be allowed to drop its MTU as a result, since the packet was
2974 * already smaller than the smallest allowable MTU for that interface.
2976 if (mp->b_wptr - innerptr <= minmtu) {
2977 outer4->ipha_fragment_offset_and_flags = 0;
2978 ixa->ixa_flags &= ~IXAF_DONTFRAG;
2979 } else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) &&
2980 (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4)) {
2981 ixa->ixa_flags |= IXAF_DONTFRAG;
2984 ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(outer4);
2985 ixa->ixa_pktlen = msgdsize(mp);
2986 ixa->ixa_protocol = outer4->ipha_protocol;
2988 outer4->ipha_length = htons(ixa->ixa_pktlen);
2989 return (mp);
2993 * Insert an encapsulation limit destination option in the packet provided.
2994 * Always consumes the mp argument and returns a new mblk pointer.
2996 static mblk_t *
2997 iptun_insert_encaplimit(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
2998 uint8_t limit)
3000 mblk_t *newmp;
3001 iptun_ipv6hdrs_t *newouter6;
3003 ASSERT(outer6->ip6_nxt == IPPROTO_IPV6);
3004 ASSERT(mp->b_cont == NULL);
3006 mp->b_rptr += sizeof (ip6_t);
3007 newmp = allocb(sizeof (iptun_ipv6hdrs_t) + MBLKL(mp), BPRI_MED);
3008 if (newmp == NULL) {
3009 iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
3010 return (NULL);
3012 newmp->b_wptr += sizeof (iptun_ipv6hdrs_t);
3013 /* Copy the payload (Starting with the inner IPv6 header). */
3014 bcopy(mp->b_rptr, newmp->b_wptr, MBLKL(mp));
3015 newmp->b_wptr += MBLKL(mp);
3016 newouter6 = (iptun_ipv6hdrs_t *)newmp->b_rptr;
3017 /* Now copy the outer IPv6 header. */
3018 bcopy(outer6, &newouter6->it6h_ip6h, sizeof (ip6_t));
3019 newouter6->it6h_ip6h.ip6_nxt = IPPROTO_DSTOPTS;
3020 newouter6->it6h_encaplim = iptun_encaplim_init;
3021 newouter6->it6h_encaplim.iel_destopt.ip6d_nxt = outer6->ip6_nxt;
3022 newouter6->it6h_encaplim.iel_telopt.ip6ot_encap_limit = limit;
3025 * The payload length will be set at the end of
3026 * iptun_out_process_ipv6().
3029 freemsg(mp);
3030 return (newmp);
3034 * Process output packets with outer IPv6 headers. Frees mp and bumps stats
3035 * on error.
3037 static mblk_t *
3038 iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
3039 ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
3041 uint8_t *innerptr = (inner4 != NULL ?
3042 (uint8_t *)inner4 : (uint8_t *)inner6);
3043 size_t minmtu = iptun->iptun_typeinfo->iti_minmtu;
3044 uint8_t *limit, *configlimit;
3045 uint32_t offset;
3046 iptun_ipv6hdrs_t *v6hdrs;
3048 if (inner6 != NULL && iptun_find_encaplimit(mp, inner6, &limit)) {
3050 * The inner packet is an IPv6 packet which itself contains an
3051 * encapsulation limit option. The limit variable points to
3052 * the value in the embedded option. Process the
3053 * encapsulation limit option as specified in RFC 2473.
3055 * If limit is 0, then we've exceeded the limit and we need to
3056 * send back an ICMPv6 parameter problem message.
3058 * If limit is > 0, then we decrement it by 1 and make sure
3059 * that the encapsulation limit option in the outer header
3060 * reflects that (adding an option if one isn't already
3061 * there).
3063 ASSERT(limit > mp->b_rptr && limit < mp->b_wptr);
3064 if (*limit == 0) {
3065 mp->b_rptr = (uint8_t *)inner6;
3066 offset = limit - mp->b_rptr;
3067 iptun_icmp_error_v6(iptun, inner6, mp, ICMP6_PARAM_PROB,
3068 0, offset, ixa->ixa_tsl);
3069 atomic_inc_64(&iptun->iptun_noxmtbuf);
3070 return (NULL);
3074 * The outer header requires an encapsulation limit option.
3075 * If there isn't one already, add one.
3077 if (iptun->iptun_encaplimit == 0) {
3078 if ((mp = iptun_insert_encaplimit(iptun, mp, outer6,
3079 (*limit - 1))) == NULL)
3080 return (NULL);
3081 v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr;
3082 } else {
3084 * There is an existing encapsulation limit option in
3085 * the outer header. If the inner encapsulation limit
3086 * is less than the configured encapsulation limit,
3087 * update the outer encapsulation limit to reflect
3088 * this lesser value.
3090 v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr;
3091 configlimit =
3092 &v6hdrs->it6h_encaplim.iel_telopt.ip6ot_encap_limit;
3093 if ((*limit - 1) < *configlimit)
3094 *configlimit = (*limit - 1);
3096 ixa->ixa_ip_hdr_length = sizeof (iptun_ipv6hdrs_t);
3097 ixa->ixa_protocol = v6hdrs->it6h_encaplim.iel_destopt.ip6d_nxt;
3098 } else {
3099 ixa->ixa_ip_hdr_length = sizeof (ip6_t);
3100 ixa->ixa_protocol = outer6->ip6_nxt;
3103 * See iptun_output_process_ipv4() why we allow fragmentation for
3104 * small packets
3106 if (mp->b_wptr - innerptr <= minmtu)
3107 ixa->ixa_flags &= ~IXAF_DONTFRAG;
3108 else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL))
3109 ixa->ixa_flags |= IXAF_DONTFRAG;
3111 ixa->ixa_pktlen = msgdsize(mp);
3112 outer6->ip6_plen = htons(ixa->ixa_pktlen - sizeof (ip6_t));
3113 return (mp);
3117 * The IP tunneling MAC-type plugins have already done most of the header
3118 * processing and validity checks. We are simply responsible for multiplexing
3119 * down to the ip module below us.
3121 static void
3122 iptun_output(iptun_t *iptun, mblk_t *mp)
3124 conn_t *connp = iptun->iptun_connp;
3125 mblk_t *newmp;
3126 int error;
3127 ip_xmit_attr_t *ixa;
3129 ASSERT(mp->b_datap->db_type == M_DATA);
3131 if (mp->b_cont != NULL) {
3132 if ((newmp = msgpullup(mp, -1)) == NULL) {
3133 iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
3134 return;
3136 freemsg(mp);
3137 mp = newmp;
3140 if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) {
3141 iptun_output_6to4(iptun, mp);
3142 return;
3145 if (is_system_labeled()) {
3147 * Since the label can be different meaning a potentially
3148 * different IRE,we always use a unique ip_xmit_attr_t.
3150 ixa = conn_get_ixa_exclusive(connp);
3151 } else {
3153 * If no other thread is using conn_ixa this just gets a
3154 * reference to conn_ixa. Otherwise we get a safe copy of
3155 * conn_ixa.
3157 ixa = conn_get_ixa(connp, B_FALSE);
3159 if (ixa == NULL) {
3160 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3161 return;
3165 * In case we got a safe copy of conn_ixa, then we need
3166 * to fill in any pointers in it.
3168 if (ixa->ixa_ire == NULL) {
3169 error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
3170 &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
3171 NULL, NULL, 0);
3172 if (error != 0) {
3173 if (ixa->ixa_ire != NULL &&
3174 (error == EHOSTUNREACH || error == ENETUNREACH)) {
3176 * Let conn_ip_output/ire_send_noroute return
3177 * the error and send any local ICMP error.
3179 error = 0;
3180 } else {
3181 ixa_refrele(ixa);
3182 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3183 return;
3188 iptun_output_common(iptun, ixa, mp);
3189 ixa_refrele(ixa);
3193 * We use an ixa based on the last destination.
3195 static void
3196 iptun_output_6to4(iptun_t *iptun, mblk_t *mp)
3198 conn_t *connp = iptun->iptun_connp;
3199 ipha_t *outer4, *inner4;
3200 ip6_t *outer6, *inner6;
3201 ip_xmit_attr_t *ixa;
3202 ip_xmit_attr_t *oldixa;
3203 int error;
3204 boolean_t need_connect;
3205 in6_addr_t v6dst;
3207 ASSERT(mp->b_cont == NULL); /* Verified by iptun_output */
3209 /* Make sure we set ipha_dst before we look at ipha_dst */
3211 (void) iptun_find_headers(mp, 0, &outer4, &inner4, &outer6, &inner6);
3212 ASSERT(outer4 != NULL);
3213 if (!iptun_out_process_6to4(iptun, outer4, inner6)) {
3214 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3215 return;
3218 if (is_system_labeled()) {
3220 * Since the label can be different meaning a potentially
3221 * different IRE,we always use a unique ip_xmit_attr_t.
3223 ixa = conn_get_ixa_exclusive(connp);
3224 } else {
3226 * If no other thread is using conn_ixa this just gets a
3227 * reference to conn_ixa. Otherwise we get a safe copy of
3228 * conn_ixa.
3230 ixa = conn_get_ixa(connp, B_FALSE);
3232 if (ixa == NULL) {
3233 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3234 return;
3237 mutex_enter(&connp->conn_lock);
3238 if (connp->conn_v4lastdst == outer4->ipha_dst) {
3239 need_connect = (ixa->ixa_ire == NULL);
3240 } else {
3241 /* In case previous destination was multirt */
3242 ip_attr_newdst(ixa);
3245 * We later update conn_ixa when we update conn_v4lastdst
3246 * which enables subsequent packets to avoid redoing
3247 * ip_attr_connect
3249 need_connect = B_TRUE;
3251 mutex_exit(&connp->conn_lock);
3254 * In case we got a safe copy of conn_ixa, or otherwise we don't
3255 * have a current ixa_ire, then we need to fill in any pointers in
3256 * the ixa.
3258 if (need_connect) {
3259 IN6_IPADDR_TO_V4MAPPED(outer4->ipha_dst, &v6dst);
3261 /* We handle IPsec in iptun_output_common */
3262 error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
3263 &v6dst, &v6dst, 0, NULL, NULL, 0);
3264 if (error != 0) {
3265 if (ixa->ixa_ire != NULL &&
3266 (error == EHOSTUNREACH || error == ENETUNREACH)) {
3268 * Let conn_ip_output/ire_send_noroute return
3269 * the error and send any local ICMP error.
3271 error = 0;
3272 } else {
3273 ixa_refrele(ixa);
3274 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3275 return;
3280 iptun_output_common(iptun, ixa, mp);
3282 /* Atomically replace conn_ixa and conn_v4lastdst */
3283 mutex_enter(&connp->conn_lock);
3284 if (connp->conn_v4lastdst != outer4->ipha_dst) {
3285 /* Remember the dst which corresponds to conn_ixa */
3286 connp->conn_v6lastdst = v6dst;
3287 oldixa = conn_replace_ixa(connp, ixa);
3288 } else {
3289 oldixa = NULL;
3291 mutex_exit(&connp->conn_lock);
3292 ixa_refrele(ixa);
3293 if (oldixa != NULL)
3294 ixa_refrele(oldixa);
3298 * Check the destination/label. Modifies *mpp by adding/removing CIPSO.
3300 * We get the label from the message in order to honor the
3301 * ULPs/IPs choice of label. This will be NULL for forwarded
3302 * packets, neighbor discovery packets and some others.
3304 static int
3305 iptun_output_check_label(mblk_t **mpp, ip_xmit_attr_t *ixa)
3307 cred_t *cr;
3308 int adjust;
3309 int iplen;
3310 int err;
3311 ts_label_t *effective_tsl = NULL;
3314 ASSERT(is_system_labeled());
3316 cr = msg_getcred(*mpp, NULL);
3317 if (cr == NULL)
3318 return (0);
3321 * We need to start with a label based on the IP/ULP above us
3323 ip_xmit_attr_restore_tsl(ixa, cr);
3326 * Need to update packet with any CIPSO option since
3327 * conn_ip_output doesn't do that.
3329 if (ixa->ixa_flags & IXAF_IS_IPV4) {
3330 ipha_t *ipha;
3332 ipha = (ipha_t *)(*mpp)->b_rptr;
3333 iplen = ntohs(ipha->ipha_length);
3334 err = tsol_check_label_v4(ixa->ixa_tsl,
3335 ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
3336 ixa->ixa_ipst, &effective_tsl);
3337 if (err != 0)
3338 return (err);
3340 ipha = (ipha_t *)(*mpp)->b_rptr;
3341 adjust = (int)ntohs(ipha->ipha_length) - iplen;
3342 } else {
3343 ip6_t *ip6h;
3345 ip6h = (ip6_t *)(*mpp)->b_rptr;
3346 iplen = ntohs(ip6h->ip6_plen);
3348 err = tsol_check_label_v6(ixa->ixa_tsl,
3349 ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
3350 ixa->ixa_ipst, &effective_tsl);
3351 if (err != 0)
3352 return (err);
3354 ip6h = (ip6_t *)(*mpp)->b_rptr;
3355 adjust = (int)ntohs(ip6h->ip6_plen) - iplen;
3358 if (effective_tsl != NULL) {
3359 /* Update the label */
3360 ip_xmit_attr_replace_tsl(ixa, effective_tsl);
3362 ixa->ixa_pktlen += adjust;
3363 ixa->ixa_ip_hdr_length += adjust;
3364 return (0);
3368 static void
3369 iptun_output_common(iptun_t *iptun, ip_xmit_attr_t *ixa, mblk_t *mp)
3371 ipsec_tun_pol_t *itp = iptun->iptun_itp;
3372 int outer_hlen;
3373 mblk_t *newmp;
3374 ipha_t *outer4, *inner4;
3375 ip6_t *outer6, *inner6;
3376 int error;
3377 boolean_t update_pktlen;
3379 ASSERT(ixa->ixa_ire != NULL);
3381 outer_hlen = iptun_find_headers(mp, 0, &outer4, &inner4, &outer6,
3382 &inner6);
3383 if (outer_hlen == 0) {
3384 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3385 return;
3388 /* Save IXAF_DONTFRAG value */
3389 iaflags_t dontfrag = ixa->ixa_flags & IXAF_DONTFRAG;
3391 /* Perform header processing. */
3392 if (outer4 != NULL) {
3393 mp = iptun_out_process_ipv4(iptun, mp, outer4, inner4, inner6,
3394 ixa);
3395 } else {
3396 mp = iptun_out_process_ipv6(iptun, mp, outer6, inner4, inner6,
3397 ixa);
3399 if (mp == NULL)
3400 return;
3403 * Let's hope the compiler optimizes this with "branch taken".
3405 if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) {
3406 /* This updates the ip_xmit_attr_t */
3407 mp = ipsec_tun_outbound(mp, iptun, inner4, inner6, outer4,
3408 outer6, outer_hlen, ixa);
3409 if (mp == NULL) {
3410 atomic_inc_64(&iptun->iptun_oerrors);
3411 return;
3413 if (is_system_labeled()) {
3415 * Might change the packet by adding/removing CIPSO.
3416 * After this caller inner* and outer* and outer_hlen
3417 * might be invalid.
3419 error = iptun_output_check_label(&mp, ixa);
3420 if (error != 0) {
3421 ip2dbg(("label check failed (%d)\n", error));
3422 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3423 return;
3428 * ipsec_tun_outbound() returns a chain of tunneled IP
3429 * fragments linked with b_next (or a single message if the
3430 * tunneled packet wasn't a fragment).
3431 * If fragcache returned a list then we need to update
3432 * ixa_pktlen for all packets in the list.
3434 update_pktlen = (mp->b_next != NULL);
3437 * Otherwise, we're good to go. The ixa has been updated with
3438 * instructions for outbound IPsec processing.
3440 for (newmp = mp; newmp != NULL; newmp = mp) {
3441 size_t minmtu = iptun->iptun_typeinfo->iti_minmtu;
3443 atomic_inc_64(&iptun->iptun_opackets);
3444 atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3445 mp = mp->b_next;
3446 newmp->b_next = NULL;
3449 * The IXAF_DONTFRAG flag is global, but there is
3450 * a chain here. Check if we're really already
3451 * smaller than the minimum allowed MTU and reset here
3452 * appropriately. Otherwise one small packet can kill
3453 * the whole chain's path mtu discovery.
3454 * In addition, update the pktlen to the length of
3455 * the actual packet being processed.
3457 if (update_pktlen) {
3458 ixa->ixa_pktlen = msgdsize(newmp);
3459 if (ixa->ixa_pktlen <= minmtu)
3460 ixa->ixa_flags &= ~IXAF_DONTFRAG;
3463 atomic_inc_64(&iptun->iptun_opackets);
3464 atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3466 error = conn_ip_output(newmp, ixa);
3468 /* Restore IXAF_DONTFRAG value */
3469 ixa->ixa_flags |= dontfrag;
3471 if (error == EMSGSIZE) {
3472 /* IPsec policy might have changed */
3473 (void) iptun_update_mtu(iptun, ixa, 0);
3476 } else {
3478 * The ip module will potentially apply global policy to the
3479 * packet in its output path if there's no active tunnel
3480 * policy.
3482 ASSERT(ixa->ixa_ipsec_policy == NULL);
3483 mp = ip_output_attach_policy(mp, outer4, outer6, NULL, ixa);
3484 if (mp == NULL) {
3485 atomic_inc_64(&iptun->iptun_oerrors);
3486 return;
3488 if (is_system_labeled()) {
3490 * Might change the packet by adding/removing CIPSO.
3491 * After this caller inner* and outer* and outer_hlen
3492 * might be invalid.
3494 error = iptun_output_check_label(&mp, ixa);
3495 if (error != 0) {
3496 ip2dbg(("label check failed (%d)\n", error));
3497 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3498 return;
3502 atomic_inc_64(&iptun->iptun_opackets);
3503 atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3505 error = conn_ip_output(mp, ixa);
3506 if (error == EMSGSIZE) {
3507 /* IPsec policy might have changed */
3508 (void) iptun_update_mtu(iptun, ixa, 0);
3511 if (ixa->ixa_flags & IXAF_IPSEC_SECURE)
3512 ipsec_out_release_refs(ixa);
3515 static mac_callbacks_t iptun_m_callbacks = {
3516 .mc_callbacks = (MC_SETPROP | MC_GETPROP),
3517 .mc_getstat = iptun_m_getstat,
3518 .mc_start = iptun_m_start,
3519 .mc_stop = iptun_m_stop,
3520 .mc_setpromisc = iptun_m_setpromisc,
3521 .mc_multicst = iptun_m_multicst,
3522 .mc_unicst = iptun_m_unicst,
3523 .mc_tx = iptun_m_tx,
3524 .mc_setprop = iptun_m_setprop,
3525 .mc_getprop = iptun_m_getprop