sys/netinet/in_rmx.c

   1 /*
   2  * Copyright 1994, 1995 Massachusetts Institute of Technology
   3  *
   4  * Permission to use, copy, modify, and distribute this software and
   5  * its documentation for any purpose and without fee is hereby
   6  * granted, provided that both the above copyright notice and this
   7  * permission notice appear in all copies, that both the above
   8  * copyright notice and this permission notice appear in all
   9  * supporting documentation, and that the name of M.I.T. not be used
  10  * in advertising or publicity pertaining to distribution of the
  11  * software without specific, written prior permission.  M.I.T. makes
  12  * no representations about the suitability of this software for any
  13  * purpose.  It is provided "as is" without express or implied
  14  * warranty.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  17  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  18  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  20  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  23  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  26  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  *
  29  * $FreeBSD: src/sys/netinet/in_rmx.c,v 1.37.2.3 2002/08/09 14:49:23 ru Exp $
  30  * $DragonFly: src/sys/netinet/in_rmx.c,v 1.14 2006/04/11 06:59:34 dillon Exp $
  31  */
  32
  33 /*
  34  * This code does two things necessary for the enhanced TCP metrics to
  35  * function in a useful manner:
  36  *  1) It marks all non-host routes as `cloning', thus ensuring that
  37  *     every actual reference to such a route actually gets turned
  38  *     into a reference to a host route to the specific destination
  39  *     requested.
  40  *  2) When such routes lose all their references, it arranges for them
  41  *     to be deleted in some random collection of circumstances, so that
  42  *     a large quantity of stale routing data is not kept in kernel memory
  43  *     indefinitely.  See in_rtqtimo() below for the exact mechanism.
  44  */
  45
  46 #include <sys/param.h>
  47 #include <sys/systm.h>
  48 #include <sys/kernel.h>
  49 #include <sys/sysctl.h>
  50 #include <sys/socket.h>
  51 #include <sys/mbuf.h>
  52 #include <sys/syslog.h>
  53 #include <sys/globaldata.h>
  54 #include <sys/thread2.h>
  55
  56 #include <net/if.h>
  57 #include <net/route.h>
  58 #include <net/if_var.h>
  59 #include <netinet/in.h>
  60 #include <netinet/in_var.h>
  61 #include <netinet/ip_var.h>
  62 #include <netinet/ip_flow.h>
  63
  64 #define RTPRF_EXPIRING  RTF_PROTO3      /* set on routes we manage */
  65
  66 static struct callout in_rtqtimo_ch[MAXCPU];
  67
  68 /*
  69  * Do what we need to do when inserting a route.
  70  */
  71 static struct radix_node *
  72 in_addroute(char *key, char *mask, struct radix_node_head *head,
  73             struct radix_node *treenodes)
  74 {
  75         struct rtentry *rt = (struct rtentry *)treenodes;
  76         struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt);
  77         struct radix_node *ret;
  78         struct in_ifaddr_container *iac;
  79         struct in_ifaddr *ia;
  80
  81         /*
  82          * For IP, mark routes to multicast addresses as such, because
  83          * it's easy to do and might be useful (but this is much more
  84          * dubious since it's so easy to inspect the address).
  85          *
  86          * For IP, all unicast non-host routes are automatically cloning.
  87          */
  88         if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
  89                 rt->rt_flags |= RTF_MULTICAST;
  90
  91         if (!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST)))
  92                 rt->rt_flags |= RTF_PRCLONING;
  93
  94         /*
  95          *   For host routes, we make sure that RTF_BROADCAST
  96          *   is set for anything that looks like a broadcast address.
  97          *   This way, we can avoid an expensive call to in_broadcast()
  98          *   in ip_output() most of the time (because the route passed
  99          *   to ip_output() is almost always a host route).
 100          *
 101          *   For local routes we set RTF_LOCAL allowing various shortcuts.
 102          *
 103          *   A cloned network route will point to one of several possible
 104          *   addresses if an interface has aliases and must be repointed
 105          *   back to the correct address or arp_rtrequest() will not properly
 106          *   detect the local ip.
 107          */
 108         if (rt->rt_flags & RTF_HOST) {
 109                 if (in_broadcast(sin->sin_addr, rt->rt_ifp)) {
 110                         rt->rt_flags |= RTF_BROADCAST;
 111                 } else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr ==
 112                            sin->sin_addr.s_addr) {
 113                         rt->rt_flags |= RTF_LOCAL;
 114                 } else {
 115                         LIST_FOREACH(iac, INADDR_HASH(sin->sin_addr.s_addr),
 116                                      ia_hash) {
 117                                 ia = iac->ia;
 118                                 if (sin->sin_addr.s_addr ==
 119                                     ia->ia_addr.sin_addr.s_addr) {
 120                                         rt->rt_flags |= RTF_LOCAL;
 121                                         IFAREF(&ia->ia_ifa);
 122                                         IFAFREE(rt->rt_ifa);
 123                                         rt->rt_ifa = &ia->ia_ifa;
 124                                         rt->rt_ifp = rt->rt_ifa->ifa_ifp;
 125                                         break;
 126                                 }
 127                         }
 128                 }
 129         }
 130
 131         if (rt->rt_rmx.rmx_mtu != 0 && !(rt->rt_rmx.rmx_locks & RTV_MTU) &&
 132             rt->rt_ifp != NULL)
 133                 rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
 134
 135         ret = rn_addroute(key, mask, head, treenodes);
 136         if (ret == NULL && (rt->rt_flags & RTF_HOST)) {
 137                 struct rtentry *oldrt;
 138
 139                 /*
 140                  * We are trying to add a host route, but can't.
 141                  * Find out if it is because of an ARP entry and
 142                  * delete it if so.
 143                  */
 144                 oldrt = rtpurelookup((struct sockaddr *)sin);
 145                 if (oldrt != NULL) {
 146                         --oldrt->rt_refcnt;
 147                         if ((oldrt->rt_flags & RTF_LLINFO) &&
 148                             (oldrt->rt_flags & RTF_HOST) &&
 149                             oldrt->rt_gateway &&
 150                             oldrt->rt_gateway->sa_family == AF_LINK) {
 151                                 rtrequest(RTM_DELETE, rt_key(oldrt),
 152                                           oldrt->rt_gateway, rt_mask(oldrt),
 153                                           oldrt->rt_flags, NULL);
 154                                 ret = rn_addroute(key, mask, head, treenodes);
 155                         }
 156                 }
 157         }
 158
 159         /*
 160          * If the new route has been created successfully, and it is
 161          * not a multicast/broadcast or cloned route, then we will
 162          * have to flush the ipflow.  Otherwise, we may end up using
 163          * the wrong route.
 164          */
 165         if (ret != NULL &&
 166             (rt->rt_flags &
 167              (RTF_MULTICAST | RTF_BROADCAST | RTF_WASCLONED)) == 0) {
 168                 ipflow_flush_oncpu();
 169         }
 170         return ret;
 171 }
 172
 173 /*
 174  * This code is the inverse of in_closeroute: on first reference, if we
 175  * were managing the route, stop doing so and set the expiration timer
 176  * back off again.
 177  */
 178 static struct radix_node *
 179 in_matchroute(char *key, struct radix_node_head *head)
 180 {
 181         struct radix_node *rn = rn_match(key, head);
 182         struct rtentry *rt = (struct rtentry *)rn;
 183
 184         if (rt != NULL && rt->rt_refcnt == 0) { /* this is first reference */
 185                 if (rt->rt_flags & RTPRF_EXPIRING) {
 186                         rt->rt_flags &= ~RTPRF_EXPIRING;
 187                         rt->rt_rmx.rmx_expire = 0;
 188                 }
 189         }
 190         return rn;
 191 }
 192
 193 static int rtq_reallyold = 60*60;  /* one hour is ``really old'' */
 194 SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW,
 195     &rtq_reallyold , 0,
 196     "Default expiration time on cloned routes");
 197
 198 static int rtq_minreallyold = 10;  /* never automatically crank down to less */
 199 SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW,
 200     &rtq_minreallyold , 0,
 201     "Minimum time to attempt to hold onto cloned routes");
 202
 203 static int rtq_toomany = 128;      /* 128 cached routes is ``too many'' */
 204 SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW,
 205     &rtq_toomany , 0, "Upper limit on cloned routes");
 206
 207 /*
 208  * On last reference drop, mark the route as belong to us so that it can be
 209  * timed out.
 210  */
 211 static void
 212 in_closeroute(struct radix_node *rn, struct radix_node_head *head)
 213 {
 214         struct rtentry *rt = (struct rtentry *)rn;
 215
 216         if (!(rt->rt_flags & RTF_UP))
 217                 return;         /* prophylactic measures */
 218
 219         if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST)
 220                 return;
 221
 222         if ((rt->rt_flags & (RTF_WASCLONED | RTPRF_EXPIRING)) != RTF_WASCLONED)
 223                 return;
 224
 225         /*
 226          * As requested by David Greenman:
 227          * If rtq_reallyold is 0, just delete the route without
 228          * waiting for a timeout cycle to kill it.
 229          */
 230         if (rtq_reallyold != 0) {
 231                 rt->rt_flags |= RTPRF_EXPIRING;
 232                 rt->rt_rmx.rmx_expire = time_second + rtq_reallyold;
 233         } else {
 234                 /*
 235                  * Remove route from the radix tree, but defer deallocation
 236                  * until we return to rtfree().
 237                  */
 238                 rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt),
 239                           rt->rt_flags, &rt);
 240         }
 241 }
 242
 243 struct rtqk_arg {
 244         struct radix_node_head *rnh;
 245         int draining;
 246         int killed;
 247         int found;
 248         int updating;
 249         time_t nextstop;
 250 };
 251
 252 /*
 253  * Get rid of old routes.  When draining, this deletes everything, even when
 254  * the timeout is not expired yet.  When updating, this makes sure that
 255  * nothing has a timeout longer than the current value of rtq_reallyold.
 256  */
 257 static int
 258 in_rtqkill(struct radix_node *rn, void *rock)
 259 {
 260         struct rtqk_arg *ap = rock;
 261         struct rtentry *rt = (struct rtentry *)rn;
 262         int err;
 263
 264         if (rt->rt_flags & RTPRF_EXPIRING) {
 265                 ap->found++;
 266                 if (ap->draining || rt->rt_rmx.rmx_expire <= time_second) {
 267                         if (rt->rt_refcnt > 0)
 268                                 panic("rtqkill route really not free");
 269
 270                         err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
 271                                         rt_mask(rt), rt->rt_flags, NULL);
 272                         if (err)
 273                                 log(LOG_WARNING, "in_rtqkill: error %d\n", err);
 274                         else
 275                                 ap->killed++;
 276                 } else {
 277                         if (ap->updating &&
 278                             (rt->rt_rmx.rmx_expire - time_second >
 279                              rtq_reallyold)) {
 280                                 rt->rt_rmx.rmx_expire = time_second +
 281                                     rtq_reallyold;
 282                         }
 283                         ap->nextstop = lmin(ap->nextstop,
 284                                             rt->rt_rmx.rmx_expire);
 285                 }
 286         }
 287
 288         return 0;
 289 }
 290
 291 #define RTQ_TIMEOUT     60*10   /* run no less than once every ten minutes */
 292 static int rtq_timeout = RTQ_TIMEOUT;
 293
 294 static void
 295 in_rtqtimo(void *rock)
 296 {
 297         struct radix_node_head *rnh = rock;
 298         struct rtqk_arg arg;
 299         struct timeval atv;
 300         static time_t last_adjusted_timeout = 0;
 301
 302         arg.found = arg.killed = 0;
 303         arg.rnh = rnh;
 304         arg.nextstop = time_second + rtq_timeout;
 305         arg.draining = arg.updating = 0;
 306         crit_enter();
 307         rnh->rnh_walktree(rnh, in_rtqkill, &arg);
 308         crit_exit();
 309
 310         /*
 311          * Attempt to be somewhat dynamic about this:
 312          * If there are ``too many'' routes sitting around taking up space,
 313          * then crank down the timeout, and see if we can't make some more
 314          * go away.  However, we make sure that we will never adjust more
 315          * than once in rtq_timeout seconds, to keep from cranking down too
 316          * hard.
 317          */
 318         if ((arg.found - arg.killed > rtq_toomany) &&
 319             (time_second - last_adjusted_timeout >= rtq_timeout) &&
 320             rtq_reallyold > rtq_minreallyold) {
 321                 rtq_reallyold = 2*rtq_reallyold / 3;
 322                 if (rtq_reallyold < rtq_minreallyold) {
 323                         rtq_reallyold = rtq_minreallyold;
 324                 }
 325
 326                 last_adjusted_timeout = time_second;
 327 #ifdef DIAGNOSTIC
 328                 log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n",
 329                     rtq_reallyold);
 330 #endif
 331                 arg.found = arg.killed = 0;
 332                 arg.updating = 1;
 333                 crit_enter();
 334                 rnh->rnh_walktree(rnh, in_rtqkill, &arg);
 335                 crit_exit();
 336         }
 337
 338         atv.tv_usec = 0;
 339         atv.tv_sec = arg.nextstop - time_second;
 340         callout_reset(&in_rtqtimo_ch[mycpuid], tvtohz_high(&atv), in_rtqtimo,
 341                       rock);
 342 }
 343
 344 void
 345 in_rtqdrain(void)
 346 {
 347         struct radix_node_head *rnh = rt_tables[mycpuid][AF_INET];
 348         struct rtqk_arg arg;
 349
 350         arg.found = arg.killed = 0;
 351         arg.rnh = rnh;
 352         arg.nextstop = 0;
 353         arg.draining = 1;
 354         arg.updating = 0;
 355         crit_enter();
 356         rnh->rnh_walktree(rnh, in_rtqkill, &arg);
 357         crit_exit();
 358 }
 359
 360 /*
 361  * Initialize our routing tree.
 362  */
 363 int
 364 in_inithead(void **head, int off)
 365 {
 366         struct radix_node_head *rnh;
 367
 368         if (!rn_inithead(head, rn_cpumaskhead(mycpuid), off))
 369                 return 0;
 370
 371         if (head != (void **)&rt_tables[mycpuid][AF_INET]) /* BOGUS! */
 372                 return 1;       /* only do this for the real routing table */
 373
 374         rnh = *head;
 375         rnh->rnh_addaddr = in_addroute;
 376         rnh->rnh_matchaddr = in_matchroute;
 377         rnh->rnh_close = in_closeroute;
 378         callout_init(&in_rtqtimo_ch[mycpuid]);
 379         in_rtqtimo(rnh);        /* kick off timeout first time */
 380         return 1;
 381 }
 382
 383 /*
 384  * This zaps old routes when the interface goes down or interface
 385  * address is deleted.  In the latter case, it deletes static routes
 386  * that point to this address.  If we don't do this, we may end up
 387  * using the old address in the future.  The ones we always want to
 388  * get rid of are things like ARP entries, since the user might down
 389  * the interface, walk over to a completely different network, and
 390  * plug back in.
 391  *
 392  * in_ifadown() is typically called when an interface is being brought
 393  * down.  We must iterate through all per-cpu route tables and clean
 394  * them up.
 395  */
 396 struct in_ifadown_arg {
 397         struct radix_node_head *rnh;
 398         struct ifaddr *ifa;
 399         int del;
 400 };
 401
 402 static int
 403 in_ifadownkill(struct radix_node *rn, void *xap)
 404 {
 405         struct in_ifadown_arg *ap = xap;
 406         struct rtentry *rt = (struct rtentry *)rn;
 407         int err;
 408
 409         if (rt->rt_ifa == ap->ifa &&
 410             (ap->del || !(rt->rt_flags & RTF_STATIC))) {
 411                 /*
 412                  * We need to disable the automatic prune that happens
 413                  * in this case in rtrequest() because it will blow
 414                  * away the pointers that rn_walktree() needs in order
 415                  * continue our descent.  We will end up deleting all
 416                  * the routes that rtrequest() would have in any case,
 417                  * so that behavior is not needed there.
 418                  */
 419                 rt->rt_flags &= ~(RTF_CLONING | RTF_PRCLONING);
 420                 err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
 421                                 rt_mask(rt), rt->rt_flags, NULL);
 422                 if (err)
 423                         log(LOG_WARNING, "in_ifadownkill: error %d\n", err);
 424         }
 425         return 0;
 426 }
 427
 428 int
 429 in_ifadown(struct ifaddr *ifa, int delete)
 430 {
 431         struct in_ifadown_arg arg;
 432         struct radix_node_head *rnh;
 433         int origcpu;
 434         int cpu;
 435
 436         if (ifa->ifa_addr->sa_family != AF_INET)
 437                 return 1;
 438
 439         /*
 440          * XXX individual requests are not independantly chained,
 441          * which means that the per-cpu route tables will not be
 442          * consistent in the middle of the operation.  If routes
 443          * related to the interface are manipulated while we are
 444          * doing this the inconsistancy could trigger a panic.
 445          */
 446         origcpu = mycpuid;
 447         for (cpu = 0; cpu < ncpus2; cpu++) {
 448                 lwkt_migratecpu(cpu);
 449
 450                 arg.rnh = rnh = rt_tables[cpu][AF_INET];
 451                 arg.ifa = ifa;
 452                 arg.del = delete;
 453                 rnh->rnh_walktree(rnh, in_ifadownkill, &arg);
 454                 ifa->ifa_flags &= ~IFA_ROUTE;
 455         }
 456         lwkt_migratecpu(origcpu);
 457         return 0;
 458 }
 459