sys/net/netisr.c

   1 /*
   2  * Copyright (c) 2003, 2004 Matthew Dillon. All rights reserved.
   3  * Copyright (c) 2003, 2004 Jeffrey M. Hsu.  All rights reserved.
   4  * Copyright (c) 2003 Jonathan Lemon.  All rights reserved.
   5  * Copyright (c) 2003, 2004 The DragonFly Project.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to The DragonFly Project
   8  * by Jonathan Lemon, Jeffrey M. Hsu, and Matthew Dillon.
   9  *
  10  * Jonathan Lemon gave Jeffrey Hsu permission to combine his copyright
  11  * into this one around July 8 2004.
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  * 3. Neither the name of The DragonFly Project nor the names of its
  22  *    contributors may be used to endorse or promote products derived
  23  *    from this software without specific, prior written permission.
  24  *
  25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  26  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  27  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  28  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  29  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  30  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  31  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  32  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  33  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  34  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  35  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  * $DragonFly: src/sys/net/netisr.c,v 1.49 2008/11/01 10:29:31 sephe Exp $
  39  */
  40
  41 #include <sys/param.h>
  42 #include <sys/systm.h>
  43 #include <sys/kernel.h>
  44 #include <sys/malloc.h>
  45 #include <sys/msgport.h>
  46 #include <sys/proc.h>
  47 #include <sys/interrupt.h>
  48 #include <sys/socket.h>
  49 #include <sys/sysctl.h>
  50 #include <sys/socketvar.h>
  51 #include <net/if.h>
  52 #include <net/if_var.h>
  53 #include <net/netisr.h>
  54 #include <machine/cpufunc.h>
  55
  56 #include <sys/thread2.h>
  57 #include <sys/msgport2.h>
  58 #include <net/netmsg2.h>
  59 #include <sys/mplock2.h>
  60
  61 static void netmsg_sync_func(netmsg_t msg);
  62 static void netmsg_service_loop(void *arg);
  63 static void cpu0_cpufn(struct mbuf **mp, int hoff);
  64
  65 struct netmsg_port_registration {
  66         TAILQ_ENTRY(netmsg_port_registration) npr_entry;
  67         lwkt_port_t     npr_port;
  68 };
  69
  70 struct netmsg_rollup {
  71         TAILQ_ENTRY(netmsg_rollup) ru_entry;
  72         netisr_ru_t     ru_func;
  73 };
  74
  75 static struct netisr netisrs[NETISR_MAX];
  76 static TAILQ_HEAD(,netmsg_port_registration) netreglist;
  77 static TAILQ_HEAD(,netmsg_rollup) netrulist;
  78
  79 /* Per-CPU thread to handle any protocol.  */
  80 static struct thread netisr_cpu[MAXCPU];
  81 lwkt_port netisr_afree_rport;
  82 lwkt_port netisr_afree_free_so_rport;
  83 lwkt_port netisr_adone_rport;
  84 lwkt_port netisr_apanic_rport;
  85 lwkt_port netisr_sync_port;
  86
  87 static int (*netmsg_fwd_port_fn)(lwkt_port_t, lwkt_msg_t);
  88
  89 SYSCTL_NODE(_net, OID_AUTO, netisr, CTLFLAG_RW, 0, "netisr");
  90
  91 /*
  92  * netisr_afree_rport replymsg function, only used to handle async
  93  * messages which the sender has abandoned to their fate.
  94  */
  95 static void
  96 netisr_autofree_reply(lwkt_port_t port, lwkt_msg_t msg)
  97 {
  98         kfree(msg, M_LWKTMSG);
  99 }
 100
 101 static void
 102 netisr_autofree_free_so_reply(lwkt_port_t port, lwkt_msg_t msg)
 103 {
 104         sofree(((netmsg_t)msg)->base.nm_so);
 105         kfree(msg, M_LWKTMSG);
 106 }
 107
 108 /*
 109  * We need a custom putport function to handle the case where the
 110  * message target is the current thread's message port.  This case
 111  * can occur when the TCP or UDP stack does a direct callback to NFS and NFS
 112  * then turns around and executes a network operation synchronously.
 113  *
 114  * To prevent deadlocking, we must execute these self-referential messages
 115  * synchronously, effectively turning the message into a glorified direct
 116  * procedure call back into the protocol stack.  The operation must be
 117  * complete on return or we will deadlock, so panic if it isn't.
 118  *
 119  * However, the target function is under no obligation to immediately
 120  * reply the message.  It may forward it elsewhere.
 121  */
 122 static int
 123 netmsg_put_port(lwkt_port_t port, lwkt_msg_t lmsg)
 124 {
 125         netmsg_base_t nmsg = (void *)lmsg;
 126
 127         if ((lmsg->ms_flags & MSGF_SYNC) && port == &curthread->td_msgport) {
 128                 nmsg->nm_dispatch((netmsg_t)nmsg);
 129                 return(EASYNC);
 130         } else {
 131                 return(netmsg_fwd_port_fn(port, lmsg));
 132         }
 133 }
 134
 135 /*
 136  * UNIX DOMAIN sockets still have to run their uipc functions synchronously,
 137  * because they depend on the user proc context for a number of things
 138  * (like creds) which we have not yet incorporated into the message structure.
 139  *
 140  * However, we maintain or message/port abstraction.  Having a special
 141  * synchronous port which runs the commands synchronously gives us the
 142  * ability to serialize operations in one place later on when we start
 143  * removing the BGL.
 144  */
 145 static int
 146 netmsg_sync_putport(lwkt_port_t port, lwkt_msg_t lmsg)
 147 {
 148         netmsg_base_t nmsg = (void *)lmsg;
 149
 150         KKASSERT((lmsg->ms_flags & MSGF_DONE) == 0);
 151
 152         lmsg->ms_target_port = port;    /* required for abort */
 153         nmsg->nm_dispatch((netmsg_t)nmsg);
 154         return(EASYNC);
 155 }
 156
 157 static void
 158 netisr_init(void)
 159 {
 160         int i;
 161
 162         TAILQ_INIT(&netreglist);
 163         TAILQ_INIT(&netrulist);
 164
 165         /*
 166          * Create default per-cpu threads for generic protocol handling.
 167          */
 168         for (i = 0; i < ncpus; ++i) {
 169                 lwkt_create(netmsg_service_loop, NULL, NULL,
 170                             &netisr_cpu[i], TDF_STOPREQ, i,
 171                             "netisr_cpu %d", i);
 172                 netmsg_service_port_init(&netisr_cpu[i].td_msgport);
 173                 lwkt_schedule(&netisr_cpu[i]);
 174         }
 175
 176         /*
 177          * The netisr_afree_rport is a special reply port which automatically
 178          * frees the replied message.  The netisr_adone_rport simply marks
 179          * the message as being done.  The netisr_apanic_rport panics if
 180          * the message is replied to.
 181          */
 182         lwkt_initport_replyonly(&netisr_afree_rport, netisr_autofree_reply);
 183         lwkt_initport_replyonly(&netisr_afree_free_so_rport,
 184                                 netisr_autofree_free_so_reply);
 185         lwkt_initport_replyonly_null(&netisr_adone_rport);
 186         lwkt_initport_panic(&netisr_apanic_rport);
 187
 188         /*
 189          * The netisr_syncport is a special port which executes the message
 190          * synchronously and waits for it if EASYNC is returned.
 191          */
 192         lwkt_initport_putonly(&netisr_sync_port, netmsg_sync_putport);
 193 }
 194
 195 SYSINIT(netisr, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, netisr_init, NULL);
 196
 197 /*
 198  * Finish initializing the message port for a netmsg service.  This also
 199  * registers the port for synchronous cleanup operations such as when an
 200  * ifnet is being destroyed.  There is no deregistration API yet.
 201  */
 202 void
 203 netmsg_service_port_init(lwkt_port_t port)
 204 {
 205         struct netmsg_port_registration *reg;
 206
 207         /*
 208          * Override the putport function.  Our custom function checks for
 209          * self-references and executes such commands synchronously.
 210          */
 211         if (netmsg_fwd_port_fn == NULL)
 212                 netmsg_fwd_port_fn = port->mp_putport;
 213         KKASSERT(netmsg_fwd_port_fn == port->mp_putport);
 214         port->mp_putport = netmsg_put_port;
 215
 216         /*
 217          * Keep track of ports using the netmsg API so we can synchronize
 218          * certain operations (such as freeing an ifnet structure) across all
 219          * consumers.
 220          */
 221         reg = kmalloc(sizeof(*reg), M_TEMP, M_WAITOK|M_ZERO);
 222         reg->npr_port = port;
 223         TAILQ_INSERT_TAIL(&netreglist, reg, npr_entry);
 224 }
 225
 226 /*
 227  * This function synchronizes the caller with all netmsg services.  For
 228  * example, if an interface is being removed we must make sure that all
 229  * packets related to that interface complete processing before the structure
 230  * can actually be freed.  This sort of synchronization is an alternative to
 231  * ref-counting the netif, removing the ref counting overhead in favor of
 232  * placing additional overhead in the netif freeing sequence (where it is
 233  * inconsequential).
 234  */
 235 void
 236 netmsg_service_sync(void)
 237 {
 238         struct netmsg_port_registration *reg;
 239         struct netmsg_base smsg;
 240
 241         netmsg_init(&smsg, NULL, &curthread->td_msgport, 0, netmsg_sync_func);
 242
 243         TAILQ_FOREACH(reg, &netreglist, npr_entry) {
 244                 lwkt_domsg(reg->npr_port, &smsg.lmsg, 0);
 245         }
 246 }
 247
 248 /*
 249  * The netmsg function simply replies the message.  API semantics require
 250  * EASYNC to be returned if the netmsg function disposes of the message.
 251  */
 252 static void
 253 netmsg_sync_func(netmsg_t msg)
 254 {
 255         lwkt_replymsg(&msg->lmsg, 0);
 256 }
 257
 258 /*
 259  * Generic netmsg service loop.  Some protocols may roll their own but all
 260  * must do the basic command dispatch function call done here.
 261  */
 262 static void
 263 netmsg_service_loop(void *arg)
 264 {
 265         struct netmsg_rollup *ru;
 266         netmsg_base_t msg;
 267         thread_t td = curthread;;
 268         int limit;
 269
 270         while ((msg = lwkt_waitport(&td->td_msgport, 0))) {
 271                 /*
 272                  * Run up to 512 pending netmsgs.
 273                  */
 274                 limit = 512;
 275                 do {
 276                         KASSERT(msg->nm_dispatch != NULL,
 277                                 ("netmsg_service isr %d badmsg\n",
 278                                 msg->lmsg.u.ms_result));
 279                         if (msg->nm_so &&
 280                             msg->nm_so->so_port != &td->td_msgport) {
 281                                 /*
 282                                  * Sockets undergoing connect or disconnect
 283                                  * ops can change ports on us.  Chase the
 284                                  * port.
 285                                  */
 286                                 kprintf("netmsg_service_loop: Warning, "
 287                                         "port changed so=%p\n", msg->nm_so);
 288                                 lwkt_forwardmsg(msg->nm_so->so_port,
 289                                                 &msg->lmsg);
 290                         } else {
 291                                 /*
 292                                  * We are on the correct port, dispatch it.
 293                                  */
 294                                 msg->nm_dispatch((netmsg_t)msg);
 295                         }
 296                         if (--limit == 0)
 297                                 break;
 298                 } while ((msg = lwkt_getport(&td->td_msgport)) != NULL);
 299
 300                 /*
 301                  * Run all registered rollup functions for this cpu
 302                  * (e.g. tcp_willblock()).
 303                  */
 304                 TAILQ_FOREACH(ru, &netrulist, ru_entry)
 305                         ru->ru_func();
 306         }
 307 }
 308
 309 /*
 310  * Forward a packet to a netisr service function.
 311  *
 312  * If the packet has not been assigned to a protocol thread we call
 313  * the port characterization function to assign it.  The caller must
 314  * clear M_HASH (or not have set it in the first place) if the caller
 315  * wishes the packet to be recharacterized.
 316  */
 317 int
 318 netisr_queue(int num, struct mbuf *m)
 319 {
 320         struct netisr *ni;
 321         struct netmsg_packet *pmsg;
 322         lwkt_port_t port;
 323
 324         KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))),
 325                 ("Bad isr %d", num));
 326
 327         ni = &netisrs[num];
 328         if (ni->ni_handler == NULL) {
 329                 kprintf("Unregistered isr %d\n", num);
 330                 m_freem(m);
 331                 return (EIO);
 332         }
 333
 334         /*
 335          * Figure out which protocol thread to send to.  This does not
 336          * have to be perfect but performance will be really good if it
 337          * is correct.  Major protocol inputs such as ip_input() will
 338          * re-characterize the packet as necessary.
 339          */
 340         if ((m->m_flags & M_HASH) == 0) {
 341                 ni->ni_cpufn(&m, 0);
 342                 if (m == NULL) {
 343                         m_freem(m);
 344                         return (EIO);
 345                 }
 346                 if ((m->m_flags & M_HASH) == 0) {
 347                         kprintf("netisr_queue(%d): packet hash failed\n", num);
 348                         m_freem(m);
 349                         return (EIO);
 350                 }
 351         }
 352
 353         /*
 354          * Get the protocol port based on the packet hash, initialize
 355          * the netmsg, and send it off.
 356          */
 357         port = cpu_portfn(m->m_pkthdr.hash);
 358         pmsg = &m->m_hdr.mh_netmsg;
 359         netmsg_init(&pmsg->base, NULL, &netisr_apanic_rport,
 360                     0, ni->ni_handler);
 361         pmsg->nm_packet = m;
 362         pmsg->base.lmsg.u.ms_result = num;
 363         lwkt_sendmsg(port, &pmsg->base.lmsg);
 364
 365         return (0);
 366 }
 367
 368 /*
 369  * Pre-characterization of a deeper portion of the packet for the
 370  * requested isr.
 371  *
 372  * The base of the ISR type (e.g. IP) that we want to characterize is
 373  * at (hoff) relative to the beginning of the mbuf.  This allows
 374  * e.g. ether_input_chain() to not have to adjust the m_data/m_len.
 375  */
 376 void
 377 netisr_characterize(int num, struct mbuf **mp, int hoff)
 378 {
 379         struct netisr *ni;
 380         struct mbuf *m;
 381
 382         /*
 383          * Validation
 384          */
 385         m = *mp;
 386         KKASSERT(m != NULL);
 387
 388         if (num < 0 || num >= NETISR_MAX) {
 389                 if (num == NETISR_MAX) {
 390                         m->m_flags |= M_HASH;
 391                         m->m_pkthdr.hash = 0;
 392                         return;
 393                 }
 394                 panic("Bad isr %d", num);
 395         }
 396
 397         /*
 398          * Valid netisr?
 399          */
 400         ni = &netisrs[num];
 401         if (ni->ni_handler == NULL) {
 402                 kprintf("Unregistered isr %d\n", num);
 403                 m_freem(m);
 404                 *mp = NULL;
 405         }
 406
 407         /*
 408          * Characterize the packet
 409          */
 410         if ((m->m_flags & M_HASH) == 0) {
 411                 ni->ni_cpufn(mp, hoff);
 412                 m = *mp;
 413                 if (m && (m->m_flags & M_HASH) == 0)
 414                         kprintf("netisr_queue(%d): packet hash failed\n", num);
 415         }
 416 }
 417
 418 void
 419 netisr_register(int num, netisr_fn_t handler, netisr_cpufn_t cpufn)
 420 {
 421         struct netisr *ni;
 422
 423         KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))),
 424                 ("netisr_register: bad isr %d", num));
 425         KKASSERT(handler != NULL);
 426
 427         if (cpufn == NULL)
 428                 cpufn = cpu0_cpufn;
 429
 430         ni = &netisrs[num];
 431
 432         ni->ni_handler = handler;
 433         ni->ni_cpufn = cpufn;
 434         netmsg_init(&ni->ni_netmsg, NULL, &netisr_adone_rport, 0, NULL);
 435 }
 436
 437 void
 438 netisr_register_rollup(netisr_ru_t ru_func)
 439 {
 440         struct netmsg_rollup *ru;
 441
 442         ru = kmalloc(sizeof(*ru), M_TEMP, M_WAITOK|M_ZERO);
 443         ru->ru_func = ru_func;
 444         TAILQ_INSERT_TAIL(&netrulist, ru, ru_entry);
 445 }
 446
 447 /*
 448  * Return the message port for the general protocol message servicing
 449  * thread for a particular cpu.
 450  */
 451 lwkt_port_t
 452 cpu_portfn(int cpu)
 453 {
 454         KKASSERT(cpu >= 0 && cpu < ncpus);
 455         return (&netisr_cpu[cpu].td_msgport);
 456 }
 457
 458 /*
 459  * Return the current cpu's network protocol thread.
 460  */
 461 lwkt_port_t
 462 cur_netport(void)
 463 {
 464         return(cpu_portfn(mycpu->gd_cpuid));
 465 }
 466
 467 /*
 468  * Return a default protocol control message processing thread port
 469  */
 470 lwkt_port_t
 471 cpu0_ctlport(int cmd __unused, struct sockaddr *sa __unused,
 472              void *extra __unused)
 473 {
 474         return (&netisr_cpu[0].td_msgport);
 475 }
 476
 477 /*
 478  * This is a default netisr packet characterization function which
 479  * sets M_HASH.  If a netisr is registered with a NULL cpufn function
 480  * this one is assigned.
 481  *
 482  * This function makes no attempt to validate the packet.
 483  */
 484 static void
 485 cpu0_cpufn(struct mbuf **mp, int hoff __unused)
 486 {
 487         struct mbuf *m = *mp;
 488
 489         m->m_flags |= M_HASH;
 490         m->m_pkthdr.hash = 0;
 491 }
 492
 493 /*
 494  * schednetisr() is used to call the netisr handler from the appropriate
 495  * netisr thread for polling and other purposes.
 496  *
 497  * This function may be called from a hard interrupt or IPI and must be
 498  * MP SAFE and non-blocking.  We use a fixed per-cpu message instead of
 499  * trying to allocate one.  We must get ourselves onto the target cpu
 500  * to safely check the MSGF_DONE bit on the message but since the message
 501  * will be sent to that cpu anyway this does not add any extra work beyond
 502  * what lwkt_sendmsg() would have already had to do to schedule the target
 503  * thread.
 504  */
 505 static void
 506 schednetisr_remote(void *data)
 507 {
 508         int num = (int)(intptr_t)data;
 509         struct netisr *ni = &netisrs[num];
 510         lwkt_port_t port = &netisr_cpu[0].td_msgport;
 511         netmsg_base_t pmsg;
 512
 513         pmsg = &netisrs[num].ni_netmsg;
 514         if (pmsg->lmsg.ms_flags & MSGF_DONE) {
 515                 netmsg_init(pmsg, NULL, &netisr_adone_rport, 0, ni->ni_handler);
 516                 pmsg->lmsg.u.ms_result = num;
 517                 lwkt_sendmsg(port, &pmsg->lmsg);
 518         }
 519 }
 520
 521 void
 522 schednetisr(int num)
 523 {
 524         KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))),
 525                 ("schednetisr: bad isr %d", num));
 526         KKASSERT(netisrs[num].ni_handler != NULL);
 527 #ifdef SMP
 528         if (mycpu->gd_cpuid != 0) {
 529                 lwkt_send_ipiq(globaldata_find(0),
 530                                schednetisr_remote, (void *)(intptr_t)num);
 531         } else {
 532                 crit_enter();
 533                 schednetisr_remote((void *)(intptr_t)num);
 534                 crit_exit();
 535         }
 536 #else
 537         crit_enter();
 538         schednetisr_remote((void *)(intptr_t)num);
 539         crit_exit();
 540 #endif
 541 }