sys/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
   3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to The DragonFly Project
   6  * by Jeffrey M. Hsu.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 3. Neither the name of The DragonFly Project nor the names of its
  17  *    contributors may be used to endorse or promote products derived
  18  *    from this software without specific, prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  */
  33
  34 /*
  35  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  36  *      The Regents of the University of California.  All rights reserved.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  * @(#)uipc_mbuf.c      8.2 (Berkeley) 1/4/94
  67  * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
  68  * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.61 2007/04/30 07:18:54 dillon Exp $
  69  */
  70
  71 #include "opt_param.h"
  72 #include "opt_ddb.h"
  73 #include "opt_mbuf_stress_test.h"
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/malloc.h>
  77 #include <sys/mbuf.h>
  78 #include <sys/kernel.h>
  79 #include <sys/sysctl.h>
  80 #include <sys/domain.h>
  81 #include <sys/objcache.h>
  82 #include <sys/protosw.h>
  83 #include <sys/uio.h>
  84 #include <sys/thread.h>
  85 #include <sys/globaldata.h>
  86 #include <sys/serialize.h>
  87 #include <sys/thread2.h>
  88
  89 #include <vm/vm.h>
  90 #include <vm/vm_kern.h>
  91 #include <vm/vm_extern.h>
  92
  93 #ifdef INVARIANTS
  94 #include <machine/cpu.h>
  95 #endif
  96
  97 /*
  98  * mbuf cluster meta-data
  99  */
 100 struct mbcluster {
 101         int32_t mcl_refs;
 102         void    *mcl_data;
 103         struct lwkt_serialize mcl_serializer;
 104 };
 105
 106 static void mbinit(void *);
 107 SYSINIT(mbuf, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, mbinit, NULL)
 108
 109 static u_long   mbtypes[MT_NTYPES];
 110
 111 struct mbstat mbstat;
 112 int     max_linkhdr;
 113 int     max_protohdr;
 114 int     max_hdr;
 115 int     max_datalen;
 116 int     m_defragpackets;
 117 int     m_defragbytes;
 118 int     m_defraguseless;
 119 int     m_defragfailure;
 120 #ifdef MBUF_STRESS_TEST
 121 int     m_defragrandomfailures;
 122 #endif
 123
 124 struct objcache *mbuf_cache, *mbufphdr_cache;
 125 struct objcache *mclmeta_cache;
 126 struct objcache *mbufcluster_cache, *mbufphdrcluster_cache;
 127
 128 int     nmbclusters;
 129 int     nmbufs;
 130
 131 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
 132            &max_linkhdr, 0, "");
 133 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
 134            &max_protohdr, 0, "");
 135 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
 136 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
 137            &max_datalen, 0, "");
 138 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
 139            &mbuf_wait, 0, "");
 140 SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, "");
 141 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mbtypes, CTLFLAG_RD, mbtypes,
 142            sizeof(mbtypes), "LU", "");
 143 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RW,
 144            &nmbclusters, 0, "Maximum number of mbuf clusters available");
 145 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RW, &nmbufs, 0,
 146            "Maximum number of mbufs available");
 147
 148 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
 149            &m_defragpackets, 0, "");
 150 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
 151            &m_defragbytes, 0, "");
 152 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
 153            &m_defraguseless, 0, "");
 154 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
 155            &m_defragfailure, 0, "");
 156 #ifdef MBUF_STRESS_TEST
 157 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
 158            &m_defragrandomfailures, 0, "");
 159 #endif
 160
 161 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
 162 static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl");
 163 static MALLOC_DEFINE(M_MCLMETA, "mclmeta", "mclmeta");
 164
 165 static void m_reclaim (void);
 166 static void m_mclref(void *arg);
 167 static void m_mclfree(void *arg);
 168
 169 #ifndef NMBCLUSTERS
 170 #define NMBCLUSTERS     (512 + maxusers * 16)
 171 #endif
 172 #ifndef NMBUFS
 173 #define NMBUFS          (nmbclusters * 2)
 174 #endif
 175
 176 /*
 177  * Perform sanity checks of tunables declared above.
 178  */
 179 static void
 180 tunable_mbinit(void *dummy)
 181 {
 182         /*
 183          * This has to be done before VM init.
 184          */
 185         nmbclusters = NMBCLUSTERS;
 186         TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
 187         nmbufs = NMBUFS;
 188         TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
 189         /* Sanity checks */
 190         if (nmbufs < nmbclusters * 2)
 191                 nmbufs = nmbclusters * 2;
 192 }
 193 SYSINIT(tunable_mbinit, SI_BOOT1_TUNABLES, SI_ORDER_ANY,
 194         tunable_mbinit, NULL);
 195
 196 /* "number of clusters of pages" */
 197 #define NCL_INIT        1
 198
 199 #define NMB_INIT        16
 200
 201 /*
 202  * The mbuf object cache only guarantees that m_next and m_nextpkt are
 203  * NULL and that m_data points to the beginning of the data area.  In
 204  * particular, m_len and m_pkthdr.len are uninitialized.  It is the
 205  * responsibility of the caller to initialize those fields before use.
 206  */
 207
 208 static boolean_t __inline
 209 mbuf_ctor(void *obj, void *private, int ocflags)
 210 {
 211         struct mbuf *m = obj;
 212
 213         m->m_next = NULL;
 214         m->m_nextpkt = NULL;
 215         m->m_data = m->m_dat;
 216         m->m_flags = 0;
 217
 218         return (TRUE);
 219 }
 220
 221 /*
 222  * Initialize the mbuf and the packet header fields.
 223  */
 224 static boolean_t
 225 mbufphdr_ctor(void *obj, void *private, int ocflags)
 226 {
 227         struct mbuf *m = obj;
 228
 229         m->m_next = NULL;
 230         m->m_nextpkt = NULL;
 231         m->m_data = m->m_pktdat;
 232         m->m_flags = M_PKTHDR | M_PHCACHE;
 233
 234         m->m_pkthdr.rcvif = NULL;       /* eliminate XXX JH */
 235         SLIST_INIT(&m->m_pkthdr.tags);
 236         m->m_pkthdr.csum_flags = 0;     /* eliminate XXX JH */
 237         m->m_pkthdr.fw_flags = 0;       /* eliminate XXX JH */
 238
 239         return (TRUE);
 240 }
 241
 242 /*
 243  * A mbcluster object consists of 2K (MCLBYTES) cluster and a refcount.
 244  */
 245 static boolean_t
 246 mclmeta_ctor(void *obj, void *private, int ocflags)
 247 {
 248         struct mbcluster *cl = obj;
 249         void *buf;
 250
 251         if (ocflags & M_NOWAIT)
 252                 buf = kmalloc(MCLBYTES, M_MBUFCL, M_NOWAIT | M_ZERO);
 253         else
 254                 buf = kmalloc(MCLBYTES, M_MBUFCL, M_INTWAIT | M_ZERO);
 255         if (buf == NULL)
 256                 return (FALSE);
 257         cl->mcl_refs = 0;
 258         cl->mcl_data = buf;
 259         lwkt_serialize_init(&cl->mcl_serializer);
 260         return (TRUE);
 261 }
 262
 263 static void
 264 mclmeta_dtor(void *obj, void *private)
 265 {
 266         struct mbcluster *mcl = obj;
 267
 268         KKASSERT(mcl->mcl_refs == 0);
 269         kfree(mcl->mcl_data, M_MBUFCL);
 270 }
 271
 272 static void
 273 linkcluster(struct mbuf *m, struct mbcluster *cl)
 274 {
 275         /*
 276          * Add the cluster to the mbuf.  The caller will detect that the
 277          * mbuf now has an attached cluster.
 278          */
 279         m->m_ext.ext_arg = cl;
 280         m->m_ext.ext_buf = cl->mcl_data;
 281         m->m_ext.ext_ref = m_mclref;
 282         m->m_ext.ext_free = m_mclfree;
 283         m->m_ext.ext_size = MCLBYTES;
 284         atomic_add_int(&cl->mcl_refs, 1);
 285
 286         m->m_data = m->m_ext.ext_buf;
 287         m->m_flags |= M_EXT | M_EXT_CLUSTER;
 288 }
 289
 290 static boolean_t
 291 mbufphdrcluster_ctor(void *obj, void *private, int ocflags)
 292 {
 293         struct mbuf *m = obj;
 294         struct mbcluster *cl;
 295
 296         mbufphdr_ctor(obj, private, ocflags);
 297         cl = objcache_get(mclmeta_cache, ocflags);
 298         if (cl == NULL)
 299                 return (FALSE);
 300         m->m_flags |= M_CLCACHE;
 301         linkcluster(m, cl);
 302         return (TRUE);
 303 }
 304
 305 static boolean_t
 306 mbufcluster_ctor(void *obj, void *private, int ocflags)
 307 {
 308         struct mbuf *m = obj;
 309         struct mbcluster *cl;
 310
 311         mbuf_ctor(obj, private, ocflags);
 312         cl = objcache_get(mclmeta_cache, ocflags);
 313         if (cl == NULL)
 314                 return (FALSE);
 315         m->m_flags |= M_CLCACHE;
 316         linkcluster(m, cl);
 317         return (TRUE);
 318 }
 319
 320 /*
 321  * Used for both the cluster and cluster PHDR caches.
 322  *
 323  * The mbuf may have lost its cluster due to sharing, deal
 324  * with the situation by checking M_EXT.
 325  */
 326 static void
 327 mbufcluster_dtor(void *obj, void *private)
 328 {
 329         struct mbuf *m = obj;
 330         struct mbcluster *mcl;
 331
 332         if (m->m_flags & M_EXT) {
 333                 KKASSERT((m->m_flags & M_EXT_CLUSTER) != 0);
 334                 mcl = m->m_ext.ext_arg;
 335                 KKASSERT(mcl->mcl_refs == 1);
 336                 mcl->mcl_refs = 0;
 337                 objcache_put(mclmeta_cache, mcl);
 338         }
 339 }
 340
 341 struct objcache_malloc_args mbuf_malloc_args = { MSIZE, M_MBUF };
 342 struct objcache_malloc_args mclmeta_malloc_args =
 343         { sizeof(struct mbcluster), M_MCLMETA };
 344
 345 /* ARGSUSED*/
 346 static void
 347 mbinit(void *dummy)
 348 {
 349         mbstat.m_msize = MSIZE;
 350         mbstat.m_mclbytes = MCLBYTES;
 351         mbstat.m_minclsize = MINCLSIZE;
 352         mbstat.m_mlen = MLEN;
 353         mbstat.m_mhlen = MHLEN;
 354
 355         mbuf_cache = objcache_create("mbuf", nmbufs, 0,
 356             mbuf_ctor, NULL, NULL,
 357             objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
 358         mbufphdr_cache = objcache_create("mbuf pkt hdr", nmbufs, 64,
 359             mbufphdr_ctor, NULL, NULL,
 360             objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
 361         mclmeta_cache = objcache_create("cluster mbuf", nmbclusters , 0,
 362             mclmeta_ctor, mclmeta_dtor, NULL,
 363             objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
 364         mbufcluster_cache = objcache_create("mbuf + cluster", nmbclusters, 0,
 365             mbufcluster_ctor, mbufcluster_dtor, NULL,
 366             objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
 367         mbufphdrcluster_cache = objcache_create("mbuf pkt hdr + cluster",
 368             nmbclusters, 64, mbufphdrcluster_ctor, mbufcluster_dtor, NULL,
 369             objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
 370         return;
 371 }
 372
 373 /*
 374  * Return the number of references to this mbuf's data.  0 is returned
 375  * if the mbuf is not M_EXT, a reference count is returned if it is
 376  * M_EXT | M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
 377  */
 378 int
 379 m_sharecount(struct mbuf *m)
 380 {
 381         switch (m->m_flags & (M_EXT | M_EXT_CLUSTER)) {
 382         case 0:
 383                 return (0);
 384         case M_EXT:
 385                 return (99);
 386         case M_EXT | M_EXT_CLUSTER:
 387                 return (((struct mbcluster *)m->m_ext.ext_arg)->mcl_refs);
 388         }
 389         /* NOTREACHED */
 390         return (0);             /* to shut up compiler */
 391 }
 392
 393 /*
 394  * change mbuf to new type
 395  */
 396 void
 397 m_chtype(struct mbuf *m, int type)
 398 {
 399         crit_enter();
 400         ++mbtypes[type];
 401         --mbtypes[m->m_type];
 402         m->m_type = type;
 403         crit_exit();
 404 }
 405
 406 static void
 407 m_reclaim(void)
 408 {
 409         struct domain *dp;
 410         struct protosw *pr;
 411
 412         crit_enter();
 413         SLIST_FOREACH(dp, &domains, dom_next) {
 414                 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
 415                         if (pr->pr_drain)
 416                                 (*pr->pr_drain)();
 417                 }
 418         }
 419         crit_exit();
 420         mbstat.m_drain++;
 421 }
 422
 423 static void __inline
 424 updatestats(struct mbuf *m, int type)
 425 {
 426         m->m_type = type;
 427
 428         crit_enter();
 429         ++mbtypes[type];
 430         ++mbstat.m_mbufs;
 431         crit_exit();
 432 }
 433
 434 /*
 435  * Allocate an mbuf.
 436  */
 437 struct mbuf *
 438 m_get(int how, int type)
 439 {
 440         struct mbuf *m;
 441         int ntries = 0;
 442         int ocf = MBTOM(how);
 443
 444 retryonce:
 445
 446         m = objcache_get(mbuf_cache, ocf);
 447
 448         if (m == NULL) {
 449                 if ((how & MB_TRYWAIT) && ntries++ == 0) {
 450                         struct objcache *reclaimlist[] = {
 451                                 mbufphdr_cache,
 452                                 mbufcluster_cache, mbufphdrcluster_cache
 453                         };
 454                         const int nreclaims = __arysize(reclaimlist);
 455
 456                         if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
 457                                 m_reclaim();
 458                         goto retryonce;
 459                 }
 460                 return (NULL);
 461         }
 462
 463         updatestats(m, type);
 464         return (m);
 465 }
 466
 467 struct mbuf *
 468 m_gethdr(int how, int type)
 469 {
 470         struct mbuf *m;
 471         int ocf = MBTOM(how);
 472         int ntries = 0;
 473
 474 retryonce:
 475
 476         m = objcache_get(mbufphdr_cache, ocf);
 477
 478         if (m == NULL) {
 479                 if ((how & MB_TRYWAIT) && ntries++ == 0) {
 480                         struct objcache *reclaimlist[] = {
 481                                 mbuf_cache,
 482                                 mbufcluster_cache, mbufphdrcluster_cache
 483                         };
 484                         const int nreclaims = __arysize(reclaimlist);
 485
 486                         if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
 487                                 m_reclaim();
 488                         goto retryonce;
 489                 }
 490                 return (NULL);
 491         }
 492
 493         updatestats(m, type);
 494         return (m);
 495 }
 496
 497 /*
 498  * Get a mbuf (not a mbuf cluster!) and zero it.
 499  * Deprecated.
 500  */
 501 struct mbuf *
 502 m_getclr(int how, int type)
 503 {
 504         struct mbuf *m;
 505
 506         m = m_get(how, type);
 507         if (m != NULL)
 508                 bzero(m->m_data, MLEN);
 509         return (m);
 510 }
 511
 512 /*
 513  * Returns an mbuf with an attached cluster.
 514  * Because many network drivers use this kind of buffers a lot, it is
 515  * convenient to keep a small pool of free buffers of this kind.
 516  * Even a small size such as 10 gives about 10% improvement in the
 517  * forwarding rate in a bridge or router.
 518  */
 519 struct mbuf *
 520 m_getcl(int how, short type, int flags)
 521 {
 522         struct mbuf *m;
 523         int ocflags = MBTOM(how);
 524         int ntries = 0;
 525
 526 retryonce:
 527
 528         if (flags & M_PKTHDR)
 529                 m = objcache_get(mbufphdrcluster_cache, ocflags);
 530         else
 531                 m = objcache_get(mbufcluster_cache, ocflags);
 532
 533         if (m == NULL) {
 534                 if ((how & MB_TRYWAIT) && ntries++ == 0) {
 535                         struct objcache *reclaimlist[1];
 536
 537                         if (flags & M_PKTHDR)
 538                                 reclaimlist[0] = mbufcluster_cache;
 539                         else
 540                                 reclaimlist[0] = mbufphdrcluster_cache;
 541                         if (!objcache_reclaimlist(reclaimlist, 1, ocflags))
 542                                 m_reclaim();
 543                         goto retryonce;
 544                 }
 545                 return (NULL);
 546         }
 547
 548         m->m_type = type;
 549
 550         crit_enter();
 551         ++mbtypes[type];
 552         ++mbstat.m_clusters;
 553         crit_exit();
 554         return (m);
 555 }
 556
 557 /*
 558  * Allocate chain of requested length.
 559  */
 560 struct mbuf *
 561 m_getc(int len, int how, int type)
 562 {
 563         struct mbuf *n, *nfirst = NULL, **ntail = &nfirst;
 564         int nsize;
 565
 566         while (len > 0) {
 567                 n = m_getl(len, how, type, 0, &nsize);
 568                 if (n == NULL)
 569                         goto failed;
 570                 n->m_len = 0;
 571                 *ntail = n;
 572                 ntail = &n->m_next;
 573                 len -= nsize;
 574         }
 575         return (nfirst);
 576
 577 failed:
 578         m_freem(nfirst);
 579         return (NULL);
 580 }
 581
 582 /*
 583  * Allocate len-worth of mbufs and/or mbuf clusters (whatever fits best)
 584  * and return a pointer to the head of the allocated chain. If m0 is
 585  * non-null, then we assume that it is a single mbuf or an mbuf chain to
 586  * which we want len bytes worth of mbufs and/or clusters attached, and so
 587  * if we succeed in allocating it, we will just return a pointer to m0.
 588  *
 589  * If we happen to fail at any point during the allocation, we will free
 590  * up everything we have already allocated and return NULL.
 591  *
 592  * Deprecated.  Use m_getc() and m_cat() instead.
 593  */
 594 struct mbuf *
 595 m_getm(struct mbuf *m0, int len, int type, int how)
 596 {
 597         struct mbuf *nfirst;
 598
 599         nfirst = m_getc(len, how, type);
 600
 601         if (m0 != NULL) {
 602                 m_last(m0)->m_next = nfirst;
 603                 return (m0);
 604         }
 605
 606         return (nfirst);
 607 }
 608
 609 /*
 610  * Adds a cluster to a normal mbuf, M_EXT is set on success.
 611  * Deprecated.  Use m_getcl() instead.
 612  */
 613 void
 614 m_mclget(struct mbuf *m, int how)
 615 {
 616         struct mbcluster *mcl;
 617
 618         KKASSERT((m->m_flags & M_EXT) == 0);
 619         mcl = objcache_get(mclmeta_cache, MBTOM(how));
 620         if (mcl != NULL) {
 621                 linkcluster(m, mcl);
 622                 crit_enter();
 623                 ++mbstat.m_clusters;
 624                 /* leave the m_mbufs count intact for original mbuf */
 625                 crit_exit();
 626         }
 627 }
 628
 629 /*
 630  * Updates to mbcluster must be MPSAFE.  Only an entity which already has
 631  * a reference to the cluster can ref it, so we are in no danger of
 632  * racing an add with a subtract.  But the operation must still be atomic
 633  * since multiple entities may have a reference on the cluster.
 634  *
 635  * m_mclfree() is almost the same but it must contend with two entities
 636  * freeing the cluster at the same time.  If there is only one reference
 637  * count we are the only entity referencing the cluster and no further
 638  * locking is required.  Otherwise we must protect against a race to 0
 639  * with the serializer.
 640  */
 641 static void
 642 m_mclref(void *arg)
 643 {
 644         struct mbcluster *mcl = arg;
 645
 646         atomic_add_int(&mcl->mcl_refs, 1);
 647 }
 648
 649 static void
 650 m_mclfree(void *arg)
 651 {
 652         struct mbcluster *mcl = arg;
 653
 654         if (mcl->mcl_refs == 1) {
 655                 mcl->mcl_refs = 0;
 656                 objcache_put(mclmeta_cache, mcl);
 657         } else {
 658                 lwkt_serialize_enter(&mcl->mcl_serializer);
 659                 if (mcl->mcl_refs > 1) {
 660                         atomic_subtract_int(&mcl->mcl_refs, 1);
 661                         lwkt_serialize_exit(&mcl->mcl_serializer);
 662                 } else {
 663                         lwkt_serialize_exit(&mcl->mcl_serializer);
 664                         KKASSERT(mcl->mcl_refs == 1);
 665                         mcl->mcl_refs = 0;
 666                         objcache_put(mclmeta_cache, mcl);
 667                 }
 668         }
 669 }
 670
 671 extern void db_print_backtrace(void);
 672
 673 /*
 674  * Free a single mbuf and any associated external storage.  The successor,
 675  * if any, is returned.
 676  *
 677  * We do need to check non-first mbuf for m_aux, since some of existing
 678  * code does not call M_PREPEND properly.
 679  * (example: call to bpf_mtap from drivers)
 680  */
 681 struct mbuf *
 682 m_free(struct mbuf *m)
 683 {
 684         struct mbuf *n;
 685
 686         KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m));
 687         --mbtypes[m->m_type];
 688
 689         n = m->m_next;
 690
 691         /*
 692          * Make sure the mbuf is in constructed state before returning it
 693          * to the objcache.
 694          */
 695         m->m_next = NULL;
 696 #ifdef notyet
 697         KKASSERT(m->m_nextpkt == NULL);
 698 #else
 699         if (m->m_nextpkt != NULL) {
 700 #ifdef DDB
 701                 static int afewtimes = 10;
 702
 703                 if (afewtimes-- > 0) {
 704                         kprintf("mfree: m->m_nextpkt != NULL\n");
 705                         db_print_backtrace();
 706                 }
 707 #endif
 708                 m->m_nextpkt = NULL;
 709         }
 710 #endif
 711         if (m->m_flags & M_PKTHDR) {
 712                 m_tag_delete_chain(m);          /* eliminate XXX JH */
 713         }
 714
 715         m->m_flags &= (M_EXT | M_EXT_CLUSTER | M_CLCACHE | M_PHCACHE);
 716
 717         /*
 718          * Clean the M_PKTHDR state so we can return the mbuf to its original
 719          * cache.  This is based on the PHCACHE flag which tells us whether
 720          * the mbuf was originally allocated out of a packet-header cache
 721          * or a non-packet-header cache.
 722          */
 723         if (m->m_flags & M_PHCACHE) {
 724                 m->m_flags |= M_PKTHDR;
 725                 m->m_pkthdr.rcvif = NULL;       /* eliminate XXX JH */
 726                 m->m_pkthdr.csum_flags = 0;     /* eliminate XXX JH */
 727                 m->m_pkthdr.fw_flags = 0;       /* eliminate XXX JH */
 728                 SLIST_INIT(&m->m_pkthdr.tags);
 729         }
 730
 731         /*
 732          * Handle remaining flags combinations.  M_CLCACHE tells us whether
 733          * the mbuf was originally allocated from a cluster cache or not,
 734          * and is totally separate from whether the mbuf is currently
 735          * associated with a cluster.
 736          */
 737         crit_enter();
 738         switch(m->m_flags & (M_CLCACHE | M_EXT | M_EXT_CLUSTER)) {
 739         case M_CLCACHE | M_EXT | M_EXT_CLUSTER:
 740                 /*
 741                  * mbuf+cluster cache case.  The mbuf was allocated from the
 742                  * combined mbuf_cluster cache and can be returned to the
 743                  * cache if the cluster hasn't been shared.
 744                  */
 745                 if (m_sharecount(m) == 1) {
 746                         /*
 747                          * The cluster has not been shared, we can just
 748                          * reset the data pointer and return the mbuf
 749                          * to the cluster cache.  Note that the reference
 750                          * count is left intact (it is still associated with
 751                          * an mbuf).
 752                          */
 753                         m->m_data = m->m_ext.ext_buf;
 754                         if (m->m_flags & M_PHCACHE)
 755                                 objcache_put(mbufphdrcluster_cache, m);
 756                         else
 757                                 objcache_put(mbufcluster_cache, m);
 758                         --mbstat.m_clusters;
 759                 } else {
 760                         /*
 761                          * Hell.  Someone else has a ref on this cluster,
 762                          * we have to disconnect it which means we can't
 763                          * put it back into the mbufcluster_cache, we
 764                          * have to destroy the mbuf.
 765                          *
 766                          * Other mbuf references to the cluster will typically
 767                          * be M_EXT | M_EXT_CLUSTER but without M_CLCACHE.
 768                          *
 769                          * XXX we could try to connect another cluster to
 770                          * it.
 771                          */
 772                         m->m_ext.ext_free(m->m_ext.ext_arg);
 773                         m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
 774                         if (m->m_flags & M_PHCACHE)
 775                                 objcache_dtor(mbufphdrcluster_cache, m);
 776                         else
 777                                 objcache_dtor(mbufcluster_cache, m);
 778                 }
 779                 break;
 780         case M_EXT | M_EXT_CLUSTER:
 781                 /*
 782                  * Normal cluster associated with an mbuf that was allocated
 783                  * from the normal mbuf pool rather then the cluster pool.
 784                  * The cluster has to be independantly disassociated from the
 785                  * mbuf.
 786                  */
 787                 if (m_sharecount(m) == 1)
 788                         --mbstat.m_clusters;
 789                 /* fall through */
 790         case M_EXT:
 791                 /*
 792                  * Normal cluster association case, disconnect the cluster from
 793                  * the mbuf.  The cluster may or may not be custom.
 794                  */
 795                 m->m_ext.ext_free(m->m_ext.ext_arg);
 796                 m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
 797                 /* fall through */
 798         case 0:
 799                 /*
 800                  * return the mbuf to the mbuf cache.
 801                  */
 802                 if (m->m_flags & M_PHCACHE) {
 803                         m->m_data = m->m_pktdat;
 804                         objcache_put(mbufphdr_cache, m);
 805                 } else {
 806                         m->m_data = m->m_dat;
 807                         objcache_put(mbuf_cache, m);
 808                 }
 809                 --mbstat.m_mbufs;
 810                 break;
 811         default:
 812                 if (!panicstr)
 813                         panic("bad mbuf flags %p %08x\n", m, m->m_flags);
 814                 break;
 815         }
 816         crit_exit();
 817         return (n);
 818 }
 819
 820 void
 821 m_freem(struct mbuf *m)
 822 {
 823         crit_enter();
 824         while (m)
 825                 m = m_free(m);
 826         crit_exit();
 827 }
 828
 829 /*
 830  * mbuf utility routines
 831  */
 832
 833 /*
 834  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain and
 835  * copy junk along.
 836  */
 837 struct mbuf *
 838 m_prepend(struct mbuf *m, int len, int how)
 839 {
 840         struct mbuf *mn;
 841
 842         if (m->m_flags & M_PKTHDR)
 843             mn = m_gethdr(how, m->m_type);
 844         else
 845             mn = m_get(how, m->m_type);
 846         if (mn == NULL) {
 847                 m_freem(m);
 848                 return (NULL);
 849         }
 850         if (m->m_flags & M_PKTHDR)
 851                 M_MOVE_PKTHDR(mn, m);
 852         mn->m_next = m;
 853         m = mn;
 854         if (len < MHLEN)
 855                 MH_ALIGN(m, len);
 856         m->m_len = len;
 857         return (m);
 858 }
 859
 860 /*
 861  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
 862  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
 863  * The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller.
 864  * Note that the copy is read-only, because clusters are not copied,
 865  * only their reference counts are incremented.
 866  */
 867 struct mbuf *
 868 m_copym(const struct mbuf *m, int off0, int len, int wait)
 869 {
 870         struct mbuf *n, **np;
 871         int off = off0;
 872         struct mbuf *top;
 873         int copyhdr = 0;
 874
 875         KASSERT(off >= 0, ("m_copym, negative off %d", off));
 876         KASSERT(len >= 0, ("m_copym, negative len %d", len));
 877         if (off == 0 && m->m_flags & M_PKTHDR)
 878                 copyhdr = 1;
 879         while (off > 0) {
 880                 KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
 881                 if (off < m->m_len)
 882                         break;
 883                 off -= m->m_len;
 884                 m = m->m_next;
 885         }
 886         np = &top;
 887         top = 0;
 888         while (len > 0) {
 889                 if (m == NULL) {
 890                         KASSERT(len == M_COPYALL,
 891                             ("m_copym, length > size of mbuf chain"));
 892                         break;
 893                 }
 894                 /*
 895                  * Because we are sharing any cluster attachment below,
 896                  * be sure to get an mbuf that does not have a cluster
 897                  * associated with it.
 898                  */
 899                 if (copyhdr)
 900                         n = m_gethdr(wait, m->m_type);
 901                 else
 902                         n = m_get(wait, m->m_type);
 903                 *np = n;
 904                 if (n == NULL)
 905                         goto nospace;
 906                 if (copyhdr) {
 907                         if (!m_dup_pkthdr(n, m, wait))
 908                                 goto nospace;
 909                         if (len == M_COPYALL)
 910                                 n->m_pkthdr.len -= off0;
 911                         else
 912                                 n->m_pkthdr.len = len;
 913                         copyhdr = 0;
 914                 }
 915                 n->m_len = min(len, m->m_len - off);
 916                 if (m->m_flags & M_EXT) {
 917                         KKASSERT((n->m_flags & M_EXT) == 0);
 918                         n->m_data = m->m_data + off;
 919                         m->m_ext.ext_ref(m->m_ext.ext_arg);
 920                         n->m_ext = m->m_ext;
 921                         n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
 922                 } else {
 923                         bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
 924                             (unsigned)n->m_len);
 925                 }
 926                 if (len != M_COPYALL)
 927                         len -= n->m_len;
 928                 off = 0;
 929                 m = m->m_next;
 930                 np = &n->m_next;
 931         }
 932         if (top == NULL)
 933                 mbstat.m_mcfail++;
 934         return (top);
 935 nospace:
 936         m_freem(top);
 937         mbstat.m_mcfail++;
 938         return (NULL);
 939 }
 940
 941 /*
 942  * Copy an entire packet, including header (which must be present).
 943  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
 944  * Note that the copy is read-only, because clusters are not copied,
 945  * only their reference counts are incremented.
 946  * Preserve alignment of the first mbuf so if the creator has left
 947  * some room at the beginning (e.g. for inserting protocol headers)
 948  * the copies also have the room available.
 949  */
 950 struct mbuf *
 951 m_copypacket(struct mbuf *m, int how)
 952 {
 953         struct mbuf *top, *n, *o;
 954
 955         n = m_gethdr(how, m->m_type);
 956         top = n;
 957         if (!n)
 958                 goto nospace;
 959
 960         if (!m_dup_pkthdr(n, m, how))
 961                 goto nospace;
 962         n->m_len = m->m_len;
 963         if (m->m_flags & M_EXT) {
 964                 KKASSERT((n->m_flags & M_EXT) == 0);
 965                 n->m_data = m->m_data;
 966                 m->m_ext.ext_ref(m->m_ext.ext_arg);
 967                 n->m_ext = m->m_ext;
 968                 n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
 969         } else {
 970                 n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
 971                 bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 972         }
 973
 974         m = m->m_next;
 975         while (m) {
 976                 o = m_get(how, m->m_type);
 977                 if (!o)
 978                         goto nospace;
 979
 980                 n->m_next = o;
 981                 n = n->m_next;
 982
 983                 n->m_len = m->m_len;
 984                 if (m->m_flags & M_EXT) {
 985                         KKASSERT((n->m_flags & M_EXT) == 0);
 986                         n->m_data = m->m_data;
 987                         m->m_ext.ext_ref(m->m_ext.ext_arg);
 988                         n->m_ext = m->m_ext;
 989                         n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
 990                 } else {
 991                         bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 992                 }
 993
 994                 m = m->m_next;
 995         }
 996         return top;
 997 nospace:
 998         m_freem(top);
 999         mbstat.m_mcfail++;
1000         return (NULL);
1001 }
1002
1003 /*
1004  * Copy data from an mbuf chain starting "off" bytes from the beginning,
1005  * continuing for "len" bytes, into the indicated buffer.
1006  */
1007 void
1008 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
1009 {
1010         unsigned count;
1011
1012         KASSERT(off >= 0, ("m_copydata, negative off %d", off));
1013         KASSERT(len >= 0, ("m_copydata, negative len %d", len));
1014         while (off > 0) {
1015                 KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
1016                 if (off < m->m_len)
1017                         break;
1018                 off -= m->m_len;
1019                 m = m->m_next;
1020         }
1021         while (len > 0) {
1022                 KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
1023                 count = min(m->m_len - off, len);
1024                 bcopy(mtod(m, caddr_t) + off, cp, count);
1025                 len -= count;
1026                 cp += count;
1027                 off = 0;
1028                 m = m->m_next;
1029         }
1030 }
1031
1032 /*
1033  * Copy a packet header mbuf chain into a completely new chain, including
1034  * copying any mbuf clusters.  Use this instead of m_copypacket() when
1035  * you need a writable copy of an mbuf chain.
1036  */
1037 struct mbuf *
1038 m_dup(struct mbuf *m, int how)
1039 {
1040         struct mbuf **p, *top = NULL;
1041         int remain, moff, nsize;
1042
1043         /* Sanity check */
1044         if (m == NULL)
1045                 return (NULL);
1046         KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
1047
1048         /* While there's more data, get a new mbuf, tack it on, and fill it */
1049         remain = m->m_pkthdr.len;
1050         moff = 0;
1051         p = &top;
1052         while (remain > 0 || top == NULL) {     /* allow m->m_pkthdr.len == 0 */
1053                 struct mbuf *n;
1054
1055                 /* Get the next new mbuf */
1056                 n = m_getl(remain, how, m->m_type, top == NULL ? M_PKTHDR : 0,
1057                            &nsize);
1058                 if (n == NULL)
1059                         goto nospace;
1060                 if (top == NULL)
1061                         if (!m_dup_pkthdr(n, m, how))
1062                                 goto nospace0;
1063
1064                 /* Link it into the new chain */
1065                 *p = n;
1066                 p = &n->m_next;
1067
1068                 /* Copy data from original mbuf(s) into new mbuf */
1069                 n->m_len = 0;
1070                 while (n->m_len < nsize && m != NULL) {
1071                         int chunk = min(nsize - n->m_len, m->m_len - moff);
1072
1073                         bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1074                         moff += chunk;
1075                         n->m_len += chunk;
1076                         remain -= chunk;
1077                         if (moff == m->m_len) {
1078                                 m = m->m_next;
1079                                 moff = 0;
1080                         }
1081                 }
1082
1083                 /* Check correct total mbuf length */
1084                 KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
1085                         ("%s: bogus m_pkthdr.len", __func__));
1086         }
1087         return (top);
1088
1089 nospace:
1090         m_freem(top);
1091 nospace0:
1092         mbstat.m_mcfail++;
1093         return (NULL);
1094 }
1095
1096 /*
1097  * Concatenate mbuf chain n to m.
1098  * Both chains must be of the same type (e.g. MT_DATA).
1099  * Any m_pkthdr is not updated.
1100  */
1101 void
1102 m_cat(struct mbuf *m, struct mbuf *n)
1103 {
1104         m = m_last(m);
1105         while (n) {
1106                 if (m->m_flags & M_EXT ||
1107                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
1108                         /* just join the two chains */
1109                         m->m_next = n;
1110                         return;
1111                 }
1112                 /* splat the data from one into the other */
1113                 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1114                     (u_int)n->m_len);
1115                 m->m_len += n->m_len;
1116                 n = m_free(n);
1117         }
1118 }
1119
1120 void
1121 m_adj(struct mbuf *mp, int req_len)
1122 {
1123         int len = req_len;
1124         struct mbuf *m;
1125         int count;
1126
1127         if ((m = mp) == NULL)
1128                 return;
1129         if (len >= 0) {
1130                 /*
1131                  * Trim from head.
1132                  */
1133                 while (m != NULL && len > 0) {
1134                         if (m->m_len <= len) {
1135                                 len -= m->m_len;
1136                                 m->m_len = 0;
1137                                 m = m->m_next;
1138                         } else {
1139                                 m->m_len -= len;
1140                                 m->m_data += len;
1141                                 len = 0;
1142                         }
1143                 }
1144                 m = mp;
1145                 if (mp->m_flags & M_PKTHDR)
1146                         m->m_pkthdr.len -= (req_len - len);
1147         } else {
1148                 /*
1149                  * Trim from tail.  Scan the mbuf chain,
1150                  * calculating its length and finding the last mbuf.
1151                  * If the adjustment only affects this mbuf, then just
1152                  * adjust and return.  Otherwise, rescan and truncate
1153                  * after the remaining size.
1154                  */
1155                 len = -len;
1156                 count = 0;
1157                 for (;;) {
1158                         count += m->m_len;
1159                         if (m->m_next == (struct mbuf *)0)
1160                                 break;
1161                         m = m->m_next;
1162                 }
1163                 if (m->m_len >= len) {
1164                         m->m_len -= len;
1165                         if (mp->m_flags & M_PKTHDR)
1166                                 mp->m_pkthdr.len -= len;
1167                         return;
1168                 }
1169                 count -= len;
1170                 if (count < 0)
1171                         count = 0;
1172                 /*
1173                  * Correct length for chain is "count".
1174                  * Find the mbuf with last data, adjust its length,
1175                  * and toss data from remaining mbufs on chain.
1176                  */
1177                 m = mp;
1178                 if (m->m_flags & M_PKTHDR)
1179                         m->m_pkthdr.len = count;
1180                 for (; m; m = m->m_next) {
1181                         if (m->m_len >= count) {
1182                                 m->m_len = count;
1183                                 break;
1184                         }
1185                         count -= m->m_len;
1186                 }
1187                 while (m->m_next)
1188                         (m = m->m_next) ->m_len = 0;
1189         }
1190 }
1191
1192 /*
1193  * Rearrange an mbuf chain so that len bytes are contiguous
1194  * and in the data area of an mbuf (so that mtod will work for a structure
1195  * of size len).  Returns the resulting mbuf chain on success, frees it and
1196  * returns null on failure.  If there is room, it will add up to
1197  * max_protohdr-len extra bytes to the contiguous region in an attempt to
1198  * avoid being called next time.
1199  */
1200 struct mbuf *
1201 m_pullup(struct mbuf *n, int len)
1202 {
1203         struct mbuf *m;
1204         int count;
1205         int space;
1206
1207         /*
1208          * If first mbuf has no cluster, and has room for len bytes
1209          * without shifting current data, pullup into it,
1210          * otherwise allocate a new mbuf to prepend to the chain.
1211          */
1212         if (!(n->m_flags & M_EXT) &&
1213             n->m_data + len < &n->m_dat[MLEN] &&
1214             n->m_next) {
1215                 if (n->m_len >= len)
1216                         return (n);
1217                 m = n;
1218                 n = n->m_next;
1219                 len -= m->m_len;
1220         } else {
1221                 if (len > MHLEN)
1222                         goto bad;
1223                 if (n->m_flags & M_PKTHDR)
1224                         m = m_gethdr(MB_DONTWAIT, n->m_type);
1225                 else
1226                         m = m_get(MB_DONTWAIT, n->m_type);
1227                 if (m == NULL)
1228                         goto bad;
1229                 m->m_len = 0;
1230                 if (n->m_flags & M_PKTHDR)
1231                         M_MOVE_PKTHDR(m, n);
1232         }
1233         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1234         do {
1235                 count = min(min(max(len, max_protohdr), space), n->m_len);
1236                 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1237                   (unsigned)count);
1238                 len -= count;
1239                 m->m_len += count;
1240                 n->m_len -= count;
1241                 space -= count;
1242                 if (n->m_len)
1243                         n->m_data += count;
1244                 else
1245                         n = m_free(n);
1246         } while (len > 0 && n);
1247         if (len > 0) {
1248                 m_free(m);
1249                 goto bad;
1250         }
1251         m->m_next = n;
1252         return (m);
1253 bad:
1254         m_freem(n);
1255         mbstat.m_mpfail++;
1256         return (NULL);
1257 }
1258
1259 /*
1260  * Partition an mbuf chain in two pieces, returning the tail --
1261  * all but the first len0 bytes.  In case of failure, it returns NULL and
1262  * attempts to restore the chain to its original state.
1263  *
1264  * Note that the resulting mbufs might be read-only, because the new
1265  * mbuf can end up sharing an mbuf cluster with the original mbuf if
1266  * the "breaking point" happens to lie within a cluster mbuf. Use the
1267  * M_WRITABLE() macro to check for this case.
1268  */
1269 struct mbuf *
1270 m_split(struct mbuf *m0, int len0, int wait)
1271 {
1272         struct mbuf *m, *n;
1273         unsigned len = len0, remain;
1274
1275         for (m = m0; m && len > m->m_len; m = m->m_next)
1276                 len -= m->m_len;
1277         if (m == NULL)
1278                 return (NULL);
1279         remain = m->m_len - len;
1280         if (m0->m_flags & M_PKTHDR) {
1281                 n = m_gethdr(wait, m0->m_type);
1282                 if (n == NULL)
1283                         return (NULL);
1284                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1285                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1286                 m0->m_pkthdr.len = len0;
1287                 if (m->m_flags & M_EXT)
1288                         goto extpacket;
1289                 if (remain > MHLEN) {
1290                         /* m can't be the lead packet */
1291                         MH_ALIGN(n, 0);
1292                         n->m_next = m_split(m, len, wait);
1293                         if (n->m_next == NULL) {
1294                                 m_free(n);
1295                                 return (NULL);
1296                         } else {
1297                                 n->m_len = 0;
1298                                 return (n);
1299                         }
1300                 } else
1301                         MH_ALIGN(n, remain);
1302         } else if (remain == 0) {
1303                 n = m->m_next;
1304                 m->m_next = 0;
1305                 return (n);
1306         } else {
1307                 n = m_get(wait, m->m_type);
1308                 if (n == NULL)
1309                         return (NULL);
1310                 M_ALIGN(n, remain);
1311         }
1312 extpacket:
1313         if (m->m_flags & M_EXT) {
1314                 KKASSERT((n->m_flags & M_EXT) == 0);
1315                 n->m_data = m->m_data + len;
1316                 m->m_ext.ext_ref(m->m_ext.ext_arg);
1317                 n->m_ext = m->m_ext;
1318                 n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1319         } else {
1320                 bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1321         }
1322         n->m_len = remain;
1323         m->m_len = len;
1324         n->m_next = m->m_next;
1325         m->m_next = 0;
1326         return (n);
1327 }
1328
1329 /*
1330  * Routine to copy from device local memory into mbufs.
1331  * Note: "offset" is ill-defined and always called as 0, so ignore it.
1332  */
1333 struct mbuf *
1334 m_devget(char *buf, int len, int offset, struct ifnet *ifp,
1335     void (*copy)(volatile const void *from, volatile void *to, size_t length))
1336 {
1337         struct mbuf *m, *mfirst = NULL, **mtail;
1338         int nsize, flags;
1339
1340         if (copy == NULL)
1341                 copy = bcopy;
1342         mtail = &mfirst;
1343         flags = M_PKTHDR;
1344
1345         while (len > 0) {
1346                 m = m_getl(len, MB_DONTWAIT, MT_DATA, flags, &nsize);
1347                 if (m == NULL) {
1348                         m_freem(mfirst);
1349                         return (NULL);
1350                 }
1351                 m->m_len = min(len, nsize);
1352
1353                 if (flags & M_PKTHDR) {
1354                         if (len + max_linkhdr <= nsize)
1355                                 m->m_data += max_linkhdr;
1356                         m->m_pkthdr.rcvif = ifp;
1357                         m->m_pkthdr.len = len;
1358                         flags = 0;
1359                 }
1360
1361                 copy(buf, m->m_data, (unsigned)m->m_len);
1362                 buf += m->m_len;
1363                 len -= m->m_len;
1364                 *mtail = m;
1365                 mtail = &m->m_next;
1366         }
1367
1368         return (mfirst);
1369 }
1370
1371 /*
1372  * Copy data from a buffer back into the indicated mbuf chain,
1373  * starting "off" bytes from the beginning, extending the mbuf
1374  * chain if necessary.
1375  */
1376 void
1377 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
1378 {
1379         int mlen;
1380         struct mbuf *m = m0, *n;
1381         int totlen = 0;
1382
1383         if (m0 == NULL)
1384                 return;
1385         while (off > (mlen = m->m_len)) {
1386                 off -= mlen;
1387                 totlen += mlen;
1388                 if (m->m_next == NULL) {
1389                         n = m_getclr(MB_DONTWAIT, m->m_type);
1390                         if (n == NULL)
1391                                 goto out;
1392                         n->m_len = min(MLEN, len + off);
1393                         m->m_next = n;
1394                 }
1395                 m = m->m_next;
1396         }
1397         while (len > 0) {
1398                 mlen = min (m->m_len - off, len);
1399                 bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
1400                 cp += mlen;
1401                 len -= mlen;
1402                 mlen += off;
1403                 off = 0;
1404                 totlen += mlen;
1405                 if (len == 0)
1406                         break;
1407                 if (m->m_next == NULL) {
1408                         n = m_get(MB_DONTWAIT, m->m_type);
1409                         if (n == NULL)
1410                                 break;
1411                         n->m_len = min(MLEN, len);
1412                         m->m_next = n;
1413                 }
1414                 m = m->m_next;
1415         }
1416 out:    if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1417                 m->m_pkthdr.len = totlen;
1418 }
1419
1420 void
1421 m_print(const struct mbuf *m)
1422 {
1423         int len;
1424         const struct mbuf *m2;
1425
1426         len = m->m_pkthdr.len;
1427         m2 = m;
1428         while (len) {
1429                 kprintf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
1430                 len -= m2->m_len;
1431                 m2 = m2->m_next;
1432         }
1433         return;
1434 }
1435
1436 /*
1437  * "Move" mbuf pkthdr from "from" to "to".
1438  * "from" must have M_PKTHDR set, and "to" must be empty.
1439  */
1440 void
1441 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
1442 {
1443         KASSERT((to->m_flags & M_PKTHDR), ("m_move_pkthdr: not packet header"));
1444
1445         to->m_flags |= from->m_flags & M_COPYFLAGS;
1446         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
1447         SLIST_INIT(&from->m_pkthdr.tags);       /* purge tags from src */
1448 }
1449
1450 /*
1451  * Duplicate "from"'s mbuf pkthdr in "to".
1452  * "from" must have M_PKTHDR set, and "to" must be empty.
1453  * In particular, this does a deep copy of the packet tags.
1454  */
1455 int
1456 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
1457 {
1458         KASSERT((to->m_flags & M_PKTHDR), ("m_dup_pkthdr: not packet header"));
1459
1460         to->m_flags = (from->m_flags & M_COPYFLAGS) |
1461                       (to->m_flags & ~M_COPYFLAGS);
1462         to->m_pkthdr = from->m_pkthdr;
1463         SLIST_INIT(&to->m_pkthdr.tags);
1464         return (m_tag_copy_chain(to, from, how));
1465 }
1466
1467 /*
1468  * Defragment a mbuf chain, returning the shortest possible
1469  * chain of mbufs and clusters.  If allocation fails and
1470  * this cannot be completed, NULL will be returned, but
1471  * the passed in chain will be unchanged.  Upon success,
1472  * the original chain will be freed, and the new chain
1473  * will be returned.
1474  *
1475  * If a non-packet header is passed in, the original
1476  * mbuf (chain?) will be returned unharmed.
1477  *
1478  * m_defrag_nofree doesn't free the passed in mbuf.
1479  */
1480 struct mbuf *
1481 m_defrag(struct mbuf *m0, int how)
1482 {
1483         struct mbuf *m_new;
1484
1485         if ((m_new = m_defrag_nofree(m0, how)) == NULL)
1486                 return (NULL);
1487         if (m_new != m0)
1488                 m_freem(m0);
1489         return (m_new);
1490 }
1491
1492 struct mbuf *
1493 m_defrag_nofree(struct mbuf *m0, int how)
1494 {
1495         struct mbuf     *m_new = NULL, *m_final = NULL;
1496         int             progress = 0, length, nsize;
1497
1498         if (!(m0->m_flags & M_PKTHDR))
1499                 return (m0);
1500
1501 #ifdef MBUF_STRESS_TEST
1502         if (m_defragrandomfailures) {
1503                 int temp = karc4random() & 0xff;
1504                 if (temp == 0xba)
1505                         goto nospace;
1506         }
1507 #endif
1508
1509         m_final = m_getl(m0->m_pkthdr.len, how, MT_DATA, M_PKTHDR, &nsize);
1510         if (m_final == NULL)
1511                 goto nospace;
1512         m_final->m_len = 0;     /* in case m0->m_pkthdr.len is zero */
1513
1514         if (m_dup_pkthdr(m_final, m0, how) == NULL)
1515                 goto nospace;
1516
1517         m_new = m_final;
1518
1519         while (progress < m0->m_pkthdr.len) {
1520                 length = m0->m_pkthdr.len - progress;
1521                 if (length > MCLBYTES)
1522                         length = MCLBYTES;
1523
1524                 if (m_new == NULL) {
1525                         m_new = m_getl(length, how, MT_DATA, 0, &nsize);
1526                         if (m_new == NULL)
1527                                 goto nospace;
1528                 }
1529
1530                 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1531                 progress += length;
1532                 m_new->m_len = length;
1533                 if (m_new != m_final)
1534                         m_cat(m_final, m_new);
1535                 m_new = NULL;
1536         }
1537         if (m0->m_next == NULL)
1538                 m_defraguseless++;
1539         m_defragpackets++;
1540         m_defragbytes += m_final->m_pkthdr.len;
1541         return (m_final);
1542 nospace:
1543         m_defragfailure++;
1544         if (m_new)
1545                 m_free(m_new);
1546         m_freem(m_final);
1547         return (NULL);
1548 }
1549
1550 /*
1551  * Move data from uio into mbufs.
1552  */
1553 struct mbuf *
1554 m_uiomove(struct uio *uio)
1555 {
1556         struct mbuf *m;                 /* current working mbuf */
1557         struct mbuf *head = NULL;       /* result mbuf chain */
1558         struct mbuf **mp = &head;
1559         int resid = uio->uio_resid, nsize, flags = M_PKTHDR, error;
1560
1561         do {
1562                 m = m_getl(resid, MB_WAIT, MT_DATA, flags, &nsize);
1563                 if (flags) {
1564                         m->m_pkthdr.len = 0;
1565                         /* Leave room for protocol headers. */
1566                         if (resid < MHLEN)
1567                                 MH_ALIGN(m, resid);
1568                         flags = 0;
1569                 }
1570                 m->m_len = min(nsize, resid);
1571                 error = uiomove(mtod(m, caddr_t), m->m_len, uio);
1572                 if (error) {
1573                         m_free(m);
1574                         goto failed;
1575                 }
1576                 *mp = m;
1577                 mp = &m->m_next;
1578                 head->m_pkthdr.len += m->m_len;
1579                 resid -= m->m_len;
1580         } while (resid > 0);
1581
1582         return (head);
1583
1584 failed:
1585         m_freem(head);
1586         return (NULL);
1587 }
1588
1589 struct mbuf *
1590 m_last(struct mbuf *m)
1591 {
1592         while (m->m_next)
1593                 m = m->m_next;
1594         return (m);
1595 }
1596
1597 /*
1598  * Return the number of bytes in an mbuf chain.
1599  * If lastm is not NULL, also return the last mbuf.
1600  */
1601 u_int
1602 m_lengthm(struct mbuf *m, struct mbuf **lastm)
1603 {
1604         u_int len = 0;
1605         struct mbuf *prev = m;
1606
1607         while (m) {
1608                 len += m->m_len;
1609                 prev = m;
1610                 m = m->m_next;
1611         }
1612         if (lastm != NULL)
1613                 *lastm = prev;
1614         return (len);
1615 }
1616
1617 /*
1618  * Like m_lengthm(), except also keep track of mbuf usage.
1619  */
1620 u_int
1621 m_countm(struct mbuf *m, struct mbuf **lastm, u_int *pmbcnt)
1622 {
1623         u_int len = 0, mbcnt = 0;
1624         struct mbuf *prev = m;
1625
1626         while (m) {
1627                 len += m->m_len;
1628                 mbcnt += MSIZE;
1629                 if (m->m_flags & M_EXT)
1630                         mbcnt += m->m_ext.ext_size;
1631                 prev = m;
1632                 m = m->m_next;
1633         }
1634         if (lastm != NULL)
1635                 *lastm = prev;
1636         *pmbcnt = mbcnt;
1637         return (len);
1638 }