sys/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
   3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to The DragonFly Project
   6  * by Jeffrey M. Hsu.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 3. Neither the name of The DragonFly Project nor the names of its
  17  *    contributors may be used to endorse or promote products derived
  18  *    from this software without specific, prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  */
  33
  34 /*
  35  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  36  *      The Regents of the University of California.  All rights reserved.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  * @(#)uipc_mbuf.c      8.2 (Berkeley) 1/4/94
  67  * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
  68  * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.62 2007/05/13 22:56:59 dillon Exp $
  69  */
  70
  71 #include "opt_param.h"
  72 #include "opt_ddb.h"
  73 #include "opt_mbuf_stress_test.h"
  74 #include <sys/param.h>
  75 #include <sys/systm.h>
  76 #include <sys/malloc.h>
  77 #include <sys/mbuf.h>
  78 #include <sys/kernel.h>
  79 #include <sys/sysctl.h>
  80 #include <sys/domain.h>
  81 #include <sys/objcache.h>
  82 #include <sys/protosw.h>
  83 #include <sys/uio.h>
  84 #include <sys/thread.h>
  85 #include <sys/globaldata.h>
  86 #include <sys/serialize.h>
  87 #include <sys/thread2.h>
  88
  89 #include <vm/vm.h>
  90 #include <vm/vm_kern.h>
  91 #include <vm/vm_extern.h>
  92
  93 #ifdef INVARIANTS
  94 #include <machine/cpu.h>
  95 #endif
  96
  97 /*
  98  * mbuf cluster meta-data
  99  */
 100 struct mbcluster {
 101         int32_t mcl_refs;
 102         void    *mcl_data;
 103         struct lwkt_serialize mcl_serializer;
 104 };
 105
 106 static void mbinit(void *);
 107 SYSINIT(mbuf, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, mbinit, NULL)
 108
 109 static u_long   mbtypes[MT_NTYPES];
 110
 111 struct mbstat mbstat;
 112 int     max_linkhdr;
 113 int     max_protohdr;
 114 int     max_hdr;
 115 int     max_datalen;
 116 int     m_defragpackets;
 117 int     m_defragbytes;
 118 int     m_defraguseless;
 119 int     m_defragfailure;
 120 #ifdef MBUF_STRESS_TEST
 121 int     m_defragrandomfailures;
 122 #endif
 123
 124 struct objcache *mbuf_cache, *mbufphdr_cache;
 125 struct objcache *mclmeta_cache;
 126 struct objcache *mbufcluster_cache, *mbufphdrcluster_cache;
 127
 128 int     nmbclusters;
 129 int     nmbufs;
 130
 131 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
 132            &max_linkhdr, 0, "");
 133 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
 134            &max_protohdr, 0, "");
 135 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
 136 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
 137            &max_datalen, 0, "");
 138 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
 139            &mbuf_wait, 0, "");
 140 SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, "");
 141 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mbtypes, CTLFLAG_RD, mbtypes,
 142            sizeof(mbtypes), "LU", "");
 143
 144 /*
 145  * These are read-only because we do not currently have any code
 146  * to adjust the objcache limits after the fact.  The variables
 147  * may only be set as boot-time tunables.
 148  */
 149 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD,
 150            &nmbclusters, 0, "Maximum number of mbuf clusters available");
 151 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
 152            "Maximum number of mbufs available");
 153
 154 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
 155            &m_defragpackets, 0, "");
 156 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
 157            &m_defragbytes, 0, "");
 158 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
 159            &m_defraguseless, 0, "");
 160 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
 161            &m_defragfailure, 0, "");
 162 #ifdef MBUF_STRESS_TEST
 163 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
 164            &m_defragrandomfailures, 0, "");
 165 #endif
 166
 167 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
 168 static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl");
 169 static MALLOC_DEFINE(M_MCLMETA, "mclmeta", "mclmeta");
 170
 171 static void m_reclaim (void);
 172 static void m_mclref(void *arg);
 173 static void m_mclfree(void *arg);
 174
 175 #ifndef NMBCLUSTERS
 176 #define NMBCLUSTERS     (512 + maxusers * 16)
 177 #endif
 178 #ifndef NMBUFS
 179 #define NMBUFS          (nmbclusters * 2)
 180 #endif
 181
 182 /*
 183  * Perform sanity checks of tunables declared above.
 184  */
 185 static void
 186 tunable_mbinit(void *dummy)
 187 {
 188         /*
 189          * This has to be done before VM init.
 190          */
 191         nmbclusters = NMBCLUSTERS;
 192         TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
 193         nmbufs = NMBUFS;
 194         TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
 195         /* Sanity checks */
 196         if (nmbufs < nmbclusters * 2)
 197                 nmbufs = nmbclusters * 2;
 198 }
 199 SYSINIT(tunable_mbinit, SI_BOOT1_TUNABLES, SI_ORDER_ANY,
 200         tunable_mbinit, NULL);
 201
 202 /* "number of clusters of pages" */
 203 #define NCL_INIT        1
 204
 205 #define NMB_INIT        16
 206
 207 /*
 208  * The mbuf object cache only guarantees that m_next and m_nextpkt are
 209  * NULL and that m_data points to the beginning of the data area.  In
 210  * particular, m_len and m_pkthdr.len are uninitialized.  It is the
 211  * responsibility of the caller to initialize those fields before use.
 212  */
 213
 214 static boolean_t __inline
 215 mbuf_ctor(void *obj, void *private, int ocflags)
 216 {
 217         struct mbuf *m = obj;
 218
 219         m->m_next = NULL;
 220         m->m_nextpkt = NULL;
 221         m->m_data = m->m_dat;
 222         m->m_flags = 0;
 223
 224         return (TRUE);
 225 }
 226
 227 /*
 228  * Initialize the mbuf and the packet header fields.
 229  */
 230 static boolean_t
 231 mbufphdr_ctor(void *obj, void *private, int ocflags)
 232 {
 233         struct mbuf *m = obj;
 234
 235         m->m_next = NULL;
 236         m->m_nextpkt = NULL;
 237         m->m_data = m->m_pktdat;
 238         m->m_flags = M_PKTHDR | M_PHCACHE;
 239
 240         m->m_pkthdr.rcvif = NULL;       /* eliminate XXX JH */
 241         SLIST_INIT(&m->m_pkthdr.tags);
 242         m->m_pkthdr.csum_flags = 0;     /* eliminate XXX JH */
 243         m->m_pkthdr.fw_flags = 0;       /* eliminate XXX JH */
 244
 245         return (TRUE);
 246 }
 247
 248 /*
 249  * A mbcluster object consists of 2K (MCLBYTES) cluster and a refcount.
 250  */
 251 static boolean_t
 252 mclmeta_ctor(void *obj, void *private, int ocflags)
 253 {
 254         struct mbcluster *cl = obj;
 255         void *buf;
 256
 257         if (ocflags & M_NOWAIT)
 258                 buf = kmalloc(MCLBYTES, M_MBUFCL, M_NOWAIT | M_ZERO);
 259         else
 260                 buf = kmalloc(MCLBYTES, M_MBUFCL, M_INTWAIT | M_ZERO);
 261         if (buf == NULL)
 262                 return (FALSE);
 263         cl->mcl_refs = 0;
 264         cl->mcl_data = buf;
 265         lwkt_serialize_init(&cl->mcl_serializer);
 266         return (TRUE);
 267 }
 268
 269 static void
 270 mclmeta_dtor(void *obj, void *private)
 271 {
 272         struct mbcluster *mcl = obj;
 273
 274         KKASSERT(mcl->mcl_refs == 0);
 275         kfree(mcl->mcl_data, M_MBUFCL);
 276 }
 277
 278 static void
 279 linkcluster(struct mbuf *m, struct mbcluster *cl)
 280 {
 281         /*
 282          * Add the cluster to the mbuf.  The caller will detect that the
 283          * mbuf now has an attached cluster.
 284          */
 285         m->m_ext.ext_arg = cl;
 286         m->m_ext.ext_buf = cl->mcl_data;
 287         m->m_ext.ext_ref = m_mclref;
 288         m->m_ext.ext_free = m_mclfree;
 289         m->m_ext.ext_size = MCLBYTES;
 290         atomic_add_int(&cl->mcl_refs, 1);
 291
 292         m->m_data = m->m_ext.ext_buf;
 293         m->m_flags |= M_EXT | M_EXT_CLUSTER;
 294 }
 295
 296 static boolean_t
 297 mbufphdrcluster_ctor(void *obj, void *private, int ocflags)
 298 {
 299         struct mbuf *m = obj;
 300         struct mbcluster *cl;
 301
 302         mbufphdr_ctor(obj, private, ocflags);
 303         cl = objcache_get(mclmeta_cache, ocflags);
 304         if (cl == NULL)
 305                 return (FALSE);
 306         m->m_flags |= M_CLCACHE;
 307         linkcluster(m, cl);
 308         return (TRUE);
 309 }
 310
 311 static boolean_t
 312 mbufcluster_ctor(void *obj, void *private, int ocflags)
 313 {
 314         struct mbuf *m = obj;
 315         struct mbcluster *cl;
 316
 317         mbuf_ctor(obj, private, ocflags);
 318         cl = objcache_get(mclmeta_cache, ocflags);
 319         if (cl == NULL)
 320                 return (FALSE);
 321         m->m_flags |= M_CLCACHE;
 322         linkcluster(m, cl);
 323         return (TRUE);
 324 }
 325
 326 /*
 327  * Used for both the cluster and cluster PHDR caches.
 328  *
 329  * The mbuf may have lost its cluster due to sharing, deal
 330  * with the situation by checking M_EXT.
 331  */
 332 static void
 333 mbufcluster_dtor(void *obj, void *private)
 334 {
 335         struct mbuf *m = obj;
 336         struct mbcluster *mcl;
 337
 338         if (m->m_flags & M_EXT) {
 339                 KKASSERT((m->m_flags & M_EXT_CLUSTER) != 0);
 340                 mcl = m->m_ext.ext_arg;
 341                 KKASSERT(mcl->mcl_refs == 1);
 342                 mcl->mcl_refs = 0;
 343                 objcache_put(mclmeta_cache, mcl);
 344         }
 345 }
 346
 347 struct objcache_malloc_args mbuf_malloc_args = { MSIZE, M_MBUF };
 348 struct objcache_malloc_args mclmeta_malloc_args =
 349         { sizeof(struct mbcluster), M_MCLMETA };
 350
 351 /* ARGSUSED*/
 352 static void
 353 mbinit(void *dummy)
 354 {
 355         mbstat.m_msize = MSIZE;
 356         mbstat.m_mclbytes = MCLBYTES;
 357         mbstat.m_minclsize = MINCLSIZE;
 358         mbstat.m_mlen = MLEN;
 359         mbstat.m_mhlen = MHLEN;
 360
 361         mbuf_cache = objcache_create("mbuf", nmbufs, 0,
 362             mbuf_ctor, NULL, NULL,
 363             objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
 364         mbufphdr_cache = objcache_create("mbuf pkt hdr", nmbufs, 64,
 365             mbufphdr_ctor, NULL, NULL,
 366             objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
 367         mclmeta_cache = objcache_create("cluster mbuf", nmbclusters , 0,
 368             mclmeta_ctor, mclmeta_dtor, NULL,
 369             objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
 370         mbufcluster_cache = objcache_create("mbuf + cluster", nmbclusters, 0,
 371             mbufcluster_ctor, mbufcluster_dtor, NULL,
 372             objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
 373         mbufphdrcluster_cache = objcache_create("mbuf pkt hdr + cluster",
 374             nmbclusters, 64, mbufphdrcluster_ctor, mbufcluster_dtor, NULL,
 375             objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
 376         return;
 377 }
 378
 379 /*
 380  * Return the number of references to this mbuf's data.  0 is returned
 381  * if the mbuf is not M_EXT, a reference count is returned if it is
 382  * M_EXT | M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
 383  */
 384 int
 385 m_sharecount(struct mbuf *m)
 386 {
 387         switch (m->m_flags & (M_EXT | M_EXT_CLUSTER)) {
 388         case 0:
 389                 return (0);
 390         case M_EXT:
 391                 return (99);
 392         case M_EXT | M_EXT_CLUSTER:
 393                 return (((struct mbcluster *)m->m_ext.ext_arg)->mcl_refs);
 394         }
 395         /* NOTREACHED */
 396         return (0);             /* to shut up compiler */
 397 }
 398
 399 /*
 400  * change mbuf to new type
 401  */
 402 void
 403 m_chtype(struct mbuf *m, int type)
 404 {
 405         crit_enter();
 406         ++mbtypes[type];
 407         --mbtypes[m->m_type];
 408         m->m_type = type;
 409         crit_exit();
 410 }
 411
 412 static void
 413 m_reclaim(void)
 414 {
 415         struct domain *dp;
 416         struct protosw *pr;
 417
 418         crit_enter();
 419         SLIST_FOREACH(dp, &domains, dom_next) {
 420                 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
 421                         if (pr->pr_drain)
 422                                 (*pr->pr_drain)();
 423                 }
 424         }
 425         crit_exit();
 426         mbstat.m_drain++;
 427 }
 428
 429 static void __inline
 430 updatestats(struct mbuf *m, int type)
 431 {
 432         m->m_type = type;
 433
 434         crit_enter();
 435         ++mbtypes[type];
 436         ++mbstat.m_mbufs;
 437         crit_exit();
 438 }
 439
 440 /*
 441  * Allocate an mbuf.
 442  */
 443 struct mbuf *
 444 m_get(int how, int type)
 445 {
 446         struct mbuf *m;
 447         int ntries = 0;
 448         int ocf = MBTOM(how);
 449
 450 retryonce:
 451
 452         m = objcache_get(mbuf_cache, ocf);
 453
 454         if (m == NULL) {
 455                 if ((how & MB_TRYWAIT) && ntries++ == 0) {
 456                         struct objcache *reclaimlist[] = {
 457                                 mbufphdr_cache,
 458                                 mbufcluster_cache, mbufphdrcluster_cache
 459                         };
 460                         const int nreclaims = __arysize(reclaimlist);
 461
 462                         if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
 463                                 m_reclaim();
 464                         goto retryonce;
 465                 }
 466                 return (NULL);
 467         }
 468
 469         updatestats(m, type);
 470         return (m);
 471 }
 472
 473 struct mbuf *
 474 m_gethdr(int how, int type)
 475 {
 476         struct mbuf *m;
 477         int ocf = MBTOM(how);
 478         int ntries = 0;
 479
 480 retryonce:
 481
 482         m = objcache_get(mbufphdr_cache, ocf);
 483
 484         if (m == NULL) {
 485                 if ((how & MB_TRYWAIT) && ntries++ == 0) {
 486                         struct objcache *reclaimlist[] = {
 487                                 mbuf_cache,
 488                                 mbufcluster_cache, mbufphdrcluster_cache
 489                         };
 490                         const int nreclaims = __arysize(reclaimlist);
 491
 492                         if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
 493                                 m_reclaim();
 494                         goto retryonce;
 495                 }
 496                 return (NULL);
 497         }
 498
 499         updatestats(m, type);
 500         return (m);
 501 }
 502
 503 /*
 504  * Get a mbuf (not a mbuf cluster!) and zero it.
 505  * Deprecated.
 506  */
 507 struct mbuf *
 508 m_getclr(int how, int type)
 509 {
 510         struct mbuf *m;
 511
 512         m = m_get(how, type);
 513         if (m != NULL)
 514                 bzero(m->m_data, MLEN);
 515         return (m);
 516 }
 517
 518 /*
 519  * Returns an mbuf with an attached cluster.
 520  * Because many network drivers use this kind of buffers a lot, it is
 521  * convenient to keep a small pool of free buffers of this kind.
 522  * Even a small size such as 10 gives about 10% improvement in the
 523  * forwarding rate in a bridge or router.
 524  */
 525 struct mbuf *
 526 m_getcl(int how, short type, int flags)
 527 {
 528         struct mbuf *m;
 529         int ocflags = MBTOM(how);
 530         int ntries = 0;
 531
 532 retryonce:
 533
 534         if (flags & M_PKTHDR)
 535                 m = objcache_get(mbufphdrcluster_cache, ocflags);
 536         else
 537                 m = objcache_get(mbufcluster_cache, ocflags);
 538
 539         if (m == NULL) {
 540                 if ((how & MB_TRYWAIT) && ntries++ == 0) {
 541                         struct objcache *reclaimlist[1];
 542
 543                         if (flags & M_PKTHDR)
 544                                 reclaimlist[0] = mbufcluster_cache;
 545                         else
 546                                 reclaimlist[0] = mbufphdrcluster_cache;
 547                         if (!objcache_reclaimlist(reclaimlist, 1, ocflags))
 548                                 m_reclaim();
 549                         goto retryonce;
 550                 }
 551                 return (NULL);
 552         }
 553
 554         m->m_type = type;
 555
 556         crit_enter();
 557         ++mbtypes[type];
 558         ++mbstat.m_clusters;
 559         crit_exit();
 560         return (m);
 561 }
 562
 563 /*
 564  * Allocate chain of requested length.
 565  */
 566 struct mbuf *
 567 m_getc(int len, int how, int type)
 568 {
 569         struct mbuf *n, *nfirst = NULL, **ntail = &nfirst;
 570         int nsize;
 571
 572         while (len > 0) {
 573                 n = m_getl(len, how, type, 0, &nsize);
 574                 if (n == NULL)
 575                         goto failed;
 576                 n->m_len = 0;
 577                 *ntail = n;
 578                 ntail = &n->m_next;
 579                 len -= nsize;
 580         }
 581         return (nfirst);
 582
 583 failed:
 584         m_freem(nfirst);
 585         return (NULL);
 586 }
 587
 588 /*
 589  * Allocate len-worth of mbufs and/or mbuf clusters (whatever fits best)
 590  * and return a pointer to the head of the allocated chain. If m0 is
 591  * non-null, then we assume that it is a single mbuf or an mbuf chain to
 592  * which we want len bytes worth of mbufs and/or clusters attached, and so
 593  * if we succeed in allocating it, we will just return a pointer to m0.
 594  *
 595  * If we happen to fail at any point during the allocation, we will free
 596  * up everything we have already allocated and return NULL.
 597  *
 598  * Deprecated.  Use m_getc() and m_cat() instead.
 599  */
 600 struct mbuf *
 601 m_getm(struct mbuf *m0, int len, int type, int how)
 602 {
 603         struct mbuf *nfirst;
 604
 605         nfirst = m_getc(len, how, type);
 606
 607         if (m0 != NULL) {
 608                 m_last(m0)->m_next = nfirst;
 609                 return (m0);
 610         }
 611
 612         return (nfirst);
 613 }
 614
 615 /*
 616  * Adds a cluster to a normal mbuf, M_EXT is set on success.
 617  * Deprecated.  Use m_getcl() instead.
 618  */
 619 void
 620 m_mclget(struct mbuf *m, int how)
 621 {
 622         struct mbcluster *mcl;
 623
 624         KKASSERT((m->m_flags & M_EXT) == 0);
 625         mcl = objcache_get(mclmeta_cache, MBTOM(how));
 626         if (mcl != NULL) {
 627                 linkcluster(m, mcl);
 628                 crit_enter();
 629                 ++mbstat.m_clusters;
 630                 /* leave the m_mbufs count intact for original mbuf */
 631                 crit_exit();
 632         }
 633 }
 634
 635 /*
 636  * Updates to mbcluster must be MPSAFE.  Only an entity which already has
 637  * a reference to the cluster can ref it, so we are in no danger of
 638  * racing an add with a subtract.  But the operation must still be atomic
 639  * since multiple entities may have a reference on the cluster.
 640  *
 641  * m_mclfree() is almost the same but it must contend with two entities
 642  * freeing the cluster at the same time.  If there is only one reference
 643  * count we are the only entity referencing the cluster and no further
 644  * locking is required.  Otherwise we must protect against a race to 0
 645  * with the serializer.
 646  */
 647 static void
 648 m_mclref(void *arg)
 649 {
 650         struct mbcluster *mcl = arg;
 651
 652         atomic_add_int(&mcl->mcl_refs, 1);
 653 }
 654
 655 static void
 656 m_mclfree(void *arg)
 657 {
 658         struct mbcluster *mcl = arg;
 659
 660         if (mcl->mcl_refs == 1) {
 661                 mcl->mcl_refs = 0;
 662                 objcache_put(mclmeta_cache, mcl);
 663         } else {
 664                 lwkt_serialize_enter(&mcl->mcl_serializer);
 665                 if (mcl->mcl_refs > 1) {
 666                         atomic_subtract_int(&mcl->mcl_refs, 1);
 667                         lwkt_serialize_exit(&mcl->mcl_serializer);
 668                 } else {
 669                         lwkt_serialize_exit(&mcl->mcl_serializer);
 670                         KKASSERT(mcl->mcl_refs == 1);
 671                         mcl->mcl_refs = 0;
 672                         objcache_put(mclmeta_cache, mcl);
 673                 }
 674         }
 675 }
 676
 677 extern void db_print_backtrace(void);
 678
 679 /*
 680  * Free a single mbuf and any associated external storage.  The successor,
 681  * if any, is returned.
 682  *
 683  * We do need to check non-first mbuf for m_aux, since some of existing
 684  * code does not call M_PREPEND properly.
 685  * (example: call to bpf_mtap from drivers)
 686  */
 687 struct mbuf *
 688 m_free(struct mbuf *m)
 689 {
 690         struct mbuf *n;
 691
 692         KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m));
 693         --mbtypes[m->m_type];
 694
 695         n = m->m_next;
 696
 697         /*
 698          * Make sure the mbuf is in constructed state before returning it
 699          * to the objcache.
 700          */
 701         m->m_next = NULL;
 702 #ifdef notyet
 703         KKASSERT(m->m_nextpkt == NULL);
 704 #else
 705         if (m->m_nextpkt != NULL) {
 706 #ifdef DDB
 707                 static int afewtimes = 10;
 708
 709                 if (afewtimes-- > 0) {
 710                         kprintf("mfree: m->m_nextpkt != NULL\n");
 711                         db_print_backtrace();
 712                 }
 713 #endif
 714                 m->m_nextpkt = NULL;
 715         }
 716 #endif
 717         if (m->m_flags & M_PKTHDR) {
 718                 m_tag_delete_chain(m);          /* eliminate XXX JH */
 719         }
 720
 721         m->m_flags &= (M_EXT | M_EXT_CLUSTER | M_CLCACHE | M_PHCACHE);
 722
 723         /*
 724          * Clean the M_PKTHDR state so we can return the mbuf to its original
 725          * cache.  This is based on the PHCACHE flag which tells us whether
 726          * the mbuf was originally allocated out of a packet-header cache
 727          * or a non-packet-header cache.
 728          */
 729         if (m->m_flags & M_PHCACHE) {
 730                 m->m_flags |= M_PKTHDR;
 731                 m->m_pkthdr.rcvif = NULL;       /* eliminate XXX JH */
 732                 m->m_pkthdr.csum_flags = 0;     /* eliminate XXX JH */
 733                 m->m_pkthdr.fw_flags = 0;       /* eliminate XXX JH */
 734                 SLIST_INIT(&m->m_pkthdr.tags);
 735         }
 736
 737         /*
 738          * Handle remaining flags combinations.  M_CLCACHE tells us whether
 739          * the mbuf was originally allocated from a cluster cache or not,
 740          * and is totally separate from whether the mbuf is currently
 741          * associated with a cluster.
 742          */
 743         crit_enter();
 744         switch(m->m_flags & (M_CLCACHE | M_EXT | M_EXT_CLUSTER)) {
 745         case M_CLCACHE | M_EXT | M_EXT_CLUSTER:
 746                 /*
 747                  * mbuf+cluster cache case.  The mbuf was allocated from the
 748                  * combined mbuf_cluster cache and can be returned to the
 749                  * cache if the cluster hasn't been shared.
 750                  */
 751                 if (m_sharecount(m) == 1) {
 752                         /*
 753                          * The cluster has not been shared, we can just
 754                          * reset the data pointer and return the mbuf
 755                          * to the cluster cache.  Note that the reference
 756                          * count is left intact (it is still associated with
 757                          * an mbuf).
 758                          */
 759                         m->m_data = m->m_ext.ext_buf;
 760                         if (m->m_flags & M_PHCACHE)
 761                                 objcache_put(mbufphdrcluster_cache, m);
 762                         else
 763                                 objcache_put(mbufcluster_cache, m);
 764                         --mbstat.m_clusters;
 765                 } else {
 766                         /*
 767                          * Hell.  Someone else has a ref on this cluster,
 768                          * we have to disconnect it which means we can't
 769                          * put it back into the mbufcluster_cache, we
 770                          * have to destroy the mbuf.
 771                          *
 772                          * Other mbuf references to the cluster will typically
 773                          * be M_EXT | M_EXT_CLUSTER but without M_CLCACHE.
 774                          *
 775                          * XXX we could try to connect another cluster to
 776                          * it.
 777                          */
 778                         m->m_ext.ext_free(m->m_ext.ext_arg);
 779                         m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
 780                         if (m->m_flags & M_PHCACHE)
 781                                 objcache_dtor(mbufphdrcluster_cache, m);
 782                         else
 783                                 objcache_dtor(mbufcluster_cache, m);
 784                 }
 785                 break;
 786         case M_EXT | M_EXT_CLUSTER:
 787                 /*
 788                  * Normal cluster associated with an mbuf that was allocated
 789                  * from the normal mbuf pool rather then the cluster pool.
 790                  * The cluster has to be independantly disassociated from the
 791                  * mbuf.
 792                  */
 793                 if (m_sharecount(m) == 1)
 794                         --mbstat.m_clusters;
 795                 /* fall through */
 796         case M_EXT:
 797                 /*
 798                  * Normal cluster association case, disconnect the cluster from
 799                  * the mbuf.  The cluster may or may not be custom.
 800                  */
 801                 m->m_ext.ext_free(m->m_ext.ext_arg);
 802                 m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
 803                 /* fall through */
 804         case 0:
 805                 /*
 806                  * return the mbuf to the mbuf cache.
 807                  */
 808                 if (m->m_flags & M_PHCACHE) {
 809                         m->m_data = m->m_pktdat;
 810                         objcache_put(mbufphdr_cache, m);
 811                 } else {
 812                         m->m_data = m->m_dat;
 813                         objcache_put(mbuf_cache, m);
 814                 }
 815                 --mbstat.m_mbufs;
 816                 break;
 817         default:
 818                 if (!panicstr)
 819                         panic("bad mbuf flags %p %08x\n", m, m->m_flags);
 820                 break;
 821         }
 822         crit_exit();
 823         return (n);
 824 }
 825
 826 void
 827 m_freem(struct mbuf *m)
 828 {
 829         crit_enter();
 830         while (m)
 831                 m = m_free(m);
 832         crit_exit();
 833 }
 834
 835 /*
 836  * mbuf utility routines
 837  */
 838
 839 /*
 840  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain and
 841  * copy junk along.
 842  */
 843 struct mbuf *
 844 m_prepend(struct mbuf *m, int len, int how)
 845 {
 846         struct mbuf *mn;
 847
 848         if (m->m_flags & M_PKTHDR)
 849             mn = m_gethdr(how, m->m_type);
 850         else
 851             mn = m_get(how, m->m_type);
 852         if (mn == NULL) {
 853                 m_freem(m);
 854                 return (NULL);
 855         }
 856         if (m->m_flags & M_PKTHDR)
 857                 M_MOVE_PKTHDR(mn, m);
 858         mn->m_next = m;
 859         m = mn;
 860         if (len < MHLEN)
 861                 MH_ALIGN(m, len);
 862         m->m_len = len;
 863         return (m);
 864 }
 865
 866 /*
 867  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
 868  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
 869  * The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller.
 870  * Note that the copy is read-only, because clusters are not copied,
 871  * only their reference counts are incremented.
 872  */
 873 struct mbuf *
 874 m_copym(const struct mbuf *m, int off0, int len, int wait)
 875 {
 876         struct mbuf *n, **np;
 877         int off = off0;
 878         struct mbuf *top;
 879         int copyhdr = 0;
 880
 881         KASSERT(off >= 0, ("m_copym, negative off %d", off));
 882         KASSERT(len >= 0, ("m_copym, negative len %d", len));
 883         if (off == 0 && m->m_flags & M_PKTHDR)
 884                 copyhdr = 1;
 885         while (off > 0) {
 886                 KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
 887                 if (off < m->m_len)
 888                         break;
 889                 off -= m->m_len;
 890                 m = m->m_next;
 891         }
 892         np = &top;
 893         top = 0;
 894         while (len > 0) {
 895                 if (m == NULL) {
 896                         KASSERT(len == M_COPYALL,
 897                             ("m_copym, length > size of mbuf chain"));
 898                         break;
 899                 }
 900                 /*
 901                  * Because we are sharing any cluster attachment below,
 902                  * be sure to get an mbuf that does not have a cluster
 903                  * associated with it.
 904                  */
 905                 if (copyhdr)
 906                         n = m_gethdr(wait, m->m_type);
 907                 else
 908                         n = m_get(wait, m->m_type);
 909                 *np = n;
 910                 if (n == NULL)
 911                         goto nospace;
 912                 if (copyhdr) {
 913                         if (!m_dup_pkthdr(n, m, wait))
 914                                 goto nospace;
 915                         if (len == M_COPYALL)
 916                                 n->m_pkthdr.len -= off0;
 917                         else
 918                                 n->m_pkthdr.len = len;
 919                         copyhdr = 0;
 920                 }
 921                 n->m_len = min(len, m->m_len - off);
 922                 if (m->m_flags & M_EXT) {
 923                         KKASSERT((n->m_flags & M_EXT) == 0);
 924                         n->m_data = m->m_data + off;
 925                         m->m_ext.ext_ref(m->m_ext.ext_arg);
 926                         n->m_ext = m->m_ext;
 927                         n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
 928                 } else {
 929                         bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
 930                             (unsigned)n->m_len);
 931                 }
 932                 if (len != M_COPYALL)
 933                         len -= n->m_len;
 934                 off = 0;
 935                 m = m->m_next;
 936                 np = &n->m_next;
 937         }
 938         if (top == NULL)
 939                 mbstat.m_mcfail++;
 940         return (top);
 941 nospace:
 942         m_freem(top);
 943         mbstat.m_mcfail++;
 944         return (NULL);
 945 }
 946
 947 /*
 948  * Copy an entire packet, including header (which must be present).
 949  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
 950  * Note that the copy is read-only, because clusters are not copied,
 951  * only their reference counts are incremented.
 952  * Preserve alignment of the first mbuf so if the creator has left
 953  * some room at the beginning (e.g. for inserting protocol headers)
 954  * the copies also have the room available.
 955  */
 956 struct mbuf *
 957 m_copypacket(struct mbuf *m, int how)
 958 {
 959         struct mbuf *top, *n, *o;
 960
 961         n = m_gethdr(how, m->m_type);
 962         top = n;
 963         if (!n)
 964                 goto nospace;
 965
 966         if (!m_dup_pkthdr(n, m, how))
 967                 goto nospace;
 968         n->m_len = m->m_len;
 969         if (m->m_flags & M_EXT) {
 970                 KKASSERT((n->m_flags & M_EXT) == 0);
 971                 n->m_data = m->m_data;
 972                 m->m_ext.ext_ref(m->m_ext.ext_arg);
 973                 n->m_ext = m->m_ext;
 974                 n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
 975         } else {
 976                 n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
 977                 bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 978         }
 979
 980         m = m->m_next;
 981         while (m) {
 982                 o = m_get(how, m->m_type);
 983                 if (!o)
 984                         goto nospace;
 985
 986                 n->m_next = o;
 987                 n = n->m_next;
 988
 989                 n->m_len = m->m_len;
 990                 if (m->m_flags & M_EXT) {
 991                         KKASSERT((n->m_flags & M_EXT) == 0);
 992                         n->m_data = m->m_data;
 993                         m->m_ext.ext_ref(m->m_ext.ext_arg);
 994                         n->m_ext = m->m_ext;
 995                         n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
 996                 } else {
 997                         bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 998                 }
 999
1000                 m = m->m_next;
1001         }
1002         return top;
1003 nospace:
1004         m_freem(top);
1005         mbstat.m_mcfail++;
1006         return (NULL);
1007 }
1008
1009 /*
1010  * Copy data from an mbuf chain starting "off" bytes from the beginning,
1011  * continuing for "len" bytes, into the indicated buffer.
1012  */
1013 void
1014 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
1015 {
1016         unsigned count;
1017
1018         KASSERT(off >= 0, ("m_copydata, negative off %d", off));
1019         KASSERT(len >= 0, ("m_copydata, negative len %d", len));
1020         while (off > 0) {
1021                 KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
1022                 if (off < m->m_len)
1023                         break;
1024                 off -= m->m_len;
1025                 m = m->m_next;
1026         }
1027         while (len > 0) {
1028                 KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
1029                 count = min(m->m_len - off, len);
1030                 bcopy(mtod(m, caddr_t) + off, cp, count);
1031                 len -= count;
1032                 cp += count;
1033                 off = 0;
1034                 m = m->m_next;
1035         }
1036 }
1037
1038 /*
1039  * Copy a packet header mbuf chain into a completely new chain, including
1040  * copying any mbuf clusters.  Use this instead of m_copypacket() when
1041  * you need a writable copy of an mbuf chain.
1042  */
1043 struct mbuf *
1044 m_dup(struct mbuf *m, int how)
1045 {
1046         struct mbuf **p, *top = NULL;
1047         int remain, moff, nsize;
1048
1049         /* Sanity check */
1050         if (m == NULL)
1051                 return (NULL);
1052         KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
1053
1054         /* While there's more data, get a new mbuf, tack it on, and fill it */
1055         remain = m->m_pkthdr.len;
1056         moff = 0;
1057         p = &top;
1058         while (remain > 0 || top == NULL) {     /* allow m->m_pkthdr.len == 0 */
1059                 struct mbuf *n;
1060
1061                 /* Get the next new mbuf */
1062                 n = m_getl(remain, how, m->m_type, top == NULL ? M_PKTHDR : 0,
1063                            &nsize);
1064                 if (n == NULL)
1065                         goto nospace;
1066                 if (top == NULL)
1067                         if (!m_dup_pkthdr(n, m, how))
1068                                 goto nospace0;
1069
1070                 /* Link it into the new chain */
1071                 *p = n;
1072                 p = &n->m_next;
1073
1074                 /* Copy data from original mbuf(s) into new mbuf */
1075                 n->m_len = 0;
1076                 while (n->m_len < nsize && m != NULL) {
1077                         int chunk = min(nsize - n->m_len, m->m_len - moff);
1078
1079                         bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1080                         moff += chunk;
1081                         n->m_len += chunk;
1082                         remain -= chunk;
1083                         if (moff == m->m_len) {
1084                                 m = m->m_next;
1085                                 moff = 0;
1086                         }
1087                 }
1088
1089                 /* Check correct total mbuf length */
1090                 KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
1091                         ("%s: bogus m_pkthdr.len", __func__));
1092         }
1093         return (top);
1094
1095 nospace:
1096         m_freem(top);
1097 nospace0:
1098         mbstat.m_mcfail++;
1099         return (NULL);
1100 }
1101
1102 /*
1103  * Concatenate mbuf chain n to m.
1104  * Both chains must be of the same type (e.g. MT_DATA).
1105  * Any m_pkthdr is not updated.
1106  */
1107 void
1108 m_cat(struct mbuf *m, struct mbuf *n)
1109 {
1110         m = m_last(m);
1111         while (n) {
1112                 if (m->m_flags & M_EXT ||
1113                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
1114                         /* just join the two chains */
1115                         m->m_next = n;
1116                         return;
1117                 }
1118                 /* splat the data from one into the other */
1119                 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1120                     (u_int)n->m_len);
1121                 m->m_len += n->m_len;
1122                 n = m_free(n);
1123         }
1124 }
1125
1126 void
1127 m_adj(struct mbuf *mp, int req_len)
1128 {
1129         int len = req_len;
1130         struct mbuf *m;
1131         int count;
1132
1133         if ((m = mp) == NULL)
1134                 return;
1135         if (len >= 0) {
1136                 /*
1137                  * Trim from head.
1138                  */
1139                 while (m != NULL && len > 0) {
1140                         if (m->m_len <= len) {
1141                                 len -= m->m_len;
1142                                 m->m_len = 0;
1143                                 m = m->m_next;
1144                         } else {
1145                                 m->m_len -= len;
1146                                 m->m_data += len;
1147                                 len = 0;
1148                         }
1149                 }
1150                 m = mp;
1151                 if (mp->m_flags & M_PKTHDR)
1152                         m->m_pkthdr.len -= (req_len - len);
1153         } else {
1154                 /*
1155                  * Trim from tail.  Scan the mbuf chain,
1156                  * calculating its length and finding the last mbuf.
1157                  * If the adjustment only affects this mbuf, then just
1158                  * adjust and return.  Otherwise, rescan and truncate
1159                  * after the remaining size.
1160                  */
1161                 len = -len;
1162                 count = 0;
1163                 for (;;) {
1164                         count += m->m_len;
1165                         if (m->m_next == (struct mbuf *)0)
1166                                 break;
1167                         m = m->m_next;
1168                 }
1169                 if (m->m_len >= len) {
1170                         m->m_len -= len;
1171                         if (mp->m_flags & M_PKTHDR)
1172                                 mp->m_pkthdr.len -= len;
1173                         return;
1174                 }
1175                 count -= len;
1176                 if (count < 0)
1177                         count = 0;
1178                 /*
1179                  * Correct length for chain is "count".
1180                  * Find the mbuf with last data, adjust its length,
1181                  * and toss data from remaining mbufs on chain.
1182                  */
1183                 m = mp;
1184                 if (m->m_flags & M_PKTHDR)
1185                         m->m_pkthdr.len = count;
1186                 for (; m; m = m->m_next) {
1187                         if (m->m_len >= count) {
1188                                 m->m_len = count;
1189                                 break;
1190                         }
1191                         count -= m->m_len;
1192                 }
1193                 while (m->m_next)
1194                         (m = m->m_next) ->m_len = 0;
1195         }
1196 }
1197
1198 /*
1199  * Rearrange an mbuf chain so that len bytes are contiguous
1200  * and in the data area of an mbuf (so that mtod will work for a structure
1201  * of size len).  Returns the resulting mbuf chain on success, frees it and
1202  * returns null on failure.  If there is room, it will add up to
1203  * max_protohdr-len extra bytes to the contiguous region in an attempt to
1204  * avoid being called next time.
1205  */
1206 struct mbuf *
1207 m_pullup(struct mbuf *n, int len)
1208 {
1209         struct mbuf *m;
1210         int count;
1211         int space;
1212
1213         /*
1214          * If first mbuf has no cluster, and has room for len bytes
1215          * without shifting current data, pullup into it,
1216          * otherwise allocate a new mbuf to prepend to the chain.
1217          */
1218         if (!(n->m_flags & M_EXT) &&
1219             n->m_data + len < &n->m_dat[MLEN] &&
1220             n->m_next) {
1221                 if (n->m_len >= len)
1222                         return (n);
1223                 m = n;
1224                 n = n->m_next;
1225                 len -= m->m_len;
1226         } else {
1227                 if (len > MHLEN)
1228                         goto bad;
1229                 if (n->m_flags & M_PKTHDR)
1230                         m = m_gethdr(MB_DONTWAIT, n->m_type);
1231                 else
1232                         m = m_get(MB_DONTWAIT, n->m_type);
1233                 if (m == NULL)
1234                         goto bad;
1235                 m->m_len = 0;
1236                 if (n->m_flags & M_PKTHDR)
1237                         M_MOVE_PKTHDR(m, n);
1238         }
1239         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1240         do {
1241                 count = min(min(max(len, max_protohdr), space), n->m_len);
1242                 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1243                   (unsigned)count);
1244                 len -= count;
1245                 m->m_len += count;
1246                 n->m_len -= count;
1247                 space -= count;
1248                 if (n->m_len)
1249                         n->m_data += count;
1250                 else
1251                         n = m_free(n);
1252         } while (len > 0 && n);
1253         if (len > 0) {
1254                 m_free(m);
1255                 goto bad;
1256         }
1257         m->m_next = n;
1258         return (m);
1259 bad:
1260         m_freem(n);
1261         mbstat.m_mpfail++;
1262         return (NULL);
1263 }
1264
1265 /*
1266  * Partition an mbuf chain in two pieces, returning the tail --
1267  * all but the first len0 bytes.  In case of failure, it returns NULL and
1268  * attempts to restore the chain to its original state.
1269  *
1270  * Note that the resulting mbufs might be read-only, because the new
1271  * mbuf can end up sharing an mbuf cluster with the original mbuf if
1272  * the "breaking point" happens to lie within a cluster mbuf. Use the
1273  * M_WRITABLE() macro to check for this case.
1274  */
1275 struct mbuf *
1276 m_split(struct mbuf *m0, int len0, int wait)
1277 {
1278         struct mbuf *m, *n;
1279         unsigned len = len0, remain;
1280
1281         for (m = m0; m && len > m->m_len; m = m->m_next)
1282                 len -= m->m_len;
1283         if (m == NULL)
1284                 return (NULL);
1285         remain = m->m_len - len;
1286         if (m0->m_flags & M_PKTHDR) {
1287                 n = m_gethdr(wait, m0->m_type);
1288                 if (n == NULL)
1289                         return (NULL);
1290                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1291                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1292                 m0->m_pkthdr.len = len0;
1293                 if (m->m_flags & M_EXT)
1294                         goto extpacket;
1295                 if (remain > MHLEN) {
1296                         /* m can't be the lead packet */
1297                         MH_ALIGN(n, 0);
1298                         n->m_next = m_split(m, len, wait);
1299                         if (n->m_next == NULL) {
1300                                 m_free(n);
1301                                 return (NULL);
1302                         } else {
1303                                 n->m_len = 0;
1304                                 return (n);
1305                         }
1306                 } else
1307                         MH_ALIGN(n, remain);
1308         } else if (remain == 0) {
1309                 n = m->m_next;
1310                 m->m_next = 0;
1311                 return (n);
1312         } else {
1313                 n = m_get(wait, m->m_type);
1314                 if (n == NULL)
1315                         return (NULL);
1316                 M_ALIGN(n, remain);
1317         }
1318 extpacket:
1319         if (m->m_flags & M_EXT) {
1320                 KKASSERT((n->m_flags & M_EXT) == 0);
1321                 n->m_data = m->m_data + len;
1322                 m->m_ext.ext_ref(m->m_ext.ext_arg);
1323                 n->m_ext = m->m_ext;
1324                 n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
1325         } else {
1326                 bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1327         }
1328         n->m_len = remain;
1329         m->m_len = len;
1330         n->m_next = m->m_next;
1331         m->m_next = 0;
1332         return (n);
1333 }
1334
1335 /*
1336  * Routine to copy from device local memory into mbufs.
1337  * Note: "offset" is ill-defined and always called as 0, so ignore it.
1338  */
1339 struct mbuf *
1340 m_devget(char *buf, int len, int offset, struct ifnet *ifp,
1341     void (*copy)(volatile const void *from, volatile void *to, size_t length))
1342 {
1343         struct mbuf *m, *mfirst = NULL, **mtail;
1344         int nsize, flags;
1345
1346         if (copy == NULL)
1347                 copy = bcopy;
1348         mtail = &mfirst;
1349         flags = M_PKTHDR;
1350
1351         while (len > 0) {
1352                 m = m_getl(len, MB_DONTWAIT, MT_DATA, flags, &nsize);
1353                 if (m == NULL) {
1354                         m_freem(mfirst);
1355                         return (NULL);
1356                 }
1357                 m->m_len = min(len, nsize);
1358
1359                 if (flags & M_PKTHDR) {
1360                         if (len + max_linkhdr <= nsize)
1361                                 m->m_data += max_linkhdr;
1362                         m->m_pkthdr.rcvif = ifp;
1363                         m->m_pkthdr.len = len;
1364                         flags = 0;
1365                 }
1366
1367                 copy(buf, m->m_data, (unsigned)m->m_len);
1368                 buf += m->m_len;
1369                 len -= m->m_len;
1370                 *mtail = m;
1371                 mtail = &m->m_next;
1372         }
1373
1374         return (mfirst);
1375 }
1376
1377 /*
1378  * Copy data from a buffer back into the indicated mbuf chain,
1379  * starting "off" bytes from the beginning, extending the mbuf
1380  * chain if necessary.
1381  */
1382 void
1383 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
1384 {
1385         int mlen;
1386         struct mbuf *m = m0, *n;
1387         int totlen = 0;
1388
1389         if (m0 == NULL)
1390                 return;
1391         while (off > (mlen = m->m_len)) {
1392                 off -= mlen;
1393                 totlen += mlen;
1394                 if (m->m_next == NULL) {
1395                         n = m_getclr(MB_DONTWAIT, m->m_type);
1396                         if (n == NULL)
1397                                 goto out;
1398                         n->m_len = min(MLEN, len + off);
1399                         m->m_next = n;
1400                 }
1401                 m = m->m_next;
1402         }
1403         while (len > 0) {
1404                 mlen = min (m->m_len - off, len);
1405                 bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
1406                 cp += mlen;
1407                 len -= mlen;
1408                 mlen += off;
1409                 off = 0;
1410                 totlen += mlen;
1411                 if (len == 0)
1412                         break;
1413                 if (m->m_next == NULL) {
1414                         n = m_get(MB_DONTWAIT, m->m_type);
1415                         if (n == NULL)
1416                                 break;
1417                         n->m_len = min(MLEN, len);
1418                         m->m_next = n;
1419                 }
1420                 m = m->m_next;
1421         }
1422 out:    if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1423                 m->m_pkthdr.len = totlen;
1424 }
1425
1426 void
1427 m_print(const struct mbuf *m)
1428 {
1429         int len;
1430         const struct mbuf *m2;
1431
1432         len = m->m_pkthdr.len;
1433         m2 = m;
1434         while (len) {
1435                 kprintf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
1436                 len -= m2->m_len;
1437                 m2 = m2->m_next;
1438         }
1439         return;
1440 }
1441
1442 /*
1443  * "Move" mbuf pkthdr from "from" to "to".
1444  * "from" must have M_PKTHDR set, and "to" must be empty.
1445  */
1446 void
1447 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
1448 {
1449         KASSERT((to->m_flags & M_PKTHDR), ("m_move_pkthdr: not packet header"));
1450
1451         to->m_flags |= from->m_flags & M_COPYFLAGS;
1452         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
1453         SLIST_INIT(&from->m_pkthdr.tags);       /* purge tags from src */
1454 }
1455
1456 /*
1457  * Duplicate "from"'s mbuf pkthdr in "to".
1458  * "from" must have M_PKTHDR set, and "to" must be empty.
1459  * In particular, this does a deep copy of the packet tags.
1460  */
1461 int
1462 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
1463 {
1464         KASSERT((to->m_flags & M_PKTHDR), ("m_dup_pkthdr: not packet header"));
1465
1466         to->m_flags = (from->m_flags & M_COPYFLAGS) |
1467                       (to->m_flags & ~M_COPYFLAGS);
1468         to->m_pkthdr = from->m_pkthdr;
1469         SLIST_INIT(&to->m_pkthdr.tags);
1470         return (m_tag_copy_chain(to, from, how));
1471 }
1472
1473 /*
1474  * Defragment a mbuf chain, returning the shortest possible
1475  * chain of mbufs and clusters.  If allocation fails and
1476  * this cannot be completed, NULL will be returned, but
1477  * the passed in chain will be unchanged.  Upon success,
1478  * the original chain will be freed, and the new chain
1479  * will be returned.
1480  *
1481  * If a non-packet header is passed in, the original
1482  * mbuf (chain?) will be returned unharmed.
1483  *
1484  * m_defrag_nofree doesn't free the passed in mbuf.
1485  */
1486 struct mbuf *
1487 m_defrag(struct mbuf *m0, int how)
1488 {
1489         struct mbuf *m_new;
1490
1491         if ((m_new = m_defrag_nofree(m0, how)) == NULL)
1492                 return (NULL);
1493         if (m_new != m0)
1494                 m_freem(m0);
1495         return (m_new);
1496 }
1497
1498 struct mbuf *
1499 m_defrag_nofree(struct mbuf *m0, int how)
1500 {
1501         struct mbuf     *m_new = NULL, *m_final = NULL;
1502         int             progress = 0, length, nsize;
1503
1504         if (!(m0->m_flags & M_PKTHDR))
1505                 return (m0);
1506
1507 #ifdef MBUF_STRESS_TEST
1508         if (m_defragrandomfailures) {
1509                 int temp = karc4random() & 0xff;
1510                 if (temp == 0xba)
1511                         goto nospace;
1512         }
1513 #endif
1514
1515         m_final = m_getl(m0->m_pkthdr.len, how, MT_DATA, M_PKTHDR, &nsize);
1516         if (m_final == NULL)
1517                 goto nospace;
1518         m_final->m_len = 0;     /* in case m0->m_pkthdr.len is zero */
1519
1520         if (m_dup_pkthdr(m_final, m0, how) == NULL)
1521                 goto nospace;
1522
1523         m_new = m_final;
1524
1525         while (progress < m0->m_pkthdr.len) {
1526                 length = m0->m_pkthdr.len - progress;
1527                 if (length > MCLBYTES)
1528                         length = MCLBYTES;
1529
1530                 if (m_new == NULL) {
1531                         m_new = m_getl(length, how, MT_DATA, 0, &nsize);
1532                         if (m_new == NULL)
1533                                 goto nospace;
1534                 }
1535
1536                 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1537                 progress += length;
1538                 m_new->m_len = length;
1539                 if (m_new != m_final)
1540                         m_cat(m_final, m_new);
1541                 m_new = NULL;
1542         }
1543         if (m0->m_next == NULL)
1544                 m_defraguseless++;
1545         m_defragpackets++;
1546         m_defragbytes += m_final->m_pkthdr.len;
1547         return (m_final);
1548 nospace:
1549         m_defragfailure++;
1550         if (m_new)
1551                 m_free(m_new);
1552         m_freem(m_final);
1553         return (NULL);
1554 }
1555
1556 /*
1557  * Move data from uio into mbufs.
1558  */
1559 struct mbuf *
1560 m_uiomove(struct uio *uio)
1561 {
1562         struct mbuf *m;                 /* current working mbuf */
1563         struct mbuf *head = NULL;       /* result mbuf chain */
1564         struct mbuf **mp = &head;
1565         int resid = uio->uio_resid, nsize, flags = M_PKTHDR, error;
1566
1567         do {
1568                 m = m_getl(resid, MB_WAIT, MT_DATA, flags, &nsize);
1569                 if (flags) {
1570                         m->m_pkthdr.len = 0;
1571                         /* Leave room for protocol headers. */
1572                         if (resid < MHLEN)
1573                                 MH_ALIGN(m, resid);
1574                         flags = 0;
1575                 }
1576                 m->m_len = min(nsize, resid);
1577                 error = uiomove(mtod(m, caddr_t), m->m_len, uio);
1578                 if (error) {
1579                         m_free(m);
1580                         goto failed;
1581                 }
1582                 *mp = m;
1583                 mp = &m->m_next;
1584                 head->m_pkthdr.len += m->m_len;
1585                 resid -= m->m_len;
1586         } while (resid > 0);
1587
1588         return (head);
1589
1590 failed:
1591         m_freem(head);
1592         return (NULL);
1593 }
1594
1595 struct mbuf *
1596 m_last(struct mbuf *m)
1597 {
1598         while (m->m_next)
1599                 m = m->m_next;
1600         return (m);
1601 }
1602
1603 /*
1604  * Return the number of bytes in an mbuf chain.
1605  * If lastm is not NULL, also return the last mbuf.
1606  */
1607 u_int
1608 m_lengthm(struct mbuf *m, struct mbuf **lastm)
1609 {
1610         u_int len = 0;
1611         struct mbuf *prev = m;
1612
1613         while (m) {
1614                 len += m->m_len;
1615                 prev = m;
1616                 m = m->m_next;
1617         }
1618         if (lastm != NULL)
1619                 *lastm = prev;
1620         return (len);
1621 }
1622
1623 /*
1624  * Like m_lengthm(), except also keep track of mbuf usage.
1625  */
1626 u_int
1627 m_countm(struct mbuf *m, struct mbuf **lastm, u_int *pmbcnt)
1628 {
1629         u_int len = 0, mbcnt = 0;
1630         struct mbuf *prev = m;
1631
1632         while (m) {
1633                 len += m->m_len;
1634                 mbcnt += MSIZE;
1635                 if (m->m_flags & M_EXT)
1636                         mbcnt += m->m_ext.ext_size;
1637                 prev = m;
1638                 m = m->m_next;
1639         }
1640         if (lastm != NULL)
1641                 *lastm = prev;
1642         *pmbcnt = mbcnt;
1643         return (len);
1644 }