2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved.
3 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
5 * This code is derived from software contributed to The DragonFly Project
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of The DragonFly Project nor the names of its
17 * contributors may be used to endorse or promote products derived
18 * from this software without specific, prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved.
37 * License terms: all terms for the DragonFly license above plus the following:
39 * 4. All advertising materials mentioning features or use of this software
40 * must display the following acknowledgement:
42 * This product includes software developed by Jeffrey M. Hsu
43 * for the DragonFly Project.
45 * This requirement may be waived with permission from Jeffrey Hsu.
46 * This requirement will sunset and may be removed on July 8 2005,
47 * after which the standard DragonFly license (as shown above) will
52 * Copyright (c) 1982, 1986, 1988, 1991, 1993
53 * The Regents of the University of California. All rights reserved.
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
58 * 1. Redistributions of source code must retain the above copyright
59 * notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 * notice, this list of conditions and the following disclaimer in the
62 * documentation and/or other materials provided with the distribution.
63 * 3. All advertising materials mentioning features or use of this software
64 * must display the following acknowledgement:
65 * This product includes software developed by the University of
66 * California, Berkeley and its contributors.
67 * 4. Neither the name of the University nor the names of its contributors
68 * may be used to endorse or promote products derived from this software
69 * without specific prior written permission.
71 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
84 * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
85 * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.36 2005/04/20 21:37:06 hsu Exp $
88 #include "opt_param.h"
89 #include "opt_mbuf_stress_test.h"
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/malloc.h>
94 #include <sys/kernel.h>
95 #include <sys/sysctl.h>
96 #include <sys/domain.h>
97 #include <sys/protosw.h>
99 #include <sys/thread.h>
100 #include <sys/globaldata.h>
101 #include <sys/thread2.h>
104 #include <vm/vm_kern.h>
105 #include <vm/vm_extern.h>
108 #include <machine/cpu.h>
112 * mbuf cluster meta-data
114 typedef struct mbcluster
{
115 struct mbcluster
*mcl_next
;
121 typedef struct mbuf
*mbuf_t
;
123 #define MCL_MAGIC 0x6d62636c
125 static void mbinit (void *);
126 SYSINIT(mbuf
, SI_SUB_MBUF
, SI_ORDER_FIRST
, mbinit
, NULL
)
128 static u_long mbtypes
[MT_NTYPES
];
130 struct mbstat mbstat
;
139 #ifdef MBUF_STRESS_TEST
140 int m_defragrandomfailures
;
145 u_int m_mballoc_wid
= 0;
146 u_int m_clalloc_wid
= 0;
148 SYSCTL_INT(_kern_ipc
, KIPC_MAX_LINKHDR
, max_linkhdr
, CTLFLAG_RW
,
149 &max_linkhdr
, 0, "");
150 SYSCTL_INT(_kern_ipc
, KIPC_MAX_PROTOHDR
, max_protohdr
, CTLFLAG_RW
,
151 &max_protohdr
, 0, "");
152 SYSCTL_INT(_kern_ipc
, KIPC_MAX_HDR
, max_hdr
, CTLFLAG_RW
, &max_hdr
, 0, "");
153 SYSCTL_INT(_kern_ipc
, KIPC_MAX_DATALEN
, max_datalen
, CTLFLAG_RW
,
154 &max_datalen
, 0, "");
155 SYSCTL_INT(_kern_ipc
, OID_AUTO
, mbuf_wait
, CTLFLAG_RW
,
157 SYSCTL_STRUCT(_kern_ipc
, KIPC_MBSTAT
, mbstat
, CTLFLAG_RW
, &mbstat
, mbstat
, "");
158 SYSCTL_OPAQUE(_kern_ipc
, OID_AUTO
, mbtypes
, CTLFLAG_RD
, mbtypes
,
159 sizeof(mbtypes
), "LU", "");
160 SYSCTL_INT(_kern_ipc
, KIPC_NMBCLUSTERS
, nmbclusters
, CTLFLAG_RW
,
161 &nmbclusters
, 0, "Maximum number of mbuf clusters available");
162 SYSCTL_INT(_kern_ipc
, OID_AUTO
, nmbufs
, CTLFLAG_RW
, &nmbufs
, 0,
163 "Maximum number of mbufs available");
164 SYSCTL_INT(_kern_ipc
, OID_AUTO
, m_defragpackets
, CTLFLAG_RD
,
165 &m_defragpackets
, 0, "");
166 SYSCTL_INT(_kern_ipc
, OID_AUTO
, m_defragbytes
, CTLFLAG_RD
,
167 &m_defragbytes
, 0, "");
168 SYSCTL_INT(_kern_ipc
, OID_AUTO
, m_defraguseless
, CTLFLAG_RD
,
169 &m_defraguseless
, 0, "");
170 SYSCTL_INT(_kern_ipc
, OID_AUTO
, m_defragfailure
, CTLFLAG_RD
,
171 &m_defragfailure
, 0, "");
172 #ifdef MBUF_STRESS_TEST
173 SYSCTL_INT(_kern_ipc
, OID_AUTO
, m_defragrandomfailures
, CTLFLAG_RW
,
174 &m_defragrandomfailures
, 0, "");
177 static int mcl_pool_count
;
178 static int mcl_pool_max
= 20;
179 static int mcl_free_max
= 1000;
180 static int mbuf_free_max
= 5000;
182 SYSCTL_INT(_kern_ipc
, OID_AUTO
, mcl_pool_max
, CTLFLAG_RW
, &mcl_pool_max
, 0,
183 "Maximum number of mbufs+cluster in free list");
184 SYSCTL_INT(_kern_ipc
, OID_AUTO
, mcl_pool_count
, CTLFLAG_RD
, &mcl_pool_count
, 0,
185 "Current number of mbufs+cluster in free list");
186 SYSCTL_INT(_kern_ipc
, OID_AUTO
, mcl_free_max
, CTLFLAG_RW
, &mcl_free_max
, 0,
187 "Maximum number of clusters on the free list");
188 SYSCTL_INT(_kern_ipc
, OID_AUTO
, mbuf_free_max
, CTLFLAG_RW
, &mbuf_free_max
, 0,
189 "Maximum number of mbufs on the free list");
191 static MALLOC_DEFINE(M_MBUF
, "mbuf", "mbuf");
192 static MALLOC_DEFINE(M_MBUFCL
, "mbufcl", "mbufcl");
194 static mbuf_t mmbfree
;
195 static mbcluster_t mclfree
;
196 static struct mbuf
*mcl_pool
;
198 static void m_reclaim (void);
199 static int m_mballoc(int nmb
, int how
);
200 static int m_clalloc(int ncl
, int how
);
201 static struct mbuf
*m_mballoc_wait(int caller
, int type
);
202 static void m_mclref(void *arg
);
203 static void m_mclfree(void *arg
);
206 #define NMBCLUSTERS (512 + maxusers * 16)
209 #define NMBUFS (nmbclusters * 4)
213 * Perform sanity checks of tunables declared above.
216 tunable_mbinit(void *dummy
)
220 * This has to be done before VM init.
222 nmbclusters
= NMBCLUSTERS
;
223 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters
);
225 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs
);
227 if (nmbufs
< nmbclusters
* 2)
228 nmbufs
= nmbclusters
* 2;
232 SYSINIT(tunable_mbinit
, SI_SUB_TUNABLES
, SI_ORDER_ANY
, tunable_mbinit
, NULL
);
234 /* "number of clusters of pages" */
245 mbstat
.m_msize
= MSIZE
;
246 mbstat
.m_mclbytes
= MCLBYTES
;
247 mbstat
.m_minclsize
= MINCLSIZE
;
248 mbstat
.m_mlen
= MLEN
;
249 mbstat
.m_mhlen
= MHLEN
;
252 if (m_mballoc(NMB_INIT
, MB_DONTWAIT
) == 0)
254 #if MCLBYTES <= PAGE_SIZE
255 if (m_clalloc(NCL_INIT
, MB_DONTWAIT
) == 0)
258 /* It's OK to call contigmalloc in this context. */
259 if (m_clalloc(16, MB_WAIT
) == 0)
270 * Allocate at least nmb mbufs and place on mbuf free list.
271 * Returns the number of mbufs successfully allocated, 0 if none.
273 * Must be called while in a critical section.
276 m_mballoc(int nmb
, int how
)
282 * If we've hit the mbuf limit, stop allocating (or trying to)
283 * in order to avoid exhausting kernel memory entirely.
285 if ((nmb
+ mbstat
.m_mbufs
) > nmbufs
)
289 * Attempt to allocate the requested number of mbufs, terminate when
290 * the allocation fails but if blocking is allowed allocate at least
293 for (i
= 0; i
< nmb
; ++i
) {
294 m
= malloc(MSIZE
, M_MBUF
, M_NOWAIT
|M_NULLOK
|M_ZERO
);
296 if (how
== MB_WAIT
) {
298 m
= malloc(MSIZE
, M_MBUF
,
299 M_WAITOK
|M_NULLOK
|M_ZERO
);
314 * Once mbuf memory has been exhausted and if the call to the allocation macros
315 * (or, in some cases, functions) is with MB_WAIT, then it is necessary to rely
316 * solely on reclaimed mbufs. Here we wait for an mbuf to be freed for a
317 * designated (mbuf_wait) time.
320 m_mballoc_wait(int caller
, int type
)
326 if ((tsleep(&m_mballoc_wid
, 0, "mballc", mbuf_wait
)) == EWOULDBLOCK
)
331 * Now that we (think) that we've got something, we will redo an
332 * MGET, but avoid getting into another instance of m_mballoc_wait()
333 * XXX: We retry to fetch _even_ if the sleep timed out. This is left
334 * this way, purposely, in the [unlikely] case that an mbuf was
335 * freed but the sleep was not awakened in time.
340 MGET(m
, MB_DONTWAIT
, type
);
343 MGETHDR(m
, MB_DONTWAIT
, type
);
346 panic("m_mballoc_wait: invalid caller (%d)", caller
);
350 if (m
!= NULL
) { /* We waited and got something... */
352 /* Wake up another if we have more free. */
360 #if MCLBYTES > PAGE_SIZE
361 static int i_want_my_mcl
;
370 tsleep(&i_want_my_mcl
, 0, "mclalloc", 0);
372 while (i_want_my_mcl
> 0) {
373 if (m_clalloc(1, MB_WAIT
) == 0)
374 printf("m_clalloc failed even in thread context!\n");
382 static struct thread
*mclallocthread
;
383 static struct kproc_desc mclalloc_kp
= {
388 SYSINIT(mclallocthread
, SI_SUB_KTHREAD_UPDATE
, SI_ORDER_ANY
, kproc_start
,
393 * Allocate at least nmb mbuf clusters and place on mbuf free list.
394 * Returns the number of mbuf clusters successfully allocated, 0 if none.
396 * Must be called while in a critical section.
399 m_clalloc(int ncl
, int how
)
401 static int last_report
;
407 * If we've hit the mbuf cluster limit, stop allocating (or trying to).
409 if ((ncl
+ mbstat
.m_clusters
) > nmbclusters
)
413 * Attempt to allocate the requested number of mbuf clusters,
414 * terminate when the allocation fails but if blocking is allowed
415 * allocate at least one.
417 * We need to allocate two structures for each cluster... a
418 * ref counting / governing structure and the actual data. MCLBYTES
419 * should be a power of 2 which means that the slab allocator will
420 * return a buffer that does not cross a page boundary.
422 for (i
= 0; i
< ncl
; ++i
) {
426 mcl
= malloc(sizeof(*mcl
), M_MBUFCL
, M_NOWAIT
|M_NULLOK
|M_ZERO
);
428 if (how
== MB_WAIT
) {
430 mcl
= malloc(sizeof(*mcl
),
431 M_MBUFCL
, M_WAITOK
|M_NULLOK
|M_ZERO
);
438 * Physically contiguous data buffer.
440 #if MCLBYTES > PAGE_SIZE
441 if (how
!= MB_WAIT
) {
442 i_want_my_mcl
+= ncl
- i
;
443 wakeup(&i_want_my_mcl
);
447 data
= contigmalloc_map(MCLBYTES
, M_MBUFCL
,
448 M_WAITOK
, 0ul, ~0ul, PAGE_SIZE
, 0, kernel_map
);
451 data
= malloc(MCLBYTES
, M_MBUFCL
, M_NOWAIT
|M_NULLOK
);
453 if (how
== MB_WAIT
) {
455 data
= malloc(MCLBYTES
, M_MBUFCL
,
464 mcl
->mcl_next
= mclfree
;
465 mcl
->mcl_data
= data
;
466 mcl
->mcl_magic
= MCL_MAGIC
;
475 * If we could not allocate any report failure no more often then
480 if (ticks
< last_report
|| (ticks
- last_report
) >= hz
) {
482 printf("All mbuf clusters exhausted, please see tuning(7).\n");
489 * Once cluster memory has been exhausted and the allocation is called with
490 * MB_WAIT, we rely on the mclfree pointers. If nothing is free, we will
491 * sleep for a designated amount of time (mbuf_wait) or until we're woken up
492 * due to sudden mcluster availability.
494 * Must be called while in a critical section.
499 /* If in interrupt context, and INVARIANTS, maintain sanity and die. */
500 KASSERT(mycpu
->gd_intr_nesting_level
== 0,
501 ("CLALLOC: CANNOT WAIT IN INTERRUPT"));
504 * Sleep until something's available or until we expire.
507 if ((tsleep(&m_clalloc_wid
, 0, "mclalc", mbuf_wait
)) == EWOULDBLOCK
)
511 * Try the allocation once more, and if we see mor then two
512 * free entries wake up others as well.
514 m_clalloc(1, MB_WAIT
);
515 if (mclfree
&& mclfree
->mcl_next
) {
521 * Return the number of references to this mbuf's data. 0 is returned
522 * if the mbuf is not M_EXT, a reference count is returned if it is
523 * M_EXT|M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
526 m_sharecount(struct mbuf
*m
)
530 switch(m
->m_flags
& (M_EXT
|M_EXT_CLUSTER
)) {
537 case M_EXT
|M_EXT_CLUSTER
:
538 count
= ((mbcluster_t
)m
->m_ext
.ext_arg
)->mcl_refs
;
541 panic("bad mbuf flags: %p", m
);
548 * change mbuf to new type
551 m_chtype(struct mbuf
*m
, int type
)
554 --mbtypes
[m
->m_type
];
561 * When MGET fails, ask protocols to free space when short of memory,
562 * then re-attempt to allocate an mbuf.
565 m_retry(int how
, int t
)
570 * Must only do the reclaim if not in an interrupt context.
572 if (how
== MB_WAIT
) {
573 KASSERT(mycpu
->gd_intr_nesting_level
== 0,
574 ("MBALLOC: CANNOT WAIT IN INTERRUPT"));
579 * Try to pull a new mbuf out of the cache, if the cache is empty
580 * try to allocate a new one and if that doesn't work we give up.
583 if ((m
= mmbfree
) == NULL
) {
585 if ((m
= mmbfree
) == NULL
) {
586 static int last_report
;
590 if (ticks
< last_report
||
591 (ticks
- last_report
) >= hz
) {
593 printf("All mbufs exhausted, please see tuning(7).\n");
600 * Cache case, adjust globals before leaving the critical section
611 m
->m_data
= m
->m_dat
;
617 * As above; retry an MGETHDR.
620 m_retryhdr(int how
, int t
)
625 * Must only do the reclaim if not in an interrupt context.
627 if (how
== MB_WAIT
) {
628 KASSERT(mycpu
->gd_intr_nesting_level
== 0,
629 ("MBALLOC: CANNOT WAIT IN INTERRUPT"));
634 * Try to pull a new mbuf out of the cache, if the cache is empty
635 * try to allocate a new one and if that doesn't work we give up.
638 if ((m
= mmbfree
) == NULL
) {
640 if ((m
= mmbfree
) == NULL
) {
641 static int last_report
;
645 if (ticks
< last_report
||
646 (ticks
- last_report
) >= hz
) {
648 printf("All mbufs exhausted, please see tuning(7).\n");
655 * Cache case, adjust globals before leaving the critical section
666 m
->m_data
= m
->m_pktdat
;
667 m
->m_flags
= M_PKTHDR
;
668 m
->m_pkthdr
.rcvif
= NULL
;
669 SLIST_INIT(&m
->m_pkthdr
.tags
);
670 m
->m_pkthdr
.csum_flags
= 0;
681 SLIST_FOREACH(dp
, &domains
, dom_next
) {
682 for (pr
= dp
->dom_protosw
; pr
< dp
->dom_protoswNPROTOSW
; pr
++) {
692 * Allocate an mbuf. If no mbufs are immediately available try to
693 * bring a bunch more into our cache (mmbfree list). A critical
694 * section is required to protect the mmbfree list and counters
695 * against interrupts.
698 m_get(int how
, int type
)
703 * Try to pull a new mbuf out of the cache, if the cache is empty
704 * try to allocate a new one and if that doesn't work try even harder
705 * by calling m_retryhdr().
708 if ((m
= mmbfree
) == NULL
) {
710 if ((m
= mmbfree
) == NULL
) {
712 m
= m_retry(how
, type
);
713 if (m
== NULL
&& how
== MB_WAIT
)
714 m
= m_mballoc_wait(MGET_C
, type
);
720 * Cache case, adjust globals before leaving the critical section
730 m
->m_data
= m
->m_dat
;
736 m_gethdr(int how
, int type
)
741 * Try to pull a new mbuf out of the cache, if the cache is empty
742 * try to allocate a new one and if that doesn't work try even harder
743 * by calling m_retryhdr().
746 if ((m
= mmbfree
) == NULL
) {
748 if ((m
= mmbfree
) == NULL
) {
750 m
= m_retryhdr(how
, type
);
751 if (m
== NULL
&& how
== MB_WAIT
)
752 m
= m_mballoc_wait(MGETHDR_C
, type
);
758 * Cache case, adjust globals before leaving the critical section
768 m
->m_data
= m
->m_pktdat
;
769 m
->m_flags
= M_PKTHDR
;
770 m
->m_pkthdr
.rcvif
= NULL
;
771 SLIST_INIT(&m
->m_pkthdr
.tags
);
772 m
->m_pkthdr
.csum_flags
= 0;
773 m
->m_pkthdr
.fw_flags
= 0;
778 m_getclr(int how
, int type
)
782 if ((m
= m_get(how
, type
)) != NULL
) {
783 bzero(mtod(m
, caddr_t
), MLEN
);
789 * m_getcl() returns an mbuf with an attached cluster.
790 * Because many network drivers use this kind of buffers a lot, it is
791 * convenient to keep a small pool of free buffers of this kind.
792 * Even a small size such as 10 gives about 10% improvement in the
793 * forwarding rate in a bridge or router.
794 * The size of this free list is controlled by the sysctl variable
795 * mcl_pool_max. The list is populated on m_freem(), and used in
796 * m_getcl() if elements are available.
799 m_getcl(int how
, short type
, int flags
)
804 if (flags
& M_PKTHDR
) {
805 if (type
== MT_DATA
&& mcl_pool
) {
807 mcl_pool
= mp
->m_nextpkt
;
810 mp
->m_nextpkt
= NULL
;
811 mp
->m_data
= mp
->m_ext
.ext_buf
;
812 mp
->m_flags
= M_PKTHDR
|M_EXT
|M_EXT_CLUSTER
;
813 mp
->m_pkthdr
.rcvif
= NULL
;
814 mp
->m_pkthdr
.csum_flags
= 0;
817 MGETHDR(mp
, how
, type
);
823 if ((mp
->m_flags
& M_EXT
) == 0) {
834 * m_getm(m, len, how, type)
836 * This will allocate len-worth of mbufs and/or mbuf clusters (whatever fits
837 * best) and return a pointer to the top of the allocated chain. If m is
838 * non-null, then we assume that it is a single mbuf or an mbuf chain to
839 * which we want len bytes worth of mbufs and/or clusters attached, and so
840 * if we succeed in allocating it, we will just return a pointer to m.
842 * If we happen to fail at any point during the allocation, we will free
843 * up everything we have already allocated and return NULL.
847 m_getm(struct mbuf
*m
, int len
, int how
, int type
)
849 struct mbuf
*top
, *tail
, *mp
, *mtail
= NULL
;
851 KASSERT(len
>= 0, ("len is < 0 in m_getm"));
853 mp
= m_get(how
, type
);
856 } else if (len
> MINCLSIZE
) {
858 if ((mp
->m_flags
& M_EXT
) == 0) {
864 len
-= M_TRAILINGSPACE(mp
);
867 for (mtail
= m
; mtail
->m_next
!= NULL
; mtail
= mtail
->m_next
)
875 mp
= m_get(how
, type
);
881 if (len
> MINCLSIZE
) {
883 if ((mp
->m_flags
& M_EXT
) == 0)
888 len
-= M_TRAILINGSPACE(mp
);
900 * m_mclget() - Adds a cluster to a normal mbuf, M_EXT is set on success.
903 m_mclget(struct mbuf
*m
, int how
)
907 KKASSERT((m
->m_flags
& M_EXT_OLD
) == 0);
910 * Allocate a cluster, return if we can't get one.
913 if ((mcl
= mclfree
) == NULL
) {
915 if ((mcl
= mclfree
) == NULL
) {
916 if (how
== MB_WAIT
) {
928 * We have a cluster, unlink it from the free list and set the ref
931 KKASSERT(mcl
->mcl_refs
== 0);
932 mclfree
= mcl
->mcl_next
;
938 * Add the cluster to the mbuf. The caller will detect that the
939 * mbuf now has an attached cluster.
941 m
->m_ext
.ext_arg
= mcl
;
942 m
->m_ext
.ext_buf
= mcl
->mcl_data
;
943 m
->m_ext
.ext_nref
.new = m_mclref
;
944 m
->m_ext
.ext_nfree
.new = m_mclfree
;
945 m
->m_ext
.ext_size
= MCLBYTES
;
947 m
->m_data
= m
->m_ext
.ext_buf
;
948 m
->m_flags
|= M_EXT
| M_EXT_CLUSTER
;
954 mbcluster_t mcl
= arg
;
956 KKASSERT(mcl
->mcl_magic
== MCL_MAGIC
);
957 KKASSERT(mcl
->mcl_refs
> 0);
959 if (--mcl
->mcl_refs
== 0) {
960 if (mbstat
.m_clfree
< mcl_free_max
) {
961 mcl
->mcl_next
= mclfree
;
967 free(mcl
->mcl_data
, M_MBUFCL
);
978 mbcluster_t mcl
= arg
;
980 KKASSERT(mcl
->mcl_magic
== MCL_MAGIC
);
987 * Helper routines for M_EXT reference/free
990 m_extref(const struct mbuf
*m
)
992 KKASSERT(m
->m_ext
.ext_nfree
.any
!= NULL
);
994 if (m
->m_flags
& M_EXT_OLD
)
995 m
->m_ext
.ext_nref
.old(m
->m_ext
.ext_buf
, m
->m_ext
.ext_size
);
997 m
->m_ext
.ext_nref
.new(m
->m_ext
.ext_arg
);
1004 * Free a single mbuf and any associated external storage. The successor,
1005 * if any, is returned.
1007 * We do need to check non-first mbuf for m_aux, since some of existing
1008 * code does not call M_PREPEND properly.
1009 * (example: call to bpf_mtap from drivers)
1012 m_free(struct mbuf
*m
)
1017 KASSERT(m
->m_type
!= MT_FREE
, ("freeing free mbuf %p", m
));
1020 * Adjust our type count and delete any attached chains if the
1021 * mbuf is a packet header.
1023 if ((m
->m_flags
& M_PKTHDR
) != 0)
1024 m_tag_delete_chain(m
, NULL
);
1027 * Place the mbuf on the appropriate free list. Try to maintain a
1028 * small cache of mbuf+cluster pairs.
1032 if (m
->m_flags
& M_EXT
) {
1033 KKASSERT(m
->m_ext
.ext_nfree
.any
!= NULL
);
1034 if (mcl_pool_count
< mcl_pool_max
&& m
&& m
->m_next
== NULL
&&
1035 (m
->m_flags
& (M_PKTHDR
|M_EXT_CLUSTER
)) == (M_PKTHDR
|M_EXT_CLUSTER
) &&
1036 m
->m_type
== MT_DATA
&& M_EXT_WRITABLE(m
) ) {
1037 KKASSERT(((mbcluster_t
)m
->m_ext
.ext_arg
)->mcl_magic
== MCL_MAGIC
);
1038 m
->m_nextpkt
= mcl_pool
;
1043 if (m
->m_flags
& M_EXT_OLD
)
1044 m
->m_ext
.ext_nfree
.old(m
->m_ext
.ext_buf
, m
->m_ext
.ext_size
);
1046 m
->m_ext
.ext_nfree
.new(m
->m_ext
.ext_arg
);
1048 m
->m_ext
.ext_arg
= NULL
;
1049 m
->m_ext
.ext_nref
.new = NULL
;
1050 m
->m_ext
.ext_nfree
.new = NULL
;
1054 --mbtypes
[m
->m_type
];
1055 if (mbtypes
[MT_FREE
] < mbuf_free_max
) {
1056 m
->m_type
= MT_FREE
;
1058 m
->m_next
= mmbfree
;
1071 m_freem(struct mbuf
*m
)
1080 * mbuf utility routines
1084 * Lesser-used path for M_PREPEND:
1085 * allocate new mbuf to prepend to chain,
1089 m_prepend(struct mbuf
*m
, int len
, int how
)
1093 MGET(mn
, how
, m
->m_type
);
1094 if (mn
== (struct mbuf
*)NULL
) {
1096 return ((struct mbuf
*)NULL
);
1098 if (m
->m_flags
& M_PKTHDR
)
1099 M_MOVE_PKTHDR(mn
, m
);
1109 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
1110 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
1111 * The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller.
1112 * Note that the copy is read-only, because clusters are not copied,
1113 * only their reference counts are incremented.
1115 #define MCFail (mbstat.m_mcfail)
1118 m_copym(const struct mbuf
*m
, int off0
, int len
, int wait
)
1120 struct mbuf
*n
, **np
;
1125 KASSERT(off
>= 0, ("m_copym, negative off %d", off
));
1126 KASSERT(len
>= 0, ("m_copym, negative len %d", len
));
1127 if (off
== 0 && m
->m_flags
& M_PKTHDR
)
1130 KASSERT(m
!= NULL
, ("m_copym, offset > size of mbuf chain"));
1140 KASSERT(len
== M_COPYALL
,
1141 ("m_copym, length > size of mbuf chain"));
1144 MGET(n
, wait
, m
->m_type
);
1149 if (!m_dup_pkthdr(n
, m
, wait
))
1151 if (len
== M_COPYALL
)
1152 n
->m_pkthdr
.len
-= off0
;
1154 n
->m_pkthdr
.len
= len
;
1157 n
->m_len
= min(len
, m
->m_len
- off
);
1158 if (m
->m_flags
& M_EXT
) {
1159 n
->m_data
= m
->m_data
+ off
;
1161 n
->m_ext
= m
->m_ext
;
1162 n
->m_flags
|= m
->m_flags
&
1163 (M_EXT
| M_EXT_OLD
| M_EXT_CLUSTER
);
1165 bcopy(mtod(m
, caddr_t
)+off
, mtod(n
, caddr_t
),
1166 (unsigned)n
->m_len
);
1168 if (len
!= M_COPYALL
)
1184 * Copy an entire packet, including header (which must be present).
1185 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
1186 * Note that the copy is read-only, because clusters are not copied,
1187 * only their reference counts are incremented.
1188 * Preserve alignment of the first mbuf so if the creator has left
1189 * some room at the beginning (e.g. for inserting protocol headers)
1190 * the copies also have the room available.
1193 m_copypacket(struct mbuf
*m
, int how
)
1195 struct mbuf
*top
, *n
, *o
;
1197 MGET(n
, how
, m
->m_type
);
1202 if (!m_dup_pkthdr(n
, m
, how
))
1204 n
->m_len
= m
->m_len
;
1205 if (m
->m_flags
& M_EXT
) {
1206 n
->m_data
= m
->m_data
;
1208 n
->m_ext
= m
->m_ext
;
1209 n
->m_flags
|= m
->m_flags
& (M_EXT
| M_EXT_OLD
| M_EXT_CLUSTER
);
1211 n
->m_data
= n
->m_pktdat
+ (m
->m_data
- m
->m_pktdat
);
1212 bcopy(mtod(m
, char *), mtod(n
, char *), n
->m_len
);
1217 MGET(o
, how
, m
->m_type
);
1224 n
->m_len
= m
->m_len
;
1225 if (m
->m_flags
& M_EXT
) {
1226 n
->m_data
= m
->m_data
;
1228 n
->m_ext
= m
->m_ext
;
1229 n
->m_flags
|= m
->m_flags
&
1230 (M_EXT
| M_EXT_OLD
| M_EXT_CLUSTER
);
1232 bcopy(mtod(m
, char *), mtod(n
, char *), n
->m_len
);
1245 * Copy data from an mbuf chain starting "off" bytes from the beginning,
1246 * continuing for "len" bytes, into the indicated buffer.
1249 m_copydata(const struct mbuf
*m
, int off
, int len
, caddr_t cp
)
1253 KASSERT(off
>= 0, ("m_copydata, negative off %d", off
));
1254 KASSERT(len
>= 0, ("m_copydata, negative len %d", len
));
1256 KASSERT(m
!= NULL
, ("m_copydata, offset > size of mbuf chain"));
1263 KASSERT(m
!= NULL
, ("m_copydata, length > size of mbuf chain"));
1264 count
= min(m
->m_len
- off
, len
);
1265 bcopy(mtod(m
, caddr_t
) + off
, cp
, count
);
1274 * Copy a packet header mbuf chain into a completely new chain, including
1275 * copying any mbuf clusters. Use this instead of m_copypacket() when
1276 * you need a writable copy of an mbuf chain.
1279 m_dup(struct mbuf
*m
, int how
)
1281 struct mbuf
**p
, *top
= NULL
;
1282 int remain
, moff
, nsize
;
1287 KASSERT((m
->m_flags
& M_PKTHDR
) != 0, ("%s: !PKTHDR", __func__
));
1289 /* While there's more data, get a new mbuf, tack it on, and fill it */
1290 remain
= m
->m_pkthdr
.len
;
1293 while (remain
> 0 || top
== NULL
) { /* allow m->m_pkthdr.len == 0 */
1296 /* Get the next new mbuf */
1297 MGET(n
, how
, m
->m_type
);
1300 if (top
== NULL
) { /* first one, must be PKTHDR */
1301 if (!m_dup_pkthdr(n
, m
, how
))
1304 } else /* not the first one */
1306 if (remain
>= MINCLSIZE
) {
1308 if ((n
->m_flags
& M_EXT
) == 0) {
1316 /* Link it into the new chain */
1320 /* Copy data from original mbuf(s) into new mbuf */
1321 while (n
->m_len
< nsize
&& m
!= NULL
) {
1322 int chunk
= min(nsize
- n
->m_len
, m
->m_len
- moff
);
1324 bcopy(m
->m_data
+ moff
, n
->m_data
+ n
->m_len
, chunk
);
1328 if (moff
== m
->m_len
) {
1334 /* Check correct total mbuf length */
1335 KASSERT((remain
> 0 && m
!= NULL
) || (remain
== 0 && m
== NULL
),
1336 ("%s: bogus m_pkthdr.len", __func__
));
1347 * Concatenate mbuf chain n to m.
1348 * Both chains must be of the same type (e.g. MT_DATA).
1349 * Any m_pkthdr is not updated.
1352 m_cat(struct mbuf
*m
, struct mbuf
*n
)
1357 if (m
->m_flags
& M_EXT
||
1358 m
->m_data
+ m
->m_len
+ n
->m_len
>= &m
->m_dat
[MLEN
]) {
1359 /* just join the two chains */
1363 /* splat the data from one into the other */
1364 bcopy(mtod(n
, caddr_t
), mtod(m
, caddr_t
) + m
->m_len
,
1366 m
->m_len
+= n
->m_len
;
1372 m_adj(struct mbuf
*mp
, int req_len
)
1378 if ((m
= mp
) == NULL
)
1384 while (m
!= NULL
&& len
> 0) {
1385 if (m
->m_len
<= len
) {
1396 if (mp
->m_flags
& M_PKTHDR
)
1397 m
->m_pkthdr
.len
-= (req_len
- len
);
1400 * Trim from tail. Scan the mbuf chain,
1401 * calculating its length and finding the last mbuf.
1402 * If the adjustment only affects this mbuf, then just
1403 * adjust and return. Otherwise, rescan and truncate
1404 * after the remaining size.
1410 if (m
->m_next
== (struct mbuf
*)0)
1414 if (m
->m_len
>= len
) {
1416 if (mp
->m_flags
& M_PKTHDR
)
1417 mp
->m_pkthdr
.len
-= len
;
1424 * Correct length for chain is "count".
1425 * Find the mbuf with last data, adjust its length,
1426 * and toss data from remaining mbufs on chain.
1429 if (m
->m_flags
& M_PKTHDR
)
1430 m
->m_pkthdr
.len
= count
;
1431 for (; m
; m
= m
->m_next
) {
1432 if (m
->m_len
>= count
) {
1439 (m
= m
->m_next
) ->m_len
= 0;
1444 * Rearange an mbuf chain so that len bytes are contiguous
1445 * and in the data area of an mbuf (so that mtod will work for a structure
1446 * of size len). Returns the resulting mbuf chain on success, frees it and
1447 * returns null on failure. If there is room, it will add up to
1448 * max_protohdr-len extra bytes to the contiguous region in an attempt to
1449 * avoid being called next time.
1451 #define MPFail (mbstat.m_mpfail)
1454 m_pullup(struct mbuf
*n
, int len
)
1461 * If first mbuf has no cluster, and has room for len bytes
1462 * without shifting current data, pullup into it,
1463 * otherwise allocate a new mbuf to prepend to the chain.
1465 if ((n
->m_flags
& M_EXT
) == 0 &&
1466 n
->m_data
+ len
< &n
->m_dat
[MLEN
] && n
->m_next
) {
1467 if (n
->m_len
>= len
)
1475 MGET(m
, MB_DONTWAIT
, n
->m_type
);
1479 if (n
->m_flags
& M_PKTHDR
)
1480 M_MOVE_PKTHDR(m
, n
);
1482 space
= &m
->m_dat
[MLEN
] - (m
->m_data
+ m
->m_len
);
1484 count
= min(min(max(len
, max_protohdr
), space
), n
->m_len
);
1485 bcopy(mtod(n
, caddr_t
), mtod(m
, caddr_t
) + m
->m_len
,
1495 } while (len
> 0 && n
);
1509 * Partition an mbuf chain in two pieces, returning the tail --
1510 * all but the first len0 bytes. In case of failure, it returns NULL and
1511 * attempts to restore the chain to its original state.
1513 * Note that the resulting mbufs might be read-only, because the new
1514 * mbuf can end up sharing an mbuf cluster with the original mbuf if
1515 * the "breaking point" happens to lie within a cluster mbuf. Use the
1516 * M_WRITABLE() macro to check for this case.
1519 m_split(struct mbuf
*m0
, int len0
, int wait
)
1522 unsigned len
= len0
, remain
;
1524 for (m
= m0
; m
&& len
> m
->m_len
; m
= m
->m_next
)
1528 remain
= m
->m_len
- len
;
1529 if (m0
->m_flags
& M_PKTHDR
) {
1530 MGETHDR(n
, wait
, m0
->m_type
);
1533 n
->m_pkthdr
.rcvif
= m0
->m_pkthdr
.rcvif
;
1534 n
->m_pkthdr
.len
= m0
->m_pkthdr
.len
- len0
;
1535 m0
->m_pkthdr
.len
= len0
;
1536 if (m
->m_flags
& M_EXT
)
1538 if (remain
> MHLEN
) {
1539 /* m can't be the lead packet */
1541 n
->m_next
= m_split(m
, len
, wait
);
1542 if (n
->m_next
== 0) {
1550 MH_ALIGN(n
, remain
);
1551 } else if (remain
== 0) {
1556 MGET(n
, wait
, m
->m_type
);
1562 if (m
->m_flags
& M_EXT
) {
1563 n
->m_data
= m
->m_data
+ len
;
1565 n
->m_ext
= m
->m_ext
;
1566 n
->m_flags
|= m
->m_flags
& (M_EXT
| M_EXT_OLD
| M_EXT_CLUSTER
);
1568 bcopy(mtod(m
, caddr_t
) + len
, mtod(n
, caddr_t
), remain
);
1572 n
->m_next
= m
->m_next
;
1577 * Routine to copy from device local memory into mbufs.
1580 m_devget(char *buf
, int totlen
, int off0
, struct ifnet
*ifp
,
1581 void (*copy
) (char *from
, caddr_t to
, u_int len
))
1584 struct mbuf
*top
= 0, **mp
= &top
;
1585 int off
= off0
, len
;
1592 cp
+= off
+ 2 * sizeof(u_short
);
1593 totlen
-= 2 * sizeof(u_short
);
1595 MGETHDR(m
, MB_DONTWAIT
, MT_DATA
);
1598 m
->m_pkthdr
.rcvif
= ifp
;
1599 m
->m_pkthdr
.len
= totlen
;
1602 while (totlen
> 0) {
1604 MGET(m
, MB_DONTWAIT
, MT_DATA
);
1611 len
= min(totlen
, epkt
- cp
);
1612 if (len
>= MINCLSIZE
) {
1613 MCLGET(m
, MB_DONTWAIT
);
1614 if (m
->m_flags
& M_EXT
)
1615 m
->m_len
= len
= min(len
, MCLBYTES
);
1620 * Place initial small packet/header at end of mbuf.
1622 if (len
< m
->m_len
) {
1623 if (top
== 0 && len
+ max_linkhdr
<= m
->m_len
)
1624 m
->m_data
+= max_linkhdr
;
1630 copy(cp
, mtod(m
, caddr_t
), (unsigned)len
);
1632 bcopy(cp
, mtod(m
, caddr_t
), (unsigned)len
);
1644 * Copy data from a buffer back into the indicated mbuf chain,
1645 * starting "off" bytes from the beginning, extending the mbuf
1646 * chain if necessary.
1649 m_copyback(struct mbuf
*m0
, int off
, int len
, caddr_t cp
)
1652 struct mbuf
*m
= m0
, *n
;
1657 while (off
> (mlen
= m
->m_len
)) {
1660 if (m
->m_next
== 0) {
1661 n
= m_getclr(MB_DONTWAIT
, m
->m_type
);
1664 n
->m_len
= min(MLEN
, len
+ off
);
1670 mlen
= min (m
->m_len
- off
, len
);
1671 bcopy(cp
, off
+ mtod(m
, caddr_t
), (unsigned)mlen
);
1679 if (m
->m_next
== 0) {
1680 n
= m_get(MB_DONTWAIT
, m
->m_type
);
1683 n
->m_len
= min(MLEN
, len
);
1688 out
: if (((m
= m0
)->m_flags
& M_PKTHDR
) && (m
->m_pkthdr
.len
< totlen
))
1689 m
->m_pkthdr
.len
= totlen
;
1693 m_print(const struct mbuf
*m
)
1696 const struct mbuf
*m2
;
1698 len
= m
->m_pkthdr
.len
;
1701 printf("%p %*D\n", m2
, m2
->m_len
, (u_char
*)m2
->m_data
, "-");
1709 * "Move" mbuf pkthdr from "from" to "to".
1710 * "from" must have M_PKTHDR set, and "to" must be empty.
1713 m_move_pkthdr(struct mbuf
*to
, struct mbuf
*from
)
1715 KASSERT((to
->m_flags
& M_EXT
) == 0, ("m_move_pkthdr: to has cluster"));
1717 to
->m_flags
= from
->m_flags
& M_COPYFLAGS
;
1718 to
->m_data
= to
->m_pktdat
;
1719 to
->m_pkthdr
= from
->m_pkthdr
; /* especially tags */
1720 SLIST_INIT(&from
->m_pkthdr
.tags
); /* purge tags from src */
1721 from
->m_flags
&= ~M_PKTHDR
;
1725 * Duplicate "from"'s mbuf pkthdr in "to".
1726 * "from" must have M_PKTHDR set, and "to" must be empty.
1727 * In particular, this does a deep copy of the packet tags.
1730 m_dup_pkthdr(struct mbuf
*to
, const struct mbuf
*from
, int how
)
1732 to
->m_flags
= (from
->m_flags
& M_COPYFLAGS
) | (to
->m_flags
& M_EXT
);
1733 if ((to
->m_flags
& M_EXT
) == 0)
1734 to
->m_data
= to
->m_pktdat
;
1735 to
->m_pkthdr
= from
->m_pkthdr
;
1736 SLIST_INIT(&to
->m_pkthdr
.tags
);
1737 return (m_tag_copy_chain(to
, from
, how
));
1741 * Defragment a mbuf chain, returning the shortest possible
1742 * chain of mbufs and clusters. If allocation fails and
1743 * this cannot be completed, NULL will be returned, but
1744 * the passed in chain will be unchanged. Upon success,
1745 * the original chain will be freed, and the new chain
1748 * If a non-packet header is passed in, the original
1749 * mbuf (chain?) will be returned unharmed.
1751 * m_defrag_nofree doesn't free the passed in mbuf.
1754 m_defrag(struct mbuf
*m0
, int how
)
1758 if ((m_new
= m_defrag_nofree(m0
, how
)) == NULL
)
1766 m_defrag_nofree(struct mbuf
*m0
, int how
)
1768 struct mbuf
*m_new
= NULL
, *m_final
= NULL
;
1769 int progress
= 0, length
;
1771 if (!(m0
->m_flags
& M_PKTHDR
))
1774 #ifdef MBUF_STRESS_TEST
1775 if (m_defragrandomfailures
) {
1776 int temp
= arc4random() & 0xff;
1782 if (m0
->m_pkthdr
.len
> MHLEN
)
1783 m_final
= m_getcl(how
, MT_DATA
, M_PKTHDR
);
1785 m_final
= m_gethdr(how
, MT_DATA
);
1787 if (m_final
== NULL
)
1790 if (m_dup_pkthdr(m_final
, m0
, how
) == NULL
)
1795 while (progress
< m0
->m_pkthdr
.len
) {
1796 length
= m0
->m_pkthdr
.len
- progress
;
1797 if (length
> MCLBYTES
)
1800 if (m_new
== NULL
) {
1802 m_new
= m_getcl(how
, MT_DATA
, 0);
1804 m_new
= m_get(how
, MT_DATA
);
1809 m_copydata(m0
, progress
, length
, mtod(m_new
, caddr_t
));
1811 m_new
->m_len
= length
;
1812 if (m_new
!= m_final
)
1813 m_cat(m_final
, m_new
);
1816 if (m0
->m_next
== NULL
)
1819 m_defragbytes
+= m_final
->m_pkthdr
.len
;
1831 * Move data from uio into mbufs.
1832 * A length of zero means copy the whole uio.
1835 m_uiomove(struct uio
*uio
, int wait
, int len0
)
1837 struct mbuf
*head
; /* result mbuf chain */
1838 struct mbuf
*m
; /* current working mbuf */
1840 int resid
, datalen
, error
;
1842 resid
= (len0
== 0) ? uio
->uio_resid
: min(len0
, uio
->uio_resid
);
1847 if (resid
> MHLEN
) {
1848 m
= m_getcl(wait
, MT_DATA
, head
== NULL
? M_PKTHDR
: 0);
1851 if (m
->m_flags
& M_PKTHDR
)
1852 m
->m_pkthdr
.len
= 0;
1855 MGETHDR(m
, wait
, MT_DATA
);
1858 m
->m_pkthdr
.len
= 0;
1859 /* Leave room for protocol headers. */
1863 MGET(m
, wait
, MT_DATA
);
1868 datalen
= min(MCLBYTES
, resid
);
1869 error
= uiomove(mtod(m
, caddr_t
), datalen
, uio
);
1877 head
->m_pkthdr
.len
+= datalen
;
1879 } while (resid
> 0);
1890 * Return the number of bytes in an mbuf chain.
1891 * If lastm is not NULL, also return the last mbuf.
1894 m_lengthm(struct mbuf
*m
, struct mbuf
**lastm
)
1897 struct mbuf
*prev
= m
;
1910 * Like m_lengthm(), except also keep track of mbuf usage.
1913 m_countm(struct mbuf
*m
, struct mbuf
**lastm
, u_int
*pmbcnt
)
1915 u_int len
= 0, mbcnt
= 0;
1916 struct mbuf
*prev
= m
;
1921 if (m
->m_flags
& M_EXT
)
1922 mbcnt
+= m
->m_ext
.ext_size
;