2 * Copyright (c) 2005 Jeffrey M. Hsu. All rights reserved.
3 * Copyright (c) 1982, 1986, 1988, 1990, 1993
4 * The Regents of the University of California. All rights reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Neither the name of the University nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
31 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $
34 #include "opt_param.h"
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/domain.h>
38 #include <sys/file.h> /* for maxfiles */
39 #include <sys/kernel.h>
41 #include <sys/malloc.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
49 #include <sys/thread2.h>
50 #include <sys/msgport2.h>
53 * Routines to add and remove data from an mbuf queue.
55 * The routines sbappend() or sbappendrecord() are normally called to
56 * append new mbufs to a socket buffer. sbappendrecord() differs from
57 * sbappend() in that data supplied is treated as the beginning of a new
58 * record. sbappend() only begins a new record if the last mbuf in the
59 * sockbuf is marked M_EOR.
61 * To place a sender's address, optional access rights, and data in a
62 * socket receive buffer, sbappendaddr() or sbappendcontrol() should be
63 * used. These functions also begin a new record.
65 * Reliable protocols may use the socket send buffer to hold data
66 * awaiting acknowledgement. Data is normally copied from a socket
67 * send buffer in a protocol with m_copy for output to a peer,
68 * and then removing the data from the socket buffer with sbdrop()
69 * or sbdroprecord() when the data is acknowledged by the peer.
73 * Append mbuf chain m to the last record in the socket buffer sb.
74 * The additional space associated the mbuf chain is recorded in sb.
75 * Empty mbufs are discarded and mbufs are compacted where possible.
77 * If M_EOR is set in the first or last mbuf of the last record, the
78 * mbuf chain is appended as a new record. M_EOR is usually just set
79 * in the last mbuf of the last record's mbuf chain (see sbcompress()),
80 * but this may be changed in the future since there is no real need
81 * to propogate the flag any more.
84 sbappend(struct sockbuf
*sb
, struct mbuf
*m
)
91 n
= sb
->sb_lastrecord
;
93 if (n
->m_flags
& M_EOR
) {
94 sbappendrecord(sb
, m
);
100 if (n
->m_flags
& M_EOR
) {
101 sbappendrecord(sb
, m
);
105 sbcompress(sb
, m
, n
);
110 * sbappendstream() is an optimized form of sbappend() for protocols
111 * such as TCP that only have one record in the socket buffer, are
112 * not PR_ATOMIC, nor allow MT_CONTROL data. A protocol that uses
113 * sbappendstream() must use sbappendstream() exclusively.
116 sbappendstream(struct sockbuf
*sb
, struct mbuf
*m
)
119 KKASSERT(m
->m_nextpkt
== NULL
);
120 sbcompress(sb
, m
, sb
->sb_lastmbuf
);
126 _sbcheck(struct sockbuf
*sb
)
129 struct mbuf
*n
= NULL
;
130 u_long len
= 0, mbcnt
= 0;
132 for (m
= sb
->sb_mb
; m
; m
= n
) {
134 if (n
== NULL
&& sb
->sb_lastrecord
!= m
) {
135 kprintf("sockbuf %p mismatched lastrecord %p vs %p\n", sb
, sb
->sb_lastrecord
, m
);
139 for (; m
; m
= m
->m_next
) {
142 if (m
->m_flags
& M_EXT
) /*XXX*/ /* pretty sure this is bogus */
143 mbcnt
+= m
->m_ext
.ext_size
;
144 if (n
== NULL
&& m
->m_next
== NULL
) {
145 if (sb
->sb_lastmbuf
!= m
) {
146 kprintf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb
, sb
->sb_lastmbuf
, m
);
152 if (sb
->sb_mb
== NULL
) {
153 if (sb
->sb_lastrecord
!= NULL
) {
154 kprintf("sockbuf %p is empty, lastrecord not NULL: %p\n",
155 sb
, sb
->sb_lastrecord
);
158 if (sb
->sb_lastmbuf
!= NULL
) {
159 kprintf("sockbuf %p is empty, lastmbuf not NULL: %p\n",
160 sb
, sb
->sb_lastmbuf
);
164 if (len
!= sb
->sb_cc
|| mbcnt
!= sb
->sb_mbcnt
) {
165 kprintf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n",
166 sb
, len
, sb
->sb_cc
, mbcnt
, sb
->sb_mbcnt
);
174 * Same as sbappend(), except the mbuf chain begins a new record.
177 sbappendrecord(struct sockbuf
*sb
, struct mbuf
*m0
)
179 struct mbuf
*firstmbuf
;
180 struct mbuf
*secondmbuf
;
189 * Break the first mbuf off from the rest of the mbuf chain.
192 secondmbuf
= m0
->m_next
;
196 * Insert the first mbuf of the m0 mbuf chain as the last record of
197 * the sockbuf. Note this permits zero length records! Keep the
198 * sockbuf state consistent.
200 if (sb
->sb_mb
== NULL
)
201 sb
->sb_mb
= firstmbuf
;
203 sb
->sb_lastrecord
->m_nextpkt
= firstmbuf
;
204 sb
->sb_lastrecord
= firstmbuf
; /* update hint for new last record */
205 sb
->sb_lastmbuf
= firstmbuf
; /* update hint for new last mbuf */
208 * propagate the EOR flag so sbcompress() can pick it up
210 if ((firstmbuf
->m_flags
& M_EOR
) && (secondmbuf
!= NULL
)) {
211 firstmbuf
->m_flags
&= ~M_EOR
;
212 secondmbuf
->m_flags
|= M_EOR
;
216 * The succeeding call to sbcompress() omits accounting for
217 * the first mbuf, so do it here.
219 sballoc(sb
, firstmbuf
);
221 /* Compact the rest of the mbuf chain in after the first mbuf. */
222 sbcompress(sb
, secondmbuf
, firstmbuf
);
226 * Append address and data, and optionally, control (ancillary) data
227 * to the receive queue of a socket. If present,
228 * m0 must include a packet header with total length.
229 * Returns 0 if insufficient mbufs.
232 sbappendaddr(struct sockbuf
*sb
, const struct sockaddr
*asa
, struct mbuf
*m0
,
233 struct mbuf
*control
)
239 mbuftrackid(control
, 20);
240 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
241 panic("sbappendaddr");
244 for (n
= control
; n
; n
= n
->m_next
) {
245 if (n
->m_next
== NULL
) /* keep pointer to last control buf */
248 if (asa
->sa_len
> MLEN
)
250 MGET(m
, M_NOWAIT
, MT_SONAME
);
253 KKASSERT(m
->m_nextpkt
== NULL
);
254 m
->m_len
= asa
->sa_len
;
255 bcopy(asa
, mtod(m
, caddr_t
), asa
->sa_len
);
257 n
->m_next
= m0
; /* concatenate data to control */
261 for (n
= m
; n
; n
= n
->m_next
)
264 if (sb
->sb_mb
== NULL
)
267 sb
->sb_lastrecord
->m_nextpkt
= m
;
268 sb
->sb_lastrecord
= m
;
271 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
272 * so sbappend() can find it.
276 m
->m_flags
&= ~M_EOR
;
280 m
->m_flags
|= eor
& M_EOR
;
287 * Append control information followed by data. Both the control and data
291 sbappendcontrol(struct sockbuf
*sb
, struct mbuf
*m0
, struct mbuf
*control
)
294 u_int length
, cmbcnt
, m0mbcnt
;
297 KASSERT(control
!= NULL
, ("sbappendcontrol"));
298 KKASSERT(control
->m_nextpkt
== NULL
);
302 mbuftrackid(control
, 22);
304 length
= m_countm(control
, &n
, &cmbcnt
) + m_countm(m0
, NULL
, &m0mbcnt
);
306 KKASSERT(m0
!= NULL
);
308 n
->m_next
= m0
; /* concatenate data to control */
310 if (sb
->sb_mb
== NULL
)
313 sb
->sb_lastrecord
->m_nextpkt
= control
;
314 sb
->sb_lastrecord
= control
;
317 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
318 * so sbappend() can find it.
322 m0
->m_flags
&= ~M_EOR
;
326 m0
->m_flags
|= eor
& M_EOR
;
327 sb
->sb_lastmbuf
= m0
;
330 sb
->sb_mbcnt
+= cmbcnt
+ m0mbcnt
;
336 * Compress mbuf chain m into the socket buffer sb following mbuf tailm.
337 * If tailm is null, the buffer is presumed empty. Also, as a side-effect,
338 * increment the sockbuf counts for each mbuf in the chain.
341 sbcompress(struct sockbuf
*sb
, struct mbuf
*m
, struct mbuf
*tailm
)
344 struct mbuf
*free_chain
= NULL
;
352 eor
|= m
->m_flags
& M_EOR
;
354 * Disregard empty mbufs as long as we don't encounter
355 * an end-of-record or there is a trailing mbuf of
356 * the same type to propagate the EOR flag to.
358 * Defer the m_free() call because it can block and break
359 * the atomicy of the sockbuf.
363 (((o
= m
->m_next
) || (o
= tailm
)) &&
364 o
->m_type
== m
->m_type
))) {
366 m
->m_next
= free_chain
;
373 * See if we can coalesce with preceding mbuf. Never try
374 * to coalesce a mbuf representing an end-of-record or
375 * a mbuf locked by userland for reading.
377 if (tailm
&& !(tailm
->m_flags
& (M_EOR
| M_SOLOCKED
)) &&
379 m
->m_len
<= MCLBYTES
/ 4 && /* XXX: Don't copy too much */
380 m
->m_len
<= M_TRAILINGSPACE(tailm
) &&
381 tailm
->m_type
== m
->m_type
) {
384 bcopy(mtod(m
, caddr_t
),
385 mtod(tailm
, caddr_t
) + tailm
->m_len
,
387 tailm
->m_len
+= m
->m_len
;
389 sb
->sb_cc
+= m
->m_len
; /* update sb counter */
392 * Fix the wrongly updated mbcnt_prealloc
395 if (m
->m_flags
& M_EXT
)
396 mbcnt_sz
+= m
->m_ext
.ext_size
;
397 atomic_subtract_long(&sb
->sb_mbcnt_prealloc
, mbcnt_sz
);
400 m
->m_next
= free_chain
;
406 /* Insert whole mbuf. */
408 KASSERT(sb
->sb_mb
== NULL
,
409 ("sbcompress: sb_mb not NULL"));
410 sb
->sb_mb
= m
; /* only mbuf in sockbuf */
411 sb
->sb_lastrecord
= m
; /* new last record */
413 tailm
->m_next
= m
; /* tack m on following tailm */
415 sb
->sb_lastmbuf
= m
; /* update last mbuf hint */
417 tailm
= m
; /* just inserted mbuf becomes the new tail */
418 m
= m
->m_next
; /* advance to next mbuf */
419 tailm
->m_next
= NULL
; /* split inserted mbuf off from chain */
421 /* update sb counters for just added mbuf */
424 /* clear EOR on intermediate mbufs */
425 tailm
->m_flags
&= ~M_EOR
;
429 * Propogate EOR to the last mbuf
433 tailm
->m_flags
|= eor
;
435 kprintf("semi-panic: sbcompress");
439 * Clean up any defered frees.
442 free_chain
= m_free(free_chain
);
448 * Free all mbufs in a sockbuf.
449 * Check that all resources are reclaimed.
452 sbflush(struct sockbuf
*sb
)
454 while (sb
->sb_mbcnt
) {
456 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
457 * we would loop forever. Panic instead.
459 if (!sb
->sb_cc
&& (sb
->sb_mb
== NULL
|| sb
->sb_mb
->m_len
))
461 sbdrop(sb
, (int)sb
->sb_cc
);
463 KASSERT(!(sb
->sb_cc
|| sb
->sb_mb
|| sb
->sb_mbcnt
|| sb
->sb_lastmbuf
),
464 ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p",
465 sb
->sb_cc
, sb
->sb_mb
, sb
->sb_mbcnt
, sb
->sb_lastmbuf
));
469 * Drop data from (the front of) a sockbuf. If the current record is
470 * exhausted this routine will move onto the next one and continue dropping
474 sbdrop(struct sockbuf
*sb
, int len
)
477 struct mbuf
*free_chain
= NULL
;
483 while (m
&& len
> 0) {
484 if (m
->m_len
> len
) {
488 atomic_subtract_long(&sb
->sb_cc_prealloc
, len
);
492 m
= sbunlinkmbuf(sb
, m
, &free_chain
);
493 if (m
== NULL
&& len
)
498 * Remove any trailing 0-length mbufs in the current record. If
499 * the last record for which data was removed is now empty, m will be
502 while (m
&& m
->m_len
== 0) {
503 m
= sbunlinkmbuf(sb
, m
, &free_chain
);
512 * Drop a record off the front of a sockbuf and move the next record
515 * Must be called while holding a critical section.
518 sbdroprecord(struct sockbuf
*sb
)
526 if ((sb
->sb_mb
= m
->m_nextpkt
) == NULL
) {
527 sb
->sb_lastrecord
= NULL
;
528 sb
->sb_lastmbuf
= NULL
;
531 for (n
= m
; n
; n
= n
->m_next
)
539 * Drop the first mbuf off the sockbuf and move the next mbuf to the front.
540 * Currently only the head mbuf of the sockbuf may be dropped this way.
542 * The next mbuf in the same record as the mbuf being removed is returned
543 * or NULL if the record is exhausted. Note that other records may remain
544 * in the sockbuf when NULL is returned.
546 * Must be called while holding a critical section.
549 sbunlinkmbuf(struct sockbuf
*sb
, struct mbuf
*m
, struct mbuf
**free_chain
)
553 KKASSERT(sb
->sb_mb
== m
);
558 if (sb
->sb_lastrecord
== m
)
559 sb
->sb_lastrecord
= n
;
560 KKASSERT(sb
->sb_lastmbuf
!= m
);
561 n
->m_nextpkt
= m
->m_nextpkt
;
563 sb
->sb_mb
= m
->m_nextpkt
;
564 if (sb
->sb_lastrecord
== m
) {
565 KKASSERT(sb
->sb_mb
== NULL
);
566 sb
->sb_lastrecord
= NULL
;
568 if (sb
->sb_mb
== NULL
)
569 sb
->sb_lastmbuf
= NULL
;
573 m
->m_next
= *free_chain
;
582 * Create a "control" mbuf containing the specified data
583 * with the specified type for presentation on a socket buffer.
586 sbcreatecontrol(caddr_t p
, int size
, int type
, int level
)
591 if (CMSG_SPACE((u_int
)size
) > MCLBYTES
)
593 m
= m_getl(CMSG_SPACE((u_int
)size
), M_NOWAIT
, MT_CONTROL
, 0, NULL
);
596 m
->m_len
= CMSG_SPACE(size
);
597 cp
= mtod(m
, struct cmsghdr
*);
599 memcpy(CMSG_DATA(cp
), p
, size
);
600 cp
->cmsg_len
= CMSG_LEN(size
);
601 cp
->cmsg_level
= level
;
602 cp
->cmsg_type
= type
;