Add tunable for each_burst.
[dragonfly.git] / sys / kern / uipc_sockbuf.c
blob490872876c66d1f97a9ef33b34e75db6b520b49d
1 /*
2 * Copyright (c) 2005 Jeffrey M. Hsu. All rights reserved.
3 * Copyright (c) 1982, 1986, 1988, 1990, 1993
4 * The Regents of the University of California. All rights reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. All advertising materials mentioning features or use of this software
15 * must display the following acknowledgement:
16 * This product includes software developed by the University of
17 * California, Berkeley and its contributors.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
34 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
35 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $
36 * $DragonFly: src/sys/kern/uipc_sockbuf.c,v 1.3 2007/08/09 01:10:04 dillon Exp $
39 #include "opt_param.h"
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/domain.h>
43 #include <sys/file.h> /* for maxfiles */
44 #include <sys/kernel.h>
45 #include <sys/proc.h>
46 #include <sys/malloc.h>
47 #include <sys/mbuf.h>
48 #include <sys/protosw.h>
49 #include <sys/resourcevar.h>
50 #include <sys/stat.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
54 #include <sys/thread2.h>
55 #include <sys/msgport2.h>
58 * Routines to add and remove data from an mbuf queue.
60 * The routines sbappend() or sbappendrecord() are normally called to
61 * append new mbufs to a socket buffer. sbappendrecord() differs from
62 * sbappend() in that data supplied is treated as the beginning of a new
63 * record. sbappend() only begins a new record if the last mbuf in the
64 * sockbuf is marked M_EOR.
66 * To place a sender's address, optional access rights, and data in a
67 * socket receive buffer, sbappendaddr() or sbappendcontrol() should be
68 * used. These functions also begin a new record.
70 * Reliable protocols may use the socket send buffer to hold data
71 * awaiting acknowledgement. Data is normally copied from a socket
72 * send buffer in a protocol with m_copy for output to a peer,
73 * and then removing the data from the socket buffer with sbdrop()
74 * or sbdroprecord() when the data is acknowledged by the peer.
78 * Append mbuf chain m to the last record in the socket buffer sb.
79 * The additional space associated the mbuf chain is recorded in sb.
80 * Empty mbufs are discarded and mbufs are compacted where possible.
82 * If M_EOR is set in the first or last mbuf of the last record, the
83 * mbuf chain is appended as a new record. M_EOR is usually just set
84 * in the last mbuf of the last record's mbuf chain (see sbcompress()),
85 * but this may be changed in the future since there is no real need
86 * to propogate the flag any more.
88 void
89 sbappend(struct sockbuf *sb, struct mbuf *m)
91 struct mbuf *n;
93 mbuftrackid(m, 16);
95 if (m) {
96 n = sb->sb_lastrecord;
97 if (n) {
98 if (n->m_flags & M_EOR) {
99 sbappendrecord(sb, m);
100 return;
103 n = sb->sb_lastmbuf;
104 if (n) {
105 if (n->m_flags & M_EOR) {
106 sbappendrecord(sb, m);
107 return;
110 sbcompress(sb, m, n);
115 * sbappendstream() is an optimized form of sbappend() for protocols
116 * such as TCP that only have one record in the socket buffer, are
117 * not PR_ATOMIC, nor allow MT_CONTROL data. A protocol that uses
118 * sbappendstream() must use sbappendstream() exclusively.
120 void
121 sbappendstream(struct sockbuf *sb, struct mbuf *m)
123 mbuftrackid(m, 17);
124 KKASSERT(m->m_nextpkt == NULL);
125 sbcompress(sb, m, sb->sb_lastmbuf);
128 #ifdef SOCKBUF_DEBUG
130 void
131 _sbcheck(struct sockbuf *sb)
133 struct mbuf *m;
134 struct mbuf *n = NULL;
135 u_long len = 0, mbcnt = 0;
137 for (m = sb->sb_mb; m; m = n) {
138 n = m->m_nextpkt;
139 if (n == NULL && sb->sb_lastrecord != m) {
140 kprintf("sockbuf %p mismatched lastrecord %p vs %p\n", sb, sb->sb_lastrecord, m);
141 panic("sbcheck1");
144 for (; m; m = m->m_next) {
145 len += m->m_len;
146 mbcnt += MSIZE;
147 if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
148 mbcnt += m->m_ext.ext_size;
149 if (n == NULL && m->m_next == NULL) {
150 if (sb->sb_lastmbuf != m) {
151 kprintf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb, sb->sb_lastmbuf, m);
152 panic("sbcheck2");
157 if (sb->sb_mb == NULL) {
158 if (sb->sb_lastrecord != NULL) {
159 kprintf("sockbuf %p is empty, lastrecord not NULL: %p\n",
160 sb, sb->sb_lastrecord);
161 panic("sbcheck3");
163 if (sb->sb_lastmbuf != NULL) {
164 kprintf("sockbuf %p is empty, lastmbuf not NULL: %p\n",
165 sb, sb->sb_lastmbuf);
166 panic("sbcheck4");
169 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
170 kprintf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n",
171 sb, len, sb->sb_cc, mbcnt, sb->sb_mbcnt);
172 panic("sbcheck5");
176 #endif
179 * Same as sbappend(), except the mbuf chain begins a new record.
181 void
182 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
184 struct mbuf *firstmbuf;
185 struct mbuf *secondmbuf;
187 if (m0 == NULL)
188 return;
189 mbuftrackid(m0, 18);
191 sbcheck(sb);
194 * Break the first mbuf off from the rest of the mbuf chain.
196 firstmbuf = m0;
197 secondmbuf = m0->m_next;
198 m0->m_next = NULL;
201 * Insert the first mbuf of the m0 mbuf chain as the last record of
202 * the sockbuf. Note this permits zero length records! Keep the
203 * sockbuf state consistent.
205 if (sb->sb_mb == NULL)
206 sb->sb_mb = firstmbuf;
207 else
208 sb->sb_lastrecord->m_nextpkt = firstmbuf;
209 sb->sb_lastrecord = firstmbuf; /* update hint for new last record */
210 sb->sb_lastmbuf = firstmbuf; /* update hint for new last mbuf */
213 * propagate the EOR flag so sbcompress() can pick it up
215 if ((firstmbuf->m_flags & M_EOR) && (secondmbuf != NULL)) {
216 firstmbuf->m_flags &= ~M_EOR;
217 secondmbuf->m_flags |= M_EOR;
221 * The succeeding call to sbcompress() omits accounting for
222 * the first mbuf, so do it here.
224 sballoc(sb, firstmbuf);
226 /* Compact the rest of the mbuf chain in after the first mbuf. */
227 sbcompress(sb, secondmbuf, firstmbuf);
231 * Append address and data, and optionally, control (ancillary) data
232 * to the receive queue of a socket. If present,
233 * m0 must include a packet header with total length.
234 * Returns 0 if insufficient mbufs.
237 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
238 struct mbuf *control)
240 struct mbuf *m, *n;
241 int eor;
243 mbuftrackid(m0, 19);
244 mbuftrackid(control, 20);
245 if (m0 && (m0->m_flags & M_PKTHDR) == 0)
246 panic("sbappendaddr");
247 sbcheck(sb);
249 for (n = control; n; n = n->m_next) {
250 if (n->m_next == NULL) /* keep pointer to last control buf */
251 break;
253 if (asa->sa_len > MLEN)
254 return (0);
255 MGET(m, MB_DONTWAIT, MT_SONAME);
256 if (m == NULL)
257 return (0);
258 KKASSERT(m->m_nextpkt == NULL);
259 m->m_len = asa->sa_len;
260 bcopy(asa, mtod(m, caddr_t), asa->sa_len);
261 if (n)
262 n->m_next = m0; /* concatenate data to control */
263 else
264 control = m0;
265 m->m_next = control;
266 for (n = m; n; n = n->m_next)
267 sballoc(sb, n);
269 if (sb->sb_mb == NULL)
270 sb->sb_mb = m;
271 else
272 sb->sb_lastrecord->m_nextpkt = m;
273 sb->sb_lastrecord = m;
276 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
277 * so sbappend() can find it.
279 eor = m->m_flags;
280 while (m->m_next) {
281 m->m_flags &= ~M_EOR;
282 m = m->m_next;
283 eor |= m->m_flags;
285 m->m_flags |= eor & M_EOR;
286 sb->sb_lastmbuf = m;
288 return (1);
292 * Append control information followed by data. Both the control and data
293 * must be non-null.
296 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
298 struct mbuf *n;
299 u_int length, cmbcnt, m0mbcnt;
300 int eor;
302 KASSERT(control != NULL, ("sbappendcontrol"));
303 KKASSERT(control->m_nextpkt == NULL);
304 sbcheck(sb);
306 mbuftrackid(m0, 21);
307 mbuftrackid(control, 22);
309 length = m_countm(control, &n, &cmbcnt) + m_countm(m0, NULL, &m0mbcnt);
311 KKASSERT(m0 != NULL);
313 n->m_next = m0; /* concatenate data to control */
315 if (sb->sb_mb == NULL)
316 sb->sb_mb = control;
317 else
318 sb->sb_lastrecord->m_nextpkt = control;
319 sb->sb_lastrecord = control;
322 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
323 * so sbappend() can find it.
325 eor = m0->m_flags;
326 while (m0->m_next) {
327 m0->m_flags &= ~M_EOR;
328 m0 = m0->m_next;
329 eor |= m0->m_flags;
331 m0->m_flags |= eor & M_EOR;
332 sb->sb_lastmbuf = m0;
334 sb->sb_cc += length;
335 sb->sb_mbcnt += cmbcnt + m0mbcnt;
337 return (1);
341 * Compress mbuf chain m into the socket buffer sb following mbuf tailm.
342 * If tailm is null, the buffer is presumed empty. Also, as a side-effect,
343 * increment the sockbuf counts for each mbuf in the chain.
345 void
346 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *tailm)
348 int eor = 0;
349 struct mbuf *free_chain = NULL;
351 mbuftrackid(m, 23);
353 sbcheck(sb);
354 while (m) {
355 struct mbuf *o;
357 eor |= m->m_flags & M_EOR;
359 * Disregard empty mbufs as long as we don't encounter
360 * an end-of-record or there is a trailing mbuf of
361 * the same type to propagate the EOR flag to.
363 * Defer the m_free() call because it can block and break
364 * the atomicy of the sockbuf.
366 if (m->m_len == 0 &&
367 (eor == 0 ||
368 (((o = m->m_next) || (o = tailm)) &&
369 o->m_type == m->m_type))) {
370 o = m->m_next;
371 m->m_next = free_chain;
372 free_chain = m;
373 m = o;
374 continue;
377 /* See if we can coalesce with preceding mbuf. */
378 if (tailm && !(tailm->m_flags & M_EOR) && M_WRITABLE(tailm) &&
379 m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
380 m->m_len <= M_TRAILINGSPACE(tailm) &&
381 tailm->m_type == m->m_type) {
382 bcopy(mtod(m, caddr_t),
383 mtod(tailm, caddr_t) + tailm->m_len,
384 (unsigned)m->m_len);
385 tailm->m_len += m->m_len;
386 sb->sb_cc += m->m_len; /* update sb counter */
387 o = m->m_next;
388 m->m_next = free_chain;
389 free_chain = m;
390 m = o;
391 continue;
394 /* Insert whole mbuf. */
395 if (tailm == NULL) {
396 KASSERT(sb->sb_mb == NULL,
397 ("sbcompress: sb_mb not NULL"));
398 sb->sb_mb = m; /* only mbuf in sockbuf */
399 sb->sb_lastrecord = m; /* new last record */
400 } else {
401 tailm->m_next = m; /* tack m on following tailm */
403 sb->sb_lastmbuf = m; /* update last mbuf hint */
405 tailm = m; /* just inserted mbuf becomes the new tail */
406 m = m->m_next; /* advance to next mbuf */
407 tailm->m_next = NULL; /* split inserted mbuf off from chain */
409 /* update sb counters for just added mbuf */
410 sballoc(sb, tailm);
412 /* clear EOR on intermediate mbufs */
413 tailm->m_flags &= ~M_EOR;
417 * Propogate EOR to the last mbuf
419 if (eor) {
420 if (tailm)
421 tailm->m_flags |= eor;
422 else
423 kprintf("semi-panic: sbcompress");
427 * Clean up any defered frees.
429 while (free_chain)
430 free_chain = m_free(free_chain);
432 sbcheck(sb);
436 * Free all mbufs in a sockbuf.
437 * Check that all resources are reclaimed.
439 void
440 sbflush(struct sockbuf *sb)
442 while (sb->sb_mbcnt) {
444 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
445 * we would loop forever. Panic instead.
447 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
448 break;
449 sbdrop(sb, (int)sb->sb_cc);
451 KASSERT(!(sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_lastmbuf),
452 ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p",
453 sb->sb_cc, sb->sb_mb, sb->sb_mbcnt, sb->sb_lastmbuf));
457 * Drop data from (the front of) a sockbuf. If the current record is
458 * exhausted this routine will move onto the next one and continue dropping
459 * data.
461 void
462 sbdrop(struct sockbuf *sb, int len)
464 struct mbuf *m;
465 struct mbuf *free_chain = NULL;
467 sbcheck(sb);
468 crit_enter();
470 m = sb->sb_mb;
471 while (m && len > 0) {
472 if (m->m_len > len) {
473 m->m_len -= len;
474 m->m_data += len;
475 sb->sb_cc -= len;
476 break;
478 len -= m->m_len;
479 m = sbunlinkmbuf(sb, m, &free_chain);
480 if (m == NULL && len)
481 m = sb->sb_mb;
485 * Remove any trailing 0-length mbufs in the current record. If
486 * the last record for which data was removed is now empty, m will be
487 * NULL.
489 while (m && m->m_len == 0) {
490 m = sbunlinkmbuf(sb, m, &free_chain);
492 crit_exit();
493 if (free_chain)
494 m_freem(free_chain);
495 sbcheck(sb);
499 * Drop a record off the front of a sockbuf and move the next record
500 * to the front.
502 * Must be called while holding a critical section.
504 void
505 sbdroprecord(struct sockbuf *sb)
507 struct mbuf *m;
508 struct mbuf *n;
510 sbcheck(sb);
511 m = sb->sb_mb;
512 if (m) {
513 if ((sb->sb_mb = m->m_nextpkt) == NULL) {
514 sb->sb_lastrecord = NULL;
515 sb->sb_lastmbuf = NULL;
517 m->m_nextpkt = NULL;
518 for (n = m; n; n = n->m_next)
519 sbfree(sb, n);
520 m_freem(m);
521 sbcheck(sb);
526 * Drop the first mbuf off the sockbuf and move the next mbuf to the front.
527 * Currently only the head mbuf of the sockbuf may be dropped this way.
529 * The next mbuf in the same record as the mbuf being removed is returned
530 * or NULL if the record is exhausted. Note that other records may remain
531 * in the sockbuf when NULL is returned.
533 * Must be called while holding a critical section.
535 struct mbuf *
536 sbunlinkmbuf(struct sockbuf *sb, struct mbuf *m, struct mbuf **free_chain)
538 struct mbuf *n;
540 KKASSERT(sb->sb_mb == m);
541 sbfree(sb, m);
542 n = m->m_next;
543 if (n) {
544 sb->sb_mb = n;
545 if (sb->sb_lastrecord == m)
546 sb->sb_lastrecord = n;
547 KKASSERT(sb->sb_lastmbuf != m);
548 n->m_nextpkt = m->m_nextpkt;
549 } else {
550 sb->sb_mb = m->m_nextpkt;
551 if (sb->sb_lastrecord == m) {
552 KKASSERT(sb->sb_mb == NULL);
553 sb->sb_lastrecord = NULL;
555 if (sb->sb_mb == NULL)
556 sb->sb_lastmbuf = NULL;
558 m->m_nextpkt = NULL;
559 if (free_chain) {
560 m->m_next = *free_chain;
561 *free_chain = m;
562 } else {
563 m->m_next = NULL;
565 return(n);
569 * Create a "control" mbuf containing the specified data
570 * with the specified type for presentation on a socket buffer.
572 struct mbuf *
573 sbcreatecontrol(caddr_t p, int size, int type, int level)
575 struct cmsghdr *cp;
576 struct mbuf *m;
578 if (CMSG_SPACE((u_int)size) > MCLBYTES)
579 return (NULL);
580 m = m_getl(CMSG_SPACE((u_int)size), MB_DONTWAIT, MT_CONTROL, 0, NULL);
581 if (m == NULL)
582 return (NULL);
583 m->m_len = CMSG_SPACE(size);
584 cp = mtod(m, struct cmsghdr *);
585 if (p != NULL)
586 memcpy(CMSG_DATA(cp), p, size);
587 cp->cmsg_len = CMSG_LEN(size);
588 cp->cmsg_level = level;
589 cp->cmsg_type = type;
590 mbuftrackid(m, 24);
591 return (m);