linprocfs - Introduce /proc/mounts
[dragonfly.git] / sys / net / altq / altq_red.c
blob3163ea291b435eecec79bc8c1e5732995f34114b
1 /* $KAME: altq_red.c,v 1.19 2004/04/17 10:54:49 kjc Exp $ */
2 /* $DragonFly: src/sys/net/altq/altq_red.c,v 1.4 2006/12/22 23:44:55 swildner Exp $ */
4 /*
5 * Copyright (C) 1997-2003
6 * Sony Computer Science Laboratories Inc. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
31 * Copyright (c) 1990-1994 Regents of the University of California.
32 * All rights reserved.
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 * 3. All advertising materials mentioning features or use of this software
43 * must display the following acknowledgement:
44 * This product includes software developed by the Computer Systems
45 * Engineering Group at Lawrence Berkeley Laboratory.
46 * 4. Neither the name of the University nor of the Laboratory may be used
47 * to endorse or promote products derived from this software without
48 * specific prior written permission.
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE.
63 #include "opt_altq.h"
64 #include "opt_inet.h"
65 #include "opt_inet6.h"
67 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */
69 #include <sys/param.h>
70 #include <sys/malloc.h>
71 #include <sys/mbuf.h>
72 #include <sys/socket.h>
73 #include <sys/systm.h>
74 #include <sys/errno.h>
76 #include <net/if.h>
78 #include <netinet/in.h>
79 #include <netinet/in_systm.h>
80 #include <netinet/ip.h>
81 #ifdef INET6
82 #include <netinet/ip6.h>
83 #endif
85 #include <net/pf/pfvar.h>
86 #include <net/altq/altq.h>
87 #include <net/altq/altq_red.h>
90 * ALTQ/RED (Random Early Detection) implementation using 32-bit
91 * fixed-point calculation.
93 * written by kjc using the ns code as a reference.
94 * you can learn more about red and ns from Sally's home page at
95 * http://www-nrg.ee.lbl.gov/floyd/
97 * most of the red parameter values are fixed in this implementation
98 * to prevent fixed-point overflow/underflow.
99 * if you change the parameters, watch out for overflow/underflow!
101 * the parameters used are recommended values by Sally.
102 * the corresponding ns config looks:
103 * q_weight=0.00195
104 * minthresh=5 maxthresh=15 queue-size=60
105 * linterm=30
106 * dropmech=drop-tail
107 * bytes=false (can't be handled by 32-bit fixed-point)
108 * doubleq=false dqthresh=false
109 * wait=true
112 * alternative red parameters for a slow link.
114 * assume the queue length becomes from zero to L and keeps L, it takes
115 * N packets for q_avg to reach 63% of L.
116 * when q_weight is 0.002, N is about 500 packets.
117 * for a slow link like dial-up, 500 packets takes more than 1 minute!
118 * when q_weight is 0.008, N is about 127 packets.
119 * when q_weight is 0.016, N is about 63 packets.
120 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
121 * are allowed for 0.016.
122 * see Sally's paper for more details.
124 /* normal red parameters */
125 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
126 /* q_weight = 0.00195 */
128 /* red parameters for a slow link */
129 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
130 /* q_weight = 0.0078125 */
132 /* red parameters for a very slow link (e.g., dialup) */
133 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
134 /* q_weight = 0.015625 */
136 /* fixed-point uses 12-bit decimal places */
137 #define FP_SHIFT 12 /* fixed-point shift */
139 /* red parameters for drop probability */
140 #define INV_P_MAX 10 /* inverse of max drop probability */
141 #define TH_MIN 5 /* min threshold */
142 #define TH_MAX 15 /* max threshold */
144 #define RED_LIMIT 60 /* default max queue lenght */
145 #define RED_STATS /* collect statistics */
148 * our default policy for forced-drop is drop-tail.
149 * (in altq-1.1.2 or earlier, the default was random-drop.
150 * but it makes more sense to punish the cause of the surge.)
151 * to switch to the random-drop policy, define "RED_RANDOM_DROP".
154 /* default red parameter values */
155 static int default_th_min = TH_MIN;
156 static int default_th_max = TH_MAX;
157 static int default_inv_pmax = INV_P_MAX;
160 * red support routines
162 red_t *
163 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags, int pkttime)
165 red_t *rp;
166 int w, i;
167 int npkts_per_sec;
169 rp = kmalloc(sizeof(*rp), M_ALTQ, M_WAITOK | M_ZERO);
170 rp->red_avg = 0;
171 rp->red_idle = 1;
173 if (weight == 0)
174 rp->red_weight = W_WEIGHT;
175 else
176 rp->red_weight = weight;
177 if (inv_pmax == 0)
178 rp->red_inv_pmax = default_inv_pmax;
179 else
180 rp->red_inv_pmax = inv_pmax;
181 if (th_min == 0)
182 rp->red_thmin = default_th_min;
183 else
184 rp->red_thmin = th_min;
185 if (th_max == 0)
186 rp->red_thmax = default_th_max;
187 else
188 rp->red_thmax = th_max;
190 rp->red_flags = flags;
192 if (pkttime == 0)
193 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
194 rp->red_pkttime = 800;
195 else
196 rp->red_pkttime = pkttime;
198 if (weight == 0) {
199 /* when the link is very slow, adjust red parameters */
200 npkts_per_sec = 1000000 / rp->red_pkttime;
201 if (npkts_per_sec < 50) {
202 /* up to about 400Kbps */
203 rp->red_weight = W_WEIGHT_2;
204 } else if (npkts_per_sec < 300) {
205 /* up to about 2.4Mbps */
206 rp->red_weight = W_WEIGHT_1;
210 /* calculate wshift. weight must be power of 2 */
211 w = rp->red_weight;
212 for (i = 0; w > 1; i++)
213 w = w >> 1;
214 rp->red_wshift = i;
215 w = 1 << rp->red_wshift;
216 if (w != rp->red_weight) {
217 kprintf("invalid weight value %d for red! use %d\n",
218 rp->red_weight, w);
219 rp->red_weight = w;
223 * thmin_s and thmax_s are scaled versions of th_min and th_max
224 * to be compared with avg.
226 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
227 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
230 * precompute probability denominator
231 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
233 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
234 * rp->red_inv_pmax) << FP_SHIFT;
236 /* allocate weight table */
237 rp->red_wtab = wtab_alloc(rp->red_weight);
239 microtime(&rp->red_last);
240 return (rp);
243 void
244 red_destroy(red_t *rp)
246 wtab_destroy(rp->red_wtab);
247 kfree(rp, M_ALTQ);
250 void
251 red_getstats(red_t *rp, struct redstats *sp)
253 sp->q_avg = rp->red_avg >> rp->red_wshift;
254 sp->xmit_cnt = rp->red_stats.xmit_cnt;
255 sp->drop_cnt = rp->red_stats.drop_cnt;
256 sp->drop_forced = rp->red_stats.drop_forced;
257 sp->drop_unforced = rp->red_stats.drop_unforced;
258 sp->marked_packets = rp->red_stats.marked_packets;
262 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, struct altq_pktattr *pktattr)
264 int avg, droptype;
265 int n;
267 avg = rp->red_avg;
270 * if we were idle, we pretend that n packets arrived during
271 * the idle period.
273 if (rp->red_idle) {
274 struct timeval now;
275 int t;
277 rp->red_idle = 0;
278 microtime(&now);
279 t = (now.tv_sec - rp->red_last.tv_sec);
280 if (t > 60) {
282 * being idle for more than 1 minute, set avg to zero.
283 * this prevents t from overflow.
285 avg = 0;
286 } else {
287 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
288 n = t / rp->red_pkttime - 1;
290 /* the following line does (avg = (1 - Wq)^n * avg) */
291 if (n > 0)
292 avg = (avg >> FP_SHIFT) *
293 pow_w(rp->red_wtab, n);
297 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
298 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
299 rp->red_avg = avg; /* save the new value */
302 * red_count keeps a tally of arriving traffic that has not
303 * been dropped.
305 rp->red_count++;
307 /* see if we drop early */
308 droptype = DTYPE_NODROP;
309 if (avg >= rp->red_thmin_s && qlen(q) > 1) {
310 if (avg >= rp->red_thmax_s) {
311 /* avg >= th_max: forced drop */
312 droptype = DTYPE_FORCED;
313 } else if (rp->red_old == 0) {
314 /* first exceeds th_min */
315 rp->red_count = 1;
316 rp->red_old = 1;
317 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
318 rp->red_probd, rp->red_count)) {
319 /* mark or drop by red */
320 if ((rp->red_flags & REDF_ECN) &&
321 mark_ecn(m, pktattr, rp->red_flags)) {
322 /* successfully marked. do not drop. */
323 rp->red_count = 0;
324 #ifdef RED_STATS
325 rp->red_stats.marked_packets++;
326 #endif
327 } else {
328 /* unforced drop by red */
329 droptype = DTYPE_EARLY;
332 } else {
333 /* avg < th_min */
334 rp->red_old = 0;
338 * if the queue length hits the hard limit, it's a forced drop.
340 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
341 droptype = DTYPE_FORCED;
343 #ifdef RED_RANDOM_DROP
344 /* if successful or forced drop, enqueue this packet. */
345 if (droptype != DTYPE_EARLY)
346 _addq(q, m);
347 #else
348 /* if successful, enqueue this packet. */
349 if (droptype == DTYPE_NODROP)
350 _addq(q, m);
351 #endif
352 if (droptype != DTYPE_NODROP) {
353 if (droptype == DTYPE_EARLY) {
354 /* drop the incoming packet */
355 #ifdef RED_STATS
356 rp->red_stats.drop_unforced++;
357 #endif
358 } else {
359 /* forced drop, select a victim packet in the queue. */
360 #ifdef RED_RANDOM_DROP
361 m = _getq_random(q);
362 #endif
363 #ifdef RED_STATS
364 rp->red_stats.drop_forced++;
365 #endif
367 #ifdef RED_STATS
368 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
369 #endif
370 rp->red_count = 0;
371 m_freem(m);
372 return (-1);
374 /* successfully queued */
375 #ifdef RED_STATS
376 PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
377 #endif
378 return (0);
382 * early-drop probability is calculated as follows:
383 * prob = p_max * (avg - th_min) / (th_max - th_min)
384 * prob_a = prob / (2 - count*prob)
385 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
386 * here prob_a increases as successive undrop count increases.
387 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
388 * becomes 1 when (count >= (2 / prob))).
391 drop_early(int fp_len, int fp_probd, int count)
393 int d; /* denominator of drop-probability */
395 d = fp_probd - count * fp_len;
396 if (d <= 0) {
397 /* count exceeds the hard limit: drop or mark */
398 return (1);
402 * now the range of d is [1..600] in fixed-point. (when
403 * th_max-th_min=10 and p_max=1/30)
404 * drop probability = (avg - TH_MIN) / d
407 if ((karc4random() % d) < fp_len) {
408 /* drop or mark */
409 return (1);
411 /* no drop/mark */
412 return (0);
416 * try to mark CE bit to the packet.
417 * returns 1 if successfully marked, 0 otherwise.
420 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
422 struct mbuf *m0;
423 void *hdr;
424 int af;
426 if ((m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED) == 0)
427 return (0);
428 af = m->m_pkthdr.ecn_af;
429 hdr = m->m_pkthdr.header;
431 if (af != AF_INET && af != AF_INET6)
432 return (0);
434 /* verify that pattr_hdr is within the mbuf data */
435 for (m0 = m; m0 != NULL; m0 = m0->m_next) {
436 if (((caddr_t)hdr >= m0->m_data) &&
437 ((caddr_t)hdr < m0->m_data + m0->m_len))
438 break;
440 if (m0 == NULL) {
441 /* ick, tag info is stale */
442 return (0);
445 switch (af) {
446 case AF_INET:
447 if (flags & REDF_ECN4) {
448 struct ip *ip = hdr;
449 uint8_t otos;
450 int sum;
452 if (ip->ip_v != 4)
453 return (0); /* version mismatch! */
455 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
456 return (0); /* not-ECT */
457 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
458 return (1); /* already marked */
461 * ecn-capable but not marked,
462 * mark CE and update checksum
464 otos = ip->ip_tos;
465 ip->ip_tos |= IPTOS_ECN_CE;
467 * update checksum (from RFC1624)
468 * HC' = ~(~HC + ~m + m')
470 sum = ~ntohs(ip->ip_sum) & 0xffff;
471 sum += (~otos & 0xffff) + ip->ip_tos;
472 sum = (sum >> 16) + (sum & 0xffff);
473 sum += (sum >> 16); /* add carry */
474 ip->ip_sum = htons(~sum & 0xffff);
475 return (1);
477 break;
478 #ifdef INET6
479 case AF_INET6:
480 if (flags & REDF_ECN6) {
481 struct ip6_hdr *ip6 = hdr;
482 uint32_t flowlabel;
484 flowlabel = ntohl(ip6->ip6_flow);
485 if ((flowlabel >> 28) != 6)
486 return (0); /* version mismatch! */
487 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
488 (IPTOS_ECN_NOTECT << 20))
489 return (0); /* not-ECT */
490 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
491 (IPTOS_ECN_CE << 20))
492 return (1); /* already marked */
494 * ecn-capable but not marked, mark CE
496 flowlabel |= (IPTOS_ECN_CE << 20);
497 ip6->ip6_flow = htonl(flowlabel);
498 return (1);
500 break;
501 #endif /* INET6 */
504 /* not marked */
505 return (0);
508 struct mbuf *
509 red_getq(red_t *rp, class_queue_t *q)
511 struct mbuf *m;
513 if ((m = _getq(q)) == NULL) {
514 if (rp->red_idle == 0) {
515 rp->red_idle = 1;
516 microtime(&rp->red_last);
518 return NULL;
521 rp->red_idle = 0;
522 return (m);
526 * helper routine to calibrate avg during idle.
527 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
528 * here Wq = 1/weight and the code assumes Wq is close to zero.
530 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
532 static SLIST_HEAD(, wtab) wtab_list = SLIST_HEAD_INITIALIZER(&wtab_list);
534 struct wtab *
535 wtab_alloc(int weight)
537 struct wtab *w;
538 int i;
540 SLIST_FOREACH(w, &wtab_list, w_link) {
541 if (w->w_weight == weight) {
542 w->w_refcount++;
543 return (w);
547 w = kmalloc(sizeof(*w), M_ALTQ, M_WAITOK | M_ZERO);
548 w->w_weight = weight;
549 w->w_refcount = 1;
550 SLIST_INSERT_HEAD(&wtab_list, w, w_link);
552 /* initialize the weight table */
553 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
554 for (i = 1; i < 32; i++) {
555 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
556 if (w->w_tab[i] == 0 && w->w_param_max == 0)
557 w->w_param_max = 1 << i;
560 return (w);
564 wtab_destroy(struct wtab *w)
566 if (--w->w_refcount > 0)
567 return (0);
569 SLIST_REMOVE(&wtab_list, w, wtab, w_link);
570 kfree(w, M_ALTQ);
572 return (0);
575 int32_t
576 pow_w(struct wtab *w, int n)
578 int i, bit;
579 int32_t val;
581 if (n >= w->w_param_max)
582 return (0);
584 val = 1 << FP_SHIFT;
585 if (n <= 0)
586 return (val);
588 bit = 1;
589 i = 0;
590 while (n) {
591 if (n & bit) {
592 val = (val * w->w_tab[i]) >> FP_SHIFT;
593 n &= ~bit;
595 i++;
596 bit <<= 1;
598 return (val);
601 #endif /* ALTQ_RED */