1 /* $KAME: altq_red.c,v 1.19 2004/04/17 10:54:49 kjc Exp $ */
2 /* $DragonFly: src/sys/net/altq/altq_red.c,v 1.4 2006/12/22 23:44:55 swildner Exp $ */
5 * Copyright (C) 1997-2003
6 * Sony Computer Science Laboratories Inc. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * Copyright (c) 1990-1994 Regents of the University of California.
32 * All rights reserved.
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 * 3. All advertising materials mentioning features or use of this software
43 * must display the following acknowledgement:
44 * This product includes software developed by the Computer Systems
45 * Engineering Group at Lawrence Berkeley Laboratory.
46 * 4. Neither the name of the University nor of the Laboratory may be used
47 * to endorse or promote products derived from this software without
48 * specific prior written permission.
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 #include "opt_inet6.h"
67 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */
69 #include <sys/param.h>
70 #include <sys/malloc.h>
72 #include <sys/socket.h>
73 #include <sys/systm.h>
74 #include <sys/errno.h>
78 #include <netinet/in.h>
79 #include <netinet/in_systm.h>
80 #include <netinet/ip.h>
82 #include <netinet/ip6.h>
85 #include <net/pf/pfvar.h>
86 #include <net/altq/altq.h>
87 #include <net/altq/altq_red.h>
90 * ALTQ/RED (Random Early Detection) implementation using 32-bit
91 * fixed-point calculation.
93 * written by kjc using the ns code as a reference.
94 * you can learn more about red and ns from Sally's home page at
95 * http://www-nrg.ee.lbl.gov/floyd/
97 * most of the red parameter values are fixed in this implementation
98 * to prevent fixed-point overflow/underflow.
99 * if you change the parameters, watch out for overflow/underflow!
101 * the parameters used are recommended values by Sally.
102 * the corresponding ns config looks:
104 * minthresh=5 maxthresh=15 queue-size=60
107 * bytes=false (can't be handled by 32-bit fixed-point)
108 * doubleq=false dqthresh=false
112 * alternative red parameters for a slow link.
114 * assume the queue length becomes from zero to L and keeps L, it takes
115 * N packets for q_avg to reach 63% of L.
116 * when q_weight is 0.002, N is about 500 packets.
117 * for a slow link like dial-up, 500 packets takes more than 1 minute!
118 * when q_weight is 0.008, N is about 127 packets.
119 * when q_weight is 0.016, N is about 63 packets.
120 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
121 * are allowed for 0.016.
122 * see Sally's paper for more details.
124 /* normal red parameters */
125 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
126 /* q_weight = 0.00195 */
128 /* red parameters for a slow link */
129 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
130 /* q_weight = 0.0078125 */
132 /* red parameters for a very slow link (e.g., dialup) */
133 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
134 /* q_weight = 0.015625 */
136 /* fixed-point uses 12-bit decimal places */
137 #define FP_SHIFT 12 /* fixed-point shift */
139 /* red parameters for drop probability */
140 #define INV_P_MAX 10 /* inverse of max drop probability */
141 #define TH_MIN 5 /* min threshold */
142 #define TH_MAX 15 /* max threshold */
144 #define RED_LIMIT 60 /* default max queue lenght */
145 #define RED_STATS /* collect statistics */
148 * our default policy for forced-drop is drop-tail.
149 * (in altq-1.1.2 or earlier, the default was random-drop.
150 * but it makes more sense to punish the cause of the surge.)
151 * to switch to the random-drop policy, define "RED_RANDOM_DROP".
154 /* default red parameter values */
155 static int default_th_min
= TH_MIN
;
156 static int default_th_max
= TH_MAX
;
157 static int default_inv_pmax
= INV_P_MAX
;
160 * red support routines
163 red_alloc(int weight
, int inv_pmax
, int th_min
, int th_max
, int flags
, int pkttime
)
169 rp
= kmalloc(sizeof(*rp
), M_ALTQ
, M_WAITOK
| M_ZERO
);
174 rp
->red_weight
= W_WEIGHT
;
176 rp
->red_weight
= weight
;
178 rp
->red_inv_pmax
= default_inv_pmax
;
180 rp
->red_inv_pmax
= inv_pmax
;
182 rp
->red_thmin
= default_th_min
;
184 rp
->red_thmin
= th_min
;
186 rp
->red_thmax
= default_th_max
;
188 rp
->red_thmax
= th_max
;
190 rp
->red_flags
= flags
;
193 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
194 rp
->red_pkttime
= 800;
196 rp
->red_pkttime
= pkttime
;
199 /* when the link is very slow, adjust red parameters */
200 npkts_per_sec
= 1000000 / rp
->red_pkttime
;
201 if (npkts_per_sec
< 50) {
202 /* up to about 400Kbps */
203 rp
->red_weight
= W_WEIGHT_2
;
204 } else if (npkts_per_sec
< 300) {
205 /* up to about 2.4Mbps */
206 rp
->red_weight
= W_WEIGHT_1
;
210 /* calculate wshift. weight must be power of 2 */
212 for (i
= 0; w
> 1; i
++)
215 w
= 1 << rp
->red_wshift
;
216 if (w
!= rp
->red_weight
) {
217 kprintf("invalid weight value %d for red! use %d\n",
223 * thmin_s and thmax_s are scaled versions of th_min and th_max
224 * to be compared with avg.
226 rp
->red_thmin_s
= rp
->red_thmin
<< (rp
->red_wshift
+ FP_SHIFT
);
227 rp
->red_thmax_s
= rp
->red_thmax
<< (rp
->red_wshift
+ FP_SHIFT
);
230 * precompute probability denominator
231 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
233 rp
->red_probd
= (2 * (rp
->red_thmax
- rp
->red_thmin
)
234 * rp
->red_inv_pmax
) << FP_SHIFT
;
236 /* allocate weight table */
237 rp
->red_wtab
= wtab_alloc(rp
->red_weight
);
239 microtime(&rp
->red_last
);
244 red_destroy(red_t
*rp
)
246 wtab_destroy(rp
->red_wtab
);
251 red_getstats(red_t
*rp
, struct redstats
*sp
)
253 sp
->q_avg
= rp
->red_avg
>> rp
->red_wshift
;
254 sp
->xmit_cnt
= rp
->red_stats
.xmit_cnt
;
255 sp
->drop_cnt
= rp
->red_stats
.drop_cnt
;
256 sp
->drop_forced
= rp
->red_stats
.drop_forced
;
257 sp
->drop_unforced
= rp
->red_stats
.drop_unforced
;
258 sp
->marked_packets
= rp
->red_stats
.marked_packets
;
262 red_addq(red_t
*rp
, class_queue_t
*q
, struct mbuf
*m
, struct altq_pktattr
*pktattr
)
270 * if we were idle, we pretend that n packets arrived during
279 t
= (now
.tv_sec
- rp
->red_last
.tv_sec
);
282 * being idle for more than 1 minute, set avg to zero.
283 * this prevents t from overflow.
287 t
= t
* 1000000 + (now
.tv_usec
- rp
->red_last
.tv_usec
);
288 n
= t
/ rp
->red_pkttime
- 1;
290 /* the following line does (avg = (1 - Wq)^n * avg) */
292 avg
= (avg
>> FP_SHIFT
) *
293 pow_w(rp
->red_wtab
, n
);
297 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
298 avg
+= (qlen(q
) << FP_SHIFT
) - (avg
>> rp
->red_wshift
);
299 rp
->red_avg
= avg
; /* save the new value */
302 * red_count keeps a tally of arriving traffic that has not
307 /* see if we drop early */
308 droptype
= DTYPE_NODROP
;
309 if (avg
>= rp
->red_thmin_s
&& qlen(q
) > 1) {
310 if (avg
>= rp
->red_thmax_s
) {
311 /* avg >= th_max: forced drop */
312 droptype
= DTYPE_FORCED
;
313 } else if (rp
->red_old
== 0) {
314 /* first exceeds th_min */
317 } else if (drop_early((avg
- rp
->red_thmin_s
) >> rp
->red_wshift
,
318 rp
->red_probd
, rp
->red_count
)) {
319 /* mark or drop by red */
320 if ((rp
->red_flags
& REDF_ECN
) &&
321 mark_ecn(m
, pktattr
, rp
->red_flags
)) {
322 /* successfully marked. do not drop. */
325 rp
->red_stats
.marked_packets
++;
328 /* unforced drop by red */
329 droptype
= DTYPE_EARLY
;
338 * if the queue length hits the hard limit, it's a forced drop.
340 if (droptype
== DTYPE_NODROP
&& qlen(q
) >= qlimit(q
))
341 droptype
= DTYPE_FORCED
;
343 #ifdef RED_RANDOM_DROP
344 /* if successful or forced drop, enqueue this packet. */
345 if (droptype
!= DTYPE_EARLY
)
348 /* if successful, enqueue this packet. */
349 if (droptype
== DTYPE_NODROP
)
352 if (droptype
!= DTYPE_NODROP
) {
353 if (droptype
== DTYPE_EARLY
) {
354 /* drop the incoming packet */
356 rp
->red_stats
.drop_unforced
++;
359 /* forced drop, select a victim packet in the queue. */
360 #ifdef RED_RANDOM_DROP
364 rp
->red_stats
.drop_forced
++;
368 PKTCNTR_ADD(&rp
->red_stats
.drop_cnt
, m_pktlen(m
));
374 /* successfully queued */
376 PKTCNTR_ADD(&rp
->red_stats
.xmit_cnt
, m_pktlen(m
));
382 * early-drop probability is calculated as follows:
383 * prob = p_max * (avg - th_min) / (th_max - th_min)
384 * prob_a = prob / (2 - count*prob)
385 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
386 * here prob_a increases as successive undrop count increases.
387 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
388 * becomes 1 when (count >= (2 / prob))).
391 drop_early(int fp_len
, int fp_probd
, int count
)
393 int d
; /* denominator of drop-probability */
395 d
= fp_probd
- count
* fp_len
;
397 /* count exceeds the hard limit: drop or mark */
402 * now the range of d is [1..600] in fixed-point. (when
403 * th_max-th_min=10 and p_max=1/30)
404 * drop probability = (avg - TH_MIN) / d
407 if ((karc4random() % d
) < fp_len
) {
416 * try to mark CE bit to the packet.
417 * returns 1 if successfully marked, 0 otherwise.
420 mark_ecn(struct mbuf
*m
, struct altq_pktattr
*pktattr
, int flags
)
426 if (m
->m_pkthdr
.fw_flags
& PF_MBUF_STRUCTURE
) {
427 af
= m
->m_pkthdr
.pf
.ecn_af
;
428 hdr
= m
->m_pkthdr
.pf
.hdr
;
429 } else if (pktattr
) {
430 af
= pktattr
->pattr_af
;
431 hdr
= pktattr
->pattr_hdr
;
436 if (af
!= AF_INET
&& af
!= AF_INET6
)
439 /* verify that pattr_hdr is within the mbuf data */
440 for (m0
= m
; m0
!= NULL
; m0
= m0
->m_next
) {
441 if (((caddr_t
)hdr
>= m0
->m_data
) &&
442 ((caddr_t
)hdr
< m0
->m_data
+ m0
->m_len
))
446 /* ick, tag info is stale */
452 if (flags
& REDF_ECN4
) {
457 if ((ip
->ip_tos
& IPTOS_ECN_MASK
) == IPTOS_ECN_NOTECT
)
458 return (0); /* not-ECT */
459 if ((ip
->ip_tos
& IPTOS_ECN_MASK
) == IPTOS_ECN_CE
)
460 return (1); /* already marked */
463 * ecn-capable but not marked,
464 * mark CE and update checksum
467 ip
->ip_tos
|= IPTOS_ECN_CE
;
469 * update checksum (from RFC1624)
470 * HC' = ~(~HC + ~m + m')
472 sum
= ~ntohs(ip
->ip_sum
) & 0xffff;
473 sum
+= (~otos
& 0xffff) + ip
->ip_tos
;
474 sum
= (sum
>> 16) + (sum
& 0xffff);
475 sum
+= (sum
>> 16); /* add carry */
476 ip
->ip_sum
= htons(~sum
& 0xffff);
482 if (flags
& REDF_ECN6
) {
483 struct ip6_hdr
*ip6
= hdr
;
486 flowlabel
= ntohl(ip6
->ip6_flow
);
487 if ((flowlabel
>> 28) != 6)
488 return (0); /* version mismatch! */
489 if ((flowlabel
& (IPTOS_ECN_MASK
<< 20)) ==
490 (IPTOS_ECN_NOTECT
<< 20))
491 return (0); /* not-ECT */
492 if ((flowlabel
& (IPTOS_ECN_MASK
<< 20)) ==
493 (IPTOS_ECN_CE
<< 20))
494 return (1); /* already marked */
496 * ecn-capable but not marked, mark CE
498 flowlabel
|= (IPTOS_ECN_CE
<< 20);
499 ip6
->ip6_flow
= htonl(flowlabel
);
511 red_getq(red_t
*rp
, class_queue_t
*q
)
515 if ((m
= _getq(q
)) == NULL
) {
516 if (rp
->red_idle
== 0) {
518 microtime(&rp
->red_last
);
528 * helper routine to calibrate avg during idle.
529 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
530 * here Wq = 1/weight and the code assumes Wq is close to zero.
532 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
534 static SLIST_HEAD(, wtab
) wtab_list
= SLIST_HEAD_INITIALIZER(&wtab_list
);
537 wtab_alloc(int weight
)
542 SLIST_FOREACH(w
, &wtab_list
, w_link
) {
543 if (w
->w_weight
== weight
) {
549 w
= kmalloc(sizeof(*w
), M_ALTQ
, M_WAITOK
| M_ZERO
);
550 w
->w_weight
= weight
;
552 SLIST_INSERT_HEAD(&wtab_list
, w
, w_link
);
554 /* initialize the weight table */
555 w
->w_tab
[0] = ((weight
- 1) << FP_SHIFT
) / weight
;
556 for (i
= 1; i
< 32; i
++) {
557 w
->w_tab
[i
] = (w
->w_tab
[i
-1] * w
->w_tab
[i
-1]) >> FP_SHIFT
;
558 if (w
->w_tab
[i
] == 0 && w
->w_param_max
== 0)
559 w
->w_param_max
= 1 << i
;
566 wtab_destroy(struct wtab
*w
)
568 if (--w
->w_refcount
> 0)
571 SLIST_REMOVE(&wtab_list
, w
, wtab
, w_link
);
578 pow_w(struct wtab
*w
, int n
)
583 if (n
>= w
->w_param_max
)
594 val
= (val
* w
->w_tab
[i
]) >> FP_SHIFT
;
603 #endif /* ALTQ_RED */