2 * net/sched/sch_red.c Random Early Detection queue.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * J Hadi Salim <hadi@nortel.com> 980914: computation fixes
13 * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
14 * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <asm/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/sched.h>
25 #include <linux/string.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
30 #include <linux/errno.h>
31 #include <linux/interrupt.h>
32 #include <linux/if_ether.h>
33 #include <linux/inet.h>
34 #include <linux/netdevice.h>
35 #include <linux/etherdevice.h>
36 #include <linux/notifier.h>
38 #include <net/route.h>
39 #include <linux/skbuff.h>
41 #include <net/pkt_sched.h>
42 #include <net/inet_ecn.h>
44 #define RED_ECN_ECT 0x02
45 #define RED_ECN_CE 0x01
48 /* Random Early Detection (RED) algorithm.
49 =======================================
51 Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
52 for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
54 This file codes a "divisionless" version of RED algorithm
55 as written down in Fig.17 of the paper.
60 When a new packet arrives we calculate the average queue length:
62 avg = (1-W)*avg + W*current_queue_len,
64 W is the filter time constant (choosen as 2^(-Wlog)), it controls
65 the inertia of the algorithm. To allow larger bursts, W should be
68 if (avg > th_max) -> packet marked (dropped).
69 if (avg < th_min) -> packet passes.
70 if (th_min < avg < th_max) we calculate probability:
72 Pb = max_P * (avg - th_min)/(th_max-th_min)
74 and mark (drop) packet with this probability.
75 Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
76 max_P should be small (not 1), usually 0.01..0.02 is good value.
78 max_P is chosen as a number, so that max_P/(th_max-th_min)
79 is a negative power of two in order arithmetics to contain
83 Parameters, settable by user:
84 -----------------------------
86 limit - bytes (must be > qth_max + burst)
88 Hard limit on queue length, should be chosen >qth_max
89 to allow packet bursts. This parameter does not
90 affect the algorithms behaviour and can be chosen
91 arbitrarily high (well, less than ram size)
92 Really, this limit will never be reached
93 if RED works correctly.
95 qth_min - bytes (should be < qth_max/2)
96 qth_max - bytes (should be at least 2*qth_min and less limit)
97 Wlog - bits (<32) log(1/W).
100 Plog is related to max_P by formula:
102 max_P = (qth_max-qth_min)/2^Plog;
104 F.e. if qth_max=128K and qth_min=32K, then Plog=22
105 corresponds to max_P=0.02
110 Lookup table for log((1-W)^(t/t_ave).
118 If you want to allow bursts of L packets of size S,
121 L + 1 - th_min/S < (1-(1-W)^L)/W
123 th_min/S = 32 th_min/S = 4
138 struct red_sched_data
141 u32 limit
; /* HARD maximal queue length */
142 u32 qth_min
; /* Min average length threshold: A scaled */
143 u32 qth_max
; /* Max average length threshold: A scaled */
147 char Wlog
; /* log(W) */
148 char Plog
; /* random number bits */
153 unsigned long qave
; /* Average queue length: A scaled */
154 int qcount
; /* Packets since last random number generation */
155 u32 qR
; /* Cached random number */
157 psched_time_t qidlestart
; /* Start of idle period */
158 struct tc_red_xstats st
;
161 static int red_ecn_mark(struct sk_buff
*skb
)
163 if (skb
->nh
.raw
+ 20 > skb
->tail
)
166 switch (skb
->protocol
) {
167 case __constant_htons(ETH_P_IP
):
169 u8 tos
= skb
->nh
.iph
->tos
;
171 if (!(tos
& RED_ECN_ECT
))
174 if (!(tos
& RED_ECN_CE
))
175 IP_ECN_set_ce(skb
->nh
.iph
);
180 case __constant_htons(ETH_P_IPV6
):
182 u32 label
= *(u32
*)skb
->nh
.raw
;
184 if (!(label
& __constant_htonl(RED_ECN_ECT
<<20)))
186 label
|= __constant_htonl(RED_ECN_CE
<<20);
196 red_enqueue(struct sk_buff
*skb
, struct Qdisc
* sch
)
198 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
202 if (!PSCHED_IS_PASTPERFECT(q
->qidlestart
)) {
206 PSCHED_GET_TIME(now
);
207 us_idle
= PSCHED_TDIFF_SAFE(now
, q
->qidlestart
, q
->Scell_max
, 0);
208 PSCHED_SET_PASTPERFECT(q
->qidlestart
);
211 The problem: ideally, average length queue recalcultion should
212 be done over constant clock intervals. This is too expensive, so that
213 the calculation is driven by outgoing packets.
214 When the queue is idle we have to model this clock by hand.
216 SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
217 dummy packets as a burst after idle time, i.e.
221 This is an apparently overcomplicated solution (f.e. we have to precompute
222 a table to make this calculation in reasonable time)
223 I believe that a simpler model may be used here,
224 but it is field for experiments.
226 shift
= q
->Stab
[us_idle
>>q
->Scell_log
];
231 /* Approximate initial part of exponent
232 with linear function:
233 (1-W)^m ~= 1-mW + ...
235 Seems, it is the best solution to
236 problem of too coarce exponent tabulation.
239 us_idle
= (q
->qave
* us_idle
)>>q
->Scell_log
;
240 if (us_idle
< q
->qave
/2)
246 q
->qave
+= sch
->stats
.backlog
- (q
->qave
>> q
->Wlog
);
248 q->qave is fixed point number with point at Wlog.
249 The formulae above is equvalent to floating point
252 qave = qave*(1-W) + sch->stats.backlog*W;
257 if (q
->qave
< q
->qth_min
) {
260 if (sch
->stats
.backlog
<= q
->limit
) {
261 __skb_queue_tail(&sch
->q
, skb
);
262 sch
->stats
.backlog
+= skb
->len
;
263 sch
->stats
.bytes
+= skb
->len
;
264 sch
->stats
.packets
++;
265 return NET_XMIT_SUCCESS
;
271 return NET_XMIT_DROP
;
273 if (q
->qave
>= q
->qth_max
) {
275 sch
->stats
.overlimits
++;
277 if (!(q
->flags
&TC_RED_ECN
) || !red_ecn_mark(skb
)) {
286 /* The formula used below causes questions.
288 OK. qR is random number in the interval 0..Rmask
289 i.e. 0..(2^Plog). If we used floating point
290 arithmetics, it would be: (2^Plog)*rnd_num,
291 where rnd_num is less 1.
293 Taking into account, that qave have fixed
294 point at Wlog, and Plog is related to max_P by
295 max_P = (qth_max-qth_min)/2^Plog; two lines
296 below have the following floating point equivalent:
298 max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
300 Any questions? --ANK (980924)
302 if (((q
->qave
- q
->qth_min
)>>q
->Wlog
)*q
->qcount
< q
->qR
)
305 q
->qR
= net_random()&q
->Rmask
;
306 sch
->stats
.overlimits
++;
309 q
->qR
= net_random()&q
->Rmask
;
319 red_requeue(struct sk_buff
*skb
, struct Qdisc
* sch
)
321 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
323 PSCHED_SET_PASTPERFECT(q
->qidlestart
);
325 __skb_queue_head(&sch
->q
, skb
);
326 sch
->stats
.backlog
+= skb
->len
;
330 static struct sk_buff
*
331 red_dequeue(struct Qdisc
* sch
)
334 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
336 skb
= __skb_dequeue(&sch
->q
);
338 sch
->stats
.backlog
-= skb
->len
;
341 PSCHED_GET_TIME(q
->qidlestart
);
346 red_drop(struct Qdisc
* sch
)
349 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
351 skb
= __skb_dequeue_tail(&sch
->q
);
353 sch
->stats
.backlog
-= skb
->len
;
359 PSCHED_GET_TIME(q
->qidlestart
);
363 static void red_reset(struct Qdisc
* sch
)
365 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
368 while((skb
=__skb_dequeue(&sch
->q
))!=NULL
)
370 sch
->stats
.backlog
= 0;
371 PSCHED_SET_PASTPERFECT(q
->qidlestart
);
376 static int red_change(struct Qdisc
*sch
, struct rtattr
*opt
)
378 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
379 struct rtattr
*tb
[TCA_RED_STAB
];
380 struct tc_red_qopt
*ctl
;
383 rtattr_parse(tb
, TCA_RED_STAB
, RTA_DATA(opt
), RTA_PAYLOAD(opt
)) ||
384 tb
[TCA_RED_PARMS
-1] == 0 || tb
[TCA_RED_STAB
-1] == 0 ||
385 RTA_PAYLOAD(tb
[TCA_RED_PARMS
-1]) < sizeof(*ctl
) ||
386 RTA_PAYLOAD(tb
[TCA_RED_STAB
-1]) < 256)
389 ctl
= RTA_DATA(tb
[TCA_RED_PARMS
-1]);
392 q
->flags
= ctl
->flags
;
395 q
->Rmask
= ctl
->Plog
< 32 ? ((1<<ctl
->Plog
) - 1) : ~0UL;
396 q
->Scell_log
= ctl
->Scell_log
;
397 q
->Scell_max
= (255<<q
->Scell_log
);
398 q
->qth_min
= ctl
->qth_min
<<ctl
->Wlog
;
399 q
->qth_max
= ctl
->qth_max
<<ctl
->Wlog
;
400 q
->limit
= ctl
->limit
;
401 memcpy(q
->Stab
, RTA_DATA(tb
[TCA_RED_STAB
-1]), 256);
404 if (skb_queue_len(&sch
->q
) == 0)
405 PSCHED_SET_PASTPERFECT(q
->qidlestart
);
406 sch_tree_unlock(sch
);
410 static int red_init(struct Qdisc
* sch
, struct rtattr
*opt
)
416 if ((err
= red_change(sch
, opt
)) != 0) {
423 #ifdef CONFIG_RTNETLINK
424 int red_copy_xstats(struct sk_buff
*skb
, struct tc_red_xstats
*st
)
426 RTA_PUT(skb
, TCA_XSTATS
, sizeof(*st
), st
);
433 static int red_dump(struct Qdisc
*sch
, struct sk_buff
*skb
)
435 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
436 unsigned char *b
= skb
->tail
;
438 struct tc_red_qopt opt
;
440 rta
= (struct rtattr
*)b
;
441 RTA_PUT(skb
, TCA_OPTIONS
, 0, NULL
);
442 opt
.limit
= q
->limit
;
443 opt
.qth_min
= q
->qth_min
>>q
->Wlog
;
444 opt
.qth_max
= q
->qth_max
>>q
->Wlog
;
447 opt
.Scell_log
= q
->Scell_log
;
448 opt
.flags
= q
->flags
;
449 RTA_PUT(skb
, TCA_RED_PARMS
, sizeof(opt
), &opt
);
450 rta
->rta_len
= skb
->tail
- b
;
452 if (red_copy_xstats(skb
, &q
->st
))
458 skb_trim(skb
, b
- skb
->data
);
463 static void red_destroy(struct Qdisc
*sch
)
468 struct Qdisc_ops red_qdisc_ops
=
473 sizeof(struct red_sched_data
),
485 #ifdef CONFIG_RTNETLINK
492 int init_module(void)
494 return register_qdisc(&red_qdisc_ops
);
497 void cleanup_module(void)
499 unregister_qdisc(&red_qdisc_ops
);