2 * net/sched/sch_red.c Random Early Detection queue.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * J Hadi Salim <hadi@nortel.com> 980914: computation fixes
13 * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
14 * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <asm/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/sched.h>
25 #include <linux/string.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
30 #include <linux/errno.h>
31 #include <linux/interrupt.h>
32 #include <linux/if_ether.h>
33 #include <linux/inet.h>
34 #include <linux/netdevice.h>
35 #include <linux/etherdevice.h>
36 #include <linux/notifier.h>
38 #include <net/route.h>
39 #include <linux/skbuff.h>
41 #include <net/pkt_sched.h>
42 #include <net/inet_ecn.h>
44 #define RED_ECN_ECT 0x02
45 #define RED_ECN_CE 0x01
48 /* Random Early Detection (RED) algorithm.
49 =======================================
51 Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
52 for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
54 This file codes a "divisionless" version of RED algorithm
55 as written down in Fig.17 of the paper.
60 When a new packet arrives we calculate the average queue length:
62 avg = (1-W)*avg + W*current_queue_len,
64 W is the filter time constant (chosen as 2^(-Wlog)), it controls
65 the inertia of the algorithm. To allow larger bursts, W should be
68 if (avg > th_max) -> packet marked (dropped).
69 if (avg < th_min) -> packet passes.
70 if (th_min < avg < th_max) we calculate probability:
72 Pb = max_P * (avg - th_min)/(th_max-th_min)
74 and mark (drop) packet with this probability.
75 Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
76 max_P should be small (not 1), usually 0.01..0.02 is good value.
78 max_P is chosen as a number, so that max_P/(th_max-th_min)
79 is a negative power of two in order arithmetics to contain
83 Parameters, settable by user:
84 -----------------------------
86 limit - bytes (must be > qth_max + burst)
88 Hard limit on queue length, should be chosen >qth_max
89 to allow packet bursts. This parameter does not
90 affect the algorithms behaviour and can be chosen
91 arbitrarily high (well, less than ram size)
92 Really, this limit will never be reached
93 if RED works correctly.
95 qth_min - bytes (should be < qth_max/2)
96 qth_max - bytes (should be at least 2*qth_min and less limit)
97 Wlog - bits (<32) log(1/W).
100 Plog is related to max_P by formula:
102 max_P = (qth_max-qth_min)/2^Plog;
104 F.e. if qth_max=128K and qth_min=32K, then Plog=22
105 corresponds to max_P=0.02
110 Lookup table for log((1-W)^(t/t_ave).
118 If you want to allow bursts of L packets of size S,
121 L + 1 - th_min/S < (1-(1-W)^L)/W
123 th_min/S = 32 th_min/S = 4
138 struct red_sched_data
141 u32 limit
; /* HARD maximal queue length */
142 u32 qth_min
; /* Min average length threshold: A scaled */
143 u32 qth_max
; /* Max average length threshold: A scaled */
147 char Wlog
; /* log(W) */
148 char Plog
; /* random number bits */
153 unsigned long qave
; /* Average queue length: A scaled */
154 int qcount
; /* Packets since last random number generation */
155 u32 qR
; /* Cached random number */
157 psched_time_t qidlestart
; /* Start of idle period */
158 struct tc_red_xstats st
;
161 static int red_ecn_mark(struct sk_buff
*skb
)
163 if (skb
->nh
.raw
+ 20 > skb
->tail
)
166 switch (skb
->protocol
) {
167 case __constant_htons(ETH_P_IP
):
169 u8 tos
= skb
->nh
.iph
->tos
;
171 if (!(tos
& RED_ECN_ECT
))
174 if (!(tos
& RED_ECN_CE
))
175 IP_ECN_set_ce(skb
->nh
.iph
);
180 case __constant_htons(ETH_P_IPV6
):
182 u32 label
= *(u32
*)skb
->nh
.raw
;
184 if (!(label
& __constant_htonl(RED_ECN_ECT
<<20)))
186 label
|= __constant_htonl(RED_ECN_CE
<<20);
196 red_enqueue(struct sk_buff
*skb
, struct Qdisc
* sch
)
198 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
202 if (!PSCHED_IS_PASTPERFECT(q
->qidlestart
)) {
206 PSCHED_GET_TIME(now
);
207 us_idle
= PSCHED_TDIFF_SAFE(now
, q
->qidlestart
, q
->Scell_max
, 0);
208 PSCHED_SET_PASTPERFECT(q
->qidlestart
);
211 The problem: ideally, average length queue recalcultion should
212 be done over constant clock intervals. This is too expensive, so that
213 the calculation is driven by outgoing packets.
214 When the queue is idle we have to model this clock by hand.
216 SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
217 dummy packets as a burst after idle time, i.e.
221 This is an apparently overcomplicated solution (f.e. we have to precompute
222 a table to make this calculation in reasonable time)
223 I believe that a simpler model may be used here,
224 but it is field for experiments.
226 shift
= q
->Stab
[us_idle
>>q
->Scell_log
];
231 /* Approximate initial part of exponent
232 with linear function:
233 (1-W)^m ~= 1-mW + ...
235 Seems, it is the best solution to
236 problem of too coarce exponent tabulation.
239 us_idle
= (q
->qave
* us_idle
)>>q
->Scell_log
;
240 if (us_idle
< q
->qave
/2)
246 q
->qave
+= sch
->stats
.backlog
- (q
->qave
>> q
->Wlog
);
248 q->qave is fixed point number with point at Wlog.
249 The formulae above is equvalent to floating point
252 qave = qave*(1-W) + sch->stats.backlog*W;
257 if (q
->qave
< q
->qth_min
) {
260 if (sch
->stats
.backlog
<= q
->limit
) {
261 __skb_queue_tail(&sch
->q
, skb
);
262 sch
->stats
.backlog
+= skb
->len
;
263 sch
->stats
.bytes
+= skb
->len
;
264 sch
->stats
.packets
++;
265 return NET_XMIT_SUCCESS
;
271 return NET_XMIT_DROP
;
273 if (q
->qave
>= q
->qth_max
) {
275 sch
->stats
.overlimits
++;
277 if (!(q
->flags
&TC_RED_ECN
) || !red_ecn_mark(skb
)) {
286 /* The formula used below causes questions.
288 OK. qR is random number in the interval 0..Rmask
289 i.e. 0..(2^Plog). If we used floating point
290 arithmetics, it would be: (2^Plog)*rnd_num,
291 where rnd_num is less 1.
293 Taking into account, that qave have fixed
294 point at Wlog, and Plog is related to max_P by
295 max_P = (qth_max-qth_min)/2^Plog; two lines
296 below have the following floating point equivalent:
298 max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
300 Any questions? --ANK (980924)
302 if (((q
->qave
- q
->qth_min
)>>q
->Wlog
)*q
->qcount
< q
->qR
)
305 q
->qR
= net_random()&q
->Rmask
;
306 sch
->stats
.overlimits
++;
309 q
->qR
= net_random()&q
->Rmask
;
319 red_requeue(struct sk_buff
*skb
, struct Qdisc
* sch
)
321 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
323 PSCHED_SET_PASTPERFECT(q
->qidlestart
);
325 __skb_queue_head(&sch
->q
, skb
);
326 sch
->stats
.backlog
+= skb
->len
;
330 static struct sk_buff
*
331 red_dequeue(struct Qdisc
* sch
)
334 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
336 skb
= __skb_dequeue(&sch
->q
);
338 sch
->stats
.backlog
-= skb
->len
;
341 PSCHED_GET_TIME(q
->qidlestart
);
345 static unsigned int red_drop(struct Qdisc
* sch
)
348 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
350 skb
= __skb_dequeue_tail(&sch
->q
);
352 unsigned int len
= skb
->len
;
353 sch
->stats
.backlog
-= len
;
359 PSCHED_GET_TIME(q
->qidlestart
);
363 static void red_reset(struct Qdisc
* sch
)
365 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
367 __skb_queue_purge(&sch
->q
);
368 sch
->stats
.backlog
= 0;
369 PSCHED_SET_PASTPERFECT(q
->qidlestart
);
374 static int red_change(struct Qdisc
*sch
, struct rtattr
*opt
)
376 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
377 struct rtattr
*tb
[TCA_RED_STAB
];
378 struct tc_red_qopt
*ctl
;
381 rtattr_parse(tb
, TCA_RED_STAB
, RTA_DATA(opt
), RTA_PAYLOAD(opt
)) ||
382 tb
[TCA_RED_PARMS
-1] == 0 || tb
[TCA_RED_STAB
-1] == 0 ||
383 RTA_PAYLOAD(tb
[TCA_RED_PARMS
-1]) < sizeof(*ctl
) ||
384 RTA_PAYLOAD(tb
[TCA_RED_STAB
-1]) < 256)
387 ctl
= RTA_DATA(tb
[TCA_RED_PARMS
-1]);
390 q
->flags
= ctl
->flags
;
393 q
->Rmask
= ctl
->Plog
< 32 ? ((1<<ctl
->Plog
) - 1) : ~0UL;
394 q
->Scell_log
= ctl
->Scell_log
;
395 q
->Scell_max
= (255<<q
->Scell_log
);
396 q
->qth_min
= ctl
->qth_min
<<ctl
->Wlog
;
397 q
->qth_max
= ctl
->qth_max
<<ctl
->Wlog
;
398 q
->limit
= ctl
->limit
;
399 memcpy(q
->Stab
, RTA_DATA(tb
[TCA_RED_STAB
-1]), 256);
402 if (skb_queue_len(&sch
->q
) == 0)
403 PSCHED_SET_PASTPERFECT(q
->qidlestart
);
404 sch_tree_unlock(sch
);
408 static int red_init(struct Qdisc
* sch
, struct rtattr
*opt
)
410 return red_change(sch
, opt
);
414 int red_copy_xstats(struct sk_buff
*skb
, struct tc_red_xstats
*st
)
416 RTA_PUT(skb
, TCA_XSTATS
, sizeof(*st
), st
);
423 static int red_dump(struct Qdisc
*sch
, struct sk_buff
*skb
)
425 struct red_sched_data
*q
= (struct red_sched_data
*)sch
->data
;
426 unsigned char *b
= skb
->tail
;
428 struct tc_red_qopt opt
;
430 rta
= (struct rtattr
*)b
;
431 RTA_PUT(skb
, TCA_OPTIONS
, 0, NULL
);
432 opt
.limit
= q
->limit
;
433 opt
.qth_min
= q
->qth_min
>>q
->Wlog
;
434 opt
.qth_max
= q
->qth_max
>>q
->Wlog
;
437 opt
.Scell_log
= q
->Scell_log
;
438 opt
.flags
= q
->flags
;
439 RTA_PUT(skb
, TCA_RED_PARMS
, sizeof(opt
), &opt
);
440 rta
->rta_len
= skb
->tail
- b
;
442 if (red_copy_xstats(skb
, &q
->st
))
448 skb_trim(skb
, b
- skb
->data
);
452 static void red_destroy(struct Qdisc
*sch
)
456 struct Qdisc_ops red_qdisc_ops
= {
460 .priv_size
= sizeof(struct red_sched_data
),
461 .enqueue
= red_enqueue
,
462 .dequeue
= red_dequeue
,
463 .requeue
= red_requeue
,
467 .destroy
= red_destroy
,
468 .change
= red_change
,
470 .owner
= THIS_MODULE
,
475 int init_module(void)
477 return register_qdisc(&red_qdisc_ops
);
480 void cleanup_module(void)
482 unregister_qdisc(&red_qdisc_ops
);
485 MODULE_LICENSE("GPL");