- Linus: drop support for old-style Makefiles entirely. Big.
[davej-history.git] / net / sched / sch_red.c
blobae948ba9219bfa1757b2aeba00cc7b26a5cc70aa
1 /*
2 * net/sched/sch_red.c Random Early Detection queue.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 * Changes:
12 * J Hadi Salim <hadi@nortel.com> 980914: computation fixes
13 * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
14 * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <asm/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/sched.h>
25 #include <linux/string.h>
26 #include <linux/mm.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/in.h>
30 #include <linux/errno.h>
31 #include <linux/interrupt.h>
32 #include <linux/if_ether.h>
33 #include <linux/inet.h>
34 #include <linux/netdevice.h>
35 #include <linux/etherdevice.h>
36 #include <linux/notifier.h>
37 #include <net/ip.h>
38 #include <net/route.h>
39 #include <linux/skbuff.h>
40 #include <net/sock.h>
41 #include <net/pkt_sched.h>
42 #include <net/inet_ecn.h>
44 #define RED_ECN_ECT 0x02
45 #define RED_ECN_CE 0x01
48 /* Random Early Detection (RED) algorithm.
49 =======================================
51 Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
52 for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
54 This file codes a "divisionless" version of RED algorithm
55 as written down in Fig.17 of the paper.
57 Short description.
58 ------------------
60 When a new packet arrives we calculate the average queue length:
62 avg = (1-W)*avg + W*current_queue_len,
64 W is the filter time constant (choosen as 2^(-Wlog)), it controls
65 the inertia of the algorithm. To allow larger bursts, W should be
66 decreased.
68 if (avg > th_max) -> packet marked (dropped).
69 if (avg < th_min) -> packet passes.
70 if (th_min < avg < th_max) we calculate probability:
72 Pb = max_P * (avg - th_min)/(th_max-th_min)
74 and mark (drop) packet with this probability.
75 Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
76 max_P should be small (not 1), usually 0.01..0.02 is good value.
78 max_P is chosen as a number, so that max_P/(th_max-th_min)
79 is a negative power of two in order arithmetics to contain
80 only shifts.
83 Parameters, settable by user:
84 -----------------------------
86 limit - bytes (must be > qth_max + burst)
88 Hard limit on queue length, should be chosen >qth_max
89 to allow packet bursts. This parameter does not
90 affect the algorithms behaviour and can be chosen
91 arbitrarily high (well, less than ram size)
92 Really, this limit will never be reached
93 if RED works correctly.
95 qth_min - bytes (should be < qth_max/2)
96 qth_max - bytes (should be at least 2*qth_min and less limit)
97 Wlog - bits (<32) log(1/W).
98 Plog - bits (<32)
100 Plog is related to max_P by formula:
102 max_P = (qth_max-qth_min)/2^Plog;
104 F.e. if qth_max=128K and qth_min=32K, then Plog=22
105 corresponds to max_P=0.02
107 Scell_log
108 Stab
110 Lookup table for log((1-W)^(t/t_ave).
113 NOTES:
115 Upper bound on W.
116 -----------------
118 If you want to allow bursts of L packets of size S,
119 you should choose W:
121 L + 1 - th_min/S < (1-(1-W)^L)/W
123 th_min/S = 32 th_min/S = 4
125 log(W) L
126 -1 33
127 -2 35
128 -3 39
129 -4 46
130 -5 57
131 -6 75
132 -7 101
133 -8 135
134 -9 190
135 etc.
138 struct red_sched_data
140 /* Parameters */
141 u32 limit; /* HARD maximal queue length */
142 u32 qth_min; /* Min average length threshold: A scaled */
143 u32 qth_max; /* Max average length threshold: A scaled */
144 u32 Rmask;
145 u32 Scell_max;
146 unsigned char flags;
147 char Wlog; /* log(W) */
148 char Plog; /* random number bits */
149 char Scell_log;
150 u8 Stab[256];
152 /* Variables */
153 unsigned long qave; /* Average queue length: A scaled */
154 int qcount; /* Packets since last random number generation */
155 u32 qR; /* Cached random number */
157 psched_time_t qidlestart; /* Start of idle period */
158 struct tc_red_xstats st;
161 static int red_ecn_mark(struct sk_buff *skb)
163 if (skb->nh.raw + 20 > skb->tail)
164 return 0;
166 switch (skb->protocol) {
167 case __constant_htons(ETH_P_IP):
169 u8 tos = skb->nh.iph->tos;
171 if (!(tos & RED_ECN_ECT))
172 return 0;
174 if (!(tos & RED_ECN_CE))
175 IP_ECN_set_ce(skb->nh.iph);
177 return 1;
180 case __constant_htons(ETH_P_IPV6):
182 u32 label = *(u32*)skb->nh.raw;
184 if (!(label & __constant_htonl(RED_ECN_ECT<<20)))
185 return 0;
186 label |= __constant_htonl(RED_ECN_CE<<20);
187 return 1;
190 default:
191 return 0;
195 static int
196 red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
198 struct red_sched_data *q = (struct red_sched_data *)sch->data;
200 psched_time_t now;
202 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
203 long us_idle;
204 int shift;
206 PSCHED_GET_TIME(now);
207 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0);
208 PSCHED_SET_PASTPERFECT(q->qidlestart);
211 The problem: ideally, average length queue recalcultion should
212 be done over constant clock intervals. This is too expensive, so that
213 the calculation is driven by outgoing packets.
214 When the queue is idle we have to model this clock by hand.
216 SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
217 dummy packets as a burst after idle time, i.e.
219 q->qave *= (1-W)^m
221 This is an apparently overcomplicated solution (f.e. we have to precompute
222 a table to make this calculation in reasonable time)
223 I believe that a simpler model may be used here,
224 but it is field for experiments.
226 shift = q->Stab[us_idle>>q->Scell_log];
228 if (shift) {
229 q->qave >>= shift;
230 } else {
231 /* Approximate initial part of exponent
232 with linear function:
233 (1-W)^m ~= 1-mW + ...
235 Seems, it is the best solution to
236 problem of too coarce exponent tabulation.
239 us_idle = (q->qave * us_idle)>>q->Scell_log;
240 if (us_idle < q->qave/2)
241 q->qave -= us_idle;
242 else
243 q->qave >>= 1;
245 } else {
246 q->qave += sch->stats.backlog - (q->qave >> q->Wlog);
247 /* NOTE:
248 q->qave is fixed point number with point at Wlog.
249 The formulae above is equvalent to floating point
250 version:
252 qave = qave*(1-W) + sch->stats.backlog*W;
253 --ANK (980924)
257 if (q->qave < q->qth_min) {
258 q->qcount = -1;
259 enqueue:
260 if (sch->stats.backlog <= q->limit) {
261 __skb_queue_tail(&sch->q, skb);
262 sch->stats.backlog += skb->len;
263 sch->stats.bytes += skb->len;
264 sch->stats.packets++;
265 return NET_XMIT_SUCCESS;
266 } else {
267 q->st.pdrop++;
269 kfree_skb(skb);
270 sch->stats.drops++;
271 return NET_XMIT_DROP;
273 if (q->qave >= q->qth_max) {
274 q->qcount = -1;
275 sch->stats.overlimits++;
276 mark:
277 if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
278 q->st.early++;
279 goto drop;
281 q->st.marked++;
282 goto enqueue;
285 if (++q->qcount) {
286 /* The formula used below causes questions.
288 OK. qR is random number in the interval 0..Rmask
289 i.e. 0..(2^Plog). If we used floating point
290 arithmetics, it would be: (2^Plog)*rnd_num,
291 where rnd_num is less 1.
293 Taking into account, that qave have fixed
294 point at Wlog, and Plog is related to max_P by
295 max_P = (qth_max-qth_min)/2^Plog; two lines
296 below have the following floating point equivalent:
298 max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
300 Any questions? --ANK (980924)
302 if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
303 goto enqueue;
304 q->qcount = 0;
305 q->qR = net_random()&q->Rmask;
306 sch->stats.overlimits++;
307 goto mark;
309 q->qR = net_random()&q->Rmask;
310 goto enqueue;
312 drop:
313 kfree_skb(skb);
314 sch->stats.drops++;
315 return NET_XMIT_CN;
318 static int
319 red_requeue(struct sk_buff *skb, struct Qdisc* sch)
321 struct red_sched_data *q = (struct red_sched_data *)sch->data;
323 PSCHED_SET_PASTPERFECT(q->qidlestart);
325 __skb_queue_head(&sch->q, skb);
326 sch->stats.backlog += skb->len;
327 return 0;
330 static struct sk_buff *
331 red_dequeue(struct Qdisc* sch)
333 struct sk_buff *skb;
334 struct red_sched_data *q = (struct red_sched_data *)sch->data;
336 skb = __skb_dequeue(&sch->q);
337 if (skb) {
338 sch->stats.backlog -= skb->len;
339 return skb;
341 PSCHED_GET_TIME(q->qidlestart);
342 return NULL;
345 static int
346 red_drop(struct Qdisc* sch)
348 struct sk_buff *skb;
349 struct red_sched_data *q = (struct red_sched_data *)sch->data;
351 skb = __skb_dequeue_tail(&sch->q);
352 if (skb) {
353 sch->stats.backlog -= skb->len;
354 sch->stats.drops++;
355 q->st.other++;
356 kfree_skb(skb);
357 return 1;
359 PSCHED_GET_TIME(q->qidlestart);
360 return 0;
363 static void red_reset(struct Qdisc* sch)
365 struct red_sched_data *q = (struct red_sched_data *)sch->data;
366 struct sk_buff *skb;
368 while((skb=__skb_dequeue(&sch->q))!=NULL)
369 kfree_skb(skb);
370 sch->stats.backlog = 0;
371 PSCHED_SET_PASTPERFECT(q->qidlestart);
372 q->qave = 0;
373 q->qcount = -1;
376 static int red_change(struct Qdisc *sch, struct rtattr *opt)
378 struct red_sched_data *q = (struct red_sched_data *)sch->data;
379 struct rtattr *tb[TCA_RED_STAB];
380 struct tc_red_qopt *ctl;
382 if (opt == NULL ||
383 rtattr_parse(tb, TCA_RED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
384 tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
385 RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
386 RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
387 return -EINVAL;
389 ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
391 sch_tree_lock(sch);
392 q->flags = ctl->flags;
393 q->Wlog = ctl->Wlog;
394 q->Plog = ctl->Plog;
395 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
396 q->Scell_log = ctl->Scell_log;
397 q->Scell_max = (255<<q->Scell_log);
398 q->qth_min = ctl->qth_min<<ctl->Wlog;
399 q->qth_max = ctl->qth_max<<ctl->Wlog;
400 q->limit = ctl->limit;
401 memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
403 q->qcount = -1;
404 if (skb_queue_len(&sch->q) == 0)
405 PSCHED_SET_PASTPERFECT(q->qidlestart);
406 sch_tree_unlock(sch);
407 return 0;
410 static int red_init(struct Qdisc* sch, struct rtattr *opt)
412 int err;
414 MOD_INC_USE_COUNT;
416 if ((err = red_change(sch, opt)) != 0) {
417 MOD_DEC_USE_COUNT;
419 return err;
423 #ifdef CONFIG_RTNETLINK
424 int red_copy_xstats(struct sk_buff *skb, struct tc_red_xstats *st)
426 RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st);
427 return 0;
429 rtattr_failure:
430 return 1;
433 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
435 struct red_sched_data *q = (struct red_sched_data *)sch->data;
436 unsigned char *b = skb->tail;
437 struct rtattr *rta;
438 struct tc_red_qopt opt;
440 rta = (struct rtattr*)b;
441 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
442 opt.limit = q->limit;
443 opt.qth_min = q->qth_min>>q->Wlog;
444 opt.qth_max = q->qth_max>>q->Wlog;
445 opt.Wlog = q->Wlog;
446 opt.Plog = q->Plog;
447 opt.Scell_log = q->Scell_log;
448 opt.flags = q->flags;
449 RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
450 rta->rta_len = skb->tail - b;
452 if (red_copy_xstats(skb, &q->st))
453 goto rtattr_failure;
455 return skb->len;
457 rtattr_failure:
458 skb_trim(skb, b - skb->data);
459 return -1;
461 #endif
463 static void red_destroy(struct Qdisc *sch)
465 MOD_DEC_USE_COUNT;
468 struct Qdisc_ops red_qdisc_ops =
470 NULL,
471 NULL,
472 "red",
473 sizeof(struct red_sched_data),
475 red_enqueue,
476 red_dequeue,
477 red_requeue,
478 red_drop,
480 red_init,
481 red_reset,
482 red_destroy,
483 red_change,
485 #ifdef CONFIG_RTNETLINK
486 red_dump,
487 #endif
491 #ifdef MODULE
492 int init_module(void)
494 return register_qdisc(&red_qdisc_ops);
497 void cleanup_module(void)
499 unregister_qdisc(&red_qdisc_ops);
501 #endif