Changes for kernel and Busybox
[tomato.git] / release / src / linux / linux / net / sched / sch_cbq.c
blob66a0e86588a031b579fb3bfa0563b8ffe09aece1
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <asm/uaccess.h>
16 #include <asm/system.h>
17 #include <asm/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/if_ether.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/notifier.h>
33 #include <net/ip.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
36 #include <net/sock.h>
37 #include <net/pkt_sched.h>
40 /* Class-Based Queueing (CBQ) algorithm.
41 =======================================
43 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
44 Management Models for Packet Networks",
45 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
47 [2] Sally Floyd, "Notes on CBQ and Guaranted Service", 1995
49 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
50 Parameters", 1996
52 [4] Sally Floyd and Michael Speer, "Experimental Results
53 for Class-Based Queueing", 1998, not published.
55 -----------------------------------------------------------------------
57 Algorithm skeleton was taken from NS simulator cbq.cc.
58 If someone wants to check this code against the LBL version,
59 he should take into account that ONLY the skeleton was borrowed,
60 the implementation is different. Particularly:
62 --- The WRR algorithm is different. Our version looks more
63 reasonable (I hope) and works when quanta are allowed to be
64 less than MTU, which is always the case when real time classes
65 have small rates. Note, that the statement of [3] is
66 incomplete, delay may actually be estimated even if class
67 per-round allotment is less than MTU. Namely, if per-round
68 allotment is W*r_i, and r_1+...+r_k = r < 1
70 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
72 In the worst case we have IntServ estimate with D = W*r+k*MTU
73 and C = MTU*r. The proof (if correct at all) is trivial.
76 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
77 interpret some places, which look like wrong translations
78 from NS. Anyone is advised to find these differences
79 and explain to me, why I am wrong 8).
81 --- Linux has no EOI event, so that we cannot estimate true class
82 idle time. Workaround is to consider the next dequeue event
83 as sign that previous packet is finished. This is wrong because of
84 internal device queueing, but on a permanently loaded link it is true.
85 Moreover, combined with clock integrator, this scheme looks
86 very close to an ideal solution. */
88 struct cbq_sched_data;
91 struct cbq_class
93 struct cbq_class *next; /* hash table link */
94 struct cbq_class *next_alive; /* next class with backlog in this priority band */
96 /* Parameters */
97 u32 classid;
98 unsigned char priority; /* class priority */
99 unsigned char priority2; /* priority to be used after overlimit */
100 unsigned char ewma_log; /* time constant for idle time calculation */
101 unsigned char ovl_strategy;
102 #ifdef CONFIG_NET_CLS_POLICE
103 unsigned char police;
104 #endif
106 u32 defmap;
108 /* Link-sharing scheduler parameters */
109 long maxidle; /* Class paramters: see below. */
110 long offtime;
111 long minidle;
112 u32 avpkt;
113 struct qdisc_rate_table *R_tab;
115 /* Overlimit strategy parameters */
116 void (*overlimit)(struct cbq_class *cl);
117 long penalty;
119 /* General scheduler (WRR) parameters */
120 long allot;
121 long quantum; /* Allotment per WRR round */
122 long weight; /* Relative allotment: see below */
124 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
125 struct cbq_class *split; /* Ptr to split node */
126 struct cbq_class *share; /* Ptr to LS parent in the class tree */
127 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
128 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
129 parent otherwise */
130 struct cbq_class *sibling; /* Sibling chain */
131 struct cbq_class *children; /* Pointer to children chain */
133 struct Qdisc *q; /* Elementary queueing discipline */
136 /* Variables */
137 unsigned char cpriority; /* Effective priority */
138 unsigned char delayed;
139 unsigned char level; /* level of the class in hierarchy:
140 0 for leaf classes, and maximal
141 level of children + 1 for nodes.
144 psched_time_t last; /* Last end of service */
145 psched_time_t undertime;
146 long avgidle;
147 long deficit; /* Saved deficit for WRR */
148 unsigned long penalized;
149 struct tc_stats stats;
150 struct tc_cbq_xstats xstats;
152 struct tcf_proto *filter_list;
154 int refcnt;
155 int filters;
157 struct cbq_class *defaults[TC_PRIO_MAX+1];
160 struct cbq_sched_data
162 struct cbq_class *classes[16]; /* Hash table of all classes */
163 int nclasses[TC_CBQ_MAXPRIO+1];
164 unsigned quanta[TC_CBQ_MAXPRIO+1];
166 struct cbq_class link;
168 unsigned activemask;
169 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
170 with backlog */
172 #ifdef CONFIG_NET_CLS_POLICE
173 struct cbq_class *rx_class;
174 #endif
175 struct cbq_class *tx_class;
176 struct cbq_class *tx_borrowed;
177 int tx_len;
178 psched_time_t now; /* Cached timestamp */
179 psched_time_t now_rt; /* Cached real time */
180 unsigned pmask;
182 struct timer_list delay_timer;
183 struct timer_list wd_timer; /* Watchdog timer,
184 started when CBQ has
185 backlog, but cannot
186 transmit just now */
187 long wd_expires;
188 int toplevel;
189 u32 hgenerator;
193 #define L2T(cl,len) qdisc_l2t((cl)->R_tab,len)
196 static __inline__ unsigned cbq_hash(u32 h)
198 h ^= h>>8;
199 h ^= h>>4;
200 return h&0xF;
203 static __inline__ struct cbq_class *
204 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
206 struct cbq_class *cl;
208 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
209 if (cl->classid == classid)
210 return cl;
211 return NULL;
214 #ifdef CONFIG_NET_CLS_POLICE
216 static struct cbq_class *
217 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
219 struct cbq_class *cl, *new;
221 for (cl = this->tparent; cl; cl = cl->tparent)
222 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
223 return new;
225 return NULL;
228 #endif
230 /* Classify packet. The procedure is pretty complicated, but
231 it allows us to combine link sharing and priority scheduling
232 transparently.
234 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
235 so that it resolves to split nodes. Then packets are classified
236 by logical priority, or a more specific classifier may be attached
237 to the split node.
240 static struct cbq_class *
241 cbq_classify(struct sk_buff *skb, struct Qdisc *sch)
243 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
244 struct cbq_class *head = &q->link;
245 struct cbq_class **defmap;
246 struct cbq_class *cl = NULL;
247 u32 prio = skb->priority;
248 struct tcf_result res;
251 * Step 1. If skb->priority points to one of our classes, use it.
253 if (TC_H_MAJ(prio^sch->handle) == 0 &&
254 (cl = cbq_class_lookup(q, prio)) != NULL)
255 return cl;
257 for (;;) {
258 int result = 0;
260 defmap = head->defaults;
263 * Step 2+n. Apply classifier.
265 if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
266 goto fallback;
268 if ((cl = (void*)res.class) == NULL) {
269 if (TC_H_MAJ(res.classid))
270 cl = cbq_class_lookup(q, res.classid);
271 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
272 cl = defmap[TC_PRIO_BESTEFFORT];
274 if (cl == NULL || cl->level >= head->level)
275 goto fallback;
278 #ifdef CONFIG_NET_CLS_POLICE
279 switch (result) {
280 case TC_POLICE_RECLASSIFY:
281 return cbq_reclassify(skb, cl);
282 case TC_POLICE_SHOT:
283 return NULL;
284 default:
285 break;
287 #endif
288 if (cl->level == 0)
289 return cl;
292 * Step 3+n. If classifier selected a link sharing class,
293 * apply agency specific classifier.
294 * Repeat this procdure until we hit a leaf node.
296 head = cl;
299 fallback:
300 cl = head;
303 * Step 4. No success...
305 if (TC_H_MAJ(prio) == 0 &&
306 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
307 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
308 return head;
310 return cl;
314 A packet has just been enqueued on the empty class.
315 cbq_activate_class adds it to the tail of active class list
316 of its priority band.
319 static __inline__ void cbq_activate_class(struct cbq_class *cl)
321 struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
322 int prio = cl->cpriority;
323 struct cbq_class *cl_tail;
325 cl_tail = q->active[prio];
326 q->active[prio] = cl;
328 if (cl_tail != NULL) {
329 cl->next_alive = cl_tail->next_alive;
330 cl_tail->next_alive = cl;
331 } else {
332 cl->next_alive = cl;
333 q->activemask |= (1<<prio);
338 Unlink class from active chain.
339 Note that this same procedure is done directly in cbq_dequeue*
340 during round-robin procedure.
343 static void cbq_deactivate_class(struct cbq_class *this)
345 struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
346 int prio = this->cpriority;
347 struct cbq_class *cl;
348 struct cbq_class *cl_prev = q->active[prio];
350 do {
351 cl = cl_prev->next_alive;
352 if (cl == this) {
353 cl_prev->next_alive = cl->next_alive;
354 cl->next_alive = NULL;
356 if (cl == q->active[prio]) {
357 q->active[prio] = cl_prev;
358 if (cl == q->active[prio]) {
359 q->active[prio] = NULL;
360 q->activemask &= ~(1<<prio);
361 return;
365 cl = cl_prev->next_alive;
366 return;
368 } while ((cl_prev = cl) != q->active[prio]);
371 static void
372 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
374 int toplevel = q->toplevel;
376 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
377 psched_time_t now;
378 psched_tdiff_t incr;
380 PSCHED_GET_TIME(now);
381 incr = PSCHED_TDIFF(now, q->now_rt);
382 PSCHED_TADD2(q->now, incr, now);
384 do {
385 if (PSCHED_TLESS(cl->undertime, now)) {
386 q->toplevel = cl->level;
387 return;
389 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
393 static int
394 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
396 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
397 struct cbq_class *cl = cbq_classify(skb, sch);
398 int len = skb->len;
399 int ret = NET_XMIT_POLICED;
401 #ifdef CONFIG_NET_CLS_POLICE
402 q->rx_class = cl;
403 #endif
404 if (cl) {
405 #ifdef CONFIG_NET_CLS_POLICE
406 cl->q->__parent = sch;
407 #endif
408 if ((ret = cl->q->enqueue(skb, cl->q)) == 0) {
409 sch->q.qlen++;
410 sch->stats.packets++;
411 sch->stats.bytes+=len;
412 cbq_mark_toplevel(q, cl);
413 if (!cl->next_alive)
414 cbq_activate_class(cl);
415 return 0;
419 sch->stats.drops++;
420 if (cl == NULL)
421 kfree_skb(skb);
422 else {
423 cbq_mark_toplevel(q, cl);
424 cl->stats.drops++;
426 return ret;
429 static int
430 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
432 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
433 struct cbq_class *cl;
434 int ret;
436 if ((cl = q->tx_class) == NULL) {
437 kfree_skb(skb);
438 sch->stats.drops++;
439 return NET_XMIT_CN;
441 q->tx_class = NULL;
443 cbq_mark_toplevel(q, cl);
445 #ifdef CONFIG_NET_CLS_POLICE
446 q->rx_class = cl;
447 cl->q->__parent = sch;
448 #endif
449 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
450 sch->q.qlen++;
451 if (!cl->next_alive)
452 cbq_activate_class(cl);
453 return 0;
455 sch->stats.drops++;
456 cl->stats.drops++;
457 return ret;
460 /* Overlimit actions */
462 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
464 static void cbq_ovl_classic(struct cbq_class *cl)
466 struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
467 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
469 if (!cl->delayed) {
470 delay += cl->offtime;
473 Class goes to sleep, so that it will have no
474 chance to work avgidle. Let's forgive it 8)
476 BTW cbq-2.0 has a crap in this
477 place, apparently they forgot to shift it by cl->ewma_log.
479 if (cl->avgidle < 0)
480 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
481 if (cl->avgidle < cl->minidle)
482 cl->avgidle = cl->minidle;
483 if (delay <= 0)
484 delay = 1;
485 PSCHED_TADD2(q->now, delay, cl->undertime);
487 cl->xstats.overactions++;
488 cl->delayed = 1;
490 if (q->wd_expires == 0 || q->wd_expires > delay)
491 q->wd_expires = delay;
493 /* Dirty work! We must schedule wakeups based on
494 real available rate, rather than leaf rate,
495 which may be tiny (even zero).
497 if (q->toplevel == TC_CBQ_MAXLEVEL) {
498 struct cbq_class *b;
499 psched_tdiff_t base_delay = q->wd_expires;
501 for (b = cl->borrow; b; b = b->borrow) {
502 delay = PSCHED_TDIFF(b->undertime, q->now);
503 if (delay < base_delay) {
504 if (delay <= 0)
505 delay = 1;
506 base_delay = delay;
510 q->wd_expires = base_delay;
514 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
515 they go overlimit
518 static void cbq_ovl_rclassic(struct cbq_class *cl)
520 struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
521 struct cbq_class *this = cl;
523 do {
524 if (cl->level > q->toplevel) {
525 cl = NULL;
526 break;
528 } while ((cl = cl->borrow) != NULL);
530 if (cl == NULL)
531 cl = this;
532 cbq_ovl_classic(cl);
535 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
537 static void cbq_ovl_delay(struct cbq_class *cl)
539 struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
540 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
542 if (!cl->delayed) {
543 unsigned long sched = jiffies;
545 delay += cl->offtime;
546 if (cl->avgidle < 0)
547 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
548 if (cl->avgidle < cl->minidle)
549 cl->avgidle = cl->minidle;
550 PSCHED_TADD2(q->now, delay, cl->undertime);
552 if (delay > 0) {
553 sched += PSCHED_US2JIFFIE(delay) + cl->penalty;
554 cl->penalized = sched;
555 cl->cpriority = TC_CBQ_MAXPRIO;
556 q->pmask |= (1<<TC_CBQ_MAXPRIO);
557 if (del_timer(&q->delay_timer) &&
558 (long)(q->delay_timer.expires - sched) > 0)
559 q->delay_timer.expires = sched;
560 add_timer(&q->delay_timer);
561 cl->delayed = 1;
562 cl->xstats.overactions++;
563 return;
565 delay = 1;
567 if (q->wd_expires == 0 || q->wd_expires > delay)
568 q->wd_expires = delay;
571 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
573 static void cbq_ovl_lowprio(struct cbq_class *cl)
575 struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
577 cl->penalized = jiffies + cl->penalty;
579 if (cl->cpriority != cl->priority2) {
580 cl->cpriority = cl->priority2;
581 q->pmask |= (1<<cl->cpriority);
582 cl->xstats.overactions++;
584 cbq_ovl_classic(cl);
587 /* TC_CBQ_OVL_DROP: penalize class by dropping */
589 static void cbq_ovl_drop(struct cbq_class *cl)
591 if (cl->q->ops->drop)
592 if (cl->q->ops->drop(cl->q))
593 cl->qdisc->q.qlen--;
594 cl->xstats.overactions++;
595 cbq_ovl_classic(cl);
598 static void cbq_watchdog(unsigned long arg)
600 struct Qdisc *sch = (struct Qdisc*)arg;
602 sch->flags &= ~TCQ_F_THROTTLED;
603 netif_schedule(sch->dev);
606 static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio)
608 struct cbq_class *cl;
609 struct cbq_class *cl_prev = q->active[prio];
610 unsigned long now = jiffies;
611 unsigned long sched = now;
613 if (cl_prev == NULL)
614 return now;
616 do {
617 cl = cl_prev->next_alive;
618 if ((long)(now - cl->penalized) > 0) {
619 cl_prev->next_alive = cl->next_alive;
620 cl->next_alive = NULL;
621 cl->cpriority = cl->priority;
622 cl->delayed = 0;
623 cbq_activate_class(cl);
625 if (cl == q->active[prio]) {
626 q->active[prio] = cl_prev;
627 if (cl == q->active[prio]) {
628 q->active[prio] = NULL;
629 return 0;
633 cl = cl_prev->next_alive;
634 } else if ((long)(sched - cl->penalized) > 0)
635 sched = cl->penalized;
636 } while ((cl_prev = cl) != q->active[prio]);
638 return (long)(sched - now);
641 static void cbq_undelay(unsigned long arg)
643 struct Qdisc *sch = (struct Qdisc*)arg;
644 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
645 long delay = 0;
646 unsigned pmask;
648 pmask = q->pmask;
649 q->pmask = 0;
651 while (pmask) {
652 int prio = ffz(~pmask);
653 long tmp;
655 pmask &= ~(1<<prio);
657 tmp = cbq_undelay_prio(q, prio);
658 if (tmp > 0) {
659 q->pmask |= 1<<prio;
660 if (tmp < delay || delay == 0)
661 delay = tmp;
665 if (delay) {
666 q->delay_timer.expires = jiffies + delay;
667 add_timer(&q->delay_timer);
670 sch->flags &= ~TCQ_F_THROTTLED;
671 netif_schedule(sch->dev);
675 #ifdef CONFIG_NET_CLS_POLICE
677 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
679 int len = skb->len;
680 struct Qdisc *sch = child->__parent;
681 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
682 struct cbq_class *cl = q->rx_class;
684 q->rx_class = NULL;
686 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
688 cbq_mark_toplevel(q, cl);
690 q->rx_class = cl;
691 cl->q->__parent = sch;
693 if (cl->q->enqueue(skb, cl->q) == 0) {
694 sch->q.qlen++;
695 sch->stats.packets++;
696 sch->stats.bytes+=len;
697 if (!cl->next_alive)
698 cbq_activate_class(cl);
699 return 0;
701 sch->stats.drops++;
702 return 0;
705 sch->stats.drops++;
706 return -1;
708 #endif
711 It is mission critical procedure.
713 We "regenerate" toplevel cutoff, if transmitting class
714 has backlog and it is not regulated. It is not part of
715 original CBQ description, but looks more reasonable.
716 Probably, it is wrong. This question needs further investigation.
719 static __inline__ void
720 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
721 struct cbq_class *borrowed)
723 if (cl && q->toplevel >= borrowed->level) {
724 if (cl->q->q.qlen > 1) {
725 do {
726 if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
727 q->toplevel = borrowed->level;
728 return;
730 } while ((borrowed=borrowed->borrow) != NULL);
732 #if 0
733 /* It is not necessary now. Uncommenting it
734 will save CPU cycles, but decrease fairness.
736 q->toplevel = TC_CBQ_MAXLEVEL;
737 #endif
741 static void
742 cbq_update(struct cbq_sched_data *q)
744 struct cbq_class *this = q->tx_class;
745 struct cbq_class *cl = this;
746 int len = q->tx_len;
748 q->tx_class = NULL;
750 for ( ; cl; cl = cl->share) {
751 long avgidle = cl->avgidle;
752 long idle;
754 cl->stats.packets++;
755 cl->stats.bytes += len;
758 (now - last) is total time between packet right edges.
759 (last_pktlen/rate) is "virtual" busy time, so that
761 idle = (now - last) - last_pktlen/rate
764 idle = PSCHED_TDIFF(q->now, cl->last);
765 if ((unsigned long)idle > 128*1024*1024) {
766 avgidle = cl->maxidle;
767 } else {
768 idle -= L2T(cl, len);
770 /* true_avgidle := (1-W)*true_avgidle + W*idle,
771 where W=2^{-ewma_log}. But cl->avgidle is scaled:
772 cl->avgidle == true_avgidle/W,
773 hence:
775 avgidle += idle - (avgidle>>cl->ewma_log);
778 if (avgidle <= 0) {
779 /* Overlimit or at-limit */
781 if (avgidle < cl->minidle)
782 avgidle = cl->minidle;
784 cl->avgidle = avgidle;
786 /* Calculate expected time, when this class
787 will be allowed to send.
788 It will occur, when:
789 (1-W)*true_avgidle + W*delay = 0, i.e.
790 idle = (1/W - 1)*(-true_avgidle)
792 idle = (1 - W)*(-cl->avgidle);
794 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
797 That is not all.
798 To maintain the rate allocated to the class,
799 we add to undertime virtual clock,
800 necesary to complete transmitted packet.
801 (len/phys_bandwidth has been already passed
802 to the moment of cbq_update)
805 idle -= L2T(&q->link, len);
806 idle += L2T(cl, len);
808 PSCHED_AUDIT_TDIFF(idle);
810 PSCHED_TADD2(q->now, idle, cl->undertime);
811 } else {
812 /* Underlimit */
814 PSCHED_SET_PASTPERFECT(cl->undertime);
815 if (avgidle > cl->maxidle)
816 cl->avgidle = cl->maxidle;
817 else
818 cl->avgidle = avgidle;
820 cl->last = q->now;
823 cbq_update_toplevel(q, this, q->tx_borrowed);
826 static __inline__ struct cbq_class *
827 cbq_under_limit(struct cbq_class *cl)
829 struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
830 struct cbq_class *this_cl = cl;
832 if (cl->tparent == NULL)
833 return cl;
835 if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
836 !PSCHED_TLESS(q->now, cl->undertime)) {
837 cl->delayed = 0;
838 return cl;
841 do {
842 /* It is very suspicious place. Now overlimit
843 action is generated for not bounded classes
844 only if link is completely congested.
845 Though it is in agree with ancestor-only paradigm,
846 it looks very stupid. Particularly,
847 it means that this chunk of code will either
848 never be called or result in strong amplification
849 of burstiness. Dangerous, silly, and, however,
850 no another solution exists.
852 if ((cl = cl->borrow) == NULL) {
853 this_cl->stats.overlimits++;
854 this_cl->overlimit(this_cl);
855 return NULL;
857 if (cl->level > q->toplevel)
858 return NULL;
859 } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
860 PSCHED_TLESS(q->now, cl->undertime));
862 cl->delayed = 0;
863 return cl;
866 static __inline__ struct sk_buff *
867 cbq_dequeue_prio(struct Qdisc *sch, int prio)
869 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
870 struct cbq_class *cl_tail, *cl_prev, *cl;
871 struct sk_buff *skb;
872 int deficit;
874 cl_tail = cl_prev = q->active[prio];
875 cl = cl_prev->next_alive;
877 do {
878 deficit = 0;
880 /* Start round */
881 do {
882 struct cbq_class *borrow = cl;
884 if (cl->q->q.qlen &&
885 (borrow = cbq_under_limit(cl)) == NULL)
886 goto skip_class;
888 if (cl->deficit <= 0) {
889 /* Class exhausted its allotment per
890 this round. Switch to the next one.
892 deficit = 1;
893 cl->deficit += cl->quantum;
894 goto next_class;
897 skb = cl->q->dequeue(cl->q);
899 /* Class did not give us any skb :-(
900 It could occur even if cl->q->q.qlen != 0
901 f.e. if cl->q == "tbf"
903 if (skb == NULL)
904 goto skip_class;
906 cl->deficit -= skb->len;
907 q->tx_class = cl;
908 q->tx_borrowed = borrow;
909 if (borrow != cl) {
910 #ifndef CBQ_XSTATS_BORROWS_BYTES
911 borrow->xstats.borrows++;
912 cl->xstats.borrows++;
913 #else
914 borrow->xstats.borrows += skb->len;
915 cl->xstats.borrows += skb->len;
916 #endif
918 q->tx_len = skb->len;
920 if (cl->deficit <= 0) {
921 q->active[prio] = cl;
922 cl = cl->next_alive;
923 cl->deficit += cl->quantum;
925 return skb;
927 skip_class:
928 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
929 /* Class is empty or penalized.
930 Unlink it from active chain.
932 cl_prev->next_alive = cl->next_alive;
933 cl->next_alive = NULL;
935 /* Did cl_tail point to it? */
936 if (cl == cl_tail) {
937 /* Repair it! */
938 cl_tail = cl_prev;
940 /* Was it the last class in this band? */
941 if (cl == cl_tail) {
942 /* Kill the band! */
943 q->active[prio] = NULL;
944 q->activemask &= ~(1<<prio);
945 if (cl->q->q.qlen)
946 cbq_activate_class(cl);
947 return NULL;
950 q->active[prio] = cl_tail;
952 if (cl->q->q.qlen)
953 cbq_activate_class(cl);
955 cl = cl_prev;
958 next_class:
959 cl_prev = cl;
960 cl = cl->next_alive;
961 } while (cl_prev != cl_tail);
962 } while (deficit);
964 q->active[prio] = cl_prev;
966 return NULL;
969 static __inline__ struct sk_buff *
970 cbq_dequeue_1(struct Qdisc *sch)
972 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
973 struct sk_buff *skb;
974 unsigned activemask;
976 activemask = q->activemask&0xFF;
977 while (activemask) {
978 int prio = ffz(~activemask);
979 activemask &= ~(1<<prio);
980 skb = cbq_dequeue_prio(sch, prio);
981 if (skb)
982 return skb;
984 return NULL;
987 static struct sk_buff *
988 cbq_dequeue(struct Qdisc *sch)
990 struct sk_buff *skb;
991 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
992 psched_time_t now;
993 psched_tdiff_t incr;
995 PSCHED_GET_TIME(now);
996 incr = PSCHED_TDIFF(now, q->now_rt);
998 if (q->tx_class) {
999 psched_tdiff_t incr2;
1000 /* Time integrator. We calculate EOS time
1001 by adding expected packet transmittion time.
1002 If real time is greater, we warp artificial clock,
1003 so that:
1005 cbq_time = max(real_time, work);
1007 incr2 = L2T(&q->link, q->tx_len);
1008 PSCHED_TADD(q->now, incr2);
1009 cbq_update(q);
1010 if ((incr -= incr2) < 0)
1011 incr = 0;
1013 PSCHED_TADD(q->now, incr);
1014 q->now_rt = now;
1016 for (;;) {
1017 q->wd_expires = 0;
1019 skb = cbq_dequeue_1(sch);
1020 if (skb) {
1021 sch->q.qlen--;
1022 sch->flags &= ~TCQ_F_THROTTLED;
1023 return skb;
1026 /* All the classes are overlimit.
1028 It is possible, if:
1030 1. Scheduler is empty.
1031 2. Toplevel cutoff inhibited borrowing.
1032 3. Root class is overlimit.
1034 Reset 2d and 3d conditions and retry.
1036 Note, that NS and cbq-2.0 are buggy, peeking
1037 an arbitrary class is appropriate for ancestor-only
1038 sharing, but not for toplevel algorithm.
1040 Our version is better, but slower, because it requires
1041 two passes, but it is unavoidable with top-level sharing.
1044 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1045 PSCHED_IS_PASTPERFECT(q->link.undertime))
1046 break;
1048 q->toplevel = TC_CBQ_MAXLEVEL;
1049 PSCHED_SET_PASTPERFECT(q->link.undertime);
1052 /* No packets in scheduler or nobody wants to give them to us :-(
1053 Sigh... start watchdog timer in the last case. */
1055 if (sch->q.qlen) {
1056 sch->stats.overlimits++;
1057 if (q->wd_expires) {
1058 long delay = PSCHED_US2JIFFIE(q->wd_expires);
1059 if (delay <= 0)
1060 delay = 1;
1061 mod_timer(&q->wd_timer, jiffies + delay);
1062 sch->flags |= TCQ_F_THROTTLED;
1065 return NULL;
1068 /* CBQ class maintanance routines */
1070 static void cbq_adjust_levels(struct cbq_class *this)
1072 if (this == NULL)
1073 return;
1075 do {
1076 int level = 0;
1077 struct cbq_class *cl;
1079 if ((cl = this->children) != NULL) {
1080 do {
1081 if (cl->level > level)
1082 level = cl->level;
1083 } while ((cl = cl->sibling) != this->children);
1085 this->level = level+1;
1086 } while ((this = this->tparent) != NULL);
1089 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1091 struct cbq_class *cl;
1092 unsigned h;
1094 if (q->quanta[prio] == 0)
1095 return;
1097 for (h=0; h<16; h++) {
1098 for (cl = q->classes[h]; cl; cl = cl->next) {
1099 /* BUGGGG... Beware! This expression suffer of
1100 arithmetic overflows!
1102 if (cl->priority == prio) {
1103 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1104 q->quanta[prio];
1106 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1107 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1108 cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1114 static void cbq_sync_defmap(struct cbq_class *cl)
1116 struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
1117 struct cbq_class *split = cl->split;
1118 unsigned h;
1119 int i;
1121 if (split == NULL)
1122 return;
1124 for (i=0; i<=TC_PRIO_MAX; i++) {
1125 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1126 split->defaults[i] = NULL;
1129 for (i=0; i<=TC_PRIO_MAX; i++) {
1130 int level = split->level;
1132 if (split->defaults[i])
1133 continue;
1135 for (h=0; h<16; h++) {
1136 struct cbq_class *c;
1138 for (c = q->classes[h]; c; c = c->next) {
1139 if (c->split == split && c->level < level &&
1140 c->defmap&(1<<i)) {
1141 split->defaults[i] = c;
1142 level = c->level;
1149 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1151 struct cbq_class *split = NULL;
1153 if (splitid == 0) {
1154 if ((split = cl->split) == NULL)
1155 return;
1156 splitid = split->classid;
1159 if (split == NULL || split->classid != splitid) {
1160 for (split = cl->tparent; split; split = split->tparent)
1161 if (split->classid == splitid)
1162 break;
1165 if (split == NULL)
1166 return;
1168 if (cl->split != split) {
1169 cl->defmap = 0;
1170 cbq_sync_defmap(cl);
1171 cl->split = split;
1172 cl->defmap = def&mask;
1173 } else
1174 cl->defmap = (cl->defmap&~mask)|(def&mask);
1176 cbq_sync_defmap(cl);
1179 static void cbq_unlink_class(struct cbq_class *this)
1181 struct cbq_class *cl, **clp;
1182 struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
1184 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1185 if (cl == this) {
1186 *clp = cl->next;
1187 cl->next = NULL;
1188 break;
1192 if (this->tparent) {
1193 clp=&this->sibling;
1194 cl = *clp;
1195 do {
1196 if (cl == this) {
1197 *clp = cl->sibling;
1198 break;
1200 clp = &cl->sibling;
1201 } while ((cl = *clp) != this->sibling);
1203 if (this->tparent->children == this) {
1204 this->tparent->children = this->sibling;
1205 if (this->sibling == this)
1206 this->tparent->children = NULL;
1208 } else {
1209 BUG_TRAP(this->sibling == this);
1213 static void cbq_link_class(struct cbq_class *this)
1215 struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
1216 unsigned h = cbq_hash(this->classid);
1217 struct cbq_class *parent = this->tparent;
1219 this->sibling = this;
1220 this->next = q->classes[h];
1221 q->classes[h] = this;
1223 if (parent == NULL)
1224 return;
1226 if (parent->children == NULL) {
1227 parent->children = this;
1228 } else {
1229 this->sibling = parent->children->sibling;
1230 parent->children->sibling = this;
1234 static unsigned int cbq_drop(struct Qdisc* sch)
1236 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1237 struct cbq_class *cl, *cl_head;
1238 int prio;
1239 unsigned int len;
1241 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1242 if ((cl_head = q->active[prio]) == NULL)
1243 continue;
1245 cl = cl_head;
1246 do {
1247 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1248 sch->q.qlen--;
1249 return len;
1251 } while ((cl = cl->next_alive) != cl_head);
1253 return 0;
1256 static void
1257 cbq_reset(struct Qdisc* sch)
1259 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1260 struct cbq_class *cl;
1261 int prio;
1262 unsigned h;
1264 q->activemask = 0;
1265 q->pmask = 0;
1266 q->tx_class = NULL;
1267 q->tx_borrowed = NULL;
1268 del_timer(&q->wd_timer);
1269 del_timer(&q->delay_timer);
1270 q->toplevel = TC_CBQ_MAXLEVEL;
1271 PSCHED_GET_TIME(q->now);
1272 q->now_rt = q->now;
1274 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1275 q->active[prio] = NULL;
1277 for (h = 0; h < 16; h++) {
1278 for (cl = q->classes[h]; cl; cl = cl->next) {
1279 qdisc_reset(cl->q);
1281 cl->next_alive = NULL;
1282 PSCHED_SET_PASTPERFECT(cl->undertime);
1283 cl->avgidle = cl->maxidle;
1284 cl->deficit = cl->quantum;
1285 cl->cpriority = cl->priority;
1288 sch->q.qlen = 0;
1292 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1294 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1295 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1296 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1298 if (lss->change&TCF_CBQ_LSS_EWMA)
1299 cl->ewma_log = lss->ewma_log;
1300 if (lss->change&TCF_CBQ_LSS_AVPKT)
1301 cl->avpkt = lss->avpkt;
1302 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1303 cl->minidle = -(long)lss->minidle;
1304 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1305 cl->maxidle = lss->maxidle;
1306 cl->avgidle = lss->maxidle;
1308 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1309 cl->offtime = lss->offtime;
1310 return 0;
1313 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1315 q->nclasses[cl->priority]--;
1316 q->quanta[cl->priority] -= cl->weight;
1317 cbq_normalize_quanta(q, cl->priority);
1320 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1322 q->nclasses[cl->priority]++;
1323 q->quanta[cl->priority] += cl->weight;
1324 cbq_normalize_quanta(q, cl->priority);
1327 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1329 struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
1331 if (wrr->allot)
1332 cl->allot = wrr->allot;
1333 if (wrr->weight)
1334 cl->weight = wrr->weight;
1335 if (wrr->priority) {
1336 cl->priority = wrr->priority-1;
1337 cl->cpriority = cl->priority;
1338 if (cl->priority >= cl->priority2)
1339 cl->priority2 = TC_CBQ_MAXPRIO-1;
1342 cbq_addprio(q, cl);
1343 return 0;
1346 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1348 switch (ovl->strategy) {
1349 case TC_CBQ_OVL_CLASSIC:
1350 cl->overlimit = cbq_ovl_classic;
1351 break;
1352 case TC_CBQ_OVL_DELAY:
1353 cl->overlimit = cbq_ovl_delay;
1354 break;
1355 case TC_CBQ_OVL_LOWPRIO:
1356 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1357 ovl->priority2-1 <= cl->priority)
1358 return -EINVAL;
1359 cl->priority2 = ovl->priority2-1;
1360 cl->overlimit = cbq_ovl_lowprio;
1361 break;
1362 case TC_CBQ_OVL_DROP:
1363 cl->overlimit = cbq_ovl_drop;
1364 break;
1365 case TC_CBQ_OVL_RCLASSIC:
1366 cl->overlimit = cbq_ovl_rclassic;
1367 break;
1368 default:
1369 return -EINVAL;
1371 cl->penalty = (ovl->penalty*HZ)/1000;
1372 return 0;
1375 #ifdef CONFIG_NET_CLS_POLICE
1376 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1378 cl->police = p->police;
1380 if (cl->q->handle) {
1381 if (p->police == TC_POLICE_RECLASSIFY)
1382 cl->q->reshape_fail = cbq_reshape_fail;
1383 else
1384 cl->q->reshape_fail = NULL;
1386 return 0;
1388 #endif
1390 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1392 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1393 return 0;
1396 static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1398 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
1399 struct rtattr *tb[TCA_CBQ_MAX];
1400 struct tc_ratespec *r;
1402 if (rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0 ||
1403 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1404 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1405 return -EINVAL;
1407 if (tb[TCA_CBQ_LSSOPT-1] &&
1408 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1409 return -EINVAL;
1411 r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1413 MOD_INC_USE_COUNT;
1414 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) {
1415 MOD_DEC_USE_COUNT;
1416 return -EINVAL;
1419 q->link.refcnt = 1;
1420 q->link.sibling = &q->link;
1421 q->link.classid = sch->handle;
1422 q->link.qdisc = sch;
1423 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
1424 q->link.q = &noop_qdisc;
1426 q->link.priority = TC_CBQ_MAXPRIO-1;
1427 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1428 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1429 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1430 q->link.overlimit = cbq_ovl_classic;
1431 q->link.allot = psched_mtu(sch->dev);
1432 q->link.quantum = q->link.allot;
1433 q->link.weight = q->link.R_tab->rate.rate;
1435 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1436 q->link.avpkt = q->link.allot/2;
1437 q->link.minidle = -0x7FFFFFFF;
1438 q->link.stats.lock = &sch->dev->queue_lock;
1440 init_timer(&q->wd_timer);
1441 q->wd_timer.data = (unsigned long)sch;
1442 q->wd_timer.function = cbq_watchdog;
1443 init_timer(&q->delay_timer);
1444 q->delay_timer.data = (unsigned long)sch;
1445 q->delay_timer.function = cbq_undelay;
1446 q->toplevel = TC_CBQ_MAXLEVEL;
1447 PSCHED_GET_TIME(q->now);
1448 q->now_rt = q->now;
1450 cbq_link_class(&q->link);
1452 if (tb[TCA_CBQ_LSSOPT-1])
1453 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1455 cbq_addprio(q, &q->link);
1456 return 0;
1459 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1461 unsigned char *b = skb->tail;
1463 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1464 return skb->len;
1466 rtattr_failure:
1467 skb_trim(skb, b - skb->data);
1468 return -1;
1471 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1473 unsigned char *b = skb->tail;
1474 struct tc_cbq_lssopt opt;
1476 opt.flags = 0;
1477 if (cl->borrow == NULL)
1478 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1479 if (cl->share == NULL)
1480 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1481 opt.ewma_log = cl->ewma_log;
1482 opt.level = cl->level;
1483 opt.avpkt = cl->avpkt;
1484 opt.maxidle = cl->maxidle;
1485 opt.minidle = (u32)(-cl->minidle);
1486 opt.offtime = cl->offtime;
1487 opt.change = ~0;
1488 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1489 return skb->len;
1491 rtattr_failure:
1492 skb_trim(skb, b - skb->data);
1493 return -1;
1496 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1498 unsigned char *b = skb->tail;
1499 struct tc_cbq_wrropt opt;
1501 opt.flags = 0;
1502 opt.allot = cl->allot;
1503 opt.priority = cl->priority+1;
1504 opt.cpriority = cl->cpriority+1;
1505 opt.weight = cl->weight;
1506 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1507 return skb->len;
1509 rtattr_failure:
1510 skb_trim(skb, b - skb->data);
1511 return -1;
1514 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1516 unsigned char *b = skb->tail;
1517 struct tc_cbq_ovl opt;
1519 opt.strategy = cl->ovl_strategy;
1520 opt.priority2 = cl->priority2+1;
1521 opt.pad = 0;
1522 opt.penalty = (cl->penalty*1000)/HZ;
1523 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1524 return skb->len;
1526 rtattr_failure:
1527 skb_trim(skb, b - skb->data);
1528 return -1;
1531 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1533 unsigned char *b = skb->tail;
1534 struct tc_cbq_fopt opt;
1536 if (cl->split || cl->defmap) {
1537 opt.split = cl->split ? cl->split->classid : 0;
1538 opt.defmap = cl->defmap;
1539 opt.defchange = ~0;
1540 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1542 return skb->len;
1544 rtattr_failure:
1545 skb_trim(skb, b - skb->data);
1546 return -1;
1549 #ifdef CONFIG_NET_CLS_POLICE
1550 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1552 unsigned char *b = skb->tail;
1553 struct tc_cbq_police opt;
1555 if (cl->police) {
1556 opt.police = cl->police;
1557 opt.__res1 = 0;
1558 opt.__res2 = 0;
1559 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1561 return skb->len;
1563 rtattr_failure:
1564 skb_trim(skb, b - skb->data);
1565 return -1;
1567 #endif
1569 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1571 if (cbq_dump_lss(skb, cl) < 0 ||
1572 cbq_dump_rate(skb, cl) < 0 ||
1573 cbq_dump_wrr(skb, cl) < 0 ||
1574 cbq_dump_ovl(skb, cl) < 0 ||
1575 #ifdef CONFIG_NET_CLS_POLICE
1576 cbq_dump_police(skb, cl) < 0 ||
1577 #endif
1578 cbq_dump_fopt(skb, cl) < 0)
1579 return -1;
1580 return 0;
1583 int cbq_copy_xstats(struct sk_buff *skb, struct tc_cbq_xstats *st)
1585 RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st);
1586 return 0;
1588 rtattr_failure:
1589 return -1;
1593 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1595 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
1596 unsigned char *b = skb->tail;
1597 struct rtattr *rta;
1599 rta = (struct rtattr*)b;
1600 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1601 if (cbq_dump_attr(skb, &q->link) < 0)
1602 goto rtattr_failure;
1603 rta->rta_len = skb->tail - b;
1604 spin_lock_bh(&sch->dev->queue_lock);
1605 q->link.xstats.avgidle = q->link.avgidle;
1606 if (cbq_copy_xstats(skb, &q->link.xstats)) {
1607 spin_unlock_bh(&sch->dev->queue_lock);
1608 goto rtattr_failure;
1610 spin_unlock_bh(&sch->dev->queue_lock);
1611 return skb->len;
1613 rtattr_failure:
1614 skb_trim(skb, b - skb->data);
1615 return -1;
1618 static int
1619 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1620 struct sk_buff *skb, struct tcmsg *tcm)
1622 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
1623 struct cbq_class *cl = (struct cbq_class*)arg;
1624 unsigned char *b = skb->tail;
1625 struct rtattr *rta;
1627 if (cl->tparent)
1628 tcm->tcm_parent = cl->tparent->classid;
1629 else
1630 tcm->tcm_parent = TC_H_ROOT;
1631 tcm->tcm_handle = cl->classid;
1632 tcm->tcm_info = cl->q->handle;
1634 rta = (struct rtattr*)b;
1635 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1636 if (cbq_dump_attr(skb, cl) < 0)
1637 goto rtattr_failure;
1638 rta->rta_len = skb->tail - b;
1639 cl->stats.qlen = cl->q->q.qlen;
1640 if (qdisc_copy_stats(skb, &cl->stats))
1641 goto rtattr_failure;
1642 spin_lock_bh(&sch->dev->queue_lock);
1643 cl->xstats.avgidle = cl->avgidle;
1644 cl->xstats.undertime = 0;
1645 if (!PSCHED_IS_PASTPERFECT(cl->undertime))
1646 cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
1647 if (cbq_copy_xstats(skb, &cl->xstats)) {
1648 spin_unlock_bh(&sch->dev->queue_lock);
1649 goto rtattr_failure;
1651 spin_unlock_bh(&sch->dev->queue_lock);
1653 return skb->len;
1655 rtattr_failure:
1656 skb_trim(skb, b - skb->data);
1657 return -1;
1660 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1661 struct Qdisc **old)
1663 struct cbq_class *cl = (struct cbq_class*)arg;
1665 if (cl) {
1666 if (new == NULL) {
1667 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL)
1668 return -ENOBUFS;
1669 } else {
1670 #ifdef CONFIG_NET_CLS_POLICE
1671 if (cl->police == TC_POLICE_RECLASSIFY)
1672 new->reshape_fail = cbq_reshape_fail;
1673 #endif
1675 sch_tree_lock(sch);
1676 *old = cl->q;
1677 cl->q = new;
1678 sch->q.qlen -= (*old)->q.qlen;
1679 qdisc_reset(*old);
1680 sch_tree_unlock(sch);
1682 return 0;
1684 return -ENOENT;
1687 static struct Qdisc *
1688 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1690 struct cbq_class *cl = (struct cbq_class*)arg;
1692 return cl ? cl->q : NULL;
1695 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1697 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1698 struct cbq_class *cl = cbq_class_lookup(q, classid);
1700 if (cl) {
1701 cl->refcnt++;
1702 return (unsigned long)cl;
1704 return 0;
1707 static void cbq_destroy_filters(struct cbq_class *cl)
1709 struct tcf_proto *tp;
1711 while ((tp = cl->filter_list) != NULL) {
1712 cl->filter_list = tp->next;
1713 tcf_destroy(tp);
1717 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1719 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1721 BUG_TRAP(!cl->filters);
1723 cbq_destroy_filters(cl);
1724 qdisc_destroy(cl->q);
1725 qdisc_put_rtab(cl->R_tab);
1726 #ifdef CONFIG_NET_ESTIMATOR
1727 qdisc_kill_estimator(&cl->stats);
1728 #endif
1729 if (cl != &q->link)
1730 kfree(cl);
1733 static void
1734 cbq_destroy(struct Qdisc* sch)
1736 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1737 struct cbq_class *cl;
1738 unsigned h;
1740 #ifdef CONFIG_NET_CLS_POLICE
1741 q->rx_class = NULL;
1742 #endif
1744 * Filters must be destroyed first because we don't destroy the
1745 * classes from root to leafs which means that filters can still
1746 * be bound to classes which have been destroyed already. --TGR '04
1748 for (h = 0; h < 16; h++)
1749 for (cl = q->classes[h]; cl; cl = cl->next)
1750 cbq_destroy_filters(cl);
1752 for (h = 0; h < 16; h++) {
1753 struct cbq_class *next;
1755 for (cl = q->classes[h]; cl; cl = next) {
1756 next = cl->next;
1757 cbq_destroy_class(sch, cl);
1761 MOD_DEC_USE_COUNT;
1764 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1766 struct cbq_class *cl = (struct cbq_class*)arg;
1768 if (--cl->refcnt == 0) {
1769 #ifdef CONFIG_NET_CLS_POLICE
1770 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1772 spin_lock_bh(&sch->dev->queue_lock);
1773 if (q->rx_class == cl)
1774 q->rx_class = NULL;
1775 spin_unlock_bh(&sch->dev->queue_lock);
1776 #endif
1778 cbq_destroy_class(sch, cl);
1782 static int
1783 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1784 unsigned long *arg)
1786 int err;
1787 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1788 struct cbq_class *cl = (struct cbq_class*)*arg;
1789 struct rtattr *opt = tca[TCA_OPTIONS-1];
1790 struct rtattr *tb[TCA_CBQ_MAX];
1791 struct cbq_class *parent;
1792 struct qdisc_rate_table *rtab = NULL;
1794 if (opt==NULL ||
1795 rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)))
1796 return -EINVAL;
1798 if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1799 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1800 return -EINVAL;
1802 if (tb[TCA_CBQ_FOPT-1] &&
1803 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1804 return -EINVAL;
1806 if (tb[TCA_CBQ_RATE-1] &&
1807 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1808 return -EINVAL;
1810 if (tb[TCA_CBQ_LSSOPT-1] &&
1811 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1812 return -EINVAL;
1814 if (tb[TCA_CBQ_WRROPT-1] &&
1815 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1816 return -EINVAL;
1818 #ifdef CONFIG_NET_CLS_POLICE
1819 if (tb[TCA_CBQ_POLICE-1] &&
1820 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1821 return -EINVAL;
1822 #endif
1824 if (cl) {
1825 /* Check parent */
1826 if (parentid) {
1827 if (cl->tparent && cl->tparent->classid != parentid)
1828 return -EINVAL;
1829 if (!cl->tparent && parentid != TC_H_ROOT)
1830 return -EINVAL;
1833 if (tb[TCA_CBQ_RATE-1]) {
1834 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1835 if (rtab == NULL)
1836 return -EINVAL;
1839 /* Change class parameters */
1840 sch_tree_lock(sch);
1842 if (cl->next_alive != NULL)
1843 cbq_deactivate_class(cl);
1845 if (rtab) {
1846 rtab = xchg(&cl->R_tab, rtab);
1847 qdisc_put_rtab(rtab);
1850 if (tb[TCA_CBQ_LSSOPT-1])
1851 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1853 if (tb[TCA_CBQ_WRROPT-1]) {
1854 cbq_rmprio(q, cl);
1855 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1858 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1859 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1861 #ifdef CONFIG_NET_CLS_POLICE
1862 if (tb[TCA_CBQ_POLICE-1])
1863 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1864 #endif
1866 if (tb[TCA_CBQ_FOPT-1])
1867 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1869 if (cl->q->q.qlen)
1870 cbq_activate_class(cl);
1872 sch_tree_unlock(sch);
1874 #ifdef CONFIG_NET_ESTIMATOR
1875 if (tca[TCA_RATE-1]) {
1876 qdisc_kill_estimator(&cl->stats);
1877 qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]);
1879 #endif
1880 return 0;
1883 if (parentid == TC_H_ROOT)
1884 return -EINVAL;
1886 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1887 tb[TCA_CBQ_LSSOPT-1] == NULL)
1888 return -EINVAL;
1890 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1891 if (rtab == NULL)
1892 return -EINVAL;
1894 if (classid) {
1895 err = -EINVAL;
1896 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1897 goto failure;
1898 } else {
1899 int i;
1900 classid = TC_H_MAKE(sch->handle,0x8000);
1902 for (i=0; i<0x8000; i++) {
1903 if (++q->hgenerator >= 0x8000)
1904 q->hgenerator = 1;
1905 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1906 break;
1908 err = -ENOSR;
1909 if (i >= 0x8000)
1910 goto failure;
1911 classid = classid|q->hgenerator;
1914 parent = &q->link;
1915 if (parentid) {
1916 parent = cbq_class_lookup(q, parentid);
1917 err = -EINVAL;
1918 if (parent == NULL)
1919 goto failure;
1922 err = -ENOBUFS;
1923 cl = kmalloc(sizeof(*cl), GFP_KERNEL);
1924 if (cl == NULL)
1925 goto failure;
1926 memset(cl, 0, sizeof(*cl));
1927 cl->R_tab = rtab;
1928 rtab = NULL;
1929 cl->refcnt = 1;
1930 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
1931 cl->q = &noop_qdisc;
1932 cl->classid = classid;
1933 cl->tparent = parent;
1934 cl->qdisc = sch;
1935 cl->allot = parent->allot;
1936 cl->quantum = cl->allot;
1937 cl->weight = cl->R_tab->rate.rate;
1938 cl->stats.lock = &sch->dev->queue_lock;
1940 sch_tree_lock(sch);
1941 cbq_link_class(cl);
1942 cl->borrow = cl->tparent;
1943 if (cl->tparent != &q->link)
1944 cl->share = cl->tparent;
1945 cbq_adjust_levels(parent);
1946 cl->minidle = -0x7FFFFFFF;
1947 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1948 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1949 if (cl->ewma_log==0)
1950 cl->ewma_log = q->link.ewma_log;
1951 if (cl->maxidle==0)
1952 cl->maxidle = q->link.maxidle;
1953 if (cl->avpkt==0)
1954 cl->avpkt = q->link.avpkt;
1955 cl->overlimit = cbq_ovl_classic;
1956 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1957 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1958 #ifdef CONFIG_NET_CLS_POLICE
1959 if (tb[TCA_CBQ_POLICE-1])
1960 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1961 #endif
1962 if (tb[TCA_CBQ_FOPT-1])
1963 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1964 sch_tree_unlock(sch);
1966 #ifdef CONFIG_NET_ESTIMATOR
1967 if (tca[TCA_RATE-1])
1968 qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]);
1969 #endif
1971 *arg = (unsigned long)cl;
1972 return 0;
1974 failure:
1975 qdisc_put_rtab(rtab);
1976 return err;
1979 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1981 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1982 struct cbq_class *cl = (struct cbq_class*)arg;
1984 if (cl->filters || cl->children || cl == &q->link)
1985 return -EBUSY;
1987 sch_tree_lock(sch);
1989 if (cl->next_alive)
1990 cbq_deactivate_class(cl);
1992 if (q->tx_borrowed == cl)
1993 q->tx_borrowed = q->tx_class;
1994 if (q->tx_class == cl) {
1995 q->tx_class = NULL;
1996 q->tx_borrowed = NULL;
1998 #ifdef CONFIG_NET_CLS_POLICE
1999 if (q->rx_class == cl)
2000 q->rx_class = NULL;
2001 #endif
2003 cbq_unlink_class(cl);
2004 cbq_adjust_levels(cl->tparent);
2005 cl->defmap = 0;
2006 cbq_sync_defmap(cl);
2008 cbq_rmprio(q, cl);
2009 sch_tree_unlock(sch);
2011 if (--cl->refcnt == 0)
2012 cbq_destroy_class(sch, cl);
2014 return 0;
2017 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
2019 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
2020 struct cbq_class *cl = (struct cbq_class *)arg;
2022 if (cl == NULL)
2023 cl = &q->link;
2025 return &cl->filter_list;
2028 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
2029 u32 classid)
2031 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
2032 struct cbq_class *p = (struct cbq_class*)parent;
2033 struct cbq_class *cl = cbq_class_lookup(q, classid);
2035 if (cl) {
2036 if (p && p->level <= cl->level)
2037 return 0;
2038 cl->filters++;
2039 return (unsigned long)cl;
2041 return 0;
2044 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2046 struct cbq_class *cl = (struct cbq_class*)arg;
2048 cl->filters--;
2051 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2053 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
2054 unsigned h;
2056 if (arg->stop)
2057 return;
2059 for (h = 0; h < 16; h++) {
2060 struct cbq_class *cl;
2062 for (cl = q->classes[h]; cl; cl = cl->next) {
2063 if (arg->count < arg->skip) {
2064 arg->count++;
2065 continue;
2067 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2068 arg->stop = 1;
2069 return;
2071 arg->count++;
2076 static struct Qdisc_class_ops cbq_class_ops =
2078 cbq_graft,
2079 cbq_leaf,
2080 cbq_get,
2081 cbq_put,
2082 cbq_change_class,
2083 cbq_delete,
2084 cbq_walk,
2086 cbq_find_tcf,
2087 cbq_bind_filter,
2088 cbq_unbind_filter,
2090 cbq_dump_class,
2093 struct Qdisc_ops cbq_qdisc_ops =
2095 NULL,
2096 &cbq_class_ops,
2097 "cbq",
2098 sizeof(struct cbq_sched_data),
2100 cbq_enqueue,
2101 cbq_dequeue,
2102 cbq_requeue,
2103 cbq_drop,
2105 cbq_init,
2106 cbq_reset,
2107 cbq_destroy,
2108 NULL /* cbq_change */,
2110 cbq_dump,
2113 #ifdef MODULE
2114 int init_module(void)
2116 return register_qdisc(&cbq_qdisc_ops);
2119 void cleanup_module(void)
2121 unregister_qdisc(&cbq_qdisc_ops);
2123 #endif
2124 MODULE_LICENSE("GPL");