autofs4: add missing kfree
[linux-2.6/x86.git] / net / sched / sch_cbq.c
blobf1d2f8ec8b4cde95e206ff184627ae9b8bdcf55b
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/string.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <net/netlink.h>
20 #include <net/pkt_sched.h>
23 /* Class-Based Queueing (CBQ) algorithm.
24 =======================================
26 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
27 Management Models for Packet Networks",
28 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
30 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
32 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
33 Parameters", 1996
35 [4] Sally Floyd and Michael Speer, "Experimental Results
36 for Class-Based Queueing", 1998, not published.
38 -----------------------------------------------------------------------
40 Algorithm skeleton was taken from NS simulator cbq.cc.
41 If someone wants to check this code against the LBL version,
42 he should take into account that ONLY the skeleton was borrowed,
43 the implementation is different. Particularly:
45 --- The WRR algorithm is different. Our version looks more
46 reasonable (I hope) and works when quanta are allowed to be
47 less than MTU, which is always the case when real time classes
48 have small rates. Note, that the statement of [3] is
49 incomplete, delay may actually be estimated even if class
50 per-round allotment is less than MTU. Namely, if per-round
51 allotment is W*r_i, and r_1+...+r_k = r < 1
53 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
55 In the worst case we have IntServ estimate with D = W*r+k*MTU
56 and C = MTU*r. The proof (if correct at all) is trivial.
59 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
60 interpret some places, which look like wrong translations
61 from NS. Anyone is advised to find these differences
62 and explain to me, why I am wrong 8).
64 --- Linux has no EOI event, so that we cannot estimate true class
65 idle time. Workaround is to consider the next dequeue event
66 as sign that previous packet is finished. This is wrong because of
67 internal device queueing, but on a permanently loaded link it is true.
68 Moreover, combined with clock integrator, this scheme looks
69 very close to an ideal solution. */
71 struct cbq_sched_data;
74 struct cbq_class
76 struct Qdisc_class_common common;
77 struct cbq_class *next_alive; /* next class with backlog in this priority band */
79 /* Parameters */
80 unsigned char priority; /* class priority */
81 unsigned char priority2; /* priority to be used after overlimit */
82 unsigned char ewma_log; /* time constant for idle time calculation */
83 unsigned char ovl_strategy;
84 #ifdef CONFIG_NET_CLS_ACT
85 unsigned char police;
86 #endif
88 u32 defmap;
90 /* Link-sharing scheduler parameters */
91 long maxidle; /* Class parameters: see below. */
92 long offtime;
93 long minidle;
94 u32 avpkt;
95 struct qdisc_rate_table *R_tab;
97 /* Overlimit strategy parameters */
98 void (*overlimit)(struct cbq_class *cl);
99 psched_tdiff_t penalty;
101 /* General scheduler (WRR) parameters */
102 long allot;
103 long quantum; /* Allotment per WRR round */
104 long weight; /* Relative allotment: see below */
106 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
107 struct cbq_class *split; /* Ptr to split node */
108 struct cbq_class *share; /* Ptr to LS parent in the class tree */
109 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
110 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
111 parent otherwise */
112 struct cbq_class *sibling; /* Sibling chain */
113 struct cbq_class *children; /* Pointer to children chain */
115 struct Qdisc *q; /* Elementary queueing discipline */
118 /* Variables */
119 unsigned char cpriority; /* Effective priority */
120 unsigned char delayed;
121 unsigned char level; /* level of the class in hierarchy:
122 0 for leaf classes, and maximal
123 level of children + 1 for nodes.
126 psched_time_t last; /* Last end of service */
127 psched_time_t undertime;
128 long avgidle;
129 long deficit; /* Saved deficit for WRR */
130 psched_time_t penalized;
131 struct gnet_stats_basic bstats;
132 struct gnet_stats_queue qstats;
133 struct gnet_stats_rate_est rate_est;
134 struct tc_cbq_xstats xstats;
136 struct tcf_proto *filter_list;
138 int refcnt;
139 int filters;
141 struct cbq_class *defaults[TC_PRIO_MAX+1];
144 struct cbq_sched_data
146 struct Qdisc_class_hash clhash; /* Hash table of all classes */
147 int nclasses[TC_CBQ_MAXPRIO+1];
148 unsigned quanta[TC_CBQ_MAXPRIO+1];
150 struct cbq_class link;
152 unsigned activemask;
153 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
154 with backlog */
156 #ifdef CONFIG_NET_CLS_ACT
157 struct cbq_class *rx_class;
158 #endif
159 struct cbq_class *tx_class;
160 struct cbq_class *tx_borrowed;
161 int tx_len;
162 psched_time_t now; /* Cached timestamp */
163 psched_time_t now_rt; /* Cached real time */
164 unsigned pmask;
166 struct hrtimer delay_timer;
167 struct qdisc_watchdog watchdog; /* Watchdog timer,
168 started when CBQ has
169 backlog, but cannot
170 transmit just now */
171 psched_tdiff_t wd_expires;
172 int toplevel;
173 u32 hgenerator;
177 #define L2T(cl,len) qdisc_l2t((cl)->R_tab,len)
179 static __inline__ struct cbq_class *
180 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
182 struct Qdisc_class_common *clc;
184 clc = qdisc_class_find(&q->clhash, classid);
185 if (clc == NULL)
186 return NULL;
187 return container_of(clc, struct cbq_class, common);
190 #ifdef CONFIG_NET_CLS_ACT
192 static struct cbq_class *
193 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
195 struct cbq_class *cl, *new;
197 for (cl = this->tparent; cl; cl = cl->tparent)
198 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
199 return new;
201 return NULL;
204 #endif
206 /* Classify packet. The procedure is pretty complicated, but
207 it allows us to combine link sharing and priority scheduling
208 transparently.
210 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
211 so that it resolves to split nodes. Then packets are classified
212 by logical priority, or a more specific classifier may be attached
213 to the split node.
216 static struct cbq_class *
217 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
219 struct cbq_sched_data *q = qdisc_priv(sch);
220 struct cbq_class *head = &q->link;
221 struct cbq_class **defmap;
222 struct cbq_class *cl = NULL;
223 u32 prio = skb->priority;
224 struct tcf_result res;
227 * Step 1. If skb->priority points to one of our classes, use it.
229 if (TC_H_MAJ(prio^sch->handle) == 0 &&
230 (cl = cbq_class_lookup(q, prio)) != NULL)
231 return cl;
233 *qerr = NET_XMIT_BYPASS;
234 for (;;) {
235 int result = 0;
236 defmap = head->defaults;
239 * Step 2+n. Apply classifier.
241 if (!head->filter_list ||
242 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
243 goto fallback;
245 if ((cl = (void*)res.class) == NULL) {
246 if (TC_H_MAJ(res.classid))
247 cl = cbq_class_lookup(q, res.classid);
248 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
249 cl = defmap[TC_PRIO_BESTEFFORT];
251 if (cl == NULL || cl->level >= head->level)
252 goto fallback;
255 #ifdef CONFIG_NET_CLS_ACT
256 switch (result) {
257 case TC_ACT_QUEUED:
258 case TC_ACT_STOLEN:
259 *qerr = NET_XMIT_SUCCESS;
260 case TC_ACT_SHOT:
261 return NULL;
262 case TC_ACT_RECLASSIFY:
263 return cbq_reclassify(skb, cl);
265 #endif
266 if (cl->level == 0)
267 return cl;
270 * Step 3+n. If classifier selected a link sharing class,
271 * apply agency specific classifier.
272 * Repeat this procdure until we hit a leaf node.
274 head = cl;
277 fallback:
278 cl = head;
281 * Step 4. No success...
283 if (TC_H_MAJ(prio) == 0 &&
284 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
285 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
286 return head;
288 return cl;
292 A packet has just been enqueued on the empty class.
293 cbq_activate_class adds it to the tail of active class list
294 of its priority band.
297 static __inline__ void cbq_activate_class(struct cbq_class *cl)
299 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
300 int prio = cl->cpriority;
301 struct cbq_class *cl_tail;
303 cl_tail = q->active[prio];
304 q->active[prio] = cl;
306 if (cl_tail != NULL) {
307 cl->next_alive = cl_tail->next_alive;
308 cl_tail->next_alive = cl;
309 } else {
310 cl->next_alive = cl;
311 q->activemask |= (1<<prio);
316 Unlink class from active chain.
317 Note that this same procedure is done directly in cbq_dequeue*
318 during round-robin procedure.
321 static void cbq_deactivate_class(struct cbq_class *this)
323 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
324 int prio = this->cpriority;
325 struct cbq_class *cl;
326 struct cbq_class *cl_prev = q->active[prio];
328 do {
329 cl = cl_prev->next_alive;
330 if (cl == this) {
331 cl_prev->next_alive = cl->next_alive;
332 cl->next_alive = NULL;
334 if (cl == q->active[prio]) {
335 q->active[prio] = cl_prev;
336 if (cl == q->active[prio]) {
337 q->active[prio] = NULL;
338 q->activemask &= ~(1<<prio);
339 return;
342 return;
344 } while ((cl_prev = cl) != q->active[prio]);
347 static void
348 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
350 int toplevel = q->toplevel;
352 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
353 psched_time_t now;
354 psched_tdiff_t incr;
356 now = psched_get_time();
357 incr = now - q->now_rt;
358 now = q->now + incr;
360 do {
361 if (cl->undertime < now) {
362 q->toplevel = cl->level;
363 return;
365 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
369 static int
370 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
372 struct cbq_sched_data *q = qdisc_priv(sch);
373 int uninitialized_var(ret);
374 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
376 #ifdef CONFIG_NET_CLS_ACT
377 q->rx_class = cl;
378 #endif
379 if (cl == NULL) {
380 if (ret == NET_XMIT_BYPASS)
381 sch->qstats.drops++;
382 kfree_skb(skb);
383 return ret;
386 #ifdef CONFIG_NET_CLS_ACT
387 cl->q->__parent = sch;
388 #endif
389 ret = qdisc_enqueue(skb, cl->q);
390 if (ret == NET_XMIT_SUCCESS) {
391 sch->q.qlen++;
392 sch->bstats.packets++;
393 sch->bstats.bytes += qdisc_pkt_len(skb);
394 cbq_mark_toplevel(q, cl);
395 if (!cl->next_alive)
396 cbq_activate_class(cl);
397 return ret;
400 sch->qstats.drops++;
401 cbq_mark_toplevel(q, cl);
402 cl->qstats.drops++;
403 return ret;
406 static int
407 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
409 struct cbq_sched_data *q = qdisc_priv(sch);
410 struct cbq_class *cl;
411 int ret;
413 if ((cl = q->tx_class) == NULL) {
414 kfree_skb(skb);
415 sch->qstats.drops++;
416 return NET_XMIT_CN;
418 q->tx_class = NULL;
420 cbq_mark_toplevel(q, cl);
422 #ifdef CONFIG_NET_CLS_ACT
423 q->rx_class = cl;
424 cl->q->__parent = sch;
425 #endif
426 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
427 sch->q.qlen++;
428 sch->qstats.requeues++;
429 if (!cl->next_alive)
430 cbq_activate_class(cl);
431 return 0;
433 sch->qstats.drops++;
434 cl->qstats.drops++;
435 return ret;
438 /* Overlimit actions */
440 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
442 static void cbq_ovl_classic(struct cbq_class *cl)
444 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
445 psched_tdiff_t delay = cl->undertime - q->now;
447 if (!cl->delayed) {
448 delay += cl->offtime;
451 Class goes to sleep, so that it will have no
452 chance to work avgidle. Let's forgive it 8)
454 BTW cbq-2.0 has a crap in this
455 place, apparently they forgot to shift it by cl->ewma_log.
457 if (cl->avgidle < 0)
458 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
459 if (cl->avgidle < cl->minidle)
460 cl->avgidle = cl->minidle;
461 if (delay <= 0)
462 delay = 1;
463 cl->undertime = q->now + delay;
465 cl->xstats.overactions++;
466 cl->delayed = 1;
468 if (q->wd_expires == 0 || q->wd_expires > delay)
469 q->wd_expires = delay;
471 /* Dirty work! We must schedule wakeups based on
472 real available rate, rather than leaf rate,
473 which may be tiny (even zero).
475 if (q->toplevel == TC_CBQ_MAXLEVEL) {
476 struct cbq_class *b;
477 psched_tdiff_t base_delay = q->wd_expires;
479 for (b = cl->borrow; b; b = b->borrow) {
480 delay = b->undertime - q->now;
481 if (delay < base_delay) {
482 if (delay <= 0)
483 delay = 1;
484 base_delay = delay;
488 q->wd_expires = base_delay;
492 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
493 they go overlimit
496 static void cbq_ovl_rclassic(struct cbq_class *cl)
498 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
499 struct cbq_class *this = cl;
501 do {
502 if (cl->level > q->toplevel) {
503 cl = NULL;
504 break;
506 } while ((cl = cl->borrow) != NULL);
508 if (cl == NULL)
509 cl = this;
510 cbq_ovl_classic(cl);
513 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
515 static void cbq_ovl_delay(struct cbq_class *cl)
517 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
518 psched_tdiff_t delay = cl->undertime - q->now;
520 if (!cl->delayed) {
521 psched_time_t sched = q->now;
522 ktime_t expires;
524 delay += cl->offtime;
525 if (cl->avgidle < 0)
526 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
527 if (cl->avgidle < cl->minidle)
528 cl->avgidle = cl->minidle;
529 cl->undertime = q->now + delay;
531 if (delay > 0) {
532 sched += delay + cl->penalty;
533 cl->penalized = sched;
534 cl->cpriority = TC_CBQ_MAXPRIO;
535 q->pmask |= (1<<TC_CBQ_MAXPRIO);
537 expires = ktime_set(0, 0);
538 expires = ktime_add_ns(expires, PSCHED_US2NS(sched));
539 if (hrtimer_try_to_cancel(&q->delay_timer) &&
540 ktime_to_ns(ktime_sub(q->delay_timer.expires,
541 expires)) > 0)
542 q->delay_timer.expires = expires;
543 hrtimer_restart(&q->delay_timer);
544 cl->delayed = 1;
545 cl->xstats.overactions++;
546 return;
548 delay = 1;
550 if (q->wd_expires == 0 || q->wd_expires > delay)
551 q->wd_expires = delay;
554 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
556 static void cbq_ovl_lowprio(struct cbq_class *cl)
558 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
560 cl->penalized = q->now + cl->penalty;
562 if (cl->cpriority != cl->priority2) {
563 cl->cpriority = cl->priority2;
564 q->pmask |= (1<<cl->cpriority);
565 cl->xstats.overactions++;
567 cbq_ovl_classic(cl);
570 /* TC_CBQ_OVL_DROP: penalize class by dropping */
572 static void cbq_ovl_drop(struct cbq_class *cl)
574 if (cl->q->ops->drop)
575 if (cl->q->ops->drop(cl->q))
576 cl->qdisc->q.qlen--;
577 cl->xstats.overactions++;
578 cbq_ovl_classic(cl);
581 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
582 psched_time_t now)
584 struct cbq_class *cl;
585 struct cbq_class *cl_prev = q->active[prio];
586 psched_time_t sched = now;
588 if (cl_prev == NULL)
589 return 0;
591 do {
592 cl = cl_prev->next_alive;
593 if (now - cl->penalized > 0) {
594 cl_prev->next_alive = cl->next_alive;
595 cl->next_alive = NULL;
596 cl->cpriority = cl->priority;
597 cl->delayed = 0;
598 cbq_activate_class(cl);
600 if (cl == q->active[prio]) {
601 q->active[prio] = cl_prev;
602 if (cl == q->active[prio]) {
603 q->active[prio] = NULL;
604 return 0;
608 cl = cl_prev->next_alive;
609 } else if (sched - cl->penalized > 0)
610 sched = cl->penalized;
611 } while ((cl_prev = cl) != q->active[prio]);
613 return sched - now;
616 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
618 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
619 delay_timer);
620 struct Qdisc *sch = q->watchdog.qdisc;
621 psched_time_t now;
622 psched_tdiff_t delay = 0;
623 unsigned pmask;
625 now = psched_get_time();
627 pmask = q->pmask;
628 q->pmask = 0;
630 while (pmask) {
631 int prio = ffz(~pmask);
632 psched_tdiff_t tmp;
634 pmask &= ~(1<<prio);
636 tmp = cbq_undelay_prio(q, prio, now);
637 if (tmp > 0) {
638 q->pmask |= 1<<prio;
639 if (tmp < delay || delay == 0)
640 delay = tmp;
644 if (delay) {
645 ktime_t time;
647 time = ktime_set(0, 0);
648 time = ktime_add_ns(time, PSCHED_US2NS(now + delay));
649 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
652 sch->flags &= ~TCQ_F_THROTTLED;
653 __netif_schedule(sch);
654 return HRTIMER_NORESTART;
657 #ifdef CONFIG_NET_CLS_ACT
658 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
660 struct Qdisc *sch = child->__parent;
661 struct cbq_sched_data *q = qdisc_priv(sch);
662 struct cbq_class *cl = q->rx_class;
664 q->rx_class = NULL;
666 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
668 cbq_mark_toplevel(q, cl);
670 q->rx_class = cl;
671 cl->q->__parent = sch;
673 if (qdisc_enqueue(skb, cl->q) == 0) {
674 sch->q.qlen++;
675 sch->bstats.packets++;
676 sch->bstats.bytes += qdisc_pkt_len(skb);
677 if (!cl->next_alive)
678 cbq_activate_class(cl);
679 return 0;
681 sch->qstats.drops++;
682 return 0;
685 sch->qstats.drops++;
686 return -1;
688 #endif
691 It is mission critical procedure.
693 We "regenerate" toplevel cutoff, if transmitting class
694 has backlog and it is not regulated. It is not part of
695 original CBQ description, but looks more reasonable.
696 Probably, it is wrong. This question needs further investigation.
699 static __inline__ void
700 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
701 struct cbq_class *borrowed)
703 if (cl && q->toplevel >= borrowed->level) {
704 if (cl->q->q.qlen > 1) {
705 do {
706 if (borrowed->undertime == PSCHED_PASTPERFECT) {
707 q->toplevel = borrowed->level;
708 return;
710 } while ((borrowed=borrowed->borrow) != NULL);
712 #if 0
713 /* It is not necessary now. Uncommenting it
714 will save CPU cycles, but decrease fairness.
716 q->toplevel = TC_CBQ_MAXLEVEL;
717 #endif
721 static void
722 cbq_update(struct cbq_sched_data *q)
724 struct cbq_class *this = q->tx_class;
725 struct cbq_class *cl = this;
726 int len = q->tx_len;
728 q->tx_class = NULL;
730 for ( ; cl; cl = cl->share) {
731 long avgidle = cl->avgidle;
732 long idle;
734 cl->bstats.packets++;
735 cl->bstats.bytes += len;
738 (now - last) is total time between packet right edges.
739 (last_pktlen/rate) is "virtual" busy time, so that
741 idle = (now - last) - last_pktlen/rate
744 idle = q->now - cl->last;
745 if ((unsigned long)idle > 128*1024*1024) {
746 avgidle = cl->maxidle;
747 } else {
748 idle -= L2T(cl, len);
750 /* true_avgidle := (1-W)*true_avgidle + W*idle,
751 where W=2^{-ewma_log}. But cl->avgidle is scaled:
752 cl->avgidle == true_avgidle/W,
753 hence:
755 avgidle += idle - (avgidle>>cl->ewma_log);
758 if (avgidle <= 0) {
759 /* Overlimit or at-limit */
761 if (avgidle < cl->minidle)
762 avgidle = cl->minidle;
764 cl->avgidle = avgidle;
766 /* Calculate expected time, when this class
767 will be allowed to send.
768 It will occur, when:
769 (1-W)*true_avgidle + W*delay = 0, i.e.
770 idle = (1/W - 1)*(-true_avgidle)
772 idle = (1 - W)*(-cl->avgidle);
774 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
777 That is not all.
778 To maintain the rate allocated to the class,
779 we add to undertime virtual clock,
780 necessary to complete transmitted packet.
781 (len/phys_bandwidth has been already passed
782 to the moment of cbq_update)
785 idle -= L2T(&q->link, len);
786 idle += L2T(cl, len);
788 cl->undertime = q->now + idle;
789 } else {
790 /* Underlimit */
792 cl->undertime = PSCHED_PASTPERFECT;
793 if (avgidle > cl->maxidle)
794 cl->avgidle = cl->maxidle;
795 else
796 cl->avgidle = avgidle;
798 cl->last = q->now;
801 cbq_update_toplevel(q, this, q->tx_borrowed);
804 static __inline__ struct cbq_class *
805 cbq_under_limit(struct cbq_class *cl)
807 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
808 struct cbq_class *this_cl = cl;
810 if (cl->tparent == NULL)
811 return cl;
813 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
814 cl->delayed = 0;
815 return cl;
818 do {
819 /* It is very suspicious place. Now overlimit
820 action is generated for not bounded classes
821 only if link is completely congested.
822 Though it is in agree with ancestor-only paradigm,
823 it looks very stupid. Particularly,
824 it means that this chunk of code will either
825 never be called or result in strong amplification
826 of burstiness. Dangerous, silly, and, however,
827 no another solution exists.
829 if ((cl = cl->borrow) == NULL) {
830 this_cl->qstats.overlimits++;
831 this_cl->overlimit(this_cl);
832 return NULL;
834 if (cl->level > q->toplevel)
835 return NULL;
836 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
838 cl->delayed = 0;
839 return cl;
842 static __inline__ struct sk_buff *
843 cbq_dequeue_prio(struct Qdisc *sch, int prio)
845 struct cbq_sched_data *q = qdisc_priv(sch);
846 struct cbq_class *cl_tail, *cl_prev, *cl;
847 struct sk_buff *skb;
848 int deficit;
850 cl_tail = cl_prev = q->active[prio];
851 cl = cl_prev->next_alive;
853 do {
854 deficit = 0;
856 /* Start round */
857 do {
858 struct cbq_class *borrow = cl;
860 if (cl->q->q.qlen &&
861 (borrow = cbq_under_limit(cl)) == NULL)
862 goto skip_class;
864 if (cl->deficit <= 0) {
865 /* Class exhausted its allotment per
866 this round. Switch to the next one.
868 deficit = 1;
869 cl->deficit += cl->quantum;
870 goto next_class;
873 skb = cl->q->dequeue(cl->q);
875 /* Class did not give us any skb :-(
876 It could occur even if cl->q->q.qlen != 0
877 f.e. if cl->q == "tbf"
879 if (skb == NULL)
880 goto skip_class;
882 cl->deficit -= qdisc_pkt_len(skb);
883 q->tx_class = cl;
884 q->tx_borrowed = borrow;
885 if (borrow != cl) {
886 #ifndef CBQ_XSTATS_BORROWS_BYTES
887 borrow->xstats.borrows++;
888 cl->xstats.borrows++;
889 #else
890 borrow->xstats.borrows += qdisc_pkt_len(skb);
891 cl->xstats.borrows += qdisc_pkt_len(skb);
892 #endif
894 q->tx_len = qdisc_pkt_len(skb);
896 if (cl->deficit <= 0) {
897 q->active[prio] = cl;
898 cl = cl->next_alive;
899 cl->deficit += cl->quantum;
901 return skb;
903 skip_class:
904 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
905 /* Class is empty or penalized.
906 Unlink it from active chain.
908 cl_prev->next_alive = cl->next_alive;
909 cl->next_alive = NULL;
911 /* Did cl_tail point to it? */
912 if (cl == cl_tail) {
913 /* Repair it! */
914 cl_tail = cl_prev;
916 /* Was it the last class in this band? */
917 if (cl == cl_tail) {
918 /* Kill the band! */
919 q->active[prio] = NULL;
920 q->activemask &= ~(1<<prio);
921 if (cl->q->q.qlen)
922 cbq_activate_class(cl);
923 return NULL;
926 q->active[prio] = cl_tail;
928 if (cl->q->q.qlen)
929 cbq_activate_class(cl);
931 cl = cl_prev;
934 next_class:
935 cl_prev = cl;
936 cl = cl->next_alive;
937 } while (cl_prev != cl_tail);
938 } while (deficit);
940 q->active[prio] = cl_prev;
942 return NULL;
945 static __inline__ struct sk_buff *
946 cbq_dequeue_1(struct Qdisc *sch)
948 struct cbq_sched_data *q = qdisc_priv(sch);
949 struct sk_buff *skb;
950 unsigned activemask;
952 activemask = q->activemask&0xFF;
953 while (activemask) {
954 int prio = ffz(~activemask);
955 activemask &= ~(1<<prio);
956 skb = cbq_dequeue_prio(sch, prio);
957 if (skb)
958 return skb;
960 return NULL;
963 static struct sk_buff *
964 cbq_dequeue(struct Qdisc *sch)
966 struct sk_buff *skb;
967 struct cbq_sched_data *q = qdisc_priv(sch);
968 psched_time_t now;
969 psched_tdiff_t incr;
971 now = psched_get_time();
972 incr = now - q->now_rt;
974 if (q->tx_class) {
975 psched_tdiff_t incr2;
976 /* Time integrator. We calculate EOS time
977 by adding expected packet transmission time.
978 If real time is greater, we warp artificial clock,
979 so that:
981 cbq_time = max(real_time, work);
983 incr2 = L2T(&q->link, q->tx_len);
984 q->now += incr2;
985 cbq_update(q);
986 if ((incr -= incr2) < 0)
987 incr = 0;
989 q->now += incr;
990 q->now_rt = now;
992 for (;;) {
993 q->wd_expires = 0;
995 skb = cbq_dequeue_1(sch);
996 if (skb) {
997 sch->q.qlen--;
998 sch->flags &= ~TCQ_F_THROTTLED;
999 return skb;
1002 /* All the classes are overlimit.
1004 It is possible, if:
1006 1. Scheduler is empty.
1007 2. Toplevel cutoff inhibited borrowing.
1008 3. Root class is overlimit.
1010 Reset 2d and 3d conditions and retry.
1012 Note, that NS and cbq-2.0 are buggy, peeking
1013 an arbitrary class is appropriate for ancestor-only
1014 sharing, but not for toplevel algorithm.
1016 Our version is better, but slower, because it requires
1017 two passes, but it is unavoidable with top-level sharing.
1020 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1021 q->link.undertime == PSCHED_PASTPERFECT)
1022 break;
1024 q->toplevel = TC_CBQ_MAXLEVEL;
1025 q->link.undertime = PSCHED_PASTPERFECT;
1028 /* No packets in scheduler or nobody wants to give them to us :-(
1029 Sigh... start watchdog timer in the last case. */
1031 if (sch->q.qlen) {
1032 sch->qstats.overlimits++;
1033 if (q->wd_expires)
1034 qdisc_watchdog_schedule(&q->watchdog,
1035 now + q->wd_expires);
1037 return NULL;
1040 /* CBQ class maintanance routines */
1042 static void cbq_adjust_levels(struct cbq_class *this)
1044 if (this == NULL)
1045 return;
1047 do {
1048 int level = 0;
1049 struct cbq_class *cl;
1051 if ((cl = this->children) != NULL) {
1052 do {
1053 if (cl->level > level)
1054 level = cl->level;
1055 } while ((cl = cl->sibling) != this->children);
1057 this->level = level+1;
1058 } while ((this = this->tparent) != NULL);
1061 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1063 struct cbq_class *cl;
1064 struct hlist_node *n;
1065 unsigned int h;
1067 if (q->quanta[prio] == 0)
1068 return;
1070 for (h = 0; h < q->clhash.hashsize; h++) {
1071 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1072 /* BUGGGG... Beware! This expression suffer of
1073 arithmetic overflows!
1075 if (cl->priority == prio) {
1076 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1077 q->quanta[prio];
1079 if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1080 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum);
1081 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1087 static void cbq_sync_defmap(struct cbq_class *cl)
1089 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1090 struct cbq_class *split = cl->split;
1091 unsigned h;
1092 int i;
1094 if (split == NULL)
1095 return;
1097 for (i=0; i<=TC_PRIO_MAX; i++) {
1098 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1099 split->defaults[i] = NULL;
1102 for (i=0; i<=TC_PRIO_MAX; i++) {
1103 int level = split->level;
1105 if (split->defaults[i])
1106 continue;
1108 for (h = 0; h < q->clhash.hashsize; h++) {
1109 struct hlist_node *n;
1110 struct cbq_class *c;
1112 hlist_for_each_entry(c, n, &q->clhash.hash[h],
1113 common.hnode) {
1114 if (c->split == split && c->level < level &&
1115 c->defmap&(1<<i)) {
1116 split->defaults[i] = c;
1117 level = c->level;
1124 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1126 struct cbq_class *split = NULL;
1128 if (splitid == 0) {
1129 if ((split = cl->split) == NULL)
1130 return;
1131 splitid = split->common.classid;
1134 if (split == NULL || split->common.classid != splitid) {
1135 for (split = cl->tparent; split; split = split->tparent)
1136 if (split->common.classid == splitid)
1137 break;
1140 if (split == NULL)
1141 return;
1143 if (cl->split != split) {
1144 cl->defmap = 0;
1145 cbq_sync_defmap(cl);
1146 cl->split = split;
1147 cl->defmap = def&mask;
1148 } else
1149 cl->defmap = (cl->defmap&~mask)|(def&mask);
1151 cbq_sync_defmap(cl);
1154 static void cbq_unlink_class(struct cbq_class *this)
1156 struct cbq_class *cl, **clp;
1157 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1159 qdisc_class_hash_remove(&q->clhash, &this->common);
1161 if (this->tparent) {
1162 clp=&this->sibling;
1163 cl = *clp;
1164 do {
1165 if (cl == this) {
1166 *clp = cl->sibling;
1167 break;
1169 clp = &cl->sibling;
1170 } while ((cl = *clp) != this->sibling);
1172 if (this->tparent->children == this) {
1173 this->tparent->children = this->sibling;
1174 if (this->sibling == this)
1175 this->tparent->children = NULL;
1177 } else {
1178 BUG_TRAP(this->sibling == this);
1182 static void cbq_link_class(struct cbq_class *this)
1184 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1185 struct cbq_class *parent = this->tparent;
1187 this->sibling = this;
1188 qdisc_class_hash_insert(&q->clhash, &this->common);
1190 if (parent == NULL)
1191 return;
1193 if (parent->children == NULL) {
1194 parent->children = this;
1195 } else {
1196 this->sibling = parent->children->sibling;
1197 parent->children->sibling = this;
1201 static unsigned int cbq_drop(struct Qdisc* sch)
1203 struct cbq_sched_data *q = qdisc_priv(sch);
1204 struct cbq_class *cl, *cl_head;
1205 int prio;
1206 unsigned int len;
1208 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1209 if ((cl_head = q->active[prio]) == NULL)
1210 continue;
1212 cl = cl_head;
1213 do {
1214 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1215 sch->q.qlen--;
1216 if (!cl->q->q.qlen)
1217 cbq_deactivate_class(cl);
1218 return len;
1220 } while ((cl = cl->next_alive) != cl_head);
1222 return 0;
1225 static void
1226 cbq_reset(struct Qdisc* sch)
1228 struct cbq_sched_data *q = qdisc_priv(sch);
1229 struct cbq_class *cl;
1230 struct hlist_node *n;
1231 int prio;
1232 unsigned h;
1234 q->activemask = 0;
1235 q->pmask = 0;
1236 q->tx_class = NULL;
1237 q->tx_borrowed = NULL;
1238 qdisc_watchdog_cancel(&q->watchdog);
1239 hrtimer_cancel(&q->delay_timer);
1240 q->toplevel = TC_CBQ_MAXLEVEL;
1241 q->now = psched_get_time();
1242 q->now_rt = q->now;
1244 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1245 q->active[prio] = NULL;
1247 for (h = 0; h < q->clhash.hashsize; h++) {
1248 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1249 qdisc_reset(cl->q);
1251 cl->next_alive = NULL;
1252 cl->undertime = PSCHED_PASTPERFECT;
1253 cl->avgidle = cl->maxidle;
1254 cl->deficit = cl->quantum;
1255 cl->cpriority = cl->priority;
1258 sch->q.qlen = 0;
1262 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1264 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1265 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1266 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1268 if (lss->change&TCF_CBQ_LSS_EWMA)
1269 cl->ewma_log = lss->ewma_log;
1270 if (lss->change&TCF_CBQ_LSS_AVPKT)
1271 cl->avpkt = lss->avpkt;
1272 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1273 cl->minidle = -(long)lss->minidle;
1274 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1275 cl->maxidle = lss->maxidle;
1276 cl->avgidle = lss->maxidle;
1278 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1279 cl->offtime = lss->offtime;
1280 return 0;
1283 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1285 q->nclasses[cl->priority]--;
1286 q->quanta[cl->priority] -= cl->weight;
1287 cbq_normalize_quanta(q, cl->priority);
1290 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1292 q->nclasses[cl->priority]++;
1293 q->quanta[cl->priority] += cl->weight;
1294 cbq_normalize_quanta(q, cl->priority);
1297 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1299 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1301 if (wrr->allot)
1302 cl->allot = wrr->allot;
1303 if (wrr->weight)
1304 cl->weight = wrr->weight;
1305 if (wrr->priority) {
1306 cl->priority = wrr->priority-1;
1307 cl->cpriority = cl->priority;
1308 if (cl->priority >= cl->priority2)
1309 cl->priority2 = TC_CBQ_MAXPRIO-1;
1312 cbq_addprio(q, cl);
1313 return 0;
1316 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1318 switch (ovl->strategy) {
1319 case TC_CBQ_OVL_CLASSIC:
1320 cl->overlimit = cbq_ovl_classic;
1321 break;
1322 case TC_CBQ_OVL_DELAY:
1323 cl->overlimit = cbq_ovl_delay;
1324 break;
1325 case TC_CBQ_OVL_LOWPRIO:
1326 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1327 ovl->priority2-1 <= cl->priority)
1328 return -EINVAL;
1329 cl->priority2 = ovl->priority2-1;
1330 cl->overlimit = cbq_ovl_lowprio;
1331 break;
1332 case TC_CBQ_OVL_DROP:
1333 cl->overlimit = cbq_ovl_drop;
1334 break;
1335 case TC_CBQ_OVL_RCLASSIC:
1336 cl->overlimit = cbq_ovl_rclassic;
1337 break;
1338 default:
1339 return -EINVAL;
1341 cl->penalty = ovl->penalty;
1342 return 0;
1345 #ifdef CONFIG_NET_CLS_ACT
1346 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1348 cl->police = p->police;
1350 if (cl->q->handle) {
1351 if (p->police == TC_POLICE_RECLASSIFY)
1352 cl->q->reshape_fail = cbq_reshape_fail;
1353 else
1354 cl->q->reshape_fail = NULL;
1356 return 0;
1358 #endif
1360 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1362 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1363 return 0;
1366 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1367 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1368 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1369 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1370 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1371 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1372 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1373 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1376 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1378 struct cbq_sched_data *q = qdisc_priv(sch);
1379 struct nlattr *tb[TCA_CBQ_MAX + 1];
1380 struct tc_ratespec *r;
1381 int err;
1383 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1384 if (err < 0)
1385 return err;
1387 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1388 return -EINVAL;
1390 r = nla_data(tb[TCA_CBQ_RATE]);
1392 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1393 return -EINVAL;
1395 err = qdisc_class_hash_init(&q->clhash);
1396 if (err < 0)
1397 goto put_rtab;
1399 q->link.refcnt = 1;
1400 q->link.sibling = &q->link;
1401 q->link.common.classid = sch->handle;
1402 q->link.qdisc = sch;
1403 if (!(q->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1404 &pfifo_qdisc_ops,
1405 sch->handle)))
1406 q->link.q = &noop_qdisc;
1408 q->link.priority = TC_CBQ_MAXPRIO-1;
1409 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1410 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1411 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1412 q->link.overlimit = cbq_ovl_classic;
1413 q->link.allot = psched_mtu(qdisc_dev(sch));
1414 q->link.quantum = q->link.allot;
1415 q->link.weight = q->link.R_tab->rate.rate;
1417 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1418 q->link.avpkt = q->link.allot/2;
1419 q->link.minidle = -0x7FFFFFFF;
1421 qdisc_watchdog_init(&q->watchdog, sch);
1422 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1423 q->delay_timer.function = cbq_undelay;
1424 q->toplevel = TC_CBQ_MAXLEVEL;
1425 q->now = psched_get_time();
1426 q->now_rt = q->now;
1428 cbq_link_class(&q->link);
1430 if (tb[TCA_CBQ_LSSOPT])
1431 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1433 cbq_addprio(q, &q->link);
1434 return 0;
1436 put_rtab:
1437 qdisc_put_rtab(q->link.R_tab);
1438 return err;
1441 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1443 unsigned char *b = skb_tail_pointer(skb);
1445 NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1446 return skb->len;
1448 nla_put_failure:
1449 nlmsg_trim(skb, b);
1450 return -1;
1453 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1455 unsigned char *b = skb_tail_pointer(skb);
1456 struct tc_cbq_lssopt opt;
1458 opt.flags = 0;
1459 if (cl->borrow == NULL)
1460 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1461 if (cl->share == NULL)
1462 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1463 opt.ewma_log = cl->ewma_log;
1464 opt.level = cl->level;
1465 opt.avpkt = cl->avpkt;
1466 opt.maxidle = cl->maxidle;
1467 opt.minidle = (u32)(-cl->minidle);
1468 opt.offtime = cl->offtime;
1469 opt.change = ~0;
1470 NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1471 return skb->len;
1473 nla_put_failure:
1474 nlmsg_trim(skb, b);
1475 return -1;
1478 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1480 unsigned char *b = skb_tail_pointer(skb);
1481 struct tc_cbq_wrropt opt;
1483 opt.flags = 0;
1484 opt.allot = cl->allot;
1485 opt.priority = cl->priority+1;
1486 opt.cpriority = cl->cpriority+1;
1487 opt.weight = cl->weight;
1488 NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1489 return skb->len;
1491 nla_put_failure:
1492 nlmsg_trim(skb, b);
1493 return -1;
1496 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1498 unsigned char *b = skb_tail_pointer(skb);
1499 struct tc_cbq_ovl opt;
1501 opt.strategy = cl->ovl_strategy;
1502 opt.priority2 = cl->priority2+1;
1503 opt.pad = 0;
1504 opt.penalty = cl->penalty;
1505 NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1506 return skb->len;
1508 nla_put_failure:
1509 nlmsg_trim(skb, b);
1510 return -1;
1513 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1515 unsigned char *b = skb_tail_pointer(skb);
1516 struct tc_cbq_fopt opt;
1518 if (cl->split || cl->defmap) {
1519 opt.split = cl->split ? cl->split->common.classid : 0;
1520 opt.defmap = cl->defmap;
1521 opt.defchange = ~0;
1522 NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1524 return skb->len;
1526 nla_put_failure:
1527 nlmsg_trim(skb, b);
1528 return -1;
1531 #ifdef CONFIG_NET_CLS_ACT
1532 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1534 unsigned char *b = skb_tail_pointer(skb);
1535 struct tc_cbq_police opt;
1537 if (cl->police) {
1538 opt.police = cl->police;
1539 opt.__res1 = 0;
1540 opt.__res2 = 0;
1541 NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1543 return skb->len;
1545 nla_put_failure:
1546 nlmsg_trim(skb, b);
1547 return -1;
1549 #endif
1551 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1553 if (cbq_dump_lss(skb, cl) < 0 ||
1554 cbq_dump_rate(skb, cl) < 0 ||
1555 cbq_dump_wrr(skb, cl) < 0 ||
1556 cbq_dump_ovl(skb, cl) < 0 ||
1557 #ifdef CONFIG_NET_CLS_ACT
1558 cbq_dump_police(skb, cl) < 0 ||
1559 #endif
1560 cbq_dump_fopt(skb, cl) < 0)
1561 return -1;
1562 return 0;
1565 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1567 struct cbq_sched_data *q = qdisc_priv(sch);
1568 struct nlattr *nest;
1570 nest = nla_nest_start(skb, TCA_OPTIONS);
1571 if (nest == NULL)
1572 goto nla_put_failure;
1573 if (cbq_dump_attr(skb, &q->link) < 0)
1574 goto nla_put_failure;
1575 nla_nest_end(skb, nest);
1576 return skb->len;
1578 nla_put_failure:
1579 nla_nest_cancel(skb, nest);
1580 return -1;
1583 static int
1584 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1586 struct cbq_sched_data *q = qdisc_priv(sch);
1588 q->link.xstats.avgidle = q->link.avgidle;
1589 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1592 static int
1593 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1594 struct sk_buff *skb, struct tcmsg *tcm)
1596 struct cbq_class *cl = (struct cbq_class*)arg;
1597 struct nlattr *nest;
1599 if (cl->tparent)
1600 tcm->tcm_parent = cl->tparent->common.classid;
1601 else
1602 tcm->tcm_parent = TC_H_ROOT;
1603 tcm->tcm_handle = cl->common.classid;
1604 tcm->tcm_info = cl->q->handle;
1606 nest = nla_nest_start(skb, TCA_OPTIONS);
1607 if (nest == NULL)
1608 goto nla_put_failure;
1609 if (cbq_dump_attr(skb, cl) < 0)
1610 goto nla_put_failure;
1611 nla_nest_end(skb, nest);
1612 return skb->len;
1614 nla_put_failure:
1615 nla_nest_cancel(skb, nest);
1616 return -1;
1619 static int
1620 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1621 struct gnet_dump *d)
1623 struct cbq_sched_data *q = qdisc_priv(sch);
1624 struct cbq_class *cl = (struct cbq_class*)arg;
1626 cl->qstats.qlen = cl->q->q.qlen;
1627 cl->xstats.avgidle = cl->avgidle;
1628 cl->xstats.undertime = 0;
1630 if (cl->undertime != PSCHED_PASTPERFECT)
1631 cl->xstats.undertime = cl->undertime - q->now;
1633 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1634 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1635 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1636 return -1;
1638 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1641 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1642 struct Qdisc **old)
1644 struct cbq_class *cl = (struct cbq_class*)arg;
1646 if (cl) {
1647 if (new == NULL) {
1648 new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1649 &pfifo_qdisc_ops,
1650 cl->common.classid);
1651 if (new == NULL)
1652 return -ENOBUFS;
1653 } else {
1654 #ifdef CONFIG_NET_CLS_ACT
1655 if (cl->police == TC_POLICE_RECLASSIFY)
1656 new->reshape_fail = cbq_reshape_fail;
1657 #endif
1659 sch_tree_lock(sch);
1660 *old = xchg(&cl->q, new);
1661 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1662 qdisc_reset(*old);
1663 sch_tree_unlock(sch);
1665 return 0;
1667 return -ENOENT;
1670 static struct Qdisc *
1671 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1673 struct cbq_class *cl = (struct cbq_class*)arg;
1675 return cl ? cl->q : NULL;
1678 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1680 struct cbq_class *cl = (struct cbq_class *)arg;
1682 if (cl->q->q.qlen == 0)
1683 cbq_deactivate_class(cl);
1686 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1688 struct cbq_sched_data *q = qdisc_priv(sch);
1689 struct cbq_class *cl = cbq_class_lookup(q, classid);
1691 if (cl) {
1692 cl->refcnt++;
1693 return (unsigned long)cl;
1695 return 0;
1698 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1700 struct cbq_sched_data *q = qdisc_priv(sch);
1702 BUG_TRAP(!cl->filters);
1704 tcf_destroy_chain(&cl->filter_list);
1705 qdisc_destroy(cl->q);
1706 qdisc_put_rtab(cl->R_tab);
1707 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1708 if (cl != &q->link)
1709 kfree(cl);
1712 static void
1713 cbq_destroy(struct Qdisc* sch)
1715 struct cbq_sched_data *q = qdisc_priv(sch);
1716 struct hlist_node *n, *next;
1717 struct cbq_class *cl;
1718 unsigned h;
1720 #ifdef CONFIG_NET_CLS_ACT
1721 q->rx_class = NULL;
1722 #endif
1724 * Filters must be destroyed first because we don't destroy the
1725 * classes from root to leafs which means that filters can still
1726 * be bound to classes which have been destroyed already. --TGR '04
1728 for (h = 0; h < q->clhash.hashsize; h++) {
1729 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
1730 tcf_destroy_chain(&cl->filter_list);
1732 for (h = 0; h < q->clhash.hashsize; h++) {
1733 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
1734 common.hnode)
1735 cbq_destroy_class(sch, cl);
1737 qdisc_class_hash_destroy(&q->clhash);
1740 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1742 struct cbq_class *cl = (struct cbq_class*)arg;
1744 if (--cl->refcnt == 0) {
1745 #ifdef CONFIG_NET_CLS_ACT
1746 spinlock_t *root_lock = qdisc_root_lock(sch);
1747 struct cbq_sched_data *q = qdisc_priv(sch);
1749 spin_lock_bh(root_lock);
1750 if (q->rx_class == cl)
1751 q->rx_class = NULL;
1752 spin_unlock_bh(root_lock);
1753 #endif
1755 cbq_destroy_class(sch, cl);
1759 static int
1760 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1761 unsigned long *arg)
1763 int err;
1764 struct cbq_sched_data *q = qdisc_priv(sch);
1765 struct cbq_class *cl = (struct cbq_class*)*arg;
1766 struct nlattr *opt = tca[TCA_OPTIONS];
1767 struct nlattr *tb[TCA_CBQ_MAX + 1];
1768 struct cbq_class *parent;
1769 struct qdisc_rate_table *rtab = NULL;
1771 if (opt == NULL)
1772 return -EINVAL;
1774 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1775 if (err < 0)
1776 return err;
1778 if (cl) {
1779 /* Check parent */
1780 if (parentid) {
1781 if (cl->tparent &&
1782 cl->tparent->common.classid != parentid)
1783 return -EINVAL;
1784 if (!cl->tparent && parentid != TC_H_ROOT)
1785 return -EINVAL;
1788 if (tb[TCA_CBQ_RATE]) {
1789 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1790 if (rtab == NULL)
1791 return -EINVAL;
1794 /* Change class parameters */
1795 sch_tree_lock(sch);
1797 if (cl->next_alive != NULL)
1798 cbq_deactivate_class(cl);
1800 if (rtab) {
1801 rtab = xchg(&cl->R_tab, rtab);
1802 qdisc_put_rtab(rtab);
1805 if (tb[TCA_CBQ_LSSOPT])
1806 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1808 if (tb[TCA_CBQ_WRROPT]) {
1809 cbq_rmprio(q, cl);
1810 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1813 if (tb[TCA_CBQ_OVL_STRATEGY])
1814 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1816 #ifdef CONFIG_NET_CLS_ACT
1817 if (tb[TCA_CBQ_POLICE])
1818 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1819 #endif
1821 if (tb[TCA_CBQ_FOPT])
1822 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1824 if (cl->q->q.qlen)
1825 cbq_activate_class(cl);
1827 sch_tree_unlock(sch);
1829 if (tca[TCA_RATE])
1830 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1831 qdisc_root_lock(sch),
1832 tca[TCA_RATE]);
1833 return 0;
1836 if (parentid == TC_H_ROOT)
1837 return -EINVAL;
1839 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1840 tb[TCA_CBQ_LSSOPT] == NULL)
1841 return -EINVAL;
1843 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1844 if (rtab == NULL)
1845 return -EINVAL;
1847 if (classid) {
1848 err = -EINVAL;
1849 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1850 goto failure;
1851 } else {
1852 int i;
1853 classid = TC_H_MAKE(sch->handle,0x8000);
1855 for (i=0; i<0x8000; i++) {
1856 if (++q->hgenerator >= 0x8000)
1857 q->hgenerator = 1;
1858 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1859 break;
1861 err = -ENOSR;
1862 if (i >= 0x8000)
1863 goto failure;
1864 classid = classid|q->hgenerator;
1867 parent = &q->link;
1868 if (parentid) {
1869 parent = cbq_class_lookup(q, parentid);
1870 err = -EINVAL;
1871 if (parent == NULL)
1872 goto failure;
1875 err = -ENOBUFS;
1876 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1877 if (cl == NULL)
1878 goto failure;
1879 cl->R_tab = rtab;
1880 rtab = NULL;
1881 cl->refcnt = 1;
1882 if (!(cl->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1883 &pfifo_qdisc_ops, classid)))
1884 cl->q = &noop_qdisc;
1885 cl->common.classid = classid;
1886 cl->tparent = parent;
1887 cl->qdisc = sch;
1888 cl->allot = parent->allot;
1889 cl->quantum = cl->allot;
1890 cl->weight = cl->R_tab->rate.rate;
1892 sch_tree_lock(sch);
1893 cbq_link_class(cl);
1894 cl->borrow = cl->tparent;
1895 if (cl->tparent != &q->link)
1896 cl->share = cl->tparent;
1897 cbq_adjust_levels(parent);
1898 cl->minidle = -0x7FFFFFFF;
1899 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1900 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1901 if (cl->ewma_log==0)
1902 cl->ewma_log = q->link.ewma_log;
1903 if (cl->maxidle==0)
1904 cl->maxidle = q->link.maxidle;
1905 if (cl->avpkt==0)
1906 cl->avpkt = q->link.avpkt;
1907 cl->overlimit = cbq_ovl_classic;
1908 if (tb[TCA_CBQ_OVL_STRATEGY])
1909 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1910 #ifdef CONFIG_NET_CLS_ACT
1911 if (tb[TCA_CBQ_POLICE])
1912 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1913 #endif
1914 if (tb[TCA_CBQ_FOPT])
1915 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1916 sch_tree_unlock(sch);
1918 qdisc_class_hash_grow(sch, &q->clhash);
1920 if (tca[TCA_RATE])
1921 gen_new_estimator(&cl->bstats, &cl->rate_est,
1922 qdisc_root_lock(sch), tca[TCA_RATE]);
1924 *arg = (unsigned long)cl;
1925 return 0;
1927 failure:
1928 qdisc_put_rtab(rtab);
1929 return err;
1932 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1934 struct cbq_sched_data *q = qdisc_priv(sch);
1935 struct cbq_class *cl = (struct cbq_class*)arg;
1936 unsigned int qlen;
1938 if (cl->filters || cl->children || cl == &q->link)
1939 return -EBUSY;
1941 sch_tree_lock(sch);
1943 qlen = cl->q->q.qlen;
1944 qdisc_reset(cl->q);
1945 qdisc_tree_decrease_qlen(cl->q, qlen);
1947 if (cl->next_alive)
1948 cbq_deactivate_class(cl);
1950 if (q->tx_borrowed == cl)
1951 q->tx_borrowed = q->tx_class;
1952 if (q->tx_class == cl) {
1953 q->tx_class = NULL;
1954 q->tx_borrowed = NULL;
1956 #ifdef CONFIG_NET_CLS_ACT
1957 if (q->rx_class == cl)
1958 q->rx_class = NULL;
1959 #endif
1961 cbq_unlink_class(cl);
1962 cbq_adjust_levels(cl->tparent);
1963 cl->defmap = 0;
1964 cbq_sync_defmap(cl);
1966 cbq_rmprio(q, cl);
1967 sch_tree_unlock(sch);
1969 if (--cl->refcnt == 0)
1970 cbq_destroy_class(sch, cl);
1972 return 0;
1975 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1977 struct cbq_sched_data *q = qdisc_priv(sch);
1978 struct cbq_class *cl = (struct cbq_class *)arg;
1980 if (cl == NULL)
1981 cl = &q->link;
1983 return &cl->filter_list;
1986 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1987 u32 classid)
1989 struct cbq_sched_data *q = qdisc_priv(sch);
1990 struct cbq_class *p = (struct cbq_class*)parent;
1991 struct cbq_class *cl = cbq_class_lookup(q, classid);
1993 if (cl) {
1994 if (p && p->level <= cl->level)
1995 return 0;
1996 cl->filters++;
1997 return (unsigned long)cl;
1999 return 0;
2002 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2004 struct cbq_class *cl = (struct cbq_class*)arg;
2006 cl->filters--;
2009 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2011 struct cbq_sched_data *q = qdisc_priv(sch);
2012 struct cbq_class *cl;
2013 struct hlist_node *n;
2014 unsigned h;
2016 if (arg->stop)
2017 return;
2019 for (h = 0; h < q->clhash.hashsize; h++) {
2020 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
2021 if (arg->count < arg->skip) {
2022 arg->count++;
2023 continue;
2025 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2026 arg->stop = 1;
2027 return;
2029 arg->count++;
2034 static const struct Qdisc_class_ops cbq_class_ops = {
2035 .graft = cbq_graft,
2036 .leaf = cbq_leaf,
2037 .qlen_notify = cbq_qlen_notify,
2038 .get = cbq_get,
2039 .put = cbq_put,
2040 .change = cbq_change_class,
2041 .delete = cbq_delete,
2042 .walk = cbq_walk,
2043 .tcf_chain = cbq_find_tcf,
2044 .bind_tcf = cbq_bind_filter,
2045 .unbind_tcf = cbq_unbind_filter,
2046 .dump = cbq_dump_class,
2047 .dump_stats = cbq_dump_class_stats,
2050 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2051 .next = NULL,
2052 .cl_ops = &cbq_class_ops,
2053 .id = "cbq",
2054 .priv_size = sizeof(struct cbq_sched_data),
2055 .enqueue = cbq_enqueue,
2056 .dequeue = cbq_dequeue,
2057 .requeue = cbq_requeue,
2058 .drop = cbq_drop,
2059 .init = cbq_init,
2060 .reset = cbq_reset,
2061 .destroy = cbq_destroy,
2062 .change = NULL,
2063 .dump = cbq_dump,
2064 .dump_stats = cbq_dump_stats,
2065 .owner = THIS_MODULE,
2068 static int __init cbq_module_init(void)
2070 return register_qdisc(&cbq_qdisc_ops);
2072 static void __exit cbq_module_exit(void)
2074 unregister_qdisc(&cbq_qdisc_ops);
2076 module_init(cbq_module_init)
2077 module_exit(cbq_module_exit)
2078 MODULE_LICENSE("GPL");