uas: Remove protype hardware usb interface info
[linux-2.6/btrfs-unstable.git] / net / sched / sch_cbq.c
blob762a04bb8f6dcbe4627b579ff4081a4cf92adb11
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
24 /* Class-Based Queueing (CBQ) algorithm.
25 =======================================
27 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
28 Management Models for Packet Networks",
29 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
31 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
33 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
34 Parameters", 1996
36 [4] Sally Floyd and Michael Speer, "Experimental Results
37 for Class-Based Queueing", 1998, not published.
39 -----------------------------------------------------------------------
41 Algorithm skeleton was taken from NS simulator cbq.cc.
42 If someone wants to check this code against the LBL version,
43 he should take into account that ONLY the skeleton was borrowed,
44 the implementation is different. Particularly:
46 --- The WRR algorithm is different. Our version looks more
47 reasonable (I hope) and works when quanta are allowed to be
48 less than MTU, which is always the case when real time classes
49 have small rates. Note, that the statement of [3] is
50 incomplete, delay may actually be estimated even if class
51 per-round allotment is less than MTU. Namely, if per-round
52 allotment is W*r_i, and r_1+...+r_k = r < 1
54 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
56 In the worst case we have IntServ estimate with D = W*r+k*MTU
57 and C = MTU*r. The proof (if correct at all) is trivial.
60 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
61 interpret some places, which look like wrong translations
62 from NS. Anyone is advised to find these differences
63 and explain to me, why I am wrong 8).
65 --- Linux has no EOI event, so that we cannot estimate true class
66 idle time. Workaround is to consider the next dequeue event
67 as sign that previous packet is finished. This is wrong because of
68 internal device queueing, but on a permanently loaded link it is true.
69 Moreover, combined with clock integrator, this scheme looks
70 very close to an ideal solution. */
72 struct cbq_sched_data;
75 struct cbq_class {
76 struct Qdisc_class_common common;
77 struct cbq_class *next_alive; /* next class with backlog in this priority band */
79 /* Parameters */
80 unsigned char priority; /* class priority */
81 unsigned char priority2; /* priority to be used after overlimit */
82 unsigned char ewma_log; /* time constant for idle time calculation */
83 unsigned char ovl_strategy;
84 #ifdef CONFIG_NET_CLS_ACT
85 unsigned char police;
86 #endif
88 u32 defmap;
90 /* Link-sharing scheduler parameters */
91 long maxidle; /* Class parameters: see below. */
92 long offtime;
93 long minidle;
94 u32 avpkt;
95 struct qdisc_rate_table *R_tab;
97 /* Overlimit strategy parameters */
98 void (*overlimit)(struct cbq_class *cl);
99 psched_tdiff_t penalty;
101 /* General scheduler (WRR) parameters */
102 long allot;
103 long quantum; /* Allotment per WRR round */
104 long weight; /* Relative allotment: see below */
106 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
107 struct cbq_class *split; /* Ptr to split node */
108 struct cbq_class *share; /* Ptr to LS parent in the class tree */
109 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
110 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
111 parent otherwise */
112 struct cbq_class *sibling; /* Sibling chain */
113 struct cbq_class *children; /* Pointer to children chain */
115 struct Qdisc *q; /* Elementary queueing discipline */
118 /* Variables */
119 unsigned char cpriority; /* Effective priority */
120 unsigned char delayed;
121 unsigned char level; /* level of the class in hierarchy:
122 0 for leaf classes, and maximal
123 level of children + 1 for nodes.
126 psched_time_t last; /* Last end of service */
127 psched_time_t undertime;
128 long avgidle;
129 long deficit; /* Saved deficit for WRR */
130 psched_time_t penalized;
131 struct gnet_stats_basic_packed bstats;
132 struct gnet_stats_queue qstats;
133 struct gnet_stats_rate_est64 rate_est;
134 struct tc_cbq_xstats xstats;
136 struct tcf_proto *filter_list;
138 int refcnt;
139 int filters;
141 struct cbq_class *defaults[TC_PRIO_MAX + 1];
144 struct cbq_sched_data {
145 struct Qdisc_class_hash clhash; /* Hash table of all classes */
146 int nclasses[TC_CBQ_MAXPRIO + 1];
147 unsigned int quanta[TC_CBQ_MAXPRIO + 1];
149 struct cbq_class link;
151 unsigned int activemask;
152 struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
153 with backlog */
155 #ifdef CONFIG_NET_CLS_ACT
156 struct cbq_class *rx_class;
157 #endif
158 struct cbq_class *tx_class;
159 struct cbq_class *tx_borrowed;
160 int tx_len;
161 psched_time_t now; /* Cached timestamp */
162 unsigned int pmask;
164 struct hrtimer delay_timer;
165 struct qdisc_watchdog watchdog; /* Watchdog timer,
166 started when CBQ has
167 backlog, but cannot
168 transmit just now */
169 psched_tdiff_t wd_expires;
170 int toplevel;
171 u32 hgenerator;
175 #define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
177 static inline struct cbq_class *
178 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
180 struct Qdisc_class_common *clc;
182 clc = qdisc_class_find(&q->clhash, classid);
183 if (clc == NULL)
184 return NULL;
185 return container_of(clc, struct cbq_class, common);
188 #ifdef CONFIG_NET_CLS_ACT
190 static struct cbq_class *
191 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
193 struct cbq_class *cl;
195 for (cl = this->tparent; cl; cl = cl->tparent) {
196 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
198 if (new != NULL && new != this)
199 return new;
201 return NULL;
204 #endif
206 /* Classify packet. The procedure is pretty complicated, but
207 * it allows us to combine link sharing and priority scheduling
208 * transparently.
210 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
211 * so that it resolves to split nodes. Then packets are classified
212 * by logical priority, or a more specific classifier may be attached
213 * to the split node.
216 static struct cbq_class *
217 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
219 struct cbq_sched_data *q = qdisc_priv(sch);
220 struct cbq_class *head = &q->link;
221 struct cbq_class **defmap;
222 struct cbq_class *cl = NULL;
223 u32 prio = skb->priority;
224 struct tcf_result res;
227 * Step 1. If skb->priority points to one of our classes, use it.
229 if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
230 (cl = cbq_class_lookup(q, prio)) != NULL)
231 return cl;
233 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
234 for (;;) {
235 int result = 0;
236 defmap = head->defaults;
239 * Step 2+n. Apply classifier.
241 if (!head->filter_list ||
242 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
243 goto fallback;
245 cl = (void *)res.class;
246 if (!cl) {
247 if (TC_H_MAJ(res.classid))
248 cl = cbq_class_lookup(q, res.classid);
249 else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
250 cl = defmap[TC_PRIO_BESTEFFORT];
252 if (cl == NULL)
253 goto fallback;
255 if (cl->level >= head->level)
256 goto fallback;
257 #ifdef CONFIG_NET_CLS_ACT
258 switch (result) {
259 case TC_ACT_QUEUED:
260 case TC_ACT_STOLEN:
261 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
262 case TC_ACT_SHOT:
263 return NULL;
264 case TC_ACT_RECLASSIFY:
265 return cbq_reclassify(skb, cl);
267 #endif
268 if (cl->level == 0)
269 return cl;
272 * Step 3+n. If classifier selected a link sharing class,
273 * apply agency specific classifier.
274 * Repeat this procdure until we hit a leaf node.
276 head = cl;
279 fallback:
280 cl = head;
283 * Step 4. No success...
285 if (TC_H_MAJ(prio) == 0 &&
286 !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
287 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
288 return head;
290 return cl;
294 * A packet has just been enqueued on the empty class.
295 * cbq_activate_class adds it to the tail of active class list
296 * of its priority band.
299 static inline void cbq_activate_class(struct cbq_class *cl)
301 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
302 int prio = cl->cpriority;
303 struct cbq_class *cl_tail;
305 cl_tail = q->active[prio];
306 q->active[prio] = cl;
308 if (cl_tail != NULL) {
309 cl->next_alive = cl_tail->next_alive;
310 cl_tail->next_alive = cl;
311 } else {
312 cl->next_alive = cl;
313 q->activemask |= (1<<prio);
318 * Unlink class from active chain.
319 * Note that this same procedure is done directly in cbq_dequeue*
320 * during round-robin procedure.
323 static void cbq_deactivate_class(struct cbq_class *this)
325 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
326 int prio = this->cpriority;
327 struct cbq_class *cl;
328 struct cbq_class *cl_prev = q->active[prio];
330 do {
331 cl = cl_prev->next_alive;
332 if (cl == this) {
333 cl_prev->next_alive = cl->next_alive;
334 cl->next_alive = NULL;
336 if (cl == q->active[prio]) {
337 q->active[prio] = cl_prev;
338 if (cl == q->active[prio]) {
339 q->active[prio] = NULL;
340 q->activemask &= ~(1<<prio);
341 return;
344 return;
346 } while ((cl_prev = cl) != q->active[prio]);
349 static void
350 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
352 int toplevel = q->toplevel;
354 if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
355 psched_time_t now = psched_get_time();
357 do {
358 if (cl->undertime < now) {
359 q->toplevel = cl->level;
360 return;
362 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
366 static int
367 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
369 struct cbq_sched_data *q = qdisc_priv(sch);
370 int uninitialized_var(ret);
371 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
373 #ifdef CONFIG_NET_CLS_ACT
374 q->rx_class = cl;
375 #endif
376 if (cl == NULL) {
377 if (ret & __NET_XMIT_BYPASS)
378 sch->qstats.drops++;
379 kfree_skb(skb);
380 return ret;
383 #ifdef CONFIG_NET_CLS_ACT
384 cl->q->__parent = sch;
385 #endif
386 ret = qdisc_enqueue(skb, cl->q);
387 if (ret == NET_XMIT_SUCCESS) {
388 sch->q.qlen++;
389 cbq_mark_toplevel(q, cl);
390 if (!cl->next_alive)
391 cbq_activate_class(cl);
392 return ret;
395 if (net_xmit_drop_count(ret)) {
396 sch->qstats.drops++;
397 cbq_mark_toplevel(q, cl);
398 cl->qstats.drops++;
400 return ret;
403 /* Overlimit actions */
405 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
407 static void cbq_ovl_classic(struct cbq_class *cl)
409 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
410 psched_tdiff_t delay = cl->undertime - q->now;
412 if (!cl->delayed) {
413 delay += cl->offtime;
416 * Class goes to sleep, so that it will have no
417 * chance to work avgidle. Let's forgive it 8)
419 * BTW cbq-2.0 has a crap in this
420 * place, apparently they forgot to shift it by cl->ewma_log.
422 if (cl->avgidle < 0)
423 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
424 if (cl->avgidle < cl->minidle)
425 cl->avgidle = cl->minidle;
426 if (delay <= 0)
427 delay = 1;
428 cl->undertime = q->now + delay;
430 cl->xstats.overactions++;
431 cl->delayed = 1;
433 if (q->wd_expires == 0 || q->wd_expires > delay)
434 q->wd_expires = delay;
436 /* Dirty work! We must schedule wakeups based on
437 * real available rate, rather than leaf rate,
438 * which may be tiny (even zero).
440 if (q->toplevel == TC_CBQ_MAXLEVEL) {
441 struct cbq_class *b;
442 psched_tdiff_t base_delay = q->wd_expires;
444 for (b = cl->borrow; b; b = b->borrow) {
445 delay = b->undertime - q->now;
446 if (delay < base_delay) {
447 if (delay <= 0)
448 delay = 1;
449 base_delay = delay;
453 q->wd_expires = base_delay;
457 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
458 * they go overlimit
461 static void cbq_ovl_rclassic(struct cbq_class *cl)
463 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
464 struct cbq_class *this = cl;
466 do {
467 if (cl->level > q->toplevel) {
468 cl = NULL;
469 break;
471 } while ((cl = cl->borrow) != NULL);
473 if (cl == NULL)
474 cl = this;
475 cbq_ovl_classic(cl);
478 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
480 static void cbq_ovl_delay(struct cbq_class *cl)
482 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
483 psched_tdiff_t delay = cl->undertime - q->now;
485 if (test_bit(__QDISC_STATE_DEACTIVATED,
486 &qdisc_root_sleeping(cl->qdisc)->state))
487 return;
489 if (!cl->delayed) {
490 psched_time_t sched = q->now;
491 ktime_t expires;
493 delay += cl->offtime;
494 if (cl->avgidle < 0)
495 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
496 if (cl->avgidle < cl->minidle)
497 cl->avgidle = cl->minidle;
498 cl->undertime = q->now + delay;
500 if (delay > 0) {
501 sched += delay + cl->penalty;
502 cl->penalized = sched;
503 cl->cpriority = TC_CBQ_MAXPRIO;
504 q->pmask |= (1<<TC_CBQ_MAXPRIO);
506 expires = ns_to_ktime(PSCHED_TICKS2NS(sched));
507 if (hrtimer_try_to_cancel(&q->delay_timer) &&
508 ktime_to_ns(ktime_sub(
509 hrtimer_get_expires(&q->delay_timer),
510 expires)) > 0)
511 hrtimer_set_expires(&q->delay_timer, expires);
512 hrtimer_restart(&q->delay_timer);
513 cl->delayed = 1;
514 cl->xstats.overactions++;
515 return;
517 delay = 1;
519 if (q->wd_expires == 0 || q->wd_expires > delay)
520 q->wd_expires = delay;
523 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
525 static void cbq_ovl_lowprio(struct cbq_class *cl)
527 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
529 cl->penalized = q->now + cl->penalty;
531 if (cl->cpriority != cl->priority2) {
532 cl->cpriority = cl->priority2;
533 q->pmask |= (1<<cl->cpriority);
534 cl->xstats.overactions++;
536 cbq_ovl_classic(cl);
539 /* TC_CBQ_OVL_DROP: penalize class by dropping */
541 static void cbq_ovl_drop(struct cbq_class *cl)
543 if (cl->q->ops->drop)
544 if (cl->q->ops->drop(cl->q))
545 cl->qdisc->q.qlen--;
546 cl->xstats.overactions++;
547 cbq_ovl_classic(cl);
550 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
551 psched_time_t now)
553 struct cbq_class *cl;
554 struct cbq_class *cl_prev = q->active[prio];
555 psched_time_t sched = now;
557 if (cl_prev == NULL)
558 return 0;
560 do {
561 cl = cl_prev->next_alive;
562 if (now - cl->penalized > 0) {
563 cl_prev->next_alive = cl->next_alive;
564 cl->next_alive = NULL;
565 cl->cpriority = cl->priority;
566 cl->delayed = 0;
567 cbq_activate_class(cl);
569 if (cl == q->active[prio]) {
570 q->active[prio] = cl_prev;
571 if (cl == q->active[prio]) {
572 q->active[prio] = NULL;
573 return 0;
577 cl = cl_prev->next_alive;
578 } else if (sched - cl->penalized > 0)
579 sched = cl->penalized;
580 } while ((cl_prev = cl) != q->active[prio]);
582 return sched - now;
585 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
587 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
588 delay_timer);
589 struct Qdisc *sch = q->watchdog.qdisc;
590 psched_time_t now;
591 psched_tdiff_t delay = 0;
592 unsigned int pmask;
594 now = psched_get_time();
596 pmask = q->pmask;
597 q->pmask = 0;
599 while (pmask) {
600 int prio = ffz(~pmask);
601 psched_tdiff_t tmp;
603 pmask &= ~(1<<prio);
605 tmp = cbq_undelay_prio(q, prio, now);
606 if (tmp > 0) {
607 q->pmask |= 1<<prio;
608 if (tmp < delay || delay == 0)
609 delay = tmp;
613 if (delay) {
614 ktime_t time;
616 time = ktime_set(0, 0);
617 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
618 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
621 qdisc_unthrottled(sch);
622 __netif_schedule(qdisc_root(sch));
623 return HRTIMER_NORESTART;
626 #ifdef CONFIG_NET_CLS_ACT
627 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
629 struct Qdisc *sch = child->__parent;
630 struct cbq_sched_data *q = qdisc_priv(sch);
631 struct cbq_class *cl = q->rx_class;
633 q->rx_class = NULL;
635 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
636 int ret;
638 cbq_mark_toplevel(q, cl);
640 q->rx_class = cl;
641 cl->q->__parent = sch;
643 ret = qdisc_enqueue(skb, cl->q);
644 if (ret == NET_XMIT_SUCCESS) {
645 sch->q.qlen++;
646 if (!cl->next_alive)
647 cbq_activate_class(cl);
648 return 0;
650 if (net_xmit_drop_count(ret))
651 sch->qstats.drops++;
652 return 0;
655 sch->qstats.drops++;
656 return -1;
658 #endif
661 * It is mission critical procedure.
663 * We "regenerate" toplevel cutoff, if transmitting class
664 * has backlog and it is not regulated. It is not part of
665 * original CBQ description, but looks more reasonable.
666 * Probably, it is wrong. This question needs further investigation.
669 static inline void
670 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
671 struct cbq_class *borrowed)
673 if (cl && q->toplevel >= borrowed->level) {
674 if (cl->q->q.qlen > 1) {
675 do {
676 if (borrowed->undertime == PSCHED_PASTPERFECT) {
677 q->toplevel = borrowed->level;
678 return;
680 } while ((borrowed = borrowed->borrow) != NULL);
682 #if 0
683 /* It is not necessary now. Uncommenting it
684 will save CPU cycles, but decrease fairness.
686 q->toplevel = TC_CBQ_MAXLEVEL;
687 #endif
691 static void
692 cbq_update(struct cbq_sched_data *q)
694 struct cbq_class *this = q->tx_class;
695 struct cbq_class *cl = this;
696 int len = q->tx_len;
697 psched_time_t now;
699 q->tx_class = NULL;
700 /* Time integrator. We calculate EOS time
701 * by adding expected packet transmission time.
703 now = q->now + L2T(&q->link, len);
705 for ( ; cl; cl = cl->share) {
706 long avgidle = cl->avgidle;
707 long idle;
709 cl->bstats.packets++;
710 cl->bstats.bytes += len;
713 * (now - last) is total time between packet right edges.
714 * (last_pktlen/rate) is "virtual" busy time, so that
716 * idle = (now - last) - last_pktlen/rate
719 idle = now - cl->last;
720 if ((unsigned long)idle > 128*1024*1024) {
721 avgidle = cl->maxidle;
722 } else {
723 idle -= L2T(cl, len);
725 /* true_avgidle := (1-W)*true_avgidle + W*idle,
726 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
727 * cl->avgidle == true_avgidle/W,
728 * hence:
730 avgidle += idle - (avgidle>>cl->ewma_log);
733 if (avgidle <= 0) {
734 /* Overlimit or at-limit */
736 if (avgidle < cl->minidle)
737 avgidle = cl->minidle;
739 cl->avgidle = avgidle;
741 /* Calculate expected time, when this class
742 * will be allowed to send.
743 * It will occur, when:
744 * (1-W)*true_avgidle + W*delay = 0, i.e.
745 * idle = (1/W - 1)*(-true_avgidle)
746 * or
747 * idle = (1 - W)*(-cl->avgidle);
749 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
752 * That is not all.
753 * To maintain the rate allocated to the class,
754 * we add to undertime virtual clock,
755 * necessary to complete transmitted packet.
756 * (len/phys_bandwidth has been already passed
757 * to the moment of cbq_update)
760 idle -= L2T(&q->link, len);
761 idle += L2T(cl, len);
763 cl->undertime = now + idle;
764 } else {
765 /* Underlimit */
767 cl->undertime = PSCHED_PASTPERFECT;
768 if (avgidle > cl->maxidle)
769 cl->avgidle = cl->maxidle;
770 else
771 cl->avgidle = avgidle;
773 if ((s64)(now - cl->last) > 0)
774 cl->last = now;
777 cbq_update_toplevel(q, this, q->tx_borrowed);
780 static inline struct cbq_class *
781 cbq_under_limit(struct cbq_class *cl)
783 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
784 struct cbq_class *this_cl = cl;
786 if (cl->tparent == NULL)
787 return cl;
789 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
790 cl->delayed = 0;
791 return cl;
794 do {
795 /* It is very suspicious place. Now overlimit
796 * action is generated for not bounded classes
797 * only if link is completely congested.
798 * Though it is in agree with ancestor-only paradigm,
799 * it looks very stupid. Particularly,
800 * it means that this chunk of code will either
801 * never be called or result in strong amplification
802 * of burstiness. Dangerous, silly, and, however,
803 * no another solution exists.
805 cl = cl->borrow;
806 if (!cl) {
807 this_cl->qstats.overlimits++;
808 this_cl->overlimit(this_cl);
809 return NULL;
811 if (cl->level > q->toplevel)
812 return NULL;
813 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
815 cl->delayed = 0;
816 return cl;
819 static inline struct sk_buff *
820 cbq_dequeue_prio(struct Qdisc *sch, int prio)
822 struct cbq_sched_data *q = qdisc_priv(sch);
823 struct cbq_class *cl_tail, *cl_prev, *cl;
824 struct sk_buff *skb;
825 int deficit;
827 cl_tail = cl_prev = q->active[prio];
828 cl = cl_prev->next_alive;
830 do {
831 deficit = 0;
833 /* Start round */
834 do {
835 struct cbq_class *borrow = cl;
837 if (cl->q->q.qlen &&
838 (borrow = cbq_under_limit(cl)) == NULL)
839 goto skip_class;
841 if (cl->deficit <= 0) {
842 /* Class exhausted its allotment per
843 * this round. Switch to the next one.
845 deficit = 1;
846 cl->deficit += cl->quantum;
847 goto next_class;
850 skb = cl->q->dequeue(cl->q);
852 /* Class did not give us any skb :-(
853 * It could occur even if cl->q->q.qlen != 0
854 * f.e. if cl->q == "tbf"
856 if (skb == NULL)
857 goto skip_class;
859 cl->deficit -= qdisc_pkt_len(skb);
860 q->tx_class = cl;
861 q->tx_borrowed = borrow;
862 if (borrow != cl) {
863 #ifndef CBQ_XSTATS_BORROWS_BYTES
864 borrow->xstats.borrows++;
865 cl->xstats.borrows++;
866 #else
867 borrow->xstats.borrows += qdisc_pkt_len(skb);
868 cl->xstats.borrows += qdisc_pkt_len(skb);
869 #endif
871 q->tx_len = qdisc_pkt_len(skb);
873 if (cl->deficit <= 0) {
874 q->active[prio] = cl;
875 cl = cl->next_alive;
876 cl->deficit += cl->quantum;
878 return skb;
880 skip_class:
881 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
882 /* Class is empty or penalized.
883 * Unlink it from active chain.
885 cl_prev->next_alive = cl->next_alive;
886 cl->next_alive = NULL;
888 /* Did cl_tail point to it? */
889 if (cl == cl_tail) {
890 /* Repair it! */
891 cl_tail = cl_prev;
893 /* Was it the last class in this band? */
894 if (cl == cl_tail) {
895 /* Kill the band! */
896 q->active[prio] = NULL;
897 q->activemask &= ~(1<<prio);
898 if (cl->q->q.qlen)
899 cbq_activate_class(cl);
900 return NULL;
903 q->active[prio] = cl_tail;
905 if (cl->q->q.qlen)
906 cbq_activate_class(cl);
908 cl = cl_prev;
911 next_class:
912 cl_prev = cl;
913 cl = cl->next_alive;
914 } while (cl_prev != cl_tail);
915 } while (deficit);
917 q->active[prio] = cl_prev;
919 return NULL;
922 static inline struct sk_buff *
923 cbq_dequeue_1(struct Qdisc *sch)
925 struct cbq_sched_data *q = qdisc_priv(sch);
926 struct sk_buff *skb;
927 unsigned int activemask;
929 activemask = q->activemask & 0xFF;
930 while (activemask) {
931 int prio = ffz(~activemask);
932 activemask &= ~(1<<prio);
933 skb = cbq_dequeue_prio(sch, prio);
934 if (skb)
935 return skb;
937 return NULL;
940 static struct sk_buff *
941 cbq_dequeue(struct Qdisc *sch)
943 struct sk_buff *skb;
944 struct cbq_sched_data *q = qdisc_priv(sch);
945 psched_time_t now;
947 now = psched_get_time();
949 if (q->tx_class)
950 cbq_update(q);
952 q->now = now;
954 for (;;) {
955 q->wd_expires = 0;
957 skb = cbq_dequeue_1(sch);
958 if (skb) {
959 qdisc_bstats_update(sch, skb);
960 sch->q.qlen--;
961 qdisc_unthrottled(sch);
962 return skb;
965 /* All the classes are overlimit.
967 * It is possible, if:
969 * 1. Scheduler is empty.
970 * 2. Toplevel cutoff inhibited borrowing.
971 * 3. Root class is overlimit.
973 * Reset 2d and 3d conditions and retry.
975 * Note, that NS and cbq-2.0 are buggy, peeking
976 * an arbitrary class is appropriate for ancestor-only
977 * sharing, but not for toplevel algorithm.
979 * Our version is better, but slower, because it requires
980 * two passes, but it is unavoidable with top-level sharing.
983 if (q->toplevel == TC_CBQ_MAXLEVEL &&
984 q->link.undertime == PSCHED_PASTPERFECT)
985 break;
987 q->toplevel = TC_CBQ_MAXLEVEL;
988 q->link.undertime = PSCHED_PASTPERFECT;
991 /* No packets in scheduler or nobody wants to give them to us :-(
992 * Sigh... start watchdog timer in the last case.
995 if (sch->q.qlen) {
996 sch->qstats.overlimits++;
997 if (q->wd_expires)
998 qdisc_watchdog_schedule(&q->watchdog,
999 now + q->wd_expires);
1001 return NULL;
1004 /* CBQ class maintanance routines */
1006 static void cbq_adjust_levels(struct cbq_class *this)
1008 if (this == NULL)
1009 return;
1011 do {
1012 int level = 0;
1013 struct cbq_class *cl;
1015 cl = this->children;
1016 if (cl) {
1017 do {
1018 if (cl->level > level)
1019 level = cl->level;
1020 } while ((cl = cl->sibling) != this->children);
1022 this->level = level + 1;
1023 } while ((this = this->tparent) != NULL);
1026 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1028 struct cbq_class *cl;
1029 unsigned int h;
1031 if (q->quanta[prio] == 0)
1032 return;
1034 for (h = 0; h < q->clhash.hashsize; h++) {
1035 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1036 /* BUGGGG... Beware! This expression suffer of
1037 * arithmetic overflows!
1039 if (cl->priority == prio) {
1040 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1041 q->quanta[prio];
1043 if (cl->quantum <= 0 ||
1044 cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
1045 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
1046 cl->common.classid, cl->quantum);
1047 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1053 static void cbq_sync_defmap(struct cbq_class *cl)
1055 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1056 struct cbq_class *split = cl->split;
1057 unsigned int h;
1058 int i;
1060 if (split == NULL)
1061 return;
1063 for (i = 0; i <= TC_PRIO_MAX; i++) {
1064 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
1065 split->defaults[i] = NULL;
1068 for (i = 0; i <= TC_PRIO_MAX; i++) {
1069 int level = split->level;
1071 if (split->defaults[i])
1072 continue;
1074 for (h = 0; h < q->clhash.hashsize; h++) {
1075 struct cbq_class *c;
1077 hlist_for_each_entry(c, &q->clhash.hash[h],
1078 common.hnode) {
1079 if (c->split == split && c->level < level &&
1080 c->defmap & (1<<i)) {
1081 split->defaults[i] = c;
1082 level = c->level;
1089 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1091 struct cbq_class *split = NULL;
1093 if (splitid == 0) {
1094 split = cl->split;
1095 if (!split)
1096 return;
1097 splitid = split->common.classid;
1100 if (split == NULL || split->common.classid != splitid) {
1101 for (split = cl->tparent; split; split = split->tparent)
1102 if (split->common.classid == splitid)
1103 break;
1106 if (split == NULL)
1107 return;
1109 if (cl->split != split) {
1110 cl->defmap = 0;
1111 cbq_sync_defmap(cl);
1112 cl->split = split;
1113 cl->defmap = def & mask;
1114 } else
1115 cl->defmap = (cl->defmap & ~mask) | (def & mask);
1117 cbq_sync_defmap(cl);
1120 static void cbq_unlink_class(struct cbq_class *this)
1122 struct cbq_class *cl, **clp;
1123 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1125 qdisc_class_hash_remove(&q->clhash, &this->common);
1127 if (this->tparent) {
1128 clp = &this->sibling;
1129 cl = *clp;
1130 do {
1131 if (cl == this) {
1132 *clp = cl->sibling;
1133 break;
1135 clp = &cl->sibling;
1136 } while ((cl = *clp) != this->sibling);
1138 if (this->tparent->children == this) {
1139 this->tparent->children = this->sibling;
1140 if (this->sibling == this)
1141 this->tparent->children = NULL;
1143 } else {
1144 WARN_ON(this->sibling != this);
1148 static void cbq_link_class(struct cbq_class *this)
1150 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1151 struct cbq_class *parent = this->tparent;
1153 this->sibling = this;
1154 qdisc_class_hash_insert(&q->clhash, &this->common);
1156 if (parent == NULL)
1157 return;
1159 if (parent->children == NULL) {
1160 parent->children = this;
1161 } else {
1162 this->sibling = parent->children->sibling;
1163 parent->children->sibling = this;
1167 static unsigned int cbq_drop(struct Qdisc *sch)
1169 struct cbq_sched_data *q = qdisc_priv(sch);
1170 struct cbq_class *cl, *cl_head;
1171 int prio;
1172 unsigned int len;
1174 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1175 cl_head = q->active[prio];
1176 if (!cl_head)
1177 continue;
1179 cl = cl_head;
1180 do {
1181 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1182 sch->q.qlen--;
1183 if (!cl->q->q.qlen)
1184 cbq_deactivate_class(cl);
1185 return len;
1187 } while ((cl = cl->next_alive) != cl_head);
1189 return 0;
1192 static void
1193 cbq_reset(struct Qdisc *sch)
1195 struct cbq_sched_data *q = qdisc_priv(sch);
1196 struct cbq_class *cl;
1197 int prio;
1198 unsigned int h;
1200 q->activemask = 0;
1201 q->pmask = 0;
1202 q->tx_class = NULL;
1203 q->tx_borrowed = NULL;
1204 qdisc_watchdog_cancel(&q->watchdog);
1205 hrtimer_cancel(&q->delay_timer);
1206 q->toplevel = TC_CBQ_MAXLEVEL;
1207 q->now = psched_get_time();
1209 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1210 q->active[prio] = NULL;
1212 for (h = 0; h < q->clhash.hashsize; h++) {
1213 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1214 qdisc_reset(cl->q);
1216 cl->next_alive = NULL;
1217 cl->undertime = PSCHED_PASTPERFECT;
1218 cl->avgidle = cl->maxidle;
1219 cl->deficit = cl->quantum;
1220 cl->cpriority = cl->priority;
1223 sch->q.qlen = 0;
1227 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1229 if (lss->change & TCF_CBQ_LSS_FLAGS) {
1230 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1231 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1233 if (lss->change & TCF_CBQ_LSS_EWMA)
1234 cl->ewma_log = lss->ewma_log;
1235 if (lss->change & TCF_CBQ_LSS_AVPKT)
1236 cl->avpkt = lss->avpkt;
1237 if (lss->change & TCF_CBQ_LSS_MINIDLE)
1238 cl->minidle = -(long)lss->minidle;
1239 if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1240 cl->maxidle = lss->maxidle;
1241 cl->avgidle = lss->maxidle;
1243 if (lss->change & TCF_CBQ_LSS_OFFTIME)
1244 cl->offtime = lss->offtime;
1245 return 0;
1248 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1250 q->nclasses[cl->priority]--;
1251 q->quanta[cl->priority] -= cl->weight;
1252 cbq_normalize_quanta(q, cl->priority);
1255 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1257 q->nclasses[cl->priority]++;
1258 q->quanta[cl->priority] += cl->weight;
1259 cbq_normalize_quanta(q, cl->priority);
1262 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1264 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1266 if (wrr->allot)
1267 cl->allot = wrr->allot;
1268 if (wrr->weight)
1269 cl->weight = wrr->weight;
1270 if (wrr->priority) {
1271 cl->priority = wrr->priority - 1;
1272 cl->cpriority = cl->priority;
1273 if (cl->priority >= cl->priority2)
1274 cl->priority2 = TC_CBQ_MAXPRIO - 1;
1277 cbq_addprio(q, cl);
1278 return 0;
1281 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1283 switch (ovl->strategy) {
1284 case TC_CBQ_OVL_CLASSIC:
1285 cl->overlimit = cbq_ovl_classic;
1286 break;
1287 case TC_CBQ_OVL_DELAY:
1288 cl->overlimit = cbq_ovl_delay;
1289 break;
1290 case TC_CBQ_OVL_LOWPRIO:
1291 if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
1292 ovl->priority2 - 1 <= cl->priority)
1293 return -EINVAL;
1294 cl->priority2 = ovl->priority2 - 1;
1295 cl->overlimit = cbq_ovl_lowprio;
1296 break;
1297 case TC_CBQ_OVL_DROP:
1298 cl->overlimit = cbq_ovl_drop;
1299 break;
1300 case TC_CBQ_OVL_RCLASSIC:
1301 cl->overlimit = cbq_ovl_rclassic;
1302 break;
1303 default:
1304 return -EINVAL;
1306 cl->penalty = ovl->penalty;
1307 return 0;
1310 #ifdef CONFIG_NET_CLS_ACT
1311 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1313 cl->police = p->police;
1315 if (cl->q->handle) {
1316 if (p->police == TC_POLICE_RECLASSIFY)
1317 cl->q->reshape_fail = cbq_reshape_fail;
1318 else
1319 cl->q->reshape_fail = NULL;
1321 return 0;
1323 #endif
1325 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1327 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1328 return 0;
1331 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1332 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1333 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1334 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1335 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1336 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1337 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1338 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1341 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1343 struct cbq_sched_data *q = qdisc_priv(sch);
1344 struct nlattr *tb[TCA_CBQ_MAX + 1];
1345 struct tc_ratespec *r;
1346 int err;
1348 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1349 if (err < 0)
1350 return err;
1352 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1353 return -EINVAL;
1355 r = nla_data(tb[TCA_CBQ_RATE]);
1357 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1358 return -EINVAL;
1360 err = qdisc_class_hash_init(&q->clhash);
1361 if (err < 0)
1362 goto put_rtab;
1364 q->link.refcnt = 1;
1365 q->link.sibling = &q->link;
1366 q->link.common.classid = sch->handle;
1367 q->link.qdisc = sch;
1368 q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1369 sch->handle);
1370 if (!q->link.q)
1371 q->link.q = &noop_qdisc;
1373 q->link.priority = TC_CBQ_MAXPRIO - 1;
1374 q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1375 q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1376 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1377 q->link.overlimit = cbq_ovl_classic;
1378 q->link.allot = psched_mtu(qdisc_dev(sch));
1379 q->link.quantum = q->link.allot;
1380 q->link.weight = q->link.R_tab->rate.rate;
1382 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1383 q->link.avpkt = q->link.allot/2;
1384 q->link.minidle = -0x7FFFFFFF;
1386 qdisc_watchdog_init(&q->watchdog, sch);
1387 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1388 q->delay_timer.function = cbq_undelay;
1389 q->toplevel = TC_CBQ_MAXLEVEL;
1390 q->now = psched_get_time();
1392 cbq_link_class(&q->link);
1394 if (tb[TCA_CBQ_LSSOPT])
1395 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1397 cbq_addprio(q, &q->link);
1398 return 0;
1400 put_rtab:
1401 qdisc_put_rtab(q->link.R_tab);
1402 return err;
1405 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1407 unsigned char *b = skb_tail_pointer(skb);
1409 if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1410 goto nla_put_failure;
1411 return skb->len;
1413 nla_put_failure:
1414 nlmsg_trim(skb, b);
1415 return -1;
1418 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1420 unsigned char *b = skb_tail_pointer(skb);
1421 struct tc_cbq_lssopt opt;
1423 opt.flags = 0;
1424 if (cl->borrow == NULL)
1425 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1426 if (cl->share == NULL)
1427 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1428 opt.ewma_log = cl->ewma_log;
1429 opt.level = cl->level;
1430 opt.avpkt = cl->avpkt;
1431 opt.maxidle = cl->maxidle;
1432 opt.minidle = (u32)(-cl->minidle);
1433 opt.offtime = cl->offtime;
1434 opt.change = ~0;
1435 if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1436 goto nla_put_failure;
1437 return skb->len;
1439 nla_put_failure:
1440 nlmsg_trim(skb, b);
1441 return -1;
1444 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1446 unsigned char *b = skb_tail_pointer(skb);
1447 struct tc_cbq_wrropt opt;
1449 memset(&opt, 0, sizeof(opt));
1450 opt.flags = 0;
1451 opt.allot = cl->allot;
1452 opt.priority = cl->priority + 1;
1453 opt.cpriority = cl->cpriority + 1;
1454 opt.weight = cl->weight;
1455 if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1456 goto nla_put_failure;
1457 return skb->len;
1459 nla_put_failure:
1460 nlmsg_trim(skb, b);
1461 return -1;
1464 static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1466 unsigned char *b = skb_tail_pointer(skb);
1467 struct tc_cbq_ovl opt;
1469 opt.strategy = cl->ovl_strategy;
1470 opt.priority2 = cl->priority2 + 1;
1471 opt.pad = 0;
1472 opt.penalty = cl->penalty;
1473 if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt))
1474 goto nla_put_failure;
1475 return skb->len;
1477 nla_put_failure:
1478 nlmsg_trim(skb, b);
1479 return -1;
1482 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1484 unsigned char *b = skb_tail_pointer(skb);
1485 struct tc_cbq_fopt opt;
1487 if (cl->split || cl->defmap) {
1488 opt.split = cl->split ? cl->split->common.classid : 0;
1489 opt.defmap = cl->defmap;
1490 opt.defchange = ~0;
1491 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1492 goto nla_put_failure;
1494 return skb->len;
1496 nla_put_failure:
1497 nlmsg_trim(skb, b);
1498 return -1;
1501 #ifdef CONFIG_NET_CLS_ACT
1502 static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1504 unsigned char *b = skb_tail_pointer(skb);
1505 struct tc_cbq_police opt;
1507 if (cl->police) {
1508 opt.police = cl->police;
1509 opt.__res1 = 0;
1510 opt.__res2 = 0;
1511 if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt))
1512 goto nla_put_failure;
1514 return skb->len;
1516 nla_put_failure:
1517 nlmsg_trim(skb, b);
1518 return -1;
1520 #endif
1522 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1524 if (cbq_dump_lss(skb, cl) < 0 ||
1525 cbq_dump_rate(skb, cl) < 0 ||
1526 cbq_dump_wrr(skb, cl) < 0 ||
1527 cbq_dump_ovl(skb, cl) < 0 ||
1528 #ifdef CONFIG_NET_CLS_ACT
1529 cbq_dump_police(skb, cl) < 0 ||
1530 #endif
1531 cbq_dump_fopt(skb, cl) < 0)
1532 return -1;
1533 return 0;
1536 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1538 struct cbq_sched_data *q = qdisc_priv(sch);
1539 struct nlattr *nest;
1541 nest = nla_nest_start(skb, TCA_OPTIONS);
1542 if (nest == NULL)
1543 goto nla_put_failure;
1544 if (cbq_dump_attr(skb, &q->link) < 0)
1545 goto nla_put_failure;
1546 return nla_nest_end(skb, nest);
1548 nla_put_failure:
1549 nla_nest_cancel(skb, nest);
1550 return -1;
1553 static int
1554 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1556 struct cbq_sched_data *q = qdisc_priv(sch);
1558 q->link.xstats.avgidle = q->link.avgidle;
1559 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1562 static int
1563 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1564 struct sk_buff *skb, struct tcmsg *tcm)
1566 struct cbq_class *cl = (struct cbq_class *)arg;
1567 struct nlattr *nest;
1569 if (cl->tparent)
1570 tcm->tcm_parent = cl->tparent->common.classid;
1571 else
1572 tcm->tcm_parent = TC_H_ROOT;
1573 tcm->tcm_handle = cl->common.classid;
1574 tcm->tcm_info = cl->q->handle;
1576 nest = nla_nest_start(skb, TCA_OPTIONS);
1577 if (nest == NULL)
1578 goto nla_put_failure;
1579 if (cbq_dump_attr(skb, cl) < 0)
1580 goto nla_put_failure;
1581 return nla_nest_end(skb, nest);
1583 nla_put_failure:
1584 nla_nest_cancel(skb, nest);
1585 return -1;
1588 static int
1589 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1590 struct gnet_dump *d)
1592 struct cbq_sched_data *q = qdisc_priv(sch);
1593 struct cbq_class *cl = (struct cbq_class *)arg;
1595 cl->qstats.qlen = cl->q->q.qlen;
1596 cl->xstats.avgidle = cl->avgidle;
1597 cl->xstats.undertime = 0;
1599 if (cl->undertime != PSCHED_PASTPERFECT)
1600 cl->xstats.undertime = cl->undertime - q->now;
1602 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1603 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1604 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1605 return -1;
1607 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1610 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1611 struct Qdisc **old)
1613 struct cbq_class *cl = (struct cbq_class *)arg;
1615 if (new == NULL) {
1616 new = qdisc_create_dflt(sch->dev_queue,
1617 &pfifo_qdisc_ops, cl->common.classid);
1618 if (new == NULL)
1619 return -ENOBUFS;
1620 } else {
1621 #ifdef CONFIG_NET_CLS_ACT
1622 if (cl->police == TC_POLICE_RECLASSIFY)
1623 new->reshape_fail = cbq_reshape_fail;
1624 #endif
1626 sch_tree_lock(sch);
1627 *old = cl->q;
1628 cl->q = new;
1629 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1630 qdisc_reset(*old);
1631 sch_tree_unlock(sch);
1633 return 0;
1636 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1638 struct cbq_class *cl = (struct cbq_class *)arg;
1640 return cl->q;
1643 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1645 struct cbq_class *cl = (struct cbq_class *)arg;
1647 if (cl->q->q.qlen == 0)
1648 cbq_deactivate_class(cl);
1651 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1653 struct cbq_sched_data *q = qdisc_priv(sch);
1654 struct cbq_class *cl = cbq_class_lookup(q, classid);
1656 if (cl) {
1657 cl->refcnt++;
1658 return (unsigned long)cl;
1660 return 0;
1663 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1665 struct cbq_sched_data *q = qdisc_priv(sch);
1667 WARN_ON(cl->filters);
1669 tcf_destroy_chain(&cl->filter_list);
1670 qdisc_destroy(cl->q);
1671 qdisc_put_rtab(cl->R_tab);
1672 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1673 if (cl != &q->link)
1674 kfree(cl);
1677 static void cbq_destroy(struct Qdisc *sch)
1679 struct cbq_sched_data *q = qdisc_priv(sch);
1680 struct hlist_node *next;
1681 struct cbq_class *cl;
1682 unsigned int h;
1684 #ifdef CONFIG_NET_CLS_ACT
1685 q->rx_class = NULL;
1686 #endif
1688 * Filters must be destroyed first because we don't destroy the
1689 * classes from root to leafs which means that filters can still
1690 * be bound to classes which have been destroyed already. --TGR '04
1692 for (h = 0; h < q->clhash.hashsize; h++) {
1693 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
1694 tcf_destroy_chain(&cl->filter_list);
1696 for (h = 0; h < q->clhash.hashsize; h++) {
1697 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1698 common.hnode)
1699 cbq_destroy_class(sch, cl);
1701 qdisc_class_hash_destroy(&q->clhash);
1704 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1706 struct cbq_class *cl = (struct cbq_class *)arg;
1708 if (--cl->refcnt == 0) {
1709 #ifdef CONFIG_NET_CLS_ACT
1710 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1711 struct cbq_sched_data *q = qdisc_priv(sch);
1713 spin_lock_bh(root_lock);
1714 if (q->rx_class == cl)
1715 q->rx_class = NULL;
1716 spin_unlock_bh(root_lock);
1717 #endif
1719 cbq_destroy_class(sch, cl);
1723 static int
1724 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1725 unsigned long *arg)
1727 int err;
1728 struct cbq_sched_data *q = qdisc_priv(sch);
1729 struct cbq_class *cl = (struct cbq_class *)*arg;
1730 struct nlattr *opt = tca[TCA_OPTIONS];
1731 struct nlattr *tb[TCA_CBQ_MAX + 1];
1732 struct cbq_class *parent;
1733 struct qdisc_rate_table *rtab = NULL;
1735 if (opt == NULL)
1736 return -EINVAL;
1738 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1739 if (err < 0)
1740 return err;
1742 if (cl) {
1743 /* Check parent */
1744 if (parentid) {
1745 if (cl->tparent &&
1746 cl->tparent->common.classid != parentid)
1747 return -EINVAL;
1748 if (!cl->tparent && parentid != TC_H_ROOT)
1749 return -EINVAL;
1752 if (tb[TCA_CBQ_RATE]) {
1753 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1754 tb[TCA_CBQ_RTAB]);
1755 if (rtab == NULL)
1756 return -EINVAL;
1759 if (tca[TCA_RATE]) {
1760 err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1761 qdisc_root_sleeping_lock(sch),
1762 tca[TCA_RATE]);
1763 if (err) {
1764 qdisc_put_rtab(rtab);
1765 return err;
1769 /* Change class parameters */
1770 sch_tree_lock(sch);
1772 if (cl->next_alive != NULL)
1773 cbq_deactivate_class(cl);
1775 if (rtab) {
1776 qdisc_put_rtab(cl->R_tab);
1777 cl->R_tab = rtab;
1780 if (tb[TCA_CBQ_LSSOPT])
1781 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1783 if (tb[TCA_CBQ_WRROPT]) {
1784 cbq_rmprio(q, cl);
1785 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1788 if (tb[TCA_CBQ_OVL_STRATEGY])
1789 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1791 #ifdef CONFIG_NET_CLS_ACT
1792 if (tb[TCA_CBQ_POLICE])
1793 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1794 #endif
1796 if (tb[TCA_CBQ_FOPT])
1797 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1799 if (cl->q->q.qlen)
1800 cbq_activate_class(cl);
1802 sch_tree_unlock(sch);
1804 return 0;
1807 if (parentid == TC_H_ROOT)
1808 return -EINVAL;
1810 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1811 tb[TCA_CBQ_LSSOPT] == NULL)
1812 return -EINVAL;
1814 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1815 if (rtab == NULL)
1816 return -EINVAL;
1818 if (classid) {
1819 err = -EINVAL;
1820 if (TC_H_MAJ(classid ^ sch->handle) ||
1821 cbq_class_lookup(q, classid))
1822 goto failure;
1823 } else {
1824 int i;
1825 classid = TC_H_MAKE(sch->handle, 0x8000);
1827 for (i = 0; i < 0x8000; i++) {
1828 if (++q->hgenerator >= 0x8000)
1829 q->hgenerator = 1;
1830 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1831 break;
1833 err = -ENOSR;
1834 if (i >= 0x8000)
1835 goto failure;
1836 classid = classid|q->hgenerator;
1839 parent = &q->link;
1840 if (parentid) {
1841 parent = cbq_class_lookup(q, parentid);
1842 err = -EINVAL;
1843 if (parent == NULL)
1844 goto failure;
1847 err = -ENOBUFS;
1848 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1849 if (cl == NULL)
1850 goto failure;
1852 if (tca[TCA_RATE]) {
1853 err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1854 qdisc_root_sleeping_lock(sch),
1855 tca[TCA_RATE]);
1856 if (err) {
1857 kfree(cl);
1858 goto failure;
1862 cl->R_tab = rtab;
1863 rtab = NULL;
1864 cl->refcnt = 1;
1865 cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1866 if (!cl->q)
1867 cl->q = &noop_qdisc;
1868 cl->common.classid = classid;
1869 cl->tparent = parent;
1870 cl->qdisc = sch;
1871 cl->allot = parent->allot;
1872 cl->quantum = cl->allot;
1873 cl->weight = cl->R_tab->rate.rate;
1875 sch_tree_lock(sch);
1876 cbq_link_class(cl);
1877 cl->borrow = cl->tparent;
1878 if (cl->tparent != &q->link)
1879 cl->share = cl->tparent;
1880 cbq_adjust_levels(parent);
1881 cl->minidle = -0x7FFFFFFF;
1882 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1883 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1884 if (cl->ewma_log == 0)
1885 cl->ewma_log = q->link.ewma_log;
1886 if (cl->maxidle == 0)
1887 cl->maxidle = q->link.maxidle;
1888 if (cl->avpkt == 0)
1889 cl->avpkt = q->link.avpkt;
1890 cl->overlimit = cbq_ovl_classic;
1891 if (tb[TCA_CBQ_OVL_STRATEGY])
1892 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1893 #ifdef CONFIG_NET_CLS_ACT
1894 if (tb[TCA_CBQ_POLICE])
1895 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1896 #endif
1897 if (tb[TCA_CBQ_FOPT])
1898 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1899 sch_tree_unlock(sch);
1901 qdisc_class_hash_grow(sch, &q->clhash);
1903 *arg = (unsigned long)cl;
1904 return 0;
1906 failure:
1907 qdisc_put_rtab(rtab);
1908 return err;
1911 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1913 struct cbq_sched_data *q = qdisc_priv(sch);
1914 struct cbq_class *cl = (struct cbq_class *)arg;
1915 unsigned int qlen;
1917 if (cl->filters || cl->children || cl == &q->link)
1918 return -EBUSY;
1920 sch_tree_lock(sch);
1922 qlen = cl->q->q.qlen;
1923 qdisc_reset(cl->q);
1924 qdisc_tree_decrease_qlen(cl->q, qlen);
1926 if (cl->next_alive)
1927 cbq_deactivate_class(cl);
1929 if (q->tx_borrowed == cl)
1930 q->tx_borrowed = q->tx_class;
1931 if (q->tx_class == cl) {
1932 q->tx_class = NULL;
1933 q->tx_borrowed = NULL;
1935 #ifdef CONFIG_NET_CLS_ACT
1936 if (q->rx_class == cl)
1937 q->rx_class = NULL;
1938 #endif
1940 cbq_unlink_class(cl);
1941 cbq_adjust_levels(cl->tparent);
1942 cl->defmap = 0;
1943 cbq_sync_defmap(cl);
1945 cbq_rmprio(q, cl);
1946 sch_tree_unlock(sch);
1948 BUG_ON(--cl->refcnt == 0);
1950 * This shouldn't happen: we "hold" one cops->get() when called
1951 * from tc_ctl_tclass; the destroy method is done from cops->put().
1954 return 0;
1957 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1959 struct cbq_sched_data *q = qdisc_priv(sch);
1960 struct cbq_class *cl = (struct cbq_class *)arg;
1962 if (cl == NULL)
1963 cl = &q->link;
1965 return &cl->filter_list;
1968 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1969 u32 classid)
1971 struct cbq_sched_data *q = qdisc_priv(sch);
1972 struct cbq_class *p = (struct cbq_class *)parent;
1973 struct cbq_class *cl = cbq_class_lookup(q, classid);
1975 if (cl) {
1976 if (p && p->level <= cl->level)
1977 return 0;
1978 cl->filters++;
1979 return (unsigned long)cl;
1981 return 0;
1984 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1986 struct cbq_class *cl = (struct cbq_class *)arg;
1988 cl->filters--;
1991 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1993 struct cbq_sched_data *q = qdisc_priv(sch);
1994 struct cbq_class *cl;
1995 unsigned int h;
1997 if (arg->stop)
1998 return;
2000 for (h = 0; h < q->clhash.hashsize; h++) {
2001 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
2002 if (arg->count < arg->skip) {
2003 arg->count++;
2004 continue;
2006 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2007 arg->stop = 1;
2008 return;
2010 arg->count++;
2015 static const struct Qdisc_class_ops cbq_class_ops = {
2016 .graft = cbq_graft,
2017 .leaf = cbq_leaf,
2018 .qlen_notify = cbq_qlen_notify,
2019 .get = cbq_get,
2020 .put = cbq_put,
2021 .change = cbq_change_class,
2022 .delete = cbq_delete,
2023 .walk = cbq_walk,
2024 .tcf_chain = cbq_find_tcf,
2025 .bind_tcf = cbq_bind_filter,
2026 .unbind_tcf = cbq_unbind_filter,
2027 .dump = cbq_dump_class,
2028 .dump_stats = cbq_dump_class_stats,
2031 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2032 .next = NULL,
2033 .cl_ops = &cbq_class_ops,
2034 .id = "cbq",
2035 .priv_size = sizeof(struct cbq_sched_data),
2036 .enqueue = cbq_enqueue,
2037 .dequeue = cbq_dequeue,
2038 .peek = qdisc_peek_dequeued,
2039 .drop = cbq_drop,
2040 .init = cbq_init,
2041 .reset = cbq_reset,
2042 .destroy = cbq_destroy,
2043 .change = NULL,
2044 .dump = cbq_dump,
2045 .dump_stats = cbq_dump_stats,
2046 .owner = THIS_MODULE,
2049 static int __init cbq_module_init(void)
2051 return register_qdisc(&cbq_qdisc_ops);
2053 static void __exit cbq_module_exit(void)
2055 unregister_qdisc(&cbq_qdisc_ops);
2057 module_init(cbq_module_init)
2058 module_exit(cbq_module_exit)
2059 MODULE_LICENSE("GPL");