ALSA: hda - Power up always when no jack detection is available
[linux-2.6/btrfs-unstable.git] / net / sched / sch_cbq.c
blob03e389e8d945d2b93b2d31b408a2ee78c6f0c8b9
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/string.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <net/netlink.h>
20 #include <net/pkt_sched.h>
23 /* Class-Based Queueing (CBQ) algorithm.
24 =======================================
26 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
27 Management Models for Packet Networks",
28 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
30 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
32 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
33 Parameters", 1996
35 [4] Sally Floyd and Michael Speer, "Experimental Results
36 for Class-Based Queueing", 1998, not published.
38 -----------------------------------------------------------------------
40 Algorithm skeleton was taken from NS simulator cbq.cc.
41 If someone wants to check this code against the LBL version,
42 he should take into account that ONLY the skeleton was borrowed,
43 the implementation is different. Particularly:
45 --- The WRR algorithm is different. Our version looks more
46 reasonable (I hope) and works when quanta are allowed to be
47 less than MTU, which is always the case when real time classes
48 have small rates. Note, that the statement of [3] is
49 incomplete, delay may actually be estimated even if class
50 per-round allotment is less than MTU. Namely, if per-round
51 allotment is W*r_i, and r_1+...+r_k = r < 1
53 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
55 In the worst case we have IntServ estimate with D = W*r+k*MTU
56 and C = MTU*r. The proof (if correct at all) is trivial.
59 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
60 interpret some places, which look like wrong translations
61 from NS. Anyone is advised to find these differences
62 and explain to me, why I am wrong 8).
64 --- Linux has no EOI event, so that we cannot estimate true class
65 idle time. Workaround is to consider the next dequeue event
66 as sign that previous packet is finished. This is wrong because of
67 internal device queueing, but on a permanently loaded link it is true.
68 Moreover, combined with clock integrator, this scheme looks
69 very close to an ideal solution. */
71 struct cbq_sched_data;
74 struct cbq_class
76 struct Qdisc_class_common common;
77 struct cbq_class *next_alive; /* next class with backlog in this priority band */
79 /* Parameters */
80 unsigned char priority; /* class priority */
81 unsigned char priority2; /* priority to be used after overlimit */
82 unsigned char ewma_log; /* time constant for idle time calculation */
83 unsigned char ovl_strategy;
84 #ifdef CONFIG_NET_CLS_ACT
85 unsigned char police;
86 #endif
88 u32 defmap;
90 /* Link-sharing scheduler parameters */
91 long maxidle; /* Class parameters: see below. */
92 long offtime;
93 long minidle;
94 u32 avpkt;
95 struct qdisc_rate_table *R_tab;
97 /* Overlimit strategy parameters */
98 void (*overlimit)(struct cbq_class *cl);
99 psched_tdiff_t penalty;
101 /* General scheduler (WRR) parameters */
102 long allot;
103 long quantum; /* Allotment per WRR round */
104 long weight; /* Relative allotment: see below */
106 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
107 struct cbq_class *split; /* Ptr to split node */
108 struct cbq_class *share; /* Ptr to LS parent in the class tree */
109 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
110 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
111 parent otherwise */
112 struct cbq_class *sibling; /* Sibling chain */
113 struct cbq_class *children; /* Pointer to children chain */
115 struct Qdisc *q; /* Elementary queueing discipline */
118 /* Variables */
119 unsigned char cpriority; /* Effective priority */
120 unsigned char delayed;
121 unsigned char level; /* level of the class in hierarchy:
122 0 for leaf classes, and maximal
123 level of children + 1 for nodes.
126 psched_time_t last; /* Last end of service */
127 psched_time_t undertime;
128 long avgidle;
129 long deficit; /* Saved deficit for WRR */
130 psched_time_t penalized;
131 struct gnet_stats_basic bstats;
132 struct gnet_stats_queue qstats;
133 struct gnet_stats_rate_est rate_est;
134 struct tc_cbq_xstats xstats;
136 struct tcf_proto *filter_list;
138 int refcnt;
139 int filters;
141 struct cbq_class *defaults[TC_PRIO_MAX+1];
144 struct cbq_sched_data
146 struct Qdisc_class_hash clhash; /* Hash table of all classes */
147 int nclasses[TC_CBQ_MAXPRIO+1];
148 unsigned quanta[TC_CBQ_MAXPRIO+1];
150 struct cbq_class link;
152 unsigned activemask;
153 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
154 with backlog */
156 #ifdef CONFIG_NET_CLS_ACT
157 struct cbq_class *rx_class;
158 #endif
159 struct cbq_class *tx_class;
160 struct cbq_class *tx_borrowed;
161 int tx_len;
162 psched_time_t now; /* Cached timestamp */
163 psched_time_t now_rt; /* Cached real time */
164 unsigned pmask;
166 struct hrtimer delay_timer;
167 struct qdisc_watchdog watchdog; /* Watchdog timer,
168 started when CBQ has
169 backlog, but cannot
170 transmit just now */
171 psched_tdiff_t wd_expires;
172 int toplevel;
173 u32 hgenerator;
177 #define L2T(cl,len) qdisc_l2t((cl)->R_tab,len)
179 static __inline__ struct cbq_class *
180 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
182 struct Qdisc_class_common *clc;
184 clc = qdisc_class_find(&q->clhash, classid);
185 if (clc == NULL)
186 return NULL;
187 return container_of(clc, struct cbq_class, common);
190 #ifdef CONFIG_NET_CLS_ACT
192 static struct cbq_class *
193 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
195 struct cbq_class *cl, *new;
197 for (cl = this->tparent; cl; cl = cl->tparent)
198 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
199 return new;
201 return NULL;
204 #endif
206 /* Classify packet. The procedure is pretty complicated, but
207 it allows us to combine link sharing and priority scheduling
208 transparently.
210 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
211 so that it resolves to split nodes. Then packets are classified
212 by logical priority, or a more specific classifier may be attached
213 to the split node.
216 static struct cbq_class *
217 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
219 struct cbq_sched_data *q = qdisc_priv(sch);
220 struct cbq_class *head = &q->link;
221 struct cbq_class **defmap;
222 struct cbq_class *cl = NULL;
223 u32 prio = skb->priority;
224 struct tcf_result res;
227 * Step 1. If skb->priority points to one of our classes, use it.
229 if (TC_H_MAJ(prio^sch->handle) == 0 &&
230 (cl = cbq_class_lookup(q, prio)) != NULL)
231 return cl;
233 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
234 for (;;) {
235 int result = 0;
236 defmap = head->defaults;
239 * Step 2+n. Apply classifier.
241 if (!head->filter_list ||
242 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
243 goto fallback;
245 if ((cl = (void*)res.class) == NULL) {
246 if (TC_H_MAJ(res.classid))
247 cl = cbq_class_lookup(q, res.classid);
248 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
249 cl = defmap[TC_PRIO_BESTEFFORT];
251 if (cl == NULL || cl->level >= head->level)
252 goto fallback;
255 #ifdef CONFIG_NET_CLS_ACT
256 switch (result) {
257 case TC_ACT_QUEUED:
258 case TC_ACT_STOLEN:
259 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
260 case TC_ACT_SHOT:
261 return NULL;
262 case TC_ACT_RECLASSIFY:
263 return cbq_reclassify(skb, cl);
265 #endif
266 if (cl->level == 0)
267 return cl;
270 * Step 3+n. If classifier selected a link sharing class,
271 * apply agency specific classifier.
272 * Repeat this procdure until we hit a leaf node.
274 head = cl;
277 fallback:
278 cl = head;
281 * Step 4. No success...
283 if (TC_H_MAJ(prio) == 0 &&
284 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
285 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
286 return head;
288 return cl;
292 A packet has just been enqueued on the empty class.
293 cbq_activate_class adds it to the tail of active class list
294 of its priority band.
297 static __inline__ void cbq_activate_class(struct cbq_class *cl)
299 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
300 int prio = cl->cpriority;
301 struct cbq_class *cl_tail;
303 cl_tail = q->active[prio];
304 q->active[prio] = cl;
306 if (cl_tail != NULL) {
307 cl->next_alive = cl_tail->next_alive;
308 cl_tail->next_alive = cl;
309 } else {
310 cl->next_alive = cl;
311 q->activemask |= (1<<prio);
316 Unlink class from active chain.
317 Note that this same procedure is done directly in cbq_dequeue*
318 during round-robin procedure.
321 static void cbq_deactivate_class(struct cbq_class *this)
323 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
324 int prio = this->cpriority;
325 struct cbq_class *cl;
326 struct cbq_class *cl_prev = q->active[prio];
328 do {
329 cl = cl_prev->next_alive;
330 if (cl == this) {
331 cl_prev->next_alive = cl->next_alive;
332 cl->next_alive = NULL;
334 if (cl == q->active[prio]) {
335 q->active[prio] = cl_prev;
336 if (cl == q->active[prio]) {
337 q->active[prio] = NULL;
338 q->activemask &= ~(1<<prio);
339 return;
342 return;
344 } while ((cl_prev = cl) != q->active[prio]);
347 static void
348 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
350 int toplevel = q->toplevel;
352 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
353 psched_time_t now;
354 psched_tdiff_t incr;
356 now = psched_get_time();
357 incr = now - q->now_rt;
358 now = q->now + incr;
360 do {
361 if (cl->undertime < now) {
362 q->toplevel = cl->level;
363 return;
365 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
369 static int
370 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
372 struct cbq_sched_data *q = qdisc_priv(sch);
373 int uninitialized_var(ret);
374 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
376 #ifdef CONFIG_NET_CLS_ACT
377 q->rx_class = cl;
378 #endif
379 if (cl == NULL) {
380 if (ret & __NET_XMIT_BYPASS)
381 sch->qstats.drops++;
382 kfree_skb(skb);
383 return ret;
386 #ifdef CONFIG_NET_CLS_ACT
387 cl->q->__parent = sch;
388 #endif
389 ret = qdisc_enqueue(skb, cl->q);
390 if (ret == NET_XMIT_SUCCESS) {
391 sch->q.qlen++;
392 sch->bstats.packets++;
393 sch->bstats.bytes += qdisc_pkt_len(skb);
394 cbq_mark_toplevel(q, cl);
395 if (!cl->next_alive)
396 cbq_activate_class(cl);
397 return ret;
400 if (net_xmit_drop_count(ret)) {
401 sch->qstats.drops++;
402 cbq_mark_toplevel(q, cl);
403 cl->qstats.drops++;
405 return ret;
408 static int
409 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
411 struct cbq_sched_data *q = qdisc_priv(sch);
412 struct cbq_class *cl;
413 int ret;
415 if ((cl = q->tx_class) == NULL) {
416 kfree_skb(skb);
417 sch->qstats.drops++;
418 return NET_XMIT_CN;
420 q->tx_class = NULL;
422 cbq_mark_toplevel(q, cl);
424 #ifdef CONFIG_NET_CLS_ACT
425 q->rx_class = cl;
426 cl->q->__parent = sch;
427 #endif
428 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
429 sch->q.qlen++;
430 sch->qstats.requeues++;
431 if (!cl->next_alive)
432 cbq_activate_class(cl);
433 return 0;
435 if (net_xmit_drop_count(ret)) {
436 sch->qstats.drops++;
437 cl->qstats.drops++;
439 return ret;
442 /* Overlimit actions */
444 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
446 static void cbq_ovl_classic(struct cbq_class *cl)
448 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
449 psched_tdiff_t delay = cl->undertime - q->now;
451 if (!cl->delayed) {
452 delay += cl->offtime;
455 Class goes to sleep, so that it will have no
456 chance to work avgidle. Let's forgive it 8)
458 BTW cbq-2.0 has a crap in this
459 place, apparently they forgot to shift it by cl->ewma_log.
461 if (cl->avgidle < 0)
462 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
463 if (cl->avgidle < cl->minidle)
464 cl->avgidle = cl->minidle;
465 if (delay <= 0)
466 delay = 1;
467 cl->undertime = q->now + delay;
469 cl->xstats.overactions++;
470 cl->delayed = 1;
472 if (q->wd_expires == 0 || q->wd_expires > delay)
473 q->wd_expires = delay;
475 /* Dirty work! We must schedule wakeups based on
476 real available rate, rather than leaf rate,
477 which may be tiny (even zero).
479 if (q->toplevel == TC_CBQ_MAXLEVEL) {
480 struct cbq_class *b;
481 psched_tdiff_t base_delay = q->wd_expires;
483 for (b = cl->borrow; b; b = b->borrow) {
484 delay = b->undertime - q->now;
485 if (delay < base_delay) {
486 if (delay <= 0)
487 delay = 1;
488 base_delay = delay;
492 q->wd_expires = base_delay;
496 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
497 they go overlimit
500 static void cbq_ovl_rclassic(struct cbq_class *cl)
502 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
503 struct cbq_class *this = cl;
505 do {
506 if (cl->level > q->toplevel) {
507 cl = NULL;
508 break;
510 } while ((cl = cl->borrow) != NULL);
512 if (cl == NULL)
513 cl = this;
514 cbq_ovl_classic(cl);
517 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
519 static void cbq_ovl_delay(struct cbq_class *cl)
521 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
522 psched_tdiff_t delay = cl->undertime - q->now;
524 if (test_bit(__QDISC_STATE_DEACTIVATED,
525 &qdisc_root_sleeping(cl->qdisc)->state))
526 return;
528 if (!cl->delayed) {
529 psched_time_t sched = q->now;
530 ktime_t expires;
532 delay += cl->offtime;
533 if (cl->avgidle < 0)
534 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
535 if (cl->avgidle < cl->minidle)
536 cl->avgidle = cl->minidle;
537 cl->undertime = q->now + delay;
539 if (delay > 0) {
540 sched += delay + cl->penalty;
541 cl->penalized = sched;
542 cl->cpriority = TC_CBQ_MAXPRIO;
543 q->pmask |= (1<<TC_CBQ_MAXPRIO);
545 expires = ktime_set(0, 0);
546 expires = ktime_add_ns(expires, PSCHED_US2NS(sched));
547 if (hrtimer_try_to_cancel(&q->delay_timer) &&
548 ktime_to_ns(ktime_sub(
549 hrtimer_get_expires(&q->delay_timer),
550 expires)) > 0)
551 hrtimer_set_expires(&q->delay_timer, expires);
552 hrtimer_restart(&q->delay_timer);
553 cl->delayed = 1;
554 cl->xstats.overactions++;
555 return;
557 delay = 1;
559 if (q->wd_expires == 0 || q->wd_expires > delay)
560 q->wd_expires = delay;
563 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
565 static void cbq_ovl_lowprio(struct cbq_class *cl)
567 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
569 cl->penalized = q->now + cl->penalty;
571 if (cl->cpriority != cl->priority2) {
572 cl->cpriority = cl->priority2;
573 q->pmask |= (1<<cl->cpriority);
574 cl->xstats.overactions++;
576 cbq_ovl_classic(cl);
579 /* TC_CBQ_OVL_DROP: penalize class by dropping */
581 static void cbq_ovl_drop(struct cbq_class *cl)
583 if (cl->q->ops->drop)
584 if (cl->q->ops->drop(cl->q))
585 cl->qdisc->q.qlen--;
586 cl->xstats.overactions++;
587 cbq_ovl_classic(cl);
590 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
591 psched_time_t now)
593 struct cbq_class *cl;
594 struct cbq_class *cl_prev = q->active[prio];
595 psched_time_t sched = now;
597 if (cl_prev == NULL)
598 return 0;
600 do {
601 cl = cl_prev->next_alive;
602 if (now - cl->penalized > 0) {
603 cl_prev->next_alive = cl->next_alive;
604 cl->next_alive = NULL;
605 cl->cpriority = cl->priority;
606 cl->delayed = 0;
607 cbq_activate_class(cl);
609 if (cl == q->active[prio]) {
610 q->active[prio] = cl_prev;
611 if (cl == q->active[prio]) {
612 q->active[prio] = NULL;
613 return 0;
617 cl = cl_prev->next_alive;
618 } else if (sched - cl->penalized > 0)
619 sched = cl->penalized;
620 } while ((cl_prev = cl) != q->active[prio]);
622 return sched - now;
625 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
627 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
628 delay_timer);
629 struct Qdisc *sch = q->watchdog.qdisc;
630 psched_time_t now;
631 psched_tdiff_t delay = 0;
632 unsigned pmask;
634 now = psched_get_time();
636 pmask = q->pmask;
637 q->pmask = 0;
639 while (pmask) {
640 int prio = ffz(~pmask);
641 psched_tdiff_t tmp;
643 pmask &= ~(1<<prio);
645 tmp = cbq_undelay_prio(q, prio, now);
646 if (tmp > 0) {
647 q->pmask |= 1<<prio;
648 if (tmp < delay || delay == 0)
649 delay = tmp;
653 if (delay) {
654 ktime_t time;
656 time = ktime_set(0, 0);
657 time = ktime_add_ns(time, PSCHED_US2NS(now + delay));
658 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
661 sch->flags &= ~TCQ_F_THROTTLED;
662 __netif_schedule(qdisc_root(sch));
663 return HRTIMER_NORESTART;
666 #ifdef CONFIG_NET_CLS_ACT
667 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
669 struct Qdisc *sch = child->__parent;
670 struct cbq_sched_data *q = qdisc_priv(sch);
671 struct cbq_class *cl = q->rx_class;
673 q->rx_class = NULL;
675 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
676 int ret;
678 cbq_mark_toplevel(q, cl);
680 q->rx_class = cl;
681 cl->q->__parent = sch;
683 ret = qdisc_enqueue(skb, cl->q);
684 if (ret == NET_XMIT_SUCCESS) {
685 sch->q.qlen++;
686 sch->bstats.packets++;
687 sch->bstats.bytes += qdisc_pkt_len(skb);
688 if (!cl->next_alive)
689 cbq_activate_class(cl);
690 return 0;
692 if (net_xmit_drop_count(ret))
693 sch->qstats.drops++;
694 return 0;
697 sch->qstats.drops++;
698 return -1;
700 #endif
703 It is mission critical procedure.
705 We "regenerate" toplevel cutoff, if transmitting class
706 has backlog and it is not regulated. It is not part of
707 original CBQ description, but looks more reasonable.
708 Probably, it is wrong. This question needs further investigation.
711 static __inline__ void
712 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
713 struct cbq_class *borrowed)
715 if (cl && q->toplevel >= borrowed->level) {
716 if (cl->q->q.qlen > 1) {
717 do {
718 if (borrowed->undertime == PSCHED_PASTPERFECT) {
719 q->toplevel = borrowed->level;
720 return;
722 } while ((borrowed=borrowed->borrow) != NULL);
724 #if 0
725 /* It is not necessary now. Uncommenting it
726 will save CPU cycles, but decrease fairness.
728 q->toplevel = TC_CBQ_MAXLEVEL;
729 #endif
733 static void
734 cbq_update(struct cbq_sched_data *q)
736 struct cbq_class *this = q->tx_class;
737 struct cbq_class *cl = this;
738 int len = q->tx_len;
740 q->tx_class = NULL;
742 for ( ; cl; cl = cl->share) {
743 long avgidle = cl->avgidle;
744 long idle;
746 cl->bstats.packets++;
747 cl->bstats.bytes += len;
750 (now - last) is total time between packet right edges.
751 (last_pktlen/rate) is "virtual" busy time, so that
753 idle = (now - last) - last_pktlen/rate
756 idle = q->now - cl->last;
757 if ((unsigned long)idle > 128*1024*1024) {
758 avgidle = cl->maxidle;
759 } else {
760 idle -= L2T(cl, len);
762 /* true_avgidle := (1-W)*true_avgidle + W*idle,
763 where W=2^{-ewma_log}. But cl->avgidle is scaled:
764 cl->avgidle == true_avgidle/W,
765 hence:
767 avgidle += idle - (avgidle>>cl->ewma_log);
770 if (avgidle <= 0) {
771 /* Overlimit or at-limit */
773 if (avgidle < cl->minidle)
774 avgidle = cl->minidle;
776 cl->avgidle = avgidle;
778 /* Calculate expected time, when this class
779 will be allowed to send.
780 It will occur, when:
781 (1-W)*true_avgidle + W*delay = 0, i.e.
782 idle = (1/W - 1)*(-true_avgidle)
784 idle = (1 - W)*(-cl->avgidle);
786 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
789 That is not all.
790 To maintain the rate allocated to the class,
791 we add to undertime virtual clock,
792 necessary to complete transmitted packet.
793 (len/phys_bandwidth has been already passed
794 to the moment of cbq_update)
797 idle -= L2T(&q->link, len);
798 idle += L2T(cl, len);
800 cl->undertime = q->now + idle;
801 } else {
802 /* Underlimit */
804 cl->undertime = PSCHED_PASTPERFECT;
805 if (avgidle > cl->maxidle)
806 cl->avgidle = cl->maxidle;
807 else
808 cl->avgidle = avgidle;
810 cl->last = q->now;
813 cbq_update_toplevel(q, this, q->tx_borrowed);
816 static __inline__ struct cbq_class *
817 cbq_under_limit(struct cbq_class *cl)
819 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
820 struct cbq_class *this_cl = cl;
822 if (cl->tparent == NULL)
823 return cl;
825 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
826 cl->delayed = 0;
827 return cl;
830 do {
831 /* It is very suspicious place. Now overlimit
832 action is generated for not bounded classes
833 only if link is completely congested.
834 Though it is in agree with ancestor-only paradigm,
835 it looks very stupid. Particularly,
836 it means that this chunk of code will either
837 never be called or result in strong amplification
838 of burstiness. Dangerous, silly, and, however,
839 no another solution exists.
841 if ((cl = cl->borrow) == NULL) {
842 this_cl->qstats.overlimits++;
843 this_cl->overlimit(this_cl);
844 return NULL;
846 if (cl->level > q->toplevel)
847 return NULL;
848 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
850 cl->delayed = 0;
851 return cl;
854 static __inline__ struct sk_buff *
855 cbq_dequeue_prio(struct Qdisc *sch, int prio)
857 struct cbq_sched_data *q = qdisc_priv(sch);
858 struct cbq_class *cl_tail, *cl_prev, *cl;
859 struct sk_buff *skb;
860 int deficit;
862 cl_tail = cl_prev = q->active[prio];
863 cl = cl_prev->next_alive;
865 do {
866 deficit = 0;
868 /* Start round */
869 do {
870 struct cbq_class *borrow = cl;
872 if (cl->q->q.qlen &&
873 (borrow = cbq_under_limit(cl)) == NULL)
874 goto skip_class;
876 if (cl->deficit <= 0) {
877 /* Class exhausted its allotment per
878 this round. Switch to the next one.
880 deficit = 1;
881 cl->deficit += cl->quantum;
882 goto next_class;
885 skb = cl->q->dequeue(cl->q);
887 /* Class did not give us any skb :-(
888 It could occur even if cl->q->q.qlen != 0
889 f.e. if cl->q == "tbf"
891 if (skb == NULL)
892 goto skip_class;
894 cl->deficit -= qdisc_pkt_len(skb);
895 q->tx_class = cl;
896 q->tx_borrowed = borrow;
897 if (borrow != cl) {
898 #ifndef CBQ_XSTATS_BORROWS_BYTES
899 borrow->xstats.borrows++;
900 cl->xstats.borrows++;
901 #else
902 borrow->xstats.borrows += qdisc_pkt_len(skb);
903 cl->xstats.borrows += qdisc_pkt_len(skb);
904 #endif
906 q->tx_len = qdisc_pkt_len(skb);
908 if (cl->deficit <= 0) {
909 q->active[prio] = cl;
910 cl = cl->next_alive;
911 cl->deficit += cl->quantum;
913 return skb;
915 skip_class:
916 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
917 /* Class is empty or penalized.
918 Unlink it from active chain.
920 cl_prev->next_alive = cl->next_alive;
921 cl->next_alive = NULL;
923 /* Did cl_tail point to it? */
924 if (cl == cl_tail) {
925 /* Repair it! */
926 cl_tail = cl_prev;
928 /* Was it the last class in this band? */
929 if (cl == cl_tail) {
930 /* Kill the band! */
931 q->active[prio] = NULL;
932 q->activemask &= ~(1<<prio);
933 if (cl->q->q.qlen)
934 cbq_activate_class(cl);
935 return NULL;
938 q->active[prio] = cl_tail;
940 if (cl->q->q.qlen)
941 cbq_activate_class(cl);
943 cl = cl_prev;
946 next_class:
947 cl_prev = cl;
948 cl = cl->next_alive;
949 } while (cl_prev != cl_tail);
950 } while (deficit);
952 q->active[prio] = cl_prev;
954 return NULL;
957 static __inline__ struct sk_buff *
958 cbq_dequeue_1(struct Qdisc *sch)
960 struct cbq_sched_data *q = qdisc_priv(sch);
961 struct sk_buff *skb;
962 unsigned activemask;
964 activemask = q->activemask&0xFF;
965 while (activemask) {
966 int prio = ffz(~activemask);
967 activemask &= ~(1<<prio);
968 skb = cbq_dequeue_prio(sch, prio);
969 if (skb)
970 return skb;
972 return NULL;
975 static struct sk_buff *
976 cbq_dequeue(struct Qdisc *sch)
978 struct sk_buff *skb;
979 struct cbq_sched_data *q = qdisc_priv(sch);
980 psched_time_t now;
981 psched_tdiff_t incr;
983 now = psched_get_time();
984 incr = now - q->now_rt;
986 if (q->tx_class) {
987 psched_tdiff_t incr2;
988 /* Time integrator. We calculate EOS time
989 by adding expected packet transmission time.
990 If real time is greater, we warp artificial clock,
991 so that:
993 cbq_time = max(real_time, work);
995 incr2 = L2T(&q->link, q->tx_len);
996 q->now += incr2;
997 cbq_update(q);
998 if ((incr -= incr2) < 0)
999 incr = 0;
1001 q->now += incr;
1002 q->now_rt = now;
1004 for (;;) {
1005 q->wd_expires = 0;
1007 skb = cbq_dequeue_1(sch);
1008 if (skb) {
1009 sch->q.qlen--;
1010 sch->flags &= ~TCQ_F_THROTTLED;
1011 return skb;
1014 /* All the classes are overlimit.
1016 It is possible, if:
1018 1. Scheduler is empty.
1019 2. Toplevel cutoff inhibited borrowing.
1020 3. Root class is overlimit.
1022 Reset 2d and 3d conditions and retry.
1024 Note, that NS and cbq-2.0 are buggy, peeking
1025 an arbitrary class is appropriate for ancestor-only
1026 sharing, but not for toplevel algorithm.
1028 Our version is better, but slower, because it requires
1029 two passes, but it is unavoidable with top-level sharing.
1032 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1033 q->link.undertime == PSCHED_PASTPERFECT)
1034 break;
1036 q->toplevel = TC_CBQ_MAXLEVEL;
1037 q->link.undertime = PSCHED_PASTPERFECT;
1040 /* No packets in scheduler or nobody wants to give them to us :-(
1041 Sigh... start watchdog timer in the last case. */
1043 if (sch->q.qlen) {
1044 sch->qstats.overlimits++;
1045 if (q->wd_expires)
1046 qdisc_watchdog_schedule(&q->watchdog,
1047 now + q->wd_expires);
1049 return NULL;
1052 /* CBQ class maintanance routines */
1054 static void cbq_adjust_levels(struct cbq_class *this)
1056 if (this == NULL)
1057 return;
1059 do {
1060 int level = 0;
1061 struct cbq_class *cl;
1063 if ((cl = this->children) != NULL) {
1064 do {
1065 if (cl->level > level)
1066 level = cl->level;
1067 } while ((cl = cl->sibling) != this->children);
1069 this->level = level+1;
1070 } while ((this = this->tparent) != NULL);
1073 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1075 struct cbq_class *cl;
1076 struct hlist_node *n;
1077 unsigned int h;
1079 if (q->quanta[prio] == 0)
1080 return;
1082 for (h = 0; h < q->clhash.hashsize; h++) {
1083 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1084 /* BUGGGG... Beware! This expression suffer of
1085 arithmetic overflows!
1087 if (cl->priority == prio) {
1088 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1089 q->quanta[prio];
1091 if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1092 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum);
1093 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1099 static void cbq_sync_defmap(struct cbq_class *cl)
1101 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1102 struct cbq_class *split = cl->split;
1103 unsigned h;
1104 int i;
1106 if (split == NULL)
1107 return;
1109 for (i=0; i<=TC_PRIO_MAX; i++) {
1110 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1111 split->defaults[i] = NULL;
1114 for (i=0; i<=TC_PRIO_MAX; i++) {
1115 int level = split->level;
1117 if (split->defaults[i])
1118 continue;
1120 for (h = 0; h < q->clhash.hashsize; h++) {
1121 struct hlist_node *n;
1122 struct cbq_class *c;
1124 hlist_for_each_entry(c, n, &q->clhash.hash[h],
1125 common.hnode) {
1126 if (c->split == split && c->level < level &&
1127 c->defmap&(1<<i)) {
1128 split->defaults[i] = c;
1129 level = c->level;
1136 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1138 struct cbq_class *split = NULL;
1140 if (splitid == 0) {
1141 if ((split = cl->split) == NULL)
1142 return;
1143 splitid = split->common.classid;
1146 if (split == NULL || split->common.classid != splitid) {
1147 for (split = cl->tparent; split; split = split->tparent)
1148 if (split->common.classid == splitid)
1149 break;
1152 if (split == NULL)
1153 return;
1155 if (cl->split != split) {
1156 cl->defmap = 0;
1157 cbq_sync_defmap(cl);
1158 cl->split = split;
1159 cl->defmap = def&mask;
1160 } else
1161 cl->defmap = (cl->defmap&~mask)|(def&mask);
1163 cbq_sync_defmap(cl);
1166 static void cbq_unlink_class(struct cbq_class *this)
1168 struct cbq_class *cl, **clp;
1169 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1171 qdisc_class_hash_remove(&q->clhash, &this->common);
1173 if (this->tparent) {
1174 clp=&this->sibling;
1175 cl = *clp;
1176 do {
1177 if (cl == this) {
1178 *clp = cl->sibling;
1179 break;
1181 clp = &cl->sibling;
1182 } while ((cl = *clp) != this->sibling);
1184 if (this->tparent->children == this) {
1185 this->tparent->children = this->sibling;
1186 if (this->sibling == this)
1187 this->tparent->children = NULL;
1189 } else {
1190 WARN_ON(this->sibling != this);
1194 static void cbq_link_class(struct cbq_class *this)
1196 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1197 struct cbq_class *parent = this->tparent;
1199 this->sibling = this;
1200 qdisc_class_hash_insert(&q->clhash, &this->common);
1202 if (parent == NULL)
1203 return;
1205 if (parent->children == NULL) {
1206 parent->children = this;
1207 } else {
1208 this->sibling = parent->children->sibling;
1209 parent->children->sibling = this;
1213 static unsigned int cbq_drop(struct Qdisc* sch)
1215 struct cbq_sched_data *q = qdisc_priv(sch);
1216 struct cbq_class *cl, *cl_head;
1217 int prio;
1218 unsigned int len;
1220 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1221 if ((cl_head = q->active[prio]) == NULL)
1222 continue;
1224 cl = cl_head;
1225 do {
1226 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1227 sch->q.qlen--;
1228 if (!cl->q->q.qlen)
1229 cbq_deactivate_class(cl);
1230 return len;
1232 } while ((cl = cl->next_alive) != cl_head);
1234 return 0;
1237 static void
1238 cbq_reset(struct Qdisc* sch)
1240 struct cbq_sched_data *q = qdisc_priv(sch);
1241 struct cbq_class *cl;
1242 struct hlist_node *n;
1243 int prio;
1244 unsigned h;
1246 q->activemask = 0;
1247 q->pmask = 0;
1248 q->tx_class = NULL;
1249 q->tx_borrowed = NULL;
1250 qdisc_watchdog_cancel(&q->watchdog);
1251 hrtimer_cancel(&q->delay_timer);
1252 q->toplevel = TC_CBQ_MAXLEVEL;
1253 q->now = psched_get_time();
1254 q->now_rt = q->now;
1256 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1257 q->active[prio] = NULL;
1259 for (h = 0; h < q->clhash.hashsize; h++) {
1260 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1261 qdisc_reset(cl->q);
1263 cl->next_alive = NULL;
1264 cl->undertime = PSCHED_PASTPERFECT;
1265 cl->avgidle = cl->maxidle;
1266 cl->deficit = cl->quantum;
1267 cl->cpriority = cl->priority;
1270 sch->q.qlen = 0;
1274 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1276 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1277 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1278 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1280 if (lss->change&TCF_CBQ_LSS_EWMA)
1281 cl->ewma_log = lss->ewma_log;
1282 if (lss->change&TCF_CBQ_LSS_AVPKT)
1283 cl->avpkt = lss->avpkt;
1284 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1285 cl->minidle = -(long)lss->minidle;
1286 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1287 cl->maxidle = lss->maxidle;
1288 cl->avgidle = lss->maxidle;
1290 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1291 cl->offtime = lss->offtime;
1292 return 0;
1295 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1297 q->nclasses[cl->priority]--;
1298 q->quanta[cl->priority] -= cl->weight;
1299 cbq_normalize_quanta(q, cl->priority);
1302 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1304 q->nclasses[cl->priority]++;
1305 q->quanta[cl->priority] += cl->weight;
1306 cbq_normalize_quanta(q, cl->priority);
1309 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1311 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1313 if (wrr->allot)
1314 cl->allot = wrr->allot;
1315 if (wrr->weight)
1316 cl->weight = wrr->weight;
1317 if (wrr->priority) {
1318 cl->priority = wrr->priority-1;
1319 cl->cpriority = cl->priority;
1320 if (cl->priority >= cl->priority2)
1321 cl->priority2 = TC_CBQ_MAXPRIO-1;
1324 cbq_addprio(q, cl);
1325 return 0;
1328 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1330 switch (ovl->strategy) {
1331 case TC_CBQ_OVL_CLASSIC:
1332 cl->overlimit = cbq_ovl_classic;
1333 break;
1334 case TC_CBQ_OVL_DELAY:
1335 cl->overlimit = cbq_ovl_delay;
1336 break;
1337 case TC_CBQ_OVL_LOWPRIO:
1338 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1339 ovl->priority2-1 <= cl->priority)
1340 return -EINVAL;
1341 cl->priority2 = ovl->priority2-1;
1342 cl->overlimit = cbq_ovl_lowprio;
1343 break;
1344 case TC_CBQ_OVL_DROP:
1345 cl->overlimit = cbq_ovl_drop;
1346 break;
1347 case TC_CBQ_OVL_RCLASSIC:
1348 cl->overlimit = cbq_ovl_rclassic;
1349 break;
1350 default:
1351 return -EINVAL;
1353 cl->penalty = ovl->penalty;
1354 return 0;
1357 #ifdef CONFIG_NET_CLS_ACT
1358 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1360 cl->police = p->police;
1362 if (cl->q->handle) {
1363 if (p->police == TC_POLICE_RECLASSIFY)
1364 cl->q->reshape_fail = cbq_reshape_fail;
1365 else
1366 cl->q->reshape_fail = NULL;
1368 return 0;
1370 #endif
1372 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1374 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1375 return 0;
1378 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1379 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1380 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1381 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1382 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1383 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1384 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1385 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1388 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1390 struct cbq_sched_data *q = qdisc_priv(sch);
1391 struct nlattr *tb[TCA_CBQ_MAX + 1];
1392 struct tc_ratespec *r;
1393 int err;
1395 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1396 if (err < 0)
1397 return err;
1399 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1400 return -EINVAL;
1402 r = nla_data(tb[TCA_CBQ_RATE]);
1404 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1405 return -EINVAL;
1407 err = qdisc_class_hash_init(&q->clhash);
1408 if (err < 0)
1409 goto put_rtab;
1411 q->link.refcnt = 1;
1412 q->link.sibling = &q->link;
1413 q->link.common.classid = sch->handle;
1414 q->link.qdisc = sch;
1415 if (!(q->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1416 &pfifo_qdisc_ops,
1417 sch->handle)))
1418 q->link.q = &noop_qdisc;
1420 q->link.priority = TC_CBQ_MAXPRIO-1;
1421 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1422 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1423 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1424 q->link.overlimit = cbq_ovl_classic;
1425 q->link.allot = psched_mtu(qdisc_dev(sch));
1426 q->link.quantum = q->link.allot;
1427 q->link.weight = q->link.R_tab->rate.rate;
1429 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1430 q->link.avpkt = q->link.allot/2;
1431 q->link.minidle = -0x7FFFFFFF;
1433 qdisc_watchdog_init(&q->watchdog, sch);
1434 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1435 q->delay_timer.function = cbq_undelay;
1436 q->toplevel = TC_CBQ_MAXLEVEL;
1437 q->now = psched_get_time();
1438 q->now_rt = q->now;
1440 cbq_link_class(&q->link);
1442 if (tb[TCA_CBQ_LSSOPT])
1443 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1445 cbq_addprio(q, &q->link);
1446 return 0;
1448 put_rtab:
1449 qdisc_put_rtab(q->link.R_tab);
1450 return err;
1453 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1455 unsigned char *b = skb_tail_pointer(skb);
1457 NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1458 return skb->len;
1460 nla_put_failure:
1461 nlmsg_trim(skb, b);
1462 return -1;
1465 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1467 unsigned char *b = skb_tail_pointer(skb);
1468 struct tc_cbq_lssopt opt;
1470 opt.flags = 0;
1471 if (cl->borrow == NULL)
1472 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1473 if (cl->share == NULL)
1474 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1475 opt.ewma_log = cl->ewma_log;
1476 opt.level = cl->level;
1477 opt.avpkt = cl->avpkt;
1478 opt.maxidle = cl->maxidle;
1479 opt.minidle = (u32)(-cl->minidle);
1480 opt.offtime = cl->offtime;
1481 opt.change = ~0;
1482 NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1483 return skb->len;
1485 nla_put_failure:
1486 nlmsg_trim(skb, b);
1487 return -1;
1490 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1492 unsigned char *b = skb_tail_pointer(skb);
1493 struct tc_cbq_wrropt opt;
1495 opt.flags = 0;
1496 opt.allot = cl->allot;
1497 opt.priority = cl->priority+1;
1498 opt.cpriority = cl->cpriority+1;
1499 opt.weight = cl->weight;
1500 NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1501 return skb->len;
1503 nla_put_failure:
1504 nlmsg_trim(skb, b);
1505 return -1;
1508 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1510 unsigned char *b = skb_tail_pointer(skb);
1511 struct tc_cbq_ovl opt;
1513 opt.strategy = cl->ovl_strategy;
1514 opt.priority2 = cl->priority2+1;
1515 opt.pad = 0;
1516 opt.penalty = cl->penalty;
1517 NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1518 return skb->len;
1520 nla_put_failure:
1521 nlmsg_trim(skb, b);
1522 return -1;
1525 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1527 unsigned char *b = skb_tail_pointer(skb);
1528 struct tc_cbq_fopt opt;
1530 if (cl->split || cl->defmap) {
1531 opt.split = cl->split ? cl->split->common.classid : 0;
1532 opt.defmap = cl->defmap;
1533 opt.defchange = ~0;
1534 NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1536 return skb->len;
1538 nla_put_failure:
1539 nlmsg_trim(skb, b);
1540 return -1;
1543 #ifdef CONFIG_NET_CLS_ACT
1544 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1546 unsigned char *b = skb_tail_pointer(skb);
1547 struct tc_cbq_police opt;
1549 if (cl->police) {
1550 opt.police = cl->police;
1551 opt.__res1 = 0;
1552 opt.__res2 = 0;
1553 NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1555 return skb->len;
1557 nla_put_failure:
1558 nlmsg_trim(skb, b);
1559 return -1;
1561 #endif
1563 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1565 if (cbq_dump_lss(skb, cl) < 0 ||
1566 cbq_dump_rate(skb, cl) < 0 ||
1567 cbq_dump_wrr(skb, cl) < 0 ||
1568 cbq_dump_ovl(skb, cl) < 0 ||
1569 #ifdef CONFIG_NET_CLS_ACT
1570 cbq_dump_police(skb, cl) < 0 ||
1571 #endif
1572 cbq_dump_fopt(skb, cl) < 0)
1573 return -1;
1574 return 0;
1577 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1579 struct cbq_sched_data *q = qdisc_priv(sch);
1580 struct nlattr *nest;
1582 nest = nla_nest_start(skb, TCA_OPTIONS);
1583 if (nest == NULL)
1584 goto nla_put_failure;
1585 if (cbq_dump_attr(skb, &q->link) < 0)
1586 goto nla_put_failure;
1587 nla_nest_end(skb, nest);
1588 return skb->len;
1590 nla_put_failure:
1591 nla_nest_cancel(skb, nest);
1592 return -1;
1595 static int
1596 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1598 struct cbq_sched_data *q = qdisc_priv(sch);
1600 q->link.xstats.avgidle = q->link.avgidle;
1601 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1604 static int
1605 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1606 struct sk_buff *skb, struct tcmsg *tcm)
1608 struct cbq_class *cl = (struct cbq_class*)arg;
1609 struct nlattr *nest;
1611 if (cl->tparent)
1612 tcm->tcm_parent = cl->tparent->common.classid;
1613 else
1614 tcm->tcm_parent = TC_H_ROOT;
1615 tcm->tcm_handle = cl->common.classid;
1616 tcm->tcm_info = cl->q->handle;
1618 nest = nla_nest_start(skb, TCA_OPTIONS);
1619 if (nest == NULL)
1620 goto nla_put_failure;
1621 if (cbq_dump_attr(skb, cl) < 0)
1622 goto nla_put_failure;
1623 nla_nest_end(skb, nest);
1624 return skb->len;
1626 nla_put_failure:
1627 nla_nest_cancel(skb, nest);
1628 return -1;
1631 static int
1632 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1633 struct gnet_dump *d)
1635 struct cbq_sched_data *q = qdisc_priv(sch);
1636 struct cbq_class *cl = (struct cbq_class*)arg;
1638 cl->qstats.qlen = cl->q->q.qlen;
1639 cl->xstats.avgidle = cl->avgidle;
1640 cl->xstats.undertime = 0;
1642 if (cl->undertime != PSCHED_PASTPERFECT)
1643 cl->xstats.undertime = cl->undertime - q->now;
1645 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1646 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1647 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1648 return -1;
1650 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1653 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1654 struct Qdisc **old)
1656 struct cbq_class *cl = (struct cbq_class*)arg;
1658 if (cl) {
1659 if (new == NULL) {
1660 new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1661 &pfifo_qdisc_ops,
1662 cl->common.classid);
1663 if (new == NULL)
1664 return -ENOBUFS;
1665 } else {
1666 #ifdef CONFIG_NET_CLS_ACT
1667 if (cl->police == TC_POLICE_RECLASSIFY)
1668 new->reshape_fail = cbq_reshape_fail;
1669 #endif
1671 sch_tree_lock(sch);
1672 *old = xchg(&cl->q, new);
1673 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1674 qdisc_reset(*old);
1675 sch_tree_unlock(sch);
1677 return 0;
1679 return -ENOENT;
1682 static struct Qdisc *
1683 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1685 struct cbq_class *cl = (struct cbq_class*)arg;
1687 return cl ? cl->q : NULL;
1690 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1692 struct cbq_class *cl = (struct cbq_class *)arg;
1694 if (cl->q->q.qlen == 0)
1695 cbq_deactivate_class(cl);
1698 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1700 struct cbq_sched_data *q = qdisc_priv(sch);
1701 struct cbq_class *cl = cbq_class_lookup(q, classid);
1703 if (cl) {
1704 cl->refcnt++;
1705 return (unsigned long)cl;
1707 return 0;
1710 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1712 struct cbq_sched_data *q = qdisc_priv(sch);
1714 WARN_ON(cl->filters);
1716 tcf_destroy_chain(&cl->filter_list);
1717 qdisc_destroy(cl->q);
1718 qdisc_put_rtab(cl->R_tab);
1719 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1720 if (cl != &q->link)
1721 kfree(cl);
1724 static void
1725 cbq_destroy(struct Qdisc* sch)
1727 struct cbq_sched_data *q = qdisc_priv(sch);
1728 struct hlist_node *n, *next;
1729 struct cbq_class *cl;
1730 unsigned h;
1732 #ifdef CONFIG_NET_CLS_ACT
1733 q->rx_class = NULL;
1734 #endif
1736 * Filters must be destroyed first because we don't destroy the
1737 * classes from root to leafs which means that filters can still
1738 * be bound to classes which have been destroyed already. --TGR '04
1740 for (h = 0; h < q->clhash.hashsize; h++) {
1741 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
1742 tcf_destroy_chain(&cl->filter_list);
1744 for (h = 0; h < q->clhash.hashsize; h++) {
1745 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
1746 common.hnode)
1747 cbq_destroy_class(sch, cl);
1749 qdisc_class_hash_destroy(&q->clhash);
1752 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1754 struct cbq_class *cl = (struct cbq_class*)arg;
1756 if (--cl->refcnt == 0) {
1757 #ifdef CONFIG_NET_CLS_ACT
1758 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1759 struct cbq_sched_data *q = qdisc_priv(sch);
1761 spin_lock_bh(root_lock);
1762 if (q->rx_class == cl)
1763 q->rx_class = NULL;
1764 spin_unlock_bh(root_lock);
1765 #endif
1767 cbq_destroy_class(sch, cl);
1771 static int
1772 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1773 unsigned long *arg)
1775 int err;
1776 struct cbq_sched_data *q = qdisc_priv(sch);
1777 struct cbq_class *cl = (struct cbq_class*)*arg;
1778 struct nlattr *opt = tca[TCA_OPTIONS];
1779 struct nlattr *tb[TCA_CBQ_MAX + 1];
1780 struct cbq_class *parent;
1781 struct qdisc_rate_table *rtab = NULL;
1783 if (opt == NULL)
1784 return -EINVAL;
1786 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1787 if (err < 0)
1788 return err;
1790 if (cl) {
1791 /* Check parent */
1792 if (parentid) {
1793 if (cl->tparent &&
1794 cl->tparent->common.classid != parentid)
1795 return -EINVAL;
1796 if (!cl->tparent && parentid != TC_H_ROOT)
1797 return -EINVAL;
1800 if (tb[TCA_CBQ_RATE]) {
1801 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1802 if (rtab == NULL)
1803 return -EINVAL;
1806 /* Change class parameters */
1807 sch_tree_lock(sch);
1809 if (cl->next_alive != NULL)
1810 cbq_deactivate_class(cl);
1812 if (rtab) {
1813 rtab = xchg(&cl->R_tab, rtab);
1814 qdisc_put_rtab(rtab);
1817 if (tb[TCA_CBQ_LSSOPT])
1818 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1820 if (tb[TCA_CBQ_WRROPT]) {
1821 cbq_rmprio(q, cl);
1822 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1825 if (tb[TCA_CBQ_OVL_STRATEGY])
1826 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1828 #ifdef CONFIG_NET_CLS_ACT
1829 if (tb[TCA_CBQ_POLICE])
1830 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1831 #endif
1833 if (tb[TCA_CBQ_FOPT])
1834 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1836 if (cl->q->q.qlen)
1837 cbq_activate_class(cl);
1839 sch_tree_unlock(sch);
1841 if (tca[TCA_RATE])
1842 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1843 qdisc_root_sleeping_lock(sch),
1844 tca[TCA_RATE]);
1845 return 0;
1848 if (parentid == TC_H_ROOT)
1849 return -EINVAL;
1851 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1852 tb[TCA_CBQ_LSSOPT] == NULL)
1853 return -EINVAL;
1855 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1856 if (rtab == NULL)
1857 return -EINVAL;
1859 if (classid) {
1860 err = -EINVAL;
1861 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1862 goto failure;
1863 } else {
1864 int i;
1865 classid = TC_H_MAKE(sch->handle,0x8000);
1867 for (i=0; i<0x8000; i++) {
1868 if (++q->hgenerator >= 0x8000)
1869 q->hgenerator = 1;
1870 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1871 break;
1873 err = -ENOSR;
1874 if (i >= 0x8000)
1875 goto failure;
1876 classid = classid|q->hgenerator;
1879 parent = &q->link;
1880 if (parentid) {
1881 parent = cbq_class_lookup(q, parentid);
1882 err = -EINVAL;
1883 if (parent == NULL)
1884 goto failure;
1887 err = -ENOBUFS;
1888 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1889 if (cl == NULL)
1890 goto failure;
1891 cl->R_tab = rtab;
1892 rtab = NULL;
1893 cl->refcnt = 1;
1894 if (!(cl->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1895 &pfifo_qdisc_ops, classid)))
1896 cl->q = &noop_qdisc;
1897 cl->common.classid = classid;
1898 cl->tparent = parent;
1899 cl->qdisc = sch;
1900 cl->allot = parent->allot;
1901 cl->quantum = cl->allot;
1902 cl->weight = cl->R_tab->rate.rate;
1904 sch_tree_lock(sch);
1905 cbq_link_class(cl);
1906 cl->borrow = cl->tparent;
1907 if (cl->tparent != &q->link)
1908 cl->share = cl->tparent;
1909 cbq_adjust_levels(parent);
1910 cl->minidle = -0x7FFFFFFF;
1911 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1912 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1913 if (cl->ewma_log==0)
1914 cl->ewma_log = q->link.ewma_log;
1915 if (cl->maxidle==0)
1916 cl->maxidle = q->link.maxidle;
1917 if (cl->avpkt==0)
1918 cl->avpkt = q->link.avpkt;
1919 cl->overlimit = cbq_ovl_classic;
1920 if (tb[TCA_CBQ_OVL_STRATEGY])
1921 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1922 #ifdef CONFIG_NET_CLS_ACT
1923 if (tb[TCA_CBQ_POLICE])
1924 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1925 #endif
1926 if (tb[TCA_CBQ_FOPT])
1927 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1928 sch_tree_unlock(sch);
1930 qdisc_class_hash_grow(sch, &q->clhash);
1932 if (tca[TCA_RATE])
1933 gen_new_estimator(&cl->bstats, &cl->rate_est,
1934 qdisc_root_sleeping_lock(sch), tca[TCA_RATE]);
1936 *arg = (unsigned long)cl;
1937 return 0;
1939 failure:
1940 qdisc_put_rtab(rtab);
1941 return err;
1944 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1946 struct cbq_sched_data *q = qdisc_priv(sch);
1947 struct cbq_class *cl = (struct cbq_class*)arg;
1948 unsigned int qlen;
1950 if (cl->filters || cl->children || cl == &q->link)
1951 return -EBUSY;
1953 sch_tree_lock(sch);
1955 qlen = cl->q->q.qlen;
1956 qdisc_reset(cl->q);
1957 qdisc_tree_decrease_qlen(cl->q, qlen);
1959 if (cl->next_alive)
1960 cbq_deactivate_class(cl);
1962 if (q->tx_borrowed == cl)
1963 q->tx_borrowed = q->tx_class;
1964 if (q->tx_class == cl) {
1965 q->tx_class = NULL;
1966 q->tx_borrowed = NULL;
1968 #ifdef CONFIG_NET_CLS_ACT
1969 if (q->rx_class == cl)
1970 q->rx_class = NULL;
1971 #endif
1973 cbq_unlink_class(cl);
1974 cbq_adjust_levels(cl->tparent);
1975 cl->defmap = 0;
1976 cbq_sync_defmap(cl);
1978 cbq_rmprio(q, cl);
1979 sch_tree_unlock(sch);
1981 if (--cl->refcnt == 0)
1982 cbq_destroy_class(sch, cl);
1984 return 0;
1987 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1989 struct cbq_sched_data *q = qdisc_priv(sch);
1990 struct cbq_class *cl = (struct cbq_class *)arg;
1992 if (cl == NULL)
1993 cl = &q->link;
1995 return &cl->filter_list;
1998 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1999 u32 classid)
2001 struct cbq_sched_data *q = qdisc_priv(sch);
2002 struct cbq_class *p = (struct cbq_class*)parent;
2003 struct cbq_class *cl = cbq_class_lookup(q, classid);
2005 if (cl) {
2006 if (p && p->level <= cl->level)
2007 return 0;
2008 cl->filters++;
2009 return (unsigned long)cl;
2011 return 0;
2014 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2016 struct cbq_class *cl = (struct cbq_class*)arg;
2018 cl->filters--;
2021 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2023 struct cbq_sched_data *q = qdisc_priv(sch);
2024 struct cbq_class *cl;
2025 struct hlist_node *n;
2026 unsigned h;
2028 if (arg->stop)
2029 return;
2031 for (h = 0; h < q->clhash.hashsize; h++) {
2032 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
2033 if (arg->count < arg->skip) {
2034 arg->count++;
2035 continue;
2037 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2038 arg->stop = 1;
2039 return;
2041 arg->count++;
2046 static const struct Qdisc_class_ops cbq_class_ops = {
2047 .graft = cbq_graft,
2048 .leaf = cbq_leaf,
2049 .qlen_notify = cbq_qlen_notify,
2050 .get = cbq_get,
2051 .put = cbq_put,
2052 .change = cbq_change_class,
2053 .delete = cbq_delete,
2054 .walk = cbq_walk,
2055 .tcf_chain = cbq_find_tcf,
2056 .bind_tcf = cbq_bind_filter,
2057 .unbind_tcf = cbq_unbind_filter,
2058 .dump = cbq_dump_class,
2059 .dump_stats = cbq_dump_class_stats,
2062 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2063 .next = NULL,
2064 .cl_ops = &cbq_class_ops,
2065 .id = "cbq",
2066 .priv_size = sizeof(struct cbq_sched_data),
2067 .enqueue = cbq_enqueue,
2068 .dequeue = cbq_dequeue,
2069 .requeue = cbq_requeue,
2070 .drop = cbq_drop,
2071 .init = cbq_init,
2072 .reset = cbq_reset,
2073 .destroy = cbq_destroy,
2074 .change = NULL,
2075 .dump = cbq_dump,
2076 .dump_stats = cbq_dump_stats,
2077 .owner = THIS_MODULE,
2080 static int __init cbq_module_init(void)
2082 return register_qdisc(&cbq_qdisc_ops);
2084 static void __exit cbq_module_exit(void)
2086 unregister_qdisc(&cbq_qdisc_ops);
2088 module_init(cbq_module_init)
2089 module_exit(cbq_module_exit)
2090 MODULE_LICENSE("GPL");