ACPI: thinkpad-acpi: add development version tag
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / sched / sch_cbq.c
blob8b06fa9004828c937e359df4c3c0d0c0bfa0ecf3
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/string.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <net/netlink.h>
20 #include <net/pkt_sched.h>
23 /* Class-Based Queueing (CBQ) algorithm.
24 =======================================
26 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
27 Management Models for Packet Networks",
28 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
30 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
32 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
33 Parameters", 1996
35 [4] Sally Floyd and Michael Speer, "Experimental Results
36 for Class-Based Queueing", 1998, not published.
38 -----------------------------------------------------------------------
40 Algorithm skeleton was taken from NS simulator cbq.cc.
41 If someone wants to check this code against the LBL version,
42 he should take into account that ONLY the skeleton was borrowed,
43 the implementation is different. Particularly:
45 --- The WRR algorithm is different. Our version looks more
46 reasonable (I hope) and works when quanta are allowed to be
47 less than MTU, which is always the case when real time classes
48 have small rates. Note, that the statement of [3] is
49 incomplete, delay may actually be estimated even if class
50 per-round allotment is less than MTU. Namely, if per-round
51 allotment is W*r_i, and r_1+...+r_k = r < 1
53 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
55 In the worst case we have IntServ estimate with D = W*r+k*MTU
56 and C = MTU*r. The proof (if correct at all) is trivial.
59 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
60 interpret some places, which look like wrong translations
61 from NS. Anyone is advised to find these differences
62 and explain to me, why I am wrong 8).
64 --- Linux has no EOI event, so that we cannot estimate true class
65 idle time. Workaround is to consider the next dequeue event
66 as sign that previous packet is finished. This is wrong because of
67 internal device queueing, but on a permanently loaded link it is true.
68 Moreover, combined with clock integrator, this scheme looks
69 very close to an ideal solution. */
71 struct cbq_sched_data;
74 struct cbq_class
76 struct Qdisc_class_common common;
77 struct cbq_class *next_alive; /* next class with backlog in this priority band */
79 /* Parameters */
80 unsigned char priority; /* class priority */
81 unsigned char priority2; /* priority to be used after overlimit */
82 unsigned char ewma_log; /* time constant for idle time calculation */
83 unsigned char ovl_strategy;
84 #ifdef CONFIG_NET_CLS_ACT
85 unsigned char police;
86 #endif
88 u32 defmap;
90 /* Link-sharing scheduler parameters */
91 long maxidle; /* Class parameters: see below. */
92 long offtime;
93 long minidle;
94 u32 avpkt;
95 struct qdisc_rate_table *R_tab;
97 /* Overlimit strategy parameters */
98 void (*overlimit)(struct cbq_class *cl);
99 psched_tdiff_t penalty;
101 /* General scheduler (WRR) parameters */
102 long allot;
103 long quantum; /* Allotment per WRR round */
104 long weight; /* Relative allotment: see below */
106 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
107 struct cbq_class *split; /* Ptr to split node */
108 struct cbq_class *share; /* Ptr to LS parent in the class tree */
109 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
110 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
111 parent otherwise */
112 struct cbq_class *sibling; /* Sibling chain */
113 struct cbq_class *children; /* Pointer to children chain */
115 struct Qdisc *q; /* Elementary queueing discipline */
118 /* Variables */
119 unsigned char cpriority; /* Effective priority */
120 unsigned char delayed;
121 unsigned char level; /* level of the class in hierarchy:
122 0 for leaf classes, and maximal
123 level of children + 1 for nodes.
126 psched_time_t last; /* Last end of service */
127 psched_time_t undertime;
128 long avgidle;
129 long deficit; /* Saved deficit for WRR */
130 psched_time_t penalized;
131 struct gnet_stats_basic bstats;
132 struct gnet_stats_queue qstats;
133 struct gnet_stats_rate_est rate_est;
134 struct tc_cbq_xstats xstats;
136 struct tcf_proto *filter_list;
138 int refcnt;
139 int filters;
141 struct cbq_class *defaults[TC_PRIO_MAX+1];
144 struct cbq_sched_data
146 struct Qdisc_class_hash clhash; /* Hash table of all classes */
147 int nclasses[TC_CBQ_MAXPRIO+1];
148 unsigned quanta[TC_CBQ_MAXPRIO+1];
150 struct cbq_class link;
152 unsigned activemask;
153 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
154 with backlog */
156 #ifdef CONFIG_NET_CLS_ACT
157 struct cbq_class *rx_class;
158 #endif
159 struct cbq_class *tx_class;
160 struct cbq_class *tx_borrowed;
161 int tx_len;
162 psched_time_t now; /* Cached timestamp */
163 psched_time_t now_rt; /* Cached real time */
164 unsigned pmask;
166 struct hrtimer delay_timer;
167 struct qdisc_watchdog watchdog; /* Watchdog timer,
168 started when CBQ has
169 backlog, but cannot
170 transmit just now */
171 psched_tdiff_t wd_expires;
172 int toplevel;
173 u32 hgenerator;
177 #define L2T(cl,len) qdisc_l2t((cl)->R_tab,len)
179 static __inline__ struct cbq_class *
180 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
182 struct Qdisc_class_common *clc;
184 clc = qdisc_class_find(&q->clhash, classid);
185 if (clc == NULL)
186 return NULL;
187 return container_of(clc, struct cbq_class, common);
190 #ifdef CONFIG_NET_CLS_ACT
192 static struct cbq_class *
193 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
195 struct cbq_class *cl, *new;
197 for (cl = this->tparent; cl; cl = cl->tparent)
198 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
199 return new;
201 return NULL;
204 #endif
206 /* Classify packet. The procedure is pretty complicated, but
207 it allows us to combine link sharing and priority scheduling
208 transparently.
210 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
211 so that it resolves to split nodes. Then packets are classified
212 by logical priority, or a more specific classifier may be attached
213 to the split node.
216 static struct cbq_class *
217 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
219 struct cbq_sched_data *q = qdisc_priv(sch);
220 struct cbq_class *head = &q->link;
221 struct cbq_class **defmap;
222 struct cbq_class *cl = NULL;
223 u32 prio = skb->priority;
224 struct tcf_result res;
227 * Step 1. If skb->priority points to one of our classes, use it.
229 if (TC_H_MAJ(prio^sch->handle) == 0 &&
230 (cl = cbq_class_lookup(q, prio)) != NULL)
231 return cl;
233 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
234 for (;;) {
235 int result = 0;
236 defmap = head->defaults;
239 * Step 2+n. Apply classifier.
241 if (!head->filter_list ||
242 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
243 goto fallback;
245 if ((cl = (void*)res.class) == NULL) {
246 if (TC_H_MAJ(res.classid))
247 cl = cbq_class_lookup(q, res.classid);
248 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
249 cl = defmap[TC_PRIO_BESTEFFORT];
251 if (cl == NULL || cl->level >= head->level)
252 goto fallback;
255 #ifdef CONFIG_NET_CLS_ACT
256 switch (result) {
257 case TC_ACT_QUEUED:
258 case TC_ACT_STOLEN:
259 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
260 case TC_ACT_SHOT:
261 return NULL;
262 case TC_ACT_RECLASSIFY:
263 return cbq_reclassify(skb, cl);
265 #endif
266 if (cl->level == 0)
267 return cl;
270 * Step 3+n. If classifier selected a link sharing class,
271 * apply agency specific classifier.
272 * Repeat this procdure until we hit a leaf node.
274 head = cl;
277 fallback:
278 cl = head;
281 * Step 4. No success...
283 if (TC_H_MAJ(prio) == 0 &&
284 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
285 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
286 return head;
288 return cl;
292 A packet has just been enqueued on the empty class.
293 cbq_activate_class adds it to the tail of active class list
294 of its priority band.
297 static __inline__ void cbq_activate_class(struct cbq_class *cl)
299 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
300 int prio = cl->cpriority;
301 struct cbq_class *cl_tail;
303 cl_tail = q->active[prio];
304 q->active[prio] = cl;
306 if (cl_tail != NULL) {
307 cl->next_alive = cl_tail->next_alive;
308 cl_tail->next_alive = cl;
309 } else {
310 cl->next_alive = cl;
311 q->activemask |= (1<<prio);
316 Unlink class from active chain.
317 Note that this same procedure is done directly in cbq_dequeue*
318 during round-robin procedure.
321 static void cbq_deactivate_class(struct cbq_class *this)
323 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
324 int prio = this->cpriority;
325 struct cbq_class *cl;
326 struct cbq_class *cl_prev = q->active[prio];
328 do {
329 cl = cl_prev->next_alive;
330 if (cl == this) {
331 cl_prev->next_alive = cl->next_alive;
332 cl->next_alive = NULL;
334 if (cl == q->active[prio]) {
335 q->active[prio] = cl_prev;
336 if (cl == q->active[prio]) {
337 q->active[prio] = NULL;
338 q->activemask &= ~(1<<prio);
339 return;
342 return;
344 } while ((cl_prev = cl) != q->active[prio]);
347 static void
348 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
350 int toplevel = q->toplevel;
352 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
353 psched_time_t now;
354 psched_tdiff_t incr;
356 now = psched_get_time();
357 incr = now - q->now_rt;
358 now = q->now + incr;
360 do {
361 if (cl->undertime < now) {
362 q->toplevel = cl->level;
363 return;
365 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
369 static int
370 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
372 struct cbq_sched_data *q = qdisc_priv(sch);
373 int uninitialized_var(ret);
374 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
376 #ifdef CONFIG_NET_CLS_ACT
377 q->rx_class = cl;
378 #endif
379 if (cl == NULL) {
380 if (ret & __NET_XMIT_BYPASS)
381 sch->qstats.drops++;
382 kfree_skb(skb);
383 return ret;
386 #ifdef CONFIG_NET_CLS_ACT
387 cl->q->__parent = sch;
388 #endif
389 ret = qdisc_enqueue(skb, cl->q);
390 if (ret == NET_XMIT_SUCCESS) {
391 sch->q.qlen++;
392 sch->bstats.packets++;
393 sch->bstats.bytes += qdisc_pkt_len(skb);
394 cbq_mark_toplevel(q, cl);
395 if (!cl->next_alive)
396 cbq_activate_class(cl);
397 return ret;
400 if (net_xmit_drop_count(ret)) {
401 sch->qstats.drops++;
402 cbq_mark_toplevel(q, cl);
403 cl->qstats.drops++;
405 return ret;
408 static int
409 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
411 struct cbq_sched_data *q = qdisc_priv(sch);
412 struct cbq_class *cl;
413 int ret;
415 if ((cl = q->tx_class) == NULL) {
416 kfree_skb(skb);
417 sch->qstats.drops++;
418 return NET_XMIT_CN;
420 q->tx_class = NULL;
422 cbq_mark_toplevel(q, cl);
424 #ifdef CONFIG_NET_CLS_ACT
425 q->rx_class = cl;
426 cl->q->__parent = sch;
427 #endif
428 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
429 sch->q.qlen++;
430 sch->qstats.requeues++;
431 if (!cl->next_alive)
432 cbq_activate_class(cl);
433 return 0;
435 if (net_xmit_drop_count(ret)) {
436 sch->qstats.drops++;
437 cl->qstats.drops++;
439 return ret;
442 /* Overlimit actions */
444 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
446 static void cbq_ovl_classic(struct cbq_class *cl)
448 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
449 psched_tdiff_t delay = cl->undertime - q->now;
451 if (!cl->delayed) {
452 delay += cl->offtime;
455 Class goes to sleep, so that it will have no
456 chance to work avgidle. Let's forgive it 8)
458 BTW cbq-2.0 has a crap in this
459 place, apparently they forgot to shift it by cl->ewma_log.
461 if (cl->avgidle < 0)
462 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
463 if (cl->avgidle < cl->minidle)
464 cl->avgidle = cl->minidle;
465 if (delay <= 0)
466 delay = 1;
467 cl->undertime = q->now + delay;
469 cl->xstats.overactions++;
470 cl->delayed = 1;
472 if (q->wd_expires == 0 || q->wd_expires > delay)
473 q->wd_expires = delay;
475 /* Dirty work! We must schedule wakeups based on
476 real available rate, rather than leaf rate,
477 which may be tiny (even zero).
479 if (q->toplevel == TC_CBQ_MAXLEVEL) {
480 struct cbq_class *b;
481 psched_tdiff_t base_delay = q->wd_expires;
483 for (b = cl->borrow; b; b = b->borrow) {
484 delay = b->undertime - q->now;
485 if (delay < base_delay) {
486 if (delay <= 0)
487 delay = 1;
488 base_delay = delay;
492 q->wd_expires = base_delay;
496 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
497 they go overlimit
500 static void cbq_ovl_rclassic(struct cbq_class *cl)
502 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
503 struct cbq_class *this = cl;
505 do {
506 if (cl->level > q->toplevel) {
507 cl = NULL;
508 break;
510 } while ((cl = cl->borrow) != NULL);
512 if (cl == NULL)
513 cl = this;
514 cbq_ovl_classic(cl);
517 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
519 static void cbq_ovl_delay(struct cbq_class *cl)
521 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
522 psched_tdiff_t delay = cl->undertime - q->now;
524 if (test_bit(__QDISC_STATE_DEACTIVATED,
525 &qdisc_root_sleeping(cl->qdisc)->state))
526 return;
528 if (!cl->delayed) {
529 psched_time_t sched = q->now;
530 ktime_t expires;
532 delay += cl->offtime;
533 if (cl->avgidle < 0)
534 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
535 if (cl->avgidle < cl->minidle)
536 cl->avgidle = cl->minidle;
537 cl->undertime = q->now + delay;
539 if (delay > 0) {
540 sched += delay + cl->penalty;
541 cl->penalized = sched;
542 cl->cpriority = TC_CBQ_MAXPRIO;
543 q->pmask |= (1<<TC_CBQ_MAXPRIO);
545 expires = ktime_set(0, 0);
546 expires = ktime_add_ns(expires, PSCHED_US2NS(sched));
547 if (hrtimer_try_to_cancel(&q->delay_timer) &&
548 ktime_to_ns(ktime_sub(q->delay_timer.expires,
549 expires)) > 0)
550 q->delay_timer.expires = expires;
551 hrtimer_restart(&q->delay_timer);
552 cl->delayed = 1;
553 cl->xstats.overactions++;
554 return;
556 delay = 1;
558 if (q->wd_expires == 0 || q->wd_expires > delay)
559 q->wd_expires = delay;
562 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
564 static void cbq_ovl_lowprio(struct cbq_class *cl)
566 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
568 cl->penalized = q->now + cl->penalty;
570 if (cl->cpriority != cl->priority2) {
571 cl->cpriority = cl->priority2;
572 q->pmask |= (1<<cl->cpriority);
573 cl->xstats.overactions++;
575 cbq_ovl_classic(cl);
578 /* TC_CBQ_OVL_DROP: penalize class by dropping */
580 static void cbq_ovl_drop(struct cbq_class *cl)
582 if (cl->q->ops->drop)
583 if (cl->q->ops->drop(cl->q))
584 cl->qdisc->q.qlen--;
585 cl->xstats.overactions++;
586 cbq_ovl_classic(cl);
589 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
590 psched_time_t now)
592 struct cbq_class *cl;
593 struct cbq_class *cl_prev = q->active[prio];
594 psched_time_t sched = now;
596 if (cl_prev == NULL)
597 return 0;
599 do {
600 cl = cl_prev->next_alive;
601 if (now - cl->penalized > 0) {
602 cl_prev->next_alive = cl->next_alive;
603 cl->next_alive = NULL;
604 cl->cpriority = cl->priority;
605 cl->delayed = 0;
606 cbq_activate_class(cl);
608 if (cl == q->active[prio]) {
609 q->active[prio] = cl_prev;
610 if (cl == q->active[prio]) {
611 q->active[prio] = NULL;
612 return 0;
616 cl = cl_prev->next_alive;
617 } else if (sched - cl->penalized > 0)
618 sched = cl->penalized;
619 } while ((cl_prev = cl) != q->active[prio]);
621 return sched - now;
624 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
626 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
627 delay_timer);
628 struct Qdisc *sch = q->watchdog.qdisc;
629 psched_time_t now;
630 psched_tdiff_t delay = 0;
631 unsigned pmask;
633 now = psched_get_time();
635 pmask = q->pmask;
636 q->pmask = 0;
638 while (pmask) {
639 int prio = ffz(~pmask);
640 psched_tdiff_t tmp;
642 pmask &= ~(1<<prio);
644 tmp = cbq_undelay_prio(q, prio, now);
645 if (tmp > 0) {
646 q->pmask |= 1<<prio;
647 if (tmp < delay || delay == 0)
648 delay = tmp;
652 if (delay) {
653 ktime_t time;
655 time = ktime_set(0, 0);
656 time = ktime_add_ns(time, PSCHED_US2NS(now + delay));
657 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
660 sch->flags &= ~TCQ_F_THROTTLED;
661 __netif_schedule(qdisc_root(sch));
662 return HRTIMER_NORESTART;
665 #ifdef CONFIG_NET_CLS_ACT
666 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
668 struct Qdisc *sch = child->__parent;
669 struct cbq_sched_data *q = qdisc_priv(sch);
670 struct cbq_class *cl = q->rx_class;
672 q->rx_class = NULL;
674 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
675 int ret;
677 cbq_mark_toplevel(q, cl);
679 q->rx_class = cl;
680 cl->q->__parent = sch;
682 ret = qdisc_enqueue(skb, cl->q);
683 if (ret == NET_XMIT_SUCCESS) {
684 sch->q.qlen++;
685 sch->bstats.packets++;
686 sch->bstats.bytes += qdisc_pkt_len(skb);
687 if (!cl->next_alive)
688 cbq_activate_class(cl);
689 return 0;
691 if (net_xmit_drop_count(ret))
692 sch->qstats.drops++;
693 return 0;
696 sch->qstats.drops++;
697 return -1;
699 #endif
702 It is mission critical procedure.
704 We "regenerate" toplevel cutoff, if transmitting class
705 has backlog and it is not regulated. It is not part of
706 original CBQ description, but looks more reasonable.
707 Probably, it is wrong. This question needs further investigation.
710 static __inline__ void
711 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
712 struct cbq_class *borrowed)
714 if (cl && q->toplevel >= borrowed->level) {
715 if (cl->q->q.qlen > 1) {
716 do {
717 if (borrowed->undertime == PSCHED_PASTPERFECT) {
718 q->toplevel = borrowed->level;
719 return;
721 } while ((borrowed=borrowed->borrow) != NULL);
723 #if 0
724 /* It is not necessary now. Uncommenting it
725 will save CPU cycles, but decrease fairness.
727 q->toplevel = TC_CBQ_MAXLEVEL;
728 #endif
732 static void
733 cbq_update(struct cbq_sched_data *q)
735 struct cbq_class *this = q->tx_class;
736 struct cbq_class *cl = this;
737 int len = q->tx_len;
739 q->tx_class = NULL;
741 for ( ; cl; cl = cl->share) {
742 long avgidle = cl->avgidle;
743 long idle;
745 cl->bstats.packets++;
746 cl->bstats.bytes += len;
749 (now - last) is total time between packet right edges.
750 (last_pktlen/rate) is "virtual" busy time, so that
752 idle = (now - last) - last_pktlen/rate
755 idle = q->now - cl->last;
756 if ((unsigned long)idle > 128*1024*1024) {
757 avgidle = cl->maxidle;
758 } else {
759 idle -= L2T(cl, len);
761 /* true_avgidle := (1-W)*true_avgidle + W*idle,
762 where W=2^{-ewma_log}. But cl->avgidle is scaled:
763 cl->avgidle == true_avgidle/W,
764 hence:
766 avgidle += idle - (avgidle>>cl->ewma_log);
769 if (avgidle <= 0) {
770 /* Overlimit or at-limit */
772 if (avgidle < cl->minidle)
773 avgidle = cl->minidle;
775 cl->avgidle = avgidle;
777 /* Calculate expected time, when this class
778 will be allowed to send.
779 It will occur, when:
780 (1-W)*true_avgidle + W*delay = 0, i.e.
781 idle = (1/W - 1)*(-true_avgidle)
783 idle = (1 - W)*(-cl->avgidle);
785 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
788 That is not all.
789 To maintain the rate allocated to the class,
790 we add to undertime virtual clock,
791 necessary to complete transmitted packet.
792 (len/phys_bandwidth has been already passed
793 to the moment of cbq_update)
796 idle -= L2T(&q->link, len);
797 idle += L2T(cl, len);
799 cl->undertime = q->now + idle;
800 } else {
801 /* Underlimit */
803 cl->undertime = PSCHED_PASTPERFECT;
804 if (avgidle > cl->maxidle)
805 cl->avgidle = cl->maxidle;
806 else
807 cl->avgidle = avgidle;
809 cl->last = q->now;
812 cbq_update_toplevel(q, this, q->tx_borrowed);
815 static __inline__ struct cbq_class *
816 cbq_under_limit(struct cbq_class *cl)
818 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
819 struct cbq_class *this_cl = cl;
821 if (cl->tparent == NULL)
822 return cl;
824 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
825 cl->delayed = 0;
826 return cl;
829 do {
830 /* It is very suspicious place. Now overlimit
831 action is generated for not bounded classes
832 only if link is completely congested.
833 Though it is in agree with ancestor-only paradigm,
834 it looks very stupid. Particularly,
835 it means that this chunk of code will either
836 never be called or result in strong amplification
837 of burstiness. Dangerous, silly, and, however,
838 no another solution exists.
840 if ((cl = cl->borrow) == NULL) {
841 this_cl->qstats.overlimits++;
842 this_cl->overlimit(this_cl);
843 return NULL;
845 if (cl->level > q->toplevel)
846 return NULL;
847 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
849 cl->delayed = 0;
850 return cl;
853 static __inline__ struct sk_buff *
854 cbq_dequeue_prio(struct Qdisc *sch, int prio)
856 struct cbq_sched_data *q = qdisc_priv(sch);
857 struct cbq_class *cl_tail, *cl_prev, *cl;
858 struct sk_buff *skb;
859 int deficit;
861 cl_tail = cl_prev = q->active[prio];
862 cl = cl_prev->next_alive;
864 do {
865 deficit = 0;
867 /* Start round */
868 do {
869 struct cbq_class *borrow = cl;
871 if (cl->q->q.qlen &&
872 (borrow = cbq_under_limit(cl)) == NULL)
873 goto skip_class;
875 if (cl->deficit <= 0) {
876 /* Class exhausted its allotment per
877 this round. Switch to the next one.
879 deficit = 1;
880 cl->deficit += cl->quantum;
881 goto next_class;
884 skb = cl->q->dequeue(cl->q);
886 /* Class did not give us any skb :-(
887 It could occur even if cl->q->q.qlen != 0
888 f.e. if cl->q == "tbf"
890 if (skb == NULL)
891 goto skip_class;
893 cl->deficit -= qdisc_pkt_len(skb);
894 q->tx_class = cl;
895 q->tx_borrowed = borrow;
896 if (borrow != cl) {
897 #ifndef CBQ_XSTATS_BORROWS_BYTES
898 borrow->xstats.borrows++;
899 cl->xstats.borrows++;
900 #else
901 borrow->xstats.borrows += qdisc_pkt_len(skb);
902 cl->xstats.borrows += qdisc_pkt_len(skb);
903 #endif
905 q->tx_len = qdisc_pkt_len(skb);
907 if (cl->deficit <= 0) {
908 q->active[prio] = cl;
909 cl = cl->next_alive;
910 cl->deficit += cl->quantum;
912 return skb;
914 skip_class:
915 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
916 /* Class is empty or penalized.
917 Unlink it from active chain.
919 cl_prev->next_alive = cl->next_alive;
920 cl->next_alive = NULL;
922 /* Did cl_tail point to it? */
923 if (cl == cl_tail) {
924 /* Repair it! */
925 cl_tail = cl_prev;
927 /* Was it the last class in this band? */
928 if (cl == cl_tail) {
929 /* Kill the band! */
930 q->active[prio] = NULL;
931 q->activemask &= ~(1<<prio);
932 if (cl->q->q.qlen)
933 cbq_activate_class(cl);
934 return NULL;
937 q->active[prio] = cl_tail;
939 if (cl->q->q.qlen)
940 cbq_activate_class(cl);
942 cl = cl_prev;
945 next_class:
946 cl_prev = cl;
947 cl = cl->next_alive;
948 } while (cl_prev != cl_tail);
949 } while (deficit);
951 q->active[prio] = cl_prev;
953 return NULL;
956 static __inline__ struct sk_buff *
957 cbq_dequeue_1(struct Qdisc *sch)
959 struct cbq_sched_data *q = qdisc_priv(sch);
960 struct sk_buff *skb;
961 unsigned activemask;
963 activemask = q->activemask&0xFF;
964 while (activemask) {
965 int prio = ffz(~activemask);
966 activemask &= ~(1<<prio);
967 skb = cbq_dequeue_prio(sch, prio);
968 if (skb)
969 return skb;
971 return NULL;
974 static struct sk_buff *
975 cbq_dequeue(struct Qdisc *sch)
977 struct sk_buff *skb;
978 struct cbq_sched_data *q = qdisc_priv(sch);
979 psched_time_t now;
980 psched_tdiff_t incr;
982 now = psched_get_time();
983 incr = now - q->now_rt;
985 if (q->tx_class) {
986 psched_tdiff_t incr2;
987 /* Time integrator. We calculate EOS time
988 by adding expected packet transmission time.
989 If real time is greater, we warp artificial clock,
990 so that:
992 cbq_time = max(real_time, work);
994 incr2 = L2T(&q->link, q->tx_len);
995 q->now += incr2;
996 cbq_update(q);
997 if ((incr -= incr2) < 0)
998 incr = 0;
1000 q->now += incr;
1001 q->now_rt = now;
1003 for (;;) {
1004 q->wd_expires = 0;
1006 skb = cbq_dequeue_1(sch);
1007 if (skb) {
1008 sch->q.qlen--;
1009 sch->flags &= ~TCQ_F_THROTTLED;
1010 return skb;
1013 /* All the classes are overlimit.
1015 It is possible, if:
1017 1. Scheduler is empty.
1018 2. Toplevel cutoff inhibited borrowing.
1019 3. Root class is overlimit.
1021 Reset 2d and 3d conditions and retry.
1023 Note, that NS and cbq-2.0 are buggy, peeking
1024 an arbitrary class is appropriate for ancestor-only
1025 sharing, but not for toplevel algorithm.
1027 Our version is better, but slower, because it requires
1028 two passes, but it is unavoidable with top-level sharing.
1031 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1032 q->link.undertime == PSCHED_PASTPERFECT)
1033 break;
1035 q->toplevel = TC_CBQ_MAXLEVEL;
1036 q->link.undertime = PSCHED_PASTPERFECT;
1039 /* No packets in scheduler or nobody wants to give them to us :-(
1040 Sigh... start watchdog timer in the last case. */
1042 if (sch->q.qlen) {
1043 sch->qstats.overlimits++;
1044 if (q->wd_expires)
1045 qdisc_watchdog_schedule(&q->watchdog,
1046 now + q->wd_expires);
1048 return NULL;
1051 /* CBQ class maintanance routines */
1053 static void cbq_adjust_levels(struct cbq_class *this)
1055 if (this == NULL)
1056 return;
1058 do {
1059 int level = 0;
1060 struct cbq_class *cl;
1062 if ((cl = this->children) != NULL) {
1063 do {
1064 if (cl->level > level)
1065 level = cl->level;
1066 } while ((cl = cl->sibling) != this->children);
1068 this->level = level+1;
1069 } while ((this = this->tparent) != NULL);
1072 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1074 struct cbq_class *cl;
1075 struct hlist_node *n;
1076 unsigned int h;
1078 if (q->quanta[prio] == 0)
1079 return;
1081 for (h = 0; h < q->clhash.hashsize; h++) {
1082 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1083 /* BUGGGG... Beware! This expression suffer of
1084 arithmetic overflows!
1086 if (cl->priority == prio) {
1087 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1088 q->quanta[prio];
1090 if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1091 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum);
1092 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1098 static void cbq_sync_defmap(struct cbq_class *cl)
1100 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1101 struct cbq_class *split = cl->split;
1102 unsigned h;
1103 int i;
1105 if (split == NULL)
1106 return;
1108 for (i=0; i<=TC_PRIO_MAX; i++) {
1109 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1110 split->defaults[i] = NULL;
1113 for (i=0; i<=TC_PRIO_MAX; i++) {
1114 int level = split->level;
1116 if (split->defaults[i])
1117 continue;
1119 for (h = 0; h < q->clhash.hashsize; h++) {
1120 struct hlist_node *n;
1121 struct cbq_class *c;
1123 hlist_for_each_entry(c, n, &q->clhash.hash[h],
1124 common.hnode) {
1125 if (c->split == split && c->level < level &&
1126 c->defmap&(1<<i)) {
1127 split->defaults[i] = c;
1128 level = c->level;
1135 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1137 struct cbq_class *split = NULL;
1139 if (splitid == 0) {
1140 if ((split = cl->split) == NULL)
1141 return;
1142 splitid = split->common.classid;
1145 if (split == NULL || split->common.classid != splitid) {
1146 for (split = cl->tparent; split; split = split->tparent)
1147 if (split->common.classid == splitid)
1148 break;
1151 if (split == NULL)
1152 return;
1154 if (cl->split != split) {
1155 cl->defmap = 0;
1156 cbq_sync_defmap(cl);
1157 cl->split = split;
1158 cl->defmap = def&mask;
1159 } else
1160 cl->defmap = (cl->defmap&~mask)|(def&mask);
1162 cbq_sync_defmap(cl);
1165 static void cbq_unlink_class(struct cbq_class *this)
1167 struct cbq_class *cl, **clp;
1168 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1170 qdisc_class_hash_remove(&q->clhash, &this->common);
1172 if (this->tparent) {
1173 clp=&this->sibling;
1174 cl = *clp;
1175 do {
1176 if (cl == this) {
1177 *clp = cl->sibling;
1178 break;
1180 clp = &cl->sibling;
1181 } while ((cl = *clp) != this->sibling);
1183 if (this->tparent->children == this) {
1184 this->tparent->children = this->sibling;
1185 if (this->sibling == this)
1186 this->tparent->children = NULL;
1188 } else {
1189 WARN_ON(this->sibling != this);
1193 static void cbq_link_class(struct cbq_class *this)
1195 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1196 struct cbq_class *parent = this->tparent;
1198 this->sibling = this;
1199 qdisc_class_hash_insert(&q->clhash, &this->common);
1201 if (parent == NULL)
1202 return;
1204 if (parent->children == NULL) {
1205 parent->children = this;
1206 } else {
1207 this->sibling = parent->children->sibling;
1208 parent->children->sibling = this;
1212 static unsigned int cbq_drop(struct Qdisc* sch)
1214 struct cbq_sched_data *q = qdisc_priv(sch);
1215 struct cbq_class *cl, *cl_head;
1216 int prio;
1217 unsigned int len;
1219 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1220 if ((cl_head = q->active[prio]) == NULL)
1221 continue;
1223 cl = cl_head;
1224 do {
1225 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1226 sch->q.qlen--;
1227 if (!cl->q->q.qlen)
1228 cbq_deactivate_class(cl);
1229 return len;
1231 } while ((cl = cl->next_alive) != cl_head);
1233 return 0;
1236 static void
1237 cbq_reset(struct Qdisc* sch)
1239 struct cbq_sched_data *q = qdisc_priv(sch);
1240 struct cbq_class *cl;
1241 struct hlist_node *n;
1242 int prio;
1243 unsigned h;
1245 q->activemask = 0;
1246 q->pmask = 0;
1247 q->tx_class = NULL;
1248 q->tx_borrowed = NULL;
1249 qdisc_watchdog_cancel(&q->watchdog);
1250 hrtimer_cancel(&q->delay_timer);
1251 q->toplevel = TC_CBQ_MAXLEVEL;
1252 q->now = psched_get_time();
1253 q->now_rt = q->now;
1255 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1256 q->active[prio] = NULL;
1258 for (h = 0; h < q->clhash.hashsize; h++) {
1259 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1260 qdisc_reset(cl->q);
1262 cl->next_alive = NULL;
1263 cl->undertime = PSCHED_PASTPERFECT;
1264 cl->avgidle = cl->maxidle;
1265 cl->deficit = cl->quantum;
1266 cl->cpriority = cl->priority;
1269 sch->q.qlen = 0;
1273 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1275 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1276 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1277 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1279 if (lss->change&TCF_CBQ_LSS_EWMA)
1280 cl->ewma_log = lss->ewma_log;
1281 if (lss->change&TCF_CBQ_LSS_AVPKT)
1282 cl->avpkt = lss->avpkt;
1283 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1284 cl->minidle = -(long)lss->minidle;
1285 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1286 cl->maxidle = lss->maxidle;
1287 cl->avgidle = lss->maxidle;
1289 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1290 cl->offtime = lss->offtime;
1291 return 0;
1294 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1296 q->nclasses[cl->priority]--;
1297 q->quanta[cl->priority] -= cl->weight;
1298 cbq_normalize_quanta(q, cl->priority);
1301 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1303 q->nclasses[cl->priority]++;
1304 q->quanta[cl->priority] += cl->weight;
1305 cbq_normalize_quanta(q, cl->priority);
1308 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1310 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1312 if (wrr->allot)
1313 cl->allot = wrr->allot;
1314 if (wrr->weight)
1315 cl->weight = wrr->weight;
1316 if (wrr->priority) {
1317 cl->priority = wrr->priority-1;
1318 cl->cpriority = cl->priority;
1319 if (cl->priority >= cl->priority2)
1320 cl->priority2 = TC_CBQ_MAXPRIO-1;
1323 cbq_addprio(q, cl);
1324 return 0;
1327 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1329 switch (ovl->strategy) {
1330 case TC_CBQ_OVL_CLASSIC:
1331 cl->overlimit = cbq_ovl_classic;
1332 break;
1333 case TC_CBQ_OVL_DELAY:
1334 cl->overlimit = cbq_ovl_delay;
1335 break;
1336 case TC_CBQ_OVL_LOWPRIO:
1337 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1338 ovl->priority2-1 <= cl->priority)
1339 return -EINVAL;
1340 cl->priority2 = ovl->priority2-1;
1341 cl->overlimit = cbq_ovl_lowprio;
1342 break;
1343 case TC_CBQ_OVL_DROP:
1344 cl->overlimit = cbq_ovl_drop;
1345 break;
1346 case TC_CBQ_OVL_RCLASSIC:
1347 cl->overlimit = cbq_ovl_rclassic;
1348 break;
1349 default:
1350 return -EINVAL;
1352 cl->penalty = ovl->penalty;
1353 return 0;
1356 #ifdef CONFIG_NET_CLS_ACT
1357 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1359 cl->police = p->police;
1361 if (cl->q->handle) {
1362 if (p->police == TC_POLICE_RECLASSIFY)
1363 cl->q->reshape_fail = cbq_reshape_fail;
1364 else
1365 cl->q->reshape_fail = NULL;
1367 return 0;
1369 #endif
1371 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1373 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1374 return 0;
1377 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1378 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1379 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1380 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1381 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1382 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1383 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1384 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1387 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1389 struct cbq_sched_data *q = qdisc_priv(sch);
1390 struct nlattr *tb[TCA_CBQ_MAX + 1];
1391 struct tc_ratespec *r;
1392 int err;
1394 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1395 if (err < 0)
1396 return err;
1398 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1399 return -EINVAL;
1401 r = nla_data(tb[TCA_CBQ_RATE]);
1403 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1404 return -EINVAL;
1406 err = qdisc_class_hash_init(&q->clhash);
1407 if (err < 0)
1408 goto put_rtab;
1410 q->link.refcnt = 1;
1411 q->link.sibling = &q->link;
1412 q->link.common.classid = sch->handle;
1413 q->link.qdisc = sch;
1414 if (!(q->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1415 &pfifo_qdisc_ops,
1416 sch->handle)))
1417 q->link.q = &noop_qdisc;
1419 q->link.priority = TC_CBQ_MAXPRIO-1;
1420 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1421 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1422 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1423 q->link.overlimit = cbq_ovl_classic;
1424 q->link.allot = psched_mtu(qdisc_dev(sch));
1425 q->link.quantum = q->link.allot;
1426 q->link.weight = q->link.R_tab->rate.rate;
1428 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1429 q->link.avpkt = q->link.allot/2;
1430 q->link.minidle = -0x7FFFFFFF;
1432 qdisc_watchdog_init(&q->watchdog, sch);
1433 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1434 q->delay_timer.function = cbq_undelay;
1435 q->toplevel = TC_CBQ_MAXLEVEL;
1436 q->now = psched_get_time();
1437 q->now_rt = q->now;
1439 cbq_link_class(&q->link);
1441 if (tb[TCA_CBQ_LSSOPT])
1442 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1444 cbq_addprio(q, &q->link);
1445 return 0;
1447 put_rtab:
1448 qdisc_put_rtab(q->link.R_tab);
1449 return err;
1452 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1454 unsigned char *b = skb_tail_pointer(skb);
1456 NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1457 return skb->len;
1459 nla_put_failure:
1460 nlmsg_trim(skb, b);
1461 return -1;
1464 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1466 unsigned char *b = skb_tail_pointer(skb);
1467 struct tc_cbq_lssopt opt;
1469 opt.flags = 0;
1470 if (cl->borrow == NULL)
1471 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1472 if (cl->share == NULL)
1473 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1474 opt.ewma_log = cl->ewma_log;
1475 opt.level = cl->level;
1476 opt.avpkt = cl->avpkt;
1477 opt.maxidle = cl->maxidle;
1478 opt.minidle = (u32)(-cl->minidle);
1479 opt.offtime = cl->offtime;
1480 opt.change = ~0;
1481 NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1482 return skb->len;
1484 nla_put_failure:
1485 nlmsg_trim(skb, b);
1486 return -1;
1489 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1491 unsigned char *b = skb_tail_pointer(skb);
1492 struct tc_cbq_wrropt opt;
1494 opt.flags = 0;
1495 opt.allot = cl->allot;
1496 opt.priority = cl->priority+1;
1497 opt.cpriority = cl->cpriority+1;
1498 opt.weight = cl->weight;
1499 NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1500 return skb->len;
1502 nla_put_failure:
1503 nlmsg_trim(skb, b);
1504 return -1;
1507 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1509 unsigned char *b = skb_tail_pointer(skb);
1510 struct tc_cbq_ovl opt;
1512 opt.strategy = cl->ovl_strategy;
1513 opt.priority2 = cl->priority2+1;
1514 opt.pad = 0;
1515 opt.penalty = cl->penalty;
1516 NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1517 return skb->len;
1519 nla_put_failure:
1520 nlmsg_trim(skb, b);
1521 return -1;
1524 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1526 unsigned char *b = skb_tail_pointer(skb);
1527 struct tc_cbq_fopt opt;
1529 if (cl->split || cl->defmap) {
1530 opt.split = cl->split ? cl->split->common.classid : 0;
1531 opt.defmap = cl->defmap;
1532 opt.defchange = ~0;
1533 NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1535 return skb->len;
1537 nla_put_failure:
1538 nlmsg_trim(skb, b);
1539 return -1;
1542 #ifdef CONFIG_NET_CLS_ACT
1543 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1545 unsigned char *b = skb_tail_pointer(skb);
1546 struct tc_cbq_police opt;
1548 if (cl->police) {
1549 opt.police = cl->police;
1550 opt.__res1 = 0;
1551 opt.__res2 = 0;
1552 NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1554 return skb->len;
1556 nla_put_failure:
1557 nlmsg_trim(skb, b);
1558 return -1;
1560 #endif
1562 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1564 if (cbq_dump_lss(skb, cl) < 0 ||
1565 cbq_dump_rate(skb, cl) < 0 ||
1566 cbq_dump_wrr(skb, cl) < 0 ||
1567 cbq_dump_ovl(skb, cl) < 0 ||
1568 #ifdef CONFIG_NET_CLS_ACT
1569 cbq_dump_police(skb, cl) < 0 ||
1570 #endif
1571 cbq_dump_fopt(skb, cl) < 0)
1572 return -1;
1573 return 0;
1576 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1578 struct cbq_sched_data *q = qdisc_priv(sch);
1579 struct nlattr *nest;
1581 nest = nla_nest_start(skb, TCA_OPTIONS);
1582 if (nest == NULL)
1583 goto nla_put_failure;
1584 if (cbq_dump_attr(skb, &q->link) < 0)
1585 goto nla_put_failure;
1586 nla_nest_end(skb, nest);
1587 return skb->len;
1589 nla_put_failure:
1590 nla_nest_cancel(skb, nest);
1591 return -1;
1594 static int
1595 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1597 struct cbq_sched_data *q = qdisc_priv(sch);
1599 q->link.xstats.avgidle = q->link.avgidle;
1600 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1603 static int
1604 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1605 struct sk_buff *skb, struct tcmsg *tcm)
1607 struct cbq_class *cl = (struct cbq_class*)arg;
1608 struct nlattr *nest;
1610 if (cl->tparent)
1611 tcm->tcm_parent = cl->tparent->common.classid;
1612 else
1613 tcm->tcm_parent = TC_H_ROOT;
1614 tcm->tcm_handle = cl->common.classid;
1615 tcm->tcm_info = cl->q->handle;
1617 nest = nla_nest_start(skb, TCA_OPTIONS);
1618 if (nest == NULL)
1619 goto nla_put_failure;
1620 if (cbq_dump_attr(skb, cl) < 0)
1621 goto nla_put_failure;
1622 nla_nest_end(skb, nest);
1623 return skb->len;
1625 nla_put_failure:
1626 nla_nest_cancel(skb, nest);
1627 return -1;
1630 static int
1631 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1632 struct gnet_dump *d)
1634 struct cbq_sched_data *q = qdisc_priv(sch);
1635 struct cbq_class *cl = (struct cbq_class*)arg;
1637 cl->qstats.qlen = cl->q->q.qlen;
1638 cl->xstats.avgidle = cl->avgidle;
1639 cl->xstats.undertime = 0;
1641 if (cl->undertime != PSCHED_PASTPERFECT)
1642 cl->xstats.undertime = cl->undertime - q->now;
1644 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1645 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1646 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1647 return -1;
1649 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1652 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1653 struct Qdisc **old)
1655 struct cbq_class *cl = (struct cbq_class*)arg;
1657 if (cl) {
1658 if (new == NULL) {
1659 new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1660 &pfifo_qdisc_ops,
1661 cl->common.classid);
1662 if (new == NULL)
1663 return -ENOBUFS;
1664 } else {
1665 #ifdef CONFIG_NET_CLS_ACT
1666 if (cl->police == TC_POLICE_RECLASSIFY)
1667 new->reshape_fail = cbq_reshape_fail;
1668 #endif
1670 sch_tree_lock(sch);
1671 *old = xchg(&cl->q, new);
1672 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1673 qdisc_reset(*old);
1674 sch_tree_unlock(sch);
1676 return 0;
1678 return -ENOENT;
1681 static struct Qdisc *
1682 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1684 struct cbq_class *cl = (struct cbq_class*)arg;
1686 return cl ? cl->q : NULL;
1689 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1691 struct cbq_class *cl = (struct cbq_class *)arg;
1693 if (cl->q->q.qlen == 0)
1694 cbq_deactivate_class(cl);
1697 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1699 struct cbq_sched_data *q = qdisc_priv(sch);
1700 struct cbq_class *cl = cbq_class_lookup(q, classid);
1702 if (cl) {
1703 cl->refcnt++;
1704 return (unsigned long)cl;
1706 return 0;
1709 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1711 struct cbq_sched_data *q = qdisc_priv(sch);
1713 WARN_ON(cl->filters);
1715 tcf_destroy_chain(&cl->filter_list);
1716 qdisc_destroy(cl->q);
1717 qdisc_put_rtab(cl->R_tab);
1718 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1719 if (cl != &q->link)
1720 kfree(cl);
1723 static void
1724 cbq_destroy(struct Qdisc* sch)
1726 struct cbq_sched_data *q = qdisc_priv(sch);
1727 struct hlist_node *n, *next;
1728 struct cbq_class *cl;
1729 unsigned h;
1731 #ifdef CONFIG_NET_CLS_ACT
1732 q->rx_class = NULL;
1733 #endif
1735 * Filters must be destroyed first because we don't destroy the
1736 * classes from root to leafs which means that filters can still
1737 * be bound to classes which have been destroyed already. --TGR '04
1739 for (h = 0; h < q->clhash.hashsize; h++) {
1740 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
1741 tcf_destroy_chain(&cl->filter_list);
1743 for (h = 0; h < q->clhash.hashsize; h++) {
1744 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
1745 common.hnode)
1746 cbq_destroy_class(sch, cl);
1748 qdisc_class_hash_destroy(&q->clhash);
1751 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1753 struct cbq_class *cl = (struct cbq_class*)arg;
1755 if (--cl->refcnt == 0) {
1756 #ifdef CONFIG_NET_CLS_ACT
1757 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1758 struct cbq_sched_data *q = qdisc_priv(sch);
1760 spin_lock_bh(root_lock);
1761 if (q->rx_class == cl)
1762 q->rx_class = NULL;
1763 spin_unlock_bh(root_lock);
1764 #endif
1766 cbq_destroy_class(sch, cl);
1770 static int
1771 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1772 unsigned long *arg)
1774 int err;
1775 struct cbq_sched_data *q = qdisc_priv(sch);
1776 struct cbq_class *cl = (struct cbq_class*)*arg;
1777 struct nlattr *opt = tca[TCA_OPTIONS];
1778 struct nlattr *tb[TCA_CBQ_MAX + 1];
1779 struct cbq_class *parent;
1780 struct qdisc_rate_table *rtab = NULL;
1782 if (opt == NULL)
1783 return -EINVAL;
1785 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1786 if (err < 0)
1787 return err;
1789 if (cl) {
1790 /* Check parent */
1791 if (parentid) {
1792 if (cl->tparent &&
1793 cl->tparent->common.classid != parentid)
1794 return -EINVAL;
1795 if (!cl->tparent && parentid != TC_H_ROOT)
1796 return -EINVAL;
1799 if (tb[TCA_CBQ_RATE]) {
1800 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1801 if (rtab == NULL)
1802 return -EINVAL;
1805 /* Change class parameters */
1806 sch_tree_lock(sch);
1808 if (cl->next_alive != NULL)
1809 cbq_deactivate_class(cl);
1811 if (rtab) {
1812 rtab = xchg(&cl->R_tab, rtab);
1813 qdisc_put_rtab(rtab);
1816 if (tb[TCA_CBQ_LSSOPT])
1817 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1819 if (tb[TCA_CBQ_WRROPT]) {
1820 cbq_rmprio(q, cl);
1821 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1824 if (tb[TCA_CBQ_OVL_STRATEGY])
1825 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1827 #ifdef CONFIG_NET_CLS_ACT
1828 if (tb[TCA_CBQ_POLICE])
1829 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1830 #endif
1832 if (tb[TCA_CBQ_FOPT])
1833 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1835 if (cl->q->q.qlen)
1836 cbq_activate_class(cl);
1838 sch_tree_unlock(sch);
1840 if (tca[TCA_RATE])
1841 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1842 qdisc_root_sleeping_lock(sch),
1843 tca[TCA_RATE]);
1844 return 0;
1847 if (parentid == TC_H_ROOT)
1848 return -EINVAL;
1850 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1851 tb[TCA_CBQ_LSSOPT] == NULL)
1852 return -EINVAL;
1854 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1855 if (rtab == NULL)
1856 return -EINVAL;
1858 if (classid) {
1859 err = -EINVAL;
1860 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1861 goto failure;
1862 } else {
1863 int i;
1864 classid = TC_H_MAKE(sch->handle,0x8000);
1866 for (i=0; i<0x8000; i++) {
1867 if (++q->hgenerator >= 0x8000)
1868 q->hgenerator = 1;
1869 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1870 break;
1872 err = -ENOSR;
1873 if (i >= 0x8000)
1874 goto failure;
1875 classid = classid|q->hgenerator;
1878 parent = &q->link;
1879 if (parentid) {
1880 parent = cbq_class_lookup(q, parentid);
1881 err = -EINVAL;
1882 if (parent == NULL)
1883 goto failure;
1886 err = -ENOBUFS;
1887 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1888 if (cl == NULL)
1889 goto failure;
1890 cl->R_tab = rtab;
1891 rtab = NULL;
1892 cl->refcnt = 1;
1893 if (!(cl->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1894 &pfifo_qdisc_ops, classid)))
1895 cl->q = &noop_qdisc;
1896 cl->common.classid = classid;
1897 cl->tparent = parent;
1898 cl->qdisc = sch;
1899 cl->allot = parent->allot;
1900 cl->quantum = cl->allot;
1901 cl->weight = cl->R_tab->rate.rate;
1903 sch_tree_lock(sch);
1904 cbq_link_class(cl);
1905 cl->borrow = cl->tparent;
1906 if (cl->tparent != &q->link)
1907 cl->share = cl->tparent;
1908 cbq_adjust_levels(parent);
1909 cl->minidle = -0x7FFFFFFF;
1910 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1911 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1912 if (cl->ewma_log==0)
1913 cl->ewma_log = q->link.ewma_log;
1914 if (cl->maxidle==0)
1915 cl->maxidle = q->link.maxidle;
1916 if (cl->avpkt==0)
1917 cl->avpkt = q->link.avpkt;
1918 cl->overlimit = cbq_ovl_classic;
1919 if (tb[TCA_CBQ_OVL_STRATEGY])
1920 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1921 #ifdef CONFIG_NET_CLS_ACT
1922 if (tb[TCA_CBQ_POLICE])
1923 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1924 #endif
1925 if (tb[TCA_CBQ_FOPT])
1926 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1927 sch_tree_unlock(sch);
1929 qdisc_class_hash_grow(sch, &q->clhash);
1931 if (tca[TCA_RATE])
1932 gen_new_estimator(&cl->bstats, &cl->rate_est,
1933 qdisc_root_sleeping_lock(sch), tca[TCA_RATE]);
1935 *arg = (unsigned long)cl;
1936 return 0;
1938 failure:
1939 qdisc_put_rtab(rtab);
1940 return err;
1943 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1945 struct cbq_sched_data *q = qdisc_priv(sch);
1946 struct cbq_class *cl = (struct cbq_class*)arg;
1947 unsigned int qlen;
1949 if (cl->filters || cl->children || cl == &q->link)
1950 return -EBUSY;
1952 sch_tree_lock(sch);
1954 qlen = cl->q->q.qlen;
1955 qdisc_reset(cl->q);
1956 qdisc_tree_decrease_qlen(cl->q, qlen);
1958 if (cl->next_alive)
1959 cbq_deactivate_class(cl);
1961 if (q->tx_borrowed == cl)
1962 q->tx_borrowed = q->tx_class;
1963 if (q->tx_class == cl) {
1964 q->tx_class = NULL;
1965 q->tx_borrowed = NULL;
1967 #ifdef CONFIG_NET_CLS_ACT
1968 if (q->rx_class == cl)
1969 q->rx_class = NULL;
1970 #endif
1972 cbq_unlink_class(cl);
1973 cbq_adjust_levels(cl->tparent);
1974 cl->defmap = 0;
1975 cbq_sync_defmap(cl);
1977 cbq_rmprio(q, cl);
1978 sch_tree_unlock(sch);
1980 if (--cl->refcnt == 0)
1981 cbq_destroy_class(sch, cl);
1983 return 0;
1986 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1988 struct cbq_sched_data *q = qdisc_priv(sch);
1989 struct cbq_class *cl = (struct cbq_class *)arg;
1991 if (cl == NULL)
1992 cl = &q->link;
1994 return &cl->filter_list;
1997 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1998 u32 classid)
2000 struct cbq_sched_data *q = qdisc_priv(sch);
2001 struct cbq_class *p = (struct cbq_class*)parent;
2002 struct cbq_class *cl = cbq_class_lookup(q, classid);
2004 if (cl) {
2005 if (p && p->level <= cl->level)
2006 return 0;
2007 cl->filters++;
2008 return (unsigned long)cl;
2010 return 0;
2013 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2015 struct cbq_class *cl = (struct cbq_class*)arg;
2017 cl->filters--;
2020 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2022 struct cbq_sched_data *q = qdisc_priv(sch);
2023 struct cbq_class *cl;
2024 struct hlist_node *n;
2025 unsigned h;
2027 if (arg->stop)
2028 return;
2030 for (h = 0; h < q->clhash.hashsize; h++) {
2031 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
2032 if (arg->count < arg->skip) {
2033 arg->count++;
2034 continue;
2036 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2037 arg->stop = 1;
2038 return;
2040 arg->count++;
2045 static const struct Qdisc_class_ops cbq_class_ops = {
2046 .graft = cbq_graft,
2047 .leaf = cbq_leaf,
2048 .qlen_notify = cbq_qlen_notify,
2049 .get = cbq_get,
2050 .put = cbq_put,
2051 .change = cbq_change_class,
2052 .delete = cbq_delete,
2053 .walk = cbq_walk,
2054 .tcf_chain = cbq_find_tcf,
2055 .bind_tcf = cbq_bind_filter,
2056 .unbind_tcf = cbq_unbind_filter,
2057 .dump = cbq_dump_class,
2058 .dump_stats = cbq_dump_class_stats,
2061 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2062 .next = NULL,
2063 .cl_ops = &cbq_class_ops,
2064 .id = "cbq",
2065 .priv_size = sizeof(struct cbq_sched_data),
2066 .enqueue = cbq_enqueue,
2067 .dequeue = cbq_dequeue,
2068 .requeue = cbq_requeue,
2069 .drop = cbq_drop,
2070 .init = cbq_init,
2071 .reset = cbq_reset,
2072 .destroy = cbq_destroy,
2073 .change = NULL,
2074 .dump = cbq_dump,
2075 .dump_stats = cbq_dump_stats,
2076 .owner = THIS_MODULE,
2079 static int __init cbq_module_init(void)
2081 return register_qdisc(&cbq_qdisc_ops);
2083 static void __exit cbq_module_exit(void)
2085 unregister_qdisc(&cbq_qdisc_ops);
2087 module_init(cbq_module_init)
2088 module_exit(cbq_module_exit)
2089 MODULE_LICENSE("GPL");