pm2fb: free cmap memory on module remove
[linux-2.6/btrfs-unstable.git] / net / sched / sch_cbq.c
blob47ef492c4ff40ad23735aa6254885b503e094e2f
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/string.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <net/netlink.h>
20 #include <net/pkt_sched.h>
23 /* Class-Based Queueing (CBQ) algorithm.
24 =======================================
26 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
27 Management Models for Packet Networks",
28 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
30 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
32 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
33 Parameters", 1996
35 [4] Sally Floyd and Michael Speer, "Experimental Results
36 for Class-Based Queueing", 1998, not published.
38 -----------------------------------------------------------------------
40 Algorithm skeleton was taken from NS simulator cbq.cc.
41 If someone wants to check this code against the LBL version,
42 he should take into account that ONLY the skeleton was borrowed,
43 the implementation is different. Particularly:
45 --- The WRR algorithm is different. Our version looks more
46 reasonable (I hope) and works when quanta are allowed to be
47 less than MTU, which is always the case when real time classes
48 have small rates. Note, that the statement of [3] is
49 incomplete, delay may actually be estimated even if class
50 per-round allotment is less than MTU. Namely, if per-round
51 allotment is W*r_i, and r_1+...+r_k = r < 1
53 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
55 In the worst case we have IntServ estimate with D = W*r+k*MTU
56 and C = MTU*r. The proof (if correct at all) is trivial.
59 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
60 interpret some places, which look like wrong translations
61 from NS. Anyone is advised to find these differences
62 and explain to me, why I am wrong 8).
64 --- Linux has no EOI event, so that we cannot estimate true class
65 idle time. Workaround is to consider the next dequeue event
66 as sign that previous packet is finished. This is wrong because of
67 internal device queueing, but on a permanently loaded link it is true.
68 Moreover, combined with clock integrator, this scheme looks
69 very close to an ideal solution. */
71 struct cbq_sched_data;
74 struct cbq_class
76 struct Qdisc_class_common common;
77 struct cbq_class *next_alive; /* next class with backlog in this priority band */
79 /* Parameters */
80 unsigned char priority; /* class priority */
81 unsigned char priority2; /* priority to be used after overlimit */
82 unsigned char ewma_log; /* time constant for idle time calculation */
83 unsigned char ovl_strategy;
84 #ifdef CONFIG_NET_CLS_ACT
85 unsigned char police;
86 #endif
88 u32 defmap;
90 /* Link-sharing scheduler parameters */
91 long maxidle; /* Class parameters: see below. */
92 long offtime;
93 long minidle;
94 u32 avpkt;
95 struct qdisc_rate_table *R_tab;
97 /* Overlimit strategy parameters */
98 void (*overlimit)(struct cbq_class *cl);
99 psched_tdiff_t penalty;
101 /* General scheduler (WRR) parameters */
102 long allot;
103 long quantum; /* Allotment per WRR round */
104 long weight; /* Relative allotment: see below */
106 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
107 struct cbq_class *split; /* Ptr to split node */
108 struct cbq_class *share; /* Ptr to LS parent in the class tree */
109 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
110 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
111 parent otherwise */
112 struct cbq_class *sibling; /* Sibling chain */
113 struct cbq_class *children; /* Pointer to children chain */
115 struct Qdisc *q; /* Elementary queueing discipline */
118 /* Variables */
119 unsigned char cpriority; /* Effective priority */
120 unsigned char delayed;
121 unsigned char level; /* level of the class in hierarchy:
122 0 for leaf classes, and maximal
123 level of children + 1 for nodes.
126 psched_time_t last; /* Last end of service */
127 psched_time_t undertime;
128 long avgidle;
129 long deficit; /* Saved deficit for WRR */
130 psched_time_t penalized;
131 struct gnet_stats_basic bstats;
132 struct gnet_stats_queue qstats;
133 struct gnet_stats_rate_est rate_est;
134 struct tc_cbq_xstats xstats;
136 struct tcf_proto *filter_list;
138 int refcnt;
139 int filters;
141 struct cbq_class *defaults[TC_PRIO_MAX+1];
144 struct cbq_sched_data
146 struct Qdisc_class_hash clhash; /* Hash table of all classes */
147 int nclasses[TC_CBQ_MAXPRIO+1];
148 unsigned quanta[TC_CBQ_MAXPRIO+1];
150 struct cbq_class link;
152 unsigned activemask;
153 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
154 with backlog */
156 #ifdef CONFIG_NET_CLS_ACT
157 struct cbq_class *rx_class;
158 #endif
159 struct cbq_class *tx_class;
160 struct cbq_class *tx_borrowed;
161 int tx_len;
162 psched_time_t now; /* Cached timestamp */
163 psched_time_t now_rt; /* Cached real time */
164 unsigned pmask;
166 struct hrtimer delay_timer;
167 struct qdisc_watchdog watchdog; /* Watchdog timer,
168 started when CBQ has
169 backlog, but cannot
170 transmit just now */
171 psched_tdiff_t wd_expires;
172 int toplevel;
173 u32 hgenerator;
177 #define L2T(cl,len) qdisc_l2t((cl)->R_tab,len)
179 static __inline__ struct cbq_class *
180 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
182 struct Qdisc_class_common *clc;
184 clc = qdisc_class_find(&q->clhash, classid);
185 if (clc == NULL)
186 return NULL;
187 return container_of(clc, struct cbq_class, common);
190 #ifdef CONFIG_NET_CLS_ACT
192 static struct cbq_class *
193 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
195 struct cbq_class *cl, *new;
197 for (cl = this->tparent; cl; cl = cl->tparent)
198 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
199 return new;
201 return NULL;
204 #endif
206 /* Classify packet. The procedure is pretty complicated, but
207 it allows us to combine link sharing and priority scheduling
208 transparently.
210 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
211 so that it resolves to split nodes. Then packets are classified
212 by logical priority, or a more specific classifier may be attached
213 to the split node.
216 static struct cbq_class *
217 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
219 struct cbq_sched_data *q = qdisc_priv(sch);
220 struct cbq_class *head = &q->link;
221 struct cbq_class **defmap;
222 struct cbq_class *cl = NULL;
223 u32 prio = skb->priority;
224 struct tcf_result res;
227 * Step 1. If skb->priority points to one of our classes, use it.
229 if (TC_H_MAJ(prio^sch->handle) == 0 &&
230 (cl = cbq_class_lookup(q, prio)) != NULL)
231 return cl;
233 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
234 for (;;) {
235 int result = 0;
236 defmap = head->defaults;
239 * Step 2+n. Apply classifier.
241 if (!head->filter_list ||
242 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
243 goto fallback;
245 if ((cl = (void*)res.class) == NULL) {
246 if (TC_H_MAJ(res.classid))
247 cl = cbq_class_lookup(q, res.classid);
248 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
249 cl = defmap[TC_PRIO_BESTEFFORT];
251 if (cl == NULL || cl->level >= head->level)
252 goto fallback;
255 #ifdef CONFIG_NET_CLS_ACT
256 switch (result) {
257 case TC_ACT_QUEUED:
258 case TC_ACT_STOLEN:
259 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
260 case TC_ACT_SHOT:
261 return NULL;
262 case TC_ACT_RECLASSIFY:
263 return cbq_reclassify(skb, cl);
265 #endif
266 if (cl->level == 0)
267 return cl;
270 * Step 3+n. If classifier selected a link sharing class,
271 * apply agency specific classifier.
272 * Repeat this procdure until we hit a leaf node.
274 head = cl;
277 fallback:
278 cl = head;
281 * Step 4. No success...
283 if (TC_H_MAJ(prio) == 0 &&
284 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
285 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
286 return head;
288 return cl;
292 A packet has just been enqueued on the empty class.
293 cbq_activate_class adds it to the tail of active class list
294 of its priority band.
297 static __inline__ void cbq_activate_class(struct cbq_class *cl)
299 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
300 int prio = cl->cpriority;
301 struct cbq_class *cl_tail;
303 cl_tail = q->active[prio];
304 q->active[prio] = cl;
306 if (cl_tail != NULL) {
307 cl->next_alive = cl_tail->next_alive;
308 cl_tail->next_alive = cl;
309 } else {
310 cl->next_alive = cl;
311 q->activemask |= (1<<prio);
316 Unlink class from active chain.
317 Note that this same procedure is done directly in cbq_dequeue*
318 during round-robin procedure.
321 static void cbq_deactivate_class(struct cbq_class *this)
323 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
324 int prio = this->cpriority;
325 struct cbq_class *cl;
326 struct cbq_class *cl_prev = q->active[prio];
328 do {
329 cl = cl_prev->next_alive;
330 if (cl == this) {
331 cl_prev->next_alive = cl->next_alive;
332 cl->next_alive = NULL;
334 if (cl == q->active[prio]) {
335 q->active[prio] = cl_prev;
336 if (cl == q->active[prio]) {
337 q->active[prio] = NULL;
338 q->activemask &= ~(1<<prio);
339 return;
342 return;
344 } while ((cl_prev = cl) != q->active[prio]);
347 static void
348 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
350 int toplevel = q->toplevel;
352 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
353 psched_time_t now;
354 psched_tdiff_t incr;
356 now = psched_get_time();
357 incr = now - q->now_rt;
358 now = q->now + incr;
360 do {
361 if (cl->undertime < now) {
362 q->toplevel = cl->level;
363 return;
365 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
369 static int
370 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
372 struct cbq_sched_data *q = qdisc_priv(sch);
373 int uninitialized_var(ret);
374 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
376 #ifdef CONFIG_NET_CLS_ACT
377 q->rx_class = cl;
378 #endif
379 if (cl == NULL) {
380 if (ret & __NET_XMIT_BYPASS)
381 sch->qstats.drops++;
382 kfree_skb(skb);
383 return ret;
386 #ifdef CONFIG_NET_CLS_ACT
387 cl->q->__parent = sch;
388 #endif
389 ret = qdisc_enqueue(skb, cl->q);
390 if (ret == NET_XMIT_SUCCESS) {
391 sch->q.qlen++;
392 sch->bstats.packets++;
393 sch->bstats.bytes += qdisc_pkt_len(skb);
394 cbq_mark_toplevel(q, cl);
395 if (!cl->next_alive)
396 cbq_activate_class(cl);
397 return ret;
400 if (net_xmit_drop_count(ret)) {
401 sch->qstats.drops++;
402 cbq_mark_toplevel(q, cl);
403 cl->qstats.drops++;
405 return ret;
408 static int
409 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
411 struct cbq_sched_data *q = qdisc_priv(sch);
412 struct cbq_class *cl;
413 int ret;
415 if ((cl = q->tx_class) == NULL) {
416 kfree_skb(skb);
417 sch->qstats.drops++;
418 return NET_XMIT_CN;
420 q->tx_class = NULL;
422 cbq_mark_toplevel(q, cl);
424 #ifdef CONFIG_NET_CLS_ACT
425 q->rx_class = cl;
426 cl->q->__parent = sch;
427 #endif
428 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
429 sch->q.qlen++;
430 sch->qstats.requeues++;
431 if (!cl->next_alive)
432 cbq_activate_class(cl);
433 return 0;
435 if (net_xmit_drop_count(ret)) {
436 sch->qstats.drops++;
437 cl->qstats.drops++;
439 return ret;
442 /* Overlimit actions */
444 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
446 static void cbq_ovl_classic(struct cbq_class *cl)
448 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
449 psched_tdiff_t delay = cl->undertime - q->now;
451 if (!cl->delayed) {
452 delay += cl->offtime;
455 Class goes to sleep, so that it will have no
456 chance to work avgidle. Let's forgive it 8)
458 BTW cbq-2.0 has a crap in this
459 place, apparently they forgot to shift it by cl->ewma_log.
461 if (cl->avgidle < 0)
462 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
463 if (cl->avgidle < cl->minidle)
464 cl->avgidle = cl->minidle;
465 if (delay <= 0)
466 delay = 1;
467 cl->undertime = q->now + delay;
469 cl->xstats.overactions++;
470 cl->delayed = 1;
472 if (q->wd_expires == 0 || q->wd_expires > delay)
473 q->wd_expires = delay;
475 /* Dirty work! We must schedule wakeups based on
476 real available rate, rather than leaf rate,
477 which may be tiny (even zero).
479 if (q->toplevel == TC_CBQ_MAXLEVEL) {
480 struct cbq_class *b;
481 psched_tdiff_t base_delay = q->wd_expires;
483 for (b = cl->borrow; b; b = b->borrow) {
484 delay = b->undertime - q->now;
485 if (delay < base_delay) {
486 if (delay <= 0)
487 delay = 1;
488 base_delay = delay;
492 q->wd_expires = base_delay;
496 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
497 they go overlimit
500 static void cbq_ovl_rclassic(struct cbq_class *cl)
502 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
503 struct cbq_class *this = cl;
505 do {
506 if (cl->level > q->toplevel) {
507 cl = NULL;
508 break;
510 } while ((cl = cl->borrow) != NULL);
512 if (cl == NULL)
513 cl = this;
514 cbq_ovl_classic(cl);
517 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
519 static void cbq_ovl_delay(struct cbq_class *cl)
521 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
522 psched_tdiff_t delay = cl->undertime - q->now;
524 if (!cl->delayed) {
525 psched_time_t sched = q->now;
526 ktime_t expires;
528 delay += cl->offtime;
529 if (cl->avgidle < 0)
530 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
531 if (cl->avgidle < cl->minidle)
532 cl->avgidle = cl->minidle;
533 cl->undertime = q->now + delay;
535 if (delay > 0) {
536 sched += delay + cl->penalty;
537 cl->penalized = sched;
538 cl->cpriority = TC_CBQ_MAXPRIO;
539 q->pmask |= (1<<TC_CBQ_MAXPRIO);
541 expires = ktime_set(0, 0);
542 expires = ktime_add_ns(expires, PSCHED_US2NS(sched));
543 if (hrtimer_try_to_cancel(&q->delay_timer) &&
544 ktime_to_ns(ktime_sub(q->delay_timer.expires,
545 expires)) > 0)
546 q->delay_timer.expires = expires;
547 hrtimer_restart(&q->delay_timer);
548 cl->delayed = 1;
549 cl->xstats.overactions++;
550 return;
552 delay = 1;
554 if (q->wd_expires == 0 || q->wd_expires > delay)
555 q->wd_expires = delay;
558 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
560 static void cbq_ovl_lowprio(struct cbq_class *cl)
562 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
564 cl->penalized = q->now + cl->penalty;
566 if (cl->cpriority != cl->priority2) {
567 cl->cpriority = cl->priority2;
568 q->pmask |= (1<<cl->cpriority);
569 cl->xstats.overactions++;
571 cbq_ovl_classic(cl);
574 /* TC_CBQ_OVL_DROP: penalize class by dropping */
576 static void cbq_ovl_drop(struct cbq_class *cl)
578 if (cl->q->ops->drop)
579 if (cl->q->ops->drop(cl->q))
580 cl->qdisc->q.qlen--;
581 cl->xstats.overactions++;
582 cbq_ovl_classic(cl);
585 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
586 psched_time_t now)
588 struct cbq_class *cl;
589 struct cbq_class *cl_prev = q->active[prio];
590 psched_time_t sched = now;
592 if (cl_prev == NULL)
593 return 0;
595 do {
596 cl = cl_prev->next_alive;
597 if (now - cl->penalized > 0) {
598 cl_prev->next_alive = cl->next_alive;
599 cl->next_alive = NULL;
600 cl->cpriority = cl->priority;
601 cl->delayed = 0;
602 cbq_activate_class(cl);
604 if (cl == q->active[prio]) {
605 q->active[prio] = cl_prev;
606 if (cl == q->active[prio]) {
607 q->active[prio] = NULL;
608 return 0;
612 cl = cl_prev->next_alive;
613 } else if (sched - cl->penalized > 0)
614 sched = cl->penalized;
615 } while ((cl_prev = cl) != q->active[prio]);
617 return sched - now;
620 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
622 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
623 delay_timer);
624 struct Qdisc *sch = q->watchdog.qdisc;
625 psched_time_t now;
626 psched_tdiff_t delay = 0;
627 unsigned pmask;
629 now = psched_get_time();
631 pmask = q->pmask;
632 q->pmask = 0;
634 while (pmask) {
635 int prio = ffz(~pmask);
636 psched_tdiff_t tmp;
638 pmask &= ~(1<<prio);
640 tmp = cbq_undelay_prio(q, prio, now);
641 if (tmp > 0) {
642 q->pmask |= 1<<prio;
643 if (tmp < delay || delay == 0)
644 delay = tmp;
648 if (delay) {
649 ktime_t time;
651 time = ktime_set(0, 0);
652 time = ktime_add_ns(time, PSCHED_US2NS(now + delay));
653 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
656 sch->flags &= ~TCQ_F_THROTTLED;
657 __netif_schedule(qdisc_root(sch));
658 return HRTIMER_NORESTART;
661 #ifdef CONFIG_NET_CLS_ACT
662 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
664 struct Qdisc *sch = child->__parent;
665 struct cbq_sched_data *q = qdisc_priv(sch);
666 struct cbq_class *cl = q->rx_class;
668 q->rx_class = NULL;
670 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
671 int ret;
673 cbq_mark_toplevel(q, cl);
675 q->rx_class = cl;
676 cl->q->__parent = sch;
678 ret = qdisc_enqueue(skb, cl->q);
679 if (ret == NET_XMIT_SUCCESS) {
680 sch->q.qlen++;
681 sch->bstats.packets++;
682 sch->bstats.bytes += qdisc_pkt_len(skb);
683 if (!cl->next_alive)
684 cbq_activate_class(cl);
685 return 0;
687 if (net_xmit_drop_count(ret))
688 sch->qstats.drops++;
689 return 0;
692 sch->qstats.drops++;
693 return -1;
695 #endif
698 It is mission critical procedure.
700 We "regenerate" toplevel cutoff, if transmitting class
701 has backlog and it is not regulated. It is not part of
702 original CBQ description, but looks more reasonable.
703 Probably, it is wrong. This question needs further investigation.
706 static __inline__ void
707 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
708 struct cbq_class *borrowed)
710 if (cl && q->toplevel >= borrowed->level) {
711 if (cl->q->q.qlen > 1) {
712 do {
713 if (borrowed->undertime == PSCHED_PASTPERFECT) {
714 q->toplevel = borrowed->level;
715 return;
717 } while ((borrowed=borrowed->borrow) != NULL);
719 #if 0
720 /* It is not necessary now. Uncommenting it
721 will save CPU cycles, but decrease fairness.
723 q->toplevel = TC_CBQ_MAXLEVEL;
724 #endif
728 static void
729 cbq_update(struct cbq_sched_data *q)
731 struct cbq_class *this = q->tx_class;
732 struct cbq_class *cl = this;
733 int len = q->tx_len;
735 q->tx_class = NULL;
737 for ( ; cl; cl = cl->share) {
738 long avgidle = cl->avgidle;
739 long idle;
741 cl->bstats.packets++;
742 cl->bstats.bytes += len;
745 (now - last) is total time between packet right edges.
746 (last_pktlen/rate) is "virtual" busy time, so that
748 idle = (now - last) - last_pktlen/rate
751 idle = q->now - cl->last;
752 if ((unsigned long)idle > 128*1024*1024) {
753 avgidle = cl->maxidle;
754 } else {
755 idle -= L2T(cl, len);
757 /* true_avgidle := (1-W)*true_avgidle + W*idle,
758 where W=2^{-ewma_log}. But cl->avgidle is scaled:
759 cl->avgidle == true_avgidle/W,
760 hence:
762 avgidle += idle - (avgidle>>cl->ewma_log);
765 if (avgidle <= 0) {
766 /* Overlimit or at-limit */
768 if (avgidle < cl->minidle)
769 avgidle = cl->minidle;
771 cl->avgidle = avgidle;
773 /* Calculate expected time, when this class
774 will be allowed to send.
775 It will occur, when:
776 (1-W)*true_avgidle + W*delay = 0, i.e.
777 idle = (1/W - 1)*(-true_avgidle)
779 idle = (1 - W)*(-cl->avgidle);
781 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
784 That is not all.
785 To maintain the rate allocated to the class,
786 we add to undertime virtual clock,
787 necessary to complete transmitted packet.
788 (len/phys_bandwidth has been already passed
789 to the moment of cbq_update)
792 idle -= L2T(&q->link, len);
793 idle += L2T(cl, len);
795 cl->undertime = q->now + idle;
796 } else {
797 /* Underlimit */
799 cl->undertime = PSCHED_PASTPERFECT;
800 if (avgidle > cl->maxidle)
801 cl->avgidle = cl->maxidle;
802 else
803 cl->avgidle = avgidle;
805 cl->last = q->now;
808 cbq_update_toplevel(q, this, q->tx_borrowed);
811 static __inline__ struct cbq_class *
812 cbq_under_limit(struct cbq_class *cl)
814 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
815 struct cbq_class *this_cl = cl;
817 if (cl->tparent == NULL)
818 return cl;
820 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
821 cl->delayed = 0;
822 return cl;
825 do {
826 /* It is very suspicious place. Now overlimit
827 action is generated for not bounded classes
828 only if link is completely congested.
829 Though it is in agree with ancestor-only paradigm,
830 it looks very stupid. Particularly,
831 it means that this chunk of code will either
832 never be called or result in strong amplification
833 of burstiness. Dangerous, silly, and, however,
834 no another solution exists.
836 if ((cl = cl->borrow) == NULL) {
837 this_cl->qstats.overlimits++;
838 this_cl->overlimit(this_cl);
839 return NULL;
841 if (cl->level > q->toplevel)
842 return NULL;
843 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
845 cl->delayed = 0;
846 return cl;
849 static __inline__ struct sk_buff *
850 cbq_dequeue_prio(struct Qdisc *sch, int prio)
852 struct cbq_sched_data *q = qdisc_priv(sch);
853 struct cbq_class *cl_tail, *cl_prev, *cl;
854 struct sk_buff *skb;
855 int deficit;
857 cl_tail = cl_prev = q->active[prio];
858 cl = cl_prev->next_alive;
860 do {
861 deficit = 0;
863 /* Start round */
864 do {
865 struct cbq_class *borrow = cl;
867 if (cl->q->q.qlen &&
868 (borrow = cbq_under_limit(cl)) == NULL)
869 goto skip_class;
871 if (cl->deficit <= 0) {
872 /* Class exhausted its allotment per
873 this round. Switch to the next one.
875 deficit = 1;
876 cl->deficit += cl->quantum;
877 goto next_class;
880 skb = cl->q->dequeue(cl->q);
882 /* Class did not give us any skb :-(
883 It could occur even if cl->q->q.qlen != 0
884 f.e. if cl->q == "tbf"
886 if (skb == NULL)
887 goto skip_class;
889 cl->deficit -= qdisc_pkt_len(skb);
890 q->tx_class = cl;
891 q->tx_borrowed = borrow;
892 if (borrow != cl) {
893 #ifndef CBQ_XSTATS_BORROWS_BYTES
894 borrow->xstats.borrows++;
895 cl->xstats.borrows++;
896 #else
897 borrow->xstats.borrows += qdisc_pkt_len(skb);
898 cl->xstats.borrows += qdisc_pkt_len(skb);
899 #endif
901 q->tx_len = qdisc_pkt_len(skb);
903 if (cl->deficit <= 0) {
904 q->active[prio] = cl;
905 cl = cl->next_alive;
906 cl->deficit += cl->quantum;
908 return skb;
910 skip_class:
911 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
912 /* Class is empty or penalized.
913 Unlink it from active chain.
915 cl_prev->next_alive = cl->next_alive;
916 cl->next_alive = NULL;
918 /* Did cl_tail point to it? */
919 if (cl == cl_tail) {
920 /* Repair it! */
921 cl_tail = cl_prev;
923 /* Was it the last class in this band? */
924 if (cl == cl_tail) {
925 /* Kill the band! */
926 q->active[prio] = NULL;
927 q->activemask &= ~(1<<prio);
928 if (cl->q->q.qlen)
929 cbq_activate_class(cl);
930 return NULL;
933 q->active[prio] = cl_tail;
935 if (cl->q->q.qlen)
936 cbq_activate_class(cl);
938 cl = cl_prev;
941 next_class:
942 cl_prev = cl;
943 cl = cl->next_alive;
944 } while (cl_prev != cl_tail);
945 } while (deficit);
947 q->active[prio] = cl_prev;
949 return NULL;
952 static __inline__ struct sk_buff *
953 cbq_dequeue_1(struct Qdisc *sch)
955 struct cbq_sched_data *q = qdisc_priv(sch);
956 struct sk_buff *skb;
957 unsigned activemask;
959 activemask = q->activemask&0xFF;
960 while (activemask) {
961 int prio = ffz(~activemask);
962 activemask &= ~(1<<prio);
963 skb = cbq_dequeue_prio(sch, prio);
964 if (skb)
965 return skb;
967 return NULL;
970 static struct sk_buff *
971 cbq_dequeue(struct Qdisc *sch)
973 struct sk_buff *skb;
974 struct cbq_sched_data *q = qdisc_priv(sch);
975 psched_time_t now;
976 psched_tdiff_t incr;
978 now = psched_get_time();
979 incr = now - q->now_rt;
981 if (q->tx_class) {
982 psched_tdiff_t incr2;
983 /* Time integrator. We calculate EOS time
984 by adding expected packet transmission time.
985 If real time is greater, we warp artificial clock,
986 so that:
988 cbq_time = max(real_time, work);
990 incr2 = L2T(&q->link, q->tx_len);
991 q->now += incr2;
992 cbq_update(q);
993 if ((incr -= incr2) < 0)
994 incr = 0;
996 q->now += incr;
997 q->now_rt = now;
999 for (;;) {
1000 q->wd_expires = 0;
1002 skb = cbq_dequeue_1(sch);
1003 if (skb) {
1004 sch->q.qlen--;
1005 sch->flags &= ~TCQ_F_THROTTLED;
1006 return skb;
1009 /* All the classes are overlimit.
1011 It is possible, if:
1013 1. Scheduler is empty.
1014 2. Toplevel cutoff inhibited borrowing.
1015 3. Root class is overlimit.
1017 Reset 2d and 3d conditions and retry.
1019 Note, that NS and cbq-2.0 are buggy, peeking
1020 an arbitrary class is appropriate for ancestor-only
1021 sharing, but not for toplevel algorithm.
1023 Our version is better, but slower, because it requires
1024 two passes, but it is unavoidable with top-level sharing.
1027 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1028 q->link.undertime == PSCHED_PASTPERFECT)
1029 break;
1031 q->toplevel = TC_CBQ_MAXLEVEL;
1032 q->link.undertime = PSCHED_PASTPERFECT;
1035 /* No packets in scheduler or nobody wants to give them to us :-(
1036 Sigh... start watchdog timer in the last case. */
1038 if (sch->q.qlen) {
1039 sch->qstats.overlimits++;
1040 if (q->wd_expires)
1041 qdisc_watchdog_schedule(&q->watchdog,
1042 now + q->wd_expires);
1044 return NULL;
1047 /* CBQ class maintanance routines */
1049 static void cbq_adjust_levels(struct cbq_class *this)
1051 if (this == NULL)
1052 return;
1054 do {
1055 int level = 0;
1056 struct cbq_class *cl;
1058 if ((cl = this->children) != NULL) {
1059 do {
1060 if (cl->level > level)
1061 level = cl->level;
1062 } while ((cl = cl->sibling) != this->children);
1064 this->level = level+1;
1065 } while ((this = this->tparent) != NULL);
1068 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1070 struct cbq_class *cl;
1071 struct hlist_node *n;
1072 unsigned int h;
1074 if (q->quanta[prio] == 0)
1075 return;
1077 for (h = 0; h < q->clhash.hashsize; h++) {
1078 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1079 /* BUGGGG... Beware! This expression suffer of
1080 arithmetic overflows!
1082 if (cl->priority == prio) {
1083 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1084 q->quanta[prio];
1086 if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1087 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum);
1088 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1094 static void cbq_sync_defmap(struct cbq_class *cl)
1096 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1097 struct cbq_class *split = cl->split;
1098 unsigned h;
1099 int i;
1101 if (split == NULL)
1102 return;
1104 for (i=0; i<=TC_PRIO_MAX; i++) {
1105 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1106 split->defaults[i] = NULL;
1109 for (i=0; i<=TC_PRIO_MAX; i++) {
1110 int level = split->level;
1112 if (split->defaults[i])
1113 continue;
1115 for (h = 0; h < q->clhash.hashsize; h++) {
1116 struct hlist_node *n;
1117 struct cbq_class *c;
1119 hlist_for_each_entry(c, n, &q->clhash.hash[h],
1120 common.hnode) {
1121 if (c->split == split && c->level < level &&
1122 c->defmap&(1<<i)) {
1123 split->defaults[i] = c;
1124 level = c->level;
1131 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1133 struct cbq_class *split = NULL;
1135 if (splitid == 0) {
1136 if ((split = cl->split) == NULL)
1137 return;
1138 splitid = split->common.classid;
1141 if (split == NULL || split->common.classid != splitid) {
1142 for (split = cl->tparent; split; split = split->tparent)
1143 if (split->common.classid == splitid)
1144 break;
1147 if (split == NULL)
1148 return;
1150 if (cl->split != split) {
1151 cl->defmap = 0;
1152 cbq_sync_defmap(cl);
1153 cl->split = split;
1154 cl->defmap = def&mask;
1155 } else
1156 cl->defmap = (cl->defmap&~mask)|(def&mask);
1158 cbq_sync_defmap(cl);
1161 static void cbq_unlink_class(struct cbq_class *this)
1163 struct cbq_class *cl, **clp;
1164 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1166 qdisc_class_hash_remove(&q->clhash, &this->common);
1168 if (this->tparent) {
1169 clp=&this->sibling;
1170 cl = *clp;
1171 do {
1172 if (cl == this) {
1173 *clp = cl->sibling;
1174 break;
1176 clp = &cl->sibling;
1177 } while ((cl = *clp) != this->sibling);
1179 if (this->tparent->children == this) {
1180 this->tparent->children = this->sibling;
1181 if (this->sibling == this)
1182 this->tparent->children = NULL;
1184 } else {
1185 WARN_ON(this->sibling != this);
1189 static void cbq_link_class(struct cbq_class *this)
1191 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1192 struct cbq_class *parent = this->tparent;
1194 this->sibling = this;
1195 qdisc_class_hash_insert(&q->clhash, &this->common);
1197 if (parent == NULL)
1198 return;
1200 if (parent->children == NULL) {
1201 parent->children = this;
1202 } else {
1203 this->sibling = parent->children->sibling;
1204 parent->children->sibling = this;
1208 static unsigned int cbq_drop(struct Qdisc* sch)
1210 struct cbq_sched_data *q = qdisc_priv(sch);
1211 struct cbq_class *cl, *cl_head;
1212 int prio;
1213 unsigned int len;
1215 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1216 if ((cl_head = q->active[prio]) == NULL)
1217 continue;
1219 cl = cl_head;
1220 do {
1221 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1222 sch->q.qlen--;
1223 if (!cl->q->q.qlen)
1224 cbq_deactivate_class(cl);
1225 return len;
1227 } while ((cl = cl->next_alive) != cl_head);
1229 return 0;
1232 static void
1233 cbq_reset(struct Qdisc* sch)
1235 struct cbq_sched_data *q = qdisc_priv(sch);
1236 struct cbq_class *cl;
1237 struct hlist_node *n;
1238 int prio;
1239 unsigned h;
1241 q->activemask = 0;
1242 q->pmask = 0;
1243 q->tx_class = NULL;
1244 q->tx_borrowed = NULL;
1245 qdisc_watchdog_cancel(&q->watchdog);
1246 hrtimer_cancel(&q->delay_timer);
1247 q->toplevel = TC_CBQ_MAXLEVEL;
1248 q->now = psched_get_time();
1249 q->now_rt = q->now;
1251 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1252 q->active[prio] = NULL;
1254 for (h = 0; h < q->clhash.hashsize; h++) {
1255 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1256 qdisc_reset(cl->q);
1258 cl->next_alive = NULL;
1259 cl->undertime = PSCHED_PASTPERFECT;
1260 cl->avgidle = cl->maxidle;
1261 cl->deficit = cl->quantum;
1262 cl->cpriority = cl->priority;
1265 sch->q.qlen = 0;
1269 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1271 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1272 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1273 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1275 if (lss->change&TCF_CBQ_LSS_EWMA)
1276 cl->ewma_log = lss->ewma_log;
1277 if (lss->change&TCF_CBQ_LSS_AVPKT)
1278 cl->avpkt = lss->avpkt;
1279 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1280 cl->minidle = -(long)lss->minidle;
1281 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1282 cl->maxidle = lss->maxidle;
1283 cl->avgidle = lss->maxidle;
1285 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1286 cl->offtime = lss->offtime;
1287 return 0;
1290 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1292 q->nclasses[cl->priority]--;
1293 q->quanta[cl->priority] -= cl->weight;
1294 cbq_normalize_quanta(q, cl->priority);
1297 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1299 q->nclasses[cl->priority]++;
1300 q->quanta[cl->priority] += cl->weight;
1301 cbq_normalize_quanta(q, cl->priority);
1304 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1306 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1308 if (wrr->allot)
1309 cl->allot = wrr->allot;
1310 if (wrr->weight)
1311 cl->weight = wrr->weight;
1312 if (wrr->priority) {
1313 cl->priority = wrr->priority-1;
1314 cl->cpriority = cl->priority;
1315 if (cl->priority >= cl->priority2)
1316 cl->priority2 = TC_CBQ_MAXPRIO-1;
1319 cbq_addprio(q, cl);
1320 return 0;
1323 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1325 switch (ovl->strategy) {
1326 case TC_CBQ_OVL_CLASSIC:
1327 cl->overlimit = cbq_ovl_classic;
1328 break;
1329 case TC_CBQ_OVL_DELAY:
1330 cl->overlimit = cbq_ovl_delay;
1331 break;
1332 case TC_CBQ_OVL_LOWPRIO:
1333 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1334 ovl->priority2-1 <= cl->priority)
1335 return -EINVAL;
1336 cl->priority2 = ovl->priority2-1;
1337 cl->overlimit = cbq_ovl_lowprio;
1338 break;
1339 case TC_CBQ_OVL_DROP:
1340 cl->overlimit = cbq_ovl_drop;
1341 break;
1342 case TC_CBQ_OVL_RCLASSIC:
1343 cl->overlimit = cbq_ovl_rclassic;
1344 break;
1345 default:
1346 return -EINVAL;
1348 cl->penalty = ovl->penalty;
1349 return 0;
1352 #ifdef CONFIG_NET_CLS_ACT
1353 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1355 cl->police = p->police;
1357 if (cl->q->handle) {
1358 if (p->police == TC_POLICE_RECLASSIFY)
1359 cl->q->reshape_fail = cbq_reshape_fail;
1360 else
1361 cl->q->reshape_fail = NULL;
1363 return 0;
1365 #endif
1367 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1369 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1370 return 0;
1373 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1374 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1375 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1376 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1377 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1378 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1379 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1380 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1383 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1385 struct cbq_sched_data *q = qdisc_priv(sch);
1386 struct nlattr *tb[TCA_CBQ_MAX + 1];
1387 struct tc_ratespec *r;
1388 int err;
1390 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1391 if (err < 0)
1392 return err;
1394 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1395 return -EINVAL;
1397 r = nla_data(tb[TCA_CBQ_RATE]);
1399 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1400 return -EINVAL;
1402 err = qdisc_class_hash_init(&q->clhash);
1403 if (err < 0)
1404 goto put_rtab;
1406 q->link.refcnt = 1;
1407 q->link.sibling = &q->link;
1408 q->link.common.classid = sch->handle;
1409 q->link.qdisc = sch;
1410 if (!(q->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1411 &pfifo_qdisc_ops,
1412 sch->handle)))
1413 q->link.q = &noop_qdisc;
1415 q->link.priority = TC_CBQ_MAXPRIO-1;
1416 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1417 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1418 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1419 q->link.overlimit = cbq_ovl_classic;
1420 q->link.allot = psched_mtu(qdisc_dev(sch));
1421 q->link.quantum = q->link.allot;
1422 q->link.weight = q->link.R_tab->rate.rate;
1424 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1425 q->link.avpkt = q->link.allot/2;
1426 q->link.minidle = -0x7FFFFFFF;
1428 qdisc_watchdog_init(&q->watchdog, sch);
1429 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1430 q->delay_timer.function = cbq_undelay;
1431 q->toplevel = TC_CBQ_MAXLEVEL;
1432 q->now = psched_get_time();
1433 q->now_rt = q->now;
1435 cbq_link_class(&q->link);
1437 if (tb[TCA_CBQ_LSSOPT])
1438 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1440 cbq_addprio(q, &q->link);
1441 return 0;
1443 put_rtab:
1444 qdisc_put_rtab(q->link.R_tab);
1445 return err;
1448 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1450 unsigned char *b = skb_tail_pointer(skb);
1452 NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1453 return skb->len;
1455 nla_put_failure:
1456 nlmsg_trim(skb, b);
1457 return -1;
1460 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1462 unsigned char *b = skb_tail_pointer(skb);
1463 struct tc_cbq_lssopt opt;
1465 opt.flags = 0;
1466 if (cl->borrow == NULL)
1467 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1468 if (cl->share == NULL)
1469 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1470 opt.ewma_log = cl->ewma_log;
1471 opt.level = cl->level;
1472 opt.avpkt = cl->avpkt;
1473 opt.maxidle = cl->maxidle;
1474 opt.minidle = (u32)(-cl->minidle);
1475 opt.offtime = cl->offtime;
1476 opt.change = ~0;
1477 NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1478 return skb->len;
1480 nla_put_failure:
1481 nlmsg_trim(skb, b);
1482 return -1;
1485 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1487 unsigned char *b = skb_tail_pointer(skb);
1488 struct tc_cbq_wrropt opt;
1490 opt.flags = 0;
1491 opt.allot = cl->allot;
1492 opt.priority = cl->priority+1;
1493 opt.cpriority = cl->cpriority+1;
1494 opt.weight = cl->weight;
1495 NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1496 return skb->len;
1498 nla_put_failure:
1499 nlmsg_trim(skb, b);
1500 return -1;
1503 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1505 unsigned char *b = skb_tail_pointer(skb);
1506 struct tc_cbq_ovl opt;
1508 opt.strategy = cl->ovl_strategy;
1509 opt.priority2 = cl->priority2+1;
1510 opt.pad = 0;
1511 opt.penalty = cl->penalty;
1512 NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1513 return skb->len;
1515 nla_put_failure:
1516 nlmsg_trim(skb, b);
1517 return -1;
1520 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1522 unsigned char *b = skb_tail_pointer(skb);
1523 struct tc_cbq_fopt opt;
1525 if (cl->split || cl->defmap) {
1526 opt.split = cl->split ? cl->split->common.classid : 0;
1527 opt.defmap = cl->defmap;
1528 opt.defchange = ~0;
1529 NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1531 return skb->len;
1533 nla_put_failure:
1534 nlmsg_trim(skb, b);
1535 return -1;
1538 #ifdef CONFIG_NET_CLS_ACT
1539 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1541 unsigned char *b = skb_tail_pointer(skb);
1542 struct tc_cbq_police opt;
1544 if (cl->police) {
1545 opt.police = cl->police;
1546 opt.__res1 = 0;
1547 opt.__res2 = 0;
1548 NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1550 return skb->len;
1552 nla_put_failure:
1553 nlmsg_trim(skb, b);
1554 return -1;
1556 #endif
1558 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1560 if (cbq_dump_lss(skb, cl) < 0 ||
1561 cbq_dump_rate(skb, cl) < 0 ||
1562 cbq_dump_wrr(skb, cl) < 0 ||
1563 cbq_dump_ovl(skb, cl) < 0 ||
1564 #ifdef CONFIG_NET_CLS_ACT
1565 cbq_dump_police(skb, cl) < 0 ||
1566 #endif
1567 cbq_dump_fopt(skb, cl) < 0)
1568 return -1;
1569 return 0;
1572 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1574 struct cbq_sched_data *q = qdisc_priv(sch);
1575 struct nlattr *nest;
1577 nest = nla_nest_start(skb, TCA_OPTIONS);
1578 if (nest == NULL)
1579 goto nla_put_failure;
1580 if (cbq_dump_attr(skb, &q->link) < 0)
1581 goto nla_put_failure;
1582 nla_nest_end(skb, nest);
1583 return skb->len;
1585 nla_put_failure:
1586 nla_nest_cancel(skb, nest);
1587 return -1;
1590 static int
1591 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1593 struct cbq_sched_data *q = qdisc_priv(sch);
1595 q->link.xstats.avgidle = q->link.avgidle;
1596 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1599 static int
1600 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1601 struct sk_buff *skb, struct tcmsg *tcm)
1603 struct cbq_class *cl = (struct cbq_class*)arg;
1604 struct nlattr *nest;
1606 if (cl->tparent)
1607 tcm->tcm_parent = cl->tparent->common.classid;
1608 else
1609 tcm->tcm_parent = TC_H_ROOT;
1610 tcm->tcm_handle = cl->common.classid;
1611 tcm->tcm_info = cl->q->handle;
1613 nest = nla_nest_start(skb, TCA_OPTIONS);
1614 if (nest == NULL)
1615 goto nla_put_failure;
1616 if (cbq_dump_attr(skb, cl) < 0)
1617 goto nla_put_failure;
1618 nla_nest_end(skb, nest);
1619 return skb->len;
1621 nla_put_failure:
1622 nla_nest_cancel(skb, nest);
1623 return -1;
1626 static int
1627 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1628 struct gnet_dump *d)
1630 struct cbq_sched_data *q = qdisc_priv(sch);
1631 struct cbq_class *cl = (struct cbq_class*)arg;
1633 cl->qstats.qlen = cl->q->q.qlen;
1634 cl->xstats.avgidle = cl->avgidle;
1635 cl->xstats.undertime = 0;
1637 if (cl->undertime != PSCHED_PASTPERFECT)
1638 cl->xstats.undertime = cl->undertime - q->now;
1640 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1641 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1642 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1643 return -1;
1645 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1648 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1649 struct Qdisc **old)
1651 struct cbq_class *cl = (struct cbq_class*)arg;
1653 if (cl) {
1654 if (new == NULL) {
1655 new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1656 &pfifo_qdisc_ops,
1657 cl->common.classid);
1658 if (new == NULL)
1659 return -ENOBUFS;
1660 } else {
1661 #ifdef CONFIG_NET_CLS_ACT
1662 if (cl->police == TC_POLICE_RECLASSIFY)
1663 new->reshape_fail = cbq_reshape_fail;
1664 #endif
1666 sch_tree_lock(sch);
1667 *old = xchg(&cl->q, new);
1668 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1669 qdisc_reset(*old);
1670 sch_tree_unlock(sch);
1672 return 0;
1674 return -ENOENT;
1677 static struct Qdisc *
1678 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1680 struct cbq_class *cl = (struct cbq_class*)arg;
1682 return cl ? cl->q : NULL;
1685 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1687 struct cbq_class *cl = (struct cbq_class *)arg;
1689 if (cl->q->q.qlen == 0)
1690 cbq_deactivate_class(cl);
1693 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1695 struct cbq_sched_data *q = qdisc_priv(sch);
1696 struct cbq_class *cl = cbq_class_lookup(q, classid);
1698 if (cl) {
1699 cl->refcnt++;
1700 return (unsigned long)cl;
1702 return 0;
1705 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1707 struct cbq_sched_data *q = qdisc_priv(sch);
1709 WARN_ON(cl->filters);
1711 tcf_destroy_chain(&cl->filter_list);
1712 qdisc_destroy(cl->q);
1713 qdisc_put_rtab(cl->R_tab);
1714 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1715 if (cl != &q->link)
1716 kfree(cl);
1719 static void
1720 cbq_destroy(struct Qdisc* sch)
1722 struct cbq_sched_data *q = qdisc_priv(sch);
1723 struct hlist_node *n, *next;
1724 struct cbq_class *cl;
1725 unsigned h;
1727 #ifdef CONFIG_NET_CLS_ACT
1728 q->rx_class = NULL;
1729 #endif
1731 * Filters must be destroyed first because we don't destroy the
1732 * classes from root to leafs which means that filters can still
1733 * be bound to classes which have been destroyed already. --TGR '04
1735 for (h = 0; h < q->clhash.hashsize; h++) {
1736 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
1737 tcf_destroy_chain(&cl->filter_list);
1739 for (h = 0; h < q->clhash.hashsize; h++) {
1740 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
1741 common.hnode)
1742 cbq_destroy_class(sch, cl);
1744 qdisc_class_hash_destroy(&q->clhash);
1747 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1749 struct cbq_class *cl = (struct cbq_class*)arg;
1751 if (--cl->refcnt == 0) {
1752 #ifdef CONFIG_NET_CLS_ACT
1753 spinlock_t *root_lock = qdisc_root_lock(sch);
1754 struct cbq_sched_data *q = qdisc_priv(sch);
1756 spin_lock_bh(root_lock);
1757 if (q->rx_class == cl)
1758 q->rx_class = NULL;
1759 spin_unlock_bh(root_lock);
1760 #endif
1762 cbq_destroy_class(sch, cl);
1766 static int
1767 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1768 unsigned long *arg)
1770 int err;
1771 struct cbq_sched_data *q = qdisc_priv(sch);
1772 struct cbq_class *cl = (struct cbq_class*)*arg;
1773 struct nlattr *opt = tca[TCA_OPTIONS];
1774 struct nlattr *tb[TCA_CBQ_MAX + 1];
1775 struct cbq_class *parent;
1776 struct qdisc_rate_table *rtab = NULL;
1778 if (opt == NULL)
1779 return -EINVAL;
1781 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1782 if (err < 0)
1783 return err;
1785 if (cl) {
1786 /* Check parent */
1787 if (parentid) {
1788 if (cl->tparent &&
1789 cl->tparent->common.classid != parentid)
1790 return -EINVAL;
1791 if (!cl->tparent && parentid != TC_H_ROOT)
1792 return -EINVAL;
1795 if (tb[TCA_CBQ_RATE]) {
1796 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1797 if (rtab == NULL)
1798 return -EINVAL;
1801 /* Change class parameters */
1802 sch_tree_lock(sch);
1804 if (cl->next_alive != NULL)
1805 cbq_deactivate_class(cl);
1807 if (rtab) {
1808 rtab = xchg(&cl->R_tab, rtab);
1809 qdisc_put_rtab(rtab);
1812 if (tb[TCA_CBQ_LSSOPT])
1813 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1815 if (tb[TCA_CBQ_WRROPT]) {
1816 cbq_rmprio(q, cl);
1817 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1820 if (tb[TCA_CBQ_OVL_STRATEGY])
1821 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1823 #ifdef CONFIG_NET_CLS_ACT
1824 if (tb[TCA_CBQ_POLICE])
1825 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1826 #endif
1828 if (tb[TCA_CBQ_FOPT])
1829 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1831 if (cl->q->q.qlen)
1832 cbq_activate_class(cl);
1834 sch_tree_unlock(sch);
1836 if (tca[TCA_RATE])
1837 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1838 qdisc_root_lock(sch),
1839 tca[TCA_RATE]);
1840 return 0;
1843 if (parentid == TC_H_ROOT)
1844 return -EINVAL;
1846 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1847 tb[TCA_CBQ_LSSOPT] == NULL)
1848 return -EINVAL;
1850 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1851 if (rtab == NULL)
1852 return -EINVAL;
1854 if (classid) {
1855 err = -EINVAL;
1856 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1857 goto failure;
1858 } else {
1859 int i;
1860 classid = TC_H_MAKE(sch->handle,0x8000);
1862 for (i=0; i<0x8000; i++) {
1863 if (++q->hgenerator >= 0x8000)
1864 q->hgenerator = 1;
1865 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1866 break;
1868 err = -ENOSR;
1869 if (i >= 0x8000)
1870 goto failure;
1871 classid = classid|q->hgenerator;
1874 parent = &q->link;
1875 if (parentid) {
1876 parent = cbq_class_lookup(q, parentid);
1877 err = -EINVAL;
1878 if (parent == NULL)
1879 goto failure;
1882 err = -ENOBUFS;
1883 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1884 if (cl == NULL)
1885 goto failure;
1886 cl->R_tab = rtab;
1887 rtab = NULL;
1888 cl->refcnt = 1;
1889 if (!(cl->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1890 &pfifo_qdisc_ops, classid)))
1891 cl->q = &noop_qdisc;
1892 cl->common.classid = classid;
1893 cl->tparent = parent;
1894 cl->qdisc = sch;
1895 cl->allot = parent->allot;
1896 cl->quantum = cl->allot;
1897 cl->weight = cl->R_tab->rate.rate;
1899 sch_tree_lock(sch);
1900 cbq_link_class(cl);
1901 cl->borrow = cl->tparent;
1902 if (cl->tparent != &q->link)
1903 cl->share = cl->tparent;
1904 cbq_adjust_levels(parent);
1905 cl->minidle = -0x7FFFFFFF;
1906 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1907 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1908 if (cl->ewma_log==0)
1909 cl->ewma_log = q->link.ewma_log;
1910 if (cl->maxidle==0)
1911 cl->maxidle = q->link.maxidle;
1912 if (cl->avpkt==0)
1913 cl->avpkt = q->link.avpkt;
1914 cl->overlimit = cbq_ovl_classic;
1915 if (tb[TCA_CBQ_OVL_STRATEGY])
1916 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1917 #ifdef CONFIG_NET_CLS_ACT
1918 if (tb[TCA_CBQ_POLICE])
1919 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1920 #endif
1921 if (tb[TCA_CBQ_FOPT])
1922 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1923 sch_tree_unlock(sch);
1925 qdisc_class_hash_grow(sch, &q->clhash);
1927 if (tca[TCA_RATE])
1928 gen_new_estimator(&cl->bstats, &cl->rate_est,
1929 qdisc_root_lock(sch), tca[TCA_RATE]);
1931 *arg = (unsigned long)cl;
1932 return 0;
1934 failure:
1935 qdisc_put_rtab(rtab);
1936 return err;
1939 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1941 struct cbq_sched_data *q = qdisc_priv(sch);
1942 struct cbq_class *cl = (struct cbq_class*)arg;
1943 unsigned int qlen;
1945 if (cl->filters || cl->children || cl == &q->link)
1946 return -EBUSY;
1948 sch_tree_lock(sch);
1950 qlen = cl->q->q.qlen;
1951 qdisc_reset(cl->q);
1952 qdisc_tree_decrease_qlen(cl->q, qlen);
1954 if (cl->next_alive)
1955 cbq_deactivate_class(cl);
1957 if (q->tx_borrowed == cl)
1958 q->tx_borrowed = q->tx_class;
1959 if (q->tx_class == cl) {
1960 q->tx_class = NULL;
1961 q->tx_borrowed = NULL;
1963 #ifdef CONFIG_NET_CLS_ACT
1964 if (q->rx_class == cl)
1965 q->rx_class = NULL;
1966 #endif
1968 cbq_unlink_class(cl);
1969 cbq_adjust_levels(cl->tparent);
1970 cl->defmap = 0;
1971 cbq_sync_defmap(cl);
1973 cbq_rmprio(q, cl);
1974 sch_tree_unlock(sch);
1976 if (--cl->refcnt == 0)
1977 cbq_destroy_class(sch, cl);
1979 return 0;
1982 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1984 struct cbq_sched_data *q = qdisc_priv(sch);
1985 struct cbq_class *cl = (struct cbq_class *)arg;
1987 if (cl == NULL)
1988 cl = &q->link;
1990 return &cl->filter_list;
1993 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1994 u32 classid)
1996 struct cbq_sched_data *q = qdisc_priv(sch);
1997 struct cbq_class *p = (struct cbq_class*)parent;
1998 struct cbq_class *cl = cbq_class_lookup(q, classid);
2000 if (cl) {
2001 if (p && p->level <= cl->level)
2002 return 0;
2003 cl->filters++;
2004 return (unsigned long)cl;
2006 return 0;
2009 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2011 struct cbq_class *cl = (struct cbq_class*)arg;
2013 cl->filters--;
2016 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2018 struct cbq_sched_data *q = qdisc_priv(sch);
2019 struct cbq_class *cl;
2020 struct hlist_node *n;
2021 unsigned h;
2023 if (arg->stop)
2024 return;
2026 for (h = 0; h < q->clhash.hashsize; h++) {
2027 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
2028 if (arg->count < arg->skip) {
2029 arg->count++;
2030 continue;
2032 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2033 arg->stop = 1;
2034 return;
2036 arg->count++;
2041 static const struct Qdisc_class_ops cbq_class_ops = {
2042 .graft = cbq_graft,
2043 .leaf = cbq_leaf,
2044 .qlen_notify = cbq_qlen_notify,
2045 .get = cbq_get,
2046 .put = cbq_put,
2047 .change = cbq_change_class,
2048 .delete = cbq_delete,
2049 .walk = cbq_walk,
2050 .tcf_chain = cbq_find_tcf,
2051 .bind_tcf = cbq_bind_filter,
2052 .unbind_tcf = cbq_unbind_filter,
2053 .dump = cbq_dump_class,
2054 .dump_stats = cbq_dump_class_stats,
2057 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2058 .next = NULL,
2059 .cl_ops = &cbq_class_ops,
2060 .id = "cbq",
2061 .priv_size = sizeof(struct cbq_sched_data),
2062 .enqueue = cbq_enqueue,
2063 .dequeue = cbq_dequeue,
2064 .requeue = cbq_requeue,
2065 .drop = cbq_drop,
2066 .init = cbq_init,
2067 .reset = cbq_reset,
2068 .destroy = cbq_destroy,
2069 .change = NULL,
2070 .dump = cbq_dump,
2071 .dump_stats = cbq_dump_stats,
2072 .owner = THIS_MODULE,
2075 static int __init cbq_module_init(void)
2077 return register_qdisc(&cbq_qdisc_ops);
2079 static void __exit cbq_module_exit(void)
2081 unregister_qdisc(&cbq_qdisc_ops);
2083 module_init(cbq_module_init)
2084 module_exit(cbq_module_exit)
2085 MODULE_LICENSE("GPL");