hwmon/f71882fg: Make creation of in sysfs attributes more generic
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / sched / sch_cbq.c
blob5f63ec58942c10ac5088c6e6fca6e40463235a0b
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
24 /* Class-Based Queueing (CBQ) algorithm.
25 =======================================
27 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
28 Management Models for Packet Networks",
29 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
31 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
33 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
34 Parameters", 1996
36 [4] Sally Floyd and Michael Speer, "Experimental Results
37 for Class-Based Queueing", 1998, not published.
39 -----------------------------------------------------------------------
41 Algorithm skeleton was taken from NS simulator cbq.cc.
42 If someone wants to check this code against the LBL version,
43 he should take into account that ONLY the skeleton was borrowed,
44 the implementation is different. Particularly:
46 --- The WRR algorithm is different. Our version looks more
47 reasonable (I hope) and works when quanta are allowed to be
48 less than MTU, which is always the case when real time classes
49 have small rates. Note, that the statement of [3] is
50 incomplete, delay may actually be estimated even if class
51 per-round allotment is less than MTU. Namely, if per-round
52 allotment is W*r_i, and r_1+...+r_k = r < 1
54 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
56 In the worst case we have IntServ estimate with D = W*r+k*MTU
57 and C = MTU*r. The proof (if correct at all) is trivial.
60 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
61 interpret some places, which look like wrong translations
62 from NS. Anyone is advised to find these differences
63 and explain to me, why I am wrong 8).
65 --- Linux has no EOI event, so that we cannot estimate true class
66 idle time. Workaround is to consider the next dequeue event
67 as sign that previous packet is finished. This is wrong because of
68 internal device queueing, but on a permanently loaded link it is true.
69 Moreover, combined with clock integrator, this scheme looks
70 very close to an ideal solution. */
72 struct cbq_sched_data;
75 struct cbq_class
77 struct Qdisc_class_common common;
78 struct cbq_class *next_alive; /* next class with backlog in this priority band */
80 /* Parameters */
81 unsigned char priority; /* class priority */
82 unsigned char priority2; /* priority to be used after overlimit */
83 unsigned char ewma_log; /* time constant for idle time calculation */
84 unsigned char ovl_strategy;
85 #ifdef CONFIG_NET_CLS_ACT
86 unsigned char police;
87 #endif
89 u32 defmap;
91 /* Link-sharing scheduler parameters */
92 long maxidle; /* Class parameters: see below. */
93 long offtime;
94 long minidle;
95 u32 avpkt;
96 struct qdisc_rate_table *R_tab;
98 /* Overlimit strategy parameters */
99 void (*overlimit)(struct cbq_class *cl);
100 psched_tdiff_t penalty;
102 /* General scheduler (WRR) parameters */
103 long allot;
104 long quantum; /* Allotment per WRR round */
105 long weight; /* Relative allotment: see below */
107 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
108 struct cbq_class *split; /* Ptr to split node */
109 struct cbq_class *share; /* Ptr to LS parent in the class tree */
110 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
111 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
112 parent otherwise */
113 struct cbq_class *sibling; /* Sibling chain */
114 struct cbq_class *children; /* Pointer to children chain */
116 struct Qdisc *q; /* Elementary queueing discipline */
119 /* Variables */
120 unsigned char cpriority; /* Effective priority */
121 unsigned char delayed;
122 unsigned char level; /* level of the class in hierarchy:
123 0 for leaf classes, and maximal
124 level of children + 1 for nodes.
127 psched_time_t last; /* Last end of service */
128 psched_time_t undertime;
129 long avgidle;
130 long deficit; /* Saved deficit for WRR */
131 psched_time_t penalized;
132 struct gnet_stats_basic_packed bstats;
133 struct gnet_stats_queue qstats;
134 struct gnet_stats_rate_est rate_est;
135 struct tc_cbq_xstats xstats;
137 struct tcf_proto *filter_list;
139 int refcnt;
140 int filters;
142 struct cbq_class *defaults[TC_PRIO_MAX+1];
145 struct cbq_sched_data
147 struct Qdisc_class_hash clhash; /* Hash table of all classes */
148 int nclasses[TC_CBQ_MAXPRIO+1];
149 unsigned quanta[TC_CBQ_MAXPRIO+1];
151 struct cbq_class link;
153 unsigned activemask;
154 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
155 with backlog */
157 #ifdef CONFIG_NET_CLS_ACT
158 struct cbq_class *rx_class;
159 #endif
160 struct cbq_class *tx_class;
161 struct cbq_class *tx_borrowed;
162 int tx_len;
163 psched_time_t now; /* Cached timestamp */
164 psched_time_t now_rt; /* Cached real time */
165 unsigned pmask;
167 struct hrtimer delay_timer;
168 struct qdisc_watchdog watchdog; /* Watchdog timer,
169 started when CBQ has
170 backlog, but cannot
171 transmit just now */
172 psched_tdiff_t wd_expires;
173 int toplevel;
174 u32 hgenerator;
178 #define L2T(cl,len) qdisc_l2t((cl)->R_tab,len)
180 static __inline__ struct cbq_class *
181 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
183 struct Qdisc_class_common *clc;
185 clc = qdisc_class_find(&q->clhash, classid);
186 if (clc == NULL)
187 return NULL;
188 return container_of(clc, struct cbq_class, common);
191 #ifdef CONFIG_NET_CLS_ACT
193 static struct cbq_class *
194 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
196 struct cbq_class *cl, *new;
198 for (cl = this->tparent; cl; cl = cl->tparent)
199 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
200 return new;
202 return NULL;
205 #endif
207 /* Classify packet. The procedure is pretty complicated, but
208 it allows us to combine link sharing and priority scheduling
209 transparently.
211 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
212 so that it resolves to split nodes. Then packets are classified
213 by logical priority, or a more specific classifier may be attached
214 to the split node.
217 static struct cbq_class *
218 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
220 struct cbq_sched_data *q = qdisc_priv(sch);
221 struct cbq_class *head = &q->link;
222 struct cbq_class **defmap;
223 struct cbq_class *cl = NULL;
224 u32 prio = skb->priority;
225 struct tcf_result res;
228 * Step 1. If skb->priority points to one of our classes, use it.
230 if (TC_H_MAJ(prio^sch->handle) == 0 &&
231 (cl = cbq_class_lookup(q, prio)) != NULL)
232 return cl;
234 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
235 for (;;) {
236 int result = 0;
237 defmap = head->defaults;
240 * Step 2+n. Apply classifier.
242 if (!head->filter_list ||
243 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
244 goto fallback;
246 if ((cl = (void*)res.class) == NULL) {
247 if (TC_H_MAJ(res.classid))
248 cl = cbq_class_lookup(q, res.classid);
249 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
250 cl = defmap[TC_PRIO_BESTEFFORT];
252 if (cl == NULL || cl->level >= head->level)
253 goto fallback;
256 #ifdef CONFIG_NET_CLS_ACT
257 switch (result) {
258 case TC_ACT_QUEUED:
259 case TC_ACT_STOLEN:
260 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
261 case TC_ACT_SHOT:
262 return NULL;
263 case TC_ACT_RECLASSIFY:
264 return cbq_reclassify(skb, cl);
266 #endif
267 if (cl->level == 0)
268 return cl;
271 * Step 3+n. If classifier selected a link sharing class,
272 * apply agency specific classifier.
273 * Repeat this procdure until we hit a leaf node.
275 head = cl;
278 fallback:
279 cl = head;
282 * Step 4. No success...
284 if (TC_H_MAJ(prio) == 0 &&
285 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
286 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
287 return head;
289 return cl;
293 A packet has just been enqueued on the empty class.
294 cbq_activate_class adds it to the tail of active class list
295 of its priority band.
298 static __inline__ void cbq_activate_class(struct cbq_class *cl)
300 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
301 int prio = cl->cpriority;
302 struct cbq_class *cl_tail;
304 cl_tail = q->active[prio];
305 q->active[prio] = cl;
307 if (cl_tail != NULL) {
308 cl->next_alive = cl_tail->next_alive;
309 cl_tail->next_alive = cl;
310 } else {
311 cl->next_alive = cl;
312 q->activemask |= (1<<prio);
317 Unlink class from active chain.
318 Note that this same procedure is done directly in cbq_dequeue*
319 during round-robin procedure.
322 static void cbq_deactivate_class(struct cbq_class *this)
324 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
325 int prio = this->cpriority;
326 struct cbq_class *cl;
327 struct cbq_class *cl_prev = q->active[prio];
329 do {
330 cl = cl_prev->next_alive;
331 if (cl == this) {
332 cl_prev->next_alive = cl->next_alive;
333 cl->next_alive = NULL;
335 if (cl == q->active[prio]) {
336 q->active[prio] = cl_prev;
337 if (cl == q->active[prio]) {
338 q->active[prio] = NULL;
339 q->activemask &= ~(1<<prio);
340 return;
343 return;
345 } while ((cl_prev = cl) != q->active[prio]);
348 static void
349 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
351 int toplevel = q->toplevel;
353 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
354 psched_time_t now;
355 psched_tdiff_t incr;
357 now = psched_get_time();
358 incr = now - q->now_rt;
359 now = q->now + incr;
361 do {
362 if (cl->undertime < now) {
363 q->toplevel = cl->level;
364 return;
366 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
370 static int
371 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
373 struct cbq_sched_data *q = qdisc_priv(sch);
374 int uninitialized_var(ret);
375 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
377 #ifdef CONFIG_NET_CLS_ACT
378 q->rx_class = cl;
379 #endif
380 if (cl == NULL) {
381 if (ret & __NET_XMIT_BYPASS)
382 sch->qstats.drops++;
383 kfree_skb(skb);
384 return ret;
387 #ifdef CONFIG_NET_CLS_ACT
388 cl->q->__parent = sch;
389 #endif
390 ret = qdisc_enqueue(skb, cl->q);
391 if (ret == NET_XMIT_SUCCESS) {
392 sch->q.qlen++;
393 cbq_mark_toplevel(q, cl);
394 if (!cl->next_alive)
395 cbq_activate_class(cl);
396 return ret;
399 if (net_xmit_drop_count(ret)) {
400 sch->qstats.drops++;
401 cbq_mark_toplevel(q, cl);
402 cl->qstats.drops++;
404 return ret;
407 /* Overlimit actions */
409 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
411 static void cbq_ovl_classic(struct cbq_class *cl)
413 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
414 psched_tdiff_t delay = cl->undertime - q->now;
416 if (!cl->delayed) {
417 delay += cl->offtime;
420 Class goes to sleep, so that it will have no
421 chance to work avgidle. Let's forgive it 8)
423 BTW cbq-2.0 has a crap in this
424 place, apparently they forgot to shift it by cl->ewma_log.
426 if (cl->avgidle < 0)
427 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
428 if (cl->avgidle < cl->minidle)
429 cl->avgidle = cl->minidle;
430 if (delay <= 0)
431 delay = 1;
432 cl->undertime = q->now + delay;
434 cl->xstats.overactions++;
435 cl->delayed = 1;
437 if (q->wd_expires == 0 || q->wd_expires > delay)
438 q->wd_expires = delay;
440 /* Dirty work! We must schedule wakeups based on
441 real available rate, rather than leaf rate,
442 which may be tiny (even zero).
444 if (q->toplevel == TC_CBQ_MAXLEVEL) {
445 struct cbq_class *b;
446 psched_tdiff_t base_delay = q->wd_expires;
448 for (b = cl->borrow; b; b = b->borrow) {
449 delay = b->undertime - q->now;
450 if (delay < base_delay) {
451 if (delay <= 0)
452 delay = 1;
453 base_delay = delay;
457 q->wd_expires = base_delay;
461 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
462 they go overlimit
465 static void cbq_ovl_rclassic(struct cbq_class *cl)
467 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
468 struct cbq_class *this = cl;
470 do {
471 if (cl->level > q->toplevel) {
472 cl = NULL;
473 break;
475 } while ((cl = cl->borrow) != NULL);
477 if (cl == NULL)
478 cl = this;
479 cbq_ovl_classic(cl);
482 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
484 static void cbq_ovl_delay(struct cbq_class *cl)
486 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
487 psched_tdiff_t delay = cl->undertime - q->now;
489 if (test_bit(__QDISC_STATE_DEACTIVATED,
490 &qdisc_root_sleeping(cl->qdisc)->state))
491 return;
493 if (!cl->delayed) {
494 psched_time_t sched = q->now;
495 ktime_t expires;
497 delay += cl->offtime;
498 if (cl->avgidle < 0)
499 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
500 if (cl->avgidle < cl->minidle)
501 cl->avgidle = cl->minidle;
502 cl->undertime = q->now + delay;
504 if (delay > 0) {
505 sched += delay + cl->penalty;
506 cl->penalized = sched;
507 cl->cpriority = TC_CBQ_MAXPRIO;
508 q->pmask |= (1<<TC_CBQ_MAXPRIO);
510 expires = ktime_set(0, 0);
511 expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched));
512 if (hrtimer_try_to_cancel(&q->delay_timer) &&
513 ktime_to_ns(ktime_sub(
514 hrtimer_get_expires(&q->delay_timer),
515 expires)) > 0)
516 hrtimer_set_expires(&q->delay_timer, expires);
517 hrtimer_restart(&q->delay_timer);
518 cl->delayed = 1;
519 cl->xstats.overactions++;
520 return;
522 delay = 1;
524 if (q->wd_expires == 0 || q->wd_expires > delay)
525 q->wd_expires = delay;
528 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
530 static void cbq_ovl_lowprio(struct cbq_class *cl)
532 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
534 cl->penalized = q->now + cl->penalty;
536 if (cl->cpriority != cl->priority2) {
537 cl->cpriority = cl->priority2;
538 q->pmask |= (1<<cl->cpriority);
539 cl->xstats.overactions++;
541 cbq_ovl_classic(cl);
544 /* TC_CBQ_OVL_DROP: penalize class by dropping */
546 static void cbq_ovl_drop(struct cbq_class *cl)
548 if (cl->q->ops->drop)
549 if (cl->q->ops->drop(cl->q))
550 cl->qdisc->q.qlen--;
551 cl->xstats.overactions++;
552 cbq_ovl_classic(cl);
555 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
556 psched_time_t now)
558 struct cbq_class *cl;
559 struct cbq_class *cl_prev = q->active[prio];
560 psched_time_t sched = now;
562 if (cl_prev == NULL)
563 return 0;
565 do {
566 cl = cl_prev->next_alive;
567 if (now - cl->penalized > 0) {
568 cl_prev->next_alive = cl->next_alive;
569 cl->next_alive = NULL;
570 cl->cpriority = cl->priority;
571 cl->delayed = 0;
572 cbq_activate_class(cl);
574 if (cl == q->active[prio]) {
575 q->active[prio] = cl_prev;
576 if (cl == q->active[prio]) {
577 q->active[prio] = NULL;
578 return 0;
582 cl = cl_prev->next_alive;
583 } else if (sched - cl->penalized > 0)
584 sched = cl->penalized;
585 } while ((cl_prev = cl) != q->active[prio]);
587 return sched - now;
590 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
592 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
593 delay_timer);
594 struct Qdisc *sch = q->watchdog.qdisc;
595 psched_time_t now;
596 psched_tdiff_t delay = 0;
597 unsigned pmask;
599 now = psched_get_time();
601 pmask = q->pmask;
602 q->pmask = 0;
604 while (pmask) {
605 int prio = ffz(~pmask);
606 psched_tdiff_t tmp;
608 pmask &= ~(1<<prio);
610 tmp = cbq_undelay_prio(q, prio, now);
611 if (tmp > 0) {
612 q->pmask |= 1<<prio;
613 if (tmp < delay || delay == 0)
614 delay = tmp;
618 if (delay) {
619 ktime_t time;
621 time = ktime_set(0, 0);
622 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
623 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
626 sch->flags &= ~TCQ_F_THROTTLED;
627 __netif_schedule(qdisc_root(sch));
628 return HRTIMER_NORESTART;
631 #ifdef CONFIG_NET_CLS_ACT
632 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
634 struct Qdisc *sch = child->__parent;
635 struct cbq_sched_data *q = qdisc_priv(sch);
636 struct cbq_class *cl = q->rx_class;
638 q->rx_class = NULL;
640 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
641 int ret;
643 cbq_mark_toplevel(q, cl);
645 q->rx_class = cl;
646 cl->q->__parent = sch;
648 ret = qdisc_enqueue(skb, cl->q);
649 if (ret == NET_XMIT_SUCCESS) {
650 sch->q.qlen++;
651 if (!cl->next_alive)
652 cbq_activate_class(cl);
653 return 0;
655 if (net_xmit_drop_count(ret))
656 sch->qstats.drops++;
657 return 0;
660 sch->qstats.drops++;
661 return -1;
663 #endif
666 It is mission critical procedure.
668 We "regenerate" toplevel cutoff, if transmitting class
669 has backlog and it is not regulated. It is not part of
670 original CBQ description, but looks more reasonable.
671 Probably, it is wrong. This question needs further investigation.
674 static __inline__ void
675 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
676 struct cbq_class *borrowed)
678 if (cl && q->toplevel >= borrowed->level) {
679 if (cl->q->q.qlen > 1) {
680 do {
681 if (borrowed->undertime == PSCHED_PASTPERFECT) {
682 q->toplevel = borrowed->level;
683 return;
685 } while ((borrowed=borrowed->borrow) != NULL);
687 #if 0
688 /* It is not necessary now. Uncommenting it
689 will save CPU cycles, but decrease fairness.
691 q->toplevel = TC_CBQ_MAXLEVEL;
692 #endif
696 static void
697 cbq_update(struct cbq_sched_data *q)
699 struct cbq_class *this = q->tx_class;
700 struct cbq_class *cl = this;
701 int len = q->tx_len;
703 q->tx_class = NULL;
705 for ( ; cl; cl = cl->share) {
706 long avgidle = cl->avgidle;
707 long idle;
709 cl->bstats.packets++;
710 cl->bstats.bytes += len;
713 (now - last) is total time between packet right edges.
714 (last_pktlen/rate) is "virtual" busy time, so that
716 idle = (now - last) - last_pktlen/rate
719 idle = q->now - cl->last;
720 if ((unsigned long)idle > 128*1024*1024) {
721 avgidle = cl->maxidle;
722 } else {
723 idle -= L2T(cl, len);
725 /* true_avgidle := (1-W)*true_avgidle + W*idle,
726 where W=2^{-ewma_log}. But cl->avgidle is scaled:
727 cl->avgidle == true_avgidle/W,
728 hence:
730 avgidle += idle - (avgidle>>cl->ewma_log);
733 if (avgidle <= 0) {
734 /* Overlimit or at-limit */
736 if (avgidle < cl->minidle)
737 avgidle = cl->minidle;
739 cl->avgidle = avgidle;
741 /* Calculate expected time, when this class
742 will be allowed to send.
743 It will occur, when:
744 (1-W)*true_avgidle + W*delay = 0, i.e.
745 idle = (1/W - 1)*(-true_avgidle)
747 idle = (1 - W)*(-cl->avgidle);
749 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
752 That is not all.
753 To maintain the rate allocated to the class,
754 we add to undertime virtual clock,
755 necessary to complete transmitted packet.
756 (len/phys_bandwidth has been already passed
757 to the moment of cbq_update)
760 idle -= L2T(&q->link, len);
761 idle += L2T(cl, len);
763 cl->undertime = q->now + idle;
764 } else {
765 /* Underlimit */
767 cl->undertime = PSCHED_PASTPERFECT;
768 if (avgidle > cl->maxidle)
769 cl->avgidle = cl->maxidle;
770 else
771 cl->avgidle = avgidle;
773 cl->last = q->now;
776 cbq_update_toplevel(q, this, q->tx_borrowed);
779 static __inline__ struct cbq_class *
780 cbq_under_limit(struct cbq_class *cl)
782 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
783 struct cbq_class *this_cl = cl;
785 if (cl->tparent == NULL)
786 return cl;
788 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
789 cl->delayed = 0;
790 return cl;
793 do {
794 /* It is very suspicious place. Now overlimit
795 action is generated for not bounded classes
796 only if link is completely congested.
797 Though it is in agree with ancestor-only paradigm,
798 it looks very stupid. Particularly,
799 it means that this chunk of code will either
800 never be called or result in strong amplification
801 of burstiness. Dangerous, silly, and, however,
802 no another solution exists.
804 if ((cl = cl->borrow) == NULL) {
805 this_cl->qstats.overlimits++;
806 this_cl->overlimit(this_cl);
807 return NULL;
809 if (cl->level > q->toplevel)
810 return NULL;
811 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
813 cl->delayed = 0;
814 return cl;
817 static __inline__ struct sk_buff *
818 cbq_dequeue_prio(struct Qdisc *sch, int prio)
820 struct cbq_sched_data *q = qdisc_priv(sch);
821 struct cbq_class *cl_tail, *cl_prev, *cl;
822 struct sk_buff *skb;
823 int deficit;
825 cl_tail = cl_prev = q->active[prio];
826 cl = cl_prev->next_alive;
828 do {
829 deficit = 0;
831 /* Start round */
832 do {
833 struct cbq_class *borrow = cl;
835 if (cl->q->q.qlen &&
836 (borrow = cbq_under_limit(cl)) == NULL)
837 goto skip_class;
839 if (cl->deficit <= 0) {
840 /* Class exhausted its allotment per
841 this round. Switch to the next one.
843 deficit = 1;
844 cl->deficit += cl->quantum;
845 goto next_class;
848 skb = cl->q->dequeue(cl->q);
850 /* Class did not give us any skb :-(
851 It could occur even if cl->q->q.qlen != 0
852 f.e. if cl->q == "tbf"
854 if (skb == NULL)
855 goto skip_class;
857 cl->deficit -= qdisc_pkt_len(skb);
858 q->tx_class = cl;
859 q->tx_borrowed = borrow;
860 if (borrow != cl) {
861 #ifndef CBQ_XSTATS_BORROWS_BYTES
862 borrow->xstats.borrows++;
863 cl->xstats.borrows++;
864 #else
865 borrow->xstats.borrows += qdisc_pkt_len(skb);
866 cl->xstats.borrows += qdisc_pkt_len(skb);
867 #endif
869 q->tx_len = qdisc_pkt_len(skb);
871 if (cl->deficit <= 0) {
872 q->active[prio] = cl;
873 cl = cl->next_alive;
874 cl->deficit += cl->quantum;
876 return skb;
878 skip_class:
879 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
880 /* Class is empty or penalized.
881 Unlink it from active chain.
883 cl_prev->next_alive = cl->next_alive;
884 cl->next_alive = NULL;
886 /* Did cl_tail point to it? */
887 if (cl == cl_tail) {
888 /* Repair it! */
889 cl_tail = cl_prev;
891 /* Was it the last class in this band? */
892 if (cl == cl_tail) {
893 /* Kill the band! */
894 q->active[prio] = NULL;
895 q->activemask &= ~(1<<prio);
896 if (cl->q->q.qlen)
897 cbq_activate_class(cl);
898 return NULL;
901 q->active[prio] = cl_tail;
903 if (cl->q->q.qlen)
904 cbq_activate_class(cl);
906 cl = cl_prev;
909 next_class:
910 cl_prev = cl;
911 cl = cl->next_alive;
912 } while (cl_prev != cl_tail);
913 } while (deficit);
915 q->active[prio] = cl_prev;
917 return NULL;
920 static __inline__ struct sk_buff *
921 cbq_dequeue_1(struct Qdisc *sch)
923 struct cbq_sched_data *q = qdisc_priv(sch);
924 struct sk_buff *skb;
925 unsigned activemask;
927 activemask = q->activemask&0xFF;
928 while (activemask) {
929 int prio = ffz(~activemask);
930 activemask &= ~(1<<prio);
931 skb = cbq_dequeue_prio(sch, prio);
932 if (skb)
933 return skb;
935 return NULL;
938 static struct sk_buff *
939 cbq_dequeue(struct Qdisc *sch)
941 struct sk_buff *skb;
942 struct cbq_sched_data *q = qdisc_priv(sch);
943 psched_time_t now;
944 psched_tdiff_t incr;
946 now = psched_get_time();
947 incr = now - q->now_rt;
949 if (q->tx_class) {
950 psched_tdiff_t incr2;
951 /* Time integrator. We calculate EOS time
952 by adding expected packet transmission time.
953 If real time is greater, we warp artificial clock,
954 so that:
956 cbq_time = max(real_time, work);
958 incr2 = L2T(&q->link, q->tx_len);
959 q->now += incr2;
960 cbq_update(q);
961 if ((incr -= incr2) < 0)
962 incr = 0;
964 q->now += incr;
965 q->now_rt = now;
967 for (;;) {
968 q->wd_expires = 0;
970 skb = cbq_dequeue_1(sch);
971 if (skb) {
972 qdisc_bstats_update(sch, skb);
973 sch->q.qlen--;
974 sch->flags &= ~TCQ_F_THROTTLED;
975 return skb;
978 /* All the classes are overlimit.
980 It is possible, if:
982 1. Scheduler is empty.
983 2. Toplevel cutoff inhibited borrowing.
984 3. Root class is overlimit.
986 Reset 2d and 3d conditions and retry.
988 Note, that NS and cbq-2.0 are buggy, peeking
989 an arbitrary class is appropriate for ancestor-only
990 sharing, but not for toplevel algorithm.
992 Our version is better, but slower, because it requires
993 two passes, but it is unavoidable with top-level sharing.
996 if (q->toplevel == TC_CBQ_MAXLEVEL &&
997 q->link.undertime == PSCHED_PASTPERFECT)
998 break;
1000 q->toplevel = TC_CBQ_MAXLEVEL;
1001 q->link.undertime = PSCHED_PASTPERFECT;
1004 /* No packets in scheduler or nobody wants to give them to us :-(
1005 Sigh... start watchdog timer in the last case. */
1007 if (sch->q.qlen) {
1008 sch->qstats.overlimits++;
1009 if (q->wd_expires)
1010 qdisc_watchdog_schedule(&q->watchdog,
1011 now + q->wd_expires);
1013 return NULL;
1016 /* CBQ class maintanance routines */
1018 static void cbq_adjust_levels(struct cbq_class *this)
1020 if (this == NULL)
1021 return;
1023 do {
1024 int level = 0;
1025 struct cbq_class *cl;
1027 if ((cl = this->children) != NULL) {
1028 do {
1029 if (cl->level > level)
1030 level = cl->level;
1031 } while ((cl = cl->sibling) != this->children);
1033 this->level = level+1;
1034 } while ((this = this->tparent) != NULL);
1037 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1039 struct cbq_class *cl;
1040 struct hlist_node *n;
1041 unsigned int h;
1043 if (q->quanta[prio] == 0)
1044 return;
1046 for (h = 0; h < q->clhash.hashsize; h++) {
1047 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1048 /* BUGGGG... Beware! This expression suffer of
1049 arithmetic overflows!
1051 if (cl->priority == prio) {
1052 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1053 q->quanta[prio];
1055 if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1056 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum);
1057 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1063 static void cbq_sync_defmap(struct cbq_class *cl)
1065 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1066 struct cbq_class *split = cl->split;
1067 unsigned h;
1068 int i;
1070 if (split == NULL)
1071 return;
1073 for (i=0; i<=TC_PRIO_MAX; i++) {
1074 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1075 split->defaults[i] = NULL;
1078 for (i=0; i<=TC_PRIO_MAX; i++) {
1079 int level = split->level;
1081 if (split->defaults[i])
1082 continue;
1084 for (h = 0; h < q->clhash.hashsize; h++) {
1085 struct hlist_node *n;
1086 struct cbq_class *c;
1088 hlist_for_each_entry(c, n, &q->clhash.hash[h],
1089 common.hnode) {
1090 if (c->split == split && c->level < level &&
1091 c->defmap&(1<<i)) {
1092 split->defaults[i] = c;
1093 level = c->level;
1100 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1102 struct cbq_class *split = NULL;
1104 if (splitid == 0) {
1105 if ((split = cl->split) == NULL)
1106 return;
1107 splitid = split->common.classid;
1110 if (split == NULL || split->common.classid != splitid) {
1111 for (split = cl->tparent; split; split = split->tparent)
1112 if (split->common.classid == splitid)
1113 break;
1116 if (split == NULL)
1117 return;
1119 if (cl->split != split) {
1120 cl->defmap = 0;
1121 cbq_sync_defmap(cl);
1122 cl->split = split;
1123 cl->defmap = def&mask;
1124 } else
1125 cl->defmap = (cl->defmap&~mask)|(def&mask);
1127 cbq_sync_defmap(cl);
1130 static void cbq_unlink_class(struct cbq_class *this)
1132 struct cbq_class *cl, **clp;
1133 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1135 qdisc_class_hash_remove(&q->clhash, &this->common);
1137 if (this->tparent) {
1138 clp=&this->sibling;
1139 cl = *clp;
1140 do {
1141 if (cl == this) {
1142 *clp = cl->sibling;
1143 break;
1145 clp = &cl->sibling;
1146 } while ((cl = *clp) != this->sibling);
1148 if (this->tparent->children == this) {
1149 this->tparent->children = this->sibling;
1150 if (this->sibling == this)
1151 this->tparent->children = NULL;
1153 } else {
1154 WARN_ON(this->sibling != this);
1158 static void cbq_link_class(struct cbq_class *this)
1160 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1161 struct cbq_class *parent = this->tparent;
1163 this->sibling = this;
1164 qdisc_class_hash_insert(&q->clhash, &this->common);
1166 if (parent == NULL)
1167 return;
1169 if (parent->children == NULL) {
1170 parent->children = this;
1171 } else {
1172 this->sibling = parent->children->sibling;
1173 parent->children->sibling = this;
1177 static unsigned int cbq_drop(struct Qdisc* sch)
1179 struct cbq_sched_data *q = qdisc_priv(sch);
1180 struct cbq_class *cl, *cl_head;
1181 int prio;
1182 unsigned int len;
1184 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1185 if ((cl_head = q->active[prio]) == NULL)
1186 continue;
1188 cl = cl_head;
1189 do {
1190 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1191 sch->q.qlen--;
1192 if (!cl->q->q.qlen)
1193 cbq_deactivate_class(cl);
1194 return len;
1196 } while ((cl = cl->next_alive) != cl_head);
1198 return 0;
1201 static void
1202 cbq_reset(struct Qdisc* sch)
1204 struct cbq_sched_data *q = qdisc_priv(sch);
1205 struct cbq_class *cl;
1206 struct hlist_node *n;
1207 int prio;
1208 unsigned h;
1210 q->activemask = 0;
1211 q->pmask = 0;
1212 q->tx_class = NULL;
1213 q->tx_borrowed = NULL;
1214 qdisc_watchdog_cancel(&q->watchdog);
1215 hrtimer_cancel(&q->delay_timer);
1216 q->toplevel = TC_CBQ_MAXLEVEL;
1217 q->now = psched_get_time();
1218 q->now_rt = q->now;
1220 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1221 q->active[prio] = NULL;
1223 for (h = 0; h < q->clhash.hashsize; h++) {
1224 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1225 qdisc_reset(cl->q);
1227 cl->next_alive = NULL;
1228 cl->undertime = PSCHED_PASTPERFECT;
1229 cl->avgidle = cl->maxidle;
1230 cl->deficit = cl->quantum;
1231 cl->cpriority = cl->priority;
1234 sch->q.qlen = 0;
1238 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1240 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1241 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1242 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1244 if (lss->change&TCF_CBQ_LSS_EWMA)
1245 cl->ewma_log = lss->ewma_log;
1246 if (lss->change&TCF_CBQ_LSS_AVPKT)
1247 cl->avpkt = lss->avpkt;
1248 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1249 cl->minidle = -(long)lss->minidle;
1250 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1251 cl->maxidle = lss->maxidle;
1252 cl->avgidle = lss->maxidle;
1254 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1255 cl->offtime = lss->offtime;
1256 return 0;
1259 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1261 q->nclasses[cl->priority]--;
1262 q->quanta[cl->priority] -= cl->weight;
1263 cbq_normalize_quanta(q, cl->priority);
1266 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1268 q->nclasses[cl->priority]++;
1269 q->quanta[cl->priority] += cl->weight;
1270 cbq_normalize_quanta(q, cl->priority);
1273 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1275 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1277 if (wrr->allot)
1278 cl->allot = wrr->allot;
1279 if (wrr->weight)
1280 cl->weight = wrr->weight;
1281 if (wrr->priority) {
1282 cl->priority = wrr->priority-1;
1283 cl->cpriority = cl->priority;
1284 if (cl->priority >= cl->priority2)
1285 cl->priority2 = TC_CBQ_MAXPRIO-1;
1288 cbq_addprio(q, cl);
1289 return 0;
1292 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1294 switch (ovl->strategy) {
1295 case TC_CBQ_OVL_CLASSIC:
1296 cl->overlimit = cbq_ovl_classic;
1297 break;
1298 case TC_CBQ_OVL_DELAY:
1299 cl->overlimit = cbq_ovl_delay;
1300 break;
1301 case TC_CBQ_OVL_LOWPRIO:
1302 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1303 ovl->priority2-1 <= cl->priority)
1304 return -EINVAL;
1305 cl->priority2 = ovl->priority2-1;
1306 cl->overlimit = cbq_ovl_lowprio;
1307 break;
1308 case TC_CBQ_OVL_DROP:
1309 cl->overlimit = cbq_ovl_drop;
1310 break;
1311 case TC_CBQ_OVL_RCLASSIC:
1312 cl->overlimit = cbq_ovl_rclassic;
1313 break;
1314 default:
1315 return -EINVAL;
1317 cl->penalty = ovl->penalty;
1318 return 0;
1321 #ifdef CONFIG_NET_CLS_ACT
1322 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1324 cl->police = p->police;
1326 if (cl->q->handle) {
1327 if (p->police == TC_POLICE_RECLASSIFY)
1328 cl->q->reshape_fail = cbq_reshape_fail;
1329 else
1330 cl->q->reshape_fail = NULL;
1332 return 0;
1334 #endif
1336 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1338 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1339 return 0;
1342 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1343 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1344 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1345 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1346 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1347 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1348 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1349 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1352 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1354 struct cbq_sched_data *q = qdisc_priv(sch);
1355 struct nlattr *tb[TCA_CBQ_MAX + 1];
1356 struct tc_ratespec *r;
1357 int err;
1359 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1360 if (err < 0)
1361 return err;
1363 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1364 return -EINVAL;
1366 r = nla_data(tb[TCA_CBQ_RATE]);
1368 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1369 return -EINVAL;
1371 err = qdisc_class_hash_init(&q->clhash);
1372 if (err < 0)
1373 goto put_rtab;
1375 q->link.refcnt = 1;
1376 q->link.sibling = &q->link;
1377 q->link.common.classid = sch->handle;
1378 q->link.qdisc = sch;
1379 q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1380 sch->handle);
1381 if (!q->link.q)
1382 q->link.q = &noop_qdisc;
1384 q->link.priority = TC_CBQ_MAXPRIO-1;
1385 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1386 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1387 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1388 q->link.overlimit = cbq_ovl_classic;
1389 q->link.allot = psched_mtu(qdisc_dev(sch));
1390 q->link.quantum = q->link.allot;
1391 q->link.weight = q->link.R_tab->rate.rate;
1393 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1394 q->link.avpkt = q->link.allot/2;
1395 q->link.minidle = -0x7FFFFFFF;
1397 qdisc_watchdog_init(&q->watchdog, sch);
1398 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1399 q->delay_timer.function = cbq_undelay;
1400 q->toplevel = TC_CBQ_MAXLEVEL;
1401 q->now = psched_get_time();
1402 q->now_rt = q->now;
1404 cbq_link_class(&q->link);
1406 if (tb[TCA_CBQ_LSSOPT])
1407 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1409 cbq_addprio(q, &q->link);
1410 return 0;
1412 put_rtab:
1413 qdisc_put_rtab(q->link.R_tab);
1414 return err;
1417 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1419 unsigned char *b = skb_tail_pointer(skb);
1421 NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1422 return skb->len;
1424 nla_put_failure:
1425 nlmsg_trim(skb, b);
1426 return -1;
1429 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1431 unsigned char *b = skb_tail_pointer(skb);
1432 struct tc_cbq_lssopt opt;
1434 opt.flags = 0;
1435 if (cl->borrow == NULL)
1436 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1437 if (cl->share == NULL)
1438 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1439 opt.ewma_log = cl->ewma_log;
1440 opt.level = cl->level;
1441 opt.avpkt = cl->avpkt;
1442 opt.maxidle = cl->maxidle;
1443 opt.minidle = (u32)(-cl->minidle);
1444 opt.offtime = cl->offtime;
1445 opt.change = ~0;
1446 NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1447 return skb->len;
1449 nla_put_failure:
1450 nlmsg_trim(skb, b);
1451 return -1;
1454 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1456 unsigned char *b = skb_tail_pointer(skb);
1457 struct tc_cbq_wrropt opt;
1459 opt.flags = 0;
1460 opt.allot = cl->allot;
1461 opt.priority = cl->priority+1;
1462 opt.cpriority = cl->cpriority+1;
1463 opt.weight = cl->weight;
1464 NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1465 return skb->len;
1467 nla_put_failure:
1468 nlmsg_trim(skb, b);
1469 return -1;
1472 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1474 unsigned char *b = skb_tail_pointer(skb);
1475 struct tc_cbq_ovl opt;
1477 opt.strategy = cl->ovl_strategy;
1478 opt.priority2 = cl->priority2+1;
1479 opt.pad = 0;
1480 opt.penalty = cl->penalty;
1481 NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1482 return skb->len;
1484 nla_put_failure:
1485 nlmsg_trim(skb, b);
1486 return -1;
1489 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1491 unsigned char *b = skb_tail_pointer(skb);
1492 struct tc_cbq_fopt opt;
1494 if (cl->split || cl->defmap) {
1495 opt.split = cl->split ? cl->split->common.classid : 0;
1496 opt.defmap = cl->defmap;
1497 opt.defchange = ~0;
1498 NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1500 return skb->len;
1502 nla_put_failure:
1503 nlmsg_trim(skb, b);
1504 return -1;
1507 #ifdef CONFIG_NET_CLS_ACT
1508 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1510 unsigned char *b = skb_tail_pointer(skb);
1511 struct tc_cbq_police opt;
1513 if (cl->police) {
1514 opt.police = cl->police;
1515 opt.__res1 = 0;
1516 opt.__res2 = 0;
1517 NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1519 return skb->len;
1521 nla_put_failure:
1522 nlmsg_trim(skb, b);
1523 return -1;
1525 #endif
1527 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1529 if (cbq_dump_lss(skb, cl) < 0 ||
1530 cbq_dump_rate(skb, cl) < 0 ||
1531 cbq_dump_wrr(skb, cl) < 0 ||
1532 cbq_dump_ovl(skb, cl) < 0 ||
1533 #ifdef CONFIG_NET_CLS_ACT
1534 cbq_dump_police(skb, cl) < 0 ||
1535 #endif
1536 cbq_dump_fopt(skb, cl) < 0)
1537 return -1;
1538 return 0;
1541 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1543 struct cbq_sched_data *q = qdisc_priv(sch);
1544 struct nlattr *nest;
1546 nest = nla_nest_start(skb, TCA_OPTIONS);
1547 if (nest == NULL)
1548 goto nla_put_failure;
1549 if (cbq_dump_attr(skb, &q->link) < 0)
1550 goto nla_put_failure;
1551 nla_nest_end(skb, nest);
1552 return skb->len;
1554 nla_put_failure:
1555 nla_nest_cancel(skb, nest);
1556 return -1;
1559 static int
1560 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1562 struct cbq_sched_data *q = qdisc_priv(sch);
1564 q->link.xstats.avgidle = q->link.avgidle;
1565 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1568 static int
1569 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1570 struct sk_buff *skb, struct tcmsg *tcm)
1572 struct cbq_class *cl = (struct cbq_class*)arg;
1573 struct nlattr *nest;
1575 if (cl->tparent)
1576 tcm->tcm_parent = cl->tparent->common.classid;
1577 else
1578 tcm->tcm_parent = TC_H_ROOT;
1579 tcm->tcm_handle = cl->common.classid;
1580 tcm->tcm_info = cl->q->handle;
1582 nest = nla_nest_start(skb, TCA_OPTIONS);
1583 if (nest == NULL)
1584 goto nla_put_failure;
1585 if (cbq_dump_attr(skb, cl) < 0)
1586 goto nla_put_failure;
1587 nla_nest_end(skb, nest);
1588 return skb->len;
1590 nla_put_failure:
1591 nla_nest_cancel(skb, nest);
1592 return -1;
1595 static int
1596 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1597 struct gnet_dump *d)
1599 struct cbq_sched_data *q = qdisc_priv(sch);
1600 struct cbq_class *cl = (struct cbq_class*)arg;
1602 cl->qstats.qlen = cl->q->q.qlen;
1603 cl->xstats.avgidle = cl->avgidle;
1604 cl->xstats.undertime = 0;
1606 if (cl->undertime != PSCHED_PASTPERFECT)
1607 cl->xstats.undertime = cl->undertime - q->now;
1609 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1610 gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1611 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1612 return -1;
1614 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1617 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1618 struct Qdisc **old)
1620 struct cbq_class *cl = (struct cbq_class*)arg;
1622 if (new == NULL) {
1623 new = qdisc_create_dflt(sch->dev_queue,
1624 &pfifo_qdisc_ops, cl->common.classid);
1625 if (new == NULL)
1626 return -ENOBUFS;
1627 } else {
1628 #ifdef CONFIG_NET_CLS_ACT
1629 if (cl->police == TC_POLICE_RECLASSIFY)
1630 new->reshape_fail = cbq_reshape_fail;
1631 #endif
1633 sch_tree_lock(sch);
1634 *old = cl->q;
1635 cl->q = new;
1636 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1637 qdisc_reset(*old);
1638 sch_tree_unlock(sch);
1640 return 0;
1643 static struct Qdisc *
1644 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1646 struct cbq_class *cl = (struct cbq_class*)arg;
1648 return cl->q;
1651 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1653 struct cbq_class *cl = (struct cbq_class *)arg;
1655 if (cl->q->q.qlen == 0)
1656 cbq_deactivate_class(cl);
1659 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1661 struct cbq_sched_data *q = qdisc_priv(sch);
1662 struct cbq_class *cl = cbq_class_lookup(q, classid);
1664 if (cl) {
1665 cl->refcnt++;
1666 return (unsigned long)cl;
1668 return 0;
1671 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1673 struct cbq_sched_data *q = qdisc_priv(sch);
1675 WARN_ON(cl->filters);
1677 tcf_destroy_chain(&cl->filter_list);
1678 qdisc_destroy(cl->q);
1679 qdisc_put_rtab(cl->R_tab);
1680 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1681 if (cl != &q->link)
1682 kfree(cl);
1685 static void
1686 cbq_destroy(struct Qdisc* sch)
1688 struct cbq_sched_data *q = qdisc_priv(sch);
1689 struct hlist_node *n, *next;
1690 struct cbq_class *cl;
1691 unsigned h;
1693 #ifdef CONFIG_NET_CLS_ACT
1694 q->rx_class = NULL;
1695 #endif
1697 * Filters must be destroyed first because we don't destroy the
1698 * classes from root to leafs which means that filters can still
1699 * be bound to classes which have been destroyed already. --TGR '04
1701 for (h = 0; h < q->clhash.hashsize; h++) {
1702 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
1703 tcf_destroy_chain(&cl->filter_list);
1705 for (h = 0; h < q->clhash.hashsize; h++) {
1706 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
1707 common.hnode)
1708 cbq_destroy_class(sch, cl);
1710 qdisc_class_hash_destroy(&q->clhash);
1713 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1715 struct cbq_class *cl = (struct cbq_class*)arg;
1717 if (--cl->refcnt == 0) {
1718 #ifdef CONFIG_NET_CLS_ACT
1719 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1720 struct cbq_sched_data *q = qdisc_priv(sch);
1722 spin_lock_bh(root_lock);
1723 if (q->rx_class == cl)
1724 q->rx_class = NULL;
1725 spin_unlock_bh(root_lock);
1726 #endif
1728 cbq_destroy_class(sch, cl);
1732 static int
1733 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1734 unsigned long *arg)
1736 int err;
1737 struct cbq_sched_data *q = qdisc_priv(sch);
1738 struct cbq_class *cl = (struct cbq_class*)*arg;
1739 struct nlattr *opt = tca[TCA_OPTIONS];
1740 struct nlattr *tb[TCA_CBQ_MAX + 1];
1741 struct cbq_class *parent;
1742 struct qdisc_rate_table *rtab = NULL;
1744 if (opt == NULL)
1745 return -EINVAL;
1747 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1748 if (err < 0)
1749 return err;
1751 if (cl) {
1752 /* Check parent */
1753 if (parentid) {
1754 if (cl->tparent &&
1755 cl->tparent->common.classid != parentid)
1756 return -EINVAL;
1757 if (!cl->tparent && parentid != TC_H_ROOT)
1758 return -EINVAL;
1761 if (tb[TCA_CBQ_RATE]) {
1762 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1763 tb[TCA_CBQ_RTAB]);
1764 if (rtab == NULL)
1765 return -EINVAL;
1768 if (tca[TCA_RATE]) {
1769 err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1770 qdisc_root_sleeping_lock(sch),
1771 tca[TCA_RATE]);
1772 if (err) {
1773 if (rtab)
1774 qdisc_put_rtab(rtab);
1775 return err;
1779 /* Change class parameters */
1780 sch_tree_lock(sch);
1782 if (cl->next_alive != NULL)
1783 cbq_deactivate_class(cl);
1785 if (rtab) {
1786 qdisc_put_rtab(cl->R_tab);
1787 cl->R_tab = rtab;
1790 if (tb[TCA_CBQ_LSSOPT])
1791 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1793 if (tb[TCA_CBQ_WRROPT]) {
1794 cbq_rmprio(q, cl);
1795 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1798 if (tb[TCA_CBQ_OVL_STRATEGY])
1799 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1801 #ifdef CONFIG_NET_CLS_ACT
1802 if (tb[TCA_CBQ_POLICE])
1803 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1804 #endif
1806 if (tb[TCA_CBQ_FOPT])
1807 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1809 if (cl->q->q.qlen)
1810 cbq_activate_class(cl);
1812 sch_tree_unlock(sch);
1814 return 0;
1817 if (parentid == TC_H_ROOT)
1818 return -EINVAL;
1820 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1821 tb[TCA_CBQ_LSSOPT] == NULL)
1822 return -EINVAL;
1824 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1825 if (rtab == NULL)
1826 return -EINVAL;
1828 if (classid) {
1829 err = -EINVAL;
1830 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1831 goto failure;
1832 } else {
1833 int i;
1834 classid = TC_H_MAKE(sch->handle,0x8000);
1836 for (i=0; i<0x8000; i++) {
1837 if (++q->hgenerator >= 0x8000)
1838 q->hgenerator = 1;
1839 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1840 break;
1842 err = -ENOSR;
1843 if (i >= 0x8000)
1844 goto failure;
1845 classid = classid|q->hgenerator;
1848 parent = &q->link;
1849 if (parentid) {
1850 parent = cbq_class_lookup(q, parentid);
1851 err = -EINVAL;
1852 if (parent == NULL)
1853 goto failure;
1856 err = -ENOBUFS;
1857 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1858 if (cl == NULL)
1859 goto failure;
1861 if (tca[TCA_RATE]) {
1862 err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1863 qdisc_root_sleeping_lock(sch),
1864 tca[TCA_RATE]);
1865 if (err) {
1866 kfree(cl);
1867 goto failure;
1871 cl->R_tab = rtab;
1872 rtab = NULL;
1873 cl->refcnt = 1;
1874 cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1875 if (!cl->q)
1876 cl->q = &noop_qdisc;
1877 cl->common.classid = classid;
1878 cl->tparent = parent;
1879 cl->qdisc = sch;
1880 cl->allot = parent->allot;
1881 cl->quantum = cl->allot;
1882 cl->weight = cl->R_tab->rate.rate;
1884 sch_tree_lock(sch);
1885 cbq_link_class(cl);
1886 cl->borrow = cl->tparent;
1887 if (cl->tparent != &q->link)
1888 cl->share = cl->tparent;
1889 cbq_adjust_levels(parent);
1890 cl->minidle = -0x7FFFFFFF;
1891 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1892 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1893 if (cl->ewma_log==0)
1894 cl->ewma_log = q->link.ewma_log;
1895 if (cl->maxidle==0)
1896 cl->maxidle = q->link.maxidle;
1897 if (cl->avpkt==0)
1898 cl->avpkt = q->link.avpkt;
1899 cl->overlimit = cbq_ovl_classic;
1900 if (tb[TCA_CBQ_OVL_STRATEGY])
1901 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1902 #ifdef CONFIG_NET_CLS_ACT
1903 if (tb[TCA_CBQ_POLICE])
1904 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1905 #endif
1906 if (tb[TCA_CBQ_FOPT])
1907 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1908 sch_tree_unlock(sch);
1910 qdisc_class_hash_grow(sch, &q->clhash);
1912 *arg = (unsigned long)cl;
1913 return 0;
1915 failure:
1916 qdisc_put_rtab(rtab);
1917 return err;
1920 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1922 struct cbq_sched_data *q = qdisc_priv(sch);
1923 struct cbq_class *cl = (struct cbq_class*)arg;
1924 unsigned int qlen;
1926 if (cl->filters || cl->children || cl == &q->link)
1927 return -EBUSY;
1929 sch_tree_lock(sch);
1931 qlen = cl->q->q.qlen;
1932 qdisc_reset(cl->q);
1933 qdisc_tree_decrease_qlen(cl->q, qlen);
1935 if (cl->next_alive)
1936 cbq_deactivate_class(cl);
1938 if (q->tx_borrowed == cl)
1939 q->tx_borrowed = q->tx_class;
1940 if (q->tx_class == cl) {
1941 q->tx_class = NULL;
1942 q->tx_borrowed = NULL;
1944 #ifdef CONFIG_NET_CLS_ACT
1945 if (q->rx_class == cl)
1946 q->rx_class = NULL;
1947 #endif
1949 cbq_unlink_class(cl);
1950 cbq_adjust_levels(cl->tparent);
1951 cl->defmap = 0;
1952 cbq_sync_defmap(cl);
1954 cbq_rmprio(q, cl);
1955 sch_tree_unlock(sch);
1957 BUG_ON(--cl->refcnt == 0);
1959 * This shouldn't happen: we "hold" one cops->get() when called
1960 * from tc_ctl_tclass; the destroy method is done from cops->put().
1963 return 0;
1966 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1968 struct cbq_sched_data *q = qdisc_priv(sch);
1969 struct cbq_class *cl = (struct cbq_class *)arg;
1971 if (cl == NULL)
1972 cl = &q->link;
1974 return &cl->filter_list;
1977 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1978 u32 classid)
1980 struct cbq_sched_data *q = qdisc_priv(sch);
1981 struct cbq_class *p = (struct cbq_class*)parent;
1982 struct cbq_class *cl = cbq_class_lookup(q, classid);
1984 if (cl) {
1985 if (p && p->level <= cl->level)
1986 return 0;
1987 cl->filters++;
1988 return (unsigned long)cl;
1990 return 0;
1993 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1995 struct cbq_class *cl = (struct cbq_class*)arg;
1997 cl->filters--;
2000 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2002 struct cbq_sched_data *q = qdisc_priv(sch);
2003 struct cbq_class *cl;
2004 struct hlist_node *n;
2005 unsigned h;
2007 if (arg->stop)
2008 return;
2010 for (h = 0; h < q->clhash.hashsize; h++) {
2011 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
2012 if (arg->count < arg->skip) {
2013 arg->count++;
2014 continue;
2016 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2017 arg->stop = 1;
2018 return;
2020 arg->count++;
2025 static const struct Qdisc_class_ops cbq_class_ops = {
2026 .graft = cbq_graft,
2027 .leaf = cbq_leaf,
2028 .qlen_notify = cbq_qlen_notify,
2029 .get = cbq_get,
2030 .put = cbq_put,
2031 .change = cbq_change_class,
2032 .delete = cbq_delete,
2033 .walk = cbq_walk,
2034 .tcf_chain = cbq_find_tcf,
2035 .bind_tcf = cbq_bind_filter,
2036 .unbind_tcf = cbq_unbind_filter,
2037 .dump = cbq_dump_class,
2038 .dump_stats = cbq_dump_class_stats,
2041 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2042 .next = NULL,
2043 .cl_ops = &cbq_class_ops,
2044 .id = "cbq",
2045 .priv_size = sizeof(struct cbq_sched_data),
2046 .enqueue = cbq_enqueue,
2047 .dequeue = cbq_dequeue,
2048 .peek = qdisc_peek_dequeued,
2049 .drop = cbq_drop,
2050 .init = cbq_init,
2051 .reset = cbq_reset,
2052 .destroy = cbq_destroy,
2053 .change = NULL,
2054 .dump = cbq_dump,
2055 .dump_stats = cbq_dump_stats,
2056 .owner = THIS_MODULE,
2059 static int __init cbq_module_init(void)
2061 return register_qdisc(&cbq_qdisc_ops);
2063 static void __exit cbq_module_exit(void)
2065 unregister_qdisc(&cbq_qdisc_ops);
2067 module_init(cbq_module_init)
2068 module_exit(cbq_module_exit)
2069 MODULE_LICENSE("GPL");