[NET_SCHED]: sch_cbq: use hrtimer for delay_timer
[linux-2.6.22.y-op.git] / net / sched / sch_cbq.c
blob0491fad97c0ac5e4411ba8589388b1e523a1ceee
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/module.h>
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/string.h>
20 #include <linux/mm.h>
21 #include <linux/socket.h>
22 #include <linux/sockios.h>
23 #include <linux/in.h>
24 #include <linux/errno.h>
25 #include <linux/interrupt.h>
26 #include <linux/if_ether.h>
27 #include <linux/inet.h>
28 #include <linux/netdevice.h>
29 #include <linux/etherdevice.h>
30 #include <linux/notifier.h>
31 #include <net/ip.h>
32 #include <net/route.h>
33 #include <linux/skbuff.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
38 /* Class-Based Queueing (CBQ) algorithm.
39 =======================================
41 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
42 Management Models for Packet Networks",
43 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
45 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
47 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
48 Parameters", 1996
50 [4] Sally Floyd and Michael Speer, "Experimental Results
51 for Class-Based Queueing", 1998, not published.
53 -----------------------------------------------------------------------
55 Algorithm skeleton was taken from NS simulator cbq.cc.
56 If someone wants to check this code against the LBL version,
57 he should take into account that ONLY the skeleton was borrowed,
58 the implementation is different. Particularly:
60 --- The WRR algorithm is different. Our version looks more
61 reasonable (I hope) and works when quanta are allowed to be
62 less than MTU, which is always the case when real time classes
63 have small rates. Note, that the statement of [3] is
64 incomplete, delay may actually be estimated even if class
65 per-round allotment is less than MTU. Namely, if per-round
66 allotment is W*r_i, and r_1+...+r_k = r < 1
68 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
70 In the worst case we have IntServ estimate with D = W*r+k*MTU
71 and C = MTU*r. The proof (if correct at all) is trivial.
74 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
75 interpret some places, which look like wrong translations
76 from NS. Anyone is advised to find these differences
77 and explain to me, why I am wrong 8).
79 --- Linux has no EOI event, so that we cannot estimate true class
80 idle time. Workaround is to consider the next dequeue event
81 as sign that previous packet is finished. This is wrong because of
82 internal device queueing, but on a permanently loaded link it is true.
83 Moreover, combined with clock integrator, this scheme looks
84 very close to an ideal solution. */
86 struct cbq_sched_data;
89 struct cbq_class
91 struct cbq_class *next; /* hash table link */
92 struct cbq_class *next_alive; /* next class with backlog in this priority band */
94 /* Parameters */
95 u32 classid;
96 unsigned char priority; /* class priority */
97 unsigned char priority2; /* priority to be used after overlimit */
98 unsigned char ewma_log; /* time constant for idle time calculation */
99 unsigned char ovl_strategy;
100 #ifdef CONFIG_NET_CLS_POLICE
101 unsigned char police;
102 #endif
104 u32 defmap;
106 /* Link-sharing scheduler parameters */
107 long maxidle; /* Class parameters: see below. */
108 long offtime;
109 long minidle;
110 u32 avpkt;
111 struct qdisc_rate_table *R_tab;
113 /* Overlimit strategy parameters */
114 void (*overlimit)(struct cbq_class *cl);
115 psched_tdiff_t penalty;
117 /* General scheduler (WRR) parameters */
118 long allot;
119 long quantum; /* Allotment per WRR round */
120 long weight; /* Relative allotment: see below */
122 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
123 struct cbq_class *split; /* Ptr to split node */
124 struct cbq_class *share; /* Ptr to LS parent in the class tree */
125 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
126 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
127 parent otherwise */
128 struct cbq_class *sibling; /* Sibling chain */
129 struct cbq_class *children; /* Pointer to children chain */
131 struct Qdisc *q; /* Elementary queueing discipline */
134 /* Variables */
135 unsigned char cpriority; /* Effective priority */
136 unsigned char delayed;
137 unsigned char level; /* level of the class in hierarchy:
138 0 for leaf classes, and maximal
139 level of children + 1 for nodes.
142 psched_time_t last; /* Last end of service */
143 psched_time_t undertime;
144 long avgidle;
145 long deficit; /* Saved deficit for WRR */
146 psched_time_t penalized;
147 struct gnet_stats_basic bstats;
148 struct gnet_stats_queue qstats;
149 struct gnet_stats_rate_est rate_est;
150 spinlock_t *stats_lock;
151 struct tc_cbq_xstats xstats;
153 struct tcf_proto *filter_list;
155 int refcnt;
156 int filters;
158 struct cbq_class *defaults[TC_PRIO_MAX+1];
161 struct cbq_sched_data
163 struct cbq_class *classes[16]; /* Hash table of all classes */
164 int nclasses[TC_CBQ_MAXPRIO+1];
165 unsigned quanta[TC_CBQ_MAXPRIO+1];
167 struct cbq_class link;
169 unsigned activemask;
170 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
171 with backlog */
173 #ifdef CONFIG_NET_CLS_POLICE
174 struct cbq_class *rx_class;
175 #endif
176 struct cbq_class *tx_class;
177 struct cbq_class *tx_borrowed;
178 int tx_len;
179 psched_time_t now; /* Cached timestamp */
180 psched_time_t now_rt; /* Cached real time */
181 unsigned pmask;
183 struct hrtimer delay_timer;
184 struct qdisc_watchdog watchdog; /* Watchdog timer,
185 started when CBQ has
186 backlog, but cannot
187 transmit just now */
188 psched_tdiff_t wd_expires;
189 int toplevel;
190 u32 hgenerator;
194 #define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
197 static __inline__ unsigned cbq_hash(u32 h)
199 h ^= h>>8;
200 h ^= h>>4;
201 return h&0xF;
204 static __inline__ struct cbq_class *
205 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
207 struct cbq_class *cl;
209 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
210 if (cl->classid == classid)
211 return cl;
212 return NULL;
215 #ifdef CONFIG_NET_CLS_POLICE
217 static struct cbq_class *
218 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
220 struct cbq_class *cl, *new;
222 for (cl = this->tparent; cl; cl = cl->tparent)
223 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
224 return new;
226 return NULL;
229 #endif
231 /* Classify packet. The procedure is pretty complicated, but
232 it allows us to combine link sharing and priority scheduling
233 transparently.
235 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
236 so that it resolves to split nodes. Then packets are classified
237 by logical priority, or a more specific classifier may be attached
238 to the split node.
241 static struct cbq_class *
242 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
244 struct cbq_sched_data *q = qdisc_priv(sch);
245 struct cbq_class *head = &q->link;
246 struct cbq_class **defmap;
247 struct cbq_class *cl = NULL;
248 u32 prio = skb->priority;
249 struct tcf_result res;
252 * Step 1. If skb->priority points to one of our classes, use it.
254 if (TC_H_MAJ(prio^sch->handle) == 0 &&
255 (cl = cbq_class_lookup(q, prio)) != NULL)
256 return cl;
258 *qerr = NET_XMIT_BYPASS;
259 for (;;) {
260 int result = 0;
261 defmap = head->defaults;
264 * Step 2+n. Apply classifier.
266 if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
267 goto fallback;
269 if ((cl = (void*)res.class) == NULL) {
270 if (TC_H_MAJ(res.classid))
271 cl = cbq_class_lookup(q, res.classid);
272 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
273 cl = defmap[TC_PRIO_BESTEFFORT];
275 if (cl == NULL || cl->level >= head->level)
276 goto fallback;
279 #ifdef CONFIG_NET_CLS_ACT
280 switch (result) {
281 case TC_ACT_QUEUED:
282 case TC_ACT_STOLEN:
283 *qerr = NET_XMIT_SUCCESS;
284 case TC_ACT_SHOT:
285 return NULL;
287 #elif defined(CONFIG_NET_CLS_POLICE)
288 switch (result) {
289 case TC_POLICE_RECLASSIFY:
290 return cbq_reclassify(skb, cl);
291 case TC_POLICE_SHOT:
292 return NULL;
293 default:
294 break;
296 #endif
297 if (cl->level == 0)
298 return cl;
301 * Step 3+n. If classifier selected a link sharing class,
302 * apply agency specific classifier.
303 * Repeat this procdure until we hit a leaf node.
305 head = cl;
308 fallback:
309 cl = head;
312 * Step 4. No success...
314 if (TC_H_MAJ(prio) == 0 &&
315 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
316 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
317 return head;
319 return cl;
323 A packet has just been enqueued on the empty class.
324 cbq_activate_class adds it to the tail of active class list
325 of its priority band.
328 static __inline__ void cbq_activate_class(struct cbq_class *cl)
330 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
331 int prio = cl->cpriority;
332 struct cbq_class *cl_tail;
334 cl_tail = q->active[prio];
335 q->active[prio] = cl;
337 if (cl_tail != NULL) {
338 cl->next_alive = cl_tail->next_alive;
339 cl_tail->next_alive = cl;
340 } else {
341 cl->next_alive = cl;
342 q->activemask |= (1<<prio);
347 Unlink class from active chain.
348 Note that this same procedure is done directly in cbq_dequeue*
349 during round-robin procedure.
352 static void cbq_deactivate_class(struct cbq_class *this)
354 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
355 int prio = this->cpriority;
356 struct cbq_class *cl;
357 struct cbq_class *cl_prev = q->active[prio];
359 do {
360 cl = cl_prev->next_alive;
361 if (cl == this) {
362 cl_prev->next_alive = cl->next_alive;
363 cl->next_alive = NULL;
365 if (cl == q->active[prio]) {
366 q->active[prio] = cl_prev;
367 if (cl == q->active[prio]) {
368 q->active[prio] = NULL;
369 q->activemask &= ~(1<<prio);
370 return;
373 return;
375 } while ((cl_prev = cl) != q->active[prio]);
378 static void
379 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
381 int toplevel = q->toplevel;
383 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
384 psched_time_t now;
385 psched_tdiff_t incr;
387 PSCHED_GET_TIME(now);
388 incr = PSCHED_TDIFF(now, q->now_rt);
389 PSCHED_TADD2(q->now, incr, now);
391 do {
392 if (PSCHED_TLESS(cl->undertime, now)) {
393 q->toplevel = cl->level;
394 return;
396 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
400 static int
401 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
403 struct cbq_sched_data *q = qdisc_priv(sch);
404 int len = skb->len;
405 int ret;
406 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
408 #ifdef CONFIG_NET_CLS_POLICE
409 q->rx_class = cl;
410 #endif
411 if (cl == NULL) {
412 if (ret == NET_XMIT_BYPASS)
413 sch->qstats.drops++;
414 kfree_skb(skb);
415 return ret;
418 #ifdef CONFIG_NET_CLS_POLICE
419 cl->q->__parent = sch;
420 #endif
421 if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
422 sch->q.qlen++;
423 sch->bstats.packets++;
424 sch->bstats.bytes+=len;
425 cbq_mark_toplevel(q, cl);
426 if (!cl->next_alive)
427 cbq_activate_class(cl);
428 return ret;
431 sch->qstats.drops++;
432 cbq_mark_toplevel(q, cl);
433 cl->qstats.drops++;
434 return ret;
437 static int
438 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
440 struct cbq_sched_data *q = qdisc_priv(sch);
441 struct cbq_class *cl;
442 int ret;
444 if ((cl = q->tx_class) == NULL) {
445 kfree_skb(skb);
446 sch->qstats.drops++;
447 return NET_XMIT_CN;
449 q->tx_class = NULL;
451 cbq_mark_toplevel(q, cl);
453 #ifdef CONFIG_NET_CLS_POLICE
454 q->rx_class = cl;
455 cl->q->__parent = sch;
456 #endif
457 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
458 sch->q.qlen++;
459 sch->qstats.requeues++;
460 if (!cl->next_alive)
461 cbq_activate_class(cl);
462 return 0;
464 sch->qstats.drops++;
465 cl->qstats.drops++;
466 return ret;
469 /* Overlimit actions */
471 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
473 static void cbq_ovl_classic(struct cbq_class *cl)
475 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
476 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
478 if (!cl->delayed) {
479 delay += cl->offtime;
482 Class goes to sleep, so that it will have no
483 chance to work avgidle. Let's forgive it 8)
485 BTW cbq-2.0 has a crap in this
486 place, apparently they forgot to shift it by cl->ewma_log.
488 if (cl->avgidle < 0)
489 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
490 if (cl->avgidle < cl->minidle)
491 cl->avgidle = cl->minidle;
492 if (delay <= 0)
493 delay = 1;
494 PSCHED_TADD2(q->now, delay, cl->undertime);
496 cl->xstats.overactions++;
497 cl->delayed = 1;
499 if (q->wd_expires == 0 || q->wd_expires > delay)
500 q->wd_expires = delay;
502 /* Dirty work! We must schedule wakeups based on
503 real available rate, rather than leaf rate,
504 which may be tiny (even zero).
506 if (q->toplevel == TC_CBQ_MAXLEVEL) {
507 struct cbq_class *b;
508 psched_tdiff_t base_delay = q->wd_expires;
510 for (b = cl->borrow; b; b = b->borrow) {
511 delay = PSCHED_TDIFF(b->undertime, q->now);
512 if (delay < base_delay) {
513 if (delay <= 0)
514 delay = 1;
515 base_delay = delay;
519 q->wd_expires = base_delay;
523 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
524 they go overlimit
527 static void cbq_ovl_rclassic(struct cbq_class *cl)
529 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
530 struct cbq_class *this = cl;
532 do {
533 if (cl->level > q->toplevel) {
534 cl = NULL;
535 break;
537 } while ((cl = cl->borrow) != NULL);
539 if (cl == NULL)
540 cl = this;
541 cbq_ovl_classic(cl);
544 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
546 static void cbq_ovl_delay(struct cbq_class *cl)
548 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
549 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
551 if (!cl->delayed) {
552 psched_time_t sched = q->now;
553 ktime_t expires;
555 delay += cl->offtime;
556 if (cl->avgidle < 0)
557 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
558 if (cl->avgidle < cl->minidle)
559 cl->avgidle = cl->minidle;
560 PSCHED_TADD2(q->now, delay, cl->undertime);
562 if (delay > 0) {
563 sched += delay + cl->penalty;
564 cl->penalized = sched;
565 cl->cpriority = TC_CBQ_MAXPRIO;
566 q->pmask |= (1<<TC_CBQ_MAXPRIO);
568 expires = ktime_set(0, 0);
569 expires = ktime_add_ns(expires, PSCHED_US2NS(sched));
570 if (hrtimer_try_to_cancel(&q->delay_timer) &&
571 ktime_to_ns(ktime_sub(q->delay_timer.expires,
572 expires)) > 0)
573 q->delay_timer.expires = expires;
574 hrtimer_restart(&q->delay_timer);
575 cl->delayed = 1;
576 cl->xstats.overactions++;
577 return;
579 delay = 1;
581 if (q->wd_expires == 0 || q->wd_expires > delay)
582 q->wd_expires = delay;
585 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
587 static void cbq_ovl_lowprio(struct cbq_class *cl)
589 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
591 cl->penalized = q->now + cl->penalty;
593 if (cl->cpriority != cl->priority2) {
594 cl->cpriority = cl->priority2;
595 q->pmask |= (1<<cl->cpriority);
596 cl->xstats.overactions++;
598 cbq_ovl_classic(cl);
601 /* TC_CBQ_OVL_DROP: penalize class by dropping */
603 static void cbq_ovl_drop(struct cbq_class *cl)
605 if (cl->q->ops->drop)
606 if (cl->q->ops->drop(cl->q))
607 cl->qdisc->q.qlen--;
608 cl->xstats.overactions++;
609 cbq_ovl_classic(cl);
612 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
613 psched_time_t now)
615 struct cbq_class *cl;
616 struct cbq_class *cl_prev = q->active[prio];
617 psched_time_t sched = now;
619 if (cl_prev == NULL)
620 return 0;
622 do {
623 cl = cl_prev->next_alive;
624 if (now - cl->penalized > 0) {
625 cl_prev->next_alive = cl->next_alive;
626 cl->next_alive = NULL;
627 cl->cpriority = cl->priority;
628 cl->delayed = 0;
629 cbq_activate_class(cl);
631 if (cl == q->active[prio]) {
632 q->active[prio] = cl_prev;
633 if (cl == q->active[prio]) {
634 q->active[prio] = NULL;
635 return 0;
639 cl = cl_prev->next_alive;
640 } else if (sched - cl->penalized > 0)
641 sched = cl->penalized;
642 } while ((cl_prev = cl) != q->active[prio]);
644 return sched - now;
647 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
649 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
650 delay_timer);
651 struct Qdisc *sch = q->watchdog.qdisc;
652 psched_time_t now;
653 psched_tdiff_t delay = 0;
654 unsigned pmask;
656 PSCHED_GET_TIME(now);
658 pmask = q->pmask;
659 q->pmask = 0;
661 while (pmask) {
662 int prio = ffz(~pmask);
663 psched_tdiff_t tmp;
665 pmask &= ~(1<<prio);
667 tmp = cbq_undelay_prio(q, prio, now);
668 if (tmp > 0) {
669 q->pmask |= 1<<prio;
670 if (tmp < delay || delay == 0)
671 delay = tmp;
675 if (delay) {
676 ktime_t time;
678 time = ktime_set(0, 0);
679 time = ktime_add_ns(time, PSCHED_US2NS(now + delay));
680 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
683 sch->flags &= ~TCQ_F_THROTTLED;
684 netif_schedule(sch->dev);
685 return HRTIMER_NORESTART;
689 #ifdef CONFIG_NET_CLS_POLICE
691 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
693 int len = skb->len;
694 struct Qdisc *sch = child->__parent;
695 struct cbq_sched_data *q = qdisc_priv(sch);
696 struct cbq_class *cl = q->rx_class;
698 q->rx_class = NULL;
700 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
702 cbq_mark_toplevel(q, cl);
704 q->rx_class = cl;
705 cl->q->__parent = sch;
707 if (cl->q->enqueue(skb, cl->q) == 0) {
708 sch->q.qlen++;
709 sch->bstats.packets++;
710 sch->bstats.bytes+=len;
711 if (!cl->next_alive)
712 cbq_activate_class(cl);
713 return 0;
715 sch->qstats.drops++;
716 return 0;
719 sch->qstats.drops++;
720 return -1;
722 #endif
725 It is mission critical procedure.
727 We "regenerate" toplevel cutoff, if transmitting class
728 has backlog and it is not regulated. It is not part of
729 original CBQ description, but looks more reasonable.
730 Probably, it is wrong. This question needs further investigation.
733 static __inline__ void
734 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
735 struct cbq_class *borrowed)
737 if (cl && q->toplevel >= borrowed->level) {
738 if (cl->q->q.qlen > 1) {
739 do {
740 if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
741 q->toplevel = borrowed->level;
742 return;
744 } while ((borrowed=borrowed->borrow) != NULL);
746 #if 0
747 /* It is not necessary now. Uncommenting it
748 will save CPU cycles, but decrease fairness.
750 q->toplevel = TC_CBQ_MAXLEVEL;
751 #endif
755 static void
756 cbq_update(struct cbq_sched_data *q)
758 struct cbq_class *this = q->tx_class;
759 struct cbq_class *cl = this;
760 int len = q->tx_len;
762 q->tx_class = NULL;
764 for ( ; cl; cl = cl->share) {
765 long avgidle = cl->avgidle;
766 long idle;
768 cl->bstats.packets++;
769 cl->bstats.bytes += len;
772 (now - last) is total time between packet right edges.
773 (last_pktlen/rate) is "virtual" busy time, so that
775 idle = (now - last) - last_pktlen/rate
778 idle = PSCHED_TDIFF(q->now, cl->last);
779 if ((unsigned long)idle > 128*1024*1024) {
780 avgidle = cl->maxidle;
781 } else {
782 idle -= L2T(cl, len);
784 /* true_avgidle := (1-W)*true_avgidle + W*idle,
785 where W=2^{-ewma_log}. But cl->avgidle is scaled:
786 cl->avgidle == true_avgidle/W,
787 hence:
789 avgidle += idle - (avgidle>>cl->ewma_log);
792 if (avgidle <= 0) {
793 /* Overlimit or at-limit */
795 if (avgidle < cl->minidle)
796 avgidle = cl->minidle;
798 cl->avgidle = avgidle;
800 /* Calculate expected time, when this class
801 will be allowed to send.
802 It will occur, when:
803 (1-W)*true_avgidle + W*delay = 0, i.e.
804 idle = (1/W - 1)*(-true_avgidle)
806 idle = (1 - W)*(-cl->avgidle);
808 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
811 That is not all.
812 To maintain the rate allocated to the class,
813 we add to undertime virtual clock,
814 necessary to complete transmitted packet.
815 (len/phys_bandwidth has been already passed
816 to the moment of cbq_update)
819 idle -= L2T(&q->link, len);
820 idle += L2T(cl, len);
822 PSCHED_AUDIT_TDIFF(idle);
824 PSCHED_TADD2(q->now, idle, cl->undertime);
825 } else {
826 /* Underlimit */
828 PSCHED_SET_PASTPERFECT(cl->undertime);
829 if (avgidle > cl->maxidle)
830 cl->avgidle = cl->maxidle;
831 else
832 cl->avgidle = avgidle;
834 cl->last = q->now;
837 cbq_update_toplevel(q, this, q->tx_borrowed);
840 static __inline__ struct cbq_class *
841 cbq_under_limit(struct cbq_class *cl)
843 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
844 struct cbq_class *this_cl = cl;
846 if (cl->tparent == NULL)
847 return cl;
849 if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
850 !PSCHED_TLESS(q->now, cl->undertime)) {
851 cl->delayed = 0;
852 return cl;
855 do {
856 /* It is very suspicious place. Now overlimit
857 action is generated for not bounded classes
858 only if link is completely congested.
859 Though it is in agree with ancestor-only paradigm,
860 it looks very stupid. Particularly,
861 it means that this chunk of code will either
862 never be called or result in strong amplification
863 of burstiness. Dangerous, silly, and, however,
864 no another solution exists.
866 if ((cl = cl->borrow) == NULL) {
867 this_cl->qstats.overlimits++;
868 this_cl->overlimit(this_cl);
869 return NULL;
871 if (cl->level > q->toplevel)
872 return NULL;
873 } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
874 PSCHED_TLESS(q->now, cl->undertime));
876 cl->delayed = 0;
877 return cl;
880 static __inline__ struct sk_buff *
881 cbq_dequeue_prio(struct Qdisc *sch, int prio)
883 struct cbq_sched_data *q = qdisc_priv(sch);
884 struct cbq_class *cl_tail, *cl_prev, *cl;
885 struct sk_buff *skb;
886 int deficit;
888 cl_tail = cl_prev = q->active[prio];
889 cl = cl_prev->next_alive;
891 do {
892 deficit = 0;
894 /* Start round */
895 do {
896 struct cbq_class *borrow = cl;
898 if (cl->q->q.qlen &&
899 (borrow = cbq_under_limit(cl)) == NULL)
900 goto skip_class;
902 if (cl->deficit <= 0) {
903 /* Class exhausted its allotment per
904 this round. Switch to the next one.
906 deficit = 1;
907 cl->deficit += cl->quantum;
908 goto next_class;
911 skb = cl->q->dequeue(cl->q);
913 /* Class did not give us any skb :-(
914 It could occur even if cl->q->q.qlen != 0
915 f.e. if cl->q == "tbf"
917 if (skb == NULL)
918 goto skip_class;
920 cl->deficit -= skb->len;
921 q->tx_class = cl;
922 q->tx_borrowed = borrow;
923 if (borrow != cl) {
924 #ifndef CBQ_XSTATS_BORROWS_BYTES
925 borrow->xstats.borrows++;
926 cl->xstats.borrows++;
927 #else
928 borrow->xstats.borrows += skb->len;
929 cl->xstats.borrows += skb->len;
930 #endif
932 q->tx_len = skb->len;
934 if (cl->deficit <= 0) {
935 q->active[prio] = cl;
936 cl = cl->next_alive;
937 cl->deficit += cl->quantum;
939 return skb;
941 skip_class:
942 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
943 /* Class is empty or penalized.
944 Unlink it from active chain.
946 cl_prev->next_alive = cl->next_alive;
947 cl->next_alive = NULL;
949 /* Did cl_tail point to it? */
950 if (cl == cl_tail) {
951 /* Repair it! */
952 cl_tail = cl_prev;
954 /* Was it the last class in this band? */
955 if (cl == cl_tail) {
956 /* Kill the band! */
957 q->active[prio] = NULL;
958 q->activemask &= ~(1<<prio);
959 if (cl->q->q.qlen)
960 cbq_activate_class(cl);
961 return NULL;
964 q->active[prio] = cl_tail;
966 if (cl->q->q.qlen)
967 cbq_activate_class(cl);
969 cl = cl_prev;
972 next_class:
973 cl_prev = cl;
974 cl = cl->next_alive;
975 } while (cl_prev != cl_tail);
976 } while (deficit);
978 q->active[prio] = cl_prev;
980 return NULL;
983 static __inline__ struct sk_buff *
984 cbq_dequeue_1(struct Qdisc *sch)
986 struct cbq_sched_data *q = qdisc_priv(sch);
987 struct sk_buff *skb;
988 unsigned activemask;
990 activemask = q->activemask&0xFF;
991 while (activemask) {
992 int prio = ffz(~activemask);
993 activemask &= ~(1<<prio);
994 skb = cbq_dequeue_prio(sch, prio);
995 if (skb)
996 return skb;
998 return NULL;
1001 static struct sk_buff *
1002 cbq_dequeue(struct Qdisc *sch)
1004 struct sk_buff *skb;
1005 struct cbq_sched_data *q = qdisc_priv(sch);
1006 psched_time_t now;
1007 psched_tdiff_t incr;
1009 PSCHED_GET_TIME(now);
1010 incr = PSCHED_TDIFF(now, q->now_rt);
1012 if (q->tx_class) {
1013 psched_tdiff_t incr2;
1014 /* Time integrator. We calculate EOS time
1015 by adding expected packet transmission time.
1016 If real time is greater, we warp artificial clock,
1017 so that:
1019 cbq_time = max(real_time, work);
1021 incr2 = L2T(&q->link, q->tx_len);
1022 PSCHED_TADD(q->now, incr2);
1023 cbq_update(q);
1024 if ((incr -= incr2) < 0)
1025 incr = 0;
1027 PSCHED_TADD(q->now, incr);
1028 q->now_rt = now;
1030 for (;;) {
1031 q->wd_expires = 0;
1033 skb = cbq_dequeue_1(sch);
1034 if (skb) {
1035 sch->q.qlen--;
1036 sch->flags &= ~TCQ_F_THROTTLED;
1037 return skb;
1040 /* All the classes are overlimit.
1042 It is possible, if:
1044 1. Scheduler is empty.
1045 2. Toplevel cutoff inhibited borrowing.
1046 3. Root class is overlimit.
1048 Reset 2d and 3d conditions and retry.
1050 Note, that NS and cbq-2.0 are buggy, peeking
1051 an arbitrary class is appropriate for ancestor-only
1052 sharing, but not for toplevel algorithm.
1054 Our version is better, but slower, because it requires
1055 two passes, but it is unavoidable with top-level sharing.
1058 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1059 PSCHED_IS_PASTPERFECT(q->link.undertime))
1060 break;
1062 q->toplevel = TC_CBQ_MAXLEVEL;
1063 PSCHED_SET_PASTPERFECT(q->link.undertime);
1066 /* No packets in scheduler or nobody wants to give them to us :-(
1067 Sigh... start watchdog timer in the last case. */
1069 if (sch->q.qlen) {
1070 sch->qstats.overlimits++;
1071 if (q->wd_expires)
1072 qdisc_watchdog_schedule(&q->watchdog,
1073 q->now + q->wd_expires);
1075 return NULL;
1078 /* CBQ class maintanance routines */
1080 static void cbq_adjust_levels(struct cbq_class *this)
1082 if (this == NULL)
1083 return;
1085 do {
1086 int level = 0;
1087 struct cbq_class *cl;
1089 if ((cl = this->children) != NULL) {
1090 do {
1091 if (cl->level > level)
1092 level = cl->level;
1093 } while ((cl = cl->sibling) != this->children);
1095 this->level = level+1;
1096 } while ((this = this->tparent) != NULL);
1099 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1101 struct cbq_class *cl;
1102 unsigned h;
1104 if (q->quanta[prio] == 0)
1105 return;
1107 for (h=0; h<16; h++) {
1108 for (cl = q->classes[h]; cl; cl = cl->next) {
1109 /* BUGGGG... Beware! This expression suffer of
1110 arithmetic overflows!
1112 if (cl->priority == prio) {
1113 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1114 q->quanta[prio];
1116 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1117 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1118 cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1124 static void cbq_sync_defmap(struct cbq_class *cl)
1126 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1127 struct cbq_class *split = cl->split;
1128 unsigned h;
1129 int i;
1131 if (split == NULL)
1132 return;
1134 for (i=0; i<=TC_PRIO_MAX; i++) {
1135 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1136 split->defaults[i] = NULL;
1139 for (i=0; i<=TC_PRIO_MAX; i++) {
1140 int level = split->level;
1142 if (split->defaults[i])
1143 continue;
1145 for (h=0; h<16; h++) {
1146 struct cbq_class *c;
1148 for (c = q->classes[h]; c; c = c->next) {
1149 if (c->split == split && c->level < level &&
1150 c->defmap&(1<<i)) {
1151 split->defaults[i] = c;
1152 level = c->level;
1159 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1161 struct cbq_class *split = NULL;
1163 if (splitid == 0) {
1164 if ((split = cl->split) == NULL)
1165 return;
1166 splitid = split->classid;
1169 if (split == NULL || split->classid != splitid) {
1170 for (split = cl->tparent; split; split = split->tparent)
1171 if (split->classid == splitid)
1172 break;
1175 if (split == NULL)
1176 return;
1178 if (cl->split != split) {
1179 cl->defmap = 0;
1180 cbq_sync_defmap(cl);
1181 cl->split = split;
1182 cl->defmap = def&mask;
1183 } else
1184 cl->defmap = (cl->defmap&~mask)|(def&mask);
1186 cbq_sync_defmap(cl);
1189 static void cbq_unlink_class(struct cbq_class *this)
1191 struct cbq_class *cl, **clp;
1192 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1194 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1195 if (cl == this) {
1196 *clp = cl->next;
1197 cl->next = NULL;
1198 break;
1202 if (this->tparent) {
1203 clp=&this->sibling;
1204 cl = *clp;
1205 do {
1206 if (cl == this) {
1207 *clp = cl->sibling;
1208 break;
1210 clp = &cl->sibling;
1211 } while ((cl = *clp) != this->sibling);
1213 if (this->tparent->children == this) {
1214 this->tparent->children = this->sibling;
1215 if (this->sibling == this)
1216 this->tparent->children = NULL;
1218 } else {
1219 BUG_TRAP(this->sibling == this);
1223 static void cbq_link_class(struct cbq_class *this)
1225 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1226 unsigned h = cbq_hash(this->classid);
1227 struct cbq_class *parent = this->tparent;
1229 this->sibling = this;
1230 this->next = q->classes[h];
1231 q->classes[h] = this;
1233 if (parent == NULL)
1234 return;
1236 if (parent->children == NULL) {
1237 parent->children = this;
1238 } else {
1239 this->sibling = parent->children->sibling;
1240 parent->children->sibling = this;
1244 static unsigned int cbq_drop(struct Qdisc* sch)
1246 struct cbq_sched_data *q = qdisc_priv(sch);
1247 struct cbq_class *cl, *cl_head;
1248 int prio;
1249 unsigned int len;
1251 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1252 if ((cl_head = q->active[prio]) == NULL)
1253 continue;
1255 cl = cl_head;
1256 do {
1257 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1258 sch->q.qlen--;
1259 if (!cl->q->q.qlen)
1260 cbq_deactivate_class(cl);
1261 return len;
1263 } while ((cl = cl->next_alive) != cl_head);
1265 return 0;
1268 static void
1269 cbq_reset(struct Qdisc* sch)
1271 struct cbq_sched_data *q = qdisc_priv(sch);
1272 struct cbq_class *cl;
1273 int prio;
1274 unsigned h;
1276 q->activemask = 0;
1277 q->pmask = 0;
1278 q->tx_class = NULL;
1279 q->tx_borrowed = NULL;
1280 qdisc_watchdog_cancel(&q->watchdog);
1281 hrtimer_cancel(&q->delay_timer);
1282 q->toplevel = TC_CBQ_MAXLEVEL;
1283 PSCHED_GET_TIME(q->now);
1284 q->now_rt = q->now;
1286 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1287 q->active[prio] = NULL;
1289 for (h = 0; h < 16; h++) {
1290 for (cl = q->classes[h]; cl; cl = cl->next) {
1291 qdisc_reset(cl->q);
1293 cl->next_alive = NULL;
1294 PSCHED_SET_PASTPERFECT(cl->undertime);
1295 cl->avgidle = cl->maxidle;
1296 cl->deficit = cl->quantum;
1297 cl->cpriority = cl->priority;
1300 sch->q.qlen = 0;
1304 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1306 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1307 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1308 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1310 if (lss->change&TCF_CBQ_LSS_EWMA)
1311 cl->ewma_log = lss->ewma_log;
1312 if (lss->change&TCF_CBQ_LSS_AVPKT)
1313 cl->avpkt = lss->avpkt;
1314 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1315 cl->minidle = -(long)lss->minidle;
1316 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1317 cl->maxidle = lss->maxidle;
1318 cl->avgidle = lss->maxidle;
1320 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1321 cl->offtime = lss->offtime;
1322 return 0;
1325 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1327 q->nclasses[cl->priority]--;
1328 q->quanta[cl->priority] -= cl->weight;
1329 cbq_normalize_quanta(q, cl->priority);
1332 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1334 q->nclasses[cl->priority]++;
1335 q->quanta[cl->priority] += cl->weight;
1336 cbq_normalize_quanta(q, cl->priority);
1339 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1341 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1343 if (wrr->allot)
1344 cl->allot = wrr->allot;
1345 if (wrr->weight)
1346 cl->weight = wrr->weight;
1347 if (wrr->priority) {
1348 cl->priority = wrr->priority-1;
1349 cl->cpriority = cl->priority;
1350 if (cl->priority >= cl->priority2)
1351 cl->priority2 = TC_CBQ_MAXPRIO-1;
1354 cbq_addprio(q, cl);
1355 return 0;
1358 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1360 switch (ovl->strategy) {
1361 case TC_CBQ_OVL_CLASSIC:
1362 cl->overlimit = cbq_ovl_classic;
1363 break;
1364 case TC_CBQ_OVL_DELAY:
1365 cl->overlimit = cbq_ovl_delay;
1366 break;
1367 case TC_CBQ_OVL_LOWPRIO:
1368 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1369 ovl->priority2-1 <= cl->priority)
1370 return -EINVAL;
1371 cl->priority2 = ovl->priority2-1;
1372 cl->overlimit = cbq_ovl_lowprio;
1373 break;
1374 case TC_CBQ_OVL_DROP:
1375 cl->overlimit = cbq_ovl_drop;
1376 break;
1377 case TC_CBQ_OVL_RCLASSIC:
1378 cl->overlimit = cbq_ovl_rclassic;
1379 break;
1380 default:
1381 return -EINVAL;
1383 cl->penalty = ovl->penalty;
1384 return 0;
1387 #ifdef CONFIG_NET_CLS_POLICE
1388 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1390 cl->police = p->police;
1392 if (cl->q->handle) {
1393 if (p->police == TC_POLICE_RECLASSIFY)
1394 cl->q->reshape_fail = cbq_reshape_fail;
1395 else
1396 cl->q->reshape_fail = NULL;
1398 return 0;
1400 #endif
1402 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1404 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1405 return 0;
1408 static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1410 struct cbq_sched_data *q = qdisc_priv(sch);
1411 struct rtattr *tb[TCA_CBQ_MAX];
1412 struct tc_ratespec *r;
1414 if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 ||
1415 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1416 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1417 return -EINVAL;
1419 if (tb[TCA_CBQ_LSSOPT-1] &&
1420 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1421 return -EINVAL;
1423 r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1425 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL)
1426 return -EINVAL;
1428 q->link.refcnt = 1;
1429 q->link.sibling = &q->link;
1430 q->link.classid = sch->handle;
1431 q->link.qdisc = sch;
1432 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1433 sch->handle)))
1434 q->link.q = &noop_qdisc;
1436 q->link.priority = TC_CBQ_MAXPRIO-1;
1437 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1438 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1439 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1440 q->link.overlimit = cbq_ovl_classic;
1441 q->link.allot = psched_mtu(sch->dev);
1442 q->link.quantum = q->link.allot;
1443 q->link.weight = q->link.R_tab->rate.rate;
1445 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1446 q->link.avpkt = q->link.allot/2;
1447 q->link.minidle = -0x7FFFFFFF;
1448 q->link.stats_lock = &sch->dev->queue_lock;
1450 qdisc_watchdog_init(&q->watchdog, sch);
1451 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1452 q->delay_timer.function = cbq_undelay;
1453 q->toplevel = TC_CBQ_MAXLEVEL;
1454 PSCHED_GET_TIME(q->now);
1455 q->now_rt = q->now;
1457 cbq_link_class(&q->link);
1459 if (tb[TCA_CBQ_LSSOPT-1])
1460 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1462 cbq_addprio(q, &q->link);
1463 return 0;
1466 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1468 unsigned char *b = skb->tail;
1470 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1471 return skb->len;
1473 rtattr_failure:
1474 skb_trim(skb, b - skb->data);
1475 return -1;
1478 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1480 unsigned char *b = skb->tail;
1481 struct tc_cbq_lssopt opt;
1483 opt.flags = 0;
1484 if (cl->borrow == NULL)
1485 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1486 if (cl->share == NULL)
1487 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1488 opt.ewma_log = cl->ewma_log;
1489 opt.level = cl->level;
1490 opt.avpkt = cl->avpkt;
1491 opt.maxidle = cl->maxidle;
1492 opt.minidle = (u32)(-cl->minidle);
1493 opt.offtime = cl->offtime;
1494 opt.change = ~0;
1495 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1496 return skb->len;
1498 rtattr_failure:
1499 skb_trim(skb, b - skb->data);
1500 return -1;
1503 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1505 unsigned char *b = skb->tail;
1506 struct tc_cbq_wrropt opt;
1508 opt.flags = 0;
1509 opt.allot = cl->allot;
1510 opt.priority = cl->priority+1;
1511 opt.cpriority = cl->cpriority+1;
1512 opt.weight = cl->weight;
1513 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1514 return skb->len;
1516 rtattr_failure:
1517 skb_trim(skb, b - skb->data);
1518 return -1;
1521 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1523 unsigned char *b = skb->tail;
1524 struct tc_cbq_ovl opt;
1526 opt.strategy = cl->ovl_strategy;
1527 opt.priority2 = cl->priority2+1;
1528 opt.pad = 0;
1529 opt.penalty = cl->penalty;
1530 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1531 return skb->len;
1533 rtattr_failure:
1534 skb_trim(skb, b - skb->data);
1535 return -1;
1538 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1540 unsigned char *b = skb->tail;
1541 struct tc_cbq_fopt opt;
1543 if (cl->split || cl->defmap) {
1544 opt.split = cl->split ? cl->split->classid : 0;
1545 opt.defmap = cl->defmap;
1546 opt.defchange = ~0;
1547 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1549 return skb->len;
1551 rtattr_failure:
1552 skb_trim(skb, b - skb->data);
1553 return -1;
1556 #ifdef CONFIG_NET_CLS_POLICE
1557 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1559 unsigned char *b = skb->tail;
1560 struct tc_cbq_police opt;
1562 if (cl->police) {
1563 opt.police = cl->police;
1564 opt.__res1 = 0;
1565 opt.__res2 = 0;
1566 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1568 return skb->len;
1570 rtattr_failure:
1571 skb_trim(skb, b - skb->data);
1572 return -1;
1574 #endif
1576 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1578 if (cbq_dump_lss(skb, cl) < 0 ||
1579 cbq_dump_rate(skb, cl) < 0 ||
1580 cbq_dump_wrr(skb, cl) < 0 ||
1581 cbq_dump_ovl(skb, cl) < 0 ||
1582 #ifdef CONFIG_NET_CLS_POLICE
1583 cbq_dump_police(skb, cl) < 0 ||
1584 #endif
1585 cbq_dump_fopt(skb, cl) < 0)
1586 return -1;
1587 return 0;
1590 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1592 struct cbq_sched_data *q = qdisc_priv(sch);
1593 unsigned char *b = skb->tail;
1594 struct rtattr *rta;
1596 rta = (struct rtattr*)b;
1597 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1598 if (cbq_dump_attr(skb, &q->link) < 0)
1599 goto rtattr_failure;
1600 rta->rta_len = skb->tail - b;
1601 return skb->len;
1603 rtattr_failure:
1604 skb_trim(skb, b - skb->data);
1605 return -1;
1608 static int
1609 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1611 struct cbq_sched_data *q = qdisc_priv(sch);
1613 q->link.xstats.avgidle = q->link.avgidle;
1614 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1617 static int
1618 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1619 struct sk_buff *skb, struct tcmsg *tcm)
1621 struct cbq_class *cl = (struct cbq_class*)arg;
1622 unsigned char *b = skb->tail;
1623 struct rtattr *rta;
1625 if (cl->tparent)
1626 tcm->tcm_parent = cl->tparent->classid;
1627 else
1628 tcm->tcm_parent = TC_H_ROOT;
1629 tcm->tcm_handle = cl->classid;
1630 tcm->tcm_info = cl->q->handle;
1632 rta = (struct rtattr*)b;
1633 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1634 if (cbq_dump_attr(skb, cl) < 0)
1635 goto rtattr_failure;
1636 rta->rta_len = skb->tail - b;
1637 return skb->len;
1639 rtattr_failure:
1640 skb_trim(skb, b - skb->data);
1641 return -1;
1644 static int
1645 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1646 struct gnet_dump *d)
1648 struct cbq_sched_data *q = qdisc_priv(sch);
1649 struct cbq_class *cl = (struct cbq_class*)arg;
1651 cl->qstats.qlen = cl->q->q.qlen;
1652 cl->xstats.avgidle = cl->avgidle;
1653 cl->xstats.undertime = 0;
1655 if (!PSCHED_IS_PASTPERFECT(cl->undertime))
1656 cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
1658 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1659 #ifdef CONFIG_NET_ESTIMATOR
1660 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1661 #endif
1662 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1663 return -1;
1665 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1668 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1669 struct Qdisc **old)
1671 struct cbq_class *cl = (struct cbq_class*)arg;
1673 if (cl) {
1674 if (new == NULL) {
1675 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1676 cl->classid)) == NULL)
1677 return -ENOBUFS;
1678 } else {
1679 #ifdef CONFIG_NET_CLS_POLICE
1680 if (cl->police == TC_POLICE_RECLASSIFY)
1681 new->reshape_fail = cbq_reshape_fail;
1682 #endif
1684 sch_tree_lock(sch);
1685 *old = xchg(&cl->q, new);
1686 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1687 qdisc_reset(*old);
1688 sch_tree_unlock(sch);
1690 return 0;
1692 return -ENOENT;
1695 static struct Qdisc *
1696 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1698 struct cbq_class *cl = (struct cbq_class*)arg;
1700 return cl ? cl->q : NULL;
1703 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1705 struct cbq_class *cl = (struct cbq_class *)arg;
1707 if (cl->q->q.qlen == 0)
1708 cbq_deactivate_class(cl);
1711 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1713 struct cbq_sched_data *q = qdisc_priv(sch);
1714 struct cbq_class *cl = cbq_class_lookup(q, classid);
1716 if (cl) {
1717 cl->refcnt++;
1718 return (unsigned long)cl;
1720 return 0;
1723 static void cbq_destroy_filters(struct cbq_class *cl)
1725 struct tcf_proto *tp;
1727 while ((tp = cl->filter_list) != NULL) {
1728 cl->filter_list = tp->next;
1729 tcf_destroy(tp);
1733 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1735 struct cbq_sched_data *q = qdisc_priv(sch);
1737 BUG_TRAP(!cl->filters);
1739 cbq_destroy_filters(cl);
1740 qdisc_destroy(cl->q);
1741 qdisc_put_rtab(cl->R_tab);
1742 #ifdef CONFIG_NET_ESTIMATOR
1743 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1744 #endif
1745 if (cl != &q->link)
1746 kfree(cl);
1749 static void
1750 cbq_destroy(struct Qdisc* sch)
1752 struct cbq_sched_data *q = qdisc_priv(sch);
1753 struct cbq_class *cl;
1754 unsigned h;
1756 #ifdef CONFIG_NET_CLS_POLICE
1757 q->rx_class = NULL;
1758 #endif
1760 * Filters must be destroyed first because we don't destroy the
1761 * classes from root to leafs which means that filters can still
1762 * be bound to classes which have been destroyed already. --TGR '04
1764 for (h = 0; h < 16; h++)
1765 for (cl = q->classes[h]; cl; cl = cl->next)
1766 cbq_destroy_filters(cl);
1768 for (h = 0; h < 16; h++) {
1769 struct cbq_class *next;
1771 for (cl = q->classes[h]; cl; cl = next) {
1772 next = cl->next;
1773 cbq_destroy_class(sch, cl);
1778 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1780 struct cbq_class *cl = (struct cbq_class*)arg;
1782 if (--cl->refcnt == 0) {
1783 #ifdef CONFIG_NET_CLS_POLICE
1784 struct cbq_sched_data *q = qdisc_priv(sch);
1786 spin_lock_bh(&sch->dev->queue_lock);
1787 if (q->rx_class == cl)
1788 q->rx_class = NULL;
1789 spin_unlock_bh(&sch->dev->queue_lock);
1790 #endif
1792 cbq_destroy_class(sch, cl);
1796 static int
1797 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1798 unsigned long *arg)
1800 int err;
1801 struct cbq_sched_data *q = qdisc_priv(sch);
1802 struct cbq_class *cl = (struct cbq_class*)*arg;
1803 struct rtattr *opt = tca[TCA_OPTIONS-1];
1804 struct rtattr *tb[TCA_CBQ_MAX];
1805 struct cbq_class *parent;
1806 struct qdisc_rate_table *rtab = NULL;
1808 if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt))
1809 return -EINVAL;
1811 if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1812 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1813 return -EINVAL;
1815 if (tb[TCA_CBQ_FOPT-1] &&
1816 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1817 return -EINVAL;
1819 if (tb[TCA_CBQ_RATE-1] &&
1820 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1821 return -EINVAL;
1823 if (tb[TCA_CBQ_LSSOPT-1] &&
1824 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1825 return -EINVAL;
1827 if (tb[TCA_CBQ_WRROPT-1] &&
1828 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1829 return -EINVAL;
1831 #ifdef CONFIG_NET_CLS_POLICE
1832 if (tb[TCA_CBQ_POLICE-1] &&
1833 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1834 return -EINVAL;
1835 #endif
1837 if (cl) {
1838 /* Check parent */
1839 if (parentid) {
1840 if (cl->tparent && cl->tparent->classid != parentid)
1841 return -EINVAL;
1842 if (!cl->tparent && parentid != TC_H_ROOT)
1843 return -EINVAL;
1846 if (tb[TCA_CBQ_RATE-1]) {
1847 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1848 if (rtab == NULL)
1849 return -EINVAL;
1852 /* Change class parameters */
1853 sch_tree_lock(sch);
1855 if (cl->next_alive != NULL)
1856 cbq_deactivate_class(cl);
1858 if (rtab) {
1859 rtab = xchg(&cl->R_tab, rtab);
1860 qdisc_put_rtab(rtab);
1863 if (tb[TCA_CBQ_LSSOPT-1])
1864 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1866 if (tb[TCA_CBQ_WRROPT-1]) {
1867 cbq_rmprio(q, cl);
1868 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1871 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1872 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1874 #ifdef CONFIG_NET_CLS_POLICE
1875 if (tb[TCA_CBQ_POLICE-1])
1876 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1877 #endif
1879 if (tb[TCA_CBQ_FOPT-1])
1880 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1882 if (cl->q->q.qlen)
1883 cbq_activate_class(cl);
1885 sch_tree_unlock(sch);
1887 #ifdef CONFIG_NET_ESTIMATOR
1888 if (tca[TCA_RATE-1])
1889 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1890 cl->stats_lock, tca[TCA_RATE-1]);
1891 #endif
1892 return 0;
1895 if (parentid == TC_H_ROOT)
1896 return -EINVAL;
1898 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1899 tb[TCA_CBQ_LSSOPT-1] == NULL)
1900 return -EINVAL;
1902 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1903 if (rtab == NULL)
1904 return -EINVAL;
1906 if (classid) {
1907 err = -EINVAL;
1908 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1909 goto failure;
1910 } else {
1911 int i;
1912 classid = TC_H_MAKE(sch->handle,0x8000);
1914 for (i=0; i<0x8000; i++) {
1915 if (++q->hgenerator >= 0x8000)
1916 q->hgenerator = 1;
1917 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1918 break;
1920 err = -ENOSR;
1921 if (i >= 0x8000)
1922 goto failure;
1923 classid = classid|q->hgenerator;
1926 parent = &q->link;
1927 if (parentid) {
1928 parent = cbq_class_lookup(q, parentid);
1929 err = -EINVAL;
1930 if (parent == NULL)
1931 goto failure;
1934 err = -ENOBUFS;
1935 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1936 if (cl == NULL)
1937 goto failure;
1938 cl->R_tab = rtab;
1939 rtab = NULL;
1940 cl->refcnt = 1;
1941 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid)))
1942 cl->q = &noop_qdisc;
1943 cl->classid = classid;
1944 cl->tparent = parent;
1945 cl->qdisc = sch;
1946 cl->allot = parent->allot;
1947 cl->quantum = cl->allot;
1948 cl->weight = cl->R_tab->rate.rate;
1949 cl->stats_lock = &sch->dev->queue_lock;
1951 sch_tree_lock(sch);
1952 cbq_link_class(cl);
1953 cl->borrow = cl->tparent;
1954 if (cl->tparent != &q->link)
1955 cl->share = cl->tparent;
1956 cbq_adjust_levels(parent);
1957 cl->minidle = -0x7FFFFFFF;
1958 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1959 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1960 if (cl->ewma_log==0)
1961 cl->ewma_log = q->link.ewma_log;
1962 if (cl->maxidle==0)
1963 cl->maxidle = q->link.maxidle;
1964 if (cl->avpkt==0)
1965 cl->avpkt = q->link.avpkt;
1966 cl->overlimit = cbq_ovl_classic;
1967 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1968 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1969 #ifdef CONFIG_NET_CLS_POLICE
1970 if (tb[TCA_CBQ_POLICE-1])
1971 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1972 #endif
1973 if (tb[TCA_CBQ_FOPT-1])
1974 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1975 sch_tree_unlock(sch);
1977 #ifdef CONFIG_NET_ESTIMATOR
1978 if (tca[TCA_RATE-1])
1979 gen_new_estimator(&cl->bstats, &cl->rate_est,
1980 cl->stats_lock, tca[TCA_RATE-1]);
1981 #endif
1983 *arg = (unsigned long)cl;
1984 return 0;
1986 failure:
1987 qdisc_put_rtab(rtab);
1988 return err;
1991 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1993 struct cbq_sched_data *q = qdisc_priv(sch);
1994 struct cbq_class *cl = (struct cbq_class*)arg;
1995 unsigned int qlen;
1997 if (cl->filters || cl->children || cl == &q->link)
1998 return -EBUSY;
2000 sch_tree_lock(sch);
2002 qlen = cl->q->q.qlen;
2003 qdisc_reset(cl->q);
2004 qdisc_tree_decrease_qlen(cl->q, qlen);
2006 if (cl->next_alive)
2007 cbq_deactivate_class(cl);
2009 if (q->tx_borrowed == cl)
2010 q->tx_borrowed = q->tx_class;
2011 if (q->tx_class == cl) {
2012 q->tx_class = NULL;
2013 q->tx_borrowed = NULL;
2015 #ifdef CONFIG_NET_CLS_POLICE
2016 if (q->rx_class == cl)
2017 q->rx_class = NULL;
2018 #endif
2020 cbq_unlink_class(cl);
2021 cbq_adjust_levels(cl->tparent);
2022 cl->defmap = 0;
2023 cbq_sync_defmap(cl);
2025 cbq_rmprio(q, cl);
2026 sch_tree_unlock(sch);
2028 if (--cl->refcnt == 0)
2029 cbq_destroy_class(sch, cl);
2031 return 0;
2034 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
2036 struct cbq_sched_data *q = qdisc_priv(sch);
2037 struct cbq_class *cl = (struct cbq_class *)arg;
2039 if (cl == NULL)
2040 cl = &q->link;
2042 return &cl->filter_list;
2045 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
2046 u32 classid)
2048 struct cbq_sched_data *q = qdisc_priv(sch);
2049 struct cbq_class *p = (struct cbq_class*)parent;
2050 struct cbq_class *cl = cbq_class_lookup(q, classid);
2052 if (cl) {
2053 if (p && p->level <= cl->level)
2054 return 0;
2055 cl->filters++;
2056 return (unsigned long)cl;
2058 return 0;
2061 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2063 struct cbq_class *cl = (struct cbq_class*)arg;
2065 cl->filters--;
2068 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2070 struct cbq_sched_data *q = qdisc_priv(sch);
2071 unsigned h;
2073 if (arg->stop)
2074 return;
2076 for (h = 0; h < 16; h++) {
2077 struct cbq_class *cl;
2079 for (cl = q->classes[h]; cl; cl = cl->next) {
2080 if (arg->count < arg->skip) {
2081 arg->count++;
2082 continue;
2084 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2085 arg->stop = 1;
2086 return;
2088 arg->count++;
2093 static struct Qdisc_class_ops cbq_class_ops = {
2094 .graft = cbq_graft,
2095 .leaf = cbq_leaf,
2096 .qlen_notify = cbq_qlen_notify,
2097 .get = cbq_get,
2098 .put = cbq_put,
2099 .change = cbq_change_class,
2100 .delete = cbq_delete,
2101 .walk = cbq_walk,
2102 .tcf_chain = cbq_find_tcf,
2103 .bind_tcf = cbq_bind_filter,
2104 .unbind_tcf = cbq_unbind_filter,
2105 .dump = cbq_dump_class,
2106 .dump_stats = cbq_dump_class_stats,
2109 static struct Qdisc_ops cbq_qdisc_ops = {
2110 .next = NULL,
2111 .cl_ops = &cbq_class_ops,
2112 .id = "cbq",
2113 .priv_size = sizeof(struct cbq_sched_data),
2114 .enqueue = cbq_enqueue,
2115 .dequeue = cbq_dequeue,
2116 .requeue = cbq_requeue,
2117 .drop = cbq_drop,
2118 .init = cbq_init,
2119 .reset = cbq_reset,
2120 .destroy = cbq_destroy,
2121 .change = NULL,
2122 .dump = cbq_dump,
2123 .dump_stats = cbq_dump_stats,
2124 .owner = THIS_MODULE,
2127 static int __init cbq_module_init(void)
2129 return register_qdisc(&cbq_qdisc_ops);
2131 static void __exit cbq_module_exit(void)
2133 unregister_qdisc(&cbq_qdisc_ops);
2135 module_init(cbq_module_init)
2136 module_exit(cbq_module_exit)
2137 MODULE_LICENSE("GPL");