remove the jffs MAINTAINERS entry
[linux-2.6.22.y-op.git] / net / sched / sch_cbq.c
blob76c92e710a33486926c196e4f16b5c0ad4450ba2
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/module.h>
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/string.h>
20 #include <linux/mm.h>
21 #include <linux/socket.h>
22 #include <linux/sockios.h>
23 #include <linux/in.h>
24 #include <linux/errno.h>
25 #include <linux/interrupt.h>
26 #include <linux/if_ether.h>
27 #include <linux/inet.h>
28 #include <linux/netdevice.h>
29 #include <linux/etherdevice.h>
30 #include <linux/notifier.h>
31 #include <net/ip.h>
32 #include <net/route.h>
33 #include <linux/skbuff.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
38 /* Class-Based Queueing (CBQ) algorithm.
39 =======================================
41 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
42 Management Models for Packet Networks",
43 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
45 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
47 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
48 Parameters", 1996
50 [4] Sally Floyd and Michael Speer, "Experimental Results
51 for Class-Based Queueing", 1998, not published.
53 -----------------------------------------------------------------------
55 Algorithm skeleton was taken from NS simulator cbq.cc.
56 If someone wants to check this code against the LBL version,
57 he should take into account that ONLY the skeleton was borrowed,
58 the implementation is different. Particularly:
60 --- The WRR algorithm is different. Our version looks more
61 reasonable (I hope) and works when quanta are allowed to be
62 less than MTU, which is always the case when real time classes
63 have small rates. Note, that the statement of [3] is
64 incomplete, delay may actually be estimated even if class
65 per-round allotment is less than MTU. Namely, if per-round
66 allotment is W*r_i, and r_1+...+r_k = r < 1
68 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
70 In the worst case we have IntServ estimate with D = W*r+k*MTU
71 and C = MTU*r. The proof (if correct at all) is trivial.
74 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
75 interpret some places, which look like wrong translations
76 from NS. Anyone is advised to find these differences
77 and explain to me, why I am wrong 8).
79 --- Linux has no EOI event, so that we cannot estimate true class
80 idle time. Workaround is to consider the next dequeue event
81 as sign that previous packet is finished. This is wrong because of
82 internal device queueing, but on a permanently loaded link it is true.
83 Moreover, combined with clock integrator, this scheme looks
84 very close to an ideal solution. */
86 struct cbq_sched_data;
89 struct cbq_class
91 struct cbq_class *next; /* hash table link */
92 struct cbq_class *next_alive; /* next class with backlog in this priority band */
94 /* Parameters */
95 u32 classid;
96 unsigned char priority; /* class priority */
97 unsigned char priority2; /* priority to be used after overlimit */
98 unsigned char ewma_log; /* time constant for idle time calculation */
99 unsigned char ovl_strategy;
100 #ifdef CONFIG_NET_CLS_POLICE
101 unsigned char police;
102 #endif
104 u32 defmap;
106 /* Link-sharing scheduler parameters */
107 long maxidle; /* Class parameters: see below. */
108 long offtime;
109 long minidle;
110 u32 avpkt;
111 struct qdisc_rate_table *R_tab;
113 /* Overlimit strategy parameters */
114 void (*overlimit)(struct cbq_class *cl);
115 long penalty;
117 /* General scheduler (WRR) parameters */
118 long allot;
119 long quantum; /* Allotment per WRR round */
120 long weight; /* Relative allotment: see below */
122 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
123 struct cbq_class *split; /* Ptr to split node */
124 struct cbq_class *share; /* Ptr to LS parent in the class tree */
125 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
126 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
127 parent otherwise */
128 struct cbq_class *sibling; /* Sibling chain */
129 struct cbq_class *children; /* Pointer to children chain */
131 struct Qdisc *q; /* Elementary queueing discipline */
134 /* Variables */
135 unsigned char cpriority; /* Effective priority */
136 unsigned char delayed;
137 unsigned char level; /* level of the class in hierarchy:
138 0 for leaf classes, and maximal
139 level of children + 1 for nodes.
142 psched_time_t last; /* Last end of service */
143 psched_time_t undertime;
144 long avgidle;
145 long deficit; /* Saved deficit for WRR */
146 unsigned long penalized;
147 struct gnet_stats_basic bstats;
148 struct gnet_stats_queue qstats;
149 struct gnet_stats_rate_est rate_est;
150 spinlock_t *stats_lock;
151 struct tc_cbq_xstats xstats;
153 struct tcf_proto *filter_list;
155 int refcnt;
156 int filters;
158 struct cbq_class *defaults[TC_PRIO_MAX+1];
161 struct cbq_sched_data
163 struct cbq_class *classes[16]; /* Hash table of all classes */
164 int nclasses[TC_CBQ_MAXPRIO+1];
165 unsigned quanta[TC_CBQ_MAXPRIO+1];
167 struct cbq_class link;
169 unsigned activemask;
170 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
171 with backlog */
173 #ifdef CONFIG_NET_CLS_POLICE
174 struct cbq_class *rx_class;
175 #endif
176 struct cbq_class *tx_class;
177 struct cbq_class *tx_borrowed;
178 int tx_len;
179 psched_time_t now; /* Cached timestamp */
180 psched_time_t now_rt; /* Cached real time */
181 unsigned pmask;
183 struct timer_list delay_timer;
184 struct timer_list wd_timer; /* Watchdog timer,
185 started when CBQ has
186 backlog, but cannot
187 transmit just now */
188 long wd_expires;
189 int toplevel;
190 u32 hgenerator;
194 #define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
197 static __inline__ unsigned cbq_hash(u32 h)
199 h ^= h>>8;
200 h ^= h>>4;
201 return h&0xF;
204 static __inline__ struct cbq_class *
205 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
207 struct cbq_class *cl;
209 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
210 if (cl->classid == classid)
211 return cl;
212 return NULL;
215 #ifdef CONFIG_NET_CLS_POLICE
217 static struct cbq_class *
218 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
220 struct cbq_class *cl, *new;
222 for (cl = this->tparent; cl; cl = cl->tparent)
223 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
224 return new;
226 return NULL;
229 #endif
231 /* Classify packet. The procedure is pretty complicated, but
232 it allows us to combine link sharing and priority scheduling
233 transparently.
235 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
236 so that it resolves to split nodes. Then packets are classified
237 by logical priority, or a more specific classifier may be attached
238 to the split node.
241 static struct cbq_class *
242 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
244 struct cbq_sched_data *q = qdisc_priv(sch);
245 struct cbq_class *head = &q->link;
246 struct cbq_class **defmap;
247 struct cbq_class *cl = NULL;
248 u32 prio = skb->priority;
249 struct tcf_result res;
252 * Step 1. If skb->priority points to one of our classes, use it.
254 if (TC_H_MAJ(prio^sch->handle) == 0 &&
255 (cl = cbq_class_lookup(q, prio)) != NULL)
256 return cl;
258 *qerr = NET_XMIT_BYPASS;
259 for (;;) {
260 int result = 0;
261 defmap = head->defaults;
264 * Step 2+n. Apply classifier.
266 if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
267 goto fallback;
269 if ((cl = (void*)res.class) == NULL) {
270 if (TC_H_MAJ(res.classid))
271 cl = cbq_class_lookup(q, res.classid);
272 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
273 cl = defmap[TC_PRIO_BESTEFFORT];
275 if (cl == NULL || cl->level >= head->level)
276 goto fallback;
279 #ifdef CONFIG_NET_CLS_ACT
280 switch (result) {
281 case TC_ACT_QUEUED:
282 case TC_ACT_STOLEN:
283 *qerr = NET_XMIT_SUCCESS;
284 case TC_ACT_SHOT:
285 return NULL;
287 #elif defined(CONFIG_NET_CLS_POLICE)
288 switch (result) {
289 case TC_POLICE_RECLASSIFY:
290 return cbq_reclassify(skb, cl);
291 case TC_POLICE_SHOT:
292 return NULL;
293 default:
294 break;
296 #endif
297 if (cl->level == 0)
298 return cl;
301 * Step 3+n. If classifier selected a link sharing class,
302 * apply agency specific classifier.
303 * Repeat this procdure until we hit a leaf node.
305 head = cl;
308 fallback:
309 cl = head;
312 * Step 4. No success...
314 if (TC_H_MAJ(prio) == 0 &&
315 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
316 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
317 return head;
319 return cl;
323 A packet has just been enqueued on the empty class.
324 cbq_activate_class adds it to the tail of active class list
325 of its priority band.
328 static __inline__ void cbq_activate_class(struct cbq_class *cl)
330 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
331 int prio = cl->cpriority;
332 struct cbq_class *cl_tail;
334 cl_tail = q->active[prio];
335 q->active[prio] = cl;
337 if (cl_tail != NULL) {
338 cl->next_alive = cl_tail->next_alive;
339 cl_tail->next_alive = cl;
340 } else {
341 cl->next_alive = cl;
342 q->activemask |= (1<<prio);
347 Unlink class from active chain.
348 Note that this same procedure is done directly in cbq_dequeue*
349 during round-robin procedure.
352 static void cbq_deactivate_class(struct cbq_class *this)
354 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
355 int prio = this->cpriority;
356 struct cbq_class *cl;
357 struct cbq_class *cl_prev = q->active[prio];
359 do {
360 cl = cl_prev->next_alive;
361 if (cl == this) {
362 cl_prev->next_alive = cl->next_alive;
363 cl->next_alive = NULL;
365 if (cl == q->active[prio]) {
366 q->active[prio] = cl_prev;
367 if (cl == q->active[prio]) {
368 q->active[prio] = NULL;
369 q->activemask &= ~(1<<prio);
370 return;
373 return;
375 } while ((cl_prev = cl) != q->active[prio]);
378 static void
379 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
381 int toplevel = q->toplevel;
383 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
384 psched_time_t now;
385 psched_tdiff_t incr;
387 PSCHED_GET_TIME(now);
388 incr = PSCHED_TDIFF(now, q->now_rt);
389 PSCHED_TADD2(q->now, incr, now);
391 do {
392 if (PSCHED_TLESS(cl->undertime, now)) {
393 q->toplevel = cl->level;
394 return;
396 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
400 static int
401 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
403 struct cbq_sched_data *q = qdisc_priv(sch);
404 int len = skb->len;
405 int ret;
406 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
408 #ifdef CONFIG_NET_CLS_POLICE
409 q->rx_class = cl;
410 #endif
411 if (cl == NULL) {
412 if (ret == NET_XMIT_BYPASS)
413 sch->qstats.drops++;
414 kfree_skb(skb);
415 return ret;
418 #ifdef CONFIG_NET_CLS_POLICE
419 cl->q->__parent = sch;
420 #endif
421 if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
422 sch->q.qlen++;
423 sch->bstats.packets++;
424 sch->bstats.bytes+=len;
425 cbq_mark_toplevel(q, cl);
426 if (!cl->next_alive)
427 cbq_activate_class(cl);
428 return ret;
431 sch->qstats.drops++;
432 cbq_mark_toplevel(q, cl);
433 cl->qstats.drops++;
434 return ret;
437 static int
438 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
440 struct cbq_sched_data *q = qdisc_priv(sch);
441 struct cbq_class *cl;
442 int ret;
444 if ((cl = q->tx_class) == NULL) {
445 kfree_skb(skb);
446 sch->qstats.drops++;
447 return NET_XMIT_CN;
449 q->tx_class = NULL;
451 cbq_mark_toplevel(q, cl);
453 #ifdef CONFIG_NET_CLS_POLICE
454 q->rx_class = cl;
455 cl->q->__parent = sch;
456 #endif
457 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
458 sch->q.qlen++;
459 sch->qstats.requeues++;
460 if (!cl->next_alive)
461 cbq_activate_class(cl);
462 return 0;
464 sch->qstats.drops++;
465 cl->qstats.drops++;
466 return ret;
469 /* Overlimit actions */
471 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
473 static void cbq_ovl_classic(struct cbq_class *cl)
475 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
476 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
478 if (!cl->delayed) {
479 delay += cl->offtime;
482 Class goes to sleep, so that it will have no
483 chance to work avgidle. Let's forgive it 8)
485 BTW cbq-2.0 has a crap in this
486 place, apparently they forgot to shift it by cl->ewma_log.
488 if (cl->avgidle < 0)
489 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
490 if (cl->avgidle < cl->minidle)
491 cl->avgidle = cl->minidle;
492 if (delay <= 0)
493 delay = 1;
494 PSCHED_TADD2(q->now, delay, cl->undertime);
496 cl->xstats.overactions++;
497 cl->delayed = 1;
499 if (q->wd_expires == 0 || q->wd_expires > delay)
500 q->wd_expires = delay;
502 /* Dirty work! We must schedule wakeups based on
503 real available rate, rather than leaf rate,
504 which may be tiny (even zero).
506 if (q->toplevel == TC_CBQ_MAXLEVEL) {
507 struct cbq_class *b;
508 psched_tdiff_t base_delay = q->wd_expires;
510 for (b = cl->borrow; b; b = b->borrow) {
511 delay = PSCHED_TDIFF(b->undertime, q->now);
512 if (delay < base_delay) {
513 if (delay <= 0)
514 delay = 1;
515 base_delay = delay;
519 q->wd_expires = base_delay;
523 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
524 they go overlimit
527 static void cbq_ovl_rclassic(struct cbq_class *cl)
529 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
530 struct cbq_class *this = cl;
532 do {
533 if (cl->level > q->toplevel) {
534 cl = NULL;
535 break;
537 } while ((cl = cl->borrow) != NULL);
539 if (cl == NULL)
540 cl = this;
541 cbq_ovl_classic(cl);
544 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
546 static void cbq_ovl_delay(struct cbq_class *cl)
548 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
549 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
551 if (!cl->delayed) {
552 unsigned long sched = jiffies;
554 delay += cl->offtime;
555 if (cl->avgidle < 0)
556 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
557 if (cl->avgidle < cl->minidle)
558 cl->avgidle = cl->minidle;
559 PSCHED_TADD2(q->now, delay, cl->undertime);
561 if (delay > 0) {
562 sched += PSCHED_US2JIFFIE(delay) + cl->penalty;
563 cl->penalized = sched;
564 cl->cpriority = TC_CBQ_MAXPRIO;
565 q->pmask |= (1<<TC_CBQ_MAXPRIO);
566 if (del_timer(&q->delay_timer) &&
567 (long)(q->delay_timer.expires - sched) > 0)
568 q->delay_timer.expires = sched;
569 add_timer(&q->delay_timer);
570 cl->delayed = 1;
571 cl->xstats.overactions++;
572 return;
574 delay = 1;
576 if (q->wd_expires == 0 || q->wd_expires > delay)
577 q->wd_expires = delay;
580 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
582 static void cbq_ovl_lowprio(struct cbq_class *cl)
584 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
586 cl->penalized = jiffies + cl->penalty;
588 if (cl->cpriority != cl->priority2) {
589 cl->cpriority = cl->priority2;
590 q->pmask |= (1<<cl->cpriority);
591 cl->xstats.overactions++;
593 cbq_ovl_classic(cl);
596 /* TC_CBQ_OVL_DROP: penalize class by dropping */
598 static void cbq_ovl_drop(struct cbq_class *cl)
600 if (cl->q->ops->drop)
601 if (cl->q->ops->drop(cl->q))
602 cl->qdisc->q.qlen--;
603 cl->xstats.overactions++;
604 cbq_ovl_classic(cl);
607 static void cbq_watchdog(unsigned long arg)
609 struct Qdisc *sch = (struct Qdisc*)arg;
611 sch->flags &= ~TCQ_F_THROTTLED;
612 netif_schedule(sch->dev);
615 static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio)
617 struct cbq_class *cl;
618 struct cbq_class *cl_prev = q->active[prio];
619 unsigned long now = jiffies;
620 unsigned long sched = now;
622 if (cl_prev == NULL)
623 return now;
625 do {
626 cl = cl_prev->next_alive;
627 if ((long)(now - cl->penalized) > 0) {
628 cl_prev->next_alive = cl->next_alive;
629 cl->next_alive = NULL;
630 cl->cpriority = cl->priority;
631 cl->delayed = 0;
632 cbq_activate_class(cl);
634 if (cl == q->active[prio]) {
635 q->active[prio] = cl_prev;
636 if (cl == q->active[prio]) {
637 q->active[prio] = NULL;
638 return 0;
642 cl = cl_prev->next_alive;
643 } else if ((long)(sched - cl->penalized) > 0)
644 sched = cl->penalized;
645 } while ((cl_prev = cl) != q->active[prio]);
647 return (long)(sched - now);
650 static void cbq_undelay(unsigned long arg)
652 struct Qdisc *sch = (struct Qdisc*)arg;
653 struct cbq_sched_data *q = qdisc_priv(sch);
654 long delay = 0;
655 unsigned pmask;
657 pmask = q->pmask;
658 q->pmask = 0;
660 while (pmask) {
661 int prio = ffz(~pmask);
662 long tmp;
664 pmask &= ~(1<<prio);
666 tmp = cbq_undelay_prio(q, prio);
667 if (tmp > 0) {
668 q->pmask |= 1<<prio;
669 if (tmp < delay || delay == 0)
670 delay = tmp;
674 if (delay) {
675 q->delay_timer.expires = jiffies + delay;
676 add_timer(&q->delay_timer);
679 sch->flags &= ~TCQ_F_THROTTLED;
680 netif_schedule(sch->dev);
684 #ifdef CONFIG_NET_CLS_POLICE
686 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
688 int len = skb->len;
689 struct Qdisc *sch = child->__parent;
690 struct cbq_sched_data *q = qdisc_priv(sch);
691 struct cbq_class *cl = q->rx_class;
693 q->rx_class = NULL;
695 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
697 cbq_mark_toplevel(q, cl);
699 q->rx_class = cl;
700 cl->q->__parent = sch;
702 if (cl->q->enqueue(skb, cl->q) == 0) {
703 sch->q.qlen++;
704 sch->bstats.packets++;
705 sch->bstats.bytes+=len;
706 if (!cl->next_alive)
707 cbq_activate_class(cl);
708 return 0;
710 sch->qstats.drops++;
711 return 0;
714 sch->qstats.drops++;
715 return -1;
717 #endif
720 It is mission critical procedure.
722 We "regenerate" toplevel cutoff, if transmitting class
723 has backlog and it is not regulated. It is not part of
724 original CBQ description, but looks more reasonable.
725 Probably, it is wrong. This question needs further investigation.
728 static __inline__ void
729 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
730 struct cbq_class *borrowed)
732 if (cl && q->toplevel >= borrowed->level) {
733 if (cl->q->q.qlen > 1) {
734 do {
735 if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
736 q->toplevel = borrowed->level;
737 return;
739 } while ((borrowed=borrowed->borrow) != NULL);
741 #if 0
742 /* It is not necessary now. Uncommenting it
743 will save CPU cycles, but decrease fairness.
745 q->toplevel = TC_CBQ_MAXLEVEL;
746 #endif
750 static void
751 cbq_update(struct cbq_sched_data *q)
753 struct cbq_class *this = q->tx_class;
754 struct cbq_class *cl = this;
755 int len = q->tx_len;
757 q->tx_class = NULL;
759 for ( ; cl; cl = cl->share) {
760 long avgidle = cl->avgidle;
761 long idle;
763 cl->bstats.packets++;
764 cl->bstats.bytes += len;
767 (now - last) is total time between packet right edges.
768 (last_pktlen/rate) is "virtual" busy time, so that
770 idle = (now - last) - last_pktlen/rate
773 idle = PSCHED_TDIFF(q->now, cl->last);
774 if ((unsigned long)idle > 128*1024*1024) {
775 avgidle = cl->maxidle;
776 } else {
777 idle -= L2T(cl, len);
779 /* true_avgidle := (1-W)*true_avgidle + W*idle,
780 where W=2^{-ewma_log}. But cl->avgidle is scaled:
781 cl->avgidle == true_avgidle/W,
782 hence:
784 avgidle += idle - (avgidle>>cl->ewma_log);
787 if (avgidle <= 0) {
788 /* Overlimit or at-limit */
790 if (avgidle < cl->minidle)
791 avgidle = cl->minidle;
793 cl->avgidle = avgidle;
795 /* Calculate expected time, when this class
796 will be allowed to send.
797 It will occur, when:
798 (1-W)*true_avgidle + W*delay = 0, i.e.
799 idle = (1/W - 1)*(-true_avgidle)
801 idle = (1 - W)*(-cl->avgidle);
803 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
806 That is not all.
807 To maintain the rate allocated to the class,
808 we add to undertime virtual clock,
809 necessary to complete transmitted packet.
810 (len/phys_bandwidth has been already passed
811 to the moment of cbq_update)
814 idle -= L2T(&q->link, len);
815 idle += L2T(cl, len);
817 PSCHED_AUDIT_TDIFF(idle);
819 PSCHED_TADD2(q->now, idle, cl->undertime);
820 } else {
821 /* Underlimit */
823 PSCHED_SET_PASTPERFECT(cl->undertime);
824 if (avgidle > cl->maxidle)
825 cl->avgidle = cl->maxidle;
826 else
827 cl->avgidle = avgidle;
829 cl->last = q->now;
832 cbq_update_toplevel(q, this, q->tx_borrowed);
835 static __inline__ struct cbq_class *
836 cbq_under_limit(struct cbq_class *cl)
838 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
839 struct cbq_class *this_cl = cl;
841 if (cl->tparent == NULL)
842 return cl;
844 if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
845 !PSCHED_TLESS(q->now, cl->undertime)) {
846 cl->delayed = 0;
847 return cl;
850 do {
851 /* It is very suspicious place. Now overlimit
852 action is generated for not bounded classes
853 only if link is completely congested.
854 Though it is in agree with ancestor-only paradigm,
855 it looks very stupid. Particularly,
856 it means that this chunk of code will either
857 never be called or result in strong amplification
858 of burstiness. Dangerous, silly, and, however,
859 no another solution exists.
861 if ((cl = cl->borrow) == NULL) {
862 this_cl->qstats.overlimits++;
863 this_cl->overlimit(this_cl);
864 return NULL;
866 if (cl->level > q->toplevel)
867 return NULL;
868 } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
869 PSCHED_TLESS(q->now, cl->undertime));
871 cl->delayed = 0;
872 return cl;
875 static __inline__ struct sk_buff *
876 cbq_dequeue_prio(struct Qdisc *sch, int prio)
878 struct cbq_sched_data *q = qdisc_priv(sch);
879 struct cbq_class *cl_tail, *cl_prev, *cl;
880 struct sk_buff *skb;
881 int deficit;
883 cl_tail = cl_prev = q->active[prio];
884 cl = cl_prev->next_alive;
886 do {
887 deficit = 0;
889 /* Start round */
890 do {
891 struct cbq_class *borrow = cl;
893 if (cl->q->q.qlen &&
894 (borrow = cbq_under_limit(cl)) == NULL)
895 goto skip_class;
897 if (cl->deficit <= 0) {
898 /* Class exhausted its allotment per
899 this round. Switch to the next one.
901 deficit = 1;
902 cl->deficit += cl->quantum;
903 goto next_class;
906 skb = cl->q->dequeue(cl->q);
908 /* Class did not give us any skb :-(
909 It could occur even if cl->q->q.qlen != 0
910 f.e. if cl->q == "tbf"
912 if (skb == NULL)
913 goto skip_class;
915 cl->deficit -= skb->len;
916 q->tx_class = cl;
917 q->tx_borrowed = borrow;
918 if (borrow != cl) {
919 #ifndef CBQ_XSTATS_BORROWS_BYTES
920 borrow->xstats.borrows++;
921 cl->xstats.borrows++;
922 #else
923 borrow->xstats.borrows += skb->len;
924 cl->xstats.borrows += skb->len;
925 #endif
927 q->tx_len = skb->len;
929 if (cl->deficit <= 0) {
930 q->active[prio] = cl;
931 cl = cl->next_alive;
932 cl->deficit += cl->quantum;
934 return skb;
936 skip_class:
937 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
938 /* Class is empty or penalized.
939 Unlink it from active chain.
941 cl_prev->next_alive = cl->next_alive;
942 cl->next_alive = NULL;
944 /* Did cl_tail point to it? */
945 if (cl == cl_tail) {
946 /* Repair it! */
947 cl_tail = cl_prev;
949 /* Was it the last class in this band? */
950 if (cl == cl_tail) {
951 /* Kill the band! */
952 q->active[prio] = NULL;
953 q->activemask &= ~(1<<prio);
954 if (cl->q->q.qlen)
955 cbq_activate_class(cl);
956 return NULL;
959 q->active[prio] = cl_tail;
961 if (cl->q->q.qlen)
962 cbq_activate_class(cl);
964 cl = cl_prev;
967 next_class:
968 cl_prev = cl;
969 cl = cl->next_alive;
970 } while (cl_prev != cl_tail);
971 } while (deficit);
973 q->active[prio] = cl_prev;
975 return NULL;
978 static __inline__ struct sk_buff *
979 cbq_dequeue_1(struct Qdisc *sch)
981 struct cbq_sched_data *q = qdisc_priv(sch);
982 struct sk_buff *skb;
983 unsigned activemask;
985 activemask = q->activemask&0xFF;
986 while (activemask) {
987 int prio = ffz(~activemask);
988 activemask &= ~(1<<prio);
989 skb = cbq_dequeue_prio(sch, prio);
990 if (skb)
991 return skb;
993 return NULL;
996 static struct sk_buff *
997 cbq_dequeue(struct Qdisc *sch)
999 struct sk_buff *skb;
1000 struct cbq_sched_data *q = qdisc_priv(sch);
1001 psched_time_t now;
1002 psched_tdiff_t incr;
1004 PSCHED_GET_TIME(now);
1005 incr = PSCHED_TDIFF(now, q->now_rt);
1007 if (q->tx_class) {
1008 psched_tdiff_t incr2;
1009 /* Time integrator. We calculate EOS time
1010 by adding expected packet transmission time.
1011 If real time is greater, we warp artificial clock,
1012 so that:
1014 cbq_time = max(real_time, work);
1016 incr2 = L2T(&q->link, q->tx_len);
1017 PSCHED_TADD(q->now, incr2);
1018 cbq_update(q);
1019 if ((incr -= incr2) < 0)
1020 incr = 0;
1022 PSCHED_TADD(q->now, incr);
1023 q->now_rt = now;
1025 for (;;) {
1026 q->wd_expires = 0;
1028 skb = cbq_dequeue_1(sch);
1029 if (skb) {
1030 sch->q.qlen--;
1031 sch->flags &= ~TCQ_F_THROTTLED;
1032 return skb;
1035 /* All the classes are overlimit.
1037 It is possible, if:
1039 1. Scheduler is empty.
1040 2. Toplevel cutoff inhibited borrowing.
1041 3. Root class is overlimit.
1043 Reset 2d and 3d conditions and retry.
1045 Note, that NS and cbq-2.0 are buggy, peeking
1046 an arbitrary class is appropriate for ancestor-only
1047 sharing, but not for toplevel algorithm.
1049 Our version is better, but slower, because it requires
1050 two passes, but it is unavoidable with top-level sharing.
1053 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1054 PSCHED_IS_PASTPERFECT(q->link.undertime))
1055 break;
1057 q->toplevel = TC_CBQ_MAXLEVEL;
1058 PSCHED_SET_PASTPERFECT(q->link.undertime);
1061 /* No packets in scheduler or nobody wants to give them to us :-(
1062 Sigh... start watchdog timer in the last case. */
1064 if (sch->q.qlen) {
1065 sch->qstats.overlimits++;
1066 if (q->wd_expires) {
1067 long delay = PSCHED_US2JIFFIE(q->wd_expires);
1068 if (delay <= 0)
1069 delay = 1;
1070 mod_timer(&q->wd_timer, jiffies + delay);
1071 sch->flags |= TCQ_F_THROTTLED;
1074 return NULL;
1077 /* CBQ class maintanance routines */
1079 static void cbq_adjust_levels(struct cbq_class *this)
1081 if (this == NULL)
1082 return;
1084 do {
1085 int level = 0;
1086 struct cbq_class *cl;
1088 if ((cl = this->children) != NULL) {
1089 do {
1090 if (cl->level > level)
1091 level = cl->level;
1092 } while ((cl = cl->sibling) != this->children);
1094 this->level = level+1;
1095 } while ((this = this->tparent) != NULL);
1098 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1100 struct cbq_class *cl;
1101 unsigned h;
1103 if (q->quanta[prio] == 0)
1104 return;
1106 for (h=0; h<16; h++) {
1107 for (cl = q->classes[h]; cl; cl = cl->next) {
1108 /* BUGGGG... Beware! This expression suffer of
1109 arithmetic overflows!
1111 if (cl->priority == prio) {
1112 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1113 q->quanta[prio];
1115 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1116 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1117 cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1123 static void cbq_sync_defmap(struct cbq_class *cl)
1125 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1126 struct cbq_class *split = cl->split;
1127 unsigned h;
1128 int i;
1130 if (split == NULL)
1131 return;
1133 for (i=0; i<=TC_PRIO_MAX; i++) {
1134 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1135 split->defaults[i] = NULL;
1138 for (i=0; i<=TC_PRIO_MAX; i++) {
1139 int level = split->level;
1141 if (split->defaults[i])
1142 continue;
1144 for (h=0; h<16; h++) {
1145 struct cbq_class *c;
1147 for (c = q->classes[h]; c; c = c->next) {
1148 if (c->split == split && c->level < level &&
1149 c->defmap&(1<<i)) {
1150 split->defaults[i] = c;
1151 level = c->level;
1158 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1160 struct cbq_class *split = NULL;
1162 if (splitid == 0) {
1163 if ((split = cl->split) == NULL)
1164 return;
1165 splitid = split->classid;
1168 if (split == NULL || split->classid != splitid) {
1169 for (split = cl->tparent; split; split = split->tparent)
1170 if (split->classid == splitid)
1171 break;
1174 if (split == NULL)
1175 return;
1177 if (cl->split != split) {
1178 cl->defmap = 0;
1179 cbq_sync_defmap(cl);
1180 cl->split = split;
1181 cl->defmap = def&mask;
1182 } else
1183 cl->defmap = (cl->defmap&~mask)|(def&mask);
1185 cbq_sync_defmap(cl);
1188 static void cbq_unlink_class(struct cbq_class *this)
1190 struct cbq_class *cl, **clp;
1191 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1193 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1194 if (cl == this) {
1195 *clp = cl->next;
1196 cl->next = NULL;
1197 break;
1201 if (this->tparent) {
1202 clp=&this->sibling;
1203 cl = *clp;
1204 do {
1205 if (cl == this) {
1206 *clp = cl->sibling;
1207 break;
1209 clp = &cl->sibling;
1210 } while ((cl = *clp) != this->sibling);
1212 if (this->tparent->children == this) {
1213 this->tparent->children = this->sibling;
1214 if (this->sibling == this)
1215 this->tparent->children = NULL;
1217 } else {
1218 BUG_TRAP(this->sibling == this);
1222 static void cbq_link_class(struct cbq_class *this)
1224 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1225 unsigned h = cbq_hash(this->classid);
1226 struct cbq_class *parent = this->tparent;
1228 this->sibling = this;
1229 this->next = q->classes[h];
1230 q->classes[h] = this;
1232 if (parent == NULL)
1233 return;
1235 if (parent->children == NULL) {
1236 parent->children = this;
1237 } else {
1238 this->sibling = parent->children->sibling;
1239 parent->children->sibling = this;
1243 static unsigned int cbq_drop(struct Qdisc* sch)
1245 struct cbq_sched_data *q = qdisc_priv(sch);
1246 struct cbq_class *cl, *cl_head;
1247 int prio;
1248 unsigned int len;
1250 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1251 if ((cl_head = q->active[prio]) == NULL)
1252 continue;
1254 cl = cl_head;
1255 do {
1256 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1257 sch->q.qlen--;
1258 if (!cl->q->q.qlen)
1259 cbq_deactivate_class(cl);
1260 return len;
1262 } while ((cl = cl->next_alive) != cl_head);
1264 return 0;
1267 static void
1268 cbq_reset(struct Qdisc* sch)
1270 struct cbq_sched_data *q = qdisc_priv(sch);
1271 struct cbq_class *cl;
1272 int prio;
1273 unsigned h;
1275 q->activemask = 0;
1276 q->pmask = 0;
1277 q->tx_class = NULL;
1278 q->tx_borrowed = NULL;
1279 del_timer(&q->wd_timer);
1280 del_timer(&q->delay_timer);
1281 q->toplevel = TC_CBQ_MAXLEVEL;
1282 PSCHED_GET_TIME(q->now);
1283 q->now_rt = q->now;
1285 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1286 q->active[prio] = NULL;
1288 for (h = 0; h < 16; h++) {
1289 for (cl = q->classes[h]; cl; cl = cl->next) {
1290 qdisc_reset(cl->q);
1292 cl->next_alive = NULL;
1293 PSCHED_SET_PASTPERFECT(cl->undertime);
1294 cl->avgidle = cl->maxidle;
1295 cl->deficit = cl->quantum;
1296 cl->cpriority = cl->priority;
1299 sch->q.qlen = 0;
1303 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1305 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1306 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1307 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1309 if (lss->change&TCF_CBQ_LSS_EWMA)
1310 cl->ewma_log = lss->ewma_log;
1311 if (lss->change&TCF_CBQ_LSS_AVPKT)
1312 cl->avpkt = lss->avpkt;
1313 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1314 cl->minidle = -(long)lss->minidle;
1315 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1316 cl->maxidle = lss->maxidle;
1317 cl->avgidle = lss->maxidle;
1319 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1320 cl->offtime = lss->offtime;
1321 return 0;
1324 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1326 q->nclasses[cl->priority]--;
1327 q->quanta[cl->priority] -= cl->weight;
1328 cbq_normalize_quanta(q, cl->priority);
1331 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1333 q->nclasses[cl->priority]++;
1334 q->quanta[cl->priority] += cl->weight;
1335 cbq_normalize_quanta(q, cl->priority);
1338 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1340 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1342 if (wrr->allot)
1343 cl->allot = wrr->allot;
1344 if (wrr->weight)
1345 cl->weight = wrr->weight;
1346 if (wrr->priority) {
1347 cl->priority = wrr->priority-1;
1348 cl->cpriority = cl->priority;
1349 if (cl->priority >= cl->priority2)
1350 cl->priority2 = TC_CBQ_MAXPRIO-1;
1353 cbq_addprio(q, cl);
1354 return 0;
1357 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1359 switch (ovl->strategy) {
1360 case TC_CBQ_OVL_CLASSIC:
1361 cl->overlimit = cbq_ovl_classic;
1362 break;
1363 case TC_CBQ_OVL_DELAY:
1364 cl->overlimit = cbq_ovl_delay;
1365 break;
1366 case TC_CBQ_OVL_LOWPRIO:
1367 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1368 ovl->priority2-1 <= cl->priority)
1369 return -EINVAL;
1370 cl->priority2 = ovl->priority2-1;
1371 cl->overlimit = cbq_ovl_lowprio;
1372 break;
1373 case TC_CBQ_OVL_DROP:
1374 cl->overlimit = cbq_ovl_drop;
1375 break;
1376 case TC_CBQ_OVL_RCLASSIC:
1377 cl->overlimit = cbq_ovl_rclassic;
1378 break;
1379 default:
1380 return -EINVAL;
1382 cl->penalty = (ovl->penalty*HZ)/1000;
1383 return 0;
1386 #ifdef CONFIG_NET_CLS_POLICE
1387 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1389 cl->police = p->police;
1391 if (cl->q->handle) {
1392 if (p->police == TC_POLICE_RECLASSIFY)
1393 cl->q->reshape_fail = cbq_reshape_fail;
1394 else
1395 cl->q->reshape_fail = NULL;
1397 return 0;
1399 #endif
1401 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1403 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1404 return 0;
1407 static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1409 struct cbq_sched_data *q = qdisc_priv(sch);
1410 struct rtattr *tb[TCA_CBQ_MAX];
1411 struct tc_ratespec *r;
1413 if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 ||
1414 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1415 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1416 return -EINVAL;
1418 if (tb[TCA_CBQ_LSSOPT-1] &&
1419 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1420 return -EINVAL;
1422 r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1424 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL)
1425 return -EINVAL;
1427 q->link.refcnt = 1;
1428 q->link.sibling = &q->link;
1429 q->link.classid = sch->handle;
1430 q->link.qdisc = sch;
1431 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1432 sch->handle)))
1433 q->link.q = &noop_qdisc;
1435 q->link.priority = TC_CBQ_MAXPRIO-1;
1436 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1437 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1438 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1439 q->link.overlimit = cbq_ovl_classic;
1440 q->link.allot = psched_mtu(sch->dev);
1441 q->link.quantum = q->link.allot;
1442 q->link.weight = q->link.R_tab->rate.rate;
1444 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1445 q->link.avpkt = q->link.allot/2;
1446 q->link.minidle = -0x7FFFFFFF;
1447 q->link.stats_lock = &sch->dev->queue_lock;
1449 init_timer(&q->wd_timer);
1450 q->wd_timer.data = (unsigned long)sch;
1451 q->wd_timer.function = cbq_watchdog;
1452 init_timer(&q->delay_timer);
1453 q->delay_timer.data = (unsigned long)sch;
1454 q->delay_timer.function = cbq_undelay;
1455 q->toplevel = TC_CBQ_MAXLEVEL;
1456 PSCHED_GET_TIME(q->now);
1457 q->now_rt = q->now;
1459 cbq_link_class(&q->link);
1461 if (tb[TCA_CBQ_LSSOPT-1])
1462 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1464 cbq_addprio(q, &q->link);
1465 return 0;
1468 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1470 unsigned char *b = skb->tail;
1472 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1473 return skb->len;
1475 rtattr_failure:
1476 skb_trim(skb, b - skb->data);
1477 return -1;
1480 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1482 unsigned char *b = skb->tail;
1483 struct tc_cbq_lssopt opt;
1485 opt.flags = 0;
1486 if (cl->borrow == NULL)
1487 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1488 if (cl->share == NULL)
1489 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1490 opt.ewma_log = cl->ewma_log;
1491 opt.level = cl->level;
1492 opt.avpkt = cl->avpkt;
1493 opt.maxidle = cl->maxidle;
1494 opt.minidle = (u32)(-cl->minidle);
1495 opt.offtime = cl->offtime;
1496 opt.change = ~0;
1497 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1498 return skb->len;
1500 rtattr_failure:
1501 skb_trim(skb, b - skb->data);
1502 return -1;
1505 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1507 unsigned char *b = skb->tail;
1508 struct tc_cbq_wrropt opt;
1510 opt.flags = 0;
1511 opt.allot = cl->allot;
1512 opt.priority = cl->priority+1;
1513 opt.cpriority = cl->cpriority+1;
1514 opt.weight = cl->weight;
1515 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1516 return skb->len;
1518 rtattr_failure:
1519 skb_trim(skb, b - skb->data);
1520 return -1;
1523 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1525 unsigned char *b = skb->tail;
1526 struct tc_cbq_ovl opt;
1528 opt.strategy = cl->ovl_strategy;
1529 opt.priority2 = cl->priority2+1;
1530 opt.pad = 0;
1531 opt.penalty = (cl->penalty*1000)/HZ;
1532 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1533 return skb->len;
1535 rtattr_failure:
1536 skb_trim(skb, b - skb->data);
1537 return -1;
1540 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1542 unsigned char *b = skb->tail;
1543 struct tc_cbq_fopt opt;
1545 if (cl->split || cl->defmap) {
1546 opt.split = cl->split ? cl->split->classid : 0;
1547 opt.defmap = cl->defmap;
1548 opt.defchange = ~0;
1549 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1551 return skb->len;
1553 rtattr_failure:
1554 skb_trim(skb, b - skb->data);
1555 return -1;
1558 #ifdef CONFIG_NET_CLS_POLICE
1559 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1561 unsigned char *b = skb->tail;
1562 struct tc_cbq_police opt;
1564 if (cl->police) {
1565 opt.police = cl->police;
1566 opt.__res1 = 0;
1567 opt.__res2 = 0;
1568 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1570 return skb->len;
1572 rtattr_failure:
1573 skb_trim(skb, b - skb->data);
1574 return -1;
1576 #endif
1578 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1580 if (cbq_dump_lss(skb, cl) < 0 ||
1581 cbq_dump_rate(skb, cl) < 0 ||
1582 cbq_dump_wrr(skb, cl) < 0 ||
1583 cbq_dump_ovl(skb, cl) < 0 ||
1584 #ifdef CONFIG_NET_CLS_POLICE
1585 cbq_dump_police(skb, cl) < 0 ||
1586 #endif
1587 cbq_dump_fopt(skb, cl) < 0)
1588 return -1;
1589 return 0;
1592 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1594 struct cbq_sched_data *q = qdisc_priv(sch);
1595 unsigned char *b = skb->tail;
1596 struct rtattr *rta;
1598 rta = (struct rtattr*)b;
1599 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1600 if (cbq_dump_attr(skb, &q->link) < 0)
1601 goto rtattr_failure;
1602 rta->rta_len = skb->tail - b;
1603 return skb->len;
1605 rtattr_failure:
1606 skb_trim(skb, b - skb->data);
1607 return -1;
1610 static int
1611 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1613 struct cbq_sched_data *q = qdisc_priv(sch);
1615 q->link.xstats.avgidle = q->link.avgidle;
1616 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1619 static int
1620 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1621 struct sk_buff *skb, struct tcmsg *tcm)
1623 struct cbq_class *cl = (struct cbq_class*)arg;
1624 unsigned char *b = skb->tail;
1625 struct rtattr *rta;
1627 if (cl->tparent)
1628 tcm->tcm_parent = cl->tparent->classid;
1629 else
1630 tcm->tcm_parent = TC_H_ROOT;
1631 tcm->tcm_handle = cl->classid;
1632 tcm->tcm_info = cl->q->handle;
1634 rta = (struct rtattr*)b;
1635 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1636 if (cbq_dump_attr(skb, cl) < 0)
1637 goto rtattr_failure;
1638 rta->rta_len = skb->tail - b;
1639 return skb->len;
1641 rtattr_failure:
1642 skb_trim(skb, b - skb->data);
1643 return -1;
1646 static int
1647 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1648 struct gnet_dump *d)
1650 struct cbq_sched_data *q = qdisc_priv(sch);
1651 struct cbq_class *cl = (struct cbq_class*)arg;
1653 cl->qstats.qlen = cl->q->q.qlen;
1654 cl->xstats.avgidle = cl->avgidle;
1655 cl->xstats.undertime = 0;
1657 if (!PSCHED_IS_PASTPERFECT(cl->undertime))
1658 cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
1660 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1661 #ifdef CONFIG_NET_ESTIMATOR
1662 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1663 #endif
1664 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1665 return -1;
1667 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1670 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1671 struct Qdisc **old)
1673 struct cbq_class *cl = (struct cbq_class*)arg;
1675 if (cl) {
1676 if (new == NULL) {
1677 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1678 cl->classid)) == NULL)
1679 return -ENOBUFS;
1680 } else {
1681 #ifdef CONFIG_NET_CLS_POLICE
1682 if (cl->police == TC_POLICE_RECLASSIFY)
1683 new->reshape_fail = cbq_reshape_fail;
1684 #endif
1686 sch_tree_lock(sch);
1687 *old = xchg(&cl->q, new);
1688 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1689 qdisc_reset(*old);
1690 sch_tree_unlock(sch);
1692 return 0;
1694 return -ENOENT;
1697 static struct Qdisc *
1698 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1700 struct cbq_class *cl = (struct cbq_class*)arg;
1702 return cl ? cl->q : NULL;
1705 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1707 struct cbq_class *cl = (struct cbq_class *)arg;
1709 if (cl->q->q.qlen == 0)
1710 cbq_deactivate_class(cl);
1713 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1715 struct cbq_sched_data *q = qdisc_priv(sch);
1716 struct cbq_class *cl = cbq_class_lookup(q, classid);
1718 if (cl) {
1719 cl->refcnt++;
1720 return (unsigned long)cl;
1722 return 0;
1725 static void cbq_destroy_filters(struct cbq_class *cl)
1727 struct tcf_proto *tp;
1729 while ((tp = cl->filter_list) != NULL) {
1730 cl->filter_list = tp->next;
1731 tcf_destroy(tp);
1735 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1737 struct cbq_sched_data *q = qdisc_priv(sch);
1739 BUG_TRAP(!cl->filters);
1741 cbq_destroy_filters(cl);
1742 qdisc_destroy(cl->q);
1743 qdisc_put_rtab(cl->R_tab);
1744 #ifdef CONFIG_NET_ESTIMATOR
1745 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1746 #endif
1747 if (cl != &q->link)
1748 kfree(cl);
1751 static void
1752 cbq_destroy(struct Qdisc* sch)
1754 struct cbq_sched_data *q = qdisc_priv(sch);
1755 struct cbq_class *cl;
1756 unsigned h;
1758 #ifdef CONFIG_NET_CLS_POLICE
1759 q->rx_class = NULL;
1760 #endif
1762 * Filters must be destroyed first because we don't destroy the
1763 * classes from root to leafs which means that filters can still
1764 * be bound to classes which have been destroyed already. --TGR '04
1766 for (h = 0; h < 16; h++)
1767 for (cl = q->classes[h]; cl; cl = cl->next)
1768 cbq_destroy_filters(cl);
1770 for (h = 0; h < 16; h++) {
1771 struct cbq_class *next;
1773 for (cl = q->classes[h]; cl; cl = next) {
1774 next = cl->next;
1775 cbq_destroy_class(sch, cl);
1780 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1782 struct cbq_class *cl = (struct cbq_class*)arg;
1784 if (--cl->refcnt == 0) {
1785 #ifdef CONFIG_NET_CLS_POLICE
1786 struct cbq_sched_data *q = qdisc_priv(sch);
1788 spin_lock_bh(&sch->dev->queue_lock);
1789 if (q->rx_class == cl)
1790 q->rx_class = NULL;
1791 spin_unlock_bh(&sch->dev->queue_lock);
1792 #endif
1794 cbq_destroy_class(sch, cl);
1798 static int
1799 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1800 unsigned long *arg)
1802 int err;
1803 struct cbq_sched_data *q = qdisc_priv(sch);
1804 struct cbq_class *cl = (struct cbq_class*)*arg;
1805 struct rtattr *opt = tca[TCA_OPTIONS-1];
1806 struct rtattr *tb[TCA_CBQ_MAX];
1807 struct cbq_class *parent;
1808 struct qdisc_rate_table *rtab = NULL;
1810 if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt))
1811 return -EINVAL;
1813 if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1814 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1815 return -EINVAL;
1817 if (tb[TCA_CBQ_FOPT-1] &&
1818 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1819 return -EINVAL;
1821 if (tb[TCA_CBQ_RATE-1] &&
1822 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1823 return -EINVAL;
1825 if (tb[TCA_CBQ_LSSOPT-1] &&
1826 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1827 return -EINVAL;
1829 if (tb[TCA_CBQ_WRROPT-1] &&
1830 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1831 return -EINVAL;
1833 #ifdef CONFIG_NET_CLS_POLICE
1834 if (tb[TCA_CBQ_POLICE-1] &&
1835 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1836 return -EINVAL;
1837 #endif
1839 if (cl) {
1840 /* Check parent */
1841 if (parentid) {
1842 if (cl->tparent && cl->tparent->classid != parentid)
1843 return -EINVAL;
1844 if (!cl->tparent && parentid != TC_H_ROOT)
1845 return -EINVAL;
1848 if (tb[TCA_CBQ_RATE-1]) {
1849 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1850 if (rtab == NULL)
1851 return -EINVAL;
1854 /* Change class parameters */
1855 sch_tree_lock(sch);
1857 if (cl->next_alive != NULL)
1858 cbq_deactivate_class(cl);
1860 if (rtab) {
1861 rtab = xchg(&cl->R_tab, rtab);
1862 qdisc_put_rtab(rtab);
1865 if (tb[TCA_CBQ_LSSOPT-1])
1866 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1868 if (tb[TCA_CBQ_WRROPT-1]) {
1869 cbq_rmprio(q, cl);
1870 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1873 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1874 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1876 #ifdef CONFIG_NET_CLS_POLICE
1877 if (tb[TCA_CBQ_POLICE-1])
1878 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1879 #endif
1881 if (tb[TCA_CBQ_FOPT-1])
1882 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1884 if (cl->q->q.qlen)
1885 cbq_activate_class(cl);
1887 sch_tree_unlock(sch);
1889 #ifdef CONFIG_NET_ESTIMATOR
1890 if (tca[TCA_RATE-1])
1891 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1892 cl->stats_lock, tca[TCA_RATE-1]);
1893 #endif
1894 return 0;
1897 if (parentid == TC_H_ROOT)
1898 return -EINVAL;
1900 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1901 tb[TCA_CBQ_LSSOPT-1] == NULL)
1902 return -EINVAL;
1904 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1905 if (rtab == NULL)
1906 return -EINVAL;
1908 if (classid) {
1909 err = -EINVAL;
1910 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1911 goto failure;
1912 } else {
1913 int i;
1914 classid = TC_H_MAKE(sch->handle,0x8000);
1916 for (i=0; i<0x8000; i++) {
1917 if (++q->hgenerator >= 0x8000)
1918 q->hgenerator = 1;
1919 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1920 break;
1922 err = -ENOSR;
1923 if (i >= 0x8000)
1924 goto failure;
1925 classid = classid|q->hgenerator;
1928 parent = &q->link;
1929 if (parentid) {
1930 parent = cbq_class_lookup(q, parentid);
1931 err = -EINVAL;
1932 if (parent == NULL)
1933 goto failure;
1936 err = -ENOBUFS;
1937 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1938 if (cl == NULL)
1939 goto failure;
1940 cl->R_tab = rtab;
1941 rtab = NULL;
1942 cl->refcnt = 1;
1943 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid)))
1944 cl->q = &noop_qdisc;
1945 cl->classid = classid;
1946 cl->tparent = parent;
1947 cl->qdisc = sch;
1948 cl->allot = parent->allot;
1949 cl->quantum = cl->allot;
1950 cl->weight = cl->R_tab->rate.rate;
1951 cl->stats_lock = &sch->dev->queue_lock;
1953 sch_tree_lock(sch);
1954 cbq_link_class(cl);
1955 cl->borrow = cl->tparent;
1956 if (cl->tparent != &q->link)
1957 cl->share = cl->tparent;
1958 cbq_adjust_levels(parent);
1959 cl->minidle = -0x7FFFFFFF;
1960 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1961 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1962 if (cl->ewma_log==0)
1963 cl->ewma_log = q->link.ewma_log;
1964 if (cl->maxidle==0)
1965 cl->maxidle = q->link.maxidle;
1966 if (cl->avpkt==0)
1967 cl->avpkt = q->link.avpkt;
1968 cl->overlimit = cbq_ovl_classic;
1969 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1970 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1971 #ifdef CONFIG_NET_CLS_POLICE
1972 if (tb[TCA_CBQ_POLICE-1])
1973 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1974 #endif
1975 if (tb[TCA_CBQ_FOPT-1])
1976 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1977 sch_tree_unlock(sch);
1979 #ifdef CONFIG_NET_ESTIMATOR
1980 if (tca[TCA_RATE-1])
1981 gen_new_estimator(&cl->bstats, &cl->rate_est,
1982 cl->stats_lock, tca[TCA_RATE-1]);
1983 #endif
1985 *arg = (unsigned long)cl;
1986 return 0;
1988 failure:
1989 qdisc_put_rtab(rtab);
1990 return err;
1993 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1995 struct cbq_sched_data *q = qdisc_priv(sch);
1996 struct cbq_class *cl = (struct cbq_class*)arg;
1997 unsigned int qlen;
1999 if (cl->filters || cl->children || cl == &q->link)
2000 return -EBUSY;
2002 sch_tree_lock(sch);
2004 qlen = cl->q->q.qlen;
2005 qdisc_reset(cl->q);
2006 qdisc_tree_decrease_qlen(cl->q, qlen);
2008 if (cl->next_alive)
2009 cbq_deactivate_class(cl);
2011 if (q->tx_borrowed == cl)
2012 q->tx_borrowed = q->tx_class;
2013 if (q->tx_class == cl) {
2014 q->tx_class = NULL;
2015 q->tx_borrowed = NULL;
2017 #ifdef CONFIG_NET_CLS_POLICE
2018 if (q->rx_class == cl)
2019 q->rx_class = NULL;
2020 #endif
2022 cbq_unlink_class(cl);
2023 cbq_adjust_levels(cl->tparent);
2024 cl->defmap = 0;
2025 cbq_sync_defmap(cl);
2027 cbq_rmprio(q, cl);
2028 sch_tree_unlock(sch);
2030 if (--cl->refcnt == 0)
2031 cbq_destroy_class(sch, cl);
2033 return 0;
2036 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
2038 struct cbq_sched_data *q = qdisc_priv(sch);
2039 struct cbq_class *cl = (struct cbq_class *)arg;
2041 if (cl == NULL)
2042 cl = &q->link;
2044 return &cl->filter_list;
2047 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
2048 u32 classid)
2050 struct cbq_sched_data *q = qdisc_priv(sch);
2051 struct cbq_class *p = (struct cbq_class*)parent;
2052 struct cbq_class *cl = cbq_class_lookup(q, classid);
2054 if (cl) {
2055 if (p && p->level <= cl->level)
2056 return 0;
2057 cl->filters++;
2058 return (unsigned long)cl;
2060 return 0;
2063 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2065 struct cbq_class *cl = (struct cbq_class*)arg;
2067 cl->filters--;
2070 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2072 struct cbq_sched_data *q = qdisc_priv(sch);
2073 unsigned h;
2075 if (arg->stop)
2076 return;
2078 for (h = 0; h < 16; h++) {
2079 struct cbq_class *cl;
2081 for (cl = q->classes[h]; cl; cl = cl->next) {
2082 if (arg->count < arg->skip) {
2083 arg->count++;
2084 continue;
2086 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2087 arg->stop = 1;
2088 return;
2090 arg->count++;
2095 static struct Qdisc_class_ops cbq_class_ops = {
2096 .graft = cbq_graft,
2097 .leaf = cbq_leaf,
2098 .qlen_notify = cbq_qlen_notify,
2099 .get = cbq_get,
2100 .put = cbq_put,
2101 .change = cbq_change_class,
2102 .delete = cbq_delete,
2103 .walk = cbq_walk,
2104 .tcf_chain = cbq_find_tcf,
2105 .bind_tcf = cbq_bind_filter,
2106 .unbind_tcf = cbq_unbind_filter,
2107 .dump = cbq_dump_class,
2108 .dump_stats = cbq_dump_class_stats,
2111 static struct Qdisc_ops cbq_qdisc_ops = {
2112 .next = NULL,
2113 .cl_ops = &cbq_class_ops,
2114 .id = "cbq",
2115 .priv_size = sizeof(struct cbq_sched_data),
2116 .enqueue = cbq_enqueue,
2117 .dequeue = cbq_dequeue,
2118 .requeue = cbq_requeue,
2119 .drop = cbq_drop,
2120 .init = cbq_init,
2121 .reset = cbq_reset,
2122 .destroy = cbq_destroy,
2123 .change = NULL,
2124 .dump = cbq_dump,
2125 .dump_stats = cbq_dump_stats,
2126 .owner = THIS_MODULE,
2129 static int __init cbq_module_init(void)
2131 return register_qdisc(&cbq_qdisc_ops);
2133 static void __exit cbq_module_exit(void)
2135 unregister_qdisc(&cbq_qdisc_ops);
2137 module_init(cbq_module_init)
2138 module_exit(cbq_module_exit)
2139 MODULE_LICENSE("GPL");