K2.6 patches and update.
[tomato.git] / release / src-rt / linux / linux-2.6 / net / sched / sch_cbq.c
blob9710b9e9e688a26fbacabef70c6e79b1fd7251fb
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/module.h>
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/string.h>
20 #include <linux/mm.h>
21 #include <linux/socket.h>
22 #include <linux/sockios.h>
23 #include <linux/in.h>
24 #include <linux/errno.h>
25 #include <linux/interrupt.h>
26 #include <linux/if_ether.h>
27 #include <linux/inet.h>
28 #include <linux/netdevice.h>
29 #include <linux/etherdevice.h>
30 #include <linux/notifier.h>
31 #include <net/ip.h>
32 #include <net/netlink.h>
33 #include <net/route.h>
34 #include <linux/skbuff.h>
35 #include <net/sock.h>
36 #include <net/pkt_sched.h>
39 /* Class-Based Queueing (CBQ) algorithm.
40 =======================================
42 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
43 Management Models for Packet Networks",
44 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
46 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
48 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
49 Parameters", 1996
51 [4] Sally Floyd and Michael Speer, "Experimental Results
52 for Class-Based Queueing", 1998, not published.
54 -----------------------------------------------------------------------
56 Algorithm skeleton was taken from NS simulator cbq.cc.
57 If someone wants to check this code against the LBL version,
58 he should take into account that ONLY the skeleton was borrowed,
59 the implementation is different. Particularly:
61 --- The WRR algorithm is different. Our version looks more
62 reasonable (I hope) and works when quanta are allowed to be
63 less than MTU, which is always the case when real time classes
64 have small rates. Note, that the statement of [3] is
65 incomplete, delay may actually be estimated even if class
66 per-round allotment is less than MTU. Namely, if per-round
67 allotment is W*r_i, and r_1+...+r_k = r < 1
69 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
71 In the worst case we have IntServ estimate with D = W*r+k*MTU
72 and C = MTU*r. The proof (if correct at all) is trivial.
75 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
76 interpret some places, which look like wrong translations
77 from NS. Anyone is advised to find these differences
78 and explain to me, why I am wrong 8).
80 --- Linux has no EOI event, so that we cannot estimate true class
81 idle time. Workaround is to consider the next dequeue event
82 as sign that previous packet is finished. This is wrong because of
83 internal device queueing, but on a permanently loaded link it is true.
84 Moreover, combined with clock integrator, this scheme looks
85 very close to an ideal solution. */
87 struct cbq_sched_data;
90 struct cbq_class
92 struct cbq_class *next; /* hash table link */
93 struct cbq_class *next_alive; /* next class with backlog in this priority band */
95 /* Parameters */
96 u32 classid;
97 unsigned char priority; /* class priority */
98 unsigned char priority2; /* priority to be used after overlimit */
99 unsigned char ewma_log; /* time constant for idle time calculation */
100 unsigned char ovl_strategy;
101 #ifdef CONFIG_NET_CLS_POLICE
102 unsigned char police;
103 #endif
105 u32 defmap;
107 /* Link-sharing scheduler parameters */
108 long maxidle; /* Class parameters: see below. */
109 long offtime;
110 long minidle;
111 u32 avpkt;
112 struct qdisc_rate_table *R_tab;
114 /* Overlimit strategy parameters */
115 void (*overlimit)(struct cbq_class *cl);
116 psched_tdiff_t penalty;
118 /* General scheduler (WRR) parameters */
119 long allot;
120 long quantum; /* Allotment per WRR round */
121 long weight; /* Relative allotment: see below */
123 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
124 struct cbq_class *split; /* Ptr to split node */
125 struct cbq_class *share; /* Ptr to LS parent in the class tree */
126 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
127 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
128 parent otherwise */
129 struct cbq_class *sibling; /* Sibling chain */
130 struct cbq_class *children; /* Pointer to children chain */
132 struct Qdisc *q; /* Elementary queueing discipline */
135 /* Variables */
136 unsigned char cpriority; /* Effective priority */
137 unsigned char delayed;
138 unsigned char level; /* level of the class in hierarchy:
139 0 for leaf classes, and maximal
140 level of children + 1 for nodes.
143 psched_time_t last; /* Last end of service */
144 psched_time_t undertime;
145 long avgidle;
146 long deficit; /* Saved deficit for WRR */
147 psched_time_t penalized;
148 struct gnet_stats_basic bstats;
149 struct gnet_stats_queue qstats;
150 struct gnet_stats_rate_est rate_est;
151 spinlock_t *stats_lock;
152 struct tc_cbq_xstats xstats;
154 struct tcf_proto *filter_list;
156 int refcnt;
157 int filters;
159 struct cbq_class *defaults[TC_PRIO_MAX+1];
162 struct cbq_sched_data
164 struct cbq_class *classes[16]; /* Hash table of all classes */
165 int nclasses[TC_CBQ_MAXPRIO+1];
166 unsigned quanta[TC_CBQ_MAXPRIO+1];
168 struct cbq_class link;
170 unsigned activemask;
171 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
172 with backlog */
174 #ifdef CONFIG_NET_CLS_POLICE
175 struct cbq_class *rx_class;
176 #endif
177 struct cbq_class *tx_class;
178 struct cbq_class *tx_borrowed;
179 int tx_len;
180 psched_time_t now; /* Cached timestamp */
181 psched_time_t now_rt; /* Cached real time */
182 unsigned pmask;
184 struct hrtimer delay_timer;
185 struct qdisc_watchdog watchdog; /* Watchdog timer,
186 started when CBQ has
187 backlog, but cannot
188 transmit just now */
189 psched_tdiff_t wd_expires;
190 int toplevel;
191 u32 hgenerator;
195 #define L2T(cl,len) qdisc_l2t((cl)->R_tab,len)
197 static __inline__ unsigned cbq_hash(u32 h)
199 h ^= h>>8;
200 h ^= h>>4;
201 return h&0xF;
204 static __inline__ struct cbq_class *
205 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
207 struct cbq_class *cl;
209 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
210 if (cl->classid == classid)
211 return cl;
212 return NULL;
215 #ifdef CONFIG_NET_CLS_POLICE
217 static struct cbq_class *
218 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
220 struct cbq_class *cl, *new;
222 for (cl = this->tparent; cl; cl = cl->tparent)
223 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
224 return new;
226 return NULL;
229 #endif
231 /* Classify packet. The procedure is pretty complicated, but
232 it allows us to combine link sharing and priority scheduling
233 transparently.
235 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
236 so that it resolves to split nodes. Then packets are classified
237 by logical priority, or a more specific classifier may be attached
238 to the split node.
241 static struct cbq_class *
242 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
244 struct cbq_sched_data *q = qdisc_priv(sch);
245 struct cbq_class *head = &q->link;
246 struct cbq_class **defmap;
247 struct cbq_class *cl = NULL;
248 u32 prio = skb->priority;
249 struct tcf_result res;
252 * Step 1. If skb->priority points to one of our classes, use it.
254 if (TC_H_MAJ(prio^sch->handle) == 0 &&
255 (cl = cbq_class_lookup(q, prio)) != NULL)
256 return cl;
258 *qerr = NET_XMIT_BYPASS;
259 for (;;) {
260 int result = 0;
261 defmap = head->defaults;
264 * Step 2+n. Apply classifier.
266 if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
267 goto fallback;
269 if ((cl = (void*)res.class) == NULL) {
270 if (TC_H_MAJ(res.classid))
271 cl = cbq_class_lookup(q, res.classid);
272 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
273 cl = defmap[TC_PRIO_BESTEFFORT];
275 if (cl == NULL || cl->level >= head->level)
276 goto fallback;
279 #ifdef CONFIG_NET_CLS_ACT
280 switch (result) {
281 case TC_ACT_QUEUED:
282 case TC_ACT_STOLEN:
283 *qerr = NET_XMIT_SUCCESS;
284 case TC_ACT_SHOT:
285 return NULL;
287 #elif defined(CONFIG_NET_CLS_POLICE)
288 switch (result) {
289 case TC_POLICE_RECLASSIFY:
290 return cbq_reclassify(skb, cl);
291 case TC_POLICE_SHOT:
292 return NULL;
293 default:
294 break;
296 #endif
297 if (cl->level == 0)
298 return cl;
301 * Step 3+n. If classifier selected a link sharing class,
302 * apply agency specific classifier.
303 * Repeat this procdure until we hit a leaf node.
305 head = cl;
308 fallback:
309 cl = head;
312 * Step 4. No success...
314 if (TC_H_MAJ(prio) == 0 &&
315 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
316 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
317 return head;
319 return cl;
323 A packet has just been enqueued on the empty class.
324 cbq_activate_class adds it to the tail of active class list
325 of its priority band.
328 static __inline__ void cbq_activate_class(struct cbq_class *cl)
330 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
331 int prio = cl->cpriority;
332 struct cbq_class *cl_tail;
334 cl_tail = q->active[prio];
335 q->active[prio] = cl;
337 if (cl_tail != NULL) {
338 cl->next_alive = cl_tail->next_alive;
339 cl_tail->next_alive = cl;
340 } else {
341 cl->next_alive = cl;
342 q->activemask |= (1<<prio);
347 Unlink class from active chain.
348 Note that this same procedure is done directly in cbq_dequeue*
349 during round-robin procedure.
352 static void cbq_deactivate_class(struct cbq_class *this)
354 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
355 int prio = this->cpriority;
356 struct cbq_class *cl;
357 struct cbq_class *cl_prev = q->active[prio];
359 do {
360 cl = cl_prev->next_alive;
361 if (cl == this) {
362 cl_prev->next_alive = cl->next_alive;
363 cl->next_alive = NULL;
365 if (cl == q->active[prio]) {
366 q->active[prio] = cl_prev;
367 if (cl == q->active[prio]) {
368 q->active[prio] = NULL;
369 q->activemask &= ~(1<<prio);
370 return;
373 return;
375 } while ((cl_prev = cl) != q->active[prio]);
378 static void
379 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
381 int toplevel = q->toplevel;
383 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
384 psched_time_t now;
385 psched_tdiff_t incr;
387 now = psched_get_time();
388 incr = now - q->now_rt;
389 now = q->now + incr;
391 do {
392 if (cl->undertime < now) {
393 q->toplevel = cl->level;
394 return;
396 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
400 static int
401 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
403 struct cbq_sched_data *q = qdisc_priv(sch);
404 int len = skb->len;
405 int ret;
406 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
408 #ifdef CONFIG_NET_CLS_POLICE
409 q->rx_class = cl;
410 #endif
411 if (cl == NULL) {
412 if (ret == NET_XMIT_BYPASS)
413 sch->qstats.drops++;
414 kfree_skb(skb);
415 return ret;
418 #ifdef CONFIG_NET_CLS_POLICE
419 cl->q->__parent = sch;
420 #endif
421 if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
422 sch->q.qlen++;
423 sch->bstats.packets++;
424 sch->bstats.bytes+=len;
425 cbq_mark_toplevel(q, cl);
426 if (!cl->next_alive)
427 cbq_activate_class(cl);
428 return ret;
431 sch->qstats.drops++;
432 cbq_mark_toplevel(q, cl);
433 cl->qstats.drops++;
434 return ret;
437 static int
438 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
440 struct cbq_sched_data *q = qdisc_priv(sch);
441 struct cbq_class *cl;
442 int ret;
444 if ((cl = q->tx_class) == NULL) {
445 kfree_skb(skb);
446 sch->qstats.drops++;
447 return NET_XMIT_CN;
449 q->tx_class = NULL;
451 cbq_mark_toplevel(q, cl);
453 #ifdef CONFIG_NET_CLS_POLICE
454 q->rx_class = cl;
455 cl->q->__parent = sch;
456 #endif
457 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
458 sch->q.qlen++;
459 sch->qstats.requeues++;
460 if (!cl->next_alive)
461 cbq_activate_class(cl);
462 return 0;
464 sch->qstats.drops++;
465 cl->qstats.drops++;
466 return ret;
469 /* Overlimit actions */
471 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
473 static void cbq_ovl_classic(struct cbq_class *cl)
475 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
476 psched_tdiff_t delay = cl->undertime - q->now;
478 if (!cl->delayed) {
479 delay += cl->offtime;
482 Class goes to sleep, so that it will have no
483 chance to work avgidle. Let's forgive it 8)
485 BTW cbq-2.0 has a crap in this
486 place, apparently they forgot to shift it by cl->ewma_log.
488 if (cl->avgidle < 0)
489 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
490 if (cl->avgidle < cl->minidle)
491 cl->avgidle = cl->minidle;
492 if (delay <= 0)
493 delay = 1;
494 cl->undertime = q->now + delay;
496 cl->xstats.overactions++;
497 cl->delayed = 1;
499 if (q->wd_expires == 0 || q->wd_expires > delay)
500 q->wd_expires = delay;
502 /* Dirty work! We must schedule wakeups based on
503 real available rate, rather than leaf rate,
504 which may be tiny (even zero).
506 if (q->toplevel == TC_CBQ_MAXLEVEL) {
507 struct cbq_class *b;
508 psched_tdiff_t base_delay = q->wd_expires;
510 for (b = cl->borrow; b; b = b->borrow) {
511 delay = b->undertime - q->now;
512 if (delay < base_delay) {
513 if (delay <= 0)
514 delay = 1;
515 base_delay = delay;
519 q->wd_expires = base_delay;
523 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
524 they go overlimit
527 static void cbq_ovl_rclassic(struct cbq_class *cl)
529 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
530 struct cbq_class *this = cl;
532 do {
533 if (cl->level > q->toplevel) {
534 cl = NULL;
535 break;
537 } while ((cl = cl->borrow) != NULL);
539 if (cl == NULL)
540 cl = this;
541 cbq_ovl_classic(cl);
544 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
546 static void cbq_ovl_delay(struct cbq_class *cl)
548 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
549 psched_tdiff_t delay = cl->undertime - q->now;
551 if (!cl->delayed) {
552 psched_time_t sched = q->now;
553 ktime_t expires;
555 delay += cl->offtime;
556 if (cl->avgidle < 0)
557 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
558 if (cl->avgidle < cl->minidle)
559 cl->avgidle = cl->minidle;
560 cl->undertime = q->now + delay;
562 if (delay > 0) {
563 sched += delay + cl->penalty;
564 cl->penalized = sched;
565 cl->cpriority = TC_CBQ_MAXPRIO;
566 q->pmask |= (1<<TC_CBQ_MAXPRIO);
568 expires = ktime_set(0, 0);
569 expires = ktime_add_ns(expires, PSCHED_US2NS(sched));
570 if (hrtimer_try_to_cancel(&q->delay_timer) &&
571 ktime_to_ns(ktime_sub(q->delay_timer.expires,
572 expires)) > 0)
573 q->delay_timer.expires = expires;
574 hrtimer_restart(&q->delay_timer);
575 cl->delayed = 1;
576 cl->xstats.overactions++;
577 return;
579 delay = 1;
581 if (q->wd_expires == 0 || q->wd_expires > delay)
582 q->wd_expires = delay;
585 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
587 static void cbq_ovl_lowprio(struct cbq_class *cl)
589 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
591 cl->penalized = q->now + cl->penalty;
593 if (cl->cpriority != cl->priority2) {
594 cl->cpriority = cl->priority2;
595 q->pmask |= (1<<cl->cpriority);
596 cl->xstats.overactions++;
598 cbq_ovl_classic(cl);
601 /* TC_CBQ_OVL_DROP: penalize class by dropping */
603 static void cbq_ovl_drop(struct cbq_class *cl)
605 if (cl->q->ops->drop)
606 if (cl->q->ops->drop(cl->q))
607 cl->qdisc->q.qlen--;
608 cl->xstats.overactions++;
609 cbq_ovl_classic(cl);
612 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
613 psched_time_t now)
615 struct cbq_class *cl;
616 struct cbq_class *cl_prev = q->active[prio];
617 psched_time_t sched = now;
619 if (cl_prev == NULL)
620 return 0;
622 do {
623 cl = cl_prev->next_alive;
624 if (now - cl->penalized > 0) {
625 cl_prev->next_alive = cl->next_alive;
626 cl->next_alive = NULL;
627 cl->cpriority = cl->priority;
628 cl->delayed = 0;
629 cbq_activate_class(cl);
631 if (cl == q->active[prio]) {
632 q->active[prio] = cl_prev;
633 if (cl == q->active[prio]) {
634 q->active[prio] = NULL;
635 return 0;
639 cl = cl_prev->next_alive;
640 } else if (sched - cl->penalized > 0)
641 sched = cl->penalized;
642 } while ((cl_prev = cl) != q->active[prio]);
644 return sched - now;
647 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
649 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
650 delay_timer);
651 struct Qdisc *sch = q->watchdog.qdisc;
652 psched_time_t now;
653 psched_tdiff_t delay = 0;
654 unsigned pmask;
656 now = psched_get_time();
658 pmask = q->pmask;
659 q->pmask = 0;
661 while (pmask) {
662 int prio = ffz(~pmask);
663 psched_tdiff_t tmp;
665 pmask &= ~(1<<prio);
667 tmp = cbq_undelay_prio(q, prio, now);
668 if (tmp > 0) {
669 q->pmask |= 1<<prio;
670 if (tmp < delay || delay == 0)
671 delay = tmp;
675 if (delay) {
676 ktime_t time;
678 time = ktime_set(0, 0);
679 time = ktime_add_ns(time, PSCHED_US2NS(now + delay));
680 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
683 sch->flags &= ~TCQ_F_THROTTLED;
684 netif_schedule(sch->dev);
685 return HRTIMER_NORESTART;
689 #ifdef CONFIG_NET_CLS_POLICE
691 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
693 int len = skb->len;
694 struct Qdisc *sch = child->__parent;
695 struct cbq_sched_data *q = qdisc_priv(sch);
696 struct cbq_class *cl = q->rx_class;
698 q->rx_class = NULL;
700 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
702 cbq_mark_toplevel(q, cl);
704 q->rx_class = cl;
705 cl->q->__parent = sch;
707 if (cl->q->enqueue(skb, cl->q) == 0) {
708 sch->q.qlen++;
709 sch->bstats.packets++;
710 sch->bstats.bytes+=len;
711 if (!cl->next_alive)
712 cbq_activate_class(cl);
713 return 0;
715 sch->qstats.drops++;
716 return 0;
719 sch->qstats.drops++;
720 return -1;
722 #endif
725 It is mission critical procedure.
727 We "regenerate" toplevel cutoff, if transmitting class
728 has backlog and it is not regulated. It is not part of
729 original CBQ description, but looks more reasonable.
730 Probably, it is wrong. This question needs further investigation.
733 static __inline__ void
734 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
735 struct cbq_class *borrowed)
737 if (cl && q->toplevel >= borrowed->level) {
738 if (cl->q->q.qlen > 1) {
739 do {
740 if (borrowed->undertime == PSCHED_PASTPERFECT) {
741 q->toplevel = borrowed->level;
742 return;
744 } while ((borrowed=borrowed->borrow) != NULL);
746 #if 0
747 /* It is not necessary now. Uncommenting it
748 will save CPU cycles, but decrease fairness.
750 q->toplevel = TC_CBQ_MAXLEVEL;
751 #endif
755 static void
756 cbq_update(struct cbq_sched_data *q)
758 struct cbq_class *this = q->tx_class;
759 struct cbq_class *cl = this;
760 int len = q->tx_len;
762 q->tx_class = NULL;
764 for ( ; cl; cl = cl->share) {
765 long avgidle = cl->avgidle;
766 long idle;
768 cl->bstats.packets++;
769 cl->bstats.bytes += len;
772 (now - last) is total time between packet right edges.
773 (last_pktlen/rate) is "virtual" busy time, so that
775 idle = (now - last) - last_pktlen/rate
778 idle = q->now - cl->last;
779 if ((unsigned long)idle > 128*1024*1024) {
780 avgidle = cl->maxidle;
781 } else {
782 idle -= L2T(cl, len);
784 /* true_avgidle := (1-W)*true_avgidle + W*idle,
785 where W=2^{-ewma_log}. But cl->avgidle is scaled:
786 cl->avgidle == true_avgidle/W,
787 hence:
789 avgidle += idle - (avgidle>>cl->ewma_log);
792 if (avgidle <= 0) {
793 /* Overlimit or at-limit */
795 if (avgidle < cl->minidle)
796 avgidle = cl->minidle;
798 cl->avgidle = avgidle;
800 /* Calculate expected time, when this class
801 will be allowed to send.
802 It will occur, when:
803 (1-W)*true_avgidle + W*delay = 0, i.e.
804 idle = (1/W - 1)*(-true_avgidle)
806 idle = (1 - W)*(-cl->avgidle);
808 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
811 That is not all.
812 To maintain the rate allocated to the class,
813 we add to undertime virtual clock,
814 necessary to complete transmitted packet.
815 (len/phys_bandwidth has been already passed
816 to the moment of cbq_update)
819 idle -= L2T(&q->link, len);
820 idle += L2T(cl, len);
822 cl->undertime = q->now + idle;
823 } else {
824 /* Underlimit */
826 cl->undertime = PSCHED_PASTPERFECT;
827 if (avgidle > cl->maxidle)
828 cl->avgidle = cl->maxidle;
829 else
830 cl->avgidle = avgidle;
832 cl->last = q->now;
835 cbq_update_toplevel(q, this, q->tx_borrowed);
838 static __inline__ struct cbq_class *
839 cbq_under_limit(struct cbq_class *cl)
841 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
842 struct cbq_class *this_cl = cl;
844 if (cl->tparent == NULL)
845 return cl;
847 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
848 cl->delayed = 0;
849 return cl;
852 do {
853 /* It is very suspicious place. Now overlimit
854 action is generated for not bounded classes
855 only if link is completely congested.
856 Though it is in agree with ancestor-only paradigm,
857 it looks very stupid. Particularly,
858 it means that this chunk of code will either
859 never be called or result in strong amplification
860 of burstiness. Dangerous, silly, and, however,
861 no another solution exists.
863 if ((cl = cl->borrow) == NULL) {
864 this_cl->qstats.overlimits++;
865 this_cl->overlimit(this_cl);
866 return NULL;
868 if (cl->level > q->toplevel)
869 return NULL;
870 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
872 cl->delayed = 0;
873 return cl;
876 static __inline__ struct sk_buff *
877 cbq_dequeue_prio(struct Qdisc *sch, int prio)
879 struct cbq_sched_data *q = qdisc_priv(sch);
880 struct cbq_class *cl_tail, *cl_prev, *cl;
881 struct sk_buff *skb;
882 int deficit;
884 cl_tail = cl_prev = q->active[prio];
885 cl = cl_prev->next_alive;
887 do {
888 deficit = 0;
890 /* Start round */
891 do {
892 struct cbq_class *borrow = cl;
894 if (cl->q->q.qlen &&
895 (borrow = cbq_under_limit(cl)) == NULL)
896 goto skip_class;
898 if (cl->deficit <= 0) {
899 /* Class exhausted its allotment per
900 this round. Switch to the next one.
902 deficit = 1;
903 cl->deficit += cl->quantum;
904 goto next_class;
907 skb = cl->q->dequeue(cl->q);
909 /* Class did not give us any skb :-(
910 It could occur even if cl->q->q.qlen != 0
911 f.e. if cl->q == "tbf"
913 if (skb == NULL)
914 goto skip_class;
916 cl->deficit -= skb->len;
917 q->tx_class = cl;
918 q->tx_borrowed = borrow;
919 if (borrow != cl) {
920 #ifndef CBQ_XSTATS_BORROWS_BYTES
921 borrow->xstats.borrows++;
922 cl->xstats.borrows++;
923 #else
924 borrow->xstats.borrows += skb->len;
925 cl->xstats.borrows += skb->len;
926 #endif
928 q->tx_len = skb->len;
930 if (cl->deficit <= 0) {
931 q->active[prio] = cl;
932 cl = cl->next_alive;
933 cl->deficit += cl->quantum;
935 return skb;
937 skip_class:
938 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
939 /* Class is empty or penalized.
940 Unlink it from active chain.
942 cl_prev->next_alive = cl->next_alive;
943 cl->next_alive = NULL;
945 /* Did cl_tail point to it? */
946 if (cl == cl_tail) {
947 /* Repair it! */
948 cl_tail = cl_prev;
950 /* Was it the last class in this band? */
951 if (cl == cl_tail) {
952 /* Kill the band! */
953 q->active[prio] = NULL;
954 q->activemask &= ~(1<<prio);
955 if (cl->q->q.qlen)
956 cbq_activate_class(cl);
957 return NULL;
960 q->active[prio] = cl_tail;
962 if (cl->q->q.qlen)
963 cbq_activate_class(cl);
965 cl = cl_prev;
968 next_class:
969 cl_prev = cl;
970 cl = cl->next_alive;
971 } while (cl_prev != cl_tail);
972 } while (deficit);
974 q->active[prio] = cl_prev;
976 return NULL;
979 static __inline__ struct sk_buff *
980 cbq_dequeue_1(struct Qdisc *sch)
982 struct cbq_sched_data *q = qdisc_priv(sch);
983 struct sk_buff *skb;
984 unsigned activemask;
986 activemask = q->activemask&0xFF;
987 while (activemask) {
988 int prio = ffz(~activemask);
989 activemask &= ~(1<<prio);
990 skb = cbq_dequeue_prio(sch, prio);
991 if (skb)
992 return skb;
994 return NULL;
997 static struct sk_buff *
998 cbq_dequeue(struct Qdisc *sch)
1000 struct sk_buff *skb;
1001 struct cbq_sched_data *q = qdisc_priv(sch);
1002 psched_time_t now;
1003 psched_tdiff_t incr;
1005 now = psched_get_time();
1006 incr = now - q->now_rt;
1008 if (q->tx_class) {
1009 psched_tdiff_t incr2;
1010 /* Time integrator. We calculate EOS time
1011 by adding expected packet transmission time.
1012 If real time is greater, we warp artificial clock,
1013 so that:
1015 cbq_time = max(real_time, work);
1017 incr2 = L2T(&q->link, q->tx_len);
1018 q->now += incr2;
1019 cbq_update(q);
1020 if ((incr -= incr2) < 0)
1021 incr = 0;
1023 q->now += incr;
1024 q->now_rt = now;
1026 for (;;) {
1027 q->wd_expires = 0;
1029 skb = cbq_dequeue_1(sch);
1030 if (skb) {
1031 sch->q.qlen--;
1032 sch->flags &= ~TCQ_F_THROTTLED;
1033 return skb;
1036 /* All the classes are overlimit.
1038 It is possible, if:
1040 1. Scheduler is empty.
1041 2. Toplevel cutoff inhibited borrowing.
1042 3. Root class is overlimit.
1044 Reset 2d and 3d conditions and retry.
1046 Note, that NS and cbq-2.0 are buggy, peeking
1047 an arbitrary class is appropriate for ancestor-only
1048 sharing, but not for toplevel algorithm.
1050 Our version is better, but slower, because it requires
1051 two passes, but it is unavoidable with top-level sharing.
1054 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1055 q->link.undertime == PSCHED_PASTPERFECT)
1056 break;
1058 q->toplevel = TC_CBQ_MAXLEVEL;
1059 q->link.undertime = PSCHED_PASTPERFECT;
1062 /* No packets in scheduler or nobody wants to give them to us :-(
1063 Sigh... start watchdog timer in the last case. */
1065 if (sch->q.qlen) {
1066 sch->qstats.overlimits++;
1067 if (q->wd_expires)
1068 qdisc_watchdog_schedule(&q->watchdog,
1069 now + q->wd_expires);
1071 return NULL;
1074 /* CBQ class maintanance routines */
1076 static void cbq_adjust_levels(struct cbq_class *this)
1078 if (this == NULL)
1079 return;
1081 do {
1082 int level = 0;
1083 struct cbq_class *cl;
1085 if ((cl = this->children) != NULL) {
1086 do {
1087 if (cl->level > level)
1088 level = cl->level;
1089 } while ((cl = cl->sibling) != this->children);
1091 this->level = level+1;
1092 } while ((this = this->tparent) != NULL);
1095 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1097 struct cbq_class *cl;
1098 unsigned h;
1100 if (q->quanta[prio] == 0)
1101 return;
1103 for (h=0; h<16; h++) {
1104 for (cl = q->classes[h]; cl; cl = cl->next) {
1105 /* BUGGGG... Beware! This expression suffer of
1106 arithmetic overflows!
1108 if (cl->priority == prio) {
1109 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1110 q->quanta[prio];
1112 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1113 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1114 cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1120 static void cbq_sync_defmap(struct cbq_class *cl)
1122 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1123 struct cbq_class *split = cl->split;
1124 unsigned h;
1125 int i;
1127 if (split == NULL)
1128 return;
1130 for (i=0; i<=TC_PRIO_MAX; i++) {
1131 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1132 split->defaults[i] = NULL;
1135 for (i=0; i<=TC_PRIO_MAX; i++) {
1136 int level = split->level;
1138 if (split->defaults[i])
1139 continue;
1141 for (h=0; h<16; h++) {
1142 struct cbq_class *c;
1144 for (c = q->classes[h]; c; c = c->next) {
1145 if (c->split == split && c->level < level &&
1146 c->defmap&(1<<i)) {
1147 split->defaults[i] = c;
1148 level = c->level;
1155 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1157 struct cbq_class *split = NULL;
1159 if (splitid == 0) {
1160 if ((split = cl->split) == NULL)
1161 return;
1162 splitid = split->classid;
1165 if (split == NULL || split->classid != splitid) {
1166 for (split = cl->tparent; split; split = split->tparent)
1167 if (split->classid == splitid)
1168 break;
1171 if (split == NULL)
1172 return;
1174 if (cl->split != split) {
1175 cl->defmap = 0;
1176 cbq_sync_defmap(cl);
1177 cl->split = split;
1178 cl->defmap = def&mask;
1179 } else
1180 cl->defmap = (cl->defmap&~mask)|(def&mask);
1182 cbq_sync_defmap(cl);
1185 static void cbq_unlink_class(struct cbq_class *this)
1187 struct cbq_class *cl, **clp;
1188 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1190 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1191 if (cl == this) {
1192 *clp = cl->next;
1193 cl->next = NULL;
1194 break;
1198 if (this->tparent) {
1199 clp=&this->sibling;
1200 cl = *clp;
1201 do {
1202 if (cl == this) {
1203 *clp = cl->sibling;
1204 break;
1206 clp = &cl->sibling;
1207 } while ((cl = *clp) != this->sibling);
1209 if (this->tparent->children == this) {
1210 this->tparent->children = this->sibling;
1211 if (this->sibling == this)
1212 this->tparent->children = NULL;
1214 } else {
1215 BUG_TRAP(this->sibling == this);
1219 static void cbq_link_class(struct cbq_class *this)
1221 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1222 unsigned h = cbq_hash(this->classid);
1223 struct cbq_class *parent = this->tparent;
1225 this->sibling = this;
1226 this->next = q->classes[h];
1227 q->classes[h] = this;
1229 if (parent == NULL)
1230 return;
1232 if (parent->children == NULL) {
1233 parent->children = this;
1234 } else {
1235 this->sibling = parent->children->sibling;
1236 parent->children->sibling = this;
1240 static unsigned int cbq_drop(struct Qdisc* sch)
1242 struct cbq_sched_data *q = qdisc_priv(sch);
1243 struct cbq_class *cl, *cl_head;
1244 int prio;
1245 unsigned int len;
1247 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1248 if ((cl_head = q->active[prio]) == NULL)
1249 continue;
1251 cl = cl_head;
1252 do {
1253 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1254 sch->q.qlen--;
1255 if (!cl->q->q.qlen)
1256 cbq_deactivate_class(cl);
1257 return len;
1259 } while ((cl = cl->next_alive) != cl_head);
1261 return 0;
1264 static void
1265 cbq_reset(struct Qdisc* sch)
1267 struct cbq_sched_data *q = qdisc_priv(sch);
1268 struct cbq_class *cl;
1269 int prio;
1270 unsigned h;
1272 q->activemask = 0;
1273 q->pmask = 0;
1274 q->tx_class = NULL;
1275 q->tx_borrowed = NULL;
1276 qdisc_watchdog_cancel(&q->watchdog);
1277 hrtimer_cancel(&q->delay_timer);
1278 q->toplevel = TC_CBQ_MAXLEVEL;
1279 q->now = psched_get_time();
1280 q->now_rt = q->now;
1282 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1283 q->active[prio] = NULL;
1285 for (h = 0; h < 16; h++) {
1286 for (cl = q->classes[h]; cl; cl = cl->next) {
1287 qdisc_reset(cl->q);
1289 cl->next_alive = NULL;
1290 cl->undertime = PSCHED_PASTPERFECT;
1291 cl->avgidle = cl->maxidle;
1292 cl->deficit = cl->quantum;
1293 cl->cpriority = cl->priority;
1296 sch->q.qlen = 0;
1300 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1302 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1303 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1304 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1306 if (lss->change&TCF_CBQ_LSS_EWMA)
1307 cl->ewma_log = lss->ewma_log;
1308 if (lss->change&TCF_CBQ_LSS_AVPKT)
1309 cl->avpkt = lss->avpkt;
1310 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1311 cl->minidle = -(long)lss->minidle;
1312 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1313 cl->maxidle = lss->maxidle;
1314 cl->avgidle = lss->maxidle;
1316 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1317 cl->offtime = lss->offtime;
1318 return 0;
1321 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1323 q->nclasses[cl->priority]--;
1324 q->quanta[cl->priority] -= cl->weight;
1325 cbq_normalize_quanta(q, cl->priority);
1328 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1330 q->nclasses[cl->priority]++;
1331 q->quanta[cl->priority] += cl->weight;
1332 cbq_normalize_quanta(q, cl->priority);
1335 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1337 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1339 if (wrr->allot)
1340 cl->allot = wrr->allot;
1341 if (wrr->weight)
1342 cl->weight = wrr->weight;
1343 if (wrr->priority) {
1344 cl->priority = wrr->priority-1;
1345 cl->cpriority = cl->priority;
1346 if (cl->priority >= cl->priority2)
1347 cl->priority2 = TC_CBQ_MAXPRIO-1;
1350 cbq_addprio(q, cl);
1351 return 0;
1354 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1356 switch (ovl->strategy) {
1357 case TC_CBQ_OVL_CLASSIC:
1358 cl->overlimit = cbq_ovl_classic;
1359 break;
1360 case TC_CBQ_OVL_DELAY:
1361 cl->overlimit = cbq_ovl_delay;
1362 break;
1363 case TC_CBQ_OVL_LOWPRIO:
1364 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1365 ovl->priority2-1 <= cl->priority)
1366 return -EINVAL;
1367 cl->priority2 = ovl->priority2-1;
1368 cl->overlimit = cbq_ovl_lowprio;
1369 break;
1370 case TC_CBQ_OVL_DROP:
1371 cl->overlimit = cbq_ovl_drop;
1372 break;
1373 case TC_CBQ_OVL_RCLASSIC:
1374 cl->overlimit = cbq_ovl_rclassic;
1375 break;
1376 default:
1377 return -EINVAL;
1379 cl->penalty = ovl->penalty;
1380 return 0;
1383 #ifdef CONFIG_NET_CLS_POLICE
1384 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1386 cl->police = p->police;
1388 if (cl->q->handle) {
1389 if (p->police == TC_POLICE_RECLASSIFY)
1390 cl->q->reshape_fail = cbq_reshape_fail;
1391 else
1392 cl->q->reshape_fail = NULL;
1394 return 0;
1396 #endif
1398 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1400 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1401 return 0;
1404 static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1406 struct cbq_sched_data *q = qdisc_priv(sch);
1407 struct rtattr *tb[TCA_CBQ_MAX];
1408 struct tc_ratespec *r;
1410 if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 ||
1411 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1412 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1413 return -EINVAL;
1415 if (tb[TCA_CBQ_LSSOPT-1] &&
1416 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1417 return -EINVAL;
1419 r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1421 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL)
1422 return -EINVAL;
1424 q->link.refcnt = 1;
1425 q->link.sibling = &q->link;
1426 q->link.classid = sch->handle;
1427 q->link.qdisc = sch;
1428 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1429 sch->handle)))
1430 q->link.q = &noop_qdisc;
1432 q->link.priority = TC_CBQ_MAXPRIO-1;
1433 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1434 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1435 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1436 q->link.overlimit = cbq_ovl_classic;
1437 q->link.allot = psched_mtu(sch->dev);
1438 q->link.quantum = q->link.allot;
1439 q->link.weight = q->link.R_tab->rate.rate;
1441 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1442 q->link.avpkt = q->link.allot/2;
1443 q->link.minidle = -0x7FFFFFFF;
1444 q->link.stats_lock = &sch->dev->queue_lock;
1446 qdisc_watchdog_init(&q->watchdog, sch);
1447 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1448 q->delay_timer.function = cbq_undelay;
1449 q->toplevel = TC_CBQ_MAXLEVEL;
1450 q->now = psched_get_time();
1451 q->now_rt = q->now;
1453 cbq_link_class(&q->link);
1455 if (tb[TCA_CBQ_LSSOPT-1])
1456 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1458 cbq_addprio(q, &q->link);
1459 return 0;
1462 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1464 unsigned char *b = skb_tail_pointer(skb);
1466 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1467 return skb->len;
1469 rtattr_failure:
1470 nlmsg_trim(skb, b);
1471 return -1;
1474 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1476 unsigned char *b = skb_tail_pointer(skb);
1477 struct tc_cbq_lssopt opt;
1479 opt.flags = 0;
1480 if (cl->borrow == NULL)
1481 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1482 if (cl->share == NULL)
1483 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1484 opt.ewma_log = cl->ewma_log;
1485 opt.level = cl->level;
1486 opt.avpkt = cl->avpkt;
1487 opt.maxidle = cl->maxidle;
1488 opt.minidle = (u32)(-cl->minidle);
1489 opt.offtime = cl->offtime;
1490 opt.change = ~0;
1491 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1492 return skb->len;
1494 rtattr_failure:
1495 nlmsg_trim(skb, b);
1496 return -1;
1499 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1501 unsigned char *b = skb_tail_pointer(skb);
1502 struct tc_cbq_wrropt opt;
1504 opt.flags = 0;
1505 opt.allot = cl->allot;
1506 opt.priority = cl->priority+1;
1507 opt.cpriority = cl->cpriority+1;
1508 opt.weight = cl->weight;
1509 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1510 return skb->len;
1512 rtattr_failure:
1513 nlmsg_trim(skb, b);
1514 return -1;
1517 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1519 unsigned char *b = skb_tail_pointer(skb);
1520 struct tc_cbq_ovl opt;
1522 opt.strategy = cl->ovl_strategy;
1523 opt.priority2 = cl->priority2+1;
1524 opt.pad = 0;
1525 opt.penalty = cl->penalty;
1526 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1527 return skb->len;
1529 rtattr_failure:
1530 nlmsg_trim(skb, b);
1531 return -1;
1534 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1536 unsigned char *b = skb_tail_pointer(skb);
1537 struct tc_cbq_fopt opt;
1539 if (cl->split || cl->defmap) {
1540 opt.split = cl->split ? cl->split->classid : 0;
1541 opt.defmap = cl->defmap;
1542 opt.defchange = ~0;
1543 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1545 return skb->len;
1547 rtattr_failure:
1548 nlmsg_trim(skb, b);
1549 return -1;
1552 #ifdef CONFIG_NET_CLS_POLICE
1553 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1555 unsigned char *b = skb_tail_pointer(skb);
1556 struct tc_cbq_police opt;
1558 if (cl->police) {
1559 opt.police = cl->police;
1560 opt.__res1 = 0;
1561 opt.__res2 = 0;
1562 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1564 return skb->len;
1566 rtattr_failure:
1567 nlmsg_trim(skb, b);
1568 return -1;
1570 #endif
1572 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1574 if (cbq_dump_lss(skb, cl) < 0 ||
1575 cbq_dump_rate(skb, cl) < 0 ||
1576 cbq_dump_wrr(skb, cl) < 0 ||
1577 cbq_dump_ovl(skb, cl) < 0 ||
1578 #ifdef CONFIG_NET_CLS_POLICE
1579 cbq_dump_police(skb, cl) < 0 ||
1580 #endif
1581 cbq_dump_fopt(skb, cl) < 0)
1582 return -1;
1583 return 0;
1586 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1588 struct cbq_sched_data *q = qdisc_priv(sch);
1589 unsigned char *b = skb_tail_pointer(skb);
1590 struct rtattr *rta;
1592 rta = (struct rtattr*)b;
1593 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1594 if (cbq_dump_attr(skb, &q->link) < 0)
1595 goto rtattr_failure;
1596 rta->rta_len = skb_tail_pointer(skb) - b;
1597 return skb->len;
1599 rtattr_failure:
1600 nlmsg_trim(skb, b);
1601 return -1;
1604 static int
1605 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1607 struct cbq_sched_data *q = qdisc_priv(sch);
1609 q->link.xstats.avgidle = q->link.avgidle;
1610 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1613 static int
1614 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1615 struct sk_buff *skb, struct tcmsg *tcm)
1617 struct cbq_class *cl = (struct cbq_class*)arg;
1618 unsigned char *b = skb_tail_pointer(skb);
1619 struct rtattr *rta;
1621 if (cl->tparent)
1622 tcm->tcm_parent = cl->tparent->classid;
1623 else
1624 tcm->tcm_parent = TC_H_ROOT;
1625 tcm->tcm_handle = cl->classid;
1626 tcm->tcm_info = cl->q->handle;
1628 rta = (struct rtattr*)b;
1629 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1630 if (cbq_dump_attr(skb, cl) < 0)
1631 goto rtattr_failure;
1632 rta->rta_len = skb_tail_pointer(skb) - b;
1633 return skb->len;
1635 rtattr_failure:
1636 nlmsg_trim(skb, b);
1637 return -1;
1640 static int
1641 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1642 struct gnet_dump *d)
1644 struct cbq_sched_data *q = qdisc_priv(sch);
1645 struct cbq_class *cl = (struct cbq_class*)arg;
1647 cl->qstats.qlen = cl->q->q.qlen;
1648 cl->xstats.avgidle = cl->avgidle;
1649 cl->xstats.undertime = 0;
1651 if (cl->undertime != PSCHED_PASTPERFECT)
1652 cl->xstats.undertime = cl->undertime - q->now;
1654 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1655 #ifdef CONFIG_NET_ESTIMATOR
1656 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1657 #endif
1658 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1659 return -1;
1661 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1664 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1665 struct Qdisc **old)
1667 struct cbq_class *cl = (struct cbq_class*)arg;
1669 if (cl) {
1670 if (new == NULL) {
1671 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1672 cl->classid)) == NULL)
1673 return -ENOBUFS;
1674 } else {
1675 #ifdef CONFIG_NET_CLS_POLICE
1676 if (cl->police == TC_POLICE_RECLASSIFY)
1677 new->reshape_fail = cbq_reshape_fail;
1678 #endif
1680 sch_tree_lock(sch);
1681 *old = xchg(&cl->q, new);
1682 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1683 qdisc_reset(*old);
1684 sch_tree_unlock(sch);
1686 return 0;
1688 return -ENOENT;
1691 static struct Qdisc *
1692 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1694 struct cbq_class *cl = (struct cbq_class*)arg;
1696 return cl ? cl->q : NULL;
1699 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1701 struct cbq_class *cl = (struct cbq_class *)arg;
1703 if (cl->q->q.qlen == 0)
1704 cbq_deactivate_class(cl);
1707 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1709 struct cbq_sched_data *q = qdisc_priv(sch);
1710 struct cbq_class *cl = cbq_class_lookup(q, classid);
1712 if (cl) {
1713 cl->refcnt++;
1714 return (unsigned long)cl;
1716 return 0;
1719 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1721 struct cbq_sched_data *q = qdisc_priv(sch);
1723 BUG_TRAP(!cl->filters);
1725 tcf_destroy_chain(cl->filter_list);
1726 qdisc_destroy(cl->q);
1727 qdisc_put_rtab(cl->R_tab);
1728 #ifdef CONFIG_NET_ESTIMATOR
1729 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1730 #endif
1731 if (cl != &q->link)
1732 kfree(cl);
1735 static void
1736 cbq_destroy(struct Qdisc* sch)
1738 struct cbq_sched_data *q = qdisc_priv(sch);
1739 struct cbq_class *cl;
1740 unsigned h;
1742 #ifdef CONFIG_NET_CLS_POLICE
1743 q->rx_class = NULL;
1744 #endif
1746 * Filters must be destroyed first because we don't destroy the
1747 * classes from root to leafs which means that filters can still
1748 * be bound to classes which have been destroyed already. --TGR '04
1750 for (h = 0; h < 16; h++) {
1751 for (cl = q->classes[h]; cl; cl = cl->next) {
1752 tcf_destroy_chain(cl->filter_list);
1753 cl->filter_list = NULL;
1756 for (h = 0; h < 16; h++) {
1757 struct cbq_class *next;
1759 for (cl = q->classes[h]; cl; cl = next) {
1760 next = cl->next;
1761 cbq_destroy_class(sch, cl);
1766 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1768 struct cbq_class *cl = (struct cbq_class*)arg;
1770 if (--cl->refcnt == 0) {
1771 #ifdef CONFIG_NET_CLS_POLICE
1772 struct cbq_sched_data *q = qdisc_priv(sch);
1774 spin_lock_bh(&sch->dev->queue_lock);
1775 if (q->rx_class == cl)
1776 q->rx_class = NULL;
1777 spin_unlock_bh(&sch->dev->queue_lock);
1778 #endif
1780 cbq_destroy_class(sch, cl);
1784 static int
1785 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1786 unsigned long *arg)
1788 int err;
1789 struct cbq_sched_data *q = qdisc_priv(sch);
1790 struct cbq_class *cl = (struct cbq_class*)*arg;
1791 struct rtattr *opt = tca[TCA_OPTIONS-1];
1792 struct rtattr *tb[TCA_CBQ_MAX];
1793 struct cbq_class *parent;
1794 struct qdisc_rate_table *rtab = NULL;
1796 if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt))
1797 return -EINVAL;
1799 if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1800 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1801 return -EINVAL;
1803 if (tb[TCA_CBQ_FOPT-1] &&
1804 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1805 return -EINVAL;
1807 if (tb[TCA_CBQ_RATE-1] &&
1808 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1809 return -EINVAL;
1811 if (tb[TCA_CBQ_LSSOPT-1] &&
1812 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1813 return -EINVAL;
1815 if (tb[TCA_CBQ_WRROPT-1] &&
1816 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1817 return -EINVAL;
1819 #ifdef CONFIG_NET_CLS_POLICE
1820 if (tb[TCA_CBQ_POLICE-1] &&
1821 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1822 return -EINVAL;
1823 #endif
1825 if (cl) {
1826 /* Check parent */
1827 if (parentid) {
1828 if (cl->tparent && cl->tparent->classid != parentid)
1829 return -EINVAL;
1830 if (!cl->tparent && parentid != TC_H_ROOT)
1831 return -EINVAL;
1834 if (tb[TCA_CBQ_RATE-1]) {
1835 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1836 if (rtab == NULL)
1837 return -EINVAL;
1840 /* Change class parameters */
1841 sch_tree_lock(sch);
1843 if (cl->next_alive != NULL)
1844 cbq_deactivate_class(cl);
1846 if (rtab) {
1847 rtab = xchg(&cl->R_tab, rtab);
1848 qdisc_put_rtab(rtab);
1851 if (tb[TCA_CBQ_LSSOPT-1])
1852 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1854 if (tb[TCA_CBQ_WRROPT-1]) {
1855 cbq_rmprio(q, cl);
1856 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1859 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1860 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1862 #ifdef CONFIG_NET_CLS_POLICE
1863 if (tb[TCA_CBQ_POLICE-1])
1864 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1865 #endif
1867 if (tb[TCA_CBQ_FOPT-1])
1868 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1870 if (cl->q->q.qlen)
1871 cbq_activate_class(cl);
1873 sch_tree_unlock(sch);
1875 #ifdef CONFIG_NET_ESTIMATOR
1876 if (tca[TCA_RATE-1])
1877 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1878 cl->stats_lock, tca[TCA_RATE-1]);
1879 #endif
1880 return 0;
1883 if (parentid == TC_H_ROOT)
1884 return -EINVAL;
1886 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1887 tb[TCA_CBQ_LSSOPT-1] == NULL)
1888 return -EINVAL;
1890 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1891 if (rtab == NULL)
1892 return -EINVAL;
1894 if (classid) {
1895 err = -EINVAL;
1896 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1897 goto failure;
1898 } else {
1899 int i;
1900 classid = TC_H_MAKE(sch->handle,0x8000);
1902 for (i=0; i<0x8000; i++) {
1903 if (++q->hgenerator >= 0x8000)
1904 q->hgenerator = 1;
1905 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1906 break;
1908 err = -ENOSR;
1909 if (i >= 0x8000)
1910 goto failure;
1911 classid = classid|q->hgenerator;
1914 parent = &q->link;
1915 if (parentid) {
1916 parent = cbq_class_lookup(q, parentid);
1917 err = -EINVAL;
1918 if (parent == NULL)
1919 goto failure;
1922 err = -ENOBUFS;
1923 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1924 if (cl == NULL)
1925 goto failure;
1926 cl->R_tab = rtab;
1927 rtab = NULL;
1928 cl->refcnt = 1;
1929 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid)))
1930 cl->q = &noop_qdisc;
1931 cl->classid = classid;
1932 cl->tparent = parent;
1933 cl->qdisc = sch;
1934 cl->allot = parent->allot;
1935 cl->quantum = cl->allot;
1936 cl->weight = cl->R_tab->rate.rate;
1937 cl->stats_lock = &sch->dev->queue_lock;
1939 sch_tree_lock(sch);
1940 cbq_link_class(cl);
1941 cl->borrow = cl->tparent;
1942 if (cl->tparent != &q->link)
1943 cl->share = cl->tparent;
1944 cbq_adjust_levels(parent);
1945 cl->minidle = -0x7FFFFFFF;
1946 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1947 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1948 if (cl->ewma_log==0)
1949 cl->ewma_log = q->link.ewma_log;
1950 if (cl->maxidle==0)
1951 cl->maxidle = q->link.maxidle;
1952 if (cl->avpkt==0)
1953 cl->avpkt = q->link.avpkt;
1954 cl->overlimit = cbq_ovl_classic;
1955 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1956 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1957 #ifdef CONFIG_NET_CLS_POLICE
1958 if (tb[TCA_CBQ_POLICE-1])
1959 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1960 #endif
1961 if (tb[TCA_CBQ_FOPT-1])
1962 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1963 sch_tree_unlock(sch);
1965 #ifdef CONFIG_NET_ESTIMATOR
1966 if (tca[TCA_RATE-1])
1967 gen_new_estimator(&cl->bstats, &cl->rate_est,
1968 cl->stats_lock, tca[TCA_RATE-1]);
1969 #endif
1971 *arg = (unsigned long)cl;
1972 return 0;
1974 failure:
1975 qdisc_put_rtab(rtab);
1976 return err;
1979 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1981 struct cbq_sched_data *q = qdisc_priv(sch);
1982 struct cbq_class *cl = (struct cbq_class*)arg;
1983 unsigned int qlen;
1985 if (cl->filters || cl->children || cl == &q->link)
1986 return -EBUSY;
1988 sch_tree_lock(sch);
1990 qlen = cl->q->q.qlen;
1991 qdisc_reset(cl->q);
1992 qdisc_tree_decrease_qlen(cl->q, qlen);
1994 if (cl->next_alive)
1995 cbq_deactivate_class(cl);
1997 if (q->tx_borrowed == cl)
1998 q->tx_borrowed = q->tx_class;
1999 if (q->tx_class == cl) {
2000 q->tx_class = NULL;
2001 q->tx_borrowed = NULL;
2003 #ifdef CONFIG_NET_CLS_POLICE
2004 if (q->rx_class == cl)
2005 q->rx_class = NULL;
2006 #endif
2008 cbq_unlink_class(cl);
2009 cbq_adjust_levels(cl->tparent);
2010 cl->defmap = 0;
2011 cbq_sync_defmap(cl);
2013 cbq_rmprio(q, cl);
2014 sch_tree_unlock(sch);
2016 if (--cl->refcnt == 0)
2017 cbq_destroy_class(sch, cl);
2019 return 0;
2022 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
2024 struct cbq_sched_data *q = qdisc_priv(sch);
2025 struct cbq_class *cl = (struct cbq_class *)arg;
2027 if (cl == NULL)
2028 cl = &q->link;
2030 return &cl->filter_list;
2033 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
2034 u32 classid)
2036 struct cbq_sched_data *q = qdisc_priv(sch);
2037 struct cbq_class *p = (struct cbq_class*)parent;
2038 struct cbq_class *cl = cbq_class_lookup(q, classid);
2040 if (cl) {
2041 if (p && p->level <= cl->level)
2042 return 0;
2043 cl->filters++;
2044 return (unsigned long)cl;
2046 return 0;
2049 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2051 struct cbq_class *cl = (struct cbq_class*)arg;
2053 cl->filters--;
2056 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2058 struct cbq_sched_data *q = qdisc_priv(sch);
2059 unsigned h;
2061 if (arg->stop)
2062 return;
2064 for (h = 0; h < 16; h++) {
2065 struct cbq_class *cl;
2067 for (cl = q->classes[h]; cl; cl = cl->next) {
2068 if (arg->count < arg->skip) {
2069 arg->count++;
2070 continue;
2072 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2073 arg->stop = 1;
2074 return;
2076 arg->count++;
2081 static struct Qdisc_class_ops cbq_class_ops = {
2082 .graft = cbq_graft,
2083 .leaf = cbq_leaf,
2084 .qlen_notify = cbq_qlen_notify,
2085 .get = cbq_get,
2086 .put = cbq_put,
2087 .change = cbq_change_class,
2088 .delete = cbq_delete,
2089 .walk = cbq_walk,
2090 .tcf_chain = cbq_find_tcf,
2091 .bind_tcf = cbq_bind_filter,
2092 .unbind_tcf = cbq_unbind_filter,
2093 .dump = cbq_dump_class,
2094 .dump_stats = cbq_dump_class_stats,
2097 static struct Qdisc_ops cbq_qdisc_ops = {
2098 .next = NULL,
2099 .cl_ops = &cbq_class_ops,
2100 .id = "cbq",
2101 .priv_size = sizeof(struct cbq_sched_data),
2102 .enqueue = cbq_enqueue,
2103 .dequeue = cbq_dequeue,
2104 .requeue = cbq_requeue,
2105 .drop = cbq_drop,
2106 .init = cbq_init,
2107 .reset = cbq_reset,
2108 .destroy = cbq_destroy,
2109 .change = NULL,
2110 .dump = cbq_dump,
2111 .dump_stats = cbq_dump_stats,
2112 .owner = THIS_MODULE,
2115 static int __init cbq_module_init(void)
2117 return register_qdisc(&cbq_qdisc_ops);
2119 static void __exit cbq_module_exit(void)
2121 unregister_qdisc(&cbq_qdisc_ops);
2123 module_init(cbq_module_init)
2124 module_exit(cbq_module_exit)
2125 MODULE_LICENSE("GPL");