Kill st_fstype member.
[linux-2.6/linux-mips.git] / net / sched / sch_cbq.c
blobdc9f11aa7ddd1d49ddd9c3484d2b6056d9aa2c5b
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <asm/uaccess.h>
16 #include <asm/system.h>
17 #include <asm/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/if_ether.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/notifier.h>
33 #include <net/ip.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
36 #include <net/sock.h>
37 #include <net/pkt_sched.h>
40 /* Class-Based Queueing (CBQ) algorithm.
41 =======================================
43 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
44 Management Models for Packet Networks",
45 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
47 [2] Sally Floyd, "Notes on CBQ and Guaranted Service", 1995
49 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
50 Parameters", 1996
52 [4] Sally Floyd and Michael Speer, "Experimental Results
53 for Class-Based Queueing", 1998, not published.
55 -----------------------------------------------------------------------
57 Algorithm skeleton was taken from NS simulator cbq.cc.
58 If someone wants to check this code against the LBL version,
59 he should take into account that ONLY the skeleton was borrowed,
60 the implementation is different. Particularly:
62 --- The WRR algorithm is different. Our version looks more
63 reasonable (I hope) and works when quanta are allowed to be
64 less than MTU, which is always the case when real time classes
65 have small rates. Note, that the statement of [3] is
66 incomplete, delay may actually be estimated even if class
67 per-round allotment is less than MTU. Namely, if per-round
68 allotment is W*r_i, and r_1+...+r_k = r < 1
70 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
72 In the worst case we have IntServ estimate with D = W*r+k*MTU
73 and C = MTU*r. The proof (if correct at all) is trivial.
76 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
77 interpret some places, which look like wrong translations
78 from NS. Anyone is advised to find these differences
79 and explain to me, why I am wrong 8).
81 --- Linux has no EOI event, so that we cannot estimate true class
82 idle time. Workaround is to consider the next dequeue event
83 as sign that previous packet is finished. This is wrong because of
84 internal device queueing, but on a permanently loaded link it is true.
85 Moreover, combined with clock integrator, this scheme looks
86 very close to an ideal solution. */
88 struct cbq_sched_data;
91 struct cbq_class
93 struct cbq_class *next; /* hash table link */
94 struct cbq_class *next_alive; /* next class with backlog in this priority band */
96 /* Parameters */
97 u32 classid;
98 unsigned char priority; /* class priority */
99 unsigned char priority2; /* priority to be used after overlimit */
100 unsigned char ewma_log; /* time constant for idle time calculation */
101 unsigned char ovl_strategy;
102 #ifdef CONFIG_NET_CLS_POLICE
103 unsigned char police;
104 #endif
106 u32 defmap;
108 /* Link-sharing scheduler parameters */
109 long maxidle; /* Class paramters: see below. */
110 long offtime;
111 long minidle;
112 u32 avpkt;
113 struct qdisc_rate_table *R_tab;
115 /* Overlimit strategy parameters */
116 void (*overlimit)(struct cbq_class *cl);
117 long penalty;
119 /* General scheduler (WRR) parameters */
120 long allot;
121 long quantum; /* Allotment per WRR round */
122 long weight; /* Relative allotment: see below */
124 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
125 struct cbq_class *split; /* Ptr to split node */
126 struct cbq_class *share; /* Ptr to LS parent in the class tree */
127 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
128 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
129 parent otherwise */
130 struct cbq_class *sibling; /* Sibling chain */
131 struct cbq_class *children; /* Pointer to children chain */
133 struct Qdisc *q; /* Elementary queueing discipline */
136 /* Variables */
137 unsigned char cpriority; /* Effective priority */
138 unsigned char delayed;
139 unsigned char level; /* level of the class in hierarchy:
140 0 for leaf classes, and maximal
141 level of children + 1 for nodes.
144 psched_time_t last; /* Last end of service */
145 psched_time_t undertime;
146 long avgidle;
147 long deficit; /* Saved deficit for WRR */
148 unsigned long penalized;
149 struct tc_stats stats;
150 struct tc_cbq_xstats xstats;
152 struct tcf_proto *filter_list;
154 int refcnt;
155 int filters;
157 struct cbq_class *defaults[TC_PRIO_MAX+1];
160 struct cbq_sched_data
162 struct cbq_class *classes[16]; /* Hash table of all classes */
163 int nclasses[TC_CBQ_MAXPRIO+1];
164 unsigned quanta[TC_CBQ_MAXPRIO+1];
166 struct cbq_class link;
168 unsigned activemask;
169 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
170 with backlog */
172 #ifdef CONFIG_NET_CLS_POLICE
173 struct cbq_class *rx_class;
174 #endif
175 struct cbq_class *tx_class;
176 struct cbq_class *tx_borrowed;
177 int tx_len;
178 psched_time_t now; /* Cached timestamp */
179 psched_time_t now_rt; /* Cached real time */
180 unsigned pmask;
182 struct timer_list delay_timer;
183 struct timer_list wd_timer; /* Watchdog timer,
184 started when CBQ has
185 backlog, but cannot
186 transmit just now */
187 long wd_expires;
188 int toplevel;
189 u32 hgenerator;
193 #define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
196 static __inline__ unsigned cbq_hash(u32 h)
198 h ^= h>>8;
199 h ^= h>>4;
200 return h&0xF;
203 static __inline__ struct cbq_class *
204 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
206 struct cbq_class *cl;
208 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
209 if (cl->classid == classid)
210 return cl;
211 return NULL;
214 #ifdef CONFIG_NET_CLS_POLICE
216 static struct cbq_class *
217 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
219 struct cbq_class *cl, *new;
221 for (cl = this->tparent; cl; cl = cl->tparent)
222 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
223 return new;
225 return NULL;
228 #endif
230 /* Classify packet. The procedure is pretty complicated, but
231 it allows us to combine link sharing and priority scheduling
232 transparently.
234 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
235 so that it resolves to split nodes. Then packets are classified
236 by logical priority, or a more specific classifier may be attached
237 to the split node.
240 static struct cbq_class *
241 cbq_classify(struct sk_buff *skb, struct Qdisc *sch)
243 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
244 struct cbq_class *head = &q->link;
245 struct cbq_class **defmap;
246 struct cbq_class *cl = NULL;
247 u32 prio = skb->priority;
248 struct tcf_result res;
251 * Step 1. If skb->priority points to one of our classes, use it.
253 if (TC_H_MAJ(prio^sch->handle) == 0 &&
254 (cl = cbq_class_lookup(q, prio)) != NULL)
255 return cl;
257 for (;;) {
258 int result = 0;
260 defmap = head->defaults;
263 * Step 2+n. Apply classifier.
265 if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
266 goto fallback;
268 if ((cl = (void*)res.class) == NULL) {
269 if (TC_H_MAJ(res.classid))
270 cl = cbq_class_lookup(q, res.classid);
271 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
272 cl = defmap[TC_PRIO_BESTEFFORT];
274 if (cl == NULL || cl->level >= head->level)
275 goto fallback;
278 #ifdef CONFIG_NET_CLS_POLICE
279 switch (result) {
280 case TC_POLICE_RECLASSIFY:
281 return cbq_reclassify(skb, cl);
282 case TC_POLICE_SHOT:
283 return NULL;
284 default:
286 #endif
287 if (cl->level == 0)
288 return cl;
291 * Step 3+n. If classifier selected a link sharing class,
292 * apply agency specific classifier.
293 * Repeat this procdure until we hit a leaf node.
295 head = cl;
298 fallback:
299 cl = head;
302 * Step 4. No success...
304 if (TC_H_MAJ(prio) == 0 &&
305 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
306 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
307 return head;
309 return cl;
313 A packet has just been enqueued on the empty class.
314 cbq_activate_class adds it to the tail of active class list
315 of its priority band.
318 static __inline__ void cbq_activate_class(struct cbq_class *cl)
320 struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
321 int prio = cl->cpriority;
322 struct cbq_class *cl_tail;
324 cl_tail = q->active[prio];
325 q->active[prio] = cl;
327 if (cl_tail != NULL) {
328 cl->next_alive = cl_tail->next_alive;
329 cl_tail->next_alive = cl;
330 } else {
331 cl->next_alive = cl;
332 q->activemask |= (1<<prio);
337 Unlink class from active chain.
338 Note that this same procedure is done directly in cbq_dequeue*
339 during round-robin procedure.
342 static void cbq_deactivate_class(struct cbq_class *this)
344 struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
345 int prio = this->cpriority;
346 struct cbq_class *cl;
347 struct cbq_class *cl_prev = q->active[prio];
349 do {
350 cl = cl_prev->next_alive;
351 if (cl == this) {
352 cl_prev->next_alive = cl->next_alive;
353 cl->next_alive = NULL;
355 if (cl == q->active[prio]) {
356 q->active[prio] = cl_prev;
357 if (cl == q->active[prio]) {
358 q->active[prio] = NULL;
359 q->activemask &= ~(1<<prio);
360 return;
364 cl = cl_prev->next_alive;
365 return;
367 } while ((cl_prev = cl) != q->active[prio]);
370 static void
371 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
373 int toplevel = q->toplevel;
375 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
376 psched_time_t now;
377 psched_tdiff_t incr;
379 PSCHED_GET_TIME(now);
380 incr = PSCHED_TDIFF(now, q->now_rt);
381 PSCHED_TADD2(q->now, incr, now);
383 do {
384 if (PSCHED_TLESS(cl->undertime, now)) {
385 q->toplevel = cl->level;
386 return;
388 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
392 static int
393 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
395 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
396 struct cbq_class *cl = cbq_classify(skb, sch);
397 int len = skb->len;
398 int ret = NET_XMIT_POLICED;
400 #ifdef CONFIG_NET_CLS_POLICE
401 q->rx_class = cl;
402 #endif
403 if (cl) {
404 #ifdef CONFIG_NET_CLS_POLICE
405 cl->q->__parent = sch;
406 #endif
407 if ((ret = cl->q->enqueue(skb, cl->q)) == 0) {
408 sch->q.qlen++;
409 sch->stats.packets++;
410 sch->stats.bytes+=len;
411 cbq_mark_toplevel(q, cl);
412 if (!cl->next_alive)
413 cbq_activate_class(cl);
414 return 0;
418 sch->stats.drops++;
419 if (cl == NULL)
420 kfree_skb(skb);
421 else {
422 cbq_mark_toplevel(q, cl);
423 cl->stats.drops++;
425 return ret;
428 static int
429 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
431 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
432 struct cbq_class *cl;
433 int ret;
435 if ((cl = q->tx_class) == NULL) {
436 kfree_skb(skb);
437 sch->stats.drops++;
438 return NET_XMIT_CN;
440 q->tx_class = NULL;
442 cbq_mark_toplevel(q, cl);
444 #ifdef CONFIG_NET_CLS_POLICE
445 q->rx_class = cl;
446 cl->q->__parent = sch;
447 #endif
448 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
449 sch->q.qlen++;
450 if (!cl->next_alive)
451 cbq_activate_class(cl);
452 return 0;
454 sch->stats.drops++;
455 cl->stats.drops++;
456 return ret;
459 /* Overlimit actions */
461 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
463 static void cbq_ovl_classic(struct cbq_class *cl)
465 struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
466 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
468 if (!cl->delayed) {
469 delay += cl->offtime;
472 Class goes to sleep, so that it will have no
473 chance to work avgidle. Let's forgive it 8)
475 BTW cbq-2.0 has a crap in this
476 place, apparently they forgot to shift it by cl->ewma_log.
478 if (cl->avgidle < 0)
479 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
480 if (cl->avgidle < cl->minidle)
481 cl->avgidle = cl->minidle;
482 if (delay <= 0)
483 delay = 1;
484 PSCHED_TADD2(q->now, delay, cl->undertime);
486 cl->xstats.overactions++;
487 cl->delayed = 1;
489 if (q->wd_expires == 0 || q->wd_expires > delay)
490 q->wd_expires = delay;
492 /* Dirty work! We must schedule wakeups based on
493 real available rate, rather than leaf rate,
494 which may be tiny (even zero).
496 if (q->toplevel == TC_CBQ_MAXLEVEL) {
497 struct cbq_class *b;
498 psched_tdiff_t base_delay = q->wd_expires;
500 for (b = cl->borrow; b; b = b->borrow) {
501 delay = PSCHED_TDIFF(b->undertime, q->now);
502 if (delay < base_delay) {
503 if (delay <= 0)
504 delay = 1;
505 base_delay = delay;
509 q->wd_expires = delay;
513 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
514 they go overlimit
517 static void cbq_ovl_rclassic(struct cbq_class *cl)
519 struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
520 struct cbq_class *this = cl;
522 do {
523 if (cl->level > q->toplevel) {
524 cl = NULL;
525 break;
527 } while ((cl = cl->borrow) != NULL);
529 if (cl == NULL)
530 cl = this;
531 cbq_ovl_classic(cl);
534 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
536 static void cbq_ovl_delay(struct cbq_class *cl)
538 struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
539 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
541 if (!cl->delayed) {
542 unsigned long sched = jiffies;
544 delay += cl->offtime;
545 if (cl->avgidle < 0)
546 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
547 if (cl->avgidle < cl->minidle)
548 cl->avgidle = cl->minidle;
549 PSCHED_TADD2(q->now, delay, cl->undertime);
551 if (delay > 0) {
552 sched += PSCHED_US2JIFFIE(delay) + cl->penalty;
553 cl->penalized = sched;
554 cl->cpriority = TC_CBQ_MAXPRIO;
555 q->pmask |= (1<<TC_CBQ_MAXPRIO);
556 if (del_timer(&q->delay_timer) &&
557 (long)(q->delay_timer.expires - sched) > 0)
558 q->delay_timer.expires = sched;
559 add_timer(&q->delay_timer);
560 cl->delayed = 1;
561 cl->xstats.overactions++;
562 return;
564 delay = 1;
566 if (q->wd_expires == 0 || q->wd_expires > delay)
567 q->wd_expires = delay;
570 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
572 static void cbq_ovl_lowprio(struct cbq_class *cl)
574 struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
576 cl->penalized = jiffies + cl->penalty;
578 if (cl->cpriority != cl->priority2) {
579 cl->cpriority = cl->priority2;
580 q->pmask |= (1<<cl->cpriority);
581 cl->xstats.overactions++;
583 cbq_ovl_classic(cl);
586 /* TC_CBQ_OVL_DROP: penalize class by dropping */
588 static void cbq_ovl_drop(struct cbq_class *cl)
590 if (cl->q->ops->drop)
591 if (cl->q->ops->drop(cl->q))
592 cl->qdisc->q.qlen--;
593 cl->xstats.overactions++;
594 cbq_ovl_classic(cl);
597 static void cbq_watchdog(unsigned long arg)
599 struct Qdisc *sch = (struct Qdisc*)arg;
601 sch->flags &= ~TCQ_F_THROTTLED;
602 netif_schedule(sch->dev);
605 static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio)
607 struct cbq_class *cl;
608 struct cbq_class *cl_prev = q->active[prio];
609 unsigned long now = jiffies;
610 unsigned long sched = now;
612 if (cl_prev == NULL)
613 return now;
615 do {
616 cl = cl_prev->next_alive;
617 if ((long)(now - cl->penalized) > 0) {
618 cl_prev->next_alive = cl->next_alive;
619 cl->next_alive = NULL;
620 cl->cpriority = cl->priority;
621 cl->delayed = 0;
622 cbq_activate_class(cl);
624 if (cl == q->active[prio]) {
625 q->active[prio] = cl_prev;
626 if (cl == q->active[prio]) {
627 q->active[prio] = NULL;
628 return 0;
632 cl = cl_prev->next_alive;
633 } else if ((long)(sched - cl->penalized) > 0)
634 sched = cl->penalized;
635 } while ((cl_prev = cl) != q->active[prio]);
637 return (long)(sched - now);
640 static void cbq_undelay(unsigned long arg)
642 struct Qdisc *sch = (struct Qdisc*)arg;
643 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
644 long delay = 0;
645 unsigned pmask;
647 pmask = q->pmask;
648 q->pmask = 0;
650 while (pmask) {
651 int prio = ffz(~pmask);
652 long tmp;
654 pmask &= ~(1<<prio);
656 tmp = cbq_undelay_prio(q, prio);
657 if (tmp > 0) {
658 q->pmask |= 1<<prio;
659 if (tmp < delay || delay == 0)
660 delay = tmp;
664 if (delay) {
665 q->delay_timer.expires = jiffies + delay;
666 add_timer(&q->delay_timer);
669 sch->flags &= ~TCQ_F_THROTTLED;
670 netif_schedule(sch->dev);
674 #ifdef CONFIG_NET_CLS_POLICE
676 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
678 int len = skb->len;
679 struct Qdisc *sch = child->__parent;
680 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
681 struct cbq_class *cl = q->rx_class;
683 q->rx_class = NULL;
685 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
687 cbq_mark_toplevel(q, cl);
689 q->rx_class = cl;
690 cl->q->__parent = sch;
692 if (cl->q->enqueue(skb, cl->q) == 0) {
693 sch->q.qlen++;
694 sch->stats.packets++;
695 sch->stats.bytes+=len;
696 if (!cl->next_alive)
697 cbq_activate_class(cl);
698 return 0;
700 sch->stats.drops++;
701 return 0;
704 sch->stats.drops++;
705 return -1;
707 #endif
710 It is mission critical procedure.
712 We "regenerate" toplevel cutoff, if transmitting class
713 has backlog and it is not regulated. It is not part of
714 original CBQ description, but looks more reasonable.
715 Probably, it is wrong. This question needs further investigation.
718 static __inline__ void
719 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
720 struct cbq_class *borrowed)
722 if (cl && q->toplevel >= borrowed->level) {
723 if (cl->q->q.qlen > 1) {
724 do {
725 if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
726 q->toplevel = borrowed->level;
727 return;
729 } while ((borrowed=borrowed->borrow) != NULL);
731 #if 0
732 /* It is not necessary now. Uncommenting it
733 will save CPU cycles, but decrease fairness.
735 q->toplevel = TC_CBQ_MAXLEVEL;
736 #endif
740 static void
741 cbq_update(struct cbq_sched_data *q)
743 struct cbq_class *this = q->tx_class;
744 struct cbq_class *cl = this;
745 int len = q->tx_len;
747 q->tx_class = NULL;
749 for ( ; cl; cl = cl->share) {
750 long avgidle = cl->avgidle;
751 long idle;
753 cl->stats.packets++;
754 cl->stats.bytes += len;
757 (now - last) is total time between packet right edges.
758 (last_pktlen/rate) is "virtual" busy time, so that
760 idle = (now - last) - last_pktlen/rate
763 idle = PSCHED_TDIFF(q->now, cl->last);
764 if ((unsigned long)idle > 128*1024*1024) {
765 avgidle = cl->maxidle;
766 } else {
767 idle -= L2T(cl, len);
769 /* true_avgidle := (1-W)*true_avgidle + W*idle,
770 where W=2^{-ewma_log}. But cl->avgidle is scaled:
771 cl->avgidle == true_avgidle/W,
772 hence:
774 avgidle += idle - (avgidle>>cl->ewma_log);
777 if (avgidle <= 0) {
778 /* Overlimit or at-limit */
780 if (avgidle < cl->minidle)
781 avgidle = cl->minidle;
783 cl->avgidle = avgidle;
785 /* Calculate expected time, when this class
786 will be allowed to send.
787 It will occur, when:
788 (1-W)*true_avgidle + W*delay = 0, i.e.
789 idle = (1/W - 1)*(-true_avgidle)
791 idle = (1 - W)*(-cl->avgidle);
793 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
796 That is not all.
797 To maintain the rate allocated to the class,
798 we add to undertime virtual clock,
799 necesary to complete transmitted packet.
800 (len/phys_bandwidth has been already passed
801 to the moment of cbq_update)
804 idle -= L2T(&q->link, len);
805 idle += L2T(cl, len);
807 PSCHED_AUDIT_TDIFF(idle);
809 PSCHED_TADD2(q->now, idle, cl->undertime);
810 } else {
811 /* Underlimit */
813 PSCHED_SET_PASTPERFECT(cl->undertime);
814 if (avgidle > cl->maxidle)
815 cl->avgidle = cl->maxidle;
816 else
817 cl->avgidle = avgidle;
819 cl->last = q->now;
822 cbq_update_toplevel(q, this, q->tx_borrowed);
825 static __inline__ struct cbq_class *
826 cbq_under_limit(struct cbq_class *cl)
828 struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
829 struct cbq_class *this_cl = cl;
831 if (cl->tparent == NULL)
832 return cl;
834 if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
835 !PSCHED_TLESS(q->now, cl->undertime)) {
836 cl->delayed = 0;
837 return cl;
840 do {
841 /* It is very suspicious place. Now overlimit
842 action is generated for not bounded classes
843 only if link is completely congested.
844 Though it is in agree with ancestor-only paradigm,
845 it looks very stupid. Particularly,
846 it means that this chunk of code will either
847 never be called or result in strong amplification
848 of burstiness. Dangerous, silly, and, however,
849 no another solution exists.
851 if ((cl = cl->borrow) == NULL) {
852 this_cl->stats.overlimits++;
853 this_cl->overlimit(this_cl);
854 return NULL;
856 if (cl->level > q->toplevel)
857 return NULL;
858 } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
859 PSCHED_TLESS(q->now, cl->undertime));
861 cl->delayed = 0;
862 return cl;
865 static __inline__ struct sk_buff *
866 cbq_dequeue_prio(struct Qdisc *sch, int prio)
868 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
869 struct cbq_class *cl_tail, *cl_prev, *cl;
870 struct sk_buff *skb;
871 int deficit;
873 cl_tail = cl_prev = q->active[prio];
874 cl = cl_prev->next_alive;
876 do {
877 deficit = 0;
879 /* Start round */
880 do {
881 struct cbq_class *borrow = cl;
883 if (cl->q->q.qlen &&
884 (borrow = cbq_under_limit(cl)) == NULL)
885 goto skip_class;
887 if (cl->deficit <= 0) {
888 /* Class exhausted its allotment per
889 this round. Switch to the next one.
891 deficit = 1;
892 cl->deficit += cl->quantum;
893 goto next_class;
896 skb = cl->q->dequeue(cl->q);
898 /* Class did not give us any skb :-(
899 It could occur even if cl->q->q.qlen != 0
900 f.e. if cl->q == "tbf"
902 if (skb == NULL)
903 goto skip_class;
905 cl->deficit -= skb->len;
906 q->tx_class = cl;
907 q->tx_borrowed = borrow;
908 if (borrow != cl) {
909 #ifndef CBQ_XSTATS_BORROWS_BYTES
910 borrow->xstats.borrows++;
911 cl->xstats.borrows++;
912 #else
913 borrow->xstats.borrows += skb->len;
914 cl->xstats.borrows += skb->len;
915 #endif
917 q->tx_len = skb->len;
919 if (cl->deficit <= 0) {
920 q->active[prio] = cl;
921 cl = cl->next_alive;
922 cl->deficit += cl->quantum;
924 return skb;
926 skip_class:
927 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
928 /* Class is empty or penalized.
929 Unlink it from active chain.
931 cl_prev->next_alive = cl->next_alive;
932 cl->next_alive = NULL;
934 /* Did cl_tail point to it? */
935 if (cl == cl_tail) {
936 /* Repair it! */
937 cl_tail = cl_prev;
939 /* Was it the last class in this band? */
940 if (cl == cl_tail) {
941 /* Kill the band! */
942 q->active[prio] = NULL;
943 q->activemask &= ~(1<<prio);
944 if (cl->q->q.qlen)
945 cbq_activate_class(cl);
946 return NULL;
949 q->active[prio] = cl_tail;
951 if (cl->q->q.qlen)
952 cbq_activate_class(cl);
954 cl = cl_prev;
957 next_class:
958 cl_prev = cl;
959 cl = cl->next_alive;
960 } while (cl_prev != cl_tail);
961 } while (deficit);
963 q->active[prio] = cl_prev;
965 return NULL;
968 static __inline__ struct sk_buff *
969 cbq_dequeue_1(struct Qdisc *sch)
971 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
972 struct sk_buff *skb;
973 unsigned activemask;
975 activemask = q->activemask&0xFF;
976 while (activemask) {
977 int prio = ffz(~activemask);
978 activemask &= ~(1<<prio);
979 skb = cbq_dequeue_prio(sch, prio);
980 if (skb)
981 return skb;
983 return NULL;
986 static struct sk_buff *
987 cbq_dequeue(struct Qdisc *sch)
989 struct sk_buff *skb;
990 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
991 psched_time_t now;
992 psched_tdiff_t incr;
994 PSCHED_GET_TIME(now);
995 incr = PSCHED_TDIFF(now, q->now_rt);
997 if (q->tx_class) {
998 psched_tdiff_t incr2;
999 /* Time integrator. We calculate EOS time
1000 by adding expected packet transmittion time.
1001 If real time is greater, we warp artificial clock,
1002 so that:
1004 cbq_time = max(real_time, work);
1006 incr2 = L2T(&q->link, q->tx_len);
1007 PSCHED_TADD(q->now, incr2);
1008 cbq_update(q);
1009 if ((incr -= incr2) < 0)
1010 incr = 0;
1012 PSCHED_TADD(q->now, incr);
1013 q->now_rt = now;
1015 for (;;) {
1016 q->wd_expires = 0;
1018 skb = cbq_dequeue_1(sch);
1019 if (skb) {
1020 sch->q.qlen--;
1021 sch->flags &= ~TCQ_F_THROTTLED;
1022 return skb;
1025 /* All the classes are overlimit.
1027 It is possible, if:
1029 1. Scheduler is empty.
1030 2. Toplevel cutoff inhibited borrowing.
1031 3. Root class is overlimit.
1033 Reset 2d and 3d conditions and retry.
1035 Note, that NS and cbq-2.0 are buggy, peeking
1036 an arbitrary class is appropriate for ancestor-only
1037 sharing, but not for toplevel algorithm.
1039 Our version is better, but slower, because it requires
1040 two passes, but it is unavoidable with top-level sharing.
1043 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1044 PSCHED_IS_PASTPERFECT(q->link.undertime))
1045 break;
1047 q->toplevel = TC_CBQ_MAXLEVEL;
1048 PSCHED_SET_PASTPERFECT(q->link.undertime);
1051 /* No packets in scheduler or nobody wants to give them to us :-(
1052 Sigh... start watchdog timer in the last case. */
1054 if (sch->q.qlen) {
1055 sch->stats.overlimits++;
1056 if (q->wd_expires && !netif_queue_stopped(sch->dev)) {
1057 long delay = PSCHED_US2JIFFIE(q->wd_expires);
1058 del_timer(&q->wd_timer);
1059 if (delay <= 0)
1060 delay = 1;
1061 q->wd_timer.expires = jiffies + delay;
1062 add_timer(&q->wd_timer);
1063 sch->flags |= TCQ_F_THROTTLED;
1066 return NULL;
1069 /* CBQ class maintanance routines */
1071 static void cbq_adjust_levels(struct cbq_class *this)
1073 if (this == NULL)
1074 return;
1076 do {
1077 int level = 0;
1078 struct cbq_class *cl;
1080 if ((cl = this->children) != NULL) {
1081 do {
1082 if (cl->level > level)
1083 level = cl->level;
1084 } while ((cl = cl->sibling) != this->children);
1086 this->level = level+1;
1087 } while ((this = this->tparent) != NULL);
1090 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1092 struct cbq_class *cl;
1093 unsigned h;
1095 if (q->quanta[prio] == 0)
1096 return;
1098 for (h=0; h<16; h++) {
1099 for (cl = q->classes[h]; cl; cl = cl->next) {
1100 /* BUGGGG... Beware! This expression suffer of
1101 arithmetic overflows!
1103 if (cl->priority == prio) {
1104 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1105 q->quanta[prio];
1107 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1108 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1109 cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1115 static void cbq_sync_defmap(struct cbq_class *cl)
1117 struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data;
1118 struct cbq_class *split = cl->split;
1119 unsigned h;
1120 int i;
1122 if (split == NULL)
1123 return;
1125 for (i=0; i<=TC_PRIO_MAX; i++) {
1126 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1127 split->defaults[i] = NULL;
1130 for (i=0; i<=TC_PRIO_MAX; i++) {
1131 int level = split->level;
1133 if (split->defaults[i])
1134 continue;
1136 for (h=0; h<16; h++) {
1137 struct cbq_class *c;
1139 for (c = q->classes[h]; c; c = c->next) {
1140 if (c->split == split && c->level < level &&
1141 c->defmap&(1<<i)) {
1142 split->defaults[i] = c;
1143 level = c->level;
1150 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1152 struct cbq_class *split = NULL;
1154 if (splitid == 0) {
1155 if ((split = cl->split) == NULL)
1156 return;
1157 splitid = split->classid;
1160 if (split == NULL || split->classid != splitid) {
1161 for (split = cl->tparent; split; split = split->tparent)
1162 if (split->classid == splitid)
1163 break;
1166 if (split == NULL)
1167 return;
1169 if (cl->split != split) {
1170 cl->defmap = 0;
1171 cbq_sync_defmap(cl);
1172 cl->split = split;
1173 cl->defmap = def&mask;
1174 } else
1175 cl->defmap = (cl->defmap&~mask)|(def&mask);
1177 cbq_sync_defmap(cl);
1180 static void cbq_unlink_class(struct cbq_class *this)
1182 struct cbq_class *cl, **clp;
1183 struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
1185 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1186 if (cl == this) {
1187 *clp = cl->next;
1188 cl->next = NULL;
1189 break;
1193 if (this->tparent) {
1194 clp=&this->sibling;
1195 cl = *clp;
1196 do {
1197 if (cl == this) {
1198 *clp = cl->sibling;
1199 break;
1201 clp = &cl->sibling;
1202 } while ((cl = *clp) != this->sibling);
1204 if (this->tparent->children == this) {
1205 this->tparent->children = this->sibling;
1206 if (this->sibling == this)
1207 this->tparent->children = NULL;
1209 } else {
1210 BUG_TRAP(this->sibling == this);
1214 static void cbq_link_class(struct cbq_class *this)
1216 struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data;
1217 unsigned h = cbq_hash(this->classid);
1218 struct cbq_class *parent = this->tparent;
1220 this->sibling = this;
1221 this->next = q->classes[h];
1222 q->classes[h] = this;
1224 if (parent == NULL)
1225 return;
1227 if (parent->children == NULL) {
1228 parent->children = this;
1229 } else {
1230 this->sibling = parent->children->sibling;
1231 parent->children->sibling = this;
1235 static int cbq_drop(struct Qdisc* sch)
1237 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1238 struct cbq_class *cl, *cl_head;
1239 int prio;
1241 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1242 if ((cl_head = q->active[prio]) == NULL)
1243 continue;
1245 cl = cl_head;
1246 do {
1247 if (cl->q->ops->drop && cl->q->ops->drop(cl->q))
1248 return 1;
1249 } while ((cl = cl->next_alive) != cl_head);
1251 return 0;
1254 static void
1255 cbq_reset(struct Qdisc* sch)
1257 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1258 struct cbq_class *cl;
1259 int prio;
1260 unsigned h;
1262 q->activemask = 0;
1263 q->pmask = 0;
1264 q->tx_class = NULL;
1265 q->tx_borrowed = NULL;
1266 del_timer(&q->wd_timer);
1267 del_timer(&q->delay_timer);
1268 q->toplevel = TC_CBQ_MAXLEVEL;
1269 PSCHED_GET_TIME(q->now);
1270 q->now_rt = q->now;
1272 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1273 q->active[prio] = NULL;
1275 for (h = 0; h < 16; h++) {
1276 for (cl = q->classes[h]; cl; cl = cl->next) {
1277 qdisc_reset(cl->q);
1279 cl->next_alive = NULL;
1280 PSCHED_SET_PASTPERFECT(cl->undertime);
1281 cl->avgidle = cl->maxidle;
1282 cl->deficit = cl->quantum;
1283 cl->cpriority = cl->priority;
1286 sch->q.qlen = 0;
1290 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1292 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1293 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1294 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1296 if (lss->change&TCF_CBQ_LSS_EWMA)
1297 cl->ewma_log = lss->ewma_log;
1298 if (lss->change&TCF_CBQ_LSS_AVPKT)
1299 cl->avpkt = lss->avpkt;
1300 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1301 cl->minidle = -(long)lss->minidle;
1302 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1303 cl->maxidle = lss->maxidle;
1304 cl->avgidle = lss->maxidle;
1306 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1307 cl->offtime = lss->offtime;
1308 return 0;
1311 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1313 q->nclasses[cl->priority]--;
1314 q->quanta[cl->priority] -= cl->weight;
1315 cbq_normalize_quanta(q, cl->priority);
1318 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1320 q->nclasses[cl->priority]++;
1321 q->quanta[cl->priority] += cl->weight;
1322 cbq_normalize_quanta(q, cl->priority);
1325 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1327 struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data;
1329 if (wrr->allot)
1330 cl->allot = wrr->allot;
1331 if (wrr->weight)
1332 cl->weight = wrr->weight;
1333 if (wrr->priority) {
1334 cl->priority = wrr->priority-1;
1335 cl->cpriority = cl->priority;
1336 if (cl->priority >= cl->priority2)
1337 cl->priority2 = TC_CBQ_MAXPRIO-1;
1340 cbq_addprio(q, cl);
1341 return 0;
1344 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1346 switch (ovl->strategy) {
1347 case TC_CBQ_OVL_CLASSIC:
1348 cl->overlimit = cbq_ovl_classic;
1349 break;
1350 case TC_CBQ_OVL_DELAY:
1351 cl->overlimit = cbq_ovl_delay;
1352 break;
1353 case TC_CBQ_OVL_LOWPRIO:
1354 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1355 ovl->priority2-1 <= cl->priority)
1356 return -EINVAL;
1357 cl->priority2 = ovl->priority2-1;
1358 cl->overlimit = cbq_ovl_lowprio;
1359 break;
1360 case TC_CBQ_OVL_DROP:
1361 cl->overlimit = cbq_ovl_drop;
1362 break;
1363 case TC_CBQ_OVL_RCLASSIC:
1364 cl->overlimit = cbq_ovl_rclassic;
1365 break;
1366 default:
1367 return -EINVAL;
1369 cl->penalty = (ovl->penalty*HZ)/1000;
1370 return 0;
1373 #ifdef CONFIG_NET_CLS_POLICE
1374 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1376 cl->police = p->police;
1378 if (cl->q->handle) {
1379 if (p->police == TC_POLICE_RECLASSIFY)
1380 cl->q->reshape_fail = cbq_reshape_fail;
1381 else
1382 cl->q->reshape_fail = NULL;
1384 return 0;
1386 #endif
1388 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1390 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1391 return 0;
1394 static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1396 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
1397 struct rtattr *tb[TCA_CBQ_MAX];
1398 struct tc_ratespec *r;
1400 if (rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0 ||
1401 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1402 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1403 return -EINVAL;
1405 if (tb[TCA_CBQ_LSSOPT-1] &&
1406 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1407 return -EINVAL;
1409 r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1411 MOD_INC_USE_COUNT;
1412 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) {
1413 MOD_DEC_USE_COUNT;
1414 return -EINVAL;
1417 q->link.refcnt = 1;
1418 q->link.sibling = &q->link;
1419 q->link.classid = sch->handle;
1420 q->link.qdisc = sch;
1421 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
1422 q->link.q = &noop_qdisc;
1424 q->link.priority = TC_CBQ_MAXPRIO-1;
1425 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1426 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1427 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1428 q->link.overlimit = cbq_ovl_classic;
1429 q->link.allot = psched_mtu(sch->dev);
1430 q->link.quantum = q->link.allot;
1431 q->link.weight = q->link.R_tab->rate.rate;
1433 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1434 q->link.avpkt = q->link.allot/2;
1435 q->link.minidle = -0x7FFFFFFF;
1436 q->link.stats.lock = &sch->dev->queue_lock;
1438 init_timer(&q->wd_timer);
1439 q->wd_timer.data = (unsigned long)sch;
1440 q->wd_timer.function = cbq_watchdog;
1441 init_timer(&q->delay_timer);
1442 q->delay_timer.data = (unsigned long)sch;
1443 q->delay_timer.function = cbq_undelay;
1444 q->toplevel = TC_CBQ_MAXLEVEL;
1445 PSCHED_GET_TIME(q->now);
1446 q->now_rt = q->now;
1448 cbq_link_class(&q->link);
1450 if (tb[TCA_CBQ_LSSOPT-1])
1451 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1453 cbq_addprio(q, &q->link);
1454 return 0;
1457 #ifdef CONFIG_RTNETLINK
1459 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1461 unsigned char *b = skb->tail;
1463 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1464 return skb->len;
1466 rtattr_failure:
1467 skb_trim(skb, b - skb->data);
1468 return -1;
1471 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1473 unsigned char *b = skb->tail;
1474 struct tc_cbq_lssopt opt;
1476 opt.flags = 0;
1477 if (cl->borrow == NULL)
1478 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1479 if (cl->share == NULL)
1480 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1481 opt.ewma_log = cl->ewma_log;
1482 opt.level = cl->level;
1483 opt.avpkt = cl->avpkt;
1484 opt.maxidle = cl->maxidle;
1485 opt.minidle = (u32)(-cl->minidle);
1486 opt.offtime = cl->offtime;
1487 opt.change = ~0;
1488 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1489 return skb->len;
1491 rtattr_failure:
1492 skb_trim(skb, b - skb->data);
1493 return -1;
1496 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1498 unsigned char *b = skb->tail;
1499 struct tc_cbq_wrropt opt;
1501 opt.flags = 0;
1502 opt.allot = cl->allot;
1503 opt.priority = cl->priority+1;
1504 opt.cpriority = cl->cpriority+1;
1505 opt.weight = cl->weight;
1506 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1507 return skb->len;
1509 rtattr_failure:
1510 skb_trim(skb, b - skb->data);
1511 return -1;
1514 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1516 unsigned char *b = skb->tail;
1517 struct tc_cbq_ovl opt;
1519 opt.strategy = cl->ovl_strategy;
1520 opt.priority2 = cl->priority2+1;
1521 opt.penalty = (cl->penalty*1000)/HZ;
1522 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1523 return skb->len;
1525 rtattr_failure:
1526 skb_trim(skb, b - skb->data);
1527 return -1;
1530 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1532 unsigned char *b = skb->tail;
1533 struct tc_cbq_fopt opt;
1535 if (cl->split || cl->defmap) {
1536 opt.split = cl->split ? cl->split->classid : 0;
1537 opt.defmap = cl->defmap;
1538 opt.defchange = ~0;
1539 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1541 return skb->len;
1543 rtattr_failure:
1544 skb_trim(skb, b - skb->data);
1545 return -1;
1548 #ifdef CONFIG_NET_CLS_POLICE
1549 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1551 unsigned char *b = skb->tail;
1552 struct tc_cbq_police opt;
1554 if (cl->police) {
1555 opt.police = cl->police;
1556 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1558 return skb->len;
1560 rtattr_failure:
1561 skb_trim(skb, b - skb->data);
1562 return -1;
1564 #endif
1566 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1568 if (cbq_dump_lss(skb, cl) < 0 ||
1569 cbq_dump_rate(skb, cl) < 0 ||
1570 cbq_dump_wrr(skb, cl) < 0 ||
1571 cbq_dump_ovl(skb, cl) < 0 ||
1572 #ifdef CONFIG_NET_CLS_POLICE
1573 cbq_dump_police(skb, cl) < 0 ||
1574 #endif
1575 cbq_dump_fopt(skb, cl) < 0)
1576 return -1;
1577 return 0;
1580 int cbq_copy_xstats(struct sk_buff *skb, struct tc_cbq_xstats *st)
1582 RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st);
1583 return 0;
1585 rtattr_failure:
1586 return -1;
1590 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1592 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
1593 unsigned char *b = skb->tail;
1594 struct rtattr *rta;
1596 rta = (struct rtattr*)b;
1597 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1598 if (cbq_dump_attr(skb, &q->link) < 0)
1599 goto rtattr_failure;
1600 rta->rta_len = skb->tail - b;
1601 spin_lock_bh(&sch->dev->queue_lock);
1602 q->link.xstats.avgidle = q->link.avgidle;
1603 if (cbq_copy_xstats(skb, &q->link.xstats)) {
1604 spin_unlock_bh(&sch->dev->queue_lock);
1605 goto rtattr_failure;
1607 spin_unlock_bh(&sch->dev->queue_lock);
1608 return skb->len;
1610 rtattr_failure:
1611 skb_trim(skb, b - skb->data);
1612 return -1;
1615 static int
1616 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1617 struct sk_buff *skb, struct tcmsg *tcm)
1619 struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data;
1620 struct cbq_class *cl = (struct cbq_class*)arg;
1621 unsigned char *b = skb->tail;
1622 struct rtattr *rta;
1624 if (cl->tparent)
1625 tcm->tcm_parent = cl->tparent->classid;
1626 else
1627 tcm->tcm_parent = TC_H_ROOT;
1628 tcm->tcm_handle = cl->classid;
1629 tcm->tcm_info = cl->q->handle;
1631 rta = (struct rtattr*)b;
1632 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1633 if (cbq_dump_attr(skb, cl) < 0)
1634 goto rtattr_failure;
1635 rta->rta_len = skb->tail - b;
1636 cl->stats.qlen = cl->q->q.qlen;
1637 if (qdisc_copy_stats(skb, &cl->stats))
1638 goto rtattr_failure;
1639 spin_lock_bh(&sch->dev->queue_lock);
1640 cl->xstats.avgidle = cl->avgidle;
1641 cl->xstats.undertime = 0;
1642 if (!PSCHED_IS_PASTPERFECT(cl->undertime))
1643 cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
1644 q->link.xstats.avgidle = q->link.avgidle;
1645 if (cbq_copy_xstats(skb, &cl->xstats)) {
1646 spin_unlock_bh(&sch->dev->queue_lock);
1647 goto rtattr_failure;
1649 spin_unlock_bh(&sch->dev->queue_lock);
1651 return skb->len;
1653 rtattr_failure:
1654 skb_trim(skb, b - skb->data);
1655 return -1;
1658 #endif
1660 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1661 struct Qdisc **old)
1663 struct cbq_class *cl = (struct cbq_class*)arg;
1665 if (cl) {
1666 if (new == NULL) {
1667 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL)
1668 return -ENOBUFS;
1669 } else {
1670 #ifdef CONFIG_NET_CLS_POLICE
1671 if (cl->police == TC_POLICE_RECLASSIFY)
1672 new->reshape_fail = cbq_reshape_fail;
1673 #endif
1675 sch_tree_lock(sch);
1676 *old = cl->q;
1677 cl->q = new;
1678 qdisc_reset(*old);
1679 sch_tree_unlock(sch);
1681 return 0;
1683 return -ENOENT;
1686 static struct Qdisc *
1687 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1689 struct cbq_class *cl = (struct cbq_class*)arg;
1691 return cl ? cl->q : NULL;
1694 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1696 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1697 struct cbq_class *cl = cbq_class_lookup(q, classid);
1699 if (cl) {
1700 cl->refcnt++;
1701 return (unsigned long)cl;
1703 return 0;
1706 static void cbq_destroy_filters(struct cbq_class *cl)
1708 struct tcf_proto *tp;
1710 while ((tp = cl->filter_list) != NULL) {
1711 cl->filter_list = tp->next;
1712 tp->ops->destroy(tp);
1716 static void cbq_destroy_class(struct cbq_class *cl)
1718 cbq_destroy_filters(cl);
1719 qdisc_destroy(cl->q);
1720 qdisc_put_rtab(cl->R_tab);
1721 #ifdef CONFIG_NET_ESTIMATOR
1722 qdisc_kill_estimator(&cl->stats);
1723 #endif
1724 kfree(cl);
1727 static void
1728 cbq_destroy(struct Qdisc* sch)
1730 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1731 struct cbq_class *cl;
1732 unsigned h;
1734 #ifdef CONFIG_NET_CLS_POLICE
1735 q->rx_class = NULL;
1736 #endif
1737 for (h = 0; h < 16; h++) {
1738 for (cl = q->classes[h]; cl; cl = cl->next)
1739 cbq_destroy_filters(cl);
1742 for (h = 0; h < 16; h++) {
1743 for (cl = q->classes[h]; cl; cl = cl->next)
1744 if (cl != &q->link)
1745 cbq_destroy_class(cl);
1748 qdisc_put_rtab(q->link.R_tab);
1749 MOD_DEC_USE_COUNT;
1752 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1754 struct cbq_class *cl = (struct cbq_class*)arg;
1756 if (--cl->refcnt == 0) {
1757 #ifdef CONFIG_NET_CLS_POLICE
1758 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1760 spin_lock_bh(&sch->dev->queue_lock);
1761 if (q->rx_class == cl)
1762 q->rx_class = NULL;
1763 spin_unlock_bh(&sch->dev->queue_lock);
1764 #endif
1766 cbq_destroy_class(cl);
1770 static int
1771 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1772 unsigned long *arg)
1774 int err;
1775 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1776 struct cbq_class *cl = (struct cbq_class*)*arg;
1777 struct rtattr *opt = tca[TCA_OPTIONS-1];
1778 struct rtattr *tb[TCA_CBQ_MAX];
1779 struct cbq_class *parent;
1780 struct qdisc_rate_table *rtab = NULL;
1782 if (opt==NULL ||
1783 rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)))
1784 return -EINVAL;
1786 if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1787 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1788 return -EINVAL;
1790 if (tb[TCA_CBQ_FOPT-1] &&
1791 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1792 return -EINVAL;
1794 if (tb[TCA_CBQ_RATE-1] &&
1795 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1796 return -EINVAL;
1798 if (tb[TCA_CBQ_LSSOPT-1] &&
1799 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1800 return -EINVAL;
1802 if (tb[TCA_CBQ_WRROPT-1] &&
1803 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1804 return -EINVAL;
1806 #ifdef CONFIG_NET_CLS_POLICE
1807 if (tb[TCA_CBQ_POLICE-1] &&
1808 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1809 return -EINVAL;
1810 #endif
1812 if (cl) {
1813 /* Check parent */
1814 if (parentid) {
1815 if (cl->tparent && cl->tparent->classid != parentid)
1816 return -EINVAL;
1817 if (!cl->tparent && parentid != TC_H_ROOT)
1818 return -EINVAL;
1821 if (tb[TCA_CBQ_RATE-1]) {
1822 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1823 if (rtab == NULL)
1824 return -EINVAL;
1827 /* Change class parameters */
1828 sch_tree_lock(sch);
1830 if (cl->next_alive != NULL)
1831 cbq_deactivate_class(cl);
1833 if (rtab) {
1834 rtab = xchg(&cl->R_tab, rtab);
1835 qdisc_put_rtab(rtab);
1838 if (tb[TCA_CBQ_LSSOPT-1])
1839 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1841 if (tb[TCA_CBQ_WRROPT-1]) {
1842 cbq_rmprio(q, cl);
1843 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1846 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1847 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1849 #ifdef CONFIG_NET_CLS_POLICE
1850 if (tb[TCA_CBQ_POLICE-1])
1851 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1852 #endif
1854 if (tb[TCA_CBQ_FOPT-1])
1855 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1857 if (cl->q->q.qlen)
1858 cbq_activate_class(cl);
1860 sch_tree_unlock(sch);
1862 #ifdef CONFIG_NET_ESTIMATOR
1863 if (tca[TCA_RATE-1]) {
1864 qdisc_kill_estimator(&cl->stats);
1865 qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]);
1867 #endif
1868 return 0;
1871 if (parentid == TC_H_ROOT)
1872 return -EINVAL;
1874 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1875 tb[TCA_CBQ_LSSOPT-1] == NULL)
1876 return -EINVAL;
1878 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1879 if (rtab == NULL)
1880 return -EINVAL;
1882 if (classid) {
1883 err = -EINVAL;
1884 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1885 goto failure;
1886 } else {
1887 int i;
1888 classid = TC_H_MAKE(sch->handle,0x8000);
1890 for (i=0; i<0x8000; i++) {
1891 if (++q->hgenerator >= 0x8000)
1892 q->hgenerator = 1;
1893 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1894 break;
1896 err = -ENOSR;
1897 if (i >= 0x8000)
1898 goto failure;
1899 classid = classid|q->hgenerator;
1902 parent = &q->link;
1903 if (parentid) {
1904 parent = cbq_class_lookup(q, parentid);
1905 err = -EINVAL;
1906 if (parent == NULL)
1907 goto failure;
1910 err = -ENOBUFS;
1911 cl = kmalloc(sizeof(*cl), GFP_KERNEL);
1912 if (cl == NULL)
1913 goto failure;
1914 memset(cl, 0, sizeof(*cl));
1915 cl->R_tab = rtab;
1916 rtab = NULL;
1917 cl->refcnt = 1;
1918 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
1919 cl->q = &noop_qdisc;
1920 cl->classid = classid;
1921 cl->tparent = parent;
1922 cl->qdisc = sch;
1923 cl->allot = parent->allot;
1924 cl->quantum = cl->allot;
1925 cl->weight = cl->R_tab->rate.rate;
1926 cl->stats.lock = &sch->dev->queue_lock;
1928 sch_tree_lock(sch);
1929 cbq_link_class(cl);
1930 cl->borrow = cl->tparent;
1931 if (cl->tparent != &q->link)
1932 cl->share = cl->tparent;
1933 cbq_adjust_levels(parent);
1934 cl->minidle = -0x7FFFFFFF;
1935 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1936 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1937 if (cl->ewma_log==0)
1938 cl->ewma_log = q->link.ewma_log;
1939 if (cl->maxidle==0)
1940 cl->maxidle = q->link.maxidle;
1941 if (cl->avpkt==0)
1942 cl->avpkt = q->link.avpkt;
1943 cl->overlimit = cbq_ovl_classic;
1944 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1945 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1946 #ifdef CONFIG_NET_CLS_POLICE
1947 if (tb[TCA_CBQ_POLICE-1])
1948 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1949 #endif
1950 if (tb[TCA_CBQ_FOPT-1])
1951 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1952 sch_tree_unlock(sch);
1954 #ifdef CONFIG_NET_ESTIMATOR
1955 if (tca[TCA_RATE-1])
1956 qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]);
1957 #endif
1959 *arg = (unsigned long)cl;
1960 return 0;
1962 failure:
1963 qdisc_put_rtab(rtab);
1964 return err;
1967 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1969 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
1970 struct cbq_class *cl = (struct cbq_class*)arg;
1972 if (cl->filters || cl->children || cl == &q->link)
1973 return -EBUSY;
1975 sch_tree_lock(sch);
1977 if (cl->next_alive)
1978 cbq_deactivate_class(cl);
1980 if (q->tx_borrowed == cl)
1981 q->tx_borrowed = q->tx_class;
1982 if (q->tx_class == cl) {
1983 q->tx_class = NULL;
1984 q->tx_borrowed = NULL;
1986 #ifdef CONFIG_NET_CLS_POLICE
1987 if (q->rx_class == cl)
1988 q->rx_class = NULL;
1989 #endif
1991 cbq_unlink_class(cl);
1992 cbq_adjust_levels(cl->tparent);
1993 cl->defmap = 0;
1994 cbq_sync_defmap(cl);
1996 cbq_rmprio(q, cl);
1997 sch_tree_unlock(sch);
1999 if (--cl->refcnt == 0)
2000 cbq_destroy_class(cl);
2002 return 0;
2005 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
2007 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
2008 struct cbq_class *cl = (struct cbq_class *)arg;
2010 if (cl == NULL)
2011 cl = &q->link;
2013 return &cl->filter_list;
2016 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
2017 u32 classid)
2019 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
2020 struct cbq_class *p = (struct cbq_class*)parent;
2021 struct cbq_class *cl = cbq_class_lookup(q, classid);
2023 if (cl) {
2024 if (p && p->level <= cl->level)
2025 return 0;
2026 cl->filters++;
2027 return (unsigned long)cl;
2029 return 0;
2032 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2034 struct cbq_class *cl = (struct cbq_class*)arg;
2036 cl->filters--;
2039 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2041 struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data;
2042 unsigned h;
2044 if (arg->stop)
2045 return;
2047 for (h = 0; h < 16; h++) {
2048 struct cbq_class *cl;
2050 for (cl = q->classes[h]; cl; cl = cl->next) {
2051 if (arg->count < arg->skip) {
2052 arg->count++;
2053 continue;
2055 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2056 arg->stop = 1;
2057 return;
2059 arg->count++;
2064 static struct Qdisc_class_ops cbq_class_ops =
2066 cbq_graft,
2067 cbq_leaf,
2068 cbq_get,
2069 cbq_put,
2070 cbq_change_class,
2071 cbq_delete,
2072 cbq_walk,
2074 cbq_find_tcf,
2075 cbq_bind_filter,
2076 cbq_unbind_filter,
2078 #ifdef CONFIG_RTNETLINK
2079 cbq_dump_class,
2080 #endif
2083 struct Qdisc_ops cbq_qdisc_ops =
2085 NULL,
2086 &cbq_class_ops,
2087 "cbq",
2088 sizeof(struct cbq_sched_data),
2090 cbq_enqueue,
2091 cbq_dequeue,
2092 cbq_requeue,
2093 cbq_drop,
2095 cbq_init,
2096 cbq_reset,
2097 cbq_destroy,
2098 NULL /* cbq_change */,
2100 #ifdef CONFIG_RTNETLINK
2101 cbq_dump,
2102 #endif
2105 #ifdef MODULE
2106 int init_module(void)
2108 return register_qdisc(&cbq_qdisc_ops);
2111 void cleanup_module(void)
2113 unregister_qdisc(&cbq_qdisc_ops);
2115 #endif