[IPV6]: Repair IPv6 Fragments
[linux-2.6/linux-mips.git] / net / sched / sch_cbq.c
blobba82dfab6043d7379094ec0771d1bbdfcbb470f3
1 /*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/module.h>
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/sched.h>
20 #include <linux/string.h>
21 #include <linux/mm.h>
22 #include <linux/socket.h>
23 #include <linux/sockios.h>
24 #include <linux/in.h>
25 #include <linux/errno.h>
26 #include <linux/interrupt.h>
27 #include <linux/if_ether.h>
28 #include <linux/inet.h>
29 #include <linux/netdevice.h>
30 #include <linux/etherdevice.h>
31 #include <linux/notifier.h>
32 #include <net/ip.h>
33 #include <net/route.h>
34 #include <linux/skbuff.h>
35 #include <net/sock.h>
36 #include <net/pkt_sched.h>
39 /* Class-Based Queueing (CBQ) algorithm.
40 =======================================
42 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
43 Management Models for Packet Networks",
44 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
46 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
48 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
49 Parameters", 1996
51 [4] Sally Floyd and Michael Speer, "Experimental Results
52 for Class-Based Queueing", 1998, not published.
54 -----------------------------------------------------------------------
56 Algorithm skeleton was taken from NS simulator cbq.cc.
57 If someone wants to check this code against the LBL version,
58 he should take into account that ONLY the skeleton was borrowed,
59 the implementation is different. Particularly:
61 --- The WRR algorithm is different. Our version looks more
62 reasonable (I hope) and works when quanta are allowed to be
63 less than MTU, which is always the case when real time classes
64 have small rates. Note, that the statement of [3] is
65 incomplete, delay may actually be estimated even if class
66 per-round allotment is less than MTU. Namely, if per-round
67 allotment is W*r_i, and r_1+...+r_k = r < 1
69 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
71 In the worst case we have IntServ estimate with D = W*r+k*MTU
72 and C = MTU*r. The proof (if correct at all) is trivial.
75 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
76 interpret some places, which look like wrong translations
77 from NS. Anyone is advised to find these differences
78 and explain to me, why I am wrong 8).
80 --- Linux has no EOI event, so that we cannot estimate true class
81 idle time. Workaround is to consider the next dequeue event
82 as sign that previous packet is finished. This is wrong because of
83 internal device queueing, but on a permanently loaded link it is true.
84 Moreover, combined with clock integrator, this scheme looks
85 very close to an ideal solution. */
87 struct cbq_sched_data;
90 struct cbq_class
92 struct cbq_class *next; /* hash table link */
93 struct cbq_class *next_alive; /* next class with backlog in this priority band */
95 /* Parameters */
96 u32 classid;
97 unsigned char priority; /* class priority */
98 unsigned char priority2; /* priority to be used after overlimit */
99 unsigned char ewma_log; /* time constant for idle time calculation */
100 unsigned char ovl_strategy;
101 #ifdef CONFIG_NET_CLS_POLICE
102 unsigned char police;
103 #endif
105 u32 defmap;
107 /* Link-sharing scheduler parameters */
108 long maxidle; /* Class parameters: see below. */
109 long offtime;
110 long minidle;
111 u32 avpkt;
112 struct qdisc_rate_table *R_tab;
114 /* Overlimit strategy parameters */
115 void (*overlimit)(struct cbq_class *cl);
116 long penalty;
118 /* General scheduler (WRR) parameters */
119 long allot;
120 long quantum; /* Allotment per WRR round */
121 long weight; /* Relative allotment: see below */
123 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
124 struct cbq_class *split; /* Ptr to split node */
125 struct cbq_class *share; /* Ptr to LS parent in the class tree */
126 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
127 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
128 parent otherwise */
129 struct cbq_class *sibling; /* Sibling chain */
130 struct cbq_class *children; /* Pointer to children chain */
132 struct Qdisc *q; /* Elementary queueing discipline */
135 /* Variables */
136 unsigned char cpriority; /* Effective priority */
137 unsigned char delayed;
138 unsigned char level; /* level of the class in hierarchy:
139 0 for leaf classes, and maximal
140 level of children + 1 for nodes.
143 psched_time_t last; /* Last end of service */
144 psched_time_t undertime;
145 long avgidle;
146 long deficit; /* Saved deficit for WRR */
147 unsigned long penalized;
148 struct gnet_stats_basic bstats;
149 struct gnet_stats_queue qstats;
150 struct gnet_stats_rate_est rate_est;
151 spinlock_t *stats_lock;
152 struct tc_cbq_xstats xstats;
154 struct tcf_proto *filter_list;
156 int refcnt;
157 int filters;
159 struct cbq_class *defaults[TC_PRIO_MAX+1];
162 struct cbq_sched_data
164 struct cbq_class *classes[16]; /* Hash table of all classes */
165 int nclasses[TC_CBQ_MAXPRIO+1];
166 unsigned quanta[TC_CBQ_MAXPRIO+1];
168 struct cbq_class link;
170 unsigned activemask;
171 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
172 with backlog */
174 #ifdef CONFIG_NET_CLS_POLICE
175 struct cbq_class *rx_class;
176 #endif
177 struct cbq_class *tx_class;
178 struct cbq_class *tx_borrowed;
179 int tx_len;
180 psched_time_t now; /* Cached timestamp */
181 psched_time_t now_rt; /* Cached real time */
182 unsigned pmask;
184 struct timer_list delay_timer;
185 struct timer_list wd_timer; /* Watchdog timer,
186 started when CBQ has
187 backlog, but cannot
188 transmit just now */
189 long wd_expires;
190 int toplevel;
191 u32 hgenerator;
195 #define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
198 static __inline__ unsigned cbq_hash(u32 h)
200 h ^= h>>8;
201 h ^= h>>4;
202 return h&0xF;
205 static __inline__ struct cbq_class *
206 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
208 struct cbq_class *cl;
210 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
211 if (cl->classid == classid)
212 return cl;
213 return NULL;
216 #ifdef CONFIG_NET_CLS_POLICE
218 static struct cbq_class *
219 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
221 struct cbq_class *cl, *new;
223 for (cl = this->tparent; cl; cl = cl->tparent)
224 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
225 return new;
227 return NULL;
230 #endif
232 /* Classify packet. The procedure is pretty complicated, but
233 it allows us to combine link sharing and priority scheduling
234 transparently.
236 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
237 so that it resolves to split nodes. Then packets are classified
238 by logical priority, or a more specific classifier may be attached
239 to the split node.
242 static struct cbq_class *
243 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
245 struct cbq_sched_data *q = qdisc_priv(sch);
246 struct cbq_class *head = &q->link;
247 struct cbq_class **defmap;
248 struct cbq_class *cl = NULL;
249 u32 prio = skb->priority;
250 struct tcf_result res;
253 * Step 1. If skb->priority points to one of our classes, use it.
255 if (TC_H_MAJ(prio^sch->handle) == 0 &&
256 (cl = cbq_class_lookup(q, prio)) != NULL)
257 return cl;
259 *qerr = NET_XMIT_BYPASS;
260 for (;;) {
261 int result = 0;
262 defmap = head->defaults;
265 * Step 2+n. Apply classifier.
267 if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
268 goto fallback;
270 if ((cl = (void*)res.class) == NULL) {
271 if (TC_H_MAJ(res.classid))
272 cl = cbq_class_lookup(q, res.classid);
273 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
274 cl = defmap[TC_PRIO_BESTEFFORT];
276 if (cl == NULL || cl->level >= head->level)
277 goto fallback;
280 #ifdef CONFIG_NET_CLS_ACT
281 switch (result) {
282 case TC_ACT_QUEUED:
283 case TC_ACT_STOLEN:
284 *qerr = NET_XMIT_SUCCESS;
285 case TC_ACT_SHOT:
286 return NULL;
288 #elif defined(CONFIG_NET_CLS_POLICE)
289 switch (result) {
290 case TC_POLICE_RECLASSIFY:
291 return cbq_reclassify(skb, cl);
292 case TC_POLICE_SHOT:
293 return NULL;
294 default:
295 break;
297 #endif
298 if (cl->level == 0)
299 return cl;
302 * Step 3+n. If classifier selected a link sharing class,
303 * apply agency specific classifier.
304 * Repeat this procdure until we hit a leaf node.
306 head = cl;
309 fallback:
310 cl = head;
313 * Step 4. No success...
315 if (TC_H_MAJ(prio) == 0 &&
316 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
317 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
318 return head;
320 return cl;
324 A packet has just been enqueued on the empty class.
325 cbq_activate_class adds it to the tail of active class list
326 of its priority band.
329 static __inline__ void cbq_activate_class(struct cbq_class *cl)
331 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
332 int prio = cl->cpriority;
333 struct cbq_class *cl_tail;
335 cl_tail = q->active[prio];
336 q->active[prio] = cl;
338 if (cl_tail != NULL) {
339 cl->next_alive = cl_tail->next_alive;
340 cl_tail->next_alive = cl;
341 } else {
342 cl->next_alive = cl;
343 q->activemask |= (1<<prio);
348 Unlink class from active chain.
349 Note that this same procedure is done directly in cbq_dequeue*
350 during round-robin procedure.
353 static void cbq_deactivate_class(struct cbq_class *this)
355 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
356 int prio = this->cpriority;
357 struct cbq_class *cl;
358 struct cbq_class *cl_prev = q->active[prio];
360 do {
361 cl = cl_prev->next_alive;
362 if (cl == this) {
363 cl_prev->next_alive = cl->next_alive;
364 cl->next_alive = NULL;
366 if (cl == q->active[prio]) {
367 q->active[prio] = cl_prev;
368 if (cl == q->active[prio]) {
369 q->active[prio] = NULL;
370 q->activemask &= ~(1<<prio);
371 return;
375 cl = cl_prev->next_alive;
376 return;
378 } while ((cl_prev = cl) != q->active[prio]);
381 static void
382 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
384 int toplevel = q->toplevel;
386 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
387 psched_time_t now;
388 psched_tdiff_t incr;
390 PSCHED_GET_TIME(now);
391 incr = PSCHED_TDIFF(now, q->now_rt);
392 PSCHED_TADD2(q->now, incr, now);
394 do {
395 if (PSCHED_TLESS(cl->undertime, now)) {
396 q->toplevel = cl->level;
397 return;
399 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
403 static int
404 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
406 struct cbq_sched_data *q = qdisc_priv(sch);
407 int len = skb->len;
408 int ret;
409 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
411 #ifdef CONFIG_NET_CLS_POLICE
412 q->rx_class = cl;
413 #endif
414 if (cl == NULL) {
415 if (ret == NET_XMIT_BYPASS)
416 sch->qstats.drops++;
417 kfree_skb(skb);
418 return ret;
421 #ifdef CONFIG_NET_CLS_POLICE
422 cl->q->__parent = sch;
423 #endif
424 if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
425 sch->q.qlen++;
426 sch->bstats.packets++;
427 sch->bstats.bytes+=len;
428 cbq_mark_toplevel(q, cl);
429 if (!cl->next_alive)
430 cbq_activate_class(cl);
431 return ret;
434 sch->qstats.drops++;
435 cbq_mark_toplevel(q, cl);
436 cl->qstats.drops++;
437 return ret;
440 static int
441 cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
443 struct cbq_sched_data *q = qdisc_priv(sch);
444 struct cbq_class *cl;
445 int ret;
447 if ((cl = q->tx_class) == NULL) {
448 kfree_skb(skb);
449 sch->qstats.drops++;
450 return NET_XMIT_CN;
452 q->tx_class = NULL;
454 cbq_mark_toplevel(q, cl);
456 #ifdef CONFIG_NET_CLS_POLICE
457 q->rx_class = cl;
458 cl->q->__parent = sch;
459 #endif
460 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
461 sch->q.qlen++;
462 sch->qstats.requeues++;
463 if (!cl->next_alive)
464 cbq_activate_class(cl);
465 return 0;
467 sch->qstats.drops++;
468 cl->qstats.drops++;
469 return ret;
472 /* Overlimit actions */
474 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
476 static void cbq_ovl_classic(struct cbq_class *cl)
478 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
479 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
481 if (!cl->delayed) {
482 delay += cl->offtime;
485 Class goes to sleep, so that it will have no
486 chance to work avgidle. Let's forgive it 8)
488 BTW cbq-2.0 has a crap in this
489 place, apparently they forgot to shift it by cl->ewma_log.
491 if (cl->avgidle < 0)
492 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
493 if (cl->avgidle < cl->minidle)
494 cl->avgidle = cl->minidle;
495 if (delay <= 0)
496 delay = 1;
497 PSCHED_TADD2(q->now, delay, cl->undertime);
499 cl->xstats.overactions++;
500 cl->delayed = 1;
502 if (q->wd_expires == 0 || q->wd_expires > delay)
503 q->wd_expires = delay;
505 /* Dirty work! We must schedule wakeups based on
506 real available rate, rather than leaf rate,
507 which may be tiny (even zero).
509 if (q->toplevel == TC_CBQ_MAXLEVEL) {
510 struct cbq_class *b;
511 psched_tdiff_t base_delay = q->wd_expires;
513 for (b = cl->borrow; b; b = b->borrow) {
514 delay = PSCHED_TDIFF(b->undertime, q->now);
515 if (delay < base_delay) {
516 if (delay <= 0)
517 delay = 1;
518 base_delay = delay;
522 q->wd_expires = base_delay;
526 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
527 they go overlimit
530 static void cbq_ovl_rclassic(struct cbq_class *cl)
532 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
533 struct cbq_class *this = cl;
535 do {
536 if (cl->level > q->toplevel) {
537 cl = NULL;
538 break;
540 } while ((cl = cl->borrow) != NULL);
542 if (cl == NULL)
543 cl = this;
544 cbq_ovl_classic(cl);
547 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
549 static void cbq_ovl_delay(struct cbq_class *cl)
551 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
552 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
554 if (!cl->delayed) {
555 unsigned long sched = jiffies;
557 delay += cl->offtime;
558 if (cl->avgidle < 0)
559 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
560 if (cl->avgidle < cl->minidle)
561 cl->avgidle = cl->minidle;
562 PSCHED_TADD2(q->now, delay, cl->undertime);
564 if (delay > 0) {
565 sched += PSCHED_US2JIFFIE(delay) + cl->penalty;
566 cl->penalized = sched;
567 cl->cpriority = TC_CBQ_MAXPRIO;
568 q->pmask |= (1<<TC_CBQ_MAXPRIO);
569 if (del_timer(&q->delay_timer) &&
570 (long)(q->delay_timer.expires - sched) > 0)
571 q->delay_timer.expires = sched;
572 add_timer(&q->delay_timer);
573 cl->delayed = 1;
574 cl->xstats.overactions++;
575 return;
577 delay = 1;
579 if (q->wd_expires == 0 || q->wd_expires > delay)
580 q->wd_expires = delay;
583 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
585 static void cbq_ovl_lowprio(struct cbq_class *cl)
587 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
589 cl->penalized = jiffies + cl->penalty;
591 if (cl->cpriority != cl->priority2) {
592 cl->cpriority = cl->priority2;
593 q->pmask |= (1<<cl->cpriority);
594 cl->xstats.overactions++;
596 cbq_ovl_classic(cl);
599 /* TC_CBQ_OVL_DROP: penalize class by dropping */
601 static void cbq_ovl_drop(struct cbq_class *cl)
603 if (cl->q->ops->drop)
604 if (cl->q->ops->drop(cl->q))
605 cl->qdisc->q.qlen--;
606 cl->xstats.overactions++;
607 cbq_ovl_classic(cl);
610 static void cbq_watchdog(unsigned long arg)
612 struct Qdisc *sch = (struct Qdisc*)arg;
614 sch->flags &= ~TCQ_F_THROTTLED;
615 netif_schedule(sch->dev);
618 static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio)
620 struct cbq_class *cl;
621 struct cbq_class *cl_prev = q->active[prio];
622 unsigned long now = jiffies;
623 unsigned long sched = now;
625 if (cl_prev == NULL)
626 return now;
628 do {
629 cl = cl_prev->next_alive;
630 if ((long)(now - cl->penalized) > 0) {
631 cl_prev->next_alive = cl->next_alive;
632 cl->next_alive = NULL;
633 cl->cpriority = cl->priority;
634 cl->delayed = 0;
635 cbq_activate_class(cl);
637 if (cl == q->active[prio]) {
638 q->active[prio] = cl_prev;
639 if (cl == q->active[prio]) {
640 q->active[prio] = NULL;
641 return 0;
645 cl = cl_prev->next_alive;
646 } else if ((long)(sched - cl->penalized) > 0)
647 sched = cl->penalized;
648 } while ((cl_prev = cl) != q->active[prio]);
650 return (long)(sched - now);
653 static void cbq_undelay(unsigned long arg)
655 struct Qdisc *sch = (struct Qdisc*)arg;
656 struct cbq_sched_data *q = qdisc_priv(sch);
657 long delay = 0;
658 unsigned pmask;
660 pmask = q->pmask;
661 q->pmask = 0;
663 while (pmask) {
664 int prio = ffz(~pmask);
665 long tmp;
667 pmask &= ~(1<<prio);
669 tmp = cbq_undelay_prio(q, prio);
670 if (tmp > 0) {
671 q->pmask |= 1<<prio;
672 if (tmp < delay || delay == 0)
673 delay = tmp;
677 if (delay) {
678 q->delay_timer.expires = jiffies + delay;
679 add_timer(&q->delay_timer);
682 sch->flags &= ~TCQ_F_THROTTLED;
683 netif_schedule(sch->dev);
687 #ifdef CONFIG_NET_CLS_POLICE
689 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
691 int len = skb->len;
692 struct Qdisc *sch = child->__parent;
693 struct cbq_sched_data *q = qdisc_priv(sch);
694 struct cbq_class *cl = q->rx_class;
696 q->rx_class = NULL;
698 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
700 cbq_mark_toplevel(q, cl);
702 q->rx_class = cl;
703 cl->q->__parent = sch;
705 if (cl->q->enqueue(skb, cl->q) == 0) {
706 sch->q.qlen++;
707 sch->bstats.packets++;
708 sch->bstats.bytes+=len;
709 if (!cl->next_alive)
710 cbq_activate_class(cl);
711 return 0;
713 sch->qstats.drops++;
714 return 0;
717 sch->qstats.drops++;
718 return -1;
720 #endif
723 It is mission critical procedure.
725 We "regenerate" toplevel cutoff, if transmitting class
726 has backlog and it is not regulated. It is not part of
727 original CBQ description, but looks more reasonable.
728 Probably, it is wrong. This question needs further investigation.
731 static __inline__ void
732 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
733 struct cbq_class *borrowed)
735 if (cl && q->toplevel >= borrowed->level) {
736 if (cl->q->q.qlen > 1) {
737 do {
738 if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
739 q->toplevel = borrowed->level;
740 return;
742 } while ((borrowed=borrowed->borrow) != NULL);
744 #if 0
745 /* It is not necessary now. Uncommenting it
746 will save CPU cycles, but decrease fairness.
748 q->toplevel = TC_CBQ_MAXLEVEL;
749 #endif
753 static void
754 cbq_update(struct cbq_sched_data *q)
756 struct cbq_class *this = q->tx_class;
757 struct cbq_class *cl = this;
758 int len = q->tx_len;
760 q->tx_class = NULL;
762 for ( ; cl; cl = cl->share) {
763 long avgidle = cl->avgidle;
764 long idle;
766 cl->bstats.packets++;
767 cl->bstats.bytes += len;
770 (now - last) is total time between packet right edges.
771 (last_pktlen/rate) is "virtual" busy time, so that
773 idle = (now - last) - last_pktlen/rate
776 idle = PSCHED_TDIFF(q->now, cl->last);
777 if ((unsigned long)idle > 128*1024*1024) {
778 avgidle = cl->maxidle;
779 } else {
780 idle -= L2T(cl, len);
782 /* true_avgidle := (1-W)*true_avgidle + W*idle,
783 where W=2^{-ewma_log}. But cl->avgidle is scaled:
784 cl->avgidle == true_avgidle/W,
785 hence:
787 avgidle += idle - (avgidle>>cl->ewma_log);
790 if (avgidle <= 0) {
791 /* Overlimit or at-limit */
793 if (avgidle < cl->minidle)
794 avgidle = cl->minidle;
796 cl->avgidle = avgidle;
798 /* Calculate expected time, when this class
799 will be allowed to send.
800 It will occur, when:
801 (1-W)*true_avgidle + W*delay = 0, i.e.
802 idle = (1/W - 1)*(-true_avgidle)
804 idle = (1 - W)*(-cl->avgidle);
806 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
809 That is not all.
810 To maintain the rate allocated to the class,
811 we add to undertime virtual clock,
812 necessary to complete transmitted packet.
813 (len/phys_bandwidth has been already passed
814 to the moment of cbq_update)
817 idle -= L2T(&q->link, len);
818 idle += L2T(cl, len);
820 PSCHED_AUDIT_TDIFF(idle);
822 PSCHED_TADD2(q->now, idle, cl->undertime);
823 } else {
824 /* Underlimit */
826 PSCHED_SET_PASTPERFECT(cl->undertime);
827 if (avgidle > cl->maxidle)
828 cl->avgidle = cl->maxidle;
829 else
830 cl->avgidle = avgidle;
832 cl->last = q->now;
835 cbq_update_toplevel(q, this, q->tx_borrowed);
838 static __inline__ struct cbq_class *
839 cbq_under_limit(struct cbq_class *cl)
841 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
842 struct cbq_class *this_cl = cl;
844 if (cl->tparent == NULL)
845 return cl;
847 if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
848 !PSCHED_TLESS(q->now, cl->undertime)) {
849 cl->delayed = 0;
850 return cl;
853 do {
854 /* It is very suspicious place. Now overlimit
855 action is generated for not bounded classes
856 only if link is completely congested.
857 Though it is in agree with ancestor-only paradigm,
858 it looks very stupid. Particularly,
859 it means that this chunk of code will either
860 never be called or result in strong amplification
861 of burstiness. Dangerous, silly, and, however,
862 no another solution exists.
864 if ((cl = cl->borrow) == NULL) {
865 this_cl->qstats.overlimits++;
866 this_cl->overlimit(this_cl);
867 return NULL;
869 if (cl->level > q->toplevel)
870 return NULL;
871 } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
872 PSCHED_TLESS(q->now, cl->undertime));
874 cl->delayed = 0;
875 return cl;
878 static __inline__ struct sk_buff *
879 cbq_dequeue_prio(struct Qdisc *sch, int prio)
881 struct cbq_sched_data *q = qdisc_priv(sch);
882 struct cbq_class *cl_tail, *cl_prev, *cl;
883 struct sk_buff *skb;
884 int deficit;
886 cl_tail = cl_prev = q->active[prio];
887 cl = cl_prev->next_alive;
889 do {
890 deficit = 0;
892 /* Start round */
893 do {
894 struct cbq_class *borrow = cl;
896 if (cl->q->q.qlen &&
897 (borrow = cbq_under_limit(cl)) == NULL)
898 goto skip_class;
900 if (cl->deficit <= 0) {
901 /* Class exhausted its allotment per
902 this round. Switch to the next one.
904 deficit = 1;
905 cl->deficit += cl->quantum;
906 goto next_class;
909 skb = cl->q->dequeue(cl->q);
911 /* Class did not give us any skb :-(
912 It could occur even if cl->q->q.qlen != 0
913 f.e. if cl->q == "tbf"
915 if (skb == NULL)
916 goto skip_class;
918 cl->deficit -= skb->len;
919 q->tx_class = cl;
920 q->tx_borrowed = borrow;
921 if (borrow != cl) {
922 #ifndef CBQ_XSTATS_BORROWS_BYTES
923 borrow->xstats.borrows++;
924 cl->xstats.borrows++;
925 #else
926 borrow->xstats.borrows += skb->len;
927 cl->xstats.borrows += skb->len;
928 #endif
930 q->tx_len = skb->len;
932 if (cl->deficit <= 0) {
933 q->active[prio] = cl;
934 cl = cl->next_alive;
935 cl->deficit += cl->quantum;
937 return skb;
939 skip_class:
940 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
941 /* Class is empty or penalized.
942 Unlink it from active chain.
944 cl_prev->next_alive = cl->next_alive;
945 cl->next_alive = NULL;
947 /* Did cl_tail point to it? */
948 if (cl == cl_tail) {
949 /* Repair it! */
950 cl_tail = cl_prev;
952 /* Was it the last class in this band? */
953 if (cl == cl_tail) {
954 /* Kill the band! */
955 q->active[prio] = NULL;
956 q->activemask &= ~(1<<prio);
957 if (cl->q->q.qlen)
958 cbq_activate_class(cl);
959 return NULL;
962 q->active[prio] = cl_tail;
964 if (cl->q->q.qlen)
965 cbq_activate_class(cl);
967 cl = cl_prev;
970 next_class:
971 cl_prev = cl;
972 cl = cl->next_alive;
973 } while (cl_prev != cl_tail);
974 } while (deficit);
976 q->active[prio] = cl_prev;
978 return NULL;
981 static __inline__ struct sk_buff *
982 cbq_dequeue_1(struct Qdisc *sch)
984 struct cbq_sched_data *q = qdisc_priv(sch);
985 struct sk_buff *skb;
986 unsigned activemask;
988 activemask = q->activemask&0xFF;
989 while (activemask) {
990 int prio = ffz(~activemask);
991 activemask &= ~(1<<prio);
992 skb = cbq_dequeue_prio(sch, prio);
993 if (skb)
994 return skb;
996 return NULL;
999 static struct sk_buff *
1000 cbq_dequeue(struct Qdisc *sch)
1002 struct sk_buff *skb;
1003 struct cbq_sched_data *q = qdisc_priv(sch);
1004 psched_time_t now;
1005 psched_tdiff_t incr;
1007 PSCHED_GET_TIME(now);
1008 incr = PSCHED_TDIFF(now, q->now_rt);
1010 if (q->tx_class) {
1011 psched_tdiff_t incr2;
1012 /* Time integrator. We calculate EOS time
1013 by adding expected packet transmission time.
1014 If real time is greater, we warp artificial clock,
1015 so that:
1017 cbq_time = max(real_time, work);
1019 incr2 = L2T(&q->link, q->tx_len);
1020 PSCHED_TADD(q->now, incr2);
1021 cbq_update(q);
1022 if ((incr -= incr2) < 0)
1023 incr = 0;
1025 PSCHED_TADD(q->now, incr);
1026 q->now_rt = now;
1028 for (;;) {
1029 q->wd_expires = 0;
1031 skb = cbq_dequeue_1(sch);
1032 if (skb) {
1033 sch->q.qlen--;
1034 sch->flags &= ~TCQ_F_THROTTLED;
1035 return skb;
1038 /* All the classes are overlimit.
1040 It is possible, if:
1042 1. Scheduler is empty.
1043 2. Toplevel cutoff inhibited borrowing.
1044 3. Root class is overlimit.
1046 Reset 2d and 3d conditions and retry.
1048 Note, that NS and cbq-2.0 are buggy, peeking
1049 an arbitrary class is appropriate for ancestor-only
1050 sharing, but not for toplevel algorithm.
1052 Our version is better, but slower, because it requires
1053 two passes, but it is unavoidable with top-level sharing.
1056 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1057 PSCHED_IS_PASTPERFECT(q->link.undertime))
1058 break;
1060 q->toplevel = TC_CBQ_MAXLEVEL;
1061 PSCHED_SET_PASTPERFECT(q->link.undertime);
1064 /* No packets in scheduler or nobody wants to give them to us :-(
1065 Sigh... start watchdog timer in the last case. */
1067 if (sch->q.qlen) {
1068 sch->qstats.overlimits++;
1069 if (q->wd_expires) {
1070 long delay = PSCHED_US2JIFFIE(q->wd_expires);
1071 if (delay <= 0)
1072 delay = 1;
1073 mod_timer(&q->wd_timer, jiffies + delay);
1074 sch->flags |= TCQ_F_THROTTLED;
1077 return NULL;
1080 /* CBQ class maintanance routines */
1082 static void cbq_adjust_levels(struct cbq_class *this)
1084 if (this == NULL)
1085 return;
1087 do {
1088 int level = 0;
1089 struct cbq_class *cl;
1091 if ((cl = this->children) != NULL) {
1092 do {
1093 if (cl->level > level)
1094 level = cl->level;
1095 } while ((cl = cl->sibling) != this->children);
1097 this->level = level+1;
1098 } while ((this = this->tparent) != NULL);
1101 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1103 struct cbq_class *cl;
1104 unsigned h;
1106 if (q->quanta[prio] == 0)
1107 return;
1109 for (h=0; h<16; h++) {
1110 for (cl = q->classes[h]; cl; cl = cl->next) {
1111 /* BUGGGG... Beware! This expression suffer of
1112 arithmetic overflows!
1114 if (cl->priority == prio) {
1115 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1116 q->quanta[prio];
1118 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1119 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1120 cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1126 static void cbq_sync_defmap(struct cbq_class *cl)
1128 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1129 struct cbq_class *split = cl->split;
1130 unsigned h;
1131 int i;
1133 if (split == NULL)
1134 return;
1136 for (i=0; i<=TC_PRIO_MAX; i++) {
1137 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1138 split->defaults[i] = NULL;
1141 for (i=0; i<=TC_PRIO_MAX; i++) {
1142 int level = split->level;
1144 if (split->defaults[i])
1145 continue;
1147 for (h=0; h<16; h++) {
1148 struct cbq_class *c;
1150 for (c = q->classes[h]; c; c = c->next) {
1151 if (c->split == split && c->level < level &&
1152 c->defmap&(1<<i)) {
1153 split->defaults[i] = c;
1154 level = c->level;
1161 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1163 struct cbq_class *split = NULL;
1165 if (splitid == 0) {
1166 if ((split = cl->split) == NULL)
1167 return;
1168 splitid = split->classid;
1171 if (split == NULL || split->classid != splitid) {
1172 for (split = cl->tparent; split; split = split->tparent)
1173 if (split->classid == splitid)
1174 break;
1177 if (split == NULL)
1178 return;
1180 if (cl->split != split) {
1181 cl->defmap = 0;
1182 cbq_sync_defmap(cl);
1183 cl->split = split;
1184 cl->defmap = def&mask;
1185 } else
1186 cl->defmap = (cl->defmap&~mask)|(def&mask);
1188 cbq_sync_defmap(cl);
1191 static void cbq_unlink_class(struct cbq_class *this)
1193 struct cbq_class *cl, **clp;
1194 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1196 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1197 if (cl == this) {
1198 *clp = cl->next;
1199 cl->next = NULL;
1200 break;
1204 if (this->tparent) {
1205 clp=&this->sibling;
1206 cl = *clp;
1207 do {
1208 if (cl == this) {
1209 *clp = cl->sibling;
1210 break;
1212 clp = &cl->sibling;
1213 } while ((cl = *clp) != this->sibling);
1215 if (this->tparent->children == this) {
1216 this->tparent->children = this->sibling;
1217 if (this->sibling == this)
1218 this->tparent->children = NULL;
1220 } else {
1221 BUG_TRAP(this->sibling == this);
1225 static void cbq_link_class(struct cbq_class *this)
1227 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1228 unsigned h = cbq_hash(this->classid);
1229 struct cbq_class *parent = this->tparent;
1231 this->sibling = this;
1232 this->next = q->classes[h];
1233 q->classes[h] = this;
1235 if (parent == NULL)
1236 return;
1238 if (parent->children == NULL) {
1239 parent->children = this;
1240 } else {
1241 this->sibling = parent->children->sibling;
1242 parent->children->sibling = this;
1246 static unsigned int cbq_drop(struct Qdisc* sch)
1248 struct cbq_sched_data *q = qdisc_priv(sch);
1249 struct cbq_class *cl, *cl_head;
1250 int prio;
1251 unsigned int len;
1253 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1254 if ((cl_head = q->active[prio]) == NULL)
1255 continue;
1257 cl = cl_head;
1258 do {
1259 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1260 sch->q.qlen--;
1261 return len;
1263 } while ((cl = cl->next_alive) != cl_head);
1265 return 0;
1268 static void
1269 cbq_reset(struct Qdisc* sch)
1271 struct cbq_sched_data *q = qdisc_priv(sch);
1272 struct cbq_class *cl;
1273 int prio;
1274 unsigned h;
1276 q->activemask = 0;
1277 q->pmask = 0;
1278 q->tx_class = NULL;
1279 q->tx_borrowed = NULL;
1280 del_timer(&q->wd_timer);
1281 del_timer(&q->delay_timer);
1282 q->toplevel = TC_CBQ_MAXLEVEL;
1283 PSCHED_GET_TIME(q->now);
1284 q->now_rt = q->now;
1286 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1287 q->active[prio] = NULL;
1289 for (h = 0; h < 16; h++) {
1290 for (cl = q->classes[h]; cl; cl = cl->next) {
1291 qdisc_reset(cl->q);
1293 cl->next_alive = NULL;
1294 PSCHED_SET_PASTPERFECT(cl->undertime);
1295 cl->avgidle = cl->maxidle;
1296 cl->deficit = cl->quantum;
1297 cl->cpriority = cl->priority;
1300 sch->q.qlen = 0;
1304 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1306 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1307 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1308 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1310 if (lss->change&TCF_CBQ_LSS_EWMA)
1311 cl->ewma_log = lss->ewma_log;
1312 if (lss->change&TCF_CBQ_LSS_AVPKT)
1313 cl->avpkt = lss->avpkt;
1314 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1315 cl->minidle = -(long)lss->minidle;
1316 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1317 cl->maxidle = lss->maxidle;
1318 cl->avgidle = lss->maxidle;
1320 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1321 cl->offtime = lss->offtime;
1322 return 0;
1325 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1327 q->nclasses[cl->priority]--;
1328 q->quanta[cl->priority] -= cl->weight;
1329 cbq_normalize_quanta(q, cl->priority);
1332 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1334 q->nclasses[cl->priority]++;
1335 q->quanta[cl->priority] += cl->weight;
1336 cbq_normalize_quanta(q, cl->priority);
1339 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1341 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1343 if (wrr->allot)
1344 cl->allot = wrr->allot;
1345 if (wrr->weight)
1346 cl->weight = wrr->weight;
1347 if (wrr->priority) {
1348 cl->priority = wrr->priority-1;
1349 cl->cpriority = cl->priority;
1350 if (cl->priority >= cl->priority2)
1351 cl->priority2 = TC_CBQ_MAXPRIO-1;
1354 cbq_addprio(q, cl);
1355 return 0;
1358 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1360 switch (ovl->strategy) {
1361 case TC_CBQ_OVL_CLASSIC:
1362 cl->overlimit = cbq_ovl_classic;
1363 break;
1364 case TC_CBQ_OVL_DELAY:
1365 cl->overlimit = cbq_ovl_delay;
1366 break;
1367 case TC_CBQ_OVL_LOWPRIO:
1368 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1369 ovl->priority2-1 <= cl->priority)
1370 return -EINVAL;
1371 cl->priority2 = ovl->priority2-1;
1372 cl->overlimit = cbq_ovl_lowprio;
1373 break;
1374 case TC_CBQ_OVL_DROP:
1375 cl->overlimit = cbq_ovl_drop;
1376 break;
1377 case TC_CBQ_OVL_RCLASSIC:
1378 cl->overlimit = cbq_ovl_rclassic;
1379 break;
1380 default:
1381 return -EINVAL;
1383 cl->penalty = (ovl->penalty*HZ)/1000;
1384 return 0;
1387 #ifdef CONFIG_NET_CLS_POLICE
1388 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1390 cl->police = p->police;
1392 if (cl->q->handle) {
1393 if (p->police == TC_POLICE_RECLASSIFY)
1394 cl->q->reshape_fail = cbq_reshape_fail;
1395 else
1396 cl->q->reshape_fail = NULL;
1398 return 0;
1400 #endif
1402 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1404 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1405 return 0;
1408 static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1410 struct cbq_sched_data *q = qdisc_priv(sch);
1411 struct rtattr *tb[TCA_CBQ_MAX];
1412 struct tc_ratespec *r;
1414 if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 ||
1415 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1416 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1417 return -EINVAL;
1419 if (tb[TCA_CBQ_LSSOPT-1] &&
1420 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1421 return -EINVAL;
1423 r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1425 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL)
1426 return -EINVAL;
1428 q->link.refcnt = 1;
1429 q->link.sibling = &q->link;
1430 q->link.classid = sch->handle;
1431 q->link.qdisc = sch;
1432 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1433 sch->handle)))
1434 q->link.q = &noop_qdisc;
1436 q->link.priority = TC_CBQ_MAXPRIO-1;
1437 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1438 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1439 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1440 q->link.overlimit = cbq_ovl_classic;
1441 q->link.allot = psched_mtu(sch->dev);
1442 q->link.quantum = q->link.allot;
1443 q->link.weight = q->link.R_tab->rate.rate;
1445 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1446 q->link.avpkt = q->link.allot/2;
1447 q->link.minidle = -0x7FFFFFFF;
1448 q->link.stats_lock = &sch->dev->queue_lock;
1450 init_timer(&q->wd_timer);
1451 q->wd_timer.data = (unsigned long)sch;
1452 q->wd_timer.function = cbq_watchdog;
1453 init_timer(&q->delay_timer);
1454 q->delay_timer.data = (unsigned long)sch;
1455 q->delay_timer.function = cbq_undelay;
1456 q->toplevel = TC_CBQ_MAXLEVEL;
1457 PSCHED_GET_TIME(q->now);
1458 q->now_rt = q->now;
1460 cbq_link_class(&q->link);
1462 if (tb[TCA_CBQ_LSSOPT-1])
1463 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1465 cbq_addprio(q, &q->link);
1466 return 0;
1469 static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1471 unsigned char *b = skb->tail;
1473 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1474 return skb->len;
1476 rtattr_failure:
1477 skb_trim(skb, b - skb->data);
1478 return -1;
1481 static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1483 unsigned char *b = skb->tail;
1484 struct tc_cbq_lssopt opt;
1486 opt.flags = 0;
1487 if (cl->borrow == NULL)
1488 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1489 if (cl->share == NULL)
1490 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1491 opt.ewma_log = cl->ewma_log;
1492 opt.level = cl->level;
1493 opt.avpkt = cl->avpkt;
1494 opt.maxidle = cl->maxidle;
1495 opt.minidle = (u32)(-cl->minidle);
1496 opt.offtime = cl->offtime;
1497 opt.change = ~0;
1498 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1499 return skb->len;
1501 rtattr_failure:
1502 skb_trim(skb, b - skb->data);
1503 return -1;
1506 static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1508 unsigned char *b = skb->tail;
1509 struct tc_cbq_wrropt opt;
1511 opt.flags = 0;
1512 opt.allot = cl->allot;
1513 opt.priority = cl->priority+1;
1514 opt.cpriority = cl->cpriority+1;
1515 opt.weight = cl->weight;
1516 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1517 return skb->len;
1519 rtattr_failure:
1520 skb_trim(skb, b - skb->data);
1521 return -1;
1524 static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1526 unsigned char *b = skb->tail;
1527 struct tc_cbq_ovl opt;
1529 opt.strategy = cl->ovl_strategy;
1530 opt.priority2 = cl->priority2+1;
1531 opt.pad = 0;
1532 opt.penalty = (cl->penalty*1000)/HZ;
1533 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1534 return skb->len;
1536 rtattr_failure:
1537 skb_trim(skb, b - skb->data);
1538 return -1;
1541 static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1543 unsigned char *b = skb->tail;
1544 struct tc_cbq_fopt opt;
1546 if (cl->split || cl->defmap) {
1547 opt.split = cl->split ? cl->split->classid : 0;
1548 opt.defmap = cl->defmap;
1549 opt.defchange = ~0;
1550 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1552 return skb->len;
1554 rtattr_failure:
1555 skb_trim(skb, b - skb->data);
1556 return -1;
1559 #ifdef CONFIG_NET_CLS_POLICE
1560 static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1562 unsigned char *b = skb->tail;
1563 struct tc_cbq_police opt;
1565 if (cl->police) {
1566 opt.police = cl->police;
1567 opt.__res1 = 0;
1568 opt.__res2 = 0;
1569 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1571 return skb->len;
1573 rtattr_failure:
1574 skb_trim(skb, b - skb->data);
1575 return -1;
1577 #endif
1579 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1581 if (cbq_dump_lss(skb, cl) < 0 ||
1582 cbq_dump_rate(skb, cl) < 0 ||
1583 cbq_dump_wrr(skb, cl) < 0 ||
1584 cbq_dump_ovl(skb, cl) < 0 ||
1585 #ifdef CONFIG_NET_CLS_POLICE
1586 cbq_dump_police(skb, cl) < 0 ||
1587 #endif
1588 cbq_dump_fopt(skb, cl) < 0)
1589 return -1;
1590 return 0;
1593 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1595 struct cbq_sched_data *q = qdisc_priv(sch);
1596 unsigned char *b = skb->tail;
1597 struct rtattr *rta;
1599 rta = (struct rtattr*)b;
1600 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1601 if (cbq_dump_attr(skb, &q->link) < 0)
1602 goto rtattr_failure;
1603 rta->rta_len = skb->tail - b;
1604 return skb->len;
1606 rtattr_failure:
1607 skb_trim(skb, b - skb->data);
1608 return -1;
1611 static int
1612 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1614 struct cbq_sched_data *q = qdisc_priv(sch);
1616 q->link.xstats.avgidle = q->link.avgidle;
1617 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1620 static int
1621 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1622 struct sk_buff *skb, struct tcmsg *tcm)
1624 struct cbq_class *cl = (struct cbq_class*)arg;
1625 unsigned char *b = skb->tail;
1626 struct rtattr *rta;
1628 if (cl->tparent)
1629 tcm->tcm_parent = cl->tparent->classid;
1630 else
1631 tcm->tcm_parent = TC_H_ROOT;
1632 tcm->tcm_handle = cl->classid;
1633 tcm->tcm_info = cl->q->handle;
1635 rta = (struct rtattr*)b;
1636 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1637 if (cbq_dump_attr(skb, cl) < 0)
1638 goto rtattr_failure;
1639 rta->rta_len = skb->tail - b;
1640 return skb->len;
1642 rtattr_failure:
1643 skb_trim(skb, b - skb->data);
1644 return -1;
1647 static int
1648 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1649 struct gnet_dump *d)
1651 struct cbq_sched_data *q = qdisc_priv(sch);
1652 struct cbq_class *cl = (struct cbq_class*)arg;
1654 cl->qstats.qlen = cl->q->q.qlen;
1655 cl->xstats.avgidle = cl->avgidle;
1656 cl->xstats.undertime = 0;
1658 if (!PSCHED_IS_PASTPERFECT(cl->undertime))
1659 cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
1661 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1662 #ifdef CONFIG_NET_ESTIMATOR
1663 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1664 #endif
1665 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1666 return -1;
1668 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1671 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1672 struct Qdisc **old)
1674 struct cbq_class *cl = (struct cbq_class*)arg;
1676 if (cl) {
1677 if (new == NULL) {
1678 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1679 cl->classid)) == NULL)
1680 return -ENOBUFS;
1681 } else {
1682 #ifdef CONFIG_NET_CLS_POLICE
1683 if (cl->police == TC_POLICE_RECLASSIFY)
1684 new->reshape_fail = cbq_reshape_fail;
1685 #endif
1687 sch_tree_lock(sch);
1688 *old = cl->q;
1689 cl->q = new;
1690 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1691 qdisc_reset(*old);
1692 sch_tree_unlock(sch);
1694 return 0;
1696 return -ENOENT;
1699 static struct Qdisc *
1700 cbq_leaf(struct Qdisc *sch, unsigned long arg)
1702 struct cbq_class *cl = (struct cbq_class*)arg;
1704 return cl ? cl->q : NULL;
1707 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1709 struct cbq_sched_data *q = qdisc_priv(sch);
1710 struct cbq_class *cl = cbq_class_lookup(q, classid);
1712 if (cl) {
1713 cl->refcnt++;
1714 return (unsigned long)cl;
1716 return 0;
1719 static void cbq_destroy_filters(struct cbq_class *cl)
1721 struct tcf_proto *tp;
1723 while ((tp = cl->filter_list) != NULL) {
1724 cl->filter_list = tp->next;
1725 tcf_destroy(tp);
1729 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1731 struct cbq_sched_data *q = qdisc_priv(sch);
1733 BUG_TRAP(!cl->filters);
1735 cbq_destroy_filters(cl);
1736 qdisc_destroy(cl->q);
1737 qdisc_put_rtab(cl->R_tab);
1738 #ifdef CONFIG_NET_ESTIMATOR
1739 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1740 #endif
1741 if (cl != &q->link)
1742 kfree(cl);
1745 static void
1746 cbq_destroy(struct Qdisc* sch)
1748 struct cbq_sched_data *q = qdisc_priv(sch);
1749 struct cbq_class *cl;
1750 unsigned h;
1752 #ifdef CONFIG_NET_CLS_POLICE
1753 q->rx_class = NULL;
1754 #endif
1756 * Filters must be destroyed first because we don't destroy the
1757 * classes from root to leafs which means that filters can still
1758 * be bound to classes which have been destroyed already. --TGR '04
1760 for (h = 0; h < 16; h++)
1761 for (cl = q->classes[h]; cl; cl = cl->next)
1762 cbq_destroy_filters(cl);
1764 for (h = 0; h < 16; h++) {
1765 struct cbq_class *next;
1767 for (cl = q->classes[h]; cl; cl = next) {
1768 next = cl->next;
1769 cbq_destroy_class(sch, cl);
1774 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1776 struct cbq_class *cl = (struct cbq_class*)arg;
1778 if (--cl->refcnt == 0) {
1779 #ifdef CONFIG_NET_CLS_POLICE
1780 struct cbq_sched_data *q = qdisc_priv(sch);
1782 spin_lock_bh(&sch->dev->queue_lock);
1783 if (q->rx_class == cl)
1784 q->rx_class = NULL;
1785 spin_unlock_bh(&sch->dev->queue_lock);
1786 #endif
1788 cbq_destroy_class(sch, cl);
1792 static int
1793 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1794 unsigned long *arg)
1796 int err;
1797 struct cbq_sched_data *q = qdisc_priv(sch);
1798 struct cbq_class *cl = (struct cbq_class*)*arg;
1799 struct rtattr *opt = tca[TCA_OPTIONS-1];
1800 struct rtattr *tb[TCA_CBQ_MAX];
1801 struct cbq_class *parent;
1802 struct qdisc_rate_table *rtab = NULL;
1804 if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt))
1805 return -EINVAL;
1807 if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1808 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1809 return -EINVAL;
1811 if (tb[TCA_CBQ_FOPT-1] &&
1812 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1813 return -EINVAL;
1815 if (tb[TCA_CBQ_RATE-1] &&
1816 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1817 return -EINVAL;
1819 if (tb[TCA_CBQ_LSSOPT-1] &&
1820 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1821 return -EINVAL;
1823 if (tb[TCA_CBQ_WRROPT-1] &&
1824 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1825 return -EINVAL;
1827 #ifdef CONFIG_NET_CLS_POLICE
1828 if (tb[TCA_CBQ_POLICE-1] &&
1829 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1830 return -EINVAL;
1831 #endif
1833 if (cl) {
1834 /* Check parent */
1835 if (parentid) {
1836 if (cl->tparent && cl->tparent->classid != parentid)
1837 return -EINVAL;
1838 if (!cl->tparent && parentid != TC_H_ROOT)
1839 return -EINVAL;
1842 if (tb[TCA_CBQ_RATE-1]) {
1843 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1844 if (rtab == NULL)
1845 return -EINVAL;
1848 /* Change class parameters */
1849 sch_tree_lock(sch);
1851 if (cl->next_alive != NULL)
1852 cbq_deactivate_class(cl);
1854 if (rtab) {
1855 rtab = xchg(&cl->R_tab, rtab);
1856 qdisc_put_rtab(rtab);
1859 if (tb[TCA_CBQ_LSSOPT-1])
1860 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1862 if (tb[TCA_CBQ_WRROPT-1]) {
1863 cbq_rmprio(q, cl);
1864 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1867 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1868 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1870 #ifdef CONFIG_NET_CLS_POLICE
1871 if (tb[TCA_CBQ_POLICE-1])
1872 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1873 #endif
1875 if (tb[TCA_CBQ_FOPT-1])
1876 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1878 if (cl->q->q.qlen)
1879 cbq_activate_class(cl);
1881 sch_tree_unlock(sch);
1883 #ifdef CONFIG_NET_ESTIMATOR
1884 if (tca[TCA_RATE-1])
1885 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1886 cl->stats_lock, tca[TCA_RATE-1]);
1887 #endif
1888 return 0;
1891 if (parentid == TC_H_ROOT)
1892 return -EINVAL;
1894 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1895 tb[TCA_CBQ_LSSOPT-1] == NULL)
1896 return -EINVAL;
1898 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1899 if (rtab == NULL)
1900 return -EINVAL;
1902 if (classid) {
1903 err = -EINVAL;
1904 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1905 goto failure;
1906 } else {
1907 int i;
1908 classid = TC_H_MAKE(sch->handle,0x8000);
1910 for (i=0; i<0x8000; i++) {
1911 if (++q->hgenerator >= 0x8000)
1912 q->hgenerator = 1;
1913 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1914 break;
1916 err = -ENOSR;
1917 if (i >= 0x8000)
1918 goto failure;
1919 classid = classid|q->hgenerator;
1922 parent = &q->link;
1923 if (parentid) {
1924 parent = cbq_class_lookup(q, parentid);
1925 err = -EINVAL;
1926 if (parent == NULL)
1927 goto failure;
1930 err = -ENOBUFS;
1931 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1932 if (cl == NULL)
1933 goto failure;
1934 cl->R_tab = rtab;
1935 rtab = NULL;
1936 cl->refcnt = 1;
1937 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid)))
1938 cl->q = &noop_qdisc;
1939 cl->classid = classid;
1940 cl->tparent = parent;
1941 cl->qdisc = sch;
1942 cl->allot = parent->allot;
1943 cl->quantum = cl->allot;
1944 cl->weight = cl->R_tab->rate.rate;
1945 cl->stats_lock = &sch->dev->queue_lock;
1947 sch_tree_lock(sch);
1948 cbq_link_class(cl);
1949 cl->borrow = cl->tparent;
1950 if (cl->tparent != &q->link)
1951 cl->share = cl->tparent;
1952 cbq_adjust_levels(parent);
1953 cl->minidle = -0x7FFFFFFF;
1954 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1955 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1956 if (cl->ewma_log==0)
1957 cl->ewma_log = q->link.ewma_log;
1958 if (cl->maxidle==0)
1959 cl->maxidle = q->link.maxidle;
1960 if (cl->avpkt==0)
1961 cl->avpkt = q->link.avpkt;
1962 cl->overlimit = cbq_ovl_classic;
1963 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1964 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1965 #ifdef CONFIG_NET_CLS_POLICE
1966 if (tb[TCA_CBQ_POLICE-1])
1967 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1968 #endif
1969 if (tb[TCA_CBQ_FOPT-1])
1970 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1971 sch_tree_unlock(sch);
1973 #ifdef CONFIG_NET_ESTIMATOR
1974 if (tca[TCA_RATE-1])
1975 gen_new_estimator(&cl->bstats, &cl->rate_est,
1976 cl->stats_lock, tca[TCA_RATE-1]);
1977 #endif
1979 *arg = (unsigned long)cl;
1980 return 0;
1982 failure:
1983 qdisc_put_rtab(rtab);
1984 return err;
1987 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1989 struct cbq_sched_data *q = qdisc_priv(sch);
1990 struct cbq_class *cl = (struct cbq_class*)arg;
1992 if (cl->filters || cl->children || cl == &q->link)
1993 return -EBUSY;
1995 sch_tree_lock(sch);
1997 if (cl->next_alive)
1998 cbq_deactivate_class(cl);
2000 if (q->tx_borrowed == cl)
2001 q->tx_borrowed = q->tx_class;
2002 if (q->tx_class == cl) {
2003 q->tx_class = NULL;
2004 q->tx_borrowed = NULL;
2006 #ifdef CONFIG_NET_CLS_POLICE
2007 if (q->rx_class == cl)
2008 q->rx_class = NULL;
2009 #endif
2011 cbq_unlink_class(cl);
2012 cbq_adjust_levels(cl->tparent);
2013 cl->defmap = 0;
2014 cbq_sync_defmap(cl);
2016 cbq_rmprio(q, cl);
2017 sch_tree_unlock(sch);
2019 if (--cl->refcnt == 0)
2020 cbq_destroy_class(sch, cl);
2022 return 0;
2025 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
2027 struct cbq_sched_data *q = qdisc_priv(sch);
2028 struct cbq_class *cl = (struct cbq_class *)arg;
2030 if (cl == NULL)
2031 cl = &q->link;
2033 return &cl->filter_list;
2036 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
2037 u32 classid)
2039 struct cbq_sched_data *q = qdisc_priv(sch);
2040 struct cbq_class *p = (struct cbq_class*)parent;
2041 struct cbq_class *cl = cbq_class_lookup(q, classid);
2043 if (cl) {
2044 if (p && p->level <= cl->level)
2045 return 0;
2046 cl->filters++;
2047 return (unsigned long)cl;
2049 return 0;
2052 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2054 struct cbq_class *cl = (struct cbq_class*)arg;
2056 cl->filters--;
2059 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2061 struct cbq_sched_data *q = qdisc_priv(sch);
2062 unsigned h;
2064 if (arg->stop)
2065 return;
2067 for (h = 0; h < 16; h++) {
2068 struct cbq_class *cl;
2070 for (cl = q->classes[h]; cl; cl = cl->next) {
2071 if (arg->count < arg->skip) {
2072 arg->count++;
2073 continue;
2075 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2076 arg->stop = 1;
2077 return;
2079 arg->count++;
2084 static struct Qdisc_class_ops cbq_class_ops = {
2085 .graft = cbq_graft,
2086 .leaf = cbq_leaf,
2087 .get = cbq_get,
2088 .put = cbq_put,
2089 .change = cbq_change_class,
2090 .delete = cbq_delete,
2091 .walk = cbq_walk,
2092 .tcf_chain = cbq_find_tcf,
2093 .bind_tcf = cbq_bind_filter,
2094 .unbind_tcf = cbq_unbind_filter,
2095 .dump = cbq_dump_class,
2096 .dump_stats = cbq_dump_class_stats,
2099 static struct Qdisc_ops cbq_qdisc_ops = {
2100 .next = NULL,
2101 .cl_ops = &cbq_class_ops,
2102 .id = "cbq",
2103 .priv_size = sizeof(struct cbq_sched_data),
2104 .enqueue = cbq_enqueue,
2105 .dequeue = cbq_dequeue,
2106 .requeue = cbq_requeue,
2107 .drop = cbq_drop,
2108 .init = cbq_init,
2109 .reset = cbq_reset,
2110 .destroy = cbq_destroy,
2111 .change = NULL,
2112 .dump = cbq_dump,
2113 .dump_stats = cbq_dump_stats,
2114 .owner = THIS_MODULE,
2117 static int __init cbq_module_init(void)
2119 return register_qdisc(&cbq_qdisc_ops);
2121 static void __exit cbq_module_exit(void)
2123 unregister_qdisc(&cbq_qdisc_ops);
2125 module_init(cbq_module_init)
2126 module_exit(cbq_module_exit)
2127 MODULE_LICENSE("GPL");