Import 2.2.0pre6
[davej-history.git] / net / sched / sch_csz.c
blob9bdc656c9b903ea6472794b5ac8b31fb292904e2
1 /*
2 * net/sched/sch_csz.c Clark-Shenker-Zhang scheduler.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <asm/uaccess.h>
16 #include <asm/system.h>
17 #include <asm/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/if_ether.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/notifier.h>
33 #include <net/ip.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
36 #include <net/sock.h>
37 #include <net/pkt_sched.h>
40 /* Clark-Shenker-Zhang algorithm.
41 =======================================
43 SOURCE.
45 David D. Clark, Scott Shenker and Lixia Zhang
46 "Supporting Real-Time Applications in an Integrated Services Packet
47 Network: Architecture and Mechanism".
49 CBQ presents a flexible universal algorithm for packet scheduling,
50 but it has pretty poor delay characteristics.
51 Round-robin scheduling and link-sharing goals
52 apparently contradict minimization of network delay and jitter.
53 Moreover, correct handling of predictive flows seems to be
54 impossible in CBQ.
56 CSZ presents a more precise but less flexible and less efficient
57 approach. As I understand it, the main idea is to create
58 WFQ flows for each guaranteed service and to allocate
59 the rest of bandwith to dummy flow-0. Flow-0 comprises
60 the predictive services and the best effort traffic;
61 it is handled by a priority scheduler with the highest
62 priority band allocated for predictive services, and the rest ---
63 to the best effort packets.
65 Note that in CSZ flows are NOT limited to their bandwidth. It
66 is supposed that the flow passed admission control at the edge
67 of the QoS network and it doesn't need further shaping. Any
68 attempt to improve the flow or to shape it to a token bucket
69 at intermediate hops will introduce undesired delays and raise
70 jitter.
72 At the moment CSZ is the only scheduler that provides
73 true guaranteed service. Another schemes (including CBQ)
74 do not provide guaranteed delay and randomize jitter.
75 There is a proof (Sally Floyd), that delay
76 can be estimated by a IntServ compliant formula.
77 This result is true formally, but it is wrong in principle.
78 It takes into account only round-robin delays,
79 ignoring delays introduced by link sharing i.e. overlimiting.
80 Note that temporary overlimits are inevitable because
81 real links are not ideal, and the real algorithm must take this
82 into account.
84 ALGORITHM.
86 --- Notations.
88 $B$ is link bandwidth (bits/sec).
90 $I$ is set of all flows, including flow $0$.
91 Every flow $a \in I$ has associated bandwidth slice $r_a < 1$ and
92 $\sum_{a \in I} r_a = 1$.
94 --- Flow model.
96 Let $m_a$ is the number of backlogged bits in flow $a$.
97 The flow is {\em active}, if $m_a > 0$.
98 This number is a discontinuous function of time;
99 when a packet $i$ arrives:
101 m_a(t_i+0) - m_a(t_i-0) = L^i,
103 where $L^i$ is the length of the arrived packet.
104 The flow queue is drained continuously until $m_a == 0$:
106 {d m_a \over dt} = - { B r_a \over \sum_{b \in A} r_b}.
108 I.e. flow rates are their allocated rates proportionally
109 scaled to take all available link bandwidth. Apparently,
110 it is not the only possible policy. F.e. CBQ classes
111 without borrowing would be modelled by:
113 {d m_a \over dt} = - B r_a .
115 More complicated hierarchical bandwidth allocation
116 policies are possible, but unfortunately, the basic
117 flow equations have a simple solution only for proportional
118 scaling.
120 --- Departure times.
122 We calculate the time until the last bit of packet is sent:
124 E_a^i(t) = { m_a(t_i) - \delta_a(t) \over r_a },
126 where $\delta_a(t)$ is number of bits drained since $t_i$.
127 We have to evaluate $E_a^i$ for all queued packets,
128 then find the packet with minimal $E_a^i$ and send it.
130 This sounds good, but direct implementation of the algorithm
131 is absolutely infeasible. Luckily, if flow rates
132 are scaled proportionally, the equations have a simple solution.
134 The differential equation for $E_a^i$ is
136 {d E_a^i (t) \over dt } = - { d \delta_a(t) \over dt} { 1 \over r_a} =
137 { B \over \sum_{b \in A} r_b}
139 with initial condition
141 E_a^i (t_i) = { m_a(t_i) \over r_a } .
144 Let's introduce an auxiliary function $R(t)$:
146 --- Round number.
148 Consider the following model: we rotate over active flows,
149 sending $r_a B$ bits from every flow, so that we send
150 $B \sum_{a \in A} r_a$ bits per round, that takes
151 $\sum_{a \in A} r_a$ seconds.
153 Hence, $R(t)$ (round number) is a monotonically increasing
154 linear function of time when $A$ is not changed
156 { d R(t) \over dt } = { 1 \over \sum_{a \in A} r_a }
158 and it is continuous when $A$ changes.
160 The central observation is that the quantity
161 $F_a^i = R(t) + E_a^i(t)/B$ does not depend on time at all!
162 $R(t)$ does not depend on flow, so that $F_a^i$ can be
163 calculated only once on packet arrival, and we need not
164 recalculate $E$ numbers and resorting queues.
165 The number $F_a^i$ is called finish number of the packet.
166 It is just the value of $R(t)$ when the last bit of packet
167 is sent out.
169 Maximal finish number on flow is called finish number of flow
170 and minimal one is "start number of flow".
171 Apparently, flow is active if and only if $F_a \leq R$.
173 When a packet of length $L_i$ bit arrives to flow $a$ at time $t_i$,
174 we calculate $F_a^i$ as:
176 If flow was inactive ($F_a < R$):
177 $F_a^i = R(t) + {L_i \over B r_a}$
178 otherwise
179 $F_a^i = F_a + {L_i \over B r_a}$
181 These equations complete the algorithm specification.
183 It looks pretty hairy, but there is a simple
184 procedure for solving these equations.
185 See procedure csz_update(), that is a generalization of
186 the algorithm from S. Keshav's thesis Chapter 3
187 "Efficient Implementation of Fair Queeing".
189 NOTES.
191 * We implement only the simplest variant of CSZ,
192 when flow-0 is a explicit 4band priority fifo.
193 This is bad, but we need a "peek" operation in addition
194 to "dequeue" to implement complete CSZ.
195 I do not want to do that, unless it is absolutely
196 necessary.
198 * A primitive support for token bucket filtering
199 presents itself too. It directly contradicts CSZ, but
200 even though the Internet is on the globe ... :-)
201 "the edges of the network" really exist.
203 BUGS.
205 * Fixed point arithmetic is overcomplicated, suboptimal and even
206 wrong. Check it later. */
209 /* This number is arbitrary */
211 #define CSZ_GUARANTEED 16
212 #define CSZ_FLOWS (CSZ_GUARANTEED+4)
214 struct csz_head
216 struct csz_head *snext;
217 struct csz_head *sprev;
218 struct csz_head *fnext;
219 struct csz_head *fprev;
222 struct csz_flow
224 struct csz_head *snext;
225 struct csz_head *sprev;
226 struct csz_head *fnext;
227 struct csz_head *fprev;
229 /* Parameters */
230 struct tc_ratespec rate;
231 struct tc_ratespec slice;
232 u32 *L_tab; /* Lookup table for L/(B*r_a) values */
233 unsigned long limit; /* Maximal length of queue */
234 #ifdef CSZ_PLUS_TBF
235 struct tc_ratespec peakrate;
236 __u32 buffer; /* Depth of token bucket, normalized
237 as L/(B*r_a) */
238 __u32 mtu;
239 #endif
241 /* Variables */
242 #ifdef CSZ_PLUS_TBF
243 unsigned long tokens; /* Tokens number: usecs */
244 psched_time_t t_tbf;
245 unsigned long R_tbf;
246 int throttled;
247 #endif
248 unsigned peeked;
249 unsigned long start; /* Finish number of the first skb */
250 unsigned long finish; /* Finish number of the flow */
252 struct sk_buff_head q; /* FIFO queue */
255 #define L2R(f,L) ((f)->L_tab[(L)>>(f)->slice.cell_log])
257 struct csz_sched_data
259 /* Parameters */
260 unsigned char rate_log; /* fixed point position for rate;
261 * really we need not it */
262 unsigned char R_log; /* fixed point position for round number */
263 unsigned char delta_log; /* 1<<delta_log is maximal timeout in usecs;
264 * 21 <-> 2.1sec is MAXIMAL value */
266 /* Variables */
267 struct tcf_proto *filter_list;
268 u8 prio2band[TC_PRIO_MAX+1];
269 #ifdef CSZ_PLUS_TBF
270 struct timer_list wd_timer;
271 long wd_expires;
272 #endif
273 psched_time_t t_c; /* Time check-point */
274 unsigned long R_c; /* R-number check-point */
275 unsigned long rate; /* Current sum of rates of active flows */
276 struct csz_head s; /* Flows sorted by "start" */
277 struct csz_head f; /* Flows sorted by "finish" */
279 struct sk_buff_head other[4];/* Predicted (0) and the best efforts
280 classes (1,2,3) */
281 struct csz_flow flow[CSZ_GUARANTEED]; /* Array of flows */
284 /* These routines (csz_insert_finish and csz_insert_start) are
285 the most time consuming part of all the algorithm.
287 We insert to sorted list, so that time
288 is linear with respect to number of active flows in the worst case.
289 Note that we have not very large number of guaranteed flows,
290 so that logarithmic algorithms (heap etc.) are useless,
291 they are slower than linear one when length of list <= 32.
293 Heap would take sence if we used WFQ for best efforts
294 flows, but SFQ is better choice in this case.
298 /* Insert flow "this" to the list "b" before
299 flow with greater finish number.
302 #if 0
303 /* Scan forward */
304 extern __inline__ void csz_insert_finish(struct csz_head *b,
305 struct csz_flow *this)
307 struct csz_head *f = b->fnext;
308 unsigned long finish = this->finish;
310 while (f != b) {
311 if (((struct csz_flow*)f)->finish - finish > 0)
312 break;
313 f = f->fnext;
315 this->fnext = f;
316 this->fprev = f->fprev;
317 this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
319 #else
320 /* Scan backward */
321 extern __inline__ void csz_insert_finish(struct csz_head *b,
322 struct csz_flow *this)
324 struct csz_head *f = b->fprev;
325 unsigned long finish = this->finish;
327 while (f != b) {
328 if (((struct csz_flow*)f)->finish - finish <= 0)
329 break;
330 f = f->fprev;
332 this->fnext = f->fnext;
333 this->fprev = f;
334 this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
336 #endif
338 /* Insert flow "this" to the list "b" before
339 flow with greater start number.
342 extern __inline__ void csz_insert_start(struct csz_head *b,
343 struct csz_flow *this)
345 struct csz_head *f = b->snext;
346 unsigned long start = this->start;
348 while (f != b) {
349 if (((struct csz_flow*)f)->start - start > 0)
350 break;
351 f = f->snext;
353 this->snext = f;
354 this->sprev = f->sprev;
355 this->snext->sprev = this->sprev->snext = (struct csz_head*)this;
359 /* Calculate and return current round number.
360 It is another time consuming part, but
361 it is impossible to avoid it.
363 It costs O(N) that make all the algorithm useful only
364 to play with closest to ideal fluid model.
366 There exist less academic, but more practical modifications,
367 which might have even better characteristics (WF2Q+, HPFQ, HFSC)
370 static unsigned long csz_update(struct Qdisc *sch)
372 struct csz_sched_data *q = (struct csz_sched_data*)sch->data;
373 struct csz_flow *a;
374 unsigned long F;
375 unsigned long tmp;
376 psched_time_t now;
377 unsigned long delay;
378 unsigned long R_c;
380 PSCHED_GET_TIME(now);
381 delay = PSCHED_TDIFF_SAFE(now, q->t_c, 0, goto do_reset);
383 if (delay>>q->delta_log) {
384 do_reset:
385 /* Delta is too large.
386 It is possible if MTU/BW > 1<<q->delta_log
387 (i.e. configuration error) or because of hardware
388 fault. We have no choice...
390 qdisc_reset(sch);
391 return 0;
394 q->t_c = now;
396 for (;;) {
397 a = (struct csz_flow*)q->f.fnext;
399 /* No more active flows. Reset R and exit. */
400 if (a == (struct csz_flow*)&q->f) {
401 #ifdef CSZ_DEBUG
402 if (q->rate) {
403 printk("csz_update: rate!=0 on inactive csz\n");
404 q->rate = 0;
406 #endif
407 q->R_c = 0;
408 return 0;
411 F = a->finish;
413 #ifdef CSZ_DEBUG
414 if (q->rate == 0) {
415 printk("csz_update: rate=0 on active csz\n");
416 goto do_reset;
418 #endif
421 * tmp = (t - q->t_c)/q->rate;
424 tmp = ((delay<<(31-q->delta_log))/q->rate)>>(31-q->delta_log+q->R_log);
426 tmp += q->R_c;
428 /* OK, this flow (and all flows with greater
429 finish numbers) is still active */
430 if (F - tmp > 0)
431 break;
433 /* It is more not active */
435 a->fprev->fnext = a->fnext;
436 a->fnext->fprev = a->fprev;
439 * q->t_c += (F - q->R_c)*q->rate
442 tmp = ((F-q->R_c)*q->rate)<<q->R_log;
443 R_c = F;
444 q->rate -= a->slice.rate;
446 if ((long)(delay - tmp) >= 0) {
447 delay -= tmp;
448 continue;
450 delay = 0;
453 q->R_c = tmp;
454 return tmp;
457 unsigned csz_classify(struct sk_buff *skb, struct csz_sched_data *q)
459 return CSZ_GUARANTEED;
462 static int
463 csz_enqueue(struct sk_buff *skb, struct Qdisc* sch)
465 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
466 unsigned flow_id = csz_classify(skb, q);
467 unsigned long R;
468 int prio = 0;
469 struct csz_flow *this;
471 if (flow_id >= CSZ_GUARANTEED) {
472 prio = flow_id - CSZ_GUARANTEED;
473 flow_id = 0;
476 this = &q->flow[flow_id];
477 if (this->q.qlen >= this->limit || this->L_tab == NULL) {
478 sch->stats.drops++;
479 kfree_skb(skb);
480 return 0;
483 R = csz_update(sch);
485 if ((long)(this->finish - R) >= 0) {
486 /* It was active */
487 this->finish += L2R(this,skb->len);
488 } else {
489 /* It is inactive; activate it */
490 this->finish = R + L2R(this,skb->len);
491 q->rate += this->slice.rate;
492 csz_insert_finish(&q->f, this);
495 /* If this flow was empty, remember start number
496 and insert it into start queue */
497 if (this->q.qlen == 0) {
498 this->start = this->finish;
499 csz_insert_start(&q->s, this);
501 if (flow_id)
502 skb_queue_tail(&this->q, skb);
503 else
504 skb_queue_tail(&q->other[prio], skb);
505 sch->q.qlen++;
506 sch->stats.bytes += skb->len;
507 sch->stats.packets++;
508 return 1;
511 static __inline__ struct sk_buff *
512 skb_dequeue_best(struct csz_sched_data * q)
514 int i;
515 struct sk_buff *skb;
517 for (i=0; i<4; i++) {
518 skb = skb_dequeue(&q->other[i]);
519 if (skb) {
520 q->flow[0].q.qlen--;
521 return skb;
524 return NULL;
527 static __inline__ struct sk_buff *
528 skb_peek_best(struct csz_sched_data * q)
530 int i;
531 struct sk_buff *skb;
533 for (i=0; i<4; i++) {
534 skb = skb_peek(&q->other[i]);
535 if (skb)
536 return skb;
538 return NULL;
541 #ifdef CSZ_PLUS_TBF
543 static void csz_watchdog(unsigned long arg)
545 struct Qdisc *sch = (struct Qdisc*)arg;
547 qdisc_wakeup(sch->dev);
550 static __inline__ void
551 csz_move_queue(struct csz_flow *this, long delta)
553 this->fprev->fnext = this->fnext;
554 this->fnext->fprev = this->fprev;
556 this->start += delta;
557 this->finish += delta;
559 csz_insert_finish(this);
562 static __inline__ int csz_enough_tokens(struct csz_sched_data *q,
563 struct csz_flow *this,
564 struct sk_buff *skb)
566 long toks;
567 long shift;
568 psched_time_t now;
570 PSCHED_GET_TIME(now);
572 toks = PSCHED_TDIFF(now, t_tbf) + this->tokens - L2R(q,this,skb->len);
574 shift = 0;
575 if (this->throttled) {
576 /* Remember aposteriory delay */
578 unsigned long R = csz_update(q);
579 shift = R - this->R_tbf;
580 this->R_tbf = R;
583 if (toks >= 0) {
584 /* Now we have enough tokens to proceed */
586 this->tokens = toks <= this->depth ? toks : this->depth;
587 this->t_tbf = now;
589 if (!this->throttled)
590 return 1;
592 /* Flow was throttled. Update its start&finish numbers
593 with delay calculated aposteriori.
596 this->throttled = 0;
597 if (shift > 0)
598 csz_move_queue(this, shift);
599 return 1;
602 if (!this->throttled) {
603 /* Flow has just been throttled; remember
604 current round number to calculate aposteriori delay
606 this->throttled = 1;
607 this->R_tbf = csz_update(q);
610 /* Move all the queue to the time when it will be allowed to send.
611 We should translate time to round number, but it is impossible,
612 so that we made the most conservative estimate i.e. we suppose
613 that only this flow is active and, hence, R = t.
614 Really toks <= R <= toks/r_a.
616 This apriory shift in R will be adjusted later to reflect
617 real delay. We cannot avoid it because of:
618 - throttled flow continues to be active from the viewpoint
619 of CSZ, so that it would acquire the highest priority,
620 if you not adjusted start numbers.
621 - Eventually, finish number would become less than round
622 number and flow were declared inactive.
625 toks = -toks;
627 /* Remeber, that we should start watchdog */
628 if (toks < q->wd_expires)
629 q->wd_expires = toks;
631 toks >>= q->R_log;
632 shift += toks;
633 if (shift > 0) {
634 this->R_tbf += toks;
635 csz_move_queue(this, shift);
637 csz_insert_start(this);
638 return 0;
640 #endif
643 static struct sk_buff *
644 csz_dequeue(struct Qdisc* sch)
646 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
647 struct sk_buff *skb;
648 struct csz_flow *this;
650 #ifdef CSZ_PLUS_TBF
651 q->wd_expires = 0;
652 #endif
653 this = (struct csz_flow*)q->s.snext;
655 while (this != (struct csz_flow*)&q->s) {
657 /* First of all: unlink from start list */
658 this->sprev->snext = this->snext;
659 this->snext->sprev = this->sprev;
661 if (this != &q->flow[0]) { /* Guaranteed flow */
662 skb = __skb_dequeue(&this->q);
663 if (skb) {
664 #ifdef CSZ_PLUS_TBF
665 if (this->depth) {
666 if (!csz_enough_tokens(q, this, skb))
667 continue;
669 #endif
670 if (this->q.qlen) {
671 struct sk_buff *nskb = skb_peek(&this->q);
672 this->start += L2R(this,nskb->len);
673 csz_insert_start(&q->s, this);
675 sch->q.qlen--;
676 return skb;
678 } else { /* Predicted or best effort flow */
679 skb = skb_dequeue_best(q);
680 if (skb) {
681 unsigned peeked = this->peeked;
682 this->peeked = 0;
684 if (--this->q.qlen) {
685 struct sk_buff *nskb;
686 unsigned dequeued = L2R(this,skb->len);
688 /* We got not the same thing that
689 peeked earlier; adjust start number
691 if (peeked != dequeued && peeked)
692 this->start += dequeued - peeked;
694 nskb = skb_peek_best(q);
695 peeked = L2R(this,nskb->len);
696 this->start += peeked;
697 this->peeked = peeked;
698 csz_insert_start(&q->s, this);
700 sch->q.qlen--;
701 return skb;
705 #ifdef CSZ_PLUS_TBF
706 /* We are about to return no skb.
707 Schedule watchdog timer, if it occurred because of shaping.
709 if (q->wd_expires) {
710 unsigned long delay = PSCHED_US2JIFFIE(q->wd_expires);
711 del_timer(&q->wd_timer);
712 if (delay == 0)
713 delay = 1;
714 q->wd_timer.expires = jiffies + delay;
715 add_timer(&q->wd_timer);
716 sch->stats.overlimits++;
718 #endif
719 return NULL;
722 static void
723 csz_reset(struct Qdisc* sch)
725 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
726 int i;
728 for (i=0; i<4; i++)
729 skb_queue_purge(&q->other[i]);
731 for (i=0; i<CSZ_GUARANTEED; i++) {
732 struct csz_flow *this = q->flow + i;
733 skb_queue_purge(&this->q);
734 this->snext = this->sprev =
735 this->fnext = this->fprev = (struct csz_head*)this;
736 this->start = this->finish = 0;
738 q->s.snext = q->s.sprev = &q->s;
739 q->f.fnext = q->f.fprev = &q->f;
740 q->R_c = 0;
741 #ifdef CSZ_PLUS_TBF
742 PSCHED_GET_TIME(&q->t_tbf);
743 q->tokens = q->depth;
744 del_timer(&q->wd_timer);
745 #endif
746 sch->q.qlen = 0;
749 static void
750 csz_destroy(struct Qdisc* sch)
752 MOD_DEC_USE_COUNT;
755 static int csz_init(struct Qdisc *sch, struct rtattr *opt)
757 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
758 struct rtattr *tb[TCA_CSZ_PTAB];
759 struct tc_csz_qopt *qopt;
760 int i;
762 rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
763 if (tb[TCA_CSZ_PARMS-1] == NULL ||
764 RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*qopt))
765 return -EINVAL;
766 qopt = RTA_DATA(tb[TCA_CSZ_PARMS-1]);
768 q->R_log = qopt->R_log;
769 q->delta_log = qopt->delta_log;
770 for (i=0; i<=TC_PRIO_MAX; i++) {
771 if (qopt->priomap[i] >= CSZ_FLOWS)
772 return -EINVAL;
773 q->prio2band[i] = qopt->priomap[i];
776 for (i=0; i<4; i++)
777 skb_queue_head_init(&q->other[i]);
779 for (i=0; i<CSZ_GUARANTEED; i++) {
780 struct csz_flow *this = q->flow + i;
781 skb_queue_head_init(&this->q);
782 this->snext = this->sprev =
783 this->fnext = this->fprev = (struct csz_head*)this;
784 this->start = this->finish = 0;
786 q->s.snext = q->s.sprev = &q->s;
787 q->f.fnext = q->f.fprev = &q->f;
788 q->R_c = 0;
789 #ifdef CSZ_PLUS_TBF
790 init_timer(&q->wd_timer);
791 q->wd_timer.data = (unsigned long)sch;
792 q->wd_timer.function = csz_watchdog;
793 #endif
794 MOD_INC_USE_COUNT;
795 return 0;
798 #ifdef CONFIG_RTNETLINK
799 static int csz_dump(struct Qdisc *sch, struct sk_buff *skb)
801 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
802 unsigned char *b = skb->tail;
803 struct rtattr *rta;
804 struct tc_csz_qopt opt;
806 rta = (struct rtattr*)b;
807 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
809 opt.flows = CSZ_FLOWS;
810 memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
811 RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt);
812 rta->rta_len = skb->tail - b;
814 return skb->len;
816 rtattr_failure:
817 skb_trim(skb, b - skb->data);
818 return -1;
820 #endif
823 static int csz_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
824 struct Qdisc **old)
826 return -EINVAL;
829 static unsigned long csz_get(struct Qdisc *sch, u32 classid)
831 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
832 unsigned long band = TC_H_MIN(classid) - 1;
834 if (band >= CSZ_FLOWS)
835 return 0;
837 if (band < CSZ_GUARANTEED && q->flow[band].L_tab == NULL)
838 return 0;
840 return band+1;
843 static void csz_put(struct Qdisc *sch, unsigned long cl)
845 return;
848 static int csz_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg)
850 unsigned long cl = *arg;
851 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
852 struct rtattr *opt = tca[TCA_OPTIONS-1];
853 struct rtattr *tb[TCA_CSZ_PTAB];
854 struct tc_csz_copt *copt;
856 rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
857 if (tb[TCA_CSZ_PARMS-1] == NULL ||
858 RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*copt))
859 return -EINVAL;
860 copt = RTA_DATA(tb[TCA_CSZ_PARMS-1]);
862 if (tb[TCA_CSZ_RTAB-1] &&
863 RTA_PAYLOAD(tb[TCA_CSZ_RTAB-1]) < 1024)
864 return -EINVAL;
866 if (cl) {
867 struct csz_flow *a;
868 cl--;
869 if (cl >= CSZ_FLOWS)
870 return -ENOENT;
871 if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL)
872 return -EINVAL;
874 a = &q->flow[cl];
876 start_bh_atomic();
877 #if 0
878 a->rate_log = copt->rate_log;
879 #endif
880 #ifdef CSZ_PLUS_TBF
881 a->limit = copt->limit;
882 a->rate = copt->rate;
883 a->buffer = copt->buffer;
884 a->mtu = copt->mtu;
885 #endif
887 if (tb[TCA_CSZ_RTAB-1])
888 memcpy(a->L_tab, RTA_DATA(tb[TCA_CSZ_RTAB-1]), 1024);
890 end_bh_atomic();
891 return 0;
893 /* NI */
894 return 0;
897 static int csz_delete(struct Qdisc *sch, unsigned long cl)
899 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
900 struct csz_flow *a;
902 cl--;
904 if (cl >= CSZ_FLOWS)
905 return -ENOENT;
906 if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL)
907 return -EINVAL;
909 a = &q->flow[cl];
911 start_bh_atomic();
912 a->fprev->fnext = a->fnext;
913 a->fnext->fprev = a->fprev;
914 a->sprev->snext = a->snext;
915 a->snext->sprev = a->sprev;
916 a->start = a->finish = 0;
917 kfree(xchg(&q->flow[cl].L_tab, NULL));
918 end_bh_atomic();
920 return 0;
923 #ifdef CONFIG_RTNETLINK
924 static int csz_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm)
926 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
927 unsigned char *b = skb->tail;
928 struct rtattr *rta;
929 struct tc_csz_copt opt;
931 tcm->tcm_handle = sch->handle|cl;
933 cl--;
935 if (cl > CSZ_FLOWS)
936 goto rtattr_failure;
938 if (cl < CSZ_GUARANTEED) {
939 struct csz_flow *f = &q->flow[cl];
941 if (f->L_tab == NULL)
942 goto rtattr_failure;
944 rta = (struct rtattr*)b;
945 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
947 opt.limit = f->limit;
948 opt.rate = f->rate;
949 opt.slice = f->slice;
950 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
951 #ifdef CSZ_PLUS_TBF
952 opt.buffer = f->buffer;
953 opt.mtu = f->mtu;
954 #else
955 opt.buffer = 0;
956 opt.mtu = 0;
957 #endif
959 RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt);
960 rta->rta_len = skb->tail - b;
963 return skb->len;
965 rtattr_failure:
966 skb_trim(skb, b - skb->data);
967 return -1;
969 #endif
971 static void csz_walk(struct Qdisc *sch, struct qdisc_walker *arg)
973 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
974 int prio = 0;
976 if (arg->stop)
977 return;
979 for (prio = 0; prio < CSZ_FLOWS; prio++) {
980 if (arg->count < arg->skip) {
981 arg->count++;
982 continue;
984 if (prio < CSZ_GUARANTEED && q->flow[prio].L_tab == NULL) {
985 arg->count++;
986 continue;
988 if (arg->fn(sch, prio+1, arg) < 0) {
989 arg->stop = 1;
990 break;
992 arg->count++;
996 static struct tcf_proto ** csz_find_tcf(struct Qdisc *sch, unsigned long cl)
998 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
1000 if (cl)
1001 return NULL;
1003 return &q->filter_list;
1006 struct Qdisc_class_ops csz_class_ops =
1008 csz_graft,
1009 csz_get,
1010 csz_put,
1011 csz_change,
1012 csz_delete,
1013 csz_walk,
1015 csz_find_tcf,
1016 csz_get,
1017 csz_put,
1019 #ifdef CONFIG_RTNETLINK
1020 csz_dump_class,
1021 #endif
1024 struct Qdisc_ops csz_qdisc_ops =
1026 NULL,
1027 &csz_class_ops,
1028 "csz",
1029 sizeof(struct csz_sched_data),
1031 csz_enqueue,
1032 csz_dequeue,
1033 NULL,
1034 NULL,
1036 csz_init,
1037 csz_reset,
1038 csz_destroy,
1040 #ifdef CONFIG_RTNETLINK
1041 csz_dump,
1042 #endif
1046 #ifdef MODULE
1047 int init_module(void)
1049 return register_qdisc(&csz_qdisc_ops);
1052 void cleanup_module(void)
1054 unregister_qdisc(&csz_qdisc_ops);
1056 #endif