Ok. I didn't make 2.4.0 in 2000. Tough. I tried, but we had some
[davej-history.git] / net / sched / sch_api.c
blob8a046514f0f7a3013f2880eb076dc05f340419c1
1 /*
2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 * Fixes:
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/config.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/proc_fs.h>
34 #include <linux/kmod.h>
36 #include <net/sock.h>
37 #include <net/pkt_sched.h>
39 #include <asm/processor.h>
40 #include <asm/uaccess.h>
41 #include <asm/system.h>
42 #include <asm/bitops.h>
44 #ifdef CONFIG_RTNETLINK
45 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
46 struct Qdisc *old, struct Qdisc *new);
47 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
48 struct Qdisc *q, unsigned long cl, int event);
49 #endif
53 Short review.
54 -------------
56 This file consists of two interrelated parts:
58 1. queueing disciplines manager frontend.
59 2. traffic classes manager frontend.
61 Generally, queueing discipline ("qdisc") is a black box,
62 which is able to enqueue packets and to dequeue them (when
63 device is ready to send something) in order and at times
64 determined by algorithm hidden in it.
66 qdisc's are divided to two categories:
67 - "queues", which have no internal structure visible from outside.
68 - "schedulers", which split all the packets to "traffic classes",
69 using "packet classifiers" (look at cls_api.c)
71 In turn, classes may have child qdiscs (as rule, queues)
72 attached to them etc. etc. etc.
74 The goal of the routines in this file is to translate
75 information supplied by user in the form of handles
76 to more intelligible for kernel form, to make some sanity
77 checks and part of work, which is common to all qdiscs
78 and to provide rtnetlink notifications.
80 All real intelligent work is done inside qdisc modules.
84 Every discipline has two major routines: enqueue and dequeue.
86 ---dequeue
88 dequeue usually returns a skb to send. It is allowed to return NULL,
89 but it does not mean that queue is empty, it just means that
90 discipline does not want to send anything this time.
91 Queue is really empty if q->q.qlen == 0.
92 For complicated disciplines with multiple queues q->q is not
93 real packet queue, but however q->q.qlen must be valid.
95 ---enqueue
97 enqueue returns 0, if packet was enqueued successfully.
98 If packet (this one or another one) was dropped, it returns
99 not zero error code.
100 NET_XMIT_DROP - this packet dropped
101 Expected action: do not backoff, but wait until queue will clear.
102 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
103 Expected action: backoff or ignore
104 NET_XMIT_POLICED - dropped by police.
105 Expected action: backoff or error to real-time apps.
107 Auxiliary routines:
109 ---requeue
111 requeues once dequeued packet. It is used for non-standard or
112 just buggy devices, which can defer output even if dev->tbusy=0.
114 ---reset
116 returns qdisc to initial state: purge all buffers, clear all
117 timers, counters (except for statistics) etc.
119 ---init
121 initializes newly created qdisc.
123 ---destroy
125 destroys resources allocated by init and during lifetime of qdisc.
127 ---change
129 changes qdisc parameters.
132 /* Protects list of registered TC modules. It is pure SMP lock. */
133 static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED;
136 /************************************************
137 * Queueing disciplines manipulation. *
138 ************************************************/
141 /* The list of all installed queueing disciplines. */
143 static struct Qdisc_ops *qdisc_base = NULL;
145 /* Register/uregister queueing discipline */
147 int register_qdisc(struct Qdisc_ops *qops)
149 struct Qdisc_ops *q, **qp;
151 write_lock(&qdisc_mod_lock);
152 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) {
153 if (strcmp(qops->id, q->id) == 0) {
154 write_unlock(&qdisc_mod_lock);
155 return -EEXIST;
159 if (qops->enqueue == NULL)
160 qops->enqueue = noop_qdisc_ops.enqueue;
161 if (qops->requeue == NULL)
162 qops->requeue = noop_qdisc_ops.requeue;
163 if (qops->dequeue == NULL)
164 qops->dequeue = noop_qdisc_ops.dequeue;
166 qops->next = NULL;
167 *qp = qops;
168 write_unlock(&qdisc_mod_lock);
169 return 0;
172 int unregister_qdisc(struct Qdisc_ops *qops)
174 struct Qdisc_ops *q, **qp;
175 int err = -ENOENT;
177 write_lock(&qdisc_mod_lock);
178 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179 if (q == qops)
180 break;
181 if (q) {
182 *qp = q->next;
183 q->next = NULL;
184 err = 0;
186 write_unlock(&qdisc_mod_lock);
187 return err;
190 /* We know handle. Find qdisc among all qdisc's attached to device
191 (root qdisc, all its children, children of children etc.)
194 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
196 struct Qdisc *q;
198 for (q = dev->qdisc_list; q; q = q->next) {
199 if (q->handle == handle)
200 return q;
202 return NULL;
205 struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
207 unsigned long cl;
208 struct Qdisc *leaf;
209 struct Qdisc_class_ops *cops = p->ops->cl_ops;
211 if (cops == NULL)
212 return NULL;
213 cl = cops->get(p, classid);
215 if (cl == 0)
216 return NULL;
217 leaf = cops->leaf(p, cl);
218 cops->put(p, cl);
219 return leaf;
222 /* Find queueing discipline by name */
224 struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
226 struct Qdisc_ops *q = NULL;
228 if (kind) {
229 read_lock(&qdisc_mod_lock);
230 for (q = qdisc_base; q; q = q->next) {
231 if (rtattr_strcmp(kind, q->id) == 0)
232 break;
234 read_unlock(&qdisc_mod_lock);
236 return q;
239 static struct qdisc_rate_table *qdisc_rtab_list;
241 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
243 struct qdisc_rate_table *rtab;
245 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
246 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
247 rtab->refcnt++;
248 return rtab;
252 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
253 return NULL;
255 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
256 if (rtab) {
257 rtab->rate = *r;
258 rtab->refcnt = 1;
259 memcpy(rtab->data, RTA_DATA(tab), 1024);
260 rtab->next = qdisc_rtab_list;
261 qdisc_rtab_list = rtab;
263 return rtab;
266 void qdisc_put_rtab(struct qdisc_rate_table *tab)
268 struct qdisc_rate_table *rtab, **rtabp;
270 if (!tab || --tab->refcnt)
271 return;
273 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
274 if (rtab == tab) {
275 *rtabp = rtab->next;
276 kfree(rtab);
277 return;
283 /* Allocate an unique handle from space managed by kernel */
285 u32 qdisc_alloc_handle(struct net_device *dev)
287 int i = 0x10000;
288 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
290 do {
291 autohandle += TC_H_MAKE(0x10000U, 0);
292 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
293 autohandle = TC_H_MAKE(0x80000000U, 0);
294 } while (qdisc_lookup(dev, autohandle) && --i > 0);
296 return i>0 ? autohandle : 0;
299 /* Attach toplevel qdisc to device dev */
301 static struct Qdisc *
302 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
304 struct Qdisc *oqdisc;
306 if (dev->flags & IFF_UP)
307 dev_deactivate(dev);
309 write_lock(&qdisc_tree_lock);
310 spin_lock_bh(&dev->queue_lock);
311 if (qdisc && qdisc->flags&TCQ_F_INGRES) {
312 oqdisc = dev->qdisc_ingress;
313 /* Prune old scheduler */
314 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
315 /* delete */
316 qdisc_reset(oqdisc);
317 dev->qdisc_ingress = NULL;
318 } else { /* new */
319 dev->qdisc_ingress = qdisc;
322 } else {
324 oqdisc = dev->qdisc_sleeping;
326 /* Prune old scheduler */
327 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
328 qdisc_reset(oqdisc);
330 /* ... and graft new one */
331 if (qdisc == NULL)
332 qdisc = &noop_qdisc;
333 dev->qdisc_sleeping = qdisc;
334 dev->qdisc = &noop_qdisc;
337 spin_unlock_bh(&dev->queue_lock);
338 write_unlock(&qdisc_tree_lock);
340 if (dev->flags & IFF_UP)
341 dev_activate(dev);
343 return oqdisc;
347 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
348 to device "dev".
350 Old qdisc is not destroyed but returned in *old.
353 int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid,
354 struct Qdisc *new, struct Qdisc **old)
356 int err = 0;
357 struct Qdisc *q = *old;
360 if (parent == NULL) {
361 if (q && q->flags&TCQ_F_INGRES) {
362 *old = dev_graft_qdisc(dev, q);
363 } else {
364 *old = dev_graft_qdisc(dev, new);
366 } else {
367 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
369 err = -EINVAL;
371 if (cops) {
372 unsigned long cl = cops->get(parent, classid);
373 if (cl) {
374 err = cops->graft(parent, cl, new, old);
375 cops->put(parent, cl);
379 return err;
382 #ifdef CONFIG_RTNETLINK
385 Allocate and initialize new qdisc.
387 Parameters are passed via opt.
390 static struct Qdisc *
391 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
393 int err;
394 struct rtattr *kind = tca[TCA_KIND-1];
395 struct Qdisc *sch = NULL;
396 struct Qdisc_ops *ops;
397 int size;
399 ops = qdisc_lookup_ops(kind);
400 #ifdef CONFIG_KMOD
401 if (ops==NULL && tca[TCA_KIND-1] != NULL) {
402 char module_name[4 + IFNAMSIZ + 1];
404 if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
405 sprintf(module_name, "sch_%s", (char*)RTA_DATA(kind));
406 request_module (module_name);
407 ops = qdisc_lookup_ops(kind);
410 #endif
412 err = -EINVAL;
413 if (ops == NULL)
414 goto err_out;
416 size = sizeof(*sch) + ops->priv_size;
418 sch = kmalloc(size, GFP_KERNEL);
419 err = -ENOBUFS;
420 if (!sch)
421 goto err_out;
423 /* Grrr... Resolve race condition with module unload */
425 err = -EINVAL;
426 if (ops != qdisc_lookup_ops(kind))
427 goto err_out;
429 memset(sch, 0, size);
431 skb_queue_head_init(&sch->q);
433 if (handle == TC_H_INGRESS)
434 sch->flags |= TCQ_F_INGRES;
436 sch->ops = ops;
437 sch->enqueue = ops->enqueue;
438 sch->dequeue = ops->dequeue;
439 sch->dev = dev;
440 atomic_set(&sch->refcnt, 1);
441 sch->stats.lock = &dev->queue_lock;
442 if (handle == 0) {
443 handle = qdisc_alloc_handle(dev);
444 err = -ENOMEM;
445 if (handle == 0)
446 goto err_out;
449 if (handle == TC_H_INGRESS)
450 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
451 else
452 sch->handle = handle;
454 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
455 write_lock(&qdisc_tree_lock);
456 sch->next = dev->qdisc_list;
457 dev->qdisc_list = sch;
458 write_unlock(&qdisc_tree_lock);
459 #ifdef CONFIG_NET_ESTIMATOR
460 if (tca[TCA_RATE-1])
461 qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
462 #endif
463 return sch;
466 err_out:
467 *errp = err;
468 if (sch)
469 kfree(sch);
470 return NULL;
473 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
475 if (tca[TCA_OPTIONS-1]) {
476 int err;
478 if (sch->ops->change == NULL)
479 return -EINVAL;
480 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
481 if (err)
482 return err;
484 #ifdef CONFIG_NET_ESTIMATOR
485 if (tca[TCA_RATE-1]) {
486 qdisc_kill_estimator(&sch->stats);
487 qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
489 #endif
490 return 0;
493 struct check_loop_arg
495 struct qdisc_walker w;
496 struct Qdisc *p;
497 int depth;
500 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
502 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
504 struct check_loop_arg arg;
506 if (q->ops->cl_ops == NULL)
507 return 0;
509 arg.w.stop = arg.w.skip = arg.w.count = 0;
510 arg.w.fn = check_loop_fn;
511 arg.depth = depth;
512 arg.p = p;
513 q->ops->cl_ops->walk(q, &arg.w);
514 return arg.w.stop ? -ELOOP : 0;
517 static int
518 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
520 struct Qdisc *leaf;
521 struct Qdisc_class_ops *cops = q->ops->cl_ops;
522 struct check_loop_arg *arg = (struct check_loop_arg *)w;
524 leaf = cops->leaf(q, cl);
525 if (leaf) {
526 if (leaf == arg->p || arg->depth > 7)
527 return -ELOOP;
528 return check_loop(leaf, arg->p, arg->depth + 1);
530 return 0;
534 * Delete/get qdisc.
537 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
539 struct tcmsg *tcm = NLMSG_DATA(n);
540 struct rtattr **tca = arg;
541 struct net_device *dev;
542 u32 clid = tcm->tcm_parent;
543 struct Qdisc *q = NULL;
544 struct Qdisc *p = NULL;
545 int err;
547 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
548 return -ENODEV;
550 if (clid) {
551 if (clid != TC_H_ROOT) {
552 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
553 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
554 return -ENOENT;
555 q = qdisc_leaf(p, clid);
556 } else { /* ingress */
557 q = dev->qdisc_ingress;
559 } else {
560 q = dev->qdisc_sleeping;
562 if (!q)
563 return -ENOENT;
565 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
566 return -EINVAL;
567 } else {
568 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
569 return -ENOENT;
572 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
573 return -EINVAL;
575 if (n->nlmsg_type == RTM_DELQDISC) {
576 if (!clid)
577 return -EINVAL;
578 if (q->handle == 0)
579 return -ENOENT;
580 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
581 return err;
582 if (q) {
583 qdisc_notify(skb, n, clid, q, NULL);
584 spin_lock_bh(&dev->queue_lock);
585 qdisc_destroy(q);
586 spin_unlock_bh(&dev->queue_lock);
588 } else {
589 qdisc_notify(skb, n, clid, NULL, q);
591 return 0;
595 Create/change qdisc.
598 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
600 struct tcmsg *tcm = NLMSG_DATA(n);
601 struct rtattr **tca = arg;
602 struct net_device *dev;
603 u32 clid = tcm->tcm_parent;
604 struct Qdisc *q = NULL;
605 struct Qdisc *p = NULL;
606 int err;
608 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
609 return -ENODEV;
611 if (clid) {
612 if (clid != TC_H_ROOT) {
613 if (clid != TC_H_INGRESS) {
614 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
615 return -ENOENT;
616 q = qdisc_leaf(p, clid);
617 } else { /*ingress */
618 q = dev->qdisc_ingress;
620 } else {
621 q = dev->qdisc_sleeping;
624 /* It may be default qdisc, ignore it */
625 if (q && q->handle == 0)
626 q = NULL;
628 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
629 if (tcm->tcm_handle) {
630 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
631 return -EEXIST;
632 if (TC_H_MIN(tcm->tcm_handle))
633 return -EINVAL;
634 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
635 goto create_n_graft;
636 if (n->nlmsg_flags&NLM_F_EXCL)
637 return -EEXIST;
638 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
639 return -EINVAL;
640 if (q == p ||
641 (p && check_loop(q, p, 0)))
642 return -ELOOP;
643 atomic_inc(&q->refcnt);
644 goto graft;
645 } else {
646 if (q == NULL)
647 goto create_n_graft;
649 /* This magic test requires explanation.
651 * We know, that some child q is already
652 * attached to this parent and have choice:
653 * either to change it or to create/graft new one.
655 * 1. We are allowed to create/graft only
656 * if CREATE and REPLACE flags are set.
658 * 2. If EXCL is set, requestor wanted to say,
659 * that qdisc tcm_handle is not expected
660 * to exist, so that we choose create/graft too.
662 * 3. The last case is when no flags are set.
663 * Alas, it is sort of hole in API, we
664 * cannot decide what to do unambiguously.
665 * For now we select create/graft, if
666 * user gave KIND, which does not match existing.
668 if ((n->nlmsg_flags&NLM_F_CREATE) &&
669 (n->nlmsg_flags&NLM_F_REPLACE) &&
670 ((n->nlmsg_flags&NLM_F_EXCL) ||
671 (tca[TCA_KIND-1] &&
672 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
673 goto create_n_graft;
676 } else {
677 if (!tcm->tcm_handle)
678 return -EINVAL;
679 q = qdisc_lookup(dev, tcm->tcm_handle);
682 /* Change qdisc parameters */
683 if (q == NULL)
684 return -ENOENT;
685 if (n->nlmsg_flags&NLM_F_EXCL)
686 return -EEXIST;
687 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
688 return -EINVAL;
689 err = qdisc_change(q, tca);
690 if (err == 0)
691 qdisc_notify(skb, n, clid, NULL, q);
692 return err;
694 create_n_graft:
695 if (!(n->nlmsg_flags&NLM_F_CREATE))
696 return -ENOENT;
697 if (clid == TC_H_INGRESS)
698 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
699 else
700 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
701 if (q == NULL)
702 return err;
704 graft:
705 if (1) {
706 struct Qdisc *old_q = NULL;
707 err = qdisc_graft(dev, p, clid, q, &old_q);
708 if (err) {
709 if (q) {
710 spin_lock_bh(&dev->queue_lock);
711 qdisc_destroy(q);
712 spin_unlock_bh(&dev->queue_lock);
714 return err;
716 qdisc_notify(skb, n, clid, old_q, q);
717 if (old_q) {
718 spin_lock_bh(&dev->queue_lock);
719 qdisc_destroy(old_q);
720 spin_unlock_bh(&dev->queue_lock);
723 return 0;
726 int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st)
728 spin_lock_bh(st->lock);
729 RTA_PUT(skb, TCA_STATS, (char*)&st->lock - (char*)st, st);
730 spin_unlock_bh(st->lock);
731 return 0;
733 rtattr_failure:
734 spin_unlock_bh(st->lock);
735 return -1;
739 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
740 u32 pid, u32 seq, unsigned flags, int event)
742 struct tcmsg *tcm;
743 struct nlmsghdr *nlh;
744 unsigned char *b = skb->tail;
746 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
747 nlh->nlmsg_flags = flags;
748 tcm = NLMSG_DATA(nlh);
749 tcm->tcm_family = AF_UNSPEC;
750 tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
751 tcm->tcm_parent = clid;
752 tcm->tcm_handle = q->handle;
753 tcm->tcm_info = atomic_read(&q->refcnt);
754 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
755 if (q->ops->dump && q->ops->dump(q, skb) < 0)
756 goto rtattr_failure;
757 q->stats.qlen = q->q.qlen;
758 if (qdisc_copy_stats(skb, &q->stats))
759 goto rtattr_failure;
760 nlh->nlmsg_len = skb->tail - b;
761 return skb->len;
763 nlmsg_failure:
764 rtattr_failure:
765 skb_trim(skb, b - skb->data);
766 return -1;
769 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
770 u32 clid, struct Qdisc *old, struct Qdisc *new)
772 struct sk_buff *skb;
773 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
775 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
776 if (!skb)
777 return -ENOBUFS;
779 if (old && old->handle) {
780 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
781 goto err_out;
783 if (new) {
784 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
785 goto err_out;
788 if (skb->len)
789 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
791 err_out:
792 kfree_skb(skb);
793 return -EINVAL;
796 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
798 int idx, q_idx;
799 int s_idx, s_q_idx;
800 struct net_device *dev;
801 struct Qdisc *q;
803 s_idx = cb->args[0];
804 s_q_idx = q_idx = cb->args[1];
805 read_lock(&dev_base_lock);
806 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
807 if (idx < s_idx)
808 continue;
809 if (idx > s_idx)
810 s_q_idx = 0;
811 read_lock(&qdisc_tree_lock);
812 for (q = dev->qdisc_list, q_idx = 0; q;
813 q = q->next, q_idx++) {
814 if (q_idx < s_q_idx)
815 continue;
816 if (tc_fill_qdisc(skb, q, 0, NETLINK_CB(cb->skb).pid,
817 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
818 read_unlock(&qdisc_tree_lock);
819 goto done;
822 read_unlock(&qdisc_tree_lock);
825 done:
826 read_unlock(&dev_base_lock);
828 cb->args[0] = idx;
829 cb->args[1] = q_idx;
831 return skb->len;
836 /************************************************
837 * Traffic classes manipulation. *
838 ************************************************/
842 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
844 struct tcmsg *tcm = NLMSG_DATA(n);
845 struct rtattr **tca = arg;
846 struct net_device *dev;
847 struct Qdisc *q = NULL;
848 struct Qdisc_class_ops *cops;
849 unsigned long cl = 0;
850 unsigned long new_cl;
851 u32 pid = tcm->tcm_parent;
852 u32 clid = tcm->tcm_handle;
853 u32 qid = TC_H_MAJ(clid);
854 int err;
856 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
857 return -ENODEV;
860 parent == TC_H_UNSPEC - unspecified parent.
861 parent == TC_H_ROOT - class is root, which has no parent.
862 parent == X:0 - parent is root class.
863 parent == X:Y - parent is a node in hierarchy.
864 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
866 handle == 0:0 - generate handle from kernel pool.
867 handle == 0:Y - class is X:Y, where X:0 is qdisc.
868 handle == X:Y - clear.
869 handle == X:0 - root class.
872 /* Step 1. Determine qdisc handle X:0 */
874 if (pid != TC_H_ROOT) {
875 u32 qid1 = TC_H_MAJ(pid);
877 if (qid && qid1) {
878 /* If both majors are known, they must be identical. */
879 if (qid != qid1)
880 return -EINVAL;
881 } else if (qid1) {
882 qid = qid1;
883 } else if (qid == 0)
884 qid = dev->qdisc_sleeping->handle;
886 /* Now qid is genuine qdisc handle consistent
887 both with parent and child.
889 TC_H_MAJ(pid) still may be unspecified, complete it now.
891 if (pid)
892 pid = TC_H_MAKE(qid, pid);
893 } else {
894 if (qid == 0)
895 qid = dev->qdisc_sleeping->handle;
898 /* OK. Locate qdisc */
899 if ((q = qdisc_lookup(dev, qid)) == NULL)
900 return -ENOENT;
902 /* An check that it supports classes */
903 cops = q->ops->cl_ops;
904 if (cops == NULL)
905 return -EINVAL;
907 /* Now try to get class */
908 if (clid == 0) {
909 if (pid == TC_H_ROOT)
910 clid = qid;
911 } else
912 clid = TC_H_MAKE(qid, clid);
914 if (clid)
915 cl = cops->get(q, clid);
917 if (cl == 0) {
918 err = -ENOENT;
919 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
920 goto out;
921 } else {
922 switch (n->nlmsg_type) {
923 case RTM_NEWTCLASS:
924 err = -EEXIST;
925 if (n->nlmsg_flags&NLM_F_EXCL)
926 goto out;
927 break;
928 case RTM_DELTCLASS:
929 err = cops->delete(q, cl);
930 if (err == 0)
931 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
932 goto out;
933 case RTM_GETTCLASS:
934 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
935 goto out;
936 default:
937 err = -EINVAL;
938 goto out;
942 new_cl = cl;
943 err = cops->change(q, clid, pid, tca, &new_cl);
944 if (err == 0)
945 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
947 out:
948 if (cl)
949 cops->put(q, cl);
951 return err;
955 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
956 unsigned long cl,
957 u32 pid, u32 seq, unsigned flags, int event)
959 struct tcmsg *tcm;
960 struct nlmsghdr *nlh;
961 unsigned char *b = skb->tail;
963 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
964 nlh->nlmsg_flags = flags;
965 tcm = NLMSG_DATA(nlh);
966 tcm->tcm_family = AF_UNSPEC;
967 tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
968 tcm->tcm_parent = q->handle;
969 tcm->tcm_handle = q->handle;
970 tcm->tcm_info = 0;
971 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
972 if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0)
973 goto rtattr_failure;
974 nlh->nlmsg_len = skb->tail - b;
975 return skb->len;
977 nlmsg_failure:
978 rtattr_failure:
979 skb_trim(skb, b - skb->data);
980 return -1;
983 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
984 struct Qdisc *q, unsigned long cl, int event)
986 struct sk_buff *skb;
987 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
989 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
990 if (!skb)
991 return -ENOBUFS;
993 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
994 kfree_skb(skb);
995 return -EINVAL;
998 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1001 struct qdisc_dump_args
1003 struct qdisc_walker w;
1004 struct sk_buff *skb;
1005 struct netlink_callback *cb;
1008 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1010 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1012 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1013 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1016 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1018 int t;
1019 int s_t;
1020 struct net_device *dev;
1021 struct Qdisc *q;
1022 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1023 struct qdisc_dump_args arg;
1025 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1026 return 0;
1027 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1028 return 0;
1030 s_t = cb->args[0];
1032 read_lock(&qdisc_tree_lock);
1033 for (q=dev->qdisc_list, t=0; q; q = q->next, t++) {
1034 if (t < s_t) continue;
1035 if (!q->ops->cl_ops) continue;
1036 if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle)
1037 continue;
1038 if (t > s_t)
1039 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1040 arg.w.fn = qdisc_class_dump;
1041 arg.skb = skb;
1042 arg.cb = cb;
1043 arg.w.stop = 0;
1044 arg.w.skip = cb->args[1];
1045 arg.w.count = 0;
1046 q->ops->cl_ops->walk(q, &arg.w);
1047 cb->args[1] = arg.w.count;
1048 if (arg.w.stop)
1049 break;
1051 read_unlock(&qdisc_tree_lock);
1053 cb->args[0] = t;
1055 dev_put(dev);
1056 return skb->len;
1058 #endif
1060 int psched_us_per_tick = 1;
1061 int psched_tick_per_us = 1;
1063 #ifdef CONFIG_PROC_FS
1064 static int psched_read_proc(char *buffer, char **start, off_t offset,
1065 int length, int *eof, void *data)
1067 int len;
1069 len = sprintf(buffer, "%08x %08x %08x %08x\n",
1070 psched_tick_per_us, psched_us_per_tick,
1071 1000000, HZ);
1073 len -= offset;
1075 if (len > length)
1076 len = length;
1077 if(len < 0)
1078 len = 0;
1080 *start = buffer + offset;
1081 *eof = 1;
1083 return len;
1085 #endif
1087 #if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY
1088 int psched_tod_diff(int delta_sec, int bound)
1090 int delta;
1092 if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1)
1093 return bound;
1094 delta = delta_sec * 1000000;
1095 if (delta > bound)
1096 delta = bound;
1097 return delta;
1099 #endif
1101 psched_time_t psched_time_base;
1103 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1104 psched_tdiff_t psched_clock_per_hz;
1105 int psched_clock_scale;
1106 #endif
1108 #ifdef PSCHED_WATCHER
1109 PSCHED_WATCHER psched_time_mark;
1111 static void psched_tick(unsigned long);
1113 static struct timer_list psched_timer =
1114 { function: psched_tick };
1116 static void psched_tick(unsigned long dummy)
1118 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1119 psched_time_t dummy_stamp;
1120 PSCHED_GET_TIME(dummy_stamp);
1121 /* It is OK up to 4GHz cpu */
1122 psched_timer.expires = jiffies + 1*HZ;
1123 #else
1124 unsigned long now = jiffies;
1125 psched_time_base = ((u64)now)<<PSCHED_JSCALE;
1126 psched_time_mark = now;
1127 psched_timer.expires = now + 60*60*HZ;
1128 #endif
1129 add_timer(&psched_timer);
1131 #endif
1133 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1134 int __init psched_calibrate_clock(void)
1136 psched_time_t stamp, stamp1;
1137 struct timeval tv, tv1;
1138 psched_tdiff_t delay;
1139 long rdelay;
1140 unsigned long stop;
1142 #ifdef PSCHED_WATCHER
1143 psched_tick(0);
1144 #endif
1145 stop = jiffies + HZ/10;
1146 PSCHED_GET_TIME(stamp);
1147 do_gettimeofday(&tv);
1148 while (time_before(jiffies, stop))
1149 barrier();
1150 PSCHED_GET_TIME(stamp1);
1151 do_gettimeofday(&tv1);
1153 delay = PSCHED_TDIFF(stamp1, stamp);
1154 rdelay = tv1.tv_usec - tv.tv_usec;
1155 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1156 if (rdelay > delay)
1157 return -1;
1158 delay /= rdelay;
1159 psched_tick_per_us = delay;
1160 while ((delay>>=1) != 0)
1161 psched_clock_scale++;
1162 psched_us_per_tick = 1<<psched_clock_scale;
1163 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1164 return 0;
1166 #endif
1168 int __init pktsched_init(void)
1170 #ifdef CONFIG_RTNETLINK
1171 struct rtnetlink_link *link_p;
1172 #endif
1174 #if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1175 if (psched_calibrate_clock() < 0)
1176 return -1;
1177 #elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES
1178 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1179 psched_us_per_tick = 1000000;
1180 #ifdef PSCHED_WATCHER
1181 psched_tick(0);
1182 #endif
1183 #endif
1185 #ifdef CONFIG_RTNETLINK
1186 link_p = rtnetlink_links[PF_UNSPEC];
1188 /* Setup rtnetlink links. It is made here to avoid
1189 exporting large number of public symbols.
1192 if (link_p) {
1193 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1194 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1195 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1196 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1197 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1198 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1199 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1200 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1202 #endif
1204 #define INIT_QDISC(name) { \
1205 extern struct Qdisc_ops name##_qdisc_ops; \
1206 register_qdisc(&##name##_qdisc_ops); \
1209 INIT_QDISC(pfifo);
1210 INIT_QDISC(bfifo);
1212 #ifdef CONFIG_NET_SCH_CBQ
1213 INIT_QDISC(cbq);
1214 #endif
1215 #ifdef CONFIG_NET_SCH_CSZ
1216 INIT_QDISC(csz);
1217 #endif
1218 #ifdef CONFIG_NET_SCH_HPFQ
1219 INIT_QDISC(hpfq);
1220 #endif
1221 #ifdef CONFIG_NET_SCH_HFSC
1222 INIT_QDISC(hfsc);
1223 #endif
1224 #ifdef CONFIG_NET_SCH_RED
1225 INIT_QDISC(red);
1226 #endif
1227 #ifdef CONFIG_NET_SCH_GRED
1228 INIT_QDISC(gred);
1229 #endif
1230 #ifdef CONFIG_NET_SCH_INGRESS
1231 INIT_QDISC(ingress);
1232 #endif
1233 #ifdef CONFIG_NET_SCH_DSMARK
1234 INIT_QDISC(dsmark);
1235 #endif
1236 #ifdef CONFIG_NET_SCH_SFQ
1237 INIT_QDISC(sfq);
1238 #endif
1239 #ifdef CONFIG_NET_SCH_TBF
1240 INIT_QDISC(tbf);
1241 #endif
1242 #ifdef CONFIG_NET_SCH_TEQL
1243 teql_init();
1244 #endif
1245 #ifdef CONFIG_NET_SCH_PRIO
1246 INIT_QDISC(prio);
1247 #endif
1248 #ifdef CONFIG_NET_SCH_ATM
1249 INIT_QDISC(atm);
1250 #endif
1251 #ifdef CONFIG_NET_CLS
1252 tc_filter_init();
1253 #endif
1255 #ifdef CONFIG_PROC_FS
1256 create_proc_read_entry("net/psched", 0, 0, psched_read_proc, NULL);
1257 #endif
1259 return 0;