initial commit with v2.6.9
[linux-2.6.9-moxart.git] / net / sched / sch_api.c
blob5da7b73126426a7b00188ebb93b854cb90914210
1 /*
2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 * Fixes:
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/config.h>
19 #include <linux/module.h>
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/sched.h>
23 #include <linux/string.h>
24 #include <linux/mm.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
27 #include <linux/in.h>
28 #include <linux/errno.h>
29 #include <linux/interrupt.h>
30 #include <linux/netdevice.h>
31 #include <linux/skbuff.h>
32 #include <linux/rtnetlink.h>
33 #include <linux/init.h>
34 #include <linux/proc_fs.h>
35 #include <linux/seq_file.h>
36 #include <linux/kmod.h>
37 #include <linux/list.h>
39 #include <net/sock.h>
40 #include <net/pkt_sched.h>
42 #include <asm/processor.h>
43 #include <asm/uaccess.h>
44 #include <asm/system.h>
45 #include <asm/bitops.h>
47 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
48 struct Qdisc *old, struct Qdisc *new);
49 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
50 struct Qdisc *q, unsigned long cl, int event);
54 Short review.
55 -------------
57 This file consists of two interrelated parts:
59 1. queueing disciplines manager frontend.
60 2. traffic classes manager frontend.
62 Generally, queueing discipline ("qdisc") is a black box,
63 which is able to enqueue packets and to dequeue them (when
64 device is ready to send something) in order and at times
65 determined by algorithm hidden in it.
67 qdisc's are divided to two categories:
68 - "queues", which have no internal structure visible from outside.
69 - "schedulers", which split all the packets to "traffic classes",
70 using "packet classifiers" (look at cls_api.c)
72 In turn, classes may have child qdiscs (as rule, queues)
73 attached to them etc. etc. etc.
75 The goal of the routines in this file is to translate
76 information supplied by user in the form of handles
77 to more intelligible for kernel form, to make some sanity
78 checks and part of work, which is common to all qdiscs
79 and to provide rtnetlink notifications.
81 All real intelligent work is done inside qdisc modules.
85 Every discipline has two major routines: enqueue and dequeue.
87 ---dequeue
89 dequeue usually returns a skb to send. It is allowed to return NULL,
90 but it does not mean that queue is empty, it just means that
91 discipline does not want to send anything this time.
92 Queue is really empty if q->q.qlen == 0.
93 For complicated disciplines with multiple queues q->q is not
94 real packet queue, but however q->q.qlen must be valid.
96 ---enqueue
98 enqueue returns 0, if packet was enqueued successfully.
99 If packet (this one or another one) was dropped, it returns
100 not zero error code.
101 NET_XMIT_DROP - this packet dropped
102 Expected action: do not backoff, but wait until queue will clear.
103 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
104 Expected action: backoff or ignore
105 NET_XMIT_POLICED - dropped by police.
106 Expected action: backoff or error to real-time apps.
108 Auxiliary routines:
110 ---requeue
112 requeues once dequeued packet. It is used for non-standard or
113 just buggy devices, which can defer output even if dev->tbusy=0.
115 ---reset
117 returns qdisc to initial state: purge all buffers, clear all
118 timers, counters (except for statistics) etc.
120 ---init
122 initializes newly created qdisc.
124 ---destroy
126 destroys resources allocated by init and during lifetime of qdisc.
128 ---change
130 changes qdisc parameters.
133 /* Protects list of registered TC modules. It is pure SMP lock. */
134 static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED;
137 /************************************************
138 * Queueing disciplines manipulation. *
139 ************************************************/
142 /* The list of all installed queueing disciplines. */
144 static struct Qdisc_ops *qdisc_base;
146 /* Register/uregister queueing discipline */
148 int register_qdisc(struct Qdisc_ops *qops)
150 struct Qdisc_ops *q, **qp;
151 int rc = -EEXIST;
153 write_lock(&qdisc_mod_lock);
154 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
155 if (!strcmp(qops->id, q->id))
156 goto out;
158 if (qops->enqueue == NULL)
159 qops->enqueue = noop_qdisc_ops.enqueue;
160 if (qops->requeue == NULL)
161 qops->requeue = noop_qdisc_ops.requeue;
162 if (qops->dequeue == NULL)
163 qops->dequeue = noop_qdisc_ops.dequeue;
165 qops->next = NULL;
166 *qp = qops;
167 rc = 0;
168 out:
169 write_unlock(&qdisc_mod_lock);
170 return rc;
173 int unregister_qdisc(struct Qdisc_ops *qops)
175 struct Qdisc_ops *q, **qp;
176 int err = -ENOENT;
178 write_lock(&qdisc_mod_lock);
179 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
180 if (q == qops)
181 break;
182 if (q) {
183 *qp = q->next;
184 q->next = NULL;
185 err = 0;
187 write_unlock(&qdisc_mod_lock);
188 return err;
191 /* We know handle. Find qdisc among all qdisc's attached to device
192 (root qdisc, all its children, children of children etc.)
195 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
197 struct Qdisc *q;
199 list_for_each_entry(q, &dev->qdisc_list, list) {
200 if (q->handle == handle)
201 return q;
203 return NULL;
206 struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
208 unsigned long cl;
209 struct Qdisc *leaf;
210 struct Qdisc_class_ops *cops = p->ops->cl_ops;
212 if (cops == NULL)
213 return NULL;
214 cl = cops->get(p, classid);
216 if (cl == 0)
217 return NULL;
218 leaf = cops->leaf(p, cl);
219 cops->put(p, cl);
220 return leaf;
223 /* Find queueing discipline by name */
225 struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
227 struct Qdisc_ops *q = NULL;
229 if (kind) {
230 read_lock(&qdisc_mod_lock);
231 for (q = qdisc_base; q; q = q->next) {
232 if (rtattr_strcmp(kind, q->id) == 0)
233 break;
235 read_unlock(&qdisc_mod_lock);
237 return q;
240 static struct qdisc_rate_table *qdisc_rtab_list;
242 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
244 struct qdisc_rate_table *rtab;
246 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
247 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
248 rtab->refcnt++;
249 return rtab;
253 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
254 return NULL;
256 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
257 if (rtab) {
258 rtab->rate = *r;
259 rtab->refcnt = 1;
260 memcpy(rtab->data, RTA_DATA(tab), 1024);
261 rtab->next = qdisc_rtab_list;
262 qdisc_rtab_list = rtab;
264 return rtab;
267 void qdisc_put_rtab(struct qdisc_rate_table *tab)
269 struct qdisc_rate_table *rtab, **rtabp;
271 if (!tab || --tab->refcnt)
272 return;
274 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
275 if (rtab == tab) {
276 *rtabp = rtab->next;
277 kfree(rtab);
278 return;
284 /* Allocate an unique handle from space managed by kernel */
286 u32 qdisc_alloc_handle(struct net_device *dev)
288 int i = 0x10000;
289 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
291 do {
292 autohandle += TC_H_MAKE(0x10000U, 0);
293 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
294 autohandle = TC_H_MAKE(0x80000000U, 0);
295 } while (qdisc_lookup(dev, autohandle) && --i > 0);
297 return i>0 ? autohandle : 0;
300 /* Attach toplevel qdisc to device dev */
302 static struct Qdisc *
303 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
305 struct Qdisc *oqdisc;
307 if (dev->flags & IFF_UP)
308 dev_deactivate(dev);
310 qdisc_lock_tree(dev);
311 if (qdisc && qdisc->flags&TCQ_F_INGRES) {
312 oqdisc = dev->qdisc_ingress;
313 /* Prune old scheduler */
314 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
315 /* delete */
316 qdisc_reset(oqdisc);
317 dev->qdisc_ingress = NULL;
318 } else { /* new */
319 dev->qdisc_ingress = qdisc;
322 } else {
324 oqdisc = dev->qdisc_sleeping;
326 /* Prune old scheduler */
327 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
328 qdisc_reset(oqdisc);
330 /* ... and graft new one */
331 if (qdisc == NULL)
332 qdisc = &noop_qdisc;
333 dev->qdisc_sleeping = qdisc;
334 dev->qdisc = &noop_qdisc;
337 qdisc_unlock_tree(dev);
339 if (dev->flags & IFF_UP)
340 dev_activate(dev);
342 return oqdisc;
346 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
347 to device "dev".
349 Old qdisc is not destroyed but returned in *old.
352 int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid,
353 struct Qdisc *new, struct Qdisc **old)
355 int err = 0;
356 struct Qdisc *q = *old;
359 if (parent == NULL) {
360 if (q && q->flags&TCQ_F_INGRES) {
361 *old = dev_graft_qdisc(dev, q);
362 } else {
363 *old = dev_graft_qdisc(dev, new);
365 } else {
366 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
368 err = -EINVAL;
370 if (cops) {
371 unsigned long cl = cops->get(parent, classid);
372 if (cl) {
373 err = cops->graft(parent, cl, new, old);
374 if (new)
375 new->parent = classid;
376 cops->put(parent, cl);
380 return err;
384 Allocate and initialize new qdisc.
386 Parameters are passed via opt.
389 static struct Qdisc *
390 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
392 int err;
393 struct rtattr *kind = tca[TCA_KIND-1];
394 void *p = NULL;
395 struct Qdisc *sch;
396 struct Qdisc_ops *ops;
397 int size;
399 ops = qdisc_lookup_ops(kind);
400 #ifdef CONFIG_KMOD
401 if (ops==NULL && tca[TCA_KIND-1] != NULL) {
402 if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
403 request_module("sch_%s", (char*)RTA_DATA(kind));
404 ops = qdisc_lookup_ops(kind);
407 #endif
409 err = -EINVAL;
410 if (ops == NULL)
411 goto err_out;
412 err = -EBUSY;
413 if (!try_module_get(ops->owner))
414 goto err_out;
416 /* ensure that the Qdisc and the private data are 32-byte aligned */
417 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
418 size += ops->priv_size + QDISC_ALIGN_CONST;
420 p = kmalloc(size, GFP_KERNEL);
421 err = -ENOBUFS;
422 if (!p)
423 goto err_out2;
424 memset(p, 0, size);
425 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
426 & ~QDISC_ALIGN_CONST);
427 sch->padded = (char *)sch - (char *)p;
429 INIT_LIST_HEAD(&sch->list);
430 skb_queue_head_init(&sch->q);
432 if (handle == TC_H_INGRESS)
433 sch->flags |= TCQ_F_INGRES;
435 sch->ops = ops;
436 sch->enqueue = ops->enqueue;
437 sch->dequeue = ops->dequeue;
438 sch->dev = dev;
439 dev_hold(dev);
440 atomic_set(&sch->refcnt, 1);
441 sch->stats_lock = &dev->queue_lock;
442 if (handle == 0) {
443 handle = qdisc_alloc_handle(dev);
444 err = -ENOMEM;
445 if (handle == 0)
446 goto err_out3;
449 if (handle == TC_H_INGRESS)
450 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
451 else
452 sch->handle = handle;
454 /* enqueue is accessed locklessly - make sure it's visible
455 * before we set a netdevice's qdisc pointer to sch */
456 smp_wmb();
457 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
458 qdisc_lock_tree(dev);
459 list_add_tail(&sch->list, &dev->qdisc_list);
460 qdisc_unlock_tree(dev);
462 #ifdef CONFIG_NET_ESTIMATOR
463 if (tca[TCA_RATE-1])
464 qdisc_new_estimator(&sch->stats, sch->stats_lock,
465 tca[TCA_RATE-1]);
466 #endif
467 return sch;
469 err_out3:
470 dev_put(dev);
471 err_out2:
472 module_put(ops->owner);
473 err_out:
474 *errp = err;
475 if (p)
476 kfree(p);
477 return NULL;
480 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
482 if (tca[TCA_OPTIONS-1]) {
483 int err;
485 if (sch->ops->change == NULL)
486 return -EINVAL;
487 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
488 if (err)
489 return err;
491 #ifdef CONFIG_NET_ESTIMATOR
492 if (tca[TCA_RATE-1]) {
493 qdisc_kill_estimator(&sch->stats);
494 qdisc_new_estimator(&sch->stats, sch->stats_lock,
495 tca[TCA_RATE-1]);
497 #endif
498 return 0;
501 struct check_loop_arg
503 struct qdisc_walker w;
504 struct Qdisc *p;
505 int depth;
508 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
510 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
512 struct check_loop_arg arg;
514 if (q->ops->cl_ops == NULL)
515 return 0;
517 arg.w.stop = arg.w.skip = arg.w.count = 0;
518 arg.w.fn = check_loop_fn;
519 arg.depth = depth;
520 arg.p = p;
521 q->ops->cl_ops->walk(q, &arg.w);
522 return arg.w.stop ? -ELOOP : 0;
525 static int
526 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
528 struct Qdisc *leaf;
529 struct Qdisc_class_ops *cops = q->ops->cl_ops;
530 struct check_loop_arg *arg = (struct check_loop_arg *)w;
532 leaf = cops->leaf(q, cl);
533 if (leaf) {
534 if (leaf == arg->p || arg->depth > 7)
535 return -ELOOP;
536 return check_loop(leaf, arg->p, arg->depth + 1);
538 return 0;
542 * Delete/get qdisc.
545 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
547 struct tcmsg *tcm = NLMSG_DATA(n);
548 struct rtattr **tca = arg;
549 struct net_device *dev;
550 u32 clid = tcm->tcm_parent;
551 struct Qdisc *q = NULL;
552 struct Qdisc *p = NULL;
553 int err;
555 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
556 return -ENODEV;
558 if (clid) {
559 if (clid != TC_H_ROOT) {
560 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
561 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
562 return -ENOENT;
563 q = qdisc_leaf(p, clid);
564 } else { /* ingress */
565 q = dev->qdisc_ingress;
567 } else {
568 q = dev->qdisc_sleeping;
570 if (!q)
571 return -ENOENT;
573 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
574 return -EINVAL;
575 } else {
576 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
577 return -ENOENT;
580 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
581 return -EINVAL;
583 if (n->nlmsg_type == RTM_DELQDISC) {
584 if (!clid)
585 return -EINVAL;
586 if (q->handle == 0)
587 return -ENOENT;
588 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
589 return err;
590 if (q) {
591 qdisc_notify(skb, n, clid, q, NULL);
592 spin_lock_bh(&dev->queue_lock);
593 qdisc_destroy(q);
594 spin_unlock_bh(&dev->queue_lock);
596 } else {
597 qdisc_notify(skb, n, clid, NULL, q);
599 return 0;
603 Create/change qdisc.
606 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
608 struct tcmsg *tcm = NLMSG_DATA(n);
609 struct rtattr **tca = arg;
610 struct net_device *dev;
611 u32 clid = tcm->tcm_parent;
612 struct Qdisc *q = NULL;
613 struct Qdisc *p = NULL;
614 int err;
616 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
617 return -ENODEV;
619 if (clid) {
620 if (clid != TC_H_ROOT) {
621 if (clid != TC_H_INGRESS) {
622 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
623 return -ENOENT;
624 q = qdisc_leaf(p, clid);
625 } else { /*ingress */
626 q = dev->qdisc_ingress;
628 } else {
629 q = dev->qdisc_sleeping;
632 /* It may be default qdisc, ignore it */
633 if (q && q->handle == 0)
634 q = NULL;
636 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
637 if (tcm->tcm_handle) {
638 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
639 return -EEXIST;
640 if (TC_H_MIN(tcm->tcm_handle))
641 return -EINVAL;
642 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
643 goto create_n_graft;
644 if (n->nlmsg_flags&NLM_F_EXCL)
645 return -EEXIST;
646 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
647 return -EINVAL;
648 if (q == p ||
649 (p && check_loop(q, p, 0)))
650 return -ELOOP;
651 atomic_inc(&q->refcnt);
652 goto graft;
653 } else {
654 if (q == NULL)
655 goto create_n_graft;
657 /* This magic test requires explanation.
659 * We know, that some child q is already
660 * attached to this parent and have choice:
661 * either to change it or to create/graft new one.
663 * 1. We are allowed to create/graft only
664 * if CREATE and REPLACE flags are set.
666 * 2. If EXCL is set, requestor wanted to say,
667 * that qdisc tcm_handle is not expected
668 * to exist, so that we choose create/graft too.
670 * 3. The last case is when no flags are set.
671 * Alas, it is sort of hole in API, we
672 * cannot decide what to do unambiguously.
673 * For now we select create/graft, if
674 * user gave KIND, which does not match existing.
676 if ((n->nlmsg_flags&NLM_F_CREATE) &&
677 (n->nlmsg_flags&NLM_F_REPLACE) &&
678 ((n->nlmsg_flags&NLM_F_EXCL) ||
679 (tca[TCA_KIND-1] &&
680 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
681 goto create_n_graft;
684 } else {
685 if (!tcm->tcm_handle)
686 return -EINVAL;
687 q = qdisc_lookup(dev, tcm->tcm_handle);
690 /* Change qdisc parameters */
691 if (q == NULL)
692 return -ENOENT;
693 if (n->nlmsg_flags&NLM_F_EXCL)
694 return -EEXIST;
695 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
696 return -EINVAL;
697 err = qdisc_change(q, tca);
698 if (err == 0)
699 qdisc_notify(skb, n, clid, NULL, q);
700 return err;
702 create_n_graft:
703 if (!(n->nlmsg_flags&NLM_F_CREATE))
704 return -ENOENT;
705 if (clid == TC_H_INGRESS)
706 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
707 else
708 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
709 if (q == NULL)
710 return err;
712 graft:
713 if (1) {
714 struct Qdisc *old_q = NULL;
715 err = qdisc_graft(dev, p, clid, q, &old_q);
716 if (err) {
717 if (q) {
718 spin_lock_bh(&dev->queue_lock);
719 qdisc_destroy(q);
720 spin_unlock_bh(&dev->queue_lock);
722 return err;
724 qdisc_notify(skb, n, clid, old_q, q);
725 if (old_q) {
726 spin_lock_bh(&dev->queue_lock);
727 qdisc_destroy(old_q);
728 spin_unlock_bh(&dev->queue_lock);
731 return 0;
734 int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st, spinlock_t *lock)
736 spin_lock_bh(lock);
737 RTA_PUT(skb, TCA_STATS, sizeof(struct tc_stats), st);
738 spin_unlock_bh(lock);
739 return 0;
741 rtattr_failure:
742 spin_unlock_bh(lock);
743 return -1;
747 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
748 u32 pid, u32 seq, unsigned flags, int event)
750 struct tcmsg *tcm;
751 struct nlmsghdr *nlh;
752 unsigned char *b = skb->tail;
754 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
755 nlh->nlmsg_flags = flags;
756 tcm = NLMSG_DATA(nlh);
757 tcm->tcm_family = AF_UNSPEC;
758 tcm->tcm_ifindex = q->dev->ifindex;
759 tcm->tcm_parent = clid;
760 tcm->tcm_handle = q->handle;
761 tcm->tcm_info = atomic_read(&q->refcnt);
762 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
763 if (q->ops->dump && q->ops->dump(q, skb) < 0)
764 goto rtattr_failure;
765 q->stats.qlen = q->q.qlen;
766 if (qdisc_copy_stats(skb, &q->stats, q->stats_lock))
767 goto rtattr_failure;
768 nlh->nlmsg_len = skb->tail - b;
769 return skb->len;
771 nlmsg_failure:
772 rtattr_failure:
773 skb_trim(skb, b - skb->data);
774 return -1;
777 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
778 u32 clid, struct Qdisc *old, struct Qdisc *new)
780 struct sk_buff *skb;
781 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
783 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
784 if (!skb)
785 return -ENOBUFS;
787 if (old && old->handle) {
788 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
789 goto err_out;
791 if (new) {
792 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
793 goto err_out;
796 if (skb->len)
797 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
799 err_out:
800 kfree_skb(skb);
801 return -EINVAL;
804 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
806 int idx, q_idx;
807 int s_idx, s_q_idx;
808 struct net_device *dev;
809 struct Qdisc *q;
811 s_idx = cb->args[0];
812 s_q_idx = q_idx = cb->args[1];
813 read_lock(&dev_base_lock);
814 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
815 if (idx < s_idx)
816 continue;
817 if (idx > s_idx)
818 s_q_idx = 0;
819 read_lock_bh(&qdisc_tree_lock);
820 q_idx = 0;
821 list_for_each_entry(q, &dev->qdisc_list, list) {
822 if (q_idx < s_q_idx) {
823 q_idx++;
824 continue;
826 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
827 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
828 read_unlock_bh(&qdisc_tree_lock);
829 goto done;
831 q_idx++;
833 read_unlock_bh(&qdisc_tree_lock);
836 done:
837 read_unlock(&dev_base_lock);
839 cb->args[0] = idx;
840 cb->args[1] = q_idx;
842 return skb->len;
847 /************************************************
848 * Traffic classes manipulation. *
849 ************************************************/
853 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
855 struct tcmsg *tcm = NLMSG_DATA(n);
856 struct rtattr **tca = arg;
857 struct net_device *dev;
858 struct Qdisc *q = NULL;
859 struct Qdisc_class_ops *cops;
860 unsigned long cl = 0;
861 unsigned long new_cl;
862 u32 pid = tcm->tcm_parent;
863 u32 clid = tcm->tcm_handle;
864 u32 qid = TC_H_MAJ(clid);
865 int err;
867 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
868 return -ENODEV;
871 parent == TC_H_UNSPEC - unspecified parent.
872 parent == TC_H_ROOT - class is root, which has no parent.
873 parent == X:0 - parent is root class.
874 parent == X:Y - parent is a node in hierarchy.
875 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
877 handle == 0:0 - generate handle from kernel pool.
878 handle == 0:Y - class is X:Y, where X:0 is qdisc.
879 handle == X:Y - clear.
880 handle == X:0 - root class.
883 /* Step 1. Determine qdisc handle X:0 */
885 if (pid != TC_H_ROOT) {
886 u32 qid1 = TC_H_MAJ(pid);
888 if (qid && qid1) {
889 /* If both majors are known, they must be identical. */
890 if (qid != qid1)
891 return -EINVAL;
892 } else if (qid1) {
893 qid = qid1;
894 } else if (qid == 0)
895 qid = dev->qdisc_sleeping->handle;
897 /* Now qid is genuine qdisc handle consistent
898 both with parent and child.
900 TC_H_MAJ(pid) still may be unspecified, complete it now.
902 if (pid)
903 pid = TC_H_MAKE(qid, pid);
904 } else {
905 if (qid == 0)
906 qid = dev->qdisc_sleeping->handle;
909 /* OK. Locate qdisc */
910 if ((q = qdisc_lookup(dev, qid)) == NULL)
911 return -ENOENT;
913 /* An check that it supports classes */
914 cops = q->ops->cl_ops;
915 if (cops == NULL)
916 return -EINVAL;
918 /* Now try to get class */
919 if (clid == 0) {
920 if (pid == TC_H_ROOT)
921 clid = qid;
922 } else
923 clid = TC_H_MAKE(qid, clid);
925 if (clid)
926 cl = cops->get(q, clid);
928 if (cl == 0) {
929 err = -ENOENT;
930 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
931 goto out;
932 } else {
933 switch (n->nlmsg_type) {
934 case RTM_NEWTCLASS:
935 err = -EEXIST;
936 if (n->nlmsg_flags&NLM_F_EXCL)
937 goto out;
938 break;
939 case RTM_DELTCLASS:
940 err = cops->delete(q, cl);
941 if (err == 0)
942 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
943 goto out;
944 case RTM_GETTCLASS:
945 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
946 goto out;
947 default:
948 err = -EINVAL;
949 goto out;
953 new_cl = cl;
954 err = cops->change(q, clid, pid, tca, &new_cl);
955 if (err == 0)
956 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
958 out:
959 if (cl)
960 cops->put(q, cl);
962 return err;
966 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
967 unsigned long cl,
968 u32 pid, u32 seq, unsigned flags, int event)
970 struct tcmsg *tcm;
971 struct nlmsghdr *nlh;
972 unsigned char *b = skb->tail;
974 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
975 nlh->nlmsg_flags = flags;
976 tcm = NLMSG_DATA(nlh);
977 tcm->tcm_family = AF_UNSPEC;
978 tcm->tcm_ifindex = q->dev->ifindex;
979 tcm->tcm_parent = q->handle;
980 tcm->tcm_handle = q->handle;
981 tcm->tcm_info = 0;
982 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
983 if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0)
984 goto rtattr_failure;
985 nlh->nlmsg_len = skb->tail - b;
986 return skb->len;
988 nlmsg_failure:
989 rtattr_failure:
990 skb_trim(skb, b - skb->data);
991 return -1;
994 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
995 struct Qdisc *q, unsigned long cl, int event)
997 struct sk_buff *skb;
998 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1000 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1001 if (!skb)
1002 return -ENOBUFS;
1004 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1005 kfree_skb(skb);
1006 return -EINVAL;
1009 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1012 struct qdisc_dump_args
1014 struct qdisc_walker w;
1015 struct sk_buff *skb;
1016 struct netlink_callback *cb;
1019 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1021 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1023 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1024 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1027 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1029 int t;
1030 int s_t;
1031 struct net_device *dev;
1032 struct Qdisc *q;
1033 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1034 struct qdisc_dump_args arg;
1036 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1037 return 0;
1038 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1039 return 0;
1041 s_t = cb->args[0];
1042 t = 0;
1044 read_lock_bh(&qdisc_tree_lock);
1045 list_for_each_entry(q, &dev->qdisc_list, list) {
1046 if (t < s_t || !q->ops->cl_ops ||
1047 (tcm->tcm_parent &&
1048 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1049 t++;
1050 continue;
1052 if (t > s_t)
1053 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1054 arg.w.fn = qdisc_class_dump;
1055 arg.skb = skb;
1056 arg.cb = cb;
1057 arg.w.stop = 0;
1058 arg.w.skip = cb->args[1];
1059 arg.w.count = 0;
1060 q->ops->cl_ops->walk(q, &arg.w);
1061 cb->args[1] = arg.w.count;
1062 if (arg.w.stop)
1063 break;
1064 t++;
1066 read_unlock_bh(&qdisc_tree_lock);
1068 cb->args[0] = t;
1070 dev_put(dev);
1071 return skb->len;
1074 int psched_us_per_tick = 1;
1075 int psched_tick_per_us = 1;
1077 #ifdef CONFIG_PROC_FS
1078 static int psched_show(struct seq_file *seq, void *v)
1080 seq_printf(seq, "%08x %08x %08x %08x\n",
1081 psched_tick_per_us, psched_us_per_tick,
1082 1000000, HZ);
1084 return 0;
1087 static int psched_open(struct inode *inode, struct file *file)
1089 return single_open(file, psched_show, PDE(inode)->data);
1092 static struct file_operations psched_fops = {
1093 .owner = THIS_MODULE,
1094 .open = psched_open,
1095 .read = seq_read,
1096 .llseek = seq_lseek,
1097 .release = single_release,
1099 #endif
1101 #ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY
1102 int psched_tod_diff(int delta_sec, int bound)
1104 int delta;
1106 if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1)
1107 return bound;
1108 delta = delta_sec * 1000000;
1109 if (delta > bound)
1110 delta = bound;
1111 return delta;
1113 EXPORT_SYMBOL(psched_tod_diff);
1114 #endif
1116 #ifdef CONFIG_NET_SCH_CLK_CPU
1117 psched_tdiff_t psched_clock_per_hz;
1118 int psched_clock_scale;
1119 EXPORT_SYMBOL(psched_clock_per_hz);
1120 EXPORT_SYMBOL(psched_clock_scale);
1122 psched_time_t psched_time_base;
1123 cycles_t psched_time_mark;
1124 EXPORT_SYMBOL(psched_time_mark);
1125 EXPORT_SYMBOL(psched_time_base);
1128 * Periodically adjust psched_time_base to avoid overflow
1129 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1131 static void psched_tick(unsigned long);
1132 static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0);
1134 static void psched_tick(unsigned long dummy)
1136 if (sizeof(cycles_t) == sizeof(u32)) {
1137 psched_time_t dummy_stamp;
1138 PSCHED_GET_TIME(dummy_stamp);
1139 psched_timer.expires = jiffies + 1*HZ;
1140 add_timer(&psched_timer);
1144 int __init psched_calibrate_clock(void)
1146 psched_time_t stamp, stamp1;
1147 struct timeval tv, tv1;
1148 psched_tdiff_t delay;
1149 long rdelay;
1150 unsigned long stop;
1152 psched_tick(0);
1153 stop = jiffies + HZ/10;
1154 PSCHED_GET_TIME(stamp);
1155 do_gettimeofday(&tv);
1156 while (time_before(jiffies, stop)) {
1157 barrier();
1158 cpu_relax();
1160 PSCHED_GET_TIME(stamp1);
1161 do_gettimeofday(&tv1);
1163 delay = PSCHED_TDIFF(stamp1, stamp);
1164 rdelay = tv1.tv_usec - tv.tv_usec;
1165 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1166 if (rdelay > delay)
1167 return -1;
1168 delay /= rdelay;
1169 psched_tick_per_us = delay;
1170 while ((delay>>=1) != 0)
1171 psched_clock_scale++;
1172 psched_us_per_tick = 1<<psched_clock_scale;
1173 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1174 return 0;
1176 #endif
1178 static int __init pktsched_init(void)
1180 struct rtnetlink_link *link_p;
1182 #ifdef CONFIG_NET_SCH_CLK_CPU
1183 if (psched_calibrate_clock() < 0)
1184 return -1;
1185 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1186 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1187 psched_us_per_tick = 1000000;
1188 #endif
1190 link_p = rtnetlink_links[PF_UNSPEC];
1192 /* Setup rtnetlink links. It is made here to avoid
1193 exporting large number of public symbols.
1196 if (link_p) {
1197 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1198 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1199 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1200 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1201 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1202 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1203 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1204 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1207 register_qdisc(&pfifo_qdisc_ops);
1208 register_qdisc(&bfifo_qdisc_ops);
1209 proc_net_fops_create("psched", 0, &psched_fops);
1211 return 0;
1214 subsys_initcall(pktsched_init);
1216 EXPORT_SYMBOL(qdisc_copy_stats);
1217 EXPORT_SYMBOL(qdisc_get_rtab);
1218 EXPORT_SYMBOL(qdisc_put_rtab);
1219 EXPORT_SYMBOL(register_qdisc);
1220 EXPORT_SYMBOL(unregister_qdisc);