libata: convert the remaining PATA drivers to new init model
[linux-2.6/mini2440.git] / net / sched / sch_api.c
blob8699e7006d8041f8d77e3241a97e9e608686b95f
1 /*
2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 * Fixes:
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/init.h>
31 #include <linux/proc_fs.h>
32 #include <linux/seq_file.h>
33 #include <linux/kmod.h>
34 #include <linux/list.h>
35 #include <linux/bitops.h>
36 #include <linux/hrtimer.h>
38 #include <net/netlink.h>
39 #include <net/sock.h>
40 #include <net/pkt_sched.h>
42 #include <asm/processor.h>
43 #include <asm/uaccess.h>
44 #include <asm/system.h>
46 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
47 struct Qdisc *old, struct Qdisc *new);
48 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
49 struct Qdisc *q, unsigned long cl, int event);
53 Short review.
54 -------------
56 This file consists of two interrelated parts:
58 1. queueing disciplines manager frontend.
59 2. traffic classes manager frontend.
61 Generally, queueing discipline ("qdisc") is a black box,
62 which is able to enqueue packets and to dequeue them (when
63 device is ready to send something) in order and at times
64 determined by algorithm hidden in it.
66 qdisc's are divided to two categories:
67 - "queues", which have no internal structure visible from outside.
68 - "schedulers", which split all the packets to "traffic classes",
69 using "packet classifiers" (look at cls_api.c)
71 In turn, classes may have child qdiscs (as rule, queues)
72 attached to them etc. etc. etc.
74 The goal of the routines in this file is to translate
75 information supplied by user in the form of handles
76 to more intelligible for kernel form, to make some sanity
77 checks and part of work, which is common to all qdiscs
78 and to provide rtnetlink notifications.
80 All real intelligent work is done inside qdisc modules.
84 Every discipline has two major routines: enqueue and dequeue.
86 ---dequeue
88 dequeue usually returns a skb to send. It is allowed to return NULL,
89 but it does not mean that queue is empty, it just means that
90 discipline does not want to send anything this time.
91 Queue is really empty if q->q.qlen == 0.
92 For complicated disciplines with multiple queues q->q is not
93 real packet queue, but however q->q.qlen must be valid.
95 ---enqueue
97 enqueue returns 0, if packet was enqueued successfully.
98 If packet (this one or another one) was dropped, it returns
99 not zero error code.
100 NET_XMIT_DROP - this packet dropped
101 Expected action: do not backoff, but wait until queue will clear.
102 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
103 Expected action: backoff or ignore
104 NET_XMIT_POLICED - dropped by police.
105 Expected action: backoff or error to real-time apps.
107 Auxiliary routines:
109 ---requeue
111 requeues once dequeued packet. It is used for non-standard or
112 just buggy devices, which can defer output even if dev->tbusy=0.
114 ---reset
116 returns qdisc to initial state: purge all buffers, clear all
117 timers, counters (except for statistics) etc.
119 ---init
121 initializes newly created qdisc.
123 ---destroy
125 destroys resources allocated by init and during lifetime of qdisc.
127 ---change
129 changes qdisc parameters.
132 /* Protects list of registered TC modules. It is pure SMP lock. */
133 static DEFINE_RWLOCK(qdisc_mod_lock);
136 /************************************************
137 * Queueing disciplines manipulation. *
138 ************************************************/
141 /* The list of all installed queueing disciplines. */
143 static struct Qdisc_ops *qdisc_base;
145 /* Register/uregister queueing discipline */
147 int register_qdisc(struct Qdisc_ops *qops)
149 struct Qdisc_ops *q, **qp;
150 int rc = -EEXIST;
152 write_lock(&qdisc_mod_lock);
153 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
154 if (!strcmp(qops->id, q->id))
155 goto out;
157 if (qops->enqueue == NULL)
158 qops->enqueue = noop_qdisc_ops.enqueue;
159 if (qops->requeue == NULL)
160 qops->requeue = noop_qdisc_ops.requeue;
161 if (qops->dequeue == NULL)
162 qops->dequeue = noop_qdisc_ops.dequeue;
164 qops->next = NULL;
165 *qp = qops;
166 rc = 0;
167 out:
168 write_unlock(&qdisc_mod_lock);
169 return rc;
172 int unregister_qdisc(struct Qdisc_ops *qops)
174 struct Qdisc_ops *q, **qp;
175 int err = -ENOENT;
177 write_lock(&qdisc_mod_lock);
178 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179 if (q == qops)
180 break;
181 if (q) {
182 *qp = q->next;
183 q->next = NULL;
184 err = 0;
186 write_unlock(&qdisc_mod_lock);
187 return err;
190 /* We know handle. Find qdisc among all qdisc's attached to device
191 (root qdisc, all its children, children of children etc.)
194 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
196 struct Qdisc *q;
198 list_for_each_entry(q, &dev->qdisc_list, list) {
199 if (q->handle == handle)
200 return q;
202 return NULL;
205 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
207 unsigned long cl;
208 struct Qdisc *leaf;
209 struct Qdisc_class_ops *cops = p->ops->cl_ops;
211 if (cops == NULL)
212 return NULL;
213 cl = cops->get(p, classid);
215 if (cl == 0)
216 return NULL;
217 leaf = cops->leaf(p, cl);
218 cops->put(p, cl);
219 return leaf;
222 /* Find queueing discipline by name */
224 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
226 struct Qdisc_ops *q = NULL;
228 if (kind) {
229 read_lock(&qdisc_mod_lock);
230 for (q = qdisc_base; q; q = q->next) {
231 if (rtattr_strcmp(kind, q->id) == 0) {
232 if (!try_module_get(q->owner))
233 q = NULL;
234 break;
237 read_unlock(&qdisc_mod_lock);
239 return q;
242 static struct qdisc_rate_table *qdisc_rtab_list;
244 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
246 struct qdisc_rate_table *rtab;
248 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
249 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
250 rtab->refcnt++;
251 return rtab;
255 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
256 return NULL;
258 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
259 if (rtab) {
260 rtab->rate = *r;
261 rtab->refcnt = 1;
262 memcpy(rtab->data, RTA_DATA(tab), 1024);
263 rtab->next = qdisc_rtab_list;
264 qdisc_rtab_list = rtab;
266 return rtab;
269 void qdisc_put_rtab(struct qdisc_rate_table *tab)
271 struct qdisc_rate_table *rtab, **rtabp;
273 if (!tab || --tab->refcnt)
274 return;
276 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
277 if (rtab == tab) {
278 *rtabp = rtab->next;
279 kfree(rtab);
280 return;
285 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
287 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
288 timer);
289 struct net_device *dev = wd->qdisc->dev;
291 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
292 smp_wmb();
293 if (spin_trylock(&dev->queue_lock)) {
294 qdisc_run(dev);
295 spin_unlock(&dev->queue_lock);
296 } else
297 netif_schedule(dev);
299 return HRTIMER_NORESTART;
302 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
304 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
305 wd->timer.function = qdisc_watchdog;
306 wd->qdisc = qdisc;
308 EXPORT_SYMBOL(qdisc_watchdog_init);
310 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
312 ktime_t time;
314 wd->qdisc->flags |= TCQ_F_THROTTLED;
315 time = ktime_set(0, 0);
316 time = ktime_add_ns(time, PSCHED_US2NS(expires));
317 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
319 EXPORT_SYMBOL(qdisc_watchdog_schedule);
321 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
323 hrtimer_cancel(&wd->timer);
324 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
326 EXPORT_SYMBOL(qdisc_watchdog_cancel);
328 /* Allocate an unique handle from space managed by kernel */
330 static u32 qdisc_alloc_handle(struct net_device *dev)
332 int i = 0x10000;
333 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
335 do {
336 autohandle += TC_H_MAKE(0x10000U, 0);
337 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
338 autohandle = TC_H_MAKE(0x80000000U, 0);
339 } while (qdisc_lookup(dev, autohandle) && --i > 0);
341 return i>0 ? autohandle : 0;
344 /* Attach toplevel qdisc to device dev */
346 static struct Qdisc *
347 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
349 struct Qdisc *oqdisc;
351 if (dev->flags & IFF_UP)
352 dev_deactivate(dev);
354 qdisc_lock_tree(dev);
355 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
356 oqdisc = dev->qdisc_ingress;
357 /* Prune old scheduler */
358 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
359 /* delete */
360 qdisc_reset(oqdisc);
361 dev->qdisc_ingress = NULL;
362 } else { /* new */
363 dev->qdisc_ingress = qdisc;
366 } else {
368 oqdisc = dev->qdisc_sleeping;
370 /* Prune old scheduler */
371 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
372 qdisc_reset(oqdisc);
374 /* ... and graft new one */
375 if (qdisc == NULL)
376 qdisc = &noop_qdisc;
377 dev->qdisc_sleeping = qdisc;
378 dev->qdisc = &noop_qdisc;
381 qdisc_unlock_tree(dev);
383 if (dev->flags & IFF_UP)
384 dev_activate(dev);
386 return oqdisc;
389 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
391 struct Qdisc_class_ops *cops;
392 unsigned long cl;
393 u32 parentid;
395 if (n == 0)
396 return;
397 while ((parentid = sch->parent)) {
398 sch = qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
399 cops = sch->ops->cl_ops;
400 if (cops->qlen_notify) {
401 cl = cops->get(sch, parentid);
402 cops->qlen_notify(sch, cl);
403 cops->put(sch, cl);
405 sch->q.qlen -= n;
408 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
410 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
411 to device "dev".
413 Old qdisc is not destroyed but returned in *old.
416 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
417 u32 classid,
418 struct Qdisc *new, struct Qdisc **old)
420 int err = 0;
421 struct Qdisc *q = *old;
424 if (parent == NULL) {
425 if (q && q->flags&TCQ_F_INGRESS) {
426 *old = dev_graft_qdisc(dev, q);
427 } else {
428 *old = dev_graft_qdisc(dev, new);
430 } else {
431 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
433 err = -EINVAL;
435 if (cops) {
436 unsigned long cl = cops->get(parent, classid);
437 if (cl) {
438 err = cops->graft(parent, cl, new, old);
439 if (new)
440 new->parent = classid;
441 cops->put(parent, cl);
445 return err;
449 Allocate and initialize new qdisc.
451 Parameters are passed via opt.
454 static struct Qdisc *
455 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
457 int err;
458 struct rtattr *kind = tca[TCA_KIND-1];
459 struct Qdisc *sch;
460 struct Qdisc_ops *ops;
462 ops = qdisc_lookup_ops(kind);
463 #ifdef CONFIG_KMOD
464 if (ops == NULL && kind != NULL) {
465 char name[IFNAMSIZ];
466 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
467 /* We dropped the RTNL semaphore in order to
468 * perform the module load. So, even if we
469 * succeeded in loading the module we have to
470 * tell the caller to replay the request. We
471 * indicate this using -EAGAIN.
472 * We replay the request because the device may
473 * go away in the mean time.
475 rtnl_unlock();
476 request_module("sch_%s", name);
477 rtnl_lock();
478 ops = qdisc_lookup_ops(kind);
479 if (ops != NULL) {
480 /* We will try again qdisc_lookup_ops,
481 * so don't keep a reference.
483 module_put(ops->owner);
484 err = -EAGAIN;
485 goto err_out;
489 #endif
491 err = -ENOENT;
492 if (ops == NULL)
493 goto err_out;
495 sch = qdisc_alloc(dev, ops);
496 if (IS_ERR(sch)) {
497 err = PTR_ERR(sch);
498 goto err_out2;
501 if (handle == TC_H_INGRESS) {
502 sch->flags |= TCQ_F_INGRESS;
503 sch->stats_lock = &dev->ingress_lock;
504 handle = TC_H_MAKE(TC_H_INGRESS, 0);
505 } else {
506 sch->stats_lock = &dev->queue_lock;
507 if (handle == 0) {
508 handle = qdisc_alloc_handle(dev);
509 err = -ENOMEM;
510 if (handle == 0)
511 goto err_out3;
515 sch->handle = handle;
517 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
518 #ifdef CONFIG_NET_ESTIMATOR
519 if (tca[TCA_RATE-1]) {
520 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
521 sch->stats_lock,
522 tca[TCA_RATE-1]);
523 if (err) {
525 * Any broken qdiscs that would require
526 * a ops->reset() here? The qdisc was never
527 * in action so it shouldn't be necessary.
529 if (ops->destroy)
530 ops->destroy(sch);
531 goto err_out3;
534 #endif
535 qdisc_lock_tree(dev);
536 list_add_tail(&sch->list, &dev->qdisc_list);
537 qdisc_unlock_tree(dev);
539 return sch;
541 err_out3:
542 dev_put(dev);
543 kfree((char *) sch - sch->padded);
544 err_out2:
545 module_put(ops->owner);
546 err_out:
547 *errp = err;
548 return NULL;
551 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
553 if (tca[TCA_OPTIONS-1]) {
554 int err;
556 if (sch->ops->change == NULL)
557 return -EINVAL;
558 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
559 if (err)
560 return err;
562 #ifdef CONFIG_NET_ESTIMATOR
563 if (tca[TCA_RATE-1])
564 gen_replace_estimator(&sch->bstats, &sch->rate_est,
565 sch->stats_lock, tca[TCA_RATE-1]);
566 #endif
567 return 0;
570 struct check_loop_arg
572 struct qdisc_walker w;
573 struct Qdisc *p;
574 int depth;
577 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
579 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
581 struct check_loop_arg arg;
583 if (q->ops->cl_ops == NULL)
584 return 0;
586 arg.w.stop = arg.w.skip = arg.w.count = 0;
587 arg.w.fn = check_loop_fn;
588 arg.depth = depth;
589 arg.p = p;
590 q->ops->cl_ops->walk(q, &arg.w);
591 return arg.w.stop ? -ELOOP : 0;
594 static int
595 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
597 struct Qdisc *leaf;
598 struct Qdisc_class_ops *cops = q->ops->cl_ops;
599 struct check_loop_arg *arg = (struct check_loop_arg *)w;
601 leaf = cops->leaf(q, cl);
602 if (leaf) {
603 if (leaf == arg->p || arg->depth > 7)
604 return -ELOOP;
605 return check_loop(leaf, arg->p, arg->depth + 1);
607 return 0;
611 * Delete/get qdisc.
614 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
616 struct tcmsg *tcm = NLMSG_DATA(n);
617 struct rtattr **tca = arg;
618 struct net_device *dev;
619 u32 clid = tcm->tcm_parent;
620 struct Qdisc *q = NULL;
621 struct Qdisc *p = NULL;
622 int err;
624 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
625 return -ENODEV;
627 if (clid) {
628 if (clid != TC_H_ROOT) {
629 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
630 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
631 return -ENOENT;
632 q = qdisc_leaf(p, clid);
633 } else { /* ingress */
634 q = dev->qdisc_ingress;
636 } else {
637 q = dev->qdisc_sleeping;
639 if (!q)
640 return -ENOENT;
642 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
643 return -EINVAL;
644 } else {
645 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
646 return -ENOENT;
649 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
650 return -EINVAL;
652 if (n->nlmsg_type == RTM_DELQDISC) {
653 if (!clid)
654 return -EINVAL;
655 if (q->handle == 0)
656 return -ENOENT;
657 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
658 return err;
659 if (q) {
660 qdisc_notify(skb, n, clid, q, NULL);
661 qdisc_lock_tree(dev);
662 qdisc_destroy(q);
663 qdisc_unlock_tree(dev);
665 } else {
666 qdisc_notify(skb, n, clid, NULL, q);
668 return 0;
672 Create/change qdisc.
675 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
677 struct tcmsg *tcm;
678 struct rtattr **tca;
679 struct net_device *dev;
680 u32 clid;
681 struct Qdisc *q, *p;
682 int err;
684 replay:
685 /* Reinit, just in case something touches this. */
686 tcm = NLMSG_DATA(n);
687 tca = arg;
688 clid = tcm->tcm_parent;
689 q = p = NULL;
691 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
692 return -ENODEV;
694 if (clid) {
695 if (clid != TC_H_ROOT) {
696 if (clid != TC_H_INGRESS) {
697 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
698 return -ENOENT;
699 q = qdisc_leaf(p, clid);
700 } else { /*ingress */
701 q = dev->qdisc_ingress;
703 } else {
704 q = dev->qdisc_sleeping;
707 /* It may be default qdisc, ignore it */
708 if (q && q->handle == 0)
709 q = NULL;
711 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
712 if (tcm->tcm_handle) {
713 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
714 return -EEXIST;
715 if (TC_H_MIN(tcm->tcm_handle))
716 return -EINVAL;
717 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
718 goto create_n_graft;
719 if (n->nlmsg_flags&NLM_F_EXCL)
720 return -EEXIST;
721 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
722 return -EINVAL;
723 if (q == p ||
724 (p && check_loop(q, p, 0)))
725 return -ELOOP;
726 atomic_inc(&q->refcnt);
727 goto graft;
728 } else {
729 if (q == NULL)
730 goto create_n_graft;
732 /* This magic test requires explanation.
734 * We know, that some child q is already
735 * attached to this parent and have choice:
736 * either to change it or to create/graft new one.
738 * 1. We are allowed to create/graft only
739 * if CREATE and REPLACE flags are set.
741 * 2. If EXCL is set, requestor wanted to say,
742 * that qdisc tcm_handle is not expected
743 * to exist, so that we choose create/graft too.
745 * 3. The last case is when no flags are set.
746 * Alas, it is sort of hole in API, we
747 * cannot decide what to do unambiguously.
748 * For now we select create/graft, if
749 * user gave KIND, which does not match existing.
751 if ((n->nlmsg_flags&NLM_F_CREATE) &&
752 (n->nlmsg_flags&NLM_F_REPLACE) &&
753 ((n->nlmsg_flags&NLM_F_EXCL) ||
754 (tca[TCA_KIND-1] &&
755 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
756 goto create_n_graft;
759 } else {
760 if (!tcm->tcm_handle)
761 return -EINVAL;
762 q = qdisc_lookup(dev, tcm->tcm_handle);
765 /* Change qdisc parameters */
766 if (q == NULL)
767 return -ENOENT;
768 if (n->nlmsg_flags&NLM_F_EXCL)
769 return -EEXIST;
770 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
771 return -EINVAL;
772 err = qdisc_change(q, tca);
773 if (err == 0)
774 qdisc_notify(skb, n, clid, NULL, q);
775 return err;
777 create_n_graft:
778 if (!(n->nlmsg_flags&NLM_F_CREATE))
779 return -ENOENT;
780 if (clid == TC_H_INGRESS)
781 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
782 else
783 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
784 if (q == NULL) {
785 if (err == -EAGAIN)
786 goto replay;
787 return err;
790 graft:
791 if (1) {
792 struct Qdisc *old_q = NULL;
793 err = qdisc_graft(dev, p, clid, q, &old_q);
794 if (err) {
795 if (q) {
796 qdisc_lock_tree(dev);
797 qdisc_destroy(q);
798 qdisc_unlock_tree(dev);
800 return err;
802 qdisc_notify(skb, n, clid, old_q, q);
803 if (old_q) {
804 qdisc_lock_tree(dev);
805 qdisc_destroy(old_q);
806 qdisc_unlock_tree(dev);
809 return 0;
812 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
813 u32 pid, u32 seq, u16 flags, int event)
815 struct tcmsg *tcm;
816 struct nlmsghdr *nlh;
817 unsigned char *b = skb_tail_pointer(skb);
818 struct gnet_dump d;
820 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
821 tcm = NLMSG_DATA(nlh);
822 tcm->tcm_family = AF_UNSPEC;
823 tcm->tcm__pad1 = 0;
824 tcm->tcm__pad2 = 0;
825 tcm->tcm_ifindex = q->dev->ifindex;
826 tcm->tcm_parent = clid;
827 tcm->tcm_handle = q->handle;
828 tcm->tcm_info = atomic_read(&q->refcnt);
829 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
830 if (q->ops->dump && q->ops->dump(q, skb) < 0)
831 goto rtattr_failure;
832 q->qstats.qlen = q->q.qlen;
834 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
835 TCA_XSTATS, q->stats_lock, &d) < 0)
836 goto rtattr_failure;
838 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
839 goto rtattr_failure;
841 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
842 #ifdef CONFIG_NET_ESTIMATOR
843 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
844 #endif
845 gnet_stats_copy_queue(&d, &q->qstats) < 0)
846 goto rtattr_failure;
848 if (gnet_stats_finish_copy(&d) < 0)
849 goto rtattr_failure;
851 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
852 return skb->len;
854 nlmsg_failure:
855 rtattr_failure:
856 nlmsg_trim(skb, b);
857 return -1;
860 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
861 u32 clid, struct Qdisc *old, struct Qdisc *new)
863 struct sk_buff *skb;
864 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
866 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
867 if (!skb)
868 return -ENOBUFS;
870 if (old && old->handle) {
871 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
872 goto err_out;
874 if (new) {
875 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
876 goto err_out;
879 if (skb->len)
880 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
882 err_out:
883 kfree_skb(skb);
884 return -EINVAL;
887 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
889 int idx, q_idx;
890 int s_idx, s_q_idx;
891 struct net_device *dev;
892 struct Qdisc *q;
894 s_idx = cb->args[0];
895 s_q_idx = q_idx = cb->args[1];
896 read_lock(&dev_base_lock);
897 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
898 if (idx < s_idx)
899 continue;
900 if (idx > s_idx)
901 s_q_idx = 0;
902 q_idx = 0;
903 list_for_each_entry(q, &dev->qdisc_list, list) {
904 if (q_idx < s_q_idx) {
905 q_idx++;
906 continue;
908 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
909 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
910 goto done;
911 q_idx++;
915 done:
916 read_unlock(&dev_base_lock);
918 cb->args[0] = idx;
919 cb->args[1] = q_idx;
921 return skb->len;
926 /************************************************
927 * Traffic classes manipulation. *
928 ************************************************/
932 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
934 struct tcmsg *tcm = NLMSG_DATA(n);
935 struct rtattr **tca = arg;
936 struct net_device *dev;
937 struct Qdisc *q = NULL;
938 struct Qdisc_class_ops *cops;
939 unsigned long cl = 0;
940 unsigned long new_cl;
941 u32 pid = tcm->tcm_parent;
942 u32 clid = tcm->tcm_handle;
943 u32 qid = TC_H_MAJ(clid);
944 int err;
946 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
947 return -ENODEV;
950 parent == TC_H_UNSPEC - unspecified parent.
951 parent == TC_H_ROOT - class is root, which has no parent.
952 parent == X:0 - parent is root class.
953 parent == X:Y - parent is a node in hierarchy.
954 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
956 handle == 0:0 - generate handle from kernel pool.
957 handle == 0:Y - class is X:Y, where X:0 is qdisc.
958 handle == X:Y - clear.
959 handle == X:0 - root class.
962 /* Step 1. Determine qdisc handle X:0 */
964 if (pid != TC_H_ROOT) {
965 u32 qid1 = TC_H_MAJ(pid);
967 if (qid && qid1) {
968 /* If both majors are known, they must be identical. */
969 if (qid != qid1)
970 return -EINVAL;
971 } else if (qid1) {
972 qid = qid1;
973 } else if (qid == 0)
974 qid = dev->qdisc_sleeping->handle;
976 /* Now qid is genuine qdisc handle consistent
977 both with parent and child.
979 TC_H_MAJ(pid) still may be unspecified, complete it now.
981 if (pid)
982 pid = TC_H_MAKE(qid, pid);
983 } else {
984 if (qid == 0)
985 qid = dev->qdisc_sleeping->handle;
988 /* OK. Locate qdisc */
989 if ((q = qdisc_lookup(dev, qid)) == NULL)
990 return -ENOENT;
992 /* An check that it supports classes */
993 cops = q->ops->cl_ops;
994 if (cops == NULL)
995 return -EINVAL;
997 /* Now try to get class */
998 if (clid == 0) {
999 if (pid == TC_H_ROOT)
1000 clid = qid;
1001 } else
1002 clid = TC_H_MAKE(qid, clid);
1004 if (clid)
1005 cl = cops->get(q, clid);
1007 if (cl == 0) {
1008 err = -ENOENT;
1009 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1010 goto out;
1011 } else {
1012 switch (n->nlmsg_type) {
1013 case RTM_NEWTCLASS:
1014 err = -EEXIST;
1015 if (n->nlmsg_flags&NLM_F_EXCL)
1016 goto out;
1017 break;
1018 case RTM_DELTCLASS:
1019 err = cops->delete(q, cl);
1020 if (err == 0)
1021 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1022 goto out;
1023 case RTM_GETTCLASS:
1024 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1025 goto out;
1026 default:
1027 err = -EINVAL;
1028 goto out;
1032 new_cl = cl;
1033 err = cops->change(q, clid, pid, tca, &new_cl);
1034 if (err == 0)
1035 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1037 out:
1038 if (cl)
1039 cops->put(q, cl);
1041 return err;
1045 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1046 unsigned long cl,
1047 u32 pid, u32 seq, u16 flags, int event)
1049 struct tcmsg *tcm;
1050 struct nlmsghdr *nlh;
1051 unsigned char *b = skb_tail_pointer(skb);
1052 struct gnet_dump d;
1053 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1055 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1056 tcm = NLMSG_DATA(nlh);
1057 tcm->tcm_family = AF_UNSPEC;
1058 tcm->tcm_ifindex = q->dev->ifindex;
1059 tcm->tcm_parent = q->handle;
1060 tcm->tcm_handle = q->handle;
1061 tcm->tcm_info = 0;
1062 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1063 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1064 goto rtattr_failure;
1066 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1067 TCA_XSTATS, q->stats_lock, &d) < 0)
1068 goto rtattr_failure;
1070 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1071 goto rtattr_failure;
1073 if (gnet_stats_finish_copy(&d) < 0)
1074 goto rtattr_failure;
1076 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1077 return skb->len;
1079 nlmsg_failure:
1080 rtattr_failure:
1081 nlmsg_trim(skb, b);
1082 return -1;
1085 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1086 struct Qdisc *q, unsigned long cl, int event)
1088 struct sk_buff *skb;
1089 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1091 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1092 if (!skb)
1093 return -ENOBUFS;
1095 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1096 kfree_skb(skb);
1097 return -EINVAL;
1100 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1103 struct qdisc_dump_args
1105 struct qdisc_walker w;
1106 struct sk_buff *skb;
1107 struct netlink_callback *cb;
1110 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1112 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1114 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1115 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1118 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1120 int t;
1121 int s_t;
1122 struct net_device *dev;
1123 struct Qdisc *q;
1124 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1125 struct qdisc_dump_args arg;
1127 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1128 return 0;
1129 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1130 return 0;
1132 s_t = cb->args[0];
1133 t = 0;
1135 list_for_each_entry(q, &dev->qdisc_list, list) {
1136 if (t < s_t || !q->ops->cl_ops ||
1137 (tcm->tcm_parent &&
1138 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1139 t++;
1140 continue;
1142 if (t > s_t)
1143 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1144 arg.w.fn = qdisc_class_dump;
1145 arg.skb = skb;
1146 arg.cb = cb;
1147 arg.w.stop = 0;
1148 arg.w.skip = cb->args[1];
1149 arg.w.count = 0;
1150 q->ops->cl_ops->walk(q, &arg.w);
1151 cb->args[1] = arg.w.count;
1152 if (arg.w.stop)
1153 break;
1154 t++;
1157 cb->args[0] = t;
1159 dev_put(dev);
1160 return skb->len;
1163 /* Main classifier routine: scans classifier chain attached
1164 to this qdisc, (optionally) tests for protocol and asks
1165 specific classifiers.
1167 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1168 struct tcf_result *res)
1170 int err = 0;
1171 __be16 protocol = skb->protocol;
1172 #ifdef CONFIG_NET_CLS_ACT
1173 struct tcf_proto *otp = tp;
1174 reclassify:
1175 #endif
1176 protocol = skb->protocol;
1178 for ( ; tp; tp = tp->next) {
1179 if ((tp->protocol == protocol ||
1180 tp->protocol == htons(ETH_P_ALL)) &&
1181 (err = tp->classify(skb, tp, res)) >= 0) {
1182 #ifdef CONFIG_NET_CLS_ACT
1183 if ( TC_ACT_RECLASSIFY == err) {
1184 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1185 tp = otp;
1187 if (MAX_REC_LOOP < verd++) {
1188 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1189 tp->prio&0xffff, ntohs(tp->protocol));
1190 return TC_ACT_SHOT;
1192 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1193 goto reclassify;
1194 } else {
1195 if (skb->tc_verd)
1196 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1197 return err;
1199 #else
1201 return err;
1202 #endif
1206 return -1;
1209 void tcf_destroy(struct tcf_proto *tp)
1211 tp->ops->destroy(tp);
1212 module_put(tp->ops->owner);
1213 kfree(tp);
1216 void tcf_destroy_chain(struct tcf_proto *fl)
1218 struct tcf_proto *tp;
1220 while ((tp = fl) != NULL) {
1221 fl = tp->next;
1222 tcf_destroy(tp);
1225 EXPORT_SYMBOL(tcf_destroy_chain);
1227 #ifdef CONFIG_PROC_FS
1228 static int psched_show(struct seq_file *seq, void *v)
1230 seq_printf(seq, "%08x %08x %08x %08x\n",
1231 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1232 1000000,
1233 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(KTIME_MONOTONIC_RES));
1235 return 0;
1238 static int psched_open(struct inode *inode, struct file *file)
1240 return single_open(file, psched_show, PDE(inode)->data);
1243 static const struct file_operations psched_fops = {
1244 .owner = THIS_MODULE,
1245 .open = psched_open,
1246 .read = seq_read,
1247 .llseek = seq_lseek,
1248 .release = single_release,
1250 #endif
1252 static int __init pktsched_init(void)
1254 register_qdisc(&pfifo_qdisc_ops);
1255 register_qdisc(&bfifo_qdisc_ops);
1256 proc_net_fops_create("psched", 0, &psched_fops);
1258 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1259 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1260 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1261 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1262 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1263 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1265 return 0;
1268 subsys_initcall(pktsched_init);
1270 EXPORT_SYMBOL(qdisc_get_rtab);
1271 EXPORT_SYMBOL(qdisc_put_rtab);
1272 EXPORT_SYMBOL(register_qdisc);
1273 EXPORT_SYMBOL(unregister_qdisc);
1274 EXPORT_SYMBOL(tc_classify);