2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
31 #include <net/net_namespace.h>
33 #include <net/netlink.h>
34 #include <net/pkt_sched.h>
36 static int qdisc_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
, u32 clid
,
37 struct Qdisc
*old
, struct Qdisc
*new);
38 static int tclass_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
39 struct Qdisc
*q
, unsigned long cl
, int event
);
46 This file consists of two interrelated parts:
48 1. queueing disciplines manager frontend.
49 2. traffic classes manager frontend.
51 Generally, queueing discipline ("qdisc") is a black box,
52 which is able to enqueue packets and to dequeue them (when
53 device is ready to send something) in order and at times
54 determined by algorithm hidden in it.
56 qdisc's are divided to two categories:
57 - "queues", which have no internal structure visible from outside.
58 - "schedulers", which split all the packets to "traffic classes",
59 using "packet classifiers" (look at cls_api.c)
61 In turn, classes may have child qdiscs (as rule, queues)
62 attached to them etc. etc. etc.
64 The goal of the routines in this file is to translate
65 information supplied by user in the form of handles
66 to more intelligible for kernel form, to make some sanity
67 checks and part of work, which is common to all qdiscs
68 and to provide rtnetlink notifications.
70 All real intelligent work is done inside qdisc modules.
74 Every discipline has two major routines: enqueue and dequeue.
78 dequeue usually returns a skb to send. It is allowed to return NULL,
79 but it does not mean that queue is empty, it just means that
80 discipline does not want to send anything this time.
81 Queue is really empty if q->q.qlen == 0.
82 For complicated disciplines with multiple queues q->q is not
83 real packet queue, but however q->q.qlen must be valid.
87 enqueue returns 0, if packet was enqueued successfully.
88 If packet (this one or another one) was dropped, it returns
90 NET_XMIT_DROP - this packet dropped
91 Expected action: do not backoff, but wait until queue will clear.
92 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
93 Expected action: backoff or ignore
94 NET_XMIT_POLICED - dropped by police.
95 Expected action: backoff or error to real-time apps.
101 requeues once dequeued packet. It is used for non-standard or
102 just buggy devices, which can defer output even if dev->tbusy=0.
106 returns qdisc to initial state: purge all buffers, clear all
107 timers, counters (except for statistics) etc.
111 initializes newly created qdisc.
115 destroys resources allocated by init and during lifetime of qdisc.
119 changes qdisc parameters.
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock
);
126 /************************************************
127 * Queueing disciplines manipulation. *
128 ************************************************/
131 /* The list of all installed queueing disciplines. */
133 static struct Qdisc_ops
*qdisc_base
;
135 /* Register/uregister queueing discipline */
137 int register_qdisc(struct Qdisc_ops
*qops
)
139 struct Qdisc_ops
*q
, **qp
;
142 write_lock(&qdisc_mod_lock
);
143 for (qp
= &qdisc_base
; (q
= *qp
) != NULL
; qp
= &q
->next
)
144 if (!strcmp(qops
->id
, q
->id
))
147 if (qops
->enqueue
== NULL
)
148 qops
->enqueue
= noop_qdisc_ops
.enqueue
;
149 if (qops
->requeue
== NULL
)
150 qops
->requeue
= noop_qdisc_ops
.requeue
;
151 if (qops
->dequeue
== NULL
)
152 qops
->dequeue
= noop_qdisc_ops
.dequeue
;
158 write_unlock(&qdisc_mod_lock
);
162 int unregister_qdisc(struct Qdisc_ops
*qops
)
164 struct Qdisc_ops
*q
, **qp
;
167 write_lock(&qdisc_mod_lock
);
168 for (qp
= &qdisc_base
; (q
=*qp
)!=NULL
; qp
= &q
->next
)
176 write_unlock(&qdisc_mod_lock
);
180 /* We know handle. Find qdisc among all qdisc's attached to device
181 (root qdisc, all its children, children of children etc.)
184 struct Qdisc
*qdisc_lookup(struct net_device
*dev
, u32 handle
)
188 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
189 if (q
->handle
== handle
)
195 static struct Qdisc
*qdisc_leaf(struct Qdisc
*p
, u32 classid
)
199 const struct Qdisc_class_ops
*cops
= p
->ops
->cl_ops
;
203 cl
= cops
->get(p
, classid
);
207 leaf
= cops
->leaf(p
, cl
);
212 /* Find queueing discipline by name */
214 static struct Qdisc_ops
*qdisc_lookup_ops(struct rtattr
*kind
)
216 struct Qdisc_ops
*q
= NULL
;
219 read_lock(&qdisc_mod_lock
);
220 for (q
= qdisc_base
; q
; q
= q
->next
) {
221 if (rtattr_strcmp(kind
, q
->id
) == 0) {
222 if (!try_module_get(q
->owner
))
227 read_unlock(&qdisc_mod_lock
);
232 static struct qdisc_rate_table
*qdisc_rtab_list
;
234 struct qdisc_rate_table
*qdisc_get_rtab(struct tc_ratespec
*r
, struct rtattr
*tab
)
236 struct qdisc_rate_table
*rtab
;
238 for (rtab
= qdisc_rtab_list
; rtab
; rtab
= rtab
->next
) {
239 if (memcmp(&rtab
->rate
, r
, sizeof(struct tc_ratespec
)) == 0) {
245 if (tab
== NULL
|| r
->rate
== 0 || r
->cell_log
== 0 || RTA_PAYLOAD(tab
) != 1024)
248 rtab
= kmalloc(sizeof(*rtab
), GFP_KERNEL
);
252 memcpy(rtab
->data
, RTA_DATA(tab
), 1024);
253 rtab
->next
= qdisc_rtab_list
;
254 qdisc_rtab_list
= rtab
;
259 void qdisc_put_rtab(struct qdisc_rate_table
*tab
)
261 struct qdisc_rate_table
*rtab
, **rtabp
;
263 if (!tab
|| --tab
->refcnt
)
266 for (rtabp
= &qdisc_rtab_list
; (rtab
=*rtabp
) != NULL
; rtabp
= &rtab
->next
) {
275 static enum hrtimer_restart
qdisc_watchdog(struct hrtimer
*timer
)
277 struct qdisc_watchdog
*wd
= container_of(timer
, struct qdisc_watchdog
,
279 struct net_device
*dev
= wd
->qdisc
->dev
;
281 wd
->qdisc
->flags
&= ~TCQ_F_THROTTLED
;
285 return HRTIMER_NORESTART
;
288 void qdisc_watchdog_init(struct qdisc_watchdog
*wd
, struct Qdisc
*qdisc
)
290 hrtimer_init(&wd
->timer
, CLOCK_MONOTONIC
, HRTIMER_MODE_ABS
);
291 wd
->timer
.function
= qdisc_watchdog
;
294 EXPORT_SYMBOL(qdisc_watchdog_init
);
296 void qdisc_watchdog_schedule(struct qdisc_watchdog
*wd
, psched_time_t expires
)
300 wd
->qdisc
->flags
|= TCQ_F_THROTTLED
;
301 time
= ktime_set(0, 0);
302 time
= ktime_add_ns(time
, PSCHED_US2NS(expires
));
303 hrtimer_start(&wd
->timer
, time
, HRTIMER_MODE_ABS
);
305 EXPORT_SYMBOL(qdisc_watchdog_schedule
);
307 void qdisc_watchdog_cancel(struct qdisc_watchdog
*wd
)
309 hrtimer_cancel(&wd
->timer
);
310 wd
->qdisc
->flags
&= ~TCQ_F_THROTTLED
;
312 EXPORT_SYMBOL(qdisc_watchdog_cancel
);
314 /* Allocate an unique handle from space managed by kernel */
316 static u32
qdisc_alloc_handle(struct net_device
*dev
)
319 static u32 autohandle
= TC_H_MAKE(0x80000000U
, 0);
322 autohandle
+= TC_H_MAKE(0x10000U
, 0);
323 if (autohandle
== TC_H_MAKE(TC_H_ROOT
, 0))
324 autohandle
= TC_H_MAKE(0x80000000U
, 0);
325 } while (qdisc_lookup(dev
, autohandle
) && --i
> 0);
327 return i
>0 ? autohandle
: 0;
330 /* Attach toplevel qdisc to device dev */
332 static struct Qdisc
*
333 dev_graft_qdisc(struct net_device
*dev
, struct Qdisc
*qdisc
)
335 struct Qdisc
*oqdisc
;
337 if (dev
->flags
& IFF_UP
)
340 qdisc_lock_tree(dev
);
341 if (qdisc
&& qdisc
->flags
&TCQ_F_INGRESS
) {
342 oqdisc
= dev
->qdisc_ingress
;
343 /* Prune old scheduler */
344 if (oqdisc
&& atomic_read(&oqdisc
->refcnt
) <= 1) {
347 dev
->qdisc_ingress
= NULL
;
349 dev
->qdisc_ingress
= qdisc
;
354 oqdisc
= dev
->qdisc_sleeping
;
356 /* Prune old scheduler */
357 if (oqdisc
&& atomic_read(&oqdisc
->refcnt
) <= 1)
360 /* ... and graft new one */
363 dev
->qdisc_sleeping
= qdisc
;
364 dev
->qdisc
= &noop_qdisc
;
367 qdisc_unlock_tree(dev
);
369 if (dev
->flags
& IFF_UP
)
375 void qdisc_tree_decrease_qlen(struct Qdisc
*sch
, unsigned int n
)
377 const struct Qdisc_class_ops
*cops
;
383 while ((parentid
= sch
->parent
)) {
384 sch
= qdisc_lookup(sch
->dev
, TC_H_MAJ(parentid
));
386 WARN_ON(parentid
!= TC_H_ROOT
);
389 cops
= sch
->ops
->cl_ops
;
390 if (cops
->qlen_notify
) {
391 cl
= cops
->get(sch
, parentid
);
392 cops
->qlen_notify(sch
, cl
);
398 EXPORT_SYMBOL(qdisc_tree_decrease_qlen
);
400 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
403 Old qdisc is not destroyed but returned in *old.
406 static int qdisc_graft(struct net_device
*dev
, struct Qdisc
*parent
,
408 struct Qdisc
*new, struct Qdisc
**old
)
411 struct Qdisc
*q
= *old
;
414 if (parent
== NULL
) {
415 if (q
&& q
->flags
&TCQ_F_INGRESS
) {
416 *old
= dev_graft_qdisc(dev
, q
);
418 *old
= dev_graft_qdisc(dev
, new);
421 const struct Qdisc_class_ops
*cops
= parent
->ops
->cl_ops
;
426 unsigned long cl
= cops
->get(parent
, classid
);
428 err
= cops
->graft(parent
, cl
, new, old
);
429 cops
->put(parent
, cl
);
437 Allocate and initialize new qdisc.
439 Parameters are passed via opt.
442 static struct Qdisc
*
443 qdisc_create(struct net_device
*dev
, u32 parent
, u32 handle
,
444 struct rtattr
**tca
, int *errp
)
447 struct rtattr
*kind
= tca
[TCA_KIND
-1];
449 struct Qdisc_ops
*ops
;
451 ops
= qdisc_lookup_ops(kind
);
453 if (ops
== NULL
&& kind
!= NULL
) {
455 if (rtattr_strlcpy(name
, kind
, IFNAMSIZ
) < IFNAMSIZ
) {
456 /* We dropped the RTNL semaphore in order to
457 * perform the module load. So, even if we
458 * succeeded in loading the module we have to
459 * tell the caller to replay the request. We
460 * indicate this using -EAGAIN.
461 * We replay the request because the device may
462 * go away in the mean time.
465 request_module("sch_%s", name
);
467 ops
= qdisc_lookup_ops(kind
);
469 /* We will try again qdisc_lookup_ops,
470 * so don't keep a reference.
472 module_put(ops
->owner
);
484 sch
= qdisc_alloc(dev
, ops
);
490 sch
->parent
= parent
;
492 if (handle
== TC_H_INGRESS
) {
493 sch
->flags
|= TCQ_F_INGRESS
;
494 sch
->stats_lock
= &dev
->ingress_lock
;
495 handle
= TC_H_MAKE(TC_H_INGRESS
, 0);
497 sch
->stats_lock
= &dev
->queue_lock
;
499 handle
= qdisc_alloc_handle(dev
);
506 sch
->handle
= handle
;
508 if (!ops
->init
|| (err
= ops
->init(sch
, tca
[TCA_OPTIONS
-1])) == 0) {
509 if (tca
[TCA_RATE
-1]) {
510 err
= gen_new_estimator(&sch
->bstats
, &sch
->rate_est
,
515 * Any broken qdiscs that would require
516 * a ops->reset() here? The qdisc was never
517 * in action so it shouldn't be necessary.
524 qdisc_lock_tree(dev
);
525 list_add_tail(&sch
->list
, &dev
->qdisc_list
);
526 qdisc_unlock_tree(dev
);
532 kfree((char *) sch
- sch
->padded
);
534 module_put(ops
->owner
);
540 static int qdisc_change(struct Qdisc
*sch
, struct rtattr
**tca
)
542 if (tca
[TCA_OPTIONS
-1]) {
545 if (sch
->ops
->change
== NULL
)
547 err
= sch
->ops
->change(sch
, tca
[TCA_OPTIONS
-1]);
552 gen_replace_estimator(&sch
->bstats
, &sch
->rate_est
,
553 sch
->stats_lock
, tca
[TCA_RATE
-1]);
557 struct check_loop_arg
559 struct qdisc_walker w
;
564 static int check_loop_fn(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*w
);
566 static int check_loop(struct Qdisc
*q
, struct Qdisc
*p
, int depth
)
568 struct check_loop_arg arg
;
570 if (q
->ops
->cl_ops
== NULL
)
573 arg
.w
.stop
= arg
.w
.skip
= arg
.w
.count
= 0;
574 arg
.w
.fn
= check_loop_fn
;
577 q
->ops
->cl_ops
->walk(q
, &arg
.w
);
578 return arg
.w
.stop
? -ELOOP
: 0;
582 check_loop_fn(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*w
)
585 const struct Qdisc_class_ops
*cops
= q
->ops
->cl_ops
;
586 struct check_loop_arg
*arg
= (struct check_loop_arg
*)w
;
588 leaf
= cops
->leaf(q
, cl
);
590 if (leaf
== arg
->p
|| arg
->depth
> 7)
592 return check_loop(leaf
, arg
->p
, arg
->depth
+ 1);
601 static int tc_get_qdisc(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
603 struct net
*net
= skb
->sk
->sk_net
;
604 struct tcmsg
*tcm
= NLMSG_DATA(n
);
605 struct rtattr
**tca
= arg
;
606 struct net_device
*dev
;
607 u32 clid
= tcm
->tcm_parent
;
608 struct Qdisc
*q
= NULL
;
609 struct Qdisc
*p
= NULL
;
612 if (net
!= &init_net
)
615 if ((dev
= __dev_get_by_index(&init_net
, tcm
->tcm_ifindex
)) == NULL
)
619 if (clid
!= TC_H_ROOT
) {
620 if (TC_H_MAJ(clid
) != TC_H_MAJ(TC_H_INGRESS
)) {
621 if ((p
= qdisc_lookup(dev
, TC_H_MAJ(clid
))) == NULL
)
623 q
= qdisc_leaf(p
, clid
);
624 } else { /* ingress */
625 q
= dev
->qdisc_ingress
;
628 q
= dev
->qdisc_sleeping
;
633 if (tcm
->tcm_handle
&& q
->handle
!= tcm
->tcm_handle
)
636 if ((q
= qdisc_lookup(dev
, tcm
->tcm_handle
)) == NULL
)
640 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
643 if (n
->nlmsg_type
== RTM_DELQDISC
) {
648 if ((err
= qdisc_graft(dev
, p
, clid
, NULL
, &q
)) != 0)
651 qdisc_notify(skb
, n
, clid
, q
, NULL
);
652 qdisc_lock_tree(dev
);
654 qdisc_unlock_tree(dev
);
657 qdisc_notify(skb
, n
, clid
, NULL
, q
);
666 static int tc_modify_qdisc(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
668 struct net
*net
= skb
->sk
->sk_net
;
671 struct net_device
*dev
;
676 if (net
!= &init_net
)
680 /* Reinit, just in case something touches this. */
683 clid
= tcm
->tcm_parent
;
686 if ((dev
= __dev_get_by_index(&init_net
, tcm
->tcm_ifindex
)) == NULL
)
690 if (clid
!= TC_H_ROOT
) {
691 if (clid
!= TC_H_INGRESS
) {
692 if ((p
= qdisc_lookup(dev
, TC_H_MAJ(clid
))) == NULL
)
694 q
= qdisc_leaf(p
, clid
);
695 } else { /*ingress */
696 q
= dev
->qdisc_ingress
;
699 q
= dev
->qdisc_sleeping
;
702 /* It may be default qdisc, ignore it */
703 if (q
&& q
->handle
== 0)
706 if (!q
|| !tcm
->tcm_handle
|| q
->handle
!= tcm
->tcm_handle
) {
707 if (tcm
->tcm_handle
) {
708 if (q
&& !(n
->nlmsg_flags
&NLM_F_REPLACE
))
710 if (TC_H_MIN(tcm
->tcm_handle
))
712 if ((q
= qdisc_lookup(dev
, tcm
->tcm_handle
)) == NULL
)
714 if (n
->nlmsg_flags
&NLM_F_EXCL
)
716 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
719 (p
&& check_loop(q
, p
, 0)))
721 atomic_inc(&q
->refcnt
);
727 /* This magic test requires explanation.
729 * We know, that some child q is already
730 * attached to this parent and have choice:
731 * either to change it or to create/graft new one.
733 * 1. We are allowed to create/graft only
734 * if CREATE and REPLACE flags are set.
736 * 2. If EXCL is set, requestor wanted to say,
737 * that qdisc tcm_handle is not expected
738 * to exist, so that we choose create/graft too.
740 * 3. The last case is when no flags are set.
741 * Alas, it is sort of hole in API, we
742 * cannot decide what to do unambiguously.
743 * For now we select create/graft, if
744 * user gave KIND, which does not match existing.
746 if ((n
->nlmsg_flags
&NLM_F_CREATE
) &&
747 (n
->nlmsg_flags
&NLM_F_REPLACE
) &&
748 ((n
->nlmsg_flags
&NLM_F_EXCL
) ||
750 rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))))
755 if (!tcm
->tcm_handle
)
757 q
= qdisc_lookup(dev
, tcm
->tcm_handle
);
760 /* Change qdisc parameters */
763 if (n
->nlmsg_flags
&NLM_F_EXCL
)
765 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
767 err
= qdisc_change(q
, tca
);
769 qdisc_notify(skb
, n
, clid
, NULL
, q
);
773 if (!(n
->nlmsg_flags
&NLM_F_CREATE
))
775 if (clid
== TC_H_INGRESS
)
776 q
= qdisc_create(dev
, tcm
->tcm_parent
, tcm
->tcm_parent
,
779 q
= qdisc_create(dev
, tcm
->tcm_parent
, tcm
->tcm_handle
,
789 struct Qdisc
*old_q
= NULL
;
790 err
= qdisc_graft(dev
, p
, clid
, q
, &old_q
);
793 qdisc_lock_tree(dev
);
795 qdisc_unlock_tree(dev
);
799 qdisc_notify(skb
, n
, clid
, old_q
, q
);
801 qdisc_lock_tree(dev
);
802 qdisc_destroy(old_q
);
803 qdisc_unlock_tree(dev
);
809 static int tc_fill_qdisc(struct sk_buff
*skb
, struct Qdisc
*q
, u32 clid
,
810 u32 pid
, u32 seq
, u16 flags
, int event
)
813 struct nlmsghdr
*nlh
;
814 unsigned char *b
= skb_tail_pointer(skb
);
817 nlh
= NLMSG_NEW(skb
, pid
, seq
, event
, sizeof(*tcm
), flags
);
818 tcm
= NLMSG_DATA(nlh
);
819 tcm
->tcm_family
= AF_UNSPEC
;
822 tcm
->tcm_ifindex
= q
->dev
->ifindex
;
823 tcm
->tcm_parent
= clid
;
824 tcm
->tcm_handle
= q
->handle
;
825 tcm
->tcm_info
= atomic_read(&q
->refcnt
);
826 RTA_PUT(skb
, TCA_KIND
, IFNAMSIZ
, q
->ops
->id
);
827 if (q
->ops
->dump
&& q
->ops
->dump(q
, skb
) < 0)
829 q
->qstats
.qlen
= q
->q
.qlen
;
831 if (gnet_stats_start_copy_compat(skb
, TCA_STATS2
, TCA_STATS
,
832 TCA_XSTATS
, q
->stats_lock
, &d
) < 0)
835 if (q
->ops
->dump_stats
&& q
->ops
->dump_stats(q
, &d
) < 0)
838 if (gnet_stats_copy_basic(&d
, &q
->bstats
) < 0 ||
839 gnet_stats_copy_rate_est(&d
, &q
->rate_est
) < 0 ||
840 gnet_stats_copy_queue(&d
, &q
->qstats
) < 0)
843 if (gnet_stats_finish_copy(&d
) < 0)
846 nlh
->nlmsg_len
= skb_tail_pointer(skb
) - b
;
855 static int qdisc_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
856 u32 clid
, struct Qdisc
*old
, struct Qdisc
*new)
859 u32 pid
= oskb
? NETLINK_CB(oskb
).pid
: 0;
861 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
865 if (old
&& old
->handle
) {
866 if (tc_fill_qdisc(skb
, old
, clid
, pid
, n
->nlmsg_seq
, 0, RTM_DELQDISC
) < 0)
870 if (tc_fill_qdisc(skb
, new, clid
, pid
, n
->nlmsg_seq
, old
? NLM_F_REPLACE
: 0, RTM_NEWQDISC
) < 0)
875 return rtnetlink_send(skb
, &init_net
, pid
, RTNLGRP_TC
, n
->nlmsg_flags
&NLM_F_ECHO
);
882 static int tc_dump_qdisc(struct sk_buff
*skb
, struct netlink_callback
*cb
)
884 struct net
*net
= skb
->sk
->sk_net
;
887 struct net_device
*dev
;
890 if (net
!= &init_net
)
894 s_q_idx
= q_idx
= cb
->args
[1];
895 read_lock(&dev_base_lock
);
897 for_each_netdev(&init_net
, dev
) {
903 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
904 if (q_idx
< s_q_idx
) {
908 if (tc_fill_qdisc(skb
, q
, q
->parent
, NETLINK_CB(cb
->skb
).pid
,
909 cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
, RTM_NEWQDISC
) <= 0)
918 read_unlock(&dev_base_lock
);
928 /************************************************
929 * Traffic classes manipulation. *
930 ************************************************/
934 static int tc_ctl_tclass(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
936 struct net
*net
= skb
->sk
->sk_net
;
937 struct tcmsg
*tcm
= NLMSG_DATA(n
);
938 struct rtattr
**tca
= arg
;
939 struct net_device
*dev
;
940 struct Qdisc
*q
= NULL
;
941 const struct Qdisc_class_ops
*cops
;
942 unsigned long cl
= 0;
943 unsigned long new_cl
;
944 u32 pid
= tcm
->tcm_parent
;
945 u32 clid
= tcm
->tcm_handle
;
946 u32 qid
= TC_H_MAJ(clid
);
949 if (net
!= &init_net
)
952 if ((dev
= __dev_get_by_index(&init_net
, tcm
->tcm_ifindex
)) == NULL
)
956 parent == TC_H_UNSPEC - unspecified parent.
957 parent == TC_H_ROOT - class is root, which has no parent.
958 parent == X:0 - parent is root class.
959 parent == X:Y - parent is a node in hierarchy.
960 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
962 handle == 0:0 - generate handle from kernel pool.
963 handle == 0:Y - class is X:Y, where X:0 is qdisc.
964 handle == X:Y - clear.
965 handle == X:0 - root class.
968 /* Step 1. Determine qdisc handle X:0 */
970 if (pid
!= TC_H_ROOT
) {
971 u32 qid1
= TC_H_MAJ(pid
);
974 /* If both majors are known, they must be identical. */
980 qid
= dev
->qdisc_sleeping
->handle
;
982 /* Now qid is genuine qdisc handle consistent
983 both with parent and child.
985 TC_H_MAJ(pid) still may be unspecified, complete it now.
988 pid
= TC_H_MAKE(qid
, pid
);
991 qid
= dev
->qdisc_sleeping
->handle
;
994 /* OK. Locate qdisc */
995 if ((q
= qdisc_lookup(dev
, qid
)) == NULL
)
998 /* An check that it supports classes */
999 cops
= q
->ops
->cl_ops
;
1003 /* Now try to get class */
1005 if (pid
== TC_H_ROOT
)
1008 clid
= TC_H_MAKE(qid
, clid
);
1011 cl
= cops
->get(q
, clid
);
1015 if (n
->nlmsg_type
!= RTM_NEWTCLASS
|| !(n
->nlmsg_flags
&NLM_F_CREATE
))
1018 switch (n
->nlmsg_type
) {
1021 if (n
->nlmsg_flags
&NLM_F_EXCL
)
1025 err
= cops
->delete(q
, cl
);
1027 tclass_notify(skb
, n
, q
, cl
, RTM_DELTCLASS
);
1030 err
= tclass_notify(skb
, n
, q
, cl
, RTM_NEWTCLASS
);
1039 err
= cops
->change(q
, clid
, pid
, tca
, &new_cl
);
1041 tclass_notify(skb
, n
, q
, new_cl
, RTM_NEWTCLASS
);
1051 static int tc_fill_tclass(struct sk_buff
*skb
, struct Qdisc
*q
,
1053 u32 pid
, u32 seq
, u16 flags
, int event
)
1056 struct nlmsghdr
*nlh
;
1057 unsigned char *b
= skb_tail_pointer(skb
);
1059 const struct Qdisc_class_ops
*cl_ops
= q
->ops
->cl_ops
;
1061 nlh
= NLMSG_NEW(skb
, pid
, seq
, event
, sizeof(*tcm
), flags
);
1062 tcm
= NLMSG_DATA(nlh
);
1063 tcm
->tcm_family
= AF_UNSPEC
;
1064 tcm
->tcm_ifindex
= q
->dev
->ifindex
;
1065 tcm
->tcm_parent
= q
->handle
;
1066 tcm
->tcm_handle
= q
->handle
;
1068 RTA_PUT(skb
, TCA_KIND
, IFNAMSIZ
, q
->ops
->id
);
1069 if (cl_ops
->dump
&& cl_ops
->dump(q
, cl
, skb
, tcm
) < 0)
1070 goto rtattr_failure
;
1072 if (gnet_stats_start_copy_compat(skb
, TCA_STATS2
, TCA_STATS
,
1073 TCA_XSTATS
, q
->stats_lock
, &d
) < 0)
1074 goto rtattr_failure
;
1076 if (cl_ops
->dump_stats
&& cl_ops
->dump_stats(q
, cl
, &d
) < 0)
1077 goto rtattr_failure
;
1079 if (gnet_stats_finish_copy(&d
) < 0)
1080 goto rtattr_failure
;
1082 nlh
->nlmsg_len
= skb_tail_pointer(skb
) - b
;
1091 static int tclass_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
1092 struct Qdisc
*q
, unsigned long cl
, int event
)
1094 struct sk_buff
*skb
;
1095 u32 pid
= oskb
? NETLINK_CB(oskb
).pid
: 0;
1097 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
1101 if (tc_fill_tclass(skb
, q
, cl
, pid
, n
->nlmsg_seq
, 0, event
) < 0) {
1106 return rtnetlink_send(skb
, &init_net
, pid
, RTNLGRP_TC
, n
->nlmsg_flags
&NLM_F_ECHO
);
1109 struct qdisc_dump_args
1111 struct qdisc_walker w
;
1112 struct sk_buff
*skb
;
1113 struct netlink_callback
*cb
;
1116 static int qdisc_class_dump(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*arg
)
1118 struct qdisc_dump_args
*a
= (struct qdisc_dump_args
*)arg
;
1120 return tc_fill_tclass(a
->skb
, q
, cl
, NETLINK_CB(a
->cb
->skb
).pid
,
1121 a
->cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
, RTM_NEWTCLASS
);
1124 static int tc_dump_tclass(struct sk_buff
*skb
, struct netlink_callback
*cb
)
1126 struct net
*net
= skb
->sk
->sk_net
;
1129 struct net_device
*dev
;
1131 struct tcmsg
*tcm
= (struct tcmsg
*)NLMSG_DATA(cb
->nlh
);
1132 struct qdisc_dump_args arg
;
1134 if (net
!= &init_net
)
1137 if (cb
->nlh
->nlmsg_len
< NLMSG_LENGTH(sizeof(*tcm
)))
1139 if ((dev
= dev_get_by_index(&init_net
, tcm
->tcm_ifindex
)) == NULL
)
1145 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
1146 if (t
< s_t
|| !q
->ops
->cl_ops
||
1148 TC_H_MAJ(tcm
->tcm_parent
) != q
->handle
)) {
1153 memset(&cb
->args
[1], 0, sizeof(cb
->args
)-sizeof(cb
->args
[0]));
1154 arg
.w
.fn
= qdisc_class_dump
;
1158 arg
.w
.skip
= cb
->args
[1];
1160 q
->ops
->cl_ops
->walk(q
, &arg
.w
);
1161 cb
->args
[1] = arg
.w
.count
;
1173 /* Main classifier routine: scans classifier chain attached
1174 to this qdisc, (optionally) tests for protocol and asks
1175 specific classifiers.
1177 int tc_classify_compat(struct sk_buff
*skb
, struct tcf_proto
*tp
,
1178 struct tcf_result
*res
)
1180 __be16 protocol
= skb
->protocol
;
1183 for (; tp
; tp
= tp
->next
) {
1184 if ((tp
->protocol
== protocol
||
1185 tp
->protocol
== htons(ETH_P_ALL
)) &&
1186 (err
= tp
->classify(skb
, tp
, res
)) >= 0) {
1187 #ifdef CONFIG_NET_CLS_ACT
1188 if (err
!= TC_ACT_RECLASSIFY
&& skb
->tc_verd
)
1189 skb
->tc_verd
= SET_TC_VERD(skb
->tc_verd
, 0);
1196 EXPORT_SYMBOL(tc_classify_compat
);
1198 int tc_classify(struct sk_buff
*skb
, struct tcf_proto
*tp
,
1199 struct tcf_result
*res
)
1203 #ifdef CONFIG_NET_CLS_ACT
1204 struct tcf_proto
*otp
= tp
;
1207 protocol
= skb
->protocol
;
1209 err
= tc_classify_compat(skb
, tp
, res
);
1210 #ifdef CONFIG_NET_CLS_ACT
1211 if (err
== TC_ACT_RECLASSIFY
) {
1212 u32 verd
= G_TC_VERD(skb
->tc_verd
);
1215 if (verd
++ >= MAX_REC_LOOP
) {
1216 printk("rule prio %u protocol %02x reclassify loop, "
1218 tp
->prio
&0xffff, ntohs(tp
->protocol
));
1221 skb
->tc_verd
= SET_TC_VERD(skb
->tc_verd
, verd
);
1227 EXPORT_SYMBOL(tc_classify
);
1229 void tcf_destroy(struct tcf_proto
*tp
)
1231 tp
->ops
->destroy(tp
);
1232 module_put(tp
->ops
->owner
);
1236 void tcf_destroy_chain(struct tcf_proto
*fl
)
1238 struct tcf_proto
*tp
;
1240 while ((tp
= fl
) != NULL
) {
1245 EXPORT_SYMBOL(tcf_destroy_chain
);
1247 #ifdef CONFIG_PROC_FS
1248 static int psched_show(struct seq_file
*seq
, void *v
)
1252 hrtimer_get_res(CLOCK_MONOTONIC
, &ts
);
1253 seq_printf(seq
, "%08x %08x %08x %08x\n",
1254 (u32
)NSEC_PER_USEC
, (u32
)PSCHED_US2NS(1),
1256 (u32
)NSEC_PER_SEC
/(u32
)ktime_to_ns(timespec_to_ktime(ts
)));
1261 static int psched_open(struct inode
*inode
, struct file
*file
)
1263 return single_open(file
, psched_show
, PDE(inode
)->data
);
1266 static const struct file_operations psched_fops
= {
1267 .owner
= THIS_MODULE
,
1268 .open
= psched_open
,
1270 .llseek
= seq_lseek
,
1271 .release
= single_release
,
1275 static int __init
pktsched_init(void)
1277 register_qdisc(&pfifo_qdisc_ops
);
1278 register_qdisc(&bfifo_qdisc_ops
);
1279 proc_net_fops_create(&init_net
, "psched", 0, &psched_fops
);
1281 rtnl_register(PF_UNSPEC
, RTM_NEWQDISC
, tc_modify_qdisc
, NULL
);
1282 rtnl_register(PF_UNSPEC
, RTM_DELQDISC
, tc_get_qdisc
, NULL
);
1283 rtnl_register(PF_UNSPEC
, RTM_GETQDISC
, tc_get_qdisc
, tc_dump_qdisc
);
1284 rtnl_register(PF_UNSPEC
, RTM_NEWTCLASS
, tc_ctl_tclass
, NULL
);
1285 rtnl_register(PF_UNSPEC
, RTM_DELTCLASS
, tc_ctl_tclass
, NULL
);
1286 rtnl_register(PF_UNSPEC
, RTM_GETTCLASS
, tc_ctl_tclass
, tc_dump_tclass
);
1291 subsys_initcall(pktsched_init
);
1293 EXPORT_SYMBOL(qdisc_get_rtab
);
1294 EXPORT_SYMBOL(qdisc_put_rtab
);
1295 EXPORT_SYMBOL(register_qdisc
);
1296 EXPORT_SYMBOL(unregister_qdisc
);