2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/config.h>
19 #include <linux/module.h>
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/sched.h>
23 #include <linux/string.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/interrupt.h>
30 #include <linux/netdevice.h>
31 #include <linux/skbuff.h>
32 #include <linux/rtnetlink.h>
33 #include <linux/init.h>
34 #include <linux/proc_fs.h>
35 #include <linux/seq_file.h>
36 #include <linux/kmod.h>
37 #include <linux/list.h>
38 #include <linux/bitops.h>
41 #include <net/pkt_sched.h>
43 #include <asm/processor.h>
44 #include <asm/uaccess.h>
45 #include <asm/system.h>
47 static int qdisc_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
, u32 clid
,
48 struct Qdisc
*old
, struct Qdisc
*new);
49 static int tclass_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
50 struct Qdisc
*q
, unsigned long cl
, int event
);
57 This file consists of two interrelated parts:
59 1. queueing disciplines manager frontend.
60 2. traffic classes manager frontend.
62 Generally, queueing discipline ("qdisc") is a black box,
63 which is able to enqueue packets and to dequeue them (when
64 device is ready to send something) in order and at times
65 determined by algorithm hidden in it.
67 qdisc's are divided to two categories:
68 - "queues", which have no internal structure visible from outside.
69 - "schedulers", which split all the packets to "traffic classes",
70 using "packet classifiers" (look at cls_api.c)
72 In turn, classes may have child qdiscs (as rule, queues)
73 attached to them etc. etc. etc.
75 The goal of the routines in this file is to translate
76 information supplied by user in the form of handles
77 to more intelligible for kernel form, to make some sanity
78 checks and part of work, which is common to all qdiscs
79 and to provide rtnetlink notifications.
81 All real intelligent work is done inside qdisc modules.
85 Every discipline has two major routines: enqueue and dequeue.
89 dequeue usually returns a skb to send. It is allowed to return NULL,
90 but it does not mean that queue is empty, it just means that
91 discipline does not want to send anything this time.
92 Queue is really empty if q->q.qlen == 0.
93 For complicated disciplines with multiple queues q->q is not
94 real packet queue, but however q->q.qlen must be valid.
98 enqueue returns 0, if packet was enqueued successfully.
99 If packet (this one or another one) was dropped, it returns
101 NET_XMIT_DROP - this packet dropped
102 Expected action: do not backoff, but wait until queue will clear.
103 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
104 Expected action: backoff or ignore
105 NET_XMIT_POLICED - dropped by police.
106 Expected action: backoff or error to real-time apps.
112 requeues once dequeued packet. It is used for non-standard or
113 just buggy devices, which can defer output even if dev->tbusy=0.
117 returns qdisc to initial state: purge all buffers, clear all
118 timers, counters (except for statistics) etc.
122 initializes newly created qdisc.
126 destroys resources allocated by init and during lifetime of qdisc.
130 changes qdisc parameters.
133 /* Protects list of registered TC modules. It is pure SMP lock. */
134 static DEFINE_RWLOCK(qdisc_mod_lock
);
137 /************************************************
138 * Queueing disciplines manipulation. *
139 ************************************************/
142 /* The list of all installed queueing disciplines. */
144 static struct Qdisc_ops
*qdisc_base
;
146 /* Register/uregister queueing discipline */
148 int register_qdisc(struct Qdisc_ops
*qops
)
150 struct Qdisc_ops
*q
, **qp
;
153 write_lock(&qdisc_mod_lock
);
154 for (qp
= &qdisc_base
; (q
= *qp
) != NULL
; qp
= &q
->next
)
155 if (!strcmp(qops
->id
, q
->id
))
158 if (qops
->enqueue
== NULL
)
159 qops
->enqueue
= noop_qdisc_ops
.enqueue
;
160 if (qops
->requeue
== NULL
)
161 qops
->requeue
= noop_qdisc_ops
.requeue
;
162 if (qops
->dequeue
== NULL
)
163 qops
->dequeue
= noop_qdisc_ops
.dequeue
;
169 write_unlock(&qdisc_mod_lock
);
173 int unregister_qdisc(struct Qdisc_ops
*qops
)
175 struct Qdisc_ops
*q
, **qp
;
178 write_lock(&qdisc_mod_lock
);
179 for (qp
= &qdisc_base
; (q
=*qp
)!=NULL
; qp
= &q
->next
)
187 write_unlock(&qdisc_mod_lock
);
191 /* We know handle. Find qdisc among all qdisc's attached to device
192 (root qdisc, all its children, children of children etc.)
195 struct Qdisc
*qdisc_lookup(struct net_device
*dev
, u32 handle
)
199 read_lock_bh(&qdisc_tree_lock
);
200 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
201 if (q
->handle
== handle
) {
202 read_unlock_bh(&qdisc_tree_lock
);
206 read_unlock_bh(&qdisc_tree_lock
);
210 static struct Qdisc
*qdisc_leaf(struct Qdisc
*p
, u32 classid
)
214 struct Qdisc_class_ops
*cops
= p
->ops
->cl_ops
;
218 cl
= cops
->get(p
, classid
);
222 leaf
= cops
->leaf(p
, cl
);
227 /* Find queueing discipline by name */
229 static struct Qdisc_ops
*qdisc_lookup_ops(struct rtattr
*kind
)
231 struct Qdisc_ops
*q
= NULL
;
234 read_lock(&qdisc_mod_lock
);
235 for (q
= qdisc_base
; q
; q
= q
->next
) {
236 if (rtattr_strcmp(kind
, q
->id
) == 0) {
237 if (!try_module_get(q
->owner
))
242 read_unlock(&qdisc_mod_lock
);
247 static struct qdisc_rate_table
*qdisc_rtab_list
;
249 struct qdisc_rate_table
*qdisc_get_rtab(struct tc_ratespec
*r
, struct rtattr
*tab
)
251 struct qdisc_rate_table
*rtab
;
253 for (rtab
= qdisc_rtab_list
; rtab
; rtab
= rtab
->next
) {
254 if (memcmp(&rtab
->rate
, r
, sizeof(struct tc_ratespec
)) == 0) {
260 if (tab
== NULL
|| r
->rate
== 0 || r
->cell_log
== 0 || RTA_PAYLOAD(tab
) != 1024)
263 rtab
= kmalloc(sizeof(*rtab
), GFP_KERNEL
);
267 memcpy(rtab
->data
, RTA_DATA(tab
), 1024);
268 rtab
->next
= qdisc_rtab_list
;
269 qdisc_rtab_list
= rtab
;
274 void qdisc_put_rtab(struct qdisc_rate_table
*tab
)
276 struct qdisc_rate_table
*rtab
, **rtabp
;
278 if (!tab
|| --tab
->refcnt
)
281 for (rtabp
= &qdisc_rtab_list
; (rtab
=*rtabp
) != NULL
; rtabp
= &rtab
->next
) {
291 /* Allocate an unique handle from space managed by kernel */
293 static u32
qdisc_alloc_handle(struct net_device
*dev
)
296 static u32 autohandle
= TC_H_MAKE(0x80000000U
, 0);
299 autohandle
+= TC_H_MAKE(0x10000U
, 0);
300 if (autohandle
== TC_H_MAKE(TC_H_ROOT
, 0))
301 autohandle
= TC_H_MAKE(0x80000000U
, 0);
302 } while (qdisc_lookup(dev
, autohandle
) && --i
> 0);
304 return i
>0 ? autohandle
: 0;
307 /* Attach toplevel qdisc to device dev */
309 static struct Qdisc
*
310 dev_graft_qdisc(struct net_device
*dev
, struct Qdisc
*qdisc
)
312 struct Qdisc
*oqdisc
;
314 if (dev
->flags
& IFF_UP
)
317 qdisc_lock_tree(dev
);
318 if (qdisc
&& qdisc
->flags
&TCQ_F_INGRESS
) {
319 oqdisc
= dev
->qdisc_ingress
;
320 /* Prune old scheduler */
321 if (oqdisc
&& atomic_read(&oqdisc
->refcnt
) <= 1) {
324 dev
->qdisc_ingress
= NULL
;
326 dev
->qdisc_ingress
= qdisc
;
331 oqdisc
= dev
->qdisc_sleeping
;
333 /* Prune old scheduler */
334 if (oqdisc
&& atomic_read(&oqdisc
->refcnt
) <= 1)
337 /* ... and graft new one */
340 dev
->qdisc_sleeping
= qdisc
;
341 dev
->qdisc
= &noop_qdisc
;
344 qdisc_unlock_tree(dev
);
346 if (dev
->flags
& IFF_UP
)
353 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
356 Old qdisc is not destroyed but returned in *old.
359 static int qdisc_graft(struct net_device
*dev
, struct Qdisc
*parent
,
361 struct Qdisc
*new, struct Qdisc
**old
)
364 struct Qdisc
*q
= *old
;
367 if (parent
== NULL
) {
368 if (q
&& q
->flags
&TCQ_F_INGRESS
) {
369 *old
= dev_graft_qdisc(dev
, q
);
371 *old
= dev_graft_qdisc(dev
, new);
374 struct Qdisc_class_ops
*cops
= parent
->ops
->cl_ops
;
379 unsigned long cl
= cops
->get(parent
, classid
);
381 err
= cops
->graft(parent
, cl
, new, old
);
383 new->parent
= classid
;
384 cops
->put(parent
, cl
);
392 Allocate and initialize new qdisc.
394 Parameters are passed via opt.
397 static struct Qdisc
*
398 qdisc_create(struct net_device
*dev
, u32 handle
, struct rtattr
**tca
, int *errp
)
401 struct rtattr
*kind
= tca
[TCA_KIND
-1];
403 struct Qdisc_ops
*ops
;
405 ops
= qdisc_lookup_ops(kind
);
407 if (ops
== NULL
&& kind
!= NULL
) {
409 if (rtattr_strlcpy(name
, kind
, IFNAMSIZ
) < IFNAMSIZ
) {
410 /* We dropped the RTNL semaphore in order to
411 * perform the module load. So, even if we
412 * succeeded in loading the module we have to
413 * tell the caller to replay the request. We
414 * indicate this using -EAGAIN.
415 * We replay the request because the device may
416 * go away in the mean time.
419 request_module("sch_%s", name
);
421 ops
= qdisc_lookup_ops(kind
);
423 /* We will try again qdisc_lookup_ops,
424 * so don't keep a reference.
426 module_put(ops
->owner
);
438 sch
= qdisc_alloc(dev
, ops
);
444 if (handle
== TC_H_INGRESS
) {
445 sch
->flags
|= TCQ_F_INGRESS
;
446 handle
= TC_H_MAKE(TC_H_INGRESS
, 0);
447 } else if (handle
== 0) {
448 handle
= qdisc_alloc_handle(dev
);
454 sch
->handle
= handle
;
456 if (!ops
->init
|| (err
= ops
->init(sch
, tca
[TCA_OPTIONS
-1])) == 0) {
457 #ifdef CONFIG_NET_ESTIMATOR
458 if (tca
[TCA_RATE
-1]) {
459 err
= gen_new_estimator(&sch
->bstats
, &sch
->rate_est
,
464 * Any broken qdiscs that would require
465 * a ops->reset() here? The qdisc was never
466 * in action so it shouldn't be necessary.
474 qdisc_lock_tree(dev
);
475 list_add_tail(&sch
->list
, &dev
->qdisc_list
);
476 qdisc_unlock_tree(dev
);
482 kfree((char *) sch
- sch
->padded
);
484 module_put(ops
->owner
);
490 static int qdisc_change(struct Qdisc
*sch
, struct rtattr
**tca
)
492 if (tca
[TCA_OPTIONS
-1]) {
495 if (sch
->ops
->change
== NULL
)
497 err
= sch
->ops
->change(sch
, tca
[TCA_OPTIONS
-1]);
501 #ifdef CONFIG_NET_ESTIMATOR
503 gen_replace_estimator(&sch
->bstats
, &sch
->rate_est
,
504 sch
->stats_lock
, tca
[TCA_RATE
-1]);
509 struct check_loop_arg
511 struct qdisc_walker w
;
516 static int check_loop_fn(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*w
);
518 static int check_loop(struct Qdisc
*q
, struct Qdisc
*p
, int depth
)
520 struct check_loop_arg arg
;
522 if (q
->ops
->cl_ops
== NULL
)
525 arg
.w
.stop
= arg
.w
.skip
= arg
.w
.count
= 0;
526 arg
.w
.fn
= check_loop_fn
;
529 q
->ops
->cl_ops
->walk(q
, &arg
.w
);
530 return arg
.w
.stop
? -ELOOP
: 0;
534 check_loop_fn(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*w
)
537 struct Qdisc_class_ops
*cops
= q
->ops
->cl_ops
;
538 struct check_loop_arg
*arg
= (struct check_loop_arg
*)w
;
540 leaf
= cops
->leaf(q
, cl
);
542 if (leaf
== arg
->p
|| arg
->depth
> 7)
544 return check_loop(leaf
, arg
->p
, arg
->depth
+ 1);
553 static int tc_get_qdisc(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
555 struct tcmsg
*tcm
= NLMSG_DATA(n
);
556 struct rtattr
**tca
= arg
;
557 struct net_device
*dev
;
558 u32 clid
= tcm
->tcm_parent
;
559 struct Qdisc
*q
= NULL
;
560 struct Qdisc
*p
= NULL
;
563 if ((dev
= __dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
567 if (clid
!= TC_H_ROOT
) {
568 if (TC_H_MAJ(clid
) != TC_H_MAJ(TC_H_INGRESS
)) {
569 if ((p
= qdisc_lookup(dev
, TC_H_MAJ(clid
))) == NULL
)
571 q
= qdisc_leaf(p
, clid
);
572 } else { /* ingress */
573 q
= dev
->qdisc_ingress
;
576 q
= dev
->qdisc_sleeping
;
581 if (tcm
->tcm_handle
&& q
->handle
!= tcm
->tcm_handle
)
584 if ((q
= qdisc_lookup(dev
, tcm
->tcm_handle
)) == NULL
)
588 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
591 if (n
->nlmsg_type
== RTM_DELQDISC
) {
596 if ((err
= qdisc_graft(dev
, p
, clid
, NULL
, &q
)) != 0)
599 qdisc_notify(skb
, n
, clid
, q
, NULL
);
600 spin_lock_bh(&dev
->queue_lock
);
602 spin_unlock_bh(&dev
->queue_lock
);
605 qdisc_notify(skb
, n
, clid
, NULL
, q
);
614 static int tc_modify_qdisc(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
618 struct net_device
*dev
;
624 /* Reinit, just in case something touches this. */
627 clid
= tcm
->tcm_parent
;
630 if ((dev
= __dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
634 if (clid
!= TC_H_ROOT
) {
635 if (clid
!= TC_H_INGRESS
) {
636 if ((p
= qdisc_lookup(dev
, TC_H_MAJ(clid
))) == NULL
)
638 q
= qdisc_leaf(p
, clid
);
639 } else { /*ingress */
640 q
= dev
->qdisc_ingress
;
643 q
= dev
->qdisc_sleeping
;
646 /* It may be default qdisc, ignore it */
647 if (q
&& q
->handle
== 0)
650 if (!q
|| !tcm
->tcm_handle
|| q
->handle
!= tcm
->tcm_handle
) {
651 if (tcm
->tcm_handle
) {
652 if (q
&& !(n
->nlmsg_flags
&NLM_F_REPLACE
))
654 if (TC_H_MIN(tcm
->tcm_handle
))
656 if ((q
= qdisc_lookup(dev
, tcm
->tcm_handle
)) == NULL
)
658 if (n
->nlmsg_flags
&NLM_F_EXCL
)
660 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
663 (p
&& check_loop(q
, p
, 0)))
665 atomic_inc(&q
->refcnt
);
671 /* This magic test requires explanation.
673 * We know, that some child q is already
674 * attached to this parent and have choice:
675 * either to change it or to create/graft new one.
677 * 1. We are allowed to create/graft only
678 * if CREATE and REPLACE flags are set.
680 * 2. If EXCL is set, requestor wanted to say,
681 * that qdisc tcm_handle is not expected
682 * to exist, so that we choose create/graft too.
684 * 3. The last case is when no flags are set.
685 * Alas, it is sort of hole in API, we
686 * cannot decide what to do unambiguously.
687 * For now we select create/graft, if
688 * user gave KIND, which does not match existing.
690 if ((n
->nlmsg_flags
&NLM_F_CREATE
) &&
691 (n
->nlmsg_flags
&NLM_F_REPLACE
) &&
692 ((n
->nlmsg_flags
&NLM_F_EXCL
) ||
694 rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))))
699 if (!tcm
->tcm_handle
)
701 q
= qdisc_lookup(dev
, tcm
->tcm_handle
);
704 /* Change qdisc parameters */
707 if (n
->nlmsg_flags
&NLM_F_EXCL
)
709 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
711 err
= qdisc_change(q
, tca
);
713 qdisc_notify(skb
, n
, clid
, NULL
, q
);
717 if (!(n
->nlmsg_flags
&NLM_F_CREATE
))
719 if (clid
== TC_H_INGRESS
)
720 q
= qdisc_create(dev
, tcm
->tcm_parent
, tca
, &err
);
722 q
= qdisc_create(dev
, tcm
->tcm_handle
, tca
, &err
);
731 struct Qdisc
*old_q
= NULL
;
732 err
= qdisc_graft(dev
, p
, clid
, q
, &old_q
);
735 spin_lock_bh(&dev
->queue_lock
);
737 spin_unlock_bh(&dev
->queue_lock
);
741 qdisc_notify(skb
, n
, clid
, old_q
, q
);
743 spin_lock_bh(&dev
->queue_lock
);
744 qdisc_destroy(old_q
);
745 spin_unlock_bh(&dev
->queue_lock
);
751 static int tc_fill_qdisc(struct sk_buff
*skb
, struct Qdisc
*q
, u32 clid
,
752 u32 pid
, u32 seq
, u16 flags
, int event
)
755 struct nlmsghdr
*nlh
;
756 unsigned char *b
= skb
->tail
;
759 nlh
= NLMSG_NEW(skb
, pid
, seq
, event
, sizeof(*tcm
), flags
);
760 tcm
= NLMSG_DATA(nlh
);
761 tcm
->tcm_family
= AF_UNSPEC
;
764 tcm
->tcm_ifindex
= q
->dev
->ifindex
;
765 tcm
->tcm_parent
= clid
;
766 tcm
->tcm_handle
= q
->handle
;
767 tcm
->tcm_info
= atomic_read(&q
->refcnt
);
768 RTA_PUT(skb
, TCA_KIND
, IFNAMSIZ
, q
->ops
->id
);
769 if (q
->ops
->dump
&& q
->ops
->dump(q
, skb
) < 0)
771 q
->qstats
.qlen
= q
->q
.qlen
;
773 if (gnet_stats_start_copy_compat(skb
, TCA_STATS2
, TCA_STATS
,
774 TCA_XSTATS
, q
->stats_lock
, &d
) < 0)
777 if (q
->ops
->dump_stats
&& q
->ops
->dump_stats(q
, &d
) < 0)
780 if (gnet_stats_copy_basic(&d
, &q
->bstats
) < 0 ||
781 #ifdef CONFIG_NET_ESTIMATOR
782 gnet_stats_copy_rate_est(&d
, &q
->rate_est
) < 0 ||
784 gnet_stats_copy_queue(&d
, &q
->qstats
) < 0)
787 if (gnet_stats_finish_copy(&d
) < 0)
790 nlh
->nlmsg_len
= skb
->tail
- b
;
795 skb_trim(skb
, b
- skb
->data
);
799 static int qdisc_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
800 u32 clid
, struct Qdisc
*old
, struct Qdisc
*new)
803 u32 pid
= oskb
? NETLINK_CB(oskb
).pid
: 0;
805 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
809 if (old
&& old
->handle
) {
810 if (tc_fill_qdisc(skb
, old
, clid
, pid
, n
->nlmsg_seq
, 0, RTM_DELQDISC
) < 0)
814 if (tc_fill_qdisc(skb
, new, clid
, pid
, n
->nlmsg_seq
, old
? NLM_F_REPLACE
: 0, RTM_NEWQDISC
) < 0)
819 return rtnetlink_send(skb
, pid
, RTNLGRP_TC
, n
->nlmsg_flags
&NLM_F_ECHO
);
826 static int tc_dump_qdisc(struct sk_buff
*skb
, struct netlink_callback
*cb
)
830 struct net_device
*dev
;
834 s_q_idx
= q_idx
= cb
->args
[1];
835 read_lock(&dev_base_lock
);
836 for (dev
=dev_base
, idx
=0; dev
; dev
= dev
->next
, idx
++) {
841 read_lock_bh(&qdisc_tree_lock
);
843 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
844 if (q_idx
< s_q_idx
) {
848 if (tc_fill_qdisc(skb
, q
, q
->parent
, NETLINK_CB(cb
->skb
).pid
,
849 cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
, RTM_NEWQDISC
) <= 0) {
850 read_unlock_bh(&qdisc_tree_lock
);
855 read_unlock_bh(&qdisc_tree_lock
);
859 read_unlock(&dev_base_lock
);
869 /************************************************
870 * Traffic classes manipulation. *
871 ************************************************/
875 static int tc_ctl_tclass(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
877 struct tcmsg
*tcm
= NLMSG_DATA(n
);
878 struct rtattr
**tca
= arg
;
879 struct net_device
*dev
;
880 struct Qdisc
*q
= NULL
;
881 struct Qdisc_class_ops
*cops
;
882 unsigned long cl
= 0;
883 unsigned long new_cl
;
884 u32 pid
= tcm
->tcm_parent
;
885 u32 clid
= tcm
->tcm_handle
;
886 u32 qid
= TC_H_MAJ(clid
);
889 if ((dev
= __dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
893 parent == TC_H_UNSPEC - unspecified parent.
894 parent == TC_H_ROOT - class is root, which has no parent.
895 parent == X:0 - parent is root class.
896 parent == X:Y - parent is a node in hierarchy.
897 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
899 handle == 0:0 - generate handle from kernel pool.
900 handle == 0:Y - class is X:Y, where X:0 is qdisc.
901 handle == X:Y - clear.
902 handle == X:0 - root class.
905 /* Step 1. Determine qdisc handle X:0 */
907 if (pid
!= TC_H_ROOT
) {
908 u32 qid1
= TC_H_MAJ(pid
);
911 /* If both majors are known, they must be identical. */
917 qid
= dev
->qdisc_sleeping
->handle
;
919 /* Now qid is genuine qdisc handle consistent
920 both with parent and child.
922 TC_H_MAJ(pid) still may be unspecified, complete it now.
925 pid
= TC_H_MAKE(qid
, pid
);
928 qid
= dev
->qdisc_sleeping
->handle
;
931 /* OK. Locate qdisc */
932 if ((q
= qdisc_lookup(dev
, qid
)) == NULL
)
935 /* An check that it supports classes */
936 cops
= q
->ops
->cl_ops
;
940 /* Now try to get class */
942 if (pid
== TC_H_ROOT
)
945 clid
= TC_H_MAKE(qid
, clid
);
948 cl
= cops
->get(q
, clid
);
952 if (n
->nlmsg_type
!= RTM_NEWTCLASS
|| !(n
->nlmsg_flags
&NLM_F_CREATE
))
955 switch (n
->nlmsg_type
) {
958 if (n
->nlmsg_flags
&NLM_F_EXCL
)
962 err
= cops
->delete(q
, cl
);
964 tclass_notify(skb
, n
, q
, cl
, RTM_DELTCLASS
);
967 err
= tclass_notify(skb
, n
, q
, cl
, RTM_NEWTCLASS
);
976 err
= cops
->change(q
, clid
, pid
, tca
, &new_cl
);
978 tclass_notify(skb
, n
, q
, new_cl
, RTM_NEWTCLASS
);
988 static int tc_fill_tclass(struct sk_buff
*skb
, struct Qdisc
*q
,
990 u32 pid
, u32 seq
, u16 flags
, int event
)
993 struct nlmsghdr
*nlh
;
994 unsigned char *b
= skb
->tail
;
996 struct Qdisc_class_ops
*cl_ops
= q
->ops
->cl_ops
;
998 nlh
= NLMSG_NEW(skb
, pid
, seq
, event
, sizeof(*tcm
), flags
);
999 tcm
= NLMSG_DATA(nlh
);
1000 tcm
->tcm_family
= AF_UNSPEC
;
1001 tcm
->tcm_ifindex
= q
->dev
->ifindex
;
1002 tcm
->tcm_parent
= q
->handle
;
1003 tcm
->tcm_handle
= q
->handle
;
1005 RTA_PUT(skb
, TCA_KIND
, IFNAMSIZ
, q
->ops
->id
);
1006 if (cl_ops
->dump
&& cl_ops
->dump(q
, cl
, skb
, tcm
) < 0)
1007 goto rtattr_failure
;
1009 if (gnet_stats_start_copy_compat(skb
, TCA_STATS2
, TCA_STATS
,
1010 TCA_XSTATS
, q
->stats_lock
, &d
) < 0)
1011 goto rtattr_failure
;
1013 if (cl_ops
->dump_stats
&& cl_ops
->dump_stats(q
, cl
, &d
) < 0)
1014 goto rtattr_failure
;
1016 if (gnet_stats_finish_copy(&d
) < 0)
1017 goto rtattr_failure
;
1019 nlh
->nlmsg_len
= skb
->tail
- b
;
1024 skb_trim(skb
, b
- skb
->data
);
1028 static int tclass_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
1029 struct Qdisc
*q
, unsigned long cl
, int event
)
1031 struct sk_buff
*skb
;
1032 u32 pid
= oskb
? NETLINK_CB(oskb
).pid
: 0;
1034 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
1038 if (tc_fill_tclass(skb
, q
, cl
, pid
, n
->nlmsg_seq
, 0, event
) < 0) {
1043 return rtnetlink_send(skb
, pid
, RTNLGRP_TC
, n
->nlmsg_flags
&NLM_F_ECHO
);
1046 struct qdisc_dump_args
1048 struct qdisc_walker w
;
1049 struct sk_buff
*skb
;
1050 struct netlink_callback
*cb
;
1053 static int qdisc_class_dump(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*arg
)
1055 struct qdisc_dump_args
*a
= (struct qdisc_dump_args
*)arg
;
1057 return tc_fill_tclass(a
->skb
, q
, cl
, NETLINK_CB(a
->cb
->skb
).pid
,
1058 a
->cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
, RTM_NEWTCLASS
);
1061 static int tc_dump_tclass(struct sk_buff
*skb
, struct netlink_callback
*cb
)
1065 struct net_device
*dev
;
1067 struct tcmsg
*tcm
= (struct tcmsg
*)NLMSG_DATA(cb
->nlh
);
1068 struct qdisc_dump_args arg
;
1070 if (cb
->nlh
->nlmsg_len
< NLMSG_LENGTH(sizeof(*tcm
)))
1072 if ((dev
= dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
1078 read_lock_bh(&qdisc_tree_lock
);
1079 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
1080 if (t
< s_t
|| !q
->ops
->cl_ops
||
1082 TC_H_MAJ(tcm
->tcm_parent
) != q
->handle
)) {
1087 memset(&cb
->args
[1], 0, sizeof(cb
->args
)-sizeof(cb
->args
[0]));
1088 arg
.w
.fn
= qdisc_class_dump
;
1092 arg
.w
.skip
= cb
->args
[1];
1094 q
->ops
->cl_ops
->walk(q
, &arg
.w
);
1095 cb
->args
[1] = arg
.w
.count
;
1100 read_unlock_bh(&qdisc_tree_lock
);
1108 /* Main classifier routine: scans classifier chain attached
1109 to this qdisc, (optionally) tests for protocol and asks
1110 specific classifiers.
1112 int tc_classify(struct sk_buff
*skb
, struct tcf_proto
*tp
,
1113 struct tcf_result
*res
)
1116 u32 protocol
= skb
->protocol
;
1117 #ifdef CONFIG_NET_CLS_ACT
1118 struct tcf_proto
*otp
= tp
;
1121 protocol
= skb
->protocol
;
1123 for ( ; tp
; tp
= tp
->next
) {
1124 if ((tp
->protocol
== protocol
||
1125 tp
->protocol
== __constant_htons(ETH_P_ALL
)) &&
1126 (err
= tp
->classify(skb
, tp
, res
)) >= 0) {
1127 #ifdef CONFIG_NET_CLS_ACT
1128 if ( TC_ACT_RECLASSIFY
== err
) {
1129 __u32 verd
= (__u32
) G_TC_VERD(skb
->tc_verd
);
1132 if (MAX_REC_LOOP
< verd
++) {
1133 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1134 tp
->prio
&0xffff, ntohs(tp
->protocol
));
1137 skb
->tc_verd
= SET_TC_VERD(skb
->tc_verd
,verd
);
1141 skb
->tc_verd
= SET_TC_VERD(skb
->tc_verd
,0);
1154 static int psched_us_per_tick
= 1;
1155 static int psched_tick_per_us
= 1;
1157 #ifdef CONFIG_PROC_FS
1158 static int psched_show(struct seq_file
*seq
, void *v
)
1160 seq_printf(seq
, "%08x %08x %08x %08x\n",
1161 psched_tick_per_us
, psched_us_per_tick
,
1167 static int psched_open(struct inode
*inode
, struct file
*file
)
1169 return single_open(file
, psched_show
, PDE(inode
)->data
);
1172 static struct file_operations psched_fops
= {
1173 .owner
= THIS_MODULE
,
1174 .open
= psched_open
,
1176 .llseek
= seq_lseek
,
1177 .release
= single_release
,
1181 #ifdef CONFIG_NET_SCH_CLK_CPU
1182 psched_tdiff_t psched_clock_per_hz
;
1183 int psched_clock_scale
;
1184 EXPORT_SYMBOL(psched_clock_per_hz
);
1185 EXPORT_SYMBOL(psched_clock_scale
);
1187 psched_time_t psched_time_base
;
1188 cycles_t psched_time_mark
;
1189 EXPORT_SYMBOL(psched_time_mark
);
1190 EXPORT_SYMBOL(psched_time_base
);
1193 * Periodically adjust psched_time_base to avoid overflow
1194 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1196 static void psched_tick(unsigned long);
1197 static DEFINE_TIMER(psched_timer
, psched_tick
, 0, 0);
1199 static void psched_tick(unsigned long dummy
)
1201 if (sizeof(cycles_t
) == sizeof(u32
)) {
1202 psched_time_t dummy_stamp
;
1203 PSCHED_GET_TIME(dummy_stamp
);
1204 psched_timer
.expires
= jiffies
+ 1*HZ
;
1205 add_timer(&psched_timer
);
1209 int __init
psched_calibrate_clock(void)
1211 psched_time_t stamp
, stamp1
;
1212 struct timeval tv
, tv1
;
1213 psched_tdiff_t delay
;
1218 stop
= jiffies
+ HZ
/10;
1219 PSCHED_GET_TIME(stamp
);
1220 do_gettimeofday(&tv
);
1221 while (time_before(jiffies
, stop
)) {
1225 PSCHED_GET_TIME(stamp1
);
1226 do_gettimeofday(&tv1
);
1228 delay
= PSCHED_TDIFF(stamp1
, stamp
);
1229 rdelay
= tv1
.tv_usec
- tv
.tv_usec
;
1230 rdelay
+= (tv1
.tv_sec
- tv
.tv_sec
)*1000000;
1234 psched_tick_per_us
= delay
;
1235 while ((delay
>>=1) != 0)
1236 psched_clock_scale
++;
1237 psched_us_per_tick
= 1<<psched_clock_scale
;
1238 psched_clock_per_hz
= (psched_tick_per_us
*(1000000/HZ
))>>psched_clock_scale
;
1243 static int __init
pktsched_init(void)
1245 struct rtnetlink_link
*link_p
;
1247 #ifdef CONFIG_NET_SCH_CLK_CPU
1248 if (psched_calibrate_clock() < 0)
1250 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1251 psched_tick_per_us
= HZ
<<PSCHED_JSCALE
;
1252 psched_us_per_tick
= 1000000;
1255 link_p
= rtnetlink_links
[PF_UNSPEC
];
1257 /* Setup rtnetlink links. It is made here to avoid
1258 exporting large number of public symbols.
1262 link_p
[RTM_NEWQDISC
-RTM_BASE
].doit
= tc_modify_qdisc
;
1263 link_p
[RTM_DELQDISC
-RTM_BASE
].doit
= tc_get_qdisc
;
1264 link_p
[RTM_GETQDISC
-RTM_BASE
].doit
= tc_get_qdisc
;
1265 link_p
[RTM_GETQDISC
-RTM_BASE
].dumpit
= tc_dump_qdisc
;
1266 link_p
[RTM_NEWTCLASS
-RTM_BASE
].doit
= tc_ctl_tclass
;
1267 link_p
[RTM_DELTCLASS
-RTM_BASE
].doit
= tc_ctl_tclass
;
1268 link_p
[RTM_GETTCLASS
-RTM_BASE
].doit
= tc_ctl_tclass
;
1269 link_p
[RTM_GETTCLASS
-RTM_BASE
].dumpit
= tc_dump_tclass
;
1272 register_qdisc(&pfifo_qdisc_ops
);
1273 register_qdisc(&bfifo_qdisc_ops
);
1274 proc_net_fops_create("psched", 0, &psched_fops
);
1279 subsys_initcall(pktsched_init
);
1281 EXPORT_SYMBOL(qdisc_lookup
);
1282 EXPORT_SYMBOL(qdisc_get_rtab
);
1283 EXPORT_SYMBOL(qdisc_put_rtab
);
1284 EXPORT_SYMBOL(register_qdisc
);
1285 EXPORT_SYMBOL(unregister_qdisc
);
1286 EXPORT_SYMBOL(tc_classify
);