2 * net/sched/sch_api.c Packet scheduler API.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/proc_fs.h>
34 #include <linux/seq_file.h>
35 #include <linux/kmod.h>
36 #include <linux/list.h>
37 #include <linux/bitops.h>
40 #include <net/pkt_sched.h>
42 #include <asm/processor.h>
43 #include <asm/uaccess.h>
44 #include <asm/system.h>
46 static int qdisc_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
, u32 clid
,
47 struct Qdisc
*old
, struct Qdisc
*new);
48 static int tclass_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
49 struct Qdisc
*q
, unsigned long cl
, int event
);
56 This file consists of two interrelated parts:
58 1. queueing disciplines manager frontend.
59 2. traffic classes manager frontend.
61 Generally, queueing discipline ("qdisc") is a black box,
62 which is able to enqueue packets and to dequeue them (when
63 device is ready to send something) in order and at times
64 determined by algorithm hidden in it.
66 qdisc's are divided to two categories:
67 - "queues", which have no internal structure visible from outside.
68 - "schedulers", which split all the packets to "traffic classes",
69 using "packet classifiers" (look at cls_api.c)
71 In turn, classes may have child qdiscs (as rule, queues)
72 attached to them etc. etc. etc.
74 The goal of the routines in this file is to translate
75 information supplied by user in the form of handles
76 to more intelligible for kernel form, to make some sanity
77 checks and part of work, which is common to all qdiscs
78 and to provide rtnetlink notifications.
80 All real intelligent work is done inside qdisc modules.
84 Every discipline has two major routines: enqueue and dequeue.
88 dequeue usually returns a skb to send. It is allowed to return NULL,
89 but it does not mean that queue is empty, it just means that
90 discipline does not want to send anything this time.
91 Queue is really empty if q->q.qlen == 0.
92 For complicated disciplines with multiple queues q->q is not
93 real packet queue, but however q->q.qlen must be valid.
97 enqueue returns 0, if packet was enqueued successfully.
98 If packet (this one or another one) was dropped, it returns
100 NET_XMIT_DROP - this packet dropped
101 Expected action: do not backoff, but wait until queue will clear.
102 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
103 Expected action: backoff or ignore
104 NET_XMIT_POLICED - dropped by police.
105 Expected action: backoff or error to real-time apps.
111 requeues once dequeued packet. It is used for non-standard or
112 just buggy devices, which can defer output even if dev->tbusy=0.
116 returns qdisc to initial state: purge all buffers, clear all
117 timers, counters (except for statistics) etc.
121 initializes newly created qdisc.
125 destroys resources allocated by init and during lifetime of qdisc.
129 changes qdisc parameters.
132 /* Protects list of registered TC modules. It is pure SMP lock. */
133 static DEFINE_RWLOCK(qdisc_mod_lock
);
136 /************************************************
137 * Queueing disciplines manipulation. *
138 ************************************************/
141 /* The list of all installed queueing disciplines. */
143 static struct Qdisc_ops
*qdisc_base
;
145 /* Register/uregister queueing discipline */
147 int register_qdisc(struct Qdisc_ops
*qops
)
149 struct Qdisc_ops
*q
, **qp
;
152 write_lock(&qdisc_mod_lock
);
153 for (qp
= &qdisc_base
; (q
= *qp
) != NULL
; qp
= &q
->next
)
154 if (!strcmp(qops
->id
, q
->id
))
157 if (qops
->enqueue
== NULL
)
158 qops
->enqueue
= noop_qdisc_ops
.enqueue
;
159 if (qops
->requeue
== NULL
)
160 qops
->requeue
= noop_qdisc_ops
.requeue
;
161 if (qops
->dequeue
== NULL
)
162 qops
->dequeue
= noop_qdisc_ops
.dequeue
;
168 write_unlock(&qdisc_mod_lock
);
172 int unregister_qdisc(struct Qdisc_ops
*qops
)
174 struct Qdisc_ops
*q
, **qp
;
177 write_lock(&qdisc_mod_lock
);
178 for (qp
= &qdisc_base
; (q
=*qp
)!=NULL
; qp
= &q
->next
)
186 write_unlock(&qdisc_mod_lock
);
190 /* We know handle. Find qdisc among all qdisc's attached to device
191 (root qdisc, all its children, children of children etc.)
194 struct Qdisc
*qdisc_lookup(struct net_device
*dev
, u32 handle
)
198 read_lock_bh(&qdisc_tree_lock
);
199 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
200 if (q
->handle
== handle
) {
201 read_unlock_bh(&qdisc_tree_lock
);
205 read_unlock_bh(&qdisc_tree_lock
);
209 static struct Qdisc
*qdisc_leaf(struct Qdisc
*p
, u32 classid
)
213 struct Qdisc_class_ops
*cops
= p
->ops
->cl_ops
;
217 cl
= cops
->get(p
, classid
);
221 leaf
= cops
->leaf(p
, cl
);
226 /* Find queueing discipline by name */
228 static struct Qdisc_ops
*qdisc_lookup_ops(struct rtattr
*kind
)
230 struct Qdisc_ops
*q
= NULL
;
233 read_lock(&qdisc_mod_lock
);
234 for (q
= qdisc_base
; q
; q
= q
->next
) {
235 if (rtattr_strcmp(kind
, q
->id
) == 0) {
236 if (!try_module_get(q
->owner
))
241 read_unlock(&qdisc_mod_lock
);
246 static struct qdisc_rate_table
*qdisc_rtab_list
;
248 struct qdisc_rate_table
*qdisc_get_rtab(struct tc_ratespec
*r
, struct rtattr
*tab
)
250 struct qdisc_rate_table
*rtab
;
252 for (rtab
= qdisc_rtab_list
; rtab
; rtab
= rtab
->next
) {
253 if (memcmp(&rtab
->rate
, r
, sizeof(struct tc_ratespec
)) == 0) {
259 if (tab
== NULL
|| r
->rate
== 0 || r
->cell_log
== 0 || RTA_PAYLOAD(tab
) != 1024)
262 rtab
= kmalloc(sizeof(*rtab
), GFP_KERNEL
);
266 memcpy(rtab
->data
, RTA_DATA(tab
), 1024);
267 rtab
->next
= qdisc_rtab_list
;
268 qdisc_rtab_list
= rtab
;
273 void qdisc_put_rtab(struct qdisc_rate_table
*tab
)
275 struct qdisc_rate_table
*rtab
, **rtabp
;
277 if (!tab
|| --tab
->refcnt
)
280 for (rtabp
= &qdisc_rtab_list
; (rtab
=*rtabp
) != NULL
; rtabp
= &rtab
->next
) {
290 /* Allocate an unique handle from space managed by kernel */
292 static u32
qdisc_alloc_handle(struct net_device
*dev
)
295 static u32 autohandle
= TC_H_MAKE(0x80000000U
, 0);
298 autohandle
+= TC_H_MAKE(0x10000U
, 0);
299 if (autohandle
== TC_H_MAKE(TC_H_ROOT
, 0))
300 autohandle
= TC_H_MAKE(0x80000000U
, 0);
301 } while (qdisc_lookup(dev
, autohandle
) && --i
> 0);
303 return i
>0 ? autohandle
: 0;
306 /* Attach toplevel qdisc to device dev */
308 static struct Qdisc
*
309 dev_graft_qdisc(struct net_device
*dev
, struct Qdisc
*qdisc
)
311 struct Qdisc
*oqdisc
;
313 if (dev
->flags
& IFF_UP
)
316 qdisc_lock_tree(dev
);
317 if (qdisc
&& qdisc
->flags
&TCQ_F_INGRESS
) {
318 oqdisc
= dev
->qdisc_ingress
;
319 /* Prune old scheduler */
320 if (oqdisc
&& atomic_read(&oqdisc
->refcnt
) <= 1) {
323 dev
->qdisc_ingress
= NULL
;
325 dev
->qdisc_ingress
= qdisc
;
330 oqdisc
= dev
->qdisc_sleeping
;
332 /* Prune old scheduler */
333 if (oqdisc
&& atomic_read(&oqdisc
->refcnt
) <= 1)
336 /* ... and graft new one */
339 dev
->qdisc_sleeping
= qdisc
;
340 dev
->qdisc
= &noop_qdisc
;
343 qdisc_unlock_tree(dev
);
345 if (dev
->flags
& IFF_UP
)
352 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
355 Old qdisc is not destroyed but returned in *old.
358 static int qdisc_graft(struct net_device
*dev
, struct Qdisc
*parent
,
360 struct Qdisc
*new, struct Qdisc
**old
)
363 struct Qdisc
*q
= *old
;
366 if (parent
== NULL
) {
367 if (q
&& q
->flags
&TCQ_F_INGRESS
) {
368 *old
= dev_graft_qdisc(dev
, q
);
370 *old
= dev_graft_qdisc(dev
, new);
373 struct Qdisc_class_ops
*cops
= parent
->ops
->cl_ops
;
378 unsigned long cl
= cops
->get(parent
, classid
);
380 err
= cops
->graft(parent
, cl
, new, old
);
382 new->parent
= classid
;
383 cops
->put(parent
, cl
);
391 Allocate and initialize new qdisc.
393 Parameters are passed via opt.
396 static struct Qdisc
*
397 qdisc_create(struct net_device
*dev
, u32 handle
, struct rtattr
**tca
, int *errp
)
400 struct rtattr
*kind
= tca
[TCA_KIND
-1];
402 struct Qdisc_ops
*ops
;
404 ops
= qdisc_lookup_ops(kind
);
406 if (ops
== NULL
&& kind
!= NULL
) {
408 if (rtattr_strlcpy(name
, kind
, IFNAMSIZ
) < IFNAMSIZ
) {
409 /* We dropped the RTNL semaphore in order to
410 * perform the module load. So, even if we
411 * succeeded in loading the module we have to
412 * tell the caller to replay the request. We
413 * indicate this using -EAGAIN.
414 * We replay the request because the device may
415 * go away in the mean time.
418 request_module("sch_%s", name
);
420 ops
= qdisc_lookup_ops(kind
);
422 /* We will try again qdisc_lookup_ops,
423 * so don't keep a reference.
425 module_put(ops
->owner
);
437 sch
= qdisc_alloc(dev
, ops
);
443 if (handle
== TC_H_INGRESS
) {
444 sch
->flags
|= TCQ_F_INGRESS
;
445 handle
= TC_H_MAKE(TC_H_INGRESS
, 0);
446 } else if (handle
== 0) {
447 handle
= qdisc_alloc_handle(dev
);
453 sch
->handle
= handle
;
455 if (!ops
->init
|| (err
= ops
->init(sch
, tca
[TCA_OPTIONS
-1])) == 0) {
456 #ifdef CONFIG_NET_ESTIMATOR
457 if (tca
[TCA_RATE
-1]) {
458 err
= gen_new_estimator(&sch
->bstats
, &sch
->rate_est
,
463 * Any broken qdiscs that would require
464 * a ops->reset() here? The qdisc was never
465 * in action so it shouldn't be necessary.
473 qdisc_lock_tree(dev
);
474 list_add_tail(&sch
->list
, &dev
->qdisc_list
);
475 qdisc_unlock_tree(dev
);
481 kfree((char *) sch
- sch
->padded
);
483 module_put(ops
->owner
);
489 static int qdisc_change(struct Qdisc
*sch
, struct rtattr
**tca
)
491 if (tca
[TCA_OPTIONS
-1]) {
494 if (sch
->ops
->change
== NULL
)
496 err
= sch
->ops
->change(sch
, tca
[TCA_OPTIONS
-1]);
500 #ifdef CONFIG_NET_ESTIMATOR
502 gen_replace_estimator(&sch
->bstats
, &sch
->rate_est
,
503 sch
->stats_lock
, tca
[TCA_RATE
-1]);
508 struct check_loop_arg
510 struct qdisc_walker w
;
515 static int check_loop_fn(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*w
);
517 static int check_loop(struct Qdisc
*q
, struct Qdisc
*p
, int depth
)
519 struct check_loop_arg arg
;
521 if (q
->ops
->cl_ops
== NULL
)
524 arg
.w
.stop
= arg
.w
.skip
= arg
.w
.count
= 0;
525 arg
.w
.fn
= check_loop_fn
;
528 q
->ops
->cl_ops
->walk(q
, &arg
.w
);
529 return arg
.w
.stop
? -ELOOP
: 0;
533 check_loop_fn(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*w
)
536 struct Qdisc_class_ops
*cops
= q
->ops
->cl_ops
;
537 struct check_loop_arg
*arg
= (struct check_loop_arg
*)w
;
539 leaf
= cops
->leaf(q
, cl
);
541 if (leaf
== arg
->p
|| arg
->depth
> 7)
543 return check_loop(leaf
, arg
->p
, arg
->depth
+ 1);
552 static int tc_get_qdisc(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
554 struct tcmsg
*tcm
= NLMSG_DATA(n
);
555 struct rtattr
**tca
= arg
;
556 struct net_device
*dev
;
557 u32 clid
= tcm
->tcm_parent
;
558 struct Qdisc
*q
= NULL
;
559 struct Qdisc
*p
= NULL
;
562 if ((dev
= __dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
566 if (clid
!= TC_H_ROOT
) {
567 if (TC_H_MAJ(clid
) != TC_H_MAJ(TC_H_INGRESS
)) {
568 if ((p
= qdisc_lookup(dev
, TC_H_MAJ(clid
))) == NULL
)
570 q
= qdisc_leaf(p
, clid
);
571 } else { /* ingress */
572 q
= dev
->qdisc_ingress
;
575 q
= dev
->qdisc_sleeping
;
580 if (tcm
->tcm_handle
&& q
->handle
!= tcm
->tcm_handle
)
583 if ((q
= qdisc_lookup(dev
, tcm
->tcm_handle
)) == NULL
)
587 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
590 if (n
->nlmsg_type
== RTM_DELQDISC
) {
595 if ((err
= qdisc_graft(dev
, p
, clid
, NULL
, &q
)) != 0)
598 qdisc_notify(skb
, n
, clid
, q
, NULL
);
599 spin_lock_bh(&dev
->queue_lock
);
601 spin_unlock_bh(&dev
->queue_lock
);
604 qdisc_notify(skb
, n
, clid
, NULL
, q
);
613 static int tc_modify_qdisc(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
617 struct net_device
*dev
;
623 /* Reinit, just in case something touches this. */
626 clid
= tcm
->tcm_parent
;
629 if ((dev
= __dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
633 if (clid
!= TC_H_ROOT
) {
634 if (clid
!= TC_H_INGRESS
) {
635 if ((p
= qdisc_lookup(dev
, TC_H_MAJ(clid
))) == NULL
)
637 q
= qdisc_leaf(p
, clid
);
638 } else { /*ingress */
639 q
= dev
->qdisc_ingress
;
642 q
= dev
->qdisc_sleeping
;
645 /* It may be default qdisc, ignore it */
646 if (q
&& q
->handle
== 0)
649 if (!q
|| !tcm
->tcm_handle
|| q
->handle
!= tcm
->tcm_handle
) {
650 if (tcm
->tcm_handle
) {
651 if (q
&& !(n
->nlmsg_flags
&NLM_F_REPLACE
))
653 if (TC_H_MIN(tcm
->tcm_handle
))
655 if ((q
= qdisc_lookup(dev
, tcm
->tcm_handle
)) == NULL
)
657 if (n
->nlmsg_flags
&NLM_F_EXCL
)
659 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
662 (p
&& check_loop(q
, p
, 0)))
664 atomic_inc(&q
->refcnt
);
670 /* This magic test requires explanation.
672 * We know, that some child q is already
673 * attached to this parent and have choice:
674 * either to change it or to create/graft new one.
676 * 1. We are allowed to create/graft only
677 * if CREATE and REPLACE flags are set.
679 * 2. If EXCL is set, requestor wanted to say,
680 * that qdisc tcm_handle is not expected
681 * to exist, so that we choose create/graft too.
683 * 3. The last case is when no flags are set.
684 * Alas, it is sort of hole in API, we
685 * cannot decide what to do unambiguously.
686 * For now we select create/graft, if
687 * user gave KIND, which does not match existing.
689 if ((n
->nlmsg_flags
&NLM_F_CREATE
) &&
690 (n
->nlmsg_flags
&NLM_F_REPLACE
) &&
691 ((n
->nlmsg_flags
&NLM_F_EXCL
) ||
693 rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))))
698 if (!tcm
->tcm_handle
)
700 q
= qdisc_lookup(dev
, tcm
->tcm_handle
);
703 /* Change qdisc parameters */
706 if (n
->nlmsg_flags
&NLM_F_EXCL
)
708 if (tca
[TCA_KIND
-1] && rtattr_strcmp(tca
[TCA_KIND
-1], q
->ops
->id
))
710 err
= qdisc_change(q
, tca
);
712 qdisc_notify(skb
, n
, clid
, NULL
, q
);
716 if (!(n
->nlmsg_flags
&NLM_F_CREATE
))
718 if (clid
== TC_H_INGRESS
)
719 q
= qdisc_create(dev
, tcm
->tcm_parent
, tca
, &err
);
721 q
= qdisc_create(dev
, tcm
->tcm_handle
, tca
, &err
);
730 struct Qdisc
*old_q
= NULL
;
731 err
= qdisc_graft(dev
, p
, clid
, q
, &old_q
);
734 spin_lock_bh(&dev
->queue_lock
);
736 spin_unlock_bh(&dev
->queue_lock
);
740 qdisc_notify(skb
, n
, clid
, old_q
, q
);
742 spin_lock_bh(&dev
->queue_lock
);
743 qdisc_destroy(old_q
);
744 spin_unlock_bh(&dev
->queue_lock
);
750 static int tc_fill_qdisc(struct sk_buff
*skb
, struct Qdisc
*q
, u32 clid
,
751 u32 pid
, u32 seq
, u16 flags
, int event
)
754 struct nlmsghdr
*nlh
;
755 unsigned char *b
= skb
->tail
;
758 nlh
= NLMSG_NEW(skb
, pid
, seq
, event
, sizeof(*tcm
), flags
);
759 tcm
= NLMSG_DATA(nlh
);
760 tcm
->tcm_family
= AF_UNSPEC
;
763 tcm
->tcm_ifindex
= q
->dev
->ifindex
;
764 tcm
->tcm_parent
= clid
;
765 tcm
->tcm_handle
= q
->handle
;
766 tcm
->tcm_info
= atomic_read(&q
->refcnt
);
767 RTA_PUT(skb
, TCA_KIND
, IFNAMSIZ
, q
->ops
->id
);
768 if (q
->ops
->dump
&& q
->ops
->dump(q
, skb
) < 0)
770 q
->qstats
.qlen
= q
->q
.qlen
;
772 if (gnet_stats_start_copy_compat(skb
, TCA_STATS2
, TCA_STATS
,
773 TCA_XSTATS
, q
->stats_lock
, &d
) < 0)
776 if (q
->ops
->dump_stats
&& q
->ops
->dump_stats(q
, &d
) < 0)
779 if (gnet_stats_copy_basic(&d
, &q
->bstats
) < 0 ||
780 #ifdef CONFIG_NET_ESTIMATOR
781 gnet_stats_copy_rate_est(&d
, &q
->rate_est
) < 0 ||
783 gnet_stats_copy_queue(&d
, &q
->qstats
) < 0)
786 if (gnet_stats_finish_copy(&d
) < 0)
789 nlh
->nlmsg_len
= skb
->tail
- b
;
794 skb_trim(skb
, b
- skb
->data
);
798 static int qdisc_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
799 u32 clid
, struct Qdisc
*old
, struct Qdisc
*new)
802 u32 pid
= oskb
? NETLINK_CB(oskb
).pid
: 0;
804 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
808 if (old
&& old
->handle
) {
809 if (tc_fill_qdisc(skb
, old
, clid
, pid
, n
->nlmsg_seq
, 0, RTM_DELQDISC
) < 0)
813 if (tc_fill_qdisc(skb
, new, clid
, pid
, n
->nlmsg_seq
, old
? NLM_F_REPLACE
: 0, RTM_NEWQDISC
) < 0)
818 return rtnetlink_send(skb
, pid
, RTNLGRP_TC
, n
->nlmsg_flags
&NLM_F_ECHO
);
825 static int tc_dump_qdisc(struct sk_buff
*skb
, struct netlink_callback
*cb
)
829 struct net_device
*dev
;
833 s_q_idx
= q_idx
= cb
->args
[1];
834 read_lock(&dev_base_lock
);
835 for (dev
=dev_base
, idx
=0; dev
; dev
= dev
->next
, idx
++) {
840 read_lock_bh(&qdisc_tree_lock
);
842 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
843 if (q_idx
< s_q_idx
) {
847 if (tc_fill_qdisc(skb
, q
, q
->parent
, NETLINK_CB(cb
->skb
).pid
,
848 cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
, RTM_NEWQDISC
) <= 0) {
849 read_unlock_bh(&qdisc_tree_lock
);
854 read_unlock_bh(&qdisc_tree_lock
);
858 read_unlock(&dev_base_lock
);
868 /************************************************
869 * Traffic classes manipulation. *
870 ************************************************/
874 static int tc_ctl_tclass(struct sk_buff
*skb
, struct nlmsghdr
*n
, void *arg
)
876 struct tcmsg
*tcm
= NLMSG_DATA(n
);
877 struct rtattr
**tca
= arg
;
878 struct net_device
*dev
;
879 struct Qdisc
*q
= NULL
;
880 struct Qdisc_class_ops
*cops
;
881 unsigned long cl
= 0;
882 unsigned long new_cl
;
883 u32 pid
= tcm
->tcm_parent
;
884 u32 clid
= tcm
->tcm_handle
;
885 u32 qid
= TC_H_MAJ(clid
);
888 if ((dev
= __dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
892 parent == TC_H_UNSPEC - unspecified parent.
893 parent == TC_H_ROOT - class is root, which has no parent.
894 parent == X:0 - parent is root class.
895 parent == X:Y - parent is a node in hierarchy.
896 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
898 handle == 0:0 - generate handle from kernel pool.
899 handle == 0:Y - class is X:Y, where X:0 is qdisc.
900 handle == X:Y - clear.
901 handle == X:0 - root class.
904 /* Step 1. Determine qdisc handle X:0 */
906 if (pid
!= TC_H_ROOT
) {
907 u32 qid1
= TC_H_MAJ(pid
);
910 /* If both majors are known, they must be identical. */
916 qid
= dev
->qdisc_sleeping
->handle
;
918 /* Now qid is genuine qdisc handle consistent
919 both with parent and child.
921 TC_H_MAJ(pid) still may be unspecified, complete it now.
924 pid
= TC_H_MAKE(qid
, pid
);
927 qid
= dev
->qdisc_sleeping
->handle
;
930 /* OK. Locate qdisc */
931 if ((q
= qdisc_lookup(dev
, qid
)) == NULL
)
934 /* An check that it supports classes */
935 cops
= q
->ops
->cl_ops
;
939 /* Now try to get class */
941 if (pid
== TC_H_ROOT
)
944 clid
= TC_H_MAKE(qid
, clid
);
947 cl
= cops
->get(q
, clid
);
951 if (n
->nlmsg_type
!= RTM_NEWTCLASS
|| !(n
->nlmsg_flags
&NLM_F_CREATE
))
954 switch (n
->nlmsg_type
) {
957 if (n
->nlmsg_flags
&NLM_F_EXCL
)
961 err
= cops
->delete(q
, cl
);
963 tclass_notify(skb
, n
, q
, cl
, RTM_DELTCLASS
);
966 err
= tclass_notify(skb
, n
, q
, cl
, RTM_NEWTCLASS
);
975 err
= cops
->change(q
, clid
, pid
, tca
, &new_cl
);
977 tclass_notify(skb
, n
, q
, new_cl
, RTM_NEWTCLASS
);
987 static int tc_fill_tclass(struct sk_buff
*skb
, struct Qdisc
*q
,
989 u32 pid
, u32 seq
, u16 flags
, int event
)
992 struct nlmsghdr
*nlh
;
993 unsigned char *b
= skb
->tail
;
995 struct Qdisc_class_ops
*cl_ops
= q
->ops
->cl_ops
;
997 nlh
= NLMSG_NEW(skb
, pid
, seq
, event
, sizeof(*tcm
), flags
);
998 tcm
= NLMSG_DATA(nlh
);
999 tcm
->tcm_family
= AF_UNSPEC
;
1000 tcm
->tcm_ifindex
= q
->dev
->ifindex
;
1001 tcm
->tcm_parent
= q
->handle
;
1002 tcm
->tcm_handle
= q
->handle
;
1004 RTA_PUT(skb
, TCA_KIND
, IFNAMSIZ
, q
->ops
->id
);
1005 if (cl_ops
->dump
&& cl_ops
->dump(q
, cl
, skb
, tcm
) < 0)
1006 goto rtattr_failure
;
1008 if (gnet_stats_start_copy_compat(skb
, TCA_STATS2
, TCA_STATS
,
1009 TCA_XSTATS
, q
->stats_lock
, &d
) < 0)
1010 goto rtattr_failure
;
1012 if (cl_ops
->dump_stats
&& cl_ops
->dump_stats(q
, cl
, &d
) < 0)
1013 goto rtattr_failure
;
1015 if (gnet_stats_finish_copy(&d
) < 0)
1016 goto rtattr_failure
;
1018 nlh
->nlmsg_len
= skb
->tail
- b
;
1023 skb_trim(skb
, b
- skb
->data
);
1027 static int tclass_notify(struct sk_buff
*oskb
, struct nlmsghdr
*n
,
1028 struct Qdisc
*q
, unsigned long cl
, int event
)
1030 struct sk_buff
*skb
;
1031 u32 pid
= oskb
? NETLINK_CB(oskb
).pid
: 0;
1033 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
1037 if (tc_fill_tclass(skb
, q
, cl
, pid
, n
->nlmsg_seq
, 0, event
) < 0) {
1042 return rtnetlink_send(skb
, pid
, RTNLGRP_TC
, n
->nlmsg_flags
&NLM_F_ECHO
);
1045 struct qdisc_dump_args
1047 struct qdisc_walker w
;
1048 struct sk_buff
*skb
;
1049 struct netlink_callback
*cb
;
1052 static int qdisc_class_dump(struct Qdisc
*q
, unsigned long cl
, struct qdisc_walker
*arg
)
1054 struct qdisc_dump_args
*a
= (struct qdisc_dump_args
*)arg
;
1056 return tc_fill_tclass(a
->skb
, q
, cl
, NETLINK_CB(a
->cb
->skb
).pid
,
1057 a
->cb
->nlh
->nlmsg_seq
, NLM_F_MULTI
, RTM_NEWTCLASS
);
1060 static int tc_dump_tclass(struct sk_buff
*skb
, struct netlink_callback
*cb
)
1064 struct net_device
*dev
;
1066 struct tcmsg
*tcm
= (struct tcmsg
*)NLMSG_DATA(cb
->nlh
);
1067 struct qdisc_dump_args arg
;
1069 if (cb
->nlh
->nlmsg_len
< NLMSG_LENGTH(sizeof(*tcm
)))
1071 if ((dev
= dev_get_by_index(tcm
->tcm_ifindex
)) == NULL
)
1077 read_lock_bh(&qdisc_tree_lock
);
1078 list_for_each_entry(q
, &dev
->qdisc_list
, list
) {
1079 if (t
< s_t
|| !q
->ops
->cl_ops
||
1081 TC_H_MAJ(tcm
->tcm_parent
) != q
->handle
)) {
1086 memset(&cb
->args
[1], 0, sizeof(cb
->args
)-sizeof(cb
->args
[0]));
1087 arg
.w
.fn
= qdisc_class_dump
;
1091 arg
.w
.skip
= cb
->args
[1];
1093 q
->ops
->cl_ops
->walk(q
, &arg
.w
);
1094 cb
->args
[1] = arg
.w
.count
;
1099 read_unlock_bh(&qdisc_tree_lock
);
1107 /* Main classifier routine: scans classifier chain attached
1108 to this qdisc, (optionally) tests for protocol and asks
1109 specific classifiers.
1111 int tc_classify(struct sk_buff
*skb
, struct tcf_proto
*tp
,
1112 struct tcf_result
*res
)
1115 u32 protocol
= skb
->protocol
;
1116 #ifdef CONFIG_NET_CLS_ACT
1117 struct tcf_proto
*otp
= tp
;
1120 protocol
= skb
->protocol
;
1122 for ( ; tp
; tp
= tp
->next
) {
1123 if ((tp
->protocol
== protocol
||
1124 tp
->protocol
== __constant_htons(ETH_P_ALL
)) &&
1125 (err
= tp
->classify(skb
, tp
, res
)) >= 0) {
1126 #ifdef CONFIG_NET_CLS_ACT
1127 if ( TC_ACT_RECLASSIFY
== err
) {
1128 __u32 verd
= (__u32
) G_TC_VERD(skb
->tc_verd
);
1131 if (MAX_REC_LOOP
< verd
++) {
1132 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1133 tp
->prio
&0xffff, ntohs(tp
->protocol
));
1136 skb
->tc_verd
= SET_TC_VERD(skb
->tc_verd
,verd
);
1140 skb
->tc_verd
= SET_TC_VERD(skb
->tc_verd
,0);
1153 static int psched_us_per_tick
= 1;
1154 static int psched_tick_per_us
= 1;
1156 #ifdef CONFIG_PROC_FS
1157 static int psched_show(struct seq_file
*seq
, void *v
)
1159 seq_printf(seq
, "%08x %08x %08x %08x\n",
1160 psched_tick_per_us
, psched_us_per_tick
,
1166 static int psched_open(struct inode
*inode
, struct file
*file
)
1168 return single_open(file
, psched_show
, PDE(inode
)->data
);
1171 static struct file_operations psched_fops
= {
1172 .owner
= THIS_MODULE
,
1173 .open
= psched_open
,
1175 .llseek
= seq_lseek
,
1176 .release
= single_release
,
1180 #ifdef CONFIG_NET_SCH_CLK_CPU
1181 psched_tdiff_t psched_clock_per_hz
;
1182 int psched_clock_scale
;
1183 EXPORT_SYMBOL(psched_clock_per_hz
);
1184 EXPORT_SYMBOL(psched_clock_scale
);
1186 psched_time_t psched_time_base
;
1187 cycles_t psched_time_mark
;
1188 EXPORT_SYMBOL(psched_time_mark
);
1189 EXPORT_SYMBOL(psched_time_base
);
1192 * Periodically adjust psched_time_base to avoid overflow
1193 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1195 static void psched_tick(unsigned long);
1196 static DEFINE_TIMER(psched_timer
, psched_tick
, 0, 0);
1198 static void psched_tick(unsigned long dummy
)
1200 if (sizeof(cycles_t
) == sizeof(u32
)) {
1201 psched_time_t dummy_stamp
;
1202 PSCHED_GET_TIME(dummy_stamp
);
1203 psched_timer
.expires
= jiffies
+ 1*HZ
;
1204 add_timer(&psched_timer
);
1208 int __init
psched_calibrate_clock(void)
1210 psched_time_t stamp
, stamp1
;
1211 struct timeval tv
, tv1
;
1212 psched_tdiff_t delay
;
1217 stop
= jiffies
+ HZ
/10;
1218 PSCHED_GET_TIME(stamp
);
1219 do_gettimeofday(&tv
);
1220 while (time_before(jiffies
, stop
)) {
1224 PSCHED_GET_TIME(stamp1
);
1225 do_gettimeofday(&tv1
);
1227 delay
= PSCHED_TDIFF(stamp1
, stamp
);
1228 rdelay
= tv1
.tv_usec
- tv
.tv_usec
;
1229 rdelay
+= (tv1
.tv_sec
- tv
.tv_sec
)*1000000;
1233 psched_tick_per_us
= delay
;
1234 while ((delay
>>=1) != 0)
1235 psched_clock_scale
++;
1236 psched_us_per_tick
= 1<<psched_clock_scale
;
1237 psched_clock_per_hz
= (psched_tick_per_us
*(1000000/HZ
))>>psched_clock_scale
;
1242 static int __init
pktsched_init(void)
1244 struct rtnetlink_link
*link_p
;
1246 #ifdef CONFIG_NET_SCH_CLK_CPU
1247 if (psched_calibrate_clock() < 0)
1249 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1250 psched_tick_per_us
= HZ
<<PSCHED_JSCALE
;
1251 psched_us_per_tick
= 1000000;
1254 link_p
= rtnetlink_links
[PF_UNSPEC
];
1256 /* Setup rtnetlink links. It is made here to avoid
1257 exporting large number of public symbols.
1261 link_p
[RTM_NEWQDISC
-RTM_BASE
].doit
= tc_modify_qdisc
;
1262 link_p
[RTM_DELQDISC
-RTM_BASE
].doit
= tc_get_qdisc
;
1263 link_p
[RTM_GETQDISC
-RTM_BASE
].doit
= tc_get_qdisc
;
1264 link_p
[RTM_GETQDISC
-RTM_BASE
].dumpit
= tc_dump_qdisc
;
1265 link_p
[RTM_NEWTCLASS
-RTM_BASE
].doit
= tc_ctl_tclass
;
1266 link_p
[RTM_DELTCLASS
-RTM_BASE
].doit
= tc_ctl_tclass
;
1267 link_p
[RTM_GETTCLASS
-RTM_BASE
].doit
= tc_ctl_tclass
;
1268 link_p
[RTM_GETTCLASS
-RTM_BASE
].dumpit
= tc_dump_tclass
;
1271 register_qdisc(&pfifo_qdisc_ops
);
1272 register_qdisc(&bfifo_qdisc_ops
);
1273 proc_net_fops_create("psched", 0, &psched_fops
);
1278 subsys_initcall(pktsched_init
);
1280 EXPORT_SYMBOL(qdisc_lookup
);
1281 EXPORT_SYMBOL(qdisc_get_rtab
);
1282 EXPORT_SYMBOL(qdisc_put_rtab
);
1283 EXPORT_SYMBOL(register_qdisc
);
1284 EXPORT_SYMBOL(unregister_qdisc
);
1285 EXPORT_SYMBOL(tc_classify
);