[IFB]: Use rtnl_link API
[linux-2.6/x86.git] / net / sched / sch_generic.c
blob9461e8ae052998604bfe68fd233fb70bde499898
1 /*
2 * net/sched/sch_generic.c Generic packet scheduler routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11 * - Ingress support
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
37 #define SCHED_TX_DROP -2
38 #define SCHED_TX_QUEUE -3
40 /* Main transmission queue. */
42 /* Modifications to data participating in scheduling must be protected with
43 * dev->queue_lock spinlock.
45 * The idea is the following:
46 * - enqueue, dequeue are serialized via top level device
47 * spinlock dev->queue_lock.
48 * - ingress filtering is serialized via top level device
49 * spinlock dev->ingress_lock.
50 * - updates to tree and tree walking are only done under the rtnl mutex.
53 void qdisc_lock_tree(struct net_device *dev)
55 spin_lock_bh(&dev->queue_lock);
56 spin_lock(&dev->ingress_lock);
59 void qdisc_unlock_tree(struct net_device *dev)
61 spin_unlock(&dev->ingress_lock);
62 spin_unlock_bh(&dev->queue_lock);
65 static inline int qdisc_qlen(struct Qdisc *q)
67 BUG_ON((int) q->q.qlen < 0);
68 return q->q.qlen;
71 static inline int handle_dev_cpu_collision(struct net_device *dev)
73 if (unlikely(dev->xmit_lock_owner == smp_processor_id())) {
74 if (net_ratelimit())
75 printk(KERN_WARNING
76 "Dead loop on netdevice %s, fix it urgently!\n",
77 dev->name);
78 return SCHED_TX_DROP;
80 __get_cpu_var(netdev_rx_stat).cpu_collision++;
81 return SCHED_TX_QUEUE;
84 static inline int
85 do_dev_requeue(struct sk_buff *skb, struct net_device *dev, struct Qdisc *q)
88 if (unlikely(skb->next))
89 dev->gso_skb = skb;
90 else
91 q->ops->requeue(skb, q);
92 /* XXX: Could netif_schedule fail? Or is the fact we are
93 * requeueing imply the hardware path is closed
94 * and even if we fail, some interupt will wake us
96 netif_schedule(dev);
97 return 0;
100 static inline struct sk_buff *
101 try_get_tx_pkt(struct net_device *dev, struct Qdisc *q)
103 struct sk_buff *skb = dev->gso_skb;
105 if (skb)
106 dev->gso_skb = NULL;
107 else
108 skb = q->dequeue(q);
110 return skb;
113 static inline int
114 tx_islocked(struct sk_buff *skb, struct net_device *dev, struct Qdisc *q)
116 int ret = handle_dev_cpu_collision(dev);
118 if (ret == SCHED_TX_DROP) {
119 kfree_skb(skb);
120 return qdisc_qlen(q);
123 return do_dev_requeue(skb, dev, q);
128 NOTE: Called under dev->queue_lock with locally disabled BH.
130 __LINK_STATE_QDISC_RUNNING guarantees only one CPU
131 can enter this region at a time.
133 dev->queue_lock serializes queue accesses for this device
134 AND dev->qdisc pointer itself.
136 netif_tx_lock serializes accesses to device driver.
138 dev->queue_lock and netif_tx_lock are mutually exclusive,
139 if one is grabbed, another must be free.
141 Multiple CPUs may contend for the two locks.
143 Note, that this procedure can be called by a watchdog timer
145 Returns to the caller:
146 Returns: 0 - queue is empty or throttled.
147 >0 - queue is not empty.
151 static inline int qdisc_restart(struct net_device *dev)
153 struct Qdisc *q = dev->qdisc;
154 unsigned lockless = (dev->features & NETIF_F_LLTX);
155 struct sk_buff *skb;
156 int ret;
158 skb = try_get_tx_pkt(dev, q);
159 if (skb == NULL)
160 return 0;
162 /* we have a packet to send */
163 if (!lockless) {
164 if (!netif_tx_trylock(dev))
165 return tx_islocked(skb, dev, q);
167 /* all clear .. */
168 spin_unlock(&dev->queue_lock);
170 ret = NETDEV_TX_BUSY;
171 if (!netif_queue_stopped(dev))
172 /* churn baby churn .. */
173 ret = dev_hard_start_xmit(skb, dev);
175 if (!lockless)
176 netif_tx_unlock(dev);
178 spin_lock(&dev->queue_lock);
180 /* we need to refresh q because it may be invalid since
181 * we dropped dev->queue_lock earlier ...
182 * So dont try to be clever grasshopper
184 q = dev->qdisc;
185 /* most likely result, packet went ok */
186 if (ret == NETDEV_TX_OK)
187 return qdisc_qlen(q);
188 /* only for lockless drivers .. */
189 if (ret == NETDEV_TX_LOCKED && lockless)
190 return tx_islocked(skb, dev, q);
192 if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
193 printk(KERN_WARNING " BUG %s code %d qlen %d\n",dev->name, ret, q->q.qlen);
195 return do_dev_requeue(skb, dev, q);
199 void __qdisc_run(struct net_device *dev)
201 do {
202 if (!qdisc_restart(dev))
203 break;
204 } while (!netif_queue_stopped(dev));
206 clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
209 static void dev_watchdog(unsigned long arg)
211 struct net_device *dev = (struct net_device *)arg;
213 netif_tx_lock(dev);
214 if (dev->qdisc != &noop_qdisc) {
215 if (netif_device_present(dev) &&
216 netif_running(dev) &&
217 netif_carrier_ok(dev)) {
218 if (netif_queue_stopped(dev) &&
219 time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
221 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
222 dev->name);
223 dev->tx_timeout(dev);
225 if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
226 dev_hold(dev);
229 netif_tx_unlock(dev);
231 dev_put(dev);
234 static void dev_watchdog_init(struct net_device *dev)
236 init_timer(&dev->watchdog_timer);
237 dev->watchdog_timer.data = (unsigned long)dev;
238 dev->watchdog_timer.function = dev_watchdog;
241 void __netdev_watchdog_up(struct net_device *dev)
243 if (dev->tx_timeout) {
244 if (dev->watchdog_timeo <= 0)
245 dev->watchdog_timeo = 5*HZ;
246 if (!mod_timer(&dev->watchdog_timer,
247 round_jiffies(jiffies + dev->watchdog_timeo)))
248 dev_hold(dev);
252 static void dev_watchdog_up(struct net_device *dev)
254 __netdev_watchdog_up(dev);
257 static void dev_watchdog_down(struct net_device *dev)
259 netif_tx_lock_bh(dev);
260 if (del_timer(&dev->watchdog_timer))
261 dev_put(dev);
262 netif_tx_unlock_bh(dev);
265 void netif_carrier_on(struct net_device *dev)
267 if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
268 linkwatch_fire_event(dev);
269 if (netif_running(dev))
270 __netdev_watchdog_up(dev);
273 void netif_carrier_off(struct net_device *dev)
275 if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
276 linkwatch_fire_event(dev);
279 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
280 under all circumstances. It is difficult to invent anything faster or
281 cheaper.
284 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
286 kfree_skb(skb);
287 return NET_XMIT_CN;
290 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
292 return NULL;
295 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
297 if (net_ratelimit())
298 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
299 skb->dev->name);
300 kfree_skb(skb);
301 return NET_XMIT_CN;
304 struct Qdisc_ops noop_qdisc_ops = {
305 .id = "noop",
306 .priv_size = 0,
307 .enqueue = noop_enqueue,
308 .dequeue = noop_dequeue,
309 .requeue = noop_requeue,
310 .owner = THIS_MODULE,
313 struct Qdisc noop_qdisc = {
314 .enqueue = noop_enqueue,
315 .dequeue = noop_dequeue,
316 .flags = TCQ_F_BUILTIN,
317 .ops = &noop_qdisc_ops,
318 .list = LIST_HEAD_INIT(noop_qdisc.list),
321 static struct Qdisc_ops noqueue_qdisc_ops = {
322 .id = "noqueue",
323 .priv_size = 0,
324 .enqueue = noop_enqueue,
325 .dequeue = noop_dequeue,
326 .requeue = noop_requeue,
327 .owner = THIS_MODULE,
330 static struct Qdisc noqueue_qdisc = {
331 .enqueue = NULL,
332 .dequeue = noop_dequeue,
333 .flags = TCQ_F_BUILTIN,
334 .ops = &noqueue_qdisc_ops,
335 .list = LIST_HEAD_INIT(noqueue_qdisc.list),
339 static const u8 prio2band[TC_PRIO_MAX+1] =
340 { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
342 /* 3-band FIFO queue: old style, but should be a bit faster than
343 generic prio+fifo combination.
346 #define PFIFO_FAST_BANDS 3
348 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
349 struct Qdisc *qdisc)
351 struct sk_buff_head *list = qdisc_priv(qdisc);
352 return list + prio2band[skb->priority & TC_PRIO_MAX];
355 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
357 struct sk_buff_head *list = prio2list(skb, qdisc);
359 if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
360 qdisc->q.qlen++;
361 return __qdisc_enqueue_tail(skb, qdisc, list);
364 return qdisc_drop(skb, qdisc);
367 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
369 int prio;
370 struct sk_buff_head *list = qdisc_priv(qdisc);
372 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
373 if (!skb_queue_empty(list + prio)) {
374 qdisc->q.qlen--;
375 return __qdisc_dequeue_head(qdisc, list + prio);
379 return NULL;
382 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
384 qdisc->q.qlen++;
385 return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
388 static void pfifo_fast_reset(struct Qdisc* qdisc)
390 int prio;
391 struct sk_buff_head *list = qdisc_priv(qdisc);
393 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
394 __qdisc_reset_queue(qdisc, list + prio);
396 qdisc->qstats.backlog = 0;
397 qdisc->q.qlen = 0;
400 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
402 struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
404 memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
405 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
406 return skb->len;
408 rtattr_failure:
409 return -1;
412 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
414 int prio;
415 struct sk_buff_head *list = qdisc_priv(qdisc);
417 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
418 skb_queue_head_init(list + prio);
420 return 0;
423 static struct Qdisc_ops pfifo_fast_ops = {
424 .id = "pfifo_fast",
425 .priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
426 .enqueue = pfifo_fast_enqueue,
427 .dequeue = pfifo_fast_dequeue,
428 .requeue = pfifo_fast_requeue,
429 .init = pfifo_fast_init,
430 .reset = pfifo_fast_reset,
431 .dump = pfifo_fast_dump,
432 .owner = THIS_MODULE,
435 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
437 void *p;
438 struct Qdisc *sch;
439 unsigned int size;
440 int err = -ENOBUFS;
442 /* ensure that the Qdisc and the private data are 32-byte aligned */
443 size = QDISC_ALIGN(sizeof(*sch));
444 size += ops->priv_size + (QDISC_ALIGNTO - 1);
446 p = kzalloc(size, GFP_KERNEL);
447 if (!p)
448 goto errout;
449 sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
450 sch->padded = (char *) sch - (char *) p;
452 INIT_LIST_HEAD(&sch->list);
453 skb_queue_head_init(&sch->q);
454 sch->ops = ops;
455 sch->enqueue = ops->enqueue;
456 sch->dequeue = ops->dequeue;
457 sch->dev = dev;
458 dev_hold(dev);
459 atomic_set(&sch->refcnt, 1);
461 return sch;
462 errout:
463 return ERR_PTR(-err);
466 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
467 unsigned int parentid)
469 struct Qdisc *sch;
471 sch = qdisc_alloc(dev, ops);
472 if (IS_ERR(sch))
473 goto errout;
474 sch->stats_lock = &dev->queue_lock;
475 sch->parent = parentid;
477 if (!ops->init || ops->init(sch, NULL) == 0)
478 return sch;
480 qdisc_destroy(sch);
481 errout:
482 return NULL;
485 /* Under dev->queue_lock and BH! */
487 void qdisc_reset(struct Qdisc *qdisc)
489 struct Qdisc_ops *ops = qdisc->ops;
491 if (ops->reset)
492 ops->reset(qdisc);
495 /* this is the rcu callback function to clean up a qdisc when there
496 * are no further references to it */
498 static void __qdisc_destroy(struct rcu_head *head)
500 struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
501 kfree((char *) qdisc - qdisc->padded);
504 /* Under dev->queue_lock and BH! */
506 void qdisc_destroy(struct Qdisc *qdisc)
508 struct Qdisc_ops *ops = qdisc->ops;
510 if (qdisc->flags & TCQ_F_BUILTIN ||
511 !atomic_dec_and_test(&qdisc->refcnt))
512 return;
514 list_del(&qdisc->list);
515 #ifdef CONFIG_NET_ESTIMATOR
516 gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
517 #endif
518 if (ops->reset)
519 ops->reset(qdisc);
520 if (ops->destroy)
521 ops->destroy(qdisc);
523 module_put(ops->owner);
524 dev_put(qdisc->dev);
525 call_rcu(&qdisc->q_rcu, __qdisc_destroy);
528 void dev_activate(struct net_device *dev)
530 /* No queueing discipline is attached to device;
531 create default one i.e. pfifo_fast for devices,
532 which need queueing and noqueue_qdisc for
533 virtual interfaces
536 if (dev->qdisc_sleeping == &noop_qdisc) {
537 struct Qdisc *qdisc;
538 if (dev->tx_queue_len) {
539 qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
540 TC_H_ROOT);
541 if (qdisc == NULL) {
542 printk(KERN_INFO "%s: activation failed\n", dev->name);
543 return;
545 list_add_tail(&qdisc->list, &dev->qdisc_list);
546 } else {
547 qdisc = &noqueue_qdisc;
549 dev->qdisc_sleeping = qdisc;
552 if (!netif_carrier_ok(dev))
553 /* Delay activation until next carrier-on event */
554 return;
556 spin_lock_bh(&dev->queue_lock);
557 rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
558 if (dev->qdisc != &noqueue_qdisc) {
559 dev->trans_start = jiffies;
560 dev_watchdog_up(dev);
562 spin_unlock_bh(&dev->queue_lock);
565 void dev_deactivate(struct net_device *dev)
567 struct Qdisc *qdisc;
568 struct sk_buff *skb;
570 spin_lock_bh(&dev->queue_lock);
571 qdisc = dev->qdisc;
572 dev->qdisc = &noop_qdisc;
574 qdisc_reset(qdisc);
576 skb = dev->gso_skb;
577 dev->gso_skb = NULL;
578 spin_unlock_bh(&dev->queue_lock);
580 kfree_skb(skb);
582 dev_watchdog_down(dev);
584 /* Wait for outstanding dev_queue_xmit calls. */
585 synchronize_rcu();
587 /* Wait for outstanding qdisc_run calls. */
588 while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
589 yield();
592 void dev_init_scheduler(struct net_device *dev)
594 qdisc_lock_tree(dev);
595 dev->qdisc = &noop_qdisc;
596 dev->qdisc_sleeping = &noop_qdisc;
597 INIT_LIST_HEAD(&dev->qdisc_list);
598 qdisc_unlock_tree(dev);
600 dev_watchdog_init(dev);
603 void dev_shutdown(struct net_device *dev)
605 struct Qdisc *qdisc;
607 qdisc_lock_tree(dev);
608 qdisc = dev->qdisc_sleeping;
609 dev->qdisc = &noop_qdisc;
610 dev->qdisc_sleeping = &noop_qdisc;
611 qdisc_destroy(qdisc);
612 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
613 if ((qdisc = dev->qdisc_ingress) != NULL) {
614 dev->qdisc_ingress = NULL;
615 qdisc_destroy(qdisc);
617 #endif
618 BUG_TRAP(!timer_pending(&dev->watchdog_timer));
619 qdisc_unlock_tree(dev);
622 EXPORT_SYMBOL(netif_carrier_on);
623 EXPORT_SYMBOL(netif_carrier_off);
624 EXPORT_SYMBOL(noop_qdisc);
625 EXPORT_SYMBOL(qdisc_create_dflt);
626 EXPORT_SYMBOL(qdisc_destroy);
627 EXPORT_SYMBOL(qdisc_reset);
628 EXPORT_SYMBOL(qdisc_lock_tree);
629 EXPORT_SYMBOL(qdisc_unlock_tree);