[PKT_SCHED] netem: Orphan SKB when adding to queue.
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / net / sched / sch_netem.c
blobef8874babf6ae8c7c23ae26fe1baaea03334dd2f
1 /*
2 * net/sched/sch_netem.c Network emulator
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
16 #include <linux/module.h>
17 #include <linux/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/errno.h>
21 #include <linux/netdevice.h>
22 #include <linux/skbuff.h>
23 #include <linux/rtnetlink.h>
25 #include <net/pkt_sched.h>
27 #define VERSION "1.2"
29 /* Network Emulation Queuing algorithm.
30 ====================================
32 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
33 Network Emulation Tool
34 [2] Luigi Rizzo, DummyNet for FreeBSD
36 ----------------------------------------------------------------
38 This started out as a simple way to delay outgoing packets to
39 test TCP but has grown to include most of the functionality
40 of a full blown network emulator like NISTnet. It can delay
41 packets and add random jitter (and correlation). The random
42 distribution can be loaded from a table as well to provide
43 normal, Pareto, or experimental curves. Packet loss,
44 duplication, and reordering can also be emulated.
46 This qdisc does not do classification that can be handled in
47 layering other disciplines. It does not need to do bandwidth
48 control either since that can be handled by using token
49 bucket or other rate control.
51 The simulator is limited by the Linux timer resolution
52 and will create packet bursts on the HZ boundary (1ms).
55 struct netem_sched_data {
56 struct Qdisc *qdisc;
57 struct timer_list timer;
59 u32 latency;
60 u32 loss;
61 u32 limit;
62 u32 counter;
63 u32 gap;
64 u32 jitter;
65 u32 duplicate;
66 u32 reorder;
67 u32 corrupt;
69 struct crndstate {
70 unsigned long last;
71 unsigned long rho;
72 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
74 struct disttable {
75 u32 size;
76 s16 table[0];
77 } *delay_dist;
80 /* Time stamp put into socket buffer control block */
81 struct netem_skb_cb {
82 psched_time_t time_to_send;
85 /* init_crandom - initialize correlated random number generator
86 * Use entropy source for initial seed.
88 static void init_crandom(struct crndstate *state, unsigned long rho)
90 state->rho = rho;
91 state->last = net_random();
94 /* get_crandom - correlated random number generator
95 * Next number depends on last value.
96 * rho is scaled to avoid floating point.
98 static unsigned long get_crandom(struct crndstate *state)
100 u64 value, rho;
101 unsigned long answer;
103 if (state->rho == 0) /* no correllation */
104 return net_random();
106 value = net_random();
107 rho = (u64)state->rho + 1;
108 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
109 state->last = answer;
110 return answer;
113 /* tabledist - return a pseudo-randomly distributed value with mean mu and
114 * std deviation sigma. Uses table lookup to approximate the desired
115 * distribution, and a uniformly-distributed pseudo-random source.
117 static long tabledist(unsigned long mu, long sigma,
118 struct crndstate *state, const struct disttable *dist)
120 long t, x;
121 unsigned long rnd;
123 if (sigma == 0)
124 return mu;
126 rnd = get_crandom(state);
128 /* default uniform distribution */
129 if (dist == NULL)
130 return (rnd % (2*sigma)) - sigma + mu;
132 t = dist->table[rnd % dist->size];
133 x = (sigma % NETEM_DIST_SCALE) * t;
134 if (x >= 0)
135 x += NETEM_DIST_SCALE/2;
136 else
137 x -= NETEM_DIST_SCALE/2;
139 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
143 * Insert one skb into qdisc.
144 * Note: parent depends on return value to account for queue length.
145 * NET_XMIT_DROP: queue length didn't change.
146 * NET_XMIT_SUCCESS: one skb was queued.
148 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
150 struct netem_sched_data *q = qdisc_priv(sch);
151 /* We don't fill cb now as skb_unshare() may invalidate it */
152 struct netem_skb_cb *cb;
153 struct sk_buff *skb2;
154 int ret;
155 int count = 1;
157 pr_debug("netem_enqueue skb=%p\n", skb);
159 /* Random duplication */
160 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
161 ++count;
163 /* Random packet drop 0 => none, ~0 => all */
164 if (q->loss && q->loss >= get_crandom(&q->loss_cor))
165 --count;
167 if (count == 0) {
168 sch->qstats.drops++;
169 kfree_skb(skb);
170 return NET_XMIT_BYPASS;
173 skb_orphan(skb);
176 * If we need to duplicate packet, then re-insert at top of the
177 * qdisc tree, since parent queuer expects that only one
178 * skb will be queued.
180 if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
181 struct Qdisc *rootq = sch->dev->qdisc;
182 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
183 q->duplicate = 0;
185 rootq->enqueue(skb2, rootq);
186 q->duplicate = dupsave;
190 * Randomized packet corruption.
191 * Make copy if needed since we are modifying
192 * If packet is going to be hardware checksummed, then
193 * do it now in software before we mangle it.
195 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
196 if (!(skb = skb_unshare(skb, GFP_ATOMIC))
197 || (skb->ip_summed == CHECKSUM_PARTIAL
198 && skb_checksum_help(skb))) {
199 sch->qstats.drops++;
200 return NET_XMIT_DROP;
203 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
206 cb = (struct netem_skb_cb *)skb->cb;
207 if (q->gap == 0 /* not doing reordering */
208 || q->counter < q->gap /* inside last reordering gap */
209 || q->reorder < get_crandom(&q->reorder_cor)) {
210 psched_time_t now;
211 psched_tdiff_t delay;
213 delay = tabledist(q->latency, q->jitter,
214 &q->delay_cor, q->delay_dist);
216 PSCHED_GET_TIME(now);
217 PSCHED_TADD2(now, delay, cb->time_to_send);
218 ++q->counter;
219 ret = q->qdisc->enqueue(skb, q->qdisc);
220 } else {
222 * Do re-ordering by putting one out of N packets at the front
223 * of the queue.
225 PSCHED_GET_TIME(cb->time_to_send);
226 q->counter = 0;
227 ret = q->qdisc->ops->requeue(skb, q->qdisc);
230 if (likely(ret == NET_XMIT_SUCCESS)) {
231 sch->q.qlen++;
232 sch->bstats.bytes += skb->len;
233 sch->bstats.packets++;
234 } else
235 sch->qstats.drops++;
237 pr_debug("netem: enqueue ret %d\n", ret);
238 return ret;
241 /* Requeue packets but don't change time stamp */
242 static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
244 struct netem_sched_data *q = qdisc_priv(sch);
245 int ret;
247 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
248 sch->q.qlen++;
249 sch->qstats.requeues++;
252 return ret;
255 static unsigned int netem_drop(struct Qdisc* sch)
257 struct netem_sched_data *q = qdisc_priv(sch);
258 unsigned int len = 0;
260 if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
261 sch->q.qlen--;
262 sch->qstats.drops++;
264 return len;
267 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
269 struct netem_sched_data *q = qdisc_priv(sch);
270 struct sk_buff *skb;
272 skb = q->qdisc->dequeue(q->qdisc);
273 if (skb) {
274 const struct netem_skb_cb *cb
275 = (const struct netem_skb_cb *)skb->cb;
276 psched_time_t now;
278 /* if more time remaining? */
279 PSCHED_GET_TIME(now);
281 if (PSCHED_TLESS(cb->time_to_send, now)) {
282 pr_debug("netem_dequeue: return skb=%p\n", skb);
283 sch->q.qlen--;
284 sch->flags &= ~TCQ_F_THROTTLED;
285 return skb;
286 } else {
287 psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now);
289 if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
290 sch->qstats.drops++;
292 /* After this qlen is confused */
293 printk(KERN_ERR "netem: queue discpline %s could not requeue\n",
294 q->qdisc->ops->id);
296 sch->q.qlen--;
299 mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay));
300 sch->flags |= TCQ_F_THROTTLED;
304 return NULL;
307 static void netem_watchdog(unsigned long arg)
309 struct Qdisc *sch = (struct Qdisc *)arg;
311 pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen);
312 sch->flags &= ~TCQ_F_THROTTLED;
313 netif_schedule(sch->dev);
316 static void netem_reset(struct Qdisc *sch)
318 struct netem_sched_data *q = qdisc_priv(sch);
320 qdisc_reset(q->qdisc);
321 sch->q.qlen = 0;
322 sch->flags &= ~TCQ_F_THROTTLED;
323 del_timer_sync(&q->timer);
326 /* Pass size change message down to embedded FIFO */
327 static int set_fifo_limit(struct Qdisc *q, int limit)
329 struct rtattr *rta;
330 int ret = -ENOMEM;
332 /* Hack to avoid sending change message to non-FIFO */
333 if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
334 return 0;
336 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
337 if (rta) {
338 rta->rta_type = RTM_NEWQDISC;
339 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
340 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
342 ret = q->ops->change(q, rta);
343 kfree(rta);
345 return ret;
349 * Distribution data is a variable size payload containing
350 * signed 16 bit values.
352 static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
354 struct netem_sched_data *q = qdisc_priv(sch);
355 unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
356 const __s16 *data = RTA_DATA(attr);
357 struct disttable *d;
358 int i;
360 if (n > 65536)
361 return -EINVAL;
363 d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
364 if (!d)
365 return -ENOMEM;
367 d->size = n;
368 for (i = 0; i < n; i++)
369 d->table[i] = data[i];
371 spin_lock_bh(&sch->dev->queue_lock);
372 d = xchg(&q->delay_dist, d);
373 spin_unlock_bh(&sch->dev->queue_lock);
375 kfree(d);
376 return 0;
379 static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
381 struct netem_sched_data *q = qdisc_priv(sch);
382 const struct tc_netem_corr *c = RTA_DATA(attr);
384 if (RTA_PAYLOAD(attr) != sizeof(*c))
385 return -EINVAL;
387 init_crandom(&q->delay_cor, c->delay_corr);
388 init_crandom(&q->loss_cor, c->loss_corr);
389 init_crandom(&q->dup_cor, c->dup_corr);
390 return 0;
393 static int get_reorder(struct Qdisc *sch, const struct rtattr *attr)
395 struct netem_sched_data *q = qdisc_priv(sch);
396 const struct tc_netem_reorder *r = RTA_DATA(attr);
398 if (RTA_PAYLOAD(attr) != sizeof(*r))
399 return -EINVAL;
401 q->reorder = r->probability;
402 init_crandom(&q->reorder_cor, r->correlation);
403 return 0;
406 static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr)
408 struct netem_sched_data *q = qdisc_priv(sch);
409 const struct tc_netem_corrupt *r = RTA_DATA(attr);
411 if (RTA_PAYLOAD(attr) != sizeof(*r))
412 return -EINVAL;
414 q->corrupt = r->probability;
415 init_crandom(&q->corrupt_cor, r->correlation);
416 return 0;
419 /* Parse netlink message to set options */
420 static int netem_change(struct Qdisc *sch, struct rtattr *opt)
422 struct netem_sched_data *q = qdisc_priv(sch);
423 struct tc_netem_qopt *qopt;
424 int ret;
426 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
427 return -EINVAL;
429 qopt = RTA_DATA(opt);
430 ret = set_fifo_limit(q->qdisc, qopt->limit);
431 if (ret) {
432 pr_debug("netem: can't set fifo limit\n");
433 return ret;
436 q->latency = qopt->latency;
437 q->jitter = qopt->jitter;
438 q->limit = qopt->limit;
439 q->gap = qopt->gap;
440 q->counter = 0;
441 q->loss = qopt->loss;
442 q->duplicate = qopt->duplicate;
444 /* for compatiablity with earlier versions.
445 * if gap is set, need to assume 100% probablity
447 q->reorder = ~0;
449 /* Handle nested options after initial queue options.
450 * Should have put all options in nested format but too late now.
452 if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
453 struct rtattr *tb[TCA_NETEM_MAX];
454 if (rtattr_parse(tb, TCA_NETEM_MAX,
455 RTA_DATA(opt) + sizeof(*qopt),
456 RTA_PAYLOAD(opt) - sizeof(*qopt)))
457 return -EINVAL;
459 if (tb[TCA_NETEM_CORR-1]) {
460 ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
461 if (ret)
462 return ret;
465 if (tb[TCA_NETEM_DELAY_DIST-1]) {
466 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
467 if (ret)
468 return ret;
471 if (tb[TCA_NETEM_REORDER-1]) {
472 ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]);
473 if (ret)
474 return ret;
477 if (tb[TCA_NETEM_CORRUPT-1]) {
478 ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]);
479 if (ret)
480 return ret;
484 return 0;
488 * Special case version of FIFO queue for use by netem.
489 * It queues in order based on timestamps in skb's
491 struct fifo_sched_data {
492 u32 limit;
495 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
497 struct fifo_sched_data *q = qdisc_priv(sch);
498 struct sk_buff_head *list = &sch->q;
499 const struct netem_skb_cb *ncb
500 = (const struct netem_skb_cb *)nskb->cb;
501 struct sk_buff *skb;
503 if (likely(skb_queue_len(list) < q->limit)) {
504 skb_queue_reverse_walk(list, skb) {
505 const struct netem_skb_cb *cb
506 = (const struct netem_skb_cb *)skb->cb;
508 if (!PSCHED_TLESS(ncb->time_to_send, cb->time_to_send))
509 break;
512 __skb_queue_after(list, skb, nskb);
514 sch->qstats.backlog += nskb->len;
515 sch->bstats.bytes += nskb->len;
516 sch->bstats.packets++;
518 return NET_XMIT_SUCCESS;
521 return qdisc_drop(nskb, sch);
524 static int tfifo_init(struct Qdisc *sch, struct rtattr *opt)
526 struct fifo_sched_data *q = qdisc_priv(sch);
528 if (opt) {
529 struct tc_fifo_qopt *ctl = RTA_DATA(opt);
530 if (RTA_PAYLOAD(opt) < sizeof(*ctl))
531 return -EINVAL;
533 q->limit = ctl->limit;
534 } else
535 q->limit = max_t(u32, sch->dev->tx_queue_len, 1);
537 return 0;
540 static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
542 struct fifo_sched_data *q = qdisc_priv(sch);
543 struct tc_fifo_qopt opt = { .limit = q->limit };
545 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
546 return skb->len;
548 rtattr_failure:
549 return -1;
552 static struct Qdisc_ops tfifo_qdisc_ops = {
553 .id = "tfifo",
554 .priv_size = sizeof(struct fifo_sched_data),
555 .enqueue = tfifo_enqueue,
556 .dequeue = qdisc_dequeue_head,
557 .requeue = qdisc_requeue,
558 .drop = qdisc_queue_drop,
559 .init = tfifo_init,
560 .reset = qdisc_reset_queue,
561 .change = tfifo_init,
562 .dump = tfifo_dump,
565 static int netem_init(struct Qdisc *sch, struct rtattr *opt)
567 struct netem_sched_data *q = qdisc_priv(sch);
568 int ret;
570 if (!opt)
571 return -EINVAL;
573 init_timer(&q->timer);
574 q->timer.function = netem_watchdog;
575 q->timer.data = (unsigned long) sch;
577 q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops);
578 if (!q->qdisc) {
579 pr_debug("netem: qdisc create failed\n");
580 return -ENOMEM;
583 ret = netem_change(sch, opt);
584 if (ret) {
585 pr_debug("netem: change failed\n");
586 qdisc_destroy(q->qdisc);
588 return ret;
591 static void netem_destroy(struct Qdisc *sch)
593 struct netem_sched_data *q = qdisc_priv(sch);
595 del_timer_sync(&q->timer);
596 qdisc_destroy(q->qdisc);
597 kfree(q->delay_dist);
600 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
602 const struct netem_sched_data *q = qdisc_priv(sch);
603 unsigned char *b = skb->tail;
604 struct rtattr *rta = (struct rtattr *) b;
605 struct tc_netem_qopt qopt;
606 struct tc_netem_corr cor;
607 struct tc_netem_reorder reorder;
608 struct tc_netem_corrupt corrupt;
610 qopt.latency = q->latency;
611 qopt.jitter = q->jitter;
612 qopt.limit = q->limit;
613 qopt.loss = q->loss;
614 qopt.gap = q->gap;
615 qopt.duplicate = q->duplicate;
616 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
618 cor.delay_corr = q->delay_cor.rho;
619 cor.loss_corr = q->loss_cor.rho;
620 cor.dup_corr = q->dup_cor.rho;
621 RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
623 reorder.probability = q->reorder;
624 reorder.correlation = q->reorder_cor.rho;
625 RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
627 corrupt.probability = q->corrupt;
628 corrupt.correlation = q->corrupt_cor.rho;
629 RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
631 rta->rta_len = skb->tail - b;
633 return skb->len;
635 rtattr_failure:
636 skb_trim(skb, b - skb->data);
637 return -1;
640 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
641 struct sk_buff *skb, struct tcmsg *tcm)
643 struct netem_sched_data *q = qdisc_priv(sch);
645 if (cl != 1) /* only one class */
646 return -ENOENT;
648 tcm->tcm_handle |= TC_H_MIN(1);
649 tcm->tcm_info = q->qdisc->handle;
651 return 0;
654 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
655 struct Qdisc **old)
657 struct netem_sched_data *q = qdisc_priv(sch);
659 if (new == NULL)
660 new = &noop_qdisc;
662 sch_tree_lock(sch);
663 *old = xchg(&q->qdisc, new);
664 qdisc_reset(*old);
665 sch->q.qlen = 0;
666 sch_tree_unlock(sch);
668 return 0;
671 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
673 struct netem_sched_data *q = qdisc_priv(sch);
674 return q->qdisc;
677 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
679 return 1;
682 static void netem_put(struct Qdisc *sch, unsigned long arg)
686 static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
687 struct rtattr **tca, unsigned long *arg)
689 return -ENOSYS;
692 static int netem_delete(struct Qdisc *sch, unsigned long arg)
694 return -ENOSYS;
697 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
699 if (!walker->stop) {
700 if (walker->count >= walker->skip)
701 if (walker->fn(sch, 1, walker) < 0) {
702 walker->stop = 1;
703 return;
705 walker->count++;
709 static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
711 return NULL;
714 static struct Qdisc_class_ops netem_class_ops = {
715 .graft = netem_graft,
716 .leaf = netem_leaf,
717 .get = netem_get,
718 .put = netem_put,
719 .change = netem_change_class,
720 .delete = netem_delete,
721 .walk = netem_walk,
722 .tcf_chain = netem_find_tcf,
723 .dump = netem_dump_class,
726 static struct Qdisc_ops netem_qdisc_ops = {
727 .id = "netem",
728 .cl_ops = &netem_class_ops,
729 .priv_size = sizeof(struct netem_sched_data),
730 .enqueue = netem_enqueue,
731 .dequeue = netem_dequeue,
732 .requeue = netem_requeue,
733 .drop = netem_drop,
734 .init = netem_init,
735 .reset = netem_reset,
736 .destroy = netem_destroy,
737 .change = netem_change,
738 .dump = netem_dump,
739 .owner = THIS_MODULE,
743 static int __init netem_module_init(void)
745 pr_info("netem: version " VERSION "\n");
746 return register_qdisc(&netem_qdisc_ops);
748 static void __exit netem_module_exit(void)
750 unregister_qdisc(&netem_qdisc_ops);
752 module_init(netem_module_init)
753 module_exit(netem_module_exit)
754 MODULE_LICENSE("GPL");