ESFQ scheduler for kernel 2.6
[tomato.git] / release / src-rt / linux / linux-2.6 / net / sched / sch_esfq.c
blob71c471f1581e12d0ed0113040de4927b7ef16cf5
1 /*
2 * net/sched/sch_esfq.c Extended Stochastic Fairness Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 * Changes: Alexander Atanasov, <alex@ssi.bg>
12 * Added dynamic depth,limit,divisor,hash_kind options.
13 * Added dst and src hashes.
15 * Alexander Clouter, <alex@digriz.org.uk>
16 * Ported ESFQ to Linux 2.6.
18 * Corey Hickey, <bugfood-c@fatooh.org>
19 * Maintenance of the Linux 2.6 port.
20 * Added fwmark hash (thanks to Robert Kurjata).
21 * Added usage of jhash.
22 * Added conntrack support.
23 * Added ctnatchg hash (thanks to Ben Pfountz).
26 #include <linux/module.h>
27 #include <asm/uaccess.h>
28 #include <asm/system.h>
29 #include <linux/bitops.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/jiffies.h>
33 #include <linux/string.h>
34 #include <linux/mm.h>
35 #include <linux/socket.h>
36 #include <linux/sockios.h>
37 #include <linux/in.h>
38 #include <linux/errno.h>
39 #include <linux/interrupt.h>
40 #include <linux/if_ether.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/etherdevice.h>
44 #include <linux/notifier.h>
45 #include <linux/init.h>
46 #include <net/ip.h>
47 #include <net/netlink.h>
48 #include <linux/ipv6.h>
49 #include <net/route.h>
50 #include <linux/skbuff.h>
51 #include <net/sock.h>
52 #include <net/pkt_sched.h>
53 #include <linux/jhash.h>
54 #include <net/netfilter/nf_conntrack.h>
56 /* Stochastic Fairness Queuing algorithm.
57 For more comments look at sch_sfq.c.
58 The difference is that you can change limit, depth,
59 hash table size and choose alternate hash types.
61 classic: same as in sch_sfq.c
62 dst: destination IP address
63 src: source IP address
64 fwmark: netfilter mark value
65 ctorigdst: original destination IP address
66 ctorigsrc: original source IP address
67 ctrepldst: reply destination IP address
68 ctreplsrc: reply source IP
72 #define ESFQ_HEAD 0
73 #define ESFQ_TAIL 1
75 /* This type should contain at least SFQ_DEPTH*2 values */
76 typedef unsigned int esfq_index;
78 struct esfq_head
80 esfq_index next;
81 esfq_index prev;
84 struct esfq_sched_data
86 /* Parameters */
87 int perturb_period;
88 unsigned quantum; /* Allotment per round: MUST BE >= MTU */
89 int limit;
90 unsigned depth;
91 unsigned hash_divisor;
92 unsigned hash_kind;
93 /* Variables */
94 struct timer_list perturb_timer;
95 int perturbation;
96 esfq_index tail; /* Index of current slot in round */
97 esfq_index max_depth; /* Maximal depth */
99 esfq_index *ht; /* Hash table */
100 esfq_index *next; /* Active slots link */
101 short *allot; /* Current allotment per slot */
102 unsigned short *hash; /* Hash value indexed by slots */
103 struct sk_buff_head *qs; /* Slot queue */
104 struct esfq_head *dep; /* Linked list of slots, indexed by depth */
107 /* This contains the info we will hash. */
108 struct esfq_packet_info
110 u32 proto; /* protocol or port */
111 u32 src; /* source from packet header */
112 u32 dst; /* destination from packet header */
113 u32 ctorigsrc; /* original source from conntrack */
114 u32 ctorigdst; /* original destination from conntrack */
115 u32 ctreplsrc; /* reply source from conntrack */
116 u32 ctrepldst; /* reply destination from conntrack */
117 u32 mark; /* netfilter mark (fwmark) */
120 static __inline__ unsigned esfq_jhash_1word(struct esfq_sched_data *q,u32 a)
122 return jhash_1word(a, q->perturbation) & (q->hash_divisor-1);
125 static __inline__ unsigned esfq_jhash_2words(struct esfq_sched_data *q, u32 a, u32 b)
127 return jhash_2words(a, b, q->perturbation) & (q->hash_divisor-1);
130 static __inline__ unsigned esfq_jhash_3words(struct esfq_sched_data *q, u32 a, u32 b, u32 c)
132 return jhash_3words(a, b, c, q->perturbation) & (q->hash_divisor-1);
135 static unsigned esfq_hash(struct esfq_sched_data *q, struct sk_buff *skb)
137 struct esfq_packet_info info;
138 #ifdef CONFIG_NET_SCH_ESFQ_NFCT
139 enum ip_conntrack_info ctinfo;
140 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
141 #endif
143 switch (skb->protocol) {
144 case __constant_htons(ETH_P_IP):
146 struct iphdr *iph = ip_hdr(skb);
147 info.dst = iph->daddr;
148 info.src = iph->saddr;
149 if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
150 (iph->protocol == IPPROTO_TCP ||
151 iph->protocol == IPPROTO_UDP ||
152 iph->protocol == IPPROTO_SCTP ||
153 iph->protocol == IPPROTO_DCCP ||
154 iph->protocol == IPPROTO_ESP))
155 info.proto = *(((u32*)iph) + iph->ihl);
156 else
157 info.proto = iph->protocol;
158 break;
160 case __constant_htons(ETH_P_IPV6):
162 struct ipv6hdr *iph = ipv6_hdr(skb);
163 /* Hash ipv6 addresses into a u32. This isn't ideal,
164 * but the code is simple. */
165 info.dst = jhash2(iph->daddr.s6_addr32, 4, q->perturbation);
166 info.src = jhash2(iph->saddr.s6_addr32, 4, q->perturbation);
167 if (iph->nexthdr == IPPROTO_TCP ||
168 iph->nexthdr == IPPROTO_UDP ||
169 iph->nexthdr == IPPROTO_SCTP ||
170 iph->nexthdr == IPPROTO_DCCP ||
171 iph->nexthdr == IPPROTO_ESP)
172 info.proto = *(u32*)&iph[1];
173 else
174 info.proto = iph->nexthdr;
175 break;
177 default:
178 info.dst = (u32)(unsigned long)skb->dst;
179 info.src = (u32)(unsigned long)skb->sk;
180 info.proto = skb->protocol;
183 info.mark = skb->mark;
185 #ifdef CONFIG_NET_SCH_ESFQ_NFCT
186 /* defaults if there is no conntrack info */
187 info.ctorigsrc = info.src;
188 info.ctorigdst = info.dst;
189 info.ctreplsrc = info.dst;
190 info.ctrepldst = info.src;
191 /* collect conntrack info */
192 if (ct && ct != &nf_conntrack_untracked) {
193 if (skb->protocol == __constant_htons(ETH_P_IP)) {
194 info.ctorigsrc = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
195 info.ctorigdst = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip;
196 info.ctreplsrc = ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip;
197 info.ctrepldst = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip;
199 else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
200 /* Again, hash ipv6 addresses into a single u32. */
201 info.ctorigsrc = jhash2(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip6, 4, q->perturbation);
202 info.ctorigdst = jhash2(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip6, 4, q->perturbation);
203 info.ctreplsrc = jhash2(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip6, 4, q->perturbation);
204 info.ctrepldst = jhash2(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip6, 4, q->perturbation);
208 #endif
210 switch(q->hash_kind) {
211 case TCA_SFQ_HASH_CLASSIC:
212 return esfq_jhash_3words(q, info.dst, info.src, info.proto);
213 case TCA_SFQ_HASH_DST:
214 return esfq_jhash_1word(q, info.dst);
215 case TCA_SFQ_HASH_SRC:
216 return esfq_jhash_1word(q, info.src);
217 case TCA_SFQ_HASH_FWMARK:
218 return esfq_jhash_1word(q, info.mark);
219 #ifdef CONFIG_NET_SCH_ESFQ_NFCT
220 case TCA_SFQ_HASH_CTORIGDST:
221 return esfq_jhash_1word(q, info.ctorigdst);
222 case TCA_SFQ_HASH_CTORIGSRC:
223 return esfq_jhash_1word(q, info.ctorigsrc);
224 case TCA_SFQ_HASH_CTREPLDST:
225 return esfq_jhash_1word(q, info.ctrepldst);
226 case TCA_SFQ_HASH_CTREPLSRC:
227 return esfq_jhash_1word(q, info.ctreplsrc);
228 case TCA_SFQ_HASH_CTNATCHG:
230 if (info.ctorigdst == info.ctreplsrc)
231 return esfq_jhash_1word(q, info.ctorigsrc);
232 return esfq_jhash_1word(q, info.ctreplsrc);
234 #endif
235 default:
236 if (net_ratelimit())
237 printk(KERN_WARNING "ESFQ: Unknown hash method. Falling back to classic.\n");
239 return esfq_jhash_3words(q, info.dst, info.src, info.proto);
242 static inline void esfq_link(struct esfq_sched_data *q, esfq_index x)
244 esfq_index p, n;
245 int d = q->qs[x].qlen + q->depth;
247 p = d;
248 n = q->dep[d].next;
249 q->dep[x].next = n;
250 q->dep[x].prev = p;
251 q->dep[p].next = q->dep[n].prev = x;
254 static inline void esfq_dec(struct esfq_sched_data *q, esfq_index x)
256 esfq_index p, n;
258 n = q->dep[x].next;
259 p = q->dep[x].prev;
260 q->dep[p].next = n;
261 q->dep[n].prev = p;
263 if (n == p && q->max_depth == q->qs[x].qlen + 1)
264 q->max_depth--;
266 esfq_link(q, x);
269 static inline void esfq_inc(struct esfq_sched_data *q, esfq_index x)
271 esfq_index p, n;
272 int d;
274 n = q->dep[x].next;
275 p = q->dep[x].prev;
276 q->dep[p].next = n;
277 q->dep[n].prev = p;
278 d = q->qs[x].qlen;
279 if (q->max_depth < d)
280 q->max_depth = d;
282 esfq_link(q, x);
285 static unsigned int esfq_drop(struct Qdisc *sch)
287 struct esfq_sched_data *q = qdisc_priv(sch);
288 esfq_index d = q->max_depth;
289 struct sk_buff *skb;
290 unsigned int len;
292 /* Queue is full! Find the longest slot and
293 drop a packet from it */
295 if (d > 1) {
296 esfq_index x = q->dep[d+q->depth].next;
297 skb = q->qs[x].prev;
298 len = skb->len;
299 __skb_unlink(skb, &q->qs[x]);
300 kfree_skb(skb);
301 esfq_dec(q, x);
302 sch->q.qlen--;
303 sch->qstats.drops++;
304 sch->qstats.backlog -= len;
305 return len;
308 if (d == 1) {
309 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
310 d = q->next[q->tail];
311 q->next[q->tail] = q->next[d];
312 q->allot[q->next[d]] += q->quantum;
313 skb = q->qs[d].prev;
314 len = skb->len;
315 __skb_unlink(skb, &q->qs[d]);
316 kfree_skb(skb);
317 esfq_dec(q, d);
318 sch->q.qlen--;
319 q->ht[q->hash[d]] = q->depth;
320 sch->qstats.drops++;
321 sch->qstats.backlog -= len;
322 return len;
325 return 0;
328 static void esfq_q_enqueue(struct sk_buff *skb, struct esfq_sched_data *q, unsigned int end)
330 unsigned hash = esfq_hash(q, skb);
331 unsigned depth = q->depth;
332 esfq_index x;
334 x = q->ht[hash];
335 if (x == depth) {
336 q->ht[hash] = x = q->dep[depth].next;
337 q->hash[x] = hash;
340 if (end == ESFQ_TAIL)
341 __skb_queue_tail(&q->qs[x], skb);
342 else
343 __skb_queue_head(&q->qs[x], skb);
345 esfq_inc(q, x);
346 if (q->qs[x].qlen == 1) { /* The flow is new */
347 if (q->tail == depth) { /* It is the first flow */
348 q->tail = x;
349 q->next[x] = x;
350 q->allot[x] = q->quantum;
351 } else {
352 q->next[x] = q->next[q->tail];
353 q->next[q->tail] = x;
354 q->tail = x;
359 static int esfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
361 struct esfq_sched_data *q = qdisc_priv(sch);
362 esfq_q_enqueue(skb, q, ESFQ_TAIL);
363 sch->qstats.backlog += skb->len;
364 if (++sch->q.qlen < q->limit-1) {
365 sch->bstats.bytes += skb->len;
366 sch->bstats.packets++;
367 return 0;
370 sch->qstats.drops++;
371 esfq_drop(sch);
372 return NET_XMIT_CN;
376 static int esfq_requeue(struct sk_buff *skb, struct Qdisc* sch)
378 struct esfq_sched_data *q = qdisc_priv(sch);
379 esfq_q_enqueue(skb, q, ESFQ_HEAD);
380 sch->qstats.backlog += skb->len;
381 if (++sch->q.qlen < q->limit - 1) {
382 sch->qstats.requeues++;
383 return 0;
386 sch->qstats.drops++;
387 esfq_drop(sch);
388 return NET_XMIT_CN;
391 static struct sk_buff *esfq_q_dequeue(struct esfq_sched_data *q)
393 struct sk_buff *skb;
394 unsigned depth = q->depth;
395 esfq_index a, old_a;
397 /* No active slots */
398 if (q->tail == depth)
399 return NULL;
401 a = old_a = q->next[q->tail];
403 /* Grab packet */
404 skb = __skb_dequeue(&q->qs[a]);
405 esfq_dec(q, a);
407 /* Is the slot empty? */
408 if (q->qs[a].qlen == 0) {
409 q->ht[q->hash[a]] = depth;
410 a = q->next[a];
411 if (a == old_a) {
412 q->tail = depth;
413 return skb;
415 q->next[q->tail] = a;
416 q->allot[a] += q->quantum;
417 } else if ((q->allot[a] -= skb->len) <= 0) {
418 q->tail = a;
419 a = q->next[a];
420 q->allot[a] += q->quantum;
423 return skb;
426 static struct sk_buff *esfq_dequeue(struct Qdisc* sch)
428 struct esfq_sched_data *q = qdisc_priv(sch);
429 struct sk_buff *skb;
431 skb = esfq_q_dequeue(q);
432 if (skb == NULL)
433 return NULL;
434 sch->q.qlen--;
435 sch->qstats.backlog -= skb->len;
436 return skb;
439 static void esfq_q_destroy(struct esfq_sched_data *q)
441 del_timer(&q->perturb_timer);
442 if(q->ht)
443 kfree(q->ht);
444 if(q->dep)
445 kfree(q->dep);
446 if(q->next)
447 kfree(q->next);
448 if(q->allot)
449 kfree(q->allot);
450 if(q->hash)
451 kfree(q->hash);
452 if(q->qs)
453 kfree(q->qs);
456 static void esfq_destroy(struct Qdisc *sch)
458 struct esfq_sched_data *q = qdisc_priv(sch);
459 esfq_q_destroy(q);
463 static void esfq_reset(struct Qdisc* sch)
465 struct sk_buff *skb;
467 while ((skb = esfq_dequeue(sch)) != NULL)
468 kfree_skb(skb);
471 static void esfq_perturbation(unsigned long arg)
473 struct Qdisc *sch = (struct Qdisc*)arg;
474 struct esfq_sched_data *q = qdisc_priv(sch);
476 q->perturbation = net_random()&0x1F;
478 if (q->perturb_period) {
479 q->perturb_timer.expires = jiffies + q->perturb_period;
480 add_timer(&q->perturb_timer);
484 static unsigned int esfq_check_hash(unsigned int kind)
486 switch (kind) {
487 case TCA_SFQ_HASH_CTORIGDST:
488 case TCA_SFQ_HASH_CTORIGSRC:
489 case TCA_SFQ_HASH_CTREPLDST:
490 case TCA_SFQ_HASH_CTREPLSRC:
491 case TCA_SFQ_HASH_CTNATCHG:
492 #ifndef CONFIG_NET_SCH_ESFQ_NFCT
494 if (net_ratelimit())
495 printk(KERN_WARNING "ESFQ: Conntrack hash types disabled in kernel config. Falling back to classic.\n");
496 return TCA_SFQ_HASH_CLASSIC;
498 #endif
499 case TCA_SFQ_HASH_CLASSIC:
500 case TCA_SFQ_HASH_DST:
501 case TCA_SFQ_HASH_SRC:
502 case TCA_SFQ_HASH_FWMARK:
503 return kind;
504 default:
506 if (net_ratelimit())
507 printk(KERN_WARNING "ESFQ: Unknown hash type. Falling back to classic.\n");
508 return TCA_SFQ_HASH_CLASSIC;
513 static int esfq_q_init(struct esfq_sched_data *q, struct rtattr *opt)
515 struct tc_esfq_qopt *ctl = RTA_DATA(opt);
516 esfq_index p = ~0U/2;
517 int i;
519 if (opt && opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
520 return -EINVAL;
522 q->perturbation = 0;
523 q->hash_kind = TCA_SFQ_HASH_CLASSIC;
524 q->max_depth = 0;
525 if (opt == NULL) {
526 q->perturb_period = 0;
527 q->hash_divisor = 1024;
528 q->tail = q->limit = q->depth = 128;
530 } else {
531 struct tc_esfq_qopt *ctl = RTA_DATA(opt);
532 if (ctl->quantum)
533 q->quantum = ctl->quantum;
534 q->perturb_period = ctl->perturb_period*HZ;
535 q->hash_divisor = ctl->divisor ? : 1024;
536 q->tail = q->limit = q->depth = ctl->flows ? : 128;
538 if ( q->depth > p - 1 )
539 return -EINVAL;
541 if (ctl->limit)
542 q->limit = min_t(u32, ctl->limit, q->depth);
544 if (ctl->hash_kind) {
545 q->hash_kind = esfq_check_hash(ctl->hash_kind);
549 q->ht = kmalloc(q->hash_divisor*sizeof(esfq_index), GFP_KERNEL);
550 if (!q->ht)
551 goto err_case;
552 q->dep = kmalloc((1+q->depth*2)*sizeof(struct esfq_head), GFP_KERNEL);
553 if (!q->dep)
554 goto err_case;
555 q->next = kmalloc(q->depth*sizeof(esfq_index), GFP_KERNEL);
556 if (!q->next)
557 goto err_case;
558 q->allot = kmalloc(q->depth*sizeof(short), GFP_KERNEL);
559 if (!q->allot)
560 goto err_case;
561 q->hash = kmalloc(q->depth*sizeof(unsigned short), GFP_KERNEL);
562 if (!q->hash)
563 goto err_case;
564 q->qs = kmalloc(q->depth*sizeof(struct sk_buff_head), GFP_KERNEL);
565 if (!q->qs)
566 goto err_case;
568 for (i=0; i< q->hash_divisor; i++)
569 q->ht[i] = q->depth;
570 for (i=0; i<q->depth; i++) {
571 skb_queue_head_init(&q->qs[i]);
572 q->dep[i+q->depth].next = i+q->depth;
573 q->dep[i+q->depth].prev = i+q->depth;
576 for (i=0; i<q->depth; i++)
577 esfq_link(q, i);
578 return 0;
579 err_case:
580 esfq_q_destroy(q);
581 return -ENOBUFS;
584 static int esfq_init(struct Qdisc *sch, struct rtattr *opt)
586 struct esfq_sched_data *q = qdisc_priv(sch);
587 int err;
589 q->quantum = psched_mtu(sch->dev); /* default */
590 if ((err = esfq_q_init(q, opt)))
591 return err;
593 init_timer(&q->perturb_timer);
594 q->perturb_timer.data = (unsigned long)sch;
595 q->perturb_timer.function = esfq_perturbation;
596 if (q->perturb_period) {
597 q->perturb_timer.expires = jiffies + q->perturb_period;
598 add_timer(&q->perturb_timer);
601 return 0;
604 static int esfq_change(struct Qdisc *sch, struct rtattr *opt)
606 struct esfq_sched_data *q = qdisc_priv(sch);
607 struct esfq_sched_data new;
608 struct sk_buff *skb;
609 int err;
611 /* set up new queue */
612 memset(&new, 0, sizeof(struct esfq_sched_data));
613 new.quantum = psched_mtu(sch->dev); /* default */
614 if ((err = esfq_q_init(&new, opt)))
615 return err;
617 /* copy all packets from the old queue to the new queue */
618 sch_tree_lock(sch);
619 while ((skb = esfq_q_dequeue(q)) != NULL)
620 esfq_q_enqueue(skb, &new, ESFQ_TAIL);
622 /* clean up the old queue */
623 esfq_q_destroy(q);
625 /* copy elements of the new queue into the old queue */
626 q->perturb_period = new.perturb_period;
627 q->quantum = new.quantum;
628 q->limit = new.limit;
629 q->depth = new.depth;
630 q->hash_divisor = new.hash_divisor;
631 q->hash_kind = new.hash_kind;
632 q->tail = new.tail;
633 q->max_depth = new.max_depth;
634 q->ht = new.ht;
635 q->dep = new.dep;
636 q->next = new.next;
637 q->allot = new.allot;
638 q->hash = new.hash;
639 q->qs = new.qs;
641 /* finish up */
642 if (q->perturb_period) {
643 q->perturb_timer.expires = jiffies + q->perturb_period;
644 add_timer(&q->perturb_timer);
645 } else {
646 q->perturbation = 0;
648 sch_tree_unlock(sch);
649 return 0;
652 static int esfq_dump(struct Qdisc *sch, struct sk_buff *skb)
654 struct esfq_sched_data *q = qdisc_priv(sch);
655 unsigned char *b = skb_tail_pointer(skb);
656 struct tc_esfq_qopt opt;
658 opt.quantum = q->quantum;
659 opt.perturb_period = q->perturb_period/HZ;
661 opt.limit = q->limit;
662 opt.divisor = q->hash_divisor;
663 opt.flows = q->depth;
664 opt.hash_kind = q->hash_kind;
666 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
668 return skb->len;
670 rtattr_failure:
671 nlmsg_trim(skb, b);
672 return -1;
675 static struct Qdisc_ops esfq_qdisc_ops =
677 .next = NULL,
678 .cl_ops = NULL,
679 .id = "esfq",
680 .priv_size = sizeof(struct esfq_sched_data),
681 .enqueue = esfq_enqueue,
682 .dequeue = esfq_dequeue,
683 .requeue = esfq_requeue,
684 .drop = esfq_drop,
685 .init = esfq_init,
686 .reset = esfq_reset,
687 .destroy = esfq_destroy,
688 .change = esfq_change,
689 .dump = esfq_dump,
690 .owner = THIS_MODULE,
693 static int __init esfq_module_init(void)
695 return register_qdisc(&esfq_qdisc_ops);
697 static void __exit esfq_module_exit(void)
699 unregister_qdisc(&esfq_qdisc_ops);
701 module_init(esfq_module_init)
702 module_exit(esfq_module_exit)
703 MODULE_LICENSE("GPL");