2 * net/sched/sch_esfq.c Extended Stochastic Fairness Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 * Changes: Alexander Atanasov, <alex@ssi.bg>
12 * Added dynamic depth,limit,divisor,hash_kind options.
13 * Added dst and src hashes.
15 * Alexander Clouter, <alex@digriz.org.uk>
16 * Ported ESFQ to Linux 2.6.
18 * Corey Hickey, <bugfood-c@fatooh.org>
19 * Maintenance of the Linux 2.6 port.
20 * Added fwmark hash (thanks to Robert Kurjata).
21 * Added usage of jhash.
22 * Added conntrack support.
23 * Added ctnatchg hash (thanks to Ben Pfountz).
26 #include <linux/module.h>
27 #include <asm/uaccess.h>
28 #include <asm/system.h>
29 #include <linux/bitops.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/jiffies.h>
33 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/sockios.h>
38 #include <linux/errno.h>
39 #include <linux/interrupt.h>
40 #include <linux/if_ether.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/etherdevice.h>
44 #include <linux/notifier.h>
45 #include <linux/init.h>
47 #include <net/netlink.h>
48 #include <linux/ipv6.h>
49 #include <net/route.h>
50 #include <linux/skbuff.h>
52 #include <net/pkt_sched.h>
53 #include <linux/jhash.h>
54 #include <net/netfilter/nf_conntrack.h>
56 /* Stochastic Fairness Queuing algorithm.
57 For more comments look at sch_sfq.c.
58 The difference is that you can change limit, depth,
59 hash table size and choose alternate hash types.
61 classic: same as in sch_sfq.c
62 dst: destination IP address
63 src: source IP address
64 fwmark: netfilter mark value
65 ctorigdst: original destination IP address
66 ctorigsrc: original source IP address
67 ctrepldst: reply destination IP address
68 ctreplsrc: reply source IP
75 /* This type should contain at least SFQ_DEPTH*2 values */
76 typedef unsigned int esfq_index
;
84 struct esfq_sched_data
88 unsigned quantum
; /* Allotment per round: MUST BE >= MTU */
91 unsigned hash_divisor
;
94 struct timer_list perturb_timer
;
96 esfq_index tail
; /* Index of current slot in round */
97 esfq_index max_depth
; /* Maximal depth */
99 esfq_index
*ht
; /* Hash table */
100 esfq_index
*next
; /* Active slots link */
101 short *allot
; /* Current allotment per slot */
102 unsigned short *hash
; /* Hash value indexed by slots */
103 struct sk_buff_head
*qs
; /* Slot queue */
104 struct esfq_head
*dep
; /* Linked list of slots, indexed by depth */
107 /* This contains the info we will hash. */
108 struct esfq_packet_info
110 u32 proto
; /* protocol or port */
111 u32 src
; /* source from packet header */
112 u32 dst
; /* destination from packet header */
113 u32 ctorigsrc
; /* original source from conntrack */
114 u32 ctorigdst
; /* original destination from conntrack */
115 u32 ctreplsrc
; /* reply source from conntrack */
116 u32 ctrepldst
; /* reply destination from conntrack */
117 u32 mark
; /* netfilter mark (fwmark) */
120 static __inline__
unsigned esfq_jhash_1word(struct esfq_sched_data
*q
,u32 a
)
122 return jhash_1word(a
, q
->perturbation
) & (q
->hash_divisor
-1);
125 static __inline__
unsigned esfq_jhash_2words(struct esfq_sched_data
*q
, u32 a
, u32 b
)
127 return jhash_2words(a
, b
, q
->perturbation
) & (q
->hash_divisor
-1);
130 static __inline__
unsigned esfq_jhash_3words(struct esfq_sched_data
*q
, u32 a
, u32 b
, u32 c
)
132 return jhash_3words(a
, b
, c
, q
->perturbation
) & (q
->hash_divisor
-1);
135 static unsigned esfq_hash(struct esfq_sched_data
*q
, struct sk_buff
*skb
)
137 struct esfq_packet_info info
;
138 #ifdef CONFIG_NET_SCH_ESFQ_NFCT
139 enum ip_conntrack_info ctinfo
;
140 struct nf_conn
*ct
= nf_ct_get(skb
, &ctinfo
);
143 switch (skb
->protocol
) {
144 case __constant_htons(ETH_P_IP
):
146 struct iphdr
*iph
= ip_hdr(skb
);
147 info
.dst
= iph
->daddr
;
148 info
.src
= iph
->saddr
;
149 if (!(iph
->frag_off
&htons(IP_MF
|IP_OFFSET
)) &&
150 (iph
->protocol
== IPPROTO_TCP
||
151 iph
->protocol
== IPPROTO_UDP
||
152 iph
->protocol
== IPPROTO_SCTP
||
153 iph
->protocol
== IPPROTO_DCCP
||
154 iph
->protocol
== IPPROTO_ESP
))
155 info
.proto
= *(((u32
*)iph
) + iph
->ihl
);
157 info
.proto
= iph
->protocol
;
160 case __constant_htons(ETH_P_IPV6
):
162 struct ipv6hdr
*iph
= ipv6_hdr(skb
);
163 /* Hash ipv6 addresses into a u32. This isn't ideal,
164 * but the code is simple. */
165 info
.dst
= jhash2(iph
->daddr
.s6_addr32
, 4, q
->perturbation
);
166 info
.src
= jhash2(iph
->saddr
.s6_addr32
, 4, q
->perturbation
);
167 if (iph
->nexthdr
== IPPROTO_TCP
||
168 iph
->nexthdr
== IPPROTO_UDP
||
169 iph
->nexthdr
== IPPROTO_SCTP
||
170 iph
->nexthdr
== IPPROTO_DCCP
||
171 iph
->nexthdr
== IPPROTO_ESP
)
172 info
.proto
= *(u32
*)&iph
[1];
174 info
.proto
= iph
->nexthdr
;
178 info
.dst
= (u32
)(unsigned long)skb
->dst
;
179 info
.src
= (u32
)(unsigned long)skb
->sk
;
180 info
.proto
= skb
->protocol
;
183 info
.mark
= skb
->mark
;
185 #ifdef CONFIG_NET_SCH_ESFQ_NFCT
186 /* defaults if there is no conntrack info */
187 info
.ctorigsrc
= info
.src
;
188 info
.ctorigdst
= info
.dst
;
189 info
.ctreplsrc
= info
.dst
;
190 info
.ctrepldst
= info
.src
;
191 /* collect conntrack info */
192 if (ct
&& ct
!= &nf_conntrack_untracked
) {
193 if (skb
->protocol
== __constant_htons(ETH_P_IP
)) {
194 info
.ctorigsrc
= ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
.src
.u3
.ip
;
195 info
.ctorigdst
= ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
.dst
.u3
.ip
;
196 info
.ctreplsrc
= ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
.src
.u3
.ip
;
197 info
.ctrepldst
= ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
.dst
.u3
.ip
;
199 else if (skb
->protocol
== __constant_htons(ETH_P_IPV6
)) {
200 /* Again, hash ipv6 addresses into a single u32. */
201 info
.ctorigsrc
= jhash2(ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
.src
.u3
.ip6
, 4, q
->perturbation
);
202 info
.ctorigdst
= jhash2(ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
.dst
.u3
.ip6
, 4, q
->perturbation
);
203 info
.ctreplsrc
= jhash2(ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
.src
.u3
.ip6
, 4, q
->perturbation
);
204 info
.ctrepldst
= jhash2(ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
.dst
.u3
.ip6
, 4, q
->perturbation
);
210 switch(q
->hash_kind
) {
211 case TCA_SFQ_HASH_CLASSIC
:
212 return esfq_jhash_3words(q
, info
.dst
, info
.src
, info
.proto
);
213 case TCA_SFQ_HASH_DST
:
214 return esfq_jhash_1word(q
, info
.dst
);
215 case TCA_SFQ_HASH_SRC
:
216 return esfq_jhash_1word(q
, info
.src
);
217 case TCA_SFQ_HASH_FWMARK
:
218 return esfq_jhash_1word(q
, info
.mark
);
219 #ifdef CONFIG_NET_SCH_ESFQ_NFCT
220 case TCA_SFQ_HASH_CTORIGDST
:
221 return esfq_jhash_1word(q
, info
.ctorigdst
);
222 case TCA_SFQ_HASH_CTORIGSRC
:
223 return esfq_jhash_1word(q
, info
.ctorigsrc
);
224 case TCA_SFQ_HASH_CTREPLDST
:
225 return esfq_jhash_1word(q
, info
.ctrepldst
);
226 case TCA_SFQ_HASH_CTREPLSRC
:
227 return esfq_jhash_1word(q
, info
.ctreplsrc
);
228 case TCA_SFQ_HASH_CTNATCHG
:
230 if (info
.ctorigdst
== info
.ctreplsrc
)
231 return esfq_jhash_1word(q
, info
.ctorigsrc
);
232 return esfq_jhash_1word(q
, info
.ctreplsrc
);
237 printk(KERN_WARNING
"ESFQ: Unknown hash method. Falling back to classic.\n");
239 return esfq_jhash_3words(q
, info
.dst
, info
.src
, info
.proto
);
242 static inline void esfq_link(struct esfq_sched_data
*q
, esfq_index x
)
245 int d
= q
->qs
[x
].qlen
+ q
->depth
;
251 q
->dep
[p
].next
= q
->dep
[n
].prev
= x
;
254 static inline void esfq_dec(struct esfq_sched_data
*q
, esfq_index x
)
263 if (n
== p
&& q
->max_depth
== q
->qs
[x
].qlen
+ 1)
269 static inline void esfq_inc(struct esfq_sched_data
*q
, esfq_index x
)
279 if (q
->max_depth
< d
)
285 static unsigned int esfq_drop(struct Qdisc
*sch
)
287 struct esfq_sched_data
*q
= qdisc_priv(sch
);
288 esfq_index d
= q
->max_depth
;
292 /* Queue is full! Find the longest slot and
293 drop a packet from it */
296 esfq_index x
= q
->dep
[d
+q
->depth
].next
;
299 __skb_unlink(skb
, &q
->qs
[x
]);
304 sch
->qstats
.backlog
-= len
;
309 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
310 d
= q
->next
[q
->tail
];
311 q
->next
[q
->tail
] = q
->next
[d
];
312 q
->allot
[q
->next
[d
]] += q
->quantum
;
315 __skb_unlink(skb
, &q
->qs
[d
]);
319 q
->ht
[q
->hash
[d
]] = q
->depth
;
321 sch
->qstats
.backlog
-= len
;
328 static void esfq_q_enqueue(struct sk_buff
*skb
, struct esfq_sched_data
*q
, unsigned int end
)
330 unsigned hash
= esfq_hash(q
, skb
);
331 unsigned depth
= q
->depth
;
336 q
->ht
[hash
] = x
= q
->dep
[depth
].next
;
340 if (end
== ESFQ_TAIL
)
341 __skb_queue_tail(&q
->qs
[x
], skb
);
343 __skb_queue_head(&q
->qs
[x
], skb
);
346 if (q
->qs
[x
].qlen
== 1) { /* The flow is new */
347 if (q
->tail
== depth
) { /* It is the first flow */
350 q
->allot
[x
] = q
->quantum
;
352 q
->next
[x
] = q
->next
[q
->tail
];
353 q
->next
[q
->tail
] = x
;
359 static int esfq_enqueue(struct sk_buff
*skb
, struct Qdisc
* sch
)
361 struct esfq_sched_data
*q
= qdisc_priv(sch
);
362 esfq_q_enqueue(skb
, q
, ESFQ_TAIL
);
363 sch
->qstats
.backlog
+= skb
->len
;
364 if (++sch
->q
.qlen
< q
->limit
-1) {
365 sch
->bstats
.bytes
+= skb
->len
;
366 sch
->bstats
.packets
++;
376 static int esfq_requeue(struct sk_buff
*skb
, struct Qdisc
* sch
)
378 struct esfq_sched_data
*q
= qdisc_priv(sch
);
379 esfq_q_enqueue(skb
, q
, ESFQ_HEAD
);
380 sch
->qstats
.backlog
+= skb
->len
;
381 if (++sch
->q
.qlen
< q
->limit
- 1) {
382 sch
->qstats
.requeues
++;
391 static struct sk_buff
*esfq_q_dequeue(struct esfq_sched_data
*q
)
394 unsigned depth
= q
->depth
;
397 /* No active slots */
398 if (q
->tail
== depth
)
401 a
= old_a
= q
->next
[q
->tail
];
404 skb
= __skb_dequeue(&q
->qs
[a
]);
407 /* Is the slot empty? */
408 if (q
->qs
[a
].qlen
== 0) {
409 q
->ht
[q
->hash
[a
]] = depth
;
415 q
->next
[q
->tail
] = a
;
416 q
->allot
[a
] += q
->quantum
;
417 } else if ((q
->allot
[a
] -= skb
->len
) <= 0) {
420 q
->allot
[a
] += q
->quantum
;
426 static struct sk_buff
*esfq_dequeue(struct Qdisc
* sch
)
428 struct esfq_sched_data
*q
= qdisc_priv(sch
);
431 skb
= esfq_q_dequeue(q
);
435 sch
->qstats
.backlog
-= skb
->len
;
439 static void esfq_q_destroy(struct esfq_sched_data
*q
)
441 del_timer(&q
->perturb_timer
);
456 static void esfq_destroy(struct Qdisc
*sch
)
458 struct esfq_sched_data
*q
= qdisc_priv(sch
);
463 static void esfq_reset(struct Qdisc
* sch
)
467 while ((skb
= esfq_dequeue(sch
)) != NULL
)
471 static void esfq_perturbation(unsigned long arg
)
473 struct Qdisc
*sch
= (struct Qdisc
*)arg
;
474 struct esfq_sched_data
*q
= qdisc_priv(sch
);
476 q
->perturbation
= net_random()&0x1F;
478 if (q
->perturb_period
) {
479 q
->perturb_timer
.expires
= jiffies
+ q
->perturb_period
;
480 add_timer(&q
->perturb_timer
);
484 static unsigned int esfq_check_hash(unsigned int kind
)
487 case TCA_SFQ_HASH_CTORIGDST
:
488 case TCA_SFQ_HASH_CTORIGSRC
:
489 case TCA_SFQ_HASH_CTREPLDST
:
490 case TCA_SFQ_HASH_CTREPLSRC
:
491 case TCA_SFQ_HASH_CTNATCHG
:
492 #ifndef CONFIG_NET_SCH_ESFQ_NFCT
495 printk(KERN_WARNING
"ESFQ: Conntrack hash types disabled in kernel config. Falling back to classic.\n");
496 return TCA_SFQ_HASH_CLASSIC
;
499 case TCA_SFQ_HASH_CLASSIC
:
500 case TCA_SFQ_HASH_DST
:
501 case TCA_SFQ_HASH_SRC
:
502 case TCA_SFQ_HASH_FWMARK
:
507 printk(KERN_WARNING
"ESFQ: Unknown hash type. Falling back to classic.\n");
508 return TCA_SFQ_HASH_CLASSIC
;
513 static int esfq_q_init(struct esfq_sched_data
*q
, struct rtattr
*opt
)
515 struct tc_esfq_qopt
*ctl
= RTA_DATA(opt
);
516 esfq_index p
= ~0U/2;
519 if (opt
&& opt
->rta_len
< RTA_LENGTH(sizeof(*ctl
)))
523 q
->hash_kind
= TCA_SFQ_HASH_CLASSIC
;
526 q
->perturb_period
= 0;
527 q
->hash_divisor
= 1024;
528 q
->tail
= q
->limit
= q
->depth
= 128;
531 struct tc_esfq_qopt
*ctl
= RTA_DATA(opt
);
533 q
->quantum
= ctl
->quantum
;
534 q
->perturb_period
= ctl
->perturb_period
*HZ
;
535 q
->hash_divisor
= ctl
->divisor
? : 1024;
536 q
->tail
= q
->limit
= q
->depth
= ctl
->flows
? : 128;
538 if ( q
->depth
> p
- 1 )
542 q
->limit
= min_t(u32
, ctl
->limit
, q
->depth
);
544 if (ctl
->hash_kind
) {
545 q
->hash_kind
= esfq_check_hash(ctl
->hash_kind
);
549 q
->ht
= kmalloc(q
->hash_divisor
*sizeof(esfq_index
), GFP_KERNEL
);
552 q
->dep
= kmalloc((1+q
->depth
*2)*sizeof(struct esfq_head
), GFP_KERNEL
);
555 q
->next
= kmalloc(q
->depth
*sizeof(esfq_index
), GFP_KERNEL
);
558 q
->allot
= kmalloc(q
->depth
*sizeof(short), GFP_KERNEL
);
561 q
->hash
= kmalloc(q
->depth
*sizeof(unsigned short), GFP_KERNEL
);
564 q
->qs
= kmalloc(q
->depth
*sizeof(struct sk_buff_head
), GFP_KERNEL
);
568 for (i
=0; i
< q
->hash_divisor
; i
++)
570 for (i
=0; i
<q
->depth
; i
++) {
571 skb_queue_head_init(&q
->qs
[i
]);
572 q
->dep
[i
+q
->depth
].next
= i
+q
->depth
;
573 q
->dep
[i
+q
->depth
].prev
= i
+q
->depth
;
576 for (i
=0; i
<q
->depth
; i
++)
584 static int esfq_init(struct Qdisc
*sch
, struct rtattr
*opt
)
586 struct esfq_sched_data
*q
= qdisc_priv(sch
);
589 q
->quantum
= psched_mtu(sch
->dev
); /* default */
590 if ((err
= esfq_q_init(q
, opt
)))
593 init_timer(&q
->perturb_timer
);
594 q
->perturb_timer
.data
= (unsigned long)sch
;
595 q
->perturb_timer
.function
= esfq_perturbation
;
596 if (q
->perturb_period
) {
597 q
->perturb_timer
.expires
= jiffies
+ q
->perturb_period
;
598 add_timer(&q
->perturb_timer
);
604 static int esfq_change(struct Qdisc
*sch
, struct rtattr
*opt
)
606 struct esfq_sched_data
*q
= qdisc_priv(sch
);
607 struct esfq_sched_data
new;
611 /* set up new queue */
612 memset(&new, 0, sizeof(struct esfq_sched_data
));
613 new.quantum
= psched_mtu(sch
->dev
); /* default */
614 if ((err
= esfq_q_init(&new, opt
)))
617 /* copy all packets from the old queue to the new queue */
619 while ((skb
= esfq_q_dequeue(q
)) != NULL
)
620 esfq_q_enqueue(skb
, &new, ESFQ_TAIL
);
622 /* clean up the old queue */
625 /* copy elements of the new queue into the old queue */
626 q
->perturb_period
= new.perturb_period
;
627 q
->quantum
= new.quantum
;
628 q
->limit
= new.limit
;
629 q
->depth
= new.depth
;
630 q
->hash_divisor
= new.hash_divisor
;
631 q
->hash_kind
= new.hash_kind
;
633 q
->max_depth
= new.max_depth
;
637 q
->allot
= new.allot
;
642 if (q
->perturb_period
) {
643 q
->perturb_timer
.expires
= jiffies
+ q
->perturb_period
;
644 add_timer(&q
->perturb_timer
);
648 sch_tree_unlock(sch
);
652 static int esfq_dump(struct Qdisc
*sch
, struct sk_buff
*skb
)
654 struct esfq_sched_data
*q
= qdisc_priv(sch
);
655 unsigned char *b
= skb_tail_pointer(skb
);
656 struct tc_esfq_qopt opt
;
658 opt
.quantum
= q
->quantum
;
659 opt
.perturb_period
= q
->perturb_period
/HZ
;
661 opt
.limit
= q
->limit
;
662 opt
.divisor
= q
->hash_divisor
;
663 opt
.flows
= q
->depth
;
664 opt
.hash_kind
= q
->hash_kind
;
666 RTA_PUT(skb
, TCA_OPTIONS
, sizeof(opt
), &opt
);
675 static struct Qdisc_ops esfq_qdisc_ops
=
680 .priv_size
= sizeof(struct esfq_sched_data
),
681 .enqueue
= esfq_enqueue
,
682 .dequeue
= esfq_dequeue
,
683 .requeue
= esfq_requeue
,
687 .destroy
= esfq_destroy
,
688 .change
= esfq_change
,
690 .owner
= THIS_MODULE
,
693 static int __init
esfq_module_init(void)
695 return register_qdisc(&esfq_qdisc_ops
);
697 static void __exit
esfq_module_exit(void)
699 unregister_qdisc(&esfq_qdisc_ops
);
701 module_init(esfq_module_init
)
702 module_exit(esfq_module_exit
)
703 MODULE_LICENSE("GPL");