net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/ipv6.h>
  21 #include <linux/skbuff.h>
  22 #include <linux/jhash.h>
  23 #include <linux/slab.h>
  24 #include <net/ip.h>
  25 #include <net/netlink.h>
  26 #include <net/pkt_sched.h>
  27
  28
  29 /*      Stochastic Fairness Queuing algorithm.
  30         =======================================
  31
  32         Source:
  33         Paul E. McKenney "Stochastic Fairness Queuing",
  34         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  35
  36         Paul E. McKenney "Stochastic Fairness Queuing",
  37         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  38
  39
  40         See also:
  41         M. Shreedhar and George Varghese "Efficient Fair
  42         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  43
  44
  45         This is not the thing that is usually called (W)FQ nowadays.
  46         It does not use any timestamp mechanism, but instead
  47         processes queues in round-robin order.
  48
  49         ADVANTAGE:
  50
  51         - It is very cheap. Both CPU and memory requirements are minimal.
  52
  53         DRAWBACKS:
  54
  55         - "Stochastic" -> It is not 100% fair.
  56         When hash collisions occur, several flows are considered as one.
  57
  58         - "Round-robin" -> It introduces larger delays than virtual clock
  59         based schemes, and should not be used for isolating interactive
  60         traffic from non-interactive. It means, that this scheduler
  61         should be used as leaf of CBQ or P3, which put interactive traffic
  62         to higher priority band.
  63
  64         We still need true WFQ for top level CSZ, but using WFQ
  65         for the best effort traffic is absolutely pointless:
  66         SFQ is superior for this purpose.
  67
  68         IMPLEMENTATION:
  69         This implementation limits maximal queue length to 128;
  70         maximal mtu to 2^15-1; number of hash buckets to 1024.
  71         The only goal of this restrictions was that all data
  72         fit into one 4K page :-). Struct sfq_sched_data is
  73         organized in anti-cache manner: all the data for a bucket
  74         are scattered over different locations. This is not good,
  75         but it allowed me to put it into 4K.
  76
  77         It is easy to increase these values, but not in flight.  */
  78
  79 #define SFQ_DEPTH               128
  80 #define SFQ_HASH_DIVISOR        1024
  81
  82 /* This type should contain at least SFQ_DEPTH*2 values */
  83 typedef unsigned char sfq_index;
  84
  85 struct sfq_head
  86 {
  87         sfq_index       next;
  88         sfq_index       prev;
  89 };
  90
  91 struct sfq_sched_data
  92 {
  93 /* Parameters */
  94         int             perturb_period;
  95         unsigned        quantum;        /* Allotment per round: MUST BE >= MTU */
  96         int             limit;
  97
  98 /* Variables */
  99         struct tcf_proto *filter_list;
 100         struct timer_list perturb_timer;
 101         u32             perturbation;
 102         sfq_index       tail;           /* Index of current slot in round */
 103         sfq_index       max_depth;      /* Maximal depth */
 104
 105         sfq_index       ht[SFQ_HASH_DIVISOR];   /* Hash table */
 106         sfq_index       next[SFQ_DEPTH];        /* Active slots link */
 107         short           allot[SFQ_DEPTH];       /* Current allotment per slot */
 108         unsigned short  hash[SFQ_DEPTH];        /* Hash value indexed by slots */
 109         struct sk_buff_head     qs[SFQ_DEPTH];          /* Slot queue */
 110         struct sfq_head dep[SFQ_DEPTH*2];       /* Linked list of slots, indexed by depth */
 111 };
 112
 113 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
 114 {
 115         return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
 116 }
 117
 118 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 119 {
 120         u32 h, h2;
 121
 122         switch (skb->protocol) {
 123         case htons(ETH_P_IP):
 124         {
 125                 const struct iphdr *iph;
 126
 127                 if (!pskb_network_may_pull(skb, sizeof(*iph)))
 128                         goto err;
 129                 iph = ip_hdr(skb);
 130                 h = (__force u32)iph->daddr;
 131                 h2 = (__force u32)iph->saddr ^ iph->protocol;
 132                 if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
 133                     (iph->protocol == IPPROTO_TCP ||
 134                      iph->protocol == IPPROTO_UDP ||
 135                      iph->protocol == IPPROTO_UDPLITE ||
 136                      iph->protocol == IPPROTO_SCTP ||
 137                      iph->protocol == IPPROTO_DCCP ||
 138                      iph->protocol == IPPROTO_ESP) &&
 139                      pskb_network_may_pull(skb, iph->ihl * 4 + 4))
 140                         h2 ^= *(((u32*)iph) + iph->ihl);
 141                 break;
 142         }
 143         case htons(ETH_P_IPV6):
 144         {
 145                 struct ipv6hdr *iph;
 146
 147                 if (!pskb_network_may_pull(skb, sizeof(*iph)))
 148                         goto err;
 149                 iph = ipv6_hdr(skb);
 150                 h = (__force u32)iph->daddr.s6_addr32[3];
 151                 h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
 152                 if ((iph->nexthdr == IPPROTO_TCP ||
 153                      iph->nexthdr == IPPROTO_UDP ||
 154                      iph->nexthdr == IPPROTO_UDPLITE ||
 155                      iph->nexthdr == IPPROTO_SCTP ||
 156                      iph->nexthdr == IPPROTO_DCCP ||
 157                      iph->nexthdr == IPPROTO_ESP) &&
 158                     pskb_network_may_pull(skb, sizeof(*iph) + 4))
 159                         h2 ^= *(u32*)&iph[1];
 160                 break;
 161         }
 162         default:
 163 err:
 164                 h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
 165                 h2 = (unsigned long)skb->sk;
 166         }
 167
 168         return sfq_fold_hash(q, h, h2);
 169 }
 170
 171 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 172                                  int *qerr)
 173 {
 174         struct sfq_sched_data *q = qdisc_priv(sch);
 175         struct tcf_result res;
 176         int result;
 177
 178         if (TC_H_MAJ(skb->priority) == sch->handle &&
 179             TC_H_MIN(skb->priority) > 0 &&
 180             TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
 181                 return TC_H_MIN(skb->priority);
 182
 183         if (!q->filter_list)
 184                 return sfq_hash(q, skb) + 1;
 185
 186         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 187         result = tc_classify(skb, q->filter_list, &res);
 188         if (result >= 0) {
 189 #ifdef CONFIG_NET_CLS_ACT
 190                 switch (result) {
 191                 case TC_ACT_STOLEN:
 192                 case TC_ACT_QUEUED:
 193                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 194                 case TC_ACT_SHOT:
 195                         return 0;
 196                 }
 197 #endif
 198                 if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
 199                         return TC_H_MIN(res.classid);
 200         }
 201         return 0;
 202 }
 203
 204 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 205 {
 206         sfq_index p, n;
 207         int d = q->qs[x].qlen + SFQ_DEPTH;
 208
 209         p = d;
 210         n = q->dep[d].next;
 211         q->dep[x].next = n;
 212         q->dep[x].prev = p;
 213         q->dep[p].next = q->dep[n].prev = x;
 214 }
 215
 216 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 217 {
 218         sfq_index p, n;
 219
 220         n = q->dep[x].next;
 221         p = q->dep[x].prev;
 222         q->dep[p].next = n;
 223         q->dep[n].prev = p;
 224
 225         if (n == p && q->max_depth == q->qs[x].qlen + 1)
 226                 q->max_depth--;
 227
 228         sfq_link(q, x);
 229 }
 230
 231 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 232 {
 233         sfq_index p, n;
 234         int d;
 235
 236         n = q->dep[x].next;
 237         p = q->dep[x].prev;
 238         q->dep[p].next = n;
 239         q->dep[n].prev = p;
 240         d = q->qs[x].qlen;
 241         if (q->max_depth < d)
 242                 q->max_depth = d;
 243
 244         sfq_link(q, x);
 245 }
 246
 247 static unsigned int sfq_drop(struct Qdisc *sch)
 248 {
 249         struct sfq_sched_data *q = qdisc_priv(sch);
 250         sfq_index d = q->max_depth;
 251         struct sk_buff *skb;
 252         unsigned int len;
 253
 254         /* Queue is full! Find the longest slot and
 255            drop a packet from it */
 256
 257         if (d > 1) {
 258                 sfq_index x = q->dep[d + SFQ_DEPTH].next;
 259                 skb = q->qs[x].prev;
 260                 len = qdisc_pkt_len(skb);
 261                 __skb_unlink(skb, &q->qs[x]);
 262                 kfree_skb(skb);
 263                 sfq_dec(q, x);
 264                 sch->q.qlen--;
 265                 sch->qstats.drops++;
 266                 sch->qstats.backlog -= len;
 267                 return len;
 268         }
 269
 270         if (d == 1) {
 271                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 272                 d = q->next[q->tail];
 273                 q->next[q->tail] = q->next[d];
 274                 q->allot[q->next[d]] += q->quantum;
 275                 skb = q->qs[d].prev;
 276                 len = qdisc_pkt_len(skb);
 277                 __skb_unlink(skb, &q->qs[d]);
 278                 kfree_skb(skb);
 279                 sfq_dec(q, d);
 280                 sch->q.qlen--;
 281                 q->ht[q->hash[d]] = SFQ_DEPTH;
 282                 sch->qstats.drops++;
 283                 sch->qstats.backlog -= len;
 284                 return len;
 285         }
 286
 287         return 0;
 288 }
 289
 290 static int
 291 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 292 {
 293         struct sfq_sched_data *q = qdisc_priv(sch);
 294         unsigned int hash;
 295         sfq_index x;
 296         int uninitialized_var(ret);
 297
 298         hash = sfq_classify(skb, sch, &ret);
 299         if (hash == 0) {
 300                 if (ret & __NET_XMIT_BYPASS)
 301                         sch->qstats.drops++;
 302                 kfree_skb(skb);
 303                 return ret;
 304         }
 305         hash--;
 306
 307         x = q->ht[hash];
 308         if (x == SFQ_DEPTH) {
 309                 q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
 310                 q->hash[x] = hash;
 311         }
 312
 313         /* If selected queue has length q->limit, this means that
 314          * all another queues are empty and that we do simple tail drop,
 315          * i.e. drop _this_ packet.
 316          */
 317         if (q->qs[x].qlen >= q->limit)
 318                 return qdisc_drop(skb, sch);
 319
 320         sch->qstats.backlog += qdisc_pkt_len(skb);
 321         __skb_queue_tail(&q->qs[x], skb);
 322         sfq_inc(q, x);
 323         if (q->qs[x].qlen == 1) {               /* The flow is new */
 324                 if (q->tail == SFQ_DEPTH) {     /* It is the first flow */
 325                         q->tail = x;
 326                         q->next[x] = x;
 327                         q->allot[x] = q->quantum;
 328                 } else {
 329                         q->next[x] = q->next[q->tail];
 330                         q->next[q->tail] = x;
 331                         q->tail = x;
 332                 }
 333         }
 334         if (++sch->q.qlen <= q->limit) {
 335                 sch->bstats.bytes += qdisc_pkt_len(skb);
 336                 sch->bstats.packets++;
 337                 return NET_XMIT_SUCCESS;
 338         }
 339
 340         sfq_drop(sch);
 341         return NET_XMIT_CN;
 342 }
 343
 344 static struct sk_buff *
 345 sfq_peek(struct Qdisc *sch)
 346 {
 347         struct sfq_sched_data *q = qdisc_priv(sch);
 348         sfq_index a;
 349
 350         /* No active slots */
 351         if (q->tail == SFQ_DEPTH)
 352                 return NULL;
 353
 354         a = q->next[q->tail];
 355         return skb_peek(&q->qs[a]);
 356 }
 357
 358 static struct sk_buff *
 359 sfq_dequeue(struct Qdisc *sch)
 360 {
 361         struct sfq_sched_data *q = qdisc_priv(sch);
 362         struct sk_buff *skb;
 363         sfq_index a, old_a;
 364
 365         /* No active slots */
 366         if (q->tail == SFQ_DEPTH)
 367                 return NULL;
 368
 369         a = old_a = q->next[q->tail];
 370
 371         /* Grab packet */
 372         skb = __skb_dequeue(&q->qs[a]);
 373         sfq_dec(q, a);
 374         sch->q.qlen--;
 375         sch->qstats.backlog -= qdisc_pkt_len(skb);
 376
 377         /* Is the slot empty? */
 378         if (q->qs[a].qlen == 0) {
 379                 q->ht[q->hash[a]] = SFQ_DEPTH;
 380                 a = q->next[a];
 381                 if (a == old_a) {
 382                         q->tail = SFQ_DEPTH;
 383                         return skb;
 384                 }
 385                 q->next[q->tail] = a;
 386                 q->allot[a] += q->quantum;
 387         } else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
 388                 q->tail = a;
 389                 a = q->next[a];
 390                 q->allot[a] += q->quantum;
 391         }
 392         return skb;
 393 }
 394
 395 static void
 396 sfq_reset(struct Qdisc *sch)
 397 {
 398         struct sk_buff *skb;
 399
 400         while ((skb = sfq_dequeue(sch)) != NULL)
 401                 kfree_skb(skb);
 402 }
 403
 404 static void sfq_perturbation(unsigned long arg)
 405 {
 406         struct Qdisc *sch = (struct Qdisc *)arg;
 407         struct sfq_sched_data *q = qdisc_priv(sch);
 408
 409         q->perturbation = net_random();
 410
 411         if (q->perturb_period)
 412                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 413 }
 414
 415 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 416 {
 417         struct sfq_sched_data *q = qdisc_priv(sch);
 418         struct tc_sfq_qopt *ctl = nla_data(opt);
 419         unsigned int qlen;
 420
 421         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 422                 return -EINVAL;
 423
 424         sch_tree_lock(sch);
 425         q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
 426         q->perturb_period = ctl->perturb_period * HZ;
 427         if (ctl->limit)
 428                 q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
 429
 430         qlen = sch->q.qlen;
 431         while (sch->q.qlen > q->limit)
 432                 sfq_drop(sch);
 433         qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 434
 435         del_timer(&q->perturb_timer);
 436         if (q->perturb_period) {
 437                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 438                 q->perturbation = net_random();
 439         }
 440         sch_tree_unlock(sch);
 441         return 0;
 442 }
 443
 444 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 445 {
 446         struct sfq_sched_data *q = qdisc_priv(sch);
 447         int i;
 448
 449         q->perturb_timer.function = sfq_perturbation;
 450         q->perturb_timer.data = (unsigned long)sch;
 451         init_timer_deferrable(&q->perturb_timer);
 452
 453         for (i = 0; i < SFQ_HASH_DIVISOR; i++)
 454                 q->ht[i] = SFQ_DEPTH;
 455
 456         for (i = 0; i < SFQ_DEPTH; i++) {
 457                 skb_queue_head_init(&q->qs[i]);
 458                 q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
 459                 q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
 460         }
 461
 462         q->limit = SFQ_DEPTH - 1;
 463         q->max_depth = 0;
 464         q->tail = SFQ_DEPTH;
 465         if (opt == NULL) {
 466                 q->quantum = psched_mtu(qdisc_dev(sch));
 467                 q->perturb_period = 0;
 468                 q->perturbation = net_random();
 469         } else {
 470                 int err = sfq_change(sch, opt);
 471                 if (err)
 472                         return err;
 473         }
 474
 475         for (i = 0; i < SFQ_DEPTH; i++)
 476                 sfq_link(q, i);
 477         return 0;
 478 }
 479
 480 static void sfq_destroy(struct Qdisc *sch)
 481 {
 482         struct sfq_sched_data *q = qdisc_priv(sch);
 483
 484         tcf_destroy_chain(&q->filter_list);
 485         q->perturb_period = 0;
 486         del_timer_sync(&q->perturb_timer);
 487 }
 488
 489 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 490 {
 491         struct sfq_sched_data *q = qdisc_priv(sch);
 492         unsigned char *b = skb_tail_pointer(skb);
 493         struct tc_sfq_qopt opt;
 494
 495         opt.quantum = q->quantum;
 496         opt.perturb_period = q->perturb_period / HZ;
 497
 498         opt.limit = q->limit;
 499         opt.divisor = SFQ_HASH_DIVISOR;
 500         opt.flows = q->limit;
 501
 502         NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 503
 504         return skb->len;
 505
 506 nla_put_failure:
 507         nlmsg_trim(skb, b);
 508         return -1;
 509 }
 510
 511 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 512 {
 513         return NULL;
 514 }
 515
 516 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 517 {
 518         return 0;
 519 }
 520
 521 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 522                               u32 classid)
 523 {
 524         return 0;
 525 }
 526
 527 static void sfq_put(struct Qdisc *q, unsigned long cl)
 528 {
 529 }
 530
 531 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 532 {
 533         struct sfq_sched_data *q = qdisc_priv(sch);
 534
 535         if (cl)
 536                 return NULL;
 537         return &q->filter_list;
 538 }
 539
 540 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 541                           struct sk_buff *skb, struct tcmsg *tcm)
 542 {
 543         tcm->tcm_handle |= TC_H_MIN(cl);
 544         return 0;
 545 }
 546
 547 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 548                                 struct gnet_dump *d)
 549 {
 550         struct sfq_sched_data *q = qdisc_priv(sch);
 551         sfq_index idx = q->ht[cl-1];
 552         struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
 553         struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
 554
 555         if (gnet_stats_copy_queue(d, &qs) < 0)
 556                 return -1;
 557         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 558 }
 559
 560 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 561 {
 562         struct sfq_sched_data *q = qdisc_priv(sch);
 563         unsigned int i;
 564
 565         if (arg->stop)
 566                 return;
 567
 568         for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
 569                 if (q->ht[i] == SFQ_DEPTH ||
 570                     arg->count < arg->skip) {
 571                         arg->count++;
 572                         continue;
 573                 }
 574                 if (arg->fn(sch, i + 1, arg) < 0) {
 575                         arg->stop = 1;
 576                         break;
 577                 }
 578                 arg->count++;
 579         }
 580 }
 581
 582 static const struct Qdisc_class_ops sfq_class_ops = {
 583         .leaf           =       sfq_leaf,
 584         .get            =       sfq_get,
 585         .put            =       sfq_put,
 586         .tcf_chain      =       sfq_find_tcf,
 587         .bind_tcf       =       sfq_bind,
 588         .unbind_tcf     =       sfq_put,
 589         .dump           =       sfq_dump_class,
 590         .dump_stats     =       sfq_dump_class_stats,
 591         .walk           =       sfq_walk,
 592 };
 593
 594 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 595         .cl_ops         =       &sfq_class_ops,
 596         .id             =       "sfq",
 597         .priv_size      =       sizeof(struct sfq_sched_data),
 598         .enqueue        =       sfq_enqueue,
 599         .dequeue        =       sfq_dequeue,
 600         .peek           =       sfq_peek,
 601         .drop           =       sfq_drop,
 602         .init           =       sfq_init,
 603         .reset          =       sfq_reset,
 604         .destroy        =       sfq_destroy,
 605         .change         =       NULL,
 606         .dump           =       sfq_dump,
 607         .owner          =       THIS_MODULE,
 608 };
 609
 610 static int __init sfq_module_init(void)
 611 {
 612         return register_qdisc(&sfq_qdisc_ops);
 613 }
 614 static void __exit sfq_module_exit(void)
 615 {
 616         unregister_qdisc(&sfq_qdisc_ops);
 617 }
 618 module_init(sfq_module_init)
 619 module_exit(sfq_module_exit)
 620 MODULE_LICENSE("GPL");