net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/ipv6.h>
  21 #include <linux/skbuff.h>
  22 #include <linux/jhash.h>
  23 #include <linux/slab.h>
  24 #include <net/ip.h>
  25 #include <net/netlink.h>
  26 #include <net/pkt_sched.h>
  27
  28
  29 /*      Stochastic Fairness Queuing algorithm.
  30         =======================================
  31
  32         Source:
  33         Paul E. McKenney "Stochastic Fairness Queuing",
  34         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  35
  36         Paul E. McKenney "Stochastic Fairness Queuing",
  37         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  38
  39
  40         See also:
  41         M. Shreedhar and George Varghese "Efficient Fair
  42         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  43
  44
  45         This is not the thing that is usually called (W)FQ nowadays.
  46         It does not use any timestamp mechanism, but instead
  47         processes queues in round-robin order.
  48
  49         ADVANTAGE:
  50
  51         - It is very cheap. Both CPU and memory requirements are minimal.
  52
  53         DRAWBACKS:
  54
  55         - "Stochastic" -> It is not 100% fair.
  56         When hash collisions occur, several flows are considered as one.
  57
  58         - "Round-robin" -> It introduces larger delays than virtual clock
  59         based schemes, and should not be used for isolating interactive
  60         traffic from non-interactive. It means, that this scheduler
  61         should be used as leaf of CBQ or P3, which put interactive traffic
  62         to higher priority band.
  63
  64         We still need true WFQ for top level CSZ, but using WFQ
  65         for the best effort traffic is absolutely pointless:
  66         SFQ is superior for this purpose.
  67
  68         IMPLEMENTATION:
  69         This implementation limits maximal queue length to 128;
  70         maximal mtu to 2^15-1; number of hash buckets to 1024.
  71         The only goal of this restrictions was that all data
  72         fit into one 4K page :-). Struct sfq_sched_data is
  73         organized in anti-cache manner: all the data for a bucket
  74         are scattered over different locations. This is not good,
  75         but it allowed me to put it into 4K.
  76
  77         It is easy to increase these values, but not in flight.  */
  78
  79 #define SFQ_DEPTH               128
  80 #define SFQ_HASH_DIVISOR        1024
  81
  82 /* This type should contain at least SFQ_DEPTH*2 values */
  83 typedef unsigned char sfq_index;
  84
  85 struct sfq_head
  86 {
  87         sfq_index       next;
  88         sfq_index       prev;
  89 };
  90
  91 struct sfq_sched_data
  92 {
  93 /* Parameters */
  94         int             perturb_period;
  95         unsigned        quantum;        /* Allotment per round: MUST BE >= MTU */
  96         int             limit;
  97
  98 /* Variables */
  99         struct tcf_proto *filter_list;
 100         struct timer_list perturb_timer;
 101         u32             perturbation;
 102         sfq_index       tail;           /* Index of current slot in round */
 103         sfq_index       max_depth;      /* Maximal depth */
 104
 105         sfq_index       ht[SFQ_HASH_DIVISOR];   /* Hash table */
 106         sfq_index       next[SFQ_DEPTH];        /* Active slots link */
 107         short           allot[SFQ_DEPTH];       /* Current allotment per slot */
 108         unsigned short  hash[SFQ_DEPTH];        /* Hash value indexed by slots */
 109         struct sk_buff_head     qs[SFQ_DEPTH];          /* Slot queue */
 110         struct sfq_head dep[SFQ_DEPTH*2];       /* Linked list of slots, indexed by depth */
 111 };
 112
 113 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
 114 {
 115         return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
 116 }
 117
 118 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 119 {
 120         u32 h, h2;
 121
 122         switch (skb->protocol) {
 123         case htons(ETH_P_IP):
 124         {
 125                 const struct iphdr *iph;
 126                 int poff;
 127
 128                 if (!pskb_network_may_pull(skb, sizeof(*iph)))
 129                         goto err;
 130                 iph = ip_hdr(skb);
 131                 h = (__force u32)iph->daddr;
 132                 h2 = (__force u32)iph->saddr ^ iph->protocol;
 133                 if (iph->frag_off & htons(IP_MF|IP_OFFSET))
 134                         break;
 135                 poff = proto_ports_offset(iph->protocol);
 136                 if (poff >= 0 &&
 137                     pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
 138                         iph = ip_hdr(skb);
 139                         h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff);
 140                 }
 141                 break;
 142         }
 143         case htons(ETH_P_IPV6):
 144         {
 145                 struct ipv6hdr *iph;
 146                 int poff;
 147
 148                 if (!pskb_network_may_pull(skb, sizeof(*iph)))
 149                         goto err;
 150                 iph = ipv6_hdr(skb);
 151                 h = (__force u32)iph->daddr.s6_addr32[3];
 152                 h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
 153                 poff = proto_ports_offset(iph->nexthdr);
 154                 if (poff >= 0 &&
 155                     pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
 156                         iph = ipv6_hdr(skb);
 157                         h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff);
 158                 }
 159                 break;
 160         }
 161         default:
 162 err:
 163                 h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
 164                 h2 = (unsigned long)skb->sk;
 165         }
 166
 167         return sfq_fold_hash(q, h, h2);
 168 }
 169
 170 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 171                                  int *qerr)
 172 {
 173         struct sfq_sched_data *q = qdisc_priv(sch);
 174         struct tcf_result res;
 175         int result;
 176
 177         if (TC_H_MAJ(skb->priority) == sch->handle &&
 178             TC_H_MIN(skb->priority) > 0 &&
 179             TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
 180                 return TC_H_MIN(skb->priority);
 181
 182         if (!q->filter_list)
 183                 return sfq_hash(q, skb) + 1;
 184
 185         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 186         result = tc_classify(skb, q->filter_list, &res);
 187         if (result >= 0) {
 188 #ifdef CONFIG_NET_CLS_ACT
 189                 switch (result) {
 190                 case TC_ACT_STOLEN:
 191                 case TC_ACT_QUEUED:
 192                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 193                 case TC_ACT_SHOT:
 194                         return 0;
 195                 }
 196 #endif
 197                 if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
 198                         return TC_H_MIN(res.classid);
 199         }
 200         return 0;
 201 }
 202
 203 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 204 {
 205         sfq_index p, n;
 206         int d = q->qs[x].qlen + SFQ_DEPTH;
 207
 208         p = d;
 209         n = q->dep[d].next;
 210         q->dep[x].next = n;
 211         q->dep[x].prev = p;
 212         q->dep[p].next = q->dep[n].prev = x;
 213 }
 214
 215 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 216 {
 217         sfq_index p, n;
 218
 219         n = q->dep[x].next;
 220         p = q->dep[x].prev;
 221         q->dep[p].next = n;
 222         q->dep[n].prev = p;
 223
 224         if (n == p && q->max_depth == q->qs[x].qlen + 1)
 225                 q->max_depth--;
 226
 227         sfq_link(q, x);
 228 }
 229
 230 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 231 {
 232         sfq_index p, n;
 233         int d;
 234
 235         n = q->dep[x].next;
 236         p = q->dep[x].prev;
 237         q->dep[p].next = n;
 238         q->dep[n].prev = p;
 239         d = q->qs[x].qlen;
 240         if (q->max_depth < d)
 241                 q->max_depth = d;
 242
 243         sfq_link(q, x);
 244 }
 245
 246 static unsigned int sfq_drop(struct Qdisc *sch)
 247 {
 248         struct sfq_sched_data *q = qdisc_priv(sch);
 249         sfq_index d = q->max_depth;
 250         struct sk_buff *skb;
 251         unsigned int len;
 252
 253         /* Queue is full! Find the longest slot and
 254            drop a packet from it */
 255
 256         if (d > 1) {
 257                 sfq_index x = q->dep[d + SFQ_DEPTH].next;
 258                 skb = q->qs[x].prev;
 259                 len = qdisc_pkt_len(skb);
 260                 __skb_unlink(skb, &q->qs[x]);
 261                 kfree_skb(skb);
 262                 sfq_dec(q, x);
 263                 sch->q.qlen--;
 264                 sch->qstats.drops++;
 265                 sch->qstats.backlog -= len;
 266                 return len;
 267         }
 268
 269         if (d == 1) {
 270                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 271                 d = q->next[q->tail];
 272                 q->next[q->tail] = q->next[d];
 273                 q->allot[q->next[d]] += q->quantum;
 274                 skb = q->qs[d].prev;
 275                 len = qdisc_pkt_len(skb);
 276                 __skb_unlink(skb, &q->qs[d]);
 277                 kfree_skb(skb);
 278                 sfq_dec(q, d);
 279                 sch->q.qlen--;
 280                 q->ht[q->hash[d]] = SFQ_DEPTH;
 281                 sch->qstats.drops++;
 282                 sch->qstats.backlog -= len;
 283                 return len;
 284         }
 285
 286         return 0;
 287 }
 288
 289 static int
 290 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 291 {
 292         struct sfq_sched_data *q = qdisc_priv(sch);
 293         unsigned int hash;
 294         sfq_index x;
 295         int uninitialized_var(ret);
 296
 297         hash = sfq_classify(skb, sch, &ret);
 298         if (hash == 0) {
 299                 if (ret & __NET_XMIT_BYPASS)
 300                         sch->qstats.drops++;
 301                 kfree_skb(skb);
 302                 return ret;
 303         }
 304         hash--;
 305
 306         x = q->ht[hash];
 307         if (x == SFQ_DEPTH) {
 308                 q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
 309                 q->hash[x] = hash;
 310         }
 311
 312         /* If selected queue has length q->limit, this means that
 313          * all another queues are empty and that we do simple tail drop,
 314          * i.e. drop _this_ packet.
 315          */
 316         if (q->qs[x].qlen >= q->limit)
 317                 return qdisc_drop(skb, sch);
 318
 319         sch->qstats.backlog += qdisc_pkt_len(skb);
 320         __skb_queue_tail(&q->qs[x], skb);
 321         sfq_inc(q, x);
 322         if (q->qs[x].qlen == 1) {               /* The flow is new */
 323                 if (q->tail == SFQ_DEPTH) {     /* It is the first flow */
 324                         q->tail = x;
 325                         q->next[x] = x;
 326                         q->allot[x] = q->quantum;
 327                 } else {
 328                         q->next[x] = q->next[q->tail];
 329                         q->next[q->tail] = x;
 330                         q->tail = x;
 331                 }
 332         }
 333         if (++sch->q.qlen <= q->limit) {
 334                 sch->bstats.bytes += qdisc_pkt_len(skb);
 335                 sch->bstats.packets++;
 336                 return NET_XMIT_SUCCESS;
 337         }
 338
 339         sfq_drop(sch);
 340         return NET_XMIT_CN;
 341 }
 342
 343 static struct sk_buff *
 344 sfq_peek(struct Qdisc *sch)
 345 {
 346         struct sfq_sched_data *q = qdisc_priv(sch);
 347         sfq_index a;
 348
 349         /* No active slots */
 350         if (q->tail == SFQ_DEPTH)
 351                 return NULL;
 352
 353         a = q->next[q->tail];
 354         return skb_peek(&q->qs[a]);
 355 }
 356
 357 static struct sk_buff *
 358 sfq_dequeue(struct Qdisc *sch)
 359 {
 360         struct sfq_sched_data *q = qdisc_priv(sch);
 361         struct sk_buff *skb;
 362         sfq_index a, old_a;
 363
 364         /* No active slots */
 365         if (q->tail == SFQ_DEPTH)
 366                 return NULL;
 367
 368         a = old_a = q->next[q->tail];
 369
 370         /* Grab packet */
 371         skb = __skb_dequeue(&q->qs[a]);
 372         sfq_dec(q, a);
 373         sch->q.qlen--;
 374         sch->qstats.backlog -= qdisc_pkt_len(skb);
 375
 376         /* Is the slot empty? */
 377         if (q->qs[a].qlen == 0) {
 378                 q->ht[q->hash[a]] = SFQ_DEPTH;
 379                 a = q->next[a];
 380                 if (a == old_a) {
 381                         q->tail = SFQ_DEPTH;
 382                         return skb;
 383                 }
 384                 q->next[q->tail] = a;
 385                 q->allot[a] += q->quantum;
 386         } else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
 387                 q->tail = a;
 388                 a = q->next[a];
 389                 q->allot[a] += q->quantum;
 390         }
 391         return skb;
 392 }
 393
 394 static void
 395 sfq_reset(struct Qdisc *sch)
 396 {
 397         struct sk_buff *skb;
 398
 399         while ((skb = sfq_dequeue(sch)) != NULL)
 400                 kfree_skb(skb);
 401 }
 402
 403 static void sfq_perturbation(unsigned long arg)
 404 {
 405         struct Qdisc *sch = (struct Qdisc *)arg;
 406         struct sfq_sched_data *q = qdisc_priv(sch);
 407
 408         q->perturbation = net_random();
 409
 410         if (q->perturb_period)
 411                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 412 }
 413
 414 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 415 {
 416         struct sfq_sched_data *q = qdisc_priv(sch);
 417         struct tc_sfq_qopt *ctl = nla_data(opt);
 418         unsigned int qlen;
 419
 420         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 421                 return -EINVAL;
 422
 423         sch_tree_lock(sch);
 424         q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
 425         q->perturb_period = ctl->perturb_period * HZ;
 426         if (ctl->limit)
 427                 q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
 428
 429         qlen = sch->q.qlen;
 430         while (sch->q.qlen > q->limit)
 431                 sfq_drop(sch);
 432         qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 433
 434         del_timer(&q->perturb_timer);
 435         if (q->perturb_period) {
 436                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 437                 q->perturbation = net_random();
 438         }
 439         sch_tree_unlock(sch);
 440         return 0;
 441 }
 442
 443 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 444 {
 445         struct sfq_sched_data *q = qdisc_priv(sch);
 446         int i;
 447
 448         q->perturb_timer.function = sfq_perturbation;
 449         q->perturb_timer.data = (unsigned long)sch;
 450         init_timer_deferrable(&q->perturb_timer);
 451
 452         for (i = 0; i < SFQ_HASH_DIVISOR; i++)
 453                 q->ht[i] = SFQ_DEPTH;
 454
 455         for (i = 0; i < SFQ_DEPTH; i++) {
 456                 skb_queue_head_init(&q->qs[i]);
 457                 q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
 458                 q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
 459         }
 460
 461         q->limit = SFQ_DEPTH - 1;
 462         q->max_depth = 0;
 463         q->tail = SFQ_DEPTH;
 464         if (opt == NULL) {
 465                 q->quantum = psched_mtu(qdisc_dev(sch));
 466                 q->perturb_period = 0;
 467                 q->perturbation = net_random();
 468         } else {
 469                 int err = sfq_change(sch, opt);
 470                 if (err)
 471                         return err;
 472         }
 473
 474         for (i = 0; i < SFQ_DEPTH; i++)
 475                 sfq_link(q, i);
 476         return 0;
 477 }
 478
 479 static void sfq_destroy(struct Qdisc *sch)
 480 {
 481         struct sfq_sched_data *q = qdisc_priv(sch);
 482
 483         tcf_destroy_chain(&q->filter_list);
 484         q->perturb_period = 0;
 485         del_timer_sync(&q->perturb_timer);
 486 }
 487
 488 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 489 {
 490         struct sfq_sched_data *q = qdisc_priv(sch);
 491         unsigned char *b = skb_tail_pointer(skb);
 492         struct tc_sfq_qopt opt;
 493
 494         opt.quantum = q->quantum;
 495         opt.perturb_period = q->perturb_period / HZ;
 496
 497         opt.limit = q->limit;
 498         opt.divisor = SFQ_HASH_DIVISOR;
 499         opt.flows = q->limit;
 500
 501         NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 502
 503         return skb->len;
 504
 505 nla_put_failure:
 506         nlmsg_trim(skb, b);
 507         return -1;
 508 }
 509
 510 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 511 {
 512         return NULL;
 513 }
 514
 515 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 516 {
 517         return 0;
 518 }
 519
 520 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 521                               u32 classid)
 522 {
 523         return 0;
 524 }
 525
 526 static void sfq_put(struct Qdisc *q, unsigned long cl)
 527 {
 528 }
 529
 530 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 531 {
 532         struct sfq_sched_data *q = qdisc_priv(sch);
 533
 534         if (cl)
 535                 return NULL;
 536         return &q->filter_list;
 537 }
 538
 539 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 540                           struct sk_buff *skb, struct tcmsg *tcm)
 541 {
 542         tcm->tcm_handle |= TC_H_MIN(cl);
 543         return 0;
 544 }
 545
 546 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 547                                 struct gnet_dump *d)
 548 {
 549         struct sfq_sched_data *q = qdisc_priv(sch);
 550         sfq_index idx = q->ht[cl-1];
 551         struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
 552         struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
 553
 554         if (gnet_stats_copy_queue(d, &qs) < 0)
 555                 return -1;
 556         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 557 }
 558
 559 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 560 {
 561         struct sfq_sched_data *q = qdisc_priv(sch);
 562         unsigned int i;
 563
 564         if (arg->stop)
 565                 return;
 566
 567         for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
 568                 if (q->ht[i] == SFQ_DEPTH ||
 569                     arg->count < arg->skip) {
 570                         arg->count++;
 571                         continue;
 572                 }
 573                 if (arg->fn(sch, i + 1, arg) < 0) {
 574                         arg->stop = 1;
 575                         break;
 576                 }
 577                 arg->count++;
 578         }
 579 }
 580
 581 static const struct Qdisc_class_ops sfq_class_ops = {
 582         .leaf           =       sfq_leaf,
 583         .get            =       sfq_get,
 584         .put            =       sfq_put,
 585         .tcf_chain      =       sfq_find_tcf,
 586         .bind_tcf       =       sfq_bind,
 587         .unbind_tcf     =       sfq_put,
 588         .dump           =       sfq_dump_class,
 589         .dump_stats     =       sfq_dump_class_stats,
 590         .walk           =       sfq_walk,
 591 };
 592
 593 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 594         .cl_ops         =       &sfq_class_ops,
 595         .id             =       "sfq",
 596         .priv_size      =       sizeof(struct sfq_sched_data),
 597         .enqueue        =       sfq_enqueue,
 598         .dequeue        =       sfq_dequeue,
 599         .peek           =       sfq_peek,
 600         .drop           =       sfq_drop,
 601         .init           =       sfq_init,
 602         .reset          =       sfq_reset,
 603         .destroy        =       sfq_destroy,
 604         .change         =       NULL,
 605         .dump           =       sfq_dump,
 606         .owner          =       THIS_MODULE,
 607 };
 608
 609 static int __init sfq_module_init(void)
 610 {
 611         return register_qdisc(&sfq_qdisc_ops);
 612 }
 613 static void __exit sfq_module_exit(void)
 614 {
 615         unregister_qdisc(&sfq_qdisc_ops);
 616 }
 617 module_init(sfq_module_init)
 618 module_exit(sfq_module_exit)
 619 MODULE_LICENSE("GPL");