net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/ipv6.h>
  21 #include <linux/skbuff.h>
  22 #include <linux/jhash.h>
  23 #include <linux/slab.h>
  24 #include <net/ip.h>
  25 #include <net/netlink.h>
  26 #include <net/pkt_sched.h>
  27
  28
  29 /*      Stochastic Fairness Queuing algorithm.
  30         =======================================
  31
  32         Source:
  33         Paul E. McKenney "Stochastic Fairness Queuing",
  34         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  35
  36         Paul E. McKenney "Stochastic Fairness Queuing",
  37         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  38
  39
  40         See also:
  41         M. Shreedhar and George Varghese "Efficient Fair
  42         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  43
  44
  45         This is not the thing that is usually called (W)FQ nowadays.
  46         It does not use any timestamp mechanism, but instead
  47         processes queues in round-robin order.
  48
  49         ADVANTAGE:
  50
  51         - It is very cheap. Both CPU and memory requirements are minimal.
  52
  53         DRAWBACKS:
  54
  55         - "Stochastic" -> It is not 100% fair.
  56         When hash collisions occur, several flows are considered as one.
  57
  58         - "Round-robin" -> It introduces larger delays than virtual clock
  59         based schemes, and should not be used for isolating interactive
  60         traffic from non-interactive. It means, that this scheduler
  61         should be used as leaf of CBQ or P3, which put interactive traffic
  62         to higher priority band.
  63
  64         We still need true WFQ for top level CSZ, but using WFQ
  65         for the best effort traffic is absolutely pointless:
  66         SFQ is superior for this purpose.
  67
  68         IMPLEMENTATION:
  69         This implementation limits maximal queue length to 128;
  70         maximal mtu to 2^15-1; number of hash buckets to 1024.
  71         The only goal of this restrictions was that all data
  72         fit into one 4K page :-). Struct sfq_sched_data is
  73         organized in anti-cache manner: all the data for a bucket
  74         are scattered over different locations. This is not good,
  75         but it allowed me to put it into 4K.
  76
  77         It is easy to increase these values, but not in flight.  */
  78
  79 #define SFQ_DEPTH               128
  80 #define SFQ_HASH_DIVISOR        1024
  81
  82 /* This type should contain at least SFQ_DEPTH*2 values */
  83 typedef unsigned char sfq_index;
  84
  85 struct sfq_head
  86 {
  87         sfq_index       next;
  88         sfq_index       prev;
  89 };
  90
  91 struct sfq_sched_data
  92 {
  93 /* Parameters */
  94         int             perturb_period;
  95         unsigned        quantum;        /* Allotment per round: MUST BE >= MTU */
  96         int             limit;
  97
  98 /* Variables */
  99         struct tcf_proto *filter_list;
 100         struct timer_list perturb_timer;
 101         u32             perturbation;
 102         sfq_index       tail;           /* Index of current slot in round */
 103         sfq_index       max_depth;      /* Maximal depth */
 104
 105         sfq_index       ht[SFQ_HASH_DIVISOR];   /* Hash table */
 106         sfq_index       next[SFQ_DEPTH];        /* Active slots link */
 107         short           allot[SFQ_DEPTH];       /* Current allotment per slot */
 108         unsigned short  hash[SFQ_DEPTH];        /* Hash value indexed by slots */
 109         struct sk_buff_head     qs[SFQ_DEPTH];          /* Slot queue */
 110         struct sfq_head dep[SFQ_DEPTH*2];       /* Linked list of slots, indexed by depth */
 111 };
 112
 113 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
 114 {
 115         return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
 116 }
 117
 118 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 119 {
 120         u32 h, h2;
 121
 122         switch (skb->protocol) {
 123         case htons(ETH_P_IP):
 124         {
 125                 const struct iphdr *iph = ip_hdr(skb);
 126                 h = iph->daddr;
 127                 h2 = iph->saddr ^ iph->protocol;
 128                 if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
 129                     (iph->protocol == IPPROTO_TCP ||
 130                      iph->protocol == IPPROTO_UDP ||
 131                      iph->protocol == IPPROTO_UDPLITE ||
 132                      iph->protocol == IPPROTO_SCTP ||
 133                      iph->protocol == IPPROTO_DCCP ||
 134                      iph->protocol == IPPROTO_ESP))
 135                         h2 ^= *(((u32*)iph) + iph->ihl);
 136                 break;
 137         }
 138         case htons(ETH_P_IPV6):
 139         {
 140                 struct ipv6hdr *iph = ipv6_hdr(skb);
 141                 h = iph->daddr.s6_addr32[3];
 142                 h2 = iph->saddr.s6_addr32[3] ^ iph->nexthdr;
 143                 if (iph->nexthdr == IPPROTO_TCP ||
 144                     iph->nexthdr == IPPROTO_UDP ||
 145                     iph->nexthdr == IPPROTO_UDPLITE ||
 146                     iph->nexthdr == IPPROTO_SCTP ||
 147                     iph->nexthdr == IPPROTO_DCCP ||
 148                     iph->nexthdr == IPPROTO_ESP)
 149                         h2 ^= *(u32*)&iph[1];
 150                 break;
 151         }
 152         default:
 153                 h = (unsigned long)skb_dst(skb) ^ skb->protocol;
 154                 h2 = (unsigned long)skb->sk;
 155         }
 156
 157         return sfq_fold_hash(q, h, h2);
 158 }
 159
 160 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 161                                  int *qerr)
 162 {
 163         struct sfq_sched_data *q = qdisc_priv(sch);
 164         struct tcf_result res;
 165         int result;
 166
 167         if (TC_H_MAJ(skb->priority) == sch->handle &&
 168             TC_H_MIN(skb->priority) > 0 &&
 169             TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
 170                 return TC_H_MIN(skb->priority);
 171
 172         if (!q->filter_list)
 173                 return sfq_hash(q, skb) + 1;
 174
 175         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 176         result = tc_classify(skb, q->filter_list, &res);
 177         if (result >= 0) {
 178 #ifdef CONFIG_NET_CLS_ACT
 179                 switch (result) {
 180                 case TC_ACT_STOLEN:
 181                 case TC_ACT_QUEUED:
 182                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 183                 case TC_ACT_SHOT:
 184                         return 0;
 185                 }
 186 #endif
 187                 if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
 188                         return TC_H_MIN(res.classid);
 189         }
 190         return 0;
 191 }
 192
 193 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 194 {
 195         sfq_index p, n;
 196         int d = q->qs[x].qlen + SFQ_DEPTH;
 197
 198         p = d;
 199         n = q->dep[d].next;
 200         q->dep[x].next = n;
 201         q->dep[x].prev = p;
 202         q->dep[p].next = q->dep[n].prev = x;
 203 }
 204
 205 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 206 {
 207         sfq_index p, n;
 208
 209         n = q->dep[x].next;
 210         p = q->dep[x].prev;
 211         q->dep[p].next = n;
 212         q->dep[n].prev = p;
 213
 214         if (n == p && q->max_depth == q->qs[x].qlen + 1)
 215                 q->max_depth--;
 216
 217         sfq_link(q, x);
 218 }
 219
 220 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 221 {
 222         sfq_index p, n;
 223         int d;
 224
 225         n = q->dep[x].next;
 226         p = q->dep[x].prev;
 227         q->dep[p].next = n;
 228         q->dep[n].prev = p;
 229         d = q->qs[x].qlen;
 230         if (q->max_depth < d)
 231                 q->max_depth = d;
 232
 233         sfq_link(q, x);
 234 }
 235
 236 static unsigned int sfq_drop(struct Qdisc *sch)
 237 {
 238         struct sfq_sched_data *q = qdisc_priv(sch);
 239         sfq_index d = q->max_depth;
 240         struct sk_buff *skb;
 241         unsigned int len;
 242
 243         /* Queue is full! Find the longest slot and
 244            drop a packet from it */
 245
 246         if (d > 1) {
 247                 sfq_index x = q->dep[d + SFQ_DEPTH].next;
 248                 skb = q->qs[x].prev;
 249                 len = qdisc_pkt_len(skb);
 250                 __skb_unlink(skb, &q->qs[x]);
 251                 kfree_skb(skb);
 252                 sfq_dec(q, x);
 253                 sch->q.qlen--;
 254                 sch->qstats.drops++;
 255                 sch->qstats.backlog -= len;
 256                 return len;
 257         }
 258
 259         if (d == 1) {
 260                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 261                 d = q->next[q->tail];
 262                 q->next[q->tail] = q->next[d];
 263                 q->allot[q->next[d]] += q->quantum;
 264                 skb = q->qs[d].prev;
 265                 len = qdisc_pkt_len(skb);
 266                 __skb_unlink(skb, &q->qs[d]);
 267                 kfree_skb(skb);
 268                 sfq_dec(q, d);
 269                 sch->q.qlen--;
 270                 q->ht[q->hash[d]] = SFQ_DEPTH;
 271                 sch->qstats.drops++;
 272                 sch->qstats.backlog -= len;
 273                 return len;
 274         }
 275
 276         return 0;
 277 }
 278
 279 static int
 280 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 281 {
 282         struct sfq_sched_data *q = qdisc_priv(sch);
 283         unsigned int hash;
 284         sfq_index x;
 285         int uninitialized_var(ret);
 286
 287         hash = sfq_classify(skb, sch, &ret);
 288         if (hash == 0) {
 289                 if (ret & __NET_XMIT_BYPASS)
 290                         sch->qstats.drops++;
 291                 kfree_skb(skb);
 292                 return ret;
 293         }
 294         hash--;
 295
 296         x = q->ht[hash];
 297         if (x == SFQ_DEPTH) {
 298                 q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
 299                 q->hash[x] = hash;
 300         }
 301
 302         /* If selected queue has length q->limit, this means that
 303          * all another queues are empty and that we do simple tail drop,
 304          * i.e. drop _this_ packet.
 305          */
 306         if (q->qs[x].qlen >= q->limit)
 307                 return qdisc_drop(skb, sch);
 308
 309         sch->qstats.backlog += qdisc_pkt_len(skb);
 310         __skb_queue_tail(&q->qs[x], skb);
 311         sfq_inc(q, x);
 312         if (q->qs[x].qlen == 1) {               /* The flow is new */
 313                 if (q->tail == SFQ_DEPTH) {     /* It is the first flow */
 314                         q->tail = x;
 315                         q->next[x] = x;
 316                         q->allot[x] = q->quantum;
 317                 } else {
 318                         q->next[x] = q->next[q->tail];
 319                         q->next[q->tail] = x;
 320                         q->tail = x;
 321                 }
 322         }
 323         if (++sch->q.qlen <= q->limit) {
 324                 sch->bstats.bytes += qdisc_pkt_len(skb);
 325                 sch->bstats.packets++;
 326                 return 0;
 327         }
 328
 329         sfq_drop(sch);
 330         return NET_XMIT_CN;
 331 }
 332
 333 static struct sk_buff *
 334 sfq_peek(struct Qdisc *sch)
 335 {
 336         struct sfq_sched_data *q = qdisc_priv(sch);
 337         sfq_index a;
 338
 339         /* No active slots */
 340         if (q->tail == SFQ_DEPTH)
 341                 return NULL;
 342
 343         a = q->next[q->tail];
 344         return skb_peek(&q->qs[a]);
 345 }
 346
 347 static struct sk_buff *
 348 sfq_dequeue(struct Qdisc *sch)
 349 {
 350         struct sfq_sched_data *q = qdisc_priv(sch);
 351         struct sk_buff *skb;
 352         sfq_index a, old_a;
 353
 354         /* No active slots */
 355         if (q->tail == SFQ_DEPTH)
 356                 return NULL;
 357
 358         a = old_a = q->next[q->tail];
 359
 360         /* Grab packet */
 361         skb = __skb_dequeue(&q->qs[a]);
 362         sfq_dec(q, a);
 363         sch->q.qlen--;
 364         sch->qstats.backlog -= qdisc_pkt_len(skb);
 365
 366         /* Is the slot empty? */
 367         if (q->qs[a].qlen == 0) {
 368                 q->ht[q->hash[a]] = SFQ_DEPTH;
 369                 a = q->next[a];
 370                 if (a == old_a) {
 371                         q->tail = SFQ_DEPTH;
 372                         return skb;
 373                 }
 374                 q->next[q->tail] = a;
 375                 q->allot[a] += q->quantum;
 376         } else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
 377                 q->tail = a;
 378                 a = q->next[a];
 379                 q->allot[a] += q->quantum;
 380         }
 381         return skb;
 382 }
 383
 384 static void
 385 sfq_reset(struct Qdisc *sch)
 386 {
 387         struct sk_buff *skb;
 388
 389         while ((skb = sfq_dequeue(sch)) != NULL)
 390                 kfree_skb(skb);
 391 }
 392
 393 static void sfq_perturbation(unsigned long arg)
 394 {
 395         struct Qdisc *sch = (struct Qdisc *)arg;
 396         struct sfq_sched_data *q = qdisc_priv(sch);
 397
 398         q->perturbation = net_random();
 399
 400         if (q->perturb_period)
 401                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 402 }
 403
 404 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 405 {
 406         struct sfq_sched_data *q = qdisc_priv(sch);
 407         struct tc_sfq_qopt *ctl = nla_data(opt);
 408         unsigned int qlen;
 409
 410         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 411                 return -EINVAL;
 412
 413         sch_tree_lock(sch);
 414         q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
 415         q->perturb_period = ctl->perturb_period * HZ;
 416         if (ctl->limit)
 417                 q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
 418
 419         qlen = sch->q.qlen;
 420         while (sch->q.qlen > q->limit)
 421                 sfq_drop(sch);
 422         qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 423
 424         del_timer(&q->perturb_timer);
 425         if (q->perturb_period) {
 426                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 427                 q->perturbation = net_random();
 428         }
 429         sch_tree_unlock(sch);
 430         return 0;
 431 }
 432
 433 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 434 {
 435         struct sfq_sched_data *q = qdisc_priv(sch);
 436         int i;
 437
 438         q->perturb_timer.function = sfq_perturbation;
 439         q->perturb_timer.data = (unsigned long)sch;
 440         init_timer_deferrable(&q->perturb_timer);
 441
 442         for (i = 0; i < SFQ_HASH_DIVISOR; i++)
 443                 q->ht[i] = SFQ_DEPTH;
 444
 445         for (i = 0; i < SFQ_DEPTH; i++) {
 446                 skb_queue_head_init(&q->qs[i]);
 447                 q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
 448                 q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
 449         }
 450
 451         q->limit = SFQ_DEPTH - 1;
 452         q->max_depth = 0;
 453         q->tail = SFQ_DEPTH;
 454         if (opt == NULL) {
 455                 q->quantum = psched_mtu(qdisc_dev(sch));
 456                 q->perturb_period = 0;
 457                 q->perturbation = net_random();
 458         } else {
 459                 int err = sfq_change(sch, opt);
 460                 if (err)
 461                         return err;
 462         }
 463
 464         for (i = 0; i < SFQ_DEPTH; i++)
 465                 sfq_link(q, i);
 466         return 0;
 467 }
 468
 469 static void sfq_destroy(struct Qdisc *sch)
 470 {
 471         struct sfq_sched_data *q = qdisc_priv(sch);
 472
 473         tcf_destroy_chain(&q->filter_list);
 474         q->perturb_period = 0;
 475         del_timer_sync(&q->perturb_timer);
 476 }
 477
 478 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 479 {
 480         struct sfq_sched_data *q = qdisc_priv(sch);
 481         unsigned char *b = skb_tail_pointer(skb);
 482         struct tc_sfq_qopt opt;
 483
 484         opt.quantum = q->quantum;
 485         opt.perturb_period = q->perturb_period / HZ;
 486
 487         opt.limit = q->limit;
 488         opt.divisor = SFQ_HASH_DIVISOR;
 489         opt.flows = q->limit;
 490
 491         NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 492
 493         return skb->len;
 494
 495 nla_put_failure:
 496         nlmsg_trim(skb, b);
 497         return -1;
 498 }
 499
 500 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 501 {
 502         return NULL;
 503 }
 504
 505 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 506 {
 507         return 0;
 508 }
 509
 510 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 511                               u32 classid)
 512 {
 513         return 0;
 514 }
 515
 516 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 517 {
 518         struct sfq_sched_data *q = qdisc_priv(sch);
 519
 520         if (cl)
 521                 return NULL;
 522         return &q->filter_list;
 523 }
 524
 525 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 526                           struct sk_buff *skb, struct tcmsg *tcm)
 527 {
 528         tcm->tcm_handle |= TC_H_MIN(cl);
 529         return 0;
 530 }
 531
 532 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 533                                 struct gnet_dump *d)
 534 {
 535         struct sfq_sched_data *q = qdisc_priv(sch);
 536         sfq_index idx = q->ht[cl-1];
 537         struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
 538         struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
 539
 540         if (gnet_stats_copy_queue(d, &qs) < 0)
 541                 return -1;
 542         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 543 }
 544
 545 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 546 {
 547         struct sfq_sched_data *q = qdisc_priv(sch);
 548         unsigned int i;
 549
 550         if (arg->stop)
 551                 return;
 552
 553         for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
 554                 if (q->ht[i] == SFQ_DEPTH ||
 555                     arg->count < arg->skip) {
 556                         arg->count++;
 557                         continue;
 558                 }
 559                 if (arg->fn(sch, i + 1, arg) < 0) {
 560                         arg->stop = 1;
 561                         break;
 562                 }
 563                 arg->count++;
 564         }
 565 }
 566
 567 static const struct Qdisc_class_ops sfq_class_ops = {
 568         .leaf           =       sfq_leaf,
 569         .get            =       sfq_get,
 570         .tcf_chain      =       sfq_find_tcf,
 571         .bind_tcf       =       sfq_bind,
 572         .dump           =       sfq_dump_class,
 573         .dump_stats     =       sfq_dump_class_stats,
 574         .walk           =       sfq_walk,
 575 };
 576
 577 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 578         .cl_ops         =       &sfq_class_ops,
 579         .id             =       "sfq",
 580         .priv_size      =       sizeof(struct sfq_sched_data),
 581         .enqueue        =       sfq_enqueue,
 582         .dequeue        =       sfq_dequeue,
 583         .peek           =       sfq_peek,
 584         .drop           =       sfq_drop,
 585         .init           =       sfq_init,
 586         .reset          =       sfq_reset,
 587         .destroy        =       sfq_destroy,
 588         .change         =       NULL,
 589         .dump           =       sfq_dump,
 590         .owner          =       THIS_MODULE,
 591 };
 592
 593 static int __init sfq_module_init(void)
 594 {
 595         return register_qdisc(&sfq_qdisc_ops);
 596 }
 597 static void __exit sfq_module_exit(void)
 598 {
 599         unregister_qdisc(&sfq_qdisc_ops);
 600 }
 601 module_init(sfq_module_init)
 602 module_exit(sfq_module_exit)
 603 MODULE_LICENSE("GPL");