net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/skbuff.h>
  21 #include <linux/jhash.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <net/netlink.h>
  25 #include <net/pkt_sched.h>
  26 #include <net/flow_keys.h>
  27 #include <net/red.h>
  28
  29
  30 /*      Stochastic Fairness Queuing algorithm.
  31         =======================================
  32
  33         Source:
  34         Paul E. McKenney "Stochastic Fairness Queuing",
  35         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37         Paul E. McKenney "Stochastic Fairness Queuing",
  38         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41         See also:
  42         M. Shreedhar and George Varghese "Efficient Fair
  43         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46         This is not the thing that is usually called (W)FQ nowadays.
  47         It does not use any timestamp mechanism, but instead
  48         processes queues in round-robin order.
  49
  50         ADVANTAGE:
  51
  52         - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54         DRAWBACKS:
  55
  56         - "Stochastic" -> It is not 100% fair.
  57         When hash collisions occur, several flows are considered as one.
  58
  59         - "Round-robin" -> It introduces larger delays than virtual clock
  60         based schemes, and should not be used for isolating interactive
  61         traffic from non-interactive. It means, that this scheduler
  62         should be used as leaf of CBQ or P3, which put interactive traffic
  63         to higher priority band.
  64
  65         We still need true WFQ for top level CSZ, but using WFQ
  66         for the best effort traffic is absolutely pointless:
  67         SFQ is superior for this purpose.
  68
  69         IMPLEMENTATION:
  70         This implementation limits :
  71         - maximal queue length per flow to 127 packets.
  72         - max mtu to 2^18-1;
  73         - max 65408 flows,
  74         - number of hash buckets to 65536.
  75
  76         It is easy to increase these values, but not in flight.  */
  77
  78 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79 #define SFQ_DEFAULT_FLOWS       128
  80 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81 #define SFQ_EMPTY_SLOT          0xffff
  82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84 /* We use 16 bits to store allot, and want to handle packets up to 64K
  85  * Scale allot by 8 (1<<3) so that no overflow occurs.
  86  */
  87 #define SFQ_ALLOT_SHIFT         3
  88 #define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91 typedef u16 sfq_index;
  92
  93 /*
  94  * We dont use pointers to save space.
  95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97  * are 'pointers' to dep[] array
  98  */
  99 struct sfq_head {
 100         sfq_index       next;
 101         sfq_index       prev;
 102 };
 103
 104 struct sfq_slot {
 105         struct sk_buff  *skblist_next;
 106         struct sk_buff  *skblist_prev;
 107         sfq_index       qlen; /* number of skbs in skblist */
 108         sfq_index       next; /* next slot in sfq RR chain */
 109         struct sfq_head dep; /* anchor in dep[] chains */
 110         unsigned short  hash; /* hash value (index in ht[]) */
 111         short           allot; /* credit for this slot */
 112
 113         unsigned int    backlog;
 114         struct red_vars vars;
 115 };
 116
 117 struct sfq_sched_data {
 118 /* frequently used fields */
 119         int             limit;          /* limit of total number of packets in this qdisc */
 120         unsigned int    divisor;        /* number of slots in hash table */
 121         u8              headdrop;
 122         u8              maxdepth;       /* limit of packets per flow */
 123
 124         u32             perturbation;
 125         u8              cur_depth;      /* depth of longest slot */
 126         u8              flags;
 127         unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128         struct tcf_proto *filter_list;
 129         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 130         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 131
 132         struct red_parms *red_parms;
 133         struct tc_sfqred_stats stats;
 134         struct sfq_slot *tail;          /* current slot in round */
 135
 136         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 137                                         /* Linked lists of slots, indexed by depth
 138                                          * dep[0] : list of unused flows
 139                                          * dep[1] : list of flows with 1 packet
 140                                          * dep[X] : list of flows with X packets
 141                                          */
 142
 143         unsigned int    maxflows;       /* number of flows in flows array */
 144         int             perturb_period;
 145         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 146         struct timer_list perturb_timer;
 147 };
 148
 149 /*
 150  * sfq_head are either in a sfq_slot or in dep[] array
 151  */
 152 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 153 {
 154         if (val < SFQ_MAX_FLOWS)
 155                 return &q->slots[val].dep;
 156         return &q->dep[val - SFQ_MAX_FLOWS];
 157 }
 158
 159 /*
 160  * In order to be able to quickly rehash our queue when timer changes
 161  * q->perturbation, we store flow_keys in skb->cb[]
 162  */
 163 struct sfq_skb_cb {
 164        struct flow_keys        keys;
 165 };
 166
 167 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
 168 {
 169         qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
 170         return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
 171 }
 172
 173 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 174                              const struct sk_buff *skb)
 175 {
 176         const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
 177         unsigned int hash;
 178
 179         hash = jhash_3words((__force u32)keys->dst,
 180                             (__force u32)keys->src ^ keys->ip_proto,
 181                             (__force u32)keys->ports, q->perturbation);
 182         return hash & (q->divisor - 1);
 183 }
 184
 185 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 186                                  int *qerr)
 187 {
 188         struct sfq_sched_data *q = qdisc_priv(sch);
 189         struct tcf_result res;
 190         int result;
 191
 192         if (TC_H_MAJ(skb->priority) == sch->handle &&
 193             TC_H_MIN(skb->priority) > 0 &&
 194             TC_H_MIN(skb->priority) <= q->divisor)
 195                 return TC_H_MIN(skb->priority);
 196
 197         if (!q->filter_list) {
 198                 skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
 199                 return sfq_hash(q, skb) + 1;
 200         }
 201
 202         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 203         result = tc_classify(skb, q->filter_list, &res);
 204         if (result >= 0) {
 205 #ifdef CONFIG_NET_CLS_ACT
 206                 switch (result) {
 207                 case TC_ACT_STOLEN:
 208                 case TC_ACT_QUEUED:
 209                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 210                 case TC_ACT_SHOT:
 211                         return 0;
 212                 }
 213 #endif
 214                 if (TC_H_MIN(res.classid) <= q->divisor)
 215                         return TC_H_MIN(res.classid);
 216         }
 217         return 0;
 218 }
 219
 220 /*
 221  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 222  */
 223 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 224 {
 225         sfq_index p, n;
 226         struct sfq_slot *slot = &q->slots[x];
 227         int qlen = slot->qlen;
 228
 229         p = qlen + SFQ_MAX_FLOWS;
 230         n = q->dep[qlen].next;
 231
 232         slot->dep.next = n;
 233         slot->dep.prev = p;
 234
 235         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 236         sfq_dep_head(q, n)->prev = x;
 237 }
 238
 239 #define sfq_unlink(q, x, n, p)                  \
 240         n = q->slots[x].dep.next;               \
 241         p = q->slots[x].dep.prev;               \
 242         sfq_dep_head(q, p)->next = n;           \
 243         sfq_dep_head(q, n)->prev = p
 244
 245
 246 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 247 {
 248         sfq_index p, n;
 249         int d;
 250
 251         sfq_unlink(q, x, n, p);
 252
 253         d = q->slots[x].qlen--;
 254         if (n == p && q->cur_depth == d)
 255                 q->cur_depth--;
 256         sfq_link(q, x);
 257 }
 258
 259 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 260 {
 261         sfq_index p, n;
 262         int d;
 263
 264         sfq_unlink(q, x, n, p);
 265
 266         d = ++q->slots[x].qlen;
 267         if (q->cur_depth < d)
 268                 q->cur_depth = d;
 269         sfq_link(q, x);
 270 }
 271
 272 /* helper functions : might be changed when/if skb use a standard list_head */
 273
 274 /* remove one skb from tail of slot queue */
 275 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 276 {
 277         struct sk_buff *skb = slot->skblist_prev;
 278
 279         slot->skblist_prev = skb->prev;
 280         skb->prev->next = (struct sk_buff *)slot;
 281         skb->next = skb->prev = NULL;
 282         return skb;
 283 }
 284
 285 /* remove one skb from head of slot queue */
 286 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 287 {
 288         struct sk_buff *skb = slot->skblist_next;
 289
 290         slot->skblist_next = skb->next;
 291         skb->next->prev = (struct sk_buff *)slot;
 292         skb->next = skb->prev = NULL;
 293         return skb;
 294 }
 295
 296 static inline void slot_queue_init(struct sfq_slot *slot)
 297 {
 298         memset(slot, 0, sizeof(*slot));
 299         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 300 }
 301
 302 /* add skb to slot queue (tail add) */
 303 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 304 {
 305         skb->prev = slot->skblist_prev;
 306         skb->next = (struct sk_buff *)slot;
 307         slot->skblist_prev->next = skb;
 308         slot->skblist_prev = skb;
 309 }
 310
 311 #define slot_queue_walk(slot, skb)              \
 312         for (skb = slot->skblist_next;          \
 313              skb != (struct sk_buff *)slot;     \
 314              skb = skb->next)
 315
 316 static unsigned int sfq_drop(struct Qdisc *sch)
 317 {
 318         struct sfq_sched_data *q = qdisc_priv(sch);
 319         sfq_index x, d = q->cur_depth;
 320         struct sk_buff *skb;
 321         unsigned int len;
 322         struct sfq_slot *slot;
 323
 324         /* Queue is full! Find the longest slot and drop tail packet from it */
 325         if (d > 1) {
 326                 x = q->dep[d].next;
 327                 slot = &q->slots[x];
 328 drop:
 329                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 330                 len = qdisc_pkt_len(skb);
 331                 slot->backlog -= len;
 332                 sfq_dec(q, x);
 333                 kfree_skb(skb);
 334                 sch->q.qlen--;
 335                 sch->qstats.drops++;
 336                 sch->qstats.backlog -= len;
 337                 return len;
 338         }
 339
 340         if (d == 1) {
 341                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 342                 x = q->tail->next;
 343                 slot = &q->slots[x];
 344                 q->tail->next = slot->next;
 345                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 346                 goto drop;
 347         }
 348
 349         return 0;
 350 }
 351
 352 /* Is ECN parameter configured */
 353 static int sfq_prob_mark(const struct sfq_sched_data *q)
 354 {
 355         return q->flags & TC_RED_ECN;
 356 }
 357
 358 /* Should packets over max threshold just be marked */
 359 static int sfq_hard_mark(const struct sfq_sched_data *q)
 360 {
 361         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 362 }
 363
 364 static int sfq_headdrop(const struct sfq_sched_data *q)
 365 {
 366         return q->headdrop;
 367 }
 368
 369 static int
 370 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 371 {
 372         struct sfq_sched_data *q = qdisc_priv(sch);
 373         unsigned int hash;
 374         sfq_index x, qlen;
 375         struct sfq_slot *slot;
 376         int uninitialized_var(ret);
 377         struct sk_buff *head;
 378         int delta;
 379
 380         hash = sfq_classify(skb, sch, &ret);
 381         if (hash == 0) {
 382                 if (ret & __NET_XMIT_BYPASS)
 383                         sch->qstats.drops++;
 384                 kfree_skb(skb);
 385                 return ret;
 386         }
 387         hash--;
 388
 389         x = q->ht[hash];
 390         slot = &q->slots[x];
 391         if (x == SFQ_EMPTY_SLOT) {
 392                 x = q->dep[0].next; /* get a free slot */
 393                 if (x >= SFQ_MAX_FLOWS)
 394                         return qdisc_drop(skb, sch);
 395                 q->ht[hash] = x;
 396                 slot = &q->slots[x];
 397                 slot->hash = hash;
 398                 slot->backlog = 0; /* should already be 0 anyway... */
 399                 red_set_vars(&slot->vars);
 400                 goto enqueue;
 401         }
 402         if (q->red_parms) {
 403                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 404                                                         &slot->vars,
 405                                                         slot->backlog);
 406                 switch (red_action(q->red_parms,
 407                                    &slot->vars,
 408                                    slot->vars.qavg)) {
 409                 case RED_DONT_MARK:
 410                         break;
 411
 412                 case RED_PROB_MARK:
 413                         sch->qstats.overlimits++;
 414                         if (sfq_prob_mark(q)) {
 415                                 /* We know we have at least one packet in queue */
 416                                 if (sfq_headdrop(q) &&
 417                                     INET_ECN_set_ce(slot->skblist_next)) {
 418                                         q->stats.prob_mark_head++;
 419                                         break;
 420                                 }
 421                                 if (INET_ECN_set_ce(skb)) {
 422                                         q->stats.prob_mark++;
 423                                         break;
 424                                 }
 425                         }
 426                         q->stats.prob_drop++;
 427                         goto congestion_drop;
 428
 429                 case RED_HARD_MARK:
 430                         sch->qstats.overlimits++;
 431                         if (sfq_hard_mark(q)) {
 432                                 /* We know we have at least one packet in queue */
 433                                 if (sfq_headdrop(q) &&
 434                                     INET_ECN_set_ce(slot->skblist_next)) {
 435                                         q->stats.forced_mark_head++;
 436                                         break;
 437                                 }
 438                                 if (INET_ECN_set_ce(skb)) {
 439                                         q->stats.forced_mark++;
 440                                         break;
 441                                 }
 442                         }
 443                         q->stats.forced_drop++;
 444                         goto congestion_drop;
 445                 }
 446         }
 447
 448         if (slot->qlen >= q->maxdepth) {
 449 congestion_drop:
 450                 if (!sfq_headdrop(q))
 451                         return qdisc_drop(skb, sch);
 452
 453                 /* We know we have at least one packet in queue */
 454                 head = slot_dequeue_head(slot);
 455                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 456                 sch->qstats.backlog -= delta;
 457                 slot->backlog -= delta;
 458                 qdisc_drop(head, sch);
 459
 460                 slot_queue_add(slot, skb);
 461                 return NET_XMIT_CN;
 462         }
 463
 464 enqueue:
 465         sch->qstats.backlog += qdisc_pkt_len(skb);
 466         slot->backlog += qdisc_pkt_len(skb);
 467         slot_queue_add(slot, skb);
 468         sfq_inc(q, x);
 469         if (slot->qlen == 1) {          /* The flow is new */
 470                 if (q->tail == NULL) {  /* It is the first flow */
 471                         slot->next = x;
 472                 } else {
 473                         slot->next = q->tail->next;
 474                         q->tail->next = x;
 475                 }
 476                 /* We put this flow at the end of our flow list.
 477                  * This might sound unfair for a new flow to wait after old ones,
 478                  * but we could endup servicing new flows only, and freeze old ones.
 479                  */
 480                 q->tail = slot;
 481                 /* We could use a bigger initial quantum for new flows */
 482                 slot->allot = q->scaled_quantum;
 483         }
 484         if (++sch->q.qlen <= q->limit)
 485                 return NET_XMIT_SUCCESS;
 486
 487         qlen = slot->qlen;
 488         sfq_drop(sch);
 489         /* Return Congestion Notification only if we dropped a packet
 490          * from this flow.
 491          */
 492         if (qlen != slot->qlen)
 493                 return NET_XMIT_CN;
 494
 495         /* As we dropped a packet, better let upper stack know this */
 496         qdisc_tree_decrease_qlen(sch, 1);
 497         return NET_XMIT_SUCCESS;
 498 }
 499
 500 static struct sk_buff *
 501 sfq_dequeue(struct Qdisc *sch)
 502 {
 503         struct sfq_sched_data *q = qdisc_priv(sch);
 504         struct sk_buff *skb;
 505         sfq_index a, next_a;
 506         struct sfq_slot *slot;
 507
 508         /* No active slots */
 509         if (q->tail == NULL)
 510                 return NULL;
 511
 512 next_slot:
 513         a = q->tail->next;
 514         slot = &q->slots[a];
 515         if (slot->allot <= 0) {
 516                 q->tail = slot;
 517                 slot->allot += q->scaled_quantum;
 518                 goto next_slot;
 519         }
 520         skb = slot_dequeue_head(slot);
 521         sfq_dec(q, a);
 522         qdisc_bstats_update(sch, skb);
 523         sch->q.qlen--;
 524         sch->qstats.backlog -= qdisc_pkt_len(skb);
 525         slot->backlog -= qdisc_pkt_len(skb);
 526         /* Is the slot empty? */
 527         if (slot->qlen == 0) {
 528                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 529                 next_a = slot->next;
 530                 if (a == next_a) {
 531                         q->tail = NULL; /* no more active slots */
 532                         return skb;
 533                 }
 534                 q->tail->next = next_a;
 535         } else {
 536                 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 537         }
 538         return skb;
 539 }
 540
 541 static void
 542 sfq_reset(struct Qdisc *sch)
 543 {
 544         struct sk_buff *skb;
 545
 546         while ((skb = sfq_dequeue(sch)) != NULL)
 547                 kfree_skb(skb);
 548 }
 549
 550 /*
 551  * When q->perturbation is changed, we rehash all queued skbs
 552  * to avoid OOO (Out Of Order) effects.
 553  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 554  * counters.
 555  */
 556 static void sfq_rehash(struct Qdisc *sch)
 557 {
 558         struct sfq_sched_data *q = qdisc_priv(sch);
 559         struct sk_buff *skb;
 560         int i;
 561         struct sfq_slot *slot;
 562         struct sk_buff_head list;
 563         int dropped = 0;
 564
 565         __skb_queue_head_init(&list);
 566
 567         for (i = 0; i < q->maxflows; i++) {
 568                 slot = &q->slots[i];
 569                 if (!slot->qlen)
 570                         continue;
 571                 while (slot->qlen) {
 572                         skb = slot_dequeue_head(slot);
 573                         sfq_dec(q, i);
 574                         __skb_queue_tail(&list, skb);
 575                 }
 576                 slot->backlog = 0;
 577                 red_set_vars(&slot->vars);
 578                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 579         }
 580         q->tail = NULL;
 581
 582         while ((skb = __skb_dequeue(&list)) != NULL) {
 583                 unsigned int hash = sfq_hash(q, skb);
 584                 sfq_index x = q->ht[hash];
 585
 586                 slot = &q->slots[x];
 587                 if (x == SFQ_EMPTY_SLOT) {
 588                         x = q->dep[0].next; /* get a free slot */
 589                         if (x >= SFQ_MAX_FLOWS) {
 590 drop:                           sch->qstats.backlog -= qdisc_pkt_len(skb);
 591                                 kfree_skb(skb);
 592                                 dropped++;
 593                                 continue;
 594                         }
 595                         q->ht[hash] = x;
 596                         slot = &q->slots[x];
 597                         slot->hash = hash;
 598                 }
 599                 if (slot->qlen >= q->maxdepth)
 600                         goto drop;
 601                 slot_queue_add(slot, skb);
 602                 if (q->red_parms)
 603                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 604                                                         &slot->vars,
 605                                                         slot->backlog);
 606                 slot->backlog += qdisc_pkt_len(skb);
 607                 sfq_inc(q, x);
 608                 if (slot->qlen == 1) {          /* The flow is new */
 609                         if (q->tail == NULL) {  /* It is the first flow */
 610                                 slot->next = x;
 611                         } else {
 612                                 slot->next = q->tail->next;
 613                                 q->tail->next = x;
 614                         }
 615                         q->tail = slot;
 616                         slot->allot = q->scaled_quantum;
 617                 }
 618         }
 619         sch->q.qlen -= dropped;
 620         qdisc_tree_decrease_qlen(sch, dropped);
 621 }
 622
 623 static void sfq_perturbation(unsigned long arg)
 624 {
 625         struct Qdisc *sch = (struct Qdisc *)arg;
 626         struct sfq_sched_data *q = qdisc_priv(sch);
 627         spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 628
 629         spin_lock(root_lock);
 630         q->perturbation = net_random();
 631         if (!q->filter_list && q->tail)
 632                 sfq_rehash(sch);
 633         spin_unlock(root_lock);
 634
 635         if (q->perturb_period)
 636                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 637 }
 638
 639 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 640 {
 641         struct sfq_sched_data *q = qdisc_priv(sch);
 642         struct tc_sfq_qopt *ctl = nla_data(opt);
 643         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 644         unsigned int qlen;
 645         struct red_parms *p = NULL;
 646
 647         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 648                 return -EINVAL;
 649         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 650                 ctl_v1 = nla_data(opt);
 651         if (ctl->divisor &&
 652             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 653                 return -EINVAL;
 654         if (ctl_v1 && ctl_v1->qth_min) {
 655                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 656                 if (!p)
 657                         return -ENOMEM;
 658         }
 659         sch_tree_lock(sch);
 660         if (ctl->quantum) {
 661                 q->quantum = ctl->quantum;
 662                 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 663         }
 664         q->perturb_period = ctl->perturb_period * HZ;
 665         if (ctl->flows)
 666                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 667         if (ctl->divisor) {
 668                 q->divisor = ctl->divisor;
 669                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 670         }
 671         if (ctl_v1) {
 672                 if (ctl_v1->depth)
 673                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 674                 if (p) {
 675                         swap(q->red_parms, p);
 676                         red_set_parms(q->red_parms,
 677                                       ctl_v1->qth_min, ctl_v1->qth_max,
 678                                       ctl_v1->Wlog,
 679                                       ctl_v1->Plog, ctl_v1->Scell_log,
 680                                       NULL,
 681                                       ctl_v1->max_P);
 682                 }
 683                 q->flags = ctl_v1->flags;
 684                 q->headdrop = ctl_v1->headdrop;
 685         }
 686         if (ctl->limit) {
 687                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 688                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 689         }
 690
 691         qlen = sch->q.qlen;
 692         while (sch->q.qlen > q->limit)
 693                 sfq_drop(sch);
 694         qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 695
 696         del_timer(&q->perturb_timer);
 697         if (q->perturb_period) {
 698                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 699                 q->perturbation = net_random();
 700         }
 701         sch_tree_unlock(sch);
 702         kfree(p);
 703         return 0;
 704 }
 705
 706 static void *sfq_alloc(size_t sz)
 707 {
 708         void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
 709
 710         if (!ptr)
 711                 ptr = vmalloc(sz);
 712         return ptr;
 713 }
 714
 715 static void sfq_free(void *addr)
 716 {
 717         if (addr) {
 718                 if (is_vmalloc_addr(addr))
 719                         vfree(addr);
 720                 else
 721                         kfree(addr);
 722         }
 723 }
 724
 725 static void sfq_destroy(struct Qdisc *sch)
 726 {
 727         struct sfq_sched_data *q = qdisc_priv(sch);
 728
 729         tcf_destroy_chain(&q->filter_list);
 730         q->perturb_period = 0;
 731         del_timer_sync(&q->perturb_timer);
 732         sfq_free(q->ht);
 733         sfq_free(q->slots);
 734         kfree(q->red_parms);
 735 }
 736
 737 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 738 {
 739         struct sfq_sched_data *q = qdisc_priv(sch);
 740         int i;
 741
 742         q->perturb_timer.function = sfq_perturbation;
 743         q->perturb_timer.data = (unsigned long)sch;
 744         init_timer_deferrable(&q->perturb_timer);
 745
 746         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 747                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 748                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 749         }
 750
 751         q->limit = SFQ_MAX_DEPTH;
 752         q->maxdepth = SFQ_MAX_DEPTH;
 753         q->cur_depth = 0;
 754         q->tail = NULL;
 755         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 756         q->maxflows = SFQ_DEFAULT_FLOWS;
 757         q->quantum = psched_mtu(qdisc_dev(sch));
 758         q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 759         q->perturb_period = 0;
 760         q->perturbation = net_random();
 761
 762         if (opt) {
 763                 int err = sfq_change(sch, opt);
 764                 if (err)
 765                         return err;
 766         }
 767
 768         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 769         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 770         if (!q->ht || !q->slots) {
 771                 sfq_destroy(sch);
 772                 return -ENOMEM;
 773         }
 774         for (i = 0; i < q->divisor; i++)
 775                 q->ht[i] = SFQ_EMPTY_SLOT;
 776
 777         for (i = 0; i < q->maxflows; i++) {
 778                 slot_queue_init(&q->slots[i]);
 779                 sfq_link(q, i);
 780         }
 781         if (q->limit >= 1)
 782                 sch->flags |= TCQ_F_CAN_BYPASS;
 783         else
 784                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 785         return 0;
 786 }
 787
 788 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 789 {
 790         struct sfq_sched_data *q = qdisc_priv(sch);
 791         unsigned char *b = skb_tail_pointer(skb);
 792         struct tc_sfq_qopt_v1 opt;
 793         struct red_parms *p = q->red_parms;
 794
 795         memset(&opt, 0, sizeof(opt));
 796         opt.v0.quantum  = q->quantum;
 797         opt.v0.perturb_period = q->perturb_period / HZ;
 798         opt.v0.limit    = q->limit;
 799         opt.v0.divisor  = q->divisor;
 800         opt.v0.flows    = q->maxflows;
 801         opt.depth       = q->maxdepth;
 802         opt.headdrop    = q->headdrop;
 803
 804         if (p) {
 805                 opt.qth_min     = p->qth_min >> p->Wlog;
 806                 opt.qth_max     = p->qth_max >> p->Wlog;
 807                 opt.Wlog        = p->Wlog;
 808                 opt.Plog        = p->Plog;
 809                 opt.Scell_log   = p->Scell_log;
 810                 opt.max_P       = p->max_P;
 811         }
 812         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 813         opt.flags       = q->flags;
 814
 815         if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 816                 goto nla_put_failure;
 817
 818         return skb->len;
 819
 820 nla_put_failure:
 821         nlmsg_trim(skb, b);
 822         return -1;
 823 }
 824
 825 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 826 {
 827         return NULL;
 828 }
 829
 830 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 831 {
 832         return 0;
 833 }
 834
 835 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 836                               u32 classid)
 837 {
 838         /* we cannot bypass queue discipline anymore */
 839         sch->flags &= ~TCQ_F_CAN_BYPASS;
 840         return 0;
 841 }
 842
 843 static void sfq_put(struct Qdisc *q, unsigned long cl)
 844 {
 845 }
 846
 847 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 848 {
 849         struct sfq_sched_data *q = qdisc_priv(sch);
 850
 851         if (cl)
 852                 return NULL;
 853         return &q->filter_list;
 854 }
 855
 856 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 857                           struct sk_buff *skb, struct tcmsg *tcm)
 858 {
 859         tcm->tcm_handle |= TC_H_MIN(cl);
 860         return 0;
 861 }
 862
 863 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 864                                 struct gnet_dump *d)
 865 {
 866         struct sfq_sched_data *q = qdisc_priv(sch);
 867         sfq_index idx = q->ht[cl - 1];
 868         struct gnet_stats_queue qs = { 0 };
 869         struct tc_sfq_xstats xstats = { 0 };
 870
 871         if (idx != SFQ_EMPTY_SLOT) {
 872                 const struct sfq_slot *slot = &q->slots[idx];
 873
 874                 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 875                 qs.qlen = slot->qlen;
 876                 qs.backlog = slot->backlog;
 877         }
 878         if (gnet_stats_copy_queue(d, &qs) < 0)
 879                 return -1;
 880         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 881 }
 882
 883 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 884 {
 885         struct sfq_sched_data *q = qdisc_priv(sch);
 886         unsigned int i;
 887
 888         if (arg->stop)
 889                 return;
 890
 891         for (i = 0; i < q->divisor; i++) {
 892                 if (q->ht[i] == SFQ_EMPTY_SLOT ||
 893                     arg->count < arg->skip) {
 894                         arg->count++;
 895                         continue;
 896                 }
 897                 if (arg->fn(sch, i + 1, arg) < 0) {
 898                         arg->stop = 1;
 899                         break;
 900                 }
 901                 arg->count++;
 902         }
 903 }
 904
 905 static const struct Qdisc_class_ops sfq_class_ops = {
 906         .leaf           =       sfq_leaf,
 907         .get            =       sfq_get,
 908         .put            =       sfq_put,
 909         .tcf_chain      =       sfq_find_tcf,
 910         .bind_tcf       =       sfq_bind,
 911         .unbind_tcf     =       sfq_put,
 912         .dump           =       sfq_dump_class,
 913         .dump_stats     =       sfq_dump_class_stats,
 914         .walk           =       sfq_walk,
 915 };
 916
 917 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 918         .cl_ops         =       &sfq_class_ops,
 919         .id             =       "sfq",
 920         .priv_size      =       sizeof(struct sfq_sched_data),
 921         .enqueue        =       sfq_enqueue,
 922         .dequeue        =       sfq_dequeue,
 923         .peek           =       qdisc_peek_dequeued,
 924         .drop           =       sfq_drop,
 925         .init           =       sfq_init,
 926         .reset          =       sfq_reset,
 927         .destroy        =       sfq_destroy,
 928         .change         =       NULL,
 929         .dump           =       sfq_dump,
 930         .owner          =       THIS_MODULE,
 931 };
 932
 933 static int __init sfq_module_init(void)
 934 {
 935         return register_qdisc(&sfq_qdisc_ops);
 936 }
 937 static void __exit sfq_module_exit(void)
 938 {
 939         unregister_qdisc(&sfq_qdisc_ops);
 940 }
 941 module_init(sfq_module_init)
 942 module_exit(sfq_module_exit)
 943 MODULE_LICENSE("GPL");