net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/skbuff.h>
  21 #include <linux/jhash.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <net/netlink.h>
  25 #include <net/pkt_sched.h>
  26 #include <net/flow_keys.h>
  27 #include <net/red.h>
  28
  29
  30 /*      Stochastic Fairness Queuing algorithm.
  31         =======================================
  32
  33         Source:
  34         Paul E. McKenney "Stochastic Fairness Queuing",
  35         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37         Paul E. McKenney "Stochastic Fairness Queuing",
  38         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41         See also:
  42         M. Shreedhar and George Varghese "Efficient Fair
  43         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46         This is not the thing that is usually called (W)FQ nowadays.
  47         It does not use any timestamp mechanism, but instead
  48         processes queues in round-robin order.
  49
  50         ADVANTAGE:
  51
  52         - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54         DRAWBACKS:
  55
  56         - "Stochastic" -> It is not 100% fair.
  57         When hash collisions occur, several flows are considered as one.
  58
  59         - "Round-robin" -> It introduces larger delays than virtual clock
  60         based schemes, and should not be used for isolating interactive
  61         traffic from non-interactive. It means, that this scheduler
  62         should be used as leaf of CBQ or P3, which put interactive traffic
  63         to higher priority band.
  64
  65         We still need true WFQ for top level CSZ, but using WFQ
  66         for the best effort traffic is absolutely pointless:
  67         SFQ is superior for this purpose.
  68
  69         IMPLEMENTATION:
  70         This implementation limits :
  71         - maximal queue length per flow to 127 packets.
  72         - max mtu to 2^18-1;
  73         - max 65408 flows,
  74         - number of hash buckets to 65536.
  75
  76         It is easy to increase these values, but not in flight.  */
  77
  78 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79 #define SFQ_DEFAULT_FLOWS       128
  80 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81 #define SFQ_EMPTY_SLOT          0xffff
  82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84 /* We use 16 bits to store allot, and want to handle packets up to 64K
  85  * Scale allot by 8 (1<<3) so that no overflow occurs.
  86  */
  87 #define SFQ_ALLOT_SHIFT         3
  88 #define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91 typedef u16 sfq_index;
  92
  93 /*
  94  * We dont use pointers to save space.
  95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97  * are 'pointers' to dep[] array
  98  */
  99 struct sfq_head {
 100         sfq_index       next;
 101         sfq_index       prev;
 102 };
 103
 104 struct sfq_slot {
 105         struct sk_buff  *skblist_next;
 106         struct sk_buff  *skblist_prev;
 107         sfq_index       qlen; /* number of skbs in skblist */
 108         sfq_index       next; /* next slot in sfq RR chain */
 109         struct sfq_head dep; /* anchor in dep[] chains */
 110         unsigned short  hash; /* hash value (index in ht[]) */
 111         short           allot; /* credit for this slot */
 112
 113         unsigned int    backlog;
 114         struct red_vars vars;
 115 };
 116
 117 struct sfq_sched_data {
 118 /* frequently used fields */
 119         int             limit;          /* limit of total number of packets in this qdisc */
 120         unsigned int    divisor;        /* number of slots in hash table */
 121         u8              headdrop;
 122         u8              maxdepth;       /* limit of packets per flow */
 123
 124         u32             perturbation;
 125         u8              cur_depth;      /* depth of longest slot */
 126         u8              flags;
 127         unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128         struct tcf_proto *filter_list;
 129         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 130         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 131
 132         struct red_parms *red_parms;
 133         struct tc_sfqred_stats stats;
 134         struct sfq_slot *tail;          /* current slot in round */
 135
 136         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 137                                         /* Linked lists of slots, indexed by depth
 138                                          * dep[0] : list of unused flows
 139                                          * dep[1] : list of flows with 1 packet
 140                                          * dep[X] : list of flows with X packets
 141                                          */
 142
 143         unsigned int    maxflows;       /* number of flows in flows array */
 144         int             perturb_period;
 145         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 146         struct timer_list perturb_timer;
 147 };
 148
 149 /*
 150  * sfq_head are either in a sfq_slot or in dep[] array
 151  */
 152 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 153 {
 154         if (val < SFQ_MAX_FLOWS)
 155                 return &q->slots[val].dep;
 156         return &q->dep[val - SFQ_MAX_FLOWS];
 157 }
 158
 159 /*
 160  * In order to be able to quickly rehash our queue when timer changes
 161  * q->perturbation, we store flow_keys in skb->cb[]
 162  */
 163 struct sfq_skb_cb {
 164        struct flow_keys        keys;
 165 };
 166
 167 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
 168 {
 169         qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
 170         return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
 171 }
 172
 173 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 174                              const struct sk_buff *skb)
 175 {
 176         const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
 177         unsigned int hash;
 178
 179         hash = jhash_3words((__force u32)keys->dst,
 180                             (__force u32)keys->src ^ keys->ip_proto,
 181                             (__force u32)keys->ports, q->perturbation);
 182         return hash & (q->divisor - 1);
 183 }
 184
 185 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 186                                  int *qerr)
 187 {
 188         struct sfq_sched_data *q = qdisc_priv(sch);
 189         struct tcf_result res;
 190         int result;
 191
 192         if (TC_H_MAJ(skb->priority) == sch->handle &&
 193             TC_H_MIN(skb->priority) > 0 &&
 194             TC_H_MIN(skb->priority) <= q->divisor)
 195                 return TC_H_MIN(skb->priority);
 196
 197         if (!q->filter_list) {
 198                 skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
 199                 return sfq_hash(q, skb) + 1;
 200         }
 201
 202         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 203         result = tc_classify(skb, q->filter_list, &res);
 204         if (result >= 0) {
 205 #ifdef CONFIG_NET_CLS_ACT
 206                 switch (result) {
 207                 case TC_ACT_STOLEN:
 208                 case TC_ACT_QUEUED:
 209                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 210                 case TC_ACT_SHOT:
 211                         return 0;
 212                 }
 213 #endif
 214                 if (TC_H_MIN(res.classid) <= q->divisor)
 215                         return TC_H_MIN(res.classid);
 216         }
 217         return 0;
 218 }
 219
 220 /*
 221  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 222  */
 223 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 224 {
 225         sfq_index p, n;
 226         struct sfq_slot *slot = &q->slots[x];
 227         int qlen = slot->qlen;
 228
 229         p = qlen + SFQ_MAX_FLOWS;
 230         n = q->dep[qlen].next;
 231
 232         slot->dep.next = n;
 233         slot->dep.prev = p;
 234
 235         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 236         sfq_dep_head(q, n)->prev = x;
 237 }
 238
 239 #define sfq_unlink(q, x, n, p)                  \
 240         n = q->slots[x].dep.next;               \
 241         p = q->slots[x].dep.prev;               \
 242         sfq_dep_head(q, p)->next = n;           \
 243         sfq_dep_head(q, n)->prev = p
 244
 245
 246 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 247 {
 248         sfq_index p, n;
 249         int d;
 250
 251         sfq_unlink(q, x, n, p);
 252
 253         d = q->slots[x].qlen--;
 254         if (n == p && q->cur_depth == d)
 255                 q->cur_depth--;
 256         sfq_link(q, x);
 257 }
 258
 259 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 260 {
 261         sfq_index p, n;
 262         int d;
 263
 264         sfq_unlink(q, x, n, p);
 265
 266         d = ++q->slots[x].qlen;
 267         if (q->cur_depth < d)
 268                 q->cur_depth = d;
 269         sfq_link(q, x);
 270 }
 271
 272 /* helper functions : might be changed when/if skb use a standard list_head */
 273
 274 /* remove one skb from tail of slot queue */
 275 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 276 {
 277         struct sk_buff *skb = slot->skblist_prev;
 278
 279         slot->skblist_prev = skb->prev;
 280         skb->prev->next = (struct sk_buff *)slot;
 281         skb->next = skb->prev = NULL;
 282         return skb;
 283 }
 284
 285 /* remove one skb from head of slot queue */
 286 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 287 {
 288         struct sk_buff *skb = slot->skblist_next;
 289
 290         slot->skblist_next = skb->next;
 291         skb->next->prev = (struct sk_buff *)slot;
 292         skb->next = skb->prev = NULL;
 293         return skb;
 294 }
 295
 296 static inline void slot_queue_init(struct sfq_slot *slot)
 297 {
 298         memset(slot, 0, sizeof(*slot));
 299         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 300 }
 301
 302 /* add skb to slot queue (tail add) */
 303 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 304 {
 305         skb->prev = slot->skblist_prev;
 306         skb->next = (struct sk_buff *)slot;
 307         slot->skblist_prev->next = skb;
 308         slot->skblist_prev = skb;
 309 }
 310
 311 #define slot_queue_walk(slot, skb)              \
 312         for (skb = slot->skblist_next;          \
 313              skb != (struct sk_buff *)slot;     \
 314              skb = skb->next)
 315
 316 static unsigned int sfq_drop(struct Qdisc *sch)
 317 {
 318         struct sfq_sched_data *q = qdisc_priv(sch);
 319         sfq_index x, d = q->cur_depth;
 320         struct sk_buff *skb;
 321         unsigned int len;
 322         struct sfq_slot *slot;
 323
 324         /* Queue is full! Find the longest slot and drop tail packet from it */
 325         if (d > 1) {
 326                 x = q->dep[d].next;
 327                 slot = &q->slots[x];
 328 drop:
 329                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 330                 len = qdisc_pkt_len(skb);
 331                 slot->backlog -= len;
 332                 sfq_dec(q, x);
 333                 kfree_skb(skb);
 334                 sch->q.qlen--;
 335                 sch->qstats.drops++;
 336                 sch->qstats.backlog -= len;
 337                 return len;
 338         }
 339
 340         if (d == 1) {
 341                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 342                 x = q->tail->next;
 343                 slot = &q->slots[x];
 344                 q->tail->next = slot->next;
 345                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 346                 goto drop;
 347         }
 348
 349         return 0;
 350 }
 351
 352 /* Is ECN parameter configured */
 353 static int sfq_prob_mark(const struct sfq_sched_data *q)
 354 {
 355         return q->flags & TC_RED_ECN;
 356 }
 357
 358 /* Should packets over max threshold just be marked */
 359 static int sfq_hard_mark(const struct sfq_sched_data *q)
 360 {
 361         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 362 }
 363
 364 static int sfq_headdrop(const struct sfq_sched_data *q)
 365 {
 366         return q->headdrop;
 367 }
 368
 369 static int
 370 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 371 {
 372         struct sfq_sched_data *q = qdisc_priv(sch);
 373         unsigned int hash;
 374         sfq_index x, qlen;
 375         struct sfq_slot *slot;
 376         int uninitialized_var(ret);
 377         struct sk_buff *head;
 378         int delta;
 379
 380         hash = sfq_classify(skb, sch, &ret);
 381         if (hash == 0) {
 382                 if (ret & __NET_XMIT_BYPASS)
 383                         sch->qstats.drops++;
 384                 kfree_skb(skb);
 385                 return ret;
 386         }
 387         hash--;
 388
 389         x = q->ht[hash];
 390         slot = &q->slots[x];
 391         if (x == SFQ_EMPTY_SLOT) {
 392                 x = q->dep[0].next; /* get a free slot */
 393                 if (x >= SFQ_MAX_FLOWS)
 394                         return qdisc_drop(skb, sch);
 395                 q->ht[hash] = x;
 396                 slot = &q->slots[x];
 397                 slot->hash = hash;
 398                 slot->backlog = 0; /* should already be 0 anyway... */
 399                 red_set_vars(&slot->vars);
 400                 goto enqueue;
 401         }
 402         if (q->red_parms) {
 403                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 404                                                         &slot->vars,
 405                                                         slot->backlog);
 406                 switch (red_action(q->red_parms,
 407                                    &slot->vars,
 408                                    slot->vars.qavg)) {
 409                 case RED_DONT_MARK:
 410                         break;
 411
 412                 case RED_PROB_MARK:
 413                         sch->qstats.overlimits++;
 414                         if (sfq_prob_mark(q)) {
 415                                 /* We know we have at least one packet in queue */
 416                                 if (sfq_headdrop(q) &&
 417                                     INET_ECN_set_ce(slot->skblist_next)) {
 418                                         q->stats.prob_mark_head++;
 419                                         break;
 420                                 }
 421                                 if (INET_ECN_set_ce(skb)) {
 422                                         q->stats.prob_mark++;
 423                                         break;
 424                                 }
 425                         }
 426                         q->stats.prob_drop++;
 427                         goto congestion_drop;
 428
 429                 case RED_HARD_MARK:
 430                         sch->qstats.overlimits++;
 431                         if (sfq_hard_mark(q)) {
 432                                 /* We know we have at least one packet in queue */
 433                                 if (sfq_headdrop(q) &&
 434                                     INET_ECN_set_ce(slot->skblist_next)) {
 435                                         q->stats.forced_mark_head++;
 436                                         break;
 437                                 }
 438                                 if (INET_ECN_set_ce(skb)) {
 439                                         q->stats.forced_mark++;
 440                                         break;
 441                                 }
 442                         }
 443                         q->stats.forced_drop++;
 444                         goto congestion_drop;
 445                 }
 446         }
 447
 448         if (slot->qlen >= q->maxdepth) {
 449 congestion_drop:
 450                 if (!sfq_headdrop(q))
 451                         return qdisc_drop(skb, sch);
 452
 453                 /* We know we have at least one packet in queue */
 454                 head = slot_dequeue_head(slot);
 455                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 456                 sch->qstats.backlog -= delta;
 457                 slot->backlog -= delta;
 458                 qdisc_drop(head, sch);
 459
 460                 slot_queue_add(slot, skb);
 461                 return NET_XMIT_CN;
 462         }
 463
 464 enqueue:
 465         sch->qstats.backlog += qdisc_pkt_len(skb);
 466         slot->backlog += qdisc_pkt_len(skb);
 467         slot_queue_add(slot, skb);
 468         sfq_inc(q, x);
 469         if (slot->qlen == 1) {          /* The flow is new */
 470                 if (q->tail == NULL) {  /* It is the first flow */
 471                         slot->next = x;
 472                         q->tail = slot;
 473                 } else {
 474                         slot->next = q->tail->next;
 475                         q->tail->next = x;
 476                 }
 477                 /* We could use a bigger initial quantum for new flows */
 478                 slot->allot = q->scaled_quantum;
 479         }
 480         if (++sch->q.qlen <= q->limit)
 481                 return NET_XMIT_SUCCESS;
 482
 483         qlen = slot->qlen;
 484         sfq_drop(sch);
 485         /* Return Congestion Notification only if we dropped a packet
 486          * from this flow.
 487          */
 488         if (qlen != slot->qlen)
 489                 return NET_XMIT_CN;
 490
 491         /* As we dropped a packet, better let upper stack know this */
 492         qdisc_tree_decrease_qlen(sch, 1);
 493         return NET_XMIT_SUCCESS;
 494 }
 495
 496 static struct sk_buff *
 497 sfq_dequeue(struct Qdisc *sch)
 498 {
 499         struct sfq_sched_data *q = qdisc_priv(sch);
 500         struct sk_buff *skb;
 501         sfq_index a, next_a;
 502         struct sfq_slot *slot;
 503
 504         /* No active slots */
 505         if (q->tail == NULL)
 506                 return NULL;
 507
 508 next_slot:
 509         a = q->tail->next;
 510         slot = &q->slots[a];
 511         if (slot->allot <= 0) {
 512                 q->tail = slot;
 513                 slot->allot += q->scaled_quantum;
 514                 goto next_slot;
 515         }
 516         skb = slot_dequeue_head(slot);
 517         sfq_dec(q, a);
 518         qdisc_bstats_update(sch, skb);
 519         sch->q.qlen--;
 520         sch->qstats.backlog -= qdisc_pkt_len(skb);
 521         slot->backlog -= qdisc_pkt_len(skb);
 522         /* Is the slot empty? */
 523         if (slot->qlen == 0) {
 524                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 525                 next_a = slot->next;
 526                 if (a == next_a) {
 527                         q->tail = NULL; /* no more active slots */
 528                         return skb;
 529                 }
 530                 q->tail->next = next_a;
 531         } else {
 532                 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 533         }
 534         return skb;
 535 }
 536
 537 static void
 538 sfq_reset(struct Qdisc *sch)
 539 {
 540         struct sk_buff *skb;
 541
 542         while ((skb = sfq_dequeue(sch)) != NULL)
 543                 kfree_skb(skb);
 544 }
 545
 546 /*
 547  * When q->perturbation is changed, we rehash all queued skbs
 548  * to avoid OOO (Out Of Order) effects.
 549  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 550  * counters.
 551  */
 552 static void sfq_rehash(struct Qdisc *sch)
 553 {
 554         struct sfq_sched_data *q = qdisc_priv(sch);
 555         struct sk_buff *skb;
 556         int i;
 557         struct sfq_slot *slot;
 558         struct sk_buff_head list;
 559         int dropped = 0;
 560
 561         __skb_queue_head_init(&list);
 562
 563         for (i = 0; i < q->maxflows; i++) {
 564                 slot = &q->slots[i];
 565                 if (!slot->qlen)
 566                         continue;
 567                 while (slot->qlen) {
 568                         skb = slot_dequeue_head(slot);
 569                         sfq_dec(q, i);
 570                         __skb_queue_tail(&list, skb);
 571                 }
 572                 slot->backlog = 0;
 573                 red_set_vars(&slot->vars);
 574                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 575         }
 576         q->tail = NULL;
 577
 578         while ((skb = __skb_dequeue(&list)) != NULL) {
 579                 unsigned int hash = sfq_hash(q, skb);
 580                 sfq_index x = q->ht[hash];
 581
 582                 slot = &q->slots[x];
 583                 if (x == SFQ_EMPTY_SLOT) {
 584                         x = q->dep[0].next; /* get a free slot */
 585                         if (x >= SFQ_MAX_FLOWS) {
 586 drop:                           sch->qstats.backlog -= qdisc_pkt_len(skb);
 587                                 kfree_skb(skb);
 588                                 dropped++;
 589                                 continue;
 590                         }
 591                         q->ht[hash] = x;
 592                         slot = &q->slots[x];
 593                         slot->hash = hash;
 594                 }
 595                 if (slot->qlen >= q->maxdepth)
 596                         goto drop;
 597                 slot_queue_add(slot, skb);
 598                 if (q->red_parms)
 599                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 600                                                         &slot->vars,
 601                                                         slot->backlog);
 602                 slot->backlog += qdisc_pkt_len(skb);
 603                 sfq_inc(q, x);
 604                 if (slot->qlen == 1) {          /* The flow is new */
 605                         if (q->tail == NULL) {  /* It is the first flow */
 606                                 slot->next = x;
 607                         } else {
 608                                 slot->next = q->tail->next;
 609                                 q->tail->next = x;
 610                         }
 611                         q->tail = slot;
 612                         slot->allot = q->scaled_quantum;
 613                 }
 614         }
 615         sch->q.qlen -= dropped;
 616         qdisc_tree_decrease_qlen(sch, dropped);
 617 }
 618
 619 static void sfq_perturbation(unsigned long arg)
 620 {
 621         struct Qdisc *sch = (struct Qdisc *)arg;
 622         struct sfq_sched_data *q = qdisc_priv(sch);
 623         spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 624
 625         spin_lock(root_lock);
 626         q->perturbation = net_random();
 627         if (!q->filter_list && q->tail)
 628                 sfq_rehash(sch);
 629         spin_unlock(root_lock);
 630
 631         if (q->perturb_period)
 632                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 633 }
 634
 635 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 636 {
 637         struct sfq_sched_data *q = qdisc_priv(sch);
 638         struct tc_sfq_qopt *ctl = nla_data(opt);
 639         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 640         unsigned int qlen;
 641         struct red_parms *p = NULL;
 642
 643         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 644                 return -EINVAL;
 645         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 646                 ctl_v1 = nla_data(opt);
 647         if (ctl->divisor &&
 648             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 649                 return -EINVAL;
 650         if (ctl_v1 && ctl_v1->qth_min) {
 651                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 652                 if (!p)
 653                         return -ENOMEM;
 654         }
 655         sch_tree_lock(sch);
 656         if (ctl->quantum) {
 657                 q->quantum = ctl->quantum;
 658                 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 659         }
 660         q->perturb_period = ctl->perturb_period * HZ;
 661         if (ctl->flows)
 662                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 663         if (ctl->divisor) {
 664                 q->divisor = ctl->divisor;
 665                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 666         }
 667         if (ctl_v1) {
 668                 if (ctl_v1->depth)
 669                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 670                 if (p) {
 671                         swap(q->red_parms, p);
 672                         red_set_parms(q->red_parms,
 673                                       ctl_v1->qth_min, ctl_v1->qth_max,
 674                                       ctl_v1->Wlog,
 675                                       ctl_v1->Plog, ctl_v1->Scell_log,
 676                                       NULL,
 677                                       ctl_v1->max_P);
 678                 }
 679                 q->flags = ctl_v1->flags;
 680                 q->headdrop = ctl_v1->headdrop;
 681         }
 682         if (ctl->limit) {
 683                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 684                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 685         }
 686
 687         qlen = sch->q.qlen;
 688         while (sch->q.qlen > q->limit)
 689                 sfq_drop(sch);
 690         qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 691
 692         del_timer(&q->perturb_timer);
 693         if (q->perturb_period) {
 694                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 695                 q->perturbation = net_random();
 696         }
 697         sch_tree_unlock(sch);
 698         kfree(p);
 699         return 0;
 700 }
 701
 702 static void *sfq_alloc(size_t sz)
 703 {
 704         void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
 705
 706         if (!ptr)
 707                 ptr = vmalloc(sz);
 708         return ptr;
 709 }
 710
 711 static void sfq_free(void *addr)
 712 {
 713         if (addr) {
 714                 if (is_vmalloc_addr(addr))
 715                         vfree(addr);
 716                 else
 717                         kfree(addr);
 718         }
 719 }
 720
 721 static void sfq_destroy(struct Qdisc *sch)
 722 {
 723         struct sfq_sched_data *q = qdisc_priv(sch);
 724
 725         tcf_destroy_chain(&q->filter_list);
 726         q->perturb_period = 0;
 727         del_timer_sync(&q->perturb_timer);
 728         sfq_free(q->ht);
 729         sfq_free(q->slots);
 730         kfree(q->red_parms);
 731 }
 732
 733 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 734 {
 735         struct sfq_sched_data *q = qdisc_priv(sch);
 736         int i;
 737
 738         q->perturb_timer.function = sfq_perturbation;
 739         q->perturb_timer.data = (unsigned long)sch;
 740         init_timer_deferrable(&q->perturb_timer);
 741
 742         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 743                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 744                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 745         }
 746
 747         q->limit = SFQ_MAX_DEPTH;
 748         q->maxdepth = SFQ_MAX_DEPTH;
 749         q->cur_depth = 0;
 750         q->tail = NULL;
 751         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 752         q->maxflows = SFQ_DEFAULT_FLOWS;
 753         q->quantum = psched_mtu(qdisc_dev(sch));
 754         q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 755         q->perturb_period = 0;
 756         q->perturbation = net_random();
 757
 758         if (opt) {
 759                 int err = sfq_change(sch, opt);
 760                 if (err)
 761                         return err;
 762         }
 763
 764         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 765         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 766         if (!q->ht || !q->slots) {
 767                 sfq_destroy(sch);
 768                 return -ENOMEM;
 769         }
 770         for (i = 0; i < q->divisor; i++)
 771                 q->ht[i] = SFQ_EMPTY_SLOT;
 772
 773         for (i = 0; i < q->maxflows; i++) {
 774                 slot_queue_init(&q->slots[i]);
 775                 sfq_link(q, i);
 776         }
 777         if (q->limit >= 1)
 778                 sch->flags |= TCQ_F_CAN_BYPASS;
 779         else
 780                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 781         return 0;
 782 }
 783
 784 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 785 {
 786         struct sfq_sched_data *q = qdisc_priv(sch);
 787         unsigned char *b = skb_tail_pointer(skb);
 788         struct tc_sfq_qopt_v1 opt;
 789         struct red_parms *p = q->red_parms;
 790
 791         memset(&opt, 0, sizeof(opt));
 792         opt.v0.quantum  = q->quantum;
 793         opt.v0.perturb_period = q->perturb_period / HZ;
 794         opt.v0.limit    = q->limit;
 795         opt.v0.divisor  = q->divisor;
 796         opt.v0.flows    = q->maxflows;
 797         opt.depth       = q->maxdepth;
 798         opt.headdrop    = q->headdrop;
 799
 800         if (p) {
 801                 opt.qth_min     = p->qth_min >> p->Wlog;
 802                 opt.qth_max     = p->qth_max >> p->Wlog;
 803                 opt.Wlog        = p->Wlog;
 804                 opt.Plog        = p->Plog;
 805                 opt.Scell_log   = p->Scell_log;
 806                 opt.max_P       = p->max_P;
 807         }
 808         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 809         opt.flags       = q->flags;
 810
 811         NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 812
 813         return skb->len;
 814
 815 nla_put_failure:
 816         nlmsg_trim(skb, b);
 817         return -1;
 818 }
 819
 820 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 821 {
 822         return NULL;
 823 }
 824
 825 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 826 {
 827         return 0;
 828 }
 829
 830 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 831                               u32 classid)
 832 {
 833         /* we cannot bypass queue discipline anymore */
 834         sch->flags &= ~TCQ_F_CAN_BYPASS;
 835         return 0;
 836 }
 837
 838 static void sfq_put(struct Qdisc *q, unsigned long cl)
 839 {
 840 }
 841
 842 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 843 {
 844         struct sfq_sched_data *q = qdisc_priv(sch);
 845
 846         if (cl)
 847                 return NULL;
 848         return &q->filter_list;
 849 }
 850
 851 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 852                           struct sk_buff *skb, struct tcmsg *tcm)
 853 {
 854         tcm->tcm_handle |= TC_H_MIN(cl);
 855         return 0;
 856 }
 857
 858 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 859                                 struct gnet_dump *d)
 860 {
 861         struct sfq_sched_data *q = qdisc_priv(sch);
 862         sfq_index idx = q->ht[cl - 1];
 863         struct gnet_stats_queue qs = { 0 };
 864         struct tc_sfq_xstats xstats = { 0 };
 865
 866         if (idx != SFQ_EMPTY_SLOT) {
 867                 const struct sfq_slot *slot = &q->slots[idx];
 868
 869                 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 870                 qs.qlen = slot->qlen;
 871                 qs.backlog = slot->backlog;
 872         }
 873         if (gnet_stats_copy_queue(d, &qs) < 0)
 874                 return -1;
 875         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 876 }
 877
 878 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 879 {
 880         struct sfq_sched_data *q = qdisc_priv(sch);
 881         unsigned int i;
 882
 883         if (arg->stop)
 884                 return;
 885
 886         for (i = 0; i < q->divisor; i++) {
 887                 if (q->ht[i] == SFQ_EMPTY_SLOT ||
 888                     arg->count < arg->skip) {
 889                         arg->count++;
 890                         continue;
 891                 }
 892                 if (arg->fn(sch, i + 1, arg) < 0) {
 893                         arg->stop = 1;
 894                         break;
 895                 }
 896                 arg->count++;
 897         }
 898 }
 899
 900 static const struct Qdisc_class_ops sfq_class_ops = {
 901         .leaf           =       sfq_leaf,
 902         .get            =       sfq_get,
 903         .put            =       sfq_put,
 904         .tcf_chain      =       sfq_find_tcf,
 905         .bind_tcf       =       sfq_bind,
 906         .unbind_tcf     =       sfq_put,
 907         .dump           =       sfq_dump_class,
 908         .dump_stats     =       sfq_dump_class_stats,
 909         .walk           =       sfq_walk,
 910 };
 911
 912 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 913         .cl_ops         =       &sfq_class_ops,
 914         .id             =       "sfq",
 915         .priv_size      =       sizeof(struct sfq_sched_data),
 916         .enqueue        =       sfq_enqueue,
 917         .dequeue        =       sfq_dequeue,
 918         .peek           =       qdisc_peek_dequeued,
 919         .drop           =       sfq_drop,
 920         .init           =       sfq_init,
 921         .reset          =       sfq_reset,
 922         .destroy        =       sfq_destroy,
 923         .change         =       NULL,
 924         .dump           =       sfq_dump,
 925         .owner          =       THIS_MODULE,
 926 };
 927
 928 static int __init sfq_module_init(void)
 929 {
 930         return register_qdisc(&sfq_qdisc_ops);
 931 }
 932 static void __exit sfq_module_exit(void)
 933 {
 934         unregister_qdisc(&sfq_qdisc_ops);
 935 }
 936 module_init(sfq_module_init)
 937 module_exit(sfq_module_exit)
 938 MODULE_LICENSE("GPL");