net/sched/sch_tbf.c

   1 /*
   2  * net/sched/sch_tbf.c  Token Bucket Filter queue.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *              Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
  11  *                                               original idea by Martin Devera
  12  *
  13  */
  14
  15 #include <linux/module.h>
  16 #include <asm/uaccess.h>
  17 #include <asm/system.h>
  18 #include <linux/bitops.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/jiffies.h>
  22 #include <linux/string.h>
  23 #include <linux/mm.h>
  24 #include <linux/socket.h>
  25 #include <linux/sockios.h>
  26 #include <linux/in.h>
  27 #include <linux/errno.h>
  28 #include <linux/interrupt.h>
  29 #include <linux/if_ether.h>
  30 #include <linux/inet.h>
  31 #include <linux/netdevice.h>
  32 #include <linux/etherdevice.h>
  33 #include <linux/notifier.h>
  34 #include <net/ip.h>
  35 #include <net/netlink.h>
  36 #include <net/route.h>
  37 #include <linux/skbuff.h>
  38 #include <net/sock.h>
  39 #include <net/pkt_sched.h>
  40
  41
  42 /*      Simple Token Bucket Filter.
  43         =======================================
  44
  45         SOURCE.
  46         -------
  47
  48         None.
  49
  50         Description.
  51         ------------
  52
  53         A data flow obeys TBF with rate R and depth B, if for any
  54         time interval t_i...t_f the number of transmitted bits
  55         does not exceed B + R*(t_f-t_i).
  56
  57         Packetized version of this definition:
  58         The sequence of packets of sizes s_i served at moments t_i
  59         obeys TBF, if for any i<=k:
  60
  61         s_i+....+s_k <= B + R*(t_k - t_i)
  62
  63         Algorithm.
  64         ----------
  65
  66         Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  67
  68         N(t+delta) = min{B/R, N(t) + delta}
  69
  70         If the first packet in queue has length S, it may be
  71         transmitted only at the time t_* when S/R <= N(t_*),
  72         and in this case N(t) jumps:
  73
  74         N(t_* + 0) = N(t_* - 0) - S/R.
  75
  76
  77
  78         Actually, QoS requires two TBF to be applied to a data stream.
  79         One of them controls steady state burst size, another
  80         one with rate P (peak rate) and depth M (equal to link MTU)
  81         limits bursts at a smaller time scale.
  82
  83         It is easy to see that P>R, and B>M. If P is infinity, this double
  84         TBF is equivalent to a single one.
  85
  86         When TBF works in reshaping mode, latency is estimated as:
  87
  88         lat = max ((L-B)/R, (L-M)/P)
  89
  90
  91         NOTES.
  92         ------
  93
  94         If TBF throttles, it starts a watchdog timer, which will wake it up
  95         when it is ready to transmit.
  96         Note that the minimal timer resolution is 1/HZ.
  97         If no new packets arrive during this period,
  98         or if the device is not awaken by EOI for some previous packet,
  99         TBF can stop its activity for 1/HZ.
 100
 101
 102         This means, that with depth B, the maximal rate is
 103
 104         R_crit = B*HZ
 105
 106         F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
 107
 108         Note that the peak rate TBF is much more tough: with MTU 1500
 109         P_crit = 150Kbytes/sec. So, if you need greater peak
 110         rates, use alpha with HZ=1000 :-)
 111
 112         With classful TBF, limit is just kept for backwards compatibility.
 113         It is passed to the default bfifo qdisc - if the inner qdisc is
 114         changed the limit is not effective anymore.
 115 */
 116
 117 struct tbf_sched_data
 118 {
 119 /* Parameters */
 120         u32             limit;          /* Maximal length of backlog: bytes */
 121         u32             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 122         u32             mtu;
 123         u32             max_size;
 124         struct qdisc_rate_table *R_tab;
 125         struct qdisc_rate_table *P_tab;
 126
 127 /* Variables */
 128         long    tokens;                 /* Current number of B tokens */
 129         long    ptokens;                /* Current number of P tokens */
 130         psched_time_t   t_c;            /* Time check-point */
 131         struct Qdisc    *qdisc;         /* Inner qdisc, default - bfifo queue */
 132         struct qdisc_watchdog watchdog; /* Watchdog timer */
 133 };
 134
 135 #define L2T(q,L)   ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
 136 #define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
 137
 138 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 139 {
 140         struct tbf_sched_data *q = qdisc_priv(sch);
 141         int ret;
 142
 143         if (skb->len > q->max_size) {
 144                 sch->qstats.drops++;
 145 #ifdef CONFIG_NET_CLS_POLICE
 146                 if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch))
 147 #endif
 148                         kfree_skb(skb);
 149
 150                 return NET_XMIT_DROP;
 151         }
 152
 153         if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) {
 154                 sch->qstats.drops++;
 155                 return ret;
 156         }
 157
 158         sch->q.qlen++;
 159         sch->bstats.bytes += skb->len;
 160         sch->bstats.packets++;
 161         return 0;
 162 }
 163
 164 static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
 165 {
 166         struct tbf_sched_data *q = qdisc_priv(sch);
 167         int ret;
 168
 169         if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
 170                 sch->q.qlen++;
 171                 sch->qstats.requeues++;
 172         }
 173
 174         return ret;
 175 }
 176
 177 static unsigned int tbf_drop(struct Qdisc* sch)
 178 {
 179         struct tbf_sched_data *q = qdisc_priv(sch);
 180         unsigned int len = 0;
 181
 182         if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
 183                 sch->q.qlen--;
 184                 sch->qstats.drops++;
 185         }
 186         return len;
 187 }
 188
 189 static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
 190 {
 191         struct tbf_sched_data *q = qdisc_priv(sch);
 192         struct sk_buff *skb;
 193
 194         skb = q->qdisc->dequeue(q->qdisc);
 195
 196         if (skb) {
 197                 psched_time_t now;
 198                 long toks;
 199                 long ptoks = 0;
 200                 unsigned int len = skb->len;
 201
 202                 now = psched_get_time();
 203                 toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
 204
 205                 if (q->P_tab) {
 206                         ptoks = toks + q->ptokens;
 207                         if (ptoks > (long)q->mtu)
 208                                 ptoks = q->mtu;
 209                         ptoks -= L2T_P(q, len);
 210                 }
 211                 toks += q->tokens;
 212                 if (toks > (long)q->buffer)
 213                         toks = q->buffer;
 214                 toks -= L2T(q, len);
 215
 216                 if ((toks|ptoks) >= 0) {
 217                         q->t_c = now;
 218                         q->tokens = toks;
 219                         q->ptokens = ptoks;
 220                         sch->q.qlen--;
 221                         sch->flags &= ~TCQ_F_THROTTLED;
 222                         return skb;
 223                 }
 224
 225                 qdisc_watchdog_schedule(&q->watchdog,
 226                                         now + max_t(long, -toks, -ptoks));
 227
 228                 /* Maybe we have a shorter packet in the queue,
 229                    which can be sent now. It sounds cool,
 230                    but, however, this is wrong in principle.
 231                    We MUST NOT reorder packets under these circumstances.
 232
 233                    Really, if we split the flow into independent
 234                    subflows, it would be a very good solution.
 235                    This is the main idea of all FQ algorithms
 236                    (cf. CSZ, HPFQ, HFSC)
 237                  */
 238
 239                 if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
 240                         /* When requeue fails skb is dropped */
 241                         qdisc_tree_decrease_qlen(q->qdisc, 1);
 242                         sch->qstats.drops++;
 243                 }
 244
 245                 sch->qstats.overlimits++;
 246         }
 247         return NULL;
 248 }
 249
 250 static void tbf_reset(struct Qdisc* sch)
 251 {
 252         struct tbf_sched_data *q = qdisc_priv(sch);
 253
 254         qdisc_reset(q->qdisc);
 255         sch->q.qlen = 0;
 256         q->t_c = psched_get_time();
 257         q->tokens = q->buffer;
 258         q->ptokens = q->mtu;
 259         qdisc_watchdog_cancel(&q->watchdog);
 260 }
 261
 262 static struct Qdisc *tbf_create_dflt_qdisc(struct Qdisc *sch, u32 limit)
 263 {
 264         struct Qdisc *q;
 265         struct rtattr *rta;
 266         int ret;
 267
 268         q = qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops,
 269                               TC_H_MAKE(sch->handle, 1));
 270         if (q) {
 271                 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
 272                 if (rta) {
 273                         rta->rta_type = RTM_NEWQDISC;
 274                         rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
 275                         ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
 276
 277                         ret = q->ops->change(q, rta);
 278                         kfree(rta);
 279
 280                         if (ret == 0)
 281                                 return q;
 282                 }
 283                 qdisc_destroy(q);
 284         }
 285
 286         return NULL;
 287 }
 288
 289 static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
 290 {
 291         int err = -EINVAL;
 292         struct tbf_sched_data *q = qdisc_priv(sch);
 293         struct rtattr *tb[TCA_TBF_PTAB];
 294         struct tc_tbf_qopt *qopt;
 295         struct qdisc_rate_table *rtab = NULL;
 296         struct qdisc_rate_table *ptab = NULL;
 297         struct Qdisc *child = NULL;
 298         int max_size,n;
 299
 300         if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) ||
 301             tb[TCA_TBF_PARMS-1] == NULL ||
 302             RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
 303                 goto done;
 304
 305         qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
 306         rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
 307         if (rtab == NULL)
 308                 goto done;
 309
 310         if (qopt->peakrate.rate) {
 311                 if (qopt->peakrate.rate > qopt->rate.rate)
 312                         ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
 313                 if (ptab == NULL)
 314                         goto done;
 315         }
 316
 317         for (n = 0; n < 256; n++)
 318                 if (rtab->data[n] > qopt->buffer) break;
 319         max_size = (n << qopt->rate.cell_log)-1;
 320         if (ptab) {
 321                 int size;
 322
 323                 for (n = 0; n < 256; n++)
 324                         if (ptab->data[n] > qopt->mtu) break;
 325                 size = (n << qopt->peakrate.cell_log)-1;
 326                 if (size < max_size) max_size = size;
 327         }
 328         if (max_size < 0)
 329                 goto done;
 330
 331         if (qopt->limit > 0) {
 332                 if ((child = tbf_create_dflt_qdisc(sch, qopt->limit)) == NULL)
 333                         goto done;
 334         }
 335
 336         sch_tree_lock(sch);
 337         if (child) {
 338                 qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
 339                 qdisc_destroy(xchg(&q->qdisc, child));
 340         }
 341         q->limit = qopt->limit;
 342         q->mtu = qopt->mtu;
 343         q->max_size = max_size;
 344         q->buffer = qopt->buffer;
 345         q->tokens = q->buffer;
 346         q->ptokens = q->mtu;
 347         rtab = xchg(&q->R_tab, rtab);
 348         ptab = xchg(&q->P_tab, ptab);
 349         sch_tree_unlock(sch);
 350         err = 0;
 351 done:
 352         if (rtab)
 353                 qdisc_put_rtab(rtab);
 354         if (ptab)
 355                 qdisc_put_rtab(ptab);
 356         return err;
 357 }
 358
 359 static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
 360 {
 361         struct tbf_sched_data *q = qdisc_priv(sch);
 362
 363         if (opt == NULL)
 364                 return -EINVAL;
 365
 366         q->t_c = psched_get_time();
 367         qdisc_watchdog_init(&q->watchdog, sch);
 368         q->qdisc = &noop_qdisc;
 369
 370         return tbf_change(sch, opt);
 371 }
 372
 373 static void tbf_destroy(struct Qdisc *sch)
 374 {
 375         struct tbf_sched_data *q = qdisc_priv(sch);
 376
 377         qdisc_watchdog_cancel(&q->watchdog);
 378
 379         if (q->P_tab)
 380                 qdisc_put_rtab(q->P_tab);
 381         if (q->R_tab)
 382                 qdisc_put_rtab(q->R_tab);
 383
 384         qdisc_destroy(q->qdisc);
 385 }
 386
 387 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 388 {
 389         struct tbf_sched_data *q = qdisc_priv(sch);
 390         unsigned char *b = skb_tail_pointer(skb);
 391         struct rtattr *rta;
 392         struct tc_tbf_qopt opt;
 393
 394         rta = (struct rtattr*)b;
 395         RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 396
 397         opt.limit = q->limit;
 398         opt.rate = q->R_tab->rate;
 399         if (q->P_tab)
 400                 opt.peakrate = q->P_tab->rate;
 401         else
 402                 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 403         opt.mtu = q->mtu;
 404         opt.buffer = q->buffer;
 405         RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
 406         rta->rta_len = skb_tail_pointer(skb) - b;
 407
 408         return skb->len;
 409
 410 rtattr_failure:
 411         nlmsg_trim(skb, b);
 412         return -1;
 413 }
 414
 415 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 416                           struct sk_buff *skb, struct tcmsg *tcm)
 417 {
 418         struct tbf_sched_data *q = qdisc_priv(sch);
 419
 420         if (cl != 1)    /* only one class */
 421                 return -ENOENT;
 422
 423         tcm->tcm_handle |= TC_H_MIN(1);
 424         tcm->tcm_info = q->qdisc->handle;
 425
 426         return 0;
 427 }
 428
 429 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 430                      struct Qdisc **old)
 431 {
 432         struct tbf_sched_data *q = qdisc_priv(sch);
 433
 434         if (new == NULL)
 435                 new = &noop_qdisc;
 436
 437         sch_tree_lock(sch);
 438         *old = xchg(&q->qdisc, new);
 439         qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 440         qdisc_reset(*old);
 441         sch_tree_unlock(sch);
 442
 443         return 0;
 444 }
 445
 446 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
 447 {
 448         struct tbf_sched_data *q = qdisc_priv(sch);
 449         return q->qdisc;
 450 }
 451
 452 static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
 453 {
 454         return 1;
 455 }
 456
 457 static void tbf_put(struct Qdisc *sch, unsigned long arg)
 458 {
 459 }
 460
 461 static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 462                             struct rtattr **tca, unsigned long *arg)
 463 {
 464         return -ENOSYS;
 465 }
 466
 467 static int tbf_delete(struct Qdisc *sch, unsigned long arg)
 468 {
 469         return -ENOSYS;
 470 }
 471
 472 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 473 {
 474         if (!walker->stop) {
 475                 if (walker->count >= walker->skip)
 476                         if (walker->fn(sch, 1, walker) < 0) {
 477                                 walker->stop = 1;
 478                                 return;
 479                         }
 480                 walker->count++;
 481         }
 482 }
 483
 484 static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl)
 485 {
 486         return NULL;
 487 }
 488
 489 static struct Qdisc_class_ops tbf_class_ops =
 490 {
 491         .graft          =       tbf_graft,
 492         .leaf           =       tbf_leaf,
 493         .get            =       tbf_get,
 494         .put            =       tbf_put,
 495         .change         =       tbf_change_class,
 496         .delete         =       tbf_delete,
 497         .walk           =       tbf_walk,
 498         .tcf_chain      =       tbf_find_tcf,
 499         .dump           =       tbf_dump_class,
 500 };
 501
 502 static struct Qdisc_ops tbf_qdisc_ops = {
 503         .next           =       NULL,
 504         .cl_ops         =       &tbf_class_ops,
 505         .id             =       "tbf",
 506         .priv_size      =       sizeof(struct tbf_sched_data),
 507         .enqueue        =       tbf_enqueue,
 508         .dequeue        =       tbf_dequeue,
 509         .requeue        =       tbf_requeue,
 510         .drop           =       tbf_drop,
 511         .init           =       tbf_init,
 512         .reset          =       tbf_reset,
 513         .destroy        =       tbf_destroy,
 514         .change         =       tbf_change,
 515         .dump           =       tbf_dump,
 516         .owner          =       THIS_MODULE,
 517 };
 518
 519 static int __init tbf_module_init(void)
 520 {
 521         return register_qdisc(&tbf_qdisc_ops);
 522 }
 523
 524 static void __exit tbf_module_exit(void)
 525 {
 526         unregister_qdisc(&tbf_qdisc_ops);
 527 }
 528 module_init(tbf_module_init)
 529 module_exit(tbf_module_exit)
 530 MODULE_LICENSE("GPL");