release/src-rt/linux/linux-2.6/net/sched/sch_tbf.c

   1 /*
   2  * net/sched/sch_tbf.c  Token Bucket Filter queue.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *              Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
  11  *                                               original idea by Martin Devera
  12  *
  13  */
  14
  15 #include <linux/module.h>
  16 #include <asm/uaccess.h>
  17 #include <asm/system.h>
  18 #include <linux/bitops.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/jiffies.h>
  22 #include <linux/string.h>
  23 #include <linux/mm.h>
  24 #include <linux/socket.h>
  25 #include <linux/sockios.h>
  26 #include <linux/in.h>
  27 #include <linux/errno.h>
  28 #include <linux/interrupt.h>
  29 #include <linux/if_ether.h>
  30 #include <linux/inet.h>
  31 #include <linux/netdevice.h>
  32 #include <linux/etherdevice.h>
  33 #include <linux/notifier.h>
  34 #include <net/ip.h>
  35 #include <net/netlink.h>
  36 #include <net/route.h>
  37 #include <linux/skbuff.h>
  38 #include <net/sock.h>
  39 #include <net/pkt_sched.h>
  40
  41
  42 /*      Simple Token Bucket Filter.
  43         =======================================
  44
  45         SOURCE.
  46         -------
  47
  48         None.
  49
  50         Description.
  51         ------------
  52
  53         A data flow obeys TBF with rate R and depth B, if for any
  54         time interval t_i...t_f the number of transmitted bits
  55         does not exceed B + R*(t_f-t_i).
  56
  57         Packetized version of this definition:
  58         The sequence of packets of sizes s_i served at moments t_i
  59         obeys TBF, if for any i<=k:
  60
  61         s_i+....+s_k <= B + R*(t_k - t_i)
  62
  63         Algorithm.
  64         ----------
  65
  66         Let N(t_i) be B/R initially and N(t) grow continuously with time as:
  67
  68         N(t+delta) = min{B/R, N(t) + delta}
  69
  70         If the first packet in queue has length S, it may be
  71         transmitted only at the time t_* when S/R <= N(t_*),
  72         and in this case N(t) jumps:
  73
  74         N(t_* + 0) = N(t_* - 0) - S/R.
  75
  76
  77
  78         Actually, QoS requires two TBF to be applied to a data stream.
  79         One of them controls steady state burst size, another
  80         one with rate P (peak rate) and depth M (equal to link MTU)
  81         limits bursts at a smaller time scale.
  82
  83         It is easy to see that P>R, and B>M. If P is infinity, this double
  84         TBF is equivalent to a single one.
  85
  86         When TBF works in reshaping mode, latency is estimated as:
  87
  88         lat = max ((L-B)/R, (L-M)/P)
  89
  90
  91         NOTES.
  92         ------
  93
  94         If TBF throttles, it starts a watchdog timer, which will wake it up
  95         when it is ready to transmit.
  96         Note that the minimal timer resolution is 1/HZ.
  97         If no new packets arrive during this period,
  98         or if the device is not awaken by EOI for some previous packet,
  99         TBF can stop its activity for 1/HZ.
 100
 101
 102         This means, that with depth B, the maximal rate is
 103
 104         R_crit = B*HZ
 105
 106         F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
 107
 108         Note that the peak rate TBF is much more tough: with MTU 1500
 109         P_crit = 150Kbytes/sec. So, if you need greater peak
 110         rates, use alpha with HZ=1000 :-)
 111
 112         With classful TBF, limit is just kept for backwards compatibility.
 113         It is passed to the default bfifo qdisc - if the inner qdisc is
 114         changed the limit is not effective anymore.
 115 */
 116
 117 struct tbf_sched_data
 118 {
 119 /* Parameters */
 120         u32             limit;          /* Maximal length of backlog: bytes */
 121         u32             buffer;         /* Token bucket depth/rate: MUST BE >= MTU/B */
 122         u32             mtu;
 123         u32             max_size;
 124         struct qdisc_rate_table *R_tab;
 125         struct qdisc_rate_table *P_tab;
 126
 127 /* Variables */
 128         long    tokens;                 /* Current number of B tokens */
 129         long    ptokens;                /* Current number of P tokens */
 130         psched_time_t   t_c;            /* Time check-point */
 131         struct Qdisc    *qdisc;         /* Inner qdisc, default - bfifo queue */
 132         struct qdisc_watchdog watchdog; /* Watchdog timer */
 133 };
 134
 135 #define L2T(q,L)   qdisc_l2t((q)->R_tab,L)
 136 #define L2T_P(q,L) qdisc_l2t((q)->P_tab,L)
 137
 138 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 139 {
 140         struct tbf_sched_data *q = qdisc_priv(sch);
 141         int ret;
 142
 143         if (skb->len > q->max_size)
 144                 return qdisc_reshape_fail(skb, sch);
 145
 146         if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) {
 147                 sch->qstats.drops++;
 148                 return ret;
 149         }
 150
 151         sch->q.qlen++;
 152         sch->bstats.bytes += skb->len;
 153         sch->bstats.packets++;
 154         return 0;
 155 }
 156
 157 static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
 158 {
 159         struct tbf_sched_data *q = qdisc_priv(sch);
 160         int ret;
 161
 162         if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
 163                 sch->q.qlen++;
 164                 sch->qstats.requeues++;
 165         }
 166
 167         return ret;
 168 }
 169
 170 static unsigned int tbf_drop(struct Qdisc* sch)
 171 {
 172         struct tbf_sched_data *q = qdisc_priv(sch);
 173         unsigned int len = 0;
 174
 175         if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
 176                 sch->q.qlen--;
 177                 sch->qstats.drops++;
 178         }
 179         return len;
 180 }
 181
 182 static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
 183 {
 184         struct tbf_sched_data *q = qdisc_priv(sch);
 185         struct sk_buff *skb;
 186
 187         skb = q->qdisc->dequeue(q->qdisc);
 188
 189         if (skb) {
 190                 psched_time_t now;
 191                 long toks;
 192                 long ptoks = 0;
 193                 unsigned int len = skb->len;
 194
 195                 now = psched_get_time();
 196                 toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
 197
 198                 if (q->P_tab) {
 199                         ptoks = toks + q->ptokens;
 200                         if (ptoks > (long)q->mtu)
 201                                 ptoks = q->mtu;
 202                         ptoks -= L2T_P(q, len);
 203                 }
 204                 toks += q->tokens;
 205                 if (toks > (long)q->buffer)
 206                         toks = q->buffer;
 207                 toks -= L2T(q, len);
 208
 209                 if ((toks|ptoks) >= 0) {
 210                         q->t_c = now;
 211                         q->tokens = toks;
 212                         q->ptokens = ptoks;
 213                         sch->q.qlen--;
 214                         sch->flags &= ~TCQ_F_THROTTLED;
 215                         return skb;
 216                 }
 217
 218                 qdisc_watchdog_schedule(&q->watchdog,
 219                                         now + max_t(long, -toks, -ptoks));
 220
 221                 /* Maybe we have a shorter packet in the queue,
 222                    which can be sent now. It sounds cool,
 223                    but, however, this is wrong in principle.
 224                    We MUST NOT reorder packets under these circumstances.
 225
 226                    Really, if we split the flow into independent
 227                    subflows, it would be a very good solution.
 228                    This is the main idea of all FQ algorithms
 229                    (cf. CSZ, HPFQ, HFSC)
 230                  */
 231
 232                 if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
 233                         /* When requeue fails skb is dropped */
 234                         qdisc_tree_decrease_qlen(q->qdisc, 1);
 235                         sch->qstats.drops++;
 236                 }
 237
 238                 sch->qstats.overlimits++;
 239         }
 240         return NULL;
 241 }
 242
 243 static void tbf_reset(struct Qdisc* sch)
 244 {
 245         struct tbf_sched_data *q = qdisc_priv(sch);
 246
 247         qdisc_reset(q->qdisc);
 248         sch->q.qlen = 0;
 249         q->t_c = psched_get_time();
 250         q->tokens = q->buffer;
 251         q->ptokens = q->mtu;
 252         qdisc_watchdog_cancel(&q->watchdog);
 253 }
 254
 255 static struct Qdisc *tbf_create_dflt_qdisc(struct Qdisc *sch, u32 limit)
 256 {
 257         struct Qdisc *q;
 258         struct rtattr *rta;
 259         int ret;
 260
 261         q = qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops,
 262                               TC_H_MAKE(sch->handle, 1));
 263         if (q) {
 264                 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
 265                 if (rta) {
 266                         rta->rta_type = RTM_NEWQDISC;
 267                         rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
 268                         ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
 269
 270                         ret = q->ops->change(q, rta);
 271                         kfree(rta);
 272
 273                         if (ret == 0)
 274                                 return q;
 275                 }
 276                 qdisc_destroy(q);
 277         }
 278
 279         return NULL;
 280 }
 281
 282 static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
 283 {
 284         int err = -EINVAL;
 285         struct tbf_sched_data *q = qdisc_priv(sch);
 286         struct rtattr *tb[TCA_TBF_PTAB];
 287         struct tc_tbf_qopt *qopt;
 288         struct qdisc_rate_table *rtab = NULL;
 289         struct qdisc_rate_table *ptab = NULL;
 290         struct Qdisc *child = NULL;
 291         int max_size,n;
 292
 293         if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) ||
 294             tb[TCA_TBF_PARMS-1] == NULL ||
 295             RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
 296                 goto done;
 297
 298         qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
 299         rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
 300         if (rtab == NULL)
 301                 goto done;
 302
 303         if (qopt->peakrate.rate) {
 304                 if (qopt->peakrate.rate > qopt->rate.rate)
 305                         ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
 306                 if (ptab == NULL)
 307                         goto done;
 308         }
 309
 310         for (n = 0; n < 256; n++)
 311                 if (rtab->data[n] > qopt->buffer) break;
 312         max_size = (n << qopt->rate.cell_log)-1;
 313         if (ptab) {
 314                 int size;
 315
 316                 for (n = 0; n < 256; n++)
 317                         if (ptab->data[n] > qopt->mtu) break;
 318                 size = (n << qopt->peakrate.cell_log)-1;
 319                 if (size < max_size) max_size = size;
 320         }
 321         if (max_size < 0)
 322                 goto done;
 323
 324         if (qopt->limit > 0) {
 325                 if ((child = tbf_create_dflt_qdisc(sch, qopt->limit)) == NULL)
 326                         goto done;
 327         }
 328
 329         sch_tree_lock(sch);
 330         if (child) {
 331                 qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
 332                 qdisc_destroy(xchg(&q->qdisc, child));
 333         }
 334         q->limit = qopt->limit;
 335         q->mtu = qopt->mtu;
 336         q->max_size = max_size;
 337         q->buffer = qopt->buffer;
 338         q->tokens = q->buffer;
 339         q->ptokens = q->mtu;
 340         rtab = xchg(&q->R_tab, rtab);
 341         ptab = xchg(&q->P_tab, ptab);
 342         sch_tree_unlock(sch);
 343         err = 0;
 344 done:
 345         if (rtab)
 346                 qdisc_put_rtab(rtab);
 347         if (ptab)
 348                 qdisc_put_rtab(ptab);
 349         return err;
 350 }
 351
 352 static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
 353 {
 354         struct tbf_sched_data *q = qdisc_priv(sch);
 355
 356         if (opt == NULL)
 357                 return -EINVAL;
 358
 359         q->t_c = psched_get_time();
 360         qdisc_watchdog_init(&q->watchdog, sch);
 361         q->qdisc = &noop_qdisc;
 362
 363         return tbf_change(sch, opt);
 364 }
 365
 366 static void tbf_destroy(struct Qdisc *sch)
 367 {
 368         struct tbf_sched_data *q = qdisc_priv(sch);
 369
 370         qdisc_watchdog_cancel(&q->watchdog);
 371
 372         if (q->P_tab)
 373                 qdisc_put_rtab(q->P_tab);
 374         if (q->R_tab)
 375                 qdisc_put_rtab(q->R_tab);
 376
 377         qdisc_destroy(q->qdisc);
 378 }
 379
 380 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 381 {
 382         struct tbf_sched_data *q = qdisc_priv(sch);
 383         unsigned char *b = skb_tail_pointer(skb);
 384         struct rtattr *rta;
 385         struct tc_tbf_qopt opt;
 386
 387         rta = (struct rtattr*)b;
 388         RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 389
 390         opt.limit = q->limit;
 391         opt.rate = q->R_tab->rate;
 392         if (q->P_tab)
 393                 opt.peakrate = q->P_tab->rate;
 394         else
 395                 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 396         opt.mtu = q->mtu;
 397         opt.buffer = q->buffer;
 398         RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
 399         rta->rta_len = skb_tail_pointer(skb) - b;
 400
 401         return skb->len;
 402
 403 rtattr_failure:
 404         nlmsg_trim(skb, b);
 405         return -1;
 406 }
 407
 408 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 409                           struct sk_buff *skb, struct tcmsg *tcm)
 410 {
 411         struct tbf_sched_data *q = qdisc_priv(sch);
 412
 413         if (cl != 1)    /* only one class */
 414                 return -ENOENT;
 415
 416         tcm->tcm_handle |= TC_H_MIN(1);
 417         tcm->tcm_info = q->qdisc->handle;
 418
 419         return 0;
 420 }
 421
 422 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 423                      struct Qdisc **old)
 424 {
 425         struct tbf_sched_data *q = qdisc_priv(sch);
 426
 427         if (new == NULL)
 428                 new = &noop_qdisc;
 429
 430         sch_tree_lock(sch);
 431         *old = xchg(&q->qdisc, new);
 432         qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 433         qdisc_reset(*old);
 434         sch_tree_unlock(sch);
 435
 436         return 0;
 437 }
 438
 439 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
 440 {
 441         struct tbf_sched_data *q = qdisc_priv(sch);
 442         return q->qdisc;
 443 }
 444
 445 static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
 446 {
 447         return 1;
 448 }
 449
 450 static void tbf_put(struct Qdisc *sch, unsigned long arg)
 451 {
 452 }
 453
 454 static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 455                             struct rtattr **tca, unsigned long *arg)
 456 {
 457         return -ENOSYS;
 458 }
 459
 460 static int tbf_delete(struct Qdisc *sch, unsigned long arg)
 461 {
 462         return -ENOSYS;
 463 }
 464
 465 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 466 {
 467         if (!walker->stop) {
 468                 if (walker->count >= walker->skip)
 469                         if (walker->fn(sch, 1, walker) < 0) {
 470                                 walker->stop = 1;
 471                                 return;
 472                         }
 473                 walker->count++;
 474         }
 475 }
 476
 477 static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl)
 478 {
 479         return NULL;
 480 }
 481
 482 static struct Qdisc_class_ops tbf_class_ops =
 483 {
 484         .graft          =       tbf_graft,
 485         .leaf           =       tbf_leaf,
 486         .get            =       tbf_get,
 487         .put            =       tbf_put,
 488         .change         =       tbf_change_class,
 489         .delete         =       tbf_delete,
 490         .walk           =       tbf_walk,
 491         .tcf_chain      =       tbf_find_tcf,
 492         .dump           =       tbf_dump_class,
 493 };
 494
 495 static struct Qdisc_ops tbf_qdisc_ops = {
 496         .next           =       NULL,
 497         .cl_ops         =       &tbf_class_ops,
 498         .id             =       "tbf",
 499         .priv_size      =       sizeof(struct tbf_sched_data),
 500         .enqueue        =       tbf_enqueue,
 501         .dequeue        =       tbf_dequeue,
 502         .requeue        =       tbf_requeue,
 503         .drop           =       tbf_drop,
 504         .init           =       tbf_init,
 505         .reset          =       tbf_reset,
 506         .destroy        =       tbf_destroy,
 507         .change         =       tbf_change,
 508         .dump           =       tbf_dump,
 509         .owner          =       THIS_MODULE,
 510 };
 511
 512 static int __init tbf_module_init(void)
 513 {
 514         return register_qdisc(&tbf_qdisc_ops);
 515 }
 516
 517 static void __exit tbf_module_exit(void)
 518 {
 519         unregister_qdisc(&tbf_qdisc_ops);
 520 }
 521 module_init(tbf_module_init)
 522 module_exit(tbf_module_exit)
 523 MODULE_LICENSE("GPL");