net/sched/sch_red.c

   1 /*
   2  * net/sched/sch_red.c  Random Early Detection queue.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *
  11  * Changes:
  12  * J Hadi Salim <hadi@nortel.com> 980914:       computation fixes
  13  * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
  14  * J Hadi Salim <hadi@nortelnetworks.com> 980816:  ECN support
  15  */
  16
  17 #include <linux/config.h>
  18 #include <linux/module.h>
  19 #include <asm/uaccess.h>
  20 #include <asm/system.h>
  21 #include <linux/bitops.h>
  22 #include <linux/types.h>
  23 #include <linux/kernel.h>
  24 #include <linux/sched.h>
  25 #include <linux/string.h>
  26 #include <linux/mm.h>
  27 #include <linux/socket.h>
  28 #include <linux/sockios.h>
  29 #include <linux/in.h>
  30 #include <linux/errno.h>
  31 #include <linux/interrupt.h>
  32 #include <linux/if_ether.h>
  33 #include <linux/inet.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/etherdevice.h>
  36 #include <linux/notifier.h>
  37 #include <net/ip.h>
  38 #include <net/route.h>
  39 #include <linux/skbuff.h>
  40 #include <net/sock.h>
  41 #include <net/pkt_sched.h>
  42 #include <net/inet_ecn.h>
  43 #include <net/dsfield.h>
  44
  45
  46 /*      Random Early Detection (RED) algorithm.
  47         =======================================
  48
  49         Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
  50         for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
  51
  52         This file codes a "divisionless" version of RED algorithm
  53         as written down in Fig.17 of the paper.
  54
  55 Short description.
  56 ------------------
  57
  58         When a new packet arrives we calculate the average queue length:
  59
  60         avg = (1-W)*avg + W*current_queue_len,
  61
  62         W is the filter time constant (chosen as 2^(-Wlog)), it controls
  63         the inertia of the algorithm. To allow larger bursts, W should be
  64         decreased.
  65
  66         if (avg > th_max) -> packet marked (dropped).
  67         if (avg < th_min) -> packet passes.
  68         if (th_min < avg < th_max) we calculate probability:
  69
  70         Pb = max_P * (avg - th_min)/(th_max-th_min)
  71
  72         and mark (drop) packet with this probability.
  73         Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
  74         max_P should be small (not 1), usually 0.01..0.02 is good value.
  75
  76         max_P is chosen as a number, so that max_P/(th_max-th_min)
  77         is a negative power of two in order arithmetics to contain
  78         only shifts.
  79
  80
  81         Parameters, settable by user:
  82         -----------------------------
  83
  84         limit           - bytes (must be > qth_max + burst)
  85
  86         Hard limit on queue length, should be chosen >qth_max
  87         to allow packet bursts. This parameter does not
  88         affect the algorithms behaviour and can be chosen
  89         arbitrarily high (well, less than ram size)
  90         Really, this limit will never be reached
  91         if RED works correctly.
  92
  93         qth_min         - bytes (should be < qth_max/2)
  94         qth_max         - bytes (should be at least 2*qth_min and less limit)
  95         Wlog            - bits (<32) log(1/W).
  96         Plog            - bits (<32)
  97
  98         Plog is related to max_P by formula:
  99
 100         max_P = (qth_max-qth_min)/2^Plog;
 101
 102         F.e. if qth_max=128K and qth_min=32K, then Plog=22
 103         corresponds to max_P=0.02
 104
 105         Scell_log
 106         Stab
 107
 108         Lookup table for log((1-W)^(t/t_ave).
 109
 110
 111 NOTES:
 112
 113 Upper bound on W.
 114 -----------------
 115
 116         If you want to allow bursts of L packets of size S,
 117         you should choose W:
 118
 119         L + 1 - th_min/S < (1-(1-W)^L)/W
 120
 121         th_min/S = 32         th_min/S = 4
 122
 123         log(W)  L
 124         -1      33
 125         -2      35
 126         -3      39
 127         -4      46
 128         -5      57
 129         -6      75
 130         -7      101
 131         -8      135
 132         -9      190
 133         etc.
 134  */
 135
 136 struct red_sched_data
 137 {
 138 /* Parameters */
 139         u32             limit;          /* HARD maximal queue length    */
 140         u32             qth_min;        /* Min average length threshold: A scaled */
 141         u32             qth_max;        /* Max average length threshold: A scaled */
 142         u32             Rmask;
 143         u32             Scell_max;
 144         unsigned char   flags;
 145         char            Wlog;           /* log(W)               */
 146         char            Plog;           /* random number bits   */
 147         char            Scell_log;
 148         u8              Stab[256];
 149
 150 /* Variables */
 151         unsigned long   qave;           /* Average queue length: A scaled */
 152         int             qcount;         /* Packets since last random number generation */
 153         u32             qR;             /* Cached random number */
 154
 155         psched_time_t   qidlestart;     /* Start of idle period         */
 156         struct tc_red_xstats st;
 157 };
 158
 159 static int red_ecn_mark(struct sk_buff *skb)
 160 {
 161         if (skb->nh.raw + 20 > skb->tail)
 162                 return 0;
 163
 164         switch (skb->protocol) {
 165         case __constant_htons(ETH_P_IP):
 166                 if (INET_ECN_is_not_ect(skb->nh.iph->tos))
 167                         return 0;
 168                 IP_ECN_set_ce(skb->nh.iph);
 169                 return 1;
 170         case __constant_htons(ETH_P_IPV6):
 171                 if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h)))
 172                         return 0;
 173                 IP6_ECN_set_ce(skb->nh.ipv6h);
 174                 return 1;
 175         default:
 176                 return 0;
 177         }
 178 }
 179
 180 static int
 181 red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 182 {
 183         struct red_sched_data *q = qdisc_priv(sch);
 184
 185         psched_time_t now;
 186
 187         if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
 188                 long us_idle;
 189                 int  shift;
 190
 191                 PSCHED_GET_TIME(now);
 192                 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
 193                 PSCHED_SET_PASTPERFECT(q->qidlestart);
 194
 195 /*
 196    The problem: ideally, average length queue recalcultion should
 197    be done over constant clock intervals. This is too expensive, so that
 198    the calculation is driven by outgoing packets.
 199    When the queue is idle we have to model this clock by hand.
 200
 201    SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
 202    dummy packets as a burst after idle time, i.e.
 203
 204           q->qave *= (1-W)^m
 205
 206    This is an apparently overcomplicated solution (f.e. we have to precompute
 207    a table to make this calculation in reasonable time)
 208    I believe that a simpler model may be used here,
 209    but it is field for experiments.
 210 */
 211                 shift = q->Stab[us_idle>>q->Scell_log];
 212
 213                 if (shift) {
 214                         q->qave >>= shift;
 215                 } else {
 216                         /* Approximate initial part of exponent
 217                            with linear function:
 218                            (1-W)^m ~= 1-mW + ...
 219
 220                            Seems, it is the best solution to
 221                            problem of too coarce exponent tabulation.
 222                          */
 223
 224                         us_idle = (q->qave * us_idle)>>q->Scell_log;
 225                         if (us_idle < q->qave/2)
 226                                 q->qave -= us_idle;
 227                         else
 228                                 q->qave >>= 1;
 229                 }
 230         } else {
 231                 q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
 232                 /* NOTE:
 233                    q->qave is fixed point number with point at Wlog.
 234                    The formulae above is equvalent to floating point
 235                    version:
 236
 237                    qave = qave*(1-W) + sch->qstats.backlog*W;
 238                                                            --ANK (980924)
 239                  */
 240         }
 241
 242         if (q->qave < q->qth_min) {
 243                 q->qcount = -1;
 244 enqueue:
 245                 if (sch->qstats.backlog + skb->len <= q->limit) {
 246                         __skb_queue_tail(&sch->q, skb);
 247                         sch->qstats.backlog += skb->len;
 248                         sch->bstats.bytes += skb->len;
 249                         sch->bstats.packets++;
 250                         return NET_XMIT_SUCCESS;
 251                 } else {
 252                         q->st.pdrop++;
 253                 }
 254                 kfree_skb(skb);
 255                 sch->qstats.drops++;
 256                 return NET_XMIT_DROP;
 257         }
 258         if (q->qave >= q->qth_max) {
 259                 q->qcount = -1;
 260                 sch->qstats.overlimits++;
 261 mark:
 262                 if  (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
 263                         q->st.early++;
 264                         goto drop;
 265                 }
 266                 q->st.marked++;
 267                 goto enqueue;
 268         }
 269
 270         if (++q->qcount) {
 271                 /* The formula used below causes questions.
 272
 273                    OK. qR is random number in the interval 0..Rmask
 274                    i.e. 0..(2^Plog). If we used floating point
 275                    arithmetics, it would be: (2^Plog)*rnd_num,
 276                    where rnd_num is less 1.
 277
 278                    Taking into account, that qave have fixed
 279                    point at Wlog, and Plog is related to max_P by
 280                    max_P = (qth_max-qth_min)/2^Plog; two lines
 281                    below have the following floating point equivalent:
 282
 283                    max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
 284
 285                    Any questions? --ANK (980924)
 286                  */
 287                 if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
 288                         goto enqueue;
 289                 q->qcount = 0;
 290                 q->qR = net_random()&q->Rmask;
 291                 sch->qstats.overlimits++;
 292                 goto mark;
 293         }
 294         q->qR = net_random()&q->Rmask;
 295         goto enqueue;
 296
 297 drop:
 298         kfree_skb(skb);
 299         sch->qstats.drops++;
 300         return NET_XMIT_CN;
 301 }
 302
 303 static int
 304 red_requeue(struct sk_buff *skb, struct Qdisc* sch)
 305 {
 306         struct red_sched_data *q = qdisc_priv(sch);
 307
 308         PSCHED_SET_PASTPERFECT(q->qidlestart);
 309
 310         __skb_queue_head(&sch->q, skb);
 311         sch->qstats.backlog += skb->len;
 312         sch->qstats.requeues++;
 313         return 0;
 314 }
 315
 316 static struct sk_buff *
 317 red_dequeue(struct Qdisc* sch)
 318 {
 319         struct sk_buff *skb;
 320         struct red_sched_data *q = qdisc_priv(sch);
 321
 322         skb = __skb_dequeue(&sch->q);
 323         if (skb) {
 324                 sch->qstats.backlog -= skb->len;
 325                 return skb;
 326         }
 327         PSCHED_GET_TIME(q->qidlestart);
 328         return NULL;
 329 }
 330
 331 static unsigned int red_drop(struct Qdisc* sch)
 332 {
 333         struct sk_buff *skb;
 334         struct red_sched_data *q = qdisc_priv(sch);
 335
 336         skb = __skb_dequeue_tail(&sch->q);
 337         if (skb) {
 338                 unsigned int len = skb->len;
 339                 sch->qstats.backlog -= len;
 340                 sch->qstats.drops++;
 341                 q->st.other++;
 342                 kfree_skb(skb);
 343                 return len;
 344         }
 345         PSCHED_GET_TIME(q->qidlestart);
 346         return 0;
 347 }
 348
 349 static void red_reset(struct Qdisc* sch)
 350 {
 351         struct red_sched_data *q = qdisc_priv(sch);
 352
 353         __skb_queue_purge(&sch->q);
 354         sch->qstats.backlog = 0;
 355         PSCHED_SET_PASTPERFECT(q->qidlestart);
 356         q->qave = 0;
 357         q->qcount = -1;
 358 }
 359
 360 static int red_change(struct Qdisc *sch, struct rtattr *opt)
 361 {
 362         struct red_sched_data *q = qdisc_priv(sch);
 363         struct rtattr *tb[TCA_RED_STAB];
 364         struct tc_red_qopt *ctl;
 365
 366         if (opt == NULL ||
 367             rtattr_parse_nested(tb, TCA_RED_STAB, opt) ||
 368             tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
 369             RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
 370             RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
 371                 return -EINVAL;
 372
 373         ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
 374
 375         sch_tree_lock(sch);
 376         q->flags = ctl->flags;
 377         q->Wlog = ctl->Wlog;
 378         q->Plog = ctl->Plog;
 379         q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
 380         q->Scell_log = ctl->Scell_log;
 381         q->Scell_max = (255<<q->Scell_log);
 382         q->qth_min = ctl->qth_min<<ctl->Wlog;
 383         q->qth_max = ctl->qth_max<<ctl->Wlog;
 384         q->limit = ctl->limit;
 385         memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
 386
 387         q->qcount = -1;
 388         if (skb_queue_len(&sch->q) == 0)
 389                 PSCHED_SET_PASTPERFECT(q->qidlestart);
 390         sch_tree_unlock(sch);
 391         return 0;
 392 }
 393
 394 static int red_init(struct Qdisc* sch, struct rtattr *opt)
 395 {
 396         return red_change(sch, opt);
 397 }
 398
 399 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 400 {
 401         struct red_sched_data *q = qdisc_priv(sch);
 402         unsigned char    *b = skb->tail;
 403         struct rtattr *rta;
 404         struct tc_red_qopt opt;
 405
 406         rta = (struct rtattr*)b;
 407         RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 408         opt.limit = q->limit;
 409         opt.qth_min = q->qth_min>>q->Wlog;
 410         opt.qth_max = q->qth_max>>q->Wlog;
 411         opt.Wlog = q->Wlog;
 412         opt.Plog = q->Plog;
 413         opt.Scell_log = q->Scell_log;
 414         opt.flags = q->flags;
 415         RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
 416         rta->rta_len = skb->tail - b;
 417
 418         return skb->len;
 419
 420 rtattr_failure:
 421         skb_trim(skb, b - skb->data);
 422         return -1;
 423 }
 424
 425 static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 426 {
 427         struct red_sched_data *q = qdisc_priv(sch);
 428
 429         return gnet_stats_copy_app(d, &q->st, sizeof(q->st));
 430 }
 431
 432 static struct Qdisc_ops red_qdisc_ops = {
 433         .next           =       NULL,
 434         .cl_ops         =       NULL,
 435         .id             =       "red",
 436         .priv_size      =       sizeof(struct red_sched_data),
 437         .enqueue        =       red_enqueue,
 438         .dequeue        =       red_dequeue,
 439         .requeue        =       red_requeue,
 440         .drop           =       red_drop,
 441         .init           =       red_init,
 442         .reset          =       red_reset,
 443         .change         =       red_change,
 444         .dump           =       red_dump,
 445         .dump_stats     =       red_dump_stats,
 446         .owner          =       THIS_MODULE,
 447 };
 448
 449 static int __init red_module_init(void)
 450 {
 451         return register_qdisc(&red_qdisc_ops);
 452 }
 453 static void __exit red_module_exit(void)
 454 {
 455         unregister_qdisc(&red_qdisc_ops);
 456 }
 457 module_init(red_module_init)
 458 module_exit(red_module_exit)
 459 MODULE_LICENSE("GPL");