- Linus: drop support for old-style Makefiles entirely. Big.
[davej-history.git] / net / sched / estimator.c
blobe70066f9c1bbf31f722c1cdfd0a8b2e2d4431dc2
1 /*
2 * net/sched/estimator.c Simple rate estimator.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 #include <asm/uaccess.h>
13 #include <asm/system.h>
14 #include <asm/bitops.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/sched.h>
18 #include <linux/string.h>
19 #include <linux/mm.h>
20 #include <linux/socket.h>
21 #include <linux/sockios.h>
22 #include <linux/in.h>
23 #include <linux/errno.h>
24 #include <linux/interrupt.h>
25 #include <linux/netdevice.h>
26 #include <linux/skbuff.h>
27 #include <linux/rtnetlink.h>
28 #include <linux/init.h>
29 #include <linux/proc_fs.h>
30 #include <net/sock.h>
31 #include <net/pkt_sched.h>
34 This code is NOT intended to be used for statistics collection,
35 its purpose is to provide a base for statistical multiplexing
36 for controlled load service.
37 If you need only statistics, run a user level daemon which
38 periodically reads byte counters.
40 Unfortunately, rate estimation is not a very easy task.
41 F.e. I did not find a simple way to estimate the current peak rate
42 and even failed to formulate the problem 8)8)
44 So I preferred not to built an estimator into the scheduler,
45 but run this task separately.
46 Ideally, it should be kernel thread(s), but for now it runs
47 from timers, which puts apparent top bounds on the number of rated
48 flows, has minimal overhead on small, but is enough
49 to handle controlled load service, sets of aggregates.
51 We measure rate over A=(1<<interval) seconds and evaluate EWMA:
53 avrate = avrate*(1-W) + rate*W
55 where W is chosen as negative power of 2: W = 2^(-ewma_log)
57 The resulting time constant is:
59 T = A/(-ln(1-W))
62 NOTES.
64 * The stored value for avbps is scaled by 2^5, so that maximal
65 rate is ~1Gbit, avpps is scaled by 2^10.
67 * Minimal interval is HZ/4=250msec (it is the greatest common divisor
68 for HZ=100 and HZ=1024 8)), maximal interval
69 is (HZ/4)*2^EST_MAX_INTERVAL = 8sec. Shorter intervals
70 are too expensive, longer ones can be implemented
71 at user level painlessly.
74 #if (HZ%4) != 0
75 #error Bad HZ value.
76 #endif
78 #define EST_MAX_INTERVAL 5
80 struct qdisc_estimator
82 struct qdisc_estimator *next;
83 struct tc_stats *stats;
84 unsigned interval;
85 int ewma_log;
86 u64 last_bytes;
87 u32 last_packets;
88 u32 avpps;
89 u32 avbps;
92 struct qdisc_estimator_head
94 struct timer_list timer;
95 struct qdisc_estimator *list;
98 static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1];
100 /* Estimator array lock */
101 static rwlock_t est_lock = RW_LOCK_UNLOCKED;
103 static void est_timer(unsigned long arg)
105 int idx = (int)arg;
106 struct qdisc_estimator *e;
108 read_lock(&est_lock);
109 for (e = elist[idx].list; e; e = e->next) {
110 struct tc_stats *st = e->stats;
111 u64 nbytes;
112 u32 npackets;
113 u32 rate;
115 spin_lock(st->lock);
116 nbytes = st->bytes;
117 npackets = st->packets;
118 rate = (nbytes - e->last_bytes)<<(7 - idx);
119 e->last_bytes = nbytes;
120 e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
121 st->bps = (e->avbps+0xF)>>5;
123 rate = (npackets - e->last_packets)<<(12 - idx);
124 e->last_packets = npackets;
125 e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
126 e->stats->pps = (e->avpps+0x1FF)>>10;
127 spin_unlock(st->lock);
130 mod_timer(&elist[idx].timer, jiffies + ((HZ/4)<<idx));
131 read_unlock(&est_lock);
134 int qdisc_new_estimator(struct tc_stats *stats, struct rtattr *opt)
136 struct qdisc_estimator *est;
137 struct tc_estimator *parm = RTA_DATA(opt);
139 if (RTA_PAYLOAD(opt) < sizeof(*parm))
140 return -EINVAL;
142 if (parm->interval < -2 || parm->interval > 3)
143 return -EINVAL;
145 est = kmalloc(sizeof(*est), GFP_KERNEL);
146 if (est == NULL)
147 return -ENOBUFS;
149 memset(est, 0, sizeof(*est));
150 est->interval = parm->interval + 2;
151 est->stats = stats;
152 est->ewma_log = parm->ewma_log;
153 est->last_bytes = stats->bytes;
154 est->avbps = stats->bps<<5;
155 est->last_packets = stats->packets;
156 est->avpps = stats->pps<<10;
158 est->next = elist[est->interval].list;
159 if (est->next == NULL) {
160 init_timer(&elist[est->interval].timer);
161 elist[est->interval].timer.data = est->interval;
162 elist[est->interval].timer.expires = jiffies + ((HZ/4)<<est->interval);
163 elist[est->interval].timer.function = est_timer;
164 add_timer(&elist[est->interval].timer);
166 write_lock_bh(&est_lock);
167 elist[est->interval].list = est;
168 write_unlock_bh(&est_lock);
169 return 0;
172 void qdisc_kill_estimator(struct tc_stats *stats)
174 int idx;
175 struct qdisc_estimator *est, **pest;
177 for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
178 int killed = 0;
179 pest = &elist[idx].list;
180 while ((est=*pest) != NULL) {
181 if (est->stats != stats) {
182 pest = &est->next;
183 continue;
186 write_lock_bh(&est_lock);
187 *pest = est->next;
188 write_unlock_bh(&est_lock);
190 kfree(est);
191 killed++;
193 if (killed && elist[idx].list == NULL)
194 del_timer(&elist[idx].timer);