2 * Copyright (c) 2001-2002 Luigi Rizzo
4 * Supported by: the Xorp Project (www.xorp.org)
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * $FreeBSD: src/sys/kern/kern_poll.c,v 1.2.2.4 2002/06/27 23:26:33 luigi Exp $
28 * $DragonFly: src/sys/kern/kern_poll.c,v 1.28 2007/08/04 08:25:37 sephe Exp $
31 #include "opt_polling.h"
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/socket.h> /* needed by net/if.h */
37 #include <sys/sysctl.h>
39 #include <sys/thread2.h>
40 #include <sys/msgport2.h>
42 #include <net/if.h> /* for IFF_* flags */
43 #include <net/netisr.h> /* for NETISR_POLL */
45 /* the two netisr handlers */
46 static int sysctl_pollhz(SYSCTL_HANDLER_ARGS
);
47 static int sysctl_polling(SYSCTL_HANDLER_ARGS
);
48 static void netisr_poll(struct netmsg
*);
49 static void netisr_pollmore(struct netmsg
*);
50 static void pollclock(systimer_t
, struct intrframe
*);
52 void init_device_poll(void); /* init routine */
55 * Polling support for [network] device drivers.
57 * Drivers which support this feature try to register with the
60 * If registration is successful, the driver must disable interrupts,
61 * and further I/O is performed through the handler, which is invoked
62 * (at least once per clock tick) with 3 arguments: the "arg" passed at
63 * register time (a struct ifnet pointer), a command, and a "count" limit.
65 * The command can be one of the following:
66 * POLL_ONLY: quick move of "count" packets from input/output queues.
67 * POLL_AND_CHECK_STATUS: as above, plus check status registers or do
68 * other more expensive operations. This command is issued periodically
69 * but less frequently than POLL_ONLY.
70 * POLL_DEREGISTER: deregister and return to interrupt mode.
71 * POLL_REGISTER: register and disable interrupts
73 * The first two commands are only issued if the interface is marked as
74 * 'IFF_UP and IFF_RUNNING', the last one only if IFF_RUNNING is set.
76 * The count limit specifies how much work the handler can do during the
77 * call -- typically this is the number of packets to be received, or
78 * transmitted, etc. (drivers are free to interpret this number, as long
79 * as the max time spent in the function grows roughly linearly with the
82 * Deregistration can be requested by the driver itself (typically in the
83 * *_stop() routine), or by the polling code, by invoking the handler.
85 * Polling can be globally enabled or disabled with the sysctl variable
86 * kern.polling.enable (default is 0, disabled)
88 * A second variable controls the sharing of CPU between polling/kernel
89 * network processing, and other activities (typically userlevel tasks):
90 * kern.polling.user_frac (between 0 and 100, default 50) sets the share
91 * of CPU allocated to user tasks. CPU is allocated proportionally to the
92 * shares, by dynamically adjusting the "count" (poll_burst).
94 * Other parameters can should be left to their default values.
95 * The following constraints hold
97 * 1 <= poll_each_burst <= poll_burst <= poll_burst_max
98 * MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX
101 #define MIN_POLL_BURST_MAX 10
102 #define MAX_POLL_BURST_MAX 1000
104 #ifndef DEVICE_POLLING_FREQ_MAX
105 #define DEVICE_POLLING_FREQ_MAX 30000
107 #define DEVICE_POLLING_FREQ_DEFAULT 2000
109 SYSCTL_NODE(_kern
, OID_AUTO
, polling
, CTLFLAG_RW
, 0,
110 "Device polling parameters");
112 static u_int32_t poll_burst
= 5;
113 SYSCTL_UINT(_kern_polling
, OID_AUTO
, burst
, CTLFLAG_RW
,
114 &poll_burst
, 0, "Current polling burst size");
116 static u_int32_t poll_each_burst
= 5;
117 SYSCTL_UINT(_kern_polling
, OID_AUTO
, each_burst
, CTLFLAG_RW
,
118 &poll_each_burst
, 0, "Max size of each burst");
120 static u_int32_t poll_burst_max
= 150; /* good for 100Mbit net and HZ=1000 */
121 SYSCTL_UINT(_kern_polling
, OID_AUTO
, burst_max
, CTLFLAG_RW
,
122 &poll_burst_max
, 0, "Max Polling burst size");
124 static u_int32_t user_frac
= 50;
125 SYSCTL_UINT(_kern_polling
, OID_AUTO
, user_frac
, CTLFLAG_RW
,
126 &user_frac
, 0, "Desired user fraction of cpu time");
128 static u_int32_t reg_frac
= 20 ;
129 SYSCTL_UINT(_kern_polling
, OID_AUTO
, reg_frac
, CTLFLAG_RW
,
130 ®_frac
, 0, "Every this many cycles poll register");
132 static u_int32_t short_ticks
;
133 SYSCTL_UINT(_kern_polling
, OID_AUTO
, short_ticks
, CTLFLAG_RW
,
134 &short_ticks
, 0, "Hardclock ticks shorter than they should be");
136 static u_int32_t lost_polls
;
137 SYSCTL_UINT(_kern_polling
, OID_AUTO
, lost_polls
, CTLFLAG_RW
,
138 &lost_polls
, 0, "How many times we would have lost a poll tick");
140 static u_int32_t pending_polls
;
141 SYSCTL_UINT(_kern_polling
, OID_AUTO
, pending_polls
, CTLFLAG_RD
,
142 &pending_polls
, 0, "Do we need to poll again");
144 static int residual_burst
= 0;
145 SYSCTL_INT(_kern_polling
, OID_AUTO
, residual_burst
, CTLFLAG_RW
,
146 &residual_burst
, 0, "# of residual cycles in burst");
148 static u_int32_t poll_handlers
; /* next free entry in pr[]. */
149 SYSCTL_UINT(_kern_polling
, OID_AUTO
, handlers
, CTLFLAG_RD
,
150 &poll_handlers
, 0, "Number of registered poll handlers");
152 static int polling_enabled
= 0; /* global polling enable */
153 TUNABLE_INT("kern.polling.enable", &polling_enabled
);
154 SYSCTL_PROC(_kern_polling
, OID_AUTO
, enable
, CTLTYPE_INT
| CTLFLAG_RW
,
155 0, 0, sysctl_polling
, "I", "Polling enabled");
157 static u_int32_t phase
;
158 SYSCTL_UINT(_kern_polling
, OID_AUTO
, phase
, CTLFLAG_RD
,
159 &phase
, 0, "Polling phase");
161 static u_int32_t suspect
;
162 SYSCTL_UINT(_kern_polling
, OID_AUTO
, suspect
, CTLFLAG_RW
,
163 &suspect
, 0, "suspect event");
165 static u_int32_t stalled
;
166 SYSCTL_UINT(_kern_polling
, OID_AUTO
, stalled
, CTLFLAG_RW
,
167 &stalled
, 0, "potential stalls");
169 static int pollhz
= DEVICE_POLLING_FREQ_DEFAULT
;
170 TUNABLE_INT("kern.polling.pollhz", &pollhz
);
171 SYSCTL_PROC(_kern_polling
, OID_AUTO
, pollhz
, CTLTYPE_INT
| CTLFLAG_RW
,
172 0, 0, sysctl_pollhz
, "I", "Device polling frequency");
174 #define POLL_LIST_LEN 128
179 static struct pollrec pr
[POLL_LIST_LEN
];
180 static struct systimer gd0_pollclock
;
183 * register relevant netisr. Called from kern_clock.c:
186 init_device_poll(void)
188 netisr_register(NETISR_POLL
, cpu0_portfn
, netisr_poll
);
189 netisr_register(NETISR_POLLMORE
, cpu0_portfn
, netisr_pollmore
);
190 systimer_init_periodic_nq(&gd0_pollclock
, pollclock
, NULL
,
191 polling_enabled
? pollhz
: 1);
195 * Set the polling frequency
198 sysctl_pollhz(SYSCTL_HANDLER_ARGS
)
203 error
= sysctl_handle_int(oidp
, &phz
, 0, req
);
204 if (error
|| req
->newptr
== NULL
)
208 else if (phz
> DEVICE_POLLING_FREQ_MAX
)
209 phz
= DEVICE_POLLING_FREQ_MAX
;
214 systimer_adjust_periodic(&gd0_pollclock
, phz
);
220 * Master enable. If polling is disabled, cut the polling systimer
224 sysctl_polling(SYSCTL_HANDLER_ARGS
)
228 enabled
= polling_enabled
;
229 error
= sysctl_handle_int(oidp
, &enabled
, 0, req
);
230 if (error
|| req
->newptr
== NULL
)
232 polling_enabled
= enabled
;
234 systimer_adjust_periodic(&gd0_pollclock
, pollhz
);
236 systimer_adjust_periodic(&gd0_pollclock
, 1);
241 * Hook from hardclock. Tries to schedule a netisr, but keeps track
242 * of lost ticks due to the previous handler taking too long.
243 * Normally, this should not happen, because polling handler should
244 * run for a short time. However, in some cases (e.g. when there are
245 * changes in link status etc.) the drivers take a very long time
246 * (even in the order of milliseconds) to reset and reconfigure the
247 * device, causing apparent lost polls.
249 * The first part of the code is just for debugging purposes, and tries
250 * to count how often hardclock ticks are shorter than they should,
251 * meaning either stray interrupts or delayed events.
253 * WARNING! called from fastint or IPI, the MP lock might not be held.
256 pollclock(systimer_t info __unused
, struct intrframe
*frame __unused
)
258 static struct timeval prev_t
, t
;
261 if (poll_handlers
== 0)
265 delta
= (t
.tv_usec
- prev_t
.tv_usec
) +
266 (t
.tv_sec
- prev_t
.tv_sec
)*1000000;
267 if (delta
* hz
< 500000)
272 if (pending_polls
> 100) {
274 * Too much, assume it has stalled (not always true
275 * see comment above).
286 schednetisr(NETISR_POLL
);
289 if (pending_polls
++ > 0)
294 * netisr_pollmore is called after other netisr's, possibly scheduling
295 * another NETISR_POLL call, or adapting the burst size for the next cycle.
297 * It is very bad to fetch large bursts of packets from a single card at once,
298 * because the burst could take a long time to be completely processed, or
299 * could saturate the intermediate queue (ipintrq or similar) leading to
300 * losses or unfairness. To reduce the problem, and also to account better for
301 * time spent in network-related processing, we split the burst in smaller
302 * chunks of fixed size, giving control to the other netisr's between chunks.
303 * This helps in improving the fairness, reducing livelock (because we
304 * emulate more closely the "process to completion" that we have with
305 * fastforwarding) and accounting for the work performed in low level
306 * handling and forwarding.
309 static struct timeval poll_start_t
;
313 netisr_pollmore(struct netmsg
*msg
)
319 lwkt_replymsg(&msg
->nm_lmsg
, 0);
321 if (residual_burst
> 0) {
322 schednetisr(NETISR_POLL
);
323 /* will run immediately on return, followed by netisrs */
326 /* here we can account time spent in netisr's in this tick */
328 kern_load
= (t
.tv_usec
- poll_start_t
.tv_usec
) +
329 (t
.tv_sec
- poll_start_t
.tv_sec
)*1000000; /* us */
330 kern_load
= (kern_load
* hz
) / 10000; /* 0..100 */
331 if (kern_load
> (100 - user_frac
)) { /* try decrease ticks */
335 if (poll_burst
< poll_burst_max
)
340 if (pending_polls
== 0) { /* we are done */
344 * Last cycle was long and caused us to miss one or more
345 * hardclock ticks. Restart processing again, but slightly
346 * reduce the burst size to prevent that this happens again.
348 poll_burst
-= (poll_burst
/ 8);
351 schednetisr(NETISR_POLL
);
359 * netisr_poll is scheduled by schednetisr when appropriate, typically once
362 * Note that the message is replied immediately in order to allow a new
363 * ISR to be scheduled in the handler.
365 * XXX each registration should indicate whether it needs a critical
366 * section to operate.
370 netisr_poll(struct netmsg
*msg
)
372 static int reg_frac_count
;
374 enum poll_cmd arg
= POLL_ONLY
;
376 lwkt_replymsg(&msg
->nm_lmsg
, 0);
379 if (residual_burst
== 0) { /* first call in this tick */
380 microuptime(&poll_start_t
);
382 * Check that paremeters are consistent with runtime
383 * variables. Some of these tests could be done at sysctl
384 * time, but the savings would be very limited because we
385 * still have to check against reg_frac_count and
386 * poll_each_burst. So, instead of writing separate sysctl
387 * handlers, we do all here.
392 else if (reg_frac
< 1)
394 if (reg_frac_count
> reg_frac
)
395 reg_frac_count
= reg_frac
- 1;
396 if (reg_frac_count
-- == 0) {
397 arg
= POLL_AND_CHECK_STATUS
;
398 reg_frac_count
= reg_frac
- 1;
400 if (poll_burst_max
< MIN_POLL_BURST_MAX
)
401 poll_burst_max
= MIN_POLL_BURST_MAX
;
402 else if (poll_burst_max
> MAX_POLL_BURST_MAX
)
403 poll_burst_max
= MAX_POLL_BURST_MAX
;
405 if (poll_each_burst
< 1)
407 else if (poll_each_burst
> poll_burst_max
)
408 poll_each_burst
= poll_burst_max
;
410 residual_burst
= poll_burst
;
412 cycles
= (residual_burst
< poll_each_burst
) ?
413 residual_burst
: poll_each_burst
;
414 residual_burst
-= cycles
;
416 if (polling_enabled
) {
417 for (i
= 0 ; i
< poll_handlers
; i
++) {
418 struct pollrec
*p
= &pr
[i
];
419 if ((p
->ifp
->if_flags
& (IFF_UP
|IFF_RUNNING
|IFF_POLLING
)) == (IFF_UP
|IFF_RUNNING
|IFF_POLLING
)) {
420 if (lwkt_serialize_try(p
->ifp
->if_serializer
)) {
421 p
->ifp
->if_poll(p
->ifp
, arg
, cycles
);
422 lwkt_serialize_exit(p
->ifp
->if_serializer
);
426 } else { /* unregister */
427 for (i
= 0 ; i
< poll_handlers
; i
++) {
428 struct pollrec
*p
= &pr
[i
];
429 if ((p
->ifp
->if_flags
& IFF_POLLING
) == 0)
432 * Only call the interface deregistration
433 * function if the interface is still
436 lwkt_serialize_enter(p
->ifp
->if_serializer
);
437 p
->ifp
->if_flags
&= ~IFF_POLLING
;
438 if (p
->ifp
->if_flags
& IFF_RUNNING
)
439 p
->ifp
->if_poll(p
->ifp
, POLL_DEREGISTER
, 1);
440 lwkt_serialize_exit(p
->ifp
->if_serializer
);
445 schednetisr(NETISR_POLLMORE
);
451 * Try to register routine for polling. Returns 1 if successful
452 * (and polling should be enabled), 0 otherwise.
454 * Called from mainline code only, not called from an interrupt.
457 ether_poll_register(struct ifnet
*ifp
)
461 if (polling_enabled
== 0) /* polling disabled, cannot register */
463 if ((ifp
->if_flags
& IFF_UP
) == 0) /* must be up */
465 if (ifp
->if_flags
& IFF_POLLING
) /* already polling */
467 if (ifp
->if_poll
== NULL
) /* no polling support */
471 * Attempt to register. Interlock with IFF_POLLING.
473 crit_enter(); /* XXX MP - not mp safe */
474 lwkt_serialize_enter(ifp
->if_serializer
);
475 ifp
->if_flags
|= IFF_POLLING
;
476 ifp
->if_poll(ifp
, POLL_REGISTER
, 0);
477 lwkt_serialize_exit(ifp
->if_serializer
);
478 if ((ifp
->if_flags
& IFF_POLLING
) == 0) {
484 * Check if there is room. If there isn't, deregister.
486 if (poll_handlers
>= POLL_LIST_LEN
) {
488 * List full, cannot register more entries.
489 * This should never happen; if it does, it is probably a
490 * broken driver trying to register multiple times. Checking
491 * this at runtime is expensive, and won't solve the problem
492 * anyways, so just report a few times and then give up.
494 static int verbose
= 10 ;
496 kprintf("poll handlers list full, "
497 "maybe a broken driver ?\n");
500 ifp
->if_flags
&= ~IFF_POLLING
;
501 lwkt_serialize_enter(ifp
->if_serializer
);
502 ifp
->if_poll(ifp
, POLL_DEREGISTER
, 0);
503 lwkt_serialize_exit(ifp
->if_serializer
);
506 pr
[poll_handlers
].ifp
= ifp
;
515 * Remove interface from the polling list. Occurs when polling is turned
516 * off. Called from mainline code only, not called from an interrupt.
519 ether_poll_deregister(struct ifnet
*ifp
)
524 if (ifp
== NULL
|| (ifp
->if_flags
& IFF_POLLING
) == 0) {
528 for (i
= 0 ; i
< poll_handlers
; i
++) {
529 if (pr
[i
].ifp
== ifp
) /* found it */
532 ifp
->if_flags
&= ~IFF_POLLING
; /* found or not... */
533 if (i
== poll_handlers
) {
535 kprintf("ether_poll_deregister: ifp not found!!!\n");
539 if (i
< poll_handlers
) { /* Last entry replaces this one. */
540 pr
[i
].ifp
= pr
[poll_handlers
].ifp
;
545 * Only call the deregistration function if the interface is still
548 if (ifp
->if_flags
& IFF_RUNNING
) {
549 lwkt_serialize_enter(ifp
->if_serializer
);
550 ifp
->if_poll(ifp
, POLL_DEREGISTER
, 1);
551 lwkt_serialize_exit(ifp
->if_serializer
);
557 emergency_poll_enable(const char *name
)
559 if (polling_enabled
== 0) {
561 kprintf("%s forced polling on\n", name
);