2 * Copyright (c) 2011 Your File System Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR `AS IS'' AND ANY EXPRESS OR
14 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
15 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
16 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
17 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
18 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
19 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
20 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
22 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 /* A reimplementation of the rx_event handler using red/black trees
27 * The first rx_event implementation used a simple sorted queue of all
28 * events, which lead to O(n^2) performance, where n is the number of
29 * outstanding events. This was found to scale poorly, so was replaced.
31 * The second implementation used a set of per-second buckets to store
32 * events. Each bucket (referred to as an epoch in the code) stored all
33 * of the events which expired in that second. However, on modern networks
34 * where RTT times are in the millisecond, most connections will have events
35 * expiring within the next second, so the problem reoccurs.
37 * This new implementation uses Red-Black trees to store a sorted list of
38 * events. Red Black trees are guaranteed to have no worse than O(log N)
39 * insertion, and are commonly used in timer applications
42 #include <afsconfig.h>
43 #include <afs/param.h>
46 # include "afs/sysincludes.h"
47 # include "afsincludes.h"
53 #include <opr/queue.h>
54 #include <opr/rbtree.h>
57 #include "rx_atomic.h"
59 #include "rx_globals.h"
63 struct opr_rbtree_node node
;
64 struct clock eventTime
;
68 void (*func
)(struct rxevent
*, void *, void *, int);
77 struct malloclist
*next
;
82 struct opr_queue list
;
83 struct malloclist
*mallocs
;
88 struct opr_rbtree head
;
89 struct rxevent
*first
;
100 static int allocUnit
= 10;
102 static struct rxevent
*
103 rxevent_alloc(void) {
104 struct rxevent
*evlist
;
106 struct malloclist
*mrec
;
109 MUTEX_ENTER(&freeEvents
.lock
);
110 if (opr_queue_IsEmpty(&freeEvents
.list
)) {
111 MUTEX_EXIT(&freeEvents
.lock
);
113 #if defined(AFS_AIX32_ENV) && defined(KERNEL)
114 ev
= rxi_Alloc(sizeof(struct rxevent
));
116 evlist
= osi_Alloc(sizeof(struct rxevent
) * allocUnit
);
117 mrec
= osi_Alloc(sizeof(struct malloclist
));
120 mrec
->size
= sizeof(struct rxevent
) * allocUnit
;
122 MUTEX_ENTER(&freeEvents
.lock
);
123 for (i
= 1; i
< allocUnit
; i
++) {
124 opr_queue_Append(&freeEvents
.list
, &evlist
[i
].q
);
126 mrec
->next
= freeEvents
.mallocs
;
127 freeEvents
.mallocs
= mrec
;
128 MUTEX_EXIT(&freeEvents
.lock
);
132 ev
= opr_queue_First(&freeEvents
.list
, struct rxevent
, q
);
133 opr_queue_Remove(&ev
->q
);
134 MUTEX_EXIT(&freeEvents
.lock
);
137 memset(ev
, 0, sizeof(struct rxevent
));
138 rx_atomic_set(&ev
->refcnt
, 1);
144 rxevent_free(struct rxevent
*ev
) {
145 MUTEX_ENTER(&freeEvents
.lock
);
146 opr_queue_Prepend(&freeEvents
.list
, &ev
->q
);
147 MUTEX_EXIT(&freeEvents
.lock
);
151 rxevent_put(struct rxevent
*ev
) {
152 if (rx_atomic_dec_and_read(&ev
->refcnt
) == 0) {
158 rxevent_Put(struct rxevent
**ev
)
164 static_inline
struct rxevent
*
165 rxevent_get(struct rxevent
*ev
) {
166 rx_atomic_inc(&ev
->refcnt
);
171 rxevent_Get(struct rxevent
*ev
) {
172 return rxevent_get(ev
);
175 /* Called if the time now is older than the last time we recorded running an
176 * event. This test catches machines where the system time has been set
177 * backwards, and avoids RX completely stalling when timers fail to fire.
179 * Take the different between now and the last event time, and subtract that
180 * from the timing of every event on the system. This does a relatively slow
181 * walk of the completely eventTree, but time-travel will hopefully be a pretty
184 * This can only safely be called from the event thread, as it plays with the
191 struct opr_rbtree_node
*node
;
192 struct clock adjTime
, now
;
194 MUTEX_ENTER(&eventTree
.lock
);
195 /* Time adjustment is expensive, make absolutely certain that we have
196 * to do it, by getting an up to date time to base our decision on
197 * once we've acquired the relevant locks.
200 if (!clock_Lt(&now
, &eventSchedule
.last
))
203 adjTime
= eventSchedule
.last
;
204 clock_Zero(&eventSchedule
.last
);
206 clock_Sub(&adjTime
, &now
);
208 /* If there are no events in the tree, then there's nothing to adjust */
209 if (eventTree
.first
== NULL
)
212 node
= opr_rbtree_first(&eventTree
.head
);
214 struct rxevent
*event
= opr_containerof(node
, struct rxevent
, node
);
216 clock_Sub(&event
->eventTime
, &adjTime
);
217 node
= opr_rbtree_next(node
);
219 eventSchedule
.next
= eventTree
.first
->eventTime
;
222 MUTEX_EXIT(&eventTree
.lock
);
225 static int initialised
= 0;
227 rxevent_Init(int nEvents
, void (*scheduler
)(void))
235 MUTEX_INIT(&eventTree
.lock
, "event tree lock", MUTEX_DEFAULT
, 0);
236 opr_rbtree_init(&eventTree
.head
);
238 MUTEX_INIT(&freeEvents
.lock
, "free events lock", MUTEX_DEFAULT
, 0);
239 opr_queue_Init(&freeEvents
.list
);
240 freeEvents
.mallocs
= NULL
;
245 clock_Zero(&eventSchedule
.next
);
246 clock_Zero(&eventSchedule
.last
);
247 eventSchedule
.raised
= 0;
248 eventSchedule
.func
= scheduler
;
252 rxevent_Post(struct clock
*when
, struct clock
*now
,
253 void (*func
) (struct rxevent
*, void *, void *, int),
254 void *arg
, void *arg1
, int arg2
)
256 struct rxevent
*ev
, *event
;
257 struct opr_rbtree_node
**childptr
, *parent
= NULL
;
259 ev
= rxevent_alloc();
260 ev
->eventTime
= *when
;
266 if (clock_Lt(now
, &eventSchedule
.last
))
269 MUTEX_ENTER(&eventTree
.lock
);
271 /* Work out where in the tree we'll be storing this */
272 childptr
= &eventTree
.head
.root
;
275 event
= opr_containerof((*childptr
), struct rxevent
, node
);
278 if (clock_Lt(when
, &event
->eventTime
))
279 childptr
= &(*childptr
)->left
;
280 else if (clock_Gt(when
, &event
->eventTime
))
281 childptr
= &(*childptr
)->right
;
283 opr_queue_Append(&event
->q
, &ev
->q
);
287 opr_queue_Init(&ev
->q
);
288 opr_rbtree_insert(&eventTree
.head
, parent
, childptr
, &ev
->node
);
290 if (eventTree
.first
== NULL
||
291 clock_Lt(when
, &(eventTree
.first
->eventTime
))) {
292 eventTree
.first
= ev
;
293 eventSchedule
.raised
= 1;
294 clock_Zero(&eventSchedule
.next
);
295 MUTEX_EXIT(&eventTree
.lock
);
296 if (eventSchedule
.func
!= NULL
)
297 (*eventSchedule
.func
)();
298 return rxevent_get(ev
);
302 MUTEX_EXIT(&eventTree
.lock
);
303 return rxevent_get(ev
);
306 /* We're going to remove ev from the tree, so set the first pointer to the
307 * next event after it */
309 resetFirst(struct rxevent
*ev
)
311 struct opr_rbtree_node
*next
= opr_rbtree_next(&ev
->node
);
313 eventTree
.first
= opr_containerof(next
, struct rxevent
, node
);
315 eventTree
.first
= NULL
;
321 * Cancels the event pointed to by evp. Returns true if the event has
322 * been succesfully cancelled, or false if the event has already fired.
326 rxevent_Cancel(struct rxevent
**evp
)
328 struct rxevent
*event
;
336 MUTEX_ENTER(&eventTree
.lock
);
338 if (!event
->handled
) {
339 /* We're a node on the red/black tree. If our list is non-empty,
340 * then swap the first element in the list in in our place,
341 * promoting it to the list head */
342 if (event
->node
.parent
== NULL
343 && eventTree
.head
.root
!= &event
->node
) {
344 /* Not in the rbtree, therefore must be a list element */
345 opr_queue_Remove(&event
->q
);
347 if (!opr_queue_IsEmpty(&event
->q
)) {
348 struct rxevent
*next
;
350 next
= opr_queue_First(&event
->q
, struct rxevent
, q
);
351 opr_queue_Remove(&next
->q
); /* Remove ourselves from list */
352 if (event
->q
.prev
== &event
->q
) {
353 next
->q
.prev
= next
->q
.next
= &next
->q
;
356 next
->q
.prev
->next
= &next
->q
;
357 next
->q
.next
->prev
= &next
->q
;
360 opr_rbtree_replace(&eventTree
.head
, &event
->node
,
363 if (eventTree
.first
== event
)
364 eventTree
.first
= next
;
367 if (eventTree
.first
== event
)
370 opr_rbtree_remove(&eventTree
.head
, &event
->node
);
374 rxevent_put(event
); /* Dispose of eventTree reference */
378 MUTEX_EXIT(&eventTree
.lock
);
381 rxevent_put(event
); /* Dispose of caller's reference */
386 /* Process all events which have expired. If events remain, then the relative
387 * time until the next event is returned in the parameter 'wait', and the
388 * function returns 1. If no events currently remain, the function returns 0
390 * If the current time is older than that of the last event processed, then we
391 * assume that time has gone backwards (for example, due to a system time reset)
392 * When this happens, all events in the current queue are rescheduled, using
393 * the difference between the current time and the last event time as a delta
397 rxevent_RaiseEvents(struct clock
*wait
)
400 struct rxevent
*event
;
405 /* Check for time going backwards */
406 if (clock_Lt(&now
, &eventSchedule
.last
))
408 eventSchedule
.last
= now
;
410 MUTEX_ENTER(&eventTree
.lock
);
411 /* Lock our event tree */
412 while (eventTree
.first
!= NULL
413 && clock_Lt(&eventTree
.first
->eventTime
, &now
)) {
415 /* Grab the next node, either in the event's list, or in the tree node
416 * itself, and remove it from the event tree */
417 event
= eventTree
.first
;
418 if (!opr_queue_IsEmpty(&event
->q
)) {
419 event
= opr_queue_Last(&event
->q
, struct rxevent
, q
);
420 opr_queue_Remove(&event
->q
);
423 opr_rbtree_remove(&eventTree
.head
, &event
->node
);
426 MUTEX_EXIT(&eventTree
.lock
);
428 /* Fire the event, then free the structure */
429 event
->func(event
, event
->arg
, event
->arg1
, event
->arg2
);
432 MUTEX_ENTER(&eventTree
.lock
);
435 /* Figure out when we next need to be scheduled */
436 if (eventTree
.first
!= NULL
) {
437 *wait
= eventSchedule
.next
= eventTree
.first
->eventTime
;
438 ret
= eventSchedule
.raised
= 1;
439 clock_Sub(wait
, &now
);
441 ret
= eventSchedule
.raised
= 0;
444 MUTEX_EXIT(&eventTree
.lock
);
450 shutdown_rxevent(void)
452 struct malloclist
*mrec
, *nmrec
;
457 MUTEX_DESTROY(&eventTree
.lock
);
459 #if !defined(AFS_AIX32_ENV) || !defined(KERNEL)
460 MUTEX_DESTROY(&freeEvents
.lock
);
461 mrec
= freeEvents
.mallocs
;
464 osi_Free(mrec
->mem
, mrec
->size
);
465 osi_Free(mrec
, sizeof(struct malloclist
));