Merge remote-tracking branch 'public/bug23985_029' into maint-0.2.9
[tor.git] / src / or / scheduler.c
blob49ac1b939a4510781468a37837929cdd14808977
1 /* * Copyright (c) 2013-2016, The Tor Project, Inc. */
2 /* See LICENSE for licensing information */
4 /**
5 * \file scheduler.c
6 * \brief Relay scheduling system
7 **/
9 #include "or.h"
11 #define TOR_CHANNEL_INTERNAL_ /* For channel_flush_some_cells() */
12 #include "channel.h"
14 #include "compat_libevent.h"
15 #define SCHEDULER_PRIVATE_
16 #include "scheduler.h"
18 #include <event2/event.h>
21 * Scheduler high/low watermarks
24 static uint32_t sched_q_low_water = 16384;
25 static uint32_t sched_q_high_water = 32768;
28 * Maximum cells to flush in a single call to channel_flush_some_cells();
29 * setting this low means more calls, but too high and we could overshoot
30 * sched_q_high_water.
33 static uint32_t sched_max_flush_cells = 16;
36 * Write scheduling works by keeping track of which channels can
37 * accept cells, and have cells to write. From the scheduler's perspective,
38 * a channel can be in four possible states:
40 * 1.) Not open for writes, no cells to send
41 * - Not much to do here, and the channel will have scheduler_state ==
42 * SCHED_CHAN_IDLE
43 * - Transitions from:
44 * - Open for writes/has cells by simultaneously draining all circuit
45 * queues and filling the output buffer.
46 * - Transitions to:
47 * - Not open for writes/has cells by arrival of cells on an attached
48 * circuit (this would be driven from append_cell_to_circuit_queue())
49 * - Open for writes/no cells by a channel type specific path;
50 * driven from connection_or_flushed_some() for channel_tls_t.
52 * 2.) Open for writes, no cells to send
53 * - Not much here either; this will be the state an idle but open channel
54 * can be expected to settle in. It will have scheduler_state ==
55 * SCHED_CHAN_WAITING_FOR_CELLS
56 * - Transitions from:
57 * - Not open for writes/no cells by flushing some of the output
58 * buffer.
59 * - Open for writes/has cells by the scheduler moving cells from
60 * circuit queues to channel output queue, but not having enough
61 * to fill the output queue.
62 * - Transitions to:
63 * - Open for writes/has cells by arrival of new cells on an attached
64 * circuit, in append_cell_to_circuit_queue()
66 * 3.) Not open for writes, cells to send
67 * - This is the state of a busy circuit limited by output bandwidth;
68 * cells have piled up in the circuit queues waiting to be relayed.
69 * The channel will have scheduler_state == SCHED_CHAN_WAITING_TO_WRITE.
70 * - Transitions from:
71 * - Not open for writes/no cells by arrival of cells on an attached
72 * circuit
73 * - Open for writes/has cells by filling an output buffer without
74 * draining all cells from attached circuits
75 * - Transitions to:
76 * - Opens for writes/has cells by draining some of the output buffer
77 * via the connection_or_flushed_some() path (for channel_tls_t).
79 * 4.) Open for writes, cells to send
80 * - This connection is ready to relay some cells and waiting for
81 * the scheduler to choose it. The channel will have scheduler_state ==
82 * SCHED_CHAN_PENDING.
83 * - Transitions from:
84 * - Not open for writes/has cells by the connection_or_flushed_some()
85 * path
86 * - Open for writes/no cells by the append_cell_to_circuit_queue()
87 * path
88 * - Transitions to:
89 * - Not open for writes/no cells by draining all circuit queues and
90 * simultaneously filling the output buffer.
91 * - Not open for writes/has cells by writing enough cells to fill the
92 * output buffer
93 * - Open for writes/no cells by draining all attached circuit queues
94 * without also filling the output buffer
96 * Other event-driven parts of the code move channels between these scheduling
97 * states by calling scheduler functions; the scheduler only runs on open-for-
98 * writes/has-cells channels and is the only path for those to transition to
99 * other states. The scheduler_run() function gives us the opportunity to do
100 * scheduling work, and is called from other scheduler functions whenever a
101 * state transition occurs, and periodically from the main event loop.
104 /* Scheduler global data structures */
107 * We keep a list of channels that are pending - i.e, have cells to write
108 * and can accept them to send. The enum scheduler_state in channel_t
109 * is reserved for our use.
112 /* Pqueue of channels that can write and have cells (pending work) */
113 STATIC smartlist_t *channels_pending = NULL;
116 * This event runs the scheduler from its callback, and is manually
117 * activated whenever a channel enters open for writes/cells to send.
120 STATIC struct event *run_sched_ev = NULL;
123 * Queue heuristic; this is not the queue size, but an 'effective queuesize'
124 * that ages out contributions from stalled channels.
127 STATIC uint64_t queue_heuristic = 0;
130 * Timestamp for last queue heuristic update
133 STATIC time_t queue_heuristic_timestamp = 0;
135 /* Scheduler static function declarations */
137 static void scheduler_evt_callback(evutil_socket_t fd,
138 short events, void *arg);
139 static int scheduler_more_work(void);
140 static void scheduler_retrigger(void);
141 #if 0
142 static void scheduler_trigger(void);
143 #endif
145 /* Scheduler function implementations */
147 /** Free everything and shut down the scheduling system */
149 void
150 scheduler_free_all(void)
152 log_debug(LD_SCHED, "Shutting down scheduler");
154 if (run_sched_ev) {
155 if (event_del(run_sched_ev) < 0) {
156 log_warn(LD_BUG, "Problem deleting run_sched_ev");
158 tor_event_free(run_sched_ev);
159 run_sched_ev = NULL;
162 if (channels_pending) {
163 smartlist_free(channels_pending);
164 channels_pending = NULL;
169 * Comparison function to use when sorting pending channels
172 MOCK_IMPL(STATIC int,
173 scheduler_compare_channels, (const void *c1_v, const void *c2_v))
175 channel_t *c1 = NULL, *c2 = NULL;
176 /* These are a workaround for -Wbad-function-cast throwing a fit */
177 const circuitmux_policy_t *p1, *p2;
178 uintptr_t p1_i, p2_i;
180 tor_assert(c1_v);
181 tor_assert(c2_v);
183 c1 = (channel_t *)(c1_v);
184 c2 = (channel_t *)(c2_v);
186 tor_assert(c1);
187 tor_assert(c2);
189 if (c1 != c2) {
190 if (circuitmux_get_policy(c1->cmux) ==
191 circuitmux_get_policy(c2->cmux)) {
192 /* Same cmux policy, so use the mux comparison */
193 return circuitmux_compare_muxes(c1->cmux, c2->cmux);
194 } else {
196 * Different policies; not important to get this edge case perfect
197 * because the current code never actually gives different channels
198 * different cmux policies anyway. Just use this arbitrary but
199 * definite choice.
201 p1 = circuitmux_get_policy(c1->cmux);
202 p2 = circuitmux_get_policy(c2->cmux);
203 p1_i = (uintptr_t)p1;
204 p2_i = (uintptr_t)p2;
206 return (p1_i < p2_i) ? -1 : 1;
208 } else {
209 /* c1 == c2, so always equal */
210 return 0;
215 * Scheduler event callback; this should get triggered once per event loop
216 * if any scheduling work was created during the event loop.
219 static void
220 scheduler_evt_callback(evutil_socket_t fd, short events, void *arg)
222 (void)fd;
223 (void)events;
224 (void)arg;
225 log_debug(LD_SCHED, "Scheduler event callback called");
227 tor_assert(run_sched_ev);
229 /* Run the scheduler */
230 scheduler_run();
232 /* Do we have more work to do? */
233 if (scheduler_more_work()) scheduler_retrigger();
236 /** Mark a channel as no longer ready to accept writes */
238 MOCK_IMPL(void,
239 scheduler_channel_doesnt_want_writes,(channel_t *chan))
241 tor_assert(chan);
243 tor_assert(channels_pending);
245 /* If it's already in pending, we can put it in waiting_to_write */
246 if (chan->scheduler_state == SCHED_CHAN_PENDING) {
248 * It's in channels_pending, so it shouldn't be in any of
249 * the other lists. It can't write any more, so it goes to
250 * channels_waiting_to_write.
252 smartlist_pqueue_remove(channels_pending,
253 scheduler_compare_channels,
254 STRUCT_OFFSET(channel_t, sched_heap_idx),
255 chan);
256 chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
257 log_debug(LD_SCHED,
258 "Channel " U64_FORMAT " at %p went from pending "
259 "to waiting_to_write",
260 U64_PRINTF_ARG(chan->global_identifier), chan);
261 } else {
263 * It's not in pending, so it can't become waiting_to_write; it's
264 * either not in any of the lists (nothing to do) or it's already in
265 * waiting_for_cells (remove it, can't write any more).
267 if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) {
268 chan->scheduler_state = SCHED_CHAN_IDLE;
269 log_debug(LD_SCHED,
270 "Channel " U64_FORMAT " at %p left waiting_for_cells",
271 U64_PRINTF_ARG(chan->global_identifier), chan);
276 /** Mark a channel as having waiting cells */
278 MOCK_IMPL(void,
279 scheduler_channel_has_waiting_cells,(channel_t *chan))
281 int became_pending = 0;
283 tor_assert(chan);
284 tor_assert(channels_pending);
286 /* First, check if this one also writeable */
287 if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) {
289 * It's in channels_waiting_for_cells, so it shouldn't be in any of
290 * the other lists. It has waiting cells now, so it goes to
291 * channels_pending.
293 chan->scheduler_state = SCHED_CHAN_PENDING;
294 smartlist_pqueue_add(channels_pending,
295 scheduler_compare_channels,
296 STRUCT_OFFSET(channel_t, sched_heap_idx),
297 chan);
298 log_debug(LD_SCHED,
299 "Channel " U64_FORMAT " at %p went from waiting_for_cells "
300 "to pending",
301 U64_PRINTF_ARG(chan->global_identifier), chan);
302 became_pending = 1;
303 } else {
305 * It's not in waiting_for_cells, so it can't become pending; it's
306 * either not in any of the lists (we add it to waiting_to_write)
307 * or it's already in waiting_to_write or pending (we do nothing)
309 if (!(chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE ||
310 chan->scheduler_state == SCHED_CHAN_PENDING)) {
311 chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
312 log_debug(LD_SCHED,
313 "Channel " U64_FORMAT " at %p entered waiting_to_write",
314 U64_PRINTF_ARG(chan->global_identifier), chan);
319 * If we made a channel pending, we potentially have scheduling work
320 * to do.
322 if (became_pending) scheduler_retrigger();
325 /** Set up the scheduling system */
327 void
328 scheduler_init(void)
330 log_debug(LD_SCHED, "Initting scheduler");
332 tor_assert(!run_sched_ev);
333 run_sched_ev = tor_event_new(tor_libevent_get_base(), -1,
334 0, scheduler_evt_callback, NULL);
336 channels_pending = smartlist_new();
337 queue_heuristic = 0;
338 queue_heuristic_timestamp = approx_time();
341 /** Check if there's more scheduling work */
343 static int
344 scheduler_more_work(void)
346 tor_assert(channels_pending);
348 return ((scheduler_get_queue_heuristic() < sched_q_low_water) &&
349 ((smartlist_len(channels_pending) > 0))) ? 1 : 0;
352 /** Retrigger the scheduler in a way safe to use from the callback */
354 static void
355 scheduler_retrigger(void)
357 tor_assert(run_sched_ev);
358 event_active(run_sched_ev, EV_TIMEOUT, 1);
361 /** Notify the scheduler of a channel being closed */
363 MOCK_IMPL(void,
364 scheduler_release_channel,(channel_t *chan))
366 tor_assert(chan);
367 tor_assert(channels_pending);
369 if (chan->scheduler_state == SCHED_CHAN_PENDING) {
370 smartlist_pqueue_remove(channels_pending,
371 scheduler_compare_channels,
372 STRUCT_OFFSET(channel_t, sched_heap_idx),
373 chan);
376 chan->scheduler_state = SCHED_CHAN_IDLE;
379 /** Run the scheduling algorithm if necessary */
381 MOCK_IMPL(void,
382 scheduler_run, (void))
384 int n_cells, n_chans_before, n_chans_after;
385 uint64_t q_len_before, q_heur_before, q_len_after, q_heur_after;
386 ssize_t flushed, flushed_this_time;
387 smartlist_t *to_readd = NULL;
388 channel_t *chan = NULL;
390 log_debug(LD_SCHED, "We have a chance to run the scheduler");
392 if (scheduler_get_queue_heuristic() < sched_q_low_water) {
393 n_chans_before = smartlist_len(channels_pending);
394 q_len_before = channel_get_global_queue_estimate();
395 q_heur_before = scheduler_get_queue_heuristic();
397 while (scheduler_get_queue_heuristic() <= sched_q_high_water &&
398 smartlist_len(channels_pending) > 0) {
399 /* Pop off a channel */
400 chan = smartlist_pqueue_pop(channels_pending,
401 scheduler_compare_channels,
402 STRUCT_OFFSET(channel_t, sched_heap_idx));
403 tor_assert(chan);
405 /* Figure out how many cells we can write */
406 n_cells = channel_num_cells_writeable(chan);
407 if (n_cells > 0) {
408 log_debug(LD_SCHED,
409 "Scheduler saw pending channel " U64_FORMAT " at %p with "
410 "%d cells writeable",
411 U64_PRINTF_ARG(chan->global_identifier), chan, n_cells);
413 flushed = 0;
414 while (flushed < n_cells &&
415 scheduler_get_queue_heuristic() <= sched_q_high_water) {
416 flushed_this_time =
417 channel_flush_some_cells(chan,
418 MIN(sched_max_flush_cells,
419 (size_t) n_cells - flushed));
420 if (flushed_this_time <= 0) break;
421 flushed += flushed_this_time;
424 if (flushed < n_cells) {
425 /* We ran out of cells to flush */
426 chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
427 log_debug(LD_SCHED,
428 "Channel " U64_FORMAT " at %p "
429 "entered waiting_for_cells from pending",
430 U64_PRINTF_ARG(chan->global_identifier),
431 chan);
432 } else {
433 /* The channel may still have some cells */
434 if (channel_more_to_flush(chan)) {
435 /* The channel goes to either pending or waiting_to_write */
436 if (channel_num_cells_writeable(chan) > 0) {
437 /* Add it back to pending later */
438 if (!to_readd) to_readd = smartlist_new();
439 smartlist_add(to_readd, chan);
440 log_debug(LD_SCHED,
441 "Channel " U64_FORMAT " at %p "
442 "is still pending",
443 U64_PRINTF_ARG(chan->global_identifier),
444 chan);
445 } else {
446 /* It's waiting to be able to write more */
447 chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
448 log_debug(LD_SCHED,
449 "Channel " U64_FORMAT " at %p "
450 "entered waiting_to_write from pending",
451 U64_PRINTF_ARG(chan->global_identifier),
452 chan);
454 } else {
455 /* No cells left; it can go to idle or waiting_for_cells */
456 if (channel_num_cells_writeable(chan) > 0) {
458 * It can still accept writes, so it goes to
459 * waiting_for_cells
461 chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
462 log_debug(LD_SCHED,
463 "Channel " U64_FORMAT " at %p "
464 "entered waiting_for_cells from pending",
465 U64_PRINTF_ARG(chan->global_identifier),
466 chan);
467 } else {
469 * We exactly filled up the output queue with all available
470 * cells; go to idle.
472 chan->scheduler_state = SCHED_CHAN_IDLE;
473 log_debug(LD_SCHED,
474 "Channel " U64_FORMAT " at %p "
475 "become idle from pending",
476 U64_PRINTF_ARG(chan->global_identifier),
477 chan);
482 log_debug(LD_SCHED,
483 "Scheduler flushed %d cells onto pending channel "
484 U64_FORMAT " at %p",
485 (int)flushed, U64_PRINTF_ARG(chan->global_identifier),
486 chan);
487 } else {
488 log_info(LD_SCHED,
489 "Scheduler saw pending channel " U64_FORMAT " at %p with "
490 "no cells writeable",
491 U64_PRINTF_ARG(chan->global_identifier), chan);
492 /* Put it back to WAITING_TO_WRITE */
493 chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
497 /* Readd any channels we need to */
498 if (to_readd) {
499 SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, readd_chan) {
500 readd_chan->scheduler_state = SCHED_CHAN_PENDING;
501 smartlist_pqueue_add(channels_pending,
502 scheduler_compare_channels,
503 STRUCT_OFFSET(channel_t, sched_heap_idx),
504 readd_chan);
505 } SMARTLIST_FOREACH_END(readd_chan);
506 smartlist_free(to_readd);
509 n_chans_after = smartlist_len(channels_pending);
510 q_len_after = channel_get_global_queue_estimate();
511 q_heur_after = scheduler_get_queue_heuristic();
512 log_debug(LD_SCHED,
513 "Scheduler handled %d of %d pending channels, queue size from "
514 U64_FORMAT " to " U64_FORMAT ", queue heuristic from "
515 U64_FORMAT " to " U64_FORMAT,
516 n_chans_before - n_chans_after, n_chans_before,
517 U64_PRINTF_ARG(q_len_before), U64_PRINTF_ARG(q_len_after),
518 U64_PRINTF_ARG(q_heur_before), U64_PRINTF_ARG(q_heur_after));
522 /** Trigger the scheduling event so we run the scheduler later */
524 #if 0
525 static void
526 scheduler_trigger(void)
528 log_debug(LD_SCHED, "Triggering scheduler event");
530 tor_assert(run_sched_ev);
532 event_add(run_sched_ev, EV_TIMEOUT, 1);
534 #endif
536 /** Mark a channel as ready to accept writes */
538 void
539 scheduler_channel_wants_writes(channel_t *chan)
541 int became_pending = 0;
543 tor_assert(chan);
544 tor_assert(channels_pending);
546 /* If it's already in waiting_to_write, we can put it in pending */
547 if (chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE) {
549 * It can write now, so it goes to channels_pending.
551 smartlist_pqueue_add(channels_pending,
552 scheduler_compare_channels,
553 STRUCT_OFFSET(channel_t, sched_heap_idx),
554 chan);
555 chan->scheduler_state = SCHED_CHAN_PENDING;
556 log_debug(LD_SCHED,
557 "Channel " U64_FORMAT " at %p went from waiting_to_write "
558 "to pending",
559 U64_PRINTF_ARG(chan->global_identifier), chan);
560 became_pending = 1;
561 } else {
563 * It's not in SCHED_CHAN_WAITING_TO_WRITE, so it can't become pending;
564 * it's either idle and goes to WAITING_FOR_CELLS, or it's a no-op.
566 if (!(chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS ||
567 chan->scheduler_state == SCHED_CHAN_PENDING)) {
568 chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
569 log_debug(LD_SCHED,
570 "Channel " U64_FORMAT " at %p entered waiting_for_cells",
571 U64_PRINTF_ARG(chan->global_identifier), chan);
576 * If we made a channel pending, we potentially have scheduling work
577 * to do.
579 if (became_pending) scheduler_retrigger();
583 * Notify the scheduler that a channel's position in the pqueue may have
584 * changed
587 void
588 scheduler_touch_channel(channel_t *chan)
590 tor_assert(chan);
592 if (chan->scheduler_state == SCHED_CHAN_PENDING) {
593 /* Remove and re-add it */
594 smartlist_pqueue_remove(channels_pending,
595 scheduler_compare_channels,
596 STRUCT_OFFSET(channel_t, sched_heap_idx),
597 chan);
598 smartlist_pqueue_add(channels_pending,
599 scheduler_compare_channels,
600 STRUCT_OFFSET(channel_t, sched_heap_idx),
601 chan);
603 /* else no-op, since it isn't in the queue */
607 * Notify the scheduler of a queue size adjustment, to recalculate the
608 * queue heuristic.
611 void
612 scheduler_adjust_queue_size(channel_t *chan, int dir, uint64_t adj)
614 time_t now = approx_time();
616 log_debug(LD_SCHED,
617 "Queue size adjustment by %s" U64_FORMAT " for channel "
618 U64_FORMAT,
619 (dir >= 0) ? "+" : "-",
620 U64_PRINTF_ARG(adj),
621 U64_PRINTF_ARG(chan->global_identifier));
623 /* Get the queue heuristic up to date */
624 scheduler_update_queue_heuristic(now);
626 /* Adjust as appropriate */
627 if (dir >= 0) {
628 /* Increasing it */
629 queue_heuristic += adj;
630 } else {
631 /* Decreasing it */
632 if (queue_heuristic > adj) queue_heuristic -= adj;
633 else queue_heuristic = 0;
636 log_debug(LD_SCHED,
637 "Queue heuristic is now " U64_FORMAT,
638 U64_PRINTF_ARG(queue_heuristic));
642 * Query the current value of the queue heuristic
645 STATIC uint64_t
646 scheduler_get_queue_heuristic(void)
648 time_t now = approx_time();
650 scheduler_update_queue_heuristic(now);
652 return queue_heuristic;
656 * Adjust the queue heuristic value to the present time
659 STATIC void
660 scheduler_update_queue_heuristic(time_t now)
662 time_t diff;
664 if (queue_heuristic_timestamp == 0) {
666 * Nothing we can sensibly do; must not have been initted properly.
667 * Oh well.
669 queue_heuristic_timestamp = now;
670 } else if (queue_heuristic_timestamp < now) {
671 diff = now - queue_heuristic_timestamp;
673 * This is a simple exponential age-out; the other proposed alternative
674 * was a linear age-out using the bandwidth history in rephist.c; I'm
675 * going with this out of concern that if an adversary can jam the
676 * scheduler long enough, it would cause the bandwidth to drop to
677 * zero and render the aging mechanism ineffective thereafter.
679 if (0 <= diff && diff < 64) queue_heuristic >>= diff;
680 else queue_heuristic = 0;
682 queue_heuristic_timestamp = now;
684 log_debug(LD_SCHED,
685 "Queue heuristic is now " U64_FORMAT,
686 U64_PRINTF_ARG(queue_heuristic));
688 /* else no update needed, or time went backward */
692 * Set scheduler watermarks and flush size
695 void
696 scheduler_set_watermarks(uint32_t lo, uint32_t hi, uint32_t max_flush)
698 /* Sanity assertions - caller should ensure these are true */
699 tor_assert(lo > 0);
700 tor_assert(hi > lo);
701 tor_assert(max_flush > 0);
703 sched_q_low_water = lo;
704 sched_q_high_water = hi;
705 sched_max_flush_cells = max_flush;