copy changelog to releasenotes
[tor.git] / src / or / scheduler.c
blob033e6d119cebb2c8e4339eaaa93cd183e70ec51a
1 /* * Copyright (c) 2013-2016, The Tor Project, Inc. */
2 /* See LICENSE for licensing information */
4 #include "or.h"
6 #define TOR_CHANNEL_INTERNAL_ /* For channel_flush_some_cells() */
7 #include "channel.h"
9 #include "compat_libevent.h"
10 #define SCHEDULER_PRIVATE_
11 #include "scheduler.h"
13 #include <event2/event.h>
16 * Scheduler high/low watermarks
19 static uint32_t sched_q_low_water = 16384;
20 static uint32_t sched_q_high_water = 32768;
23 * Maximum cells to flush in a single call to channel_flush_some_cells();
24 * setting this low means more calls, but too high and we could overshoot
25 * sched_q_high_water.
28 static uint32_t sched_max_flush_cells = 16;
30 /**
31 * \file scheduler.c
32 * \brief Channel scheduling system: decides which channels should send and
33 * receive when.
35 * This module implements a scheduler algorithm, to decide
36 * which channels should send/receive when.
38 * The earliest versions of Tor approximated a kind of round-robin system
39 * among active connections, but only approximated it.
41 * Now, write scheduling works by keeping track of which channels can
42 * accept cells, and have cells to write. From the scheduler's perspective,
43 * a channel can be in four possible states:
45 * <ol>
46 * <li>
47 * Not open for writes, no cells to send.
48 * <ul><li> Not much to do here, and the channel will have scheduler_state
49 * == SCHED_CHAN_IDLE
50 * <li> Transitions from:
51 * <ul>
52 * <li>Open for writes/has cells by simultaneously draining all circuit
53 * queues and filling the output buffer.
54 * </ul>
55 * <li> Transitions to:
56 * <ul>
57 * <li> Not open for writes/has cells by arrival of cells on an attached
58 * circuit (this would be driven from append_cell_to_circuit_queue())
59 * <li> Open for writes/no cells by a channel type specific path;
60 * driven from connection_or_flushed_some() for channel_tls_t.
61 * </ul>
62 * </ul>
64 * <li> Open for writes, no cells to send
65 * <ul>
66 * <li>Not much here either; this will be the state an idle but open
67 * channel can be expected to settle in. It will have scheduler_state
68 * == SCHED_CHAN_WAITING_FOR_CELLS
69 * <li> Transitions from:
70 * <ul>
71 * <li>Not open for writes/no cells by flushing some of the output
72 * buffer.
73 * <li>Open for writes/has cells by the scheduler moving cells from
74 * circuit queues to channel output queue, but not having enough
75 * to fill the output queue.
76 * </ul>
77 * <li> Transitions to:
78 * <ul>
79 * <li>Open for writes/has cells by arrival of new cells on an attached
80 * circuit, in append_cell_to_circuit_queue()
81 * </ul>
82 * </ul>
84 * <li>Not open for writes, cells to send
85 * <ul>
86 * <li>This is the state of a busy circuit limited by output bandwidth;
87 * cells have piled up in the circuit queues waiting to be relayed.
88 * The channel will have scheduler_state == SCHED_CHAN_WAITING_TO_WRITE.
89 * <li> Transitions from:
90 * <ul>
91 * <li>Not open for writes/no cells by arrival of cells on an attached
92 * circuit
93 * <li> Open for writes/has cells by filling an output buffer without
94 * draining all cells from attached circuits
95 * </ul>
96 * <li> Transitions to:
97 * <ul>
98 * <li>Opens for writes/has cells by draining some of the output buffer
99 * via the connection_or_flushed_some() path (for channel_tls_t).
100 * </ul>
101 * </ul>
103 * <li>Open for writes, cells to send
104 * <ul>
105 * <li>This connection is ready to relay some cells and waiting for
106 * the scheduler to choose it. The channel will have scheduler_state ==
107 * SCHED_CHAN_PENDING.
108 * <li>Transitions from:
109 * <ul>
110 * <li> Not open for writes/has cells by the connection_or_flushed_some()
111 * path
112 * <li> Open for writes/no cells by the append_cell_to_circuit_queue()
113 * path
114 * </ul>
115 * <li> Transitions to:
116 * <ul>
117 * <li>Not open for writes/no cells by draining all circuit queues and
118 * simultaneously filling the output buffer.
119 * <li>Not open for writes/has cells by writing enough cells to fill the
120 * output buffer
121 * <li>Open for writes/no cells by draining all attached circuit queues
122 * without also filling the output buffer
123 * </ul>
124 * </ul>
125 * </ol>
127 * Other event-driven parts of the code move channels between these scheduling
128 * states by calling scheduler functions; the scheduler only runs on open-for-
129 * writes/has-cells channels and is the only path for those to transition to
130 * other states. The scheduler_run() function gives us the opportunity to do
131 * scheduling work, and is called from other scheduler functions whenever a
132 * state transition occurs, and periodically from the main event loop.
135 /* Scheduler global data structures */
138 * We keep a list of channels that are pending - i.e, have cells to write
139 * and can accept them to send. The enum scheduler_state in channel_t
140 * is reserved for our use.
143 /* Pqueue of channels that can write and have cells (pending work) */
144 STATIC smartlist_t *channels_pending = NULL;
147 * This event runs the scheduler from its callback, and is manually
148 * activated whenever a channel enters open for writes/cells to send.
151 STATIC struct event *run_sched_ev = NULL;
154 * Queue heuristic; this is not the queue size, but an 'effective queuesize'
155 * that ages out contributions from stalled channels.
158 STATIC uint64_t queue_heuristic = 0;
161 * Timestamp for last queue heuristic update
164 STATIC time_t queue_heuristic_timestamp = 0;
166 /* Scheduler static function declarations */
168 static void scheduler_evt_callback(evutil_socket_t fd,
169 short events, void *arg);
170 static int scheduler_more_work(void);
171 static void scheduler_retrigger(void);
172 #if 0
173 static void scheduler_trigger(void);
174 #endif
176 /* Scheduler function implementations */
178 /** Free everything and shut down the scheduling system */
180 void
181 scheduler_free_all(void)
183 log_debug(LD_SCHED, "Shutting down scheduler");
185 if (run_sched_ev) {
186 if (event_del(run_sched_ev) < 0) {
187 log_warn(LD_BUG, "Problem deleting run_sched_ev");
189 tor_event_free(run_sched_ev);
190 run_sched_ev = NULL;
193 if (channels_pending) {
194 smartlist_free(channels_pending);
195 channels_pending = NULL;
200 * Comparison function to use when sorting pending channels
203 MOCK_IMPL(STATIC int,
204 scheduler_compare_channels, (const void *c1_v, const void *c2_v))
206 channel_t *c1 = NULL, *c2 = NULL;
207 /* These are a workaround for -Wbad-function-cast throwing a fit */
208 const circuitmux_policy_t *p1, *p2;
209 uintptr_t p1_i, p2_i;
211 tor_assert(c1_v);
212 tor_assert(c2_v);
214 c1 = (channel_t *)(c1_v);
215 c2 = (channel_t *)(c2_v);
217 tor_assert(c1);
218 tor_assert(c2);
220 if (c1 != c2) {
221 if (circuitmux_get_policy(c1->cmux) ==
222 circuitmux_get_policy(c2->cmux)) {
223 /* Same cmux policy, so use the mux comparison */
224 return circuitmux_compare_muxes(c1->cmux, c2->cmux);
225 } else {
227 * Different policies; not important to get this edge case perfect
228 * because the current code never actually gives different channels
229 * different cmux policies anyway. Just use this arbitrary but
230 * definite choice.
232 p1 = circuitmux_get_policy(c1->cmux);
233 p2 = circuitmux_get_policy(c2->cmux);
234 p1_i = (uintptr_t)p1;
235 p2_i = (uintptr_t)p2;
237 return (p1_i < p2_i) ? -1 : 1;
239 } else {
240 /* c1 == c2, so always equal */
241 return 0;
246 * Scheduler event callback; this should get triggered once per event loop
247 * if any scheduling work was created during the event loop.
250 static void
251 scheduler_evt_callback(evutil_socket_t fd, short events, void *arg)
253 (void)fd;
254 (void)events;
255 (void)arg;
256 log_debug(LD_SCHED, "Scheduler event callback called");
258 tor_assert(run_sched_ev);
260 /* Run the scheduler */
261 scheduler_run();
263 /* Do we have more work to do? */
264 if (scheduler_more_work()) scheduler_retrigger();
267 /** Mark a channel as no longer ready to accept writes */
269 MOCK_IMPL(void,
270 scheduler_channel_doesnt_want_writes,(channel_t *chan))
272 tor_assert(chan);
274 tor_assert(channels_pending);
276 /* If it's already in pending, we can put it in waiting_to_write */
277 if (chan->scheduler_state == SCHED_CHAN_PENDING) {
279 * It's in channels_pending, so it shouldn't be in any of
280 * the other lists. It can't write any more, so it goes to
281 * channels_waiting_to_write.
283 smartlist_pqueue_remove(channels_pending,
284 scheduler_compare_channels,
285 STRUCT_OFFSET(channel_t, sched_heap_idx),
286 chan);
287 chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
288 log_debug(LD_SCHED,
289 "Channel " U64_FORMAT " at %p went from pending "
290 "to waiting_to_write",
291 U64_PRINTF_ARG(chan->global_identifier), chan);
292 } else {
294 * It's not in pending, so it can't become waiting_to_write; it's
295 * either not in any of the lists (nothing to do) or it's already in
296 * waiting_for_cells (remove it, can't write any more).
298 if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) {
299 chan->scheduler_state = SCHED_CHAN_IDLE;
300 log_debug(LD_SCHED,
301 "Channel " U64_FORMAT " at %p left waiting_for_cells",
302 U64_PRINTF_ARG(chan->global_identifier), chan);
307 /** Mark a channel as having waiting cells */
309 MOCK_IMPL(void,
310 scheduler_channel_has_waiting_cells,(channel_t *chan))
312 int became_pending = 0;
314 tor_assert(chan);
315 tor_assert(channels_pending);
317 /* First, check if this one also writeable */
318 if (chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS) {
320 * It's in channels_waiting_for_cells, so it shouldn't be in any of
321 * the other lists. It has waiting cells now, so it goes to
322 * channels_pending.
324 chan->scheduler_state = SCHED_CHAN_PENDING;
325 smartlist_pqueue_add(channels_pending,
326 scheduler_compare_channels,
327 STRUCT_OFFSET(channel_t, sched_heap_idx),
328 chan);
329 log_debug(LD_SCHED,
330 "Channel " U64_FORMAT " at %p went from waiting_for_cells "
331 "to pending",
332 U64_PRINTF_ARG(chan->global_identifier), chan);
333 became_pending = 1;
334 } else {
336 * It's not in waiting_for_cells, so it can't become pending; it's
337 * either not in any of the lists (we add it to waiting_to_write)
338 * or it's already in waiting_to_write or pending (we do nothing)
340 if (!(chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE ||
341 chan->scheduler_state == SCHED_CHAN_PENDING)) {
342 chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
343 log_debug(LD_SCHED,
344 "Channel " U64_FORMAT " at %p entered waiting_to_write",
345 U64_PRINTF_ARG(chan->global_identifier), chan);
350 * If we made a channel pending, we potentially have scheduling work
351 * to do.
353 if (became_pending) scheduler_retrigger();
356 /** Set up the scheduling system */
358 void
359 scheduler_init(void)
361 log_debug(LD_SCHED, "Initting scheduler");
363 tor_assert(!run_sched_ev);
364 run_sched_ev = tor_event_new(tor_libevent_get_base(), -1,
365 0, scheduler_evt_callback, NULL);
367 channels_pending = smartlist_new();
368 queue_heuristic = 0;
369 queue_heuristic_timestamp = approx_time();
372 /** Check if there's more scheduling work */
374 static int
375 scheduler_more_work(void)
377 tor_assert(channels_pending);
379 return ((scheduler_get_queue_heuristic() < sched_q_low_water) &&
380 ((smartlist_len(channels_pending) > 0))) ? 1 : 0;
383 /** Retrigger the scheduler in a way safe to use from the callback */
385 static void
386 scheduler_retrigger(void)
388 tor_assert(run_sched_ev);
389 event_active(run_sched_ev, EV_TIMEOUT, 1);
392 /** Notify the scheduler of a channel being closed */
394 MOCK_IMPL(void,
395 scheduler_release_channel,(channel_t *chan))
397 tor_assert(chan);
398 tor_assert(channels_pending);
400 if (chan->scheduler_state == SCHED_CHAN_PENDING) {
401 smartlist_pqueue_remove(channels_pending,
402 scheduler_compare_channels,
403 STRUCT_OFFSET(channel_t, sched_heap_idx),
404 chan);
407 chan->scheduler_state = SCHED_CHAN_IDLE;
410 /** Run the scheduling algorithm if necessary */
412 MOCK_IMPL(void,
413 scheduler_run, (void))
415 int n_cells, n_chans_before, n_chans_after;
416 uint64_t q_len_before, q_heur_before, q_len_after, q_heur_after;
417 ssize_t flushed, flushed_this_time;
418 smartlist_t *to_readd = NULL;
419 channel_t *chan = NULL;
421 log_debug(LD_SCHED, "We have a chance to run the scheduler");
423 if (scheduler_get_queue_heuristic() < sched_q_low_water) {
424 n_chans_before = smartlist_len(channels_pending);
425 q_len_before = channel_get_global_queue_estimate();
426 q_heur_before = scheduler_get_queue_heuristic();
428 while (scheduler_get_queue_heuristic() <= sched_q_high_water &&
429 smartlist_len(channels_pending) > 0) {
430 /* Pop off a channel */
431 chan = smartlist_pqueue_pop(channels_pending,
432 scheduler_compare_channels,
433 STRUCT_OFFSET(channel_t, sched_heap_idx));
434 tor_assert(chan);
436 /* Figure out how many cells we can write */
437 n_cells = channel_num_cells_writeable(chan);
438 if (n_cells > 0) {
439 log_debug(LD_SCHED,
440 "Scheduler saw pending channel " U64_FORMAT " at %p with "
441 "%d cells writeable",
442 U64_PRINTF_ARG(chan->global_identifier), chan, n_cells);
444 flushed = 0;
445 while (flushed < n_cells &&
446 scheduler_get_queue_heuristic() <= sched_q_high_water) {
447 flushed_this_time =
448 channel_flush_some_cells(chan,
449 MIN(sched_max_flush_cells,
450 (size_t) n_cells - flushed));
451 if (flushed_this_time <= 0) break;
452 flushed += flushed_this_time;
455 if (flushed < n_cells) {
456 /* We ran out of cells to flush */
457 chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
458 log_debug(LD_SCHED,
459 "Channel " U64_FORMAT " at %p "
460 "entered waiting_for_cells from pending",
461 U64_PRINTF_ARG(chan->global_identifier),
462 chan);
463 } else {
464 /* The channel may still have some cells */
465 if (channel_more_to_flush(chan)) {
466 /* The channel goes to either pending or waiting_to_write */
467 if (channel_num_cells_writeable(chan) > 0) {
468 /* Add it back to pending later */
469 if (!to_readd) to_readd = smartlist_new();
470 smartlist_add(to_readd, chan);
471 log_debug(LD_SCHED,
472 "Channel " U64_FORMAT " at %p "
473 "is still pending",
474 U64_PRINTF_ARG(chan->global_identifier),
475 chan);
476 } else {
477 /* It's waiting to be able to write more */
478 chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
479 log_debug(LD_SCHED,
480 "Channel " U64_FORMAT " at %p "
481 "entered waiting_to_write from pending",
482 U64_PRINTF_ARG(chan->global_identifier),
483 chan);
485 } else {
486 /* No cells left; it can go to idle or waiting_for_cells */
487 if (channel_num_cells_writeable(chan) > 0) {
489 * It can still accept writes, so it goes to
490 * waiting_for_cells
492 chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
493 log_debug(LD_SCHED,
494 "Channel " U64_FORMAT " at %p "
495 "entered waiting_for_cells from pending",
496 U64_PRINTF_ARG(chan->global_identifier),
497 chan);
498 } else {
500 * We exactly filled up the output queue with all available
501 * cells; go to idle.
503 chan->scheduler_state = SCHED_CHAN_IDLE;
504 log_debug(LD_SCHED,
505 "Channel " U64_FORMAT " at %p "
506 "become idle from pending",
507 U64_PRINTF_ARG(chan->global_identifier),
508 chan);
513 log_debug(LD_SCHED,
514 "Scheduler flushed %d cells onto pending channel "
515 U64_FORMAT " at %p",
516 (int)flushed, U64_PRINTF_ARG(chan->global_identifier),
517 chan);
518 } else {
519 log_info(LD_SCHED,
520 "Scheduler saw pending channel " U64_FORMAT " at %p with "
521 "no cells writeable",
522 U64_PRINTF_ARG(chan->global_identifier), chan);
523 /* Put it back to WAITING_TO_WRITE */
524 chan->scheduler_state = SCHED_CHAN_WAITING_TO_WRITE;
528 /* Readd any channels we need to */
529 if (to_readd) {
530 SMARTLIST_FOREACH_BEGIN(to_readd, channel_t *, readd_chan) {
531 readd_chan->scheduler_state = SCHED_CHAN_PENDING;
532 smartlist_pqueue_add(channels_pending,
533 scheduler_compare_channels,
534 STRUCT_OFFSET(channel_t, sched_heap_idx),
535 readd_chan);
536 } SMARTLIST_FOREACH_END(readd_chan);
537 smartlist_free(to_readd);
540 n_chans_after = smartlist_len(channels_pending);
541 q_len_after = channel_get_global_queue_estimate();
542 q_heur_after = scheduler_get_queue_heuristic();
543 log_debug(LD_SCHED,
544 "Scheduler handled %d of %d pending channels, queue size from "
545 U64_FORMAT " to " U64_FORMAT ", queue heuristic from "
546 U64_FORMAT " to " U64_FORMAT,
547 n_chans_before - n_chans_after, n_chans_before,
548 U64_PRINTF_ARG(q_len_before), U64_PRINTF_ARG(q_len_after),
549 U64_PRINTF_ARG(q_heur_before), U64_PRINTF_ARG(q_heur_after));
553 /** Trigger the scheduling event so we run the scheduler later */
555 #if 0
556 static void
557 scheduler_trigger(void)
559 log_debug(LD_SCHED, "Triggering scheduler event");
561 tor_assert(run_sched_ev);
563 event_add(run_sched_ev, EV_TIMEOUT, 1);
565 #endif
567 /** Mark a channel as ready to accept writes */
569 void
570 scheduler_channel_wants_writes(channel_t *chan)
572 int became_pending = 0;
574 tor_assert(chan);
575 tor_assert(channels_pending);
577 /* If it's already in waiting_to_write, we can put it in pending */
578 if (chan->scheduler_state == SCHED_CHAN_WAITING_TO_WRITE) {
580 * It can write now, so it goes to channels_pending.
582 smartlist_pqueue_add(channels_pending,
583 scheduler_compare_channels,
584 STRUCT_OFFSET(channel_t, sched_heap_idx),
585 chan);
586 chan->scheduler_state = SCHED_CHAN_PENDING;
587 log_debug(LD_SCHED,
588 "Channel " U64_FORMAT " at %p went from waiting_to_write "
589 "to pending",
590 U64_PRINTF_ARG(chan->global_identifier), chan);
591 became_pending = 1;
592 } else {
594 * It's not in SCHED_CHAN_WAITING_TO_WRITE, so it can't become pending;
595 * it's either idle and goes to WAITING_FOR_CELLS, or it's a no-op.
597 if (!(chan->scheduler_state == SCHED_CHAN_WAITING_FOR_CELLS ||
598 chan->scheduler_state == SCHED_CHAN_PENDING)) {
599 chan->scheduler_state = SCHED_CHAN_WAITING_FOR_CELLS;
600 log_debug(LD_SCHED,
601 "Channel " U64_FORMAT " at %p entered waiting_for_cells",
602 U64_PRINTF_ARG(chan->global_identifier), chan);
607 * If we made a channel pending, we potentially have scheduling work
608 * to do.
610 if (became_pending) scheduler_retrigger();
614 * Notify the scheduler that a channel's position in the pqueue may have
615 * changed
618 void
619 scheduler_touch_channel(channel_t *chan)
621 tor_assert(chan);
623 if (chan->scheduler_state == SCHED_CHAN_PENDING) {
624 /* Remove and re-add it */
625 smartlist_pqueue_remove(channels_pending,
626 scheduler_compare_channels,
627 STRUCT_OFFSET(channel_t, sched_heap_idx),
628 chan);
629 smartlist_pqueue_add(channels_pending,
630 scheduler_compare_channels,
631 STRUCT_OFFSET(channel_t, sched_heap_idx),
632 chan);
634 /* else no-op, since it isn't in the queue */
638 * Notify the scheduler of a queue size adjustment, to recalculate the
639 * queue heuristic.
642 void
643 scheduler_adjust_queue_size(channel_t *chan, int dir, uint64_t adj)
645 time_t now = approx_time();
647 log_debug(LD_SCHED,
648 "Queue size adjustment by %s" U64_FORMAT " for channel "
649 U64_FORMAT,
650 (dir >= 0) ? "+" : "-",
651 U64_PRINTF_ARG(adj),
652 U64_PRINTF_ARG(chan->global_identifier));
654 /* Get the queue heuristic up to date */
655 scheduler_update_queue_heuristic(now);
657 /* Adjust as appropriate */
658 if (dir >= 0) {
659 /* Increasing it */
660 queue_heuristic += adj;
661 } else {
662 /* Decreasing it */
663 if (queue_heuristic > adj) queue_heuristic -= adj;
664 else queue_heuristic = 0;
667 log_debug(LD_SCHED,
668 "Queue heuristic is now " U64_FORMAT,
669 U64_PRINTF_ARG(queue_heuristic));
673 * Query the current value of the queue heuristic
676 STATIC uint64_t
677 scheduler_get_queue_heuristic(void)
679 time_t now = approx_time();
681 scheduler_update_queue_heuristic(now);
683 return queue_heuristic;
687 * Adjust the queue heuristic value to the present time
690 STATIC void
691 scheduler_update_queue_heuristic(time_t now)
693 time_t diff;
695 if (queue_heuristic_timestamp == 0) {
697 * Nothing we can sensibly do; must not have been initted properly.
698 * Oh well.
700 queue_heuristic_timestamp = now;
701 } else if (queue_heuristic_timestamp < now) {
702 diff = now - queue_heuristic_timestamp;
704 * This is a simple exponential age-out; the other proposed alternative
705 * was a linear age-out using the bandwidth history in rephist.c; I'm
706 * going with this out of concern that if an adversary can jam the
707 * scheduler long enough, it would cause the bandwidth to drop to
708 * zero and render the aging mechanism ineffective thereafter.
710 if (0 <= diff && diff < 64) queue_heuristic >>= diff;
711 else queue_heuristic = 0;
713 queue_heuristic_timestamp = now;
715 log_debug(LD_SCHED,
716 "Queue heuristic is now " U64_FORMAT,
717 U64_PRINTF_ARG(queue_heuristic));
719 /* else no update needed, or time went backward */
723 * Set scheduler watermarks and flush size
726 void
727 scheduler_set_watermarks(uint32_t lo, uint32_t hi, uint32_t max_flush)
729 /* Sanity assertions - caller should ensure these are true */
730 tor_assert(lo > 0);
731 tor_assert(hi > lo);
732 tor_assert(max_flush > 0);
734 sched_q_low_water = lo;
735 sched_q_high_water = hi;
736 sched_max_flush_cells = max_flush;