1 /* Copyright (c) 2021, The Tor Project, Inc. */
2 /* See LICENSE for licensing information */
5 * \file congestion_control_common.c
6 * \brief Common code used by all congestion control algorithms.
9 #define TOR_CONGESTION_CONTROL_COMMON_PRIVATE
11 #include "core/or/or.h"
13 #include "core/crypto/onion_crypto.h"
14 #include "core/or/circuitlist.h"
15 #include "core/or/crypt_path.h"
16 #include "core/or/or_circuit_st.h"
17 #include "core/or/origin_circuit_st.h"
18 #include "core/or/channel.h"
19 #include "core/mainloop/connection.h"
20 #include "core/or/sendme.h"
21 #include "core/or/congestion_control_common.h"
22 #include "core/or/congestion_control_vegas.h"
23 #include "core/or/congestion_control_nola.h"
24 #include "core/or/congestion_control_westwood.h"
25 #include "core/or/congestion_control_st.h"
26 #include "core/or/trace_probes_cc.h"
27 #include "lib/time/compat_time.h"
28 #include "feature/nodelist/networkstatus.h"
29 #include "app/config/config.h"
31 #include "trunnel/congestion_control.h"
32 #include "trunnel/extension.h"
34 /* Consensus parameter defaults.
36 * More details for each of the parameters can be found in proposal 324,
37 * section 6.5 including tuning notes. */
38 #define CIRCWINDOW_INIT (500)
39 #define SENDME_INC_DFLT (50)
40 #define CC_ALG_DFLT (CC_ALG_SENDME)
41 #define CC_ALG_DFLT_ALWAYS (CC_ALG_VEGAS)
43 #define CWND_INC_DFLT (50)
44 #define CWND_INC_PCT_SS_DFLT (100)
45 #define CWND_INC_RATE_DFLT (1)
46 #define CWND_MAX_DFLT (INT32_MAX)
47 #define CWND_MIN_DFLT (MAX(100, SENDME_INC_DFLT))
49 #define BWE_SENDME_MIN_DFLT (5)
50 #define EWMA_CWND_COUNT_DFLT (2)
52 /* BDP algorithms for each congestion control algorithms use the piecewise
53 * estimattor. See section 3.1.4 of proposal 324. */
54 #define WESTWOOD_BDP_ALG BDP_ALG_PIECEWISE
55 #define VEGAS_BDP_MIX_ALG BDP_ALG_PIECEWISE
56 #define NOLA_BDP_ALG BDP_ALG_PIECEWISE
58 /* Indicate OR connection buffer limitations used to stop or start accepting
59 * cells in its outbuf.
61 * These watermarks are historical to tor in a sense that they've been used
62 * almost from the genesis point. And were likely defined to fit the bounds of
63 * TLS records of 16KB which would be around 32 cells.
65 * These are defaults of the consensus parameter "orconn_high" and "orconn_low"
67 #define OR_CONN_HIGHWATER_DFLT (32*1024)
68 #define OR_CONN_LOWWATER_DFLT (16*1024)
70 /* Low and high values of circuit cell queue sizes. They are used to tell when
71 * to start or stop reading on the streams attached on the circuit.
73 * These are defaults of the consensus parameters "cellq_high" and "cellq_low".
75 #define CELL_QUEUE_LOW_DFLT (10)
76 #define CELL_QUEUE_HIGH_DFLT (256)
78 static uint64_t congestion_control_update_circuit_rtt(congestion_control_t
*,
80 static bool congestion_control_update_circuit_bdp(congestion_control_t
*,
85 void congestion_control_set_cc_enabled(void);
87 /* Consensus parameters cached. The non static ones are extern. */
88 static uint32_t cwnd_max
= CWND_MAX_DFLT
;
89 int32_t cell_queue_high
= CELL_QUEUE_HIGH_DFLT
;
90 int32_t cell_queue_low
= CELL_QUEUE_LOW_DFLT
;
91 uint32_t or_conn_highwater
= OR_CONN_HIGHWATER_DFLT
;
92 uint32_t or_conn_lowwater
= OR_CONN_LOWWATER_DFLT
;
93 uint8_t cc_sendme_inc
= SENDME_INC_DFLT
;
94 static cc_alg_t cc_alg
= CC_ALG_DFLT
;
97 * Update global congestion control related consensus parameter values,
98 * every consensus update.
101 congestion_control_new_consensus_params(const networkstatus_t
*ns
)
103 #define CELL_QUEUE_HIGH_MIN (1)
104 #define CELL_QUEUE_HIGH_MAX (1000)
105 cell_queue_high
= networkstatus_get_param(ns
, "cellq_high",
106 CELL_QUEUE_HIGH_DFLT
,
108 CELL_QUEUE_HIGH_MAX
);
110 #define CELL_QUEUE_LOW_MIN (1)
111 #define CELL_QUEUE_LOW_MAX (1000)
112 cell_queue_low
= networkstatus_get_param(ns
, "cellq_low",
117 #define OR_CONN_HIGHWATER_MIN (CELL_PAYLOAD_SIZE)
118 #define OR_CONN_HIGHWATER_MAX (INT32_MAX)
120 networkstatus_get_param(ns
, "orconn_high",
121 OR_CONN_HIGHWATER_DFLT
,
122 OR_CONN_HIGHWATER_MIN
,
123 OR_CONN_HIGHWATER_MAX
);
125 #define OR_CONN_LOWWATER_MIN (CELL_PAYLOAD_SIZE)
126 #define OR_CONN_LOWWATER_MAX (INT32_MAX)
128 networkstatus_get_param(ns
, "orconn_low",
129 OR_CONN_LOWWATER_DFLT
,
130 OR_CONN_LOWWATER_MIN
,
131 OR_CONN_LOWWATER_MAX
);
133 #define CWND_MAX_MIN 500
134 #define CWND_MAX_MAX (INT32_MAX)
136 networkstatus_get_param(NULL
, "cc_cwnd_max",
141 #define SENDME_INC_MIN 10
142 #define SENDME_INC_MAX (1000)
144 networkstatus_get_param(NULL
, "cc_sendme_inc",
150 #define CC_ALG_MAX (NUM_CC_ALGS-1)
152 networkstatus_get_param(NULL
, "cc_alg",
159 * Set congestion control parameters on a circuit's congestion
160 * control object based on values from the consensus.
162 * cc_alg is the negotiated congestion control algorithm.
164 * sendme_inc is the number of packaged cells that a sendme cell
165 * acks. This parameter will come from circuit negotiation.
168 congestion_control_init_params(congestion_control_t
*cc
,
169 const circuit_params_t
*params
)
171 const or_options_t
*opts
= get_options();
172 cc
->sendme_inc
= params
->sendme_inc_cells
;
174 #define CWND_INIT_MIN 100
175 #define CWND_INIT_MAX (10000)
177 networkstatus_get_param(NULL
, "cc_cwnd_init",
182 #define CWND_INC_PCT_SS_MIN 1
183 #define CWND_INC_PCT_SS_MAX (500)
184 cc
->cwnd_inc_pct_ss
=
185 networkstatus_get_param(NULL
, "cc_cwnd_inc_pct_ss",
186 CWND_INC_PCT_SS_DFLT
,
188 CWND_INC_PCT_SS_MAX
);
190 #define CWND_INC_MIN 1
191 #define CWND_INC_MAX (1000)
193 networkstatus_get_param(NULL
, "cc_cwnd_inc",
198 #define CWND_INC_RATE_MIN 1
199 #define CWND_INC_RATE_MAX (250)
201 networkstatus_get_param(NULL
, "cc_cwnd_inc_rate",
206 #define CWND_MIN_MIN 20
207 #define CWND_MIN_MAX (1000)
209 networkstatus_get_param(NULL
, "cc_cwnd_min",
214 #define EWMA_CWND_COUNT_MIN 1
215 #define EWMA_CWND_COUNT_MAX (100)
217 networkstatus_get_param(NULL
, "cc_ewma_cwnd_cnt",
218 EWMA_CWND_COUNT_DFLT
,
220 EWMA_CWND_COUNT_MAX
);
222 #define BWE_SENDME_MIN_MIN 2
223 #define BWE_SENDME_MIN_MAX (20)
225 networkstatus_get_param(NULL
, "cc_bwe_min",
230 /* If the consensus says to use OG sendme, but torrc has
231 * always-enabled, use the default "always" alg (vegas),
232 * else use cached conensus alg. */
233 if (cc_alg
== CC_ALG_SENDME
&& opts
->AlwaysCongestionControl
) {
234 cc
->cc_alg
= CC_ALG_DFLT_ALWAYS
;
239 bdp_alg_t default_bdp_alg
= 0;
241 switch (cc
->cc_alg
) {
242 case CC_ALG_WESTWOOD
:
243 default_bdp_alg
= WESTWOOD_BDP_ALG
;
246 default_bdp_alg
= VEGAS_BDP_MIX_ALG
;
249 default_bdp_alg
= NOLA_BDP_ALG
;
253 tor_fragile_assert();
254 return; // No alg-specific params
258 networkstatus_get_param(NULL
, "cc_bdp_alg",
263 /* Algorithm-specific parameters */
264 if (cc
->cc_alg
== CC_ALG_WESTWOOD
) {
265 congestion_control_westwood_set_params(cc
);
266 } else if (cc
->cc_alg
== CC_ALG_VEGAS
) {
267 congestion_control_vegas_set_params(cc
);
268 } else if (cc
->cc_alg
== CC_ALG_NOLA
) {
269 congestion_control_nola_set_params(cc
);
273 /** Returns true if congestion control is enabled in the most recent
274 * consensus, or if __AlwaysCongestionControl is set to true.
276 * Note that this function (and many many other functions) should not
277 * be called from the CPU worker threads when handling congestion
278 * control negotiation. Relevant values are marshaled into the
279 * `circuit_params_t` struct, in order to be used in worker threads
280 * without touching global state. Use those values in CPU worker
281 * threads, instead of calling this function.
283 * The danger is still present, in your time, as it was in ours.
286 congestion_control_enabled(void)
288 const or_options_t
*opts
= NULL
;
290 tor_assert_nonfatal_once(in_main_thread());
292 opts
= get_options();
294 /* If the user has set "__AlwaysCongesttionControl",
295 * then always try to negotiate congestion control, regardless
296 * of consensus param. This is to be used for testing and sbws.
298 * Note that we do *not* allow disabling congestion control
299 * if the consensus says to use it, as this is bad for queueing
301 if (opts
->AlwaysCongestionControl
)
304 return cc_alg
!= CC_ALG_SENDME
;
308 * For unit tests only: set the cached consensus cc alg to
312 congestion_control_set_cc_enabled(void)
314 cc_alg
= CC_ALG_VEGAS
;
318 * Allocate and initialize fields in congestion control object.
320 * cc_alg is the negotiated congestion control algorithm.
322 * sendme_inc is the number of packaged cells that a sendme cell
323 * acks. This parameter will come from circuit negotiation.
326 congestion_control_init(congestion_control_t
*cc
,
327 const circuit_params_t
*params
)
329 cc
->sendme_pending_timestamps
= smartlist_new();
330 cc
->sendme_arrival_timestamps
= smartlist_new();
332 cc
->in_slow_start
= 1;
333 congestion_control_init_params(cc
, params
);
335 cc
->next_cc_event
= CWND_UPDATE_RATE(cc
);
338 /** Allocate and initialize a new congestion control object */
339 congestion_control_t
*
340 congestion_control_new(const circuit_params_t
*params
)
342 congestion_control_t
*cc
= tor_malloc_zero(sizeof(congestion_control_t
));
344 congestion_control_init(cc
, params
);
350 * Free a congestion control object and its asssociated state.
353 congestion_control_free_(congestion_control_t
*cc
)
358 SMARTLIST_FOREACH(cc
->sendme_pending_timestamps
, uint64_t *, t
, tor_free(t
));
359 SMARTLIST_FOREACH(cc
->sendme_arrival_timestamps
, uint64_t *, t
, tor_free(t
));
360 smartlist_free(cc
->sendme_pending_timestamps
);
361 smartlist_free(cc
->sendme_arrival_timestamps
);
367 * Enqueue a u64 timestamp to the end of a queue of timestamps.
370 enqueue_timestamp(smartlist_t
*timestamps_u64
, uint64_t timestamp_usec
)
372 uint64_t *timestamp_ptr
= tor_malloc(sizeof(uint64_t));
373 *timestamp_ptr
= timestamp_usec
;
375 smartlist_add(timestamps_u64
, timestamp_ptr
);
379 * Peek at the head of a smartlist queue of u64 timestamps.
381 static inline uint64_t
382 peek_timestamp(const smartlist_t
*timestamps_u64_usecs
)
384 uint64_t *timestamp_ptr
= smartlist_get(timestamps_u64_usecs
, 0);
386 if (BUG(!timestamp_ptr
)) {
387 log_err(LD_CIRC
, "Congestion control timestamp list became empty!");
391 return *timestamp_ptr
;
395 * Dequeue a u64 monotime usec timestamp from the front of a
396 * smartlist of pointers to 64.
398 static inline uint64_t
399 dequeue_timestamp(smartlist_t
*timestamps_u64_usecs
)
401 uint64_t *timestamp_ptr
= smartlist_get(timestamps_u64_usecs
, 0);
402 uint64_t timestamp_u64
;
404 if (BUG(!timestamp_ptr
)) {
405 log_err(LD_CIRC
, "Congestion control timestamp list became empty!");
409 timestamp_u64
= *timestamp_ptr
;
410 smartlist_del_keeporder(timestamps_u64_usecs
, 0);
411 tor_free(timestamp_ptr
);
413 return timestamp_u64
;
417 * Returns the number of sendme acks that will be recieved in the
418 * current congestion window size, rounded to nearest int.
420 static inline uint64_t
421 sendme_acks_per_cwnd(const congestion_control_t
*cc
)
423 /* We add half a sendme_inc to cwnd to round to the nearest int */
424 return ((cc
->cwnd
+ cc
->sendme_inc
/2)/cc
->sendme_inc
);
428 * Get a package window from either old sendme logic, or congestion control.
430 * A package window is how many cells you can still send.
433 congestion_control_get_package_window(const circuit_t
*circ
,
434 const crypt_path_t
*cpath
)
437 congestion_control_t
*cc
;
442 package_window
= cpath
->package_window
;
443 cc
= cpath
->ccontrol
;
445 package_window
= circ
->package_window
;
450 return package_window
;
452 /* Inflight can be above cwnd if cwnd was just reduced */
453 if (cc
->inflight
> cc
->cwnd
)
455 /* In the extremely unlikely event that cwnd-inflight is larger than
456 * INT32_MAX, just return that cap, so old code doesn't explode. */
457 else if (cc
->cwnd
- cc
->inflight
> INT32_MAX
)
460 return (int)(cc
->cwnd
- cc
->inflight
);
465 * Returns the number of cells that are acked by every sendme.
468 sendme_get_inc_count(const circuit_t
*circ
, const crypt_path_t
*layer_hint
)
470 int sendme_inc
= CIRCWINDOW_INCREMENT
;
471 congestion_control_t
*cc
= NULL
;
474 cc
= layer_hint
->ccontrol
;
480 sendme_inc
= cc
->sendme_inc
;
486 /** Return true iff the next cell we send will result in the other endpoint
489 * We are able to know that because the package or inflight window value minus
490 * one cell (the possible SENDME cell) should be a multiple of the
491 * cells-per-sendme increment value (set via consensus parameter, negotiated
492 * for the circuit, and passed in as sendme_inc).
494 * This function is used when recording a cell digest and this is done quite
495 * low in the stack when decrypting or encrypting a cell. The window is only
496 * updated once the cell is actually put in the outbuf.
499 circuit_sent_cell_for_sendme(const circuit_t
*circ
,
500 const crypt_path_t
*layer_hint
)
502 congestion_control_t
*cc
;
508 window
= layer_hint
->package_window
;
509 cc
= layer_hint
->ccontrol
;
511 window
= circ
->package_window
;
515 /* If we are using congestion control and the alg is not
516 * old-school 'fixed', then use cc->inflight to determine
517 * when sendmes will be sent */
522 /* This check must be +1 because this function is called *before*
523 * inflight is incremented for the sent cell */
524 if ((cc
->inflight
+1) % cc
->sendme_inc
!= 0)
530 /* At the start of the window, no SENDME will be expected. */
531 if (window
== CIRCWINDOW_START
) {
535 /* Are we at the limit of the increment and if not, we don't expect next
538 * We test against the window minus 1 because when we are looking if the
539 * next cell is a SENDME, the window (either package or deliver) hasn't been
540 * decremented just yet so when this is called, we are currently processing
541 * the "window - 1" cell.
543 if (((window
- 1) % CIRCWINDOW_INCREMENT
) != 0) {
547 /* Next cell is expected to be a SENDME. */
552 * Call-in to tell congestion control code that this circuit sent a cell.
554 * This updates the 'inflight' counter, and if this is a cell that will
555 * cause the other end to send a SENDME, record the current time in a list
556 * of pending timestamps, so that we can later compute the circuit RTT when
557 * the SENDME comes back. */
559 congestion_control_note_cell_sent(congestion_control_t
*cc
,
560 const circuit_t
*circ
,
561 const crypt_path_t
*cpath
)
566 /* Is this the last cell before a SENDME? The idea is that if the
567 * package_window reaches a multiple of the increment, after this cell, we
568 * should expect a SENDME. Note that this function must be called *before*
569 * we account for the sent cell. */
570 if (!circuit_sent_cell_for_sendme(circ
, cpath
)) {
577 /* Record this cell time for RTT computation when SENDME arrives */
578 enqueue_timestamp(cc
->sendme_pending_timestamps
,
579 monotime_absolute_usec());
583 * Returns true if any edge connections are active.
585 * We need to know this so that we can stop computing BDP if the
586 * edges are not sending on the circuit.
589 circuit_has_active_streams(const circuit_t
*circ
,
590 const crypt_path_t
*layer_hint
)
592 const edge_connection_t
*streams
;
594 if (CIRCUIT_IS_ORIGIN(circ
)) {
595 streams
= CONST_TO_ORIGIN_CIRCUIT(circ
)->p_streams
;
597 streams
= CONST_TO_OR_CIRCUIT(circ
)->n_streams
;
600 /* Check linked list of streams */
601 for (const edge_connection_t
*conn
= streams
; conn
!= NULL
;
602 conn
= conn
->next_stream
) {
603 if (conn
->base_
.marked_for_close
)
606 if (!layer_hint
|| conn
->cpath_layer
== layer_hint
) {
607 if (connection_get_inbuf_len(TO_CONN(conn
)) > 0) {
608 log_info(LD_CIRC
, "CC: More in edge inbuf...");
612 /* If we did not reach EOF on this read, there's more */
613 if (!TO_CONN(conn
)->inbuf_reached_eof
) {
614 log_info(LD_CIRC
, "CC: More on edge conn...");
618 if (TO_CONN(conn
)->linked_conn
) {
619 if (connection_get_inbuf_len(TO_CONN(conn
)->linked_conn
) > 0) {
620 log_info(LD_CIRC
, "CC: More in linked inbuf...");
624 /* If there is a linked conn, and *it* did not each EOF,
626 if (!TO_CONN(conn
)->linked_conn
->inbuf_reached_eof
) {
627 log_info(LD_CIRC
, "CC: More on linked conn...");
638 * Upon receipt of a SENDME, pop the oldest timestamp off the timestamp
639 * list, and use this to update RTT.
641 * Returns true if circuit estimates were successfully updated, false
645 congestion_control_update_circuit_estimates(congestion_control_t
*cc
,
646 const circuit_t
*circ
,
647 const crypt_path_t
*layer_hint
)
649 uint64_t now_usec
= monotime_absolute_usec();
651 /* Update RTT first, then BDP. BDP needs fresh RTT */
652 uint64_t curr_rtt_usec
= congestion_control_update_circuit_rtt(cc
, now_usec
);
653 return congestion_control_update_circuit_bdp(cc
, circ
, layer_hint
, now_usec
,
658 * Returns true if we have enough time data to use heuristics
659 * to compare RTT to a baseline.
662 time_delta_should_use_heuristics(const congestion_control_t
*cc
)
665 /* If we have exited slow start, we should have processed at least
666 * a cwnd worth of RTTs */
667 if (!cc
->in_slow_start
) {
671 /* If we managed to get enough acks to estimate a SENDME BDP, then
672 * we have enough to estimate clock jumps relative to a baseline,
673 * too. (This is at least 'cc_bwe_min' acks). */
674 if (cc
->bdp
[BDP_ALG_SENDME_RATE
]) {
678 /* Not enough data to estimate clock jumps */
682 static bool is_monotime_clock_broken
= false;
685 * Returns true if the monotime delta is 0, or is significantly
686 * different than the previous delta. Either case indicates
687 * that the monotime time source stalled or jumped.
689 * Also caches the clock state in the is_monotime_clock_broken flag,
690 * so we can also provide a is_monotime_clock_reliable() function,
691 * used by flow control rate timing.
694 time_delta_stalled_or_jumped(const congestion_control_t
*cc
,
695 uint64_t old_delta
, uint64_t new_delta
)
697 #define DELTA_DISCREPENCY_RATIO_MAX 100
698 /* If we have a 0 new_delta, that is definitely a monotime stall */
699 if (new_delta
== 0) {
700 static ratelim_t stall_info_limit
= RATELIM_INIT(60);
701 log_fn_ratelim(&stall_info_limit
, LOG_INFO
, LD_CIRC
,
702 "Congestion control cannot measure RTT due to monotime stall.");
704 /* If delta is every 0, the monotime clock has stalled, and we should
705 * not use it anywhere. */
706 is_monotime_clock_broken
= true;
708 return is_monotime_clock_broken
;
711 /* If the old_delta is 0, we have no previous values on this circuit.
713 * So, return the global monotime status from other circuits, and
716 if (old_delta
== 0) {
717 return is_monotime_clock_broken
;
721 * For the heuristic cases, we need at least a few timestamps,
722 * to average out any previous partial stalls or jumps. So until
723 * than point, let's just use the cached status from other circuits.
725 if (!time_delta_should_use_heuristics(cc
)) {
726 return is_monotime_clock_broken
;
729 /* If old_delta is significantly larger than new_delta, then
730 * this means that the monotime clock recently stopped moving
732 if (old_delta
> new_delta
* DELTA_DISCREPENCY_RATIO_MAX
) {
733 static ratelim_t dec_notice_limit
= RATELIM_INIT(300);
734 log_fn_ratelim(&dec_notice_limit
, LOG_NOTICE
, LD_CIRC
,
735 "Sudden decrease in circuit RTT (%"PRIu64
" vs %"PRIu64
736 "), likely due to clock jump.",
737 new_delta
/1000, old_delta
/1000);
739 is_monotime_clock_broken
= true;
741 return is_monotime_clock_broken
;
744 /* If new_delta is significantly larger than old_delta, then
745 * this means that the monotime clock suddenly jumped forward. */
746 if (new_delta
> old_delta
* DELTA_DISCREPENCY_RATIO_MAX
) {
747 static ratelim_t dec_notice_limit
= RATELIM_INIT(300);
748 log_fn_ratelim(&dec_notice_limit
, LOG_NOTICE
, LD_CIRC
,
749 "Sudden increase in circuit RTT (%"PRIu64
" vs %"PRIu64
750 "), likely due to clock jump.",
751 new_delta
/1000, old_delta
/1000);
753 is_monotime_clock_broken
= true;
755 return is_monotime_clock_broken
;
758 /* All good! Update cached status, too */
759 is_monotime_clock_broken
= false;
761 return is_monotime_clock_broken
;
765 * Is the monotime clock stalled according to any circuits?
768 is_monotime_clock_reliable(void)
770 return !is_monotime_clock_broken
;
774 * Called when we get a SENDME. Updates circuit RTT by pulling off a
775 * timestamp of when we sent the CIRCWINDOW_INCREMENT-th cell from
776 * the queue of such timestamps, and comparing that to current time.
778 * Also updates min, max, and EWMA of RTT.
780 * Returns the current circuit RTT in usecs, or 0 if it could not be
781 * measured (due to clock jump, stall, etc).
784 congestion_control_update_circuit_rtt(congestion_control_t
*cc
,
787 uint64_t rtt
, ewma_cnt
;
788 uint64_t sent_at_timestamp
;
792 /* Get the time that we sent the cell that resulted in the other
793 * end sending this sendme. Use this to calculate RTT */
794 sent_at_timestamp
= dequeue_timestamp(cc
->sendme_pending_timestamps
);
796 rtt
= now_usec
- sent_at_timestamp
;
798 /* Do not update RTT at all if it looks fishy */
799 if (time_delta_stalled_or_jumped(cc
, cc
->ewma_rtt_usec
, rtt
)) {
803 ewma_cnt
= cc
->ewma_cwnd_cnt
*sendme_acks_per_cwnd(cc
);
804 ewma_cnt
= MAX(ewma_cnt
, 2); // Use at least 2
806 cc
->ewma_rtt_usec
= n_count_ewma(rtt
, cc
->ewma_rtt_usec
, ewma_cnt
);
808 if (rtt
> cc
->max_rtt_usec
) {
809 cc
->max_rtt_usec
= rtt
;
812 if (cc
->min_rtt_usec
== 0 || rtt
< cc
->min_rtt_usec
) {
813 cc
->min_rtt_usec
= rtt
;
820 * Called when we get a SENDME. Updates the bandwidth-delay-product (BDP)
821 * estimates of a circuit. Several methods of computing BDP are used,
822 * depending on scenario. While some congestion control algorithms only
823 * use one of these methods, we update them all because it's quick and easy.
825 * - now_usec is the current monotime in usecs.
826 * - curr_rtt_usec is the current circuit RTT in usecs. It may be 0 if no
827 * RTT could bemeasured.
829 * Returns true if we were able to update BDP, false otherwise.
832 congestion_control_update_circuit_bdp(congestion_control_t
*cc
,
833 const circuit_t
*circ
,
834 const crypt_path_t
*layer_hint
,
836 uint64_t curr_rtt_usec
)
839 unsigned int blocked_on_chan
= 0;
840 uint64_t timestamp_usec
;
841 uint64_t sendme_rate_bdp
= 0;
845 if (CIRCUIT_IS_ORIGIN(circ
)) {
846 /* origin circs use n_chan */
847 chan_q
= circ
->n_chan_cells
.n
;
848 blocked_on_chan
= circ
->streams_blocked_on_n_chan
;
850 /* Both onion services and exits use or_circuit and p_chan */
851 chan_q
= CONST_TO_OR_CIRCUIT(circ
)->p_chan_cells
.n
;
852 blocked_on_chan
= circ
->streams_blocked_on_p_chan
;
855 /* If we have no EWMA RTT, it is because monotime has been stalled
856 * or messed up the entire time so far. Set our BDP estimates directly
858 if (!cc
->ewma_rtt_usec
) {
859 uint64_t cwnd
= cc
->cwnd
;
861 /* If the channel is blocked, keep subtracting off the chan_q
862 * until we hit the min cwnd. */
863 if (blocked_on_chan
) {
864 cwnd
= MAX(cwnd
- chan_q
, cc
->cwnd_min
);
865 cc
->blocked_chan
= 1;
867 cc
->blocked_chan
= 0;
870 cc
->bdp
[BDP_ALG_CWND_RTT
] = cwnd
;
871 cc
->bdp
[BDP_ALG_INFLIGHT_RTT
] = cwnd
;
872 cc
->bdp
[BDP_ALG_SENDME_RATE
] = cwnd
;
873 cc
->bdp
[BDP_ALG_PIECEWISE
] = cwnd
;
875 static ratelim_t dec_notice_limit
= RATELIM_INIT(300);
876 log_fn_ratelim(&dec_notice_limit
, LOG_NOTICE
, LD_CIRC
,
877 "Our clock has been stalled for the entire lifetime of a circuit. "
878 "Performance may be sub-optimal.");
880 return blocked_on_chan
;
883 /* Congestion window based BDP will respond to changes in RTT only, and is
884 * relative to cwnd growth. It is useful for correcting for BDP
885 * overestimation, but if BDP is higher than the current cwnd, it will
888 * We multiply here first to avoid precision issues from min_RTT being
889 * close to ewma RTT. Since all fields are u64, there is plenty of
890 * room here to multiply first.
892 cc
->bdp
[BDP_ALG_CWND_RTT
] = cc
->cwnd
*cc
->min_rtt_usec
/cc
->ewma_rtt_usec
;
895 * If we have no pending streams, we do not have enough data to fill
896 * the BDP, so preserve our old estimates but do not make any more.
898 if (!blocked_on_chan
&& !circuit_has_active_streams(circ
, layer_hint
)) {
900 "CC: Streams drained. Spare package window: %"PRIu64
901 ", no BDP update", cc
->cwnd
- cc
->inflight
);
903 /* Clear SENDME timestamps; they will be wrong with intermittent data */
904 SMARTLIST_FOREACH(cc
->sendme_arrival_timestamps
, uint64_t *, t
,
906 smartlist_clear(cc
->sendme_arrival_timestamps
);
907 } else if (curr_rtt_usec
&& is_monotime_clock_reliable()) {
908 /* Sendme-based BDP will quickly measure BDP in much less than
909 * a cwnd worth of data when in use (in 2-10 SENDMEs).
911 * But if the link goes idle, it will be vastly lower than true BDP. Hence
912 * we only compute it if we have either pending stream data, or streams
913 * are still blocked on the channel queued data.
915 * We also do not compute it if we do not have a current RTT passed in,
916 * because that means that monotime is currently stalled or just jumped.
918 enqueue_timestamp(cc
->sendme_arrival_timestamps
, now_usec
);
920 if (smartlist_len(cc
->sendme_arrival_timestamps
) >= cc
->bwe_sendme_min
) {
921 /* If we have more sendmes than fit in a cwnd, trim the list.
922 * Those are not acurrately measuring throughput, if cwnd is
923 * currently smaller than BDP */
924 while (smartlist_len(cc
->sendme_arrival_timestamps
) >
925 cc
->bwe_sendme_min
&&
926 (uint64_t)smartlist_len(cc
->sendme_arrival_timestamps
) >
927 sendme_acks_per_cwnd(cc
)) {
928 (void)dequeue_timestamp(cc
->sendme_arrival_timestamps
);
930 int sendme_cnt
= smartlist_len(cc
->sendme_arrival_timestamps
);
932 /* Calculate SENDME_BWE_COUNT pure average */
933 timestamp_usec
= peek_timestamp(cc
->sendme_arrival_timestamps
);
934 uint64_t delta
= now_usec
- timestamp_usec
;
936 /* The acked data is in sendme_cnt-1 chunks, because we are counting the
937 * data that is processed by the other endpoint *between* all of these
938 * sendmes. There's one less gap between the sendmes than the number
940 uint64_t cells
= (sendme_cnt
-1)*cc
->sendme_inc
;
942 /* The bandwidth estimate is cells/delta, which when multiplied
943 * by min RTT obtains the BDP. However, we multiply first to
944 * avoid precision issues with the RTT being close to delta in size. */
945 sendme_rate_bdp
= cells
*cc
->min_rtt_usec
/delta
;
947 /* Calculate BDP_EWMA_COUNT N-EWMA */
948 cc
->bdp
[BDP_ALG_SENDME_RATE
] =
949 n_count_ewma(sendme_rate_bdp
, cc
->bdp
[BDP_ALG_SENDME_RATE
],
950 cc
->ewma_cwnd_cnt
*sendme_acks_per_cwnd(cc
));
953 /* In-flight BDP will cause the cwnd to drift down when underutilized.
954 * It is most useful when the local OR conn is blocked, so we only
955 * compute it if we're utilized. */
956 cc
->bdp
[BDP_ALG_INFLIGHT_RTT
] =
957 (cc
->inflight
- chan_q
)*cc
->min_rtt_usec
/
958 MAX(cc
->ewma_rtt_usec
, curr_rtt_usec
);
960 /* We can still update inflight with just an EWMA RTT, but only
961 * if there is data flowing */
962 cc
->bdp
[BDP_ALG_INFLIGHT_RTT
] =
963 (cc
->inflight
- chan_q
)*cc
->min_rtt_usec
/cc
->ewma_rtt_usec
;
966 /* The orconn is blocked; use smaller of inflight vs SENDME */
967 if (blocked_on_chan
) {
968 log_info(LD_CIRC
, "CC: Streams blocked on circ channel. Chanq: %d",
971 /* A blocked channel is an immediate congestion signal, but it still
972 * happens only once per cwnd */
973 if (!cc
->blocked_chan
) {
974 cc
->next_cc_event
= 0;
975 cc
->blocked_chan
= 1;
978 if (cc
->bdp
[BDP_ALG_SENDME_RATE
]) {
979 cc
->bdp
[BDP_ALG_PIECEWISE
] = MIN(cc
->bdp
[BDP_ALG_INFLIGHT_RTT
],
980 cc
->bdp
[BDP_ALG_SENDME_RATE
]);
982 cc
->bdp
[BDP_ALG_PIECEWISE
] = cc
->bdp
[BDP_ALG_INFLIGHT_RTT
];
985 /* If we were previously blocked, emit a new congestion event
986 * now that we are unblocked, to re-evaluate cwnd */
987 if (cc
->blocked_chan
) {
988 cc
->blocked_chan
= 0;
989 cc
->next_cc_event
= 0;
990 log_info(LD_CIRC
, "CC: Streams un-blocked on circ channel. Chanq: %d",
994 cc
->bdp
[BDP_ALG_PIECEWISE
] = MAX(cc
->bdp
[BDP_ALG_SENDME_RATE
],
995 cc
->bdp
[BDP_ALG_CWND_RTT
]);
998 /* We can end up with no piecewise value if we didn't have either
999 * a SENDME estimate or enough data for an inflight estimate.
1000 * It also happens on the very first sendme, since we need two
1001 * to get a BDP. In these cases, use the cwnd method. */
1002 if (!cc
->bdp
[BDP_ALG_PIECEWISE
]) {
1003 cc
->bdp
[BDP_ALG_PIECEWISE
] = cc
->bdp
[BDP_ALG_CWND_RTT
];
1004 log_info(LD_CIRC
, "CC: No piecewise BDP. Using %"PRIu64
,
1005 cc
->bdp
[BDP_ALG_PIECEWISE
]);
1008 if (cc
->next_cc_event
== 0) {
1009 if (CIRCUIT_IS_ORIGIN(circ
)) {
1012 "SENDME RTT: %"PRIu64
", %"PRIu64
", %"PRIu64
", %"PRIu64
", "
1019 CONST_TO_ORIGIN_CIRCUIT(circ
)->global_identifier
,
1020 cc
->min_rtt_usec
/1000,
1022 cc
->ewma_rtt_usec
/1000,
1023 cc
->max_rtt_usec
/1000,
1024 cc
->bdp
[BDP_ALG_INFLIGHT_RTT
],
1025 cc
->bdp
[BDP_ALG_CWND_RTT
],
1027 cc
->bdp
[BDP_ALG_SENDME_RATE
],
1028 cc
->bdp
[BDP_ALG_PIECEWISE
]
1032 "CC: Circuit %"PRIu64
":%d "
1033 "SENDME RTT: %"PRIu64
", %"PRIu64
", %"PRIu64
", %"PRIu64
", "
1039 // XXX: actually, is this p_chan here? This is
1040 // an or_circuit (exit or onion)
1041 circ
->n_chan
->global_identifier
, circ
->n_circ_id
,
1042 cc
->min_rtt_usec
/1000,
1044 cc
->ewma_rtt_usec
/1000,
1045 cc
->max_rtt_usec
/1000,
1046 cc
->bdp
[BDP_ALG_INFLIGHT_RTT
],
1047 cc
->bdp
[BDP_ALG_CWND_RTT
],
1049 cc
->bdp
[BDP_ALG_SENDME_RATE
],
1050 cc
->bdp
[BDP_ALG_PIECEWISE
]
1055 /* We updated BDP this round if either we had a blocked channel, or
1056 * the curr_rtt_usec was not 0. */
1057 bool ret
= (blocked_on_chan
|| curr_rtt_usec
!= 0);
1059 tor_trace(TR_SUBSYS(cc
), TR_EV(bdp_update
), circ
, cc
, curr_rtt_usec
,
1066 * Dispatch the sendme to the appropriate congestion control algorithm.
1069 congestion_control_dispatch_cc_alg(congestion_control_t
*cc
,
1070 const circuit_t
*circ
,
1071 const crypt_path_t
*layer_hint
)
1073 int ret
= -END_CIRC_REASON_INTERNAL
;
1074 switch (cc
->cc_alg
) {
1075 case CC_ALG_WESTWOOD
:
1076 ret
= congestion_control_westwood_process_sendme(cc
, circ
, layer_hint
);
1080 ret
= congestion_control_vegas_process_sendme(cc
, circ
, layer_hint
);
1084 ret
= congestion_control_nola_process_sendme(cc
, circ
, layer_hint
);
1092 if (cc
->cwnd
> cwnd_max
) {
1093 static ratelim_t cwnd_limit
= RATELIM_INIT(60);
1094 log_fn_ratelim(&cwnd_limit
, LOG_NOTICE
, LD_CIRC
,
1095 "Congestion control cwnd %"PRIu64
" exceeds max %d, clamping.",
1096 cc
->cwnd
, cwnd_max
);
1097 cc
->cwnd
= cwnd_max
;
1104 * Build an extension field request to negotiate congestion control.
1106 * If congestion control is enabled, field TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST
1107 * is created in msg_out. It is a single 0-length field that signifies that we
1108 * want to use congestion control. The length of msg_out is provided via
1111 * If congestion control is not enabled, a payload with 0 extensions is created
1114 * If there is a failure building the request, -1 is returned, else 0.
1116 * *msg_out must be freed if the return value is 0.
1119 congestion_control_build_ext_request(uint8_t **msg_out
, size_t *msg_len_out
)
1121 uint8_t *request
= NULL
;
1122 trn_extension_t
*ext
= NULL
;
1123 trn_extension_field_t
*field
= NULL
;
1125 ext
= trn_extension_new();
1127 /* With congestion control enabled, add the request, else it is an empty
1128 * request in the payload. */
1130 if (congestion_control_enabled()) {
1131 /* Build the extension field that will hold the CC field. */
1132 field
= trn_extension_field_new();
1133 trn_extension_field_set_field_type(field
,
1134 TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST
);
1136 /* No payload indicating a request to use congestion control. */
1137 trn_extension_field_set_field_len(field
, 0);
1139 /* Build final extension. */
1140 trn_extension_add_fields(ext
, field
);
1141 trn_extension_set_num(ext
, 1);
1144 /* Encode extension. */
1145 ssize_t ret
= trn_extension_encoded_len(ext
);
1149 size_t request_len
= ret
;
1150 request
= tor_malloc_zero(request_len
);
1151 ret
= trn_extension_encode(request
, request_len
, ext
);
1157 *msg_len_out
= request_len
;
1159 /* Free everything, we've encoded the request now. */
1163 trn_extension_free(ext
);
1168 * Parse a congestion control ntorv3 request payload for extensions.
1170 * On parsing failure, -1 is returned.
1172 * If congestion control request is present, return 1. If it is not present,
1175 * WARNING: Called from CPU worker! Must not access any global state.
1178 congestion_control_parse_ext_request(const uint8_t *msg
, const size_t msg_len
)
1181 trn_extension_t
*ext
= NULL
;
1182 size_t num_fields
= 0;
1184 /* Parse extension from payload. */
1185 ret
= trn_extension_parse(&ext
, msg
, msg_len
);
1190 /* No extension implies no support for congestion control. In this case, we
1191 * simply return 0 to indicate CC is disabled. */
1192 if ((num_fields
= trn_extension_get_num(ext
)) == 0) {
1197 /* Go over all fields. If any field is TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST,
1198 * then congestion control is enabled. Ignore unknown fields. */
1199 for (size_t f
= 0; f
< num_fields
; f
++) {
1200 const trn_extension_field_t
*field
= trn_extension_get_fields(ext
, f
);
1201 if (field
== NULL
) {
1206 /* For congestion control to be enabled, we only need the field type. */
1207 if (trn_extension_field_get_field_type(field
) ==
1208 TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST
) {
1215 trn_extension_free(ext
);
1220 * Given our observed parameters for circuits and congestion control,
1221 * as well as the parameters for the resulting circuit, build a response
1222 * payload using extension fields into *msg_out, with length specified in
1225 * If congestion control will be enabled, the extension field for
1226 * TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE will contain the sendme_inc value.
1228 * If congestion control won't be enabled, an extension payload with 0
1229 * fields will be created.
1231 * Return 0 if an extension payload was created in *msg_out, and -1 on
1234 * *msg_out must be freed if the return value is 0.
1236 * WARNING: Called from CPU worker! Must not access any global state.
1239 congestion_control_build_ext_response(const circuit_params_t
*our_params
,
1240 const circuit_params_t
*circ_params
,
1241 uint8_t **msg_out
, size_t *msg_len_out
)
1244 uint8_t *request
= NULL
;
1245 trn_extension_t
*ext
= NULL
;
1246 trn_extension_field_t
*field
= NULL
;
1247 trn_extension_field_cc_t
*cc_field
= NULL
;
1249 tor_assert(our_params
);
1250 tor_assert(circ_params
);
1251 tor_assert(msg_out
);
1252 tor_assert(msg_len_out
);
1254 ext
= trn_extension_new();
1256 if (circ_params
->cc_enabled
) {
1257 /* Build the extension field that will hold the CC field. */
1258 field
= trn_extension_field_new();
1259 trn_extension_field_set_field_type(field
,
1260 TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE
);
1262 /* Build the congestion control field response. */
1263 cc_field
= trn_extension_field_cc_new();
1264 trn_extension_field_cc_set_sendme_inc(cc_field
,
1265 our_params
->sendme_inc_cells
);
1267 ret
= trn_extension_field_cc_encoded_len(cc_field
);
1268 if (BUG(ret
<= 0)) {
1271 size_t field_len
= ret
;
1272 trn_extension_field_set_field_len(field
, field_len
);
1273 trn_extension_field_setlen_field(field
, field_len
);
1275 uint8_t *field_array
= trn_extension_field_getarray_field(field
);
1276 ret
= trn_extension_field_cc_encode(field_array
,
1277 trn_extension_field_getlen_field(field
), cc_field
);
1278 if (BUG(ret
<= 0)) {
1282 /* Build final extension. */
1283 trn_extension_add_fields(ext
, field
);
1284 trn_extension_set_num(ext
, 1);
1287 /* Encode extension. */
1288 ret
= trn_extension_encoded_len(ext
);
1292 size_t request_len
= ret
;
1293 request
= tor_malloc_zero(request_len
);
1294 ret
= trn_extension_encode(request
, request_len
, ext
);
1300 *msg_len_out
= request_len
;
1302 /* We've just encoded the extension, clean everything. */
1307 trn_extension_free(ext
);
1309 trn_extension_field_free(field
);
1311 trn_extension_field_cc_free(cc_field
);
1315 /** Return true iff the given sendme increment is within the acceptable
1318 congestion_control_validate_sendme_increment(uint8_t sendme_inc
)
1320 /* We will only accept this response (and this circuit) if sendme_inc
1321 * is within a factor of 2 of our consensus value. We should not need
1322 * to change cc_sendme_inc much, and if we do, we can spread out those
1323 * changes over smaller increments once every 4 hours. Exits that
1324 * violate this range should just not be used. */
1325 #define MAX_SENDME_INC_NEGOTIATE_FACTOR 2
1327 if (sendme_inc
== 0)
1331 MAX_SENDME_INC_NEGOTIATE_FACTOR
* congestion_control_sendme_inc() ||
1333 congestion_control_sendme_inc() / MAX_SENDME_INC_NEGOTIATE_FACTOR
) {
1339 /** Return 1 if CC is enabled which also will set the SENDME increment into our
1340 * params_out. Return 0 if CC is disabled. Else, return -1 on error. */
1342 congestion_control_parse_ext_response(const uint8_t *msg
,
1343 const size_t msg_len
,
1344 circuit_params_t
*params_out
)
1347 size_t num_fields
= 0;
1348 trn_extension_t
*ext
= NULL
;
1349 trn_extension_field_cc_t
*cc_field
= NULL
;
1351 /* We will only accept this response (and this circuit) if sendme_inc
1352 * is within a factor of 2 of our consensus value. We should not need
1353 * to change cc_sendme_inc much, and if we do, we can spread out those
1354 * changes over smaller increments once every 4 hours. Exits that
1355 * violate this range should just not be used. */
1356 #define MAX_SENDME_INC_NEGOTIATE_FACTOR 2
1358 /* Parse extension from payload. */
1359 ret
= trn_extension_parse(&ext
, msg
, msg_len
);
1364 if ((num_fields
= trn_extension_get_num(ext
)) == 0) {
1369 /* Go over all fields. If any field is TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE,
1370 * then congestion control is enabled. Ignore unknown fields. */
1371 for (size_t f
= 0; f
< num_fields
; f
++) {
1372 const trn_extension_field_t
*field
= trn_extension_get_fields(ext
, f
);
1373 if (field
== NULL
) {
1378 /* Only examine TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE; ignore other fields */
1379 if (trn_extension_field_get_field_type(field
) ==
1380 TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE
) {
1382 /* Parse the field into the congestion control field. */
1383 ret
= trn_extension_field_cc_parse(&cc_field
,
1384 trn_extension_field_getconstarray_field(field
),
1385 trn_extension_field_getlen_field(field
));
1390 uint8_t sendme_inc_cells
=
1391 trn_extension_field_cc_get_sendme_inc(cc_field
);
1392 if (!congestion_control_validate_sendme_increment(sendme_inc_cells
)) {
1397 /* All good. Get value and break */
1398 params_out
->sendme_inc_cells
= sendme_inc_cells
;
1405 trn_extension_free(ext
);
1406 trn_extension_field_cc_free(cc_field
);