src/core/or/congestion_control_common.c

   1 /* Copyright (c) 2021, The Tor Project, Inc. */
   2 /* See LICENSE for licensing information */
   3
   4 /**
   5  * \file congestion_control_common.c
   6  * \brief Common code used by all congestion control algorithms.
   7  */
   8
   9 #define TOR_CONGESTION_CONTROL_COMMON_PRIVATE
  10
  11 #include "core/or/or.h"
  12
  13 #include "core/crypto/onion_crypto.h"
  14 #include "core/or/circuitlist.h"
  15 #include "core/or/crypt_path.h"
  16 #include "core/or/or_circuit_st.h"
  17 #include "core/or/origin_circuit_st.h"
  18 #include "core/or/channel.h"
  19 #include "core/mainloop/connection.h"
  20 #include "core/or/sendme.h"
  21 #include "core/or/congestion_control_common.h"
  22 #include "core/or/congestion_control_vegas.h"
  23 #include "core/or/congestion_control_nola.h"
  24 #include "core/or/congestion_control_westwood.h"
  25 #include "core/or/congestion_control_st.h"
  26 #include "core/or/trace_probes_cc.h"
  27 #include "lib/time/compat_time.h"
  28 #include "feature/nodelist/networkstatus.h"
  29 #include "app/config/config.h"
  30
  31 #include "trunnel/congestion_control.h"
  32 #include "trunnel/extension.h"
  33
  34 /* Consensus parameter defaults.
  35  *
  36  * More details for each of the parameters can be found in proposal 324,
  37  * section 6.5 including tuning notes. */
  38 #define CIRCWINDOW_INIT (500)
  39 #define SENDME_INC_DFLT (50)
  40 #define CC_ALG_DFLT (CC_ALG_SENDME)
  41 #define CC_ALG_DFLT_ALWAYS (CC_ALG_VEGAS)
  42
  43 #define CWND_INC_DFLT (50)
  44 #define CWND_INC_PCT_SS_DFLT (100)
  45 #define CWND_INC_RATE_DFLT (1)
  46 #define CWND_MAX_DFLT (INT32_MAX)
  47 #define CWND_MIN_DFLT (MAX(100, SENDME_INC_DFLT))
  48
  49 #define BWE_SENDME_MIN_DFLT (5)
  50 #define EWMA_CWND_COUNT_DFLT (2)
  51
  52 /* BDP algorithms for each congestion control algorithms use the piecewise
  53  * estimattor. See section 3.1.4 of proposal 324. */
  54 #define WESTWOOD_BDP_ALG BDP_ALG_PIECEWISE
  55 #define VEGAS_BDP_MIX_ALG BDP_ALG_PIECEWISE
  56 #define NOLA_BDP_ALG BDP_ALG_PIECEWISE
  57
  58 /* Indicate OR connection buffer limitations used to stop or start accepting
  59  * cells in its outbuf.
  60  *
  61  * These watermarks are historical to tor in a sense that they've been used
  62  * almost from the genesis point. And were likely defined to fit the bounds of
  63  * TLS records of 16KB which would be around 32 cells.
  64  *
  65  * These are defaults of the consensus parameter "orconn_high" and "orconn_low"
  66  * values. */
  67 #define OR_CONN_HIGHWATER_DFLT (32*1024)
  68 #define OR_CONN_LOWWATER_DFLT (16*1024)
  69
  70 /* Low and high values of circuit cell queue sizes. They are used to tell when
  71  * to start or stop reading on the streams attached on the circuit.
  72  *
  73  * These are defaults of the consensus parameters "cellq_high" and "cellq_low".
  74  */
  75 #define CELL_QUEUE_LOW_DFLT (10)
  76 #define CELL_QUEUE_HIGH_DFLT (256)
  77
  78 static uint64_t congestion_control_update_circuit_rtt(congestion_control_t *,
  79                                                       uint64_t);
  80 static bool congestion_control_update_circuit_bdp(congestion_control_t *,
  81                                                   const circuit_t *,
  82                                                   const crypt_path_t *,
  83                                                   uint64_t, uint64_t);
  84 /* For unit tests */
  85 void congestion_control_set_cc_enabled(void);
  86
  87 /* Consensus parameters cached. The non static ones are extern. */
  88 static uint32_t cwnd_max = CWND_MAX_DFLT;
  89 int32_t cell_queue_high = CELL_QUEUE_HIGH_DFLT;
  90 int32_t cell_queue_low = CELL_QUEUE_LOW_DFLT;
  91 uint32_t or_conn_highwater = OR_CONN_HIGHWATER_DFLT;
  92 uint32_t or_conn_lowwater = OR_CONN_LOWWATER_DFLT;
  93 uint8_t cc_sendme_inc = SENDME_INC_DFLT;
  94 static cc_alg_t cc_alg = CC_ALG_DFLT;
  95
  96 /**
  97  * Update global congestion control related consensus parameter values,
  98  * every consensus update.
  99  */
 100 void
 101 congestion_control_new_consensus_params(const networkstatus_t *ns)
 102 {
 103 #define CELL_QUEUE_HIGH_MIN (1)
 104 #define CELL_QUEUE_HIGH_MAX (1000)
 105   cell_queue_high = networkstatus_get_param(ns, "cellq_high",
 106       CELL_QUEUE_HIGH_DFLT,
 107       CELL_QUEUE_HIGH_MIN,
 108       CELL_QUEUE_HIGH_MAX);
 109
 110 #define CELL_QUEUE_LOW_MIN (1)
 111 #define CELL_QUEUE_LOW_MAX (1000)
 112   cell_queue_low = networkstatus_get_param(ns, "cellq_low",
 113       CELL_QUEUE_LOW_DFLT,
 114       CELL_QUEUE_LOW_MIN,
 115       CELL_QUEUE_LOW_MAX);
 116
 117 #define OR_CONN_HIGHWATER_MIN (CELL_PAYLOAD_SIZE)
 118 #define OR_CONN_HIGHWATER_MAX (INT32_MAX)
 119   or_conn_highwater =
 120     networkstatus_get_param(ns, "orconn_high",
 121         OR_CONN_HIGHWATER_DFLT,
 122         OR_CONN_HIGHWATER_MIN,
 123         OR_CONN_HIGHWATER_MAX);
 124
 125 #define OR_CONN_LOWWATER_MIN (CELL_PAYLOAD_SIZE)
 126 #define OR_CONN_LOWWATER_MAX (INT32_MAX)
 127   or_conn_lowwater =
 128     networkstatus_get_param(ns, "orconn_low",
 129         OR_CONN_LOWWATER_DFLT,
 130         OR_CONN_LOWWATER_MIN,
 131         OR_CONN_LOWWATER_MAX);
 132
 133 #define CWND_MAX_MIN 500
 134 #define CWND_MAX_MAX (INT32_MAX)
 135   cwnd_max =
 136     networkstatus_get_param(NULL, "cc_cwnd_max",
 137         CWND_MAX_DFLT,
 138         CWND_MAX_MIN,
 139         CWND_MAX_MAX);
 140
 141 #define SENDME_INC_MIN 10
 142 #define SENDME_INC_MAX (1000)
 143   cc_sendme_inc =
 144     networkstatus_get_param(NULL, "cc_sendme_inc",
 145         SENDME_INC_DFLT,
 146         SENDME_INC_MIN,
 147         SENDME_INC_MAX);
 148
 149 #define CC_ALG_MIN 0
 150 #define CC_ALG_MAX (NUM_CC_ALGS-1)
 151   cc_alg =
 152     networkstatus_get_param(NULL, "cc_alg",
 153         CC_ALG_DFLT,
 154         CC_ALG_MIN,
 155         CC_ALG_MAX);
 156 }
 157
 158 /**
 159  * Set congestion control parameters on a circuit's congestion
 160  * control object based on values from the consensus.
 161  *
 162  * cc_alg is the negotiated congestion control algorithm.
 163  *
 164  * sendme_inc is the number of packaged cells that a sendme cell
 165  * acks. This parameter will come from circuit negotiation.
 166  */
 167 static void
 168 congestion_control_init_params(congestion_control_t *cc,
 169                                const circuit_params_t *params)
 170 {
 171   const or_options_t *opts = get_options();
 172   cc->sendme_inc = params->sendme_inc_cells;
 173
 174 #define CWND_INIT_MIN 100
 175 #define CWND_INIT_MAX (10000)
 176   cc->cwnd =
 177     networkstatus_get_param(NULL, "cc_cwnd_init",
 178         CIRCWINDOW_INIT,
 179         CWND_INIT_MIN,
 180         CWND_INIT_MAX);
 181
 182 #define CWND_INC_PCT_SS_MIN 1
 183 #define CWND_INC_PCT_SS_MAX (500)
 184   cc->cwnd_inc_pct_ss =
 185     networkstatus_get_param(NULL, "cc_cwnd_inc_pct_ss",
 186         CWND_INC_PCT_SS_DFLT,
 187         CWND_INC_PCT_SS_MIN,
 188         CWND_INC_PCT_SS_MAX);
 189
 190 #define CWND_INC_MIN 1
 191 #define CWND_INC_MAX (1000)
 192   cc->cwnd_inc =
 193     networkstatus_get_param(NULL, "cc_cwnd_inc",
 194         CWND_INC_DFLT,
 195         CWND_INC_MIN,
 196         CWND_INC_MAX);
 197
 198 #define CWND_INC_RATE_MIN 1
 199 #define CWND_INC_RATE_MAX (250)
 200   cc->cwnd_inc_rate =
 201     networkstatus_get_param(NULL, "cc_cwnd_inc_rate",
 202         CWND_INC_RATE_DFLT,
 203         CWND_INC_RATE_MIN,
 204         CWND_INC_RATE_MAX);
 205
 206 #define CWND_MIN_MIN 20
 207 #define CWND_MIN_MAX (1000)
 208   cc->cwnd_min =
 209     networkstatus_get_param(NULL, "cc_cwnd_min",
 210         CWND_MIN_DFLT,
 211         CWND_MIN_MIN,
 212         CWND_MIN_MAX);
 213
 214 #define EWMA_CWND_COUNT_MIN 1
 215 #define EWMA_CWND_COUNT_MAX (100)
 216   cc->ewma_cwnd_cnt =
 217     networkstatus_get_param(NULL, "cc_ewma_cwnd_cnt",
 218         EWMA_CWND_COUNT_DFLT,
 219         EWMA_CWND_COUNT_MIN,
 220         EWMA_CWND_COUNT_MAX);
 221
 222 #define BWE_SENDME_MIN_MIN 2
 223 #define BWE_SENDME_MIN_MAX (20)
 224   cc->bwe_sendme_min =
 225     networkstatus_get_param(NULL, "cc_bwe_min",
 226         BWE_SENDME_MIN_DFLT,
 227         BWE_SENDME_MIN_MIN,
 228         BWE_SENDME_MIN_MAX);
 229
 230   /* If the consensus says to use OG sendme, but torrc has
 231    * always-enabled, use the default "always" alg (vegas),
 232    * else use cached conensus alg. */
 233   if (cc_alg == CC_ALG_SENDME && opts->AlwaysCongestionControl) {
 234     cc->cc_alg = CC_ALG_DFLT_ALWAYS;
 235   } else {
 236     cc->cc_alg = cc_alg;
 237   }
 238
 239   bdp_alg_t default_bdp_alg = 0;
 240
 241   switch (cc->cc_alg) {
 242     case CC_ALG_WESTWOOD:
 243       default_bdp_alg = WESTWOOD_BDP_ALG;
 244       break;
 245     case CC_ALG_VEGAS:
 246       default_bdp_alg = VEGAS_BDP_MIX_ALG;
 247       break;
 248     case CC_ALG_NOLA:
 249       default_bdp_alg = NOLA_BDP_ALG;
 250       break;
 251     case CC_ALG_SENDME:
 252     default:
 253       tor_fragile_assert();
 254       return; // No alg-specific params
 255   }
 256
 257   cc->bdp_alg =
 258     networkstatus_get_param(NULL, "cc_bdp_alg",
 259         default_bdp_alg,
 260         0,
 261         NUM_BDP_ALGS-1);
 262
 263   /* Algorithm-specific parameters */
 264   if (cc->cc_alg == CC_ALG_WESTWOOD) {
 265     congestion_control_westwood_set_params(cc);
 266   } else if (cc->cc_alg == CC_ALG_VEGAS) {
 267     congestion_control_vegas_set_params(cc);
 268   } else if (cc->cc_alg == CC_ALG_NOLA) {
 269     congestion_control_nola_set_params(cc);
 270   }
 271 }
 272
 273 /** Returns true if congestion control is enabled in the most recent
 274  * consensus, or if __AlwaysCongestionControl is set to true.
 275  *
 276  * Note that this function (and many many other functions) should not
 277  * be called from the CPU worker threads when handling congestion
 278  * control negotiation. Relevant values are marshaled into the
 279  * `circuit_params_t` struct, in order to be used in worker threads
 280  * without touching global state. Use those values in CPU worker
 281  * threads, instead of calling this function.
 282  *
 283  * The danger is still present, in your time, as it was in ours.
 284  */
 285 bool
 286 congestion_control_enabled(void)
 287 {
 288   const or_options_t *opts = NULL;
 289
 290   tor_assert_nonfatal_once(in_main_thread());
 291
 292   opts = get_options();
 293
 294   /* If the user has set "__AlwaysCongesttionControl",
 295    * then always try to negotiate congestion control, regardless
 296    * of consensus param. This is to be used for testing and sbws.
 297    *
 298    * Note that we do *not* allow disabling congestion control
 299    * if the consensus says to use it, as this is bad for queueing
 300    * and fairness. */
 301   if (opts->AlwaysCongestionControl)
 302     return 1;
 303
 304   return cc_alg != CC_ALG_SENDME;
 305 }
 306
 307 /**
 308  * For unit tests only: set the cached consensus cc alg to
 309  * specified value.
 310  */
 311 void
 312 congestion_control_set_cc_enabled(void)
 313 {
 314   cc_alg = CC_ALG_VEGAS;
 315 }
 316
 317 /**
 318  * Allocate and initialize fields in congestion control object.
 319  *
 320  * cc_alg is the negotiated congestion control algorithm.
 321  *
 322  * sendme_inc is the number of packaged cells that a sendme cell
 323  * acks. This parameter will come from circuit negotiation.
 324  */
 325 static void
 326 congestion_control_init(congestion_control_t *cc,
 327                         const circuit_params_t *params)
 328 {
 329   cc->sendme_pending_timestamps = smartlist_new();
 330   cc->sendme_arrival_timestamps = smartlist_new();
 331
 332   cc->in_slow_start = 1;
 333   congestion_control_init_params(cc, params);
 334
 335   cc->next_cc_event = CWND_UPDATE_RATE(cc);
 336 }
 337
 338 /** Allocate and initialize a new congestion control object */
 339 congestion_control_t *
 340 congestion_control_new(const circuit_params_t *params)
 341 {
 342   congestion_control_t *cc = tor_malloc_zero(sizeof(congestion_control_t));
 343
 344   congestion_control_init(cc, params);
 345
 346   return cc;
 347 }
 348
 349 /**
 350  * Free a congestion control object and its asssociated state.
 351  */
 352 void
 353 congestion_control_free_(congestion_control_t *cc)
 354 {
 355   if (!cc)
 356     return;
 357
 358   SMARTLIST_FOREACH(cc->sendme_pending_timestamps, uint64_t *, t, tor_free(t));
 359   SMARTLIST_FOREACH(cc->sendme_arrival_timestamps, uint64_t *, t, tor_free(t));
 360   smartlist_free(cc->sendme_pending_timestamps);
 361   smartlist_free(cc->sendme_arrival_timestamps);
 362
 363   tor_free(cc);
 364 }
 365
 366 /**
 367  * Enqueue a u64 timestamp to the end of a queue of timestamps.
 368  */
 369 static inline void
 370 enqueue_timestamp(smartlist_t *timestamps_u64, uint64_t timestamp_usec)
 371 {
 372   uint64_t *timestamp_ptr = tor_malloc(sizeof(uint64_t));
 373   *timestamp_ptr = timestamp_usec;
 374
 375   smartlist_add(timestamps_u64, timestamp_ptr);
 376 }
 377
 378 /**
 379  * Peek at the head of a smartlist queue of u64 timestamps.
 380  */
 381 static inline uint64_t
 382 peek_timestamp(const smartlist_t *timestamps_u64_usecs)
 383 {
 384   uint64_t *timestamp_ptr = smartlist_get(timestamps_u64_usecs, 0);
 385
 386   if (BUG(!timestamp_ptr)) {
 387     log_err(LD_CIRC, "Congestion control timestamp list became empty!");
 388     return 0;
 389   }
 390
 391   return *timestamp_ptr;
 392 }
 393
 394 /**
 395  * Dequeue a u64 monotime usec timestamp from the front of a
 396  * smartlist of pointers to 64.
 397  */
 398 static inline uint64_t
 399 dequeue_timestamp(smartlist_t *timestamps_u64_usecs)
 400 {
 401   uint64_t *timestamp_ptr = smartlist_get(timestamps_u64_usecs, 0);
 402   uint64_t timestamp_u64;
 403
 404   if (BUG(!timestamp_ptr)) {
 405     log_err(LD_CIRC, "Congestion control timestamp list became empty!");
 406     return 0;
 407   }
 408
 409   timestamp_u64 = *timestamp_ptr;
 410   smartlist_del_keeporder(timestamps_u64_usecs, 0);
 411   tor_free(timestamp_ptr);
 412
 413   return timestamp_u64;
 414 }
 415
 416 /**
 417  * Returns the number of sendme acks that will be recieved in the
 418  * current congestion window size, rounded to nearest int.
 419  */
 420 static inline uint64_t
 421 sendme_acks_per_cwnd(const congestion_control_t *cc)
 422 {
 423   /* We add half a sendme_inc to cwnd to round to the nearest int */
 424   return ((cc->cwnd + cc->sendme_inc/2)/cc->sendme_inc);
 425 }
 426
 427 /**
 428  * Get a package window from either old sendme logic, or congestion control.
 429  *
 430  * A package window is how many cells you can still send.
 431  */
 432 int
 433 congestion_control_get_package_window(const circuit_t *circ,
 434                                       const crypt_path_t *cpath)
 435 {
 436   int package_window;
 437   congestion_control_t *cc;
 438
 439   tor_assert(circ);
 440
 441   if (cpath) {
 442     package_window = cpath->package_window;
 443     cc = cpath->ccontrol;
 444   } else {
 445     package_window = circ->package_window;
 446     cc = circ->ccontrol;
 447   }
 448
 449   if (!cc) {
 450     return package_window;
 451   } else {
 452     /* Inflight can be above cwnd if cwnd was just reduced */
 453     if (cc->inflight > cc->cwnd)
 454       return 0;
 455     /* In the extremely unlikely event that cwnd-inflight is larger than
 456      * INT32_MAX, just return that cap, so old code doesn't explode. */
 457     else if (cc->cwnd - cc->inflight > INT32_MAX)
 458       return INT32_MAX;
 459     else
 460       return (int)(cc->cwnd - cc->inflight);
 461   }
 462 }
 463
 464 /**
 465  * Returns the number of cells that are acked by every sendme.
 466  */
 467 int
 468 sendme_get_inc_count(const circuit_t *circ, const crypt_path_t *layer_hint)
 469 {
 470   int sendme_inc = CIRCWINDOW_INCREMENT;
 471   congestion_control_t *cc = NULL;
 472
 473   if (layer_hint) {
 474     cc = layer_hint->ccontrol;
 475   } else {
 476     cc = circ->ccontrol;
 477   }
 478
 479   if (cc) {
 480     sendme_inc = cc->sendme_inc;
 481   }
 482
 483   return sendme_inc;
 484 }
 485
 486 /** Return true iff the next cell we send will result in the other endpoint
 487  * sending a SENDME.
 488  *
 489  * We are able to know that because the package or inflight window value minus
 490  * one cell (the possible SENDME cell) should be a multiple of the
 491  * cells-per-sendme increment value (set via consensus parameter, negotiated
 492  * for the circuit, and passed in as sendme_inc).
 493  *
 494  * This function is used when recording a cell digest and this is done quite
 495  * low in the stack when decrypting or encrypting a cell. The window is only
 496  * updated once the cell is actually put in the outbuf.
 497  */
 498 bool
 499 circuit_sent_cell_for_sendme(const circuit_t *circ,
 500                              const crypt_path_t *layer_hint)
 501 {
 502   congestion_control_t *cc;
 503   int window;
 504
 505   tor_assert(circ);
 506
 507   if (layer_hint) {
 508     window = layer_hint->package_window;
 509     cc = layer_hint->ccontrol;
 510   } else {
 511     window = circ->package_window;
 512     cc = circ->ccontrol;
 513   }
 514
 515   /* If we are using congestion control and the alg is not
 516    * old-school 'fixed', then use cc->inflight to determine
 517    * when sendmes will be sent */
 518   if (cc) {
 519     if (!cc->inflight)
 520       return false;
 521
 522     /* This check must be +1 because this function is called *before*
 523      * inflight is incremented for the sent cell */
 524     if ((cc->inflight+1) % cc->sendme_inc != 0)
 525       return false;
 526
 527     return true;
 528   }
 529
 530   /* At the start of the window, no SENDME will be expected. */
 531   if (window == CIRCWINDOW_START) {
 532     return false;
 533   }
 534
 535   /* Are we at the limit of the increment and if not, we don't expect next
 536    * cell is a SENDME.
 537    *
 538    * We test against the window minus 1 because when we are looking if the
 539    * next cell is a SENDME, the window (either package or deliver) hasn't been
 540    * decremented just yet so when this is called, we are currently processing
 541    * the "window - 1" cell.
 542    */
 543   if (((window - 1) % CIRCWINDOW_INCREMENT) != 0) {
 544     return false;
 545   }
 546
 547   /* Next cell is expected to be a SENDME. */
 548   return true;
 549 }
 550
 551 /**
 552  * Call-in to tell congestion control code that this circuit sent a cell.
 553  *
 554  * This updates the 'inflight' counter, and if this is a cell that will
 555  * cause the other end to send a SENDME, record the current time in a list
 556  * of pending timestamps, so that we can later compute the circuit RTT when
 557  * the SENDME comes back. */
 558 void
 559 congestion_control_note_cell_sent(congestion_control_t *cc,
 560                                   const circuit_t *circ,
 561                                   const crypt_path_t *cpath)
 562 {
 563   tor_assert(circ);
 564   tor_assert(cc);
 565
 566   /* Is this the last cell before a SENDME? The idea is that if the
 567    * package_window reaches a multiple of the increment, after this cell, we
 568    * should expect a SENDME. Note that this function must be called *before*
 569    * we account for the sent cell. */
 570   if (!circuit_sent_cell_for_sendme(circ, cpath)) {
 571     cc->inflight++;
 572     return;
 573   }
 574
 575   cc->inflight++;
 576
 577   /* Record this cell time for RTT computation when SENDME arrives */
 578   enqueue_timestamp(cc->sendme_pending_timestamps,
 579                     monotime_absolute_usec());
 580 }
 581
 582 /**
 583  * Returns true if any edge connections are active.
 584  *
 585  * We need to know this so that we can stop computing BDP if the
 586  * edges are not sending on the circuit.
 587  */
 588 static int
 589 circuit_has_active_streams(const circuit_t *circ,
 590                            const crypt_path_t *layer_hint)
 591 {
 592   const edge_connection_t *streams;
 593
 594   if (CIRCUIT_IS_ORIGIN(circ)) {
 595     streams = CONST_TO_ORIGIN_CIRCUIT(circ)->p_streams;
 596   } else {
 597     streams = CONST_TO_OR_CIRCUIT(circ)->n_streams;
 598   }
 599
 600   /* Check linked list of streams */
 601   for (const edge_connection_t *conn = streams; conn != NULL;
 602        conn = conn->next_stream) {
 603     if (conn->base_.marked_for_close)
 604       continue;
 605
 606     if (!layer_hint || conn->cpath_layer == layer_hint) {
 607       if (connection_get_inbuf_len(TO_CONN(conn)) > 0) {
 608         log_info(LD_CIRC, "CC: More in edge inbuf...");
 609         return 1;
 610       }
 611
 612       /* If we did not reach EOF on this read, there's more */
 613       if (!TO_CONN(conn)->inbuf_reached_eof) {
 614         log_info(LD_CIRC, "CC: More on edge conn...");
 615         return 1;
 616       }
 617
 618       if (TO_CONN(conn)->linked_conn) {
 619         if (connection_get_inbuf_len(TO_CONN(conn)->linked_conn) > 0) {
 620           log_info(LD_CIRC, "CC: More in linked inbuf...");
 621           return 1;
 622         }
 623
 624         /* If there is a linked conn, and *it* did not each EOF,
 625          * there's more */
 626         if (!TO_CONN(conn)->linked_conn->inbuf_reached_eof) {
 627           log_info(LD_CIRC, "CC: More on linked conn...");
 628           return 1;
 629         }
 630       }
 631     }
 632   }
 633
 634   return 0;
 635 }
 636
 637 /**
 638  * Upon receipt of a SENDME, pop the oldest timestamp off the timestamp
 639  * list, and use this to update RTT.
 640  *
 641  * Returns true if circuit estimates were successfully updated, false
 642  * otherwise.
 643  */
 644 bool
 645 congestion_control_update_circuit_estimates(congestion_control_t *cc,
 646                                             const circuit_t *circ,
 647                                             const crypt_path_t *layer_hint)
 648 {
 649   uint64_t now_usec = monotime_absolute_usec();
 650
 651   /* Update RTT first, then BDP. BDP needs fresh RTT */
 652   uint64_t curr_rtt_usec = congestion_control_update_circuit_rtt(cc, now_usec);
 653   return congestion_control_update_circuit_bdp(cc, circ, layer_hint, now_usec,
 654                                                curr_rtt_usec);
 655 }
 656
 657 /**
 658  * Returns true if we have enough time data to use heuristics
 659  * to compare RTT to a baseline.
 660  */
 661 static bool
 662 time_delta_should_use_heuristics(const congestion_control_t *cc)
 663 {
 664
 665   /* If we have exited slow start, we should have processed at least
 666    * a cwnd worth of RTTs */
 667   if (!cc->in_slow_start) {
 668     return true;
 669   }
 670
 671   /* If we managed to get enough acks to estimate a SENDME BDP, then
 672    * we have enough to estimate clock jumps relative to a baseline,
 673    * too. (This is at least 'cc_bwe_min' acks). */
 674   if (cc->bdp[BDP_ALG_SENDME_RATE]) {
 675     return true;
 676   }
 677
 678   /* Not enough data to estimate clock jumps */
 679   return false;
 680 }
 681
 682 static bool is_monotime_clock_broken = false;
 683
 684 /**
 685  * Returns true if the monotime delta is 0, or is significantly
 686  * different than the previous delta. Either case indicates
 687  * that the monotime time source stalled or jumped.
 688  *
 689  * Also caches the clock state in the is_monotime_clock_broken flag,
 690  * so we can also provide a is_monotime_clock_reliable() function,
 691  * used by flow control rate timing.
 692  */
 693 static bool
 694 time_delta_stalled_or_jumped(const congestion_control_t *cc,
 695                              uint64_t old_delta, uint64_t new_delta)
 696 {
 697 #define DELTA_DISCREPENCY_RATIO_MAX 100
 698   /* If we have a 0 new_delta, that is definitely a monotime stall */
 699   if (new_delta == 0) {
 700     static ratelim_t stall_info_limit = RATELIM_INIT(60);
 701     log_fn_ratelim(&stall_info_limit, LOG_INFO, LD_CIRC,
 702            "Congestion control cannot measure RTT due to monotime stall.");
 703
 704     /* If delta is every 0, the monotime clock has stalled, and we should
 705      * not use it anywhere. */
 706     is_monotime_clock_broken = true;
 707
 708     return is_monotime_clock_broken;
 709   }
 710
 711   /* If the old_delta is 0, we have no previous values on this circuit.
 712    *
 713    * So, return the global monotime status from other circuits, and
 714    * do not update.
 715    */
 716   if (old_delta == 0) {
 717     return is_monotime_clock_broken;
 718   }
 719
 720   /*
 721    * For the heuristic cases, we need at least a few timestamps,
 722    * to average out any previous partial stalls or jumps. So until
 723    * than point, let's just use the cached status from other circuits.
 724    */
 725   if (!time_delta_should_use_heuristics(cc)) {
 726     return is_monotime_clock_broken;
 727   }
 728
 729   /* If old_delta is significantly larger than new_delta, then
 730    * this means that the monotime clock recently stopped moving
 731    * forward. */
 732   if (old_delta > new_delta * DELTA_DISCREPENCY_RATIO_MAX) {
 733     static ratelim_t dec_notice_limit = RATELIM_INIT(300);
 734     log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC,
 735            "Sudden decrease in circuit RTT (%"PRIu64" vs %"PRIu64
 736            "), likely due to clock jump.",
 737            new_delta/1000, old_delta/1000);
 738
 739     is_monotime_clock_broken = true;
 740
 741     return is_monotime_clock_broken;
 742   }
 743
 744   /* If new_delta is significantly larger than old_delta, then
 745    * this means that the monotime clock suddenly jumped forward. */
 746   if (new_delta > old_delta * DELTA_DISCREPENCY_RATIO_MAX) {
 747     static ratelim_t dec_notice_limit = RATELIM_INIT(300);
 748     log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC,
 749            "Sudden increase in circuit RTT (%"PRIu64" vs %"PRIu64
 750            "), likely due to clock jump.",
 751            new_delta/1000, old_delta/1000);
 752
 753     is_monotime_clock_broken = true;
 754
 755     return is_monotime_clock_broken;
 756   }
 757
 758   /* All good! Update cached status, too */
 759   is_monotime_clock_broken = false;
 760
 761   return is_monotime_clock_broken;
 762 }
 763
 764 /**
 765  * Is the monotime clock stalled according to any circuits?
 766  */
 767 bool
 768 is_monotime_clock_reliable(void)
 769 {
 770   return !is_monotime_clock_broken;
 771 }
 772
 773 /**
 774  * Called when we get a SENDME. Updates circuit RTT by pulling off a
 775  * timestamp of when we sent the CIRCWINDOW_INCREMENT-th cell from
 776  * the queue of such timestamps, and comparing that to current time.
 777  *
 778  * Also updates min, max, and EWMA of RTT.
 779  *
 780  * Returns the current circuit RTT in usecs, or 0 if it could not be
 781  * measured (due to clock jump, stall, etc).
 782  */
 783 static uint64_t
 784 congestion_control_update_circuit_rtt(congestion_control_t *cc,
 785                                       uint64_t now_usec)
 786 {
 787   uint64_t rtt, ewma_cnt;
 788   uint64_t sent_at_timestamp;
 789
 790   tor_assert(cc);
 791
 792   /* Get the time that we sent the cell that resulted in the other
 793    * end sending this sendme. Use this to calculate RTT */
 794   sent_at_timestamp = dequeue_timestamp(cc->sendme_pending_timestamps);
 795
 796   rtt = now_usec - sent_at_timestamp;
 797
 798   /* Do not update RTT at all if it looks fishy */
 799   if (time_delta_stalled_or_jumped(cc, cc->ewma_rtt_usec, rtt)) {
 800     return 0;
 801   }
 802
 803   ewma_cnt = cc->ewma_cwnd_cnt*sendme_acks_per_cwnd(cc);
 804   ewma_cnt = MAX(ewma_cnt, 2); // Use at least 2
 805
 806   cc->ewma_rtt_usec = n_count_ewma(rtt, cc->ewma_rtt_usec, ewma_cnt);
 807
 808   if (rtt > cc->max_rtt_usec) {
 809     cc->max_rtt_usec = rtt;
 810   }
 811
 812   if (cc->min_rtt_usec == 0 || rtt < cc->min_rtt_usec) {
 813     cc->min_rtt_usec = rtt;
 814   }
 815
 816   return rtt;
 817 }
 818
 819 /**
 820  * Called when we get a SENDME. Updates the bandwidth-delay-product (BDP)
 821  * estimates of a circuit. Several methods of computing BDP are used,
 822  * depending on scenario. While some congestion control algorithms only
 823  * use one of these methods, we update them all because it's quick and easy.
 824  *
 825  * - now_usec is the current monotime in usecs.
 826  * - curr_rtt_usec is the current circuit RTT in usecs. It may be 0 if no
 827  *   RTT could bemeasured.
 828  *
 829  * Returns true if we were able to update BDP, false otherwise.
 830  */
 831 static bool
 832 congestion_control_update_circuit_bdp(congestion_control_t *cc,
 833                                       const circuit_t *circ,
 834                                       const crypt_path_t *layer_hint,
 835                                       uint64_t now_usec,
 836                                       uint64_t curr_rtt_usec)
 837 {
 838   int chan_q = 0;
 839   unsigned int blocked_on_chan = 0;
 840   uint64_t timestamp_usec;
 841   uint64_t sendme_rate_bdp = 0;
 842
 843   tor_assert(cc);
 844
 845   if (CIRCUIT_IS_ORIGIN(circ)) {
 846     /* origin circs use n_chan */
 847     chan_q = circ->n_chan_cells.n;
 848     blocked_on_chan = circ->streams_blocked_on_n_chan;
 849   } else {
 850     /* Both onion services and exits use or_circuit and p_chan */
 851     chan_q = CONST_TO_OR_CIRCUIT(circ)->p_chan_cells.n;
 852     blocked_on_chan = circ->streams_blocked_on_p_chan;
 853   }
 854
 855   /* If we have no EWMA RTT, it is because monotime has been stalled
 856    * or messed up the entire time so far. Set our BDP estimates directly
 857    * to current cwnd */
 858   if (!cc->ewma_rtt_usec) {
 859      uint64_t cwnd = cc->cwnd;
 860
 861      /* If the channel is blocked, keep subtracting off the chan_q
 862       * until we hit the min cwnd. */
 863      if (blocked_on_chan) {
 864        cwnd = MAX(cwnd - chan_q, cc->cwnd_min);
 865        cc->blocked_chan = 1;
 866      } else {
 867        cc->blocked_chan = 0;
 868      }
 869
 870      cc->bdp[BDP_ALG_CWND_RTT] = cwnd;
 871      cc->bdp[BDP_ALG_INFLIGHT_RTT] = cwnd;
 872      cc->bdp[BDP_ALG_SENDME_RATE] = cwnd;
 873      cc->bdp[BDP_ALG_PIECEWISE] = cwnd;
 874
 875      static ratelim_t dec_notice_limit = RATELIM_INIT(300);
 876      log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC,
 877             "Our clock has been stalled for the entire lifetime of a circuit. "
 878             "Performance may be sub-optimal.");
 879
 880      return blocked_on_chan;
 881   }
 882
 883   /* Congestion window based BDP will respond to changes in RTT only, and is
 884    * relative to cwnd growth. It is useful for correcting for BDP
 885    * overestimation, but if BDP is higher than the current cwnd, it will
 886    * underestimate it.
 887    *
 888    * We multiply here first to avoid precision issues from min_RTT being
 889    * close to ewma RTT. Since all fields are u64, there is plenty of
 890    * room here to multiply first.
 891    */
 892   cc->bdp[BDP_ALG_CWND_RTT] = cc->cwnd*cc->min_rtt_usec/cc->ewma_rtt_usec;
 893
 894   /*
 895    * If we have no pending streams, we do not have enough data to fill
 896    * the BDP, so preserve our old estimates but do not make any more.
 897    */
 898   if (!blocked_on_chan && !circuit_has_active_streams(circ, layer_hint)) {
 899     log_info(LD_CIRC,
 900                "CC: Streams drained. Spare package window: %"PRIu64
 901                ", no BDP update", cc->cwnd - cc->inflight);
 902
 903     /* Clear SENDME timestamps; they will be wrong with intermittent data */
 904     SMARTLIST_FOREACH(cc->sendme_arrival_timestamps, uint64_t *, t,
 905                       tor_free(t));
 906     smartlist_clear(cc->sendme_arrival_timestamps);
 907   } else if (curr_rtt_usec && is_monotime_clock_reliable()) {
 908     /* Sendme-based BDP will quickly measure BDP in much less than
 909      * a cwnd worth of data when in use (in 2-10 SENDMEs).
 910      *
 911      * But if the link goes idle, it will be vastly lower than true BDP. Hence
 912      * we only compute it if we have either pending stream data, or streams
 913      * are still blocked on the channel queued data.
 914      *
 915      * We also do not compute it if we do not have a current RTT passed in,
 916      * because that means that monotime is currently stalled or just jumped.
 917      */
 918     enqueue_timestamp(cc->sendme_arrival_timestamps, now_usec);
 919
 920     if (smartlist_len(cc->sendme_arrival_timestamps) >= cc->bwe_sendme_min) {
 921       /* If we have more sendmes than fit in a cwnd, trim the list.
 922        * Those are not acurrately measuring throughput, if cwnd is
 923        * currently smaller than BDP */
 924       while (smartlist_len(cc->sendme_arrival_timestamps) >
 925              cc->bwe_sendme_min &&
 926              (uint64_t)smartlist_len(cc->sendme_arrival_timestamps) >
 927                        sendme_acks_per_cwnd(cc)) {
 928         (void)dequeue_timestamp(cc->sendme_arrival_timestamps);
 929       }
 930       int sendme_cnt = smartlist_len(cc->sendme_arrival_timestamps);
 931
 932       /* Calculate SENDME_BWE_COUNT pure average */
 933       timestamp_usec = peek_timestamp(cc->sendme_arrival_timestamps);
 934       uint64_t delta = now_usec - timestamp_usec;
 935
 936       /* The acked data is in sendme_cnt-1 chunks, because we are counting the
 937        * data that is processed by the other endpoint *between* all of these
 938        * sendmes. There's one less gap between the sendmes than the number
 939        * of sendmes. */
 940       uint64_t cells = (sendme_cnt-1)*cc->sendme_inc;
 941
 942       /* The bandwidth estimate is cells/delta, which when multiplied
 943        * by min RTT obtains the BDP. However, we multiply first to
 944        * avoid precision issues with the RTT being close to delta in size. */
 945       sendme_rate_bdp = cells*cc->min_rtt_usec/delta;
 946
 947       /* Calculate BDP_EWMA_COUNT N-EWMA */
 948       cc->bdp[BDP_ALG_SENDME_RATE] =
 949                  n_count_ewma(sendme_rate_bdp, cc->bdp[BDP_ALG_SENDME_RATE],
 950                               cc->ewma_cwnd_cnt*sendme_acks_per_cwnd(cc));
 951     }
 952
 953     /* In-flight BDP will cause the cwnd to drift down when underutilized.
 954      * It is most useful when the local OR conn is blocked, so we only
 955      * compute it if we're utilized. */
 956     cc->bdp[BDP_ALG_INFLIGHT_RTT] =
 957         (cc->inflight - chan_q)*cc->min_rtt_usec/
 958                               MAX(cc->ewma_rtt_usec, curr_rtt_usec);
 959   } else {
 960     /* We can still update inflight with just an EWMA RTT, but only
 961      * if there is data flowing */
 962     cc->bdp[BDP_ALG_INFLIGHT_RTT] =
 963         (cc->inflight - chan_q)*cc->min_rtt_usec/cc->ewma_rtt_usec;
 964   }
 965
 966   /* The orconn is blocked; use smaller of inflight vs SENDME */
 967   if (blocked_on_chan) {
 968     log_info(LD_CIRC, "CC: Streams blocked on circ channel. Chanq: %d",
 969              chan_q);
 970
 971     /* A blocked channel is an immediate congestion signal, but it still
 972      * happens only once per cwnd */
 973     if (!cc->blocked_chan) {
 974       cc->next_cc_event = 0;
 975       cc->blocked_chan = 1;
 976     }
 977
 978     if (cc->bdp[BDP_ALG_SENDME_RATE]) {
 979       cc->bdp[BDP_ALG_PIECEWISE] = MIN(cc->bdp[BDP_ALG_INFLIGHT_RTT],
 980                                       cc->bdp[BDP_ALG_SENDME_RATE]);
 981     } else {
 982       cc->bdp[BDP_ALG_PIECEWISE] = cc->bdp[BDP_ALG_INFLIGHT_RTT];
 983     }
 984   } else {
 985     /* If we were previously blocked, emit a new congestion event
 986      * now that we are unblocked, to re-evaluate cwnd */
 987     if (cc->blocked_chan) {
 988       cc->blocked_chan = 0;
 989       cc->next_cc_event = 0;
 990       log_info(LD_CIRC, "CC: Streams un-blocked on circ channel. Chanq: %d",
 991                chan_q);
 992     }
 993
 994     cc->bdp[BDP_ALG_PIECEWISE] = MAX(cc->bdp[BDP_ALG_SENDME_RATE],
 995                                      cc->bdp[BDP_ALG_CWND_RTT]);
 996   }
 997
 998   /* We can end up with no piecewise value if we didn't have either
 999    * a SENDME estimate or enough data for an inflight estimate.
1000    * It also happens on the very first sendme, since we need two
1001    * to get a BDP. In these cases, use the cwnd method. */
1002   if (!cc->bdp[BDP_ALG_PIECEWISE]) {
1003     cc->bdp[BDP_ALG_PIECEWISE] = cc->bdp[BDP_ALG_CWND_RTT];
1004     log_info(LD_CIRC, "CC: No piecewise BDP. Using %"PRIu64,
1005              cc->bdp[BDP_ALG_PIECEWISE]);
1006   }
1007
1008   if (cc->next_cc_event == 0) {
1009     if (CIRCUIT_IS_ORIGIN(circ)) {
1010       log_info(LD_CIRC,
1011                  "CC: Circuit %d "
1012                  "SENDME RTT: %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", "
1013                  "BDP estimates: "
1014                  "%"PRIu64", "
1015                  "%"PRIu64", "
1016                  "%"PRIu64", "
1017                  "%"PRIu64", "
1018                  "%"PRIu64". ",
1019                CONST_TO_ORIGIN_CIRCUIT(circ)->global_identifier,
1020                cc->min_rtt_usec/1000,
1021                curr_rtt_usec/1000,
1022                cc->ewma_rtt_usec/1000,
1023                cc->max_rtt_usec/1000,
1024                cc->bdp[BDP_ALG_INFLIGHT_RTT],
1025                cc->bdp[BDP_ALG_CWND_RTT],
1026                sendme_rate_bdp,
1027                cc->bdp[BDP_ALG_SENDME_RATE],
1028                cc->bdp[BDP_ALG_PIECEWISE]
1029                );
1030     } else {
1031       log_info(LD_CIRC,
1032                  "CC: Circuit %"PRIu64":%d "
1033                  "SENDME RTT: %"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64", "
1034                  "%"PRIu64", "
1035                  "%"PRIu64", "
1036                  "%"PRIu64", "
1037                  "%"PRIu64", "
1038                  "%"PRIu64". ",
1039                  // XXX: actually, is this p_chan here? This is
1040                  // an or_circuit (exit or onion)
1041                  circ->n_chan->global_identifier, circ->n_circ_id,
1042                  cc->min_rtt_usec/1000,
1043                  curr_rtt_usec/1000,
1044                  cc->ewma_rtt_usec/1000,
1045                  cc->max_rtt_usec/1000,
1046                  cc->bdp[BDP_ALG_INFLIGHT_RTT],
1047                  cc->bdp[BDP_ALG_CWND_RTT],
1048                  sendme_rate_bdp,
1049                  cc->bdp[BDP_ALG_SENDME_RATE],
1050                  cc->bdp[BDP_ALG_PIECEWISE]
1051                  );
1052     }
1053   }
1054
1055   /* We updated BDP this round if either we had a blocked channel, or
1056    * the curr_rtt_usec was not 0. */
1057   bool ret = (blocked_on_chan || curr_rtt_usec != 0);
1058   if (ret) {
1059     tor_trace(TR_SUBSYS(cc), TR_EV(bdp_update), circ, cc, curr_rtt_usec,
1060               sendme_rate_bdp);
1061   }
1062   return ret;
1063 }
1064
1065 /**
1066  * Dispatch the sendme to the appropriate congestion control algorithm.
1067  */
1068 int
1069 congestion_control_dispatch_cc_alg(congestion_control_t *cc,
1070                                    const circuit_t *circ,
1071                                    const crypt_path_t *layer_hint)
1072 {
1073   int ret = -END_CIRC_REASON_INTERNAL;
1074   switch (cc->cc_alg) {
1075     case CC_ALG_WESTWOOD:
1076       ret = congestion_control_westwood_process_sendme(cc, circ, layer_hint);
1077       break;
1078
1079     case CC_ALG_VEGAS:
1080       ret = congestion_control_vegas_process_sendme(cc, circ, layer_hint);
1081       break;
1082
1083     case CC_ALG_NOLA:
1084       ret = congestion_control_nola_process_sendme(cc, circ, layer_hint);
1085       break;
1086
1087     case CC_ALG_SENDME:
1088     default:
1089       tor_assert(0);
1090   }
1091
1092   if (cc->cwnd > cwnd_max) {
1093     static ratelim_t cwnd_limit = RATELIM_INIT(60);
1094     log_fn_ratelim(&cwnd_limit, LOG_NOTICE, LD_CIRC,
1095            "Congestion control cwnd %"PRIu64" exceeds max %d, clamping.",
1096            cc->cwnd, cwnd_max);
1097     cc->cwnd = cwnd_max;
1098   }
1099
1100   return ret;
1101 }
1102
1103 /**
1104  * Build an extension field request to negotiate congestion control.
1105  *
1106  * If congestion control is enabled, field TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST
1107  * is created in msg_out. It is a single 0-length field that signifies that we
1108  * want to use congestion control. The length of msg_out is provided via
1109  * msg_len_out.
1110  *
1111  * If congestion control is not enabled, a payload with 0 extensions is created
1112  * and returned.
1113  *
1114  * If there is a failure building the request, -1 is returned, else 0.
1115  *
1116  * *msg_out must be freed if the return value is 0.
1117  */
1118 int
1119 congestion_control_build_ext_request(uint8_t **msg_out, size_t *msg_len_out)
1120 {
1121   uint8_t *request = NULL;
1122   trn_extension_t *ext = NULL;
1123   trn_extension_field_t *field = NULL;
1124
1125   ext = trn_extension_new();
1126
1127   /* With congestion control enabled, add the request, else it is an empty
1128    * request in the payload. */
1129
1130   if (congestion_control_enabled()) {
1131     /* Build the extension field that will hold the CC field. */
1132     field = trn_extension_field_new();
1133     trn_extension_field_set_field_type(field,
1134                                        TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST);
1135
1136     /* No payload indicating a request to use congestion control. */
1137     trn_extension_field_set_field_len(field, 0);
1138
1139     /* Build final extension. */
1140     trn_extension_add_fields(ext, field);
1141     trn_extension_set_num(ext, 1);
1142   }
1143
1144   /* Encode extension. */
1145   ssize_t ret = trn_extension_encoded_len(ext);
1146   if (BUG(ret < 0)) {
1147     goto err;
1148   }
1149   size_t request_len = ret;
1150   request = tor_malloc_zero(request_len);
1151   ret = trn_extension_encode(request, request_len, ext);
1152   if (BUG(ret < 0)) {
1153     tor_free(request);
1154     goto err;
1155   }
1156   *msg_out = request;
1157   *msg_len_out = request_len;
1158
1159   /* Free everything, we've encoded the request now. */
1160   ret = 0;
1161
1162  err:
1163   trn_extension_free(ext);
1164   return (int)ret;
1165 }
1166
1167 /**
1168  * Parse a congestion control ntorv3 request payload for extensions.
1169  *
1170  * On parsing failure, -1 is returned.
1171  *
1172  * If congestion control request is present, return 1. If it is not present,
1173  * return 0.
1174  *
1175  * WARNING: Called from CPU worker! Must not access any global state.
1176  */
1177 int
1178 congestion_control_parse_ext_request(const uint8_t *msg, const size_t msg_len)
1179 {
1180   ssize_t ret = 0;
1181   trn_extension_t *ext = NULL;
1182   size_t num_fields = 0;
1183
1184   /* Parse extension from payload. */
1185   ret = trn_extension_parse(&ext, msg, msg_len);
1186   if (ret < 0) {
1187     goto end;
1188   }
1189
1190   /* No extension implies no support for congestion control. In this case, we
1191    * simply return 0 to indicate CC is disabled. */
1192   if ((num_fields = trn_extension_get_num(ext)) == 0) {
1193     ret = 0;
1194     goto end;
1195   }
1196
1197   /* Go over all fields. If any field is TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST,
1198    * then congestion control is enabled. Ignore unknown fields. */
1199   for (size_t f = 0; f < num_fields; f++) {
1200     const trn_extension_field_t *field = trn_extension_get_fields(ext, f);
1201     if (field == NULL) {
1202       ret = -1;
1203       goto end;
1204     }
1205
1206     /* For congestion control to be enabled, we only need the field type. */
1207     if (trn_extension_field_get_field_type(field) ==
1208         TRUNNEL_EXT_TYPE_CC_FIELD_REQUEST) {
1209       ret = 1;
1210       break;
1211     }
1212   }
1213
1214  end:
1215   trn_extension_free(ext);
1216   return (int)ret;
1217 }
1218
1219 /**
1220  * Given our observed parameters for circuits and congestion control,
1221  * as well as the parameters for the resulting circuit, build a response
1222  * payload using extension fields into *msg_out, with length specified in
1223  * *msg_out_len.
1224  *
1225  * If congestion control will be enabled, the extension field for
1226  * TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE will contain the sendme_inc value.
1227  *
1228  * If congestion control won't be enabled, an extension payload with 0
1229  * fields will be created.
1230  *
1231  * Return 0 if an extension payload was created in *msg_out, and -1 on
1232  * error.
1233  *
1234  * *msg_out must be freed if the return value is 0.
1235  *
1236  * WARNING: Called from CPU worker! Must not access any global state.
1237  */
1238 int
1239 congestion_control_build_ext_response(const circuit_params_t *our_params,
1240                                       const circuit_params_t *circ_params,
1241                                       uint8_t **msg_out, size_t *msg_len_out)
1242 {
1243   ssize_t ret;
1244   uint8_t *request = NULL;
1245   trn_extension_t *ext = NULL;
1246   trn_extension_field_t *field = NULL;
1247   trn_extension_field_cc_t *cc_field = NULL;
1248
1249   tor_assert(our_params);
1250   tor_assert(circ_params);
1251   tor_assert(msg_out);
1252   tor_assert(msg_len_out);
1253
1254   ext = trn_extension_new();
1255
1256   if (circ_params->cc_enabled) {
1257     /* Build the extension field that will hold the CC field. */
1258     field = trn_extension_field_new();
1259     trn_extension_field_set_field_type(field,
1260                                        TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE);
1261
1262     /* Build the congestion control field response. */
1263     cc_field = trn_extension_field_cc_new();
1264     trn_extension_field_cc_set_sendme_inc(cc_field,
1265                                           our_params->sendme_inc_cells);
1266
1267     ret = trn_extension_field_cc_encoded_len(cc_field);
1268     if (BUG(ret <= 0)) {
1269       goto err;
1270     }
1271     size_t field_len = ret;
1272     trn_extension_field_set_field_len(field, field_len);
1273     trn_extension_field_setlen_field(field, field_len);
1274
1275     uint8_t *field_array = trn_extension_field_getarray_field(field);
1276     ret = trn_extension_field_cc_encode(field_array,
1277               trn_extension_field_getlen_field(field), cc_field);
1278     if (BUG(ret <= 0)) {
1279       goto err;
1280     }
1281
1282     /* Build final extension. */
1283     trn_extension_add_fields(ext, field);
1284     trn_extension_set_num(ext, 1);
1285   }
1286
1287   /* Encode extension. */
1288   ret = trn_extension_encoded_len(ext);
1289   if (BUG(ret < 0)) {
1290     goto err;
1291   }
1292   size_t request_len = ret;
1293   request = tor_malloc_zero(request_len);
1294   ret = trn_extension_encode(request, request_len, ext);
1295   if (BUG(ret < 0)) {
1296     tor_free(request);
1297     goto err;
1298   }
1299   *msg_out = request;
1300   *msg_len_out = request_len;
1301
1302   /* We've just encoded the extension, clean everything. */
1303   ret = 0;
1304
1305  err:
1306   if (ext) {
1307     trn_extension_free(ext);
1308   } else {
1309     trn_extension_field_free(field);
1310   }
1311   trn_extension_field_cc_free(cc_field);
1312   return (int)ret;
1313 }
1314
1315 /** Return true iff the given sendme increment is within the acceptable
1316  * margins. */
1317 bool
1318 congestion_control_validate_sendme_increment(uint8_t sendme_inc)
1319 {
1320   /* We will only accept this response (and this circuit) if sendme_inc
1321    * is within a factor of 2 of our consensus value. We should not need
1322    * to change cc_sendme_inc much, and if we do, we can spread out those
1323    * changes over smaller increments once every 4 hours. Exits that
1324    * violate this range should just not be used. */
1325 #define MAX_SENDME_INC_NEGOTIATE_FACTOR 2
1326
1327   if (sendme_inc == 0)
1328     return false;
1329
1330   if (sendme_inc >
1331       MAX_SENDME_INC_NEGOTIATE_FACTOR * congestion_control_sendme_inc() ||
1332       sendme_inc <
1333       congestion_control_sendme_inc() / MAX_SENDME_INC_NEGOTIATE_FACTOR) {
1334     return false;
1335   }
1336   return true;
1337 }
1338
1339 /** Return 1 if CC is enabled which also will set the SENDME increment into our
1340  * params_out. Return 0 if CC is disabled. Else, return -1 on error. */
1341 int
1342 congestion_control_parse_ext_response(const uint8_t *msg,
1343                                       const size_t msg_len,
1344                                       circuit_params_t *params_out)
1345 {
1346   ssize_t ret = 0;
1347   size_t num_fields = 0;
1348   trn_extension_t *ext = NULL;
1349   trn_extension_field_cc_t *cc_field = NULL;
1350
1351   /* We will only accept this response (and this circuit) if sendme_inc
1352    * is within a factor of 2 of our consensus value. We should not need
1353    * to change cc_sendme_inc much, and if we do, we can spread out those
1354    * changes over smaller increments once every 4 hours. Exits that
1355    * violate this range should just not be used. */
1356 #define MAX_SENDME_INC_NEGOTIATE_FACTOR 2
1357
1358   /* Parse extension from payload. */
1359   ret = trn_extension_parse(&ext, msg, msg_len);
1360   if (ret < 0) {
1361     goto end;
1362   }
1363
1364   if ((num_fields = trn_extension_get_num(ext)) == 0) {
1365     ret = 0;
1366     goto end;
1367   }
1368
1369   /* Go over all fields. If any field is TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE,
1370    * then congestion control is enabled. Ignore unknown fields. */
1371   for (size_t f = 0; f < num_fields; f++) {
1372     const trn_extension_field_t *field = trn_extension_get_fields(ext, f);
1373     if (field == NULL) {
1374       ret = -1;
1375       goto end;
1376     }
1377
1378     /* Only examine TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE; ignore other fields */
1379     if (trn_extension_field_get_field_type(field) ==
1380         TRUNNEL_EXT_TYPE_CC_FIELD_RESPONSE) {
1381
1382       /* Parse the field into the congestion control field. */
1383       ret = trn_extension_field_cc_parse(&cc_field,
1384                 trn_extension_field_getconstarray_field(field),
1385                 trn_extension_field_getlen_field(field));
1386       if (ret < 0) {
1387         goto end;
1388       }
1389
1390       uint8_t sendme_inc_cells =
1391               trn_extension_field_cc_get_sendme_inc(cc_field);
1392       if (!congestion_control_validate_sendme_increment(sendme_inc_cells)) {
1393         ret = -1;
1394         goto end;
1395       }
1396
1397       /* All good. Get value and break */
1398       params_out->sendme_inc_cells = sendme_inc_cells;
1399       ret = 1;
1400       break;
1401     }
1402   }
1403
1404  end:
1405   trn_extension_free(ext);
1406   trn_extension_field_cc_free(cc_field);
1407
1408   return (int)ret;
1409 }