src/or/cpuworker.c

   1 /* Copyright (c) 2003-2004, Roger Dingledine.
   2  * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
   3  * Copyright (c) 2007-2013, The Tor Project, Inc. */
   4 /* See LICENSE for licensing information */
   5
   6 /**
   7  * \file cpuworker.c
   8  * \brief Implements a farm of 'CPU worker' processes to perform
   9  * CPU-intensive tasks in another thread or process, to not
  10  * interrupt the main thread.
  11  *
  12  * Right now, we only use this for processing onionskins.
  13  **/
  14 #include "or.h"
  15 #include "buffers.h"
  16 #include "channel.h"
  17 #include "channeltls.h"
  18 #include "circuitbuild.h"
  19 #include "circuitlist.h"
  20 #include "config.h"
  21 #include "connection.h"
  22 #include "connection_or.h"
  23 #include "cpuworker.h"
  24 #include "main.h"
  25 #include "onion.h"
  26 #include "rephist.h"
  27 #include "router.h"
  28
  29 /** The maximum number of cpuworker processes we will keep around. */
  30 #define MAX_CPUWORKERS 16
  31 /** The minimum number of cpuworker processes we will keep around. */
  32 #define MIN_CPUWORKERS 1
  33
  34 /** The tag specifies which circuit this onionskin was from. */
  35 #define TAG_LEN 12
  36
  37 /** How many cpuworkers we have running right now. */
  38 static int num_cpuworkers=0;
  39 /** How many of the running cpuworkers have an assigned task right now. */
  40 static int num_cpuworkers_busy=0;
  41 /** We need to spawn new cpuworkers whenever we rotate the onion keys
  42  * on platforms where execution contexts==processes.  This variable stores
  43  * the last time we got a key rotation event. */
  44 static time_t last_rotation_time=0;
  45
  46 static void cpuworker_main(void *data) ATTR_NORETURN;
  47 static int spawn_cpuworker(void);
  48 static void spawn_enough_cpuworkers(void);
  49 static void process_pending_task(connection_t *cpuworker);
  50
  51 /** Initialize the cpuworker subsystem.
  52  */
  53 void
  54 cpu_init(void)
  55 {
  56   cpuworkers_rotate();
  57 }
  58
  59 /** Called when we're done sending a request to a cpuworker. */
  60 int
  61 connection_cpu_finished_flushing(connection_t *conn)
  62 {
  63   tor_assert(conn);
  64   tor_assert(conn->type == CONN_TYPE_CPUWORKER);
  65   return 0;
  66 }
  67
  68 /** Pack global_id and circ_id; set *tag to the result. (See note on
  69  * cpuworker_main for wire format.) */
  70 static void
  71 tag_pack(uint8_t *tag, uint64_t chan_id, circid_t circ_id)
  72 {
  73   /*XXXX RETHINK THIS WHOLE MESS !!!! !NM NM NM NM*/
  74   /*XXXX DOUBLEPLUSTHIS!!!! AS AS AS AS*/
  75   set_uint64(tag, chan_id);
  76   set_uint32(tag+8, circ_id);
  77 }
  78
  79 /** Unpack <b>tag</b> into addr, port, and circ_id.
  80  */
  81 static void
  82 tag_unpack(const uint8_t *tag, uint64_t *chan_id, circid_t *circ_id)
  83 {
  84   *chan_id = get_uint64(tag);
  85   *circ_id = get_uint32(tag+8);
  86 }
  87
  88 /** Magic numbers to make sure our cpuworker_requests don't grow any
  89  * mis-framing bugs. */
  90 #define CPUWORKER_REQUEST_MAGIC 0xda4afeed
  91 #define CPUWORKER_REPLY_MAGIC 0x5eedf00d
  92
  93 /** A request sent to a cpuworker. */
  94 typedef struct cpuworker_request_t {
  95   /** Magic number; must be CPUWORKER_REQUEST_MAGIC. */
  96   uint32_t magic;
  97   /** Opaque tag to identify the job */
  98   uint8_t tag[TAG_LEN];
  99   /** Task code. Must be one of CPUWORKER_TASK_* */
 100   uint8_t task;
 101
 102   /** Flag: Are we timing this request? */
 103   unsigned timed : 1;
 104   /** If we're timing this request, when was it sent to the cpuworker? */
 105   struct timeval started_at;
 106
 107   /** A create cell for the cpuworker to process. */
 108   create_cell_t create_cell;
 109
 110   /* Turn the above into a tagged union if needed. */
 111 } cpuworker_request_t;
 112
 113 /** A reply sent by a cpuworker. */
 114 typedef struct cpuworker_reply_t {
 115   /** Magic number; must be CPUWORKER_REPLY_MAGIC. */
 116   uint32_t magic;
 117   /** Opaque tag to identify the job; matches the request's tag.*/
 118   uint8_t tag[TAG_LEN];
 119   /** True iff we got a successful request. */
 120   uint8_t success;
 121
 122   /** Are we timing this request? */
 123   unsigned int timed : 1;
 124   /** What handshake type was the request? (Used for timing) */
 125   uint16_t handshake_type;
 126   /** When did we send the request to the cpuworker? */
 127   struct timeval started_at;
 128   /** Once the cpuworker received the request, how many microseconds did it
 129    * take? (This shouldn't overflow; 4 billion micoseconds is over an hour,
 130    * and we'll never have an onion handshake that takes so long.) */
 131   uint32_t n_usec;
 132
 133   /** Output of processing a create cell
 134    *
 135    * @{
 136    */
 137   /** The created cell to send back. */
 138   created_cell_t created_cell;
 139   /** The keys to use on this circuit. */
 140   uint8_t keys[CPATH_KEY_MATERIAL_LEN];
 141   /** Input to use for authenticating introduce1 cells. */
 142   uint8_t rend_auth_material[DIGEST_LEN];
 143 } cpuworker_reply_t;
 144
 145 /** Called when the onion key has changed and we need to spawn new
 146  * cpuworkers.  Close all currently idle cpuworkers, and mark the last
 147  * rotation time as now.
 148  */
 149 void
 150 cpuworkers_rotate(void)
 151 {
 152   connection_t *cpuworker;
 153   while ((cpuworker = connection_get_by_type_state(CONN_TYPE_CPUWORKER,
 154                                                    CPUWORKER_STATE_IDLE))) {
 155     connection_mark_for_close(cpuworker);
 156     --num_cpuworkers;
 157   }
 158   last_rotation_time = time(NULL);
 159   if (server_mode(get_options()))
 160     spawn_enough_cpuworkers();
 161 }
 162
 163 /** If the cpuworker closes the connection,
 164  * mark it as closed and spawn a new one as needed. */
 165 int
 166 connection_cpu_reached_eof(connection_t *conn)
 167 {
 168   log_warn(LD_GENERAL,"Read eof. CPU worker died unexpectedly.");
 169   if (conn->state != CPUWORKER_STATE_IDLE) {
 170     /* the circ associated with this cpuworker will have to wait until
 171      * it gets culled in run_connection_housekeeping(), since we have
 172      * no way to find out which circ it was. */
 173     log_warn(LD_GENERAL,"...and it left a circuit queued; abandoning circ.");
 174     num_cpuworkers_busy--;
 175   }
 176   num_cpuworkers--;
 177   spawn_enough_cpuworkers(); /* try to regrow. hope we don't end up
 178                                 spinning. */
 179   connection_mark_for_close(conn);
 180   return 0;
 181 }
 182
 183 /** Indexed by handshake type: how many onionskins have we processed and
 184  * counted of that type? */
 185 static uint64_t onionskins_n_processed[MAX_ONION_HANDSHAKE_TYPE+1];
 186 /** Indexed by handshake type, corresponding to the onionskins counted in
 187  * onionskins_n_processed: how many microseconds have we spent in cpuworkers
 188  * processing that kind of onionskin? */
 189 static uint64_t onionskins_usec_internal[MAX_ONION_HANDSHAKE_TYPE+1];
 190 /** Indexed by handshake type, corresponding to onionskins counted in
 191  * onionskins_n_processed: how many microseconds have we spent waiting for
 192  * cpuworkers to give us answers for that kind of onionskin?
 193  */
 194 static uint64_t onionskins_usec_roundtrip[MAX_ONION_HANDSHAKE_TYPE+1];
 195
 196 /** If any onionskin takes longer than this, we clip them to this
 197  * time. (microseconds) */
 198 #define MAX_BELIEVABLE_ONIONSKIN_DELAY (2*1000*1000)
 199
 200 static tor_weak_rng_t request_sample_rng = TOR_WEAK_RNG_INIT;
 201
 202 /** Return true iff we'd like to measure a handshake of type
 203  * <b>onionskin_type</b>. Call only from the main thread. */
 204 static int
 205 should_time_request(uint16_t onionskin_type)
 206 {
 207   /* If we've never heard of this type, we shouldn't even be here. */
 208   if (onionskin_type > MAX_ONION_HANDSHAKE_TYPE)
 209     return 0;
 210   /* Measure the first N handshakes of each type, to ensure we have a
 211    * sample */
 212   if (onionskins_n_processed[onionskin_type] < 4096)
 213     return 1;
 214   /** Otherwise, measure with P=1/128.  We avoid doing this for every
 215    * handshake, since the measurement itself can take a little time. */
 216   return tor_weak_random_one_in_n(&request_sample_rng, 128);
 217 }
 218
 219 /** Return an estimate of how many microseconds we will need for a single
 220  * cpuworker to to process <b>n_requests</b> onionskins of type
 221  * <b>onionskin_type</b>. */
 222 uint64_t
 223 estimated_usec_for_onionskins(uint32_t n_requests, uint16_t onionskin_type)
 224 {
 225   if (onionskin_type > MAX_ONION_HANDSHAKE_TYPE) /* should be impossible */
 226     return 1000 * (uint64_t)n_requests;
 227   if (PREDICT_UNLIKELY(onionskins_n_processed[onionskin_type] < 100)) {
 228     /* Until we have 100 data points, just asssume everything takes 1 msec. */
 229     return 1000 * (uint64_t)n_requests;
 230   } else {
 231     /* This can't overflow: we'll never have more than 500000 onionskins
 232      * measured in onionskin_usec_internal, and they won't take anything near
 233      * 1 sec each, and we won't have anything like 1 million queued
 234      * onionskins.  But that's 5e5 * 1e6 * 1e6, which is still less than
 235      * UINT64_MAX. */
 236     return (onionskins_usec_internal[onionskin_type] * n_requests) /
 237       onionskins_n_processed[onionskin_type];
 238   }
 239 }
 240
 241 /** Compute the absolute and relative overhead of using the cpuworker
 242  * framework for onionskins of type <b>onionskin_type</b>.*/
 243 static int
 244 get_overhead_for_onionskins(uint32_t *usec_out, double *frac_out,
 245                             uint16_t onionskin_type)
 246 {
 247   uint64_t overhead;
 248
 249   *usec_out = 0;
 250   *frac_out = 0.0;
 251
 252   if (onionskin_type > MAX_ONION_HANDSHAKE_TYPE) /* should be impossible */
 253     return -1;
 254   if (onionskins_n_processed[onionskin_type] == 0 ||
 255       onionskins_usec_internal[onionskin_type] == 0 ||
 256       onionskins_usec_roundtrip[onionskin_type] == 0)
 257     return -1;
 258
 259   overhead = onionskins_usec_roundtrip[onionskin_type] -
 260     onionskins_usec_internal[onionskin_type];
 261
 262   *usec_out = (uint32_t)(overhead / onionskins_n_processed[onionskin_type]);
 263   *frac_out = U64_TO_DBL(overhead) / onionskins_usec_internal[onionskin_type];
 264
 265   return 0;
 266 }
 267
 268 /** If we've measured overhead for onionskins of type <b>onionskin_type</b>,
 269  * log it. */
 270 void
 271 cpuworker_log_onionskin_overhead(int severity, int onionskin_type,
 272                                  const char *onionskin_type_name)
 273 {
 274   uint32_t overhead;
 275   double relative_overhead;
 276   int r;
 277
 278   r = get_overhead_for_onionskins(&overhead,  &relative_overhead,
 279                                   onionskin_type);
 280   if (!overhead || r<0)
 281     return;
 282
 283   log_fn(severity, LD_OR,
 284          "%s onionskins have averaged %u usec overhead (%.2f%%) in "
 285          "cpuworker code ",
 286          onionskin_type_name, (unsigned)overhead, relative_overhead*100);
 287 }
 288
 289 /** Called when we get data from a cpuworker.  If the answer is not complete,
 290  * wait for a complete answer. If the answer is complete,
 291  * process it as appropriate.
 292  */
 293 int
 294 connection_cpu_process_inbuf(connection_t *conn)
 295 {
 296   uint64_t chan_id;
 297   circid_t circ_id;
 298   channel_t *p_chan = NULL;
 299   circuit_t *circ;
 300
 301   tor_assert(conn);
 302   tor_assert(conn->type == CONN_TYPE_CPUWORKER);
 303
 304   if (!connection_get_inbuf_len(conn))
 305     return 0;
 306
 307   if (conn->state == CPUWORKER_STATE_BUSY_ONION) {
 308     cpuworker_reply_t rpl;
 309     if (connection_get_inbuf_len(conn) < sizeof(cpuworker_reply_t))
 310       return 0; /* not yet */
 311     tor_assert(connection_get_inbuf_len(conn) == sizeof(cpuworker_reply_t));
 312
 313     connection_fetch_from_buf((void*)&rpl,sizeof(cpuworker_reply_t),conn);
 314
 315     tor_assert(rpl.magic == CPUWORKER_REPLY_MAGIC);
 316
 317     if (rpl.timed && rpl.success &&
 318         rpl.handshake_type <= MAX_ONION_HANDSHAKE_TYPE) {
 319       /* Time how long this request took. The handshake_type check should be
 320          needless, but let's leave it in to be safe. */
 321       struct timeval tv_end, tv_diff;
 322       int64_t usec_roundtrip;
 323       tor_gettimeofday(&tv_end);
 324       timersub(&tv_end, &rpl.started_at, &tv_diff);
 325       usec_roundtrip = ((int64_t)tv_diff.tv_sec)*1000000 + tv_diff.tv_usec;
 326       if (usec_roundtrip >= 0 &&
 327           usec_roundtrip < MAX_BELIEVABLE_ONIONSKIN_DELAY) {
 328         ++onionskins_n_processed[rpl.handshake_type];
 329         onionskins_usec_internal[rpl.handshake_type] += rpl.n_usec;
 330         onionskins_usec_roundtrip[rpl.handshake_type] += usec_roundtrip;
 331         if (onionskins_n_processed[rpl.handshake_type] >= 500000) {
 332           /* Scale down every 500000 handshakes.  On a busy server, that's
 333            * less impressive than it sounds. */
 334           onionskins_n_processed[rpl.handshake_type] /= 2;
 335           onionskins_usec_internal[rpl.handshake_type] /= 2;
 336           onionskins_usec_roundtrip[rpl.handshake_type] /= 2;
 337         }
 338       }
 339     }
 340     /* parse out the circ it was talking about */
 341     tag_unpack(rpl.tag, &chan_id, &circ_id);
 342     circ = NULL;
 343     log_debug(LD_OR,
 344               "Unpacking cpuworker reply, chan_id is " U64_FORMAT
 345               ", circ_id is %u",
 346               U64_PRINTF_ARG(chan_id), (unsigned)circ_id);
 347     p_chan = channel_find_by_global_id(chan_id);
 348
 349     if (p_chan)
 350       circ = circuit_get_by_circid_channel(circ_id, p_chan);
 351
 352     if (rpl.success == 0) {
 353       log_debug(LD_OR,
 354                 "decoding onionskin failed. "
 355                 "(Old key or bad software.) Closing.");
 356       if (circ)
 357         circuit_mark_for_close(circ, END_CIRC_REASON_TORPROTOCOL);
 358       goto done_processing;
 359     }
 360     if (!circ) {
 361       /* This happens because somebody sends us a destroy cell and the
 362        * circuit goes away, while the cpuworker is working. This is also
 363        * why our tag doesn't include a pointer to the circ, because we'd
 364        * never know if it's still valid.
 365        */
 366       log_debug(LD_OR,"processed onion for a circ that's gone. Dropping.");
 367       goto done_processing;
 368     }
 369     tor_assert(! CIRCUIT_IS_ORIGIN(circ));
 370     if (onionskin_answer(TO_OR_CIRCUIT(circ),
 371                          &rpl.created_cell,
 372                          (const char*)rpl.keys,
 373                          rpl.rend_auth_material) < 0) {
 374       log_warn(LD_OR,"onionskin_answer failed. Closing.");
 375       circuit_mark_for_close(circ, END_CIRC_REASON_INTERNAL);
 376       goto done_processing;
 377     }
 378     log_debug(LD_OR,"onionskin_answer succeeded. Yay.");
 379   } else {
 380     tor_assert(0); /* don't ask me to do handshakes yet */
 381   }
 382
 383  done_processing:
 384   conn->state = CPUWORKER_STATE_IDLE;
 385   num_cpuworkers_busy--;
 386   if (conn->timestamp_created < last_rotation_time) {
 387     connection_mark_for_close(conn);
 388     num_cpuworkers--;
 389     spawn_enough_cpuworkers();
 390   } else {
 391     process_pending_task(conn);
 392   }
 393   return 0;
 394 }
 395
 396 /** Implement a cpuworker.  'data' is an fdarray as returned by socketpair.
 397  * Read and writes from fdarray[1].  Reads requests, writes answers.
 398  *
 399  *   Request format:
 400  *          cpuworker_request_t.
 401  *   Response format:
 402  *          cpuworker_reply_t
 403  */
 404 static void
 405 cpuworker_main(void *data)
 406 {
 407   /* For talking to the parent thread/process */
 408   tor_socket_t *fdarray = data;
 409   tor_socket_t fd;
 410
 411   /* variables for onion processing */
 412   server_onion_keys_t onion_keys;
 413   cpuworker_request_t req;
 414   cpuworker_reply_t rpl;
 415
 416   fd = fdarray[1]; /* this side is ours */
 417 #ifndef TOR_IS_MULTITHREADED
 418   tor_close_socket(fdarray[0]); /* this is the side of the socketpair the
 419                                  * parent uses */
 420   tor_free_all(1); /* so the child doesn't hold the parent's fd's open */
 421   handle_signals(0); /* ignore interrupts from the keyboard, etc */
 422 #endif
 423   tor_free(data);
 424
 425   setup_server_onion_keys(&onion_keys);
 426
 427   for (;;) {
 428     if (read_all(fd, (void *)&req, sizeof(req), 1) != sizeof(req)) {
 429       log_info(LD_OR, "read request failed. Exiting.");
 430       goto end;
 431     }
 432     tor_assert(req.magic == CPUWORKER_REQUEST_MAGIC);
 433
 434     memset(&rpl, 0, sizeof(rpl));
 435
 436     if (req.task == CPUWORKER_TASK_ONION) {
 437       const create_cell_t *cc = &req.create_cell;
 438       created_cell_t *cell_out = &rpl.created_cell;
 439       struct timeval tv_start = {0,0}, tv_end;
 440       int n;
 441       rpl.timed = req.timed;
 442       rpl.started_at = req.started_at;
 443       rpl.handshake_type = cc->handshake_type;
 444       if (req.timed)
 445         tor_gettimeofday(&tv_start);
 446       n = onion_skin_server_handshake(cc->handshake_type,
 447                                       cc->onionskin, cc->handshake_len,
 448                                       &onion_keys,
 449                                       cell_out->reply,
 450                                       rpl.keys, CPATH_KEY_MATERIAL_LEN,
 451                                       rpl.rend_auth_material);
 452       if (n < 0) {
 453         /* failure */
 454         log_debug(LD_OR,"onion_skin_server_handshake failed.");
 455         memset(&rpl, 0, sizeof(rpl));
 456         memcpy(rpl.tag, req.tag, TAG_LEN);
 457         rpl.success = 0;
 458       } else {
 459         /* success */
 460         log_debug(LD_OR,"onion_skin_server_handshake succeeded.");
 461         memcpy(rpl.tag, req.tag, TAG_LEN);
 462         cell_out->handshake_len = n;
 463         switch (cc->cell_type) {
 464         case CELL_CREATE:
 465           cell_out->cell_type = CELL_CREATED; break;
 466         case CELL_CREATE2:
 467           cell_out->cell_type = CELL_CREATED2; break;
 468         case CELL_CREATE_FAST:
 469           cell_out->cell_type = CELL_CREATED_FAST; break;
 470         default:
 471           tor_assert(0);
 472           goto end;
 473         }
 474         rpl.success = 1;
 475       }
 476       rpl.magic = CPUWORKER_REPLY_MAGIC;
 477       if (req.timed) {
 478         struct timeval tv_diff;
 479         int64_t usec;
 480         tor_gettimeofday(&tv_end);
 481         timersub(&tv_end, &tv_start, &tv_diff);
 482         usec = ((int64_t)tv_diff.tv_sec)*1000000 + tv_diff.tv_usec;
 483         if (usec < 0 || usec > MAX_BELIEVABLE_ONIONSKIN_DELAY)
 484           rpl.n_usec = MAX_BELIEVABLE_ONIONSKIN_DELAY;
 485         else
 486           rpl.n_usec = (uint32_t) usec;
 487       }
 488       if (write_all(fd, (void*)&rpl, sizeof(rpl), 1) != sizeof(rpl)) {
 489         log_err(LD_BUG,"writing response buf failed. Exiting.");
 490         goto end;
 491       }
 492       log_debug(LD_OR,"finished writing response.");
 493     } else if (req.task == CPUWORKER_TASK_SHUTDOWN) {
 494       log_info(LD_OR,"Clean shutdown: exiting");
 495       goto end;
 496     }
 497     memwipe(&req, 0, sizeof(req));
 498     memwipe(&rpl, 0, sizeof(req));
 499   }
 500  end:
 501   memwipe(&req, 0, sizeof(req));
 502   memwipe(&rpl, 0, sizeof(req));
 503   release_server_onion_keys(&onion_keys);
 504   tor_close_socket(fd);
 505   crypto_thread_cleanup();
 506   spawn_exit();
 507 }
 508
 509 /** Launch a new cpuworker. Return 0 if we're happy, -1 if we failed.
 510  */
 511 static int
 512 spawn_cpuworker(void)
 513 {
 514   tor_socket_t *fdarray;
 515   tor_socket_t fd;
 516   connection_t *conn;
 517   int err;
 518
 519   fdarray = tor_malloc(sizeof(tor_socket_t)*2);
 520   if ((err = tor_socketpair(AF_UNIX, SOCK_STREAM, 0, fdarray)) < 0) {
 521     log_warn(LD_NET, "Couldn't construct socketpair for cpuworker: %s",
 522              tor_socket_strerror(-err));
 523     tor_free(fdarray);
 524     return -1;
 525   }
 526
 527   tor_assert(SOCKET_OK(fdarray[0]));
 528   tor_assert(SOCKET_OK(fdarray[1]));
 529
 530   fd = fdarray[0];
 531   if (spawn_func(cpuworker_main, (void*)fdarray) < 0) {
 532     tor_close_socket(fdarray[0]);
 533     tor_close_socket(fdarray[1]);
 534     tor_free(fdarray);
 535     return -1;
 536   }
 537   log_debug(LD_OR,"just spawned a cpu worker.");
 538 #ifndef TOR_IS_MULTITHREADED
 539   tor_close_socket(fdarray[1]); /* don't need the worker's side of the pipe */
 540   tor_free(fdarray);
 541 #endif
 542
 543   conn = connection_new(CONN_TYPE_CPUWORKER, AF_UNIX);
 544
 545   /* set up conn so it's got all the data we need to remember */
 546   conn->s = fd;
 547   conn->address = tor_strdup("localhost");
 548   tor_addr_make_unspec(&conn->addr);
 549
 550   if (set_socket_nonblocking(fd) == -1) {
 551     connection_free(conn); /* this closes fd */
 552     return -1;
 553   }
 554
 555   if (connection_add(conn) < 0) { /* no space, forget it */
 556     log_warn(LD_NET,"connection_add for cpuworker failed. Giving up.");
 557     connection_free(conn); /* this closes fd */
 558     return -1;
 559   }
 560
 561   conn->state = CPUWORKER_STATE_IDLE;
 562   connection_start_reading(conn);
 563
 564   return 0; /* success */
 565 }
 566
 567 /** If we have too few or too many active cpuworkers, try to spawn new ones
 568  * or kill idle ones.
 569  */
 570 static void
 571 spawn_enough_cpuworkers(void)
 572 {
 573   int num_cpuworkers_needed = get_num_cpus(get_options());
 574   int reseed = 0;
 575
 576   if (num_cpuworkers_needed < MIN_CPUWORKERS)
 577     num_cpuworkers_needed = MIN_CPUWORKERS;
 578   if (num_cpuworkers_needed > MAX_CPUWORKERS)
 579     num_cpuworkers_needed = MAX_CPUWORKERS;
 580
 581   while (num_cpuworkers < num_cpuworkers_needed) {
 582     if (spawn_cpuworker() < 0) {
 583       log_warn(LD_GENERAL,"Cpuworker spawn failed. Will try again later.");
 584       return;
 585     }
 586     num_cpuworkers++;
 587     reseed++;
 588   }
 589
 590   if (reseed)
 591     crypto_seed_weak_rng(&request_sample_rng);
 592 }
 593
 594 /** Take a pending task from the queue and assign it to 'cpuworker'. */
 595 static void
 596 process_pending_task(connection_t *cpuworker)
 597 {
 598   or_circuit_t *circ;
 599   create_cell_t *onionskin = NULL;
 600
 601   tor_assert(cpuworker);
 602
 603   /* for now only process onion tasks */
 604
 605   circ = onion_next_task(&onionskin);
 606   if (!circ)
 607     return;
 608   if (assign_onionskin_to_cpuworker(cpuworker, circ, onionskin))
 609     log_warn(LD_OR,"assign_to_cpuworker failed. Ignoring.");
 610 }
 611
 612 /** How long should we let a cpuworker stay busy before we give
 613  * up on it and decide that we have a bug or infinite loop?
 614  * This value is high because some servers with low memory/cpu
 615  * sometimes spend an hour or more swapping, and Tor starves. */
 616 #define CPUWORKER_BUSY_TIMEOUT (60*60*12)
 617
 618 /** We have a bug that I can't find. Sometimes, very rarely, cpuworkers get
 619  * stuck in the 'busy' state, even though the cpuworker process thinks of
 620  * itself as idle. I don't know why. But here's a workaround to kill any
 621  * cpuworker that's been busy for more than CPUWORKER_BUSY_TIMEOUT.
 622  */
 623 static void
 624 cull_wedged_cpuworkers(void)
 625 {
 626   time_t now = time(NULL);
 627   smartlist_t *conns = get_connection_array();
 628   SMARTLIST_FOREACH_BEGIN(conns, connection_t *, conn) {
 629     if (!conn->marked_for_close &&
 630         conn->type == CONN_TYPE_CPUWORKER &&
 631         conn->state == CPUWORKER_STATE_BUSY_ONION &&
 632         conn->timestamp_lastwritten + CPUWORKER_BUSY_TIMEOUT < now) {
 633       log_notice(LD_BUG,
 634                  "closing wedged cpuworker. Can somebody find the bug?");
 635       num_cpuworkers_busy--;
 636       num_cpuworkers--;
 637       connection_mark_for_close(conn);
 638     }
 639   } SMARTLIST_FOREACH_END(conn);
 640 }
 641
 642 /** Try to tell a cpuworker to perform the public key operations necessary to
 643  * respond to <b>onionskin</b> for the circuit <b>circ</b>.
 644  *
 645  * If <b>cpuworker</b> is defined, assert that he's idle, and use him. Else,
 646  * look for an idle cpuworker and use him. If none idle, queue task onto the
 647  * pending onion list and return.  Return 0 if we successfully assign the
 648  * task, or -1 on failure.
 649  */
 650 int
 651 assign_onionskin_to_cpuworker(connection_t *cpuworker,
 652                               or_circuit_t *circ,
 653                               create_cell_t *onionskin)
 654 {
 655   cpuworker_request_t req;
 656   time_t now = approx_time();
 657   static time_t last_culled_cpuworkers = 0;
 658   int should_time;
 659
 660   /* Checking for wedged cpuworkers requires a linear search over all
 661    * connections, so let's do it only once a minute.
 662    */
 663 #define CULL_CPUWORKERS_INTERVAL 60
 664
 665   if (last_culled_cpuworkers + CULL_CPUWORKERS_INTERVAL <= now) {
 666     cull_wedged_cpuworkers();
 667     spawn_enough_cpuworkers();
 668     last_culled_cpuworkers = now;
 669   }
 670
 671   if (1) {
 672     if (num_cpuworkers_busy == num_cpuworkers) {
 673       log_debug(LD_OR,"No idle cpuworkers. Queuing.");
 674       if (onion_pending_add(circ, onionskin) < 0) {
 675         tor_free(onionskin);
 676         return -1;
 677       }
 678       return 0;
 679     }
 680
 681     if (!cpuworker)
 682       cpuworker = connection_get_by_type_state(CONN_TYPE_CPUWORKER,
 683                                                CPUWORKER_STATE_IDLE);
 684
 685     tor_assert(cpuworker);
 686
 687     if (!circ->p_chan) {
 688       log_info(LD_OR,"circ->p_chan gone. Failing circ.");
 689       tor_free(onionskin);
 690       return -1;
 691     }
 692
 693     if (connection_or_digest_is_known_relay(circ->p_chan->identity_digest))
 694       rep_hist_note_circuit_handshake_assigned(onionskin->handshake_type);
 695
 696     should_time = should_time_request(onionskin->handshake_type);
 697     memset(&req, 0, sizeof(req));
 698     req.magic = CPUWORKER_REQUEST_MAGIC;
 699     tag_pack(req.tag, circ->p_chan->global_identifier,
 700              circ->p_circ_id);
 701     req.timed = should_time;
 702
 703     cpuworker->state = CPUWORKER_STATE_BUSY_ONION;
 704     /* touch the lastwritten timestamp, since that's how we check to
 705      * see how long it's been since we asked the question, and sometimes
 706      * we check before the first call to connection_handle_write(). */
 707     cpuworker->timestamp_lastwritten = now;
 708     num_cpuworkers_busy++;
 709
 710     req.task = CPUWORKER_TASK_ONION;
 711     memcpy(&req.create_cell, onionskin, sizeof(create_cell_t));
 712
 713     tor_free(onionskin);
 714
 715     if (should_time)
 716       tor_gettimeofday(&req.started_at);
 717
 718     connection_write_to_buf((void*)&req, sizeof(req), cpuworker);
 719     memwipe(&req, 0, sizeof(req));
 720   }
 721   return 0;
 722 }
 723