scripts/maint/updateFallbackDirs.py

   1 #!/usr/bin/env python
   2
   3 # Usage:
   4 #
   5 # Regenerate the list:
   6 # scripts/maint/updateFallbackDirs.py > src/app/config/fallback_dirs.inc 2> fallback_dirs.log
   7 #
   8 # Check the existing list:
   9 # scripts/maint/updateFallbackDirs.py check_existing > fallback_dirs.inc.ok 2> fallback_dirs.log
  10 # mv fallback_dirs.inc.ok src/app/config/fallback_dirs.inc
  11 #
  12 # This script should be run from a stable, reliable network connection,
  13 # with no other network activity (and not over tor).
  14 # If this is not possible, please disable:
  15 # PERFORM_IPV4_DIRPORT_CHECKS and PERFORM_IPV6_DIRPORT_CHECKS
  16 #
  17 # Needs dateutil, stem, and potentially other python packages.
  18 # Optionally uses ipaddress (python 3 builtin) or py2-ipaddress (package)
  19 # for netblock analysis.
  20 #
  21 # Then read the logs to make sure the fallbacks aren't dominated by a single
  22 # netblock or port.
  23
  24 # Script by weasel, April 2015
  25 # Portions by gsathya & karsten, 2013
  26 # https://trac.torproject.org/projects/tor/attachment/ticket/8374/dir_list.2.py
  27 # Modifications by teor, 2015
  28
  29 import StringIO
  30 import string
  31 import re
  32 import datetime
  33 import gzip
  34 import os.path
  35 import json
  36 import math
  37 import sys
  38 import urllib
  39 import urllib2
  40 import hashlib
  41 import dateutil.parser
  42 # bson_lazy provides bson
  43 #from bson import json_util
  44 import copy
  45 import re
  46
  47 from stem.descriptor import DocumentHandler
  48 from stem.descriptor.remote import get_consensus, get_server_descriptors, MAX_FINGERPRINTS
  49
  50 import logging
  51 logging.root.name = ''
  52
  53 HAVE_IPADDRESS = False
  54 try:
  55   # python 3 builtin, or install package py2-ipaddress
  56   # there are several ipaddress implementations for python 2
  57   # with slightly different semantics with str typed text
  58   # fortunately, all our IP addresses are in unicode
  59   import ipaddress
  60   HAVE_IPADDRESS = True
  61 except ImportError:
  62   # if this happens, we avoid doing netblock analysis
  63   logging.warning('Unable to import ipaddress, please install py2-ipaddress.' +
  64                   ' A fallback list will be created, but optional netblock' +
  65                   ' analysis will not be performed.')
  66
  67 ## Top-Level Configuration
  68
  69 # We use semantic versioning: https://semver.org
  70 # In particular:
  71 # * major changes include removing a mandatory field, or anything else that
  72 #   would break an appropriately tolerant parser,
  73 # * minor changes include adding a field,
  74 # * patch changes include changing header comments or other unstructured
  75 #   content
  76 FALLBACK_FORMAT_VERSION = '2.0.0'
  77 SECTION_SEPARATOR_BASE = '====='
  78 SECTION_SEPARATOR_COMMENT = '/* ' + SECTION_SEPARATOR_BASE + ' */'
  79
  80 # Output all candidate fallbacks, or only output selected fallbacks?
  81 OUTPUT_CANDIDATES = False
  82
  83 # Perform DirPort checks over IPv4?
  84 # Change this to False if IPv4 doesn't work for you, or if you don't want to
  85 # download a consensus for each fallback
  86 # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
  87 PERFORM_IPV4_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else True
  88
  89 # Perform DirPort checks over IPv6?
  90 # If you know IPv6 works for you, set this to True
  91 # This will exclude IPv6 relays without an IPv6 DirPort configured
  92 # So it's best left at False until #18394 is implemented
  93 # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
  94 PERFORM_IPV6_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else False
  95
  96 # Must relays be running now?
  97 MUST_BE_RUNNING_NOW = (PERFORM_IPV4_DIRPORT_CHECKS
  98                        or PERFORM_IPV6_DIRPORT_CHECKS)
  99
 100 # Clients have been using microdesc consensuses by default for a while now
 101 DOWNLOAD_MICRODESC_CONSENSUS = True
 102
 103 # If a relay delivers an expired consensus, if it expired less than this many
 104 # seconds ago, we still allow the relay. This should never be less than -90,
 105 # as all directory mirrors should have downloaded a consensus 90 minutes
 106 # before it expires. It should never be more than 24 hours, because clients
 107 # reject consensuses that are older than REASONABLY_LIVE_TIME.
 108 # For the consensus expiry check to be accurate, the machine running this
 109 # script needs an accurate clock.
 110 #
 111 # Relays on 0.3.0 and later return a 404 when they are about to serve an
 112 # expired consensus. This makes them fail the download check.
 113 # We use a tolerance of 0, so that 0.2.x series relays also fail the download
 114 # check if they serve an expired consensus.
 115 CONSENSUS_EXPIRY_TOLERANCE = 0
 116
 117 # Output fallback name, flags, bandwidth, and ContactInfo in a C comment?
 118 OUTPUT_COMMENTS = True if OUTPUT_CANDIDATES else False
 119
 120 # Output matching ContactInfo in fallbacks list?
 121 # Useful if you're trying to contact operators
 122 CONTACT_COUNT = True if OUTPUT_CANDIDATES else False
 123
 124 # How the list should be sorted:
 125 # fingerprint: is useful for stable diffs of fallback lists
 126 # measured_bandwidth: is useful when pruning the list based on bandwidth
 127 # contact: is useful for contacting operators once the list has been pruned
 128 OUTPUT_SORT_FIELD = 'contact' if OUTPUT_CANDIDATES else 'fingerprint'
 129
 130 ## OnionOO Settings
 131
 132 ONIONOO = 'https://onionoo.torproject.org/'
 133 #ONIONOO = 'https://onionoo.thecthulhu.com/'
 134
 135 # Don't bother going out to the Internet, just use the files available locally,
 136 # even if they're very old
 137 LOCAL_FILES_ONLY = False
 138
 139 ## Whitelist / Blacklist Filter Settings
 140
 141 # The whitelist contains entries that are included if all attributes match
 142 # (IPv4, dirport, orport, id, and optionally IPv6 and IPv6 orport)
 143
 144 # What happens to entries not in whitelist?
 145 # When True, they are included, when False, they are excluded
 146 INCLUDE_UNLISTED_ENTRIES = True if OUTPUT_CANDIDATES else False
 147
 148 WHITELIST_FILE_NAME = 'scripts/maint/fallback.whitelist'
 149 FALLBACK_FILE_NAME  = 'src/app/config/fallback_dirs.inc'
 150
 151 # The number of bytes we'll read from a filter file before giving up
 152 MAX_LIST_FILE_SIZE = 1024 * 1024
 153
 154 ## Eligibility Settings
 155
 156 # Require fallbacks to have the same address and port for a set amount of time
 157 # We used to have this at 1 week, but that caused many fallback failures, which
 158 # meant that we had to rebuild the list more often. We want fallbacks to be
 159 # stable for 2 years, so we set it to a few months.
 160 #
 161 # If a relay changes address or port, that's it, it's not useful any more,
 162 # because clients can't find it
 163 ADDRESS_AND_PORT_STABLE_DAYS = 90
 164 # We ignore relays that have been down for more than this period
 165 MAX_DOWNTIME_DAYS = 0 if MUST_BE_RUNNING_NOW else 7
 166 # FallbackDirs must have a time-weighted-fraction that is greater than or
 167 # equal to:
 168 # Mirrors that are down half the time are still useful half the time
 169 CUTOFF_RUNNING = .50
 170 CUTOFF_V2DIR = .50
 171 # Guard flags are removed for some time after a relay restarts, so we ignore
 172 # the guard flag.
 173 CUTOFF_GUARD = .00
 174 # FallbackDirs must have a time-weighted-fraction that is less than or equal
 175 # to:
 176 # .00 means no bad exits
 177 PERMITTED_BADEXIT = .00
 178
 179 # older entries' weights are adjusted with ALPHA^(age in days)
 180 AGE_ALPHA = 0.99
 181
 182 # this factor is used to scale OnionOO entries to [0,1]
 183 ONIONOO_SCALE_ONE = 999.
 184
 185 ## Fallback Count Limits
 186
 187 # The target for these parameters is 20% of the guards in the network
 188 # This is around 200 as of October 2015
 189 _FB_POG = 0.2
 190 FALLBACK_PROPORTION_OF_GUARDS = None if OUTPUT_CANDIDATES else _FB_POG
 191
 192 # Limit the number of fallbacks (eliminating lowest by advertised bandwidth)
 193 MAX_FALLBACK_COUNT = None if OUTPUT_CANDIDATES else 200
 194 # Emit a C #error if the number of fallbacks is less than expected
 195 MIN_FALLBACK_COUNT = 0 if OUTPUT_CANDIDATES else MAX_FALLBACK_COUNT*0.5
 196
 197 # The maximum number of fallbacks on the same address, contact, or family
 198 #
 199 # With 150 fallbacks, this means each operator sees 5% of client bootstraps.
 200 # For comparison:
 201 #  - We try to limit guard and exit operators to 5% of the network
 202 #  - The directory authorities used to see 11% of client bootstraps each
 203 #
 204 # We also don't want too much of the list to go down if a single operator
 205 # has to move all their relays.
 206 MAX_FALLBACKS_PER_IP = 1
 207 MAX_FALLBACKS_PER_IPV4 = MAX_FALLBACKS_PER_IP
 208 MAX_FALLBACKS_PER_IPV6 = MAX_FALLBACKS_PER_IP
 209 MAX_FALLBACKS_PER_CONTACT = 7
 210 MAX_FALLBACKS_PER_FAMILY = 7
 211
 212 ## Fallback Bandwidth Requirements
 213
 214 # Any fallback with the Exit flag has its bandwidth multiplied by this fraction
 215 # to make sure we aren't further overloading exits
 216 # (Set to 1.0, because we asked that only lightly loaded exits opt-in,
 217 # and the extra load really isn't that much for large relays.)
 218 EXIT_BANDWIDTH_FRACTION = 1.0
 219
 220 # If a single fallback's bandwidth is too low, it's pointless adding it
 221 # We expect fallbacks to handle an extra 10 kilobytes per second of traffic
 222 # Make sure they can support fifty times the expected extra load
 223 #
 224 # We convert this to a consensus weight before applying the filter,
 225 # because all the bandwidth amounts are specified by the relay
 226 MIN_BANDWIDTH = 50.0 * 10.0 * 1024.0
 227
 228 # Clients will time out after 30 seconds trying to download a consensus
 229 # So allow fallback directories half that to deliver a consensus
 230 # The exact download times might change based on the network connection
 231 # running this script, but only by a few seconds
 232 # There is also about a second of python overhead
 233 CONSENSUS_DOWNLOAD_SPEED_MAX = 15.0
 234 # If the relay fails a consensus check, retry the download
 235 # This avoids delisting a relay due to transient network conditions
 236 CONSENSUS_DOWNLOAD_RETRY = True
 237
 238 ## Parsing Functions
 239
 240 def parse_ts(t):
 241   return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S")
 242
 243 def remove_bad_chars(raw_string, bad_char_list):
 244   # Remove each character in the bad_char_list
 245   cleansed_string = raw_string
 246   for c in bad_char_list:
 247     cleansed_string = cleansed_string.replace(c, '')
 248   return cleansed_string
 249
 250 def cleanse_unprintable(raw_string):
 251   # Remove all unprintable characters
 252   cleansed_string = ''
 253   for c in raw_string:
 254     if c in string.printable:
 255       cleansed_string += c
 256   return cleansed_string
 257
 258 def cleanse_whitespace(raw_string):
 259   # Replace all whitespace characters with a space
 260   cleansed_string = raw_string
 261   for c in string.whitespace:
 262     cleansed_string = cleansed_string.replace(c, ' ')
 263   return cleansed_string
 264
 265 def cleanse_c_multiline_comment(raw_string):
 266   cleansed_string = raw_string
 267   # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
 268   cleansed_string = cleanse_whitespace(cleansed_string)
 269   # ContactInfo and Version can be arbitrary binary data
 270   cleansed_string = cleanse_unprintable(cleansed_string)
 271   # Prevent a malicious / unanticipated string from breaking out
 272   # of a C-style multiline comment
 273   # This removes '/*' and '*/' and '//'
 274   bad_char_list = '*/'
 275   # Prevent a malicious string from using C nulls
 276   bad_char_list += '\0'
 277   # Avoid confusing parsers by making sure there is only one comma per fallback
 278   bad_char_list += ','
 279   # Avoid confusing parsers by making sure there is only one equals per field
 280   bad_char_list += '='
 281   # Be safer by removing bad characters entirely
 282   cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
 283   # Some compilers may further process the content of comments
 284   # There isn't much we can do to cover every possible case
 285   # But comment-based directives are typically only advisory
 286   return cleansed_string
 287
 288 def cleanse_c_string(raw_string):
 289   cleansed_string = raw_string
 290   # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
 291   cleansed_string = cleanse_whitespace(cleansed_string)
 292   # ContactInfo and Version can be arbitrary binary data
 293   cleansed_string = cleanse_unprintable(cleansed_string)
 294   # Prevent a malicious address/fingerprint string from breaking out
 295   # of a C-style string
 296   bad_char_list = '"'
 297   # Prevent a malicious string from using escapes
 298   bad_char_list += '\\'
 299   # Prevent a malicious string from using C nulls
 300   bad_char_list += '\0'
 301   # Avoid confusing parsers by making sure there is only one comma per fallback
 302   bad_char_list += ','
 303   # Avoid confusing parsers by making sure there is only one equals per field
 304   bad_char_list += '='
 305   # Be safer by removing bad characters entirely
 306   cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
 307   # Some compilers may further process the content of strings
 308   # There isn't much we can do to cover every possible case
 309   # But this typically only results in changes to the string data
 310   return cleansed_string
 311
 312 ## OnionOO Source Functions
 313
 314 # a dictionary of source metadata for each onionoo query we've made
 315 fetch_source = {}
 316
 317 # register source metadata for 'what'
 318 # assumes we only retrieve one document for each 'what'
 319 def register_fetch_source(what, url, relays_published, version):
 320   fetch_source[what] = {}
 321   fetch_source[what]['url'] = url
 322   fetch_source[what]['relays_published'] = relays_published
 323   fetch_source[what]['version'] = version
 324
 325 # list each registered source's 'what'
 326 def fetch_source_list():
 327   return sorted(fetch_source.keys())
 328
 329 # given 'what', provide a multiline C comment describing the source
 330 def describe_fetch_source(what):
 331   desc = '/*'
 332   desc += '\n'
 333   desc += 'Onionoo Source: '
 334   desc += cleanse_c_multiline_comment(what)
 335   desc += ' Date: '
 336   desc += cleanse_c_multiline_comment(fetch_source[what]['relays_published'])
 337   desc += ' Version: '
 338   desc += cleanse_c_multiline_comment(fetch_source[what]['version'])
 339   desc += '\n'
 340   desc += 'URL: '
 341   desc += cleanse_c_multiline_comment(fetch_source[what]['url'])
 342   desc += '\n'
 343   desc += '*/'
 344   return desc
 345
 346 ## File Processing Functions
 347
 348 def write_to_file(str, file_name, max_len):
 349   try:
 350     with open(file_name, 'w') as f:
 351       f.write(str[0:max_len])
 352   except EnvironmentError, error:
 353     logging.error('Writing file %s failed: %d: %s'%
 354                   (file_name,
 355                    error.errno,
 356                    error.strerror)
 357                   )
 358
 359 def read_from_file(file_name, max_len):
 360   try:
 361     if os.path.isfile(file_name):
 362       with open(file_name, 'r') as f:
 363         return f.read(max_len)
 364   except EnvironmentError, error:
 365     logging.info('Loading file %s failed: %d: %s'%
 366                  (file_name,
 367                   error.errno,
 368                   error.strerror)
 369                  )
 370   return None
 371
 372 def parse_fallback_file(file_name):
 373   file_data = read_from_file(file_name, MAX_LIST_FILE_SIZE)
 374   file_data = cleanse_unprintable(file_data)
 375   file_data = remove_bad_chars(file_data, '\n"\0')
 376   file_data = re.sub('/\*.*?\*/', '', file_data)
 377   file_data = file_data.replace(',', '\n')
 378   file_data = file_data.replace(' weight=10', '')
 379   return file_data
 380
 381 def load_possibly_compressed_response_json(response):
 382     if response.info().get('Content-Encoding') == 'gzip':
 383       buf = StringIO.StringIO( response.read() )
 384       f = gzip.GzipFile(fileobj=buf)
 385       return json.load(f)
 386     else:
 387       return json.load(response)
 388
 389 def load_json_from_file(json_file_name):
 390     # An exception here may be resolved by deleting the .last_modified
 391     # and .json files, and re-running the script
 392     try:
 393       with open(json_file_name, 'r') as f:
 394         return json.load(f)
 395     except EnvironmentError, error:
 396       raise Exception('Reading not-modified json file %s failed: %d: %s'%
 397                     (json_file_name,
 398                      error.errno,
 399                      error.strerror)
 400                     )
 401
 402 ## OnionOO Functions
 403
 404 def datestr_to_datetime(datestr):
 405   # Parse datetimes like: Fri, 02 Oct 2015 13:34:14 GMT
 406   if datestr is not None:
 407     dt = dateutil.parser.parse(datestr)
 408   else:
 409     # Never modified - use start of epoch
 410     dt = datetime.datetime.utcfromtimestamp(0)
 411   # strip any timezone out (in case they're supported in future)
 412   dt = dt.replace(tzinfo=None)
 413   return dt
 414
 415 def onionoo_fetch(what, **kwargs):
 416   params = kwargs
 417   params['type'] = 'relay'
 418   #params['limit'] = 10
 419   params['first_seen_days'] = '%d-'%(ADDRESS_AND_PORT_STABLE_DAYS)
 420   params['last_seen_days'] = '-%d'%(MAX_DOWNTIME_DAYS)
 421   params['flag'] = 'V2Dir'
 422   url = ONIONOO + what + '?' + urllib.urlencode(params)
 423
 424   # Unfortunately, the URL is too long for some OS filenames,
 425   # but we still don't want to get files from different URLs mixed up
 426   base_file_name = what + '-' + hashlib.sha1(url).hexdigest()
 427
 428   full_url_file_name = base_file_name + '.full_url'
 429   MAX_FULL_URL_LENGTH = 1024
 430
 431   last_modified_file_name = base_file_name + '.last_modified'
 432   MAX_LAST_MODIFIED_LENGTH = 64
 433
 434   json_file_name = base_file_name + '.json'
 435
 436   if LOCAL_FILES_ONLY:
 437     # Read from the local file, don't write to anything
 438     response_json = load_json_from_file(json_file_name)
 439   else:
 440     # store the full URL to a file for debugging
 441     # no need to compare as long as you trust SHA-1
 442     write_to_file(url, full_url_file_name, MAX_FULL_URL_LENGTH)
 443
 444     request = urllib2.Request(url)
 445     request.add_header('Accept-encoding', 'gzip')
 446
 447     # load the last modified date from the file, if it exists
 448     last_mod_date = read_from_file(last_modified_file_name,
 449                                    MAX_LAST_MODIFIED_LENGTH)
 450     if last_mod_date is not None:
 451       request.add_header('If-modified-since', last_mod_date)
 452
 453     # Parse last modified date
 454     last_mod = datestr_to_datetime(last_mod_date)
 455
 456     # Not Modified and still recent enough to be useful
 457     # Onionoo / Globe used to use 6 hours, but we can afford a day
 458     required_freshness = datetime.datetime.utcnow()
 459     # strip any timezone out (to match dateutil.parser)
 460     required_freshness = required_freshness.replace(tzinfo=None)
 461     required_freshness -= datetime.timedelta(hours=24)
 462
 463     # Make the OnionOO request
 464     response_code = 0
 465     try:
 466       response = urllib2.urlopen(request)
 467       response_code = response.getcode()
 468     except urllib2.HTTPError, error:
 469       response_code = error.code
 470       if response_code == 304: # not modified
 471         pass
 472       else:
 473         raise Exception("Could not get " + url + ": "
 474                         + str(error.code) + ": " + error.reason)
 475
 476     if response_code == 200: # OK
 477       last_mod = datestr_to_datetime(response.info().get('Last-Modified'))
 478
 479     # Check for freshness
 480     if last_mod < required_freshness:
 481       if last_mod_date is not None:
 482         # This check sometimes fails transiently, retry the script if it does
 483         date_message = "Outdated data: last updated " + last_mod_date
 484       else:
 485         date_message = "No data: never downloaded "
 486       raise Exception(date_message + " from " + url)
 487
 488     # Process the data
 489     if response_code == 200: # OK
 490
 491       response_json = load_possibly_compressed_response_json(response)
 492
 493       with open(json_file_name, 'w') as f:
 494         # use the most compact json representation to save space
 495         json.dump(response_json, f, separators=(',',':'))
 496
 497       # store the last modified date in its own file
 498       if response.info().get('Last-modified') is not None:
 499         write_to_file(response.info().get('Last-Modified'),
 500                       last_modified_file_name,
 501                       MAX_LAST_MODIFIED_LENGTH)
 502
 503     elif response_code == 304: # Not Modified
 504
 505       response_json = load_json_from_file(json_file_name)
 506
 507     else: # Unexpected HTTP response code not covered in the HTTPError above
 508       raise Exception("Unexpected HTTP response code to " + url + ": "
 509                       + str(response_code))
 510
 511   register_fetch_source(what,
 512                         url,
 513                         response_json['relays_published'],
 514                         response_json['version'])
 515
 516   return response_json
 517
 518 def fetch(what, **kwargs):
 519   #x = onionoo_fetch(what, **kwargs)
 520   # don't use sort_keys, as the order of or_addresses is significant
 521   #print json.dumps(x, indent=4, separators=(',', ': '))
 522   #sys.exit(0)
 523
 524   return onionoo_fetch(what, **kwargs)
 525
 526 ## Fallback Candidate Class
 527
 528 class Candidate(object):
 529   CUTOFF_ADDRESS_AND_PORT_STABLE = (datetime.datetime.utcnow()
 530                             - datetime.timedelta(ADDRESS_AND_PORT_STABLE_DAYS))
 531
 532   def __init__(self, details):
 533     for f in ['fingerprint', 'nickname', 'last_changed_address_or_port',
 534               'consensus_weight', 'or_addresses', 'dir_address']:
 535       if not f in details: raise Exception("Document has no %s field."%(f,))
 536
 537     if not 'contact' in details:
 538       details['contact'] = None
 539     if not 'flags' in details or details['flags'] is None:
 540       details['flags'] = []
 541     if (not 'advertised_bandwidth' in details
 542         or details['advertised_bandwidth'] is None):
 543       # relays without advertised bandwidth have it calculated from their
 544       # consensus weight
 545       details['advertised_bandwidth'] = 0
 546     if (not 'effective_family' in details
 547         or details['effective_family'] is None):
 548       details['effective_family'] = []
 549     if not 'platform' in details:
 550       details['platform'] = None
 551     details['last_changed_address_or_port'] = parse_ts(
 552                                       details['last_changed_address_or_port'])
 553     self._data = details
 554     self._stable_sort_or_addresses()
 555
 556     self._fpr = self._data['fingerprint']
 557     self._running = self._guard = self._v2dir = 0.
 558     self._split_dirport()
 559     self._compute_orport()
 560     if self.orport is None:
 561       raise Exception("Failed to get an orport for %s."%(self._fpr,))
 562     self._compute_ipv6addr()
 563     if not self.has_ipv6():
 564       logging.debug("Failed to get an ipv6 address for %s."%(self._fpr,))
 565     self._compute_version()
 566     self._extra_info_cache = None
 567
 568   def _stable_sort_or_addresses(self):
 569     # replace self._data['or_addresses'] with a stable ordering,
 570     # sorting the secondary addresses in string order
 571     # leave the received order in self._data['or_addresses_raw']
 572     self._data['or_addresses_raw'] = self._data['or_addresses']
 573     or_address_primary = self._data['or_addresses'][:1]
 574     # subsequent entries in the or_addresses array are in an arbitrary order
 575     # so we stabilise the addresses by sorting them in string order
 576     or_addresses_secondaries_stable = sorted(self._data['or_addresses'][1:])
 577     or_addresses_stable = or_address_primary + or_addresses_secondaries_stable
 578     self._data['or_addresses'] = or_addresses_stable
 579
 580   def get_fingerprint(self):
 581     return self._fpr
 582
 583   # is_valid_ipv[46]_address by gsathya, karsten, 2013
 584   @staticmethod
 585   def is_valid_ipv4_address(address):
 586     if not isinstance(address, (str, unicode)):
 587       return False
 588
 589     # check if there are four period separated values
 590     if address.count(".") != 3:
 591       return False
 592
 593     # checks that each value in the octet are decimal values between 0-255
 594     for entry in address.split("."):
 595       if not entry.isdigit() or int(entry) < 0 or int(entry) > 255:
 596         return False
 597       elif entry[0] == "0" and len(entry) > 1:
 598         return False  # leading zeros, for instance in "1.2.3.001"
 599
 600     return True
 601
 602   @staticmethod
 603   def is_valid_ipv6_address(address):
 604     if not isinstance(address, (str, unicode)):
 605       return False
 606
 607     # remove brackets
 608     address = address[1:-1]
 609
 610     # addresses are made up of eight colon separated groups of four hex digits
 611     # with leading zeros being optional
 612     # https://en.wikipedia.org/wiki/IPv6#Address_format
 613
 614     colon_count = address.count(":")
 615
 616     if colon_count > 7:
 617       return False  # too many groups
 618     elif colon_count != 7 and not "::" in address:
 619       return False  # not enough groups and none are collapsed
 620     elif address.count("::") > 1 or ":::" in address:
 621       return False  # multiple groupings of zeros can't be collapsed
 622
 623     found_ipv4_on_previous_entry = False
 624     for entry in address.split(":"):
 625       # If an IPv6 address has an embedded IPv4 address,
 626       # it must be the last entry
 627       if found_ipv4_on_previous_entry:
 628         return False
 629       if not re.match("^[0-9a-fA-f]{0,4}$", entry):
 630         if not Candidate.is_valid_ipv4_address(entry):
 631           return False
 632         else:
 633           found_ipv4_on_previous_entry = True
 634
 635     return True
 636
 637   def _split_dirport(self):
 638     # Split the dir_address into dirip and dirport
 639     (self.dirip, _dirport) = self._data['dir_address'].split(':', 2)
 640     self.dirport = int(_dirport)
 641
 642   def _compute_orport(self):
 643     # Choose the first ORPort that's on the same IPv4 address as the DirPort.
 644     # In rare circumstances, this might not be the primary ORPort address.
 645     # However, _stable_sort_or_addresses() ensures we choose the same one
 646     # every time, even if onionoo changes the order of the secondaries.
 647     self._split_dirport()
 648     self.orport = None
 649     for i in self._data['or_addresses']:
 650       if i != self._data['or_addresses'][0]:
 651         logging.debug('Secondary IPv4 Address Used for %s: %s'%(self._fpr, i))
 652       (ipaddr, port) = i.rsplit(':', 1)
 653       if (ipaddr == self.dirip) and Candidate.is_valid_ipv4_address(ipaddr):
 654         self.orport = int(port)
 655         return
 656
 657   def _compute_ipv6addr(self):
 658     # Choose the first IPv6 address that uses the same port as the ORPort
 659     # Or, choose the first IPv6 address in the list
 660     # _stable_sort_or_addresses() ensures we choose the same IPv6 address
 661     # every time, even if onionoo changes the order of the secondaries.
 662     self.ipv6addr = None
 663     self.ipv6orport = None
 664     # Choose the first IPv6 address that uses the same port as the ORPort
 665     for i in self._data['or_addresses']:
 666       (ipaddr, port) = i.rsplit(':', 1)
 667       if (port == self.orport) and Candidate.is_valid_ipv6_address(ipaddr):
 668         self.ipv6addr = ipaddr
 669         self.ipv6orport = int(port)
 670         return
 671     # Choose the first IPv6 address in the list
 672     for i in self._data['or_addresses']:
 673       (ipaddr, port) = i.rsplit(':', 1)
 674       if Candidate.is_valid_ipv6_address(ipaddr):
 675         self.ipv6addr = ipaddr
 676         self.ipv6orport = int(port)
 677         return
 678
 679   def _compute_version(self):
 680     # parse the version out of the platform string
 681     # The platform looks like: "Tor 0.2.7.6 on Linux"
 682     self._data['version'] = None
 683     if self._data['platform'] is None:
 684       return
 685     # be tolerant of weird whitespacing, use a whitespace split
 686     tokens = self._data['platform'].split()
 687     for token in tokens:
 688       vnums = token.split('.')
 689       # if it's at least a.b.c.d, with potentially an -alpha-dev, -alpha, -rc
 690       if (len(vnums) >= 4 and vnums[0].isdigit() and vnums[1].isdigit() and
 691           vnums[2].isdigit()):
 692         self._data['version'] = token
 693         return
 694
 695   # From #20509
 696   # bug #20499 affects versions from 0.2.9.1-alpha-dev to 0.2.9.4-alpha-dev
 697   # and version 0.3.0.0-alpha-dev
 698   # Exhaustive lists are hard to get wrong
 699   STALE_CONSENSUS_VERSIONS = ['0.2.9.1-alpha-dev',
 700                               '0.2.9.2-alpha',
 701                               '0.2.9.2-alpha-dev',
 702                               '0.2.9.3-alpha',
 703                               '0.2.9.3-alpha-dev',
 704                               '0.2.9.4-alpha',
 705                               '0.2.9.4-alpha-dev',
 706                               '0.3.0.0-alpha-dev'
 707                               ]
 708
 709   def is_valid_version(self):
 710     # call _compute_version before calling this
 711     # is the version of the relay a version we want as a fallback?
 712     # checks both recommended versions and bug #20499 / #20509
 713     #
 714     # if the relay doesn't have a recommended version field, exclude the relay
 715     if not self._data.has_key('recommended_version'):
 716       log_excluded('%s not a candidate: no recommended_version field',
 717                    self._fpr)
 718       return False
 719     if not self._data['recommended_version']:
 720       log_excluded('%s not a candidate: version not recommended', self._fpr)
 721       return False
 722     # if the relay doesn't have version field, exclude the relay
 723     if not self._data.has_key('version'):
 724       log_excluded('%s not a candidate: no version field', self._fpr)
 725       return False
 726     if self._data['version'] in Candidate.STALE_CONSENSUS_VERSIONS:
 727       logging.warning('%s not a candidate: version delivers stale consensuses',
 728                       self._fpr)
 729       return False
 730     return True
 731
 732   @staticmethod
 733   def _extract_generic_history(history, which='unknown'):
 734     # given a tree like this:
 735     #   {
 736     #     "1_month": {
 737     #         "count": 187,
 738     #         "factor": 0.001001001001001001,
 739     #         "first": "2015-02-27 06:00:00",
 740     #         "interval": 14400,
 741     #         "last": "2015-03-30 06:00:00",
 742     #         "values": [
 743     #             999,
 744     #             999
 745     #         ]
 746     #     },
 747     #     "1_week": {
 748     #         "count": 169,
 749     #         "factor": 0.001001001001001001,
 750     #         "first": "2015-03-23 07:30:00",
 751     #         "interval": 3600,
 752     #         "last": "2015-03-30 07:30:00",
 753     #         "values": [ ...]
 754     #     },
 755     #     "1_year": {
 756     #         "count": 177,
 757     #         "factor": 0.001001001001001001,
 758     #         "first": "2014-04-11 00:00:00",
 759     #         "interval": 172800,
 760     #         "last": "2015-03-29 00:00:00",
 761     #         "values": [ ...]
 762     #     },
 763     #     "3_months": {
 764     #         "count": 185,
 765     #         "factor": 0.001001001001001001,
 766     #         "first": "2014-12-28 06:00:00",
 767     #         "interval": 43200,
 768     #         "last": "2015-03-30 06:00:00",
 769     #         "values": [ ...]
 770     #     }
 771     #   },
 772     # extract exactly one piece of data per time interval,
 773     # using smaller intervals where available.
 774     #
 775     # returns list of (age, length, value) dictionaries.
 776
 777     generic_history = []
 778
 779     periods = history.keys()
 780     periods.sort(key = lambda x: history[x]['interval'])
 781     now = datetime.datetime.utcnow()
 782     newest = now
 783     for p in periods:
 784       h = history[p]
 785       interval = datetime.timedelta(seconds = h['interval'])
 786       this_ts = parse_ts(h['last'])
 787
 788       if (len(h['values']) != h['count']):
 789         logging.warning('Inconsistent value count in %s document for %s'
 790                         %(p, which))
 791       for v in reversed(h['values']):
 792         if (this_ts <= newest):
 793           agt1 = now - this_ts
 794           agt2 = interval
 795           agetmp1 = (agt1.microseconds + (agt1.seconds + agt1.days * 24 * 3600)
 796                      * 10**6) / 10**6
 797           agetmp2 = (agt2.microseconds + (agt2.seconds + agt2.days * 24 * 3600)
 798                      * 10**6) / 10**6
 799           generic_history.append(
 800             { 'age': agetmp1,
 801               'length': agetmp2,
 802               'value': v
 803             })
 804           newest = this_ts
 805         this_ts -= interval
 806
 807       if (this_ts + interval != parse_ts(h['first'])):
 808         logging.warning('Inconsistent time information in %s document for %s'
 809                         %(p, which))
 810
 811     #print json.dumps(generic_history, sort_keys=True,
 812     #                  indent=4, separators=(',', ': '))
 813     return generic_history
 814
 815   @staticmethod
 816   def _avg_generic_history(generic_history):
 817     a = []
 818     for i in generic_history:
 819       if i['age'] > (ADDRESS_AND_PORT_STABLE_DAYS * 24 * 3600):
 820         continue
 821       if (i['length'] is not None
 822           and i['age'] is not None
 823           and i['value'] is not None):
 824         w = i['length'] * math.pow(AGE_ALPHA, i['age']/(3600*24))
 825         a.append( (i['value'] * w, w) )
 826
 827     sv = math.fsum(map(lambda x: x[0], a))
 828     sw = math.fsum(map(lambda x: x[1], a))
 829
 830     if sw == 0.0:
 831       svw = 0.0
 832     else:
 833       svw = sv/sw
 834     return svw
 835
 836   def _add_generic_history(self, history):
 837     periods = r['read_history'].keys()
 838     periods.sort(key = lambda x: r['read_history'][x]['interval'] )
 839
 840     print periods
 841
 842   def add_running_history(self, history):
 843     pass
 844
 845   def add_uptime(self, uptime):
 846     logging.debug('Adding uptime %s.'%(self._fpr,))
 847
 848     # flags we care about: Running, V2Dir, Guard
 849     if not 'flags' in uptime:
 850       logging.debug('No flags in document for %s.'%(self._fpr,))
 851       return
 852
 853     for f in ['Running', 'Guard', 'V2Dir']:
 854       if not f in uptime['flags']:
 855         logging.debug('No %s in flags for %s.'%(f, self._fpr,))
 856         return
 857
 858     running = self._extract_generic_history(uptime['flags']['Running'],
 859                                             '%s-Running'%(self._fpr))
 860     guard = self._extract_generic_history(uptime['flags']['Guard'],
 861                                           '%s-Guard'%(self._fpr))
 862     v2dir = self._extract_generic_history(uptime['flags']['V2Dir'],
 863                                           '%s-V2Dir'%(self._fpr))
 864     if 'BadExit' in uptime['flags']:
 865       badexit = self._extract_generic_history(uptime['flags']['BadExit'],
 866                                               '%s-BadExit'%(self._fpr))
 867
 868     self._running = self._avg_generic_history(running) / ONIONOO_SCALE_ONE
 869     self._guard = self._avg_generic_history(guard) / ONIONOO_SCALE_ONE
 870     self._v2dir = self._avg_generic_history(v2dir) / ONIONOO_SCALE_ONE
 871     self._badexit = None
 872     if 'BadExit' in uptime['flags']:
 873       self._badexit = self._avg_generic_history(badexit) / ONIONOO_SCALE_ONE
 874
 875   def is_candidate(self):
 876     try:
 877       if (MUST_BE_RUNNING_NOW and not self.is_running()):
 878         log_excluded('%s not a candidate: not running now, unable to check ' +
 879                      'DirPort consensus download', self._fpr)
 880         return False
 881       if (self._data['last_changed_address_or_port'] >
 882           self.CUTOFF_ADDRESS_AND_PORT_STABLE):
 883         log_excluded('%s not a candidate: changed address/port recently (%s)',
 884                      self._fpr, self._data['last_changed_address_or_port'])
 885         return False
 886       if self._running < CUTOFF_RUNNING:
 887         log_excluded('%s not a candidate: running avg too low (%lf)',
 888                      self._fpr, self._running)
 889         return False
 890       if self._v2dir < CUTOFF_V2DIR:
 891         log_excluded('%s not a candidate: v2dir avg too low (%lf)',
 892                      self._fpr, self._v2dir)
 893         return False
 894       if self._badexit is not None and self._badexit > PERMITTED_BADEXIT:
 895         log_excluded('%s not a candidate: badexit avg too high (%lf)',
 896                      self._fpr, self._badexit)
 897         return False
 898       # this function logs a message depending on which check fails
 899       if not self.is_valid_version():
 900         return False
 901       if self._guard < CUTOFF_GUARD:
 902         log_excluded('%s not a candidate: guard avg too low (%lf)',
 903                      self._fpr, self._guard)
 904         return False
 905       if (not self._data.has_key('consensus_weight')
 906           or self._data['consensus_weight'] < 1):
 907         log_excluded('%s not a candidate: consensus weight invalid', self._fpr)
 908         return False
 909     except BaseException as e:
 910       logging.warning("Exception %s when checking if fallback is a candidate",
 911                       str(e))
 912       return False
 913     return True
 914
 915   def is_in_whitelist(self, relaylist):
 916     """ A fallback matches if each key in the whitelist line matches:
 917           ipv4
 918           dirport
 919           orport
 920           id
 921           ipv6 address and port (if present)
 922         If the fallback has an ipv6 key, the whitelist line must also have
 923         it, and vice versa, otherwise they don't match. """
 924     ipv6 = None
 925     if self.has_ipv6():
 926       ipv6 = '%s:%d'%(self.ipv6addr, self.ipv6orport)
 927     for entry in relaylist:
 928       if entry['id'] != self._fpr:
 929         # can't log here unless we match an IP and port, because every relay's
 930         # fingerprint is compared to every entry's fingerprint
 931         if entry['ipv4'] == self.dirip and int(entry['orport']) == self.orport:
 932           logging.warning('%s excluded: has OR %s:%d changed fingerprint to ' +
 933                           '%s?', entry['id'], self.dirip, self.orport,
 934                           self._fpr)
 935         if self.has_ipv6() and entry.has_key('ipv6') and entry['ipv6'] == ipv6:
 936           logging.warning('%s excluded: has OR %s changed fingerprint to ' +
 937                           '%s?', entry['id'], ipv6, self._fpr)
 938         continue
 939       if entry['ipv4'] != self.dirip:
 940         logging.warning('%s excluded: has it changed IPv4 from %s to %s?',
 941                         self._fpr, entry['ipv4'], self.dirip)
 942         continue
 943       if int(entry['dirport']) != self.dirport:
 944         logging.warning('%s excluded: has it changed DirPort from %s:%d to ' +
 945                         '%s:%d?', self._fpr, self.dirip, int(entry['dirport']),
 946                         self.dirip, self.dirport)
 947         continue
 948       if int(entry['orport']) != self.orport:
 949         logging.warning('%s excluded: has it changed ORPort from %s:%d to ' +
 950                         '%s:%d?', self._fpr, self.dirip, int(entry['orport']),
 951                         self.dirip, self.orport)
 952         continue
 953       if entry.has_key('ipv6') and self.has_ipv6():
 954         # if both entry and fallback have an ipv6 address, compare them
 955         if entry['ipv6'] != ipv6:
 956           logging.warning('%s excluded: has it changed IPv6 ORPort from %s ' +
 957                           'to %s?', self._fpr, entry['ipv6'], ipv6)
 958           continue
 959       # if the fallback has an IPv6 address but the whitelist entry
 960       # doesn't, or vice versa, the whitelist entry doesn't match
 961       elif entry.has_key('ipv6') and not self.has_ipv6():
 962         logging.warning('%s excluded: has it lost its former IPv6 address %s?',
 963                         self._fpr, entry['ipv6'])
 964         continue
 965       elif not entry.has_key('ipv6') and self.has_ipv6():
 966         logging.warning('%s excluded: has it gained an IPv6 address %s?',
 967                         self._fpr, ipv6)
 968         continue
 969       return True
 970     return False
 971
 972   def cw_to_bw_factor(self):
 973     # any relays with a missing or zero consensus weight are not candidates
 974     # any relays with a missing advertised bandwidth have it set to zero
 975     return self._data['advertised_bandwidth'] / self._data['consensus_weight']
 976
 977   # since advertised_bandwidth is reported by the relay, it can be gamed
 978   # to avoid this, use the median consensus weight to bandwidth factor to
 979   # estimate this relay's measured bandwidth, and make that the upper limit
 980   def measured_bandwidth(self, median_cw_to_bw_factor):
 981     cw_to_bw= median_cw_to_bw_factor
 982     # Reduce exit bandwidth to make sure we're not overloading them
 983     if self.is_exit():
 984       cw_to_bw *= EXIT_BANDWIDTH_FRACTION
 985     measured_bandwidth = self._data['consensus_weight'] * cw_to_bw
 986     if self._data['advertised_bandwidth'] != 0:
 987       # limit advertised bandwidth (if available) to measured bandwidth
 988       return min(measured_bandwidth, self._data['advertised_bandwidth'])
 989     else:
 990       return measured_bandwidth
 991
 992   def set_measured_bandwidth(self, median_cw_to_bw_factor):
 993     self._data['measured_bandwidth'] = self.measured_bandwidth(
 994                                                       median_cw_to_bw_factor)
 995
 996   def is_exit(self):
 997     return 'Exit' in self._data['flags']
 998
 999   def is_guard(self):
1000     return 'Guard' in self._data['flags']
1001
1002   def is_running(self):
1003     return 'Running' in self._data['flags']
1004
1005   # does this fallback have an IPv6 address and orport?
1006   def has_ipv6(self):
1007     return self.ipv6addr is not None and self.ipv6orport is not None
1008
1009   # strip leading and trailing brackets from an IPv6 address
1010   # safe to use on non-bracketed IPv6 and on IPv4 addresses
1011   # also convert to unicode, and make None appear as ''
1012   @staticmethod
1013   def strip_ipv6_brackets(ip):
1014     if ip is None:
1015       return unicode('')
1016     if len(ip) < 2:
1017       return unicode(ip)
1018     if ip[0] == '[' and ip[-1] == ']':
1019       return unicode(ip[1:-1])
1020     return unicode(ip)
1021
1022   # are ip_a and ip_b in the same netblock?
1023   # mask_bits is the size of the netblock
1024   # takes both IPv4 and IPv6 addresses
1025   # the versions of ip_a and ip_b must be the same
1026   # the mask must be valid for the IP version
1027   @staticmethod
1028   def netblocks_equal(ip_a, ip_b, mask_bits):
1029     if ip_a is None or ip_b is None:
1030       return False
1031     ip_a = Candidate.strip_ipv6_brackets(ip_a)
1032     ip_b = Candidate.strip_ipv6_brackets(ip_b)
1033     a = ipaddress.ip_address(ip_a)
1034     b = ipaddress.ip_address(ip_b)
1035     if a.version != b.version:
1036       raise Exception('Mismatching IP versions in %s and %s'%(ip_a, ip_b))
1037     if mask_bits > a.max_prefixlen:
1038       logging.error('Bad IP mask %d for %s and %s'%(mask_bits, ip_a, ip_b))
1039       mask_bits = a.max_prefixlen
1040     if mask_bits < 0:
1041       logging.error('Bad IP mask %d for %s and %s'%(mask_bits, ip_a, ip_b))
1042       mask_bits = 0
1043     a_net = ipaddress.ip_network('%s/%d'%(ip_a, mask_bits), strict=False)
1044     return b in a_net
1045
1046   # is this fallback's IPv4 address (dirip) in the same netblock as other's
1047   # IPv4 address?
1048   # mask_bits is the size of the netblock
1049   def ipv4_netblocks_equal(self, other, mask_bits):
1050     return Candidate.netblocks_equal(self.dirip, other.dirip, mask_bits)
1051
1052   # is this fallback's IPv6 address (ipv6addr) in the same netblock as
1053   # other's IPv6 address?
1054   # Returns False if either fallback has no IPv6 address
1055   # mask_bits is the size of the netblock
1056   def ipv6_netblocks_equal(self, other, mask_bits):
1057     if not self.has_ipv6() or not other.has_ipv6():
1058       return False
1059     return Candidate.netblocks_equal(self.ipv6addr, other.ipv6addr, mask_bits)
1060
1061   # is this fallback's IPv4 DirPort the same as other's IPv4 DirPort?
1062   def dirport_equal(self, other):
1063     return self.dirport == other.dirport
1064
1065   # is this fallback's IPv4 ORPort the same as other's IPv4 ORPort?
1066   def ipv4_orport_equal(self, other):
1067     return self.orport == other.orport
1068
1069   # is this fallback's IPv6 ORPort the same as other's IPv6 ORPort?
1070   # Returns False if either fallback has no IPv6 address
1071   def ipv6_orport_equal(self, other):
1072     if not self.has_ipv6() or not other.has_ipv6():
1073       return False
1074     return self.ipv6orport == other.ipv6orport
1075
1076   # does this fallback have the same DirPort, IPv4 ORPort, or
1077   # IPv6 ORPort as other?
1078   # Ignores IPv6 ORPort if either fallback has no IPv6 address
1079   def port_equal(self, other):
1080     return (self.dirport_equal(other) or self.ipv4_orport_equal(other)
1081             or self.ipv6_orport_equal(other))
1082
1083   # return a list containing IPv4 ORPort, DirPort, and IPv6 ORPort (if present)
1084   def port_list(self):
1085     ports = [self.dirport, self.orport]
1086     if self.has_ipv6() and not self.ipv6orport in ports:
1087       ports.append(self.ipv6orport)
1088     return ports
1089
1090   # does this fallback share a port with other, regardless of whether the
1091   # port types match?
1092   # For example, if self's IPv4 ORPort is 80 and other's DirPort is 80,
1093   # return True
1094   def port_shared(self, other):
1095     for p in self.port_list():
1096       if p in other.port_list():
1097         return True
1098     return False
1099
1100   # log how long it takes to download a consensus from dirip:dirport
1101   # returns True if the download failed, False if it succeeded within max_time
1102   @staticmethod
1103   def fallback_consensus_download_speed(dirip, dirport, nickname, fingerprint,
1104                                         max_time):
1105     download_failed = False
1106     # some directory mirrors respond to requests in ways that hang python
1107     # sockets, which is why we log this line here
1108     logging.info('Initiating %sconsensus download from %s (%s:%d) %s.',
1109                  'microdesc ' if DOWNLOAD_MICRODESC_CONSENSUS else '',
1110                  nickname, dirip, dirport, fingerprint)
1111     # there appears to be about 1 second of overhead when comparing stem's
1112     # internal trace time and the elapsed time calculated here
1113     TIMEOUT_SLOP = 1.0
1114     start = datetime.datetime.utcnow()
1115     try:
1116       consensus = get_consensus(
1117                               endpoints = [(dirip, dirport)],
1118                               timeout = (max_time + TIMEOUT_SLOP),
1119                               validate = True,
1120                               retries = 0,
1121                               fall_back_to_authority = False,
1122                               document_handler = DocumentHandler.BARE_DOCUMENT,
1123                               microdescriptor = DOWNLOAD_MICRODESC_CONSENSUS
1124                                 ).run()[0]
1125       end = datetime.datetime.utcnow()
1126       time_since_expiry = (end - consensus.valid_until).total_seconds()
1127     except Exception, stem_error:
1128       end = datetime.datetime.utcnow()
1129       log_excluded('Unable to retrieve a consensus from %s: %s', nickname,
1130                     stem_error)
1131       status = 'error: "%s"' % (stem_error)
1132       level = logging.WARNING
1133       download_failed = True
1134     elapsed = (end - start).total_seconds()
1135     if download_failed:
1136       # keep the error failure status, and avoid using the variables
1137       pass
1138     elif elapsed > max_time:
1139       status = 'too slow'
1140       level = logging.WARNING
1141       download_failed = True
1142     elif (time_since_expiry > 0):
1143       status = 'outdated consensus, expired %ds ago'%(int(time_since_expiry))
1144       if time_since_expiry <= CONSENSUS_EXPIRY_TOLERANCE:
1145         status += ', tolerating up to %ds'%(CONSENSUS_EXPIRY_TOLERANCE)
1146         level = logging.INFO
1147       else:
1148         status += ', invalid'
1149         level = logging.WARNING
1150         download_failed = True
1151     else:
1152       status = 'ok'
1153       level = logging.DEBUG
1154     logging.log(level, 'Consensus download: %0.1fs %s from %s (%s:%d) %s, ' +
1155                  'max download time %0.1fs.', elapsed, status, nickname,
1156                  dirip, dirport, fingerprint, max_time)
1157     return download_failed
1158
1159   # does this fallback download the consensus fast enough?
1160   def check_fallback_download_consensus(self):
1161     # include the relay if we're not doing a check, or we can't check (IPv6)
1162     ipv4_failed = False
1163     ipv6_failed = False
1164     if PERFORM_IPV4_DIRPORT_CHECKS:
1165       ipv4_failed = Candidate.fallback_consensus_download_speed(self.dirip,
1166                                                 self.dirport,
1167                                                 self._data['nickname'],
1168                                                 self._fpr,
1169                                                 CONSENSUS_DOWNLOAD_SPEED_MAX)
1170     if self.has_ipv6() and PERFORM_IPV6_DIRPORT_CHECKS:
1171       # Clients assume the IPv6 DirPort is the same as the IPv4 DirPort
1172       ipv6_failed = Candidate.fallback_consensus_download_speed(self.ipv6addr,
1173                                                 self.dirport,
1174                                                 self._data['nickname'],
1175                                                 self._fpr,
1176                                                 CONSENSUS_DOWNLOAD_SPEED_MAX)
1177     return ((not ipv4_failed) and (not ipv6_failed))
1178
1179   # if this fallback has not passed a download check, try it again,
1180   # and record the result, available in get_fallback_download_consensus
1181   def try_fallback_download_consensus(self):
1182     if not self.get_fallback_download_consensus():
1183       self._data['download_check'] = self.check_fallback_download_consensus()
1184
1185   # did this fallback pass the download check?
1186   def get_fallback_download_consensus(self):
1187     # if we're not performing checks, return True
1188     if not PERFORM_IPV4_DIRPORT_CHECKS and not PERFORM_IPV6_DIRPORT_CHECKS:
1189       return True
1190     # if we are performing checks, but haven't done one, return False
1191     if not self._data.has_key('download_check'):
1192       return False
1193     return self._data['download_check']
1194
1195   # output an optional header comment and info for this fallback
1196   # try_fallback_download_consensus before calling this
1197   def fallbackdir_line(self, fallbacks, prefilter_fallbacks):
1198     s = ''
1199     if OUTPUT_COMMENTS:
1200       s += self.fallbackdir_comment(fallbacks, prefilter_fallbacks)
1201     # if the download speed is ok, output a C string
1202     # if it's not, but we OUTPUT_COMMENTS, output a commented-out C string
1203     if self.get_fallback_download_consensus() or OUTPUT_COMMENTS:
1204       s += self.fallbackdir_info(self.get_fallback_download_consensus())
1205     return s
1206
1207   # output a header comment for this fallback
1208   def fallbackdir_comment(self, fallbacks, prefilter_fallbacks):
1209     # /*
1210     # nickname
1211     # flags
1212     # adjusted bandwidth, consensus weight
1213     # [contact]
1214     # [identical contact counts]
1215     # */
1216     # Multiline C comment
1217     s = '/*'
1218     s += '\n'
1219     s += cleanse_c_multiline_comment(self._data['nickname'])
1220     s += '\n'
1221     s += 'Flags: '
1222     s += cleanse_c_multiline_comment(' '.join(sorted(self._data['flags'])))
1223     s += '\n'
1224     # this is an adjusted bandwidth, see calculate_measured_bandwidth()
1225     bandwidth = self._data['measured_bandwidth']
1226     weight = self._data['consensus_weight']
1227     s += 'Bandwidth: %.1f MByte/s, Consensus Weight: %d'%(
1228         bandwidth/(1024.0*1024.0),
1229         weight)
1230     s += '\n'
1231     if self._data['contact'] is not None:
1232       s += cleanse_c_multiline_comment(self._data['contact'])
1233       if CONTACT_COUNT:
1234         fallback_count = len([f for f in fallbacks
1235                               if f._data['contact'] == self._data['contact']])
1236         if fallback_count > 1:
1237           s += '\n'
1238           s += '%d identical contacts listed' % (fallback_count)
1239
1240   # output the fallback info C string for this fallback
1241   # this is the text that would go after FallbackDir in a torrc
1242   # if this relay failed the download test and we OUTPUT_COMMENTS,
1243   # comment-out the returned string
1244   def fallbackdir_info(self, dl_speed_ok):
1245     # "address:dirport orport=port id=fingerprint"
1246     # (insert additional madatory fields here)
1247     # "[ipv6=addr:orport]"
1248     # (insert additional optional fields here)
1249     # /* nickname=name */
1250     # /* extrainfo={0,1} */
1251     # (insert additional comment fields here)
1252     # /* ===== */
1253     # ,
1254     #
1255     # Do we want a C string, or a commented-out string?
1256     c_string = dl_speed_ok
1257     comment_string = not dl_speed_ok and OUTPUT_COMMENTS
1258     # If we don't want either kind of string, bail
1259     if not c_string and not comment_string:
1260       return ''
1261     s = ''
1262     # Comment out the fallback directory entry if it's too slow
1263     # See the debug output for which address and port is failing
1264     if comment_string:
1265       s += '/* Consensus download failed or was too slow:\n'
1266     # Multi-Line C string with trailing comma (part of a string list)
1267     # This makes it easier to diff the file, and remove IPv6 lines using grep
1268     # Integers don't need escaping
1269     s += '"%s orport=%d id=%s"'%(
1270             cleanse_c_string(self._data['dir_address']),
1271             self.orport,
1272             cleanse_c_string(self._fpr))
1273     s += '\n'
1274     # (insert additional madatory fields here)
1275     if self.has_ipv6():
1276       s += '" ipv6=%s:%d"'%(cleanse_c_string(self.ipv6addr), self.ipv6orport)
1277       s += '\n'
1278     # (insert additional optional fields here)
1279     if not comment_string:
1280       s += '/* '
1281     s += 'nickname=%s'%(cleanse_c_string(self._data['nickname']))
1282     if not comment_string:
1283       s += ' */'
1284     s += '\n'
1285     # if we know that the fallback is an extrainfo cache, flag it
1286     # and if we don't know, assume it is not
1287     if not comment_string:
1288       s += '/* '
1289     s += 'extrainfo=%d'%(1 if self._extra_info_cache else 0)
1290     if not comment_string:
1291       s += ' */'
1292     s += '\n'
1293     # (insert additional comment fields here)
1294     # The terminator and comma must be the last line in each fallback entry
1295     if not comment_string:
1296       s += '/* '
1297     s += SECTION_SEPARATOR_BASE
1298     if not comment_string:
1299       s += ' */'
1300     s += '\n'
1301     s += ','
1302     if comment_string:
1303       s += '\n'
1304       s += '*/'
1305     return s
1306
1307 ## Fallback Candidate List Class
1308
1309 class CandidateList(dict):
1310   def __init__(self):
1311     pass
1312
1313   def _add_relay(self, details):
1314     if not 'dir_address' in details: return
1315     c = Candidate(details)
1316     self[ c.get_fingerprint() ] = c
1317
1318   def _add_uptime(self, uptime):
1319     try:
1320       fpr = uptime['fingerprint']
1321     except KeyError:
1322       raise Exception("Document has no fingerprint field.")
1323
1324     try:
1325       c = self[fpr]
1326     except KeyError:
1327       logging.debug('Got unknown relay %s in uptime document.'%(fpr,))
1328       return
1329
1330     c.add_uptime(uptime)
1331
1332   def _add_details(self):
1333     logging.debug('Loading details document.')
1334     d = fetch('details',
1335         fields=('fingerprint,nickname,contact,last_changed_address_or_port,' +
1336                 'consensus_weight,advertised_bandwidth,or_addresses,' +
1337                 'dir_address,recommended_version,flags,effective_family,' +
1338                 'platform'))
1339     logging.debug('Loading details document done.')
1340
1341     if not 'relays' in d: raise Exception("No relays found in document.")
1342
1343     for r in d['relays']: self._add_relay(r)
1344
1345   def _add_uptimes(self):
1346     logging.debug('Loading uptime document.')
1347     d = fetch('uptime')
1348     logging.debug('Loading uptime document done.')
1349
1350     if not 'relays' in d: raise Exception("No relays found in document.")
1351     for r in d['relays']: self._add_uptime(r)
1352
1353   def add_relays(self):
1354     self._add_details()
1355     self._add_uptimes()
1356
1357   def count_guards(self):
1358     guard_count = 0
1359     for fpr in self.keys():
1360       if self[fpr].is_guard():
1361         guard_count += 1
1362     return guard_count
1363
1364   # Find fallbacks that fit the uptime, stability, and flags criteria,
1365   # and make an array of them in self.fallbacks
1366   def compute_fallbacks(self):
1367     self.fallbacks = map(lambda x: self[x],
1368                          filter(lambda x: self[x].is_candidate(),
1369                                 self.keys()))
1370
1371   # sort fallbacks by their consensus weight to advertised bandwidth factor,
1372   # lowest to highest
1373   # used to find the median cw_to_bw_factor()
1374   def sort_fallbacks_by_cw_to_bw_factor(self):
1375     self.fallbacks.sort(key=lambda f: f.cw_to_bw_factor())
1376
1377   # sort fallbacks by their measured bandwidth, highest to lowest
1378   # calculate_measured_bandwidth before calling this
1379   # this is useful for reviewing candidates in priority order
1380   def sort_fallbacks_by_measured_bandwidth(self):
1381     self.fallbacks.sort(key=lambda f: f._data['measured_bandwidth'],
1382                         reverse=True)
1383
1384   # sort fallbacks by the data field data_field, lowest to highest
1385   def sort_fallbacks_by(self, data_field):
1386     self.fallbacks.sort(key=lambda f: f._data[data_field])
1387
1388   @staticmethod
1389   def load_relaylist(file_obj):
1390     """ Read each line in the file, and parse it like a FallbackDir line:
1391         an IPv4 address and optional port:
1392           <IPv4 address>:<port>
1393         which are parsed into dictionary entries:
1394           ipv4=<IPv4 address>
1395           dirport=<port>
1396         followed by a series of key=value entries:
1397           orport=<port>
1398           id=<fingerprint>
1399           ipv6=<IPv6 address>:<IPv6 orport>
1400         each line's key/value pairs are placed in a dictonary,
1401         (of string -> string key/value pairs),
1402         and these dictionaries are placed in an array.
1403         comments start with # and are ignored """
1404     file_data = file_obj['data']
1405     file_name = file_obj['name']
1406     relaylist = []
1407     if file_data is None:
1408       return relaylist
1409     for line in file_data.split('\n'):
1410       relay_entry = {}
1411       # ignore comments
1412       line_comment_split = line.split('#')
1413       line = line_comment_split[0]
1414       # cleanup whitespace
1415       line = cleanse_whitespace(line)
1416       line = line.strip()
1417       if len(line) == 0:
1418         continue
1419       for item in line.split(' '):
1420         item = item.strip()
1421         if len(item) == 0:
1422           continue
1423         key_value_split = item.split('=')
1424         kvl = len(key_value_split)
1425         if kvl < 1 or kvl > 2:
1426           print '#error Bad %s item: %s, format is key=value.'%(
1427                                                  file_name, item)
1428         if kvl == 1:
1429           # assume that entries without a key are the ipv4 address,
1430           # perhaps with a dirport
1431           ipv4_maybe_dirport = key_value_split[0]
1432           ipv4_maybe_dirport_split = ipv4_maybe_dirport.split(':')
1433           dirl = len(ipv4_maybe_dirport_split)
1434           if dirl < 1 or dirl > 2:
1435             print '#error Bad %s IPv4 item: %s, format is ipv4:port.'%(
1436                                                         file_name, item)
1437           if dirl >= 1:
1438             relay_entry['ipv4'] = ipv4_maybe_dirport_split[0]
1439           if dirl == 2:
1440             relay_entry['dirport'] = ipv4_maybe_dirport_split[1]
1441         elif kvl == 2:
1442           relay_entry[key_value_split[0]] = key_value_split[1]
1443       relaylist.append(relay_entry)
1444     return relaylist
1445
1446   # apply the fallback whitelist
1447   def apply_filter_lists(self, whitelist_obj):
1448     excluded_count = 0
1449     logging.debug('Applying whitelist')
1450     # parse the whitelist
1451     whitelist = self.load_relaylist(whitelist_obj)
1452     filtered_fallbacks = []
1453     for f in self.fallbacks:
1454       in_whitelist = f.is_in_whitelist(whitelist)
1455       if in_whitelist:
1456         # include
1457         filtered_fallbacks.append(f)
1458       elif INCLUDE_UNLISTED_ENTRIES:
1459           # include
1460           filtered_fallbacks.append(f)
1461       else:
1462           # exclude
1463           excluded_count += 1
1464           log_excluded('Excluding %s: not in whitelist.',
1465                        f._fpr)
1466     self.fallbacks = filtered_fallbacks
1467     return excluded_count
1468
1469   @staticmethod
1470   def summarise_filters(initial_count, excluded_count):
1471     return '/* Whitelist excluded %d of %d candidates. */'%(
1472                                                 excluded_count, initial_count)
1473
1474   # calculate each fallback's measured bandwidth based on the median
1475   # consensus weight to advertised bandwidth ratio
1476   def calculate_measured_bandwidth(self):
1477     self.sort_fallbacks_by_cw_to_bw_factor()
1478     median_fallback = self.fallback_median(True)
1479     if median_fallback is not None:
1480       median_cw_to_bw_factor = median_fallback.cw_to_bw_factor()
1481     else:
1482       # this will never be used, because there are no fallbacks
1483       median_cw_to_bw_factor = None
1484     for f in self.fallbacks:
1485       f.set_measured_bandwidth(median_cw_to_bw_factor)
1486
1487   # remove relays with low measured bandwidth from the fallback list
1488   # calculate_measured_bandwidth for each relay before calling this
1489   def remove_low_bandwidth_relays(self):
1490     if MIN_BANDWIDTH is None:
1491       return
1492     above_min_bw_fallbacks = []
1493     for f in self.fallbacks:
1494       if f._data['measured_bandwidth'] >= MIN_BANDWIDTH:
1495         above_min_bw_fallbacks.append(f)
1496       else:
1497         # the bandwidth we log here is limited by the relay's consensus weight
1498         # as well as its adverttised bandwidth. See set_measured_bandwidth
1499         # for details
1500         log_excluded('%s not a candidate: bandwidth %.1fMByte/s too low, ' +
1501                      'must be at least %.1fMByte/s', f._fpr,
1502                      f._data['measured_bandwidth']/(1024.0*1024.0),
1503                      MIN_BANDWIDTH/(1024.0*1024.0))
1504     self.fallbacks = above_min_bw_fallbacks
1505
1506   # the minimum fallback in the list
1507   # call one of the sort_fallbacks_* functions before calling this
1508   def fallback_min(self):
1509     if len(self.fallbacks) > 0:
1510       return self.fallbacks[-1]
1511     else:
1512       return None
1513
1514   # the median fallback in the list
1515   # call one of the sort_fallbacks_* functions before calling this
1516   def fallback_median(self, require_advertised_bandwidth):
1517     # use the low-median when there are an evan number of fallbacks,
1518     # for consistency with the bandwidth authorities
1519     if len(self.fallbacks) > 0:
1520       median_position = (len(self.fallbacks) - 1) / 2
1521       if not require_advertised_bandwidth:
1522         return self.fallbacks[median_position]
1523       # if we need advertised_bandwidth but this relay doesn't have it,
1524       # move to a fallback with greater consensus weight until we find one
1525       while not self.fallbacks[median_position]._data['advertised_bandwidth']:
1526         median_position += 1
1527         if median_position >= len(self.fallbacks):
1528           return None
1529       return self.fallbacks[median_position]
1530     else:
1531       return None
1532
1533   # the maximum fallback in the list
1534   # call one of the sort_fallbacks_* functions before calling this
1535   def fallback_max(self):
1536     if len(self.fallbacks) > 0:
1537       return self.fallbacks[0]
1538     else:
1539       return None
1540
1541   # return a new bag suitable for storing attributes
1542   @staticmethod
1543   def attribute_new():
1544     return dict()
1545
1546   # get the count of attribute in attribute_bag
1547   # if attribute is None or the empty string, return 0
1548   @staticmethod
1549   def attribute_count(attribute, attribute_bag):
1550     if attribute is None or attribute == '':
1551       return 0
1552     if attribute not in attribute_bag:
1553       return 0
1554     return attribute_bag[attribute]
1555
1556   # does attribute_bag contain more than max_count instances of attribute?
1557   # if so, return False
1558   # if not, return True
1559   # if attribute is None or the empty string, or max_count is invalid,
1560   # always return True
1561   @staticmethod
1562   def attribute_allow(attribute, attribute_bag, max_count=1):
1563     if attribute is None or attribute == '' or max_count <= 0:
1564       return True
1565     elif CandidateList.attribute_count(attribute, attribute_bag) >= max_count:
1566       return False
1567     else:
1568       return True
1569
1570   # add attribute to attribute_bag, incrementing the count if it is already
1571   # present
1572   # if attribute is None or the empty string, or count is invalid,
1573   # do nothing
1574   @staticmethod
1575   def attribute_add(attribute, attribute_bag, count=1):
1576     if attribute is None or attribute == '' or count <= 0:
1577       pass
1578     attribute_bag.setdefault(attribute, 0)
1579     attribute_bag[attribute] += count
1580
1581   # make sure there are only MAX_FALLBACKS_PER_IP fallbacks per IPv4 address,
1582   # and per IPv6 address
1583   # there is only one IPv4 address on each fallback: the IPv4 DirPort address
1584   # (we choose the IPv4 ORPort which is on the same IPv4 as the DirPort)
1585   # there is at most one IPv6 address on each fallback: the IPv6 ORPort address
1586   # we try to match the IPv4 ORPort, but will use any IPv6 address if needed
1587   # (clients only use the IPv6 ORPort)
1588   # if there is no IPv6 address, only the IPv4 address is checked
1589   # return the number of candidates we excluded
1590   def limit_fallbacks_same_ip(self):
1591     ip_limit_fallbacks = []
1592     ip_list = CandidateList.attribute_new()
1593     for f in self.fallbacks:
1594       if (CandidateList.attribute_allow(f.dirip, ip_list,
1595                                         MAX_FALLBACKS_PER_IPV4)
1596           and CandidateList.attribute_allow(f.ipv6addr, ip_list,
1597                                             MAX_FALLBACKS_PER_IPV6)):
1598         ip_limit_fallbacks.append(f)
1599         CandidateList.attribute_add(f.dirip, ip_list)
1600         if f.has_ipv6():
1601           CandidateList.attribute_add(f.ipv6addr, ip_list)
1602       elif not CandidateList.attribute_allow(f.dirip, ip_list,
1603                                              MAX_FALLBACKS_PER_IPV4):
1604         log_excluded('Eliminated %s: already have %d fallback(s) on IPv4 %s'
1605                      %(f._fpr, CandidateList.attribute_count(f.dirip, ip_list),
1606                        f.dirip))
1607       elif (f.has_ipv6() and
1608             not CandidateList.attribute_allow(f.ipv6addr, ip_list,
1609                                               MAX_FALLBACKS_PER_IPV6)):
1610         log_excluded('Eliminated %s: already have %d fallback(s) on IPv6 %s'
1611                      %(f._fpr, CandidateList.attribute_count(f.ipv6addr,
1612                                                              ip_list),
1613                        f.ipv6addr))
1614     original_count = len(self.fallbacks)
1615     self.fallbacks = ip_limit_fallbacks
1616     return original_count - len(self.fallbacks)
1617
1618   # make sure there are only MAX_FALLBACKS_PER_CONTACT fallbacks for each
1619   # ContactInfo
1620   # if there is no ContactInfo, allow the fallback
1621   # this check can be gamed by providing no ContactInfo, or by setting the
1622   # ContactInfo to match another fallback
1623   # However, given the likelihood that relays with the same ContactInfo will
1624   # go down at similar times, its usefulness outweighs the risk
1625   def limit_fallbacks_same_contact(self):
1626     contact_limit_fallbacks = []
1627     contact_list = CandidateList.attribute_new()
1628     for f in self.fallbacks:
1629       if CandidateList.attribute_allow(f._data['contact'], contact_list,
1630                                        MAX_FALLBACKS_PER_CONTACT):
1631         contact_limit_fallbacks.append(f)
1632         CandidateList.attribute_add(f._data['contact'], contact_list)
1633       else:
1634         log_excluded(
1635           'Eliminated %s: already have %d fallback(s) on ContactInfo %s'
1636           %(f._fpr, CandidateList.attribute_count(f._data['contact'],
1637                                                   contact_list),
1638             f._data['contact']))
1639     original_count = len(self.fallbacks)
1640     self.fallbacks = contact_limit_fallbacks
1641     return original_count - len(self.fallbacks)
1642
1643   # make sure there are only MAX_FALLBACKS_PER_FAMILY fallbacks per effective
1644   # family
1645   # if there is no family, allow the fallback
1646   # we use effective family, which ensures mutual family declarations
1647   # but the check can be gamed by not declaring a family at all
1648   # if any indirect families exist, the result depends on the order in which
1649   # fallbacks are sorted in the list
1650   def limit_fallbacks_same_family(self):
1651     family_limit_fallbacks = []
1652     fingerprint_list = CandidateList.attribute_new()
1653     for f in self.fallbacks:
1654       if CandidateList.attribute_allow(f._fpr, fingerprint_list,
1655                                        MAX_FALLBACKS_PER_FAMILY):
1656         family_limit_fallbacks.append(f)
1657         CandidateList.attribute_add(f._fpr, fingerprint_list)
1658         for family_fingerprint in f._data['effective_family']:
1659           CandidateList.attribute_add(family_fingerprint, fingerprint_list)
1660       else:
1661         # we already have a fallback with this fallback in its effective
1662         # family
1663         log_excluded(
1664           'Eliminated %s: already have %d fallback(s) in effective family'
1665           %(f._fpr, CandidateList.attribute_count(f._fpr, fingerprint_list)))
1666     original_count = len(self.fallbacks)
1667     self.fallbacks = family_limit_fallbacks
1668     return original_count - len(self.fallbacks)
1669
1670   # try once to get the descriptors for fingerprint_list using stem
1671   # returns an empty list on exception
1672   @staticmethod
1673   def get_fallback_descriptors_once(fingerprint_list):
1674     desc_list = get_server_descriptors(fingerprints=fingerprint_list).run(suppress=True)
1675     return desc_list
1676
1677   # try up to max_retries times to get the descriptors for fingerprint_list
1678   # using stem. Stops retrying when all descriptors have been retrieved.
1679   # returns a list containing the descriptors that were retrieved
1680   @staticmethod
1681   def get_fallback_descriptors(fingerprint_list, max_retries=5):
1682     # we can't use stem's retries=, because we want to support more than 96
1683     # descriptors
1684     #
1685     # add an attempt for every MAX_FINGERPRINTS (or part thereof) in the list
1686     max_retries += (len(fingerprint_list) + MAX_FINGERPRINTS - 1) / MAX_FINGERPRINTS
1687     remaining_list = fingerprint_list
1688     desc_list = []
1689     for _ in xrange(max_retries):
1690       if len(remaining_list) == 0:
1691         break
1692       new_desc_list = CandidateList.get_fallback_descriptors_once(remaining_list[0:MAX_FINGERPRINTS])
1693       for d in new_desc_list:
1694         try:
1695           remaining_list.remove(d.fingerprint)
1696         except ValueError:
1697           # warn and ignore if a directory mirror returned a bad descriptor
1698           logging.warning("Directory mirror returned unwanted descriptor %s, ignoring",
1699                           d.fingerprint)
1700           continue
1701         desc_list.append(d)
1702     return desc_list
1703
1704   # find the fallbacks that cache extra-info documents
1705   # Onionoo doesn't know this, so we have to use stem
1706   def mark_extra_info_caches(self):
1707     fingerprint_list = [ f._fpr for f in self.fallbacks ]
1708     logging.info("Downloading fallback descriptors to find extra-info caches")
1709     desc_list = CandidateList.get_fallback_descriptors(fingerprint_list)
1710     for d in desc_list:
1711       self[d.fingerprint]._extra_info_cache = d.extra_info_cache
1712     missing_descriptor_list = [ f._fpr for f in self.fallbacks
1713                                 if f._extra_info_cache is None ]
1714     for f in missing_descriptor_list:
1715       logging.warning("No descriptor for {}. Assuming extrainfo=0.".format(f))
1716
1717   # try a download check on each fallback candidate in order
1718   # stop after max_count successful downloads
1719   # but don't remove any candidates from the array
1720   def try_download_consensus_checks(self, max_count):
1721     dl_ok_count = 0
1722     for f in self.fallbacks:
1723       f.try_fallback_download_consensus()
1724       if f.get_fallback_download_consensus():
1725         # this fallback downloaded a consensus ok
1726         dl_ok_count += 1
1727         if dl_ok_count >= max_count:
1728           # we have enough fallbacks
1729           return
1730
1731   # put max_count successful candidates in the fallbacks array:
1732   # - perform download checks on each fallback candidate
1733   # - retry failed candidates if CONSENSUS_DOWNLOAD_RETRY is set
1734   # - eliminate failed candidates
1735   # - if there are more than max_count candidates, eliminate lowest bandwidth
1736   # - if there are fewer than max_count candidates, leave only successful
1737   # Return the number of fallbacks that failed the consensus check
1738   def perform_download_consensus_checks(self, max_count):
1739     self.sort_fallbacks_by_measured_bandwidth()
1740     self.try_download_consensus_checks(max_count)
1741     if CONSENSUS_DOWNLOAD_RETRY:
1742       # try unsuccessful candidates again
1743       # we could end up with more than max_count successful candidates here
1744       self.try_download_consensus_checks(max_count)
1745     # now we have at least max_count successful candidates,
1746     # or we've tried them all
1747     original_count = len(self.fallbacks)
1748     self.fallbacks = filter(lambda x: x.get_fallback_download_consensus(),
1749                             self.fallbacks)
1750     # some of these failed the check, others skipped the check,
1751     # if we already had enough successful downloads
1752     failed_count = original_count - len(self.fallbacks)
1753     self.fallbacks = self.fallbacks[:max_count]
1754     return failed_count
1755
1756   # return a string that describes a/b as a percentage
1757   @staticmethod
1758   def describe_percentage(a, b):
1759     if b != 0:
1760       return '%d/%d = %.0f%%'%(a, b, (a*100.0)/b)
1761     else:
1762       # technically, 0/0 is undefined, but 0.0% is a sensible result
1763       return '%d/%d = %.0f%%'%(a, b, 0.0)
1764
1765   # return a dictionary of lists of fallbacks by IPv4 netblock
1766   # the dictionary is keyed by the fingerprint of an arbitrary fallback
1767   # in each netblock
1768   # mask_bits is the size of the netblock
1769   def fallbacks_by_ipv4_netblock(self, mask_bits):
1770     netblocks = {}
1771     for f in self.fallbacks:
1772       found_netblock = False
1773       for b in netblocks.keys():
1774         # we found an existing netblock containing this fallback
1775         if f.ipv4_netblocks_equal(self[b], mask_bits):
1776           # add it to the list
1777           netblocks[b].append(f)
1778           found_netblock = True
1779           break
1780       # make a new netblock based on this fallback's fingerprint
1781       if not found_netblock:
1782         netblocks[f._fpr] = [f]
1783     return netblocks
1784
1785   # return a dictionary of lists of fallbacks by IPv6 netblock
1786   # where mask_bits is the size of the netblock
1787   def fallbacks_by_ipv6_netblock(self, mask_bits):
1788     netblocks = {}
1789     for f in self.fallbacks:
1790       # skip fallbacks without IPv6 addresses
1791       if not f.has_ipv6():
1792         continue
1793       found_netblock = False
1794       for b in netblocks.keys():
1795         # we found an existing netblock containing this fallback
1796         if f.ipv6_netblocks_equal(self[b], mask_bits):
1797           # add it to the list
1798           netblocks[b].append(f)
1799           found_netblock = True
1800           break
1801       # make a new netblock based on this fallback's fingerprint
1802       if not found_netblock:
1803         netblocks[f._fpr] = [f]
1804     return netblocks
1805
1806   # log a message about the proportion of fallbacks in each IPv4 netblock,
1807   # where mask_bits is the size of the netblock
1808   def describe_fallback_ipv4_netblock_mask(self, mask_bits):
1809     fallback_count = len(self.fallbacks)
1810     shared_netblock_fallback_count = 0
1811     most_frequent_netblock = None
1812     netblocks = self.fallbacks_by_ipv4_netblock(mask_bits)
1813     for b in netblocks.keys():
1814       if len(netblocks[b]) > 1:
1815         # how many fallbacks are in a netblock with other fallbacks?
1816         shared_netblock_fallback_count += len(netblocks[b])
1817         # what's the netblock with the most fallbacks?
1818         if (most_frequent_netblock is None
1819             or len(netblocks[b]) > len(netblocks[most_frequent_netblock])):
1820           most_frequent_netblock = b
1821         logging.debug('Fallback IPv4 addresses in the same /%d:'%(mask_bits))
1822         for f in netblocks[b]:
1823           logging.debug('%s - %s', f.dirip, f._fpr)
1824     if most_frequent_netblock is not None:
1825       logging.warning('There are %s fallbacks in the IPv4 /%d containing %s'%(
1826                                     CandidateList.describe_percentage(
1827                                       len(netblocks[most_frequent_netblock]),
1828                                       fallback_count),
1829                                     mask_bits,
1830                                     self[most_frequent_netblock].dirip))
1831     if shared_netblock_fallback_count > 0:
1832       logging.warning(('%s of fallbacks are in an IPv4 /%d with other ' +
1833                        'fallbacks')%(CandidateList.describe_percentage(
1834                                                 shared_netblock_fallback_count,
1835                                                 fallback_count),
1836                                      mask_bits))
1837
1838   # log a message about the proportion of fallbacks in each IPv6 netblock,
1839   # where mask_bits is the size of the netblock
1840   def describe_fallback_ipv6_netblock_mask(self, mask_bits):
1841     fallback_count = len(self.fallbacks_with_ipv6())
1842     shared_netblock_fallback_count = 0
1843     most_frequent_netblock = None
1844     netblocks = self.fallbacks_by_ipv6_netblock(mask_bits)
1845     for b in netblocks.keys():
1846       if len(netblocks[b]) > 1:
1847         # how many fallbacks are in a netblock with other fallbacks?
1848         shared_netblock_fallback_count += len(netblocks[b])
1849         # what's the netblock with the most fallbacks?
1850         if (most_frequent_netblock is None
1851             or len(netblocks[b]) > len(netblocks[most_frequent_netblock])):
1852           most_frequent_netblock = b
1853         logging.debug('Fallback IPv6 addresses in the same /%d:'%(mask_bits))
1854         for f in netblocks[b]:
1855           logging.debug('%s - %s', f.ipv6addr, f._fpr)
1856     if most_frequent_netblock is not None:
1857       logging.warning('There are %s fallbacks in the IPv6 /%d containing %s'%(
1858                                     CandidateList.describe_percentage(
1859                                       len(netblocks[most_frequent_netblock]),
1860                                       fallback_count),
1861                                     mask_bits,
1862                                     self[most_frequent_netblock].ipv6addr))
1863     if shared_netblock_fallback_count > 0:
1864       logging.warning(('%s of fallbacks are in an IPv6 /%d with other ' +
1865                        'fallbacks')%(CandidateList.describe_percentage(
1866                                                 shared_netblock_fallback_count,
1867                                                 fallback_count),
1868                                      mask_bits))
1869
1870   # log a message about the proportion of fallbacks in each IPv4 /8, /16,
1871   # and /24
1872   def describe_fallback_ipv4_netblocks(self):
1873    # this doesn't actually tell us anything useful
1874    #self.describe_fallback_ipv4_netblock_mask(8)
1875    self.describe_fallback_ipv4_netblock_mask(16)
1876    #self.describe_fallback_ipv4_netblock_mask(24)
1877
1878   # log a message about the proportion of fallbacks in each IPv6 /12 (RIR),
1879   # /23 (smaller RIR blocks), /32 (LIR), /48 (Customer), and /64 (Host)
1880   # https://www.iana.org/assignments/ipv6-unicast-address-assignments/
1881   def describe_fallback_ipv6_netblocks(self):
1882     # these don't actually tell us anything useful
1883     #self.describe_fallback_ipv6_netblock_mask(12)
1884     #self.describe_fallback_ipv6_netblock_mask(23)
1885     self.describe_fallback_ipv6_netblock_mask(32)
1886     #self.describe_fallback_ipv6_netblock_mask(48)
1887     self.describe_fallback_ipv6_netblock_mask(64)
1888
1889   # log a message about the proportion of fallbacks in each IPv4 and IPv6
1890   # netblock
1891   def describe_fallback_netblocks(self):
1892     self.describe_fallback_ipv4_netblocks()
1893     self.describe_fallback_ipv6_netblocks()
1894
1895   # return a list of fallbacks which are on the IPv4 ORPort port
1896   def fallbacks_on_ipv4_orport(self, port):
1897     return filter(lambda x: x.orport == port, self.fallbacks)
1898
1899   # return a list of fallbacks which are on the IPv6 ORPort port
1900   def fallbacks_on_ipv6_orport(self, port):
1901     return filter(lambda x: x.ipv6orport == port, self.fallbacks_with_ipv6())
1902
1903   # return a list of fallbacks which are on the DirPort port
1904   def fallbacks_on_dirport(self, port):
1905     return filter(lambda x: x.dirport == port, self.fallbacks)
1906
1907   # log a message about the proportion of fallbacks on IPv4 ORPort port
1908   # and return that count
1909   def describe_fallback_ipv4_orport(self, port):
1910     port_count = len(self.fallbacks_on_ipv4_orport(port))
1911     fallback_count = len(self.fallbacks)
1912     logging.warning('%s of fallbacks are on IPv4 ORPort %d'%(
1913                     CandidateList.describe_percentage(port_count,
1914                                                       fallback_count),
1915                     port))
1916     return port_count
1917
1918   # log a message about the proportion of IPv6 fallbacks on IPv6 ORPort port
1919   # and return that count
1920   def describe_fallback_ipv6_orport(self, port):
1921     port_count = len(self.fallbacks_on_ipv6_orport(port))
1922     fallback_count = len(self.fallbacks_with_ipv6())
1923     logging.warning('%s of IPv6 fallbacks are on IPv6 ORPort %d'%(
1924                     CandidateList.describe_percentage(port_count,
1925                                                       fallback_count),
1926                     port))
1927     return port_count
1928
1929   # log a message about the proportion of fallbacks on DirPort port
1930   # and return that count
1931   def describe_fallback_dirport(self, port):
1932     port_count = len(self.fallbacks_on_dirport(port))
1933     fallback_count = len(self.fallbacks)
1934     logging.warning('%s of fallbacks are on DirPort %d'%(
1935                     CandidateList.describe_percentage(port_count,
1936                                                       fallback_count),
1937                     port))
1938     return port_count
1939
1940   # log a message about the proportion of fallbacks on each dirport,
1941   # each IPv4 orport, and each IPv6 orport
1942   def describe_fallback_ports(self):
1943     fallback_count = len(self.fallbacks)
1944     ipv4_or_count = fallback_count
1945     ipv4_or_count -= self.describe_fallback_ipv4_orport(443)
1946     ipv4_or_count -= self.describe_fallback_ipv4_orport(9001)
1947     logging.warning('%s of fallbacks are on other IPv4 ORPorts'%(
1948                     CandidateList.describe_percentage(ipv4_or_count,
1949                                                       fallback_count)))
1950     ipv6_fallback_count = len(self.fallbacks_with_ipv6())
1951     ipv6_or_count = ipv6_fallback_count
1952     ipv6_or_count -= self.describe_fallback_ipv6_orport(443)
1953     ipv6_or_count -= self.describe_fallback_ipv6_orport(9001)
1954     logging.warning('%s of IPv6 fallbacks are on other IPv6 ORPorts'%(
1955                     CandidateList.describe_percentage(ipv6_or_count,
1956                                                       ipv6_fallback_count)))
1957     dir_count = fallback_count
1958     dir_count -= self.describe_fallback_dirport(80)
1959     dir_count -= self.describe_fallback_dirport(9030)
1960     logging.warning('%s of fallbacks are on other DirPorts'%(
1961                     CandidateList.describe_percentage(dir_count,
1962                                                       fallback_count)))
1963
1964   # return a list of fallbacks which cache extra-info documents
1965   def fallbacks_with_extra_info_cache(self):
1966     return filter(lambda x: x._extra_info_cache, self.fallbacks)
1967
1968   # log a message about the proportion of fallbacks that cache extra-info docs
1969   def describe_fallback_extra_info_caches(self):
1970     extra_info_falback_count = len(self.fallbacks_with_extra_info_cache())
1971     fallback_count = len(self.fallbacks)
1972     logging.warning('%s of fallbacks cache extra-info documents'%(
1973                     CandidateList.describe_percentage(extra_info_falback_count,
1974                                                       fallback_count)))
1975
1976   # return a list of fallbacks which have the Exit flag
1977   def fallbacks_with_exit(self):
1978     return filter(lambda x: x.is_exit(), self.fallbacks)
1979
1980   # log a message about the proportion of fallbacks with an Exit flag
1981   def describe_fallback_exit_flag(self):
1982     exit_falback_count = len(self.fallbacks_with_exit())
1983     fallback_count = len(self.fallbacks)
1984     logging.warning('%s of fallbacks have the Exit flag'%(
1985                     CandidateList.describe_percentage(exit_falback_count,
1986                                                       fallback_count)))
1987
1988   # return a list of fallbacks which have an IPv6 address
1989   def fallbacks_with_ipv6(self):
1990     return filter(lambda x: x.has_ipv6(), self.fallbacks)
1991
1992   # log a message about the proportion of fallbacks on IPv6
1993   def describe_fallback_ip_family(self):
1994     ipv6_falback_count = len(self.fallbacks_with_ipv6())
1995     fallback_count = len(self.fallbacks)
1996     logging.warning('%s of fallbacks are on IPv6'%(
1997                     CandidateList.describe_percentage(ipv6_falback_count,
1998                                                       fallback_count)))
1999
2000   def summarise_fallbacks(self, eligible_count, operator_count, failed_count,
2001                           guard_count, target_count):
2002     s = ''
2003     # Report:
2004     #  whether we checked consensus download times
2005     #  the number of fallback directories (and limits/exclusions, if relevant)
2006     #  min & max fallback bandwidths
2007     #  #error if below minimum count
2008     if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
2009       s += '/* Checked %s%s%s DirPorts served a consensus within %.1fs. */'%(
2010             'IPv4' if PERFORM_IPV4_DIRPORT_CHECKS else '',
2011             ' and ' if (PERFORM_IPV4_DIRPORT_CHECKS
2012                         and PERFORM_IPV6_DIRPORT_CHECKS) else '',
2013             'IPv6' if PERFORM_IPV6_DIRPORT_CHECKS else '',
2014             CONSENSUS_DOWNLOAD_SPEED_MAX)
2015     else:
2016       s += '/* Did not check IPv4 or IPv6 DirPort consensus downloads. */'
2017     s += '\n'
2018     # Multiline C comment with #error if things go bad
2019     s += '/*'
2020     s += '\n'
2021     # Integers don't need escaping in C comments
2022     fallback_count = len(self.fallbacks)
2023     if FALLBACK_PROPORTION_OF_GUARDS is None:
2024       fallback_proportion = ''
2025     else:
2026       fallback_proportion = ', Target %d (%d * %.2f)'%(target_count,
2027                                                 guard_count,
2028                                                 FALLBACK_PROPORTION_OF_GUARDS)
2029     s += 'Final Count: %d (Eligible %d%s'%(fallback_count, eligible_count,
2030                                            fallback_proportion)
2031     if MAX_FALLBACK_COUNT is not None:
2032       s += ', Max %d'%(MAX_FALLBACK_COUNT)
2033     s += ')\n'
2034     if eligible_count != fallback_count:
2035       removed_count = eligible_count - fallback_count
2036       excess_to_target_or_max = (eligible_count - operator_count - failed_count
2037                                  - fallback_count)
2038       # some 'Failed' failed the check, others 'Skipped' the check,
2039       # if we already had enough successful downloads
2040       s += ('Excluded: %d (Same Operator %d, Failed/Skipped Download %d, ' +
2041             'Excess %d)')%(removed_count, operator_count, failed_count,
2042                            excess_to_target_or_max)
2043       s += '\n'
2044     min_fb = self.fallback_min()
2045     min_bw = min_fb._data['measured_bandwidth']
2046     max_fb = self.fallback_max()
2047     max_bw = max_fb._data['measured_bandwidth']
2048     s += 'Bandwidth Range: %.1f - %.1f MByte/s'%(min_bw/(1024.0*1024.0),
2049                                                  max_bw/(1024.0*1024.0))
2050     s += '\n'
2051     s += '*/'
2052     if fallback_count < MIN_FALLBACK_COUNT:
2053       # We must have a minimum number of fallbacks so they are always
2054       # reachable, and are in diverse locations
2055       s += '\n'
2056       s += '#error Fallback Count %d is too low. '%(fallback_count)
2057       s += 'Must be at least %d for diversity. '%(MIN_FALLBACK_COUNT)
2058       s += 'Try adding entries to the whitelist, '
2059       s += 'or setting INCLUDE_UNLISTED_ENTRIES = True.'
2060     return s
2061
2062 def process_existing():
2063   logging.basicConfig(level=logging.INFO)
2064   logging.getLogger('stem').setLevel(logging.INFO)
2065   whitelist = {'data': parse_fallback_file(FALLBACK_FILE_NAME),
2066                'name': FALLBACK_FILE_NAME}
2067   list_fallbacks(whitelist)
2068
2069 def process_default():
2070   logging.basicConfig(level=logging.WARNING)
2071   logging.getLogger('stem').setLevel(logging.WARNING)
2072   whitelist = {'data': read_from_file(WHITELIST_FILE_NAME, MAX_LIST_FILE_SIZE),
2073                'name': WHITELIST_FILE_NAME}
2074   list_fallbacks(whitelist)
2075
2076 ## Main Function
2077 def main():
2078   if get_command() == 'check_existing':
2079     process_existing()
2080   else:
2081     process_default()
2082
2083 def get_command():
2084   if len(sys.argv) == 2:
2085     return sys.argv[1]
2086   else:
2087     return None
2088
2089 def log_excluded(msg, *args):
2090   if get_command() == 'check_existing':
2091     logging.warning(msg, *args)
2092   else:
2093     logging.info(msg, *args)
2094
2095 def list_fallbacks(whitelist):
2096   """ Fetches required onionoo documents and evaluates the
2097       fallback directory criteria for each of the relays """
2098
2099   print "/* type=fallback */"
2100   print ("/* version={} */"
2101          .format(cleanse_c_multiline_comment(FALLBACK_FORMAT_VERSION)))
2102   now = datetime.datetime.utcnow()
2103   timestamp = now.strftime('%Y%m%d%H%M%S')
2104   print ("/* timestamp={} */"
2105          .format(cleanse_c_multiline_comment(timestamp)))
2106   # end the header with a separator, to make it easier for parsers
2107   print SECTION_SEPARATOR_COMMENT
2108
2109   logging.warning('Downloading and parsing Onionoo data. ' +
2110                   'This may take some time.')
2111   # find relays that could be fallbacks
2112   candidates = CandidateList()
2113   candidates.add_relays()
2114
2115   # work out how many fallbacks we want
2116   guard_count = candidates.count_guards()
2117   if FALLBACK_PROPORTION_OF_GUARDS is None:
2118     target_count = guard_count
2119   else:
2120     target_count = int(guard_count * FALLBACK_PROPORTION_OF_GUARDS)
2121   # the maximum number of fallbacks is the least of:
2122   # - the target fallback count (FALLBACK_PROPORTION_OF_GUARDS * guard count)
2123   # - the maximum fallback count (MAX_FALLBACK_COUNT)
2124   if MAX_FALLBACK_COUNT is None:
2125     max_count = target_count
2126   else:
2127     max_count = min(target_count, MAX_FALLBACK_COUNT)
2128
2129   candidates.compute_fallbacks()
2130   prefilter_fallbacks = copy.copy(candidates.fallbacks)
2131
2132   # filter with the whitelist
2133   # if a relay has changed IPv4 address or ports recently, it will be excluded
2134   # as ineligible before we call apply_filter_lists, and so there will be no
2135   # warning that the details have changed from those in the whitelist.
2136   # instead, there will be an info-level log during the eligibility check.
2137   initial_count = len(candidates.fallbacks)
2138   excluded_count = candidates.apply_filter_lists(whitelist)
2139   print candidates.summarise_filters(initial_count, excluded_count)
2140   eligible_count = len(candidates.fallbacks)
2141
2142   # calculate the measured bandwidth of each relay,
2143   # then remove low-bandwidth relays
2144   candidates.calculate_measured_bandwidth()
2145   candidates.remove_low_bandwidth_relays()
2146
2147   # print the raw fallback list
2148   #for x in candidates.fallbacks:
2149   #  print x.fallbackdir_line(True)
2150   #  print json.dumps(candidates[x]._data, sort_keys=True, indent=4,
2151   #                   separators=(',', ': '), default=json_util.default)
2152
2153   # impose mandatory conditions here, like one per contact, family, IP
2154   # in measured bandwidth order
2155   candidates.sort_fallbacks_by_measured_bandwidth()
2156   operator_count = 0
2157   # only impose these limits on the final list - operators can nominate
2158   # multiple candidate fallbacks, and then we choose the best set
2159   if not OUTPUT_CANDIDATES:
2160     operator_count += candidates.limit_fallbacks_same_ip()
2161     operator_count += candidates.limit_fallbacks_same_contact()
2162     operator_count += candidates.limit_fallbacks_same_family()
2163
2164   # check if each candidate can serve a consensus
2165   # there's a small risk we've eliminated relays from the same operator that
2166   # can serve a consensus, in favour of one that can't
2167   # but given it takes up to 15 seconds to check each consensus download,
2168   # the risk is worth it
2169   if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
2170     logging.warning('Checking consensus download speeds. ' +
2171                     'This may take some time.')
2172   failed_count = candidates.perform_download_consensus_checks(max_count)
2173
2174   # work out which fallbacks cache extra-infos
2175   candidates.mark_extra_info_caches()
2176
2177   # analyse and log interesting diversity metrics
2178   # like netblock, ports, exit, IPv4-only
2179   # (we can't easily analyse AS, and it's hard to accurately analyse country)
2180   candidates.describe_fallback_ip_family()
2181   # if we can't import the ipaddress module, we can't do netblock analysis
2182   if HAVE_IPADDRESS:
2183     candidates.describe_fallback_netblocks()
2184   candidates.describe_fallback_ports()
2185   candidates.describe_fallback_extra_info_caches()
2186   candidates.describe_fallback_exit_flag()
2187
2188   # output C comments summarising the fallback selection process
2189   if len(candidates.fallbacks) > 0:
2190     print candidates.summarise_fallbacks(eligible_count, operator_count,
2191                                          failed_count, guard_count,
2192                                          target_count)
2193   else:
2194     print '/* No Fallbacks met criteria */'
2195
2196   # output C comments specifying the OnionOO data used to create the list
2197   for s in fetch_source_list():
2198     print describe_fetch_source(s)
2199
2200   # start the list with a separator, to make it easy for parsers
2201   print SECTION_SEPARATOR_COMMENT
2202
2203   # sort the list differently depending on why we've created it:
2204   # if we're outputting the final fallback list, sort by fingerprint
2205   # this makes diffs much more stable
2206   # otherwise, if we're trying to find a bandwidth cutoff, or we want to
2207   # contact operators in priority order, sort by bandwidth (not yet
2208   # implemented)
2209   # otherwise, if we're contacting operators, sort by contact
2210   candidates.sort_fallbacks_by(OUTPUT_SORT_FIELD)
2211
2212   for x in candidates.fallbacks:
2213     print x.fallbackdir_line(candidates.fallbacks, prefilter_fallbacks)
2214
2215 if __name__ == "__main__":
2216   main()