scripts/maint/updateFallbackDirs.py

   1 #!/usr/bin/env python
   2
   3 # Usage:
   4 #
   5 # Regenerate the list:
   6 # scripts/maint/updateFallbackDirs.py > src/or/fallback_dirs.inc 2> fallback_dirs.log
   7 #
   8 # Check the existing list:
   9 # scripts/maint/updateFallbackDirs.py check_existing > fallback_dirs.inc.ok 2> fallback_dirs.log
  10 # mv fallback_dirs.inc.ok src/or/fallback_dirs.inc
  11 #
  12 # This script should be run from a stable, reliable network connection,
  13 # with no other network activity (and not over tor).
  14 # If this is not possible, please disable:
  15 # PERFORM_IPV4_DIRPORT_CHECKS and PERFORM_IPV6_DIRPORT_CHECKS
  16 #
  17 # Needs dateutil, stem, and potentially other python packages.
  18 # Optionally uses ipaddress (python 3 builtin) or py2-ipaddress (package)
  19 # for netblock analysis.
  20 #
  21 # Then read the logs to make sure the fallbacks aren't dominated by a single
  22 # netblock or port.
  23
  24 # Script by weasel, April 2015
  25 # Portions by gsathya & karsten, 2013
  26 # https://trac.torproject.org/projects/tor/attachment/ticket/8374/dir_list.2.py
  27 # Modifications by teor, 2015
  28
  29 import StringIO
  30 import string
  31 import re
  32 import datetime
  33 import gzip
  34 import os.path
  35 import json
  36 import math
  37 import sys
  38 import urllib
  39 import urllib2
  40 import hashlib
  41 import dateutil.parser
  42 # bson_lazy provides bson
  43 #from bson import json_util
  44 import copy
  45 import re
  46
  47 from stem.descriptor import DocumentHandler
  48 from stem.descriptor.remote import get_consensus, get_server_descriptors, MAX_FINGERPRINTS
  49
  50 import logging
  51 logging.root.name = ''
  52
  53 HAVE_IPADDRESS = False
  54 try:
  55   # python 3 builtin, or install package py2-ipaddress
  56   # there are several ipaddress implementations for python 2
  57   # with slightly different semantics with str typed text
  58   # fortunately, all our IP addresses are in unicode
  59   import ipaddress
  60   HAVE_IPADDRESS = True
  61 except ImportError:
  62   # if this happens, we avoid doing netblock analysis
  63   logging.warning('Unable to import ipaddress, please install py2-ipaddress.' +
  64                   ' A fallback list will be created, but optional netblock' +
  65                   ' analysis will not be performed.')
  66
  67 ## Top-Level Configuration
  68
  69 # We use semantic versioning: https://semver.org
  70 # In particular:
  71 # * major changes include removing a mandatory field, or anything else that
  72 #   would break an appropriately tolerant parser,
  73 # * minor changes include adding a field,
  74 # * patch changes include changing header comments or other unstructured
  75 #   content
  76 FALLBACK_FORMAT_VERSION = '2.0.0'
  77 SECTION_SEPARATOR_BASE = '====='
  78 SECTION_SEPARATOR_COMMENT = '/* ' + SECTION_SEPARATOR_BASE + ' */'
  79
  80 # Output all candidate fallbacks, or only output selected fallbacks?
  81 OUTPUT_CANDIDATES = False
  82
  83 # Perform DirPort checks over IPv4?
  84 # Change this to False if IPv4 doesn't work for you, or if you don't want to
  85 # download a consensus for each fallback
  86 # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
  87 PERFORM_IPV4_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else True
  88
  89 # Perform DirPort checks over IPv6?
  90 # If you know IPv6 works for you, set this to True
  91 # This will exclude IPv6 relays without an IPv6 DirPort configured
  92 # So it's best left at False until #18394 is implemented
  93 # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
  94 PERFORM_IPV6_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else False
  95
  96 # Must relays be running now?
  97 MUST_BE_RUNNING_NOW = (PERFORM_IPV4_DIRPORT_CHECKS
  98                        or PERFORM_IPV6_DIRPORT_CHECKS)
  99
 100 # Clients have been using microdesc consensuses by default for a while now
 101 DOWNLOAD_MICRODESC_CONSENSUS = True
 102
 103 # If a relay delivers an expired consensus, if it expired less than this many
 104 # seconds ago, we still allow the relay. This should never be less than -90,
 105 # as all directory mirrors should have downloaded a consensus 90 minutes
 106 # before it expires. It should never be more than 24 hours, because clients
 107 # reject consensuses that are older than REASONABLY_LIVE_TIME.
 108 # For the consensus expiry check to be accurate, the machine running this
 109 # script needs an accurate clock.
 110 #
 111 # Relays on 0.3.0 and later return a 404 when they are about to serve an
 112 # expired consensus. This makes them fail the download check.
 113 # We use a tolerance of 0, so that 0.2.x series relays also fail the download
 114 # check if they serve an expired consensus.
 115 CONSENSUS_EXPIRY_TOLERANCE = 0
 116
 117 # Output fallback name, flags, bandwidth, and ContactInfo in a C comment?
 118 OUTPUT_COMMENTS = True if OUTPUT_CANDIDATES else False
 119
 120 # Output matching ContactInfo in fallbacks list or the blacklist?
 121 # Useful if you're trying to contact operators
 122 CONTACT_COUNT = True if OUTPUT_CANDIDATES else False
 123 CONTACT_BLACKLIST_COUNT = True if OUTPUT_CANDIDATES else False
 124
 125 # How the list should be sorted:
 126 # fingerprint: is useful for stable diffs of fallback lists
 127 # measured_bandwidth: is useful when pruning the list based on bandwidth
 128 # contact: is useful for contacting operators once the list has been pruned
 129 OUTPUT_SORT_FIELD = 'contact' if OUTPUT_CANDIDATES else 'fingerprint'
 130
 131 ## OnionOO Settings
 132
 133 ONIONOO = 'https://onionoo.torproject.org/'
 134 #ONIONOO = 'https://onionoo.thecthulhu.com/'
 135
 136 # Don't bother going out to the Internet, just use the files available locally,
 137 # even if they're very old
 138 LOCAL_FILES_ONLY = False
 139
 140 ## Whitelist / Blacklist Filter Settings
 141
 142 # The whitelist contains entries that are included if all attributes match
 143 # (IPv4, dirport, orport, id, and optionally IPv6 and IPv6 orport)
 144 # The blacklist contains (partial) entries that are excluded if any
 145 # sufficiently specific group of attributes matches:
 146 # IPv4 & DirPort
 147 # IPv4 & ORPort
 148 # ID
 149 # IPv6 & DirPort
 150 # IPv6 & IPv6 ORPort
 151 # If neither port is included in the blacklist, the entire IP address is
 152 # blacklisted.
 153
 154 # What happens to entries in neither list?
 155 # When True, they are included, when False, they are excluded
 156 INCLUDE_UNLISTED_ENTRIES = True if OUTPUT_CANDIDATES else False
 157
 158 # If an entry is in both lists, what happens?
 159 # When True, it is excluded, when False, it is included
 160 BLACKLIST_EXCLUDES_WHITELIST_ENTRIES = True
 161
 162 WHITELIST_FILE_NAME = 'scripts/maint/fallback.whitelist'
 163 BLACKLIST_FILE_NAME = 'scripts/maint/fallback.blacklist'
 164 FALLBACK_FILE_NAME  = 'src/or/fallback_dirs.inc'
 165
 166 # The number of bytes we'll read from a filter file before giving up
 167 MAX_LIST_FILE_SIZE = 1024 * 1024
 168
 169 ## Eligibility Settings
 170
 171 # Require fallbacks to have the same address and port for a set amount of time
 172 # We used to have this at 1 week, but that caused many fallback failures, which
 173 # meant that we had to rebuild the list more often. We want fallbacks to be
 174 # stable for 2 years, so we set it to a few months.
 175 #
 176 # If a relay changes address or port, that's it, it's not useful any more,
 177 # because clients can't find it
 178 ADDRESS_AND_PORT_STABLE_DAYS = 90
 179 # We ignore relays that have been down for more than this period
 180 MAX_DOWNTIME_DAYS = 0 if MUST_BE_RUNNING_NOW else 7
 181 # FallbackDirs must have a time-weighted-fraction that is greater than or
 182 # equal to:
 183 # Mirrors that are down half the time are still useful half the time
 184 CUTOFF_RUNNING = .50
 185 CUTOFF_V2DIR = .50
 186 # Guard flags are removed for some time after a relay restarts, so we ignore
 187 # the guard flag.
 188 CUTOFF_GUARD = .00
 189 # FallbackDirs must have a time-weighted-fraction that is less than or equal
 190 # to:
 191 # .00 means no bad exits
 192 PERMITTED_BADEXIT = .00
 193
 194 # older entries' weights are adjusted with ALPHA^(age in days)
 195 AGE_ALPHA = 0.99
 196
 197 # this factor is used to scale OnionOO entries to [0,1]
 198 ONIONOO_SCALE_ONE = 999.
 199
 200 ## Fallback Count Limits
 201
 202 # The target for these parameters is 20% of the guards in the network
 203 # This is around 200 as of October 2015
 204 _FB_POG = 0.2
 205 FALLBACK_PROPORTION_OF_GUARDS = None if OUTPUT_CANDIDATES else _FB_POG
 206
 207 # Limit the number of fallbacks (eliminating lowest by advertised bandwidth)
 208 MAX_FALLBACK_COUNT = None if OUTPUT_CANDIDATES else 200
 209 # Emit a C #error if the number of fallbacks is less than expected
 210 MIN_FALLBACK_COUNT = 0 if OUTPUT_CANDIDATES else MAX_FALLBACK_COUNT*0.5
 211
 212 # The maximum number of fallbacks on the same address, contact, or family
 213 #
 214 # With 150 fallbacks, this means each operator sees 5% of client bootstraps.
 215 # For comparison:
 216 #  - We try to limit guard and exit operators to 5% of the network
 217 #  - The directory authorities used to see 11% of client bootstraps each
 218 #
 219 # We also don't want too much of the list to go down if a single operator
 220 # has to move all their relays.
 221 MAX_FALLBACKS_PER_IP = 1
 222 MAX_FALLBACKS_PER_IPV4 = MAX_FALLBACKS_PER_IP
 223 MAX_FALLBACKS_PER_IPV6 = MAX_FALLBACKS_PER_IP
 224 MAX_FALLBACKS_PER_CONTACT = 7
 225 MAX_FALLBACKS_PER_FAMILY = 7
 226
 227 ## Fallback Bandwidth Requirements
 228
 229 # Any fallback with the Exit flag has its bandwidth multiplied by this fraction
 230 # to make sure we aren't further overloading exits
 231 # (Set to 1.0, because we asked that only lightly loaded exits opt-in,
 232 # and the extra load really isn't that much for large relays.)
 233 EXIT_BANDWIDTH_FRACTION = 1.0
 234
 235 # If a single fallback's bandwidth is too low, it's pointless adding it
 236 # We expect fallbacks to handle an extra 10 kilobytes per second of traffic
 237 # Make sure they can support fifty times the expected extra load
 238 #
 239 # We convert this to a consensus weight before applying the filter,
 240 # because all the bandwidth amounts are specified by the relay
 241 MIN_BANDWIDTH = 50.0 * 10.0 * 1024.0
 242
 243 # Clients will time out after 30 seconds trying to download a consensus
 244 # So allow fallback directories half that to deliver a consensus
 245 # The exact download times might change based on the network connection
 246 # running this script, but only by a few seconds
 247 # There is also about a second of python overhead
 248 CONSENSUS_DOWNLOAD_SPEED_MAX = 15.0
 249 # If the relay fails a consensus check, retry the download
 250 # This avoids delisting a relay due to transient network conditions
 251 CONSENSUS_DOWNLOAD_RETRY = True
 252
 253 ## Parsing Functions
 254
 255 def parse_ts(t):
 256   return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S")
 257
 258 def remove_bad_chars(raw_string, bad_char_list):
 259   # Remove each character in the bad_char_list
 260   cleansed_string = raw_string
 261   for c in bad_char_list:
 262     cleansed_string = cleansed_string.replace(c, '')
 263   return cleansed_string
 264
 265 def cleanse_unprintable(raw_string):
 266   # Remove all unprintable characters
 267   cleansed_string = ''
 268   for c in raw_string:
 269     if c in string.printable:
 270       cleansed_string += c
 271   return cleansed_string
 272
 273 def cleanse_whitespace(raw_string):
 274   # Replace all whitespace characters with a space
 275   cleansed_string = raw_string
 276   for c in string.whitespace:
 277     cleansed_string = cleansed_string.replace(c, ' ')
 278   return cleansed_string
 279
 280 def cleanse_c_multiline_comment(raw_string):
 281   cleansed_string = raw_string
 282   # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
 283   cleansed_string = cleanse_whitespace(cleansed_string)
 284   # ContactInfo and Version can be arbitrary binary data
 285   cleansed_string = cleanse_unprintable(cleansed_string)
 286   # Prevent a malicious / unanticipated string from breaking out
 287   # of a C-style multiline comment
 288   # This removes '/*' and '*/' and '//'
 289   bad_char_list = '*/'
 290   # Prevent a malicious string from using C nulls
 291   bad_char_list += '\0'
 292   # Avoid confusing parsers by making sure there is only one comma per fallback
 293   bad_char_list += ','
 294   # Avoid confusing parsers by making sure there is only one equals per field
 295   bad_char_list += '='
 296   # Be safer by removing bad characters entirely
 297   cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
 298   # Some compilers may further process the content of comments
 299   # There isn't much we can do to cover every possible case
 300   # But comment-based directives are typically only advisory
 301   return cleansed_string
 302
 303 def cleanse_c_string(raw_string):
 304   cleansed_string = raw_string
 305   # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
 306   cleansed_string = cleanse_whitespace(cleansed_string)
 307   # ContactInfo and Version can be arbitrary binary data
 308   cleansed_string = cleanse_unprintable(cleansed_string)
 309   # Prevent a malicious address/fingerprint string from breaking out
 310   # of a C-style string
 311   bad_char_list = '"'
 312   # Prevent a malicious string from using escapes
 313   bad_char_list += '\\'
 314   # Prevent a malicious string from using C nulls
 315   bad_char_list += '\0'
 316   # Avoid confusing parsers by making sure there is only one comma per fallback
 317   bad_char_list += ','
 318   # Avoid confusing parsers by making sure there is only one equals per field
 319   bad_char_list += '='
 320   # Be safer by removing bad characters entirely
 321   cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
 322   # Some compilers may further process the content of strings
 323   # There isn't much we can do to cover every possible case
 324   # But this typically only results in changes to the string data
 325   return cleansed_string
 326
 327 ## OnionOO Source Functions
 328
 329 # a dictionary of source metadata for each onionoo query we've made
 330 fetch_source = {}
 331
 332 # register source metadata for 'what'
 333 # assumes we only retrieve one document for each 'what'
 334 def register_fetch_source(what, url, relays_published, version):
 335   fetch_source[what] = {}
 336   fetch_source[what]['url'] = url
 337   fetch_source[what]['relays_published'] = relays_published
 338   fetch_source[what]['version'] = version
 339
 340 # list each registered source's 'what'
 341 def fetch_source_list():
 342   return sorted(fetch_source.keys())
 343
 344 # given 'what', provide a multiline C comment describing the source
 345 def describe_fetch_source(what):
 346   desc = '/*'
 347   desc += '\n'
 348   desc += 'Onionoo Source: '
 349   desc += cleanse_c_multiline_comment(what)
 350   desc += ' Date: '
 351   desc += cleanse_c_multiline_comment(fetch_source[what]['relays_published'])
 352   desc += ' Version: '
 353   desc += cleanse_c_multiline_comment(fetch_source[what]['version'])
 354   desc += '\n'
 355   desc += 'URL: '
 356   desc += cleanse_c_multiline_comment(fetch_source[what]['url'])
 357   desc += '\n'
 358   desc += '*/'
 359   return desc
 360
 361 ## File Processing Functions
 362
 363 def write_to_file(str, file_name, max_len):
 364   try:
 365     with open(file_name, 'w') as f:
 366       f.write(str[0:max_len])
 367   except EnvironmentError, error:
 368     logging.error('Writing file %s failed: %d: %s'%
 369                   (file_name,
 370                    error.errno,
 371                    error.strerror)
 372                   )
 373
 374 def read_from_file(file_name, max_len):
 375   try:
 376     if os.path.isfile(file_name):
 377       with open(file_name, 'r') as f:
 378         return f.read(max_len)
 379   except EnvironmentError, error:
 380     logging.info('Loading file %s failed: %d: %s'%
 381                  (file_name,
 382                   error.errno,
 383                   error.strerror)
 384                  )
 385   return None
 386
 387 def parse_fallback_file(file_name):
 388   file_data = read_from_file(file_name, MAX_LIST_FILE_SIZE)
 389   file_data = cleanse_unprintable(file_data)
 390   file_data = remove_bad_chars(file_data, '\n"\0')
 391   file_data = re.sub('/\*.*?\*/', '', file_data)
 392   file_data = file_data.replace(',', '\n')
 393   file_data = file_data.replace(' weight=10', '')
 394   return file_data
 395
 396 def load_possibly_compressed_response_json(response):
 397     if response.info().get('Content-Encoding') == 'gzip':
 398       buf = StringIO.StringIO( response.read() )
 399       f = gzip.GzipFile(fileobj=buf)
 400       return json.load(f)
 401     else:
 402       return json.load(response)
 403
 404 def load_json_from_file(json_file_name):
 405     # An exception here may be resolved by deleting the .last_modified
 406     # and .json files, and re-running the script
 407     try:
 408       with open(json_file_name, 'r') as f:
 409         return json.load(f)
 410     except EnvironmentError, error:
 411       raise Exception('Reading not-modified json file %s failed: %d: %s'%
 412                     (json_file_name,
 413                      error.errno,
 414                      error.strerror)
 415                     )
 416
 417 ## OnionOO Functions
 418
 419 def datestr_to_datetime(datestr):
 420   # Parse datetimes like: Fri, 02 Oct 2015 13:34:14 GMT
 421   if datestr is not None:
 422     dt = dateutil.parser.parse(datestr)
 423   else:
 424     # Never modified - use start of epoch
 425     dt = datetime.datetime.utcfromtimestamp(0)
 426   # strip any timezone out (in case they're supported in future)
 427   dt = dt.replace(tzinfo=None)
 428   return dt
 429
 430 def onionoo_fetch(what, **kwargs):
 431   params = kwargs
 432   params['type'] = 'relay'
 433   #params['limit'] = 10
 434   params['first_seen_days'] = '%d-'%(ADDRESS_AND_PORT_STABLE_DAYS)
 435   params['last_seen_days'] = '-%d'%(MAX_DOWNTIME_DAYS)
 436   params['flag'] = 'V2Dir'
 437   url = ONIONOO + what + '?' + urllib.urlencode(params)
 438
 439   # Unfortunately, the URL is too long for some OS filenames,
 440   # but we still don't want to get files from different URLs mixed up
 441   base_file_name = what + '-' + hashlib.sha1(url).hexdigest()
 442
 443   full_url_file_name = base_file_name + '.full_url'
 444   MAX_FULL_URL_LENGTH = 1024
 445
 446   last_modified_file_name = base_file_name + '.last_modified'
 447   MAX_LAST_MODIFIED_LENGTH = 64
 448
 449   json_file_name = base_file_name + '.json'
 450
 451   if LOCAL_FILES_ONLY:
 452     # Read from the local file, don't write to anything
 453     response_json = load_json_from_file(json_file_name)
 454   else:
 455     # store the full URL to a file for debugging
 456     # no need to compare as long as you trust SHA-1
 457     write_to_file(url, full_url_file_name, MAX_FULL_URL_LENGTH)
 458
 459     request = urllib2.Request(url)
 460     request.add_header('Accept-encoding', 'gzip')
 461
 462     # load the last modified date from the file, if it exists
 463     last_mod_date = read_from_file(last_modified_file_name,
 464                                    MAX_LAST_MODIFIED_LENGTH)
 465     if last_mod_date is not None:
 466       request.add_header('If-modified-since', last_mod_date)
 467
 468     # Parse last modified date
 469     last_mod = datestr_to_datetime(last_mod_date)
 470
 471     # Not Modified and still recent enough to be useful
 472     # Onionoo / Globe used to use 6 hours, but we can afford a day
 473     required_freshness = datetime.datetime.utcnow()
 474     # strip any timezone out (to match dateutil.parser)
 475     required_freshness = required_freshness.replace(tzinfo=None)
 476     required_freshness -= datetime.timedelta(hours=24)
 477
 478     # Make the OnionOO request
 479     response_code = 0
 480     try:
 481       response = urllib2.urlopen(request)
 482       response_code = response.getcode()
 483     except urllib2.HTTPError, error:
 484       response_code = error.code
 485       if response_code == 304: # not modified
 486         pass
 487       else:
 488         raise Exception("Could not get " + url + ": "
 489                         + str(error.code) + ": " + error.reason)
 490
 491     if response_code == 200: # OK
 492       last_mod = datestr_to_datetime(response.info().get('Last-Modified'))
 493
 494     # Check for freshness
 495     if last_mod < required_freshness:
 496       if last_mod_date is not None:
 497         # This check sometimes fails transiently, retry the script if it does
 498         date_message = "Outdated data: last updated " + last_mod_date
 499       else:
 500         date_message = "No data: never downloaded "
 501       raise Exception(date_message + " from " + url)
 502
 503     # Process the data
 504     if response_code == 200: # OK
 505
 506       response_json = load_possibly_compressed_response_json(response)
 507
 508       with open(json_file_name, 'w') as f:
 509         # use the most compact json representation to save space
 510         json.dump(response_json, f, separators=(',',':'))
 511
 512       # store the last modified date in its own file
 513       if response.info().get('Last-modified') is not None:
 514         write_to_file(response.info().get('Last-Modified'),
 515                       last_modified_file_name,
 516                       MAX_LAST_MODIFIED_LENGTH)
 517
 518     elif response_code == 304: # Not Modified
 519
 520       response_json = load_json_from_file(json_file_name)
 521
 522     else: # Unexpected HTTP response code not covered in the HTTPError above
 523       raise Exception("Unexpected HTTP response code to " + url + ": "
 524                       + str(response_code))
 525
 526   register_fetch_source(what,
 527                         url,
 528                         response_json['relays_published'],
 529                         response_json['version'])
 530
 531   return response_json
 532
 533 def fetch(what, **kwargs):
 534   #x = onionoo_fetch(what, **kwargs)
 535   # don't use sort_keys, as the order of or_addresses is significant
 536   #print json.dumps(x, indent=4, separators=(',', ': '))
 537   #sys.exit(0)
 538
 539   return onionoo_fetch(what, **kwargs)
 540
 541 ## Fallback Candidate Class
 542
 543 class Candidate(object):
 544   CUTOFF_ADDRESS_AND_PORT_STABLE = (datetime.datetime.utcnow()
 545                             - datetime.timedelta(ADDRESS_AND_PORT_STABLE_DAYS))
 546
 547   def __init__(self, details):
 548     for f in ['fingerprint', 'nickname', 'last_changed_address_or_port',
 549               'consensus_weight', 'or_addresses', 'dir_address']:
 550       if not f in details: raise Exception("Document has no %s field."%(f,))
 551
 552     if not 'contact' in details:
 553       details['contact'] = None
 554     if not 'flags' in details or details['flags'] is None:
 555       details['flags'] = []
 556     if (not 'advertised_bandwidth' in details
 557         or details['advertised_bandwidth'] is None):
 558       # relays without advertised bandwidth have it calculated from their
 559       # consensus weight
 560       details['advertised_bandwidth'] = 0
 561     if (not 'effective_family' in details
 562         or details['effective_family'] is None):
 563       details['effective_family'] = []
 564     if not 'platform' in details:
 565       details['platform'] = None
 566     details['last_changed_address_or_port'] = parse_ts(
 567                                       details['last_changed_address_or_port'])
 568     self._data = details
 569     self._stable_sort_or_addresses()
 570
 571     self._fpr = self._data['fingerprint']
 572     self._running = self._guard = self._v2dir = 0.
 573     self._split_dirport()
 574     self._compute_orport()
 575     if self.orport is None:
 576       raise Exception("Failed to get an orport for %s."%(self._fpr,))
 577     self._compute_ipv6addr()
 578     if not self.has_ipv6():
 579       logging.debug("Failed to get an ipv6 address for %s."%(self._fpr,))
 580     self._compute_version()
 581     self._extra_info_cache = None
 582
 583   def _stable_sort_or_addresses(self):
 584     # replace self._data['or_addresses'] with a stable ordering,
 585     # sorting the secondary addresses in string order
 586     # leave the received order in self._data['or_addresses_raw']
 587     self._data['or_addresses_raw'] = self._data['or_addresses']
 588     or_address_primary = self._data['or_addresses'][:1]
 589     # subsequent entries in the or_addresses array are in an arbitrary order
 590     # so we stabilise the addresses by sorting them in string order
 591     or_addresses_secondaries_stable = sorted(self._data['or_addresses'][1:])
 592     or_addresses_stable = or_address_primary + or_addresses_secondaries_stable
 593     self._data['or_addresses'] = or_addresses_stable
 594
 595   def get_fingerprint(self):
 596     return self._fpr
 597
 598   # is_valid_ipv[46]_address by gsathya, karsten, 2013
 599   @staticmethod
 600   def is_valid_ipv4_address(address):
 601     if not isinstance(address, (str, unicode)):
 602       return False
 603
 604     # check if there are four period separated values
 605     if address.count(".") != 3:
 606       return False
 607
 608     # checks that each value in the octet are decimal values between 0-255
 609     for entry in address.split("."):
 610       if not entry.isdigit() or int(entry) < 0 or int(entry) > 255:
 611         return False
 612       elif entry[0] == "0" and len(entry) > 1:
 613         return False  # leading zeros, for instance in "1.2.3.001"
 614
 615     return True
 616
 617   @staticmethod
 618   def is_valid_ipv6_address(address):
 619     if not isinstance(address, (str, unicode)):
 620       return False
 621
 622     # remove brackets
 623     address = address[1:-1]
 624
 625     # addresses are made up of eight colon separated groups of four hex digits
 626     # with leading zeros being optional
 627     # https://en.wikipedia.org/wiki/IPv6#Address_format
 628
 629     colon_count = address.count(":")
 630
 631     if colon_count > 7:
 632       return False  # too many groups
 633     elif colon_count != 7 and not "::" in address:
 634       return False  # not enough groups and none are collapsed
 635     elif address.count("::") > 1 or ":::" in address:
 636       return False  # multiple groupings of zeros can't be collapsed
 637
 638     found_ipv4_on_previous_entry = False
 639     for entry in address.split(":"):
 640       # If an IPv6 address has an embedded IPv4 address,
 641       # it must be the last entry
 642       if found_ipv4_on_previous_entry:
 643         return False
 644       if not re.match("^[0-9a-fA-f]{0,4}$", entry):
 645         if not Candidate.is_valid_ipv4_address(entry):
 646           return False
 647         else:
 648           found_ipv4_on_previous_entry = True
 649
 650     return True
 651
 652   def _split_dirport(self):
 653     # Split the dir_address into dirip and dirport
 654     (self.dirip, _dirport) = self._data['dir_address'].split(':', 2)
 655     self.dirport = int(_dirport)
 656
 657   def _compute_orport(self):
 658     # Choose the first ORPort that's on the same IPv4 address as the DirPort.
 659     # In rare circumstances, this might not be the primary ORPort address.
 660     # However, _stable_sort_or_addresses() ensures we choose the same one
 661     # every time, even if onionoo changes the order of the secondaries.
 662     self._split_dirport()
 663     self.orport = None
 664     for i in self._data['or_addresses']:
 665       if i != self._data['or_addresses'][0]:
 666         logging.debug('Secondary IPv4 Address Used for %s: %s'%(self._fpr, i))
 667       (ipaddr, port) = i.rsplit(':', 1)
 668       if (ipaddr == self.dirip) and Candidate.is_valid_ipv4_address(ipaddr):
 669         self.orport = int(port)
 670         return
 671
 672   def _compute_ipv6addr(self):
 673     # Choose the first IPv6 address that uses the same port as the ORPort
 674     # Or, choose the first IPv6 address in the list
 675     # _stable_sort_or_addresses() ensures we choose the same IPv6 address
 676     # every time, even if onionoo changes the order of the secondaries.
 677     self.ipv6addr = None
 678     self.ipv6orport = None
 679     # Choose the first IPv6 address that uses the same port as the ORPort
 680     for i in self._data['or_addresses']:
 681       (ipaddr, port) = i.rsplit(':', 1)
 682       if (port == self.orport) and Candidate.is_valid_ipv6_address(ipaddr):
 683         self.ipv6addr = ipaddr
 684         self.ipv6orport = int(port)
 685         return
 686     # Choose the first IPv6 address in the list
 687     for i in self._data['or_addresses']:
 688       (ipaddr, port) = i.rsplit(':', 1)
 689       if Candidate.is_valid_ipv6_address(ipaddr):
 690         self.ipv6addr = ipaddr
 691         self.ipv6orport = int(port)
 692         return
 693
 694   def _compute_version(self):
 695     # parse the version out of the platform string
 696     # The platform looks like: "Tor 0.2.7.6 on Linux"
 697     self._data['version'] = None
 698     if self._data['platform'] is None:
 699       return
 700     # be tolerant of weird whitespacing, use a whitespace split
 701     tokens = self._data['platform'].split()
 702     for token in tokens:
 703       vnums = token.split('.')
 704       # if it's at least a.b.c.d, with potentially an -alpha-dev, -alpha, -rc
 705       if (len(vnums) >= 4 and vnums[0].isdigit() and vnums[1].isdigit() and
 706           vnums[2].isdigit()):
 707         self._data['version'] = token
 708         return
 709
 710   # From #20509
 711   # bug #20499 affects versions from 0.2.9.1-alpha-dev to 0.2.9.4-alpha-dev
 712   # and version 0.3.0.0-alpha-dev
 713   # Exhaustive lists are hard to get wrong
 714   STALE_CONSENSUS_VERSIONS = ['0.2.9.1-alpha-dev',
 715                               '0.2.9.2-alpha',
 716                               '0.2.9.2-alpha-dev',
 717                               '0.2.9.3-alpha',
 718                               '0.2.9.3-alpha-dev',
 719                               '0.2.9.4-alpha',
 720                               '0.2.9.4-alpha-dev',
 721                               '0.3.0.0-alpha-dev'
 722                               ]
 723
 724   def is_valid_version(self):
 725     # call _compute_version before calling this
 726     # is the version of the relay a version we want as a fallback?
 727     # checks both recommended versions and bug #20499 / #20509
 728     #
 729     # if the relay doesn't have a recommended version field, exclude the relay
 730     if not self._data.has_key('recommended_version'):
 731       log_excluded('%s not a candidate: no recommended_version field',
 732                    self._fpr)
 733       return False
 734     if not self._data['recommended_version']:
 735       log_excluded('%s not a candidate: version not recommended', self._fpr)
 736       return False
 737     # if the relay doesn't have version field, exclude the relay
 738     if not self._data.has_key('version'):
 739       log_excluded('%s not a candidate: no version field', self._fpr)
 740       return False
 741     if self._data['version'] in Candidate.STALE_CONSENSUS_VERSIONS:
 742       logging.warning('%s not a candidate: version delivers stale consensuses',
 743                       self._fpr)
 744       return False
 745     return True
 746
 747   @staticmethod
 748   def _extract_generic_history(history, which='unknown'):
 749     # given a tree like this:
 750     #   {
 751     #     "1_month": {
 752     #         "count": 187,
 753     #         "factor": 0.001001001001001001,
 754     #         "first": "2015-02-27 06:00:00",
 755     #         "interval": 14400,
 756     #         "last": "2015-03-30 06:00:00",
 757     #         "values": [
 758     #             999,
 759     #             999
 760     #         ]
 761     #     },
 762     #     "1_week": {
 763     #         "count": 169,
 764     #         "factor": 0.001001001001001001,
 765     #         "first": "2015-03-23 07:30:00",
 766     #         "interval": 3600,
 767     #         "last": "2015-03-30 07:30:00",
 768     #         "values": [ ...]
 769     #     },
 770     #     "1_year": {
 771     #         "count": 177,
 772     #         "factor": 0.001001001001001001,
 773     #         "first": "2014-04-11 00:00:00",
 774     #         "interval": 172800,
 775     #         "last": "2015-03-29 00:00:00",
 776     #         "values": [ ...]
 777     #     },
 778     #     "3_months": {
 779     #         "count": 185,
 780     #         "factor": 0.001001001001001001,
 781     #         "first": "2014-12-28 06:00:00",
 782     #         "interval": 43200,
 783     #         "last": "2015-03-30 06:00:00",
 784     #         "values": [ ...]
 785     #     }
 786     #   },
 787     # extract exactly one piece of data per time interval,
 788     # using smaller intervals where available.
 789     #
 790     # returns list of (age, length, value) dictionaries.
 791
 792     generic_history = []
 793
 794     periods = history.keys()
 795     periods.sort(key = lambda x: history[x]['interval'])
 796     now = datetime.datetime.utcnow()
 797     newest = now
 798     for p in periods:
 799       h = history[p]
 800       interval = datetime.timedelta(seconds = h['interval'])
 801       this_ts = parse_ts(h['last'])
 802
 803       if (len(h['values']) != h['count']):
 804         logging.warning('Inconsistent value count in %s document for %s'
 805                         %(p, which))
 806       for v in reversed(h['values']):
 807         if (this_ts <= newest):
 808           agt1 = now - this_ts
 809           agt2 = interval
 810           agetmp1 = (agt1.microseconds + (agt1.seconds + agt1.days * 24 * 3600)
 811                      * 10**6) / 10**6
 812           agetmp2 = (agt2.microseconds + (agt2.seconds + agt2.days * 24 * 3600)
 813                      * 10**6) / 10**6
 814           generic_history.append(
 815             { 'age': agetmp1,
 816               'length': agetmp2,
 817               'value': v
 818             })
 819           newest = this_ts
 820         this_ts -= interval
 821
 822       if (this_ts + interval != parse_ts(h['first'])):
 823         logging.warning('Inconsistent time information in %s document for %s'
 824                         %(p, which))
 825
 826     #print json.dumps(generic_history, sort_keys=True,
 827     #                  indent=4, separators=(',', ': '))
 828     return generic_history
 829
 830   @staticmethod
 831   def _avg_generic_history(generic_history):
 832     a = []
 833     for i in generic_history:
 834       if i['age'] > (ADDRESS_AND_PORT_STABLE_DAYS * 24 * 3600):
 835         continue
 836       if (i['length'] is not None
 837           and i['age'] is not None
 838           and i['value'] is not None):
 839         w = i['length'] * math.pow(AGE_ALPHA, i['age']/(3600*24))
 840         a.append( (i['value'] * w, w) )
 841
 842     sv = math.fsum(map(lambda x: x[0], a))
 843     sw = math.fsum(map(lambda x: x[1], a))
 844
 845     if sw == 0.0:
 846       svw = 0.0
 847     else:
 848       svw = sv/sw
 849     return svw
 850
 851   def _add_generic_history(self, history):
 852     periods = r['read_history'].keys()
 853     periods.sort(key = lambda x: r['read_history'][x]['interval'] )
 854
 855     print periods
 856
 857   def add_running_history(self, history):
 858     pass
 859
 860   def add_uptime(self, uptime):
 861     logging.debug('Adding uptime %s.'%(self._fpr,))
 862
 863     # flags we care about: Running, V2Dir, Guard
 864     if not 'flags' in uptime:
 865       logging.debug('No flags in document for %s.'%(self._fpr,))
 866       return
 867
 868     for f in ['Running', 'Guard', 'V2Dir']:
 869       if not f in uptime['flags']:
 870         logging.debug('No %s in flags for %s.'%(f, self._fpr,))
 871         return
 872
 873     running = self._extract_generic_history(uptime['flags']['Running'],
 874                                             '%s-Running'%(self._fpr))
 875     guard = self._extract_generic_history(uptime['flags']['Guard'],
 876                                           '%s-Guard'%(self._fpr))
 877     v2dir = self._extract_generic_history(uptime['flags']['V2Dir'],
 878                                           '%s-V2Dir'%(self._fpr))
 879     if 'BadExit' in uptime['flags']:
 880       badexit = self._extract_generic_history(uptime['flags']['BadExit'],
 881                                               '%s-BadExit'%(self._fpr))
 882
 883     self._running = self._avg_generic_history(running) / ONIONOO_SCALE_ONE
 884     self._guard = self._avg_generic_history(guard) / ONIONOO_SCALE_ONE
 885     self._v2dir = self._avg_generic_history(v2dir) / ONIONOO_SCALE_ONE
 886     self._badexit = None
 887     if 'BadExit' in uptime['flags']:
 888       self._badexit = self._avg_generic_history(badexit) / ONIONOO_SCALE_ONE
 889
 890   def is_candidate(self):
 891     try:
 892       if (MUST_BE_RUNNING_NOW and not self.is_running()):
 893         log_excluded('%s not a candidate: not running now, unable to check ' +
 894                      'DirPort consensus download', self._fpr)
 895         return False
 896       if (self._data['last_changed_address_or_port'] >
 897           self.CUTOFF_ADDRESS_AND_PORT_STABLE):
 898         log_excluded('%s not a candidate: changed address/port recently (%s)',
 899                      self._fpr, self._data['last_changed_address_or_port'])
 900         return False
 901       if self._running < CUTOFF_RUNNING:
 902         log_excluded('%s not a candidate: running avg too low (%lf)',
 903                      self._fpr, self._running)
 904         return False
 905       if self._v2dir < CUTOFF_V2DIR:
 906         log_excluded('%s not a candidate: v2dir avg too low (%lf)',
 907                      self._fpr, self._v2dir)
 908         return False
 909       if self._badexit is not None and self._badexit > PERMITTED_BADEXIT:
 910         log_excluded('%s not a candidate: badexit avg too high (%lf)',
 911                      self._fpr, self._badexit)
 912         return False
 913       # this function logs a message depending on which check fails
 914       if not self.is_valid_version():
 915         return False
 916       if self._guard < CUTOFF_GUARD:
 917         log_excluded('%s not a candidate: guard avg too low (%lf)',
 918                      self._fpr, self._guard)
 919         return False
 920       if (not self._data.has_key('consensus_weight')
 921           or self._data['consensus_weight'] < 1):
 922         log_excluded('%s not a candidate: consensus weight invalid', self._fpr)
 923         return False
 924     except BaseException as e:
 925       logging.warning("Exception %s when checking if fallback is a candidate",
 926                       str(e))
 927       return False
 928     return True
 929
 930   def is_in_whitelist(self, relaylist):
 931     """ A fallback matches if each key in the whitelist line matches:
 932           ipv4
 933           dirport
 934           orport
 935           id
 936           ipv6 address and port (if present)
 937         If the fallback has an ipv6 key, the whitelist line must also have
 938         it, and vice versa, otherwise they don't match. """
 939     ipv6 = None
 940     if self.has_ipv6():
 941       ipv6 = '%s:%d'%(self.ipv6addr, self.ipv6orport)
 942     for entry in relaylist:
 943       if entry['id'] != self._fpr:
 944         # can't log here unless we match an IP and port, because every relay's
 945         # fingerprint is compared to every entry's fingerprint
 946         if entry['ipv4'] == self.dirip and int(entry['orport']) == self.orport:
 947           logging.warning('%s excluded: has OR %s:%d changed fingerprint to ' +
 948                           '%s?', entry['id'], self.dirip, self.orport,
 949                           self._fpr)
 950         if self.has_ipv6() and entry.has_key('ipv6') and entry['ipv6'] == ipv6:
 951           logging.warning('%s excluded: has OR %s changed fingerprint to ' +
 952                           '%s?', entry['id'], ipv6, self._fpr)
 953         continue
 954       if entry['ipv4'] != self.dirip:
 955         logging.warning('%s excluded: has it changed IPv4 from %s to %s?',
 956                         self._fpr, entry['ipv4'], self.dirip)
 957         continue
 958       if int(entry['dirport']) != self.dirport:
 959         logging.warning('%s excluded: has it changed DirPort from %s:%d to ' +
 960                         '%s:%d?', self._fpr, self.dirip, int(entry['dirport']),
 961                         self.dirip, self.dirport)
 962         continue
 963       if int(entry['orport']) != self.orport:
 964         logging.warning('%s excluded: has it changed ORPort from %s:%d to ' +
 965                         '%s:%d?', self._fpr, self.dirip, int(entry['orport']),
 966                         self.dirip, self.orport)
 967         continue
 968       if entry.has_key('ipv6') and self.has_ipv6():
 969         # if both entry and fallback have an ipv6 address, compare them
 970         if entry['ipv6'] != ipv6:
 971           logging.warning('%s excluded: has it changed IPv6 ORPort from %s ' +
 972                           'to %s?', self._fpr, entry['ipv6'], ipv6)
 973           continue
 974       # if the fallback has an IPv6 address but the whitelist entry
 975       # doesn't, or vice versa, the whitelist entry doesn't match
 976       elif entry.has_key('ipv6') and not self.has_ipv6():
 977         logging.warning('%s excluded: has it lost its former IPv6 address %s?',
 978                         self._fpr, entry['ipv6'])
 979         continue
 980       elif not entry.has_key('ipv6') and self.has_ipv6():
 981         logging.warning('%s excluded: has it gained an IPv6 address %s?',
 982                         self._fpr, ipv6)
 983         continue
 984       return True
 985     return False
 986
 987   def is_in_blacklist(self, relaylist):
 988     """ A fallback matches a blacklist line if a sufficiently specific group
 989         of attributes matches:
 990           ipv4 & dirport
 991           ipv4 & orport
 992           id
 993           ipv6 & dirport
 994           ipv6 & ipv6 orport
 995         If the fallback and the blacklist line both have an ipv6 key,
 996         their values will be compared, otherwise, they will be ignored.
 997         If there is no dirport and no orport, the entry matches all relays on
 998         that ip. """
 999     for entry in relaylist:
1000       for key in entry:
1001         value = entry[key]
1002         if key == 'id' and value == self._fpr:
1003           log_excluded('%s is in the blacklist: fingerprint matches',
1004                        self._fpr)
1005           return True
1006         if key == 'ipv4' and value == self.dirip:
1007           # if the dirport is present, check it too
1008           if entry.has_key('dirport'):
1009             if int(entry['dirport']) == self.dirport:
1010               log_excluded('%s is in the blacklist: IPv4 (%s) and ' +
1011                            'DirPort (%d) match', self._fpr, self.dirip,
1012                            self.dirport)
1013               return True
1014           # if the orport is present, check it too
1015           elif entry.has_key('orport'):
1016             if int(entry['orport']) == self.orport:
1017               log_excluded('%s is in the blacklist: IPv4 (%s) and ' +
1018                            'ORPort (%d) match', self._fpr, self.dirip,
1019                            self.orport)
1020               return True
1021           else:
1022             log_excluded('%s is in the blacklist: IPv4 (%s) matches, and ' +
1023                          'entry has no DirPort or ORPort', self._fpr,
1024                          self.dirip)
1025             return True
1026         ipv6 = None
1027         if self.has_ipv6():
1028           ipv6 = '%s:%d'%(self.ipv6addr, self.ipv6orport)
1029         if (key == 'ipv6' and self.has_ipv6()):
1030         # if both entry and fallback have an ipv6 address, compare them,
1031         # otherwise, disregard ipv6 addresses
1032           if value == ipv6:
1033             # if the dirport is present, check it too
1034             if entry.has_key('dirport'):
1035               if int(entry['dirport']) == self.dirport:
1036                 log_excluded('%s is in the blacklist: IPv6 (%s) and ' +
1037                              'DirPort (%d) match', self._fpr, ipv6,
1038                              self.dirport)
1039                 return True
1040             # we've already checked the ORPort, it's part of entry['ipv6']
1041             else:
1042               log_excluded('%s is in the blacklist: IPv6 (%s) matches, and' +
1043                            'entry has no DirPort', self._fpr, ipv6)
1044               return True
1045         elif (key == 'ipv6' or self.has_ipv6()):
1046           # only log if the fingerprint matches but the IPv6 doesn't
1047           if entry.has_key('id') and entry['id'] == self._fpr:
1048             log_excluded('%s skipping IPv6 blacklist comparison: relay ' +
1049                          'has%s IPv6%s, but entry has%s IPv6%s', self._fpr,
1050                          '' if self.has_ipv6() else ' no',
1051                          (' (' + ipv6 + ')') if self.has_ipv6() else  '',
1052                          '' if key == 'ipv6' else ' no',
1053                          (' (' + value + ')') if key == 'ipv6' else '')
1054             logging.warning('Has %s %s IPv6 address %s?', self._fpr,
1055                         'gained an' if self.has_ipv6() else 'lost its former',
1056                         ipv6 if self.has_ipv6() else value)
1057     return False
1058
1059   def cw_to_bw_factor(self):
1060     # any relays with a missing or zero consensus weight are not candidates
1061     # any relays with a missing advertised bandwidth have it set to zero
1062     return self._data['advertised_bandwidth'] / self._data['consensus_weight']
1063
1064   # since advertised_bandwidth is reported by the relay, it can be gamed
1065   # to avoid this, use the median consensus weight to bandwidth factor to
1066   # estimate this relay's measured bandwidth, and make that the upper limit
1067   def measured_bandwidth(self, median_cw_to_bw_factor):
1068     cw_to_bw= median_cw_to_bw_factor
1069     # Reduce exit bandwidth to make sure we're not overloading them
1070     if self.is_exit():
1071       cw_to_bw *= EXIT_BANDWIDTH_FRACTION
1072     measured_bandwidth = self._data['consensus_weight'] * cw_to_bw
1073     if self._data['advertised_bandwidth'] != 0:
1074       # limit advertised bandwidth (if available) to measured bandwidth
1075       return min(measured_bandwidth, self._data['advertised_bandwidth'])
1076     else:
1077       return measured_bandwidth
1078
1079   def set_measured_bandwidth(self, median_cw_to_bw_factor):
1080     self._data['measured_bandwidth'] = self.measured_bandwidth(
1081                                                       median_cw_to_bw_factor)
1082
1083   def is_exit(self):
1084     return 'Exit' in self._data['flags']
1085
1086   def is_guard(self):
1087     return 'Guard' in self._data['flags']
1088
1089   def is_running(self):
1090     return 'Running' in self._data['flags']
1091
1092   # does this fallback have an IPv6 address and orport?
1093   def has_ipv6(self):
1094     return self.ipv6addr is not None and self.ipv6orport is not None
1095
1096   # strip leading and trailing brackets from an IPv6 address
1097   # safe to use on non-bracketed IPv6 and on IPv4 addresses
1098   # also convert to unicode, and make None appear as ''
1099   @staticmethod
1100   def strip_ipv6_brackets(ip):
1101     if ip is None:
1102       return unicode('')
1103     if len(ip) < 2:
1104       return unicode(ip)
1105     if ip[0] == '[' and ip[-1] == ']':
1106       return unicode(ip[1:-1])
1107     return unicode(ip)
1108
1109   # are ip_a and ip_b in the same netblock?
1110   # mask_bits is the size of the netblock
1111   # takes both IPv4 and IPv6 addresses
1112   # the versions of ip_a and ip_b must be the same
1113   # the mask must be valid for the IP version
1114   @staticmethod
1115   def netblocks_equal(ip_a, ip_b, mask_bits):
1116     if ip_a is None or ip_b is None:
1117       return False
1118     ip_a = Candidate.strip_ipv6_brackets(ip_a)
1119     ip_b = Candidate.strip_ipv6_brackets(ip_b)
1120     a = ipaddress.ip_address(ip_a)
1121     b = ipaddress.ip_address(ip_b)
1122     if a.version != b.version:
1123       raise Exception('Mismatching IP versions in %s and %s'%(ip_a, ip_b))
1124     if mask_bits > a.max_prefixlen:
1125       logging.error('Bad IP mask %d for %s and %s'%(mask_bits, ip_a, ip_b))
1126       mask_bits = a.max_prefixlen
1127     if mask_bits < 0:
1128       logging.error('Bad IP mask %d for %s and %s'%(mask_bits, ip_a, ip_b))
1129       mask_bits = 0
1130     a_net = ipaddress.ip_network('%s/%d'%(ip_a, mask_bits), strict=False)
1131     return b in a_net
1132
1133   # is this fallback's IPv4 address (dirip) in the same netblock as other's
1134   # IPv4 address?
1135   # mask_bits is the size of the netblock
1136   def ipv4_netblocks_equal(self, other, mask_bits):
1137     return Candidate.netblocks_equal(self.dirip, other.dirip, mask_bits)
1138
1139   # is this fallback's IPv6 address (ipv6addr) in the same netblock as
1140   # other's IPv6 address?
1141   # Returns False if either fallback has no IPv6 address
1142   # mask_bits is the size of the netblock
1143   def ipv6_netblocks_equal(self, other, mask_bits):
1144     if not self.has_ipv6() or not other.has_ipv6():
1145       return False
1146     return Candidate.netblocks_equal(self.ipv6addr, other.ipv6addr, mask_bits)
1147
1148   # is this fallback's IPv4 DirPort the same as other's IPv4 DirPort?
1149   def dirport_equal(self, other):
1150     return self.dirport == other.dirport
1151
1152   # is this fallback's IPv4 ORPort the same as other's IPv4 ORPort?
1153   def ipv4_orport_equal(self, other):
1154     return self.orport == other.orport
1155
1156   # is this fallback's IPv6 ORPort the same as other's IPv6 ORPort?
1157   # Returns False if either fallback has no IPv6 address
1158   def ipv6_orport_equal(self, other):
1159     if not self.has_ipv6() or not other.has_ipv6():
1160       return False
1161     return self.ipv6orport == other.ipv6orport
1162
1163   # does this fallback have the same DirPort, IPv4 ORPort, or
1164   # IPv6 ORPort as other?
1165   # Ignores IPv6 ORPort if either fallback has no IPv6 address
1166   def port_equal(self, other):
1167     return (self.dirport_equal(other) or self.ipv4_orport_equal(other)
1168             or self.ipv6_orport_equal(other))
1169
1170   # return a list containing IPv4 ORPort, DirPort, and IPv6 ORPort (if present)
1171   def port_list(self):
1172     ports = [self.dirport, self.orport]
1173     if self.has_ipv6() and not self.ipv6orport in ports:
1174       ports.append(self.ipv6orport)
1175     return ports
1176
1177   # does this fallback share a port with other, regardless of whether the
1178   # port types match?
1179   # For example, if self's IPv4 ORPort is 80 and other's DirPort is 80,
1180   # return True
1181   def port_shared(self, other):
1182     for p in self.port_list():
1183       if p in other.port_list():
1184         return True
1185     return False
1186
1187   # log how long it takes to download a consensus from dirip:dirport
1188   # returns True if the download failed, False if it succeeded within max_time
1189   @staticmethod
1190   def fallback_consensus_download_speed(dirip, dirport, nickname, fingerprint,
1191                                         max_time):
1192     download_failed = False
1193     # some directory mirrors respond to requests in ways that hang python
1194     # sockets, which is why we log this line here
1195     logging.info('Initiating %sconsensus download from %s (%s:%d) %s.',
1196                  'microdesc ' if DOWNLOAD_MICRODESC_CONSENSUS else '',
1197                  nickname, dirip, dirport, fingerprint)
1198     # there appears to be about 1 second of overhead when comparing stem's
1199     # internal trace time and the elapsed time calculated here
1200     TIMEOUT_SLOP = 1.0
1201     start = datetime.datetime.utcnow()
1202     try:
1203       consensus = get_consensus(
1204                               endpoints = [(dirip, dirport)],
1205                               timeout = (max_time + TIMEOUT_SLOP),
1206                               validate = True,
1207                               retries = 0,
1208                               fall_back_to_authority = False,
1209                               document_handler = DocumentHandler.BARE_DOCUMENT,
1210                               microdescriptor = DOWNLOAD_MICRODESC_CONSENSUS
1211                                 ).run()[0]
1212       end = datetime.datetime.utcnow()
1213       time_since_expiry = (end - consensus.valid_until).total_seconds()
1214     except Exception, stem_error:
1215       end = datetime.datetime.utcnow()
1216       log_excluded('Unable to retrieve a consensus from %s: %s', nickname,
1217                     stem_error)
1218       status = 'error: "%s"' % (stem_error)
1219       level = logging.WARNING
1220       download_failed = True
1221     elapsed = (end - start).total_seconds()
1222     if download_failed:
1223       # keep the error failure status, and avoid using the variables
1224       pass
1225     elif elapsed > max_time:
1226       status = 'too slow'
1227       level = logging.WARNING
1228       download_failed = True
1229     elif (time_since_expiry > 0):
1230       status = 'outdated consensus, expired %ds ago'%(int(time_since_expiry))
1231       if time_since_expiry <= CONSENSUS_EXPIRY_TOLERANCE:
1232         status += ', tolerating up to %ds'%(CONSENSUS_EXPIRY_TOLERANCE)
1233         level = logging.INFO
1234       else:
1235         status += ', invalid'
1236         level = logging.WARNING
1237         download_failed = True
1238     else:
1239       status = 'ok'
1240       level = logging.DEBUG
1241     logging.log(level, 'Consensus download: %0.1fs %s from %s (%s:%d) %s, ' +
1242                  'max download time %0.1fs.', elapsed, status, nickname,
1243                  dirip, dirport, fingerprint, max_time)
1244     return download_failed
1245
1246   # does this fallback download the consensus fast enough?
1247   def check_fallback_download_consensus(self):
1248     # include the relay if we're not doing a check, or we can't check (IPv6)
1249     ipv4_failed = False
1250     ipv6_failed = False
1251     if PERFORM_IPV4_DIRPORT_CHECKS:
1252       ipv4_failed = Candidate.fallback_consensus_download_speed(self.dirip,
1253                                                 self.dirport,
1254                                                 self._data['nickname'],
1255                                                 self._fpr,
1256                                                 CONSENSUS_DOWNLOAD_SPEED_MAX)
1257     if self.has_ipv6() and PERFORM_IPV6_DIRPORT_CHECKS:
1258       # Clients assume the IPv6 DirPort is the same as the IPv4 DirPort
1259       ipv6_failed = Candidate.fallback_consensus_download_speed(self.ipv6addr,
1260                                                 self.dirport,
1261                                                 self._data['nickname'],
1262                                                 self._fpr,
1263                                                 CONSENSUS_DOWNLOAD_SPEED_MAX)
1264     return ((not ipv4_failed) and (not ipv6_failed))
1265
1266   # if this fallback has not passed a download check, try it again,
1267   # and record the result, available in get_fallback_download_consensus
1268   def try_fallback_download_consensus(self):
1269     if not self.get_fallback_download_consensus():
1270       self._data['download_check'] = self.check_fallback_download_consensus()
1271
1272   # did this fallback pass the download check?
1273   def get_fallback_download_consensus(self):
1274     # if we're not performing checks, return True
1275     if not PERFORM_IPV4_DIRPORT_CHECKS and not PERFORM_IPV6_DIRPORT_CHECKS:
1276       return True
1277     # if we are performing checks, but haven't done one, return False
1278     if not self._data.has_key('download_check'):
1279       return False
1280     return self._data['download_check']
1281
1282   # output an optional header comment and info for this fallback
1283   # try_fallback_download_consensus before calling this
1284   def fallbackdir_line(self, fallbacks, prefilter_fallbacks):
1285     s = ''
1286     if OUTPUT_COMMENTS:
1287       s += self.fallbackdir_comment(fallbacks, prefilter_fallbacks)
1288     # if the download speed is ok, output a C string
1289     # if it's not, but we OUTPUT_COMMENTS, output a commented-out C string
1290     if self.get_fallback_download_consensus() or OUTPUT_COMMENTS:
1291       s += self.fallbackdir_info(self.get_fallback_download_consensus())
1292     return s
1293
1294   # output a header comment for this fallback
1295   def fallbackdir_comment(self, fallbacks, prefilter_fallbacks):
1296     # /*
1297     # nickname
1298     # flags
1299     # adjusted bandwidth, consensus weight
1300     # [contact]
1301     # [identical contact counts]
1302     # */
1303     # Multiline C comment
1304     s = '/*'
1305     s += '\n'
1306     s += cleanse_c_multiline_comment(self._data['nickname'])
1307     s += '\n'
1308     s += 'Flags: '
1309     s += cleanse_c_multiline_comment(' '.join(sorted(self._data['flags'])))
1310     s += '\n'
1311     # this is an adjusted bandwidth, see calculate_measured_bandwidth()
1312     bandwidth = self._data['measured_bandwidth']
1313     weight = self._data['consensus_weight']
1314     s += 'Bandwidth: %.1f MByte/s, Consensus Weight: %d'%(
1315         bandwidth/(1024.0*1024.0),
1316         weight)
1317     s += '\n'
1318     if self._data['contact'] is not None:
1319       s += cleanse_c_multiline_comment(self._data['contact'])
1320       if CONTACT_COUNT or CONTACT_BLACKLIST_COUNT:
1321         fallback_count = len([f for f in fallbacks
1322                               if f._data['contact'] == self._data['contact']])
1323         if fallback_count > 1:
1324           s += '\n'
1325           s += '%d identical contacts listed' % (fallback_count)
1326       if CONTACT_BLACKLIST_COUNT:
1327         prefilter_count = len([f for f in prefilter_fallbacks
1328                                if f._data['contact'] == self._data['contact']])
1329         filter_count = prefilter_count - fallback_count
1330         if filter_count > 0:
1331           if fallback_count > 1:
1332             s += ' '
1333           else:
1334             s += '\n'
1335           s += '%d blacklisted' % (filter_count)
1336       s += '\n'
1337     s += '*/'
1338     s += '\n'
1339     return s
1340
1341   # output the fallback info C string for this fallback
1342   # this is the text that would go after FallbackDir in a torrc
1343   # if this relay failed the download test and we OUTPUT_COMMENTS,
1344   # comment-out the returned string
1345   def fallbackdir_info(self, dl_speed_ok):
1346     # "address:dirport orport=port id=fingerprint"
1347     # (insert additional madatory fields here)
1348     # "[ipv6=addr:orport]"
1349     # (insert additional optional fields here)
1350     # /* nickname=name */
1351     # /* extrainfo={0,1} */
1352     # (insert additional comment fields here)
1353     # /* ===== */
1354     # ,
1355     #
1356     # Do we want a C string, or a commented-out string?
1357     c_string = dl_speed_ok
1358     comment_string = not dl_speed_ok and OUTPUT_COMMENTS
1359     # If we don't want either kind of string, bail
1360     if not c_string and not comment_string:
1361       return ''
1362     s = ''
1363     # Comment out the fallback directory entry if it's too slow
1364     # See the debug output for which address and port is failing
1365     if comment_string:
1366       s += '/* Consensus download failed or was too slow:\n'
1367     # Multi-Line C string with trailing comma (part of a string list)
1368     # This makes it easier to diff the file, and remove IPv6 lines using grep
1369     # Integers don't need escaping
1370     s += '"%s orport=%d id=%s"'%(
1371             cleanse_c_string(self._data['dir_address']),
1372             self.orport,
1373             cleanse_c_string(self._fpr))
1374     s += '\n'
1375     # (insert additional madatory fields here)
1376     if self.has_ipv6():
1377       s += '" ipv6=%s:%d"'%(cleanse_c_string(self.ipv6addr), self.ipv6orport)
1378       s += '\n'
1379     # (insert additional optional fields here)
1380     if not comment_string:
1381       s += '/* '
1382     s += 'nickname=%s'%(cleanse_c_string(self._data['nickname']))
1383     if not comment_string:
1384       s += ' */'
1385     s += '\n'
1386     # if we know that the fallback is an extrainfo cache, flag it
1387     # and if we don't know, assume it is not
1388     if not comment_string:
1389       s += '/* '
1390     s += 'extrainfo=%d'%(1 if self._extra_info_cache else 0)
1391     if not comment_string:
1392       s += ' */'
1393     s += '\n'
1394     # (insert additional comment fields here)
1395     # The terminator and comma must be the last line in each fallback entry
1396     if not comment_string:
1397       s += '/* '
1398     s += SECTION_SEPARATOR_BASE
1399     if not comment_string:
1400       s += ' */'
1401     s += '\n'
1402     s += ','
1403     if comment_string:
1404       s += '\n'
1405       s += '*/'
1406     return s
1407
1408 ## Fallback Candidate List Class
1409
1410 class CandidateList(dict):
1411   def __init__(self):
1412     pass
1413
1414   def _add_relay(self, details):
1415     if not 'dir_address' in details: return
1416     c = Candidate(details)
1417     self[ c.get_fingerprint() ] = c
1418
1419   def _add_uptime(self, uptime):
1420     try:
1421       fpr = uptime['fingerprint']
1422     except KeyError:
1423       raise Exception("Document has no fingerprint field.")
1424
1425     try:
1426       c = self[fpr]
1427     except KeyError:
1428       logging.debug('Got unknown relay %s in uptime document.'%(fpr,))
1429       return
1430
1431     c.add_uptime(uptime)
1432
1433   def _add_details(self):
1434     logging.debug('Loading details document.')
1435     d = fetch('details',
1436         fields=('fingerprint,nickname,contact,last_changed_address_or_port,' +
1437                 'consensus_weight,advertised_bandwidth,or_addresses,' +
1438                 'dir_address,recommended_version,flags,effective_family,' +
1439                 'platform'))
1440     logging.debug('Loading details document done.')
1441
1442     if not 'relays' in d: raise Exception("No relays found in document.")
1443
1444     for r in d['relays']: self._add_relay(r)
1445
1446   def _add_uptimes(self):
1447     logging.debug('Loading uptime document.')
1448     d = fetch('uptime')
1449     logging.debug('Loading uptime document done.')
1450
1451     if not 'relays' in d: raise Exception("No relays found in document.")
1452     for r in d['relays']: self._add_uptime(r)
1453
1454   def add_relays(self):
1455     self._add_details()
1456     self._add_uptimes()
1457
1458   def count_guards(self):
1459     guard_count = 0
1460     for fpr in self.keys():
1461       if self[fpr].is_guard():
1462         guard_count += 1
1463     return guard_count
1464
1465   # Find fallbacks that fit the uptime, stability, and flags criteria,
1466   # and make an array of them in self.fallbacks
1467   def compute_fallbacks(self):
1468     self.fallbacks = map(lambda x: self[x],
1469                          filter(lambda x: self[x].is_candidate(),
1470                                 self.keys()))
1471
1472   # sort fallbacks by their consensus weight to advertised bandwidth factor,
1473   # lowest to highest
1474   # used to find the median cw_to_bw_factor()
1475   def sort_fallbacks_by_cw_to_bw_factor(self):
1476     self.fallbacks.sort(key=lambda f: f.cw_to_bw_factor())
1477
1478   # sort fallbacks by their measured bandwidth, highest to lowest
1479   # calculate_measured_bandwidth before calling this
1480   # this is useful for reviewing candidates in priority order
1481   def sort_fallbacks_by_measured_bandwidth(self):
1482     self.fallbacks.sort(key=lambda f: f._data['measured_bandwidth'],
1483                         reverse=True)
1484
1485   # sort fallbacks by the data field data_field, lowest to highest
1486   def sort_fallbacks_by(self, data_field):
1487     self.fallbacks.sort(key=lambda f: f._data[data_field])
1488
1489   @staticmethod
1490   def load_relaylist(file_obj):
1491     """ Read each line in the file, and parse it like a FallbackDir line:
1492         an IPv4 address and optional port:
1493           <IPv4 address>:<port>
1494         which are parsed into dictionary entries:
1495           ipv4=<IPv4 address>
1496           dirport=<port>
1497         followed by a series of key=value entries:
1498           orport=<port>
1499           id=<fingerprint>
1500           ipv6=<IPv6 address>:<IPv6 orport>
1501         each line's key/value pairs are placed in a dictonary,
1502         (of string -> string key/value pairs),
1503         and these dictionaries are placed in an array.
1504         comments start with # and are ignored """
1505     file_data = file_obj['data']
1506     file_name = file_obj['name']
1507     relaylist = []
1508     if file_data is None:
1509       return relaylist
1510     for line in file_data.split('\n'):
1511       relay_entry = {}
1512       # ignore comments
1513       line_comment_split = line.split('#')
1514       line = line_comment_split[0]
1515       # cleanup whitespace
1516       line = cleanse_whitespace(line)
1517       line = line.strip()
1518       if len(line) == 0:
1519         continue
1520       for item in line.split(' '):
1521         item = item.strip()
1522         if len(item) == 0:
1523           continue
1524         key_value_split = item.split('=')
1525         kvl = len(key_value_split)
1526         if kvl < 1 or kvl > 2:
1527           print '#error Bad %s item: %s, format is key=value.'%(
1528                                                  file_name, item)
1529         if kvl == 1:
1530           # assume that entries without a key are the ipv4 address,
1531           # perhaps with a dirport
1532           ipv4_maybe_dirport = key_value_split[0]
1533           ipv4_maybe_dirport_split = ipv4_maybe_dirport.split(':')
1534           dirl = len(ipv4_maybe_dirport_split)
1535           if dirl < 1 or dirl > 2:
1536             print '#error Bad %s IPv4 item: %s, format is ipv4:port.'%(
1537                                                         file_name, item)
1538           if dirl >= 1:
1539             relay_entry['ipv4'] = ipv4_maybe_dirport_split[0]
1540           if dirl == 2:
1541             relay_entry['dirport'] = ipv4_maybe_dirport_split[1]
1542         elif kvl == 2:
1543           relay_entry[key_value_split[0]] = key_value_split[1]
1544       relaylist.append(relay_entry)
1545     return relaylist
1546
1547   # apply the fallback whitelist and blacklist
1548   def apply_filter_lists(self, whitelist_obj, blacklist_obj):
1549     excluded_count = 0
1550     logging.debug('Applying whitelist and blacklist.')
1551     # parse the whitelist and blacklist
1552     whitelist = self.load_relaylist(whitelist_obj)
1553     blacklist = self.load_relaylist(blacklist_obj)
1554     filtered_fallbacks = []
1555     for f in self.fallbacks:
1556       in_whitelist = f.is_in_whitelist(whitelist)
1557       in_blacklist = f.is_in_blacklist(blacklist)
1558       if in_whitelist and in_blacklist:
1559         if BLACKLIST_EXCLUDES_WHITELIST_ENTRIES:
1560           # exclude
1561           excluded_count += 1
1562           logging.warning('Excluding %s: in both blacklist and whitelist.',
1563                           f._fpr)
1564         else:
1565           # include
1566           filtered_fallbacks.append(f)
1567       elif in_whitelist:
1568         # include
1569         filtered_fallbacks.append(f)
1570       elif in_blacklist:
1571         # exclude
1572         excluded_count += 1
1573         log_excluded('Excluding %s: in blacklist.', f._fpr)
1574       else:
1575         if INCLUDE_UNLISTED_ENTRIES:
1576           # include
1577           filtered_fallbacks.append(f)
1578         else:
1579           # exclude
1580           excluded_count += 1
1581           log_excluded('Excluding %s: in neither blacklist nor whitelist.',
1582                        f._fpr)
1583     self.fallbacks = filtered_fallbacks
1584     return excluded_count
1585
1586   @staticmethod
1587   def summarise_filters(initial_count, excluded_count):
1588     return '/* Whitelist & blacklist excluded %d of %d candidates. */'%(
1589                                                 excluded_count, initial_count)
1590
1591   # calculate each fallback's measured bandwidth based on the median
1592   # consensus weight to advertised bandwidth ratio
1593   def calculate_measured_bandwidth(self):
1594     self.sort_fallbacks_by_cw_to_bw_factor()
1595     median_fallback = self.fallback_median(True)
1596     if median_fallback is not None:
1597       median_cw_to_bw_factor = median_fallback.cw_to_bw_factor()
1598     else:
1599       # this will never be used, because there are no fallbacks
1600       median_cw_to_bw_factor = None
1601     for f in self.fallbacks:
1602       f.set_measured_bandwidth(median_cw_to_bw_factor)
1603
1604   # remove relays with low measured bandwidth from the fallback list
1605   # calculate_measured_bandwidth for each relay before calling this
1606   def remove_low_bandwidth_relays(self):
1607     if MIN_BANDWIDTH is None:
1608       return
1609     above_min_bw_fallbacks = []
1610     for f in self.fallbacks:
1611       if f._data['measured_bandwidth'] >= MIN_BANDWIDTH:
1612         above_min_bw_fallbacks.append(f)
1613       else:
1614         # the bandwidth we log here is limited by the relay's consensus weight
1615         # as well as its adverttised bandwidth. See set_measured_bandwidth
1616         # for details
1617         log_excluded('%s not a candidate: bandwidth %.1fMByte/s too low, ' +
1618                      'must be at least %.1fMByte/s', f._fpr,
1619                      f._data['measured_bandwidth']/(1024.0*1024.0),
1620                      MIN_BANDWIDTH/(1024.0*1024.0))
1621     self.fallbacks = above_min_bw_fallbacks
1622
1623   # the minimum fallback in the list
1624   # call one of the sort_fallbacks_* functions before calling this
1625   def fallback_min(self):
1626     if len(self.fallbacks) > 0:
1627       return self.fallbacks[-1]
1628     else:
1629       return None
1630
1631   # the median fallback in the list
1632   # call one of the sort_fallbacks_* functions before calling this
1633   def fallback_median(self, require_advertised_bandwidth):
1634     # use the low-median when there are an evan number of fallbacks,
1635     # for consistency with the bandwidth authorities
1636     if len(self.fallbacks) > 0:
1637       median_position = (len(self.fallbacks) - 1) / 2
1638       if not require_advertised_bandwidth:
1639         return self.fallbacks[median_position]
1640       # if we need advertised_bandwidth but this relay doesn't have it,
1641       # move to a fallback with greater consensus weight until we find one
1642       while not self.fallbacks[median_position]._data['advertised_bandwidth']:
1643         median_position += 1
1644         if median_position >= len(self.fallbacks):
1645           return None
1646       return self.fallbacks[median_position]
1647     else:
1648       return None
1649
1650   # the maximum fallback in the list
1651   # call one of the sort_fallbacks_* functions before calling this
1652   def fallback_max(self):
1653     if len(self.fallbacks) > 0:
1654       return self.fallbacks[0]
1655     else:
1656       return None
1657
1658   # return a new bag suitable for storing attributes
1659   @staticmethod
1660   def attribute_new():
1661     return dict()
1662
1663   # get the count of attribute in attribute_bag
1664   # if attribute is None or the empty string, return 0
1665   @staticmethod
1666   def attribute_count(attribute, attribute_bag):
1667     if attribute is None or attribute == '':
1668       return 0
1669     if attribute not in attribute_bag:
1670       return 0
1671     return attribute_bag[attribute]
1672
1673   # does attribute_bag contain more than max_count instances of attribute?
1674   # if so, return False
1675   # if not, return True
1676   # if attribute is None or the empty string, or max_count is invalid,
1677   # always return True
1678   @staticmethod
1679   def attribute_allow(attribute, attribute_bag, max_count=1):
1680     if attribute is None or attribute == '' or max_count <= 0:
1681       return True
1682     elif CandidateList.attribute_count(attribute, attribute_bag) >= max_count:
1683       return False
1684     else:
1685       return True
1686
1687   # add attribute to attribute_bag, incrementing the count if it is already
1688   # present
1689   # if attribute is None or the empty string, or count is invalid,
1690   # do nothing
1691   @staticmethod
1692   def attribute_add(attribute, attribute_bag, count=1):
1693     if attribute is None or attribute == '' or count <= 0:
1694       pass
1695     attribute_bag.setdefault(attribute, 0)
1696     attribute_bag[attribute] += count
1697
1698   # make sure there are only MAX_FALLBACKS_PER_IP fallbacks per IPv4 address,
1699   # and per IPv6 address
1700   # there is only one IPv4 address on each fallback: the IPv4 DirPort address
1701   # (we choose the IPv4 ORPort which is on the same IPv4 as the DirPort)
1702   # there is at most one IPv6 address on each fallback: the IPv6 ORPort address
1703   # we try to match the IPv4 ORPort, but will use any IPv6 address if needed
1704   # (clients only use the IPv6 ORPort)
1705   # if there is no IPv6 address, only the IPv4 address is checked
1706   # return the number of candidates we excluded
1707   def limit_fallbacks_same_ip(self):
1708     ip_limit_fallbacks = []
1709     ip_list = CandidateList.attribute_new()
1710     for f in self.fallbacks:
1711       if (CandidateList.attribute_allow(f.dirip, ip_list,
1712                                         MAX_FALLBACKS_PER_IPV4)
1713           and CandidateList.attribute_allow(f.ipv6addr, ip_list,
1714                                             MAX_FALLBACKS_PER_IPV6)):
1715         ip_limit_fallbacks.append(f)
1716         CandidateList.attribute_add(f.dirip, ip_list)
1717         if f.has_ipv6():
1718           CandidateList.attribute_add(f.ipv6addr, ip_list)
1719       elif not CandidateList.attribute_allow(f.dirip, ip_list,
1720                                              MAX_FALLBACKS_PER_IPV4):
1721         log_excluded('Eliminated %s: already have %d fallback(s) on IPv4 %s'
1722                      %(f._fpr, CandidateList.attribute_count(f.dirip, ip_list),
1723                        f.dirip))
1724       elif (f.has_ipv6() and
1725             not CandidateList.attribute_allow(f.ipv6addr, ip_list,
1726                                               MAX_FALLBACKS_PER_IPV6)):
1727         log_excluded('Eliminated %s: already have %d fallback(s) on IPv6 %s'
1728                      %(f._fpr, CandidateList.attribute_count(f.ipv6addr,
1729                                                              ip_list),
1730                        f.ipv6addr))
1731     original_count = len(self.fallbacks)
1732     self.fallbacks = ip_limit_fallbacks
1733     return original_count - len(self.fallbacks)
1734
1735   # make sure there are only MAX_FALLBACKS_PER_CONTACT fallbacks for each
1736   # ContactInfo
1737   # if there is no ContactInfo, allow the fallback
1738   # this check can be gamed by providing no ContactInfo, or by setting the
1739   # ContactInfo to match another fallback
1740   # However, given the likelihood that relays with the same ContactInfo will
1741   # go down at similar times, its usefulness outweighs the risk
1742   def limit_fallbacks_same_contact(self):
1743     contact_limit_fallbacks = []
1744     contact_list = CandidateList.attribute_new()
1745     for f in self.fallbacks:
1746       if CandidateList.attribute_allow(f._data['contact'], contact_list,
1747                                        MAX_FALLBACKS_PER_CONTACT):
1748         contact_limit_fallbacks.append(f)
1749         CandidateList.attribute_add(f._data['contact'], contact_list)
1750       else:
1751         log_excluded(
1752           'Eliminated %s: already have %d fallback(s) on ContactInfo %s'
1753           %(f._fpr, CandidateList.attribute_count(f._data['contact'],
1754                                                   contact_list),
1755             f._data['contact']))
1756     original_count = len(self.fallbacks)
1757     self.fallbacks = contact_limit_fallbacks
1758     return original_count - len(self.fallbacks)
1759
1760   # make sure there are only MAX_FALLBACKS_PER_FAMILY fallbacks per effective
1761   # family
1762   # if there is no family, allow the fallback
1763   # we use effective family, which ensures mutual family declarations
1764   # but the check can be gamed by not declaring a family at all
1765   # if any indirect families exist, the result depends on the order in which
1766   # fallbacks are sorted in the list
1767   def limit_fallbacks_same_family(self):
1768     family_limit_fallbacks = []
1769     fingerprint_list = CandidateList.attribute_new()
1770     for f in self.fallbacks:
1771       if CandidateList.attribute_allow(f._fpr, fingerprint_list,
1772                                        MAX_FALLBACKS_PER_FAMILY):
1773         family_limit_fallbacks.append(f)
1774         CandidateList.attribute_add(f._fpr, fingerprint_list)
1775         for family_fingerprint in f._data['effective_family']:
1776           CandidateList.attribute_add(family_fingerprint, fingerprint_list)
1777       else:
1778         # we already have a fallback with this fallback in its effective
1779         # family
1780         log_excluded(
1781           'Eliminated %s: already have %d fallback(s) in effective family'
1782           %(f._fpr, CandidateList.attribute_count(f._fpr, fingerprint_list)))
1783     original_count = len(self.fallbacks)
1784     self.fallbacks = family_limit_fallbacks
1785     return original_count - len(self.fallbacks)
1786
1787   # try once to get the descriptors for fingerprint_list using stem
1788   # returns an empty list on exception
1789   @staticmethod
1790   def get_fallback_descriptors_once(fingerprint_list):
1791     desc_list = get_server_descriptors(fingerprints=fingerprint_list).run(suppress=True)
1792     return desc_list
1793
1794   # try up to max_retries times to get the descriptors for fingerprint_list
1795   # using stem. Stops retrying when all descriptors have been retrieved.
1796   # returns a list containing the descriptors that were retrieved
1797   @staticmethod
1798   def get_fallback_descriptors(fingerprint_list, max_retries=5):
1799     # we can't use stem's retries=, because we want to support more than 96
1800     # descriptors
1801     #
1802     # add an attempt for every MAX_FINGERPRINTS (or part thereof) in the list
1803     max_retries += (len(fingerprint_list) + MAX_FINGERPRINTS - 1) / MAX_FINGERPRINTS
1804     remaining_list = fingerprint_list
1805     desc_list = []
1806     for _ in xrange(max_retries):
1807       if len(remaining_list) == 0:
1808         break
1809       new_desc_list = CandidateList.get_fallback_descriptors_once(remaining_list[0:MAX_FINGERPRINTS])
1810       for d in new_desc_list:
1811         try:
1812           remaining_list.remove(d.fingerprint)
1813         except ValueError:
1814           # warn and ignore if a directory mirror returned a bad descriptor
1815           logging.warning("Directory mirror returned unwanted descriptor %s, ignoring",
1816                           d.fingerprint)
1817           continue
1818         desc_list.append(d)
1819     return desc_list
1820
1821   # find the fallbacks that cache extra-info documents
1822   # Onionoo doesn't know this, so we have to use stem
1823   def mark_extra_info_caches(self):
1824     fingerprint_list = [ f._fpr for f in self.fallbacks ]
1825     logging.info("Downloading fallback descriptors to find extra-info caches")
1826     desc_list = CandidateList.get_fallback_descriptors(fingerprint_list)
1827     for d in desc_list:
1828       self[d.fingerprint]._extra_info_cache = d.extra_info_cache
1829     missing_descriptor_list = [ f._fpr for f in self.fallbacks
1830                                 if f._extra_info_cache is None ]
1831     for f in missing_descriptor_list:
1832       logging.warning("No descriptor for {}. Assuming extrainfo=0.".format(f))
1833
1834   # try a download check on each fallback candidate in order
1835   # stop after max_count successful downloads
1836   # but don't remove any candidates from the array
1837   def try_download_consensus_checks(self, max_count):
1838     dl_ok_count = 0
1839     for f in self.fallbacks:
1840       f.try_fallback_download_consensus()
1841       if f.get_fallback_download_consensus():
1842         # this fallback downloaded a consensus ok
1843         dl_ok_count += 1
1844         if dl_ok_count >= max_count:
1845           # we have enough fallbacks
1846           return
1847
1848   # put max_count successful candidates in the fallbacks array:
1849   # - perform download checks on each fallback candidate
1850   # - retry failed candidates if CONSENSUS_DOWNLOAD_RETRY is set
1851   # - eliminate failed candidates
1852   # - if there are more than max_count candidates, eliminate lowest bandwidth
1853   # - if there are fewer than max_count candidates, leave only successful
1854   # Return the number of fallbacks that failed the consensus check
1855   def perform_download_consensus_checks(self, max_count):
1856     self.sort_fallbacks_by_measured_bandwidth()
1857     self.try_download_consensus_checks(max_count)
1858     if CONSENSUS_DOWNLOAD_RETRY:
1859       # try unsuccessful candidates again
1860       # we could end up with more than max_count successful candidates here
1861       self.try_download_consensus_checks(max_count)
1862     # now we have at least max_count successful candidates,
1863     # or we've tried them all
1864     original_count = len(self.fallbacks)
1865     self.fallbacks = filter(lambda x: x.get_fallback_download_consensus(),
1866                             self.fallbacks)
1867     # some of these failed the check, others skipped the check,
1868     # if we already had enough successful downloads
1869     failed_count = original_count - len(self.fallbacks)
1870     self.fallbacks = self.fallbacks[:max_count]
1871     return failed_count
1872
1873   # return a string that describes a/b as a percentage
1874   @staticmethod
1875   def describe_percentage(a, b):
1876     if b != 0:
1877       return '%d/%d = %.0f%%'%(a, b, (a*100.0)/b)
1878     else:
1879       # technically, 0/0 is undefined, but 0.0% is a sensible result
1880       return '%d/%d = %.0f%%'%(a, b, 0.0)
1881
1882   # return a dictionary of lists of fallbacks by IPv4 netblock
1883   # the dictionary is keyed by the fingerprint of an arbitrary fallback
1884   # in each netblock
1885   # mask_bits is the size of the netblock
1886   def fallbacks_by_ipv4_netblock(self, mask_bits):
1887     netblocks = {}
1888     for f in self.fallbacks:
1889       found_netblock = False
1890       for b in netblocks.keys():
1891         # we found an existing netblock containing this fallback
1892         if f.ipv4_netblocks_equal(self[b], mask_bits):
1893           # add it to the list
1894           netblocks[b].append(f)
1895           found_netblock = True
1896           break
1897       # make a new netblock based on this fallback's fingerprint
1898       if not found_netblock:
1899         netblocks[f._fpr] = [f]
1900     return netblocks
1901
1902   # return a dictionary of lists of fallbacks by IPv6 netblock
1903   # where mask_bits is the size of the netblock
1904   def fallbacks_by_ipv6_netblock(self, mask_bits):
1905     netblocks = {}
1906     for f in self.fallbacks:
1907       # skip fallbacks without IPv6 addresses
1908       if not f.has_ipv6():
1909         continue
1910       found_netblock = False
1911       for b in netblocks.keys():
1912         # we found an existing netblock containing this fallback
1913         if f.ipv6_netblocks_equal(self[b], mask_bits):
1914           # add it to the list
1915           netblocks[b].append(f)
1916           found_netblock = True
1917           break
1918       # make a new netblock based on this fallback's fingerprint
1919       if not found_netblock:
1920         netblocks[f._fpr] = [f]
1921     return netblocks
1922
1923   # log a message about the proportion of fallbacks in each IPv4 netblock,
1924   # where mask_bits is the size of the netblock
1925   def describe_fallback_ipv4_netblock_mask(self, mask_bits):
1926     fallback_count = len(self.fallbacks)
1927     shared_netblock_fallback_count = 0
1928     most_frequent_netblock = None
1929     netblocks = self.fallbacks_by_ipv4_netblock(mask_bits)
1930     for b in netblocks.keys():
1931       if len(netblocks[b]) > 1:
1932         # how many fallbacks are in a netblock with other fallbacks?
1933         shared_netblock_fallback_count += len(netblocks[b])
1934         # what's the netblock with the most fallbacks?
1935         if (most_frequent_netblock is None
1936             or len(netblocks[b]) > len(netblocks[most_frequent_netblock])):
1937           most_frequent_netblock = b
1938         logging.debug('Fallback IPv4 addresses in the same /%d:'%(mask_bits))
1939         for f in netblocks[b]:
1940           logging.debug('%s - %s', f.dirip, f._fpr)
1941     if most_frequent_netblock is not None:
1942       logging.warning('There are %s fallbacks in the IPv4 /%d containing %s'%(
1943                                     CandidateList.describe_percentage(
1944                                       len(netblocks[most_frequent_netblock]),
1945                                       fallback_count),
1946                                     mask_bits,
1947                                     self[most_frequent_netblock].dirip))
1948     if shared_netblock_fallback_count > 0:
1949       logging.warning(('%s of fallbacks are in an IPv4 /%d with other ' +
1950                        'fallbacks')%(CandidateList.describe_percentage(
1951                                                 shared_netblock_fallback_count,
1952                                                 fallback_count),
1953                                      mask_bits))
1954
1955   # log a message about the proportion of fallbacks in each IPv6 netblock,
1956   # where mask_bits is the size of the netblock
1957   def describe_fallback_ipv6_netblock_mask(self, mask_bits):
1958     fallback_count = len(self.fallbacks_with_ipv6())
1959     shared_netblock_fallback_count = 0
1960     most_frequent_netblock = None
1961     netblocks = self.fallbacks_by_ipv6_netblock(mask_bits)
1962     for b in netblocks.keys():
1963       if len(netblocks[b]) > 1:
1964         # how many fallbacks are in a netblock with other fallbacks?
1965         shared_netblock_fallback_count += len(netblocks[b])
1966         # what's the netblock with the most fallbacks?
1967         if (most_frequent_netblock is None
1968             or len(netblocks[b]) > len(netblocks[most_frequent_netblock])):
1969           most_frequent_netblock = b
1970         logging.debug('Fallback IPv6 addresses in the same /%d:'%(mask_bits))
1971         for f in netblocks[b]:
1972           logging.debug('%s - %s', f.ipv6addr, f._fpr)
1973     if most_frequent_netblock is not None:
1974       logging.warning('There are %s fallbacks in the IPv6 /%d containing %s'%(
1975                                     CandidateList.describe_percentage(
1976                                       len(netblocks[most_frequent_netblock]),
1977                                       fallback_count),
1978                                     mask_bits,
1979                                     self[most_frequent_netblock].ipv6addr))
1980     if shared_netblock_fallback_count > 0:
1981       logging.warning(('%s of fallbacks are in an IPv6 /%d with other ' +
1982                        'fallbacks')%(CandidateList.describe_percentage(
1983                                                 shared_netblock_fallback_count,
1984                                                 fallback_count),
1985                                      mask_bits))
1986
1987   # log a message about the proportion of fallbacks in each IPv4 /8, /16,
1988   # and /24
1989   def describe_fallback_ipv4_netblocks(self):
1990    # this doesn't actually tell us anything useful
1991    #self.describe_fallback_ipv4_netblock_mask(8)
1992    self.describe_fallback_ipv4_netblock_mask(16)
1993    #self.describe_fallback_ipv4_netblock_mask(24)
1994
1995   # log a message about the proportion of fallbacks in each IPv6 /12 (RIR),
1996   # /23 (smaller RIR blocks), /32 (LIR), /48 (Customer), and /64 (Host)
1997   # https://www.iana.org/assignments/ipv6-unicast-address-assignments/
1998   def describe_fallback_ipv6_netblocks(self):
1999     # these don't actually tell us anything useful
2000     #self.describe_fallback_ipv6_netblock_mask(12)
2001     #self.describe_fallback_ipv6_netblock_mask(23)
2002     self.describe_fallback_ipv6_netblock_mask(32)
2003     #self.describe_fallback_ipv6_netblock_mask(48)
2004     self.describe_fallback_ipv6_netblock_mask(64)
2005
2006   # log a message about the proportion of fallbacks in each IPv4 and IPv6
2007   # netblock
2008   def describe_fallback_netblocks(self):
2009     self.describe_fallback_ipv4_netblocks()
2010     self.describe_fallback_ipv6_netblocks()
2011
2012   # return a list of fallbacks which are on the IPv4 ORPort port
2013   def fallbacks_on_ipv4_orport(self, port):
2014     return filter(lambda x: x.orport == port, self.fallbacks)
2015
2016   # return a list of fallbacks which are on the IPv6 ORPort port
2017   def fallbacks_on_ipv6_orport(self, port):
2018     return filter(lambda x: x.ipv6orport == port, self.fallbacks_with_ipv6())
2019
2020   # return a list of fallbacks which are on the DirPort port
2021   def fallbacks_on_dirport(self, port):
2022     return filter(lambda x: x.dirport == port, self.fallbacks)
2023
2024   # log a message about the proportion of fallbacks on IPv4 ORPort port
2025   # and return that count
2026   def describe_fallback_ipv4_orport(self, port):
2027     port_count = len(self.fallbacks_on_ipv4_orport(port))
2028     fallback_count = len(self.fallbacks)
2029     logging.warning('%s of fallbacks are on IPv4 ORPort %d'%(
2030                     CandidateList.describe_percentage(port_count,
2031                                                       fallback_count),
2032                     port))
2033     return port_count
2034
2035   # log a message about the proportion of IPv6 fallbacks on IPv6 ORPort port
2036   # and return that count
2037   def describe_fallback_ipv6_orport(self, port):
2038     port_count = len(self.fallbacks_on_ipv6_orport(port))
2039     fallback_count = len(self.fallbacks_with_ipv6())
2040     logging.warning('%s of IPv6 fallbacks are on IPv6 ORPort %d'%(
2041                     CandidateList.describe_percentage(port_count,
2042                                                       fallback_count),
2043                     port))
2044     return port_count
2045
2046   # log a message about the proportion of fallbacks on DirPort port
2047   # and return that count
2048   def describe_fallback_dirport(self, port):
2049     port_count = len(self.fallbacks_on_dirport(port))
2050     fallback_count = len(self.fallbacks)
2051     logging.warning('%s of fallbacks are on DirPort %d'%(
2052                     CandidateList.describe_percentage(port_count,
2053                                                       fallback_count),
2054                     port))
2055     return port_count
2056
2057   # log a message about the proportion of fallbacks on each dirport,
2058   # each IPv4 orport, and each IPv6 orport
2059   def describe_fallback_ports(self):
2060     fallback_count = len(self.fallbacks)
2061     ipv4_or_count = fallback_count
2062     ipv4_or_count -= self.describe_fallback_ipv4_orport(443)
2063     ipv4_or_count -= self.describe_fallback_ipv4_orport(9001)
2064     logging.warning('%s of fallbacks are on other IPv4 ORPorts'%(
2065                     CandidateList.describe_percentage(ipv4_or_count,
2066                                                       fallback_count)))
2067     ipv6_fallback_count = len(self.fallbacks_with_ipv6())
2068     ipv6_or_count = ipv6_fallback_count
2069     ipv6_or_count -= self.describe_fallback_ipv6_orport(443)
2070     ipv6_or_count -= self.describe_fallback_ipv6_orport(9001)
2071     logging.warning('%s of IPv6 fallbacks are on other IPv6 ORPorts'%(
2072                     CandidateList.describe_percentage(ipv6_or_count,
2073                                                       ipv6_fallback_count)))
2074     dir_count = fallback_count
2075     dir_count -= self.describe_fallback_dirport(80)
2076     dir_count -= self.describe_fallback_dirport(9030)
2077     logging.warning('%s of fallbacks are on other DirPorts'%(
2078                     CandidateList.describe_percentage(dir_count,
2079                                                       fallback_count)))
2080
2081   # return a list of fallbacks which cache extra-info documents
2082   def fallbacks_with_extra_info_cache(self):
2083     return filter(lambda x: x._extra_info_cache, self.fallbacks)
2084
2085   # log a message about the proportion of fallbacks that cache extra-info docs
2086   def describe_fallback_extra_info_caches(self):
2087     extra_info_falback_count = len(self.fallbacks_with_extra_info_cache())
2088     fallback_count = len(self.fallbacks)
2089     logging.warning('%s of fallbacks cache extra-info documents'%(
2090                     CandidateList.describe_percentage(extra_info_falback_count,
2091                                                       fallback_count)))
2092
2093   # return a list of fallbacks which have the Exit flag
2094   def fallbacks_with_exit(self):
2095     return filter(lambda x: x.is_exit(), self.fallbacks)
2096
2097   # log a message about the proportion of fallbacks with an Exit flag
2098   def describe_fallback_exit_flag(self):
2099     exit_falback_count = len(self.fallbacks_with_exit())
2100     fallback_count = len(self.fallbacks)
2101     logging.warning('%s of fallbacks have the Exit flag'%(
2102                     CandidateList.describe_percentage(exit_falback_count,
2103                                                       fallback_count)))
2104
2105   # return a list of fallbacks which have an IPv6 address
2106   def fallbacks_with_ipv6(self):
2107     return filter(lambda x: x.has_ipv6(), self.fallbacks)
2108
2109   # log a message about the proportion of fallbacks on IPv6
2110   def describe_fallback_ip_family(self):
2111     ipv6_falback_count = len(self.fallbacks_with_ipv6())
2112     fallback_count = len(self.fallbacks)
2113     logging.warning('%s of fallbacks are on IPv6'%(
2114                     CandidateList.describe_percentage(ipv6_falback_count,
2115                                                       fallback_count)))
2116
2117   def summarise_fallbacks(self, eligible_count, operator_count, failed_count,
2118                           guard_count, target_count):
2119     s = ''
2120     # Report:
2121     #  whether we checked consensus download times
2122     #  the number of fallback directories (and limits/exclusions, if relevant)
2123     #  min & max fallback bandwidths
2124     #  #error if below minimum count
2125     if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
2126       s += '/* Checked %s%s%s DirPorts served a consensus within %.1fs. */'%(
2127             'IPv4' if PERFORM_IPV4_DIRPORT_CHECKS else '',
2128             ' and ' if (PERFORM_IPV4_DIRPORT_CHECKS
2129                         and PERFORM_IPV6_DIRPORT_CHECKS) else '',
2130             'IPv6' if PERFORM_IPV6_DIRPORT_CHECKS else '',
2131             CONSENSUS_DOWNLOAD_SPEED_MAX)
2132     else:
2133       s += '/* Did not check IPv4 or IPv6 DirPort consensus downloads. */'
2134     s += '\n'
2135     # Multiline C comment with #error if things go bad
2136     s += '/*'
2137     s += '\n'
2138     # Integers don't need escaping in C comments
2139     fallback_count = len(self.fallbacks)
2140     if FALLBACK_PROPORTION_OF_GUARDS is None:
2141       fallback_proportion = ''
2142     else:
2143       fallback_proportion = ', Target %d (%d * %.2f)'%(target_count,
2144                                                 guard_count,
2145                                                 FALLBACK_PROPORTION_OF_GUARDS)
2146     s += 'Final Count: %d (Eligible %d%s'%(fallback_count, eligible_count,
2147                                            fallback_proportion)
2148     if MAX_FALLBACK_COUNT is not None:
2149       s += ', Max %d'%(MAX_FALLBACK_COUNT)
2150     s += ')\n'
2151     if eligible_count != fallback_count:
2152       removed_count = eligible_count - fallback_count
2153       excess_to_target_or_max = (eligible_count - operator_count - failed_count
2154                                  - fallback_count)
2155       # some 'Failed' failed the check, others 'Skipped' the check,
2156       # if we already had enough successful downloads
2157       s += ('Excluded: %d (Same Operator %d, Failed/Skipped Download %d, ' +
2158             'Excess %d)')%(removed_count, operator_count, failed_count,
2159                            excess_to_target_or_max)
2160       s += '\n'
2161     min_fb = self.fallback_min()
2162     min_bw = min_fb._data['measured_bandwidth']
2163     max_fb = self.fallback_max()
2164     max_bw = max_fb._data['measured_bandwidth']
2165     s += 'Bandwidth Range: %.1f - %.1f MByte/s'%(min_bw/(1024.0*1024.0),
2166                                                  max_bw/(1024.0*1024.0))
2167     s += '\n'
2168     s += '*/'
2169     if fallback_count < MIN_FALLBACK_COUNT:
2170       # We must have a minimum number of fallbacks so they are always
2171       # reachable, and are in diverse locations
2172       s += '\n'
2173       s += '#error Fallback Count %d is too low. '%(fallback_count)
2174       s += 'Must be at least %d for diversity. '%(MIN_FALLBACK_COUNT)
2175       s += 'Try adding entries to the whitelist, '
2176       s += 'or setting INCLUDE_UNLISTED_ENTRIES = True.'
2177     return s
2178
2179 def process_existing():
2180   logging.basicConfig(level=logging.INFO)
2181   logging.getLogger('stem').setLevel(logging.INFO)
2182   whitelist = {'data': parse_fallback_file(FALLBACK_FILE_NAME),
2183                'name': FALLBACK_FILE_NAME}
2184   blacklist = {'data': read_from_file(BLACKLIST_FILE_NAME, MAX_LIST_FILE_SIZE),
2185                'name': BLACKLIST_FILE_NAME}
2186   list_fallbacks(whitelist, blacklist)
2187
2188 def process_default():
2189   logging.basicConfig(level=logging.WARNING)
2190   logging.getLogger('stem').setLevel(logging.WARNING)
2191   whitelist = {'data': read_from_file(WHITELIST_FILE_NAME, MAX_LIST_FILE_SIZE),
2192                'name': WHITELIST_FILE_NAME}
2193   blacklist = {'data': read_from_file(BLACKLIST_FILE_NAME, MAX_LIST_FILE_SIZE),
2194                'name': BLACKLIST_FILE_NAME}
2195   list_fallbacks(whitelist, blacklist)
2196
2197 ## Main Function
2198 def main():
2199   if get_command() == 'check_existing':
2200     process_existing()
2201   else:
2202     process_default()
2203
2204 def get_command():
2205   if len(sys.argv) == 2:
2206     return sys.argv[1]
2207   else:
2208     return None
2209
2210 def log_excluded(msg, *args):
2211   if get_command() == 'check_existing':
2212     logging.warning(msg, *args)
2213   else:
2214     logging.info(msg, *args)
2215
2216 def list_fallbacks(whitelist, blacklist):
2217   """ Fetches required onionoo documents and evaluates the
2218       fallback directory criteria for each of the relays """
2219
2220   print "/* type=fallback */"
2221   print ("/* version={} */"
2222          .format(cleanse_c_multiline_comment(FALLBACK_FORMAT_VERSION)))
2223   now = datetime.datetime.utcnow()
2224   timestamp = now.strftime('%Y%m%d%H%M%S')
2225   print ("/* timestamp={} */"
2226          .format(cleanse_c_multiline_comment(timestamp)))
2227   # end the header with a separator, to make it easier for parsers
2228   print SECTION_SEPARATOR_COMMENT
2229
2230   logging.warning('Downloading and parsing Onionoo data. ' +
2231                   'This may take some time.')
2232   # find relays that could be fallbacks
2233   candidates = CandidateList()
2234   candidates.add_relays()
2235
2236   # work out how many fallbacks we want
2237   guard_count = candidates.count_guards()
2238   if FALLBACK_PROPORTION_OF_GUARDS is None:
2239     target_count = guard_count
2240   else:
2241     target_count = int(guard_count * FALLBACK_PROPORTION_OF_GUARDS)
2242   # the maximum number of fallbacks is the least of:
2243   # - the target fallback count (FALLBACK_PROPORTION_OF_GUARDS * guard count)
2244   # - the maximum fallback count (MAX_FALLBACK_COUNT)
2245   if MAX_FALLBACK_COUNT is None:
2246     max_count = target_count
2247   else:
2248     max_count = min(target_count, MAX_FALLBACK_COUNT)
2249
2250   candidates.compute_fallbacks()
2251   prefilter_fallbacks = copy.copy(candidates.fallbacks)
2252
2253   # filter with the whitelist and blacklist
2254   # if a relay has changed IPv4 address or ports recently, it will be excluded
2255   # as ineligible before we call apply_filter_lists, and so there will be no
2256   # warning that the details have changed from those in the whitelist.
2257   # instead, there will be an info-level log during the eligibility check.
2258   initial_count = len(candidates.fallbacks)
2259   excluded_count = candidates.apply_filter_lists(whitelist, blacklist)
2260   print candidates.summarise_filters(initial_count, excluded_count)
2261   eligible_count = len(candidates.fallbacks)
2262
2263   # calculate the measured bandwidth of each relay,
2264   # then remove low-bandwidth relays
2265   candidates.calculate_measured_bandwidth()
2266   candidates.remove_low_bandwidth_relays()
2267
2268   # print the raw fallback list
2269   #for x in candidates.fallbacks:
2270   #  print x.fallbackdir_line(True)
2271   #  print json.dumps(candidates[x]._data, sort_keys=True, indent=4,
2272   #                   separators=(',', ': '), default=json_util.default)
2273
2274   # impose mandatory conditions here, like one per contact, family, IP
2275   # in measured bandwidth order
2276   candidates.sort_fallbacks_by_measured_bandwidth()
2277   operator_count = 0
2278   # only impose these limits on the final list - operators can nominate
2279   # multiple candidate fallbacks, and then we choose the best set
2280   if not OUTPUT_CANDIDATES:
2281     operator_count += candidates.limit_fallbacks_same_ip()
2282     operator_count += candidates.limit_fallbacks_same_contact()
2283     operator_count += candidates.limit_fallbacks_same_family()
2284
2285   # check if each candidate can serve a consensus
2286   # there's a small risk we've eliminated relays from the same operator that
2287   # can serve a consensus, in favour of one that can't
2288   # but given it takes up to 15 seconds to check each consensus download,
2289   # the risk is worth it
2290   if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
2291     logging.warning('Checking consensus download speeds. ' +
2292                     'This may take some time.')
2293   failed_count = candidates.perform_download_consensus_checks(max_count)
2294
2295   # work out which fallbacks cache extra-infos
2296   candidates.mark_extra_info_caches()
2297
2298   # analyse and log interesting diversity metrics
2299   # like netblock, ports, exit, IPv4-only
2300   # (we can't easily analyse AS, and it's hard to accurately analyse country)
2301   candidates.describe_fallback_ip_family()
2302   # if we can't import the ipaddress module, we can't do netblock analysis
2303   if HAVE_IPADDRESS:
2304     candidates.describe_fallback_netblocks()
2305   candidates.describe_fallback_ports()
2306   candidates.describe_fallback_extra_info_caches()
2307   candidates.describe_fallback_exit_flag()
2308
2309   # output C comments summarising the fallback selection process
2310   if len(candidates.fallbacks) > 0:
2311     print candidates.summarise_fallbacks(eligible_count, operator_count,
2312                                          failed_count, guard_count,
2313                                          target_count)
2314   else:
2315     print '/* No Fallbacks met criteria */'
2316
2317   # output C comments specifying the OnionOO data used to create the list
2318   for s in fetch_source_list():
2319     print describe_fetch_source(s)
2320
2321   # start the list with a separator, to make it easy for parsers
2322   print SECTION_SEPARATOR_COMMENT
2323
2324   # sort the list differently depending on why we've created it:
2325   # if we're outputting the final fallback list, sort by fingerprint
2326   # this makes diffs much more stable
2327   # otherwise, if we're trying to find a bandwidth cutoff, or we want to
2328   # contact operators in priority order, sort by bandwidth (not yet
2329   # implemented)
2330   # otherwise, if we're contacting operators, sort by contact
2331   candidates.sort_fallbacks_by(OUTPUT_SORT_FIELD)
2332
2333   for x in candidates.fallbacks:
2334     print x.fallbackdir_line(candidates.fallbacks, prefilter_fallbacks)
2335
2336 if __name__ == "__main__":
2337   main()