scripts/maint/updateFallbackDirs.py

   1 #!/usr/bin/python
   2
   3 # Usage: scripts/maint/updateFallbackDirs.py > src/or/fallback_dirs.inc
   4 #
   5 # This script should be run from a stable, reliable network connection,
   6 # with no other network activity (and not over tor).
   7 # If this is not possible, please disable:
   8 # PERFORM_IPV4_DIRPORT_CHECKS and PERFORM_IPV6_DIRPORT_CHECKS
   9 #
  10 # Needs dateutil (and potentially other python packages)
  11 # Needs stem available in your PYTHONPATH, or just ln -s ../stem/stem .
  12 # Optionally uses ipaddress (python 3 builtin) or py2-ipaddress (package)
  13 # for netblock analysis, in PYTHONPATH, or just
  14 # ln -s ../py2-ipaddress-3.4.1/ipaddress.py .
  15 #
  16 # Then read the logs to make sure the fallbacks aren't dominated by a single
  17 # netblock or port
  18
  19 # Script by weasel, April 2015
  20 # Portions by gsathya & karsten, 2013
  21 # https://trac.torproject.org/projects/tor/attachment/ticket/8374/dir_list.2.py
  22 # Modifications by teor, 2015
  23
  24 import StringIO
  25 import string
  26 import re
  27 import datetime
  28 import gzip
  29 import os.path
  30 import json
  31 import math
  32 import sys
  33 import urllib
  34 import urllib2
  35 import hashlib
  36 import dateutil.parser
  37 # bson_lazy provides bson
  38 #from bson import json_util
  39 import copy
  40
  41 from stem.descriptor.remote import DescriptorDownloader
  42
  43 import logging
  44 # INFO tells you why each relay was included or excluded
  45 # WARN tells you about potential misconfigurations and relay detail changes
  46 logging.basicConfig(level=logging.WARNING)
  47 logging.root.name = ''
  48 # INFO tells you about each consensus download attempt
  49 logging.getLogger('stem').setLevel(logging.WARNING)
  50
  51 HAVE_IPADDRESS = False
  52 try:
  53   # python 3 builtin, or install package py2-ipaddress
  54   # there are several ipaddress implementations for python 2
  55   # with slightly different semantics with str typed text
  56   # fortunately, all our IP addresses are in unicode
  57   import ipaddress
  58   HAVE_IPADDRESS = True
  59 except ImportError:
  60   # if this happens, we avoid doing netblock analysis
  61   logging.warning('Unable to import ipaddress, please install py2-ipaddress.' +
  62                   ' A fallback list will be created, but optional netblock' +
  63                   ' analysis will not be performed.')
  64
  65 ## Top-Level Configuration
  66
  67 # Output all candidate fallbacks, or only output selected fallbacks?
  68 OUTPUT_CANDIDATES = False
  69
  70 # Perform DirPort checks over IPv4?
  71 # Change this to False if IPv4 doesn't work for you, or if you don't want to
  72 # download a consensus for each fallback
  73 # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
  74 PERFORM_IPV4_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else True
  75
  76 # Perform DirPort checks over IPv6?
  77 # If you know IPv6 works for you, set this to True
  78 # This will exclude IPv6 relays without an IPv6 DirPort configured
  79 # So it's best left at False until #18394 is implemented
  80 # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
  81 PERFORM_IPV6_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else False
  82
  83 # Output fallback name, flags, and ContactInfo in a C comment?
  84 OUTPUT_COMMENTS = True if OUTPUT_CANDIDATES else False
  85
  86 # Output matching ContactInfo in fallbacks list or the blacklist?
  87 # Useful if you're trying to contact operators
  88 CONTACT_COUNT = True if OUTPUT_CANDIDATES else False
  89 CONTACT_BLACKLIST_COUNT = True if OUTPUT_CANDIDATES else False
  90
  91 ## OnionOO Settings
  92
  93 ONIONOO = 'https://onionoo.torproject.org/'
  94 #ONIONOO = 'https://onionoo.thecthulhu.com/'
  95
  96 # Don't bother going out to the Internet, just use the files available locally,
  97 # even if they're very old
  98 LOCAL_FILES_ONLY = False
  99
 100 ## Whitelist / Blacklist Filter Settings
 101
 102 # The whitelist contains entries that are included if all attributes match
 103 # (IPv4, dirport, orport, id, and optionally IPv6 and IPv6 orport)
 104 # The blacklist contains (partial) entries that are excluded if any
 105 # sufficiently specific group of attributes matches:
 106 # IPv4 & DirPort
 107 # IPv4 & ORPort
 108 # ID
 109 # IPv6 & DirPort
 110 # IPv6 & IPv6 ORPort
 111 # If neither port is included in the blacklist, the entire IP address is
 112 # blacklisted.
 113
 114 # What happens to entries in neither list?
 115 # When True, they are included, when False, they are excluded
 116 INCLUDE_UNLISTED_ENTRIES = True if OUTPUT_CANDIDATES else False
 117
 118 # If an entry is in both lists, what happens?
 119 # When True, it is excluded, when False, it is included
 120 BLACKLIST_EXCLUDES_WHITELIST_ENTRIES = True
 121
 122 WHITELIST_FILE_NAME = 'scripts/maint/fallback.whitelist'
 123 BLACKLIST_FILE_NAME = 'scripts/maint/fallback.blacklist'
 124
 125 # The number of bytes we'll read from a filter file before giving up
 126 MAX_LIST_FILE_SIZE = 1024 * 1024
 127
 128 ## Eligibility Settings
 129
 130 # Reduced due to a bug in tor where a relay submits a 0 DirPort when restarted
 131 # This causes OnionOO to (correctly) reset its stability timer
 132 # This issue will be fixed in 0.2.7.7 and 0.2.8.2
 133 # Until then, the CUTOFFs below ensure a decent level of stability.
 134 ADDRESS_AND_PORT_STABLE_DAYS = 7
 135 # What time-weighted-fraction of these flags must FallbackDirs
 136 # Equal or Exceed?
 137 CUTOFF_RUNNING = .95
 138 CUTOFF_V2DIR = .95
 139 CUTOFF_GUARD = .95
 140 # What time-weighted-fraction of these flags must FallbackDirs
 141 # Equal or Fall Under?
 142 # .00 means no bad exits
 143 PERMITTED_BADEXIT = .00
 144
 145 # older entries' weights are adjusted with ALPHA^(age in days)
 146 AGE_ALPHA = 0.99
 147
 148 # this factor is used to scale OnionOO entries to [0,1]
 149 ONIONOO_SCALE_ONE = 999.
 150
 151 ## Fallback Count Limits
 152
 153 # The target for these parameters is 20% of the guards in the network
 154 # This is around 200 as of October 2015
 155 _FB_POG = 0.2
 156 FALLBACK_PROPORTION_OF_GUARDS = None if OUTPUT_CANDIDATES else _FB_POG
 157
 158 # We want exactly 100 fallbacks for the initial release
 159 # This gives us scope to add extra fallbacks to the list as needed
 160 # Limit the number of fallbacks (eliminating lowest by advertised bandwidth)
 161 MAX_FALLBACK_COUNT = None if OUTPUT_CANDIDATES else 100
 162 # Emit a C #error if the number of fallbacks is below
 163 MIN_FALLBACK_COUNT = 100
 164
 165 ## Fallback Bandwidth Requirements
 166
 167 # Any fallback with the Exit flag has its bandwidth multipled by this fraction
 168 # to make sure we aren't further overloading exits
 169 # (Set to 1.0, because we asked that only lightly loaded exits opt-in,
 170 # and the extra load really isn't that much for large relays.)
 171 EXIT_BANDWIDTH_FRACTION = 1.0
 172
 173 # If a single fallback's bandwidth is too low, it's pointless adding it
 174 # We expect fallbacks to handle an extra 30 kilobytes per second of traffic
 175 # Make sure they can support a hundred times the expected extra load
 176 # (Use 102.4 to make it come out nicely in MB/s)
 177 # We convert this to a consensus weight before applying the filter,
 178 # because all the bandwidth amounts are specified by the relay
 179 MIN_BANDWIDTH = 102.4 * 30.0 * 1024.0
 180
 181 # Clients will time out after 30 seconds trying to download a consensus
 182 # So allow fallback directories half that to deliver a consensus
 183 # The exact download times might change based on the network connection
 184 # running this script, but only by a few seconds
 185 # There is also about a second of python overhead
 186 CONSENSUS_DOWNLOAD_SPEED_MAX = 15.0
 187 # If the relay fails a consensus check, retry the download
 188 # This avoids delisting a relay due to transient network conditions
 189 CONSENSUS_DOWNLOAD_RETRY = True
 190
 191 ## Fallback Weights for Client Selection
 192
 193 # All fallback weights are equal, and set to the value below
 194 # Authorities are weighted 1.0 by default
 195 # Clients use these weights to select fallbacks and authorities at random
 196 # If there are 100 fallbacks and 9 authorities:
 197 #  - each fallback is chosen with probability 10.0/(10.0*100 + 1.0*9) ~= 0.99%
 198 #  - each authority is chosen with probability 1.0/(10.0*100 + 1.0*9) ~= 0.09%
 199 # A client choosing a bootstrap directory server will choose a fallback for
 200 # 10.0/(10.0*100 + 1.0*9) * 100 = 99.1% of attempts, and an authority for
 201 # 1.0/(10.0*100 + 1.0*9) * 9 = 0.9% of attempts.
 202 # (This disregards the bootstrap schedules, where clients start by choosing
 203 # from fallbacks & authoritites, then later choose from only authorities.)
 204 FALLBACK_OUTPUT_WEIGHT = 10.0
 205
 206 ## Parsing Functions
 207
 208 def parse_ts(t):
 209   return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S")
 210
 211 def remove_bad_chars(raw_string, bad_char_list):
 212   # Remove each character in the bad_char_list
 213   cleansed_string = raw_string
 214   for c in bad_char_list:
 215     cleansed_string = cleansed_string.replace(c, '')
 216   return cleansed_string
 217
 218 def cleanse_unprintable(raw_string):
 219   # Remove all unprintable characters
 220   cleansed_string = ''
 221   for c in raw_string:
 222     if c in string.printable:
 223       cleansed_string += c
 224   return cleansed_string
 225
 226 def cleanse_whitespace(raw_string):
 227   # Replace all whitespace characters with a space
 228   cleansed_string = raw_string
 229   for c in string.whitespace:
 230     cleansed_string = cleansed_string.replace(c, ' ')
 231   return cleansed_string
 232
 233 def cleanse_c_multiline_comment(raw_string):
 234   cleansed_string = raw_string
 235   # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
 236   cleansed_string = cleanse_whitespace(cleansed_string)
 237   # ContactInfo and Version can be arbitrary binary data
 238   cleansed_string = cleanse_unprintable(cleansed_string)
 239   # Prevent a malicious / unanticipated string from breaking out
 240   # of a C-style multiline comment
 241   # This removes '/*' and '*/' and '//'
 242   bad_char_list = '*/'
 243   # Prevent a malicious string from using C nulls
 244   bad_char_list += '\0'
 245   # Be safer by removing bad characters entirely
 246   cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
 247   # Some compilers may further process the content of comments
 248   # There isn't much we can do to cover every possible case
 249   # But comment-based directives are typically only advisory
 250   return cleansed_string
 251
 252 def cleanse_c_string(raw_string):
 253   cleansed_string = raw_string
 254   # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
 255   cleansed_string = cleanse_whitespace(cleansed_string)
 256   # ContactInfo and Version can be arbitrary binary data
 257   cleansed_string = cleanse_unprintable(cleansed_string)
 258   # Prevent a malicious address/fingerprint string from breaking out
 259   # of a C-style string
 260   bad_char_list = '"'
 261   # Prevent a malicious string from using escapes
 262   bad_char_list += '\\'
 263   # Prevent a malicious string from using C nulls
 264   bad_char_list += '\0'
 265   # Be safer by removing bad characters entirely
 266   cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
 267   # Some compilers may further process the content of strings
 268   # There isn't much we can do to cover every possible case
 269   # But this typically only results in changes to the string data
 270   return cleansed_string
 271
 272 ## OnionOO Source Functions
 273
 274 # a dictionary of source metadata for each onionoo query we've made
 275 fetch_source = {}
 276
 277 # register source metadata for 'what'
 278 # assumes we only retrieve one document for each 'what'
 279 def register_fetch_source(what, url, relays_published, version):
 280   fetch_source[what] = {}
 281   fetch_source[what]['url'] = url
 282   fetch_source[what]['relays_published'] = relays_published
 283   fetch_source[what]['version'] = version
 284
 285 # list each registered source's 'what'
 286 def fetch_source_list():
 287   return sorted(fetch_source.keys())
 288
 289 # given 'what', provide a multiline C comment describing the source
 290 def describe_fetch_source(what):
 291   desc = '/*'
 292   desc += '\n'
 293   desc += 'Onionoo Source: '
 294   desc += cleanse_c_multiline_comment(what)
 295   desc += ' Date: '
 296   desc += cleanse_c_multiline_comment(fetch_source[what]['relays_published'])
 297   desc += ' Version: '
 298   desc += cleanse_c_multiline_comment(fetch_source[what]['version'])
 299   desc += '\n'
 300   desc += 'URL: '
 301   desc += cleanse_c_multiline_comment(fetch_source[what]['url'])
 302   desc += '\n'
 303   desc += '*/'
 304   return desc
 305
 306 ## File Processing Functions
 307
 308 def write_to_file(str, file_name, max_len):
 309   try:
 310     with open(file_name, 'w') as f:
 311       f.write(str[0:max_len])
 312   except EnvironmentError, error:
 313     logging.error('Writing file %s failed: %d: %s'%
 314                   (file_name,
 315                    error.errno,
 316                    error.strerror)
 317                   )
 318
 319 def read_from_file(file_name, max_len):
 320   try:
 321     if os.path.isfile(file_name):
 322       with open(file_name, 'r') as f:
 323         return f.read(max_len)
 324   except EnvironmentError, error:
 325     logging.info('Loading file %s failed: %d: %s'%
 326                  (file_name,
 327                   error.errno,
 328                   error.strerror)
 329                  )
 330   return None
 331
 332 def load_possibly_compressed_response_json(response):
 333     if response.info().get('Content-Encoding') == 'gzip':
 334       buf = StringIO.StringIO( response.read() )
 335       f = gzip.GzipFile(fileobj=buf)
 336       return json.load(f)
 337     else:
 338       return json.load(response)
 339
 340 def load_json_from_file(json_file_name):
 341     # An exception here may be resolved by deleting the .last_modified
 342     # and .json files, and re-running the script
 343     try:
 344       with open(json_file_name, 'r') as f:
 345         return json.load(f)
 346     except EnvironmentError, error:
 347       raise Exception('Reading not-modified json file %s failed: %d: %s'%
 348                     (json_file_name,
 349                      error.errno,
 350                      error.strerror)
 351                     )
 352
 353 ## OnionOO Functions
 354
 355 def datestr_to_datetime(datestr):
 356   # Parse datetimes like: Fri, 02 Oct 2015 13:34:14 GMT
 357   if datestr is not None:
 358     dt = dateutil.parser.parse(datestr)
 359   else:
 360     # Never modified - use start of epoch
 361     dt = datetime.datetime.utcfromtimestamp(0)
 362   # strip any timezone out (in case they're supported in future)
 363   dt = dt.replace(tzinfo=None)
 364   return dt
 365
 366 def onionoo_fetch(what, **kwargs):
 367   params = kwargs
 368   params['type'] = 'relay'
 369   #params['limit'] = 10
 370   params['first_seen_days'] = '%d-'%(ADDRESS_AND_PORT_STABLE_DAYS,)
 371   params['last_seen_days'] = '-7'
 372   params['flag'] = 'V2Dir'
 373   url = ONIONOO + what + '?' + urllib.urlencode(params)
 374
 375   # Unfortunately, the URL is too long for some OS filenames,
 376   # but we still don't want to get files from different URLs mixed up
 377   base_file_name = what + '-' + hashlib.sha1(url).hexdigest()
 378
 379   full_url_file_name = base_file_name + '.full_url'
 380   MAX_FULL_URL_LENGTH = 1024
 381
 382   last_modified_file_name = base_file_name + '.last_modified'
 383   MAX_LAST_MODIFIED_LENGTH = 64
 384
 385   json_file_name = base_file_name + '.json'
 386
 387   if LOCAL_FILES_ONLY:
 388     # Read from the local file, don't write to anything
 389     response_json = load_json_from_file(json_file_name)
 390   else:
 391     # store the full URL to a file for debugging
 392     # no need to compare as long as you trust SHA-1
 393     write_to_file(url, full_url_file_name, MAX_FULL_URL_LENGTH)
 394
 395     request = urllib2.Request(url)
 396     request.add_header('Accept-encoding', 'gzip')
 397
 398     # load the last modified date from the file, if it exists
 399     last_mod_date = read_from_file(last_modified_file_name,
 400                                    MAX_LAST_MODIFIED_LENGTH)
 401     if last_mod_date is not None:
 402       request.add_header('If-modified-since', last_mod_date)
 403
 404     # Parse last modified date
 405     last_mod = datestr_to_datetime(last_mod_date)
 406
 407     # Not Modified and still recent enough to be useful
 408     # Onionoo / Globe used to use 6 hours, but we can afford a day
 409     required_freshness = datetime.datetime.utcnow()
 410     # strip any timezone out (to match dateutil.parser)
 411     required_freshness = required_freshness.replace(tzinfo=None)
 412     required_freshness -= datetime.timedelta(hours=24)
 413
 414     # Make the OnionOO request
 415     response_code = 0
 416     try:
 417       response = urllib2.urlopen(request)
 418       response_code = response.getcode()
 419     except urllib2.HTTPError, error:
 420       response_code = error.code
 421       if response_code == 304: # not modified
 422         pass
 423       else:
 424         raise Exception("Could not get " + url + ": "
 425                         + str(error.code) + ": " + error.reason)
 426
 427     if response_code == 200: # OK
 428       last_mod = datestr_to_datetime(response.info().get('Last-Modified'))
 429
 430     # Check for freshness
 431     if last_mod < required_freshness:
 432       if last_mod_date is not None:
 433         # This check sometimes fails transiently, retry the script if it does
 434         date_message = "Outdated data: last updated " + last_mod_date
 435       else:
 436         date_message = "No data: never downloaded "
 437       raise Exception(date_message + " from " + url)
 438
 439     # Process the data
 440     if response_code == 200: # OK
 441
 442       response_json = load_possibly_compressed_response_json(response)
 443
 444       with open(json_file_name, 'w') as f:
 445         # use the most compact json representation to save space
 446         json.dump(response_json, f, separators=(',',':'))
 447
 448       # store the last modified date in its own file
 449       if response.info().get('Last-modified') is not None:
 450         write_to_file(response.info().get('Last-Modified'),
 451                       last_modified_file_name,
 452                       MAX_LAST_MODIFIED_LENGTH)
 453
 454     elif response_code == 304: # Not Modified
 455
 456       response_json = load_json_from_file(json_file_name)
 457
 458     else: # Unexpected HTTP response code not covered in the HTTPError above
 459       raise Exception("Unexpected HTTP response code to " + url + ": "
 460                       + str(response_code))
 461
 462   register_fetch_source(what,
 463                         url,
 464                         response_json['relays_published'],
 465                         response_json['version'])
 466
 467   return response_json
 468
 469 def fetch(what, **kwargs):
 470   #x = onionoo_fetch(what, **kwargs)
 471   # don't use sort_keys, as the order of or_addresses is significant
 472   #print json.dumps(x, indent=4, separators=(',', ': '))
 473   #sys.exit(0)
 474
 475   return onionoo_fetch(what, **kwargs)
 476
 477 ## Fallback Candidate Class
 478
 479 class Candidate(object):
 480   CUTOFF_ADDRESS_AND_PORT_STABLE = (datetime.datetime.utcnow()
 481                             - datetime.timedelta(ADDRESS_AND_PORT_STABLE_DAYS))
 482
 483   def __init__(self, details):
 484     for f in ['fingerprint', 'nickname', 'last_changed_address_or_port',
 485               'consensus_weight', 'or_addresses', 'dir_address']:
 486       if not f in details: raise Exception("Document has no %s field."%(f,))
 487
 488     if not 'contact' in details:
 489       details['contact'] = None
 490     if not 'flags' in details or details['flags'] is None:
 491       details['flags'] = []
 492     if (not 'advertised_bandwidth' in details
 493         or details['advertised_bandwidth'] is None):
 494       # relays without advertised bandwdith have it calculated from their
 495       # consensus weight
 496       details['advertised_bandwidth'] = 0
 497     if (not 'effective_family' in details
 498         or details['effective_family'] is None):
 499       details['effective_family'] = []
 500     details['last_changed_address_or_port'] = parse_ts(
 501                                       details['last_changed_address_or_port'])
 502     self._data = details
 503     self._stable_sort_or_addresses()
 504
 505     self._fpr = self._data['fingerprint']
 506     self._running = self._guard = self._v2dir = 0.
 507     self._split_dirport()
 508     self._compute_orport()
 509     if self.orport is None:
 510       raise Exception("Failed to get an orport for %s."%(self._fpr,))
 511     self._compute_ipv6addr()
 512     if not self.has_ipv6():
 513       logging.debug("Failed to get an ipv6 address for %s."%(self._fpr,))
 514
 515   def _stable_sort_or_addresses(self):
 516     # replace self._data['or_addresses'] with a stable ordering,
 517     # sorting the secondary addresses in string order
 518     # leave the received order in self._data['or_addresses_raw']
 519     self._data['or_addresses_raw'] = self._data['or_addresses']
 520     or_address_primary = self._data['or_addresses'][:1]
 521     # subsequent entries in the or_addresses array are in an arbitrary order
 522     # so we stabilise the addresses by sorting them in string order
 523     or_addresses_secondaries_stable = sorted(self._data['or_addresses'][1:])
 524     or_addresses_stable = or_address_primary + or_addresses_secondaries_stable
 525     self._data['or_addresses'] = or_addresses_stable
 526
 527   def get_fingerprint(self):
 528     return self._fpr
 529
 530   # is_valid_ipv[46]_address by gsathya, karsten, 2013
 531   @staticmethod
 532   def is_valid_ipv4_address(address):
 533     if not isinstance(address, (str, unicode)):
 534       return False
 535
 536     # check if there are four period separated values
 537     if address.count(".") != 3:
 538       return False
 539
 540     # checks that each value in the octet are decimal values between 0-255
 541     for entry in address.split("."):
 542       if not entry.isdigit() or int(entry) < 0 or int(entry) > 255:
 543         return False
 544       elif entry[0] == "0" and len(entry) > 1:
 545         return False  # leading zeros, for instance in "1.2.3.001"
 546
 547     return True
 548
 549   @staticmethod
 550   def is_valid_ipv6_address(address):
 551     if not isinstance(address, (str, unicode)):
 552       return False
 553
 554     # remove brackets
 555     address = address[1:-1]
 556
 557     # addresses are made up of eight colon separated groups of four hex digits
 558     # with leading zeros being optional
 559     # https://en.wikipedia.org/wiki/IPv6#Address_format
 560
 561     colon_count = address.count(":")
 562
 563     if colon_count > 7:
 564       return False  # too many groups
 565     elif colon_count != 7 and not "::" in address:
 566       return False  # not enough groups and none are collapsed
 567     elif address.count("::") > 1 or ":::" in address:
 568       return False  # multiple groupings of zeros can't be collapsed
 569
 570     found_ipv4_on_previous_entry = False
 571     for entry in address.split(":"):
 572       # If an IPv6 address has an embedded IPv4 address,
 573       # it must be the last entry
 574       if found_ipv4_on_previous_entry:
 575         return False
 576       if not re.match("^[0-9a-fA-f]{0,4}$", entry):
 577         if not Candidate.is_valid_ipv4_address(entry):
 578           return False
 579         else:
 580           found_ipv4_on_previous_entry = True
 581
 582     return True
 583
 584   def _split_dirport(self):
 585     # Split the dir_address into dirip and dirport
 586     (self.dirip, _dirport) = self._data['dir_address'].split(':', 2)
 587     self.dirport = int(_dirport)
 588
 589   def _compute_orport(self):
 590     # Choose the first ORPort that's on the same IPv4 address as the DirPort.
 591     # In rare circumstances, this might not be the primary ORPort address.
 592     # However, _stable_sort_or_addresses() ensures we choose the same one
 593     # every time, even if onionoo changes the order of the secondaries.
 594     self._split_dirport()
 595     self.orport = None
 596     for i in self._data['or_addresses']:
 597       if i != self._data['or_addresses'][0]:
 598         logging.debug('Secondary IPv4 Address Used for %s: %s'%(self._fpr, i))
 599       (ipaddr, port) = i.rsplit(':', 1)
 600       if (ipaddr == self.dirip) and Candidate.is_valid_ipv4_address(ipaddr):
 601         self.orport = int(port)
 602         return
 603
 604   def _compute_ipv6addr(self):
 605     # Choose the first IPv6 address that uses the same port as the ORPort
 606     # Or, choose the first IPv6 address in the list
 607     # _stable_sort_or_addresses() ensures we choose the same IPv6 address
 608     # every time, even if onionoo changes the order of the secondaries.
 609     self.ipv6addr = None
 610     self.ipv6orport = None
 611     # Choose the first IPv6 address that uses the same port as the ORPort
 612     for i in self._data['or_addresses']:
 613       (ipaddr, port) = i.rsplit(':', 1)
 614       if (port == self.orport) and Candidate.is_valid_ipv6_address(ipaddr):
 615         self.ipv6addr = ipaddr
 616         self.ipv6orport = int(port)
 617         return
 618     # Choose the first IPv6 address in the list
 619     for i in self._data['or_addresses']:
 620       (ipaddr, port) = i.rsplit(':', 1)
 621       if Candidate.is_valid_ipv6_address(ipaddr):
 622         self.ipv6addr = ipaddr
 623         self.ipv6orport = int(port)
 624         return
 625
 626   @staticmethod
 627   def _extract_generic_history(history, which='unknown'):
 628     # given a tree like this:
 629     #   {
 630     #     "1_month": {
 631     #         "count": 187,
 632     #         "factor": 0.001001001001001001,
 633     #         "first": "2015-02-27 06:00:00",
 634     #         "interval": 14400,
 635     #         "last": "2015-03-30 06:00:00",
 636     #         "values": [
 637     #             999,
 638     #             999
 639     #         ]
 640     #     },
 641     #     "1_week": {
 642     #         "count": 169,
 643     #         "factor": 0.001001001001001001,
 644     #         "first": "2015-03-23 07:30:00",
 645     #         "interval": 3600,
 646     #         "last": "2015-03-30 07:30:00",
 647     #         "values": [ ...]
 648     #     },
 649     #     "1_year": {
 650     #         "count": 177,
 651     #         "factor": 0.001001001001001001,
 652     #         "first": "2014-04-11 00:00:00",
 653     #         "interval": 172800,
 654     #         "last": "2015-03-29 00:00:00",
 655     #         "values": [ ...]
 656     #     },
 657     #     "3_months": {
 658     #         "count": 185,
 659     #         "factor": 0.001001001001001001,
 660     #         "first": "2014-12-28 06:00:00",
 661     #         "interval": 43200,
 662     #         "last": "2015-03-30 06:00:00",
 663     #         "values": [ ...]
 664     #     }
 665     #   },
 666     # extract exactly one piece of data per time interval,
 667     # using smaller intervals where available.
 668     #
 669     # returns list of (age, length, value) dictionaries.
 670
 671     generic_history = []
 672
 673     periods = history.keys()
 674     periods.sort(key = lambda x: history[x]['interval'])
 675     now = datetime.datetime.utcnow()
 676     newest = now
 677     for p in periods:
 678       h = history[p]
 679       interval = datetime.timedelta(seconds = h['interval'])
 680       this_ts = parse_ts(h['last'])
 681
 682       if (len(h['values']) != h['count']):
 683         logging.warning('Inconsistent value count in %s document for %s'
 684                         %(p, which))
 685       for v in reversed(h['values']):
 686         if (this_ts <= newest):
 687           agt1 = now - this_ts
 688           agt2 = interval
 689           agetmp1 = (agt1.microseconds + (agt1.seconds + agt1.days * 24 * 3600)
 690                      * 10**6) / 10**6
 691           agetmp2 = (agt2.microseconds + (agt2.seconds + agt2.days * 24 * 3600)
 692                      * 10**6) / 10**6
 693           generic_history.append(
 694             { 'age': agetmp1,
 695               'length': agetmp2,
 696               'value': v
 697             })
 698           newest = this_ts
 699         this_ts -= interval
 700
 701       if (this_ts + interval != parse_ts(h['first'])):
 702         logging.warning('Inconsistent time information in %s document for %s'
 703                         %(p, which))
 704
 705     #print json.dumps(generic_history, sort_keys=True,
 706     #                  indent=4, separators=(',', ': '))
 707     return generic_history
 708
 709   @staticmethod
 710   def _avg_generic_history(generic_history):
 711     a = []
 712     for i in generic_history:
 713       if i['age'] > (ADDRESS_AND_PORT_STABLE_DAYS * 24 * 3600):
 714         continue
 715       if (i['length'] is not None
 716           and i['age'] is not None
 717           and i['value'] is not None):
 718         w = i['length'] * math.pow(AGE_ALPHA, i['age']/(3600*24))
 719         a.append( (i['value'] * w, w) )
 720
 721     sv = math.fsum(map(lambda x: x[0], a))
 722     sw = math.fsum(map(lambda x: x[1], a))
 723
 724     if sw == 0.0:
 725       svw = 0.0
 726     else:
 727       svw = sv/sw
 728     return svw
 729
 730   def _add_generic_history(self, history):
 731     periods = r['read_history'].keys()
 732     periods.sort(key = lambda x: r['read_history'][x]['interval'] )
 733
 734     print periods
 735
 736   def add_running_history(self, history):
 737     pass
 738
 739   def add_uptime(self, uptime):
 740     logging.debug('Adding uptime %s.'%(self._fpr,))
 741
 742     # flags we care about: Running, V2Dir, Guard
 743     if not 'flags' in uptime:
 744       logging.debug('No flags in document for %s.'%(self._fpr,))
 745       return
 746
 747     for f in ['Running', 'Guard', 'V2Dir']:
 748       if not f in uptime['flags']:
 749         logging.debug('No %s in flags for %s.'%(f, self._fpr,))
 750         return
 751
 752     running = self._extract_generic_history(uptime['flags']['Running'],
 753                                             '%s-Running'%(self._fpr))
 754     guard = self._extract_generic_history(uptime['flags']['Guard'],
 755                                           '%s-Guard'%(self._fpr))
 756     v2dir = self._extract_generic_history(uptime['flags']['V2Dir'],
 757                                           '%s-V2Dir'%(self._fpr))
 758     if 'BadExit' in uptime['flags']:
 759       badexit = self._extract_generic_history(uptime['flags']['BadExit'],
 760                                               '%s-BadExit'%(self._fpr))
 761
 762     self._running = self._avg_generic_history(running) / ONIONOO_SCALE_ONE
 763     self._guard = self._avg_generic_history(guard) / ONIONOO_SCALE_ONE
 764     self._v2dir = self._avg_generic_history(v2dir) / ONIONOO_SCALE_ONE
 765     self._badexit = None
 766     if 'BadExit' in uptime['flags']:
 767       self._badexit = self._avg_generic_history(badexit) / ONIONOO_SCALE_ONE
 768
 769   def is_candidate(self):
 770     must_be_running_now = (PERFORM_IPV4_DIRPORT_CHECKS
 771                            or PERFORM_IPV6_DIRPORT_CHECKS)
 772     if (must_be_running_now and not self.is_running()):
 773       logging.info('%s not a candidate: not running now, unable to check ' +
 774                    'DirPort consensus download', self._fpr)
 775       return False
 776     if (self._data['last_changed_address_or_port'] >
 777         self.CUTOFF_ADDRESS_AND_PORT_STABLE):
 778       logging.info('%s not a candidate: changed address/port recently (%s)',
 779                    self._fpr, self._data['last_changed_address_or_port'])
 780       return False
 781     if self._running < CUTOFF_RUNNING:
 782       logging.info('%s not a candidate: running avg too low (%lf)',
 783                    self._fpr, self._running)
 784       return False
 785     if self._v2dir < CUTOFF_V2DIR:
 786       logging.info('%s not a candidate: v2dir avg too low (%lf)',
 787                    self._fpr, self._v2dir)
 788       return False
 789     if self._badexit is not None and self._badexit > PERMITTED_BADEXIT:
 790       logging.info('%s not a candidate: badexit avg too high (%lf)',
 791                    self._fpr, self._badexit)
 792       return False
 793     # if the relay doesn't report a version, also exclude the relay
 794     if (not self._data.has_key('recommended_version')
 795         or not self._data['recommended_version']):
 796       logging.info('%s not a candidate: version not recommended', self._fpr)
 797       return False
 798     if self._guard < CUTOFF_GUARD:
 799       logging.info('%s not a candidate: guard avg too low (%lf)',
 800                    self._fpr, self._guard)
 801       return False
 802     if (not self._data.has_key('consensus_weight')
 803         or self._data['consensus_weight'] < 1):
 804       logging.info('%s not a candidate: consensus weight invalid', self._fpr)
 805       return False
 806     return True
 807
 808   def is_in_whitelist(self, relaylist):
 809     """ A fallback matches if each key in the whitelist line matches:
 810           ipv4
 811           dirport
 812           orport
 813           id
 814           ipv6 address and port (if present)
 815         If the fallback has an ipv6 key, the whitelist line must also have
 816         it, and vice versa, otherwise they don't match. """
 817     ipv6 = None
 818     if self.has_ipv6():
 819       ipv6 = '%s:%d'%(self.ipv6addr, self.ipv6orport)
 820     for entry in relaylist:
 821       if entry['id'] != self._fpr:
 822         # can't log here unless we match an IP and port, because every relay's
 823         # fingerprint is compared to every entry's fingerprint
 824         if entry['ipv4'] == self.dirip and int(entry['orport']) == self.orport:
 825           logging.warning('%s excluded: has OR %s:%d changed fingerprint to ' +
 826                           '%s?', entry['id'], self.dirip, self.orport,
 827                           self._fpr)
 828         if self.has_ipv6() and entry.has_key('ipv6') and entry['ipv6'] == ipv6:
 829           logging.warning('%s excluded: has OR %s changed fingerprint to ' +
 830                           '%s?', entry['id'], ipv6, self._fpr)
 831         continue
 832       if entry['ipv4'] != self.dirip:
 833         logging.warning('%s excluded: has it changed IPv4 from %s to %s?',
 834                         self._fpr, entry['ipv4'], self.dirip)
 835         continue
 836       if int(entry['dirport']) != self.dirport:
 837         logging.warning('%s excluded: has it changed DirPort from %s:%d to ' +
 838                         '%s:%d?', self._fpr, self.dirip, int(entry['dirport']),
 839                         self.dirip, self.dirport)
 840         continue
 841       if int(entry['orport']) != self.orport:
 842         logging.warning('%s excluded: has it changed ORPort from %s:%d to ' +
 843                         '%s:%d?', self._fpr, self.dirip, int(entry['orport']),
 844                         self.dirip, self.orport)
 845         continue
 846       if entry.has_key('ipv6') and self.has_ipv6():
 847         # if both entry and fallback have an ipv6 address, compare them
 848         if entry['ipv6'] != ipv6:
 849           logging.warning('%s excluded: has it changed IPv6 ORPort from %s ' +
 850                           'to %s?', self._fpr, entry['ipv6'], ipv6)
 851           continue
 852       # if the fallback has an IPv6 address but the whitelist entry
 853       # doesn't, or vice versa, the whitelist entry doesn't match
 854       elif entry.has_key('ipv6') and not self.has_ipv6():
 855         logging.warning('%s excluded: has it lost its former IPv6 address %s?',
 856                         self._fpr, entry['ipv6'])
 857         continue
 858       elif not entry.has_key('ipv6') and self.has_ipv6():
 859         logging.warning('%s excluded: has it gained an IPv6 address %s?',
 860                         self._fpr, ipv6)
 861         continue
 862       return True
 863     return False
 864
 865   def is_in_blacklist(self, relaylist):
 866     """ A fallback matches a blacklist line if a sufficiently specific group
 867         of attributes matches:
 868           ipv4 & dirport
 869           ipv4 & orport
 870           id
 871           ipv6 & dirport
 872           ipv6 & ipv6 orport
 873         If the fallback and the blacklist line both have an ipv6 key,
 874         their values will be compared, otherwise, they will be ignored.
 875         If there is no dirport and no orport, the entry matches all relays on
 876         that ip. """
 877     for entry in relaylist:
 878       for key in entry:
 879         value = entry[key]
 880         if key == 'id' and value == self._fpr:
 881           logging.info('%s is in the blacklist: fingerprint matches',
 882                        self._fpr)
 883           return True
 884         if key == 'ipv4' and value == self.dirip:
 885           # if the dirport is present, check it too
 886           if entry.has_key('dirport'):
 887             if int(entry['dirport']) == self.dirport:
 888               logging.info('%s is in the blacklist: IPv4 (%s) and ' +
 889                            'DirPort (%d) match', self._fpr, self.dirip,
 890                            self.dirport)
 891               return True
 892           # if the orport is present, check it too
 893           elif entry.has_key('orport'):
 894             if int(entry['orport']) == self.orport:
 895               logging.info('%s is in the blacklist: IPv4 (%s) and ' +
 896                            'ORPort (%d) match', self._fpr, self.dirip,
 897                            self.orport)
 898               return True
 899           else:
 900             logging.info('%s is in the blacklist: IPv4 (%s) matches, and ' +
 901                          'entry has no DirPort or ORPort', self._fpr,
 902                          self.dirip)
 903             return True
 904         ipv6 = None
 905         if self.has_ipv6():
 906           ipv6 = '%s:%d'%(self.ipv6addr, self.ipv6orport)
 907         if (key == 'ipv6' and self.has_ipv6()):
 908         # if both entry and fallback have an ipv6 address, compare them,
 909         # otherwise, disregard ipv6 addresses
 910           if value == ipv6:
 911             # if the dirport is present, check it too
 912             if entry.has_key('dirport'):
 913               if int(entry['dirport']) == self.dirport:
 914                 logging.info('%s is in the blacklist: IPv6 (%s) and ' +
 915                              'DirPort (%d) match', self._fpr, ipv6,
 916                              self.dirport)
 917                 return True
 918             # we've already checked the ORPort, it's part of entry['ipv6']
 919             else:
 920               logging.info('%s is in the blacklist: IPv6 (%s) matches, and' +
 921                            'entry has no DirPort', self._fpr, ipv6)
 922               return True
 923         elif (key == 'ipv6' or self.has_ipv6()):
 924           # only log if the fingerprint matches but the IPv6 doesn't
 925           if entry.has_key('id') and entry['id'] == self._fpr:
 926             logging.info('%s skipping IPv6 blacklist comparison: relay ' +
 927                          'has%s IPv6%s, but entry has%s IPv6%s', self._fpr,
 928                          '' if self.has_ipv6() else ' no',
 929                          (' (' + ipv6 + ')') if self.has_ipv6() else  '',
 930                          '' if key == 'ipv6' else ' no',
 931                          (' (' + value + ')') if key == 'ipv6' else '')
 932             logging.warning('Has %s %s IPv6 address %s?', self._fpr,
 933                         'gained an' if self.has_ipv6() else 'lost its former',
 934                         ipv6 if self.has_ipv6() else value)
 935     return False
 936
 937   def cw_to_bw_factor(self):
 938     # any relays with a missing or zero consensus weight are not candidates
 939     # any relays with a missing advertised bandwidth have it set to zero
 940     return self._data['advertised_bandwidth'] / self._data['consensus_weight']
 941
 942   # since advertised_bandwidth is reported by the relay, it can be gamed
 943   # to avoid this, use the median consensus weight to bandwidth factor to
 944   # estimate this relay's measured bandwidth, and make that the upper limit
 945   def measured_bandwidth(self, median_cw_to_bw_factor):
 946     cw_to_bw= median_cw_to_bw_factor
 947     # Reduce exit bandwidth to make sure we're not overloading them
 948     if self.is_exit():
 949       cw_to_bw *= EXIT_BANDWIDTH_FRACTION
 950     measured_bandwidth = self._data['consensus_weight'] * cw_to_bw
 951     if self._data['advertised_bandwidth'] != 0:
 952       # limit advertised bandwidth (if available) to measured bandwidth
 953       return min(measured_bandwidth, self._data['advertised_bandwidth'])
 954     else:
 955       return measured_bandwidth
 956
 957   def set_measured_bandwidth(self, median_cw_to_bw_factor):
 958     self._data['measured_bandwidth'] = self.measured_bandwidth(
 959                                                       median_cw_to_bw_factor)
 960
 961   def is_exit(self):
 962     return 'Exit' in self._data['flags']
 963
 964   def is_guard(self):
 965     return 'Guard' in self._data['flags']
 966
 967   def is_running(self):
 968     return 'Running' in self._data['flags']
 969
 970   # does this fallback have an IPv6 address and orport?
 971   def has_ipv6(self):
 972     return self.ipv6addr is not None and self.ipv6orport is not None
 973
 974   # strip leading and trailing brackets from an IPv6 address
 975   # safe to use on non-bracketed IPv6 and on IPv4 addresses
 976   # also convert to unicode, and make None appear as ''
 977   @staticmethod
 978   def strip_ipv6_brackets(ip):
 979     if ip is None:
 980       return unicode('')
 981     if len(ip) < 2:
 982       return unicode(ip)
 983     if ip[0] == '[' and ip[-1] == ']':
 984       return unicode(ip[1:-1])
 985     return unicode(ip)
 986
 987   # are ip_a and ip_b in the same netblock?
 988   # mask_bits is the size of the netblock
 989   # takes both IPv4 and IPv6 addresses
 990   # the versions of ip_a and ip_b must be the same
 991   # the mask must be valid for the IP version
 992   @staticmethod
 993   def netblocks_equal(ip_a, ip_b, mask_bits):
 994     if ip_a is None or ip_b is None:
 995       return False
 996     ip_a = Candidate.strip_ipv6_brackets(ip_a)
 997     ip_b = Candidate.strip_ipv6_brackets(ip_b)
 998     a = ipaddress.ip_address(ip_a)
 999     b = ipaddress.ip_address(ip_b)
1000     if a.version != b.version:
1001       raise Exception('Mismatching IP versions in %s and %s'%(ip_a, ip_b))
1002     if mask_bits > a.max_prefixlen:
1003       logging.error('Bad IP mask %d for %s and %s'%(mask_bits, ip_a, ip_b))
1004       mask_bits = a.max_prefixlen
1005     if mask_bits < 0:
1006       logging.error('Bad IP mask %d for %s and %s'%(mask_bits, ip_a, ip_b))
1007       mask_bits = 0
1008     a_net = ipaddress.ip_network('%s/%d'%(ip_a, mask_bits), strict=False)
1009     return b in a_net
1010
1011   # is this fallback's IPv4 address (dirip) in the same netblock as other's
1012   # IPv4 address?
1013   # mask_bits is the size of the netblock
1014   def ipv4_netblocks_equal(self, other, mask_bits):
1015     return Candidate.netblocks_equal(self.dirip, other.dirip, mask_bits)
1016
1017   # is this fallback's IPv6 address (ipv6addr) in the same netblock as
1018   # other's IPv6 address?
1019   # Returns False if either fallback has no IPv6 address
1020   # mask_bits is the size of the netblock
1021   def ipv6_netblocks_equal(self, other, mask_bits):
1022     if not self.has_ipv6() or not other.has_ipv6():
1023       return False
1024     return Candidate.netblocks_equal(self.ipv6addr, other.ipv6addr, mask_bits)
1025
1026   # is this fallback's IPv4 DirPort the same as other's IPv4 DirPort?
1027   def dirport_equal(self, other):
1028     return self.dirport == other.dirport
1029
1030   # is this fallback's IPv4 ORPort the same as other's IPv4 ORPort?
1031   def ipv4_orport_equal(self, other):
1032     return self.orport == other.orport
1033
1034   # is this fallback's IPv6 ORPort the same as other's IPv6 ORPort?
1035   # Returns False if either fallback has no IPv6 address
1036   def ipv6_orport_equal(self, other):
1037     if not self.has_ipv6() or not other.has_ipv6():
1038       return False
1039     return self.ipv6orport == other.ipv6orport
1040
1041   # does this fallback have the same DirPort, IPv4 ORPort, or
1042   # IPv6 ORPort as other?
1043   # Ignores IPv6 ORPort if either fallback has no IPv6 address
1044   def port_equal(self, other):
1045     return (self.dirport_equal(other) or self.ipv4_orport_equal(other)
1046             or self.ipv6_orport_equal(other))
1047
1048   # return a list containing IPv4 ORPort, DirPort, and IPv6 ORPort (if present)
1049   def port_list(self):
1050     ports = [self.dirport, self.orport]
1051     if self.has_ipv6() and not self.ipv6orport in ports:
1052       ports.append(self.ipv6orport)
1053     return ports
1054
1055   # does this fallback share a port with other, regardless of whether the
1056   # port types match?
1057   # For example, if self's IPv4 ORPort is 80 and other's DirPort is 80,
1058   # return True
1059   def port_shared(self, other):
1060     for p in self.port_list():
1061       if p in other.port_list():
1062         return True
1063     return False
1064
1065   # report how long it takes to download a consensus from dirip:dirport
1066   @staticmethod
1067   def fallback_consensus_download_speed(dirip, dirport, nickname, max_time):
1068     download_failed = False
1069     downloader = DescriptorDownloader()
1070     start = datetime.datetime.utcnow()
1071     # some directory mirrors respond to requests in ways that hang python
1072     # sockets, which is why we log this line here
1073     logging.info('Initiating consensus download from %s (%s:%d).', nickname,
1074                  dirip, dirport)
1075     # there appears to be about 1 second of overhead when comparing stem's
1076     # internal trace time and the elapsed time calculated here
1077     TIMEOUT_SLOP = 1.0
1078     try:
1079       downloader.get_consensus(endpoints = [(dirip, dirport)],
1080                                timeout = (max_time + TIMEOUT_SLOP),
1081                                validate = True,
1082                                retries = 0,
1083                                fall_back_to_authority = False).run()
1084     except Exception, stem_error:
1085       logging.info('Unable to retrieve a consensus from %s: %s', nickname,
1086                     stem_error)
1087       status = 'error: "%s"' % (stem_error)
1088       level = logging.WARNING
1089       download_failed = True
1090     elapsed = (datetime.datetime.utcnow() - start).total_seconds()
1091     if elapsed > max_time:
1092       status = 'too slow'
1093       level = logging.WARNING
1094       download_failed = True
1095     else:
1096       status = 'ok'
1097       level = logging.DEBUG
1098     logging.log(level, 'Consensus download: %0.1fs %s from %s (%s:%d), ' +
1099                  'max download time %0.1fs.', elapsed, status, nickname,
1100                  dirip, dirport, max_time)
1101     return download_failed
1102
1103   # does this fallback download the consensus fast enough?
1104   def check_fallback_download_consensus(self):
1105     # include the relay if we're not doing a check, or we can't check (IPv6)
1106     ipv4_failed = False
1107     ipv6_failed = False
1108     if PERFORM_IPV4_DIRPORT_CHECKS:
1109       ipv4_failed = Candidate.fallback_consensus_download_speed(self.dirip,
1110                                                 self.dirport,
1111                                                 self._data['nickname'],
1112                                                 CONSENSUS_DOWNLOAD_SPEED_MAX)
1113     if self.has_ipv6() and PERFORM_IPV6_DIRPORT_CHECKS:
1114       # Clients assume the IPv6 DirPort is the same as the IPv4 DirPort
1115       ipv6_failed = Candidate.fallback_consensus_download_speed(self.ipv6addr,
1116                                                 self.dirport,
1117                                                 self._data['nickname'],
1118                                                 CONSENSUS_DOWNLOAD_SPEED_MAX)
1119     return ((not ipv4_failed) and (not ipv6_failed))
1120
1121   # if this fallback has not passed a download check, try it again,
1122   # and record the result, available in get_fallback_download_consensus
1123   def try_fallback_download_consensus(self):
1124     if not self.get_fallback_download_consensus():
1125       self._data['download_check'] = self.check_fallback_download_consensus()
1126
1127   # did this fallback pass the download check?
1128   def get_fallback_download_consensus(self):
1129     # if we're not performing checks, return True
1130     if not PERFORM_IPV4_DIRPORT_CHECKS and not PERFORM_IPV6_DIRPORT_CHECKS:
1131       return True
1132     # if we are performing checks, but haven't done one, return False
1133     if not self._data.has_key('download_check'):
1134       return False
1135     return self._data['download_check']
1136
1137   # output an optional header comment and info for this fallback
1138   # try_fallback_download_consensus before calling this
1139   def fallbackdir_line(self, fallbacks, prefilter_fallbacks):
1140     s = ''
1141     if OUTPUT_COMMENTS:
1142       s += self.fallbackdir_comment(fallbacks, prefilter_fallbacks)
1143     # if the download speed is ok, output a C string
1144     # if it's not, but we OUTPUT_COMMENTS, output a commented-out C string
1145     if self.get_fallback_download_consensus() or OUTPUT_COMMENTS:
1146       s += self.fallbackdir_info(self.get_fallback_download_consensus())
1147     return s
1148
1149   # output a header comment for this fallback
1150   def fallbackdir_comment(self, fallbacks, prefilter_fallbacks):
1151     # /*
1152     # nickname
1153     # flags
1154     # [contact]
1155     # [identical contact counts]
1156     # */
1157     # Multiline C comment
1158     s = '/*'
1159     s += '\n'
1160     s += cleanse_c_multiline_comment(self._data['nickname'])
1161     s += '\n'
1162     s += 'Flags: '
1163     s += cleanse_c_multiline_comment(' '.join(sorted(self._data['flags'])))
1164     s += '\n'
1165     if self._data['contact'] is not None:
1166       s += cleanse_c_multiline_comment(self._data['contact'])
1167       if CONTACT_COUNT or CONTACT_BLACKLIST_COUNT:
1168         fallback_count = len([f for f in fallbacks
1169                               if f._data['contact'] == self._data['contact']])
1170         if fallback_count > 1:
1171           s += '\n'
1172           s += '%d identical contacts listed' % (fallback_count)
1173       if CONTACT_BLACKLIST_COUNT:
1174         prefilter_count = len([f for f in prefilter_fallbacks
1175                                if f._data['contact'] == self._data['contact']])
1176         filter_count = prefilter_count - fallback_count
1177         if filter_count > 0:
1178           if fallback_count > 1:
1179             s += ' '
1180           else:
1181             s += '\n'
1182           s += '%d blacklisted' % (filter_count)
1183       s += '\n'
1184     s += '*/'
1185     s += '\n'
1186
1187   # output the fallback info C string for this fallback
1188   # this is the text that would go after FallbackDir in a torrc
1189   # if this relay failed the download test and we OUTPUT_COMMENTS,
1190   # comment-out the returned string
1191   def fallbackdir_info(self, dl_speed_ok):
1192     # "address:dirport orport=port id=fingerprint"
1193     # "[ipv6=addr:orport]"
1194     # "weight=FALLBACK_OUTPUT_WEIGHT",
1195     #
1196     # Do we want a C string, or a commented-out string?
1197     c_string = dl_speed_ok
1198     comment_string = not dl_speed_ok and OUTPUT_COMMENTS
1199     # If we don't want either kind of string, bail
1200     if not c_string and not comment_string:
1201       return ''
1202     s = ''
1203     # Comment out the fallback directory entry if it's too slow
1204     # See the debug output for which address and port is failing
1205     if comment_string:
1206       s += '/* Consensus download failed or was too slow:\n'
1207     # Multi-Line C string with trailing comma (part of a string list)
1208     # This makes it easier to diff the file, and remove IPv6 lines using grep
1209     # Integers don't need escaping
1210     s += '"%s orport=%d id=%s"'%(
1211             cleanse_c_string(self._data['dir_address']),
1212             self.orport,
1213             cleanse_c_string(self._fpr))
1214     s += '\n'
1215     if self.has_ipv6():
1216       s += '" ipv6=%s:%d"'%(cleanse_c_string(self.ipv6addr), self.ipv6orport)
1217       s += '\n'
1218     s += '" weight=%d",'%(FALLBACK_OUTPUT_WEIGHT)
1219     if comment_string:
1220       s += '\n'
1221       s += '*/'
1222     return s
1223
1224 ## Fallback Candidate List Class
1225
1226 class CandidateList(dict):
1227   def __init__(self):
1228     pass
1229
1230   def _add_relay(self, details):
1231     if not 'dir_address' in details: return
1232     c = Candidate(details)
1233     self[ c.get_fingerprint() ] = c
1234
1235   def _add_uptime(self, uptime):
1236     try:
1237       fpr = uptime['fingerprint']
1238     except KeyError:
1239       raise Exception("Document has no fingerprint field.")
1240
1241     try:
1242       c = self[fpr]
1243     except KeyError:
1244       logging.debug('Got unknown relay %s in uptime document.'%(fpr,))
1245       return
1246
1247     c.add_uptime(uptime)
1248
1249   def _add_details(self):
1250     logging.debug('Loading details document.')
1251     d = fetch('details',
1252         fields=('fingerprint,nickname,contact,last_changed_address_or_port,' +
1253                 'consensus_weight,advertised_bandwidth,or_addresses,' +
1254                 'dir_address,recommended_version,flags,effective_family'))
1255     logging.debug('Loading details document done.')
1256
1257     if not 'relays' in d: raise Exception("No relays found in document.")
1258
1259     for r in d['relays']: self._add_relay(r)
1260
1261   def _add_uptimes(self):
1262     logging.debug('Loading uptime document.')
1263     d = fetch('uptime')
1264     logging.debug('Loading uptime document done.')
1265
1266     if not 'relays' in d: raise Exception("No relays found in document.")
1267     for r in d['relays']: self._add_uptime(r)
1268
1269   def add_relays(self):
1270     self._add_details()
1271     self._add_uptimes()
1272
1273   def count_guards(self):
1274     guard_count = 0
1275     for fpr in self.keys():
1276       if self[fpr].is_guard():
1277         guard_count += 1
1278     return guard_count
1279
1280   # Find fallbacks that fit the uptime, stability, and flags criteria,
1281   # and make an array of them in self.fallbacks
1282   def compute_fallbacks(self):
1283     self.fallbacks = map(lambda x: self[x],
1284                          filter(lambda x: self[x].is_candidate(),
1285                                 self.keys()))
1286
1287   # sort fallbacks by their consensus weight to advertised bandwidth factor,
1288   # lowest to highest
1289   # used to find the median cw_to_bw_factor()
1290   def sort_fallbacks_by_cw_to_bw_factor(self):
1291     self.fallbacks.sort(key=lambda f: f.cw_to_bw_factor())
1292
1293   # sort fallbacks by their measured bandwidth, highest to lowest
1294   # calculate_measured_bandwidth before calling this
1295   # this is useful for reviewing candidates in priority order
1296   def sort_fallbacks_by_measured_bandwidth(self):
1297     self.fallbacks.sort(key=lambda f: f._data['measured_bandwidth'],
1298                         reverse=True)
1299
1300   # sort fallbacks by their fingerprint, lowest to highest
1301   # this is useful for stable diffs of fallback lists
1302   def sort_fallbacks_by_fingerprint(self):
1303     self.fallbacks.sort(key=lambda f: f._fpr)
1304
1305   @staticmethod
1306   def load_relaylist(file_name):
1307     """ Read each line in the file, and parse it like a FallbackDir line:
1308         an IPv4 address and optional port:
1309           <IPv4 address>:<port>
1310         which are parsed into dictionary entries:
1311           ipv4=<IPv4 address>
1312           dirport=<port>
1313         followed by a series of key=value entries:
1314           orport=<port>
1315           id=<fingerprint>
1316           ipv6=<IPv6 address>:<IPv6 orport>
1317         each line's key/value pairs are placed in a dictonary,
1318         (of string -> string key/value pairs),
1319         and these dictionaries are placed in an array.
1320         comments start with # and are ignored """
1321     relaylist = []
1322     file_data = read_from_file(file_name, MAX_LIST_FILE_SIZE)
1323     if file_data is None:
1324       return relaylist
1325     for line in file_data.split('\n'):
1326       relay_entry = {}
1327       # ignore comments
1328       line_comment_split = line.split('#')
1329       line = line_comment_split[0]
1330       # cleanup whitespace
1331       line = cleanse_whitespace(line)
1332       line = line.strip()
1333       if len(line) == 0:
1334         continue
1335       for item in line.split(' '):
1336         item = item.strip()
1337         if len(item) == 0:
1338           continue
1339         key_value_split = item.split('=')
1340         kvl = len(key_value_split)
1341         if kvl < 1 or kvl > 2:
1342           print '#error Bad %s item: %s, format is key=value.'%(
1343                                                  file_name, item)
1344         if kvl == 1:
1345           # assume that entries without a key are the ipv4 address,
1346           # perhaps with a dirport
1347           ipv4_maybe_dirport = key_value_split[0]
1348           ipv4_maybe_dirport_split = ipv4_maybe_dirport.split(':')
1349           dirl = len(ipv4_maybe_dirport_split)
1350           if dirl < 1 or dirl > 2:
1351             print '#error Bad %s IPv4 item: %s, format is ipv4:port.'%(
1352                                                         file_name, item)
1353           if dirl >= 1:
1354             relay_entry['ipv4'] = ipv4_maybe_dirport_split[0]
1355           if dirl == 2:
1356             relay_entry['dirport'] = ipv4_maybe_dirport_split[1]
1357         elif kvl == 2:
1358           relay_entry[key_value_split[0]] = key_value_split[1]
1359       relaylist.append(relay_entry)
1360     return relaylist
1361
1362   # apply the fallback whitelist and blacklist
1363   def apply_filter_lists(self):
1364     excluded_count = 0
1365     logging.debug('Applying whitelist and blacklist.')
1366     # parse the whitelist and blacklist
1367     whitelist = self.load_relaylist(WHITELIST_FILE_NAME)
1368     blacklist = self.load_relaylist(BLACKLIST_FILE_NAME)
1369     filtered_fallbacks = []
1370     for f in self.fallbacks:
1371       in_whitelist = f.is_in_whitelist(whitelist)
1372       in_blacklist = f.is_in_blacklist(blacklist)
1373       if in_whitelist and in_blacklist:
1374         if BLACKLIST_EXCLUDES_WHITELIST_ENTRIES:
1375           # exclude
1376           excluded_count += 1
1377           logging.warning('Excluding %s: in both blacklist and whitelist.',
1378                           f._fpr)
1379         else:
1380           # include
1381           filtered_fallbacks.append(f)
1382       elif in_whitelist:
1383         # include
1384         filtered_fallbacks.append(f)
1385       elif in_blacklist:
1386         # exclude
1387         excluded_count += 1
1388         logging.info('Excluding %s: in blacklist.', f._fpr)
1389       else:
1390         if INCLUDE_UNLISTED_ENTRIES:
1391           # include
1392           filtered_fallbacks.append(f)
1393         else:
1394           # exclude
1395           excluded_count += 1
1396           logging.info('Excluding %s: in neither blacklist nor whitelist.',
1397                        f._fpr)
1398     self.fallbacks = filtered_fallbacks
1399     return excluded_count
1400
1401   @staticmethod
1402   def summarise_filters(initial_count, excluded_count):
1403     return '/* Whitelist & blacklist excluded %d of %d candidates. */'%(
1404                                                 excluded_count, initial_count)
1405
1406   # calculate each fallback's measured bandwidth based on the median
1407   # consensus weight to advertised bandwdith ratio
1408   def calculate_measured_bandwidth(self):
1409     self.sort_fallbacks_by_cw_to_bw_factor()
1410     median_fallback = self.fallback_median(True)
1411     if median_fallback is not None:
1412       median_cw_to_bw_factor = median_fallback.cw_to_bw_factor()
1413     else:
1414       # this will never be used, because there are no fallbacks
1415       median_cw_to_bw_factor = None
1416     for f in self.fallbacks:
1417       f.set_measured_bandwidth(median_cw_to_bw_factor)
1418
1419   # remove relays with low measured bandwidth from the fallback list
1420   # calculate_measured_bandwidth for each relay before calling this
1421   def remove_low_bandwidth_relays(self):
1422     if MIN_BANDWIDTH is None:
1423       return
1424     above_min_bw_fallbacks = []
1425     for f in self.fallbacks:
1426       if f._data['measured_bandwidth'] >= MIN_BANDWIDTH:
1427         above_min_bw_fallbacks.append(f)
1428       else:
1429         # the bandwidth we log here is limited by the relay's consensus weight
1430         # as well as its adverttised bandwidth. See set_measured_bandwidth
1431         # for details
1432         logging.info('%s not a candidate: bandwidth %.1fMB/s too low, must ' +
1433                      'be at least %.1fMB/s', f._fpr,
1434                      f._data['measured_bandwidth']/(1024.0*1024.0),
1435                      MIN_BANDWIDTH/(1024.0*1024.0))
1436     self.fallbacks = above_min_bw_fallbacks
1437
1438   # the minimum fallback in the list
1439   # call one of the sort_fallbacks_* functions before calling this
1440   def fallback_min(self):
1441     if len(self.fallbacks) > 0:
1442       return self.fallbacks[-1]
1443     else:
1444       return None
1445
1446   # the median fallback in the list
1447   # call one of the sort_fallbacks_* functions before calling this
1448   def fallback_median(self, require_advertised_bandwidth):
1449     # use the low-median when there are an evan number of fallbacks,
1450     # for consistency with the bandwidth authorities
1451     if len(self.fallbacks) > 0:
1452       median_position = (len(self.fallbacks) - 1) / 2
1453       if not require_advertised_bandwidth:
1454         return self.fallbacks[median_position]
1455       # if we need advertised_bandwidth but this relay doesn't have it,
1456       # move to a fallback with greater consensus weight until we find one
1457       while not self.fallbacks[median_position]._data['advertised_bandwidth']:
1458         median_position += 1
1459         if median_position >= len(self.fallbacks):
1460           return None
1461       return self.fallbacks[median_position]
1462     else:
1463       return None
1464
1465   # the maximum fallback in the list
1466   # call one of the sort_fallbacks_* functions before calling this
1467   def fallback_max(self):
1468     if len(self.fallbacks) > 0:
1469       return self.fallbacks[0]
1470     else:
1471       return None
1472
1473   # does exclusion_list contain attribute?
1474   # if so, return False
1475   # if not, return True
1476   # if attribute is None or the empty string, always return True
1477   @staticmethod
1478   def allow(attribute, exclusion_list):
1479     if attribute is None or attribute == '':
1480       return True
1481     elif attribute in exclusion_list:
1482       return False
1483     else:
1484       return True
1485
1486   # make sure there is only one fallback per IPv4 address, and per IPv6 address
1487   # there is only one IPv4 address on each fallback: the IPv4 DirPort address
1488   # (we choose the IPv4 ORPort which is on the same IPv4 as the DirPort)
1489   # there is at most one IPv6 address on each fallback: the IPv6 ORPort address
1490   # we try to match the IPv4 ORPort, but will use any IPv6 address if needed
1491   # (clients assume the IPv6 DirPort is the same as the IPv4 DirPort, but
1492   # typically only use the IPv6 ORPort)
1493   # if there is no IPv6 address, only the IPv4 address is checked
1494   # return the number of candidates we excluded
1495   def limit_fallbacks_same_ip(self):
1496     ip_limit_fallbacks = []
1497     ip_list = []
1498     for f in self.fallbacks:
1499       if (CandidateList.allow(f.dirip, ip_list)
1500           and CandidateList.allow(f.ipv6addr, ip_list)):
1501         ip_limit_fallbacks.append(f)
1502         ip_list.append(f.dirip)
1503         if f.has_ipv6():
1504           ip_list.append(f.ipv6addr)
1505       elif not CandidateList.allow(f.dirip, ip_list):
1506         logging.info('Eliminated %s: already have fallback on IPv4 %s'%(
1507                                                           f._fpr, f.dirip))
1508       elif f.has_ipv6() and not CandidateList.allow(f.ipv6addr, ip_list):
1509         logging.info('Eliminated %s: already have fallback on IPv6 %s'%(
1510                                                           f._fpr, f.ipv6addr))
1511     original_count = len(self.fallbacks)
1512     self.fallbacks = ip_limit_fallbacks
1513     return original_count - len(self.fallbacks)
1514
1515   # make sure there is only one fallback per ContactInfo
1516   # if there is no ContactInfo, allow the fallback
1517   # this check can be gamed by providing no ContactInfo, or by setting the
1518   # ContactInfo to match another fallback
1519   # However, given the likelihood that relays with the same ContactInfo will
1520   # go down at similar times, its usefulness outweighs the risk
1521   def limit_fallbacks_same_contact(self):
1522     contact_limit_fallbacks = []
1523     contact_list = []
1524     for f in self.fallbacks:
1525       if CandidateList.allow(f._data['contact'], contact_list):
1526         contact_limit_fallbacks.append(f)
1527         contact_list.append(f._data['contact'])
1528       else:
1529         logging.info(('Eliminated %s: already have fallback on ' +
1530                        'ContactInfo %s')%(f._fpr, f._data['contact']))
1531     original_count = len(self.fallbacks)
1532     self.fallbacks = contact_limit_fallbacks
1533     return original_count - len(self.fallbacks)
1534
1535   # make sure there is only one fallback per effective family
1536   # if there is no family, allow the fallback
1537   # this check can't be gamed, because we use effective family, which ensures
1538   # mutual family declarations
1539   # if any indirect families exist, the result depends on the order in which
1540   # fallbacks are sorted in the list
1541   def limit_fallbacks_same_family(self):
1542     family_limit_fallbacks = []
1543     fingerprint_list = []
1544     for f in self.fallbacks:
1545       if CandidateList.allow(f._fpr, fingerprint_list):
1546         family_limit_fallbacks.append(f)
1547         fingerprint_list.append(f._fpr)
1548         fingerprint_list.extend(f._data['effective_family'])
1549       else:
1550         # technically, we already have a fallback with this fallback in its
1551         # effective family
1552         logging.info('Eliminated %s: already have fallback in effective ' +
1553                       'family'%(f._fpr))
1554     original_count = len(self.fallbacks)
1555     self.fallbacks = family_limit_fallbacks
1556     return original_count - len(self.fallbacks)
1557
1558   # try a download check on each fallback candidate in order
1559   # stop after max_count successful downloads
1560   # but don't remove any candidates from the array
1561   def try_download_consensus_checks(self, max_count):
1562     dl_ok_count = 0
1563     for f in self.fallbacks:
1564       f.try_fallback_download_consensus()
1565       if f.get_fallback_download_consensus():
1566         # this fallback downloaded a consensus ok
1567         dl_ok_count += 1
1568         if dl_ok_count >= max_count:
1569           # we have enough fallbacks
1570           return
1571
1572   # put max_count successful candidates in the fallbacks array:
1573   # - perform download checks on each fallback candidate
1574   # - retry failed candidates if CONSENSUS_DOWNLOAD_RETRY is set
1575   # - eliminate failed candidates
1576   # - if there are more than max_count candidates, eliminate lowest bandwidth
1577   # - if there are fewer than max_count candidates, leave only successful
1578   # Return the number of fallbacks that failed the consensus check
1579   def perform_download_consensus_checks(self, max_count):
1580     self.sort_fallbacks_by_measured_bandwidth()
1581     self.try_download_consensus_checks(max_count)
1582     if CONSENSUS_DOWNLOAD_RETRY:
1583       # try unsuccessful candidates again
1584       # we could end up with more than max_count successful candidates here
1585       self.try_download_consensus_checks(max_count)
1586     # now we have at least max_count successful candidates,
1587     # or we've tried them all
1588     original_count = len(self.fallbacks)
1589     self.fallbacks = filter(lambda x: x.get_fallback_download_consensus(),
1590                             self.fallbacks)
1591     # some of these failed the check, others skipped the check,
1592     # if we already had enough successful downloads
1593     failed_count = original_count - len(self.fallbacks)
1594     self.fallbacks = self.fallbacks[:max_count]
1595     return failed_count
1596
1597   # return a string that describes a/b as a percentage
1598   @staticmethod
1599   def describe_percentage(a, b):
1600     if b != 0:
1601       return '%d/%d = %.0f%%'%(a, b, (a*100.0)/b)
1602     else:
1603       # technically, 0/0 is undefined, but 0.0% is a sensible result
1604       return '%d/%d = %.0f%%'%(a, b, 0.0)
1605
1606   # return a dictionary of lists of fallbacks by IPv4 netblock
1607   # the dictionary is keyed by the fingerprint of an arbitrary fallback
1608   # in each netblock
1609   # mask_bits is the size of the netblock
1610   def fallbacks_by_ipv4_netblock(self, mask_bits):
1611     netblocks = {}
1612     for f in self.fallbacks:
1613       found_netblock = False
1614       for b in netblocks.keys():
1615         # we found an existing netblock containing this fallback
1616         if f.ipv4_netblocks_equal(self[b], mask_bits):
1617           # add it to the list
1618           netblocks[b].append(f)
1619           found_netblock = True
1620           break
1621       # make a new netblock based on this fallback's fingerprint
1622       if not found_netblock:
1623         netblocks[f._fpr] = [f]
1624     return netblocks
1625
1626   # return a dictionary of lists of fallbacks by IPv6 netblock
1627   # where mask_bits is the size of the netblock
1628   def fallbacks_by_ipv6_netblock(self, mask_bits):
1629     netblocks = {}
1630     for f in self.fallbacks:
1631       # skip fallbacks without IPv6 addresses
1632       if not f.has_ipv6():
1633         continue
1634       found_netblock = False
1635       for b in netblocks.keys():
1636         # we found an existing netblock containing this fallback
1637         if f.ipv6_netblocks_equal(self[b], mask_bits):
1638           # add it to the list
1639           netblocks[b].append(f)
1640           found_netblock = True
1641           break
1642       # make a new netblock based on this fallback's fingerprint
1643       if not found_netblock:
1644         netblocks[f._fpr] = [f]
1645     return netblocks
1646
1647   # log a message about the proportion of fallbacks in each IPv4 netblock,
1648   # where mask_bits is the size of the netblock
1649   def describe_fallback_ipv4_netblock_mask(self, mask_bits):
1650     fallback_count = len(self.fallbacks)
1651     shared_netblock_fallback_count = 0
1652     most_frequent_netblock = None
1653     netblocks = self.fallbacks_by_ipv4_netblock(mask_bits)
1654     for b in netblocks.keys():
1655       if len(netblocks[b]) > 1:
1656         # how many fallbacks are in a netblock with other fallbacks?
1657         shared_netblock_fallback_count += len(netblocks[b])
1658         # what's the netblock with the most fallbacks?
1659         if (most_frequent_netblock is None
1660             or len(netblocks[b]) > len(netblocks[most_frequent_netblock])):
1661           most_frequent_netblock = b
1662         logging.debug('Fallback IPv4 addresses in the same /%d:'%(mask_bits))
1663         for f in netblocks[b]:
1664           logging.debug('%s - %s', f.dirip, f._fpr)
1665     if most_frequent_netblock is not None:
1666       logging.warning('There are %s fallbacks in the IPv4 /%d containing %s'%(
1667                                     CandidateList.describe_percentage(
1668                                       len(netblocks[most_frequent_netblock]),
1669                                       fallback_count),
1670                                     mask_bits,
1671                                     self[most_frequent_netblock].dirip))
1672     if shared_netblock_fallback_count > 0:
1673       logging.warning(('%s of fallbacks are in an IPv4 /%d with other ' +
1674                        'fallbacks')%(CandidateList.describe_percentage(
1675                                                 shared_netblock_fallback_count,
1676                                                 fallback_count),
1677                                      mask_bits))
1678
1679   # log a message about the proportion of fallbacks in each IPv6 netblock,
1680   # where mask_bits is the size of the netblock
1681   def describe_fallback_ipv6_netblock_mask(self, mask_bits):
1682     fallback_count = len(self.fallbacks_with_ipv6())
1683     shared_netblock_fallback_count = 0
1684     most_frequent_netblock = None
1685     netblocks = self.fallbacks_by_ipv6_netblock(mask_bits)
1686     for b in netblocks.keys():
1687       if len(netblocks[b]) > 1:
1688         # how many fallbacks are in a netblock with other fallbacks?
1689         shared_netblock_fallback_count += len(netblocks[b])
1690         # what's the netblock with the most fallbacks?
1691         if (most_frequent_netblock is None
1692             or len(netblocks[b]) > len(netblocks[most_frequent_netblock])):
1693           most_frequent_netblock = b
1694         logging.debug('Fallback IPv6 addresses in the same /%d:'%(mask_bits))
1695         for f in netblocks[b]:
1696           logging.debug('%s - %s', f.ipv6addr, f._fpr)
1697     if most_frequent_netblock is not None:
1698       logging.warning('There are %s fallbacks in the IPv6 /%d containing %s'%(
1699                                     CandidateList.describe_percentage(
1700                                       len(netblocks[most_frequent_netblock]),
1701                                       fallback_count),
1702                                     mask_bits,
1703                                     self[most_frequent_netblock].ipv6addr))
1704     if shared_netblock_fallback_count > 0:
1705       logging.warning(('%s of fallbacks are in an IPv6 /%d with other ' +
1706                        'fallbacks')%(CandidateList.describe_percentage(
1707                                                 shared_netblock_fallback_count,
1708                                                 fallback_count),
1709                                      mask_bits))
1710
1711   # log a message about the proportion of fallbacks in each IPv4 /8, /16,
1712   # and /24
1713   def describe_fallback_ipv4_netblocks(self):
1714    # this doesn't actually tell us anything useful
1715    #self.describe_fallback_ipv4_netblock_mask(8)
1716    self.describe_fallback_ipv4_netblock_mask(16)
1717    self.describe_fallback_ipv4_netblock_mask(24)
1718
1719   # log a message about the proportion of fallbacks in each IPv6 /12 (RIR),
1720   # /23 (smaller RIR blocks), /32 (LIR), /48 (Customer), and /64 (Host)
1721   # https://www.iana.org/assignments/ipv6-unicast-address-assignments/
1722   def describe_fallback_ipv6_netblocks(self):
1723     # these don't actually tell us anything useful
1724     #self.describe_fallback_ipv6_netblock_mask(12)
1725     #self.describe_fallback_ipv6_netblock_mask(23)
1726     self.describe_fallback_ipv6_netblock_mask(32)
1727     self.describe_fallback_ipv6_netblock_mask(48)
1728     self.describe_fallback_ipv6_netblock_mask(64)
1729
1730   # log a message about the proportion of fallbacks in each IPv4 and IPv6
1731   # netblock
1732   def describe_fallback_netblocks(self):
1733     self.describe_fallback_ipv4_netblocks()
1734     self.describe_fallback_ipv6_netblocks()
1735
1736   # return a list of fallbacks which are on the IPv4 ORPort port
1737   def fallbacks_on_ipv4_orport(self, port):
1738     return filter(lambda x: x.orport == port, self.fallbacks)
1739
1740   # return a list of fallbacks which are on the IPv6 ORPort port
1741   def fallbacks_on_ipv6_orport(self, port):
1742     return filter(lambda x: x.ipv6orport == port, self.fallbacks_with_ipv6())
1743
1744   # return a list of fallbacks which are on the DirPort port
1745   def fallbacks_on_dirport(self, port):
1746     return filter(lambda x: x.dirport == port, self.fallbacks)
1747
1748   # log a message about the proportion of fallbacks on IPv4 ORPort port
1749   # and return that count
1750   def describe_fallback_ipv4_orport(self, port):
1751     port_count = len(self.fallbacks_on_ipv4_orport(port))
1752     fallback_count = len(self.fallbacks)
1753     logging.warning('%s of fallbacks are on IPv4 ORPort %d'%(
1754                     CandidateList.describe_percentage(port_count,
1755                                                       fallback_count),
1756                     port))
1757     return port_count
1758
1759   # log a message about the proportion of IPv6 fallbacks on IPv6 ORPort port
1760   # and return that count
1761   def describe_fallback_ipv6_orport(self, port):
1762     port_count = len(self.fallbacks_on_ipv6_orport(port))
1763     fallback_count = len(self.fallbacks_with_ipv6())
1764     logging.warning('%s of IPv6 fallbacks are on IPv6 ORPort %d'%(
1765                     CandidateList.describe_percentage(port_count,
1766                                                       fallback_count),
1767                     port))
1768     return port_count
1769
1770   # log a message about the proportion of fallbacks on DirPort port
1771   # and return that count
1772   def describe_fallback_dirport(self, port):
1773     port_count = len(self.fallbacks_on_dirport(port))
1774     fallback_count = len(self.fallbacks)
1775     logging.warning('%s of fallbacks are on DirPort %d'%(
1776                     CandidateList.describe_percentage(port_count,
1777                                                       fallback_count),
1778                     port))
1779     return port_count
1780
1781   # log a message about the proportion of fallbacks on each dirport,
1782   # each IPv4 orport, and each IPv6 orport
1783   def describe_fallback_ports(self):
1784     fallback_count = len(self.fallbacks)
1785     ipv4_or_count = fallback_count
1786     ipv4_or_count -= self.describe_fallback_ipv4_orport(443)
1787     ipv4_or_count -= self.describe_fallback_ipv4_orport(9001)
1788     logging.warning('%s of fallbacks are on other IPv4 ORPorts'%(
1789                     CandidateList.describe_percentage(ipv4_or_count,
1790                                                       fallback_count)))
1791     ipv6_fallback_count = len(self.fallbacks_with_ipv6())
1792     ipv6_or_count = ipv6_fallback_count
1793     ipv6_or_count -= self.describe_fallback_ipv6_orport(443)
1794     ipv6_or_count -= self.describe_fallback_ipv6_orport(9001)
1795     logging.warning('%s of IPv6 fallbacks are on other IPv6 ORPorts'%(
1796                     CandidateList.describe_percentage(ipv6_or_count,
1797                                                       ipv6_fallback_count)))
1798     dir_count = fallback_count
1799     dir_count -= self.describe_fallback_dirport(80)
1800     dir_count -= self.describe_fallback_dirport(9030)
1801     logging.warning('%s of fallbacks are on other DirPorts'%(
1802                     CandidateList.describe_percentage(dir_count,
1803                                                       fallback_count)))
1804
1805   # return a list of fallbacks which have the Exit flag
1806   def fallbacks_with_exit(self):
1807     return filter(lambda x: x.is_exit(), self.fallbacks)
1808
1809   # log a message about the proportion of fallbacks with an Exit flag
1810   def describe_fallback_exit_flag(self):
1811     exit_falback_count = len(self.fallbacks_with_exit())
1812     fallback_count = len(self.fallbacks)
1813     logging.warning('%s of fallbacks have the Exit flag'%(
1814                     CandidateList.describe_percentage(exit_falback_count,
1815                                                       fallback_count)))
1816
1817   # return a list of fallbacks which have an IPv6 address
1818   def fallbacks_with_ipv6(self):
1819     return filter(lambda x: x.has_ipv6(), self.fallbacks)
1820
1821   # log a message about the proportion of fallbacks on IPv6
1822   def describe_fallback_ip_family(self):
1823     ipv6_falback_count = len(self.fallbacks_with_ipv6())
1824     fallback_count = len(self.fallbacks)
1825     logging.warning('%s of fallbacks are on IPv6'%(
1826                     CandidateList.describe_percentage(ipv6_falback_count,
1827                                                       fallback_count)))
1828
1829   def summarise_fallbacks(self, eligible_count, operator_count, failed_count,
1830                           guard_count, target_count):
1831     s = ''
1832     s += '/* To comment-out entries in this file, use C comments, and add *'
1833     s += ' to the start of each line. (stem finds fallback entries using "'
1834     s += ' at the start of a line.) */'
1835     s += '\n'
1836     # Report:
1837     #  whether we checked consensus download times
1838     #  the number of fallback directories (and limits/exclusions, if relevant)
1839     #  min & max fallback bandwidths
1840     #  #error if below minimum count
1841     if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
1842       s += '/* Checked %s%s%s DirPorts served a consensus within %.1fs. */'%(
1843             'IPv4' if PERFORM_IPV4_DIRPORT_CHECKS else '',
1844             ' and ' if (PERFORM_IPV4_DIRPORT_CHECKS
1845                         and PERFORM_IPV6_DIRPORT_CHECKS) else '',
1846             'IPv6' if PERFORM_IPV6_DIRPORT_CHECKS else '',
1847             CONSENSUS_DOWNLOAD_SPEED_MAX)
1848     else:
1849       s += '/* Did not check IPv4 or IPv6 DirPort consensus downloads. */'
1850     s += '\n'
1851     # Multiline C comment with #error if things go bad
1852     s += '/*'
1853     s += '\n'
1854     # Integers don't need escaping in C comments
1855     fallback_count = len(self.fallbacks)
1856     if FALLBACK_PROPORTION_OF_GUARDS is None:
1857       fallback_proportion = ''
1858     else:
1859       fallback_proportion = ', Target %d (%d * %.2f)'%(target_count,
1860                                                 guard_count,
1861                                                 FALLBACK_PROPORTION_OF_GUARDS)
1862     s += 'Final Count: %d (Eligible %d%s'%(fallback_count, eligible_count,
1863                                            fallback_proportion)
1864     if MAX_FALLBACK_COUNT is not None:
1865       s += ', Max %d'%(MAX_FALLBACK_COUNT)
1866     s += ')\n'
1867     if eligible_count != fallback_count:
1868       removed_count = eligible_count - fallback_count
1869       excess_to_target_or_max = (eligible_count - operator_count - failed_count
1870                                  - fallback_count)
1871       # some 'Failed' failed the check, others 'Skipped' the check,
1872       # if we already had enough successful downloads
1873       s += ('Excluded: %d (Same Operator %d, Failed/Skipped Download %d, ' +
1874             'Excess %d)')%(removed_count, operator_count, failed_count,
1875                            excess_to_target_or_max)
1876       s += '\n'
1877     min_fb = self.fallback_min()
1878     min_bw = min_fb._data['measured_bandwidth']
1879     max_fb = self.fallback_max()
1880     max_bw = max_fb._data['measured_bandwidth']
1881     s += 'Bandwidth Range: %.1f - %.1f MB/s'%(min_bw/(1024.0*1024.0),
1882                                               max_bw/(1024.0*1024.0))
1883     s += '\n'
1884     s += '*/'
1885     if fallback_count < MIN_FALLBACK_COUNT:
1886       # We must have a minimum number of fallbacks so they are always
1887       # reachable, and are in diverse locations
1888       s += '\n'
1889       s += '#error Fallback Count %d is too low. '%(fallback_count)
1890       s += 'Must be at least %d for diversity. '%(MIN_FALLBACK_COUNT)
1891       s += 'Try adding entries to the whitelist, '
1892       s += 'or setting INCLUDE_UNLISTED_ENTRIES = True.'
1893     return s
1894
1895 ## Main Function
1896
1897 def list_fallbacks():
1898   """ Fetches required onionoo documents and evaluates the
1899       fallback directory criteria for each of the relays """
1900
1901   logging.warning('Downloading and parsing Onionoo data. ' +
1902                   'This may take some time.')
1903   # find relays that could be fallbacks
1904   candidates = CandidateList()
1905   candidates.add_relays()
1906
1907   # work out how many fallbacks we want
1908   guard_count = candidates.count_guards()
1909   if FALLBACK_PROPORTION_OF_GUARDS is None:
1910     target_count = guard_count
1911   else:
1912     target_count = int(guard_count * FALLBACK_PROPORTION_OF_GUARDS)
1913   # the maximum number of fallbacks is the least of:
1914   # - the target fallback count (FALLBACK_PROPORTION_OF_GUARDS * guard count)
1915   # - the maximum fallback count (MAX_FALLBACK_COUNT)
1916   if MAX_FALLBACK_COUNT is None:
1917     max_count = target_count
1918   else:
1919     max_count = min(target_count, MAX_FALLBACK_COUNT)
1920
1921   candidates.compute_fallbacks()
1922   prefilter_fallbacks = copy.copy(candidates.fallbacks)
1923
1924   # filter with the whitelist and blacklist
1925   # if a relay has changed IPv4 address or ports recently, it will be excluded
1926   # as ineligible before we call apply_filter_lists, and so there will be no
1927   # warning that the details have changed from those in the whitelist.
1928   # instead, there will be an info-level log during the eligibility check.
1929   initial_count = len(candidates.fallbacks)
1930   excluded_count = candidates.apply_filter_lists()
1931   print candidates.summarise_filters(initial_count, excluded_count)
1932   eligible_count = len(candidates.fallbacks)
1933
1934   # calculate the measured bandwidth of each relay,
1935   # then remove low-bandwidth relays
1936   candidates.calculate_measured_bandwidth()
1937   candidates.remove_low_bandwidth_relays()
1938
1939   # print the raw fallback list
1940   #for x in candidates.fallbacks:
1941   #  print x.fallbackdir_line(True)
1942   #  print json.dumps(candidates[x]._data, sort_keys=True, indent=4,
1943   #                   separators=(',', ': '), default=json_util.default)
1944
1945   # impose mandatory conditions here, like one per contact, family, IP
1946   # in measured bandwidth order
1947   candidates.sort_fallbacks_by_measured_bandwidth()
1948   operator_count = 0
1949   # only impose these limits on the final list - operators can nominate
1950   # multiple candidate fallbacks, and then we choose the best set
1951   if not OUTPUT_CANDIDATES:
1952     operator_count += candidates.limit_fallbacks_same_ip()
1953     operator_count += candidates.limit_fallbacks_same_contact()
1954     operator_count += candidates.limit_fallbacks_same_family()
1955
1956   # check if each candidate can serve a consensus
1957   # there's a small risk we've eliminated relays from the same operator that
1958   # can serve a consensus, in favour of one that can't
1959   # but given it takes up to 15 seconds to check each consensus download,
1960   # the risk is worth it
1961   if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
1962     logging.warning('Checking consensus download speeds. ' +
1963                     'This may take some time.')
1964   failed_count = candidates.perform_download_consensus_checks(max_count)
1965
1966   # analyse and log interesting diversity metrics
1967   # like netblock, ports, exit, IPv4-only
1968   # (we can't easily analyse AS, and it's hard to accurately analyse country)
1969   candidates.describe_fallback_ip_family()
1970   # if we can't import the ipaddress module, we can't do netblock analysis
1971   if HAVE_IPADDRESS:
1972     candidates.describe_fallback_netblocks()
1973   candidates.describe_fallback_ports()
1974   candidates.describe_fallback_exit_flag()
1975
1976   # output C comments summarising the fallback selection process
1977   if len(candidates.fallbacks) > 0:
1978     print candidates.summarise_fallbacks(eligible_count, operator_count,
1979                                          failed_count, guard_count,
1980                                          target_count)
1981   else:
1982     print '/* No Fallbacks met criteria */'
1983
1984   # output C comments specifying the OnionOO data used to create the list
1985   for s in fetch_source_list():
1986     print describe_fetch_source(s)
1987
1988   # if we're outputting the final fallback list, sort by fingerprint
1989   # this makes diffs much more stable
1990   # otherwise, leave sorted by bandwidth, which allows operators to be
1991   # contacted in priority order
1992   if not OUTPUT_CANDIDATES:
1993     candidates.sort_fallbacks_by_fingerprint()
1994
1995   for x in candidates.fallbacks:
1996     print x.fallbackdir_line(candidates.fallbacks, prefilter_fallbacks)
1997
1998 if __name__ == "__main__":
1999   list_fallbacks()