scripts/maint/updateFallbackDirs.py

   1 #!/usr/bin/python
   2
   3 # Usage: scripts/maint/updateFallbackDirs.py > src/or/fallback_dirs.inc
   4 # Needs stem available in your PYTHONPATH, or just ln -s ../stem/stem .
   5 #
   6 # Then read the generated list to ensure no-one slipped anything funny into
   7 # their name or contactinfo
   8
   9 # Script by weasel, April 2015
  10 # Portions by gsathya & karsten, 2013
  11 # https://trac.torproject.org/projects/tor/attachment/ticket/8374/dir_list.2.py
  12 # Modifications by teor, 2015
  13
  14 import StringIO
  15 import string
  16 import re
  17 import datetime
  18 import gzip
  19 import os.path
  20 import json
  21 import math
  22 import sys
  23 import urllib
  24 import urllib2
  25 import hashlib
  26 import dateutil.parser
  27 # bson_lazy provides bson
  28 #from bson import json_util
  29
  30 from stem.descriptor.remote import DescriptorDownloader
  31
  32 import logging
  33 # INFO tells you why each relay was included or excluded
  34 # WARN tells you about potential misconfigurations
  35 logging.basicConfig(level=logging.WARNING)
  36
  37 ## Top-Level Configuration
  38
  39 # Output all candidate fallbacks, or only output selected fallbacks?
  40 OUTPUT_CANDIDATES = False
  41
  42 # Perform DirPort checks over IPv4?
  43 # Change this to False if IPv4 doesn't work for you, or if you don't want to
  44 # download a consensus for each fallback
  45 # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
  46 PERFORM_IPV4_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else True
  47
  48 # Perform DirPort checks over IPv6?
  49 # If you know IPv6 works for you, set this to True
  50 # This will exclude IPv6 relays without an IPv6 DirPort configured
  51 # So it's best left at False until #18394 is implemented
  52 # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
  53 PERFORM_IPV6_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else False
  54
  55 ## OnionOO Settings
  56
  57 ONIONOO = 'https://onionoo.torproject.org/'
  58 #ONIONOO = 'https://onionoo.thecthulhu.com/'
  59
  60 # Don't bother going out to the Internet, just use the files available locally,
  61 # even if they're very old
  62 LOCAL_FILES_ONLY = False
  63
  64 ## Whitelist / Blacklist Filter Settings
  65
  66 # The whitelist contains entries that are included if all attributes match
  67 # (IPv4, dirport, orport, id, and optionally IPv6 and IPv6 orport)
  68 # The blacklist contains (partial) entries that are excluded if any
  69 # sufficiently specific group of attributes matches:
  70 # IPv4 & DirPort
  71 # IPv4 & ORPort
  72 # ID
  73 # IPv6 & DirPort
  74 # IPv6 & IPv6 ORPort
  75 # If neither port is included in the blacklist, the entire IP address is
  76 # blacklisted.
  77
  78 # What happens to entries in neither list?
  79 # When True, they are included, when False, they are excluded
  80 INCLUDE_UNLISTED_ENTRIES = True if OUTPUT_CANDIDATES else False
  81
  82 # If an entry is in both lists, what happens?
  83 # When True, it is excluded, when False, it is included
  84 BLACKLIST_EXCLUDES_WHITELIST_ENTRIES = True
  85
  86 WHITELIST_FILE_NAME = 'scripts/maint/fallback.whitelist'
  87 BLACKLIST_FILE_NAME = 'scripts/maint/fallback.blacklist'
  88
  89 # The number of bytes we'll read from a filter file before giving up
  90 MAX_LIST_FILE_SIZE = 1024 * 1024
  91
  92 ## Eligibility Settings
  93
  94 # Reduced due to a bug in tor where a relay submits a 0 DirPort when restarted
  95 # This causes OnionOO to (correctly) reset its stability timer
  96 # This issue will be fixed in 0.2.7.7 and 0.2.8.2
  97 # Until then, the CUTOFFs below ensure a decent level of stability.
  98 ADDRESS_AND_PORT_STABLE_DAYS = 7
  99 # What time-weighted-fraction of these flags must FallbackDirs
 100 # Equal or Exceed?
 101 CUTOFF_RUNNING = .95
 102 CUTOFF_V2DIR = .95
 103 CUTOFF_GUARD = .95
 104 # What time-weighted-fraction of these flags must FallbackDirs
 105 # Equal or Fall Under?
 106 # .00 means no bad exits
 107 PERMITTED_BADEXIT = .00
 108
 109 # Clients will time out after 30 seconds trying to download a consensus
 110 # So allow fallback directories half that to deliver a consensus
 111 # The exact download times might change based on the network connection
 112 # running this script, but only by a few seconds
 113 # There is also about a second of python overhead
 114 CONSENSUS_DOWNLOAD_SPEED_MAX = 15.0
 115 # If the relay fails a consensus check, retry the download
 116 # This avoids delisting a relay due to transient network conditions
 117 CONSENSUS_DOWNLOAD_RETRY = True
 118
 119 ## List Length Limits
 120
 121 # The target for these parameters is 20% of the guards in the network
 122 # This is around 200 as of October 2015
 123 FALLBACK_PROPORTION_OF_GUARDS = None if OUTPUT_CANDIDATES else 0.2
 124
 125 # Limit the number of fallbacks (eliminating lowest by weight)
 126 MAX_FALLBACK_COUNT = None if OUTPUT_CANDIDATES else 500
 127 # Emit a C #error if the number of fallbacks is below
 128 MIN_FALLBACK_COUNT = 50
 129
 130 ## Fallback Weight Settings
 131
 132 # Any fallback with the Exit flag has its weight multipled by this fraction
 133 EXIT_WEIGHT_FRACTION = 1.0
 134
 135 # If True, emit a C #error if we can't satisfy various constraints
 136 # If False, emit a C comment instead
 137 STRICT_FALLBACK_WEIGHTS = False
 138
 139 # Limit the proportional weight
 140 # If a single fallback's weight is too high, it will see too many clients
 141 # We reweight using a lower threshold to provide some leeway for:
 142 # * elimination of low weight relays
 143 # * consensus weight changes
 144 # * fallback directory losses over time
 145 # A relay weighted at 1 in 10 fallbacks will see about 10% of clients that
 146 # use the fallback directories. (The 9 directory authorities see a similar
 147 # proportion of clients.)
 148 TARGET_MAX_WEIGHT_FRACTION = 1/10.0
 149 REWEIGHTING_FUDGE_FACTOR = 0.8
 150 MAX_WEIGHT_FRACTION = TARGET_MAX_WEIGHT_FRACTION * REWEIGHTING_FUDGE_FACTOR
 151 # If a single fallback's weight is too low, it's pointless adding it.
 152 # (Final weights may be slightly higher than this, due to low weight relays
 153 # being excluded.)
 154 # A relay weighted at 1 in 1000 fallbacks will see about 0.1% of clients.
 155 MIN_WEIGHT_FRACTION = 0.0 if OUTPUT_CANDIDATES else 1/1000.0
 156
 157 ## Other Configuration Parameters
 158
 159 # older entries' weights are adjusted with ALPHA^(age in days)
 160 AGE_ALPHA = 0.99
 161
 162 # this factor is used to scale OnionOO entries to [0,1]
 163 ONIONOO_SCALE_ONE = 999.
 164
 165 ## Parsing Functions
 166
 167 def parse_ts(t):
 168   return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S")
 169
 170 def remove_bad_chars(raw_string, bad_char_list):
 171   # Remove each character in the bad_char_list
 172   cleansed_string = raw_string
 173   for c in bad_char_list:
 174     cleansed_string = cleansed_string.replace(c, '')
 175   return cleansed_string
 176
 177 def cleanse_unprintable(raw_string):
 178   # Remove all unprintable characters
 179   cleansed_string = ''
 180   for c in raw_string:
 181     if (c in string.ascii_letters or c in string.digits
 182         or c in string.punctuation or c in string.whitespace):
 183       cleansed_string += c
 184   return cleansed_string
 185
 186 def cleanse_whitespace(raw_string):
 187   # Replace all whitespace characters with a space
 188   cleansed_string = raw_string
 189   for c in string.whitespace:
 190     cleansed_string = cleansed_string.replace(c, ' ')
 191   return cleansed_string
 192
 193 def cleanse_c_multiline_comment(raw_string):
 194   cleansed_string = raw_string
 195   # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
 196   cleansed_string = cleanse_whitespace(cleansed_string)
 197   # ContactInfo and Version can be arbitrary binary data
 198   cleansed_string = cleanse_unprintable(cleansed_string)
 199   # Prevent a malicious / unanticipated string from breaking out
 200   # of a C-style multiline comment
 201   # This removes '/*' and '*/' and '//'
 202   bad_char_list = '*/'
 203   # Prevent a malicious string from using C nulls
 204   bad_char_list += '\0'
 205   # Be safer by removing bad characters entirely
 206   cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
 207   # Some compilers may further process the content of comments
 208   # There isn't much we can do to cover every possible case
 209   # But comment-based directives are typically only advisory
 210   return cleansed_string
 211
 212 def cleanse_c_string(raw_string):
 213   cleansed_string = raw_string
 214   # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
 215   cleansed_string = cleanse_whitespace(cleansed_string)
 216   # ContactInfo and Version can be arbitrary binary data
 217   cleansed_string = cleanse_unprintable(cleansed_string)
 218   # Prevent a malicious address/fingerprint string from breaking out
 219   # of a C-style string
 220   bad_char_list = '"'
 221   # Prevent a malicious string from using escapes
 222   bad_char_list += '\\'
 223   # Prevent a malicious string from using C nulls
 224   bad_char_list += '\0'
 225   # Be safer by removing bad characters entirely
 226   cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
 227   # Some compilers may further process the content of strings
 228   # There isn't much we can do to cover every possible case
 229   # But this typically only results in changes to the string data
 230   return cleansed_string
 231
 232 ## OnionOO Source Functions
 233
 234 # a dictionary of source metadata for each onionoo query we've made
 235 fetch_source = {}
 236
 237 # register source metadata for 'what'
 238 # assumes we only retrieve one document for each 'what'
 239 def register_fetch_source(what, url, relays_published, version):
 240   fetch_source[what] = {}
 241   fetch_source[what]['url'] = url
 242   fetch_source[what]['relays_published'] = relays_published
 243   fetch_source[what]['version'] = version
 244
 245 # list each registered source's 'what'
 246 def fetch_source_list():
 247   return sorted(fetch_source.keys())
 248
 249 # given 'what', provide a multiline C comment describing the source
 250 def describe_fetch_source(what):
 251   desc = '/*'
 252   desc += '\n'
 253   desc += 'Onionoo Source: '
 254   desc += cleanse_c_multiline_comment(what)
 255   desc += ' Date: '
 256   desc += cleanse_c_multiline_comment(fetch_source[what]['relays_published'])
 257   desc += ' Version: '
 258   desc += cleanse_c_multiline_comment(fetch_source[what]['version'])
 259   desc += '\n'
 260   desc += 'URL: '
 261   desc += cleanse_c_multiline_comment(fetch_source[what]['url'])
 262   desc += '\n'
 263   desc += '*/'
 264   return desc
 265
 266 ## File Processing Functions
 267
 268 def write_to_file(str, file_name, max_len):
 269   try:
 270     with open(file_name, 'w') as f:
 271       f.write(str[0:max_len])
 272   except EnvironmentError, error:
 273     logging.warning('Writing file %s failed: %d: %s'%
 274                     (file_name,
 275                      error.errno,
 276                      error.strerror)
 277                     )
 278
 279 def read_from_file(file_name, max_len):
 280   try:
 281     if os.path.isfile(file_name):
 282       with open(file_name, 'r') as f:
 283         return f.read(max_len)
 284   except EnvironmentError, error:
 285     logging.info('Loading file %s failed: %d: %s'%
 286                  (file_name,
 287                   error.errno,
 288                   error.strerror)
 289                  )
 290   return None
 291
 292 def load_possibly_compressed_response_json(response):
 293     if response.info().get('Content-Encoding') == 'gzip':
 294       buf = StringIO.StringIO( response.read() )
 295       f = gzip.GzipFile(fileobj=buf)
 296       return json.load(f)
 297     else:
 298       return json.load(response)
 299
 300 def load_json_from_file(json_file_name):
 301     # An exception here may be resolved by deleting the .last_modified
 302     # and .json files, and re-running the script
 303     try:
 304       with open(json_file_name, 'r') as f:
 305         return json.load(f)
 306     except EnvironmentError, error:
 307       raise Exception('Reading not-modified json file %s failed: %d: %s'%
 308                     (json_file_name,
 309                      error.errno,
 310                      error.strerror)
 311                     )
 312
 313 ## OnionOO Functions
 314
 315 def datestr_to_datetime(datestr):
 316   # Parse datetimes like: Fri, 02 Oct 2015 13:34:14 GMT
 317   if datestr is not None:
 318     dt = dateutil.parser.parse(datestr)
 319   else:
 320     # Never modified - use start of epoch
 321     dt = datetime.datetime.utcfromtimestamp(0)
 322   # strip any timezone out (in case they're supported in future)
 323   dt = dt.replace(tzinfo=None)
 324   return dt
 325
 326 def onionoo_fetch(what, **kwargs):
 327   params = kwargs
 328   params['type'] = 'relay'
 329   #params['limit'] = 10
 330   params['first_seen_days'] = '%d-'%(ADDRESS_AND_PORT_STABLE_DAYS,)
 331   params['last_seen_days'] = '-7'
 332   params['flag'] = 'V2Dir'
 333   url = ONIONOO + what + '?' + urllib.urlencode(params)
 334
 335   # Unfortunately, the URL is too long for some OS filenames,
 336   # but we still don't want to get files from different URLs mixed up
 337   base_file_name = what + '-' + hashlib.sha1(url).hexdigest()
 338
 339   full_url_file_name = base_file_name + '.full_url'
 340   MAX_FULL_URL_LENGTH = 1024
 341
 342   last_modified_file_name = base_file_name + '.last_modified'
 343   MAX_LAST_MODIFIED_LENGTH = 64
 344
 345   json_file_name = base_file_name + '.json'
 346
 347   if LOCAL_FILES_ONLY:
 348     # Read from the local file, don't write to anything
 349     response_json = load_json_from_file(json_file_name)
 350   else:
 351     # store the full URL to a file for debugging
 352     # no need to compare as long as you trust SHA-1
 353     write_to_file(url, full_url_file_name, MAX_FULL_URL_LENGTH)
 354
 355     request = urllib2.Request(url)
 356     request.add_header('Accept-encoding', 'gzip')
 357
 358     # load the last modified date from the file, if it exists
 359     last_mod_date = read_from_file(last_modified_file_name,
 360                                    MAX_LAST_MODIFIED_LENGTH)
 361     if last_mod_date is not None:
 362       request.add_header('If-modified-since', last_mod_date)
 363
 364     # Parse last modified date
 365     last_mod = datestr_to_datetime(last_mod_date)
 366
 367     # Not Modified and still recent enough to be useful
 368     # Onionoo / Globe used to use 6 hours, but we can afford a day
 369     required_freshness = datetime.datetime.utcnow()
 370     # strip any timezone out (to match dateutil.parser)
 371     required_freshness = required_freshness.replace(tzinfo=None)
 372     required_freshness -= datetime.timedelta(hours=24)
 373
 374     # Make the OnionOO request
 375     response_code = 0
 376     try:
 377       response = urllib2.urlopen(request)
 378       response_code = response.getcode()
 379     except urllib2.HTTPError, error:
 380       response_code = error.code
 381       if response_code == 304: # not modified
 382         pass
 383       else:
 384         raise Exception("Could not get " + url + ": "
 385                         + str(error.code) + ": " + error.reason)
 386
 387     if response_code == 200: # OK
 388       last_mod = datestr_to_datetime(response.info().get('Last-Modified'))
 389
 390     # Check for freshness
 391     if last_mod < required_freshness:
 392       if last_mod_date is not None:
 393         # This check sometimes fails transiently, retry the script if it does
 394         date_message = "Outdated data: last updated " + last_mod_date
 395       else:
 396         date_message = "No data: never downloaded "
 397       raise Exception(date_message + " from " + url)
 398
 399     # Process the data
 400     if response_code == 200: # OK
 401
 402       response_json = load_possibly_compressed_response_json(response)
 403
 404       with open(json_file_name, 'w') as f:
 405         # use the most compact json representation to save space
 406         json.dump(response_json, f, separators=(',',':'))
 407
 408       # store the last modified date in its own file
 409       if response.info().get('Last-modified') is not None:
 410         write_to_file(response.info().get('Last-Modified'),
 411                       last_modified_file_name,
 412                       MAX_LAST_MODIFIED_LENGTH)
 413
 414     elif response_code == 304: # Not Modified
 415
 416       response_json = load_json_from_file(json_file_name)
 417
 418     else: # Unexpected HTTP response code not covered in the HTTPError above
 419       raise Exception("Unexpected HTTP response code to " + url + ": "
 420                       + str(response_code))
 421
 422   register_fetch_source(what,
 423                         url,
 424                         response_json['relays_published'],
 425                         response_json['version'])
 426
 427   return response_json
 428
 429 def fetch(what, **kwargs):
 430   #x = onionoo_fetch(what, **kwargs)
 431   # don't use sort_keys, as the order of or_addresses is significant
 432   #print json.dumps(x, indent=4, separators=(',', ': '))
 433   #sys.exit(0)
 434
 435   return onionoo_fetch(what, **kwargs)
 436
 437 ## Fallback Candidate Class
 438
 439 class Candidate(object):
 440   CUTOFF_ADDRESS_AND_PORT_STABLE = (datetime.datetime.utcnow()
 441                             - datetime.timedelta(ADDRESS_AND_PORT_STABLE_DAYS))
 442
 443   def __init__(self, details):
 444     for f in ['fingerprint', 'nickname', 'last_changed_address_or_port',
 445               'consensus_weight', 'or_addresses', 'dir_address']:
 446       if not f in details: raise Exception("Document has no %s field."%(f,))
 447
 448     if not 'contact' in details:
 449       details['contact'] = None
 450     if not 'flags' in details or details['flags'] is None:
 451       details['flags'] = []
 452     details['last_changed_address_or_port'] = parse_ts(
 453                                       details['last_changed_address_or_port'])
 454     self._data = details
 455     self._stable_sort_or_addresses()
 456
 457     self._fpr = self._data['fingerprint']
 458     self._running = self._guard = self._v2dir = 0.
 459     self._split_dirport()
 460     self._compute_orport()
 461     if self.orport is None:
 462       raise Exception("Failed to get an orport for %s."%(self._fpr,))
 463     self._compute_ipv6addr()
 464     if self.ipv6addr is None:
 465       logging.debug("Failed to get an ipv6 address for %s."%(self._fpr,))
 466     # Reduce the weight of exits to EXIT_WEIGHT_FRACTION * consensus_weight
 467     if self.is_exit():
 468       current_weight = self._data['consensus_weight']
 469       exit_weight = current_weight * EXIT_WEIGHT_FRACTION
 470       self._data['original_consensus_weight'] = current_weight
 471       self._data['consensus_weight'] = exit_weight
 472
 473   def _stable_sort_or_addresses(self):
 474     # replace self._data['or_addresses'] with a stable ordering,
 475     # sorting the secondary addresses in string order
 476     # leave the received order in self._data['or_addresses_raw']
 477     self._data['or_addresses_raw'] = self._data['or_addresses']
 478     or_address_primary = self._data['or_addresses'][:1]
 479     # subsequent entries in the or_addresses array are in an arbitrary order
 480     # so we stabilise the addresses by sorting them in string order
 481     or_addresses_secondaries_stable = sorted(self._data['or_addresses'][1:])
 482     or_addresses_stable = or_address_primary + or_addresses_secondaries_stable
 483     self._data['or_addresses'] = or_addresses_stable
 484
 485   def get_fingerprint(self):
 486     return self._fpr
 487
 488   # is_valid_ipv[46]_address by gsathya, karsten, 2013
 489   @staticmethod
 490   def is_valid_ipv4_address(address):
 491     if not isinstance(address, (str, unicode)):
 492       return False
 493
 494     # check if there are four period separated values
 495     if address.count(".") != 3:
 496       return False
 497
 498     # checks that each value in the octet are decimal values between 0-255
 499     for entry in address.split("."):
 500       if not entry.isdigit() or int(entry) < 0 or int(entry) > 255:
 501         return False
 502       elif entry[0] == "0" and len(entry) > 1:
 503         return False  # leading zeros, for instance in "1.2.3.001"
 504
 505     return True
 506
 507   @staticmethod
 508   def is_valid_ipv6_address(address):
 509     if not isinstance(address, (str, unicode)):
 510       return False
 511
 512     # remove brackets
 513     address = address[1:-1]
 514
 515     # addresses are made up of eight colon separated groups of four hex digits
 516     # with leading zeros being optional
 517     # https://en.wikipedia.org/wiki/IPv6#Address_format
 518
 519     colon_count = address.count(":")
 520
 521     if colon_count > 7:
 522       return False  # too many groups
 523     elif colon_count != 7 and not "::" in address:
 524       return False  # not enough groups and none are collapsed
 525     elif address.count("::") > 1 or ":::" in address:
 526       return False  # multiple groupings of zeros can't be collapsed
 527
 528     found_ipv4_on_previous_entry = False
 529     for entry in address.split(":"):
 530       # If an IPv6 address has an embedded IPv4 address,
 531       # it must be the last entry
 532       if found_ipv4_on_previous_entry:
 533         return False
 534       if not re.match("^[0-9a-fA-f]{0,4}$", entry):
 535         if not Candidate.is_valid_ipv4_address(entry):
 536           return False
 537         else:
 538           found_ipv4_on_previous_entry = True
 539
 540     return True
 541
 542   def _split_dirport(self):
 543     # Split the dir_address into dirip and dirport
 544     (self.dirip, _dirport) = self._data['dir_address'].split(':', 2)
 545     self.dirport = int(_dirport)
 546
 547   def _compute_orport(self):
 548     # Choose the first ORPort that's on the same IPv4 address as the DirPort.
 549     # In rare circumstances, this might not be the primary ORPort address.
 550     # However, _stable_sort_or_addresses() ensures we choose the same one
 551     # every time, even if onionoo changes the order of the secondaries.
 552     self._split_dirport()
 553     self.orport = None
 554     for i in self._data['or_addresses']:
 555       if i != self._data['or_addresses'][0]:
 556         logging.debug('Secondary IPv4 Address Used for %s: %s'%(self._fpr, i))
 557       (ipaddr, port) = i.rsplit(':', 1)
 558       if (ipaddr == self.dirip) and Candidate.is_valid_ipv4_address(ipaddr):
 559         self.orport = int(port)
 560         return
 561
 562   def _compute_ipv6addr(self):
 563     # Choose the first IPv6 address that uses the same port as the ORPort
 564     # Or, choose the first IPv6 address in the list
 565     # _stable_sort_or_addresses() ensures we choose the same IPv6 address
 566     # every time, even if onionoo changes the order of the secondaries.
 567     self.ipv6addr = None
 568     self.ipv6orport = None
 569     # Choose the first IPv6 address that uses the same port as the ORPort
 570     for i in self._data['or_addresses']:
 571       (ipaddr, port) = i.rsplit(':', 1)
 572       if (port == self.orport) and Candidate.is_valid_ipv6_address(ipaddr):
 573         self.ipv6addr = ipaddr
 574         self.ipv6orport = port
 575         return
 576     # Choose the first IPv6 address in the list
 577     for i in self._data['or_addresses']:
 578       (ipaddr, port) = i.rsplit(':', 1)
 579       if Candidate.is_valid_ipv6_address(ipaddr):
 580         self.ipv6addr = ipaddr
 581         self.ipv6orport = port
 582         return
 583
 584   @staticmethod
 585   def _extract_generic_history(history, which='unknown'):
 586     # given a tree like this:
 587     #   {
 588     #     "1_month": {
 589     #         "count": 187,
 590     #         "factor": 0.001001001001001001,
 591     #         "first": "2015-02-27 06:00:00",
 592     #         "interval": 14400,
 593     #         "last": "2015-03-30 06:00:00",
 594     #         "values": [
 595     #             999,
 596     #             999
 597     #         ]
 598     #     },
 599     #     "1_week": {
 600     #         "count": 169,
 601     #         "factor": 0.001001001001001001,
 602     #         "first": "2015-03-23 07:30:00",
 603     #         "interval": 3600,
 604     #         "last": "2015-03-30 07:30:00",
 605     #         "values": [ ...]
 606     #     },
 607     #     "1_year": {
 608     #         "count": 177,
 609     #         "factor": 0.001001001001001001,
 610     #         "first": "2014-04-11 00:00:00",
 611     #         "interval": 172800,
 612     #         "last": "2015-03-29 00:00:00",
 613     #         "values": [ ...]
 614     #     },
 615     #     "3_months": {
 616     #         "count": 185,
 617     #         "factor": 0.001001001001001001,
 618     #         "first": "2014-12-28 06:00:00",
 619     #         "interval": 43200,
 620     #         "last": "2015-03-30 06:00:00",
 621     #         "values": [ ...]
 622     #     }
 623     #   },
 624     # extract exactly one piece of data per time interval,
 625     # using smaller intervals where available.
 626     #
 627     # returns list of (age, length, value) dictionaries.
 628
 629     generic_history = []
 630
 631     periods = history.keys()
 632     periods.sort(key = lambda x: history[x]['interval'])
 633     now = datetime.datetime.utcnow()
 634     newest = now
 635     for p in periods:
 636       h = history[p]
 637       interval = datetime.timedelta(seconds = h['interval'])
 638       this_ts = parse_ts(h['last'])
 639
 640       if (len(h['values']) != h['count']):
 641         logging.warn('Inconsistent value count in %s document for %s'
 642                      %(p, which))
 643       for v in reversed(h['values']):
 644         if (this_ts <= newest):
 645           agt1 = now - this_ts
 646           agt2 = interval
 647           agetmp1 = (agt1.microseconds + (agt1.seconds + agt1.days * 24 * 3600)
 648                      * 10**6) / 10**6
 649           agetmp2 = (agt2.microseconds + (agt2.seconds + agt2.days * 24 * 3600)
 650                      * 10**6) / 10**6
 651           generic_history.append(
 652             { 'age': agetmp1,
 653               'length': agetmp2,
 654               'value': v
 655             })
 656           newest = this_ts
 657         this_ts -= interval
 658
 659       if (this_ts + interval != parse_ts(h['first'])):
 660         logging.warn('Inconsistent time information in %s document for %s'
 661                      %(p, which))
 662
 663     #print json.dumps(generic_history, sort_keys=True,
 664     #                  indent=4, separators=(',', ': '))
 665     return generic_history
 666
 667   @staticmethod
 668   def _avg_generic_history(generic_history):
 669     a = []
 670     for i in generic_history:
 671       if i['age'] > (ADDRESS_AND_PORT_STABLE_DAYS * 24 * 3600):
 672         continue
 673       if (i['length'] is not None
 674           and i['age'] is not None
 675           and i['value'] is not None):
 676         w = i['length'] * math.pow(AGE_ALPHA, i['age']/(3600*24))
 677         a.append( (i['value'] * w, w) )
 678
 679     sv = math.fsum(map(lambda x: x[0], a))
 680     sw = math.fsum(map(lambda x: x[1], a))
 681
 682     if sw == 0.0:
 683       svw = 0.0
 684     else:
 685       svw = sv/sw
 686     return svw
 687
 688   def _add_generic_history(self, history):
 689     periods = r['read_history'].keys()
 690     periods.sort(key = lambda x: r['read_history'][x]['interval'] )
 691
 692     print periods
 693
 694   def add_running_history(self, history):
 695     pass
 696
 697   def add_uptime(self, uptime):
 698     logging.debug('Adding uptime %s.'%(self._fpr,))
 699
 700     # flags we care about: Running, V2Dir, Guard
 701     if not 'flags' in uptime:
 702       logging.debug('No flags in document for %s.'%(self._fpr,))
 703       return
 704
 705     for f in ['Running', 'Guard', 'V2Dir']:
 706       if not f in uptime['flags']:
 707         logging.debug('No %s in flags for %s.'%(f, self._fpr,))
 708         return
 709
 710     running = self._extract_generic_history(uptime['flags']['Running'],
 711                                             '%s-Running'%(self._fpr))
 712     guard = self._extract_generic_history(uptime['flags']['Guard'],
 713                                           '%s-Guard'%(self._fpr))
 714     v2dir = self._extract_generic_history(uptime['flags']['V2Dir'],
 715                                           '%s-V2Dir'%(self._fpr))
 716     if 'BadExit' in uptime['flags']:
 717       badexit = self._extract_generic_history(uptime['flags']['BadExit'],
 718                                               '%s-BadExit'%(self._fpr))
 719
 720     self._running = self._avg_generic_history(running) / ONIONOO_SCALE_ONE
 721     self._guard = self._avg_generic_history(guard) / ONIONOO_SCALE_ONE
 722     self._v2dir = self._avg_generic_history(v2dir) / ONIONOO_SCALE_ONE
 723     self._badexit = None
 724     if 'BadExit' in uptime['flags']:
 725       self._badexit = self._avg_generic_history(badexit) / ONIONOO_SCALE_ONE
 726
 727   def is_candidate(self):
 728     must_be_running_now = (PERFORM_IPV4_DIRPORT_CHECKS
 729                            or PERFORM_IPV6_DIRPORT_CHECKS)
 730     if (must_be_running_now and not self.is_running()):
 731       logging.info('%s not a candidate: not running now, unable to check ' +
 732                    'DirPort consensus download', self._fpr)
 733       return False
 734     if (self._data['last_changed_address_or_port'] >
 735         self.CUTOFF_ADDRESS_AND_PORT_STABLE):
 736       logging.info('%s not a candidate: changed address/port recently (%s)',
 737                    self._fpr, self._data['last_changed_address_or_port'])
 738       return False
 739     if self._running < CUTOFF_RUNNING:
 740       logging.info('%s not a candidate: running avg too low (%lf)',
 741                    self._fpr, self._running)
 742       return False
 743     if self._v2dir < CUTOFF_V2DIR:
 744       logging.info('%s not a candidate: v2dir avg too low (%lf)',
 745                    self._fpr, self._v2dir)
 746       return False
 747     if self._badexit is not None and self._badexit > PERMITTED_BADEXIT:
 748       logging.info('%s not a candidate: badexit avg too high (%lf)',
 749                    self._fpr, self._badexit)
 750       return False
 751     # if the relay doesn't report a version, also exclude the relay
 752     if (not self._data.has_key('recommended_version')
 753         or not self._data['recommended_version']):
 754       logging.info('%s not a candidate: version not recommended', self._fpr)
 755       return False
 756     if self._guard < CUTOFF_GUARD:
 757       logging.info('%s not a candidate: guard avg too low (%lf)',
 758                    self._fpr, self._guard)
 759       return False
 760     return True
 761
 762   def is_in_whitelist(self, relaylist):
 763     """ A fallback matches if each key in the whitelist line matches:
 764           ipv4
 765           dirport
 766           orport
 767           id
 768           ipv6 address and port (if present)
 769         If the fallback has an ipv6 key, the whitelist line must also have
 770         it, and vice versa, otherwise they don't match. """
 771     for entry in relaylist:
 772       if  entry['id'] != self._fpr:
 773         # can't log here, every relay's fingerprint is compared to the entry
 774         continue
 775       if entry['ipv4'] != self.dirip:
 776         logging.info('%s is not in the whitelist: fingerprint matches, but ' +
 777                      'IPv4 (%s) does not match entry IPv4 (%s)',
 778                      self._fpr, self.dirip, entry['ipv4'])
 779         continue
 780       if int(entry['dirport']) != self.dirport:
 781         logging.info('%s is not in the whitelist: fingerprint matches, but ' +
 782                      'DirPort (%d) does not match entry DirPort (%d)',
 783                      self._fpr, self.dirport, int(entry['dirport']))
 784         continue
 785       if int(entry['orport']) != self.orport:
 786         logging.info('%s is not in the whitelist: fingerprint matches, but ' +
 787                      'ORPort (%d) does not match entry ORPort (%d)',
 788                      self._fpr, self.orport, int(entry['orport']))
 789         continue
 790       has_ipv6 = self.ipv6addr is not None and self.ipv6orport is not None
 791       if (entry.has_key('ipv6') and has_ipv6):
 792         ipv6 = self.ipv6addr + ':' + self.ipv6orport
 793         # if both entry and fallback have an ipv6 address, compare them
 794         if entry['ipv6'] != ipv6:
 795           logging.info('%s is not in the whitelist: fingerprint matches, ' +
 796                        'but IPv6 (%s) does not match entry IPv6 (%s)',
 797                        self._fpr, ipv6, entry['ipv6'])
 798           continue
 799       # if the fallback has an IPv6 address but the whitelist entry
 800       # doesn't, or vice versa, the whitelist entry doesn't match
 801       elif entry.has_key('ipv6') and not has_ipv6:
 802         logging.info('%s is not in the whitelist: fingerprint matches, but ' +
 803                      'it has no IPv6, and entry has IPv6 (%s)', self._fpr,
 804                      entry['ipv6'])
 805         logging.warning('%s excluded: has it lost its former IPv6 address %s?',
 806                         self._fpr, entry['ipv6'])
 807         continue
 808       elif not entry.has_key('ipv6') and has_ipv6:
 809         logging.info('%s is not in the whitelist: fingerprint matches, but ' +
 810                      'it has IPv6 (%s), and entry has no IPv6', self._fpr,
 811                      ipv6)
 812         logging.warning('%s excluded: has it gained an IPv6 address %s?',
 813                         self._fpr, ipv6)
 814         continue
 815       return True
 816     return False
 817
 818   def is_in_blacklist(self, relaylist):
 819     """ A fallback matches a blacklist line if a sufficiently specific group
 820         of attributes matches:
 821           ipv4 & dirport
 822           ipv4 & orport
 823           id
 824           ipv6 & dirport
 825           ipv6 & ipv6 orport
 826         If the fallback and the blacklist line both have an ipv6 key,
 827         their values will be compared, otherwise, they will be ignored.
 828         If there is no dirport and no orport, the entry matches all relays on
 829         that ip. """
 830     for entry in relaylist:
 831       for key in entry:
 832         value = entry[key]
 833         if key == 'id' and value == self._fpr:
 834           logging.info('%s is in the blacklist: fingerprint matches',
 835                        self._fpr)
 836           return True
 837         if key == 'ipv4' and value == self.dirip:
 838           # if the dirport is present, check it too
 839           if entry.has_key('dirport'):
 840             if int(entry['dirport']) == self.dirport:
 841               logging.info('%s is in the blacklist: IPv4 (%s) and ' +
 842                            'DirPort (%d) match', self._fpr, self.dirip,
 843                            self.dirport)
 844               return True
 845           # if the orport is present, check it too
 846           elif entry.has_key('orport'):
 847             if int(entry['orport']) == self.orport:
 848               logging.info('%s is in the blacklist: IPv4 (%s) and ' +
 849                            'ORPort (%d) match', self._fpr, self.dirip,
 850                            self.orport)
 851               return True
 852           else:
 853             logging.info('%s is in the blacklist: IPv4 (%s) matches, and ' +
 854                          'entry has no DirPort or ORPort', self._fpr,
 855                          self.dirip)
 856             return True
 857         has_ipv6 = self.ipv6addr is not None and self.ipv6orport is not None
 858         ipv6 = (self.ipv6addr + ':' + self.ipv6orport) if has_ipv6 else None
 859         if (key == 'ipv6' and has_ipv6):
 860         # if both entry and fallback have an ipv6 address, compare them,
 861         # otherwise, disregard ipv6 addresses
 862           if value == ipv6:
 863             # if the dirport is present, check it too
 864             if entry.has_key('dirport'):
 865               if int(entry['dirport']) == self.dirport:
 866                 logging.info('%s is in the blacklist: IPv6 (%s) and ' +
 867                              'DirPort (%d) match', self._fpr, ipv6,
 868                              self.dirport)
 869                 return True
 870             # we've already checked the ORPort, it's part of entry['ipv6']
 871             else:
 872               logging.info('%s is in the blacklist: IPv6 (%s) matches, and' +
 873                            'entry has no DirPort', self._fpr, ipv6)
 874               return True
 875         elif (key == 'ipv6' or has_ipv6):
 876           # only log if the fingerprint matches but the IPv6 doesn't
 877           if entry.has_key('id') and entry['id'] == self._fpr:
 878             logging.info('%s skipping IPv6 blacklist comparison: relay ' +
 879                          'has%s IPv6%s, but entry has%s IPv6%s', self._fpr,
 880                          '' if has_ipv6 else ' no',
 881                          (' (' + ipv6 + ')') if has_ipv6 else  '',
 882                          '' if key == 'ipv6' else ' no',
 883                          (' (' + value + ')') if key == 'ipv6' else '')
 884             logging.warning('Has %s %s IPv6 address %s?', self._fpr,
 885                             'gained an' if has_ipv6 else 'lost its former',
 886                             ipv6 if has_ipv6 else value)
 887     return False
 888
 889   def is_exit(self):
 890     return 'Exit' in self._data['flags']
 891
 892   def is_guard(self):
 893     return 'Guard' in self._data['flags']
 894
 895   def is_running(self):
 896     return 'Running' in self._data['flags']
 897
 898   def fallback_weight_fraction(self, total_weight):
 899     return float(self._data['consensus_weight']) / total_weight
 900
 901   # return the original consensus weight, if it exists,
 902   # or, if not, return the consensus weight
 903   def original_consensus_weight(self):
 904     if self._data.has_key('original_consensus_weight'):
 905       return self._data['original_consensus_weight']
 906     else:
 907       return self._data['consensus_weight']
 908
 909   def original_fallback_weight_fraction(self, total_weight):
 910     return float(self.original_consensus_weight()) / total_weight
 911
 912   @staticmethod
 913   def fallback_consensus_dl_speed(dirip, dirport, nickname, max_time):
 914     download_failed = False
 915     downloader = DescriptorDownloader()
 916     start = datetime.datetime.utcnow()
 917     # some directory mirrors respond to requests in ways that hang python
 918     # sockets, which is why we long this line here
 919     logging.info('Initiating consensus download from %s (%s:%d).', nickname,
 920                  dirip, dirport)
 921     # there appears to be about 1 second of overhead when comparing stem's
 922     # internal trace time and the elapsed time calculated here
 923     TIMEOUT_SLOP = 1.0
 924     try:
 925       downloader.get_consensus(endpoints = [(dirip, dirport)],
 926                                timeout = (max_time + TIMEOUT_SLOP),
 927                                validate = True,
 928                                retries = 0,
 929                                fall_back_to_authority = False).run()
 930     except Exception, stem_error:
 931       logging.debug('Unable to retrieve a consensus from %s: %s', nickname,
 932                     stem_error)
 933       status = 'error: "%s"' % (stem_error)
 934       level = logging.WARNING
 935       download_failed = True
 936     elapsed = (datetime.datetime.utcnow() - start).total_seconds()
 937     if elapsed > max_time:
 938       status = 'too slow'
 939       level = logging.WARNING
 940       download_failed = True
 941     else:
 942       status = 'ok'
 943       level = logging.DEBUG
 944     logging.log(level, 'Consensus download: %0.1fs %s from %s (%s:%d), ' +
 945                  'max download time %0.1fs.', elapsed, status, nickname,
 946                  dirip, dirport, max_time)
 947     return download_failed
 948
 949   def fallback_consensus_dl_check(self):
 950     # include the relay if we're not doing a check, or we can't check (IPv6)
 951     ipv4_failed = False
 952     ipv6_failed = False
 953     if PERFORM_IPV4_DIRPORT_CHECKS:
 954       ipv4_failed = Candidate.fallback_consensus_dl_speed(self.dirip,
 955                                                 self.dirport,
 956                                                 self._data['nickname'],
 957                                                 CONSENSUS_DOWNLOAD_SPEED_MAX)
 958     if self.ipv6addr is not None and PERFORM_IPV6_DIRPORT_CHECKS:
 959       # Clients assume the IPv6 DirPort is the same as the IPv4 DirPort
 960       ipv6_failed = Candidate.fallback_consensus_dl_speed(self.ipv6addr,
 961                                                 self.dirport,
 962                                                 self._data['nickname'],
 963                                                 CONSENSUS_DOWNLOAD_SPEED_MAX)
 964     # Now retry the relay if it took too long the first time
 965     if (PERFORM_IPV4_DIRPORT_CHECKS and ipv4_failed
 966         and CONSENSUS_DOWNLOAD_RETRY):
 967       ipv4_failed = Candidate.fallback_consensus_dl_speed(self.dirip,
 968                                                 self.dirport,
 969                                                 self._data['nickname'],
 970                                                 CONSENSUS_DOWNLOAD_SPEED_MAX)
 971     if (self.ipv6addr is not None and PERFORM_IPV6_DIRPORT_CHECKS
 972         and ipv6_failed and CONSENSUS_DOWNLOAD_RETRY):
 973       ipv6_failed = Candidate.fallback_consensus_dl_speed(self.ipv6addr,
 974                                                 self.dirport,
 975                                                 self._data['nickname'],
 976                                                 CONSENSUS_DOWNLOAD_SPEED_MAX)
 977     return ((not ipv4_failed) and (not ipv6_failed))
 978
 979   def fallbackdir_line(self, total_weight, original_total_weight, dl_speed_ok):
 980     # /*
 981     # nickname
 982     # flags
 983     # weight / total (percentage)
 984     # [original weight / original total (original percentage)]
 985     # [contact]
 986     # */
 987     # "address:dirport orport=port id=fingerprint"
 988     # "[ipv6=addr:orport]"
 989     # "weight=num",
 990     #
 991     # Multiline C comment
 992     s = '/*'
 993     s += '\n'
 994     s += cleanse_c_multiline_comment(self._data['nickname'])
 995     s += '\n'
 996     s += 'Flags: '
 997     s += cleanse_c_multiline_comment(' '.join(sorted(self._data['flags'])))
 998     s += '\n'
 999     weight = self._data['consensus_weight']
1000     percent_weight = self.fallback_weight_fraction(total_weight)*100
1001     s += 'Fallback Weight: %d / %d (%.3f%%)'%(weight, total_weight,
1002                                               percent_weight)
1003     s += '\n'
1004     o_weight = self.original_consensus_weight()
1005     if o_weight != weight:
1006       o_percent_weight = self.original_fallback_weight_fraction(
1007                                                      original_total_weight)*100
1008       s += 'Consensus Weight: %d / %d (%.3f%%)'%(o_weight,
1009                                                  original_total_weight,
1010                                                  o_percent_weight)
1011       s += '\n'
1012     if self._data['contact'] is not None:
1013       s += cleanse_c_multiline_comment(self._data['contact'])
1014       s += '\n'
1015     s += '*/'
1016     s += '\n'
1017     # Comment out the fallback directory entry if it's too slow
1018     # See the debug output for which address and port is failing
1019     if not dl_speed_ok:
1020       s += '/* Consensus download failed or was too slow:\n'
1021     # Multi-Line C string with trailing comma (part of a string list)
1022     # This makes it easier to diff the file, and remove IPv6 lines using grep
1023     # Integers don't need escaping
1024     s += '"%s orport=%d id=%s"'%(
1025             cleanse_c_string(self._data['dir_address']),
1026             self.orport,
1027             cleanse_c_string(self._fpr))
1028     s += '\n'
1029     if self.ipv6addr is not None:
1030       s += '" ipv6=%s:%s"'%(
1031             cleanse_c_string(self.ipv6addr), cleanse_c_string(self.ipv6orport))
1032       s += '\n'
1033     s += '" weight=%d",'%(weight)
1034     if not dl_speed_ok:
1035       s += '\n'
1036       s += '*/'
1037     return s
1038
1039 ## Fallback Candidate List Class
1040
1041 class CandidateList(dict):
1042   def __init__(self):
1043     pass
1044
1045   def _add_relay(self, details):
1046     if not 'dir_address' in details: return
1047     c = Candidate(details)
1048     self[ c.get_fingerprint() ] = c
1049
1050   def _add_uptime(self, uptime):
1051     try:
1052       fpr = uptime['fingerprint']
1053     except KeyError:
1054       raise Exception("Document has no fingerprint field.")
1055
1056     try:
1057       c = self[fpr]
1058     except KeyError:
1059       logging.debug('Got unknown relay %s in uptime document.'%(fpr,))
1060       return
1061
1062     c.add_uptime(uptime)
1063
1064   def _add_details(self):
1065     logging.debug('Loading details document.')
1066     d = fetch('details',
1067         fields=('fingerprint,nickname,contact,last_changed_address_or_port,' +
1068                 'consensus_weight,or_addresses,dir_address,' +
1069                 'recommended_version,flags'))
1070     logging.debug('Loading details document done.')
1071
1072     if not 'relays' in d: raise Exception("No relays found in document.")
1073
1074     for r in d['relays']: self._add_relay(r)
1075
1076   def _add_uptimes(self):
1077     logging.debug('Loading uptime document.')
1078     d = fetch('uptime')
1079     logging.debug('Loading uptime document done.')
1080
1081     if not 'relays' in d: raise Exception("No relays found in document.")
1082     for r in d['relays']: self._add_uptime(r)
1083
1084   def add_relays(self):
1085     self._add_details()
1086     self._add_uptimes()
1087
1088   def count_guards(self):
1089     guard_count = 0
1090     for fpr in self.keys():
1091       if self[fpr].is_guard():
1092         guard_count += 1
1093     return guard_count
1094
1095   # Find fallbacks that fit the uptime, stability, and flags criteria
1096   def compute_fallbacks(self):
1097     self.fallbacks = map(lambda x: self[x],
1098                       sorted(
1099                         filter(lambda x: self[x].is_candidate(),
1100                                self.keys()),
1101                         key=lambda x: self[x]._data['consensus_weight'],
1102                         reverse=True)
1103                       )
1104
1105   @staticmethod
1106   def load_relaylist(file_name):
1107     """ Read each line in the file, and parse it like a FallbackDir line:
1108         an IPv4 address and optional port:
1109           <IPv4 address>:<port>
1110         which are parsed into dictionary entries:
1111           ipv4=<IPv4 address>
1112           dirport=<port>
1113         followed by a series of key=value entries:
1114           orport=<port>
1115           id=<fingerprint>
1116           ipv6=<IPv6 address>:<IPv6 orport>
1117         each line's key/value pairs are placed in a dictonary,
1118         (of string -> string key/value pairs),
1119         and these dictionaries are placed in an array.
1120         comments start with # and are ignored """
1121     relaylist = []
1122     file_data = read_from_file(file_name, MAX_LIST_FILE_SIZE)
1123     if file_data is None:
1124       return relaylist
1125     for line in file_data.split('\n'):
1126       relay_entry = {}
1127       # ignore comments
1128       line_comment_split = line.split('#')
1129       line = line_comment_split[0]
1130       # cleanup whitespace
1131       line = cleanse_whitespace(line)
1132       line = line.strip()
1133       if len(line) == 0:
1134         continue
1135       for item in line.split(' '):
1136         item = item.strip()
1137         if len(item) == 0:
1138           continue
1139         key_value_split = item.split('=')
1140         kvl = len(key_value_split)
1141         if kvl < 1 or kvl > 2:
1142           print '#error Bad %s item: %s, format is key=value.'%(
1143                                                  file_name, item)
1144         if kvl == 1:
1145           # assume that entries without a key are the ipv4 address,
1146           # perhaps with a dirport
1147           ipv4_maybe_dirport = key_value_split[0]
1148           ipv4_maybe_dirport_split = ipv4_maybe_dirport.split(':')
1149           dirl = len(ipv4_maybe_dirport_split)
1150           if dirl < 1 or dirl > 2:
1151             print '#error Bad %s IPv4 item: %s, format is ipv4:port.'%(
1152                                                         file_name, item)
1153           if dirl >= 1:
1154             relay_entry['ipv4'] = ipv4_maybe_dirport_split[0]
1155           if dirl == 2:
1156             relay_entry['dirport'] = ipv4_maybe_dirport_split[1]
1157         elif kvl == 2:
1158           relay_entry[key_value_split[0]] = key_value_split[1]
1159       relaylist.append(relay_entry)
1160     return relaylist
1161
1162   # apply the fallback whitelist and blacklist
1163   def apply_filter_lists(self):
1164     excluded_count = 0
1165     logging.debug('Applying whitelist and blacklist.')
1166     # parse the whitelist and blacklist
1167     whitelist = self.load_relaylist(WHITELIST_FILE_NAME)
1168     blacklist = self.load_relaylist(BLACKLIST_FILE_NAME)
1169     filtered_fallbacks = []
1170     for f in self.fallbacks:
1171       in_whitelist = f.is_in_whitelist(whitelist)
1172       in_blacklist = f.is_in_blacklist(blacklist)
1173       if in_whitelist and in_blacklist:
1174         if BLACKLIST_EXCLUDES_WHITELIST_ENTRIES:
1175           # exclude
1176           excluded_count += 1
1177           logging.warning('Excluding %s: in both blacklist and whitelist.',
1178                           f._fpr)
1179         else:
1180           # include
1181           filtered_fallbacks.append(f)
1182       elif in_whitelist:
1183         # include
1184         filtered_fallbacks.append(f)
1185       elif in_blacklist:
1186         # exclude
1187         excluded_count += 1
1188         logging.debug('Excluding %s: in blacklist.', f._fpr)
1189       else:
1190         if INCLUDE_UNLISTED_ENTRIES:
1191           # include
1192           filtered_fallbacks.append(f)
1193         else:
1194           # exclude
1195           excluded_count += 1
1196           logging.info('Excluding %s: in neither blacklist nor whitelist.',
1197                        f._fpr)
1198     self.fallbacks = filtered_fallbacks
1199     return excluded_count
1200
1201   @staticmethod
1202   def summarise_filters(initial_count, excluded_count):
1203     return '/* Whitelist & blacklist excluded %d of %d candidates. */'%(
1204                                                 excluded_count, initial_count)
1205
1206   # Remove any fallbacks in excess of MAX_FALLBACK_COUNT,
1207   # starting with the lowest-weighted fallbacks
1208   # total_weight should be recalculated after calling this
1209   def exclude_excess_fallbacks(self):
1210     if MAX_FALLBACK_COUNT is not None:
1211       self.fallbacks = self.fallbacks[:MAX_FALLBACK_COUNT]
1212
1213   # Clamp the weight of all fallbacks to MAX_WEIGHT_FRACTION * total_weight
1214   # fallbacks are kept sorted, but since excessive weights are reduced to
1215   # the maximum acceptable weight, these relays end up with equal weights
1216   def clamp_high_weight_fallbacks(self, total_weight):
1217     if MAX_WEIGHT_FRACTION * len(self.fallbacks) < 1.0:
1218       error_str  = 'Max Fallback Weight %.3f%% is unachievable'%(
1219                                                           MAX_WEIGHT_FRACTION)
1220       error_str += ' with Current Fallback Count %d.'%(len(self.fallbacks))
1221       if STRICT_FALLBACK_WEIGHTS:
1222         print '#error ' + error_str
1223       else:
1224         print '/* ' + error_str + ' */'
1225     relays_clamped = 0
1226     max_acceptable_weight = total_weight * MAX_WEIGHT_FRACTION
1227     for f in self.fallbacks:
1228       frac_weight = f.fallback_weight_fraction(total_weight)
1229       if frac_weight > MAX_WEIGHT_FRACTION:
1230         relays_clamped += 1
1231         current_weight = f._data['consensus_weight']
1232         # if we already have an original weight, keep it
1233         if (not f._data.has_key('original_consensus_weight')
1234             or f._data['original_consensus_weight'] == current_weight):
1235           f._data['original_consensus_weight'] = current_weight
1236         f._data['consensus_weight'] = max_acceptable_weight
1237     return relays_clamped
1238
1239   # Remove any fallbacks with weights lower than MIN_WEIGHT_FRACTION
1240   # total_weight should be recalculated after calling this
1241   def exclude_low_weight_fallbacks(self, total_weight):
1242     self.fallbacks = filter(
1243             lambda x:
1244              x.fallback_weight_fraction(total_weight) >= MIN_WEIGHT_FRACTION,
1245              self.fallbacks)
1246
1247   def fallback_weight_total(self):
1248     return sum(f._data['consensus_weight'] for f in self.fallbacks)
1249
1250   def fallback_min_weight(self):
1251     if len(self.fallbacks) > 0:
1252       return self.fallbacks[-1]
1253     else:
1254       return None
1255
1256   def fallback_max_weight(self):
1257     if len(self.fallbacks) > 0:
1258       return self.fallbacks[0]
1259     else:
1260       return None
1261
1262   def summarise_fallbacks(self, eligible_count, eligible_weight,
1263                           relays_clamped, clamped_weight,
1264                           guard_count, target_count, max_count):
1265     # Report:
1266     #  the number of fallback directories (with min & max limits);
1267     #    #error if below minimum count
1268     #  the total weight, min & max fallback proportions
1269     #    #error if outside max weight proportion
1270     # Multiline C comment with #error if things go bad
1271     s = '/*'
1272     s += '\n'
1273     s += 'Fallback Directory Summary'
1274     s += '\n'
1275     # Integers don't need escaping in C comments
1276     fallback_count = len(self.fallbacks)
1277     if FALLBACK_PROPORTION_OF_GUARDS is None:
1278       fallback_proportion = ' (none)'
1279     else:
1280       fallback_proportion = '%d (%d * %f)'%(target_count, guard_count,
1281                                           FALLBACK_PROPORTION_OF_GUARDS)
1282     s += 'Final Count:  %d (Eligible %d, Usable %d, Target %d%s'%(
1283             min(max_count, fallback_count),
1284             eligible_count,
1285             fallback_count,
1286             fallback_proportion)
1287     if MAX_FALLBACK_COUNT is not None:
1288       s += ', Clamped to %d'%(MAX_FALLBACK_COUNT)
1289     s += ')\n'
1290     if fallback_count < MIN_FALLBACK_COUNT:
1291       s += '*/'
1292       s += '\n'
1293       # We must have a minimum number of fallbacks so they are always
1294       # reachable, and are in diverse locations
1295       s += '#error Fallback Count %d is too low. '%(fallback_count)
1296       s += 'Must be at least %d for diversity. '%(MIN_FALLBACK_COUNT)
1297       s += 'Try adding entries to the whitelist, '
1298       s += 'or setting INCLUDE_UNLISTED_ENTRIES = True.'
1299       s += '\n'
1300       s += '/*'
1301       s += '\n'
1302     total_weight = self.fallback_weight_total()
1303     min_fb = self.fallback_min_weight()
1304     min_weight = min_fb._data['consensus_weight']
1305     min_percent = min_fb.fallback_weight_fraction(total_weight)*100.0
1306     max_fb = self.fallback_max_weight()
1307     max_weight = max_fb._data['consensus_weight']
1308     max_frac = max_fb.fallback_weight_fraction(total_weight)
1309     max_percent = max_frac*100.0
1310     s += 'Final Weight: %d (Eligible %d)'%(total_weight, eligible_weight)
1311     s += '\n'
1312     s += 'Max Weight:   %d (%.3f%%) (Clamped to %.3f%%)'%(
1313                                                 max_weight,
1314                                                 max_percent,
1315                                                 TARGET_MAX_WEIGHT_FRACTION*100)
1316     s += '\n'
1317     s += 'Min Weight:   %d (%.3f%%) (Clamped to %.3f%%)'%(
1318                                                 min_weight,
1319                                                 min_percent,
1320                                                 MIN_WEIGHT_FRACTION*100)
1321     s += '\n'
1322     if eligible_count != fallback_count:
1323       s += 'Excluded:     %d (Clamped, Below Target, or Low Weight)'%(
1324                                               eligible_count - fallback_count)
1325       s += '\n'
1326     if relays_clamped > 0:
1327       s += 'Clamped:   %d (%.3f%%) Excess Weight, '%(
1328                                     clamped_weight,
1329                                     (100.0 * clamped_weight) / total_weight)
1330       s += '%d High Weight Fallbacks (%.1f%%)'%(
1331                                     relays_clamped,
1332                                     (100.0 * relays_clamped) / fallback_count)
1333       s += '\n'
1334     s += '*/'
1335     if max_frac > TARGET_MAX_WEIGHT_FRACTION:
1336       s += '\n'
1337       # We must restrict the maximum fallback weight, so an adversary
1338       # at or near the fallback doesn't see too many clients
1339       error_str  = 'Max Fallback Weight %.3f%% is too high. '%(max_frac*100)
1340       error_str += 'Must be at most %.3f%% for client anonymity.'%(
1341                                               TARGET_MAX_WEIGHT_FRACTION*100)
1342       if STRICT_FALLBACK_WEIGHTS:
1343         s += '#error ' + error_str
1344       else:
1345         s += '/* ' + error_str + ' */'
1346     s += '\n'
1347     if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
1348       s += '/* Checked %s%s%s DirPorts served a consensus within %.1fs. */'%(
1349             'IPv4' if PERFORM_IPV4_DIRPORT_CHECKS else '',
1350             ' and ' if (PERFORM_IPV4_DIRPORT_CHECKS
1351                         and PERFORM_IPV6_DIRPORT_CHECKS) else '',
1352             'IPv6' if PERFORM_IPV6_DIRPORT_CHECKS else '',
1353             CONSENSUS_DOWNLOAD_SPEED_MAX)
1354     else:
1355       s += '/* Did not check IPv4 or IPv6 DirPort consensus downloads. */'
1356     return s
1357
1358 ## Main Function
1359
1360 def list_fallbacks():
1361   """ Fetches required onionoo documents and evaluates the
1362       fallback directory criteria for each of the relays """
1363
1364   # find relays that could be fallbacks
1365   candidates = CandidateList()
1366   candidates.add_relays()
1367
1368   # work out how many fallbacks we want
1369   guard_count = candidates.count_guards()
1370   if FALLBACK_PROPORTION_OF_GUARDS is None:
1371     target_count = guard_count
1372   else:
1373     target_count = int(guard_count * FALLBACK_PROPORTION_OF_GUARDS)
1374   # the maximum number of fallbacks is the least of:
1375   # - the target fallback count (FALLBACK_PROPORTION_OF_GUARDS * guard count)
1376   # - the maximum fallback count (MAX_FALLBACK_COUNT)
1377   if MAX_FALLBACK_COUNT is None:
1378     max_count = guard_count
1379   else:
1380     max_count = min(target_count, MAX_FALLBACK_COUNT)
1381
1382   candidates.compute_fallbacks()
1383
1384   # filter with the whitelist and blacklist
1385   initial_count = len(candidates.fallbacks)
1386   excluded_count = candidates.apply_filter_lists()
1387   print candidates.summarise_filters(initial_count, excluded_count)
1388   eligible_count = len(candidates.fallbacks)
1389   eligible_weight = candidates.fallback_weight_total()
1390
1391   # print the raw fallback list
1392   #total_weight = candidates.fallback_weight_total()
1393   #for x in candidates.fallbacks:
1394   #  print x.fallbackdir_line(total_weight, total_weight)
1395
1396   # When candidates are excluded, total_weight decreases, and
1397   # the proportional weight of other candidates increases.
1398   candidates.exclude_excess_fallbacks()
1399   total_weight = candidates.fallback_weight_total()
1400
1401   # When candidates are reweighted, total_weight decreases, and
1402   # the proportional weight of other candidates increases.
1403   # Previously low-weight candidates might obtain sufficient proportional
1404   # weights to be included.
1405   # Save the weight at which we reweighted fallbacks for the summary.
1406   pre_clamp_total_weight = total_weight
1407   relays_clamped = candidates.clamp_high_weight_fallbacks(total_weight)
1408
1409   # When candidates are excluded, total_weight decreases, and
1410   # the proportional weight of other candidates increases.
1411   # No new low weight candidates will be created during exclusions.
1412   # However, high weight candidates may increase over the maximum proportion.
1413   # This should not be an issue, except in pathological cases.
1414   candidates.exclude_low_weight_fallbacks(total_weight)
1415   total_weight = candidates.fallback_weight_total()
1416
1417   # check we haven't exceeded TARGET_MAX_WEIGHT_FRACTION
1418   # since reweighting preserves the orginal sort order,
1419   # the maximum weights will be at the head of the list
1420   if len(candidates.fallbacks) > 0:
1421     max_weight_fb = candidates.fallback_max_weight()
1422     max_weight = max_weight_fb.fallback_weight_fraction(total_weight)
1423     if  max_weight > TARGET_MAX_WEIGHT_FRACTION:
1424       error_str  = 'Maximum fallback weight: %.3f%% exceeds target %.3f%%. '%(
1425                                               max_weight*100.0,
1426                                               TARGET_MAX_WEIGHT_FRACTION*100.0)
1427       error_str += 'Try decreasing REWEIGHTING_FUDGE_FACTOR.'
1428       if STRICT_FALLBACK_WEIGHTS:
1429         print '#error ' + error_str
1430       else:
1431         print '/* ' + error_str + ' */'
1432
1433     print candidates.summarise_fallbacks(eligible_count, eligible_weight,
1434                                          relays_clamped,
1435                                          pre_clamp_total_weight - total_weight,
1436                                          guard_count, target_count, max_count)
1437   else:
1438     print '/* No Fallbacks met criteria */'
1439
1440   for s in fetch_source_list():
1441     print describe_fetch_source(s)
1442
1443   for x in candidates.fallbacks[:max_count]:
1444     dl_speed_ok = x.fallback_consensus_dl_check()
1445     print x.fallbackdir_line(total_weight, pre_clamp_total_weight, dl_speed_ok)
1446     #print json.dumps(candidates[x]._data, sort_keys=True, indent=4,
1447     #                  separators=(',', ': '), default=json_util.default)
1448
1449 if __name__ == "__main__":
1450   list_fallbacks()