Appease lint-changes script.
[tor.git] / scripts / maint / updateFallbackDirs.py
blob110ecda64c733920d702eeeeb507782ca70116b2
1 #!/usr/bin/python
3 # Usage: scripts/maint/updateFallbackDirs.py > src/or/fallback_dirs.inc
5 # This script should be run from a stable, reliable network connection,
6 # with no other network activity (and not over tor).
7 # If this is not possible, please disable:
8 # PERFORM_IPV4_DIRPORT_CHECKS and PERFORM_IPV6_DIRPORT_CHECKS
10 # Needs dateutil (and potentially other python packages)
11 # Needs stem available in your PYTHONPATH, or just ln -s ../stem/stem .
12 # Optionally uses ipaddress (python 3 builtin) or py2-ipaddress (package)
13 # for netblock analysis, in PYTHONPATH, or just
14 # ln -s ../py2-ipaddress-3.4.1/ipaddress.py .
16 # Then read the logs to make sure the fallbacks aren't dominated by a single
17 # netblock or port
19 # Script by weasel, April 2015
20 # Portions by gsathya & karsten, 2013
21 # https://trac.torproject.org/projects/tor/attachment/ticket/8374/dir_list.2.py
22 # Modifications by teor, 2015
24 import StringIO
25 import string
26 import re
27 import datetime
28 import gzip
29 import os.path
30 import json
31 import math
32 import sys
33 import urllib
34 import urllib2
35 import hashlib
36 import dateutil.parser
37 # bson_lazy provides bson
38 #from bson import json_util
39 import copy
41 from stem.descriptor.remote import DescriptorDownloader
43 import logging
44 # INFO tells you why each relay was included or excluded
45 # WARN tells you about potential misconfigurations and relay detail changes
46 logging.basicConfig(level=logging.WARNING)
47 logging.root.name = ''
48 # INFO tells you about each consensus download attempt
49 logging.getLogger('stem').setLevel(logging.WARNING)
51 HAVE_IPADDRESS = False
52 try:
53 # python 3 builtin, or install package py2-ipaddress
54 # there are several ipaddress implementations for python 2
55 # with slightly different semantics with str typed text
56 # fortunately, all our IP addresses are in unicode
57 import ipaddress
58 HAVE_IPADDRESS = True
59 except ImportError:
60 # if this happens, we avoid doing netblock analysis
61 logging.warning('Unable to import ipaddress, please install py2-ipaddress.' +
62 ' A fallback list will be created, but optional netblock' +
63 ' analysis will not be performed.')
65 ## Top-Level Configuration
67 # Output all candidate fallbacks, or only output selected fallbacks?
68 OUTPUT_CANDIDATES = False
70 # Perform DirPort checks over IPv4?
71 # Change this to False if IPv4 doesn't work for you, or if you don't want to
72 # download a consensus for each fallback
73 # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
74 PERFORM_IPV4_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else True
76 # Perform DirPort checks over IPv6?
77 # If you know IPv6 works for you, set this to True
78 # This will exclude IPv6 relays without an IPv6 DirPort configured
79 # So it's best left at False until #18394 is implemented
80 # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
81 PERFORM_IPV6_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else False
83 # Output fallback name, flags, and ContactInfo in a C comment?
84 OUTPUT_COMMENTS = True if OUTPUT_CANDIDATES else False
86 # Output matching ContactInfo in fallbacks list or the blacklist?
87 # Useful if you're trying to contact operators
88 CONTACT_COUNT = True if OUTPUT_CANDIDATES else False
89 CONTACT_BLACKLIST_COUNT = True if OUTPUT_CANDIDATES else False
91 ## OnionOO Settings
93 ONIONOO = 'https://onionoo.torproject.org/'
94 #ONIONOO = 'https://onionoo.thecthulhu.com/'
96 # Don't bother going out to the Internet, just use the files available locally,
97 # even if they're very old
98 LOCAL_FILES_ONLY = False
100 ## Whitelist / Blacklist Filter Settings
102 # The whitelist contains entries that are included if all attributes match
103 # (IPv4, dirport, orport, id, and optionally IPv6 and IPv6 orport)
104 # The blacklist contains (partial) entries that are excluded if any
105 # sufficiently specific group of attributes matches:
106 # IPv4 & DirPort
107 # IPv4 & ORPort
108 # ID
109 # IPv6 & DirPort
110 # IPv6 & IPv6 ORPort
111 # If neither port is included in the blacklist, the entire IP address is
112 # blacklisted.
114 # What happens to entries in neither list?
115 # When True, they are included, when False, they are excluded
116 INCLUDE_UNLISTED_ENTRIES = True if OUTPUT_CANDIDATES else False
118 # If an entry is in both lists, what happens?
119 # When True, it is excluded, when False, it is included
120 BLACKLIST_EXCLUDES_WHITELIST_ENTRIES = True
122 WHITELIST_FILE_NAME = 'scripts/maint/fallback.whitelist'
123 BLACKLIST_FILE_NAME = 'scripts/maint/fallback.blacklist'
125 # The number of bytes we'll read from a filter file before giving up
126 MAX_LIST_FILE_SIZE = 1024 * 1024
128 ## Eligibility Settings
130 # Reduced due to a bug in tor where a relay submits a 0 DirPort when restarted
131 # This causes OnionOO to (correctly) reset its stability timer
132 # This issue will be fixed in 0.2.7.7 and 0.2.8.2
133 # Until then, the CUTOFFs below ensure a decent level of stability.
134 ADDRESS_AND_PORT_STABLE_DAYS = 7
135 # What time-weighted-fraction of these flags must FallbackDirs
136 # Equal or Exceed?
137 CUTOFF_RUNNING = .95
138 CUTOFF_V2DIR = .95
139 CUTOFF_GUARD = .95
140 # What time-weighted-fraction of these flags must FallbackDirs
141 # Equal or Fall Under?
142 # .00 means no bad exits
143 PERMITTED_BADEXIT = .00
145 # older entries' weights are adjusted with ALPHA^(age in days)
146 AGE_ALPHA = 0.99
148 # this factor is used to scale OnionOO entries to [0,1]
149 ONIONOO_SCALE_ONE = 999.
151 ## Fallback Count Limits
153 # The target for these parameters is 20% of the guards in the network
154 # This is around 200 as of October 2015
155 _FB_POG = 0.2
156 FALLBACK_PROPORTION_OF_GUARDS = None if OUTPUT_CANDIDATES else _FB_POG
158 # We want exactly 100 fallbacks for the initial release
159 # This gives us scope to add extra fallbacks to the list as needed
160 # Limit the number of fallbacks (eliminating lowest by advertised bandwidth)
161 MAX_FALLBACK_COUNT = None if OUTPUT_CANDIDATES else 100
162 # Emit a C #error if the number of fallbacks is below
163 MIN_FALLBACK_COUNT = 100
165 ## Fallback Bandwidth Requirements
167 # Any fallback with the Exit flag has its bandwidth multipled by this fraction
168 # to make sure we aren't further overloading exits
169 # (Set to 1.0, because we asked that only lightly loaded exits opt-in,
170 # and the extra load really isn't that much for large relays.)
171 EXIT_BANDWIDTH_FRACTION = 1.0
173 # If a single fallback's bandwidth is too low, it's pointless adding it
174 # We expect fallbacks to handle an extra 30 kilobytes per second of traffic
175 # Make sure they can support a hundred times the expected extra load
176 # (Use 102.4 to make it come out nicely in MB/s)
177 # We convert this to a consensus weight before applying the filter,
178 # because all the bandwidth amounts are specified by the relay
179 MIN_BANDWIDTH = 102.4 * 30.0 * 1024.0
181 # Clients will time out after 30 seconds trying to download a consensus
182 # So allow fallback directories half that to deliver a consensus
183 # The exact download times might change based on the network connection
184 # running this script, but only by a few seconds
185 # There is also about a second of python overhead
186 CONSENSUS_DOWNLOAD_SPEED_MAX = 15.0
187 # If the relay fails a consensus check, retry the download
188 # This avoids delisting a relay due to transient network conditions
189 CONSENSUS_DOWNLOAD_RETRY = True
191 ## Fallback Weights for Client Selection
193 # All fallback weights are equal, and set to the value below
194 # Authorities are weighted 1.0 by default
195 # Clients use these weights to select fallbacks and authorities at random
196 # If there are 100 fallbacks and 9 authorities:
197 # - each fallback is chosen with probability 10.0/(10.0*100 + 1.0*9) ~= 0.99%
198 # - each authority is chosen with probability 1.0/(10.0*100 + 1.0*9) ~= 0.09%
199 # A client choosing a bootstrap directory server will choose a fallback for
200 # 10.0/(10.0*100 + 1.0*9) * 100 = 99.1% of attempts, and an authority for
201 # 1.0/(10.0*100 + 1.0*9) * 9 = 0.9% of attempts.
202 # (This disregards the bootstrap schedules, where clients start by choosing
203 # from fallbacks & authoritites, then later choose from only authorities.)
204 FALLBACK_OUTPUT_WEIGHT = 10.0
206 ## Parsing Functions
208 def parse_ts(t):
209 return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S")
211 def remove_bad_chars(raw_string, bad_char_list):
212 # Remove each character in the bad_char_list
213 cleansed_string = raw_string
214 for c in bad_char_list:
215 cleansed_string = cleansed_string.replace(c, '')
216 return cleansed_string
218 def cleanse_unprintable(raw_string):
219 # Remove all unprintable characters
220 cleansed_string = ''
221 for c in raw_string:
222 if c in string.printable:
223 cleansed_string += c
224 return cleansed_string
226 def cleanse_whitespace(raw_string):
227 # Replace all whitespace characters with a space
228 cleansed_string = raw_string
229 for c in string.whitespace:
230 cleansed_string = cleansed_string.replace(c, ' ')
231 return cleansed_string
233 def cleanse_c_multiline_comment(raw_string):
234 cleansed_string = raw_string
235 # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
236 cleansed_string = cleanse_whitespace(cleansed_string)
237 # ContactInfo and Version can be arbitrary binary data
238 cleansed_string = cleanse_unprintable(cleansed_string)
239 # Prevent a malicious / unanticipated string from breaking out
240 # of a C-style multiline comment
241 # This removes '/*' and '*/' and '//'
242 bad_char_list = '*/'
243 # Prevent a malicious string from using C nulls
244 bad_char_list += '\0'
245 # Be safer by removing bad characters entirely
246 cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
247 # Some compilers may further process the content of comments
248 # There isn't much we can do to cover every possible case
249 # But comment-based directives are typically only advisory
250 return cleansed_string
252 def cleanse_c_string(raw_string):
253 cleansed_string = raw_string
254 # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
255 cleansed_string = cleanse_whitespace(cleansed_string)
256 # ContactInfo and Version can be arbitrary binary data
257 cleansed_string = cleanse_unprintable(cleansed_string)
258 # Prevent a malicious address/fingerprint string from breaking out
259 # of a C-style string
260 bad_char_list = '"'
261 # Prevent a malicious string from using escapes
262 bad_char_list += '\\'
263 # Prevent a malicious string from using C nulls
264 bad_char_list += '\0'
265 # Be safer by removing bad characters entirely
266 cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
267 # Some compilers may further process the content of strings
268 # There isn't much we can do to cover every possible case
269 # But this typically only results in changes to the string data
270 return cleansed_string
272 ## OnionOO Source Functions
274 # a dictionary of source metadata for each onionoo query we've made
275 fetch_source = {}
277 # register source metadata for 'what'
278 # assumes we only retrieve one document for each 'what'
279 def register_fetch_source(what, url, relays_published, version):
280 fetch_source[what] = {}
281 fetch_source[what]['url'] = url
282 fetch_source[what]['relays_published'] = relays_published
283 fetch_source[what]['version'] = version
285 # list each registered source's 'what'
286 def fetch_source_list():
287 return sorted(fetch_source.keys())
289 # given 'what', provide a multiline C comment describing the source
290 def describe_fetch_source(what):
291 desc = '/*'
292 desc += '\n'
293 desc += 'Onionoo Source: '
294 desc += cleanse_c_multiline_comment(what)
295 desc += ' Date: '
296 desc += cleanse_c_multiline_comment(fetch_source[what]['relays_published'])
297 desc += ' Version: '
298 desc += cleanse_c_multiline_comment(fetch_source[what]['version'])
299 desc += '\n'
300 desc += 'URL: '
301 desc += cleanse_c_multiline_comment(fetch_source[what]['url'])
302 desc += '\n'
303 desc += '*/'
304 return desc
306 ## File Processing Functions
308 def write_to_file(str, file_name, max_len):
309 try:
310 with open(file_name, 'w') as f:
311 f.write(str[0:max_len])
312 except EnvironmentError, error:
313 logging.error('Writing file %s failed: %d: %s'%
314 (file_name,
315 error.errno,
316 error.strerror)
319 def read_from_file(file_name, max_len):
320 try:
321 if os.path.isfile(file_name):
322 with open(file_name, 'r') as f:
323 return f.read(max_len)
324 except EnvironmentError, error:
325 logging.info('Loading file %s failed: %d: %s'%
326 (file_name,
327 error.errno,
328 error.strerror)
330 return None
332 def load_possibly_compressed_response_json(response):
333 if response.info().get('Content-Encoding') == 'gzip':
334 buf = StringIO.StringIO( response.read() )
335 f = gzip.GzipFile(fileobj=buf)
336 return json.load(f)
337 else:
338 return json.load(response)
340 def load_json_from_file(json_file_name):
341 # An exception here may be resolved by deleting the .last_modified
342 # and .json files, and re-running the script
343 try:
344 with open(json_file_name, 'r') as f:
345 return json.load(f)
346 except EnvironmentError, error:
347 raise Exception('Reading not-modified json file %s failed: %d: %s'%
348 (json_file_name,
349 error.errno,
350 error.strerror)
353 ## OnionOO Functions
355 def datestr_to_datetime(datestr):
356 # Parse datetimes like: Fri, 02 Oct 2015 13:34:14 GMT
357 if datestr is not None:
358 dt = dateutil.parser.parse(datestr)
359 else:
360 # Never modified - use start of epoch
361 dt = datetime.datetime.utcfromtimestamp(0)
362 # strip any timezone out (in case they're supported in future)
363 dt = dt.replace(tzinfo=None)
364 return dt
366 def onionoo_fetch(what, **kwargs):
367 params = kwargs
368 params['type'] = 'relay'
369 #params['limit'] = 10
370 params['first_seen_days'] = '%d-'%(ADDRESS_AND_PORT_STABLE_DAYS,)
371 params['last_seen_days'] = '-7'
372 params['flag'] = 'V2Dir'
373 url = ONIONOO + what + '?' + urllib.urlencode(params)
375 # Unfortunately, the URL is too long for some OS filenames,
376 # but we still don't want to get files from different URLs mixed up
377 base_file_name = what + '-' + hashlib.sha1(url).hexdigest()
379 full_url_file_name = base_file_name + '.full_url'
380 MAX_FULL_URL_LENGTH = 1024
382 last_modified_file_name = base_file_name + '.last_modified'
383 MAX_LAST_MODIFIED_LENGTH = 64
385 json_file_name = base_file_name + '.json'
387 if LOCAL_FILES_ONLY:
388 # Read from the local file, don't write to anything
389 response_json = load_json_from_file(json_file_name)
390 else:
391 # store the full URL to a file for debugging
392 # no need to compare as long as you trust SHA-1
393 write_to_file(url, full_url_file_name, MAX_FULL_URL_LENGTH)
395 request = urllib2.Request(url)
396 request.add_header('Accept-encoding', 'gzip')
398 # load the last modified date from the file, if it exists
399 last_mod_date = read_from_file(last_modified_file_name,
400 MAX_LAST_MODIFIED_LENGTH)
401 if last_mod_date is not None:
402 request.add_header('If-modified-since', last_mod_date)
404 # Parse last modified date
405 last_mod = datestr_to_datetime(last_mod_date)
407 # Not Modified and still recent enough to be useful
408 # Onionoo / Globe used to use 6 hours, but we can afford a day
409 required_freshness = datetime.datetime.utcnow()
410 # strip any timezone out (to match dateutil.parser)
411 required_freshness = required_freshness.replace(tzinfo=None)
412 required_freshness -= datetime.timedelta(hours=24)
414 # Make the OnionOO request
415 response_code = 0
416 try:
417 response = urllib2.urlopen(request)
418 response_code = response.getcode()
419 except urllib2.HTTPError, error:
420 response_code = error.code
421 if response_code == 304: # not modified
422 pass
423 else:
424 raise Exception("Could not get " + url + ": "
425 + str(error.code) + ": " + error.reason)
427 if response_code == 200: # OK
428 last_mod = datestr_to_datetime(response.info().get('Last-Modified'))
430 # Check for freshness
431 if last_mod < required_freshness:
432 if last_mod_date is not None:
433 # This check sometimes fails transiently, retry the script if it does
434 date_message = "Outdated data: last updated " + last_mod_date
435 else:
436 date_message = "No data: never downloaded "
437 raise Exception(date_message + " from " + url)
439 # Process the data
440 if response_code == 200: # OK
442 response_json = load_possibly_compressed_response_json(response)
444 with open(json_file_name, 'w') as f:
445 # use the most compact json representation to save space
446 json.dump(response_json, f, separators=(',',':'))
448 # store the last modified date in its own file
449 if response.info().get('Last-modified') is not None:
450 write_to_file(response.info().get('Last-Modified'),
451 last_modified_file_name,
452 MAX_LAST_MODIFIED_LENGTH)
454 elif response_code == 304: # Not Modified
456 response_json = load_json_from_file(json_file_name)
458 else: # Unexpected HTTP response code not covered in the HTTPError above
459 raise Exception("Unexpected HTTP response code to " + url + ": "
460 + str(response_code))
462 register_fetch_source(what,
463 url,
464 response_json['relays_published'],
465 response_json['version'])
467 return response_json
469 def fetch(what, **kwargs):
470 #x = onionoo_fetch(what, **kwargs)
471 # don't use sort_keys, as the order of or_addresses is significant
472 #print json.dumps(x, indent=4, separators=(',', ': '))
473 #sys.exit(0)
475 return onionoo_fetch(what, **kwargs)
477 ## Fallback Candidate Class
479 class Candidate(object):
480 CUTOFF_ADDRESS_AND_PORT_STABLE = (datetime.datetime.utcnow()
481 - datetime.timedelta(ADDRESS_AND_PORT_STABLE_DAYS))
483 def __init__(self, details):
484 for f in ['fingerprint', 'nickname', 'last_changed_address_or_port',
485 'consensus_weight', 'or_addresses', 'dir_address']:
486 if not f in details: raise Exception("Document has no %s field."%(f,))
488 if not 'contact' in details:
489 details['contact'] = None
490 if not 'flags' in details or details['flags'] is None:
491 details['flags'] = []
492 if (not 'advertised_bandwidth' in details
493 or details['advertised_bandwidth'] is None):
494 # relays without advertised bandwdith have it calculated from their
495 # consensus weight
496 details['advertised_bandwidth'] = 0
497 if (not 'effective_family' in details
498 or details['effective_family'] is None):
499 details['effective_family'] = []
500 details['last_changed_address_or_port'] = parse_ts(
501 details['last_changed_address_or_port'])
502 self._data = details
503 self._stable_sort_or_addresses()
505 self._fpr = self._data['fingerprint']
506 self._running = self._guard = self._v2dir = 0.
507 self._split_dirport()
508 self._compute_orport()
509 if self.orport is None:
510 raise Exception("Failed to get an orport for %s."%(self._fpr,))
511 self._compute_ipv6addr()
512 if not self.has_ipv6():
513 logging.debug("Failed to get an ipv6 address for %s."%(self._fpr,))
515 def _stable_sort_or_addresses(self):
516 # replace self._data['or_addresses'] with a stable ordering,
517 # sorting the secondary addresses in string order
518 # leave the received order in self._data['or_addresses_raw']
519 self._data['or_addresses_raw'] = self._data['or_addresses']
520 or_address_primary = self._data['or_addresses'][:1]
521 # subsequent entries in the or_addresses array are in an arbitrary order
522 # so we stabilise the addresses by sorting them in string order
523 or_addresses_secondaries_stable = sorted(self._data['or_addresses'][1:])
524 or_addresses_stable = or_address_primary + or_addresses_secondaries_stable
525 self._data['or_addresses'] = or_addresses_stable
527 def get_fingerprint(self):
528 return self._fpr
530 # is_valid_ipv[46]_address by gsathya, karsten, 2013
531 @staticmethod
532 def is_valid_ipv4_address(address):
533 if not isinstance(address, (str, unicode)):
534 return False
536 # check if there are four period separated values
537 if address.count(".") != 3:
538 return False
540 # checks that each value in the octet are decimal values between 0-255
541 for entry in address.split("."):
542 if not entry.isdigit() or int(entry) < 0 or int(entry) > 255:
543 return False
544 elif entry[0] == "0" and len(entry) > 1:
545 return False # leading zeros, for instance in "1.2.3.001"
547 return True
549 @staticmethod
550 def is_valid_ipv6_address(address):
551 if not isinstance(address, (str, unicode)):
552 return False
554 # remove brackets
555 address = address[1:-1]
557 # addresses are made up of eight colon separated groups of four hex digits
558 # with leading zeros being optional
559 # https://en.wikipedia.org/wiki/IPv6#Address_format
561 colon_count = address.count(":")
563 if colon_count > 7:
564 return False # too many groups
565 elif colon_count != 7 and not "::" in address:
566 return False # not enough groups and none are collapsed
567 elif address.count("::") > 1 or ":::" in address:
568 return False # multiple groupings of zeros can't be collapsed
570 found_ipv4_on_previous_entry = False
571 for entry in address.split(":"):
572 # If an IPv6 address has an embedded IPv4 address,
573 # it must be the last entry
574 if found_ipv4_on_previous_entry:
575 return False
576 if not re.match("^[0-9a-fA-f]{0,4}$", entry):
577 if not Candidate.is_valid_ipv4_address(entry):
578 return False
579 else:
580 found_ipv4_on_previous_entry = True
582 return True
584 def _split_dirport(self):
585 # Split the dir_address into dirip and dirport
586 (self.dirip, _dirport) = self._data['dir_address'].split(':', 2)
587 self.dirport = int(_dirport)
589 def _compute_orport(self):
590 # Choose the first ORPort that's on the same IPv4 address as the DirPort.
591 # In rare circumstances, this might not be the primary ORPort address.
592 # However, _stable_sort_or_addresses() ensures we choose the same one
593 # every time, even if onionoo changes the order of the secondaries.
594 self._split_dirport()
595 self.orport = None
596 for i in self._data['or_addresses']:
597 if i != self._data['or_addresses'][0]:
598 logging.debug('Secondary IPv4 Address Used for %s: %s'%(self._fpr, i))
599 (ipaddr, port) = i.rsplit(':', 1)
600 if (ipaddr == self.dirip) and Candidate.is_valid_ipv4_address(ipaddr):
601 self.orport = int(port)
602 return
604 def _compute_ipv6addr(self):
605 # Choose the first IPv6 address that uses the same port as the ORPort
606 # Or, choose the first IPv6 address in the list
607 # _stable_sort_or_addresses() ensures we choose the same IPv6 address
608 # every time, even if onionoo changes the order of the secondaries.
609 self.ipv6addr = None
610 self.ipv6orport = None
611 # Choose the first IPv6 address that uses the same port as the ORPort
612 for i in self._data['or_addresses']:
613 (ipaddr, port) = i.rsplit(':', 1)
614 if (port == self.orport) and Candidate.is_valid_ipv6_address(ipaddr):
615 self.ipv6addr = ipaddr
616 self.ipv6orport = int(port)
617 return
618 # Choose the first IPv6 address in the list
619 for i in self._data['or_addresses']:
620 (ipaddr, port) = i.rsplit(':', 1)
621 if Candidate.is_valid_ipv6_address(ipaddr):
622 self.ipv6addr = ipaddr
623 self.ipv6orport = int(port)
624 return
626 @staticmethod
627 def _extract_generic_history(history, which='unknown'):
628 # given a tree like this:
630 # "1_month": {
631 # "count": 187,
632 # "factor": 0.001001001001001001,
633 # "first": "2015-02-27 06:00:00",
634 # "interval": 14400,
635 # "last": "2015-03-30 06:00:00",
636 # "values": [
637 # 999,
638 # 999
640 # },
641 # "1_week": {
642 # "count": 169,
643 # "factor": 0.001001001001001001,
644 # "first": "2015-03-23 07:30:00",
645 # "interval": 3600,
646 # "last": "2015-03-30 07:30:00",
647 # "values": [ ...]
648 # },
649 # "1_year": {
650 # "count": 177,
651 # "factor": 0.001001001001001001,
652 # "first": "2014-04-11 00:00:00",
653 # "interval": 172800,
654 # "last": "2015-03-29 00:00:00",
655 # "values": [ ...]
656 # },
657 # "3_months": {
658 # "count": 185,
659 # "factor": 0.001001001001001001,
660 # "first": "2014-12-28 06:00:00",
661 # "interval": 43200,
662 # "last": "2015-03-30 06:00:00",
663 # "values": [ ...]
665 # },
666 # extract exactly one piece of data per time interval,
667 # using smaller intervals where available.
669 # returns list of (age, length, value) dictionaries.
671 generic_history = []
673 periods = history.keys()
674 periods.sort(key = lambda x: history[x]['interval'])
675 now = datetime.datetime.utcnow()
676 newest = now
677 for p in periods:
678 h = history[p]
679 interval = datetime.timedelta(seconds = h['interval'])
680 this_ts = parse_ts(h['last'])
682 if (len(h['values']) != h['count']):
683 logging.warning('Inconsistent value count in %s document for %s'
684 %(p, which))
685 for v in reversed(h['values']):
686 if (this_ts <= newest):
687 agt1 = now - this_ts
688 agt2 = interval
689 agetmp1 = (agt1.microseconds + (agt1.seconds + agt1.days * 24 * 3600)
690 * 10**6) / 10**6
691 agetmp2 = (agt2.microseconds + (agt2.seconds + agt2.days * 24 * 3600)
692 * 10**6) / 10**6
693 generic_history.append(
694 { 'age': agetmp1,
695 'length': agetmp2,
696 'value': v
698 newest = this_ts
699 this_ts -= interval
701 if (this_ts + interval != parse_ts(h['first'])):
702 logging.warning('Inconsistent time information in %s document for %s'
703 %(p, which))
705 #print json.dumps(generic_history, sort_keys=True,
706 # indent=4, separators=(',', ': '))
707 return generic_history
709 @staticmethod
710 def _avg_generic_history(generic_history):
711 a = []
712 for i in generic_history:
713 if i['age'] > (ADDRESS_AND_PORT_STABLE_DAYS * 24 * 3600):
714 continue
715 if (i['length'] is not None
716 and i['age'] is not None
717 and i['value'] is not None):
718 w = i['length'] * math.pow(AGE_ALPHA, i['age']/(3600*24))
719 a.append( (i['value'] * w, w) )
721 sv = math.fsum(map(lambda x: x[0], a))
722 sw = math.fsum(map(lambda x: x[1], a))
724 if sw == 0.0:
725 svw = 0.0
726 else:
727 svw = sv/sw
728 return svw
730 def _add_generic_history(self, history):
731 periods = r['read_history'].keys()
732 periods.sort(key = lambda x: r['read_history'][x]['interval'] )
734 print periods
736 def add_running_history(self, history):
737 pass
739 def add_uptime(self, uptime):
740 logging.debug('Adding uptime %s.'%(self._fpr,))
742 # flags we care about: Running, V2Dir, Guard
743 if not 'flags' in uptime:
744 logging.debug('No flags in document for %s.'%(self._fpr,))
745 return
747 for f in ['Running', 'Guard', 'V2Dir']:
748 if not f in uptime['flags']:
749 logging.debug('No %s in flags for %s.'%(f, self._fpr,))
750 return
752 running = self._extract_generic_history(uptime['flags']['Running'],
753 '%s-Running'%(self._fpr))
754 guard = self._extract_generic_history(uptime['flags']['Guard'],
755 '%s-Guard'%(self._fpr))
756 v2dir = self._extract_generic_history(uptime['flags']['V2Dir'],
757 '%s-V2Dir'%(self._fpr))
758 if 'BadExit' in uptime['flags']:
759 badexit = self._extract_generic_history(uptime['flags']['BadExit'],
760 '%s-BadExit'%(self._fpr))
762 self._running = self._avg_generic_history(running) / ONIONOO_SCALE_ONE
763 self._guard = self._avg_generic_history(guard) / ONIONOO_SCALE_ONE
764 self._v2dir = self._avg_generic_history(v2dir) / ONIONOO_SCALE_ONE
765 self._badexit = None
766 if 'BadExit' in uptime['flags']:
767 self._badexit = self._avg_generic_history(badexit) / ONIONOO_SCALE_ONE
769 def is_candidate(self):
770 must_be_running_now = (PERFORM_IPV4_DIRPORT_CHECKS
771 or PERFORM_IPV6_DIRPORT_CHECKS)
772 if (must_be_running_now and not self.is_running()):
773 logging.info('%s not a candidate: not running now, unable to check ' +
774 'DirPort consensus download', self._fpr)
775 return False
776 if (self._data['last_changed_address_or_port'] >
777 self.CUTOFF_ADDRESS_AND_PORT_STABLE):
778 logging.info('%s not a candidate: changed address/port recently (%s)',
779 self._fpr, self._data['last_changed_address_or_port'])
780 return False
781 if self._running < CUTOFF_RUNNING:
782 logging.info('%s not a candidate: running avg too low (%lf)',
783 self._fpr, self._running)
784 return False
785 if self._v2dir < CUTOFF_V2DIR:
786 logging.info('%s not a candidate: v2dir avg too low (%lf)',
787 self._fpr, self._v2dir)
788 return False
789 if self._badexit is not None and self._badexit > PERMITTED_BADEXIT:
790 logging.info('%s not a candidate: badexit avg too high (%lf)',
791 self._fpr, self._badexit)
792 return False
793 # if the relay doesn't report a version, also exclude the relay
794 if (not self._data.has_key('recommended_version')
795 or not self._data['recommended_version']):
796 logging.info('%s not a candidate: version not recommended', self._fpr)
797 return False
798 if self._guard < CUTOFF_GUARD:
799 logging.info('%s not a candidate: guard avg too low (%lf)',
800 self._fpr, self._guard)
801 return False
802 if (not self._data.has_key('consensus_weight')
803 or self._data['consensus_weight'] < 1):
804 logging.info('%s not a candidate: consensus weight invalid', self._fpr)
805 return False
806 return True
808 def is_in_whitelist(self, relaylist):
809 """ A fallback matches if each key in the whitelist line matches:
810 ipv4
811 dirport
812 orport
814 ipv6 address and port (if present)
815 If the fallback has an ipv6 key, the whitelist line must also have
816 it, and vice versa, otherwise they don't match. """
817 ipv6 = None
818 if self.has_ipv6():
819 ipv6 = '%s:%d'%(self.ipv6addr, self.ipv6orport)
820 for entry in relaylist:
821 if entry['id'] != self._fpr:
822 # can't log here unless we match an IP and port, because every relay's
823 # fingerprint is compared to every entry's fingerprint
824 if entry['ipv4'] == self.dirip and int(entry['orport']) == self.orport:
825 logging.warning('%s excluded: has OR %s:%d changed fingerprint to ' +
826 '%s?', entry['id'], self.dirip, self.orport,
827 self._fpr)
828 if self.has_ipv6() and entry.has_key('ipv6') and entry['ipv6'] == ipv6:
829 logging.warning('%s excluded: has OR %s changed fingerprint to ' +
830 '%s?', entry['id'], ipv6, self._fpr)
831 continue
832 if entry['ipv4'] != self.dirip:
833 logging.warning('%s excluded: has it changed IPv4 from %s to %s?',
834 self._fpr, entry['ipv4'], self.dirip)
835 continue
836 if int(entry['dirport']) != self.dirport:
837 logging.warning('%s excluded: has it changed DirPort from %s:%d to ' +
838 '%s:%d?', self._fpr, self.dirip, int(entry['dirport']),
839 self.dirip, self.dirport)
840 continue
841 if int(entry['orport']) != self.orport:
842 logging.warning('%s excluded: has it changed ORPort from %s:%d to ' +
843 '%s:%d?', self._fpr, self.dirip, int(entry['orport']),
844 self.dirip, self.orport)
845 continue
846 if entry.has_key('ipv6') and self.has_ipv6():
847 # if both entry and fallback have an ipv6 address, compare them
848 if entry['ipv6'] != ipv6:
849 logging.warning('%s excluded: has it changed IPv6 ORPort from %s ' +
850 'to %s?', self._fpr, entry['ipv6'], ipv6)
851 continue
852 # if the fallback has an IPv6 address but the whitelist entry
853 # doesn't, or vice versa, the whitelist entry doesn't match
854 elif entry.has_key('ipv6') and not self.has_ipv6():
855 logging.warning('%s excluded: has it lost its former IPv6 address %s?',
856 self._fpr, entry['ipv6'])
857 continue
858 elif not entry.has_key('ipv6') and self.has_ipv6():
859 logging.warning('%s excluded: has it gained an IPv6 address %s?',
860 self._fpr, ipv6)
861 continue
862 return True
863 return False
865 def is_in_blacklist(self, relaylist):
866 """ A fallback matches a blacklist line if a sufficiently specific group
867 of attributes matches:
868 ipv4 & dirport
869 ipv4 & orport
871 ipv6 & dirport
872 ipv6 & ipv6 orport
873 If the fallback and the blacklist line both have an ipv6 key,
874 their values will be compared, otherwise, they will be ignored.
875 If there is no dirport and no orport, the entry matches all relays on
876 that ip. """
877 for entry in relaylist:
878 for key in entry:
879 value = entry[key]
880 if key == 'id' and value == self._fpr:
881 logging.info('%s is in the blacklist: fingerprint matches',
882 self._fpr)
883 return True
884 if key == 'ipv4' and value == self.dirip:
885 # if the dirport is present, check it too
886 if entry.has_key('dirport'):
887 if int(entry['dirport']) == self.dirport:
888 logging.info('%s is in the blacklist: IPv4 (%s) and ' +
889 'DirPort (%d) match', self._fpr, self.dirip,
890 self.dirport)
891 return True
892 # if the orport is present, check it too
893 elif entry.has_key('orport'):
894 if int(entry['orport']) == self.orport:
895 logging.info('%s is in the blacklist: IPv4 (%s) and ' +
896 'ORPort (%d) match', self._fpr, self.dirip,
897 self.orport)
898 return True
899 else:
900 logging.info('%s is in the blacklist: IPv4 (%s) matches, and ' +
901 'entry has no DirPort or ORPort', self._fpr,
902 self.dirip)
903 return True
904 ipv6 = None
905 if self.has_ipv6():
906 ipv6 = '%s:%d'%(self.ipv6addr, self.ipv6orport)
907 if (key == 'ipv6' and self.has_ipv6()):
908 # if both entry and fallback have an ipv6 address, compare them,
909 # otherwise, disregard ipv6 addresses
910 if value == ipv6:
911 # if the dirport is present, check it too
912 if entry.has_key('dirport'):
913 if int(entry['dirport']) == self.dirport:
914 logging.info('%s is in the blacklist: IPv6 (%s) and ' +
915 'DirPort (%d) match', self._fpr, ipv6,
916 self.dirport)
917 return True
918 # we've already checked the ORPort, it's part of entry['ipv6']
919 else:
920 logging.info('%s is in the blacklist: IPv6 (%s) matches, and' +
921 'entry has no DirPort', self._fpr, ipv6)
922 return True
923 elif (key == 'ipv6' or self.has_ipv6()):
924 # only log if the fingerprint matches but the IPv6 doesn't
925 if entry.has_key('id') and entry['id'] == self._fpr:
926 logging.info('%s skipping IPv6 blacklist comparison: relay ' +
927 'has%s IPv6%s, but entry has%s IPv6%s', self._fpr,
928 '' if self.has_ipv6() else ' no',
929 (' (' + ipv6 + ')') if self.has_ipv6() else '',
930 '' if key == 'ipv6' else ' no',
931 (' (' + value + ')') if key == 'ipv6' else '')
932 logging.warning('Has %s %s IPv6 address %s?', self._fpr,
933 'gained an' if self.has_ipv6() else 'lost its former',
934 ipv6 if self.has_ipv6() else value)
935 return False
937 def cw_to_bw_factor(self):
938 # any relays with a missing or zero consensus weight are not candidates
939 # any relays with a missing advertised bandwidth have it set to zero
940 return self._data['advertised_bandwidth'] / self._data['consensus_weight']
942 # since advertised_bandwidth is reported by the relay, it can be gamed
943 # to avoid this, use the median consensus weight to bandwidth factor to
944 # estimate this relay's measured bandwidth, and make that the upper limit
945 def measured_bandwidth(self, median_cw_to_bw_factor):
946 cw_to_bw= median_cw_to_bw_factor
947 # Reduce exit bandwidth to make sure we're not overloading them
948 if self.is_exit():
949 cw_to_bw *= EXIT_BANDWIDTH_FRACTION
950 measured_bandwidth = self._data['consensus_weight'] * cw_to_bw
951 if self._data['advertised_bandwidth'] != 0:
952 # limit advertised bandwidth (if available) to measured bandwidth
953 return min(measured_bandwidth, self._data['advertised_bandwidth'])
954 else:
955 return measured_bandwidth
957 def set_measured_bandwidth(self, median_cw_to_bw_factor):
958 self._data['measured_bandwidth'] = self.measured_bandwidth(
959 median_cw_to_bw_factor)
961 def is_exit(self):
962 return 'Exit' in self._data['flags']
964 def is_guard(self):
965 return 'Guard' in self._data['flags']
967 def is_running(self):
968 return 'Running' in self._data['flags']
970 # does this fallback have an IPv6 address and orport?
971 def has_ipv6(self):
972 return self.ipv6addr is not None and self.ipv6orport is not None
974 # strip leading and trailing brackets from an IPv6 address
975 # safe to use on non-bracketed IPv6 and on IPv4 addresses
976 # also convert to unicode, and make None appear as ''
977 @staticmethod
978 def strip_ipv6_brackets(ip):
979 if ip is None:
980 return unicode('')
981 if len(ip) < 2:
982 return unicode(ip)
983 if ip[0] == '[' and ip[-1] == ']':
984 return unicode(ip[1:-1])
985 return unicode(ip)
987 # are ip_a and ip_b in the same netblock?
988 # mask_bits is the size of the netblock
989 # takes both IPv4 and IPv6 addresses
990 # the versions of ip_a and ip_b must be the same
991 # the mask must be valid for the IP version
992 @staticmethod
993 def netblocks_equal(ip_a, ip_b, mask_bits):
994 if ip_a is None or ip_b is None:
995 return False
996 ip_a = Candidate.strip_ipv6_brackets(ip_a)
997 ip_b = Candidate.strip_ipv6_brackets(ip_b)
998 a = ipaddress.ip_address(ip_a)
999 b = ipaddress.ip_address(ip_b)
1000 if a.version != b.version:
1001 raise Exception('Mismatching IP versions in %s and %s'%(ip_a, ip_b))
1002 if mask_bits > a.max_prefixlen:
1003 logging.error('Bad IP mask %d for %s and %s'%(mask_bits, ip_a, ip_b))
1004 mask_bits = a.max_prefixlen
1005 if mask_bits < 0:
1006 logging.error('Bad IP mask %d for %s and %s'%(mask_bits, ip_a, ip_b))
1007 mask_bits = 0
1008 a_net = ipaddress.ip_network('%s/%d'%(ip_a, mask_bits), strict=False)
1009 return b in a_net
1011 # is this fallback's IPv4 address (dirip) in the same netblock as other's
1012 # IPv4 address?
1013 # mask_bits is the size of the netblock
1014 def ipv4_netblocks_equal(self, other, mask_bits):
1015 return Candidate.netblocks_equal(self.dirip, other.dirip, mask_bits)
1017 # is this fallback's IPv6 address (ipv6addr) in the same netblock as
1018 # other's IPv6 address?
1019 # Returns False if either fallback has no IPv6 address
1020 # mask_bits is the size of the netblock
1021 def ipv6_netblocks_equal(self, other, mask_bits):
1022 if not self.has_ipv6() or not other.has_ipv6():
1023 return False
1024 return Candidate.netblocks_equal(self.ipv6addr, other.ipv6addr, mask_bits)
1026 # is this fallback's IPv4 DirPort the same as other's IPv4 DirPort?
1027 def dirport_equal(self, other):
1028 return self.dirport == other.dirport
1030 # is this fallback's IPv4 ORPort the same as other's IPv4 ORPort?
1031 def ipv4_orport_equal(self, other):
1032 return self.orport == other.orport
1034 # is this fallback's IPv6 ORPort the same as other's IPv6 ORPort?
1035 # Returns False if either fallback has no IPv6 address
1036 def ipv6_orport_equal(self, other):
1037 if not self.has_ipv6() or not other.has_ipv6():
1038 return False
1039 return self.ipv6orport == other.ipv6orport
1041 # does this fallback have the same DirPort, IPv4 ORPort, or
1042 # IPv6 ORPort as other?
1043 # Ignores IPv6 ORPort if either fallback has no IPv6 address
1044 def port_equal(self, other):
1045 return (self.dirport_equal(other) or self.ipv4_orport_equal(other)
1046 or self.ipv6_orport_equal(other))
1048 # return a list containing IPv4 ORPort, DirPort, and IPv6 ORPort (if present)
1049 def port_list(self):
1050 ports = [self.dirport, self.orport]
1051 if self.has_ipv6() and not self.ipv6orport in ports:
1052 ports.append(self.ipv6orport)
1053 return ports
1055 # does this fallback share a port with other, regardless of whether the
1056 # port types match?
1057 # For example, if self's IPv4 ORPort is 80 and other's DirPort is 80,
1058 # return True
1059 def port_shared(self, other):
1060 for p in self.port_list():
1061 if p in other.port_list():
1062 return True
1063 return False
1065 # report how long it takes to download a consensus from dirip:dirport
1066 @staticmethod
1067 def fallback_consensus_download_speed(dirip, dirport, nickname, max_time):
1068 download_failed = False
1069 downloader = DescriptorDownloader()
1070 start = datetime.datetime.utcnow()
1071 # some directory mirrors respond to requests in ways that hang python
1072 # sockets, which is why we log this line here
1073 logging.info('Initiating consensus download from %s (%s:%d).', nickname,
1074 dirip, dirport)
1075 # there appears to be about 1 second of overhead when comparing stem's
1076 # internal trace time and the elapsed time calculated here
1077 TIMEOUT_SLOP = 1.0
1078 try:
1079 downloader.get_consensus(endpoints = [(dirip, dirport)],
1080 timeout = (max_time + TIMEOUT_SLOP),
1081 validate = True,
1082 retries = 0,
1083 fall_back_to_authority = False).run()
1084 except Exception, stem_error:
1085 logging.info('Unable to retrieve a consensus from %s: %s', nickname,
1086 stem_error)
1087 status = 'error: "%s"' % (stem_error)
1088 level = logging.WARNING
1089 download_failed = True
1090 elapsed = (datetime.datetime.utcnow() - start).total_seconds()
1091 if elapsed > max_time:
1092 status = 'too slow'
1093 level = logging.WARNING
1094 download_failed = True
1095 else:
1096 status = 'ok'
1097 level = logging.DEBUG
1098 logging.log(level, 'Consensus download: %0.1fs %s from %s (%s:%d), ' +
1099 'max download time %0.1fs.', elapsed, status, nickname,
1100 dirip, dirport, max_time)
1101 return download_failed
1103 # does this fallback download the consensus fast enough?
1104 def check_fallback_download_consensus(self):
1105 # include the relay if we're not doing a check, or we can't check (IPv6)
1106 ipv4_failed = False
1107 ipv6_failed = False
1108 if PERFORM_IPV4_DIRPORT_CHECKS:
1109 ipv4_failed = Candidate.fallback_consensus_download_speed(self.dirip,
1110 self.dirport,
1111 self._data['nickname'],
1112 CONSENSUS_DOWNLOAD_SPEED_MAX)
1113 if self.has_ipv6() and PERFORM_IPV6_DIRPORT_CHECKS:
1114 # Clients assume the IPv6 DirPort is the same as the IPv4 DirPort
1115 ipv6_failed = Candidate.fallback_consensus_download_speed(self.ipv6addr,
1116 self.dirport,
1117 self._data['nickname'],
1118 CONSENSUS_DOWNLOAD_SPEED_MAX)
1119 return ((not ipv4_failed) and (not ipv6_failed))
1121 # if this fallback has not passed a download check, try it again,
1122 # and record the result, available in get_fallback_download_consensus
1123 def try_fallback_download_consensus(self):
1124 if not self.get_fallback_download_consensus():
1125 self._data['download_check'] = self.check_fallback_download_consensus()
1127 # did this fallback pass the download check?
1128 def get_fallback_download_consensus(self):
1129 # if we're not performing checks, return True
1130 if not PERFORM_IPV4_DIRPORT_CHECKS and not PERFORM_IPV6_DIRPORT_CHECKS:
1131 return True
1132 # if we are performing checks, but haven't done one, return False
1133 if not self._data.has_key('download_check'):
1134 return False
1135 return self._data['download_check']
1137 # output an optional header comment and info for this fallback
1138 # try_fallback_download_consensus before calling this
1139 def fallbackdir_line(self, fallbacks, prefilter_fallbacks):
1140 s = ''
1141 if OUTPUT_COMMENTS:
1142 s += self.fallbackdir_comment(fallbacks, prefilter_fallbacks)
1143 # if the download speed is ok, output a C string
1144 # if it's not, but we OUTPUT_COMMENTS, output a commented-out C string
1145 if self.get_fallback_download_consensus() or OUTPUT_COMMENTS:
1146 s += self.fallbackdir_info(self.get_fallback_download_consensus())
1147 return s
1149 # output a header comment for this fallback
1150 def fallbackdir_comment(self, fallbacks, prefilter_fallbacks):
1151 # /*
1152 # nickname
1153 # flags
1154 # [contact]
1155 # [identical contact counts]
1156 # */
1157 # Multiline C comment
1158 s = '/*'
1159 s += '\n'
1160 s += cleanse_c_multiline_comment(self._data['nickname'])
1161 s += '\n'
1162 s += 'Flags: '
1163 s += cleanse_c_multiline_comment(' '.join(sorted(self._data['flags'])))
1164 s += '\n'
1165 if self._data['contact'] is not None:
1166 s += cleanse_c_multiline_comment(self._data['contact'])
1167 if CONTACT_COUNT or CONTACT_BLACKLIST_COUNT:
1168 fallback_count = len([f for f in fallbacks
1169 if f._data['contact'] == self._data['contact']])
1170 if fallback_count > 1:
1171 s += '\n'
1172 s += '%d identical contacts listed' % (fallback_count)
1173 if CONTACT_BLACKLIST_COUNT:
1174 prefilter_count = len([f for f in prefilter_fallbacks
1175 if f._data['contact'] == self._data['contact']])
1176 filter_count = prefilter_count - fallback_count
1177 if filter_count > 0:
1178 if fallback_count > 1:
1179 s += ' '
1180 else:
1181 s += '\n'
1182 s += '%d blacklisted' % (filter_count)
1183 s += '\n'
1184 s += '*/'
1185 s += '\n'
1187 # output the fallback info C string for this fallback
1188 # this is the text that would go after FallbackDir in a torrc
1189 # if this relay failed the download test and we OUTPUT_COMMENTS,
1190 # comment-out the returned string
1191 def fallbackdir_info(self, dl_speed_ok):
1192 # "address:dirport orport=port id=fingerprint"
1193 # "[ipv6=addr:orport]"
1194 # "weight=FALLBACK_OUTPUT_WEIGHT",
1196 # Do we want a C string, or a commented-out string?
1197 c_string = dl_speed_ok
1198 comment_string = not dl_speed_ok and OUTPUT_COMMENTS
1199 # If we don't want either kind of string, bail
1200 if not c_string and not comment_string:
1201 return ''
1202 s = ''
1203 # Comment out the fallback directory entry if it's too slow
1204 # See the debug output for which address and port is failing
1205 if comment_string:
1206 s += '/* Consensus download failed or was too slow:\n'
1207 # Multi-Line C string with trailing comma (part of a string list)
1208 # This makes it easier to diff the file, and remove IPv6 lines using grep
1209 # Integers don't need escaping
1210 s += '"%s orport=%d id=%s"'%(
1211 cleanse_c_string(self._data['dir_address']),
1212 self.orport,
1213 cleanse_c_string(self._fpr))
1214 s += '\n'
1215 if self.has_ipv6():
1216 s += '" ipv6=%s:%d"'%(cleanse_c_string(self.ipv6addr), self.ipv6orport)
1217 s += '\n'
1218 s += '" weight=%d",'%(FALLBACK_OUTPUT_WEIGHT)
1219 if comment_string:
1220 s += '\n'
1221 s += '*/'
1222 return s
1224 ## Fallback Candidate List Class
1226 class CandidateList(dict):
1227 def __init__(self):
1228 pass
1230 def _add_relay(self, details):
1231 if not 'dir_address' in details: return
1232 c = Candidate(details)
1233 self[ c.get_fingerprint() ] = c
1235 def _add_uptime(self, uptime):
1236 try:
1237 fpr = uptime['fingerprint']
1238 except KeyError:
1239 raise Exception("Document has no fingerprint field.")
1241 try:
1242 c = self[fpr]
1243 except KeyError:
1244 logging.debug('Got unknown relay %s in uptime document.'%(fpr,))
1245 return
1247 c.add_uptime(uptime)
1249 def _add_details(self):
1250 logging.debug('Loading details document.')
1251 d = fetch('details',
1252 fields=('fingerprint,nickname,contact,last_changed_address_or_port,' +
1253 'consensus_weight,advertised_bandwidth,or_addresses,' +
1254 'dir_address,recommended_version,flags,effective_family'))
1255 logging.debug('Loading details document done.')
1257 if not 'relays' in d: raise Exception("No relays found in document.")
1259 for r in d['relays']: self._add_relay(r)
1261 def _add_uptimes(self):
1262 logging.debug('Loading uptime document.')
1263 d = fetch('uptime')
1264 logging.debug('Loading uptime document done.')
1266 if not 'relays' in d: raise Exception("No relays found in document.")
1267 for r in d['relays']: self._add_uptime(r)
1269 def add_relays(self):
1270 self._add_details()
1271 self._add_uptimes()
1273 def count_guards(self):
1274 guard_count = 0
1275 for fpr in self.keys():
1276 if self[fpr].is_guard():
1277 guard_count += 1
1278 return guard_count
1280 # Find fallbacks that fit the uptime, stability, and flags criteria,
1281 # and make an array of them in self.fallbacks
1282 def compute_fallbacks(self):
1283 self.fallbacks = map(lambda x: self[x],
1284 filter(lambda x: self[x].is_candidate(),
1285 self.keys()))
1287 # sort fallbacks by their consensus weight to advertised bandwidth factor,
1288 # lowest to highest
1289 # used to find the median cw_to_bw_factor()
1290 def sort_fallbacks_by_cw_to_bw_factor(self):
1291 self.fallbacks.sort(key=lambda f: f.cw_to_bw_factor())
1293 # sort fallbacks by their measured bandwidth, highest to lowest
1294 # calculate_measured_bandwidth before calling this
1295 # this is useful for reviewing candidates in priority order
1296 def sort_fallbacks_by_measured_bandwidth(self):
1297 self.fallbacks.sort(key=lambda f: f._data['measured_bandwidth'],
1298 reverse=True)
1300 # sort fallbacks by their fingerprint, lowest to highest
1301 # this is useful for stable diffs of fallback lists
1302 def sort_fallbacks_by_fingerprint(self):
1303 self.fallbacks.sort(key=lambda f: f._fpr)
1305 @staticmethod
1306 def load_relaylist(file_name):
1307 """ Read each line in the file, and parse it like a FallbackDir line:
1308 an IPv4 address and optional port:
1309 <IPv4 address>:<port>
1310 which are parsed into dictionary entries:
1311 ipv4=<IPv4 address>
1312 dirport=<port>
1313 followed by a series of key=value entries:
1314 orport=<port>
1315 id=<fingerprint>
1316 ipv6=<IPv6 address>:<IPv6 orport>
1317 each line's key/value pairs are placed in a dictonary,
1318 (of string -> string key/value pairs),
1319 and these dictionaries are placed in an array.
1320 comments start with # and are ignored """
1321 relaylist = []
1322 file_data = read_from_file(file_name, MAX_LIST_FILE_SIZE)
1323 if file_data is None:
1324 return relaylist
1325 for line in file_data.split('\n'):
1326 relay_entry = {}
1327 # ignore comments
1328 line_comment_split = line.split('#')
1329 line = line_comment_split[0]
1330 # cleanup whitespace
1331 line = cleanse_whitespace(line)
1332 line = line.strip()
1333 if len(line) == 0:
1334 continue
1335 for item in line.split(' '):
1336 item = item.strip()
1337 if len(item) == 0:
1338 continue
1339 key_value_split = item.split('=')
1340 kvl = len(key_value_split)
1341 if kvl < 1 or kvl > 2:
1342 print '#error Bad %s item: %s, format is key=value.'%(
1343 file_name, item)
1344 if kvl == 1:
1345 # assume that entries without a key are the ipv4 address,
1346 # perhaps with a dirport
1347 ipv4_maybe_dirport = key_value_split[0]
1348 ipv4_maybe_dirport_split = ipv4_maybe_dirport.split(':')
1349 dirl = len(ipv4_maybe_dirport_split)
1350 if dirl < 1 or dirl > 2:
1351 print '#error Bad %s IPv4 item: %s, format is ipv4:port.'%(
1352 file_name, item)
1353 if dirl >= 1:
1354 relay_entry['ipv4'] = ipv4_maybe_dirport_split[0]
1355 if dirl == 2:
1356 relay_entry['dirport'] = ipv4_maybe_dirport_split[1]
1357 elif kvl == 2:
1358 relay_entry[key_value_split[0]] = key_value_split[1]
1359 relaylist.append(relay_entry)
1360 return relaylist
1362 # apply the fallback whitelist and blacklist
1363 def apply_filter_lists(self):
1364 excluded_count = 0
1365 logging.debug('Applying whitelist and blacklist.')
1366 # parse the whitelist and blacklist
1367 whitelist = self.load_relaylist(WHITELIST_FILE_NAME)
1368 blacklist = self.load_relaylist(BLACKLIST_FILE_NAME)
1369 filtered_fallbacks = []
1370 for f in self.fallbacks:
1371 in_whitelist = f.is_in_whitelist(whitelist)
1372 in_blacklist = f.is_in_blacklist(blacklist)
1373 if in_whitelist and in_blacklist:
1374 if BLACKLIST_EXCLUDES_WHITELIST_ENTRIES:
1375 # exclude
1376 excluded_count += 1
1377 logging.warning('Excluding %s: in both blacklist and whitelist.',
1378 f._fpr)
1379 else:
1380 # include
1381 filtered_fallbacks.append(f)
1382 elif in_whitelist:
1383 # include
1384 filtered_fallbacks.append(f)
1385 elif in_blacklist:
1386 # exclude
1387 excluded_count += 1
1388 logging.info('Excluding %s: in blacklist.', f._fpr)
1389 else:
1390 if INCLUDE_UNLISTED_ENTRIES:
1391 # include
1392 filtered_fallbacks.append(f)
1393 else:
1394 # exclude
1395 excluded_count += 1
1396 logging.info('Excluding %s: in neither blacklist nor whitelist.',
1397 f._fpr)
1398 self.fallbacks = filtered_fallbacks
1399 return excluded_count
1401 @staticmethod
1402 def summarise_filters(initial_count, excluded_count):
1403 return '/* Whitelist & blacklist excluded %d of %d candidates. */'%(
1404 excluded_count, initial_count)
1406 # calculate each fallback's measured bandwidth based on the median
1407 # consensus weight to advertised bandwdith ratio
1408 def calculate_measured_bandwidth(self):
1409 self.sort_fallbacks_by_cw_to_bw_factor()
1410 median_fallback = self.fallback_median(True)
1411 if median_fallback is not None:
1412 median_cw_to_bw_factor = median_fallback.cw_to_bw_factor()
1413 else:
1414 # this will never be used, because there are no fallbacks
1415 median_cw_to_bw_factor = None
1416 for f in self.fallbacks:
1417 f.set_measured_bandwidth(median_cw_to_bw_factor)
1419 # remove relays with low measured bandwidth from the fallback list
1420 # calculate_measured_bandwidth for each relay before calling this
1421 def remove_low_bandwidth_relays(self):
1422 if MIN_BANDWIDTH is None:
1423 return
1424 above_min_bw_fallbacks = []
1425 for f in self.fallbacks:
1426 if f._data['measured_bandwidth'] >= MIN_BANDWIDTH:
1427 above_min_bw_fallbacks.append(f)
1428 else:
1429 # the bandwidth we log here is limited by the relay's consensus weight
1430 # as well as its adverttised bandwidth. See set_measured_bandwidth
1431 # for details
1432 logging.info('%s not a candidate: bandwidth %.1fMB/s too low, must ' +
1433 'be at least %.1fMB/s', f._fpr,
1434 f._data['measured_bandwidth']/(1024.0*1024.0),
1435 MIN_BANDWIDTH/(1024.0*1024.0))
1436 self.fallbacks = above_min_bw_fallbacks
1438 # the minimum fallback in the list
1439 # call one of the sort_fallbacks_* functions before calling this
1440 def fallback_min(self):
1441 if len(self.fallbacks) > 0:
1442 return self.fallbacks[-1]
1443 else:
1444 return None
1446 # the median fallback in the list
1447 # call one of the sort_fallbacks_* functions before calling this
1448 def fallback_median(self, require_advertised_bandwidth):
1449 # use the low-median when there are an evan number of fallbacks,
1450 # for consistency with the bandwidth authorities
1451 if len(self.fallbacks) > 0:
1452 median_position = (len(self.fallbacks) - 1) / 2
1453 if not require_advertised_bandwidth:
1454 return self.fallbacks[median_position]
1455 # if we need advertised_bandwidth but this relay doesn't have it,
1456 # move to a fallback with greater consensus weight until we find one
1457 while not self.fallbacks[median_position]._data['advertised_bandwidth']:
1458 median_position += 1
1459 if median_position >= len(self.fallbacks):
1460 return None
1461 return self.fallbacks[median_position]
1462 else:
1463 return None
1465 # the maximum fallback in the list
1466 # call one of the sort_fallbacks_* functions before calling this
1467 def fallback_max(self):
1468 if len(self.fallbacks) > 0:
1469 return self.fallbacks[0]
1470 else:
1471 return None
1473 # does exclusion_list contain attribute?
1474 # if so, return False
1475 # if not, return True
1476 # if attribute is None or the empty string, always return True
1477 @staticmethod
1478 def allow(attribute, exclusion_list):
1479 if attribute is None or attribute == '':
1480 return True
1481 elif attribute in exclusion_list:
1482 return False
1483 else:
1484 return True
1486 # make sure there is only one fallback per IPv4 address, and per IPv6 address
1487 # there is only one IPv4 address on each fallback: the IPv4 DirPort address
1488 # (we choose the IPv4 ORPort which is on the same IPv4 as the DirPort)
1489 # there is at most one IPv6 address on each fallback: the IPv6 ORPort address
1490 # we try to match the IPv4 ORPort, but will use any IPv6 address if needed
1491 # (clients assume the IPv6 DirPort is the same as the IPv4 DirPort, but
1492 # typically only use the IPv6 ORPort)
1493 # if there is no IPv6 address, only the IPv4 address is checked
1494 # return the number of candidates we excluded
1495 def limit_fallbacks_same_ip(self):
1496 ip_limit_fallbacks = []
1497 ip_list = []
1498 for f in self.fallbacks:
1499 if (CandidateList.allow(f.dirip, ip_list)
1500 and CandidateList.allow(f.ipv6addr, ip_list)):
1501 ip_limit_fallbacks.append(f)
1502 ip_list.append(f.dirip)
1503 if f.has_ipv6():
1504 ip_list.append(f.ipv6addr)
1505 elif not CandidateList.allow(f.dirip, ip_list):
1506 logging.info('Eliminated %s: already have fallback on IPv4 %s'%(
1507 f._fpr, f.dirip))
1508 elif f.has_ipv6() and not CandidateList.allow(f.ipv6addr, ip_list):
1509 logging.info('Eliminated %s: already have fallback on IPv6 %s'%(
1510 f._fpr, f.ipv6addr))
1511 original_count = len(self.fallbacks)
1512 self.fallbacks = ip_limit_fallbacks
1513 return original_count - len(self.fallbacks)
1515 # make sure there is only one fallback per ContactInfo
1516 # if there is no ContactInfo, allow the fallback
1517 # this check can be gamed by providing no ContactInfo, or by setting the
1518 # ContactInfo to match another fallback
1519 # However, given the likelihood that relays with the same ContactInfo will
1520 # go down at similar times, its usefulness outweighs the risk
1521 def limit_fallbacks_same_contact(self):
1522 contact_limit_fallbacks = []
1523 contact_list = []
1524 for f in self.fallbacks:
1525 if CandidateList.allow(f._data['contact'], contact_list):
1526 contact_limit_fallbacks.append(f)
1527 contact_list.append(f._data['contact'])
1528 else:
1529 logging.info(('Eliminated %s: already have fallback on ' +
1530 'ContactInfo %s')%(f._fpr, f._data['contact']))
1531 original_count = len(self.fallbacks)
1532 self.fallbacks = contact_limit_fallbacks
1533 return original_count - len(self.fallbacks)
1535 # make sure there is only one fallback per effective family
1536 # if there is no family, allow the fallback
1537 # this check can't be gamed, because we use effective family, which ensures
1538 # mutual family declarations
1539 # if any indirect families exist, the result depends on the order in which
1540 # fallbacks are sorted in the list
1541 def limit_fallbacks_same_family(self):
1542 family_limit_fallbacks = []
1543 fingerprint_list = []
1544 for f in self.fallbacks:
1545 if CandidateList.allow(f._fpr, fingerprint_list):
1546 family_limit_fallbacks.append(f)
1547 fingerprint_list.append(f._fpr)
1548 fingerprint_list.extend(f._data['effective_family'])
1549 else:
1550 # technically, we already have a fallback with this fallback in its
1551 # effective family
1552 logging.info('Eliminated %s: already have fallback in effective ' +
1553 'family'%(f._fpr))
1554 original_count = len(self.fallbacks)
1555 self.fallbacks = family_limit_fallbacks
1556 return original_count - len(self.fallbacks)
1558 # try a download check on each fallback candidate in order
1559 # stop after max_count successful downloads
1560 # but don't remove any candidates from the array
1561 def try_download_consensus_checks(self, max_count):
1562 dl_ok_count = 0
1563 for f in self.fallbacks:
1564 f.try_fallback_download_consensus()
1565 if f.get_fallback_download_consensus():
1566 # this fallback downloaded a consensus ok
1567 dl_ok_count += 1
1568 if dl_ok_count >= max_count:
1569 # we have enough fallbacks
1570 return
1572 # put max_count successful candidates in the fallbacks array:
1573 # - perform download checks on each fallback candidate
1574 # - retry failed candidates if CONSENSUS_DOWNLOAD_RETRY is set
1575 # - eliminate failed candidates
1576 # - if there are more than max_count candidates, eliminate lowest bandwidth
1577 # - if there are fewer than max_count candidates, leave only successful
1578 # Return the number of fallbacks that failed the consensus check
1579 def perform_download_consensus_checks(self, max_count):
1580 self.sort_fallbacks_by_measured_bandwidth()
1581 self.try_download_consensus_checks(max_count)
1582 if CONSENSUS_DOWNLOAD_RETRY:
1583 # try unsuccessful candidates again
1584 # we could end up with more than max_count successful candidates here
1585 self.try_download_consensus_checks(max_count)
1586 # now we have at least max_count successful candidates,
1587 # or we've tried them all
1588 original_count = len(self.fallbacks)
1589 self.fallbacks = filter(lambda x: x.get_fallback_download_consensus(),
1590 self.fallbacks)
1591 # some of these failed the check, others skipped the check,
1592 # if we already had enough successful downloads
1593 failed_count = original_count - len(self.fallbacks)
1594 self.fallbacks = self.fallbacks[:max_count]
1595 return failed_count
1597 # return a string that describes a/b as a percentage
1598 @staticmethod
1599 def describe_percentage(a, b):
1600 if b != 0:
1601 return '%d/%d = %.0f%%'%(a, b, (a*100.0)/b)
1602 else:
1603 # technically, 0/0 is undefined, but 0.0% is a sensible result
1604 return '%d/%d = %.0f%%'%(a, b, 0.0)
1606 # return a dictionary of lists of fallbacks by IPv4 netblock
1607 # the dictionary is keyed by the fingerprint of an arbitrary fallback
1608 # in each netblock
1609 # mask_bits is the size of the netblock
1610 def fallbacks_by_ipv4_netblock(self, mask_bits):
1611 netblocks = {}
1612 for f in self.fallbacks:
1613 found_netblock = False
1614 for b in netblocks.keys():
1615 # we found an existing netblock containing this fallback
1616 if f.ipv4_netblocks_equal(self[b], mask_bits):
1617 # add it to the list
1618 netblocks[b].append(f)
1619 found_netblock = True
1620 break
1621 # make a new netblock based on this fallback's fingerprint
1622 if not found_netblock:
1623 netblocks[f._fpr] = [f]
1624 return netblocks
1626 # return a dictionary of lists of fallbacks by IPv6 netblock
1627 # where mask_bits is the size of the netblock
1628 def fallbacks_by_ipv6_netblock(self, mask_bits):
1629 netblocks = {}
1630 for f in self.fallbacks:
1631 # skip fallbacks without IPv6 addresses
1632 if not f.has_ipv6():
1633 continue
1634 found_netblock = False
1635 for b in netblocks.keys():
1636 # we found an existing netblock containing this fallback
1637 if f.ipv6_netblocks_equal(self[b], mask_bits):
1638 # add it to the list
1639 netblocks[b].append(f)
1640 found_netblock = True
1641 break
1642 # make a new netblock based on this fallback's fingerprint
1643 if not found_netblock:
1644 netblocks[f._fpr] = [f]
1645 return netblocks
1647 # log a message about the proportion of fallbacks in each IPv4 netblock,
1648 # where mask_bits is the size of the netblock
1649 def describe_fallback_ipv4_netblock_mask(self, mask_bits):
1650 fallback_count = len(self.fallbacks)
1651 shared_netblock_fallback_count = 0
1652 most_frequent_netblock = None
1653 netblocks = self.fallbacks_by_ipv4_netblock(mask_bits)
1654 for b in netblocks.keys():
1655 if len(netblocks[b]) > 1:
1656 # how many fallbacks are in a netblock with other fallbacks?
1657 shared_netblock_fallback_count += len(netblocks[b])
1658 # what's the netblock with the most fallbacks?
1659 if (most_frequent_netblock is None
1660 or len(netblocks[b]) > len(netblocks[most_frequent_netblock])):
1661 most_frequent_netblock = b
1662 logging.debug('Fallback IPv4 addresses in the same /%d:'%(mask_bits))
1663 for f in netblocks[b]:
1664 logging.debug('%s - %s', f.dirip, f._fpr)
1665 if most_frequent_netblock is not None:
1666 logging.warning('There are %s fallbacks in the IPv4 /%d containing %s'%(
1667 CandidateList.describe_percentage(
1668 len(netblocks[most_frequent_netblock]),
1669 fallback_count),
1670 mask_bits,
1671 self[most_frequent_netblock].dirip))
1672 if shared_netblock_fallback_count > 0:
1673 logging.warning(('%s of fallbacks are in an IPv4 /%d with other ' +
1674 'fallbacks')%(CandidateList.describe_percentage(
1675 shared_netblock_fallback_count,
1676 fallback_count),
1677 mask_bits))
1679 # log a message about the proportion of fallbacks in each IPv6 netblock,
1680 # where mask_bits is the size of the netblock
1681 def describe_fallback_ipv6_netblock_mask(self, mask_bits):
1682 fallback_count = len(self.fallbacks_with_ipv6())
1683 shared_netblock_fallback_count = 0
1684 most_frequent_netblock = None
1685 netblocks = self.fallbacks_by_ipv6_netblock(mask_bits)
1686 for b in netblocks.keys():
1687 if len(netblocks[b]) > 1:
1688 # how many fallbacks are in a netblock with other fallbacks?
1689 shared_netblock_fallback_count += len(netblocks[b])
1690 # what's the netblock with the most fallbacks?
1691 if (most_frequent_netblock is None
1692 or len(netblocks[b]) > len(netblocks[most_frequent_netblock])):
1693 most_frequent_netblock = b
1694 logging.debug('Fallback IPv6 addresses in the same /%d:'%(mask_bits))
1695 for f in netblocks[b]:
1696 logging.debug('%s - %s', f.ipv6addr, f._fpr)
1697 if most_frequent_netblock is not None:
1698 logging.warning('There are %s fallbacks in the IPv6 /%d containing %s'%(
1699 CandidateList.describe_percentage(
1700 len(netblocks[most_frequent_netblock]),
1701 fallback_count),
1702 mask_bits,
1703 self[most_frequent_netblock].ipv6addr))
1704 if shared_netblock_fallback_count > 0:
1705 logging.warning(('%s of fallbacks are in an IPv6 /%d with other ' +
1706 'fallbacks')%(CandidateList.describe_percentage(
1707 shared_netblock_fallback_count,
1708 fallback_count),
1709 mask_bits))
1711 # log a message about the proportion of fallbacks in each IPv4 /8, /16,
1712 # and /24
1713 def describe_fallback_ipv4_netblocks(self):
1714 # this doesn't actually tell us anything useful
1715 #self.describe_fallback_ipv4_netblock_mask(8)
1716 self.describe_fallback_ipv4_netblock_mask(16)
1717 self.describe_fallback_ipv4_netblock_mask(24)
1719 # log a message about the proportion of fallbacks in each IPv6 /12 (RIR),
1720 # /23 (smaller RIR blocks), /32 (LIR), /48 (Customer), and /64 (Host)
1721 # https://www.iana.org/assignments/ipv6-unicast-address-assignments/
1722 def describe_fallback_ipv6_netblocks(self):
1723 # these don't actually tell us anything useful
1724 #self.describe_fallback_ipv6_netblock_mask(12)
1725 #self.describe_fallback_ipv6_netblock_mask(23)
1726 self.describe_fallback_ipv6_netblock_mask(32)
1727 self.describe_fallback_ipv6_netblock_mask(48)
1728 self.describe_fallback_ipv6_netblock_mask(64)
1730 # log a message about the proportion of fallbacks in each IPv4 and IPv6
1731 # netblock
1732 def describe_fallback_netblocks(self):
1733 self.describe_fallback_ipv4_netblocks()
1734 self.describe_fallback_ipv6_netblocks()
1736 # return a list of fallbacks which are on the IPv4 ORPort port
1737 def fallbacks_on_ipv4_orport(self, port):
1738 return filter(lambda x: x.orport == port, self.fallbacks)
1740 # return a list of fallbacks which are on the IPv6 ORPort port
1741 def fallbacks_on_ipv6_orport(self, port):
1742 return filter(lambda x: x.ipv6orport == port, self.fallbacks_with_ipv6())
1744 # return a list of fallbacks which are on the DirPort port
1745 def fallbacks_on_dirport(self, port):
1746 return filter(lambda x: x.dirport == port, self.fallbacks)
1748 # log a message about the proportion of fallbacks on IPv4 ORPort port
1749 # and return that count
1750 def describe_fallback_ipv4_orport(self, port):
1751 port_count = len(self.fallbacks_on_ipv4_orport(port))
1752 fallback_count = len(self.fallbacks)
1753 logging.warning('%s of fallbacks are on IPv4 ORPort %d'%(
1754 CandidateList.describe_percentage(port_count,
1755 fallback_count),
1756 port))
1757 return port_count
1759 # log a message about the proportion of IPv6 fallbacks on IPv6 ORPort port
1760 # and return that count
1761 def describe_fallback_ipv6_orport(self, port):
1762 port_count = len(self.fallbacks_on_ipv6_orport(port))
1763 fallback_count = len(self.fallbacks_with_ipv6())
1764 logging.warning('%s of IPv6 fallbacks are on IPv6 ORPort %d'%(
1765 CandidateList.describe_percentage(port_count,
1766 fallback_count),
1767 port))
1768 return port_count
1770 # log a message about the proportion of fallbacks on DirPort port
1771 # and return that count
1772 def describe_fallback_dirport(self, port):
1773 port_count = len(self.fallbacks_on_dirport(port))
1774 fallback_count = len(self.fallbacks)
1775 logging.warning('%s of fallbacks are on DirPort %d'%(
1776 CandidateList.describe_percentage(port_count,
1777 fallback_count),
1778 port))
1779 return port_count
1781 # log a message about the proportion of fallbacks on each dirport,
1782 # each IPv4 orport, and each IPv6 orport
1783 def describe_fallback_ports(self):
1784 fallback_count = len(self.fallbacks)
1785 ipv4_or_count = fallback_count
1786 ipv4_or_count -= self.describe_fallback_ipv4_orport(443)
1787 ipv4_or_count -= self.describe_fallback_ipv4_orport(9001)
1788 logging.warning('%s of fallbacks are on other IPv4 ORPorts'%(
1789 CandidateList.describe_percentage(ipv4_or_count,
1790 fallback_count)))
1791 ipv6_fallback_count = len(self.fallbacks_with_ipv6())
1792 ipv6_or_count = ipv6_fallback_count
1793 ipv6_or_count -= self.describe_fallback_ipv6_orport(443)
1794 ipv6_or_count -= self.describe_fallback_ipv6_orport(9001)
1795 logging.warning('%s of IPv6 fallbacks are on other IPv6 ORPorts'%(
1796 CandidateList.describe_percentage(ipv6_or_count,
1797 ipv6_fallback_count)))
1798 dir_count = fallback_count
1799 dir_count -= self.describe_fallback_dirport(80)
1800 dir_count -= self.describe_fallback_dirport(9030)
1801 logging.warning('%s of fallbacks are on other DirPorts'%(
1802 CandidateList.describe_percentage(dir_count,
1803 fallback_count)))
1805 # return a list of fallbacks which have the Exit flag
1806 def fallbacks_with_exit(self):
1807 return filter(lambda x: x.is_exit(), self.fallbacks)
1809 # log a message about the proportion of fallbacks with an Exit flag
1810 def describe_fallback_exit_flag(self):
1811 exit_falback_count = len(self.fallbacks_with_exit())
1812 fallback_count = len(self.fallbacks)
1813 logging.warning('%s of fallbacks have the Exit flag'%(
1814 CandidateList.describe_percentage(exit_falback_count,
1815 fallback_count)))
1817 # return a list of fallbacks which have an IPv6 address
1818 def fallbacks_with_ipv6(self):
1819 return filter(lambda x: x.has_ipv6(), self.fallbacks)
1821 # log a message about the proportion of fallbacks on IPv6
1822 def describe_fallback_ip_family(self):
1823 ipv6_falback_count = len(self.fallbacks_with_ipv6())
1824 fallback_count = len(self.fallbacks)
1825 logging.warning('%s of fallbacks are on IPv6'%(
1826 CandidateList.describe_percentage(ipv6_falback_count,
1827 fallback_count)))
1829 def summarise_fallbacks(self, eligible_count, operator_count, failed_count,
1830 guard_count, target_count):
1831 s = ''
1832 s += '/* To comment-out entries in this file, use C comments, and add *'
1833 s += ' to the start of each line. (stem finds fallback entries using "'
1834 s += ' at the start of a line.) */'
1835 s += '\n'
1836 # Report:
1837 # whether we checked consensus download times
1838 # the number of fallback directories (and limits/exclusions, if relevant)
1839 # min & max fallback bandwidths
1840 # #error if below minimum count
1841 if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
1842 s += '/* Checked %s%s%s DirPorts served a consensus within %.1fs. */'%(
1843 'IPv4' if PERFORM_IPV4_DIRPORT_CHECKS else '',
1844 ' and ' if (PERFORM_IPV4_DIRPORT_CHECKS
1845 and PERFORM_IPV6_DIRPORT_CHECKS) else '',
1846 'IPv6' if PERFORM_IPV6_DIRPORT_CHECKS else '',
1847 CONSENSUS_DOWNLOAD_SPEED_MAX)
1848 else:
1849 s += '/* Did not check IPv4 or IPv6 DirPort consensus downloads. */'
1850 s += '\n'
1851 # Multiline C comment with #error if things go bad
1852 s += '/*'
1853 s += '\n'
1854 # Integers don't need escaping in C comments
1855 fallback_count = len(self.fallbacks)
1856 if FALLBACK_PROPORTION_OF_GUARDS is None:
1857 fallback_proportion = ''
1858 else:
1859 fallback_proportion = ', Target %d (%d * %.2f)'%(target_count,
1860 guard_count,
1861 FALLBACK_PROPORTION_OF_GUARDS)
1862 s += 'Final Count: %d (Eligible %d%s'%(fallback_count, eligible_count,
1863 fallback_proportion)
1864 if MAX_FALLBACK_COUNT is not None:
1865 s += ', Max %d'%(MAX_FALLBACK_COUNT)
1866 s += ')\n'
1867 if eligible_count != fallback_count:
1868 removed_count = eligible_count - fallback_count
1869 excess_to_target_or_max = (eligible_count - operator_count - failed_count
1870 - fallback_count)
1871 # some 'Failed' failed the check, others 'Skipped' the check,
1872 # if we already had enough successful downloads
1873 s += ('Excluded: %d (Same Operator %d, Failed/Skipped Download %d, ' +
1874 'Excess %d)')%(removed_count, operator_count, failed_count,
1875 excess_to_target_or_max)
1876 s += '\n'
1877 min_fb = self.fallback_min()
1878 min_bw = min_fb._data['measured_bandwidth']
1879 max_fb = self.fallback_max()
1880 max_bw = max_fb._data['measured_bandwidth']
1881 s += 'Bandwidth Range: %.1f - %.1f MB/s'%(min_bw/(1024.0*1024.0),
1882 max_bw/(1024.0*1024.0))
1883 s += '\n'
1884 s += '*/'
1885 if fallback_count < MIN_FALLBACK_COUNT:
1886 # We must have a minimum number of fallbacks so they are always
1887 # reachable, and are in diverse locations
1888 s += '\n'
1889 s += '#error Fallback Count %d is too low. '%(fallback_count)
1890 s += 'Must be at least %d for diversity. '%(MIN_FALLBACK_COUNT)
1891 s += 'Try adding entries to the whitelist, '
1892 s += 'or setting INCLUDE_UNLISTED_ENTRIES = True.'
1893 return s
1895 ## Main Function
1897 def list_fallbacks():
1898 """ Fetches required onionoo documents and evaluates the
1899 fallback directory criteria for each of the relays """
1901 logging.warning('Downloading and parsing Onionoo data. ' +
1902 'This may take some time.')
1903 # find relays that could be fallbacks
1904 candidates = CandidateList()
1905 candidates.add_relays()
1907 # work out how many fallbacks we want
1908 guard_count = candidates.count_guards()
1909 if FALLBACK_PROPORTION_OF_GUARDS is None:
1910 target_count = guard_count
1911 else:
1912 target_count = int(guard_count * FALLBACK_PROPORTION_OF_GUARDS)
1913 # the maximum number of fallbacks is the least of:
1914 # - the target fallback count (FALLBACK_PROPORTION_OF_GUARDS * guard count)
1915 # - the maximum fallback count (MAX_FALLBACK_COUNT)
1916 if MAX_FALLBACK_COUNT is None:
1917 max_count = target_count
1918 else:
1919 max_count = min(target_count, MAX_FALLBACK_COUNT)
1921 candidates.compute_fallbacks()
1922 prefilter_fallbacks = copy.copy(candidates.fallbacks)
1924 # filter with the whitelist and blacklist
1925 # if a relay has changed IPv4 address or ports recently, it will be excluded
1926 # as ineligible before we call apply_filter_lists, and so there will be no
1927 # warning that the details have changed from those in the whitelist.
1928 # instead, there will be an info-level log during the eligibility check.
1929 initial_count = len(candidates.fallbacks)
1930 excluded_count = candidates.apply_filter_lists()
1931 print candidates.summarise_filters(initial_count, excluded_count)
1932 eligible_count = len(candidates.fallbacks)
1934 # calculate the measured bandwidth of each relay,
1935 # then remove low-bandwidth relays
1936 candidates.calculate_measured_bandwidth()
1937 candidates.remove_low_bandwidth_relays()
1939 # print the raw fallback list
1940 #for x in candidates.fallbacks:
1941 # print x.fallbackdir_line(True)
1942 # print json.dumps(candidates[x]._data, sort_keys=True, indent=4,
1943 # separators=(',', ': '), default=json_util.default)
1945 # impose mandatory conditions here, like one per contact, family, IP
1946 # in measured bandwidth order
1947 candidates.sort_fallbacks_by_measured_bandwidth()
1948 operator_count = 0
1949 # only impose these limits on the final list - operators can nominate
1950 # multiple candidate fallbacks, and then we choose the best set
1951 if not OUTPUT_CANDIDATES:
1952 operator_count += candidates.limit_fallbacks_same_ip()
1953 operator_count += candidates.limit_fallbacks_same_contact()
1954 operator_count += candidates.limit_fallbacks_same_family()
1956 # check if each candidate can serve a consensus
1957 # there's a small risk we've eliminated relays from the same operator that
1958 # can serve a consensus, in favour of one that can't
1959 # but given it takes up to 15 seconds to check each consensus download,
1960 # the risk is worth it
1961 if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
1962 logging.warning('Checking consensus download speeds. ' +
1963 'This may take some time.')
1964 failed_count = candidates.perform_download_consensus_checks(max_count)
1966 # analyse and log interesting diversity metrics
1967 # like netblock, ports, exit, IPv4-only
1968 # (we can't easily analyse AS, and it's hard to accurately analyse country)
1969 candidates.describe_fallback_ip_family()
1970 # if we can't import the ipaddress module, we can't do netblock analysis
1971 if HAVE_IPADDRESS:
1972 candidates.describe_fallback_netblocks()
1973 candidates.describe_fallback_ports()
1974 candidates.describe_fallback_exit_flag()
1976 # output C comments summarising the fallback selection process
1977 if len(candidates.fallbacks) > 0:
1978 print candidates.summarise_fallbacks(eligible_count, operator_count,
1979 failed_count, guard_count,
1980 target_count)
1981 else:
1982 print '/* No Fallbacks met criteria */'
1984 # output C comments specifying the OnionOO data used to create the list
1985 for s in fetch_source_list():
1986 print describe_fetch_source(s)
1988 # if we're outputting the final fallback list, sort by fingerprint
1989 # this makes diffs much more stable
1990 # otherwise, leave sorted by bandwidth, which allows operators to be
1991 # contacted in priority order
1992 if not OUTPUT_CANDIDATES:
1993 candidates.sort_fallbacks_by_fingerprint()
1995 for x in candidates.fallbacks:
1996 print x.fallbackdir_line(candidates.fallbacks, prefilter_fallbacks)
1998 if __name__ == "__main__":
1999 list_fallbacks()