Actually I believe this should be an EINVAL.
[tor.git] / scripts / maint / updateFallbackDirs.py
blob0ea3992d8fe6dfdaf918493eebcc1fcd375bbf41
1 #!/usr/bin/env python
3 # Usage:
5 # Regenerate the list:
6 # scripts/maint/updateFallbackDirs.py > src/app/config/fallback_dirs.inc 2> fallback_dirs.log
8 # Check the existing list:
9 # scripts/maint/updateFallbackDirs.py check_existing > fallback_dirs.inc.ok 2> fallback_dirs.log
10 # mv fallback_dirs.inc.ok src/app/config/fallback_dirs.inc
12 # This script should be run from a stable, reliable network connection,
13 # with no other network activity (and not over tor).
14 # If this is not possible, please disable:
15 # PERFORM_IPV4_DIRPORT_CHECKS and PERFORM_IPV6_DIRPORT_CHECKS
17 # Needs dateutil, stem, and potentially other python packages.
18 # Optionally uses ipaddress (python 3 builtin) or py2-ipaddress (package)
19 # for netblock analysis.
21 # Then read the logs to make sure the fallbacks aren't dominated by a single
22 # netblock or port.
24 # Script by weasel, April 2015
25 # Portions by gsathya & karsten, 2013
26 # https://trac.torproject.org/projects/tor/attachment/ticket/8374/dir_list.2.py
27 # Modifications by teor, 2015
29 import StringIO
30 import string
31 import re
32 import datetime
33 import gzip
34 import os.path
35 import json
36 import math
37 import sys
38 import urllib
39 import urllib2
40 import hashlib
41 import dateutil.parser
42 # bson_lazy provides bson
43 #from bson import json_util
44 import copy
45 import re
47 from stem.descriptor import DocumentHandler
48 from stem.descriptor.remote import get_consensus, get_server_descriptors, MAX_FINGERPRINTS
50 import logging
51 logging.root.name = ''
53 HAVE_IPADDRESS = False
54 try:
55 # python 3 builtin, or install package py2-ipaddress
56 # there are several ipaddress implementations for python 2
57 # with slightly different semantics with str typed text
58 # fortunately, all our IP addresses are in unicode
59 import ipaddress
60 HAVE_IPADDRESS = True
61 except ImportError:
62 # if this happens, we avoid doing netblock analysis
63 logging.warning('Unable to import ipaddress, please install py2-ipaddress.' +
64 ' A fallback list will be created, but optional netblock' +
65 ' analysis will not be performed.')
67 ## Top-Level Configuration
69 # We use semantic versioning: https://semver.org
70 # In particular:
71 # * major changes include removing a mandatory field, or anything else that
72 # would break an appropriately tolerant parser,
73 # * minor changes include adding a field,
74 # * patch changes include changing header comments or other unstructured
75 # content
76 FALLBACK_FORMAT_VERSION = '2.0.0'
77 SECTION_SEPARATOR_BASE = '====='
78 SECTION_SEPARATOR_COMMENT = '/* ' + SECTION_SEPARATOR_BASE + ' */'
80 # Output all candidate fallbacks, or only output selected fallbacks?
81 OUTPUT_CANDIDATES = False
83 # Perform DirPort checks over IPv4?
84 # Change this to False if IPv4 doesn't work for you, or if you don't want to
85 # download a consensus for each fallback
86 # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
87 PERFORM_IPV4_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else True
89 # Perform DirPort checks over IPv6?
90 # If you know IPv6 works for you, set this to True
91 # This will exclude IPv6 relays without an IPv6 DirPort configured
92 # So it's best left at False until #18394 is implemented
93 # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
94 PERFORM_IPV6_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else False
96 # Must relays be running now?
97 MUST_BE_RUNNING_NOW = (PERFORM_IPV4_DIRPORT_CHECKS
98 or PERFORM_IPV6_DIRPORT_CHECKS)
100 # Clients have been using microdesc consensuses by default for a while now
101 DOWNLOAD_MICRODESC_CONSENSUS = True
103 # If a relay delivers an expired consensus, if it expired less than this many
104 # seconds ago, we still allow the relay. This should never be less than -90,
105 # as all directory mirrors should have downloaded a consensus 90 minutes
106 # before it expires. It should never be more than 24 hours, because clients
107 # reject consensuses that are older than REASONABLY_LIVE_TIME.
108 # For the consensus expiry check to be accurate, the machine running this
109 # script needs an accurate clock.
111 # Relays on 0.3.0 and later return a 404 when they are about to serve an
112 # expired consensus. This makes them fail the download check.
113 # We use a tolerance of 0, so that 0.2.x series relays also fail the download
114 # check if they serve an expired consensus.
115 CONSENSUS_EXPIRY_TOLERANCE = 0
117 # Output fallback name, flags, bandwidth, and ContactInfo in a C comment?
118 OUTPUT_COMMENTS = True if OUTPUT_CANDIDATES else False
120 # Output matching ContactInfo in fallbacks list?
121 # Useful if you're trying to contact operators
122 CONTACT_COUNT = True if OUTPUT_CANDIDATES else False
124 # How the list should be sorted:
125 # fingerprint: is useful for stable diffs of fallback lists
126 # measured_bandwidth: is useful when pruning the list based on bandwidth
127 # contact: is useful for contacting operators once the list has been pruned
128 OUTPUT_SORT_FIELD = 'contact' if OUTPUT_CANDIDATES else 'fingerprint'
130 ## OnionOO Settings
132 ONIONOO = 'https://onionoo.torproject.org/'
133 #ONIONOO = 'https://onionoo.thecthulhu.com/'
135 # Don't bother going out to the Internet, just use the files available locally,
136 # even if they're very old
137 LOCAL_FILES_ONLY = False
139 ## Whitelist / Blacklist Filter Settings
141 # The whitelist contains entries that are included if all attributes match
142 # (IPv4, dirport, orport, id, and optionally IPv6 and IPv6 orport)
144 # What happens to entries not in whitelist?
145 # When True, they are included, when False, they are excluded
146 INCLUDE_UNLISTED_ENTRIES = True if OUTPUT_CANDIDATES else False
148 WHITELIST_FILE_NAME = 'scripts/maint/fallback.whitelist'
149 FALLBACK_FILE_NAME = 'src/app/config/fallback_dirs.inc'
151 # The number of bytes we'll read from a filter file before giving up
152 MAX_LIST_FILE_SIZE = 1024 * 1024
154 ## Eligibility Settings
156 # Require fallbacks to have the same address and port for a set amount of time
157 # We used to have this at 1 week, but that caused many fallback failures, which
158 # meant that we had to rebuild the list more often. We want fallbacks to be
159 # stable for 2 years, so we set it to a few months.
161 # If a relay changes address or port, that's it, it's not useful any more,
162 # because clients can't find it
163 ADDRESS_AND_PORT_STABLE_DAYS = 90
164 # We ignore relays that have been down for more than this period
165 MAX_DOWNTIME_DAYS = 0 if MUST_BE_RUNNING_NOW else 7
166 # FallbackDirs must have a time-weighted-fraction that is greater than or
167 # equal to:
168 # Mirrors that are down half the time are still useful half the time
169 CUTOFF_RUNNING = .50
170 CUTOFF_V2DIR = .50
171 # Guard flags are removed for some time after a relay restarts, so we ignore
172 # the guard flag.
173 CUTOFF_GUARD = .00
174 # FallbackDirs must have a time-weighted-fraction that is less than or equal
175 # to:
176 # .00 means no bad exits
177 PERMITTED_BADEXIT = .00
179 # older entries' weights are adjusted with ALPHA^(age in days)
180 AGE_ALPHA = 0.99
182 # this factor is used to scale OnionOO entries to [0,1]
183 ONIONOO_SCALE_ONE = 999.
185 ## Fallback Count Limits
187 # The target for these parameters is 20% of the guards in the network
188 # This is around 200 as of October 2015
189 _FB_POG = 0.2
190 FALLBACK_PROPORTION_OF_GUARDS = None if OUTPUT_CANDIDATES else _FB_POG
192 # Limit the number of fallbacks (eliminating lowest by advertised bandwidth)
193 MAX_FALLBACK_COUNT = None if OUTPUT_CANDIDATES else 200
194 # Emit a C #error if the number of fallbacks is less than expected
195 MIN_FALLBACK_COUNT = 0 if OUTPUT_CANDIDATES else MAX_FALLBACK_COUNT*0.5
197 # The maximum number of fallbacks on the same address, contact, or family
199 # With 150 fallbacks, this means each operator sees 5% of client bootstraps.
200 # For comparison:
201 # - We try to limit guard and exit operators to 5% of the network
202 # - The directory authorities used to see 11% of client bootstraps each
204 # We also don't want too much of the list to go down if a single operator
205 # has to move all their relays.
206 MAX_FALLBACKS_PER_IP = 1
207 MAX_FALLBACKS_PER_IPV4 = MAX_FALLBACKS_PER_IP
208 MAX_FALLBACKS_PER_IPV6 = MAX_FALLBACKS_PER_IP
209 MAX_FALLBACKS_PER_CONTACT = 7
210 MAX_FALLBACKS_PER_FAMILY = 7
212 ## Fallback Bandwidth Requirements
214 # Any fallback with the Exit flag has its bandwidth multiplied by this fraction
215 # to make sure we aren't further overloading exits
216 # (Set to 1.0, because we asked that only lightly loaded exits opt-in,
217 # and the extra load really isn't that much for large relays.)
218 EXIT_BANDWIDTH_FRACTION = 1.0
220 # If a single fallback's bandwidth is too low, it's pointless adding it
221 # We expect fallbacks to handle an extra 10 kilobytes per second of traffic
222 # Make sure they can support fifty times the expected extra load
224 # We convert this to a consensus weight before applying the filter,
225 # because all the bandwidth amounts are specified by the relay
226 MIN_BANDWIDTH = 50.0 * 10.0 * 1024.0
228 # Clients will time out after 30 seconds trying to download a consensus
229 # So allow fallback directories half that to deliver a consensus
230 # The exact download times might change based on the network connection
231 # running this script, but only by a few seconds
232 # There is also about a second of python overhead
233 CONSENSUS_DOWNLOAD_SPEED_MAX = 15.0
234 # If the relay fails a consensus check, retry the download
235 # This avoids delisting a relay due to transient network conditions
236 CONSENSUS_DOWNLOAD_RETRY = True
238 ## Parsing Functions
240 def parse_ts(t):
241 return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S")
243 def remove_bad_chars(raw_string, bad_char_list):
244 # Remove each character in the bad_char_list
245 cleansed_string = raw_string
246 for c in bad_char_list:
247 cleansed_string = cleansed_string.replace(c, '')
248 return cleansed_string
250 def cleanse_unprintable(raw_string):
251 # Remove all unprintable characters
252 cleansed_string = ''
253 for c in raw_string:
254 if c in string.printable:
255 cleansed_string += c
256 return cleansed_string
258 def cleanse_whitespace(raw_string):
259 # Replace all whitespace characters with a space
260 cleansed_string = raw_string
261 for c in string.whitespace:
262 cleansed_string = cleansed_string.replace(c, ' ')
263 return cleansed_string
265 def cleanse_c_multiline_comment(raw_string):
266 cleansed_string = raw_string
267 # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
268 cleansed_string = cleanse_whitespace(cleansed_string)
269 # ContactInfo and Version can be arbitrary binary data
270 cleansed_string = cleanse_unprintable(cleansed_string)
271 # Prevent a malicious / unanticipated string from breaking out
272 # of a C-style multiline comment
273 # This removes '/*' and '*/' and '//'
274 bad_char_list = '*/'
275 # Prevent a malicious string from using C nulls
276 bad_char_list += '\0'
277 # Avoid confusing parsers by making sure there is only one comma per fallback
278 bad_char_list += ','
279 # Avoid confusing parsers by making sure there is only one equals per field
280 bad_char_list += '='
281 # Be safer by removing bad characters entirely
282 cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
283 # Some compilers may further process the content of comments
284 # There isn't much we can do to cover every possible case
285 # But comment-based directives are typically only advisory
286 return cleansed_string
288 def cleanse_c_string(raw_string):
289 cleansed_string = raw_string
290 # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
291 cleansed_string = cleanse_whitespace(cleansed_string)
292 # ContactInfo and Version can be arbitrary binary data
293 cleansed_string = cleanse_unprintable(cleansed_string)
294 # Prevent a malicious address/fingerprint string from breaking out
295 # of a C-style string
296 bad_char_list = '"'
297 # Prevent a malicious string from using escapes
298 bad_char_list += '\\'
299 # Prevent a malicious string from using C nulls
300 bad_char_list += '\0'
301 # Avoid confusing parsers by making sure there is only one comma per fallback
302 bad_char_list += ','
303 # Avoid confusing parsers by making sure there is only one equals per field
304 bad_char_list += '='
305 # Be safer by removing bad characters entirely
306 cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
307 # Some compilers may further process the content of strings
308 # There isn't much we can do to cover every possible case
309 # But this typically only results in changes to the string data
310 return cleansed_string
312 ## OnionOO Source Functions
314 # a dictionary of source metadata for each onionoo query we've made
315 fetch_source = {}
317 # register source metadata for 'what'
318 # assumes we only retrieve one document for each 'what'
319 def register_fetch_source(what, url, relays_published, version):
320 fetch_source[what] = {}
321 fetch_source[what]['url'] = url
322 fetch_source[what]['relays_published'] = relays_published
323 fetch_source[what]['version'] = version
325 # list each registered source's 'what'
326 def fetch_source_list():
327 return sorted(fetch_source.keys())
329 # given 'what', provide a multiline C comment describing the source
330 def describe_fetch_source(what):
331 desc = '/*'
332 desc += '\n'
333 desc += 'Onionoo Source: '
334 desc += cleanse_c_multiline_comment(what)
335 desc += ' Date: '
336 desc += cleanse_c_multiline_comment(fetch_source[what]['relays_published'])
337 desc += ' Version: '
338 desc += cleanse_c_multiline_comment(fetch_source[what]['version'])
339 desc += '\n'
340 desc += 'URL: '
341 desc += cleanse_c_multiline_comment(fetch_source[what]['url'])
342 desc += '\n'
343 desc += '*/'
344 return desc
346 ## File Processing Functions
348 def write_to_file(str, file_name, max_len):
349 try:
350 with open(file_name, 'w') as f:
351 f.write(str[0:max_len])
352 except EnvironmentError, error:
353 logging.error('Writing file %s failed: %d: %s'%
354 (file_name,
355 error.errno,
356 error.strerror)
359 def read_from_file(file_name, max_len):
360 try:
361 if os.path.isfile(file_name):
362 with open(file_name, 'r') as f:
363 return f.read(max_len)
364 except EnvironmentError, error:
365 logging.info('Loading file %s failed: %d: %s'%
366 (file_name,
367 error.errno,
368 error.strerror)
370 return None
372 def parse_fallback_file(file_name):
373 file_data = read_from_file(file_name, MAX_LIST_FILE_SIZE)
374 file_data = cleanse_unprintable(file_data)
375 file_data = remove_bad_chars(file_data, '\n"\0')
376 file_data = re.sub('/\*.*?\*/', '', file_data)
377 file_data = file_data.replace(',', '\n')
378 file_data = file_data.replace(' weight=10', '')
379 return file_data
381 def load_possibly_compressed_response_json(response):
382 if response.info().get('Content-Encoding') == 'gzip':
383 buf = StringIO.StringIO( response.read() )
384 f = gzip.GzipFile(fileobj=buf)
385 return json.load(f)
386 else:
387 return json.load(response)
389 def load_json_from_file(json_file_name):
390 # An exception here may be resolved by deleting the .last_modified
391 # and .json files, and re-running the script
392 try:
393 with open(json_file_name, 'r') as f:
394 return json.load(f)
395 except EnvironmentError, error:
396 raise Exception('Reading not-modified json file %s failed: %d: %s'%
397 (json_file_name,
398 error.errno,
399 error.strerror)
402 ## OnionOO Functions
404 def datestr_to_datetime(datestr):
405 # Parse datetimes like: Fri, 02 Oct 2015 13:34:14 GMT
406 if datestr is not None:
407 dt = dateutil.parser.parse(datestr)
408 else:
409 # Never modified - use start of epoch
410 dt = datetime.datetime.utcfromtimestamp(0)
411 # strip any timezone out (in case they're supported in future)
412 dt = dt.replace(tzinfo=None)
413 return dt
415 def onionoo_fetch(what, **kwargs):
416 params = kwargs
417 params['type'] = 'relay'
418 #params['limit'] = 10
419 params['first_seen_days'] = '%d-'%(ADDRESS_AND_PORT_STABLE_DAYS)
420 params['last_seen_days'] = '-%d'%(MAX_DOWNTIME_DAYS)
421 params['flag'] = 'V2Dir'
422 url = ONIONOO + what + '?' + urllib.urlencode(params)
424 # Unfortunately, the URL is too long for some OS filenames,
425 # but we still don't want to get files from different URLs mixed up
426 base_file_name = what + '-' + hashlib.sha1(url).hexdigest()
428 full_url_file_name = base_file_name + '.full_url'
429 MAX_FULL_URL_LENGTH = 1024
431 last_modified_file_name = base_file_name + '.last_modified'
432 MAX_LAST_MODIFIED_LENGTH = 64
434 json_file_name = base_file_name + '.json'
436 if LOCAL_FILES_ONLY:
437 # Read from the local file, don't write to anything
438 response_json = load_json_from_file(json_file_name)
439 else:
440 # store the full URL to a file for debugging
441 # no need to compare as long as you trust SHA-1
442 write_to_file(url, full_url_file_name, MAX_FULL_URL_LENGTH)
444 request = urllib2.Request(url)
445 request.add_header('Accept-encoding', 'gzip')
447 # load the last modified date from the file, if it exists
448 last_mod_date = read_from_file(last_modified_file_name,
449 MAX_LAST_MODIFIED_LENGTH)
450 if last_mod_date is not None:
451 request.add_header('If-modified-since', last_mod_date)
453 # Parse last modified date
454 last_mod = datestr_to_datetime(last_mod_date)
456 # Not Modified and still recent enough to be useful
457 # Onionoo / Globe used to use 6 hours, but we can afford a day
458 required_freshness = datetime.datetime.utcnow()
459 # strip any timezone out (to match dateutil.parser)
460 required_freshness = required_freshness.replace(tzinfo=None)
461 required_freshness -= datetime.timedelta(hours=24)
463 # Make the OnionOO request
464 response_code = 0
465 try:
466 response = urllib2.urlopen(request)
467 response_code = response.getcode()
468 except urllib2.HTTPError, error:
469 response_code = error.code
470 if response_code == 304: # not modified
471 pass
472 else:
473 raise Exception("Could not get " + url + ": "
474 + str(error.code) + ": " + error.reason)
476 if response_code == 200: # OK
477 last_mod = datestr_to_datetime(response.info().get('Last-Modified'))
479 # Check for freshness
480 if last_mod < required_freshness:
481 if last_mod_date is not None:
482 # This check sometimes fails transiently, retry the script if it does
483 date_message = "Outdated data: last updated " + last_mod_date
484 else:
485 date_message = "No data: never downloaded "
486 raise Exception(date_message + " from " + url)
488 # Process the data
489 if response_code == 200: # OK
491 response_json = load_possibly_compressed_response_json(response)
493 with open(json_file_name, 'w') as f:
494 # use the most compact json representation to save space
495 json.dump(response_json, f, separators=(',',':'))
497 # store the last modified date in its own file
498 if response.info().get('Last-modified') is not None:
499 write_to_file(response.info().get('Last-Modified'),
500 last_modified_file_name,
501 MAX_LAST_MODIFIED_LENGTH)
503 elif response_code == 304: # Not Modified
505 response_json = load_json_from_file(json_file_name)
507 else: # Unexpected HTTP response code not covered in the HTTPError above
508 raise Exception("Unexpected HTTP response code to " + url + ": "
509 + str(response_code))
511 register_fetch_source(what,
512 url,
513 response_json['relays_published'],
514 response_json['version'])
516 return response_json
518 def fetch(what, **kwargs):
519 #x = onionoo_fetch(what, **kwargs)
520 # don't use sort_keys, as the order of or_addresses is significant
521 #print json.dumps(x, indent=4, separators=(',', ': '))
522 #sys.exit(0)
524 return onionoo_fetch(what, **kwargs)
526 ## Fallback Candidate Class
528 class Candidate(object):
529 CUTOFF_ADDRESS_AND_PORT_STABLE = (datetime.datetime.utcnow()
530 - datetime.timedelta(ADDRESS_AND_PORT_STABLE_DAYS))
532 def __init__(self, details):
533 for f in ['fingerprint', 'nickname', 'last_changed_address_or_port',
534 'consensus_weight', 'or_addresses', 'dir_address']:
535 if not f in details: raise Exception("Document has no %s field."%(f,))
537 if not 'contact' in details:
538 details['contact'] = None
539 if not 'flags' in details or details['flags'] is None:
540 details['flags'] = []
541 if (not 'advertised_bandwidth' in details
542 or details['advertised_bandwidth'] is None):
543 # relays without advertised bandwidth have it calculated from their
544 # consensus weight
545 details['advertised_bandwidth'] = 0
546 if (not 'effective_family' in details
547 or details['effective_family'] is None):
548 details['effective_family'] = []
549 if not 'platform' in details:
550 details['platform'] = None
551 details['last_changed_address_or_port'] = parse_ts(
552 details['last_changed_address_or_port'])
553 self._data = details
554 self._stable_sort_or_addresses()
556 self._fpr = self._data['fingerprint']
557 self._running = self._guard = self._v2dir = 0.
558 self._split_dirport()
559 self._compute_orport()
560 if self.orport is None:
561 raise Exception("Failed to get an orport for %s."%(self._fpr,))
562 self._compute_ipv6addr()
563 if not self.has_ipv6():
564 logging.debug("Failed to get an ipv6 address for %s."%(self._fpr,))
565 self._compute_version()
566 self._extra_info_cache = None
568 def _stable_sort_or_addresses(self):
569 # replace self._data['or_addresses'] with a stable ordering,
570 # sorting the secondary addresses in string order
571 # leave the received order in self._data['or_addresses_raw']
572 self._data['or_addresses_raw'] = self._data['or_addresses']
573 or_address_primary = self._data['or_addresses'][:1]
574 # subsequent entries in the or_addresses array are in an arbitrary order
575 # so we stabilise the addresses by sorting them in string order
576 or_addresses_secondaries_stable = sorted(self._data['or_addresses'][1:])
577 or_addresses_stable = or_address_primary + or_addresses_secondaries_stable
578 self._data['or_addresses'] = or_addresses_stable
580 def get_fingerprint(self):
581 return self._fpr
583 # is_valid_ipv[46]_address by gsathya, karsten, 2013
584 @staticmethod
585 def is_valid_ipv4_address(address):
586 if not isinstance(address, (str, unicode)):
587 return False
589 # check if there are four period separated values
590 if address.count(".") != 3:
591 return False
593 # checks that each value in the octet are decimal values between 0-255
594 for entry in address.split("."):
595 if not entry.isdigit() or int(entry) < 0 or int(entry) > 255:
596 return False
597 elif entry[0] == "0" and len(entry) > 1:
598 return False # leading zeros, for instance in "1.2.3.001"
600 return True
602 @staticmethod
603 def is_valid_ipv6_address(address):
604 if not isinstance(address, (str, unicode)):
605 return False
607 # remove brackets
608 address = address[1:-1]
610 # addresses are made up of eight colon separated groups of four hex digits
611 # with leading zeros being optional
612 # https://en.wikipedia.org/wiki/IPv6#Address_format
614 colon_count = address.count(":")
616 if colon_count > 7:
617 return False # too many groups
618 elif colon_count != 7 and not "::" in address:
619 return False # not enough groups and none are collapsed
620 elif address.count("::") > 1 or ":::" in address:
621 return False # multiple groupings of zeros can't be collapsed
623 found_ipv4_on_previous_entry = False
624 for entry in address.split(":"):
625 # If an IPv6 address has an embedded IPv4 address,
626 # it must be the last entry
627 if found_ipv4_on_previous_entry:
628 return False
629 if not re.match("^[0-9a-fA-f]{0,4}$", entry):
630 if not Candidate.is_valid_ipv4_address(entry):
631 return False
632 else:
633 found_ipv4_on_previous_entry = True
635 return True
637 def _split_dirport(self):
638 # Split the dir_address into dirip and dirport
639 (self.dirip, _dirport) = self._data['dir_address'].split(':', 2)
640 self.dirport = int(_dirport)
642 def _compute_orport(self):
643 # Choose the first ORPort that's on the same IPv4 address as the DirPort.
644 # In rare circumstances, this might not be the primary ORPort address.
645 # However, _stable_sort_or_addresses() ensures we choose the same one
646 # every time, even if onionoo changes the order of the secondaries.
647 self._split_dirport()
648 self.orport = None
649 for i in self._data['or_addresses']:
650 if i != self._data['or_addresses'][0]:
651 logging.debug('Secondary IPv4 Address Used for %s: %s'%(self._fpr, i))
652 (ipaddr, port) = i.rsplit(':', 1)
653 if (ipaddr == self.dirip) and Candidate.is_valid_ipv4_address(ipaddr):
654 self.orport = int(port)
655 return
657 def _compute_ipv6addr(self):
658 # Choose the first IPv6 address that uses the same port as the ORPort
659 # Or, choose the first IPv6 address in the list
660 # _stable_sort_or_addresses() ensures we choose the same IPv6 address
661 # every time, even if onionoo changes the order of the secondaries.
662 self.ipv6addr = None
663 self.ipv6orport = None
664 # Choose the first IPv6 address that uses the same port as the ORPort
665 for i in self._data['or_addresses']:
666 (ipaddr, port) = i.rsplit(':', 1)
667 if (port == self.orport) and Candidate.is_valid_ipv6_address(ipaddr):
668 self.ipv6addr = ipaddr
669 self.ipv6orport = int(port)
670 return
671 # Choose the first IPv6 address in the list
672 for i in self._data['or_addresses']:
673 (ipaddr, port) = i.rsplit(':', 1)
674 if Candidate.is_valid_ipv6_address(ipaddr):
675 self.ipv6addr = ipaddr
676 self.ipv6orport = int(port)
677 return
679 def _compute_version(self):
680 # parse the version out of the platform string
681 # The platform looks like: "Tor 0.2.7.6 on Linux"
682 self._data['version'] = None
683 if self._data['platform'] is None:
684 return
685 # be tolerant of weird whitespacing, use a whitespace split
686 tokens = self._data['platform'].split()
687 for token in tokens:
688 vnums = token.split('.')
689 # if it's at least a.b.c.d, with potentially an -alpha-dev, -alpha, -rc
690 if (len(vnums) >= 4 and vnums[0].isdigit() and vnums[1].isdigit() and
691 vnums[2].isdigit()):
692 self._data['version'] = token
693 return
695 # From #20509
696 # bug #20499 affects versions from 0.2.9.1-alpha-dev to 0.2.9.4-alpha-dev
697 # and version 0.3.0.0-alpha-dev
698 # Exhaustive lists are hard to get wrong
699 STALE_CONSENSUS_VERSIONS = ['0.2.9.1-alpha-dev',
700 '0.2.9.2-alpha',
701 '0.2.9.2-alpha-dev',
702 '0.2.9.3-alpha',
703 '0.2.9.3-alpha-dev',
704 '0.2.9.4-alpha',
705 '0.2.9.4-alpha-dev',
706 '0.3.0.0-alpha-dev'
709 def is_valid_version(self):
710 # call _compute_version before calling this
711 # is the version of the relay a version we want as a fallback?
712 # checks both recommended versions and bug #20499 / #20509
714 # if the relay doesn't have a recommended version field, exclude the relay
715 if not self._data.has_key('recommended_version'):
716 log_excluded('%s not a candidate: no recommended_version field',
717 self._fpr)
718 return False
719 if not self._data['recommended_version']:
720 log_excluded('%s not a candidate: version not recommended', self._fpr)
721 return False
722 # if the relay doesn't have version field, exclude the relay
723 if not self._data.has_key('version'):
724 log_excluded('%s not a candidate: no version field', self._fpr)
725 return False
726 if self._data['version'] in Candidate.STALE_CONSENSUS_VERSIONS:
727 logging.warning('%s not a candidate: version delivers stale consensuses',
728 self._fpr)
729 return False
730 return True
732 @staticmethod
733 def _extract_generic_history(history, which='unknown'):
734 # given a tree like this:
736 # "1_month": {
737 # "count": 187,
738 # "factor": 0.001001001001001001,
739 # "first": "2015-02-27 06:00:00",
740 # "interval": 14400,
741 # "last": "2015-03-30 06:00:00",
742 # "values": [
743 # 999,
744 # 999
746 # },
747 # "1_week": {
748 # "count": 169,
749 # "factor": 0.001001001001001001,
750 # "first": "2015-03-23 07:30:00",
751 # "interval": 3600,
752 # "last": "2015-03-30 07:30:00",
753 # "values": [ ...]
754 # },
755 # "1_year": {
756 # "count": 177,
757 # "factor": 0.001001001001001001,
758 # "first": "2014-04-11 00:00:00",
759 # "interval": 172800,
760 # "last": "2015-03-29 00:00:00",
761 # "values": [ ...]
762 # },
763 # "3_months": {
764 # "count": 185,
765 # "factor": 0.001001001001001001,
766 # "first": "2014-12-28 06:00:00",
767 # "interval": 43200,
768 # "last": "2015-03-30 06:00:00",
769 # "values": [ ...]
771 # },
772 # extract exactly one piece of data per time interval,
773 # using smaller intervals where available.
775 # returns list of (age, length, value) dictionaries.
777 generic_history = []
779 periods = history.keys()
780 periods.sort(key = lambda x: history[x]['interval'])
781 now = datetime.datetime.utcnow()
782 newest = now
783 for p in periods:
784 h = history[p]
785 interval = datetime.timedelta(seconds = h['interval'])
786 this_ts = parse_ts(h['last'])
788 if (len(h['values']) != h['count']):
789 logging.warning('Inconsistent value count in %s document for %s'
790 %(p, which))
791 for v in reversed(h['values']):
792 if (this_ts <= newest):
793 agt1 = now - this_ts
794 agt2 = interval
795 agetmp1 = (agt1.microseconds + (agt1.seconds + agt1.days * 24 * 3600)
796 * 10**6) / 10**6
797 agetmp2 = (agt2.microseconds + (agt2.seconds + agt2.days * 24 * 3600)
798 * 10**6) / 10**6
799 generic_history.append(
800 { 'age': agetmp1,
801 'length': agetmp2,
802 'value': v
804 newest = this_ts
805 this_ts -= interval
807 if (this_ts + interval != parse_ts(h['first'])):
808 logging.warning('Inconsistent time information in %s document for %s'
809 %(p, which))
811 #print json.dumps(generic_history, sort_keys=True,
812 # indent=4, separators=(',', ': '))
813 return generic_history
815 @staticmethod
816 def _avg_generic_history(generic_history):
817 a = []
818 for i in generic_history:
819 if i['age'] > (ADDRESS_AND_PORT_STABLE_DAYS * 24 * 3600):
820 continue
821 if (i['length'] is not None
822 and i['age'] is not None
823 and i['value'] is not None):
824 w = i['length'] * math.pow(AGE_ALPHA, i['age']/(3600*24))
825 a.append( (i['value'] * w, w) )
827 sv = math.fsum(map(lambda x: x[0], a))
828 sw = math.fsum(map(lambda x: x[1], a))
830 if sw == 0.0:
831 svw = 0.0
832 else:
833 svw = sv/sw
834 return svw
836 def _add_generic_history(self, history):
837 periods = r['read_history'].keys()
838 periods.sort(key = lambda x: r['read_history'][x]['interval'] )
840 print periods
842 def add_running_history(self, history):
843 pass
845 def add_uptime(self, uptime):
846 logging.debug('Adding uptime %s.'%(self._fpr,))
848 # flags we care about: Running, V2Dir, Guard
849 if not 'flags' in uptime:
850 logging.debug('No flags in document for %s.'%(self._fpr,))
851 return
853 for f in ['Running', 'Guard', 'V2Dir']:
854 if not f in uptime['flags']:
855 logging.debug('No %s in flags for %s.'%(f, self._fpr,))
856 return
858 running = self._extract_generic_history(uptime['flags']['Running'],
859 '%s-Running'%(self._fpr))
860 guard = self._extract_generic_history(uptime['flags']['Guard'],
861 '%s-Guard'%(self._fpr))
862 v2dir = self._extract_generic_history(uptime['flags']['V2Dir'],
863 '%s-V2Dir'%(self._fpr))
864 if 'BadExit' in uptime['flags']:
865 badexit = self._extract_generic_history(uptime['flags']['BadExit'],
866 '%s-BadExit'%(self._fpr))
868 self._running = self._avg_generic_history(running) / ONIONOO_SCALE_ONE
869 self._guard = self._avg_generic_history(guard) / ONIONOO_SCALE_ONE
870 self._v2dir = self._avg_generic_history(v2dir) / ONIONOO_SCALE_ONE
871 self._badexit = None
872 if 'BadExit' in uptime['flags']:
873 self._badexit = self._avg_generic_history(badexit) / ONIONOO_SCALE_ONE
875 def is_candidate(self):
876 try:
877 if (MUST_BE_RUNNING_NOW and not self.is_running()):
878 log_excluded('%s not a candidate: not running now, unable to check ' +
879 'DirPort consensus download', self._fpr)
880 return False
881 if (self._data['last_changed_address_or_port'] >
882 self.CUTOFF_ADDRESS_AND_PORT_STABLE):
883 log_excluded('%s not a candidate: changed address/port recently (%s)',
884 self._fpr, self._data['last_changed_address_or_port'])
885 return False
886 if self._running < CUTOFF_RUNNING:
887 log_excluded('%s not a candidate: running avg too low (%lf)',
888 self._fpr, self._running)
889 return False
890 if self._v2dir < CUTOFF_V2DIR:
891 log_excluded('%s not a candidate: v2dir avg too low (%lf)',
892 self._fpr, self._v2dir)
893 return False
894 if self._badexit is not None and self._badexit > PERMITTED_BADEXIT:
895 log_excluded('%s not a candidate: badexit avg too high (%lf)',
896 self._fpr, self._badexit)
897 return False
898 # this function logs a message depending on which check fails
899 if not self.is_valid_version():
900 return False
901 if self._guard < CUTOFF_GUARD:
902 log_excluded('%s not a candidate: guard avg too low (%lf)',
903 self._fpr, self._guard)
904 return False
905 if (not self._data.has_key('consensus_weight')
906 or self._data['consensus_weight'] < 1):
907 log_excluded('%s not a candidate: consensus weight invalid', self._fpr)
908 return False
909 except BaseException as e:
910 logging.warning("Exception %s when checking if fallback is a candidate",
911 str(e))
912 return False
913 return True
915 def is_in_whitelist(self, relaylist):
916 """ A fallback matches if each key in the whitelist line matches:
917 ipv4
918 dirport
919 orport
921 ipv6 address and port (if present)
922 If the fallback has an ipv6 key, the whitelist line must also have
923 it, and vice versa, otherwise they don't match. """
924 ipv6 = None
925 if self.has_ipv6():
926 ipv6 = '%s:%d'%(self.ipv6addr, self.ipv6orport)
927 for entry in relaylist:
928 if entry['id'] != self._fpr:
929 # can't log here unless we match an IP and port, because every relay's
930 # fingerprint is compared to every entry's fingerprint
931 if entry['ipv4'] == self.dirip and int(entry['orport']) == self.orport:
932 logging.warning('%s excluded: has OR %s:%d changed fingerprint to ' +
933 '%s?', entry['id'], self.dirip, self.orport,
934 self._fpr)
935 if self.has_ipv6() and entry.has_key('ipv6') and entry['ipv6'] == ipv6:
936 logging.warning('%s excluded: has OR %s changed fingerprint to ' +
937 '%s?', entry['id'], ipv6, self._fpr)
938 continue
939 if entry['ipv4'] != self.dirip:
940 logging.warning('%s excluded: has it changed IPv4 from %s to %s?',
941 self._fpr, entry['ipv4'], self.dirip)
942 continue
943 if int(entry['dirport']) != self.dirport:
944 logging.warning('%s excluded: has it changed DirPort from %s:%d to ' +
945 '%s:%d?', self._fpr, self.dirip, int(entry['dirport']),
946 self.dirip, self.dirport)
947 continue
948 if int(entry['orport']) != self.orport:
949 logging.warning('%s excluded: has it changed ORPort from %s:%d to ' +
950 '%s:%d?', self._fpr, self.dirip, int(entry['orport']),
951 self.dirip, self.orport)
952 continue
953 if entry.has_key('ipv6') and self.has_ipv6():
954 # if both entry and fallback have an ipv6 address, compare them
955 if entry['ipv6'] != ipv6:
956 logging.warning('%s excluded: has it changed IPv6 ORPort from %s ' +
957 'to %s?', self._fpr, entry['ipv6'], ipv6)
958 continue
959 # if the fallback has an IPv6 address but the whitelist entry
960 # doesn't, or vice versa, the whitelist entry doesn't match
961 elif entry.has_key('ipv6') and not self.has_ipv6():
962 logging.warning('%s excluded: has it lost its former IPv6 address %s?',
963 self._fpr, entry['ipv6'])
964 continue
965 elif not entry.has_key('ipv6') and self.has_ipv6():
966 logging.warning('%s excluded: has it gained an IPv6 address %s?',
967 self._fpr, ipv6)
968 continue
969 return True
970 return False
972 def cw_to_bw_factor(self):
973 # any relays with a missing or zero consensus weight are not candidates
974 # any relays with a missing advertised bandwidth have it set to zero
975 return self._data['advertised_bandwidth'] / self._data['consensus_weight']
977 # since advertised_bandwidth is reported by the relay, it can be gamed
978 # to avoid this, use the median consensus weight to bandwidth factor to
979 # estimate this relay's measured bandwidth, and make that the upper limit
980 def measured_bandwidth(self, median_cw_to_bw_factor):
981 cw_to_bw= median_cw_to_bw_factor
982 # Reduce exit bandwidth to make sure we're not overloading them
983 if self.is_exit():
984 cw_to_bw *= EXIT_BANDWIDTH_FRACTION
985 measured_bandwidth = self._data['consensus_weight'] * cw_to_bw
986 if self._data['advertised_bandwidth'] != 0:
987 # limit advertised bandwidth (if available) to measured bandwidth
988 return min(measured_bandwidth, self._data['advertised_bandwidth'])
989 else:
990 return measured_bandwidth
992 def set_measured_bandwidth(self, median_cw_to_bw_factor):
993 self._data['measured_bandwidth'] = self.measured_bandwidth(
994 median_cw_to_bw_factor)
996 def is_exit(self):
997 return 'Exit' in self._data['flags']
999 def is_guard(self):
1000 return 'Guard' in self._data['flags']
1002 def is_running(self):
1003 return 'Running' in self._data['flags']
1005 # does this fallback have an IPv6 address and orport?
1006 def has_ipv6(self):
1007 return self.ipv6addr is not None and self.ipv6orport is not None
1009 # strip leading and trailing brackets from an IPv6 address
1010 # safe to use on non-bracketed IPv6 and on IPv4 addresses
1011 # also convert to unicode, and make None appear as ''
1012 @staticmethod
1013 def strip_ipv6_brackets(ip):
1014 if ip is None:
1015 return unicode('')
1016 if len(ip) < 2:
1017 return unicode(ip)
1018 if ip[0] == '[' and ip[-1] == ']':
1019 return unicode(ip[1:-1])
1020 return unicode(ip)
1022 # are ip_a and ip_b in the same netblock?
1023 # mask_bits is the size of the netblock
1024 # takes both IPv4 and IPv6 addresses
1025 # the versions of ip_a and ip_b must be the same
1026 # the mask must be valid for the IP version
1027 @staticmethod
1028 def netblocks_equal(ip_a, ip_b, mask_bits):
1029 if ip_a is None or ip_b is None:
1030 return False
1031 ip_a = Candidate.strip_ipv6_brackets(ip_a)
1032 ip_b = Candidate.strip_ipv6_brackets(ip_b)
1033 a = ipaddress.ip_address(ip_a)
1034 b = ipaddress.ip_address(ip_b)
1035 if a.version != b.version:
1036 raise Exception('Mismatching IP versions in %s and %s'%(ip_a, ip_b))
1037 if mask_bits > a.max_prefixlen:
1038 logging.error('Bad IP mask %d for %s and %s'%(mask_bits, ip_a, ip_b))
1039 mask_bits = a.max_prefixlen
1040 if mask_bits < 0:
1041 logging.error('Bad IP mask %d for %s and %s'%(mask_bits, ip_a, ip_b))
1042 mask_bits = 0
1043 a_net = ipaddress.ip_network('%s/%d'%(ip_a, mask_bits), strict=False)
1044 return b in a_net
1046 # is this fallback's IPv4 address (dirip) in the same netblock as other's
1047 # IPv4 address?
1048 # mask_bits is the size of the netblock
1049 def ipv4_netblocks_equal(self, other, mask_bits):
1050 return Candidate.netblocks_equal(self.dirip, other.dirip, mask_bits)
1052 # is this fallback's IPv6 address (ipv6addr) in the same netblock as
1053 # other's IPv6 address?
1054 # Returns False if either fallback has no IPv6 address
1055 # mask_bits is the size of the netblock
1056 def ipv6_netblocks_equal(self, other, mask_bits):
1057 if not self.has_ipv6() or not other.has_ipv6():
1058 return False
1059 return Candidate.netblocks_equal(self.ipv6addr, other.ipv6addr, mask_bits)
1061 # is this fallback's IPv4 DirPort the same as other's IPv4 DirPort?
1062 def dirport_equal(self, other):
1063 return self.dirport == other.dirport
1065 # is this fallback's IPv4 ORPort the same as other's IPv4 ORPort?
1066 def ipv4_orport_equal(self, other):
1067 return self.orport == other.orport
1069 # is this fallback's IPv6 ORPort the same as other's IPv6 ORPort?
1070 # Returns False if either fallback has no IPv6 address
1071 def ipv6_orport_equal(self, other):
1072 if not self.has_ipv6() or not other.has_ipv6():
1073 return False
1074 return self.ipv6orport == other.ipv6orport
1076 # does this fallback have the same DirPort, IPv4 ORPort, or
1077 # IPv6 ORPort as other?
1078 # Ignores IPv6 ORPort if either fallback has no IPv6 address
1079 def port_equal(self, other):
1080 return (self.dirport_equal(other) or self.ipv4_orport_equal(other)
1081 or self.ipv6_orport_equal(other))
1083 # return a list containing IPv4 ORPort, DirPort, and IPv6 ORPort (if present)
1084 def port_list(self):
1085 ports = [self.dirport, self.orport]
1086 if self.has_ipv6() and not self.ipv6orport in ports:
1087 ports.append(self.ipv6orport)
1088 return ports
1090 # does this fallback share a port with other, regardless of whether the
1091 # port types match?
1092 # For example, if self's IPv4 ORPort is 80 and other's DirPort is 80,
1093 # return True
1094 def port_shared(self, other):
1095 for p in self.port_list():
1096 if p in other.port_list():
1097 return True
1098 return False
1100 # log how long it takes to download a consensus from dirip:dirport
1101 # returns True if the download failed, False if it succeeded within max_time
1102 @staticmethod
1103 def fallback_consensus_download_speed(dirip, dirport, nickname, fingerprint,
1104 max_time):
1105 download_failed = False
1106 # some directory mirrors respond to requests in ways that hang python
1107 # sockets, which is why we log this line here
1108 logging.info('Initiating %sconsensus download from %s (%s:%d) %s.',
1109 'microdesc ' if DOWNLOAD_MICRODESC_CONSENSUS else '',
1110 nickname, dirip, dirport, fingerprint)
1111 # there appears to be about 1 second of overhead when comparing stem's
1112 # internal trace time and the elapsed time calculated here
1113 TIMEOUT_SLOP = 1.0
1114 start = datetime.datetime.utcnow()
1115 try:
1116 consensus = get_consensus(
1117 endpoints = [(dirip, dirport)],
1118 timeout = (max_time + TIMEOUT_SLOP),
1119 validate = True,
1120 retries = 0,
1121 fall_back_to_authority = False,
1122 document_handler = DocumentHandler.BARE_DOCUMENT,
1123 microdescriptor = DOWNLOAD_MICRODESC_CONSENSUS
1124 ).run()[0]
1125 end = datetime.datetime.utcnow()
1126 time_since_expiry = (end - consensus.valid_until).total_seconds()
1127 except Exception, stem_error:
1128 end = datetime.datetime.utcnow()
1129 log_excluded('Unable to retrieve a consensus from %s: %s', nickname,
1130 stem_error)
1131 status = 'error: "%s"' % (stem_error)
1132 level = logging.WARNING
1133 download_failed = True
1134 elapsed = (end - start).total_seconds()
1135 if download_failed:
1136 # keep the error failure status, and avoid using the variables
1137 pass
1138 elif elapsed > max_time:
1139 status = 'too slow'
1140 level = logging.WARNING
1141 download_failed = True
1142 elif (time_since_expiry > 0):
1143 status = 'outdated consensus, expired %ds ago'%(int(time_since_expiry))
1144 if time_since_expiry <= CONSENSUS_EXPIRY_TOLERANCE:
1145 status += ', tolerating up to %ds'%(CONSENSUS_EXPIRY_TOLERANCE)
1146 level = logging.INFO
1147 else:
1148 status += ', invalid'
1149 level = logging.WARNING
1150 download_failed = True
1151 else:
1152 status = 'ok'
1153 level = logging.DEBUG
1154 logging.log(level, 'Consensus download: %0.1fs %s from %s (%s:%d) %s, ' +
1155 'max download time %0.1fs.', elapsed, status, nickname,
1156 dirip, dirport, fingerprint, max_time)
1157 return download_failed
1159 # does this fallback download the consensus fast enough?
1160 def check_fallback_download_consensus(self):
1161 # include the relay if we're not doing a check, or we can't check (IPv6)
1162 ipv4_failed = False
1163 ipv6_failed = False
1164 if PERFORM_IPV4_DIRPORT_CHECKS:
1165 ipv4_failed = Candidate.fallback_consensus_download_speed(self.dirip,
1166 self.dirport,
1167 self._data['nickname'],
1168 self._fpr,
1169 CONSENSUS_DOWNLOAD_SPEED_MAX)
1170 if self.has_ipv6() and PERFORM_IPV6_DIRPORT_CHECKS:
1171 # Clients assume the IPv6 DirPort is the same as the IPv4 DirPort
1172 ipv6_failed = Candidate.fallback_consensus_download_speed(self.ipv6addr,
1173 self.dirport,
1174 self._data['nickname'],
1175 self._fpr,
1176 CONSENSUS_DOWNLOAD_SPEED_MAX)
1177 return ((not ipv4_failed) and (not ipv6_failed))
1179 # if this fallback has not passed a download check, try it again,
1180 # and record the result, available in get_fallback_download_consensus
1181 def try_fallback_download_consensus(self):
1182 if not self.get_fallback_download_consensus():
1183 self._data['download_check'] = self.check_fallback_download_consensus()
1185 # did this fallback pass the download check?
1186 def get_fallback_download_consensus(self):
1187 # if we're not performing checks, return True
1188 if not PERFORM_IPV4_DIRPORT_CHECKS and not PERFORM_IPV6_DIRPORT_CHECKS:
1189 return True
1190 # if we are performing checks, but haven't done one, return False
1191 if not self._data.has_key('download_check'):
1192 return False
1193 return self._data['download_check']
1195 # output an optional header comment and info for this fallback
1196 # try_fallback_download_consensus before calling this
1197 def fallbackdir_line(self, fallbacks, prefilter_fallbacks):
1198 s = ''
1199 if OUTPUT_COMMENTS:
1200 s += self.fallbackdir_comment(fallbacks, prefilter_fallbacks)
1201 # if the download speed is ok, output a C string
1202 # if it's not, but we OUTPUT_COMMENTS, output a commented-out C string
1203 if self.get_fallback_download_consensus() or OUTPUT_COMMENTS:
1204 s += self.fallbackdir_info(self.get_fallback_download_consensus())
1205 return s
1207 # output a header comment for this fallback
1208 def fallbackdir_comment(self, fallbacks, prefilter_fallbacks):
1209 # /*
1210 # nickname
1211 # flags
1212 # adjusted bandwidth, consensus weight
1213 # [contact]
1214 # [identical contact counts]
1215 # */
1216 # Multiline C comment
1217 s = '/*'
1218 s += '\n'
1219 s += cleanse_c_multiline_comment(self._data['nickname'])
1220 s += '\n'
1221 s += 'Flags: '
1222 s += cleanse_c_multiline_comment(' '.join(sorted(self._data['flags'])))
1223 s += '\n'
1224 # this is an adjusted bandwidth, see calculate_measured_bandwidth()
1225 bandwidth = self._data['measured_bandwidth']
1226 weight = self._data['consensus_weight']
1227 s += 'Bandwidth: %.1f MByte/s, Consensus Weight: %d'%(
1228 bandwidth/(1024.0*1024.0),
1229 weight)
1230 s += '\n'
1231 if self._data['contact'] is not None:
1232 s += cleanse_c_multiline_comment(self._data['contact'])
1233 if CONTACT_COUNT:
1234 fallback_count = len([f for f in fallbacks
1235 if f._data['contact'] == self._data['contact']])
1236 if fallback_count > 1:
1237 s += '\n'
1238 s += '%d identical contacts listed' % (fallback_count)
1240 # output the fallback info C string for this fallback
1241 # this is the text that would go after FallbackDir in a torrc
1242 # if this relay failed the download test and we OUTPUT_COMMENTS,
1243 # comment-out the returned string
1244 def fallbackdir_info(self, dl_speed_ok):
1245 # "address:dirport orport=port id=fingerprint"
1246 # (insert additional madatory fields here)
1247 # "[ipv6=addr:orport]"
1248 # (insert additional optional fields here)
1249 # /* nickname=name */
1250 # /* extrainfo={0,1} */
1251 # (insert additional comment fields here)
1252 # /* ===== */
1255 # Do we want a C string, or a commented-out string?
1256 c_string = dl_speed_ok
1257 comment_string = not dl_speed_ok and OUTPUT_COMMENTS
1258 # If we don't want either kind of string, bail
1259 if not c_string and not comment_string:
1260 return ''
1261 s = ''
1262 # Comment out the fallback directory entry if it's too slow
1263 # See the debug output for which address and port is failing
1264 if comment_string:
1265 s += '/* Consensus download failed or was too slow:\n'
1266 # Multi-Line C string with trailing comma (part of a string list)
1267 # This makes it easier to diff the file, and remove IPv6 lines using grep
1268 # Integers don't need escaping
1269 s += '"%s orport=%d id=%s"'%(
1270 cleanse_c_string(self._data['dir_address']),
1271 self.orport,
1272 cleanse_c_string(self._fpr))
1273 s += '\n'
1274 # (insert additional madatory fields here)
1275 if self.has_ipv6():
1276 s += '" ipv6=%s:%d"'%(cleanse_c_string(self.ipv6addr), self.ipv6orport)
1277 s += '\n'
1278 # (insert additional optional fields here)
1279 if not comment_string:
1280 s += '/* '
1281 s += 'nickname=%s'%(cleanse_c_string(self._data['nickname']))
1282 if not comment_string:
1283 s += ' */'
1284 s += '\n'
1285 # if we know that the fallback is an extrainfo cache, flag it
1286 # and if we don't know, assume it is not
1287 if not comment_string:
1288 s += '/* '
1289 s += 'extrainfo=%d'%(1 if self._extra_info_cache else 0)
1290 if not comment_string:
1291 s += ' */'
1292 s += '\n'
1293 # (insert additional comment fields here)
1294 # The terminator and comma must be the last line in each fallback entry
1295 if not comment_string:
1296 s += '/* '
1297 s += SECTION_SEPARATOR_BASE
1298 if not comment_string:
1299 s += ' */'
1300 s += '\n'
1301 s += ','
1302 if comment_string:
1303 s += '\n'
1304 s += '*/'
1305 return s
1307 ## Fallback Candidate List Class
1309 class CandidateList(dict):
1310 def __init__(self):
1311 pass
1313 def _add_relay(self, details):
1314 if not 'dir_address' in details: return
1315 c = Candidate(details)
1316 self[ c.get_fingerprint() ] = c
1318 def _add_uptime(self, uptime):
1319 try:
1320 fpr = uptime['fingerprint']
1321 except KeyError:
1322 raise Exception("Document has no fingerprint field.")
1324 try:
1325 c = self[fpr]
1326 except KeyError:
1327 logging.debug('Got unknown relay %s in uptime document.'%(fpr,))
1328 return
1330 c.add_uptime(uptime)
1332 def _add_details(self):
1333 logging.debug('Loading details document.')
1334 d = fetch('details',
1335 fields=('fingerprint,nickname,contact,last_changed_address_or_port,' +
1336 'consensus_weight,advertised_bandwidth,or_addresses,' +
1337 'dir_address,recommended_version,flags,effective_family,' +
1338 'platform'))
1339 logging.debug('Loading details document done.')
1341 if not 'relays' in d: raise Exception("No relays found in document.")
1343 for r in d['relays']: self._add_relay(r)
1345 def _add_uptimes(self):
1346 logging.debug('Loading uptime document.')
1347 d = fetch('uptime')
1348 logging.debug('Loading uptime document done.')
1350 if not 'relays' in d: raise Exception("No relays found in document.")
1351 for r in d['relays']: self._add_uptime(r)
1353 def add_relays(self):
1354 self._add_details()
1355 self._add_uptimes()
1357 def count_guards(self):
1358 guard_count = 0
1359 for fpr in self.keys():
1360 if self[fpr].is_guard():
1361 guard_count += 1
1362 return guard_count
1364 # Find fallbacks that fit the uptime, stability, and flags criteria,
1365 # and make an array of them in self.fallbacks
1366 def compute_fallbacks(self):
1367 self.fallbacks = map(lambda x: self[x],
1368 filter(lambda x: self[x].is_candidate(),
1369 self.keys()))
1371 # sort fallbacks by their consensus weight to advertised bandwidth factor,
1372 # lowest to highest
1373 # used to find the median cw_to_bw_factor()
1374 def sort_fallbacks_by_cw_to_bw_factor(self):
1375 self.fallbacks.sort(key=lambda f: f.cw_to_bw_factor())
1377 # sort fallbacks by their measured bandwidth, highest to lowest
1378 # calculate_measured_bandwidth before calling this
1379 # this is useful for reviewing candidates in priority order
1380 def sort_fallbacks_by_measured_bandwidth(self):
1381 self.fallbacks.sort(key=lambda f: f._data['measured_bandwidth'],
1382 reverse=True)
1384 # sort fallbacks by the data field data_field, lowest to highest
1385 def sort_fallbacks_by(self, data_field):
1386 self.fallbacks.sort(key=lambda f: f._data[data_field])
1388 @staticmethod
1389 def load_relaylist(file_obj):
1390 """ Read each line in the file, and parse it like a FallbackDir line:
1391 an IPv4 address and optional port:
1392 <IPv4 address>:<port>
1393 which are parsed into dictionary entries:
1394 ipv4=<IPv4 address>
1395 dirport=<port>
1396 followed by a series of key=value entries:
1397 orport=<port>
1398 id=<fingerprint>
1399 ipv6=<IPv6 address>:<IPv6 orport>
1400 each line's key/value pairs are placed in a dictonary,
1401 (of string -> string key/value pairs),
1402 and these dictionaries are placed in an array.
1403 comments start with # and are ignored """
1404 file_data = file_obj['data']
1405 file_name = file_obj['name']
1406 relaylist = []
1407 if file_data is None:
1408 return relaylist
1409 for line in file_data.split('\n'):
1410 relay_entry = {}
1411 # ignore comments
1412 line_comment_split = line.split('#')
1413 line = line_comment_split[0]
1414 # cleanup whitespace
1415 line = cleanse_whitespace(line)
1416 line = line.strip()
1417 if len(line) == 0:
1418 continue
1419 for item in line.split(' '):
1420 item = item.strip()
1421 if len(item) == 0:
1422 continue
1423 key_value_split = item.split('=')
1424 kvl = len(key_value_split)
1425 if kvl < 1 or kvl > 2:
1426 print '#error Bad %s item: %s, format is key=value.'%(
1427 file_name, item)
1428 if kvl == 1:
1429 # assume that entries without a key are the ipv4 address,
1430 # perhaps with a dirport
1431 ipv4_maybe_dirport = key_value_split[0]
1432 ipv4_maybe_dirport_split = ipv4_maybe_dirport.split(':')
1433 dirl = len(ipv4_maybe_dirport_split)
1434 if dirl < 1 or dirl > 2:
1435 print '#error Bad %s IPv4 item: %s, format is ipv4:port.'%(
1436 file_name, item)
1437 if dirl >= 1:
1438 relay_entry['ipv4'] = ipv4_maybe_dirport_split[0]
1439 if dirl == 2:
1440 relay_entry['dirport'] = ipv4_maybe_dirport_split[1]
1441 elif kvl == 2:
1442 relay_entry[key_value_split[0]] = key_value_split[1]
1443 relaylist.append(relay_entry)
1444 return relaylist
1446 # apply the fallback whitelist
1447 def apply_filter_lists(self, whitelist_obj):
1448 excluded_count = 0
1449 logging.debug('Applying whitelist')
1450 # parse the whitelist
1451 whitelist = self.load_relaylist(whitelist_obj)
1452 filtered_fallbacks = []
1453 for f in self.fallbacks:
1454 in_whitelist = f.is_in_whitelist(whitelist)
1455 if in_whitelist:
1456 # include
1457 filtered_fallbacks.append(f)
1458 elif INCLUDE_UNLISTED_ENTRIES:
1459 # include
1460 filtered_fallbacks.append(f)
1461 else:
1462 # exclude
1463 excluded_count += 1
1464 log_excluded('Excluding %s: not in whitelist.',
1465 f._fpr)
1466 self.fallbacks = filtered_fallbacks
1467 return excluded_count
1469 @staticmethod
1470 def summarise_filters(initial_count, excluded_count):
1471 return '/* Whitelist excluded %d of %d candidates. */'%(
1472 excluded_count, initial_count)
1474 # calculate each fallback's measured bandwidth based on the median
1475 # consensus weight to advertised bandwidth ratio
1476 def calculate_measured_bandwidth(self):
1477 self.sort_fallbacks_by_cw_to_bw_factor()
1478 median_fallback = self.fallback_median(True)
1479 if median_fallback is not None:
1480 median_cw_to_bw_factor = median_fallback.cw_to_bw_factor()
1481 else:
1482 # this will never be used, because there are no fallbacks
1483 median_cw_to_bw_factor = None
1484 for f in self.fallbacks:
1485 f.set_measured_bandwidth(median_cw_to_bw_factor)
1487 # remove relays with low measured bandwidth from the fallback list
1488 # calculate_measured_bandwidth for each relay before calling this
1489 def remove_low_bandwidth_relays(self):
1490 if MIN_BANDWIDTH is None:
1491 return
1492 above_min_bw_fallbacks = []
1493 for f in self.fallbacks:
1494 if f._data['measured_bandwidth'] >= MIN_BANDWIDTH:
1495 above_min_bw_fallbacks.append(f)
1496 else:
1497 # the bandwidth we log here is limited by the relay's consensus weight
1498 # as well as its adverttised bandwidth. See set_measured_bandwidth
1499 # for details
1500 log_excluded('%s not a candidate: bandwidth %.1fMByte/s too low, ' +
1501 'must be at least %.1fMByte/s', f._fpr,
1502 f._data['measured_bandwidth']/(1024.0*1024.0),
1503 MIN_BANDWIDTH/(1024.0*1024.0))
1504 self.fallbacks = above_min_bw_fallbacks
1506 # the minimum fallback in the list
1507 # call one of the sort_fallbacks_* functions before calling this
1508 def fallback_min(self):
1509 if len(self.fallbacks) > 0:
1510 return self.fallbacks[-1]
1511 else:
1512 return None
1514 # the median fallback in the list
1515 # call one of the sort_fallbacks_* functions before calling this
1516 def fallback_median(self, require_advertised_bandwidth):
1517 # use the low-median when there are an evan number of fallbacks,
1518 # for consistency with the bandwidth authorities
1519 if len(self.fallbacks) > 0:
1520 median_position = (len(self.fallbacks) - 1) / 2
1521 if not require_advertised_bandwidth:
1522 return self.fallbacks[median_position]
1523 # if we need advertised_bandwidth but this relay doesn't have it,
1524 # move to a fallback with greater consensus weight until we find one
1525 while not self.fallbacks[median_position]._data['advertised_bandwidth']:
1526 median_position += 1
1527 if median_position >= len(self.fallbacks):
1528 return None
1529 return self.fallbacks[median_position]
1530 else:
1531 return None
1533 # the maximum fallback in the list
1534 # call one of the sort_fallbacks_* functions before calling this
1535 def fallback_max(self):
1536 if len(self.fallbacks) > 0:
1537 return self.fallbacks[0]
1538 else:
1539 return None
1541 # return a new bag suitable for storing attributes
1542 @staticmethod
1543 def attribute_new():
1544 return dict()
1546 # get the count of attribute in attribute_bag
1547 # if attribute is None or the empty string, return 0
1548 @staticmethod
1549 def attribute_count(attribute, attribute_bag):
1550 if attribute is None or attribute == '':
1551 return 0
1552 if attribute not in attribute_bag:
1553 return 0
1554 return attribute_bag[attribute]
1556 # does attribute_bag contain more than max_count instances of attribute?
1557 # if so, return False
1558 # if not, return True
1559 # if attribute is None or the empty string, or max_count is invalid,
1560 # always return True
1561 @staticmethod
1562 def attribute_allow(attribute, attribute_bag, max_count=1):
1563 if attribute is None or attribute == '' or max_count <= 0:
1564 return True
1565 elif CandidateList.attribute_count(attribute, attribute_bag) >= max_count:
1566 return False
1567 else:
1568 return True
1570 # add attribute to attribute_bag, incrementing the count if it is already
1571 # present
1572 # if attribute is None or the empty string, or count is invalid,
1573 # do nothing
1574 @staticmethod
1575 def attribute_add(attribute, attribute_bag, count=1):
1576 if attribute is None or attribute == '' or count <= 0:
1577 pass
1578 attribute_bag.setdefault(attribute, 0)
1579 attribute_bag[attribute] += count
1581 # make sure there are only MAX_FALLBACKS_PER_IP fallbacks per IPv4 address,
1582 # and per IPv6 address
1583 # there is only one IPv4 address on each fallback: the IPv4 DirPort address
1584 # (we choose the IPv4 ORPort which is on the same IPv4 as the DirPort)
1585 # there is at most one IPv6 address on each fallback: the IPv6 ORPort address
1586 # we try to match the IPv4 ORPort, but will use any IPv6 address if needed
1587 # (clients only use the IPv6 ORPort)
1588 # if there is no IPv6 address, only the IPv4 address is checked
1589 # return the number of candidates we excluded
1590 def limit_fallbacks_same_ip(self):
1591 ip_limit_fallbacks = []
1592 ip_list = CandidateList.attribute_new()
1593 for f in self.fallbacks:
1594 if (CandidateList.attribute_allow(f.dirip, ip_list,
1595 MAX_FALLBACKS_PER_IPV4)
1596 and CandidateList.attribute_allow(f.ipv6addr, ip_list,
1597 MAX_FALLBACKS_PER_IPV6)):
1598 ip_limit_fallbacks.append(f)
1599 CandidateList.attribute_add(f.dirip, ip_list)
1600 if f.has_ipv6():
1601 CandidateList.attribute_add(f.ipv6addr, ip_list)
1602 elif not CandidateList.attribute_allow(f.dirip, ip_list,
1603 MAX_FALLBACKS_PER_IPV4):
1604 log_excluded('Eliminated %s: already have %d fallback(s) on IPv4 %s'
1605 %(f._fpr, CandidateList.attribute_count(f.dirip, ip_list),
1606 f.dirip))
1607 elif (f.has_ipv6() and
1608 not CandidateList.attribute_allow(f.ipv6addr, ip_list,
1609 MAX_FALLBACKS_PER_IPV6)):
1610 log_excluded('Eliminated %s: already have %d fallback(s) on IPv6 %s'
1611 %(f._fpr, CandidateList.attribute_count(f.ipv6addr,
1612 ip_list),
1613 f.ipv6addr))
1614 original_count = len(self.fallbacks)
1615 self.fallbacks = ip_limit_fallbacks
1616 return original_count - len(self.fallbacks)
1618 # make sure there are only MAX_FALLBACKS_PER_CONTACT fallbacks for each
1619 # ContactInfo
1620 # if there is no ContactInfo, allow the fallback
1621 # this check can be gamed by providing no ContactInfo, or by setting the
1622 # ContactInfo to match another fallback
1623 # However, given the likelihood that relays with the same ContactInfo will
1624 # go down at similar times, its usefulness outweighs the risk
1625 def limit_fallbacks_same_contact(self):
1626 contact_limit_fallbacks = []
1627 contact_list = CandidateList.attribute_new()
1628 for f in self.fallbacks:
1629 if CandidateList.attribute_allow(f._data['contact'], contact_list,
1630 MAX_FALLBACKS_PER_CONTACT):
1631 contact_limit_fallbacks.append(f)
1632 CandidateList.attribute_add(f._data['contact'], contact_list)
1633 else:
1634 log_excluded(
1635 'Eliminated %s: already have %d fallback(s) on ContactInfo %s'
1636 %(f._fpr, CandidateList.attribute_count(f._data['contact'],
1637 contact_list),
1638 f._data['contact']))
1639 original_count = len(self.fallbacks)
1640 self.fallbacks = contact_limit_fallbacks
1641 return original_count - len(self.fallbacks)
1643 # make sure there are only MAX_FALLBACKS_PER_FAMILY fallbacks per effective
1644 # family
1645 # if there is no family, allow the fallback
1646 # we use effective family, which ensures mutual family declarations
1647 # but the check can be gamed by not declaring a family at all
1648 # if any indirect families exist, the result depends on the order in which
1649 # fallbacks are sorted in the list
1650 def limit_fallbacks_same_family(self):
1651 family_limit_fallbacks = []
1652 fingerprint_list = CandidateList.attribute_new()
1653 for f in self.fallbacks:
1654 if CandidateList.attribute_allow(f._fpr, fingerprint_list,
1655 MAX_FALLBACKS_PER_FAMILY):
1656 family_limit_fallbacks.append(f)
1657 CandidateList.attribute_add(f._fpr, fingerprint_list)
1658 for family_fingerprint in f._data['effective_family']:
1659 CandidateList.attribute_add(family_fingerprint, fingerprint_list)
1660 else:
1661 # we already have a fallback with this fallback in its effective
1662 # family
1663 log_excluded(
1664 'Eliminated %s: already have %d fallback(s) in effective family'
1665 %(f._fpr, CandidateList.attribute_count(f._fpr, fingerprint_list)))
1666 original_count = len(self.fallbacks)
1667 self.fallbacks = family_limit_fallbacks
1668 return original_count - len(self.fallbacks)
1670 # try once to get the descriptors for fingerprint_list using stem
1671 # returns an empty list on exception
1672 @staticmethod
1673 def get_fallback_descriptors_once(fingerprint_list):
1674 desc_list = get_server_descriptors(fingerprints=fingerprint_list).run(suppress=True)
1675 return desc_list
1677 # try up to max_retries times to get the descriptors for fingerprint_list
1678 # using stem. Stops retrying when all descriptors have been retrieved.
1679 # returns a list containing the descriptors that were retrieved
1680 @staticmethod
1681 def get_fallback_descriptors(fingerprint_list, max_retries=5):
1682 # we can't use stem's retries=, because we want to support more than 96
1683 # descriptors
1685 # add an attempt for every MAX_FINGERPRINTS (or part thereof) in the list
1686 max_retries += (len(fingerprint_list) + MAX_FINGERPRINTS - 1) / MAX_FINGERPRINTS
1687 remaining_list = fingerprint_list
1688 desc_list = []
1689 for _ in xrange(max_retries):
1690 if len(remaining_list) == 0:
1691 break
1692 new_desc_list = CandidateList.get_fallback_descriptors_once(remaining_list[0:MAX_FINGERPRINTS])
1693 for d in new_desc_list:
1694 try:
1695 remaining_list.remove(d.fingerprint)
1696 except ValueError:
1697 # warn and ignore if a directory mirror returned a bad descriptor
1698 logging.warning("Directory mirror returned unwanted descriptor %s, ignoring",
1699 d.fingerprint)
1700 continue
1701 desc_list.append(d)
1702 return desc_list
1704 # find the fallbacks that cache extra-info documents
1705 # Onionoo doesn't know this, so we have to use stem
1706 def mark_extra_info_caches(self):
1707 fingerprint_list = [ f._fpr for f in self.fallbacks ]
1708 logging.info("Downloading fallback descriptors to find extra-info caches")
1709 desc_list = CandidateList.get_fallback_descriptors(fingerprint_list)
1710 for d in desc_list:
1711 self[d.fingerprint]._extra_info_cache = d.extra_info_cache
1712 missing_descriptor_list = [ f._fpr for f in self.fallbacks
1713 if f._extra_info_cache is None ]
1714 for f in missing_descriptor_list:
1715 logging.warning("No descriptor for {}. Assuming extrainfo=0.".format(f))
1717 # try a download check on each fallback candidate in order
1718 # stop after max_count successful downloads
1719 # but don't remove any candidates from the array
1720 def try_download_consensus_checks(self, max_count):
1721 dl_ok_count = 0
1722 for f in self.fallbacks:
1723 f.try_fallback_download_consensus()
1724 if f.get_fallback_download_consensus():
1725 # this fallback downloaded a consensus ok
1726 dl_ok_count += 1
1727 if dl_ok_count >= max_count:
1728 # we have enough fallbacks
1729 return
1731 # put max_count successful candidates in the fallbacks array:
1732 # - perform download checks on each fallback candidate
1733 # - retry failed candidates if CONSENSUS_DOWNLOAD_RETRY is set
1734 # - eliminate failed candidates
1735 # - if there are more than max_count candidates, eliminate lowest bandwidth
1736 # - if there are fewer than max_count candidates, leave only successful
1737 # Return the number of fallbacks that failed the consensus check
1738 def perform_download_consensus_checks(self, max_count):
1739 self.sort_fallbacks_by_measured_bandwidth()
1740 self.try_download_consensus_checks(max_count)
1741 if CONSENSUS_DOWNLOAD_RETRY:
1742 # try unsuccessful candidates again
1743 # we could end up with more than max_count successful candidates here
1744 self.try_download_consensus_checks(max_count)
1745 # now we have at least max_count successful candidates,
1746 # or we've tried them all
1747 original_count = len(self.fallbacks)
1748 self.fallbacks = filter(lambda x: x.get_fallback_download_consensus(),
1749 self.fallbacks)
1750 # some of these failed the check, others skipped the check,
1751 # if we already had enough successful downloads
1752 failed_count = original_count - len(self.fallbacks)
1753 self.fallbacks = self.fallbacks[:max_count]
1754 return failed_count
1756 # return a string that describes a/b as a percentage
1757 @staticmethod
1758 def describe_percentage(a, b):
1759 if b != 0:
1760 return '%d/%d = %.0f%%'%(a, b, (a*100.0)/b)
1761 else:
1762 # technically, 0/0 is undefined, but 0.0% is a sensible result
1763 return '%d/%d = %.0f%%'%(a, b, 0.0)
1765 # return a dictionary of lists of fallbacks by IPv4 netblock
1766 # the dictionary is keyed by the fingerprint of an arbitrary fallback
1767 # in each netblock
1768 # mask_bits is the size of the netblock
1769 def fallbacks_by_ipv4_netblock(self, mask_bits):
1770 netblocks = {}
1771 for f in self.fallbacks:
1772 found_netblock = False
1773 for b in netblocks.keys():
1774 # we found an existing netblock containing this fallback
1775 if f.ipv4_netblocks_equal(self[b], mask_bits):
1776 # add it to the list
1777 netblocks[b].append(f)
1778 found_netblock = True
1779 break
1780 # make a new netblock based on this fallback's fingerprint
1781 if not found_netblock:
1782 netblocks[f._fpr] = [f]
1783 return netblocks
1785 # return a dictionary of lists of fallbacks by IPv6 netblock
1786 # where mask_bits is the size of the netblock
1787 def fallbacks_by_ipv6_netblock(self, mask_bits):
1788 netblocks = {}
1789 for f in self.fallbacks:
1790 # skip fallbacks without IPv6 addresses
1791 if not f.has_ipv6():
1792 continue
1793 found_netblock = False
1794 for b in netblocks.keys():
1795 # we found an existing netblock containing this fallback
1796 if f.ipv6_netblocks_equal(self[b], mask_bits):
1797 # add it to the list
1798 netblocks[b].append(f)
1799 found_netblock = True
1800 break
1801 # make a new netblock based on this fallback's fingerprint
1802 if not found_netblock:
1803 netblocks[f._fpr] = [f]
1804 return netblocks
1806 # log a message about the proportion of fallbacks in each IPv4 netblock,
1807 # where mask_bits is the size of the netblock
1808 def describe_fallback_ipv4_netblock_mask(self, mask_bits):
1809 fallback_count = len(self.fallbacks)
1810 shared_netblock_fallback_count = 0
1811 most_frequent_netblock = None
1812 netblocks = self.fallbacks_by_ipv4_netblock(mask_bits)
1813 for b in netblocks.keys():
1814 if len(netblocks[b]) > 1:
1815 # how many fallbacks are in a netblock with other fallbacks?
1816 shared_netblock_fallback_count += len(netblocks[b])
1817 # what's the netblock with the most fallbacks?
1818 if (most_frequent_netblock is None
1819 or len(netblocks[b]) > len(netblocks[most_frequent_netblock])):
1820 most_frequent_netblock = b
1821 logging.debug('Fallback IPv4 addresses in the same /%d:'%(mask_bits))
1822 for f in netblocks[b]:
1823 logging.debug('%s - %s', f.dirip, f._fpr)
1824 if most_frequent_netblock is not None:
1825 logging.warning('There are %s fallbacks in the IPv4 /%d containing %s'%(
1826 CandidateList.describe_percentage(
1827 len(netblocks[most_frequent_netblock]),
1828 fallback_count),
1829 mask_bits,
1830 self[most_frequent_netblock].dirip))
1831 if shared_netblock_fallback_count > 0:
1832 logging.warning(('%s of fallbacks are in an IPv4 /%d with other ' +
1833 'fallbacks')%(CandidateList.describe_percentage(
1834 shared_netblock_fallback_count,
1835 fallback_count),
1836 mask_bits))
1838 # log a message about the proportion of fallbacks in each IPv6 netblock,
1839 # where mask_bits is the size of the netblock
1840 def describe_fallback_ipv6_netblock_mask(self, mask_bits):
1841 fallback_count = len(self.fallbacks_with_ipv6())
1842 shared_netblock_fallback_count = 0
1843 most_frequent_netblock = None
1844 netblocks = self.fallbacks_by_ipv6_netblock(mask_bits)
1845 for b in netblocks.keys():
1846 if len(netblocks[b]) > 1:
1847 # how many fallbacks are in a netblock with other fallbacks?
1848 shared_netblock_fallback_count += len(netblocks[b])
1849 # what's the netblock with the most fallbacks?
1850 if (most_frequent_netblock is None
1851 or len(netblocks[b]) > len(netblocks[most_frequent_netblock])):
1852 most_frequent_netblock = b
1853 logging.debug('Fallback IPv6 addresses in the same /%d:'%(mask_bits))
1854 for f in netblocks[b]:
1855 logging.debug('%s - %s', f.ipv6addr, f._fpr)
1856 if most_frequent_netblock is not None:
1857 logging.warning('There are %s fallbacks in the IPv6 /%d containing %s'%(
1858 CandidateList.describe_percentage(
1859 len(netblocks[most_frequent_netblock]),
1860 fallback_count),
1861 mask_bits,
1862 self[most_frequent_netblock].ipv6addr))
1863 if shared_netblock_fallback_count > 0:
1864 logging.warning(('%s of fallbacks are in an IPv6 /%d with other ' +
1865 'fallbacks')%(CandidateList.describe_percentage(
1866 shared_netblock_fallback_count,
1867 fallback_count),
1868 mask_bits))
1870 # log a message about the proportion of fallbacks in each IPv4 /8, /16,
1871 # and /24
1872 def describe_fallback_ipv4_netblocks(self):
1873 # this doesn't actually tell us anything useful
1874 #self.describe_fallback_ipv4_netblock_mask(8)
1875 self.describe_fallback_ipv4_netblock_mask(16)
1876 #self.describe_fallback_ipv4_netblock_mask(24)
1878 # log a message about the proportion of fallbacks in each IPv6 /12 (RIR),
1879 # /23 (smaller RIR blocks), /32 (LIR), /48 (Customer), and /64 (Host)
1880 # https://www.iana.org/assignments/ipv6-unicast-address-assignments/
1881 def describe_fallback_ipv6_netblocks(self):
1882 # these don't actually tell us anything useful
1883 #self.describe_fallback_ipv6_netblock_mask(12)
1884 #self.describe_fallback_ipv6_netblock_mask(23)
1885 self.describe_fallback_ipv6_netblock_mask(32)
1886 #self.describe_fallback_ipv6_netblock_mask(48)
1887 self.describe_fallback_ipv6_netblock_mask(64)
1889 # log a message about the proportion of fallbacks in each IPv4 and IPv6
1890 # netblock
1891 def describe_fallback_netblocks(self):
1892 self.describe_fallback_ipv4_netblocks()
1893 self.describe_fallback_ipv6_netblocks()
1895 # return a list of fallbacks which are on the IPv4 ORPort port
1896 def fallbacks_on_ipv4_orport(self, port):
1897 return filter(lambda x: x.orport == port, self.fallbacks)
1899 # return a list of fallbacks which are on the IPv6 ORPort port
1900 def fallbacks_on_ipv6_orport(self, port):
1901 return filter(lambda x: x.ipv6orport == port, self.fallbacks_with_ipv6())
1903 # return a list of fallbacks which are on the DirPort port
1904 def fallbacks_on_dirport(self, port):
1905 return filter(lambda x: x.dirport == port, self.fallbacks)
1907 # log a message about the proportion of fallbacks on IPv4 ORPort port
1908 # and return that count
1909 def describe_fallback_ipv4_orport(self, port):
1910 port_count = len(self.fallbacks_on_ipv4_orport(port))
1911 fallback_count = len(self.fallbacks)
1912 logging.warning('%s of fallbacks are on IPv4 ORPort %d'%(
1913 CandidateList.describe_percentage(port_count,
1914 fallback_count),
1915 port))
1916 return port_count
1918 # log a message about the proportion of IPv6 fallbacks on IPv6 ORPort port
1919 # and return that count
1920 def describe_fallback_ipv6_orport(self, port):
1921 port_count = len(self.fallbacks_on_ipv6_orport(port))
1922 fallback_count = len(self.fallbacks_with_ipv6())
1923 logging.warning('%s of IPv6 fallbacks are on IPv6 ORPort %d'%(
1924 CandidateList.describe_percentage(port_count,
1925 fallback_count),
1926 port))
1927 return port_count
1929 # log a message about the proportion of fallbacks on DirPort port
1930 # and return that count
1931 def describe_fallback_dirport(self, port):
1932 port_count = len(self.fallbacks_on_dirport(port))
1933 fallback_count = len(self.fallbacks)
1934 logging.warning('%s of fallbacks are on DirPort %d'%(
1935 CandidateList.describe_percentage(port_count,
1936 fallback_count),
1937 port))
1938 return port_count
1940 # log a message about the proportion of fallbacks on each dirport,
1941 # each IPv4 orport, and each IPv6 orport
1942 def describe_fallback_ports(self):
1943 fallback_count = len(self.fallbacks)
1944 ipv4_or_count = fallback_count
1945 ipv4_or_count -= self.describe_fallback_ipv4_orport(443)
1946 ipv4_or_count -= self.describe_fallback_ipv4_orport(9001)
1947 logging.warning('%s of fallbacks are on other IPv4 ORPorts'%(
1948 CandidateList.describe_percentage(ipv4_or_count,
1949 fallback_count)))
1950 ipv6_fallback_count = len(self.fallbacks_with_ipv6())
1951 ipv6_or_count = ipv6_fallback_count
1952 ipv6_or_count -= self.describe_fallback_ipv6_orport(443)
1953 ipv6_or_count -= self.describe_fallback_ipv6_orport(9001)
1954 logging.warning('%s of IPv6 fallbacks are on other IPv6 ORPorts'%(
1955 CandidateList.describe_percentage(ipv6_or_count,
1956 ipv6_fallback_count)))
1957 dir_count = fallback_count
1958 dir_count -= self.describe_fallback_dirport(80)
1959 dir_count -= self.describe_fallback_dirport(9030)
1960 logging.warning('%s of fallbacks are on other DirPorts'%(
1961 CandidateList.describe_percentage(dir_count,
1962 fallback_count)))
1964 # return a list of fallbacks which cache extra-info documents
1965 def fallbacks_with_extra_info_cache(self):
1966 return filter(lambda x: x._extra_info_cache, self.fallbacks)
1968 # log a message about the proportion of fallbacks that cache extra-info docs
1969 def describe_fallback_extra_info_caches(self):
1970 extra_info_falback_count = len(self.fallbacks_with_extra_info_cache())
1971 fallback_count = len(self.fallbacks)
1972 logging.warning('%s of fallbacks cache extra-info documents'%(
1973 CandidateList.describe_percentage(extra_info_falback_count,
1974 fallback_count)))
1976 # return a list of fallbacks which have the Exit flag
1977 def fallbacks_with_exit(self):
1978 return filter(lambda x: x.is_exit(), self.fallbacks)
1980 # log a message about the proportion of fallbacks with an Exit flag
1981 def describe_fallback_exit_flag(self):
1982 exit_falback_count = len(self.fallbacks_with_exit())
1983 fallback_count = len(self.fallbacks)
1984 logging.warning('%s of fallbacks have the Exit flag'%(
1985 CandidateList.describe_percentage(exit_falback_count,
1986 fallback_count)))
1988 # return a list of fallbacks which have an IPv6 address
1989 def fallbacks_with_ipv6(self):
1990 return filter(lambda x: x.has_ipv6(), self.fallbacks)
1992 # log a message about the proportion of fallbacks on IPv6
1993 def describe_fallback_ip_family(self):
1994 ipv6_falback_count = len(self.fallbacks_with_ipv6())
1995 fallback_count = len(self.fallbacks)
1996 logging.warning('%s of fallbacks are on IPv6'%(
1997 CandidateList.describe_percentage(ipv6_falback_count,
1998 fallback_count)))
2000 def summarise_fallbacks(self, eligible_count, operator_count, failed_count,
2001 guard_count, target_count):
2002 s = ''
2003 # Report:
2004 # whether we checked consensus download times
2005 # the number of fallback directories (and limits/exclusions, if relevant)
2006 # min & max fallback bandwidths
2007 # #error if below minimum count
2008 if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
2009 s += '/* Checked %s%s%s DirPorts served a consensus within %.1fs. */'%(
2010 'IPv4' if PERFORM_IPV4_DIRPORT_CHECKS else '',
2011 ' and ' if (PERFORM_IPV4_DIRPORT_CHECKS
2012 and PERFORM_IPV6_DIRPORT_CHECKS) else '',
2013 'IPv6' if PERFORM_IPV6_DIRPORT_CHECKS else '',
2014 CONSENSUS_DOWNLOAD_SPEED_MAX)
2015 else:
2016 s += '/* Did not check IPv4 or IPv6 DirPort consensus downloads. */'
2017 s += '\n'
2018 # Multiline C comment with #error if things go bad
2019 s += '/*'
2020 s += '\n'
2021 # Integers don't need escaping in C comments
2022 fallback_count = len(self.fallbacks)
2023 if FALLBACK_PROPORTION_OF_GUARDS is None:
2024 fallback_proportion = ''
2025 else:
2026 fallback_proportion = ', Target %d (%d * %.2f)'%(target_count,
2027 guard_count,
2028 FALLBACK_PROPORTION_OF_GUARDS)
2029 s += 'Final Count: %d (Eligible %d%s'%(fallback_count, eligible_count,
2030 fallback_proportion)
2031 if MAX_FALLBACK_COUNT is not None:
2032 s += ', Max %d'%(MAX_FALLBACK_COUNT)
2033 s += ')\n'
2034 if eligible_count != fallback_count:
2035 removed_count = eligible_count - fallback_count
2036 excess_to_target_or_max = (eligible_count - operator_count - failed_count
2037 - fallback_count)
2038 # some 'Failed' failed the check, others 'Skipped' the check,
2039 # if we already had enough successful downloads
2040 s += ('Excluded: %d (Same Operator %d, Failed/Skipped Download %d, ' +
2041 'Excess %d)')%(removed_count, operator_count, failed_count,
2042 excess_to_target_or_max)
2043 s += '\n'
2044 min_fb = self.fallback_min()
2045 min_bw = min_fb._data['measured_bandwidth']
2046 max_fb = self.fallback_max()
2047 max_bw = max_fb._data['measured_bandwidth']
2048 s += 'Bandwidth Range: %.1f - %.1f MByte/s'%(min_bw/(1024.0*1024.0),
2049 max_bw/(1024.0*1024.0))
2050 s += '\n'
2051 s += '*/'
2052 if fallback_count < MIN_FALLBACK_COUNT:
2053 # We must have a minimum number of fallbacks so they are always
2054 # reachable, and are in diverse locations
2055 s += '\n'
2056 s += '#error Fallback Count %d is too low. '%(fallback_count)
2057 s += 'Must be at least %d for diversity. '%(MIN_FALLBACK_COUNT)
2058 s += 'Try adding entries to the whitelist, '
2059 s += 'or setting INCLUDE_UNLISTED_ENTRIES = True.'
2060 return s
2062 def process_existing():
2063 logging.basicConfig(level=logging.INFO)
2064 logging.getLogger('stem').setLevel(logging.INFO)
2065 whitelist = {'data': parse_fallback_file(FALLBACK_FILE_NAME),
2066 'name': FALLBACK_FILE_NAME}
2067 list_fallbacks(whitelist)
2069 def process_default():
2070 logging.basicConfig(level=logging.WARNING)
2071 logging.getLogger('stem').setLevel(logging.WARNING)
2072 whitelist = {'data': read_from_file(WHITELIST_FILE_NAME, MAX_LIST_FILE_SIZE),
2073 'name': WHITELIST_FILE_NAME}
2074 list_fallbacks(whitelist)
2076 ## Main Function
2077 def main():
2078 if get_command() == 'check_existing':
2079 process_existing()
2080 else:
2081 process_default()
2083 def get_command():
2084 if len(sys.argv) == 2:
2085 return sys.argv[1]
2086 else:
2087 return None
2089 def log_excluded(msg, *args):
2090 if get_command() == 'check_existing':
2091 logging.warning(msg, *args)
2092 else:
2093 logging.info(msg, *args)
2095 def list_fallbacks(whitelist):
2096 """ Fetches required onionoo documents and evaluates the
2097 fallback directory criteria for each of the relays """
2099 print "/* type=fallback */"
2100 print ("/* version={} */"
2101 .format(cleanse_c_multiline_comment(FALLBACK_FORMAT_VERSION)))
2102 now = datetime.datetime.utcnow()
2103 timestamp = now.strftime('%Y%m%d%H%M%S')
2104 print ("/* timestamp={} */"
2105 .format(cleanse_c_multiline_comment(timestamp)))
2106 # end the header with a separator, to make it easier for parsers
2107 print SECTION_SEPARATOR_COMMENT
2109 logging.warning('Downloading and parsing Onionoo data. ' +
2110 'This may take some time.')
2111 # find relays that could be fallbacks
2112 candidates = CandidateList()
2113 candidates.add_relays()
2115 # work out how many fallbacks we want
2116 guard_count = candidates.count_guards()
2117 if FALLBACK_PROPORTION_OF_GUARDS is None:
2118 target_count = guard_count
2119 else:
2120 target_count = int(guard_count * FALLBACK_PROPORTION_OF_GUARDS)
2121 # the maximum number of fallbacks is the least of:
2122 # - the target fallback count (FALLBACK_PROPORTION_OF_GUARDS * guard count)
2123 # - the maximum fallback count (MAX_FALLBACK_COUNT)
2124 if MAX_FALLBACK_COUNT is None:
2125 max_count = target_count
2126 else:
2127 max_count = min(target_count, MAX_FALLBACK_COUNT)
2129 candidates.compute_fallbacks()
2130 prefilter_fallbacks = copy.copy(candidates.fallbacks)
2132 # filter with the whitelist
2133 # if a relay has changed IPv4 address or ports recently, it will be excluded
2134 # as ineligible before we call apply_filter_lists, and so there will be no
2135 # warning that the details have changed from those in the whitelist.
2136 # instead, there will be an info-level log during the eligibility check.
2137 initial_count = len(candidates.fallbacks)
2138 excluded_count = candidates.apply_filter_lists(whitelist)
2139 print candidates.summarise_filters(initial_count, excluded_count)
2140 eligible_count = len(candidates.fallbacks)
2142 # calculate the measured bandwidth of each relay,
2143 # then remove low-bandwidth relays
2144 candidates.calculate_measured_bandwidth()
2145 candidates.remove_low_bandwidth_relays()
2147 # print the raw fallback list
2148 #for x in candidates.fallbacks:
2149 # print x.fallbackdir_line(True)
2150 # print json.dumps(candidates[x]._data, sort_keys=True, indent=4,
2151 # separators=(',', ': '), default=json_util.default)
2153 # impose mandatory conditions here, like one per contact, family, IP
2154 # in measured bandwidth order
2155 candidates.sort_fallbacks_by_measured_bandwidth()
2156 operator_count = 0
2157 # only impose these limits on the final list - operators can nominate
2158 # multiple candidate fallbacks, and then we choose the best set
2159 if not OUTPUT_CANDIDATES:
2160 operator_count += candidates.limit_fallbacks_same_ip()
2161 operator_count += candidates.limit_fallbacks_same_contact()
2162 operator_count += candidates.limit_fallbacks_same_family()
2164 # check if each candidate can serve a consensus
2165 # there's a small risk we've eliminated relays from the same operator that
2166 # can serve a consensus, in favour of one that can't
2167 # but given it takes up to 15 seconds to check each consensus download,
2168 # the risk is worth it
2169 if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
2170 logging.warning('Checking consensus download speeds. ' +
2171 'This may take some time.')
2172 failed_count = candidates.perform_download_consensus_checks(max_count)
2174 # work out which fallbacks cache extra-infos
2175 candidates.mark_extra_info_caches()
2177 # analyse and log interesting diversity metrics
2178 # like netblock, ports, exit, IPv4-only
2179 # (we can't easily analyse AS, and it's hard to accurately analyse country)
2180 candidates.describe_fallback_ip_family()
2181 # if we can't import the ipaddress module, we can't do netblock analysis
2182 if HAVE_IPADDRESS:
2183 candidates.describe_fallback_netblocks()
2184 candidates.describe_fallback_ports()
2185 candidates.describe_fallback_extra_info_caches()
2186 candidates.describe_fallback_exit_flag()
2188 # output C comments summarising the fallback selection process
2189 if len(candidates.fallbacks) > 0:
2190 print candidates.summarise_fallbacks(eligible_count, operator_count,
2191 failed_count, guard_count,
2192 target_count)
2193 else:
2194 print '/* No Fallbacks met criteria */'
2196 # output C comments specifying the OnionOO data used to create the list
2197 for s in fetch_source_list():
2198 print describe_fetch_source(s)
2200 # start the list with a separator, to make it easy for parsers
2201 print SECTION_SEPARATOR_COMMENT
2203 # sort the list differently depending on why we've created it:
2204 # if we're outputting the final fallback list, sort by fingerprint
2205 # this makes diffs much more stable
2206 # otherwise, if we're trying to find a bandwidth cutoff, or we want to
2207 # contact operators in priority order, sort by bandwidth (not yet
2208 # implemented)
2209 # otherwise, if we're contacting operators, sort by contact
2210 candidates.sort_fallbacks_by(OUTPUT_SORT_FIELD)
2212 for x in candidates.fallbacks:
2213 print x.fallbackdir_line(candidates.fallbacks, prefilter_fallbacks)
2215 if __name__ == "__main__":
2216 main()