modified: nfig1.py
[GalaxyCodeBases.git] / python / cmdbtools.py
blob860f1bacb88b9d6f7aa69c3f1c9399379280ff8c
1 #!/usr/bin/env python
2 """A command line tools for CMDB variants browser
4 Author: Shujia Huang
5 Date: 2018-11-22
7 """
9 import argparse
10 import os
11 import sys
12 import gzip
13 import json
14 import yaml
16 from datetime import datetime
17 from urllib import urlencode
18 from urllib2 import Request, urlopen, HTTPError
20 if sys.version_info.major != 2:
21 raise Exception('This tool supports only python2')
23 argparser = argparse.ArgumentParser(description='Manage authentication for CMDB API and do querying from command line.')
24 commands = argparser.add_subparsers(dest='command', title='Commands')
26 login_command = commands.add_parser('login', help='Authorize access to CMDB API.')
27 logout_command = commands.add_parser('logout', help='Logout CMDB.')
28 token_command = commands.add_parser('print-access-token', help='Display access token for CMDB API.')
30 login_command.add_argument('-k', '--token', type=str, required=True, dest='token',
31 help='CMDB API access key(Token).')
33 annotate_command = commands.add_parser('annotate', help='Annotate input VCF.',
34 description='Input VCF file. Multi-allelic variant records in input VCF must be '
35 'split into multiple bi-allelic variant records.')
36 annotate_command.add_argument('-i', '--vcffile', metavar='VCF_FILE', type=str, required=True, dest='in_vcffile',
37 help='input VCF file.')
39 query_rs_command = commands.add_parser('query-rs',
40 help='Query variant by variant identifier or by dbSNP rsID.',
41 description='')
42 query_rs_command.add_argument('-r', '--rsid', metavar='rsid', type=str, dest='rsid',
43 help='dbSNP rsID.', default=None)
44 query_rs_command.add_argument('-l', '--list', metavar='File-contain-a-list-of-dbSNP-rsIDs',
45 type=str, dest='list',
46 help='dbSNP rsIDs list in a file. One for each line.'
47 'could be .gz file',
48 default=None)
50 query_variant_command = commands.add_parser('query-variant',
51 help='Query variant by variant identifier or by chromosome name and '
52 'chromosomal position.',
53 description='Query variant by identifier chromosome name and chromosomal '
54 'position.')
55 query_variant_command.add_argument('-c', '--chromosome', metavar='name', type=str, dest='chromosome',
56 help='Chromosome name.', default=None)
57 query_variant_command.add_argument('-p', '--position', metavar='genome-position', type=int, dest='position',
58 help='Genome position.', default=None)
59 query_variant_command.add_argument('-l', '--positions', metavar='File-contain-a-list-of-genome-positions',
60 type=str, dest='positions',
61 help='Genome positions list in a file. One for each line. You can input single '
62 'position by -c and -p or using -l for multiple poisitions in a single file, '
63 'could be .gz file',
64 default=None)
65 # query_variant_command.add_argument('-v', '--variant', metavar='chrom-pos-ref-alt/rs#', type=str, dest='variant_id',
66 # help='Variant identifier CHROM-POS-REF-ALT or rs#.')
67 # query_variant_command.add_argument('-o', '--output', required=False, choices=['json', 'vcf'], default='json',
68 # dest='format', help='Output format.')
70 USER_HOME = os.path.expanduser("~")
71 CMDB_DIR = '.cmdb'
72 CMDB_TOKENSTORE = 'authaccess.yaml'
74 CMDB_DATASET_VERSION = 'CMDB_hg19_v1.0'
75 CMDB_API_VERSION = 'v1.0'
76 CMDB_API_MAIN_URL = 'https://db.cngb.org/cmdb/api/{}'.format(CMDB_API_VERSION)
78 CMDB_VCF_HEADER = [
79 '##fileformat=VCFv4.2',
80 '##FILTER=<ID=LowQual,Description="Low quality">',
81 '##INFO=<ID=CMDB_AN,Number=1,Type=Integer,Description="Number of Alleles in Samples with Coverage from {}">'.format(
82 CMDB_DATASET_VERSION),
83 '##INFO=<ID=CMDB_AC,Number=A,Type=Integer,Description="Alternate Allele Counts in Samples with Coverage from {}">'.format(
84 CMDB_DATASET_VERSION),
85 '##INFO=<ID=CMDB_AF,Number=A,Type=Float,Description="Alternate Allele Frequencies from {}">'.format(CMDB_DATASET_VERSION),
86 '##INFO=<ID=CMDB_FILTER,Number=A,Type=Float,Description="Filter from {}">'.format(CMDB_DATASET_VERSION),
87 '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO'
91 class CMDBException(Exception):
92 def __init__(self, message):
93 self.message = message
95 def __str__(self):
96 return self.message
99 def load_version():
100 if not authaccess_exists():
101 print "No access tokens found. Please login first.\n"
102 return
104 tokenstore = read_tokenstore()
105 return tokenstore["version"]
108 class Requests(object):
109 # this implements the parts we need of the real `Requests` module
110 @staticmethod
111 def get(url, headers={}, params=None):
112 if params:
113 url += '?' + urlencode(params)
115 r = Request(url=url, headers=headers)
116 try:
117 response = urlopen(r)
118 except HTTPError:
119 response = None
121 return _RequestsResponse(response)
123 @staticmethod
124 def post(url, headers={}, data=None):
125 if data is not None and isinstance(data, dict):
126 data = urlencode(data)
128 r = Request(url, headers=headers, data=data)
129 return _RequestsResponse(urlopen(r))
132 class _RequestsResponse(object):
133 def __init__(self, response):
135 if response:
136 self.status_code = response.getcode()
137 self._json = json.load(response.fp)
138 else:
139 self.status_code = 404
140 self._json = None
142 def json(self):
143 return self._json
146 class _RequestsExceptions(object):
147 pass
150 Requests.exceptions = _RequestsExceptions
151 Requests.exceptions.RequestException = HTTPError
154 def authaccess_exists():
155 return os.path.isfile(os.path.join(USER_HOME, CMDB_DIR, CMDB_TOKENSTORE))
158 def create_tokenstore():
159 p = os.path.join(USER_HOME, CMDB_DIR)
160 if not os.path.isdir(p):
161 os.mkdir(p, 0700)
163 p = os.path.join(p, CMDB_TOKENSTORE)
164 if not os.path.isfile(p):
165 # create file
166 open(p, 'a').close()
167 os.chmod(p, 0600)
170 def read_tokenstore():
171 token_path = os.path.join(USER_HOME, CMDB_DIR, CMDB_TOKENSTORE)
172 with open(token_path, 'r') as I:
173 tokenstore = yaml.load(I)
175 access_token = tokenstore.get('access_token', None)
176 if access_token is None or not isinstance(access_token, basestring):
177 raise CMDBException('Invalid or outdated access token. You may need to run login.')
179 return tokenstore
182 def write_tokenstore(token):
183 file_path = os.path.join(USER_HOME, CMDB_DIR, CMDB_TOKENSTORE)
184 with open(file_path, 'w') as tokenstore:
185 token_obj = {
186 "access_token": token,
187 "version": CMDB_DATASET_VERSION
189 yaml.dump(token_obj, tokenstore)
191 os.chmod(file_path, 0600)
194 def login(token):
195 # Test the token is available or not
196 test_url = "https://db.cngb.org/cmdb/api/v1.0/variant?token={}&type=position&query=chr17-41234470".format(token)
197 cmdb_response = Requests.get(test_url)
199 if cmdb_response.status_code != 201:
200 raise CMDBException('Error while obtaining your token with CMDB API authentication server.'
201 'You may do not have the API access or the token is wrong.\n')
203 if not authaccess_exists():
204 create_tokenstore()
206 write_tokenstore(token)
207 sys.stdout.write("Done.\nYou are signed in now.\n")
209 return
212 def logout():
213 # logout by delete the tokenstore file
214 if not authaccess_exists():
215 sys.stderr.write("Don't find any your access token, no need to logout.\n")
216 return
218 file_path = os.path.join(USER_HOME, CMDB_DIR, CMDB_TOKENSTORE)
219 os.remove(file_path)
220 sys.stdout.write("Done.\nLogout successful.\n")
222 return
225 def print_access_token():
227 if not authaccess_exists():
228 sys.stderr.write('No access tokens found. Print nothing.\n')
229 return
231 tokenstore = read_tokenstore()
232 print (tokenstore['access_token'])
235 def _query_paged(headers, url):
236 page_no = 1
237 while url:
238 cmdb_response = Requests.get(url, headers=headers)
239 if cmdb_response.status_code != 200:
240 if cmdb_response.status_code == 400:
241 raise CMDBException(cmdb_response.json().get('error', 'Failed to query data.'))
242 else:
243 cmdb_response.raise_for_status()
245 cmdb_response_data = cmdb_response.json()
246 if cmdb_response_data['format'] == 'vcf' and page_no == 1:
247 for line in cmdb_response_data['meta']:
248 yield line
250 yield cmdb_response_data['header']
252 for item in cmdb_response_data['data']:
253 yield item
255 url = cmdb_response_data['next']
256 page_no += 1
259 def _query_nonpaged(token, url):
260 cmdb_response = Requests.get("{}&token={}".format(url, token))
261 if cmdb_response.status_code != 201:
262 if cmdb_response.status_code == 403:
263 raise CMDBException(cmdb_response.json().get('error', 'Failed to query data.'))
265 elif cmdb_response.status_code == 404:
266 pass
267 else:
268 cmdb_response.raise_for_status()
270 return cmdb_response.json()
273 def query_rs(rsid):
274 if not authaccess_exists():
275 sys.stderr.write('No access tokens found. Please login first.\n')
276 return
278 tokenstore = read_tokenstore()
279 if rsid is None:
280 raise CMDBException('Provide "-r rsid".')
282 query_url = '{}/variant?&type=rs&query={}'.format(CMDB_API_MAIN_URL, rsid)
283 return _query_nonpaged(tokenstore["access_token"], query_url)
286 def query_variant(chromosome, position):
287 if not authaccess_exists():
288 sys.stderr.write('No access tokens found. Please login first.\n')
289 return
291 tokenstore = read_tokenstore()
292 if chromosome is None or position is None:
293 raise CMDBException('Provide both "-c,--chromosome" and "-p,--position".')
295 query_url = '{}/variant?&type=position&query={}-{}'.format(CMDB_API_MAIN_URL, chromosome, position)
296 return _query_nonpaged(tokenstore["access_token"], query_url)
299 def annotate(infile, filter=None):
300 if not authaccess_exists():
301 raise CMDBException('[ERROR] No access tokens found. Please login first.\n')
303 data_version = load_version()
304 with gzip.open(infile) if infile.endswith('.gz') else open(infile) as I:
306 for in_line in I:
308 if in_line.startswith('#'):
309 if in_line.startswith('##'):
310 sys.stdout.write('{}\n'.format(in_line.rstrip()))
312 elif in_line.startswith('#CHROM'):
314 sys.stdout.write(
315 '##INFO=<ID=CMDB_AN,Number=1,Type=Integer,Description="Number of Alleles in Samples with Coverage from {}">\n'.format(
316 data_version))
317 sys.stdout.write(
318 '##INFO=<ID=CMDB_AC,Number=A,Type=Integer,Description="Alternate Allele Counts in Samples with Coverage from {}">\n'.format(
319 data_version))
320 sys.stdout.write(
321 '##INFO=<ID=CMDB_AF,Number=A,Type=Float,Description="Alternate Allele Frequencies from {}">\n'.format(
322 data_version))
323 sys.stdout.write(
324 '##INFO=<ID=CMDB_FILTER,Number=A,Type=Float,Description="Filter from {}">\n'.format(
325 data_version))
326 sys.stdout.write('{}\n'.format(in_line.rstrip()))
328 continue
330 in_fields = in_line.rstrip().split()
331 if 'chr' not in in_fields[0].lower():
332 in_fields[0] = 'chr' + in_fields[0].lower()
334 chromosome = in_fields[0]
335 position = int(in_fields[1])
336 ref = in_fields[3]
337 alt = in_fields[4] # assume bi-allelic
339 cmdb_variant = query_variant(chromosome, position)
340 # Must just be one element in the `list`
341 if cmdb_variant:
342 cmdb_variant = cmdb_variant[0]
344 # ignore `chr` in `variant` which could be compared with variant-id in CMDB
345 variants = ['-'.join([chromosome.split('chr')[-1], str(position), ref, a]).upper()
346 for a in alt.split(',')]
348 if cmdb_variant is None or (cmdb_variant and (cmdb_variant['variant_id'] not in variants)):
349 sys.stdout.write('{}\n'.format('\t'.join(in_fields)))
351 else:
352 new_info = {
353 'CMDB_AN': 'CMDB_AN={}'.format(cmdb_variant['allele_num']),
354 'CMDB_AC': 'CMDB_AC={}'.format(cmdb_variant['allele_count']),
355 'CMDB_AF': 'CMDB_AF={}'.format(cmdb_variant['allele_freq']),
356 'CMDB_FILTER': 'CMDB_FILTER={}'.format(cmdb_variant['filter_status'])
359 info = in_fields[7]
360 if info != '.':
361 for c in info.split(';'):
362 k = c.split('=')[0]
363 if k not in new_info:
364 new_info[k] = c
366 info = ';'.join([new_info[k] for k in sorted(new_info.keys())])
367 if len(in_fields) > 8:
369 sys.stdout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
370 chromosome, position, in_fields[2], ref, alt, in_fields[5], in_fields[6], info,
371 '\t'.join(in_fields[8:]))
373 else:
374 sys.stdout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
375 chromosome, position, in_fields[2], ref, alt, in_fields[5], in_fields[6], info)
377 return
380 def run_rs_variant(rsids):
381 if not authaccess_exists():
382 raise CMDBException('[ERROR] No access tokens found. Please login first.\n')
383 sys.stdout.write('%s\n' % '\n'.join(CMDB_VCF_HEADER))
384 for rsid in rsids:
385 variants = query_rs(rsid)
386 if variants is None:
387 continue
388 for cmdb_variant in variants:
389 vcf_line = [
390 cmdb_variant['chrom'],
391 cmdb_variant['pos'],
392 cmdb_variant['rsid'],
393 cmdb_variant['ref'],
394 cmdb_variant['alt'],
395 cmdb_variant['site_quality'],
396 cmdb_variant['filter_status'],
397 'CMDB_AF={},CMDB_AC={},CMDB_AN={}'.format(
398 cmdb_variant['allele_freq'],
399 cmdb_variant['allele_count'],
400 cmdb_variant['allele_num']
403 sys.stdout.write('%s\n' % '\t'.join(map(str, vcf_line)))
406 def run_query_variant(positions):
408 if not authaccess_exists():
409 raise CMDBException('[ERROR] No access tokens found. Please login first.\n')
411 sys.stdout.write('%s\n' % '\n'.join(CMDB_VCF_HEADER))
412 for chromosome, position in positions:
414 variants = query_variant(chromosome, position)
415 if variants is None:
416 continue
418 for cmdb_variant in variants:
419 vcf_line = [
420 cmdb_variant['chrom'],
421 cmdb_variant['pos'],
422 cmdb_variant['rsid'],
423 cmdb_variant['ref'],
424 cmdb_variant['alt'],
425 cmdb_variant['site_quality'],
426 cmdb_variant['filter_status'],
427 'CMDB_AF={},CMDB_AC={},CMDB_AN={}'.format(
428 cmdb_variant['allele_freq'],
429 cmdb_variant['allele_count'],
430 cmdb_variant['allele_num']
433 sys.stdout.write('%s\n' % '\t'.join(map(str, vcf_line)))
436 def main():
437 # entry function
439 START_TIME = datetime.now()
441 args = argparser.parse_args()
442 try:
443 if args.command == 'login':
444 login(args.token)
446 elif args.command == 'logout':
447 logout()
449 elif args.command == 'print-access-token':
450 print_access_token()
452 elif args.command == 'query-rs':
453 if not (args.rsid or args.list):
454 sys.stderr.write("[Error] Couldn't find any rsIDs. You must input single rsID by '-r' "
455 "or multiple rsIDs in one single file by '-l'.\n")
456 sys.exit(1)
458 rsIDs = []
459 if args.rsid:
460 rsIDs.append(args.rsid.lower())
461 if args.list:
462 # Fetching positions from a single file
463 with gzip.open(args.list) if args.list.endswith('.gz') else open(args.list) as P:
464 for line in P:
465 # skip header information
466 if line.startswith("#"):
467 continue
468 col = line.strip()
469 rsIDs.append(col.lower())
470 run_rs_variant(rsIDs)
472 elif args.command == 'query-variant':
474 if not ((args.chromosome and args.position) or args.positions):
475 sys.stderr.write("[Error] Couldn't find any positions. You must input single position by '-c and -p' "
476 "or multiple positions in one single file by '-l'.\n")
477 sys.exit(1)
479 positions = []
480 if args.chromosome and args.position:
481 if 'chr' not in args.chromosome.lower():
482 args.chromosome = 'chr' + args.chromosome.lower()
484 positions.append([args.chromosome.lower(), args.position])
486 if args.positions:
487 # Fetching positions from a single file
488 with gzip.open(args.positions) if args.positions.endswith('.gz') else open(args.positions) as P:
489 for line in P:
491 # skip header information
492 if line.startswith("#"):
493 continue
495 col = line.strip().split()
497 if 'chr' not in col[0].lower():
498 col[0] = 'chr' + col[0].lower()
500 if len(col) == 2:
501 positions.append([col[0], int(col[1])])
503 elif len(col) == 3:
504 start, end = map(int, col[1:])
505 for position in range(start, end+1):
506 positions.append([col[0], position])
507 else:
508 sys.stderr.write("[Error] Unexpected format hit %s in %s.\n" % (line, args.positions))
510 # sorted and query positions
511 positions.sort(key=lambda A:(A[0], A[1]))
512 run_query_variant(positions)
514 elif args.command == 'query-variants':
515 pass
517 elif args.command == 'annotate':
518 annotate(args.in_vcffile, filter=None)
520 except CMDBException as e:
521 print (e)
524 elasped_time = datetime.now() -START_TIME
525 sys.stderr.write("** Query CMDB done, %d seconds elapsed **\n" % (elasped_time.seconds))
528 if __name__ == '__main__':
529 main()