added iteration test to graph_test.py
[pygr.git] / pygr / parse_blast.py
blob17e647e6e3e165d8d359aedaa35cdc5ab1d6f95e
1 from __future__ import generators
2 import math
3 from nlmsa_utils import CoordsGroupStart,CoordsGroupEnd
5 # AUTHORS: zfierstadt, leec
7 def is_line_start(token,line):
8 "check whether line begins with token"
9 return token==line[:len(token)]
11 def get_ori_letterunit(start,end,seq,gapchar='-'):
12 """try to determine orientation (1 or -1) based on whether start>end,
13 and letterunit (1 or 3) depending on the ratio of end-start difference
14 vs the actual non-gap letter count. Returns tuple (ori,letterunit)"""
15 if end>start:
16 ori=1
17 else:
18 ori= -1
19 ngap=0
20 for l in seq:
21 if l==gapchar:
22 ngap+=1
23 seqlen=len(seq)-ngap
24 if ori*float(end-start)/seqlen >2.0:
25 letterunit=3
26 else:
27 letterunit=1
28 return ori,letterunit
30 class BlastIval(object):
31 def __repr__(self):
32 return '<BLAST-IVAL: ' + repr(self.__dict__) + '>'
34 class BlastHitParser(object):
35 """reads alignment info from blastall standard output.
36 Method parse_file(fo) reads file object fo, and generates tuples
37 suitable for BlastIval."""
38 gapchar='-'
39 def __init__(self):
40 self.hit_id=0
41 self.nline = 0
42 self.reset()
43 def reset(self):
44 "flush any alignment info, so we can start reading new alignment"
45 self.query_seq=""
46 self.subject_seq=""
47 self.hit_id+=1
48 def save_query(self,line):
49 self.query_id=line.split()[1]
50 def save_subject(self,line):
51 self.subject_id=line.split()[0][1:]
52 def save_score(self,line):
53 "save a Score: line"
54 self.blast_score=float(line.split()[2])
55 s=line.split()[7]
56 if s[0]=='e':
57 s='1'+s
58 try:
59 self.e_value= -math.log(float(s))/math.log(10.0)
60 except (ValueError,OverflowError):
61 self.e_value=300.
62 def save_identity(self,line):
63 "save Identities line"
64 s=line.split()[3][1:]
65 self.identity_percent=int(s[:s.find('%')])
66 def save_query_line(self,line):
67 "save a Query: line"
68 c=line.split()
69 self.query_end=int(c[3])
70 if not self.query_seq:
71 self.query_start=int(c[1])
72 if self.query_start < self.query_end: # handles forward orientation
73 self.query_start -= 1
74 self.query_seq+=c[2]
75 self.seq_start_char=line.find(c[2], 5) # IN CASE BLAST SCREWS UP Sbjct:
76 def save_subject_line(self,line):
77 "save a Sbjct: line, attempt to handle various BLAST insanities"
78 c=line.split()
79 if len(c)<4: # OOPS, BLAST FORGOT TO PUT SPACE BEFORE 1ST NUMBER
80 # THIS HAPPENS IN TBLASTN... WHEN THE SUBJECT SEQUENCE
81 # COVERS RANGE 1-1200, THE FOUR DIGIT NUMBER WILL RUN INTO
82 # THE SEQUENCE, WITH NO SPACE!!
83 c=['Sbjct:',line[6:self.seq_start_char]] \
84 +line[self.seq_start_char:].split() # FIX BLAST SCREW-UP
85 self.subject_end=int(c[3])
86 if not self.subject_seq:
87 self.subject_start=int(c[1])
88 if self.subject_start < self.subject_end: # handles forward orientation
89 self.subject_start -= 1
90 self.subject_seq+=c[2]
91 lendiff=len(self.query_seq)-len(self.subject_seq)
92 if lendiff>0: # HANDLE TBLASTN SCREWINESS: Sbjct SEQ OFTEN TOO SHORT!!
93 # THIS APPEARS TO BE ASSOCIATED ESPECIALLY WITH STOP CODONS *
94 self.subject_seq+=lendiff*'A' # EXTEND TO SAME LENGTH AS QUERY...
95 elif lendiff<0 and not hasattr(self,'ignore_query_truncation'):
96 # WHAT THE HECK?!?! WARN THE USER: BLAST RESULTS ARE SCREWY...
97 raise ValueError(
98 """BLAST appears to have truncated the Query: sequence
99 to be shorter than the Sbjct: sequence:
100 Query: %s
101 Sbjct: %s
102 This should not happen! To ignore this error, please
103 create an attribute ignore_query_truncation on the
104 BlastHitParser object.""" % (self.query_seq,self.subject_seq))
105 def get_interval_obj(self, q_start, q_end, s_start, s_end,
106 query_ori, query_factor, subject_ori, subject_factor):
107 "return interval result as an object with attributes"
108 o = BlastIval()
109 o.hit_id = self.hit_id
110 o.src_id = self.query_id
111 o.dest_id = self.subject_id
112 o.blast_score = self.blast_score
113 o.e_value = self.e_value
114 o.percent_id = self.identity_percent
115 o.src_ori = query_ori
116 o.dest_ori = subject_ori
117 query_start = self.query_start+q_start*query_ori*query_factor
118 query_end = self.query_start+q_end*query_ori*query_factor
119 subject_start = self.subject_start+s_start*subject_ori*subject_factor
120 subject_end = self.subject_start+s_end*subject_ori*subject_factor
121 if query_start<query_end:
122 o.src_start = query_start
123 o.src_end = query_end
124 else:
125 o.src_start = query_end
126 o.src_end = query_start
127 if subject_start<subject_end:
128 o.dest_start = subject_start
129 o.dest_end = subject_end
130 else:
131 o.dest_start = subject_end
132 o.dest_end = subject_start
133 return o
134 def is_valid_hit(self):
135 return self.query_seq and self.subject_seq
136 def generate_intervals(self):
137 "generate interval tuples for the current alignment"
138 yield CoordsGroupStart() # bracket with grouping markers
140 query_ori,query_factor=get_ori_letterunit(self.query_start,\
141 self.query_end,self.query_seq,self.gapchar)
142 subject_ori,subject_factor=get_ori_letterunit(self.subject_start,\
143 self.subject_end,self.subject_seq,self.gapchar)
144 q_start= -1
145 s_start= -1
146 i_query=0
147 i_subject=0
148 for i in range(len(self.query_seq)): # SCAN ALIGNMENT FOR GAPS
149 if self.query_seq[i]==self.gapchar or self.subject_seq[i]==self.gapchar:
150 if q_start>=0: # END OF AN UNGAPPED INTERVAL
151 yield self.get_interval_obj(q_start, i_query,
152 s_start, i_subject,
153 query_ori, query_factor,
154 subject_ori, subject_factor)
155 q_start= -1
156 elif q_start<0: # START OF AN UNGAPPED INTERVAL
157 q_start=i_query
158 s_start=i_subject
159 if self.query_seq[i]!=self.gapchar: # COUNT QUERY LETTERS
160 i_query+=1
161 if self.subject_seq[i]!=self.gapchar: # COUNT SUBJECT LETTERS
162 i_subject+=1
163 if q_start>=0: # REPORT THE LAST INTERVAL
164 yield self.get_interval_obj(q_start, i_query,
165 s_start, i_subject,
166 query_ori, query_factor,
167 subject_ori, subject_factor)
169 yield CoordsGroupEnd()
171 def parse_file(self,myfile):
172 "generate interval tuples by parsing BLAST output from myfile"
173 for line in myfile:
174 self.nline += 1
175 if self.is_valid_hit() and \
176 (is_line_start('>',line) or is_line_start(' Score =',line) \
177 or is_line_start(' Database:',line) \
178 or is_line_start('Query=',line)):
179 for t in self.generate_intervals(): # REPORT THIS ALIGNMENT
180 yield t # GENERATE ALL ITS INTERVAL MATCHES
181 self.reset() # RESET TO START A NEW ALIGNMENT
182 if is_line_start('Query=',line):
183 self.save_query(line)
184 elif is_line_start('>',line):
185 self.save_subject(line)
186 elif is_line_start(' Score =',line):
187 self.save_score(line)
188 elif 'Identities =' in line:
189 self.save_identity(line)
190 elif is_line_start('Query:',line):
191 self.save_query_line(line)
192 elif is_line_start('Sbjct:',line):
193 self.save_subject_line(line)
194 if self.nline == 0: # no blast output??
195 raise IOError('no BLAST output. Check that blastall is in your PATH')
197 if __name__=='__main__':
198 import sys
199 p=BlastHitParser()
200 for t in p.parse_file(sys.stdin):
201 print t