1 from __future__
import generators
3 from nlmsa_utils
import CoordsGroupStart
,CoordsGroupEnd
5 # AUTHORS: zfierstadt, leec
7 def is_line_start(token
,line
):
8 "check whether line begins with token"
9 return token
==line
[:len(token
)]
11 def get_ori_letterunit(start
,end
,seq
,gapchar
='-'):
12 """try to determine orientation (1 or -1) based on whether start>end,
13 and letterunit (1 or 3) depending on the ratio of end-start difference
14 vs the actual non-gap letter count. Returns tuple (ori,letterunit)"""
24 if ori
*float(end
-start
)/seqlen
>2.0:
30 class BlastIval(object):
32 return '<BLAST-IVAL: ' + repr(self
.__dict
__) + '>'
34 class BlastHitParser(object):
35 """reads alignment info from blastall standard output.
36 Method parse_file(fo) reads file object fo, and generates tuples
37 suitable for BlastIval."""
44 "flush any alignment info, so we can start reading new alignment"
48 def save_query(self
,line
):
49 self
.query_id
=line
.split()[1]
50 def save_subject(self
,line
):
51 self
.subject_id
=line
.split()[0][1:]
52 def save_score(self
,line
):
54 self
.blast_score
=float(line
.split()[2])
59 self
.e_value
= -math
.log(float(s
))/math
.log(10.0)
60 except (ValueError,OverflowError):
62 def save_identity(self
,line
):
63 "save Identities line"
65 self
.identity_percent
=int(s
[:s
.find('%')])
66 def save_query_line(self
,line
):
69 self
.query_end
=int(c
[3])
70 if not self
.query_seq
:
71 self
.query_start
=int(c
[1])
72 if self
.query_start
< self
.query_end
: # handles forward orientation
75 self
.seq_start_char
=line
.find(c
[2], 5) # IN CASE BLAST SCREWS UP Sbjct:
76 def save_subject_line(self
,line
):
77 "save a Sbjct: line, attempt to handle various BLAST insanities"
79 if len(c
)<4: # OOPS, BLAST FORGOT TO PUT SPACE BEFORE 1ST NUMBER
80 # THIS HAPPENS IN TBLASTN... WHEN THE SUBJECT SEQUENCE
81 # COVERS RANGE 1-1200, THE FOUR DIGIT NUMBER WILL RUN INTO
82 # THE SEQUENCE, WITH NO SPACE!!
83 c
=['Sbjct:',line
[6:self
.seq_start_char
]] \
84 +line
[self
.seq_start_char
:].split() # FIX BLAST SCREW-UP
85 self
.subject_end
=int(c
[3])
86 if not self
.subject_seq
:
87 self
.subject_start
=int(c
[1])
88 if self
.subject_start
< self
.subject_end
: # handles forward orientation
89 self
.subject_start
-= 1
90 self
.subject_seq
+=c
[2]
91 lendiff
=len(self
.query_seq
)-len(self
.subject_seq
)
92 if lendiff
>0: # HANDLE TBLASTN SCREWINESS: Sbjct SEQ OFTEN TOO SHORT!!
93 # THIS APPEARS TO BE ASSOCIATED ESPECIALLY WITH STOP CODONS *
94 self
.subject_seq
+=lendiff
*'A' # EXTEND TO SAME LENGTH AS QUERY...
95 elif lendiff
<0 and not hasattr(self
,'ignore_query_truncation'):
96 # WHAT THE HECK?!?! WARN THE USER: BLAST RESULTS ARE SCREWY...
98 """BLAST appears to have truncated the Query: sequence
99 to be shorter than the Sbjct: sequence:
102 This should not happen! To ignore this error, please
103 create an attribute ignore_query_truncation on the
104 BlastHitParser object.""" % (self
.query_seq
,self
.subject_seq
))
105 def get_interval_obj(self
, q_start
, q_end
, s_start
, s_end
,
106 query_ori
, query_factor
, subject_ori
, subject_factor
):
107 "return interval result as an object with attributes"
109 o
.hit_id
= self
.hit_id
110 o
.src_id
= self
.query_id
111 o
.dest_id
= self
.subject_id
112 o
.blast_score
= self
.blast_score
113 o
.e_value
= self
.e_value
114 o
.percent_id
= self
.identity_percent
115 o
.src_ori
= query_ori
116 o
.dest_ori
= subject_ori
117 query_start
= self
.query_start
+q_start
*query_ori
*query_factor
118 query_end
= self
.query_start
+q_end
*query_ori
*query_factor
119 subject_start
= self
.subject_start
+s_start
*subject_ori
*subject_factor
120 subject_end
= self
.subject_start
+s_end
*subject_ori
*subject_factor
121 if query_start
<query_end
:
122 o
.src_start
= query_start
123 o
.src_end
= query_end
125 o
.src_start
= query_end
126 o
.src_end
= query_start
127 if subject_start
<subject_end
:
128 o
.dest_start
= subject_start
129 o
.dest_end
= subject_end
131 o
.dest_start
= subject_end
132 o
.dest_end
= subject_start
134 def is_valid_hit(self
):
135 return self
.query_seq
and self
.subject_seq
136 def generate_intervals(self
):
137 "generate interval tuples for the current alignment"
138 yield CoordsGroupStart() # bracket with grouping markers
140 query_ori
,query_factor
=get_ori_letterunit(self
.query_start
,\
141 self
.query_end
,self
.query_seq
,self
.gapchar
)
142 subject_ori
,subject_factor
=get_ori_letterunit(self
.subject_start
,\
143 self
.subject_end
,self
.subject_seq
,self
.gapchar
)
148 for i
in range(len(self
.query_seq
)): # SCAN ALIGNMENT FOR GAPS
149 if self
.query_seq
[i
]==self
.gapchar
or self
.subject_seq
[i
]==self
.gapchar
:
150 if q_start
>=0: # END OF AN UNGAPPED INTERVAL
151 yield self
.get_interval_obj(q_start
, i_query
,
153 query_ori
, query_factor
,
154 subject_ori
, subject_factor
)
156 elif q_start
<0: # START OF AN UNGAPPED INTERVAL
159 if self
.query_seq
[i
]!=self
.gapchar
: # COUNT QUERY LETTERS
161 if self
.subject_seq
[i
]!=self
.gapchar
: # COUNT SUBJECT LETTERS
163 if q_start
>=0: # REPORT THE LAST INTERVAL
164 yield self
.get_interval_obj(q_start
, i_query
,
166 query_ori
, query_factor
,
167 subject_ori
, subject_factor
)
169 yield CoordsGroupEnd()
171 def parse_file(self
,myfile
):
172 "generate interval tuples by parsing BLAST output from myfile"
175 if self
.is_valid_hit() and \
176 (is_line_start('>',line
) or is_line_start(' Score =',line
) \
177 or is_line_start(' Database:',line
) \
178 or is_line_start('Query=',line
)):
179 for t
in self
.generate_intervals(): # REPORT THIS ALIGNMENT
180 yield t
# GENERATE ALL ITS INTERVAL MATCHES
181 self
.reset() # RESET TO START A NEW ALIGNMENT
182 if is_line_start('Query=',line
):
183 self
.save_query(line
)
184 elif is_line_start('>',line
):
185 self
.save_subject(line
)
186 elif is_line_start(' Score =',line
):
187 self
.save_score(line
)
188 elif 'Identities =' in line
:
189 self
.save_identity(line
)
190 elif is_line_start('Query:',line
):
191 self
.save_query_line(line
)
192 elif is_line_start('Sbjct:',line
):
193 self
.save_subject_line(line
)
194 if self
.nline
== 0: # no blast output??
195 raise IOError('no BLAST output. Check that blastall is in your PATH')
197 if __name__
=='__main__':
200 for t
in p
.parse_file(sys
.stdin
):