3 # BioPerl module for Bio::SearchIO::megablast
5 # Cared for by Jason Stajich <jason@bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::SearchIO::megablast - a driver module for Bio::SearchIO to parse
16 megablast reports (format 0)
20 # do not use this module directly
23 # for default format output from megablast
24 my $in = Bio::SearchIO->new(-file => 'file.mbl',
25 -format => 'megablast',
28 while( my $r = $in->next_result ) {
29 while( my $hit = $r->next_hit ) {
30 while( my $hsp = $hit->next_hsp ) {
39 Because of the way megablast report format 0 is coded, realize that score
40 means # gap characters + # mismatches for a HSP.
42 The docs from NCBI regarding FORMAT 0
43 # 0: Produce one-line output for each alignment, in the form
45 # 'subject-id'=='[+-]query-id' (s_off q_off s_end q_end) score
47 # Here subject(query)-id is a gi number, an accession or some other type of
48 # identifier found in the FASTA definition line of the respective sequence.
50 # + or - corresponds to same or different strand alignment.
52 # Score for non-affine gapping parameters means the total number of
53 # differences (mismatches + gap characters). For affine case it is the
54 # actual (raw) score of the alignment.
56 FORMAT 1 parsing has not been implemented
57 FORMAT 2 parsing should work with the SearchIO 'blast' parser
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to
65 the Bioperl mailing list. Your participation is much appreciated.
67 bioperl-l@bioperl.org - General discussion
68 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
72 Report bugs to the Bioperl bug tracking system to help us keep track
73 of the bugs and their resolution. Bug reports can be submitted via the
76 http://bugzilla.open-bio.org/
78 =head1 AUTHOR - Jason Stajich
80 Email jason-at-bioperl.org
84 The rest of the documentation details each of the object methods.
85 Internal methods are usually preceded with a _
90 # Let the code begin...
93 package Bio
::SearchIO
::megablast
;
95 use vars
qw(%MAPPING %MODEMAP $DEFAULT_BLAST_WRITER_CLASS);
97 use base qw(Bio::SearchIO);
100 # mapping of MegaBlast terms to Bioperl hash keys
101 %MODEMAP = ('MegaBlastOutput' => 'result',
106 # This should really be done more intelligently, like with
111 'Hsp_query-from' => 'HSP-query_start',
112 'Hsp_query-to' => 'HSP-query_end',
113 'Hsp_hit-from' => 'HSP-hit_start',
114 'Hsp_hit-to' => 'HSP-hit_end',
115 'Hit_score' => 'HIT-score',
116 'Hsp_score' => 'HSP-score',
118 'Hsp_identity' => 'HSP-identical',
119 'Hsp_positive' => 'HSP-conserved',
121 'Hit_id' => 'HIT-name',
123 'MegaBlastOutput_program' => 'RESULT-algorithm_name',
124 'MegaBlastOutput_query-def'=> 'RESULT-query_name',
128 $DEFAULT_BLAST_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter';
135 Usage : my $obj = Bio::SearchIO::blast->new();
136 Function: Builds a new Bio::SearchIO::blast object
137 Returns : Bio::SearchIO::blast
138 Args : -fh/-file => filehandle/filename to BLAST file
144 my ($self,@args) = @_;
145 $self->SUPER::_initialize
(@args);
146 my ($fmt) = $self->_rearrange([qw(REPORT_FORMAT)], @args);
148 $self->throw("Must provide a value for -report_format when initializing a megablast parser") unless defined $fmt ;
149 $self->report_format($fmt);
156 Usage : my $hit = $searchio->next_result;
157 Function: Returns the next Result from a search
158 Returns : Bio::Search::Result::ResultI object
169 my $fmt = $self->report_format;
170 my ($lastquery,$lasthit);
171 while( defined($_ = $self->_readline) ) {
173 if( /^\'(\S
+)\'\
=\
=\'(\
+|\
-)(\S
+)\'\s
+
174 \
((\d
+)\s
+(\d
+)\s
+(\d
+)\s
+(\d
+)\
)\s
+
177 my ($hit,$strand,$query,
178 $h_start,$q_start,$h_end,$q_end,
179 $score) = ($1,$2,$3,$4,$5,$6,$7,$8);
180 if( ! defined $lastquery ) {
181 $self->start_element({'Name' => 'MegaBlastOutput'});
182 $self->element({'Name' => 'MegaBlastOutput_program',
183 'Data' => 'MEGABLAST'});
184 $self->element({'Name' => 'MegaBlastOutput_query-def',
186 } elsif( $lastquery ne $query ) {
187 $self->_pushback($_);
188 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
189 $self->end_element({ 'Name' => 'MegaBlastOutput'});
192 return $self->end_document();
195 if( ! defined $lasthit || $lasthit ne $hit ) {
196 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
197 $self->start_element({'Name' => 'Hit'});
198 $self->element({'Name' => 'Hit_id',
201 $self->start_element({'Name' => 'Hsp'});
202 $self->element({'Name' => 'Hsp_score',
205 # flip flop start/end if strand is < 0
206 # since strandedness is inferred from the query
207 # because of the way it is coded all queries will
208 # be on the forward strand and hits will be either
211 # also the NCBI docs state:
212 # 0: Produce one-line output for each alignment, in the form
214 # 'subject-id'=='[+-]query-id' (s_off q_off s_end q_end) score
216 # Here subject(query)-id is a gi number, an accession or some other type of
217 # identifier found in the FASTA definition line of the respective sequence.
219 # + or - corresponds to same or different strand alignment.
221 # Score for non-affine gapping parameters means the total number of
222 # differences (mismatches + gap characters). For affine case it is the
223 # actual (raw) score of the alignment.
225 # and yet when rev strand hits are made I see
227 # 'Contig634'=='-503384' (1 7941 321 7620) 19
228 # so the query is on the rev strand and the
229 # subject is on the fwd strand
230 # so I am flip-flopping everything when I see a '-'
231 if( $strand eq '-' ) {
232 ($h_start,$h_end) = ( $h_end,$h_start);
233 ($q_start,$q_end) = ( $q_end,$q_start);
235 $self->element({'Name' => 'Hsp_hit-from',
236 'Data' => $h_start});
237 $self->element({'Name' => 'Hsp_hit-to',
239 $self->element({'Name' => 'Hsp_query-from',
240 'Data' => $q_start});
241 $self->element({'Name' => 'Hsp_query-to',
243 # might not be quite right -- need to know length of the HSP
244 my $numid = (abs($q_end - $q_start) - $score);
246 $self->element({'Name' => 'Hsp_identity',
248 $self->element({'Name' => 'Hsp_positive',
251 $self->end_element({'Name' => 'Hsp'});
255 $self->debug("Unknown line in fmt0 parsing: $_");
259 if( defined $lastquery && $fmt == 0 ) {
260 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
261 $self->end_element({ 'Name' => 'MegaBlastOutput'});
262 return $self->end_document();
269 Title : report_format
270 Usage : $obj->report_format($newval)
271 Function: Get/Set the report_format value
272 Returns : value of report_format (a scalar)
273 Args : on set, new value (a scalar or undef, optional)
280 return $self->{'_report_format'} = shift if @_;
281 return $self->{'_report_format'};
287 Title : start_element
288 Usage : $eventgenerator->start_element
289 Function: Handles a start element event
291 Args : hashref with at least 2 keys 'Data' and 'Name'
297 my ($self,$data) = @_;
298 # we currently do not care about attributes
299 my $nm = $data->{'Name'};
300 if( my $type = $MODEMAP{$nm} ) {
302 if( $self->_eventHandler->will_handle($type) ) {
303 my $func = sprintf("start_%s",lc $type);
304 $self->_eventHandler->$func($data->{'Attributes'});
306 unshift @
{$self->{'_elements'}}, $type;
309 if($nm eq 'MegaBlastOutput') {
310 $self->{'_values'} = {};
311 $self->{'_result'}= undef;
312 $self->{'_mode'} = '';
319 Title : start_element
320 Usage : $eventgenerator->end_element
321 Function: Handles an end element event
323 Args : hashref with at least 2 keys 'Data' and 'Name'
329 my ($self,$data) = @_;
330 my $nm = $data->{'Name'};
333 if( my $type = $MODEMAP{$nm} ) {
334 if( $self->_eventHandler->will_handle($type) ) {
335 my $func = sprintf("end_%s",lc $type);
336 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
339 shift @
{$self->{'_elements'}};
341 } elsif( $MAPPING{$nm} ) {
342 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
343 my $key = (keys %{$MAPPING{$nm}})[0];
344 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
346 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
349 $self->warn( "unknown nm $nm ignoring\n");
351 $self->{'_last_data'} = ''; # remove read data if we are at
353 $self->{'_result'} = $rc if( $nm eq 'MegaBlastOutput' );
361 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
362 Function: Convience method that calls start_element, characters, end_element
364 Args : Hash ref with the keys 'Name' and 'Data'
370 my ($self,$data) = @_;
371 $self->start_element($data);
372 $self->characters($data);
373 $self->end_element($data);
380 Usage : $eventgenerator->characters($str)
381 Function: Send a character events
389 my ($self,$data) = @_;
390 return unless defined $data->{'Data'};
391 $self->{'_last_data'} = $data->{'Data'};
397 Usage : $obj->_mode($newval)
400 Returns : value of _mode
401 Args : newvalue (optional)
407 my ($self,$value) = @_;
408 if( defined $value) {
409 $self->{'_mode'} = $value;
411 return $self->{'_mode'};
414 =head2 within_element
416 Title : within_element
417 Usage : if( $eventgenerator->within_element($element) ) {}
418 Function: Test if we are within a particular element
419 This is different than 'in' because within can be tested
422 Args : string element name
428 my ($self,$name) = @_;
429 return 0 if ( ! defined $name &&
430 ! defined $self->{'_elements'} ||
431 scalar @
{$self->{'_elements'}} == 0) ;
432 foreach ( @
{$self->{'_elements'}} ) {
443 Usage : if( $eventgenerator->in_element($element) ) {}
444 Function: Test if we are in a particular element
445 This is different than 'in' because within can be tested
448 Args : string element name
454 my ($self,$name) = @_;
455 return 0 if ! defined $self->{'_elements'}->[0];
456 return ( $self->{'_elements'}->[0] eq $name)
460 =head2 start_document
462 Title : start_document
463 Usage : $eventgenerator->start_document
464 Function: Handles a start document event
473 $self->{'_lasttype'} = '';
474 $self->{'_values'} = {};
475 $self->{'_result'}= undef;
476 $self->{'_mode'} = '';
477 $self->{'_elements'} = [];
484 Usage : $eventgenerator->end_document
485 Function: Handles an end document event
486 Returns : Bio::Search::Result::ResultI object
493 my ($self,@args) = @_;
494 return $self->{'_result'};
500 Usage : my $count = $searchio->result_count
501 Function: Returns the number of results we have processed
510 return $self->{'_result_count'};
513 sub report_count
{ shift->result_count }