3 # BioPerl module for Bio::SearchIO::megablast
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason@bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::SearchIO::megablast - a driver module for Bio::SearchIO to parse
18 megablast reports (format 0)
22 # do not use this module directly
25 # for default format output from megablast
26 my $in = Bio::SearchIO->new(-file => 'file.mbl',
27 -format => 'megablast',
30 while( my $r = $in->next_result ) {
31 while( my $hit = $r->next_hit ) {
32 while( my $hsp = $hit->next_hsp ) {
41 Because of the way megablast report format 0 is coded, realize that score
42 means # gap characters + # mismatches for a HSP.
44 The docs from NCBI regarding FORMAT 0
45 # 0: Produce one-line output for each alignment, in the form
47 # 'subject-id'=='[+-]query-id' (s_off q_off s_end q_end) score
49 # Here subject(query)-id is a gi number, an accession or some other type of
50 # identifier found in the FASTA definition line of the respective sequence.
52 # + or - corresponds to same or different strand alignment.
54 # Score for non-affine gapping parameters means the total number of
55 # differences (mismatches + gap characters). For affine case it is the
56 # actual (raw) score of the alignment.
58 FORMAT 1 parsing has not been implemented
59 FORMAT 2 parsing should work with the SearchIO 'blast' parser
65 User feedback is an integral part of the evolution of this and other
66 Bioperl modules. Send your comments and suggestions preferably to
67 the Bioperl mailing list. Your participation is much appreciated.
69 bioperl-l@bioperl.org - General discussion
70 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
74 Please direct usage questions or support issues to the mailing list:
76 I<bioperl-l@bioperl.org>
78 rather than to the module maintainer directly. Many experienced and
79 reponsive experts will be able look at the problem and quickly
80 address it. Please include a thorough description of the problem
81 with code and data examples if at all possible.
85 Report bugs to the Bioperl bug tracking system to help us keep track
86 of the bugs and their resolution. Bug reports can be submitted via the
89 http://bugzilla.open-bio.org/
91 =head1 AUTHOR - Jason Stajich
93 Email jason-at-bioperl.org
97 The rest of the documentation details each of the object methods.
98 Internal methods are usually preceded with a _
103 # Let the code begin...
106 package Bio
::SearchIO
::megablast
;
108 use vars
qw(%MAPPING %MODEMAP $DEFAULT_BLAST_WRITER_CLASS);
110 use base qw(Bio::SearchIO);
113 # mapping of MegaBlast terms to Bioperl hash keys
114 %MODEMAP = ('MegaBlastOutput' => 'result',
119 # This should really be done more intelligently, like with
124 'Hsp_query-from' => 'HSP-query_start',
125 'Hsp_query-to' => 'HSP-query_end',
126 'Hsp_hit-from' => 'HSP-hit_start',
127 'Hsp_hit-to' => 'HSP-hit_end',
128 'Hit_score' => 'HIT-score',
129 'Hsp_score' => 'HSP-score',
131 'Hsp_identity' => 'HSP-identical',
132 'Hsp_positive' => 'HSP-conserved',
134 'Hit_id' => 'HIT-name',
136 'MegaBlastOutput_program' => 'RESULT-algorithm_name',
137 'MegaBlastOutput_query-def'=> 'RESULT-query_name',
141 $DEFAULT_BLAST_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter';
148 Usage : my $obj = Bio::SearchIO::blast->new();
149 Function: Builds a new Bio::SearchIO::blast object
150 Returns : Bio::SearchIO::blast
151 Args : -fh/-file => filehandle/filename to BLAST file
157 my ($self,@args) = @_;
158 $self->SUPER::_initialize
(@args);
159 my ($fmt) = $self->_rearrange([qw(REPORT_FORMAT)], @args);
161 $self->throw("Must provide a value for -report_format when initializing a megablast parser") unless defined $fmt ;
162 $self->report_format($fmt);
169 Usage : my $hit = $searchio->next_result;
170 Function: Returns the next Result from a search
171 Returns : Bio::Search::Result::ResultI object
182 my $fmt = $self->report_format;
183 my ($lastquery,$lasthit);
184 while( defined($_ = $self->_readline) ) {
186 if( /^\'(\S
+)\'\
=\
=\'(\
+|\
-)(\S
+)\'\s
+
187 \
((\d
+)\s
+(\d
+)\s
+(\d
+)\s
+(\d
+)\
)\s
+
190 my ($hit,$strand,$query,
191 $h_start,$q_start,$h_end,$q_end,
192 $score) = ($1,$2,$3,$4,$5,$6,$7,$8);
193 if( ! defined $lastquery ) {
194 $self->start_element({'Name' => 'MegaBlastOutput'});
195 $self->element({'Name' => 'MegaBlastOutput_program',
196 'Data' => 'MEGABLAST'});
197 $self->element({'Name' => 'MegaBlastOutput_query-def',
199 } elsif( $lastquery ne $query ) {
200 $self->_pushback($_);
201 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
202 $self->end_element({ 'Name' => 'MegaBlastOutput'});
205 return $self->end_document();
208 if( ! defined $lasthit || $lasthit ne $hit ) {
209 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
210 $self->start_element({'Name' => 'Hit'});
211 $self->element({'Name' => 'Hit_id',
214 $self->start_element({'Name' => 'Hsp'});
215 $self->element({'Name' => 'Hsp_score',
218 # flip flop start/end if strand is < 0
219 # since strandedness is inferred from the query
220 # because of the way it is coded all queries will
221 # be on the forward strand and hits will be either
224 # also the NCBI docs state:
225 # 0: Produce one-line output for each alignment, in the form
227 # 'subject-id'=='[+-]query-id' (s_off q_off s_end q_end) score
229 # Here subject(query)-id is a gi number, an accession or some other type of
230 # identifier found in the FASTA definition line of the respective sequence.
232 # + or - corresponds to same or different strand alignment.
234 # Score for non-affine gapping parameters means the total number of
235 # differences (mismatches + gap characters). For affine case it is the
236 # actual (raw) score of the alignment.
238 # and yet when rev strand hits are made I see
240 # 'Contig634'=='-503384' (1 7941 321 7620) 19
241 # so the query is on the rev strand and the
242 # subject is on the fwd strand
243 # so I am flip-flopping everything when I see a '-'
244 if( $strand eq '-' ) {
245 ($h_start,$h_end) = ( $h_end,$h_start);
246 ($q_start,$q_end) = ( $q_end,$q_start);
248 $self->element({'Name' => 'Hsp_hit-from',
249 'Data' => $h_start});
250 $self->element({'Name' => 'Hsp_hit-to',
252 $self->element({'Name' => 'Hsp_query-from',
253 'Data' => $q_start});
254 $self->element({'Name' => 'Hsp_query-to',
256 # might not be quite right -- need to know length of the HSP
257 my $numid = (abs($q_end - $q_start) - $score);
259 $self->element({'Name' => 'Hsp_identity',
261 $self->element({'Name' => 'Hsp_positive',
264 $self->end_element({'Name' => 'Hsp'});
268 $self->debug("Unknown line in fmt0 parsing: $_");
272 if( defined $lastquery && $fmt == 0 ) {
273 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
274 $self->end_element({ 'Name' => 'MegaBlastOutput'});
275 return $self->end_document();
282 Title : report_format
283 Usage : $obj->report_format($newval)
284 Function: Get/Set the report_format value
285 Returns : value of report_format (a scalar)
286 Args : on set, new value (a scalar or undef, optional)
293 return $self->{'_report_format'} = shift if @_;
294 return $self->{'_report_format'};
300 Title : start_element
301 Usage : $eventgenerator->start_element
302 Function: Handles a start element event
304 Args : hashref with at least 2 keys 'Data' and 'Name'
310 my ($self,$data) = @_;
311 # we currently do not care about attributes
312 my $nm = $data->{'Name'};
313 if( my $type = $MODEMAP{$nm} ) {
315 if( $self->_eventHandler->will_handle($type) ) {
316 my $func = sprintf("start_%s",lc $type);
317 $self->_eventHandler->$func($data->{'Attributes'});
319 unshift @
{$self->{'_elements'}}, $type;
322 if($nm eq 'MegaBlastOutput') {
323 $self->{'_values'} = {};
324 $self->{'_result'}= undef;
325 $self->{'_mode'} = '';
332 Title : start_element
333 Usage : $eventgenerator->end_element
334 Function: Handles an end element event
336 Args : hashref with at least 2 keys 'Data' and 'Name'
342 my ($self,$data) = @_;
343 my $nm = $data->{'Name'};
346 if( my $type = $MODEMAP{$nm} ) {
347 if( $self->_eventHandler->will_handle($type) ) {
348 my $func = sprintf("end_%s",lc $type);
349 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
352 shift @
{$self->{'_elements'}};
354 } elsif( $MAPPING{$nm} ) {
355 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
356 my $key = (keys %{$MAPPING{$nm}})[0];
357 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
359 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
362 $self->warn( "unknown nm $nm ignoring\n");
364 $self->{'_last_data'} = ''; # remove read data if we are at
366 $self->{'_result'} = $rc if( $nm eq 'MegaBlastOutput' );
374 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
375 Function: Convience method that calls start_element, characters, end_element
377 Args : Hash ref with the keys 'Name' and 'Data'
383 my ($self,$data) = @_;
384 $self->start_element($data);
385 $self->characters($data);
386 $self->end_element($data);
393 Usage : $eventgenerator->characters($str)
394 Function: Send a character events
402 my ($self,$data) = @_;
403 return unless defined $data->{'Data'};
404 $self->{'_last_data'} = $data->{'Data'};
410 Usage : $obj->_mode($newval)
413 Returns : value of _mode
414 Args : newvalue (optional)
420 my ($self,$value) = @_;
421 if( defined $value) {
422 $self->{'_mode'} = $value;
424 return $self->{'_mode'};
427 =head2 within_element
429 Title : within_element
430 Usage : if( $eventgenerator->within_element($element) ) {}
431 Function: Test if we are within a particular element
432 This is different than 'in' because within can be tested
435 Args : string element name
441 my ($self,$name) = @_;
442 return 0 if ( ! defined $name &&
443 ! defined $self->{'_elements'} ||
444 scalar @
{$self->{'_elements'}} == 0) ;
445 foreach ( @
{$self->{'_elements'}} ) {
456 Usage : if( $eventgenerator->in_element($element) ) {}
457 Function: Test if we are in a particular element
458 This is different than 'in' because within can be tested
461 Args : string element name
467 my ($self,$name) = @_;
468 return 0 if ! defined $self->{'_elements'}->[0];
469 return ( $self->{'_elements'}->[0] eq $name)
473 =head2 start_document
475 Title : start_document
476 Usage : $eventgenerator->start_document
477 Function: Handles a start document event
486 $self->{'_lasttype'} = '';
487 $self->{'_values'} = {};
488 $self->{'_result'}= undef;
489 $self->{'_mode'} = '';
490 $self->{'_elements'} = [];
497 Usage : $eventgenerator->end_document
498 Function: Handles an end document event
499 Returns : Bio::Search::Result::ResultI object
506 my ($self,@args) = @_;
507 return $self->{'_result'};
513 Usage : my $count = $searchio->result_count
514 Function: Returns the number of results we have processed
523 return $self->{'_result_count'};
526 sub report_count
{ shift->result_count }