bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / SearchIO / megablast.pm
blob819d8d33cbe71a342ec2d583ba983b463abf085a
1 # $Id$
3 # BioPerl module for Bio::SearchIO::megablast
5 # Cared for by Jason Stajich <jason@bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SearchIO::megablast - a driver module for Bio::SearchIO to parse
16 megablast reports (format 0)
18 =head1 SYNOPSIS
20 # do not use this module directly
22 use Bio::SearchIO;
23 # for default format output from megablast
24 my $in = Bio::SearchIO->new(-file => 'file.mbl',
25 -format => 'megablast',
26 -report_format => 0);
28 while( my $r = $in->next_result ) {
29 while( my $hit = $r->next_hit ) {
30 while( my $hsp = $hit->next_hsp ) {
35 =head1 DESCRIPTION
37 Beware!
39 Because of the way megablast report format 0 is coded, realize that score
40 means # gap characters + # mismatches for a HSP.
42 The docs from NCBI regarding FORMAT 0
43 # 0: Produce one-line output for each alignment, in the form
45 # 'subject-id'=='[+-]query-id' (s_off q_off s_end q_end) score
47 # Here subject(query)-id is a gi number, an accession or some other type of
48 # identifier found in the FASTA definition line of the respective sequence.
50 # + or - corresponds to same or different strand alignment.
52 # Score for non-affine gapping parameters means the total number of
53 # differences (mismatches + gap characters). For affine case it is the
54 # actual (raw) score of the alignment.
56 FORMAT 1 parsing has not been implemented
57 FORMAT 2 parsing should work with the SearchIO 'blast' parser
59 =head1 FEEDBACK
61 =head2 Mailing Lists
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to
65 the Bioperl mailing list. Your participation is much appreciated.
67 bioperl-l@bioperl.org - General discussion
68 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
70 =head2 Reporting Bugs
72 Report bugs to the Bioperl bug tracking system to help us keep track
73 of the bugs and their resolution. Bug reports can be submitted via the
74 web:
76 http://bugzilla.open-bio.org/
78 =head1 AUTHOR - Jason Stajich
80 Email jason-at-bioperl.org
82 =head1 APPENDIX
84 The rest of the documentation details each of the object methods.
85 Internal methods are usually preceded with a _
87 =cut
90 # Let the code begin...
93 package Bio::SearchIO::megablast;
94 use strict;
95 use vars qw(%MAPPING %MODEMAP $DEFAULT_BLAST_WRITER_CLASS);
97 use base qw(Bio::SearchIO);
99 BEGIN {
100 # mapping of MegaBlast terms to Bioperl hash keys
101 %MODEMAP = ('MegaBlastOutput' => 'result',
102 'Hit' => 'hit',
103 'Hsp' => 'hsp'
106 # This should really be done more intelligently, like with
107 # XSLT
109 %MAPPING =
111 'Hsp_query-from' => 'HSP-query_start',
112 'Hsp_query-to' => 'HSP-query_end',
113 'Hsp_hit-from' => 'HSP-hit_start',
114 'Hsp_hit-to' => 'HSP-hit_end',
115 'Hit_score' => 'HIT-score',
116 'Hsp_score' => 'HSP-score',
118 'Hsp_identity' => 'HSP-identical',
119 'Hsp_positive' => 'HSP-conserved',
121 'Hit_id' => 'HIT-name',
123 'MegaBlastOutput_program' => 'RESULT-algorithm_name',
124 'MegaBlastOutput_query-def'=> 'RESULT-query_name',
128 $DEFAULT_BLAST_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter';
132 =head2 new
134 Title : new
135 Usage : my $obj = Bio::SearchIO::blast->new();
136 Function: Builds a new Bio::SearchIO::blast object
137 Returns : Bio::SearchIO::blast
138 Args : -fh/-file => filehandle/filename to BLAST file
139 -format => 'blast'
141 =cut
143 sub _initialize {
144 my ($self,@args) = @_;
145 $self->SUPER::_initialize(@args);
146 my ($fmt) = $self->_rearrange([qw(REPORT_FORMAT)], @args);
148 $self->throw("Must provide a value for -report_format when initializing a megablast parser") unless defined $fmt ;
149 $self->report_format($fmt);
150 return 1;
153 =head2 next_result
155 Title : next_result
156 Usage : my $hit = $searchio->next_result;
157 Function: Returns the next Result from a search
158 Returns : Bio::Search::Result::ResultI object
159 Args : none
161 =cut
163 sub next_result{
164 my ($self) = @_;
166 local $/ = "\n";
167 local $_;
169 my $fmt = $self->report_format;
170 my ($lastquery,$lasthit);
171 while( defined($_ = $self->_readline) ) {
172 if( $fmt == 0 ) {
173 if( /^\'(\S+)\'\=\=\'(\+|\-)(\S+)\'\s+
174 \((\d+)\s+(\d+)\s+(\d+)\s+(\d+)\)\s+
175 (\d+)/ox )
177 my ($hit,$strand,$query,
178 $h_start,$q_start,$h_end,$q_end,
179 $score) = ($1,$2,$3,$4,$5,$6,$7,$8);
180 if( ! defined $lastquery ) {
181 $self->start_element({'Name' => 'MegaBlastOutput'});
182 $self->element({'Name' => 'MegaBlastOutput_program',
183 'Data' => 'MEGABLAST'});
184 $self->element({'Name' => 'MegaBlastOutput_query-def',
185 'Data' => $query});
186 } elsif( $lastquery ne $query ) {
187 $self->_pushback($_);
188 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
189 $self->end_element({ 'Name' => 'MegaBlastOutput'});
190 $lasthit = undef;
191 $lastquery = undef;
192 return $self->end_document();
195 if( ! defined $lasthit || $lasthit ne $hit ) {
196 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
197 $self->start_element({'Name' => 'Hit'});
198 $self->element({'Name' => 'Hit_id',
199 'Data' => $hit});
201 $self->start_element({'Name' => 'Hsp'});
202 $self->element({'Name' => 'Hsp_score',
203 'Data' => $score});
205 # flip flop start/end if strand is < 0
206 # since strandedness is inferred from the query
207 # because of the way it is coded all queries will
208 # be on the forward strand and hits will be either
209 # +/-
211 # also the NCBI docs state:
212 # 0: Produce one-line output for each alignment, in the form
214 # 'subject-id'=='[+-]query-id' (s_off q_off s_end q_end) score
216 # Here subject(query)-id is a gi number, an accession or some other type of
217 # identifier found in the FASTA definition line of the respective sequence.
219 # + or - corresponds to same or different strand alignment.
221 # Score for non-affine gapping parameters means the total number of
222 # differences (mismatches + gap characters). For affine case it is the
223 # actual (raw) score of the alignment.
225 # and yet when rev strand hits are made I see
226 # (MBL 2.2.4)
227 # 'Contig634'=='-503384' (1 7941 321 7620) 19
228 # so the query is on the rev strand and the
229 # subject is on the fwd strand
230 # so I am flip-flopping everything when I see a '-'
231 if( $strand eq '-' ) {
232 ($h_start,$h_end) = ( $h_end,$h_start);
233 ($q_start,$q_end) = ( $q_end,$q_start);
235 $self->element({'Name' => 'Hsp_hit-from',
236 'Data' => $h_start});
237 $self->element({'Name' => 'Hsp_hit-to',
238 'Data' => $h_end});
239 $self->element({'Name' => 'Hsp_query-from',
240 'Data' => $q_start});
241 $self->element({'Name' => 'Hsp_query-to',
242 'Data' => $q_end});
243 # might not be quite right -- need to know length of the HSP
244 my $numid = (abs($q_end - $q_start) - $score);
246 $self->element({'Name' => 'Hsp_identity',
247 'Data' => $numid});
248 $self->element({'Name' => 'Hsp_positive',
249 'Data' => $numid});
251 $self->end_element({'Name' => 'Hsp'});
252 $lasthit = $hit;
253 $lastquery = $query;
254 } else {
255 $self->debug("Unknown line in fmt0 parsing: $_");
259 if( defined $lastquery && $fmt == 0 ) {
260 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
261 $self->end_element({ 'Name' => 'MegaBlastOutput'});
262 return $self->end_document();
264 return 0;
267 =head2 report_format
269 Title : report_format
270 Usage : $obj->report_format($newval)
271 Function: Get/Set the report_format value
272 Returns : value of report_format (a scalar)
273 Args : on set, new value (a scalar or undef, optional)
276 =cut
278 sub report_format{
279 my $self = shift;
280 return $self->{'_report_format'} = shift if @_;
281 return $self->{'_report_format'};
285 =head2 start_element
287 Title : start_element
288 Usage : $eventgenerator->start_element
289 Function: Handles a start element event
290 Returns : none
291 Args : hashref with at least 2 keys 'Data' and 'Name'
294 =cut
296 sub start_element{
297 my ($self,$data) = @_;
298 # we currently do not care about attributes
299 my $nm = $data->{'Name'};
300 if( my $type = $MODEMAP{$nm} ) {
301 $self->_mode($type);
302 if( $self->_eventHandler->will_handle($type) ) {
303 my $func = sprintf("start_%s",lc $type);
304 $self->_eventHandler->$func($data->{'Attributes'});
306 unshift @{$self->{'_elements'}}, $type;
309 if($nm eq 'MegaBlastOutput') {
310 $self->{'_values'} = {};
311 $self->{'_result'}= undef;
312 $self->{'_mode'} = '';
317 =head2 end_element
319 Title : start_element
320 Usage : $eventgenerator->end_element
321 Function: Handles an end element event
322 Returns : none
323 Args : hashref with at least 2 keys 'Data' and 'Name'
326 =cut
328 sub end_element {
329 my ($self,$data) = @_;
330 my $nm = $data->{'Name'};
331 my $rc;
333 if( my $type = $MODEMAP{$nm} ) {
334 if( $self->_eventHandler->will_handle($type) ) {
335 my $func = sprintf("end_%s",lc $type);
336 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
337 $self->{'_values'});
339 shift @{$self->{'_elements'}};
341 } elsif( $MAPPING{$nm} ) {
342 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
343 my $key = (keys %{$MAPPING{$nm}})[0];
344 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
345 } else {
346 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
348 } else {
349 $self->warn( "unknown nm $nm ignoring\n");
351 $self->{'_last_data'} = ''; # remove read data if we are at
352 # end of an element
353 $self->{'_result'} = $rc if( $nm eq 'MegaBlastOutput' );
354 return $rc;
358 =head2 element
360 Title : element
361 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
362 Function: Convience method that calls start_element, characters, end_element
363 Returns : none
364 Args : Hash ref with the keys 'Name' and 'Data'
367 =cut
369 sub element{
370 my ($self,$data) = @_;
371 $self->start_element($data);
372 $self->characters($data);
373 $self->end_element($data);
377 =head2 characters
379 Title : characters
380 Usage : $eventgenerator->characters($str)
381 Function: Send a character events
382 Returns : none
383 Args : string
386 =cut
388 sub characters{
389 my ($self,$data) = @_;
390 return unless defined $data->{'Data'};
391 $self->{'_last_data'} = $data->{'Data'};
394 =head2 _mode
396 Title : _mode
397 Usage : $obj->_mode($newval)
398 Function:
399 Example :
400 Returns : value of _mode
401 Args : newvalue (optional)
404 =cut
406 sub _mode{
407 my ($self,$value) = @_;
408 if( defined $value) {
409 $self->{'_mode'} = $value;
411 return $self->{'_mode'};
414 =head2 within_element
416 Title : within_element
417 Usage : if( $eventgenerator->within_element($element) ) {}
418 Function: Test if we are within a particular element
419 This is different than 'in' because within can be tested
420 for a whole block.
421 Returns : boolean
422 Args : string element name
425 =cut
427 sub within_element{
428 my ($self,$name) = @_;
429 return 0 if ( ! defined $name &&
430 ! defined $self->{'_elements'} ||
431 scalar @{$self->{'_elements'}} == 0) ;
432 foreach ( @{$self->{'_elements'}} ) {
433 if( $_ eq $name ) {
434 return 1;
437 return 0;
440 =head2 in_element
442 Title : in_element
443 Usage : if( $eventgenerator->in_element($element) ) {}
444 Function: Test if we are in a particular element
445 This is different than 'in' because within can be tested
446 for a whole block.
447 Returns : boolean
448 Args : string element name
451 =cut
453 sub in_element{
454 my ($self,$name) = @_;
455 return 0 if ! defined $self->{'_elements'}->[0];
456 return ( $self->{'_elements'}->[0] eq $name)
460 =head2 start_document
462 Title : start_document
463 Usage : $eventgenerator->start_document
464 Function: Handles a start document event
465 Returns : none
466 Args : none
469 =cut
471 sub start_document{
472 my ($self) = @_;
473 $self->{'_lasttype'} = '';
474 $self->{'_values'} = {};
475 $self->{'_result'}= undef;
476 $self->{'_mode'} = '';
477 $self->{'_elements'} = [];
481 =head2 end_document
483 Title : end_document
484 Usage : $eventgenerator->end_document
485 Function: Handles an end document event
486 Returns : Bio::Search::Result::ResultI object
487 Args : none
490 =cut
492 sub end_document{
493 my ($self,@args) = @_;
494 return $self->{'_result'};
497 =head2 result_count
499 Title : result_count
500 Usage : my $count = $searchio->result_count
501 Function: Returns the number of results we have processed
502 Returns : integer
503 Args : none
506 =cut
508 sub result_count {
509 my $self = shift;
510 return $self->{'_result_count'};
513 sub report_count { shift->result_count }