tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / SearchIO / megablast.pm
blob1ffe024cca057b833bcb30767c2eb24297fe6f96
1 # $Id$
3 # BioPerl module for Bio::SearchIO::megablast
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason@bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::SearchIO::megablast - a driver module for Bio::SearchIO to parse
18 megablast reports (format 0)
20 =head1 SYNOPSIS
22 # do not use this module directly
24 use Bio::SearchIO;
25 # for default format output from megablast
26 my $in = Bio::SearchIO->new(-file => 'file.mbl',
27 -format => 'megablast',
28 -report_format => 0);
30 while( my $r = $in->next_result ) {
31 while( my $hit = $r->next_hit ) {
32 while( my $hsp = $hit->next_hsp ) {
37 =head1 DESCRIPTION
39 Beware!
41 Because of the way megablast report format 0 is coded, realize that score
42 means # gap characters + # mismatches for a HSP.
44 The docs from NCBI regarding FORMAT 0
45 # 0: Produce one-line output for each alignment, in the form
47 # 'subject-id'=='[+-]query-id' (s_off q_off s_end q_end) score
49 # Here subject(query)-id is a gi number, an accession or some other type of
50 # identifier found in the FASTA definition line of the respective sequence.
52 # + or - corresponds to same or different strand alignment.
54 # Score for non-affine gapping parameters means the total number of
55 # differences (mismatches + gap characters). For affine case it is the
56 # actual (raw) score of the alignment.
58 FORMAT 1 parsing has not been implemented
59 FORMAT 2 parsing should work with the SearchIO 'blast' parser
61 =head1 FEEDBACK
63 =head2 Mailing Lists
65 User feedback is an integral part of the evolution of this and other
66 Bioperl modules. Send your comments and suggestions preferably to
67 the Bioperl mailing list. Your participation is much appreciated.
69 bioperl-l@bioperl.org - General discussion
70 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
72 =head2 Support
74 Please direct usage questions or support issues to the mailing list:
76 I<bioperl-l@bioperl.org>
78 rather than to the module maintainer directly. Many experienced and
79 reponsive experts will be able look at the problem and quickly
80 address it. Please include a thorough description of the problem
81 with code and data examples if at all possible.
83 =head2 Reporting Bugs
85 Report bugs to the Bioperl bug tracking system to help us keep track
86 of the bugs and their resolution. Bug reports can be submitted via the
87 web:
89 http://bugzilla.open-bio.org/
91 =head1 AUTHOR - Jason Stajich
93 Email jason-at-bioperl.org
95 =head1 APPENDIX
97 The rest of the documentation details each of the object methods.
98 Internal methods are usually preceded with a _
100 =cut
103 # Let the code begin...
106 package Bio::SearchIO::megablast;
107 use strict;
108 use vars qw(%MAPPING %MODEMAP $DEFAULT_BLAST_WRITER_CLASS);
110 use base qw(Bio::SearchIO);
112 BEGIN {
113 # mapping of MegaBlast terms to Bioperl hash keys
114 %MODEMAP = ('MegaBlastOutput' => 'result',
115 'Hit' => 'hit',
116 'Hsp' => 'hsp'
119 # This should really be done more intelligently, like with
120 # XSLT
122 %MAPPING =
124 'Hsp_query-from' => 'HSP-query_start',
125 'Hsp_query-to' => 'HSP-query_end',
126 'Hsp_hit-from' => 'HSP-hit_start',
127 'Hsp_hit-to' => 'HSP-hit_end',
128 'Hit_score' => 'HIT-score',
129 'Hsp_score' => 'HSP-score',
131 'Hsp_identity' => 'HSP-identical',
132 'Hsp_positive' => 'HSP-conserved',
134 'Hit_id' => 'HIT-name',
136 'MegaBlastOutput_program' => 'RESULT-algorithm_name',
137 'MegaBlastOutput_query-def'=> 'RESULT-query_name',
141 $DEFAULT_BLAST_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter';
145 =head2 new
147 Title : new
148 Usage : my $obj = Bio::SearchIO::blast->new();
149 Function: Builds a new Bio::SearchIO::blast object
150 Returns : Bio::SearchIO::blast
151 Args : -fh/-file => filehandle/filename to BLAST file
152 -format => 'blast'
154 =cut
156 sub _initialize {
157 my ($self,@args) = @_;
158 $self->SUPER::_initialize(@args);
159 my ($fmt) = $self->_rearrange([qw(REPORT_FORMAT)], @args);
161 $self->throw("Must provide a value for -report_format when initializing a megablast parser") unless defined $fmt ;
162 $self->report_format($fmt);
163 return 1;
166 =head2 next_result
168 Title : next_result
169 Usage : my $hit = $searchio->next_result;
170 Function: Returns the next Result from a search
171 Returns : Bio::Search::Result::ResultI object
172 Args : none
174 =cut
176 sub next_result{
177 my ($self) = @_;
179 local $/ = "\n";
180 local $_;
182 my $fmt = $self->report_format;
183 my ($lastquery,$lasthit);
184 while( defined($_ = $self->_readline) ) {
185 if( $fmt == 0 ) {
186 if( /^\'(\S+)\'\=\=\'(\+|\-)(\S+)\'\s+
187 \((\d+)\s+(\d+)\s+(\d+)\s+(\d+)\)\s+
188 (\d+)/ox )
190 my ($hit,$strand,$query,
191 $h_start,$q_start,$h_end,$q_end,
192 $score) = ($1,$2,$3,$4,$5,$6,$7,$8);
193 if( ! defined $lastquery ) {
194 $self->start_element({'Name' => 'MegaBlastOutput'});
195 $self->element({'Name' => 'MegaBlastOutput_program',
196 'Data' => 'MEGABLAST'});
197 $self->element({'Name' => 'MegaBlastOutput_query-def',
198 'Data' => $query});
199 } elsif( $lastquery ne $query ) {
200 $self->_pushback($_);
201 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
202 $self->end_element({ 'Name' => 'MegaBlastOutput'});
203 $lasthit = undef;
204 $lastquery = undef;
205 return $self->end_document();
208 if( ! defined $lasthit || $lasthit ne $hit ) {
209 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
210 $self->start_element({'Name' => 'Hit'});
211 $self->element({'Name' => 'Hit_id',
212 'Data' => $hit});
214 $self->start_element({'Name' => 'Hsp'});
215 $self->element({'Name' => 'Hsp_score',
216 'Data' => $score});
218 # flip flop start/end if strand is < 0
219 # since strandedness is inferred from the query
220 # because of the way it is coded all queries will
221 # be on the forward strand and hits will be either
222 # +/-
224 # also the NCBI docs state:
225 # 0: Produce one-line output for each alignment, in the form
227 # 'subject-id'=='[+-]query-id' (s_off q_off s_end q_end) score
229 # Here subject(query)-id is a gi number, an accession or some other type of
230 # identifier found in the FASTA definition line of the respective sequence.
232 # + or - corresponds to same or different strand alignment.
234 # Score for non-affine gapping parameters means the total number of
235 # differences (mismatches + gap characters). For affine case it is the
236 # actual (raw) score of the alignment.
238 # and yet when rev strand hits are made I see
239 # (MBL 2.2.4)
240 # 'Contig634'=='-503384' (1 7941 321 7620) 19
241 # so the query is on the rev strand and the
242 # subject is on the fwd strand
243 # so I am flip-flopping everything when I see a '-'
244 if( $strand eq '-' ) {
245 ($h_start,$h_end) = ( $h_end,$h_start);
246 ($q_start,$q_end) = ( $q_end,$q_start);
248 $self->element({'Name' => 'Hsp_hit-from',
249 'Data' => $h_start});
250 $self->element({'Name' => 'Hsp_hit-to',
251 'Data' => $h_end});
252 $self->element({'Name' => 'Hsp_query-from',
253 'Data' => $q_start});
254 $self->element({'Name' => 'Hsp_query-to',
255 'Data' => $q_end});
256 # might not be quite right -- need to know length of the HSP
257 my $numid = (abs($q_end - $q_start) - $score);
259 $self->element({'Name' => 'Hsp_identity',
260 'Data' => $numid});
261 $self->element({'Name' => 'Hsp_positive',
262 'Data' => $numid});
264 $self->end_element({'Name' => 'Hsp'});
265 $lasthit = $hit;
266 $lastquery = $query;
267 } else {
268 $self->debug("Unknown line in fmt0 parsing: $_");
272 if( defined $lastquery && $fmt == 0 ) {
273 $self->end_element({'Name' => 'Hit'}) if( defined $lasthit);
274 $self->end_element({ 'Name' => 'MegaBlastOutput'});
275 return $self->end_document();
277 return 0;
280 =head2 report_format
282 Title : report_format
283 Usage : $obj->report_format($newval)
284 Function: Get/Set the report_format value
285 Returns : value of report_format (a scalar)
286 Args : on set, new value (a scalar or undef, optional)
289 =cut
291 sub report_format{
292 my $self = shift;
293 return $self->{'_report_format'} = shift if @_;
294 return $self->{'_report_format'};
298 =head2 start_element
300 Title : start_element
301 Usage : $eventgenerator->start_element
302 Function: Handles a start element event
303 Returns : none
304 Args : hashref with at least 2 keys 'Data' and 'Name'
307 =cut
309 sub start_element{
310 my ($self,$data) = @_;
311 # we currently do not care about attributes
312 my $nm = $data->{'Name'};
313 if( my $type = $MODEMAP{$nm} ) {
314 $self->_mode($type);
315 if( $self->_eventHandler->will_handle($type) ) {
316 my $func = sprintf("start_%s",lc $type);
317 $self->_eventHandler->$func($data->{'Attributes'});
319 unshift @{$self->{'_elements'}}, $type;
322 if($nm eq 'MegaBlastOutput') {
323 $self->{'_values'} = {};
324 $self->{'_result'}= undef;
325 $self->{'_mode'} = '';
330 =head2 end_element
332 Title : start_element
333 Usage : $eventgenerator->end_element
334 Function: Handles an end element event
335 Returns : none
336 Args : hashref with at least 2 keys 'Data' and 'Name'
339 =cut
341 sub end_element {
342 my ($self,$data) = @_;
343 my $nm = $data->{'Name'};
344 my $rc;
346 if( my $type = $MODEMAP{$nm} ) {
347 if( $self->_eventHandler->will_handle($type) ) {
348 my $func = sprintf("end_%s",lc $type);
349 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
350 $self->{'_values'});
352 shift @{$self->{'_elements'}};
354 } elsif( $MAPPING{$nm} ) {
355 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
356 my $key = (keys %{$MAPPING{$nm}})[0];
357 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
358 } else {
359 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
361 } else {
362 $self->warn( "unknown nm $nm ignoring\n");
364 $self->{'_last_data'} = ''; # remove read data if we are at
365 # end of an element
366 $self->{'_result'} = $rc if( $nm eq 'MegaBlastOutput' );
367 return $rc;
371 =head2 element
373 Title : element
374 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
375 Function: Convience method that calls start_element, characters, end_element
376 Returns : none
377 Args : Hash ref with the keys 'Name' and 'Data'
380 =cut
382 sub element{
383 my ($self,$data) = @_;
384 $self->start_element($data);
385 $self->characters($data);
386 $self->end_element($data);
390 =head2 characters
392 Title : characters
393 Usage : $eventgenerator->characters($str)
394 Function: Send a character events
395 Returns : none
396 Args : string
399 =cut
401 sub characters{
402 my ($self,$data) = @_;
403 return unless defined $data->{'Data'};
404 $self->{'_last_data'} = $data->{'Data'};
407 =head2 _mode
409 Title : _mode
410 Usage : $obj->_mode($newval)
411 Function:
412 Example :
413 Returns : value of _mode
414 Args : newvalue (optional)
417 =cut
419 sub _mode{
420 my ($self,$value) = @_;
421 if( defined $value) {
422 $self->{'_mode'} = $value;
424 return $self->{'_mode'};
427 =head2 within_element
429 Title : within_element
430 Usage : if( $eventgenerator->within_element($element) ) {}
431 Function: Test if we are within a particular element
432 This is different than 'in' because within can be tested
433 for a whole block.
434 Returns : boolean
435 Args : string element name
438 =cut
440 sub within_element{
441 my ($self,$name) = @_;
442 return 0 if ( ! defined $name &&
443 ! defined $self->{'_elements'} ||
444 scalar @{$self->{'_elements'}} == 0) ;
445 foreach ( @{$self->{'_elements'}} ) {
446 if( $_ eq $name ) {
447 return 1;
450 return 0;
453 =head2 in_element
455 Title : in_element
456 Usage : if( $eventgenerator->in_element($element) ) {}
457 Function: Test if we are in a particular element
458 This is different than 'in' because within can be tested
459 for a whole block.
460 Returns : boolean
461 Args : string element name
464 =cut
466 sub in_element{
467 my ($self,$name) = @_;
468 return 0 if ! defined $self->{'_elements'}->[0];
469 return ( $self->{'_elements'}->[0] eq $name)
473 =head2 start_document
475 Title : start_document
476 Usage : $eventgenerator->start_document
477 Function: Handles a start document event
478 Returns : none
479 Args : none
482 =cut
484 sub start_document{
485 my ($self) = @_;
486 $self->{'_lasttype'} = '';
487 $self->{'_values'} = {};
488 $self->{'_result'}= undef;
489 $self->{'_mode'} = '';
490 $self->{'_elements'} = [];
494 =head2 end_document
496 Title : end_document
497 Usage : $eventgenerator->end_document
498 Function: Handles an end document event
499 Returns : Bio::Search::Result::ResultI object
500 Args : none
503 =cut
505 sub end_document{
506 my ($self,@args) = @_;
507 return $self->{'_result'};
510 =head2 result_count
512 Title : result_count
513 Usage : my $count = $searchio->result_count
514 Function: Returns the number of results we have processed
515 Returns : integer
516 Args : none
519 =cut
521 sub result_count {
522 my $self = shift;
523 return $self->{'_result_count'};
526 sub report_count { shift->result_count }