bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / SearchIO / waba.pm
blobfdefee587101206b36bb759b362a3a39492b4a9d
1 # $Id$
3 # BioPerl module for Bio::SearchIO::waba
5 # Cared for by Jason Stajich <jason@bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SearchIO::waba - SearchIO parser for Jim Kent WABA program
16 alignment output
18 =head1 SYNOPSIS
20 # do not use this object directly, rather through Bio::SearchIO
22 use Bio::SearchIO;
23 my $in = Bio::SearchIO->new(-format => 'waba',
24 -file => 'output.wab');
25 while( my $result = $in->next_result ) {
26 while( my $hit = $result->next_hit ) {
27 while( my $hsp = $result->next_hsp ) {
33 =head1 DESCRIPTION
35 This parser will process the waba output (NOT the human readable format).
37 =head1 FEEDBACK
39 =head2 Mailing Lists
41 User feedback is an integral part of the evolution of this and other
42 Bioperl modules. Send your comments and suggestions preferably to
43 the Bioperl mailing list. Your participation is much appreciated.
45 bioperl-l@bioperl.org - General discussion
46 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
48 =head2 Reporting Bugs
50 Report bugs to the Bioperl bug tracking system to help us keep track
51 of the bugs and their resolution. Bug reports can be submitted via the
52 web:
54 http://bugzilla.open-bio.org/
56 =head1 AUTHOR - Jason Stajich
58 Email jason-at-bioperl.org
60 =head1 APPENDIX
62 The rest of the documentation details each of the object methods.
63 Internal methods are usually preceded with a _
65 =cut
68 # Let the code begin...
71 package Bio::SearchIO::waba;
72 use vars qw(%MODEMAP %MAPPING @STATES);
73 use strict;
75 # Object preamble - inherits from Bio::Root::Root
77 use Bio::Search::Result::ResultFactory;
78 use Bio::Search::HSP::HSPFactory;
80 use POSIX;
82 BEGIN {
83 # mapping of NCBI Blast terms to Bioperl hash keys
84 %MODEMAP = ('WABAOutput' => 'result',
85 'Hit' => 'hit',
86 'Hsp' => 'hsp'
88 @STATES = qw(Hsp_qseq Hsp_hseq Hsp_stateseq);
89 %MAPPING =
91 'Hsp_query-from'=> 'HSP-query_start',
92 'Hsp_query-to' => 'HSP-query_end',
93 'Hsp_hit-from' => 'HSP-hit_start',
94 'Hsp_hit-to' => 'HSP-hit_end',
95 'Hsp_qseq' => 'HSP-query_seq',
96 'Hsp_hseq' => 'HSP-hit_seq',
97 'Hsp_midline' => 'HSP-homology_seq',
98 'Hsp_stateseq' => 'HSP-hmmstate_seq',
99 'Hsp_align-len' => 'HSP-hsp_length',
101 'Hit_id' => 'HIT-name',
102 'Hit_accession' => 'HIT-accession',
104 'WABAOutput_program' => 'RESULT-algorithm_name',
105 'WABAOutput_version' => 'RESULT-algorithm_version',
106 'WABAOutput_query-def'=> 'RESULT-query_name',
107 'WABAOutput_query-db' => 'RESULT-query_database',
108 'WABAOutput_db' => 'RESULT-database_name',
113 use base qw(Bio::SearchIO);
115 =head2 new
117 Title : new
118 Usage : my $obj = Bio::SearchIO::waba->new();
119 Function: Builds a new Bio::SearchIO::waba object
120 Returns : Bio::SearchIO::waba
121 Args : see Bio::SearchIO
123 =cut
125 sub _initialize {
126 my ($self,@args) = @_;
127 $self->SUPER::_initialize(@args);
128 $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::WABAResult'));
130 $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::WABAHSP'));
134 =head2 next_result
136 Title : next_result
137 Usage : my $hit = $searchio->next_result;
138 Function: Returns the next Result from a search
139 Returns : Bio::Search::Result::ResultI object
140 Args : none
142 =cut
144 sub next_result{
145 my ($self) = @_;
146 local $/ = "\n";
147 local $_;
149 my ($curquery,$curhit);
150 my $state = -1;
151 $self->start_document();
152 my @hit_signifs;
153 while( defined ($_ = $self->_readline )) {
155 if( $state == -1 ) {
156 my ($qid, $qhspid,$qpercent, $junk,
157 $alnlen,$qdb,$qacc,$qstart,$qend,$qstrand,
158 $hitdb,$hacc,$hstart,$hend,
159 $hstrand) =
160 ( /^(\S+)\.(\S+)\s+align\s+ # get the queryid
161 (\d+(\.\d+)?)\%\s+ # get the percentage
162 of\s+(\d+)\s+ # get the length of the alignment
163 (\S+)\s+ # this is the query database
164 (\S+):(\-?\d+)\-(\-?\d+) # The accession:start-end for query
165 \s+([\-\+]) # query strand
166 \s+(\S+)\. # hit db
167 (\S+):(\-?\d+)\-(\-?\d+) # The accession:start-end for hit
168 \s+([\-\+])\s*$ # hit strand
169 /ox );
171 # Curses. Jim's code is 0 based, the following is to readjust
172 if( $hstart < 0 ) { $hstart *= -1}
173 if( $hend < 0 ) { $hend *= -1}
174 if( $qstart < 0 ) { $qstart *= -1}
175 if( $qend < 0 ) { $qend *= -1}
176 $hstart++; $hend++; $qstart++; $qend++;
177 if( ! defined $alnlen ) {
178 $self->warn("Unable to parse the rest of the WABA alignment info for: '$_'");
179 last;
181 $self->{'_reporttype'} = 'WABA'; # hardcoded - only
182 # one type of WABA AFAIK
183 if( defined $curquery &&
184 $curquery ne $qid ) {
185 $self->end_element({'Name' => 'Hit'});
186 $self->_pushback($_);
187 $self->end_element({'Name' => 'WABAOutput'});
188 return $self->end_document();
191 if( defined $curhit &&
192 $curhit ne $hacc) {
193 # slight duplication here -- keep these in SYNC
194 $self->end_element({'Name' => 'Hit'});
195 $self->start_element({'Name' => 'Hit'});
196 $self->element({'Name' => 'Hit_id',
197 'Data' => $hacc});
198 $self->element({'Name' => 'Hit_accession',
199 'Data' => $hacc});
201 } elsif ( ! defined $curquery ) {
202 $self->start_element({'Name' => 'WABAOutput'});
203 $self->{'_result_count'}++;
204 $self->element({'Name' => 'WABAOutput_query-def',
205 'Data' => $qid });
206 $self->element({'Name' => 'WABAOutput_program',
207 'Data' => 'WABA'});
208 $self->element({'Name' => 'WABAOutput_query-db',
209 'Data' => $qdb});
210 $self->element({'Name' => 'WABAOutput_db',
211 'Data' => $hitdb});
213 # slight duplication here -- keep these N'SYNC ;-)
214 $self->start_element({'Name' => 'Hit'});
215 $self->element({'Name' => 'Hit_id',
216 'Data' => $hacc});
217 $self->element({'Name' => 'Hit_accession',
218 'Data' => $hacc});
222 # strand is inferred by start,end values
223 # in the Result Builder
224 if( $qstrand eq '-' ) {
225 ($qstart,$qend) = ($qend,$qstart);
227 if( $hstrand eq '-' ) {
228 ($hstart,$hend) = ($hend,$hstart);
231 $self->start_element({'Name' => 'Hsp'});
232 $self->element({'Name' => 'Hsp_query-from',
233 'Data' => $qstart});
234 $self->element({'Name' => 'Hsp_query-to',
235 'Data' => $qend});
236 $self->element({'Name' => 'Hsp_hit-from',
237 'Data' => $hstart});
238 $self->element({'Name' => 'Hsp_hit-to',
239 'Data' => $hend});
240 $self->element({'Name' => 'Hsp_align-len',
241 'Data' => $alnlen});
243 $curquery = $qid;
244 $curhit = $hacc;
245 $state = 0;
246 } elsif( ! defined $curquery ) {
247 $self->warn("skipping because no Hit begin line was recognized\n$_") if( $_ !~ /^\s+$/ );
248 next;
249 } else {
250 chomp;
251 $self->element({'Name' => $STATES[$state++],
252 'Data' => $_});
253 if( $state >= scalar @STATES ) {
254 $state = -1;
255 $self->end_element({'Name' => 'Hsp'});
259 if( defined $curquery ) {
260 $self->end_element({'Name' => 'Hit'});
261 $self->end_element({'Name' => 'WABAOutput'});
262 return $self->end_document();
264 return;
267 =head2 start_element
269 Title : start_element
270 Usage : $eventgenerator->start_element
271 Function: Handles a start element event
272 Returns : none
273 Args : hashref with at least 2 keys 'Data' and 'Name'
276 =cut
278 sub start_element{
279 my ($self,$data) = @_;
280 # we currently don't care about attributes
281 my $nm = $data->{'Name'};
282 if( my $type = $MODEMAP{$nm} ) {
283 $self->_mode($type);
284 if( $self->_eventHandler->will_handle($type) ) {
285 my $func = sprintf("start_%s",lc $type);
286 $self->_eventHandler->$func($data->{'Attributes'});
288 unshift @{$self->{'_elements'}}, $type;
290 if($nm eq 'WABAOutput') {
291 $self->{'_values'} = {};
292 $self->{'_result'}= undef;
293 $self->{'_mode'} = '';
298 =head2 end_element
300 Title : start_element
301 Usage : $eventgenerator->end_element
302 Function: Handles an end element event
303 Returns : none
304 Args : hashref with at least 2 keys 'Data' and 'Name'
307 =cut
309 sub end_element {
310 my ($self,$data) = @_;
311 my $nm = $data->{'Name'};
312 my $rc;
313 # Hsp are sort of weird, in that they end when another
314 # object begins so have to detect this in end_element for now
315 if( $nm eq 'Hsp' ) {
316 foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) {
317 $self->element({'Name' => $_,
318 'Data' => $self->{'_last_hspdata'}->{$_}});
320 $self->{'_last_hspdata'} = {}
323 if( my $type = $MODEMAP{$nm} ) {
324 if( $self->_eventHandler->will_handle($type) ) {
325 my $func = sprintf("end_%s",lc $type);
326 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
327 $self->{'_values'});
329 shift @{$self->{'_elements'}};
331 } elsif( $MAPPING{$nm} ) {
332 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
333 my $key = (keys %{$MAPPING{$nm}})[0];
334 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
335 } else {
336 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
338 } else {
339 $self->warn( "unknown nm $nm ignoring\n");
341 $self->{'_last_data'} = ''; # remove read data if we are at
342 # end of an element
343 $self->{'_result'} = $rc if( $nm eq 'WABAOutput' );
344 return $rc;
348 =head2 element
350 Title : element
351 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
352 Function: Convience method that calls start_element, characters, end_element
353 Returns : none
354 Args : Hash ref with the keys 'Name' and 'Data'
357 =cut
359 sub element{
360 my ($self,$data) = @_;
361 $self->start_element($data);
362 $self->characters($data);
363 $self->end_element($data);
367 =head2 characters
369 Title : characters
370 Usage : $eventgenerator->characters($str)
371 Function: Send a character events
372 Returns : none
373 Args : string
376 =cut
378 sub characters{
379 my ($self,$data) = @_;
381 return unless ( defined $data->{'Data'} );
382 if( $data->{'Data'} =~ /^\s+$/ ) {
383 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
386 if( $self->in_element('hsp') &&
387 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
389 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
392 $self->{'_last_data'} = $data->{'Data'};
395 =head2 _mode
397 Title : _mode
398 Usage : $obj->_mode($newval)
399 Function:
400 Example :
401 Returns : value of _mode
402 Args : newvalue (optional)
405 =cut
407 sub _mode{
408 my ($self,$value) = @_;
409 if( defined $value) {
410 $self->{'_mode'} = $value;
412 return $self->{'_mode'};
415 =head2 within_element
417 Title : within_element
418 Usage : if( $eventgenerator->within_element($element) ) {}
419 Function: Test if we are within a particular element
420 This is different than 'in' because within can be tested
421 for a whole block.
422 Returns : boolean
423 Args : string element name
426 =cut
428 sub within_element{
429 my ($self,$name) = @_;
430 return 0 if ( ! defined $name &&
431 ! defined $self->{'_elements'} ||
432 scalar @{$self->{'_elements'}} == 0) ;
433 foreach ( @{$self->{'_elements'}} ) {
434 if( $_ eq $name ) {
435 return 1;
438 return 0;
441 =head2 in_element
443 Title : in_element
444 Usage : if( $eventgenerator->in_element($element) ) {}
445 Function: Test if we are in a particular element
446 This is different than 'in' because within can be tested
447 for a whole block.
448 Returns : boolean
449 Args : string element name
452 =cut
454 sub in_element{
455 my ($self,$name) = @_;
456 return 0 if ! defined $self->{'_elements'}->[0];
457 return ( $self->{'_elements'}->[0] eq $name)
461 =head2 start_document
463 Title : start_document
464 Usage : $eventgenerator->start_document
465 Function: Handles a start document event
466 Returns : none
467 Args : none
470 =cut
472 sub start_document{
473 my ($self) = @_;
474 $self->{'_lasttype'} = '';
475 $self->{'_values'} = {};
476 $self->{'_result'}= undef;
477 $self->{'_mode'} = '';
478 $self->{'_elements'} = [];
482 =head2 end_document
484 Title : end_document
485 Usage : $eventgenerator->end_document
486 Function: Handles an end document event
487 Returns : Bio::Search::Result::ResultI object
488 Args : none
491 =cut
493 sub end_document{
494 my ($self,@args) = @_;
495 return $self->{'_result'};
498 =head2 result_count
500 Title : result_count
501 Usage : my $count = $searchio->result_count
502 Function: Returns the number of results we have processed
503 Returns : integer
504 Args : none
507 =cut
509 sub result_count {
510 my $self = shift;
511 return $self->{'_result_count'};
514 sub report_count { shift->result_count }