sync w/ main trunk
[bioperl-live.git] / Bio / SearchIO / waba.pm
bloba437b9cd3465e0f0268c7706e144914534b98b87
1 # $Id$
3 # BioPerl module for Bio::SearchIO::waba
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason@bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::SearchIO::waba - SearchIO parser for Jim Kent WABA program
18 alignment output
20 =head1 SYNOPSIS
22 # do not use this object directly, rather through Bio::SearchIO
24 use Bio::SearchIO;
25 my $in = Bio::SearchIO->new(-format => 'waba',
26 -file => 'output.wab');
27 while( my $result = $in->next_result ) {
28 while( my $hit = $result->next_hit ) {
29 while( my $hsp = $result->next_hsp ) {
35 =head1 DESCRIPTION
37 This parser will process the waba output (NOT the human readable format).
39 =head1 FEEDBACK
41 =head2 Mailing Lists
43 User feedback is an integral part of the evolution of this and other
44 Bioperl modules. Send your comments and suggestions preferably to
45 the Bioperl mailing list. Your participation is much appreciated.
47 bioperl-l@bioperl.org - General discussion
48 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
50 =head2 Support
52 Please direct usage questions or support issues to the mailing list:
54 L<bioperl-l@bioperl.org>
56 rather than to the module maintainer directly. Many experienced and
57 reponsive experts will be able look at the problem and quickly
58 address it. Please include a thorough description of the problem
59 with code and data examples if at all possible.
61 =head2 Reporting Bugs
63 Report bugs to the Bioperl bug tracking system to help us keep track
64 of the bugs and their resolution. Bug reports can be submitted via the
65 web:
67 http://bugzilla.open-bio.org/
69 =head1 AUTHOR - Jason Stajich
71 Email jason-at-bioperl.org
73 =head1 APPENDIX
75 The rest of the documentation details each of the object methods.
76 Internal methods are usually preceded with a _
78 =cut
81 # Let the code begin...
84 package Bio::SearchIO::waba;
85 use vars qw(%MODEMAP %MAPPING @STATES);
86 use strict;
88 # Object preamble - inherits from Bio::Root::Root
90 use Bio::Search::Result::ResultFactory;
91 use Bio::Search::HSP::HSPFactory;
93 use POSIX;
95 BEGIN {
96 # mapping of NCBI Blast terms to Bioperl hash keys
97 %MODEMAP = ('WABAOutput' => 'result',
98 'Hit' => 'hit',
99 'Hsp' => 'hsp'
101 @STATES = qw(Hsp_qseq Hsp_hseq Hsp_stateseq);
102 %MAPPING =
104 'Hsp_query-from'=> 'HSP-query_start',
105 'Hsp_query-to' => 'HSP-query_end',
106 'Hsp_hit-from' => 'HSP-hit_start',
107 'Hsp_hit-to' => 'HSP-hit_end',
108 'Hsp_qseq' => 'HSP-query_seq',
109 'Hsp_hseq' => 'HSP-hit_seq',
110 'Hsp_midline' => 'HSP-homology_seq',
111 'Hsp_stateseq' => 'HSP-hmmstate_seq',
112 'Hsp_align-len' => 'HSP-hsp_length',
114 'Hit_id' => 'HIT-name',
115 'Hit_accession' => 'HIT-accession',
117 'WABAOutput_program' => 'RESULT-algorithm_name',
118 'WABAOutput_version' => 'RESULT-algorithm_version',
119 'WABAOutput_query-def'=> 'RESULT-query_name',
120 'WABAOutput_query-db' => 'RESULT-query_database',
121 'WABAOutput_db' => 'RESULT-database_name',
126 use base qw(Bio::SearchIO);
128 =head2 new
130 Title : new
131 Usage : my $obj = Bio::SearchIO::waba->new();
132 Function: Builds a new Bio::SearchIO::waba object
133 Returns : Bio::SearchIO::waba
134 Args : see Bio::SearchIO
136 =cut
138 sub _initialize {
139 my ($self,@args) = @_;
140 $self->SUPER::_initialize(@args);
141 $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::WABAResult'));
143 $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::WABAHSP'));
147 =head2 next_result
149 Title : next_result
150 Usage : my $hit = $searchio->next_result;
151 Function: Returns the next Result from a search
152 Returns : Bio::Search::Result::ResultI object
153 Args : none
155 =cut
157 sub next_result{
158 my ($self) = @_;
159 local $/ = "\n";
160 local $_;
162 my ($curquery,$curhit);
163 my $state = -1;
164 $self->start_document();
165 my @hit_signifs;
166 while( defined ($_ = $self->_readline )) {
168 if( $state == -1 ) {
169 my ($qid, $qhspid,$qpercent, $junk,
170 $alnlen,$qdb,$qacc,$qstart,$qend,$qstrand,
171 $hitdb,$hacc,$hstart,$hend,
172 $hstrand) =
173 ( /^(\S+)\.(\S+)\s+align\s+ # get the queryid
174 (\d+(\.\d+)?)\%\s+ # get the percentage
175 of\s+(\d+)\s+ # get the length of the alignment
176 (\S+)\s+ # this is the query database
177 (\S+):(\-?\d+)\-(\-?\d+) # The accession:start-end for query
178 \s+([\-\+]) # query strand
179 \s+(\S+)\. # hit db
180 (\S+):(\-?\d+)\-(\-?\d+) # The accession:start-end for hit
181 \s+([\-\+])\s*$ # hit strand
182 /ox );
184 # Curses. Jim's code is 0 based, the following is to readjust
185 if( $hstart < 0 ) { $hstart *= -1}
186 if( $hend < 0 ) { $hend *= -1}
187 if( $qstart < 0 ) { $qstart *= -1}
188 if( $qend < 0 ) { $qend *= -1}
189 $hstart++; $hend++; $qstart++; $qend++;
190 if( ! defined $alnlen ) {
191 $self->warn("Unable to parse the rest of the WABA alignment info for: '$_'");
192 last;
194 $self->{'_reporttype'} = 'WABA'; # hardcoded - only
195 # one type of WABA AFAIK
196 if( defined $curquery &&
197 $curquery ne $qid ) {
198 $self->end_element({'Name' => 'Hit'});
199 $self->_pushback($_);
200 $self->end_element({'Name' => 'WABAOutput'});
201 return $self->end_document();
204 if( defined $curhit &&
205 $curhit ne $hacc) {
206 # slight duplication here -- keep these in SYNC
207 $self->end_element({'Name' => 'Hit'});
208 $self->start_element({'Name' => 'Hit'});
209 $self->element({'Name' => 'Hit_id',
210 'Data' => $hacc});
211 $self->element({'Name' => 'Hit_accession',
212 'Data' => $hacc});
214 } elsif ( ! defined $curquery ) {
215 $self->start_element({'Name' => 'WABAOutput'});
216 $self->{'_result_count'}++;
217 $self->element({'Name' => 'WABAOutput_query-def',
218 'Data' => $qid });
219 $self->element({'Name' => 'WABAOutput_program',
220 'Data' => 'WABA'});
221 $self->element({'Name' => 'WABAOutput_query-db',
222 'Data' => $qdb});
223 $self->element({'Name' => 'WABAOutput_db',
224 'Data' => $hitdb});
226 # slight duplication here -- keep these N'SYNC ;-)
227 $self->start_element({'Name' => 'Hit'});
228 $self->element({'Name' => 'Hit_id',
229 'Data' => $hacc});
230 $self->element({'Name' => 'Hit_accession',
231 'Data' => $hacc});
235 # strand is inferred by start,end values
236 # in the Result Builder
237 if( $qstrand eq '-' ) {
238 ($qstart,$qend) = ($qend,$qstart);
240 if( $hstrand eq '-' ) {
241 ($hstart,$hend) = ($hend,$hstart);
244 $self->start_element({'Name' => 'Hsp'});
245 $self->element({'Name' => 'Hsp_query-from',
246 'Data' => $qstart});
247 $self->element({'Name' => 'Hsp_query-to',
248 'Data' => $qend});
249 $self->element({'Name' => 'Hsp_hit-from',
250 'Data' => $hstart});
251 $self->element({'Name' => 'Hsp_hit-to',
252 'Data' => $hend});
253 $self->element({'Name' => 'Hsp_align-len',
254 'Data' => $alnlen});
256 $curquery = $qid;
257 $curhit = $hacc;
258 $state = 0;
259 } elsif( ! defined $curquery ) {
260 $self->warn("skipping because no Hit begin line was recognized\n$_") if( $_ !~ /^\s+$/ );
261 next;
262 } else {
263 chomp;
264 $self->element({'Name' => $STATES[$state++],
265 'Data' => $_});
266 if( $state >= scalar @STATES ) {
267 $state = -1;
268 $self->end_element({'Name' => 'Hsp'});
272 if( defined $curquery ) {
273 $self->end_element({'Name' => 'Hit'});
274 $self->end_element({'Name' => 'WABAOutput'});
275 return $self->end_document();
277 return;
280 =head2 start_element
282 Title : start_element
283 Usage : $eventgenerator->start_element
284 Function: Handles a start element event
285 Returns : none
286 Args : hashref with at least 2 keys 'Data' and 'Name'
289 =cut
291 sub start_element{
292 my ($self,$data) = @_;
293 # we currently don't care about attributes
294 my $nm = $data->{'Name'};
295 if( my $type = $MODEMAP{$nm} ) {
296 $self->_mode($type);
297 if( $self->_eventHandler->will_handle($type) ) {
298 my $func = sprintf("start_%s",lc $type);
299 $self->_eventHandler->$func($data->{'Attributes'});
301 unshift @{$self->{'_elements'}}, $type;
303 if($nm eq 'WABAOutput') {
304 $self->{'_values'} = {};
305 $self->{'_result'}= undef;
306 $self->{'_mode'} = '';
311 =head2 end_element
313 Title : start_element
314 Usage : $eventgenerator->end_element
315 Function: Handles an end element event
316 Returns : none
317 Args : hashref with at least 2 keys 'Data' and 'Name'
320 =cut
322 sub end_element {
323 my ($self,$data) = @_;
324 my $nm = $data->{'Name'};
325 my $rc;
326 # Hsp are sort of weird, in that they end when another
327 # object begins so have to detect this in end_element for now
328 if( $nm eq 'Hsp' ) {
329 foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) {
330 $self->element({'Name' => $_,
331 'Data' => $self->{'_last_hspdata'}->{$_}});
333 $self->{'_last_hspdata'} = {}
336 if( my $type = $MODEMAP{$nm} ) {
337 if( $self->_eventHandler->will_handle($type) ) {
338 my $func = sprintf("end_%s",lc $type);
339 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
340 $self->{'_values'});
342 shift @{$self->{'_elements'}};
344 } elsif( $MAPPING{$nm} ) {
345 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
346 my $key = (keys %{$MAPPING{$nm}})[0];
347 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
348 } else {
349 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
351 } else {
352 $self->warn( "unknown nm $nm ignoring\n");
354 $self->{'_last_data'} = ''; # remove read data if we are at
355 # end of an element
356 $self->{'_result'} = $rc if( $nm eq 'WABAOutput' );
357 return $rc;
361 =head2 element
363 Title : element
364 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
365 Function: Convience method that calls start_element, characters, end_element
366 Returns : none
367 Args : Hash ref with the keys 'Name' and 'Data'
370 =cut
372 sub element{
373 my ($self,$data) = @_;
374 $self->start_element($data);
375 $self->characters($data);
376 $self->end_element($data);
380 =head2 characters
382 Title : characters
383 Usage : $eventgenerator->characters($str)
384 Function: Send a character events
385 Returns : none
386 Args : string
389 =cut
391 sub characters{
392 my ($self,$data) = @_;
394 return unless ( defined $data->{'Data'} );
395 if( $data->{'Data'} =~ /^\s+$/ ) {
396 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
399 if( $self->in_element('hsp') &&
400 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
402 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
405 $self->{'_last_data'} = $data->{'Data'};
408 =head2 _mode
410 Title : _mode
411 Usage : $obj->_mode($newval)
412 Function:
413 Example :
414 Returns : value of _mode
415 Args : newvalue (optional)
418 =cut
420 sub _mode{
421 my ($self,$value) = @_;
422 if( defined $value) {
423 $self->{'_mode'} = $value;
425 return $self->{'_mode'};
428 =head2 within_element
430 Title : within_element
431 Usage : if( $eventgenerator->within_element($element) ) {}
432 Function: Test if we are within a particular element
433 This is different than 'in' because within can be tested
434 for a whole block.
435 Returns : boolean
436 Args : string element name
439 =cut
441 sub within_element{
442 my ($self,$name) = @_;
443 return 0 if ( ! defined $name &&
444 ! defined $self->{'_elements'} ||
445 scalar @{$self->{'_elements'}} == 0) ;
446 foreach ( @{$self->{'_elements'}} ) {
447 if( $_ eq $name ) {
448 return 1;
451 return 0;
454 =head2 in_element
456 Title : in_element
457 Usage : if( $eventgenerator->in_element($element) ) {}
458 Function: Test if we are in a particular element
459 This is different than 'in' because within can be tested
460 for a whole block.
461 Returns : boolean
462 Args : string element name
465 =cut
467 sub in_element{
468 my ($self,$name) = @_;
469 return 0 if ! defined $self->{'_elements'}->[0];
470 return ( $self->{'_elements'}->[0] eq $name)
474 =head2 start_document
476 Title : start_document
477 Usage : $eventgenerator->start_document
478 Function: Handles a start document event
479 Returns : none
480 Args : none
483 =cut
485 sub start_document{
486 my ($self) = @_;
487 $self->{'_lasttype'} = '';
488 $self->{'_values'} = {};
489 $self->{'_result'}= undef;
490 $self->{'_mode'} = '';
491 $self->{'_elements'} = [];
495 =head2 end_document
497 Title : end_document
498 Usage : $eventgenerator->end_document
499 Function: Handles an end document event
500 Returns : Bio::Search::Result::ResultI object
501 Args : none
504 =cut
506 sub end_document{
507 my ($self,@args) = @_;
508 return $self->{'_result'};
511 =head2 result_count
513 Title : result_count
514 Usage : my $count = $searchio->result_count
515 Function: Returns the number of results we have processed
516 Returns : integer
517 Args : none
520 =cut
522 sub result_count {
523 my $self = shift;
524 return $self->{'_result_count'};
527 sub report_count { shift->result_count }