bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / SearchIO / axt.pm
blob43d599fb43ac4ef009e82e1c957f259d90dcc675
1 # $Id$
3 # BioPerl module for Bio::SearchIO::axt
5 # Cared for by Jason Stajich <jason-at-bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SearchIO::axt - a parser for axt format reports
17 =head1 SYNOPSIS
19 use Bio::SearchIO;
20 my $parser = Bio::SearchIO->new(-format => 'axt',
21 -file => 't/data/report.blastz');
22 while( my $result = $parser->next_result ) {
23 while( my $hit = $result->next_hit ) {
24 while( my $hsp = $hit->next_hsp ) {
29 =head1 DESCRIPTION
31 This is a parser and event-generator for AXT format reports. BLASTZ
32 reports (Schwartz et al,(2003) Genome Research, 13:103-107) are normally
33 in LAV format but are commonly post-processed to AXT format; many precomputed
34 BLASTZ reports, such as those found in the UCSC Genome
35 Browser, are in AXT format. This parser will also parse any
36 AXT format produced from any lav report and directly out of BLAT.
38 =head1 FEEDBACK
40 =head2 Mailing Lists
42 User feedback is an integral part of the evolution of this and other
43 Bioperl modules. Send your comments and suggestions preferably to
44 the Bioperl mailing list. Your participation is much appreciated.
46 bioperl-l@bioperl.org - General discussion
47 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
49 =head2 Reporting Bugs
51 Report bugs to the Bioperl bug tracking system to help us keep track
52 of the bugs and their resolution. Bug reports can be submitted via the
53 web:
55 http://bugzilla.open-bio.org/
57 =head1 AUTHOR - Jason Stajich
59 Email jason-at-bioperl.org
61 =head1 APPENDIX
63 The rest of the documentation details each of the object methods.
64 Internal methods are usually preceded with a _
66 =cut
69 # Let the code begin...
72 package Bio::SearchIO::axt;
73 use vars qw(%MODEMAP %MAPPING @STATES $GAPCHAR);
74 use strict;
76 use Bio::Search::Result::ResultFactory;
77 use Bio::Search::HSP::HSPFactory;
78 use base qw(Bio::SearchIO);
80 use POSIX;
82 BEGIN {
83 # mapping of NCBI Blast terms to Bioperl hash keys
84 %MODEMAP = ('AXTOutput' => 'result',
85 'Hit' => 'hit',
86 'Hsp' => 'hsp'
88 $GAPCHAR = '-';
89 %MAPPING =
91 'Hsp_score' => 'HSP-score',
92 'Hsp_query-from' => 'HSP-query_start',
93 'Hsp_query-to' => 'HSP-query_end',
94 'Hsp_hit-from' => 'HSP-hit_start',
95 'Hsp_hit-to' => 'HSP-hit_end',
96 'Hsp_positive' => 'HSP-conserved',
97 'Hsp_identity' => 'HSP-identical',
98 'Hsp_gaps' => 'HSP-hsp_gaps',
99 'Hsp_hitgaps' => 'HSP-hit_gaps',
100 'Hsp_querygaps' => 'HSP-query_gaps',
101 'Hsp_qseq' => 'HSP-query_seq',
102 'Hsp_hseq' => 'HSP-hit_seq',
103 'Hsp_midline' => 'HSP-homology_seq', # ignoring this for now
104 'Hsp_align-len' => 'HSP-hsp_length',
106 'Hit_id' => 'HIT-name',
107 'AXTOutput_query-def'=> 'RESULT-query_name',
111 =head2 new
113 Title : new
114 Usage : my $obj = Bio::SearchIO::axt->new();
115 Function: Builds a new Bio::SearchIO::axt object
116 Returns : an instance of Bio::SearchIO::axt
117 Args :
120 =cut
122 sub new {
123 my($class,@args) = @_;
124 my $self = $class->SUPER::new(@args);
125 return $self;
128 =head2 next_result
130 Title : next_result
131 Usage : my $hit = $searchio->next_result;
132 Function: Returns the next Result from a search
133 Returns : Bio::Search::Result::ResultI object
134 Args : none
136 =cut
138 sub next_result{
139 my ($self) = @_;
140 local $/ = "\n";
141 local $_;
143 my ($curquery,$curhit);
144 $self->start_document();
145 my @hit_signifs;
146 while( defined ($_ = $self->_readline )) {
147 next if (/^\s+$/);
148 if( m/^(\d+)\s+ # alignment number - we'll throw this away anyways
149 (\S+)\s+ # Query name
150 (\d+)\s+(\d+)\s+ # Query start Query end (always + strand, 0 based)
151 (\S+)\s+ # Hit name
152 (\d+)\s+(\d+)\s+ # Hit start Hit end (0 based)
153 ([\-\+])\s+ # Hit strand
154 ([\d\.\-]+)\s+ # Score
155 /ox ) {
156 my ($alnnum, $qname,$qstart,$qend, $hname,
157 $hstart,$hend,$hstrand, $score) = ($1,$2,$3,$4,$5,
158 $6,$7,$8,$9);
159 $self->{'_reporttype'} = 'AXT';
160 # Jim's code is 0 based
161 $qstart++; $qend++; $hstart++; $hend++;
162 if( defined $curquery &&
163 $curquery ne $qname ) {
164 $self->end_element({'Name' => 'Hit'});
165 $self->_pushback($_);
166 $self->end_element({'Name' => 'AXTOutput'});
167 return $self->end_document();
170 if( defined $curhit &&
171 $curhit ne $hname) {
172 # slight duplication here -- keep these in SYNC
173 $self->end_element({'Name' => 'Hit'});
174 $self->start_element({'Name' => 'Hit'});
175 $self->element({'Name' => 'Hit_id',
176 'Data' => $hname});
177 } elsif ( ! defined $curquery ) {
178 $self->start_element({'Name' => 'AXTOutput'});
179 $self->{'_result_count'}++;
180 $self->element({'Name' => 'AXTOutput_query-def',
181 'Data' => $qname });
183 $self->start_element({'Name' => 'Hit'});
184 $self->element({'Name' => 'Hit_id',
185 'Data' => $hname});
187 $self->start_element({'Name' => 'Hsp'});
188 my $queryalign = $self->_readline;
189 my $hitalign = $self->_readline;
190 chomp($queryalign);
191 chomp($hitalign);
192 my $alnlen = length($queryalign);
193 my $qgapnum = ( $queryalign =~ s/\Q$GAPCHAR/$GAPCHAR/g);
194 my $hgapnum = ( $hitalign =~ s/\Q$GAPCHAR/$GAPCHAR/g);
195 my $totalgaps = ($qgapnum + $hgapnum);
197 if( $hstrand eq '-' ) { # strand gets inferred by start/end
198 ($hstart,$hend) = ($hend,$hstart);
200 $self->element({'Name' => 'Hsp_score',
201 'Data' => $score});
202 $self->element({'Name' => 'Hsp_query-from',
203 'Data' => $qstart});
204 $self->element({'Name' => 'Hsp_query-to',
205 'Data' => $qend});
206 $self->element({'Name' => 'Hsp_hit-from',
207 'Data' => $hstart});
208 $self->element({'Name' => 'Hsp_hit-to',
209 'Data' => $hend});
210 $self->element({'Name' => 'Hsp_gaps',
211 'Data' => $qgapnum + $hgapnum});
212 $self->element({'Name' => 'Hsp_querygaps',
213 'Data' => $qgapnum});
214 $self->element({'Name' => 'Hsp_hitgaps',
215 'Data' => $hgapnum});
217 $self->element({'Name' => 'Hsp_identity',
218 'Data' => $alnlen - $totalgaps});
219 $self->element({'Name' => 'Hsp_positive',
220 'Data' => $alnlen - $totalgaps});
221 $self->element({'Name' => 'Hsp_qseq',
222 'Data' => $queryalign});
223 $self->element({'Name' => 'Hsp_hseq',
224 'Data' => $hitalign});
226 $self->end_element({'Name' => 'Hsp'});
227 $curquery = $qname;
228 $curhit = $hname;
231 # fence post
232 if( defined $curquery ) {
233 $self->end_element({'Name' => 'Hit'});
234 $self->end_element({'Name' => 'AXTOutput'});
235 return $self->end_document();
237 return;
240 sub _initialize {
241 my ($self,@args) = @_;
242 $self->SUPER::_initialize(@args);
243 $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::GenericResult'));
245 $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::GenericHSP'));
249 =head2 start_element
251 Title : start_element
252 Usage : $eventgenerator->start_element
253 Function: Handles a start element event
254 Returns : none
255 Args : hashref with at least 2 keys 'Data' and 'Name'
258 =cut
260 sub start_element{
261 my ($self,$data) = @_;
262 # we currently don't care about attributes
263 my $nm = $data->{'Name'};
264 if( my $type = $MODEMAP{$nm} ) {
265 $self->_mode($type);
266 if( $self->_eventHandler->will_handle($type) ) {
267 my $func = sprintf("start_%s",lc $type);
268 $self->_eventHandler->$func($data->{'Attributes'});
270 unshift @{$self->{'_elements'}}, $type;
272 if($nm eq 'AXTOutput') {
273 $self->{'_values'} = {};
274 $self->{'_result'}= undef;
275 $self->{'_mode'} = '';
280 =head2 end_element
282 Title : start_element
283 Usage : $eventgenerator->end_element
284 Function: Handles an end element event
285 Returns : none
286 Args : hashref with at least 2 keys 'Data' and 'Name'
289 =cut
291 sub end_element {
292 my ($self,$data) = @_;
293 my $nm = $data->{'Name'};
294 my $rc;
295 # Hsp are sort of weird, in that they end when another
296 # object begins so have to detect this in end_element for now
298 if( my $type = $MODEMAP{$nm} ) {
299 if( $self->_eventHandler->will_handle($type) ) {
300 my $func = sprintf("end_%s",lc $type);
301 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
302 $self->{'_values'});
304 shift @{$self->{'_elements'}};
306 } elsif( $MAPPING{$nm} ) {
307 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
308 my $key = (keys %{$MAPPING{$nm}})[0];
309 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
310 } else {
311 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
313 } else {
314 $self->warn( "unknown nm $nm ignoring\n");
316 $self->{'_last_data'} = ''; # remove read data if we are at
317 # end of an element
318 $self->{'_result'} = $rc if( $nm eq 'AXTOutput' );
319 return $rc;
323 =head2 element
325 Title : element
326 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
327 Function: Convience method that calls start_element, characters, end_element
328 Returns : none
329 Args : Hash ref with the keys 'Name' and 'Data'
332 =cut
334 sub element{
335 my ($self,$data) = @_;
336 $self->start_element($data);
337 $self->characters($data);
338 $self->end_element($data);
342 =head2 characters
344 Title : characters
345 Usage : $eventgenerator->characters($str)
346 Function: Send a character events
347 Returns : none
348 Args : string
351 =cut
353 sub characters{
354 my ($self,$data) = @_;
356 return unless ( defined $data->{'Data'} );
357 if( $data->{'Data'} =~ /^\s+$/ ) {
358 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
361 if( $self->in_element('hsp') &&
362 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
364 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
367 $self->{'_last_data'} = $data->{'Data'};
370 =head2 _mode
372 Title : _mode
373 Usage : $obj->_mode($newval)
374 Function:
375 Example :
376 Returns : value of _mode
377 Args : newvalue (optional)
380 =cut
382 sub _mode{
383 my ($self,$value) = @_;
384 if( defined $value) {
385 $self->{'_mode'} = $value;
387 return $self->{'_mode'};
390 =head2 within_element
392 Title : within_element
393 Usage : if( $eventgenerator->within_element($element) ) {}
394 Function: Test if we are within a particular element
395 This is different than 'in' because within can be tested
396 for a whole block.
397 Returns : boolean
398 Args : string element name
401 =cut
403 sub within_element{
404 my ($self,$name) = @_;
405 return 0 if ( ! defined $name &&
406 ! defined $self->{'_elements'} ||
407 scalar @{$self->{'_elements'}} == 0) ;
408 foreach ( @{$self->{'_elements'}} ) {
409 if( $_ eq $name ) {
410 return 1;
413 return 0;
416 =head2 in_element
418 Title : in_element
419 Usage : if( $eventgenerator->in_element($element) ) {}
420 Function: Test if we are in a particular element
421 This is different than 'in' because within can be tested
422 for a whole block.
423 Returns : boolean
424 Args : string element name
427 =cut
429 sub in_element{
430 my ($self,$name) = @_;
431 return 0 if ! defined $self->{'_elements'}->[0];
432 return ( $self->{'_elements'}->[0] eq $name)
436 =head2 start_document
438 Title : start_document
439 Usage : $eventgenerator->start_document
440 Function: Handles a start document event
441 Returns : none
442 Args : none
445 =cut
447 sub start_document{
448 my ($self) = @_;
449 $self->{'_lasttype'} = '';
450 $self->{'_values'} = {};
451 $self->{'_result'}= undef;
452 $self->{'_mode'} = '';
453 $self->{'_elements'} = [];
457 =head2 end_document
459 Title : end_document
460 Usage : $eventgenerator->end_document
461 Function: Handles an end document event
462 Returns : Bio::Search::Result::ResultI object
463 Args : none
466 =cut
468 sub end_document{
469 my ($self,@args) = @_;
470 return $self->{'_result'};
473 =head2 result_count
475 Title : result_count
476 Usage : my $count = $searchio->result_count
477 Function: Returns the number of results we have processed
478 Returns : integer
479 Args : none
482 =cut
484 sub result_count {
485 my $self = shift;
486 return $self->{'_result_count'};
489 sub report_count { shift->result_count }