tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / SearchIO / axt.pm
blobf0b8629248801c673ccdde59a63438fa4511ea80
1 # $Id$
3 # BioPerl module for Bio::SearchIO::axt
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason-at-bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::SearchIO::axt - a parser for axt format reports
19 =head1 SYNOPSIS
21 use Bio::SearchIO;
22 my $parser = Bio::SearchIO->new(-format => 'axt',
23 -file => 't/data/report.blastz');
24 while( my $result = $parser->next_result ) {
25 while( my $hit = $result->next_hit ) {
26 while( my $hsp = $hit->next_hsp ) {
31 =head1 DESCRIPTION
33 This is a parser and event-generator for AXT format reports. BLASTZ
34 reports (Schwartz et al,(2003) Genome Research, 13:103-107) are normally
35 in LAV format but are commonly post-processed to AXT format; many precomputed
36 BLASTZ reports, such as those found in the UCSC Genome
37 Browser, are in AXT format. This parser will also parse any
38 AXT format produced from any lav report and directly out of BLAT.
40 =head1 FEEDBACK
42 =head2 Mailing Lists
44 User feedback is an integral part of the evolution of this and other
45 Bioperl modules. Send your comments and suggestions preferably to
46 the Bioperl mailing list. Your participation is much appreciated.
48 bioperl-l@bioperl.org - General discussion
49 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
51 =head2 Support
53 Please direct usage questions or support issues to the mailing list:
55 I<bioperl-l@bioperl.org>
57 rather than to the module maintainer directly. Many experienced and
58 reponsive experts will be able look at the problem and quickly
59 address it. Please include a thorough description of the problem
60 with code and data examples if at all possible.
62 =head2 Reporting Bugs
64 Report bugs to the Bioperl bug tracking system to help us keep track
65 of the bugs and their resolution. Bug reports can be submitted via the
66 web:
68 http://bugzilla.open-bio.org/
70 =head1 AUTHOR - Jason Stajich
72 Email jason-at-bioperl.org
74 =head1 APPENDIX
76 The rest of the documentation details each of the object methods.
77 Internal methods are usually preceded with a _
79 =cut
82 # Let the code begin...
85 package Bio::SearchIO::axt;
86 use vars qw(%MODEMAP %MAPPING @STATES $GAPCHAR);
87 use strict;
89 use Bio::Search::Result::ResultFactory;
90 use Bio::Search::HSP::HSPFactory;
91 use base qw(Bio::SearchIO);
93 use POSIX;
95 BEGIN {
96 # mapping of NCBI Blast terms to Bioperl hash keys
97 %MODEMAP = ('AXTOutput' => 'result',
98 'Hit' => 'hit',
99 'Hsp' => 'hsp'
101 $GAPCHAR = '-';
102 %MAPPING =
104 'Hsp_score' => 'HSP-score',
105 'Hsp_query-from' => 'HSP-query_start',
106 'Hsp_query-to' => 'HSP-query_end',
107 'Hsp_hit-from' => 'HSP-hit_start',
108 'Hsp_hit-to' => 'HSP-hit_end',
109 'Hsp_positive' => 'HSP-conserved',
110 'Hsp_identity' => 'HSP-identical',
111 'Hsp_gaps' => 'HSP-hsp_gaps',
112 'Hsp_hitgaps' => 'HSP-hit_gaps',
113 'Hsp_querygaps' => 'HSP-query_gaps',
114 'Hsp_qseq' => 'HSP-query_seq',
115 'Hsp_hseq' => 'HSP-hit_seq',
116 'Hsp_midline' => 'HSP-homology_seq', # ignoring this for now
117 'Hsp_align-len' => 'HSP-hsp_length',
119 'Hit_id' => 'HIT-name',
120 'AXTOutput_query-def'=> 'RESULT-query_name',
124 =head2 new
126 Title : new
127 Usage : my $obj = Bio::SearchIO::axt->new();
128 Function: Builds a new Bio::SearchIO::axt object
129 Returns : an instance of Bio::SearchIO::axt
130 Args :
133 =cut
135 sub new {
136 my($class,@args) = @_;
137 my $self = $class->SUPER::new(@args);
138 return $self;
141 =head2 next_result
143 Title : next_result
144 Usage : my $hit = $searchio->next_result;
145 Function: Returns the next Result from a search
146 Returns : Bio::Search::Result::ResultI object
147 Args : none
149 =cut
151 sub next_result{
152 my ($self) = @_;
153 local $/ = "\n";
154 local $_;
156 my ($curquery,$curhit);
157 $self->start_document();
158 my @hit_signifs;
159 while( defined ($_ = $self->_readline )) {
160 next if (/^\s+$/);
161 if( m/^(\d+)\s+ # alignment number - we'll throw this away anyways
162 (\S+)\s+ # Query name
163 (\d+)\s+(\d+)\s+ # Query start Query end (always + strand, 0 based)
164 (\S+)\s+ # Hit name
165 (\d+)\s+(\d+)\s+ # Hit start Hit end (0 based)
166 ([\-\+])\s+ # Hit strand
167 ([\d\.\-]+)\s+ # Score
168 /ox ) {
169 my ($alnnum, $qname,$qstart,$qend, $hname,
170 $hstart,$hend,$hstrand, $score) = ($1,$2,$3,$4,$5,
171 $6,$7,$8,$9);
172 $self->{'_reporttype'} = 'AXT';
173 # Jim's code is 0 based
174 $qstart++; $qend++; $hstart++; $hend++;
175 if( defined $curquery &&
176 $curquery ne $qname ) {
177 $self->end_element({'Name' => 'Hit'});
178 $self->_pushback($_);
179 $self->end_element({'Name' => 'AXTOutput'});
180 return $self->end_document();
183 if( defined $curhit &&
184 $curhit ne $hname) {
185 # slight duplication here -- keep these in SYNC
186 $self->end_element({'Name' => 'Hit'});
187 $self->start_element({'Name' => 'Hit'});
188 $self->element({'Name' => 'Hit_id',
189 'Data' => $hname});
190 } elsif ( ! defined $curquery ) {
191 $self->start_element({'Name' => 'AXTOutput'});
192 $self->{'_result_count'}++;
193 $self->element({'Name' => 'AXTOutput_query-def',
194 'Data' => $qname });
196 $self->start_element({'Name' => 'Hit'});
197 $self->element({'Name' => 'Hit_id',
198 'Data' => $hname});
200 $self->start_element({'Name' => 'Hsp'});
201 my $queryalign = $self->_readline;
202 my $hitalign = $self->_readline;
203 chomp($queryalign);
204 chomp($hitalign);
205 my $alnlen = length($queryalign);
206 my $qgapnum = ( $queryalign =~ s/\Q$GAPCHAR/$GAPCHAR/g);
207 my $hgapnum = ( $hitalign =~ s/\Q$GAPCHAR/$GAPCHAR/g);
208 my $totalgaps = ($qgapnum + $hgapnum);
210 if( $hstrand eq '-' ) { # strand gets inferred by start/end
211 ($hstart,$hend) = ($hend,$hstart);
213 $self->element({'Name' => 'Hsp_score',
214 'Data' => $score});
215 $self->element({'Name' => 'Hsp_query-from',
216 'Data' => $qstart});
217 $self->element({'Name' => 'Hsp_query-to',
218 'Data' => $qend});
219 $self->element({'Name' => 'Hsp_hit-from',
220 'Data' => $hstart});
221 $self->element({'Name' => 'Hsp_hit-to',
222 'Data' => $hend});
223 $self->element({'Name' => 'Hsp_gaps',
224 'Data' => $qgapnum + $hgapnum});
225 $self->element({'Name' => 'Hsp_querygaps',
226 'Data' => $qgapnum});
227 $self->element({'Name' => 'Hsp_hitgaps',
228 'Data' => $hgapnum});
230 $self->element({'Name' => 'Hsp_identity',
231 'Data' => $alnlen - $totalgaps});
232 $self->element({'Name' => 'Hsp_positive',
233 'Data' => $alnlen - $totalgaps});
234 $self->element({'Name' => 'Hsp_qseq',
235 'Data' => $queryalign});
236 $self->element({'Name' => 'Hsp_hseq',
237 'Data' => $hitalign});
239 $self->end_element({'Name' => 'Hsp'});
240 $curquery = $qname;
241 $curhit = $hname;
244 # fence post
245 if( defined $curquery ) {
246 $self->end_element({'Name' => 'Hit'});
247 $self->end_element({'Name' => 'AXTOutput'});
248 return $self->end_document();
250 return;
253 sub _initialize {
254 my ($self,@args) = @_;
255 $self->SUPER::_initialize(@args);
256 $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::GenericResult'));
258 $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::GenericHSP'));
262 =head2 start_element
264 Title : start_element
265 Usage : $eventgenerator->start_element
266 Function: Handles a start element event
267 Returns : none
268 Args : hashref with at least 2 keys 'Data' and 'Name'
271 =cut
273 sub start_element{
274 my ($self,$data) = @_;
275 # we currently don't care about attributes
276 my $nm = $data->{'Name'};
277 if( my $type = $MODEMAP{$nm} ) {
278 $self->_mode($type);
279 if( $self->_eventHandler->will_handle($type) ) {
280 my $func = sprintf("start_%s",lc $type);
281 $self->_eventHandler->$func($data->{'Attributes'});
283 unshift @{$self->{'_elements'}}, $type;
285 if($nm eq 'AXTOutput') {
286 $self->{'_values'} = {};
287 $self->{'_result'}= undef;
288 $self->{'_mode'} = '';
293 =head2 end_element
295 Title : start_element
296 Usage : $eventgenerator->end_element
297 Function: Handles an end element event
298 Returns : none
299 Args : hashref with at least 2 keys 'Data' and 'Name'
302 =cut
304 sub end_element {
305 my ($self,$data) = @_;
306 my $nm = $data->{'Name'};
307 my $rc;
308 # Hsp are sort of weird, in that they end when another
309 # object begins so have to detect this in end_element for now
311 if( my $type = $MODEMAP{$nm} ) {
312 if( $self->_eventHandler->will_handle($type) ) {
313 my $func = sprintf("end_%s",lc $type);
314 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
315 $self->{'_values'});
317 shift @{$self->{'_elements'}};
319 } elsif( $MAPPING{$nm} ) {
320 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
321 my $key = (keys %{$MAPPING{$nm}})[0];
322 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
323 } else {
324 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
326 } else {
327 $self->warn( "unknown nm $nm ignoring\n");
329 $self->{'_last_data'} = ''; # remove read data if we are at
330 # end of an element
331 $self->{'_result'} = $rc if( $nm eq 'AXTOutput' );
332 return $rc;
336 =head2 element
338 Title : element
339 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
340 Function: Convience method that calls start_element, characters, end_element
341 Returns : none
342 Args : Hash ref with the keys 'Name' and 'Data'
345 =cut
347 sub element{
348 my ($self,$data) = @_;
349 $self->start_element($data);
350 $self->characters($data);
351 $self->end_element($data);
355 =head2 characters
357 Title : characters
358 Usage : $eventgenerator->characters($str)
359 Function: Send a character events
360 Returns : none
361 Args : string
364 =cut
366 sub characters{
367 my ($self,$data) = @_;
369 return unless ( defined $data->{'Data'} );
370 if( $data->{'Data'} =~ /^\s+$/ ) {
371 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
374 if( $self->in_element('hsp') &&
375 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
377 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
380 $self->{'_last_data'} = $data->{'Data'};
383 =head2 _mode
385 Title : _mode
386 Usage : $obj->_mode($newval)
387 Function:
388 Example :
389 Returns : value of _mode
390 Args : newvalue (optional)
393 =cut
395 sub _mode{
396 my ($self,$value) = @_;
397 if( defined $value) {
398 $self->{'_mode'} = $value;
400 return $self->{'_mode'};
403 =head2 within_element
405 Title : within_element
406 Usage : if( $eventgenerator->within_element($element) ) {}
407 Function: Test if we are within a particular element
408 This is different than 'in' because within can be tested
409 for a whole block.
410 Returns : boolean
411 Args : string element name
414 =cut
416 sub within_element{
417 my ($self,$name) = @_;
418 return 0 if ( ! defined $name &&
419 ! defined $self->{'_elements'} ||
420 scalar @{$self->{'_elements'}} == 0) ;
421 foreach ( @{$self->{'_elements'}} ) {
422 if( $_ eq $name ) {
423 return 1;
426 return 0;
429 =head2 in_element
431 Title : in_element
432 Usage : if( $eventgenerator->in_element($element) ) {}
433 Function: Test if we are in a particular element
434 This is different than 'in' because within can be tested
435 for a whole block.
436 Returns : boolean
437 Args : string element name
440 =cut
442 sub in_element{
443 my ($self,$name) = @_;
444 return 0 if ! defined $self->{'_elements'}->[0];
445 return ( $self->{'_elements'}->[0] eq $name)
449 =head2 start_document
451 Title : start_document
452 Usage : $eventgenerator->start_document
453 Function: Handles a start document event
454 Returns : none
455 Args : none
458 =cut
460 sub start_document{
461 my ($self) = @_;
462 $self->{'_lasttype'} = '';
463 $self->{'_values'} = {};
464 $self->{'_result'}= undef;
465 $self->{'_mode'} = '';
466 $self->{'_elements'} = [];
470 =head2 end_document
472 Title : end_document
473 Usage : $eventgenerator->end_document
474 Function: Handles an end document event
475 Returns : Bio::Search::Result::ResultI object
476 Args : none
479 =cut
481 sub end_document{
482 my ($self,@args) = @_;
483 return $self->{'_result'};
486 =head2 result_count
488 Title : result_count
489 Usage : my $count = $searchio->result_count
490 Function: Returns the number of results we have processed
491 Returns : integer
492 Args : none
495 =cut
497 sub result_count {
498 my $self = shift;
499 return $self->{'_result_count'};
502 sub report_count { shift->result_count }