3 # BioPerl module for Bio::SearchIO::exonerate
5 # Cared for by Jason Stajich <jason-at-bioperl.org>
7 # Copyright Jason Stajich
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::SearchIO::exonerate - parser for Exonerate
19 # do not use this module directly, it is a driver for SearchIO
22 my $searchio = Bio::SearchIO->new(-file => 'file.exonerate',
23 -format => 'exonerate');
26 while( my $r = $searchio->next_result ) {
27 print $r->query_name, "\n";
32 This is a driver for the SearchIO system for parsing Exonerate (Guy
33 Slater) output. You can get Exonerate at
34 http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/exonerate/?cvsroot=Ensembl
35 [until Guy puts up a Web reference,publication for it.]).
37 An optional parameter -min_intron is supported by the L<new>
38 initialization method. This is if you run Exonerate with a different
39 minimum intron length (default is 30) the parser will be able to
40 detect the difference between standard deletions and an intron. Still
41 some room to play with there that might cause this to get
42 misinterpreted that has not been fully tested or explored.
44 The VULGAR and CIGAR formats should be parsed okay now creating HSPs
45 where appropriate (so merging match states where appropriate rather
46 than breaking an HSP at each indel as it may have done in the past).
47 The GFF that comes from exonerate is still probably a better way to go
48 if you are doing protein2genome or est2genome mapping.
49 For example you can see this script:
51 http://fungal.genome.duke.edu/~jes12/software/scripts/process_exonerate_gff3.perl.txt
53 If your report contains both CIGAR and VULGAR lines only the first one
54 will processed for a given Query/Target pair. If you preferentially
55 want to use VULGAR or CIGAR add one of these options when initializing
62 Or set them via these methods.
74 User feedback is an integral part of the evolution of this and other
75 Bioperl modules. Send your comments and suggestions preferably to
76 the Bioperl mailing list. Your participation is much appreciated.
78 bioperl-l@bioperl.org - General discussion
79 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
83 Report bugs to the Bioperl bug tracking system to help us keep track
84 of the bugs and their resolution. Bug reports can be submitted via the
87 http://bugzilla.open-bio.org/
89 =head1 AUTHOR - Jason Stajich
91 Email jason-at-bioperl.org
95 The rest of the documentation details each of the object methods.
96 Internal methods are usually preceded with a _
101 # Let the code begin...
104 package Bio
::SearchIO
::exonerate
;
106 use vars
qw(@STATES %MAPPING %MODEMAP $DEFAULT_WRITER_CLASS $MIN_INTRON);
108 use base qw(Bio::SearchIO);
110 %MODEMAP = ( 'ExonerateOutput' => 'result',
117 'Hsp_query-from'=> 'HSP-query_start',
118 'Hsp_query-to' => 'HSP-query_end',
119 'Hsp_hit-from' => 'HSP-hit_start',
120 'Hsp_hit-to' => 'HSP-hit_end',
121 'Hsp_qseq' => 'HSP-query_seq',
122 'Hsp_hseq' => 'HSP-hit_seq',
123 'Hsp_midline' => 'HSP-homology_seq',
124 'Hsp_score' => 'HSP-score',
125 'Hsp_qlength' => 'HSP-query_length',
126 'Hsp_hlength' => 'HSP-hit_length',
127 'Hsp_align-len' => 'HSP-hsp_length',
128 'Hsp_identity' => 'HSP-identical',
129 'Hsp_gaps' => 'HSP-hsp_gaps',
130 'Hsp_hitgaps' => 'HSP-hit_gaps',
131 'Hsp_querygaps' => 'HSP-query_gaps',
133 'Hit_id' => 'HIT-name',
134 'Hit_desc' => 'HIT-description',
135 'Hit_len' => 'HIT-length',
136 'Hit_score' => 'HIT-score',
138 'ExonerateOutput_program' => 'RESULT-algorithm_name',
139 'ExonerateOutput_query-def' => 'RESULT-query_name',
140 'ExonerateOutput_query-desc'=> 'RESULT-query_description',
141 'ExonerateOutput_query-len' => 'RESULT-query_length',
144 $DEFAULT_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter';
146 $MIN_INTRON=30; # This is the minimum intron size
151 Usage : my $obj = Bio::SearchIO::exonerate->new();
152 Function: Builds a new Bio::SearchIO::exonerate object
153 Returns : an instance of Bio::SearchIO::exonerate
154 Args : -min_intron => somewhat obselete option, how to determine if a
155 an indel is an intron or a local gap. Use VULGAR
156 rather than CIGAR to avoid this heuristic,default 30.
157 -cigar => 1 set this to 1 if you want to parse
159 -vulgar => 1 set this to 1 if you want to parse VULGAR
160 exclusively, setting both to 1 will revert
161 to the default behavior of just parsing the
162 first line that it sees.
168 my $self = $class->SUPER::new
(@_);
170 my ($min_intron,$cigar,
171 $vulgar) = $self->_rearrange([qw(MIN_INTRON
175 $MIN_INTRON = $min_intron;
177 if( $cigar && $vulgar ) {
178 $self->warn("cannot get HSPs from both CIGAR and VULGAR lines, will just choose whichever comes first (same as if you had chosen neither");
179 $cigar = 0; $vulgar=0;
181 $self->cigar($cigar);
182 $self->vulgar($vulgar);
189 Usage : my $hit = $searchio->next_result;
190 Function: Returns the next Result from a search
191 Returns : Bio::Search::Result::ResultI object
201 $self->{'_last_data'} = '';
202 my ($reporttype,$seenquery,$reportline);
203 $self->start_document();
206 my (@q_ex, @m_ex, @h_ex); ## gc addition
207 while( defined($_ = $self->_readline) ) {
208 # warn( "Reading $_");
209 if( /^Query:\s+(\S+)\s*(.+)?/ ) {
211 $self->end_element({'Name' => 'ExonerateOutput'});
212 $self->_pushback($_);
213 return $self->end_document();
216 my ($nm,$desc) = ($1,$2);
217 chomp($desc) if defined $desc;
218 $self->{'_result_count'}++;
219 $self->start_element({'Name' => 'ExonerateOutput'});
220 $self->element({'Name' => 'ExonerateOutput_query-def',
222 $self->element({'Name' => 'ExonerateOutput_query-desc',
224 $self->element({'Name' => 'ExonerateOutput_program',
225 'Data' => 'Exonerate' });
226 $self->{'_seencigar'} = 0;
227 $self->{'_vulgar'} = 0;
229 } elsif ( /^Target:\s+(\S+)\s*(.+)?/ ) {
230 my ($nm,$desc) = ($1,$2);
231 chomp($desc) if defined $desc;
232 $self->start_element({'Name' => 'Hit'});
233 $self->element({'Name' => 'Hit_id',
235 $self->element({'Name' => 'Hit_desc',
237 $self->{'_seencigar'} = 0;
238 $self->{'_vulgar'} = 0;
239 } elsif( s
/^vulgar
:\s
+(\S
+)\s
+ # query sequence id
240 (\d
+)\s
+(\d
+)\s
+([\
-\
+])\s
+ # query start-end-strand
241 (\S
+)\s
+ # target sequence id
242 (\d
+)\s
+(\d
+)\s
+([\
-\
+])\s
+ # target start-end-strand
245 next if( $self->cigar || $self->{'_seencigar'});
246 $self->{'_vulgar'}++;
248 # Note from Ewan. This is ugly - copy and paste from
249 # cigar line parsing. Should unify somehow...
251 if( ! $self->within_element('result') ) {
252 $self->start_element({'Name' => 'ExonerateOutput'});
253 $self->element({'Name' => 'ExonerateOutput_query-def',
256 if( ! $self->within_element('hit') ) {
257 $self->start_element({'Name' => 'Hit'});
258 $self->element({'Name' => 'Hit_id',
263 ## $qe and $he are no longer used for calculating the ends,
264 ## just the $qs and $hs values and the alignment and insert lenghts
265 my ($qs,$qe,$qstrand) = ($2,$3,$4);
266 my ($hs,$he,$hstrand) = ($6,$7,$8);
268 # $self->element({'Name' => 'ExonerateOutput_query-len',
270 # $self->element({'Name' => 'Hit_len',
274 ## add one because these values are zero-based
275 ## this calculation was originally done lower in the code,
276 ## but it's clearer to do it just once at the start
278 my ($qbegin,$qend) = ('query-from', 'query-to');
280 if( $qstrand eq '-' ) {
281 $qstrand = -1; $qe++;
286 my ($hbegin,$hend) = ('hit-from', 'hit-to');
288 if( $hstrand eq '-' ) {
295 # okay let's do this right and generate a set of HSPs
296 # from the cigar line/home/bio1/jes12/bin/exonerate --model est2genome --bestn 1 t/data/exonerate_cdna.fa t/data/exonerate_genomic_rev.fa
298 my ($aln_len,$inserts,$deletes) = (0,0,0);
299 my ($laststate,@events,$gaps) =( '' );
300 while( @rest >= 3 ) {
301 my ($state,$len1,$len2) = (shift @rest, shift @rest, shift @rest);
303 # HSPs are only the Match cases; otherwise we just
304 # move the coordinates on by the correct amount
307 if( $state eq 'M' ) {
308 if( $laststate eq 'G' ) {
309 # merge gaps across Match states so the HSP
311 $events[-1]->{$qend} = $qs + $len1*$qstrand - $qstrand;
312 $events[-1]->{$hend} = $hs + $len2*$hstrand - $hstrand;
313 $events[-1]->{'gaps'} = $gaps;
317 'align-len' => $len1,
319 $qend => ($qs + $len1*$qstrand - $qstrand),
321 $hend => ($hs + $len2*$hstrand - $hstrand),
326 $gaps = $len1 + $len2 if $state eq 'G';
328 $qs += $len1*$qstrand;
329 $hs += $len2*$hstrand;
332 for my $event ( @events ) {
333 $self->start_element({'Name' => 'Hsp'});
334 while( my ($key,$val) = each %$event ) {
335 $self->element({'Name' => "Hsp_$key",
338 $self->element({'Name' => 'Hsp_identity',
340 $self->end_element({'Name' => 'Hsp'});
344 $self->element({'Name' => 'Hit_score',
347 $self->end_element({'Name' => 'Hit'});
348 $self->end_element({'Name' => 'ExonerateOutput'});
350 return $self->end_document();
352 } elsif( s
/^cigar
:\s
+(\S
+)\s
+ # query sequence id
353 (\d
+)\s
+(\d
+)\s
+([\
-\
+])\s
+ # query start-end-strand
354 (\S
+)\s
+ # target sequence id
355 (\d
+)\s
+(\d
+)\s
+([\
-\
+])\s
+ # target start-end-strand
358 next if( $self->vulgar || $self->{'_seenvulgar'});
361 if( ! $self->within_element('result') ) {
362 $self->start_element({'Name' => 'ExonerateOutput'});
363 $self->element({'Name' => 'ExonerateOutput_query-def',
366 if( ! $self->within_element('hit') ) {
367 $self->start_element({'Name' => 'Hit'});
368 $self->element({'Name' => 'Hit_id',
372 ## $qe and $he are no longer used for calculating the ends,
373 ## just the $qs and $hs values and the alignment and insert lenghts
374 my ($qs,$qe,$qstrand) = ($2,$3,$4);
375 my ($hs,$he,$hstrand) = ($6,$7,$8);
377 # $self->element({'Name' => 'ExonerateOutput_query-len',
379 # $self->element({'Name' => 'Hit_len',
383 if( $qstrand eq '-' ) {
385 ($qs,$qe) = ($qe,$qs); # flip-flop if we're on opp strand
387 } else { $qstrand = 1; }
388 if( $hstrand eq '-' ) {
390 ($hs,$he) = ($he,$hs); # flip-flop if we're on opp strand
392 } else { $hstrand = 1; }
393 # okay let's do this right and generate a set of HSPs
394 # from the cigar line
397 ## add one because these values are zero-based
398 ## this calculation was originally done lower in the code,
399 ## but it's clearer to do it just once at the start
402 my ($aln_len,$inserts,$deletes) = (0,0,0);
403 while( @rest >= 2 ) {
404 my ($state,$len) = (shift @rest, shift @rest);
405 if( $state eq 'I' ) {
407 } elsif( $state eq 'D' ) {
408 if( $len >= $MIN_INTRON ) {
409 $self->start_element({'Name' => 'Hsp'});
411 $self->element({'Name' => 'Hsp_score',
413 $self->element({'Name' => 'Hsp_align-len',
414 'Data' => $aln_len});
415 $self->element({'Name' => 'Hsp_identity',
417 ($inserts + $deletes)});
419 # HSP ends where the other begins
420 $self->element({'Name' => 'Hsp_query-from',
423 ## $qs is now the start of the next hsp
424 ## the end of this hsp is 1 before this position
425 ## (or 1 after in case of reverse strand)
426 $qs += $aln_len*$qstrand;
427 $self->element({'Name' => 'Hsp_query-to',
428 'Data' => $qs - ($qstrand*1)});
430 $hs += $deletes*$hstrand;
431 $self->element({'Name' => 'Hsp_hit-from',
433 $hs += $aln_len*$hstrand;
434 $self->element({'Name' => 'Hsp_hit-to',
435 'Data' => $hs-($hstrand*1)});
437 $self->element({'Name' => 'Hsp_align-len',
438 'Data' => $aln_len + $inserts
440 $self->element({'Name' => 'Hsp_identity',
441 'Data' => $aln_len });
443 $self->element({'Name' => 'Hsp_gaps',
444 'Data' => $inserts + $deletes});
445 $self->element({'Name' => 'Hsp_querygaps',
446 'Data' => $inserts});
447 $self->element({'Name' => 'Hsp_hitgaps',
448 'Data' => $deletes});
452 $self->element({'Name' => 'Hsp_qseq',
453 'Data' => shift @q_ex,
455 $self->element({'Name' => 'Hsp_hseq',
456 'Data' => shift @h_ex,
458 $self->element({'Name' => 'Hsp_midline',
459 'Data' => shift @m_ex,
462 $self->end_element({'Name' => 'Hsp'});
464 $aln_len = $inserts = $deletes = 0;
471 $self->start_element({'Name' => 'Hsp'});
475 $self->element({'Name' => 'Hsp_qseq',
476 'Data' => shift @q_ex,
478 $self->element({'Name' => 'Hsp_hseq',
479 'Data' => shift @h_ex,
481 $self->element({'Name' => 'Hsp_midline',
482 'Data' => shift @m_ex,
486 $self->element({'Name' => 'Hsp_score',
489 $self->element({'Name' => 'Hsp_query-from',
492 $qs += $aln_len*$qstrand;
493 $self->element({'Name' => 'Hsp_query-to',
494 'Data' => $qs - ($qstrand*1)});
496 $hs += $deletes*$hstrand;
497 $self->element({'Name' => 'Hsp_hit-from',
499 $hs += $aln_len*$hstrand;
500 $self->element({'Name' => 'Hsp_hit-to',
501 'Data' => $hs -($hstrand*1)});
503 $self->element({'Name' => 'Hsp_align-len',
504 'Data' => $aln_len});
506 $self->element({'Name' => 'Hsp_identity',
507 'Data' => $aln_len - ($inserts + $deletes)});
509 $self->element({'Name' => 'Hsp_gaps',
510 'Data' => $inserts + $deletes});
512 $self->element({'Name' => 'Hsp_querygaps',
513 'Data' => $inserts});
514 $self->element({'Name' => 'Hsp_hitgaps',
515 'Data' => $deletes});
516 $self->end_element({'Name' => 'Hsp'});
518 $self->element({'Name' => 'Hit_score',
521 $self->end_element({'Name' => 'Hit'});
522 $self->end_element({'Name' => 'ExonerateOutput'});
524 return $self->end_document();
529 return $self->end_document() if( $seentop );
534 Title : start_element
535 Usage : $eventgenerator->start_element
536 Function: Handles a start element event
538 Args : hashref with at least 2 keys 'Data' and 'Name'
544 my ($self,$data) = @_;
545 # we currently don't care about attributes
546 my $nm = $data->{'Name'};
547 my $type = $MODEMAP{$nm};
550 if( $self->_eventHandler->will_handle($type) ) {
551 my $func = sprintf("start_%s",lc $type);
552 $self->_eventHandler->$func($data->{'Attributes'});
554 unshift @
{$self->{'_elements'}}, $type;
555 if($type eq 'result') {
556 $self->{'_values'} = {};
557 $self->{'_result'}= undef;
565 Title : start_element
566 Usage : $eventgenerator->end_element
567 Function: Handles an end element event
569 Args : hashref with at least 2 keys 'Data' and 'Name'
575 my ($self,$data) = @_;
576 my $nm = $data->{'Name'};
577 my $type = $MODEMAP{$nm};
580 if( $type = $MODEMAP{$nm} ) {
581 if( $self->_eventHandler->will_handle($type) ) {
582 my $func = sprintf("end_%s",lc $type);
583 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
586 shift @
{$self->{'_elements'}};
588 } elsif( $MAPPING{$nm} ) {
590 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
591 my $key = (keys %{$MAPPING{$nm}})[0];
592 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
594 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
597 $self->debug( "unknown nm $nm, ignoring\n");
599 $self->{'_last_data'} = ''; # remove read data if we are at
601 $self->{'_result'} = $rc if( defined $type && $type eq 'result' );
608 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
609 Function: Convience method that calls start_element, characters, end_element
611 Args : Hash ref with the keys 'Name' and 'Data'
617 my ($self,$data) = @_;
618 $self->start_element($data);
619 $self->characters($data);
620 $self->end_element($data);
626 Usage : $eventgenerator->characters($str)
627 Function: Send a character events
635 my ($self,$data) = @_;
637 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ );
639 $self->{'_last_data'} = $data->{'Data'};
642 =head2 within_element
644 Title : within_element
645 Usage : if( $eventgenerator->within_element($element) ) {}
646 Function: Test if we are within a particular element
647 This is different than 'in' because within can be tested
650 Args : string element name
656 my ($self,$name) = @_;
657 return 0 if ( ! defined $name &&
658 ! defined $self->{'_elements'} ||
659 scalar @
{$self->{'_elements'}} == 0) ;
660 foreach ( @
{$self->{'_elements'}} ) {
672 Usage : if( $eventgenerator->in_element($element) ) {}
673 Function: Test if we are in a particular element
674 This is different than 'in' because within can be tested
677 Args : string element name
683 my ($self,$name) = @_;
684 return 0 if ! defined $self->{'_elements'}->[0];
685 return ( $self->{'_elements'}->[0] eq $name)
688 =head2 start_document
690 Title : start_document
691 Usage : $eventgenerator->start_document
692 Function: Handle a start document event
701 $self->{'_lasttype'} = '';
702 $self->{'_values'} = {};
703 $self->{'_result'}= undef;
704 $self->{'_elements'} = [];
705 $self->{'_reporttype'} = 'exonerate';
712 Usage : $eventgenerator->end_document
713 Function: Handles an end document event
714 Returns : Bio::Search::Result::ResultI object
721 my ($self,@args) = @_;
722 return $self->{'_result'};
727 my ($self, $blast, @args) = @_;
729 if( not defined($self->writer) ) {
730 $self->warn("Writer not defined. Using a $DEFAULT_WRITER_CLASS");
731 $self->writer( $DEFAULT_WRITER_CLASS->new() );
733 $self->SUPER::write_result
( $blast, @args );
738 return $self->{'_result_count'};
741 sub report_count
{ shift->result_count }
746 Usage : $obj->vulgar($newval)
747 Function: Get/Set flag, do you want to build HSPs from VULGAR string?
748 Returns : value of vulgar (a scalar)
749 Args : on set, new value (a scalar or undef, optional)
758 if( $_[0] && $self->{'_cigar'} ) {
759 $self->warn("Trying to set vulgar and cigar both to 1, must be either or");
760 $self->{'_cigar'} = 0;
761 return $self->{'_vulgar'} = 0;
764 return $self->{'_vulgar'};
770 Usage : $obj->cigar($newval)
771 Function: Get/Set boolean flag do you want to build HSPs from CIGAR strings?
772 Returns : value of cigar (a scalar)
773 Args : on set, new value (a scalar or undef, optional)
782 if( $_[0] && $self->{'_vulgar'} ) {
783 $self->warn("Trying to set vulgar and cigar both to 1, must be either or");
784 $self->{'_vulgar'} = 0;
785 return $self->{'_cigar'} = 0;
788 return $self->{'_cigar'};