bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / SearchIO / rnamotif.pm
blob84a3256074fa65ab47c097b9bb7862fd043f6fbe
1 # $Id$
3 # BioPerl module for Bio::SearchIO::rnamotif
5 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
7 # Copyright Chris Fields
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SearchIO::rnamotif - SearchIO-based RNAMotif parser
17 =head1 SYNOPSIS
19 # do not call this module directly. Use Bio::SearchIO.
21 =head1 DESCRIPTION
23 This is a highly experimental SearchIO-based parser for output from the rnamotif
24 program (one of the programs in the RNAMotif suite). It currently parses only
25 raw rnamotif output for RNAMotif versions 3.0 and above; older versions may work
26 but will not be supported. rmfmt output will not be supported at this time.
28 =head1 FEEDBACK
30 =head2 Mailing Lists
32 User feedback is an integral part of the evolution of this and other
33 Bioperl modules. Send your comments and suggestions preferably to
34 the Bioperl mailing list. Your participation is much appreciated.
36 bioperl-l@bioperl.org - General discussion
37 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
39 =head2 Reporting Bugs
41 Report bugs to the Bioperl bug tracking system to help us keep track
42 of the bugs and their resolution. Bug reports can be submitted via the
43 web:
45 http://bugzilla.open-bio.org/
47 =head1 AUTHOR - Chris Fields
49 Email cjfields-at-uiuc-dot-edu
51 =head1 APPENDIX
53 The rest of the documentation details each of the object methods.
54 Internal methods are usually preceded with a _
56 =cut
58 # Let the code begin...
60 package Bio::SearchIO::rnamotif;
61 use strict;
63 use base qw(Bio::SearchIO);
65 my %MODEMAP = (
66 'Result' => 'result',
67 'Hit' => 'hit',
68 'Hsp' => 'hsp'
71 my %MAPPING = (
72 # commented out tags have not been assigned
74 'Hsp_score' => 'HSP-score',
75 'Hsp_custom-data' => 'HSP-custom_score',
77 # rnamotif has no evalue
79 # descriptor has no start, end; same as hit start, end
80 'Hsp_query-from' => 'HSP-query_start',
81 'Hsp_query-to' => 'HSP-query_end',
82 'Hsp_hit-from' => 'HSP-hit_start',
83 'Hsp_hit-to' => 'HSP-hit_end',
85 # descriptor has no start, end
87 'Hsp_hseq' => 'HSP-hit_seq',
88 'Hsp_align-len' => 'HSP-hsp_length',
90 # build this from scratch, simple WUSS-format
91 'Hsp_structure' => 'HSP-meta',
92 'Hsp_stranded' => 'HSP-stranded',
94 # not supported for RNAMotif
96 'Hit_id' => 'HIT-name',
97 'Hit_accession' => 'HIT-accession',
98 'Hit_gi' => 'HIT-ncbi_gi',
99 'Hit_def' => 'HIT-description',
100 'Hit_score' => 'HIT-score', # best HSP score
102 'RNAMotif_program' => 'RESULT-algorithm_name', # get/set
103 'RNAMotif_version' => 'RESULT-algorithm_version', # get/set
104 'RNAMotif_query-def'=> 'RESULT-query_name', # get/set
105 # No length (query is a descriptor)
106 'RNAMotif_query-acc'=> 'RESULT-query_accession', # get/set
107 'RNAMotif_querydesc'=> 'RESULT-query_description', # get/set
108 'RNAMotif_db' => 'RESULT-database_name', # get/set
111 # use structure_delimiters to set custom delimiters
113 my @VALID_SYMBOLS = qw(5-prime 3-prime single-strand unknown);
114 my %STRUCTURE_SYMBOLS = (
115 '5-prime' => '<',
116 '3-prime' => '>',
117 'single-strand' => '.',
118 'unknown' => '?'
119 # may add more for quartets, triplets
122 my $DEFAULT_VERSION = '3.0.3';
124 =head2 new
126 Title : new
127 Usage : my $obj = Bio::SearchIO->new();
128 Function: Builds a new Bio::SearchIO::rnamotif object
129 Returns : Bio::SearchIO::rnamotif parser
130 Args : -fh/-file => RNAMotif filename
131 -format => 'rnamotif'
132 -model => query model (or descriptor, in this case)
133 -database => database name (default undef)
134 -query_acc => query accession (default undef)
135 -hsp_minscore => minimum HSP score cutoff
136 -hsp_maxscore => maximum HSP score cutoff
137 -symbols => hash ref of structure symbols to use
138 (default symbols in %STRUCTURE_SYMBOLS hash)
140 =cut
142 sub _initialize {
143 my ( $self, @args ) = @_;
144 $self->SUPER::_initialize(@args);
145 my ($version, $model, $database, $maxcutoff, $mincutoff, $seqdistance,
146 $accession, $symbols) =
147 $self->_rearrange([qw(VERSION MODEL DATABASE HSP_MAXSCORE
148 HSP_MINSCORE SEQ_DISTANCE QUERY_ACC SYMBOLS)],@args);
149 my $handler = $self->_eventHandler;
150 $handler->register_factory(
151 'result',
152 Bio::Factory::ObjectFactory->new(
153 -type => 'Bio::Search::Result::GenericResult',
154 -interface => 'Bio::Search::Result::ResultI',
155 -verbose => $self->verbose
159 $handler->register_factory(
160 'hit',
161 Bio::Factory::ObjectFactory->new(
162 -type => 'Bio::Search::Hit::ModelHit',
163 -interface => 'Bio::Search::Hit::HitI',
164 -verbose => $self->verbose
168 $handler->register_factory(
169 'hsp',
170 Bio::Factory::ObjectFactory->new(
171 -type => 'Bio::Search::HSP::ModelHSP',
172 -interface => 'Bio::Search::HSP::HSPI',
173 -verbose => $self->verbose
176 $model && $self->model($model);
177 $database && $self->database($database);
178 $accession && $self->query_accession($accession);
179 $version ||= $DEFAULT_VERSION;
180 $self->algorithm_version($version);
181 $self->throw("Cannot define both a minimal and maximal cutoff")
182 if (defined($mincutoff) && defined($maxcutoff));
183 defined($mincutoff) && $self->hsp_minscore($mincutoff);
184 defined($maxcutoff) && $self->hsp_maxscore($maxcutoff);
185 $symbols ||= \%STRUCTURE_SYMBOLS;
186 $self->structure_symbols($symbols);
189 =head2 next_result
191 Title : next_result
192 Usage : my $hit = $searchio->next_result;
193 Function: Returns the next Result from a search
194 Returns : Bio::Search::Result::ResultI object
195 Args : none
197 =cut
199 sub next_result {
200 my ($self) = @_;
201 my $seentop = 0;
202 local $/ = "\n";
203 local $_;
204 my ($rm, $d, $descriptor, $file, $oktobuild);
205 my ($hitid, $hitdesc, $hspid, $lastid, $lastscore);
206 my $sprintf;
208 # user-determined Result data
209 my ($accession, $db, $model) =
210 ($self->query_accession, $self->database, $self->model);
211 # HSP building options
212 my $hsp_min = $self->hsp_minscore;
213 my $hsp_max = $self->hsp_maxscore;
214 my $version = $self->algorithm_version;
215 my $laststart;
217 my $verbose = $self->verbose; # cache for speed?
218 $self->start_document();
219 PARSER:
220 while ( defined( my $line = $self->_readline ) ) {
221 # start of report
222 next if $line =~ m{^\s+$};
223 if (index($line,'#RM') == 0) {
224 if (index($line,'#RM scored') == 0 ) {
225 if ($seentop) {
226 $self->_pushback($line);
227 last PARSER;
229 $self->start_element({'Name' => 'Result'});
230 $self->element_hash({
231 'RNAMotif_program' => 'rnamotif',
232 'RNAMotif_version' => $version,
233 'RNAMotif_query-acc' => $accession,
234 'RNAMotif_db' => $db
235 });
236 $seentop = 1;
237 #$self->debug("Start result\n");
238 } elsif (index($line,'#RM descr') == 0) {
239 ($rm, $d, $descriptor) = split ' ', $line, 3;
240 # toss $rm, $d; keep $descr
241 chomp $descriptor;
242 $self->{'_descriptor'} = $descriptor;
243 $self->element(
244 {'Name' => 'RNAMotif_querydesc',
245 'Data' => $descriptor}
247 } elsif(index($line,'#RM dfile') == 0) {
248 ($rm, $d, $file) = split ' ', $line, 3;
249 # toss $rm, $d; keep $file
250 chomp $file;
251 $self->element(
252 {'Name' => 'RNAMotif_query-def',
253 'Data' => $file}
255 } else {
256 $self->debug("Unrecognized line: $line");
258 } elsif ($line =~ s{^>}{}) {
259 chomp $line;
260 ($hitid, $hitdesc) = split ' ',$line,2;
262 if ($self->within_element('hit') && ($hitid ne $lastid)) {
263 $self->element(
264 {'Name' => 'Hit_score',
265 'Data' => $lastscore}
266 ) if $lastscore;
267 $self->end_element({'Name' => 'Hit'});
268 $self->start_element({'Name' => 'Hit'});
269 } elsif (!$self->within_element('hit')) {
270 $self->start_element({'Name' => 'Hit'});
272 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($hitid);
274 $self->element_hash({
275 'Hit_id' => $hitid,
276 'Hit_gi' => $gi,
277 'Hit_accession' => $ver ? "$acc.$ver" :
278 $acc ? $acc : $hitid,
279 'Hit_def' => $hitdesc}
281 $lastid = $hitid;
282 } elsif ($line =~ m{^(\S+)\s+(.+?)\s+(\d+)\s+(\d+)\s+(\d+)\s(.*)$}xm) {
283 chomp $line;
284 my $hspid = $1;
285 my ($score, $strand, $start, $length , $seq) = ($2, $3, $4, $5, $6);
286 $score *= 1; # implicitly cast any odd '0.000' to float
287 # sanity check ids
288 unless ($hitid eq $hspid) {
289 $self->throw("IDs do not match!");
291 # check score for possible sprintf data, mark as such, cache result
292 if (!defined($sprintf)) {
293 if ($score =~ m{[^0-9.-]+}gxms) {
294 if (defined $hsp_min || defined $hsp_max ) {
295 $self->warn("HSP data likely contains custom score; ".
296 "ignoring min/maxscore");
298 $sprintf = $oktobuild = 1;
299 } else {
300 $sprintf = 0;
304 if (!$sprintf) {
305 if (($hsp_min && $score <= $hsp_min)
306 || ($hsp_max && ($score >= $hsp_max)) ) {
307 # do not build HSP
308 $oktobuild = 0;
309 } else {
310 $oktobuild = 1;
312 # store best hit score based on the hsp min/maxscore only
313 if (defined $hsp_min && $score > $hsp_min) {
314 $lastscore = $score if !$lastscore || $score > $lastscore;
315 } elsif (defined $hsp_max && $score < $hsp_max) {
316 $lastscore = $score if !$lastscore || $score < $lastscore;
321 # build HSP
322 if ($oktobuild) {
323 my $end;
324 # calculate start/end
325 if( $strand==0 ) {
326 $end = $start + $length -1;
327 } else {
328 $end = $start - $length + 1;
331 my ($rna, $meta) = $self->_motif2meta($seq, $descriptor);
333 $self->start_element({'Name' => 'Hsp'});
334 my $rnalen = $rna =~ tr{ATGCatgc}{ATGCatgc};
335 $self->element_hash({
336 'Hsp_stranded' => 'HIT',
337 'Hsp_hseq' => $rna,
338 'Hsp_query-from' => 1,
339 'Hsp_query-to' =>length($rna),
340 'Hsp_hit-from' => $start,
341 'Hsp_hit-to' => $end,
342 'Hsp_structure' => $meta,
343 'Hsp_align-len' => length($rna),
344 'Hsp_score' => $sprintf ? undef : $score,
345 'Hsp_custom-data' => $sprintf ? $score : undef,
347 $self->end_element({'Name' => 'Hsp'});
348 $oktobuild = 0 if (!$sprintf);
352 if ($self->within_element('hit')) {
353 $self->element(
354 {'Name' => 'Hit_score',
355 'Data' => $lastscore}
356 ) if $lastscore;
357 $self->end_element( { 'Name' => 'Hit' } );
359 if ($seentop) {
360 $self->end_element( { 'Name' => 'Result' } );
362 return $self->end_document();
365 =head2 start_element
367 Title : start_element
368 Usage : $eventgenerator->start_element
369 Function: Handles a start element event
370 Returns : none
371 Args : hashref with at least 2 keys 'Data' and 'Name'
374 =cut
376 sub start_element {
377 my ( $self, $data ) = @_;
379 # we currently don't care about attributes
380 my $nm = $data->{'Name'};
381 my $type = $MODEMAP{$nm};
382 if ($type) {
383 if ( $self->_eventHandler->will_handle($type) ) {
384 my $func = sprintf( "start_%s", lc $type );
385 $self->_eventHandler->$func( $data->{'Attributes'} );
387 unshift @{ $self->{'_elements'} }, $type;
389 if ( defined $type
390 && $type eq 'result' )
392 $self->{'_values'} = {};
393 $self->{'_result'} = undef;
397 =head2 end_element
399 Title : start_element
400 Usage : $eventgenerator->end_element
401 Function: Handles an end element event
402 Returns : none
403 Args : hashref with at least 2 keys, 'Data' and 'Name'
406 =cut
408 sub end_element {
409 my ( $self, $data ) = @_;
410 my $nm = $data->{'Name'};
411 my $type = $MODEMAP{$nm};
412 my $rc;
414 if ($type) {
415 if ( $self->_eventHandler->will_handle($type) ) {
416 my $func = sprintf( "end_%s", lc $type );
417 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
418 $self->{'_values'} );
420 my $lastelem = shift @{ $self->{'_elements'} };
422 elsif ( $MAPPING{$nm} ) {
423 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
424 my $key = ( keys %{ $MAPPING{$nm} } )[0];
425 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
426 $self->{'_last_data'};
428 else {
429 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
432 else {
433 $self->debug("unknown nm $nm, ignoring\n");
435 $self->{'_last_data'} = ''; # remove read data if we are at
436 # end of an element
437 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
438 return $rc;
441 =head2 element
443 Title : element
444 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
445 Function: Convenience method that calls start_element, characters, end_element
446 Returns : none
447 Args : Hash ref with the keys 'Name' and 'Data'
450 =cut
452 sub element {
453 my ( $self, $data ) = @_;
454 # simple data calls (%MAPPING) do not need start_element
455 $self->characters($data);
456 $self->end_element($data);
460 =head2 element_hash
462 Title : element
463 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
464 'Hsp_hit-to' => $end,
465 'Hsp_score' => $lastscore});
466 Function: Convenience method that takes multiple simple data elements and
467 maps to appropriate parameters
468 Returns : none
469 Args : Hash ref with the mapped key (in %MAPPING) and value
471 =cut
473 sub element_hash {
474 my ($self, $data) = @_;
475 $self->throw("Must provide data hash ref") if !$data || !ref($data);
476 for my $nm (sort keys %{$data}) {
477 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
478 if ( $MAPPING{$nm} ) {
479 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
480 my $key = ( keys %{ $MAPPING{$nm} } )[0];
481 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
482 $data->{$nm};
484 else {
485 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
491 =head2 characters
493 Title : characters
494 Usage : $eventgenerator->characters($str)
495 Function: Send a character events
496 Returns : none
497 Args : string
500 =cut
502 sub characters {
503 my ( $self, $data ) = @_;
504 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
505 $self->{'_last_data'} = $data->{'Data'};
508 =head2 within_element
510 Title : within_element
511 Usage : if( $eventgenerator->within_element($element) ) {}
512 Function: Test if we are within a particular element
513 This is different than 'in' because within can be tested
514 for a whole block.
515 Returns : boolean
516 Args : string element name
518 =cut
520 sub within_element {
521 my ( $self, $name ) = @_;
522 return 0
523 if ( !defined $name
524 || !defined $self->{'_elements'}
525 || scalar @{ $self->{'_elements'} } == 0 );
526 foreach ( @{ $self->{'_elements'} } ) {
527 return 1 if ( $_ eq $name );
529 return 0;
532 =head2 in_element
534 Title : in_element
535 Usage : if( $eventgenerator->in_element($element) ) {}
536 Function: Test if we are in a particular element
537 This is different than 'within' because 'in' only
538 tests its immediate parent.
539 Returns : boolean
540 Args : string element name
542 =cut
544 sub in_element {
545 my ( $self, $name ) = @_;
546 return 0 if !defined $self->{'_elements'}->[0];
547 return ( $self->{'_elements'}->[0] eq $name );
550 =head2 start_document
552 Title : start_document
553 Usage : $eventgenerator->start_document
554 Function: Handle a start document event
555 Returns : none
556 Args : none
558 =cut
560 sub start_document {
561 my ($self) = @_;
562 $self->{'_lasttype'} = '';
563 $self->{'_values'} = {};
564 $self->{'_result'} = undef;
565 $self->{'_elements'} = [];
568 =head2 end_document
570 Title : end_document
571 Usage : $eventgenerator->end_document
572 Function: Handles an end document event
573 Returns : Bio::Search::Result::ResultI object
574 Args : none
576 =cut
578 sub end_document {
579 my ($self) = @_;
580 return $self->{'_result'};
583 =head2 result_count
585 Title : result_count
586 Usage : my $count = $searchio->result_count
587 Function: Returns the number of results we have processed
588 Returns : integer
589 Args : none
591 =cut
593 sub result_count {
594 my $self = shift;
595 return $self->{'_result_count'};
598 =head2 descriptor
600 Title : descriptor
601 Usage : my $descr = $parser->descriptor();
602 Function: Get/Set descriptor name. Some versions of RNAMotif do not add the
603 descriptor name to the output. This also overrides any name found
604 while parsing.
605 Returns : String (name of model)
606 Args : [optional] String (name of model)
608 =cut
610 sub descriptor {
611 my $self = shift;
612 return $self->{'_descriptor'} = shift if @_;
613 return $self->{'_descriptor'};
616 =head2 model
618 Title : model
619 Usage : my $model = $parser->model();
620 Function: Get/Set model; Infernal currently does not output
621 the model name (Rfam ID)
622 Returns : String (name of model)
623 Args : [optional] String (name of model)
624 Note : this is a synonym for descriptor()
626 =cut
628 sub model { shift->descriptor(@_) }
630 =head2 database
632 Title : database
633 Usage : my $database = $parser->database();
634 Function: Get/Set database; Infernal currently does not output
635 the database name
636 Returns : String (database name)
637 Args : [optional] String (database name)
639 =cut
641 sub database {
642 my $self = shift;
643 return $self->{'_database'} = shift if @_;
644 return $self->{'_database'};
647 =head2 query_accession
649 Title : query_accession
650 Usage : my $acc = $parser->query_accession();
651 Function: Get/Set query (model) accession; RNAMotif currently does not output
652 the accession number
653 Returns : String (accession)
654 Args : [optional] String (accession)
656 =cut
658 sub query_accession {
659 my $self = shift;
660 return $self->{'_query_accession'} = shift if @_;
661 return $self->{'_query_accession'};
664 =head2 algorithm_version
666 Title : algorithm_version
667 Usage : my $ver = $parser->algorithm_version();
668 Function: Get/Set algorithm version (not defined in RNAMotif output)
669 Returns : String (accession)
670 Args : [optional] String (accession)
672 =cut
674 sub algorithm_version {
675 my $self = shift;
676 return $self->{'_algorithm'} = shift if @_;
677 return $self->{'_algorithm'};
680 =head2 hsp_minscore
682 Title : hsp_minscore
683 Usage : my $cutoff = $parser->hsp_minscore();
684 Function: Get/Set min score cutoff (for generating Hits/HSPs).
685 Returns : score (number)
686 Args : [optional] score (number)
687 Note : Cannot be set along with hsp_maxscore()
689 =cut
691 sub hsp_minscore {
692 my ($self, $score) = shift;
693 $self->throw('Minscore not set to a number') if
694 ($score && $score !~ m{[0-9.]+});
695 return $self->{'_hsp_minscore'} = shift if @_;
696 return $self->{'_hsp_minscore'};
699 =head2 hsp_maxscore
701 Title : hsp_maxscore
702 Usage : my $cutoff = $parser->hsp_maxscore();
703 Function: Get/Set max score cutoff (for generating Hits/HSPs).
704 Returns : score (number)
705 Args : [optional] score (number)
706 Note : Cannot be set along with hsp_minscore()
708 =cut
710 sub hsp_maxscore {
711 my ($self, $score) = shift;
712 $self->throw('Maxscore not set to a number') if
713 ($score && $score !~ m{[0-9.]+});
714 return $self->{'_hsp_maxscore'} = shift if @_;
715 return $self->{'_hsp_maxscore'};
718 =head2 structure_symbols
720 Title : structure_symbols
721 Usage : my $hashref = $parser->structure_symbols();
722 Function: Get/Set RNA structure symbols
723 Returns : Hash ref of delimiters (5' stem, 3' stem, single-strand, etc)
724 : default = < (5-prime)
725 > (3-prime)
726 . (single-strand)
727 ? (unknown)
728 Args : Hash ref of substitute delimiters, using above keys.
730 =cut
732 sub structure_symbols {
733 my ($self, $delim) = @_;
734 if ($delim) {
735 if (ref($delim) =~ m{HASH}) {
736 my %data = %{ $delim };
737 for my $d (@VALID_SYMBOLS) {
738 if ( exists $data{$d} ) {
739 $self->{'_delimiter'}->{$d} = $data{$d};
742 } else {
743 $self->throw("Args to helix_delimiters() should be in a hash reference");
746 return $self->{'_delimiter'};
749 #Private methods
751 =head2 _motif2meta
753 Title : _motif2meta
754 Usage : my ($rna, $meta) = $parser->_motif2meta($str, $descr);
755 Function: Creates meta string from sequence and descriptor
756 Returns : array of sequence, meta strings
757 Args : Array of string data and descriptor data
759 Note: This is currently a quick and simple way of making simple
760 RNA structures (stem-loops, helices, etc) from RNAMotif descriptor
761 data in the output file. It does not currently work with pseudoknots,
762 triplets, G-quartets, or other more complex RNA structural motifs.
764 =cut
766 sub _motif2meta {
767 my ($self, $str, $descriptor) = @_;
768 my ($rna, $meta);
769 my @desc_el = split ' ',$descriptor;
770 my @seq_el = split ' ',$str;
771 my $symbol = $self->structure_symbols();
772 if ($#desc_el != $#seq_el) {
773 $self->throw("Descriptor elements and seq elements do not match");
775 while (@desc_el) {
776 my $struct;
777 my ($seq, $motif) = (shift @seq_el, shift @desc_el);
778 $struct = (index($motif,'h5') == 0) ? $symbol->{'5-prime'} :
779 (index($motif,'h3') == 0) ? $symbol->{'3-prime'} :
780 (index($motif,'ss') == 0) ? $symbol->{'single-strand'} :
781 (index($motif,'ctx')== 0) ? $symbol->{'single-strand'} :
782 $symbol->{'unknown'};
783 $meta .= $struct x (length($seq));
784 $rna .= $seq;
786 return ($rna, $meta);