sync w/ main trunk
[bioperl-live.git] / Bio / SearchIO / rnamotif.pm
blob04addcf53c1a2794bc0cc4d4def23af4820e50af
1 # $Id$
3 # BioPerl module for Bio::SearchIO::rnamotif
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
9 # Copyright Chris Fields
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::SearchIO::rnamotif - SearchIO-based RNAMotif parser
19 =head1 SYNOPSIS
21 # do not call this module directly. Use Bio::SearchIO.
23 =head1 DESCRIPTION
25 This is a highly experimental SearchIO-based parser for output from the rnamotif
26 program (one of the programs in the RNAMotif suite). It currently parses only
27 raw rnamotif output for RNAMotif versions 3.0 and above; older versions may work
28 but will not be supported. rmfmt output will not be supported at this time.
30 =head1 FEEDBACK
32 =head2 Mailing Lists
34 User feedback is an integral part of the evolution of this and other
35 Bioperl modules. Send your comments and suggestions preferably to
36 the Bioperl mailing list. Your participation is much appreciated.
38 bioperl-l@bioperl.org - General discussion
39 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
41 =head2 Support
43 Please direct usage questions or support issues to the mailing list:
45 L<bioperl-l@bioperl.org>
47 rather than to the module maintainer directly. Many experienced and
48 reponsive experts will be able look at the problem and quickly
49 address it. Please include a thorough description of the problem
50 with code and data examples if at all possible.
52 =head2 Reporting Bugs
54 Report bugs to the Bioperl bug tracking system to help us keep track
55 of the bugs and their resolution. Bug reports can be submitted via the
56 web:
58 http://bugzilla.open-bio.org/
60 =head1 AUTHOR - Chris Fields
62 Email cjfields-at-uiuc-dot-edu
64 =head1 APPENDIX
66 The rest of the documentation details each of the object methods.
67 Internal methods are usually preceded with a _
69 =cut
71 # Let the code begin...
73 package Bio::SearchIO::rnamotif;
74 use strict;
76 use base qw(Bio::SearchIO);
78 my %MODEMAP = (
79 'Result' => 'result',
80 'Hit' => 'hit',
81 'Hsp' => 'hsp'
84 my %MAPPING = (
85 # commented out tags have not been assigned
87 'Hsp_score' => 'HSP-score',
88 'Hsp_custom-data' => 'HSP-custom_score',
90 # rnamotif has no evalue
92 # descriptor has no start, end; same as hit start, end
93 'Hsp_query-from' => 'HSP-query_start',
94 'Hsp_query-to' => 'HSP-query_end',
95 'Hsp_hit-from' => 'HSP-hit_start',
96 'Hsp_hit-to' => 'HSP-hit_end',
98 # descriptor has no start, end
100 'Hsp_hseq' => 'HSP-hit_seq',
101 'Hsp_align-len' => 'HSP-hsp_length',
103 # build this from scratch, simple WUSS-format
104 'Hsp_structure' => 'HSP-meta',
105 'Hsp_stranded' => 'HSP-stranded',
107 # not supported for RNAMotif
109 'Hit_id' => 'HIT-name',
110 'Hit_accession' => 'HIT-accession',
111 'Hit_gi' => 'HIT-ncbi_gi',
112 'Hit_def' => 'HIT-description',
113 'Hit_score' => 'HIT-score', # best HSP score
115 'RNAMotif_program' => 'RESULT-algorithm_name', # get/set
116 'RNAMotif_version' => 'RESULT-algorithm_version', # get/set
117 'RNAMotif_query-def'=> 'RESULT-query_name', # get/set
118 # No length (query is a descriptor)
119 'RNAMotif_query-acc'=> 'RESULT-query_accession', # get/set
120 'RNAMotif_querydesc'=> 'RESULT-query_description', # get/set
121 'RNAMotif_db' => 'RESULT-database_name', # get/set
124 # use structure_delimiters to set custom delimiters
126 my @VALID_SYMBOLS = qw(5-prime 3-prime single-strand unknown);
127 my %STRUCTURE_SYMBOLS = (
128 '5-prime' => '<',
129 '3-prime' => '>',
130 'single-strand' => '.',
131 'unknown' => '?'
132 # may add more for quartets, triplets
135 my $DEFAULT_VERSION = '3.0.3';
137 =head2 new
139 Title : new
140 Usage : my $obj = Bio::SearchIO->new();
141 Function: Builds a new Bio::SearchIO::rnamotif object
142 Returns : Bio::SearchIO::rnamotif parser
143 Args : -fh/-file => RNAMotif filename
144 -format => 'rnamotif'
145 -model => query model (or descriptor, in this case)
146 -database => database name (default undef)
147 -query_acc => query accession (default undef)
148 -hsp_minscore => minimum HSP score cutoff
149 -hsp_maxscore => maximum HSP score cutoff
150 -symbols => hash ref of structure symbols to use
151 (default symbols in %STRUCTURE_SYMBOLS hash)
153 =cut
155 sub _initialize {
156 my ( $self, @args ) = @_;
157 $self->SUPER::_initialize(@args);
158 my ($version, $model, $database, $maxcutoff, $mincutoff, $seqdistance,
159 $accession, $symbols) =
160 $self->_rearrange([qw(VERSION MODEL DATABASE HSP_MAXSCORE
161 HSP_MINSCORE SEQ_DISTANCE QUERY_ACC SYMBOLS)],@args);
162 my $handler = $self->_eventHandler;
163 $handler->register_factory(
164 'result',
165 Bio::Factory::ObjectFactory->new(
166 -type => 'Bio::Search::Result::GenericResult',
167 -interface => 'Bio::Search::Result::ResultI',
168 -verbose => $self->verbose
172 $handler->register_factory(
173 'hit',
174 Bio::Factory::ObjectFactory->new(
175 -type => 'Bio::Search::Hit::ModelHit',
176 -interface => 'Bio::Search::Hit::HitI',
177 -verbose => $self->verbose
181 $handler->register_factory(
182 'hsp',
183 Bio::Factory::ObjectFactory->new(
184 -type => 'Bio::Search::HSP::ModelHSP',
185 -interface => 'Bio::Search::HSP::HSPI',
186 -verbose => $self->verbose
189 $model && $self->model($model);
190 $database && $self->database($database);
191 $accession && $self->query_accession($accession);
192 $version ||= $DEFAULT_VERSION;
193 $self->algorithm_version($version);
194 $self->throw("Cannot define both a minimal and maximal cutoff")
195 if (defined($mincutoff) && defined($maxcutoff));
196 defined($mincutoff) && $self->hsp_minscore($mincutoff);
197 defined($maxcutoff) && $self->hsp_maxscore($maxcutoff);
198 $symbols ||= \%STRUCTURE_SYMBOLS;
199 $self->structure_symbols($symbols);
202 =head2 next_result
204 Title : next_result
205 Usage : my $hit = $searchio->next_result;
206 Function: Returns the next Result from a search
207 Returns : Bio::Search::Result::ResultI object
208 Args : none
210 =cut
212 sub next_result {
213 my ($self) = @_;
214 my $seentop = 0;
215 local $/ = "\n";
216 local $_;
217 my ($rm, $d, $descriptor, $file, $oktobuild);
218 my ($hitid, $hitdesc, $hspid, $lastid, $lastscore);
219 my $sprintf;
221 # user-determined Result data
222 my ($accession, $db, $model) =
223 ($self->query_accession, $self->database, $self->model);
224 # HSP building options
225 my $hsp_min = $self->hsp_minscore;
226 my $hsp_max = $self->hsp_maxscore;
227 my $version = $self->algorithm_version;
228 my $laststart;
230 my $verbose = $self->verbose; # cache for speed?
231 $self->start_document();
232 PARSER:
233 while ( defined( my $line = $self->_readline ) ) {
234 # start of report
235 next if $line =~ m{^\s+$};
236 if (index($line,'#RM') == 0) {
237 if (index($line,'#RM scored') == 0 ) {
238 if ($seentop) {
239 $self->_pushback($line);
240 last PARSER;
242 $self->start_element({'Name' => 'Result'});
243 $self->element_hash({
244 'RNAMotif_program' => 'rnamotif',
245 'RNAMotif_version' => $version,
246 'RNAMotif_query-acc' => $accession,
247 'RNAMotif_db' => $db
248 });
249 $seentop = 1;
250 #$self->debug("Start result\n");
251 } elsif (index($line,'#RM descr') == 0) {
252 ($rm, $d, $descriptor) = split ' ', $line, 3;
253 # toss $rm, $d; keep $descr
254 chomp $descriptor;
255 $self->{'_descriptor'} = $descriptor;
256 $self->element(
257 {'Name' => 'RNAMotif_querydesc',
258 'Data' => $descriptor}
260 } elsif(index($line,'#RM dfile') == 0) {
261 ($rm, $d, $file) = split ' ', $line, 3;
262 # toss $rm, $d; keep $file
263 chomp $file;
264 $self->element(
265 {'Name' => 'RNAMotif_query-def',
266 'Data' => $file}
268 } else {
269 $self->debug("Unrecognized line: $line");
271 } elsif ($line =~ s{^>}{}) {
272 chomp $line;
273 ($hitid, $hitdesc) = split ' ',$line,2;
275 if ($self->within_element('hit') && ($hitid ne $lastid)) {
276 $self->element(
277 {'Name' => 'Hit_score',
278 'Data' => $lastscore}
279 ) if $lastscore;
280 $self->end_element({'Name' => 'Hit'});
281 $self->start_element({'Name' => 'Hit'});
282 } elsif (!$self->within_element('hit')) {
283 $self->start_element({'Name' => 'Hit'});
285 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($hitid);
287 $self->element_hash({
288 'Hit_id' => $hitid,
289 'Hit_gi' => $gi,
290 'Hit_accession' => $ver ? "$acc.$ver" :
291 $acc ? $acc : $hitid,
292 'Hit_def' => $hitdesc}
294 $lastid = $hitid;
295 } elsif ($line =~ m{^(\S+)\s+(.+?)\s+(\d+)\s+(\d+)\s+(\d+)\s(.*)$}xm) {
296 chomp $line;
297 my $hspid = $1;
298 my ($score, $strand, $start, $length , $seq) = ($2, $3, $4, $5, $6);
299 $score *= 1; # implicitly cast any odd '0.000' to float
300 # sanity check ids
301 unless ($hitid eq $hspid) {
302 $self->throw("IDs do not match!");
304 # check score for possible sprintf data, mark as such, cache result
305 if (!defined($sprintf)) {
306 if ($score =~ m{[^0-9.-]+}gxms) {
307 if (defined $hsp_min || defined $hsp_max ) {
308 $self->warn("HSP data likely contains custom score; ".
309 "ignoring min/maxscore");
311 $sprintf = $oktobuild = 1;
312 } else {
313 $sprintf = 0;
317 if (!$sprintf) {
318 if (($hsp_min && $score <= $hsp_min)
319 || ($hsp_max && ($score >= $hsp_max)) ) {
320 # do not build HSP
321 $oktobuild = 0;
322 } else {
323 $oktobuild = 1;
325 # store best hit score based on the hsp min/maxscore only
326 if (defined $hsp_min && $score > $hsp_min) {
327 $lastscore = $score if !$lastscore || $score > $lastscore;
328 } elsif (defined $hsp_max && $score < $hsp_max) {
329 $lastscore = $score if !$lastscore || $score < $lastscore;
334 # build HSP
335 if ($oktobuild) {
336 my $end;
337 # calculate start/end
338 if( $strand==0 ) {
339 $end = $start + $length -1;
340 } else {
341 $end = $start - $length + 1;
344 my ($rna, $meta) = $self->_motif2meta($seq, $descriptor);
346 $self->start_element({'Name' => 'Hsp'});
347 my $rnalen = $rna =~ tr{ATGCatgc}{ATGCatgc};
348 $self->element_hash({
349 'Hsp_stranded' => 'HIT',
350 'Hsp_hseq' => $rna,
351 'Hsp_query-from' => 1,
352 'Hsp_query-to' =>length($rna),
353 'Hsp_hit-from' => $start,
354 'Hsp_hit-to' => $end,
355 'Hsp_structure' => $meta,
356 'Hsp_align-len' => length($rna),
357 'Hsp_score' => $sprintf ? undef : $score,
358 'Hsp_custom-data' => $sprintf ? $score : undef,
360 $self->end_element({'Name' => 'Hsp'});
361 $oktobuild = 0 if (!$sprintf);
365 if ($self->within_element('hit')) {
366 $self->element(
367 {'Name' => 'Hit_score',
368 'Data' => $lastscore}
369 ) if $lastscore;
370 $self->end_element( { 'Name' => 'Hit' } );
372 if ($seentop) {
373 $self->end_element( { 'Name' => 'Result' } );
375 return $self->end_document();
378 =head2 start_element
380 Title : start_element
381 Usage : $eventgenerator->start_element
382 Function: Handles a start element event
383 Returns : none
384 Args : hashref with at least 2 keys 'Data' and 'Name'
387 =cut
389 sub start_element {
390 my ( $self, $data ) = @_;
392 # we currently don't care about attributes
393 my $nm = $data->{'Name'};
394 my $type = $MODEMAP{$nm};
395 if ($type) {
396 if ( $self->_eventHandler->will_handle($type) ) {
397 my $func = sprintf( "start_%s", lc $type );
398 $self->_eventHandler->$func( $data->{'Attributes'} );
400 unshift @{ $self->{'_elements'} }, $type;
402 if ( defined $type
403 && $type eq 'result' )
405 $self->{'_values'} = {};
406 $self->{'_result'} = undef;
410 =head2 end_element
412 Title : start_element
413 Usage : $eventgenerator->end_element
414 Function: Handles an end element event
415 Returns : none
416 Args : hashref with at least 2 keys, 'Data' and 'Name'
419 =cut
421 sub end_element {
422 my ( $self, $data ) = @_;
423 my $nm = $data->{'Name'};
424 my $type = $MODEMAP{$nm};
425 my $rc;
427 if ($type) {
428 if ( $self->_eventHandler->will_handle($type) ) {
429 my $func = sprintf( "end_%s", lc $type );
430 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
431 $self->{'_values'} );
433 my $lastelem = shift @{ $self->{'_elements'} };
435 elsif ( $MAPPING{$nm} ) {
436 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
437 my $key = ( keys %{ $MAPPING{$nm} } )[0];
438 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
439 $self->{'_last_data'};
441 else {
442 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
445 else {
446 $self->debug("unknown nm $nm, ignoring\n");
448 $self->{'_last_data'} = ''; # remove read data if we are at
449 # end of an element
450 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
451 return $rc;
454 =head2 element
456 Title : element
457 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
458 Function: Convenience method that calls start_element, characters, end_element
459 Returns : none
460 Args : Hash ref with the keys 'Name' and 'Data'
463 =cut
465 sub element {
466 my ( $self, $data ) = @_;
467 # simple data calls (%MAPPING) do not need start_element
468 $self->characters($data);
469 $self->end_element($data);
473 =head2 element_hash
475 Title : element
476 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
477 'Hsp_hit-to' => $end,
478 'Hsp_score' => $lastscore});
479 Function: Convenience method that takes multiple simple data elements and
480 maps to appropriate parameters
481 Returns : none
482 Args : Hash ref with the mapped key (in %MAPPING) and value
484 =cut
486 sub element_hash {
487 my ($self, $data) = @_;
488 $self->throw("Must provide data hash ref") if !$data || !ref($data);
489 for my $nm (sort keys %{$data}) {
490 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
491 if ( $MAPPING{$nm} ) {
492 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
493 my $key = ( keys %{ $MAPPING{$nm} } )[0];
494 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
495 $data->{$nm};
497 else {
498 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
504 =head2 characters
506 Title : characters
507 Usage : $eventgenerator->characters($str)
508 Function: Send a character events
509 Returns : none
510 Args : string
513 =cut
515 sub characters {
516 my ( $self, $data ) = @_;
517 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
518 $self->{'_last_data'} = $data->{'Data'};
521 =head2 within_element
523 Title : within_element
524 Usage : if( $eventgenerator->within_element($element) ) {}
525 Function: Test if we are within a particular element
526 This is different than 'in' because within can be tested
527 for a whole block.
528 Returns : boolean
529 Args : string element name
531 =cut
533 sub within_element {
534 my ( $self, $name ) = @_;
535 return 0
536 if ( !defined $name
537 || !defined $self->{'_elements'}
538 || scalar @{ $self->{'_elements'} } == 0 );
539 foreach ( @{ $self->{'_elements'} } ) {
540 return 1 if ( $_ eq $name );
542 return 0;
545 =head2 in_element
547 Title : in_element
548 Usage : if( $eventgenerator->in_element($element) ) {}
549 Function: Test if we are in a particular element
550 This is different than 'within' because 'in' only
551 tests its immediate parent.
552 Returns : boolean
553 Args : string element name
555 =cut
557 sub in_element {
558 my ( $self, $name ) = @_;
559 return 0 if !defined $self->{'_elements'}->[0];
560 return ( $self->{'_elements'}->[0] eq $name );
563 =head2 start_document
565 Title : start_document
566 Usage : $eventgenerator->start_document
567 Function: Handle a start document event
568 Returns : none
569 Args : none
571 =cut
573 sub start_document {
574 my ($self) = @_;
575 $self->{'_lasttype'} = '';
576 $self->{'_values'} = {};
577 $self->{'_result'} = undef;
578 $self->{'_elements'} = [];
581 =head2 end_document
583 Title : end_document
584 Usage : $eventgenerator->end_document
585 Function: Handles an end document event
586 Returns : Bio::Search::Result::ResultI object
587 Args : none
589 =cut
591 sub end_document {
592 my ($self) = @_;
593 return $self->{'_result'};
596 =head2 result_count
598 Title : result_count
599 Usage : my $count = $searchio->result_count
600 Function: Returns the number of results we have processed
601 Returns : integer
602 Args : none
604 =cut
606 sub result_count {
607 my $self = shift;
608 return $self->{'_result_count'};
611 =head2 descriptor
613 Title : descriptor
614 Usage : my $descr = $parser->descriptor();
615 Function: Get/Set descriptor name. Some versions of RNAMotif do not add the
616 descriptor name to the output. This also overrides any name found
617 while parsing.
618 Returns : String (name of model)
619 Args : [optional] String (name of model)
621 =cut
623 sub descriptor {
624 my $self = shift;
625 return $self->{'_descriptor'} = shift if @_;
626 return $self->{'_descriptor'};
629 =head2 model
631 Title : model
632 Usage : my $model = $parser->model();
633 Function: Get/Set model; Infernal currently does not output
634 the model name (Rfam ID)
635 Returns : String (name of model)
636 Args : [optional] String (name of model)
637 Note : this is a synonym for descriptor()
639 =cut
641 sub model { shift->descriptor(@_) }
643 =head2 database
645 Title : database
646 Usage : my $database = $parser->database();
647 Function: Get/Set database; Infernal currently does not output
648 the database name
649 Returns : String (database name)
650 Args : [optional] String (database name)
652 =cut
654 sub database {
655 my $self = shift;
656 return $self->{'_database'} = shift if @_;
657 return $self->{'_database'};
660 =head2 query_accession
662 Title : query_accession
663 Usage : my $acc = $parser->query_accession();
664 Function: Get/Set query (model) accession; RNAMotif currently does not output
665 the accession number
666 Returns : String (accession)
667 Args : [optional] String (accession)
669 =cut
671 sub query_accession {
672 my $self = shift;
673 return $self->{'_query_accession'} = shift if @_;
674 return $self->{'_query_accession'};
677 =head2 algorithm_version
679 Title : algorithm_version
680 Usage : my $ver = $parser->algorithm_version();
681 Function: Get/Set algorithm version (not defined in RNAMotif output)
682 Returns : String (accession)
683 Args : [optional] String (accession)
685 =cut
687 sub algorithm_version {
688 my $self = shift;
689 return $self->{'_algorithm'} = shift if @_;
690 return $self->{'_algorithm'};
693 =head2 hsp_minscore
695 Title : hsp_minscore
696 Usage : my $cutoff = $parser->hsp_minscore();
697 Function: Get/Set min score cutoff (for generating Hits/HSPs).
698 Returns : score (number)
699 Args : [optional] score (number)
700 Note : Cannot be set along with hsp_maxscore()
702 =cut
704 sub hsp_minscore {
705 my ($self, $score) = shift;
706 $self->throw('Minscore not set to a number') if
707 ($score && $score !~ m{[0-9.]+});
708 return $self->{'_hsp_minscore'} = shift if @_;
709 return $self->{'_hsp_minscore'};
712 =head2 hsp_maxscore
714 Title : hsp_maxscore
715 Usage : my $cutoff = $parser->hsp_maxscore();
716 Function: Get/Set max score cutoff (for generating Hits/HSPs).
717 Returns : score (number)
718 Args : [optional] score (number)
719 Note : Cannot be set along with hsp_minscore()
721 =cut
723 sub hsp_maxscore {
724 my ($self, $score) = shift;
725 $self->throw('Maxscore not set to a number') if
726 ($score && $score !~ m{[0-9.]+});
727 return $self->{'_hsp_maxscore'} = shift if @_;
728 return $self->{'_hsp_maxscore'};
731 =head2 structure_symbols
733 Title : structure_symbols
734 Usage : my $hashref = $parser->structure_symbols();
735 Function: Get/Set RNA structure symbols
736 Returns : Hash ref of delimiters (5' stem, 3' stem, single-strand, etc)
737 : default = < (5-prime)
738 > (3-prime)
739 . (single-strand)
740 ? (unknown)
741 Args : Hash ref of substitute delimiters, using above keys.
743 =cut
745 sub structure_symbols {
746 my ($self, $delim) = @_;
747 if ($delim) {
748 if (ref($delim) =~ m{HASH}) {
749 my %data = %{ $delim };
750 for my $d (@VALID_SYMBOLS) {
751 if ( exists $data{$d} ) {
752 $self->{'_delimiter'}->{$d} = $data{$d};
755 } else {
756 $self->throw("Args to helix_delimiters() should be in a hash reference");
759 return $self->{'_delimiter'};
762 #Private methods
764 =head2 _motif2meta
766 Title : _motif2meta
767 Usage : my ($rna, $meta) = $parser->_motif2meta($str, $descr);
768 Function: Creates meta string from sequence and descriptor
769 Returns : array of sequence, meta strings
770 Args : Array of string data and descriptor data
772 Note: This is currently a quick and simple way of making simple
773 RNA structures (stem-loops, helices, etc) from RNAMotif descriptor
774 data in the output file. It does not currently work with pseudoknots,
775 triplets, G-quartets, or other more complex RNA structural motifs.
777 =cut
779 sub _motif2meta {
780 my ($self, $str, $descriptor) = @_;
781 my ($rna, $meta);
782 my @desc_el = split ' ',$descriptor;
783 my @seq_el = split ' ',$str;
784 my $symbol = $self->structure_symbols();
785 if ($#desc_el != $#seq_el) {
786 $self->throw("Descriptor elements and seq elements do not match");
788 while (@desc_el) {
789 my $struct;
790 my ($seq, $motif) = (shift @seq_el, shift @desc_el);
791 $struct = (index($motif,'h5') == 0) ? $symbol->{'5-prime'} :
792 (index($motif,'h3') == 0) ? $symbol->{'3-prime'} :
793 (index($motif,'ss') == 0) ? $symbol->{'single-strand'} :
794 (index($motif,'ctx')== 0) ? $symbol->{'single-strand'} :
795 $symbol->{'unknown'};
796 $meta .= $struct x (length($seq));
797 $rna .= $seq;
799 return ($rna, $meta);