3 # BioPerl module for Bio::SearchIO::rnamotif
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
9 # Copyright Chris Fields
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::SearchIO::rnamotif - SearchIO-based RNAMotif parser
21 # do not call this module directly. Use Bio::SearchIO.
25 This is a highly experimental SearchIO-based parser for output from the rnamotif
26 program (one of the programs in the RNAMotif suite). It currently parses only
27 raw rnamotif output for RNAMotif versions 3.0 and above; older versions may work
28 but will not be supported. rmfmt output will not be supported at this time.
34 User feedback is an integral part of the evolution of this and other
35 Bioperl modules. Send your comments and suggestions preferably to
36 the Bioperl mailing list. Your participation is much appreciated.
38 bioperl-l@bioperl.org - General discussion
39 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43 Please direct usage questions or support issues to the mailing list:
45 L<bioperl-l@bioperl.org>
47 rather than to the module maintainer directly. Many experienced and
48 reponsive experts will be able look at the problem and quickly
49 address it. Please include a thorough description of the problem
50 with code and data examples if at all possible.
54 Report bugs to the Bioperl bug tracking system to help us keep track
55 of the bugs and their resolution. Bug reports can be submitted via the
58 http://bugzilla.open-bio.org/
60 =head1 AUTHOR - Chris Fields
62 Email cjfields-at-uiuc-dot-edu
66 The rest of the documentation details each of the object methods.
67 Internal methods are usually preceded with a _
71 # Let the code begin...
73 package Bio
::SearchIO
::rnamotif
;
76 use base
qw(Bio::SearchIO);
85 # commented out tags have not been assigned
87 'Hsp_score' => 'HSP-score',
88 'Hsp_custom-data' => 'HSP-custom_score',
90 # rnamotif has no evalue
92 # descriptor has no start, end; same as hit start, end
93 'Hsp_query-from' => 'HSP-query_start',
94 'Hsp_query-to' => 'HSP-query_end',
95 'Hsp_hit-from' => 'HSP-hit_start',
96 'Hsp_hit-to' => 'HSP-hit_end',
98 # descriptor has no start, end
100 'Hsp_hseq' => 'HSP-hit_seq',
101 'Hsp_align-len' => 'HSP-hsp_length',
103 # build this from scratch, simple WUSS-format
104 'Hsp_structure' => 'HSP-meta',
105 'Hsp_stranded' => 'HSP-stranded',
107 # not supported for RNAMotif
109 'Hit_id' => 'HIT-name',
110 'Hit_accession' => 'HIT-accession',
111 'Hit_gi' => 'HIT-ncbi_gi',
112 'Hit_def' => 'HIT-description',
113 'Hit_score' => 'HIT-score', # best HSP score
115 'RNAMotif_program' => 'RESULT-algorithm_name', # get/set
116 'RNAMotif_version' => 'RESULT-algorithm_version', # get/set
117 'RNAMotif_query-def'=> 'RESULT-query_name', # get/set
118 # No length (query is a descriptor)
119 'RNAMotif_query-acc'=> 'RESULT-query_accession', # get/set
120 'RNAMotif_querydesc'=> 'RESULT-query_description', # get/set
121 'RNAMotif_db' => 'RESULT-database_name', # get/set
124 # use structure_delimiters to set custom delimiters
126 my @VALID_SYMBOLS = qw(5-prime 3-prime single-strand unknown);
127 my %STRUCTURE_SYMBOLS = (
130 'single-strand' => '.',
132 # may add more for quartets, triplets
135 my $DEFAULT_VERSION = '3.0.3';
140 Usage : my $obj = Bio::SearchIO->new();
141 Function: Builds a new Bio::SearchIO::rnamotif object
142 Returns : Bio::SearchIO::rnamotif parser
143 Args : -fh/-file => RNAMotif filename
144 -format => 'rnamotif'
145 -model => query model (or descriptor, in this case)
146 -database => database name (default undef)
147 -query_acc => query accession (default undef)
148 -hsp_minscore => minimum HSP score cutoff
149 -hsp_maxscore => maximum HSP score cutoff
150 -symbols => hash ref of structure symbols to use
151 (default symbols in %STRUCTURE_SYMBOLS hash)
156 my ( $self, @args ) = @_;
157 $self->SUPER::_initialize
(@args);
158 my ($version, $model, $database, $maxcutoff, $mincutoff, $seqdistance,
159 $accession, $symbols) =
160 $self->_rearrange([qw(VERSION MODEL DATABASE HSP_MAXSCORE
161 HSP_MINSCORE SEQ_DISTANCE QUERY_ACC SYMBOLS)],@args);
162 my $handler = $self->_eventHandler;
163 $handler->register_factory(
165 Bio
::Factory
::ObjectFactory
->new(
166 -type
=> 'Bio::Search::Result::GenericResult',
167 -interface
=> 'Bio::Search::Result::ResultI',
168 -verbose
=> $self->verbose
172 $handler->register_factory(
174 Bio
::Factory
::ObjectFactory
->new(
175 -type
=> 'Bio::Search::Hit::ModelHit',
176 -interface
=> 'Bio::Search::Hit::HitI',
177 -verbose
=> $self->verbose
181 $handler->register_factory(
183 Bio
::Factory
::ObjectFactory
->new(
184 -type
=> 'Bio::Search::HSP::ModelHSP',
185 -interface
=> 'Bio::Search::HSP::HSPI',
186 -verbose
=> $self->verbose
189 $model && $self->model($model);
190 $database && $self->database($database);
191 $accession && $self->query_accession($accession);
192 $version ||= $DEFAULT_VERSION;
193 $self->algorithm_version($version);
194 $self->throw("Cannot define both a minimal and maximal cutoff")
195 if (defined($mincutoff) && defined($maxcutoff));
196 defined($mincutoff) && $self->hsp_minscore($mincutoff);
197 defined($maxcutoff) && $self->hsp_maxscore($maxcutoff);
198 $symbols ||= \
%STRUCTURE_SYMBOLS;
199 $self->structure_symbols($symbols);
205 Usage : my $hit = $searchio->next_result;
206 Function: Returns the next Result from a search
207 Returns : Bio::Search::Result::ResultI object
217 my ($rm, $d, $descriptor, $file, $oktobuild);
218 my ($hitid, $hitdesc, $hspid, $lastid, $lastscore);
221 # user-determined Result data
222 my ($accession, $db, $model) =
223 ($self->query_accession, $self->database, $self->model);
224 # HSP building options
225 my $hsp_min = $self->hsp_minscore;
226 my $hsp_max = $self->hsp_maxscore;
227 my $version = $self->algorithm_version;
230 my $verbose = $self->verbose; # cache for speed?
231 $self->start_document();
233 while ( defined( my $line = $self->_readline ) ) {
235 next if $line =~ m{^\s+$};
236 if (index($line,'#RM') == 0) {
237 if (index($line,'#RM scored') == 0 ) {
239 $self->_pushback($line);
242 $self->start_element({'Name' => 'Result'});
243 $self->element_hash({
244 'RNAMotif_program' => 'rnamotif',
245 'RNAMotif_version' => $version,
246 'RNAMotif_query-acc' => $accession,
250 #$self->debug("Start result\n");
251 } elsif (index($line,'#RM descr') == 0) {
252 ($rm, $d, $descriptor) = split ' ', $line, 3;
253 # toss $rm, $d; keep $descr
255 $self->{'_descriptor'} = $descriptor;
257 {'Name' => 'RNAMotif_querydesc',
258 'Data' => $descriptor}
260 } elsif(index($line,'#RM dfile') == 0) {
261 ($rm, $d, $file) = split ' ', $line, 3;
262 # toss $rm, $d; keep $file
265 {'Name' => 'RNAMotif_query-def',
269 $self->debug("Unrecognized line: $line");
271 } elsif ($line =~ s{^>}{}) {
273 ($hitid, $hitdesc) = split ' ',$line,2;
275 if ($self->within_element('hit') && ($hitid ne $lastid)) {
277 {'Name' => 'Hit_score',
278 'Data' => $lastscore}
280 $self->end_element({'Name' => 'Hit'});
281 $self->start_element({'Name' => 'Hit'});
282 } elsif (!$self->within_element('hit')) {
283 $self->start_element({'Name' => 'Hit'});
285 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($hitid);
287 $self->element_hash({
290 'Hit_accession' => $ver ?
"$acc.$ver" :
291 $acc ?
$acc : $hitid,
292 'Hit_def' => $hitdesc}
295 } elsif ($line =~ m{^(\S+)\s+(.+?)\s+(\d+)\s+(\d+)\s+(\d+)\s(.*)$}xm) {
298 my ($score, $strand, $start, $length , $seq) = ($2, $3, $4, $5, $6);
299 $score *= 1; # implicitly cast any odd '0.000' to float
301 unless ($hitid eq $hspid) {
302 $self->throw("IDs do not match!");
304 # check score for possible sprintf data, mark as such, cache result
305 if (!defined($sprintf)) {
306 if ($score =~ m{[^0-9.-]+}gxms) {
307 if (defined $hsp_min || defined $hsp_max ) {
308 $self->warn("HSP data likely contains custom score; ".
309 "ignoring min/maxscore");
311 $sprintf = $oktobuild = 1;
318 if (($hsp_min && $score <= $hsp_min)
319 || ($hsp_max && ($score >= $hsp_max)) ) {
325 # store best hit score based on the hsp min/maxscore only
326 if (defined $hsp_min && $score > $hsp_min) {
327 $lastscore = $score if !$lastscore || $score > $lastscore;
328 } elsif (defined $hsp_max && $score < $hsp_max) {
329 $lastscore = $score if !$lastscore || $score < $lastscore;
337 # calculate start/end
339 $end = $start + $length -1;
341 $end = $start - $length + 1;
344 my ($rna, $meta) = $self->_motif2meta($seq, $descriptor);
346 $self->start_element({'Name' => 'Hsp'});
347 my $rnalen = $rna =~ tr{ATGCatgc}{ATGCatgc};
348 $self->element_hash({
349 'Hsp_stranded' => 'HIT',
351 'Hsp_query-from' => 1,
352 'Hsp_query-to' =>length($rna),
353 'Hsp_hit-from' => $start,
354 'Hsp_hit-to' => $end,
355 'Hsp_structure' => $meta,
356 'Hsp_align-len' => length($rna),
357 'Hsp_score' => $sprintf ?
undef : $score,
358 'Hsp_custom-data' => $sprintf ?
$score : undef,
360 $self->end_element({'Name' => 'Hsp'});
361 $oktobuild = 0 if (!$sprintf);
365 if ($self->within_element('hit')) {
367 {'Name' => 'Hit_score',
368 'Data' => $lastscore}
370 $self->end_element( { 'Name' => 'Hit' } );
373 $self->end_element( { 'Name' => 'Result' } );
375 return $self->end_document();
380 Title : start_element
381 Usage : $eventgenerator->start_element
382 Function: Handles a start element event
384 Args : hashref with at least 2 keys 'Data' and 'Name'
390 my ( $self, $data ) = @_;
392 # we currently don't care about attributes
393 my $nm = $data->{'Name'};
394 my $type = $MODEMAP{$nm};
396 if ( $self->_eventHandler->will_handle($type) ) {
397 my $func = sprintf( "start_%s", lc $type );
398 $self->_eventHandler->$func( $data->{'Attributes'} );
400 unshift @
{ $self->{'_elements'} }, $type;
403 && $type eq 'result' )
405 $self->{'_values'} = {};
406 $self->{'_result'} = undef;
412 Title : start_element
413 Usage : $eventgenerator->end_element
414 Function: Handles an end element event
416 Args : hashref with at least 2 keys, 'Data' and 'Name'
422 my ( $self, $data ) = @_;
423 my $nm = $data->{'Name'};
424 my $type = $MODEMAP{$nm};
428 if ( $self->_eventHandler->will_handle($type) ) {
429 my $func = sprintf( "end_%s", lc $type );
430 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
431 $self->{'_values'} );
433 my $lastelem = shift @
{ $self->{'_elements'} };
435 elsif ( $MAPPING{$nm} ) {
436 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
437 my $key = ( keys %{ $MAPPING{$nm} } )[0];
438 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
439 $self->{'_last_data'};
442 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
446 $self->debug("unknown nm $nm, ignoring\n");
448 $self->{'_last_data'} = ''; # remove read data if we are at
450 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
457 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
458 Function: Convenience method that calls start_element, characters, end_element
460 Args : Hash ref with the keys 'Name' and 'Data'
466 my ( $self, $data ) = @_;
467 # simple data calls (%MAPPING) do not need start_element
468 $self->characters($data);
469 $self->end_element($data);
476 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
477 'Hsp_hit-to' => $end,
478 'Hsp_score' => $lastscore});
479 Function: Convenience method that takes multiple simple data elements and
480 maps to appropriate parameters
482 Args : Hash ref with the mapped key (in %MAPPING) and value
487 my ($self, $data) = @_;
488 $self->throw("Must provide data hash ref") if !$data || !ref($data);
489 for my $nm (sort keys %{$data}) {
490 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
491 if ( $MAPPING{$nm} ) {
492 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
493 my $key = ( keys %{ $MAPPING{$nm} } )[0];
494 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
498 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
507 Usage : $eventgenerator->characters($str)
508 Function: Send a character events
516 my ( $self, $data ) = @_;
517 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
518 $self->{'_last_data'} = $data->{'Data'};
521 =head2 within_element
523 Title : within_element
524 Usage : if( $eventgenerator->within_element($element) ) {}
525 Function: Test if we are within a particular element
526 This is different than 'in' because within can be tested
529 Args : string element name
534 my ( $self, $name ) = @_;
537 || !defined $self->{'_elements'}
538 || scalar @
{ $self->{'_elements'} } == 0 );
539 foreach ( @
{ $self->{'_elements'} } ) {
540 return 1 if ( $_ eq $name );
548 Usage : if( $eventgenerator->in_element($element) ) {}
549 Function: Test if we are in a particular element
550 This is different than 'within' because 'in' only
551 tests its immediate parent.
553 Args : string element name
558 my ( $self, $name ) = @_;
559 return 0 if !defined $self->{'_elements'}->[0];
560 return ( $self->{'_elements'}->[0] eq $name );
563 =head2 start_document
565 Title : start_document
566 Usage : $eventgenerator->start_document
567 Function: Handle a start document event
575 $self->{'_lasttype'} = '';
576 $self->{'_values'} = {};
577 $self->{'_result'} = undef;
578 $self->{'_elements'} = [];
584 Usage : $eventgenerator->end_document
585 Function: Handles an end document event
586 Returns : Bio::Search::Result::ResultI object
593 return $self->{'_result'};
599 Usage : my $count = $searchio->result_count
600 Function: Returns the number of results we have processed
608 return $self->{'_result_count'};
614 Usage : my $descr = $parser->descriptor();
615 Function: Get/Set descriptor name. Some versions of RNAMotif do not add the
616 descriptor name to the output. This also overrides any name found
618 Returns : String (name of model)
619 Args : [optional] String (name of model)
625 return $self->{'_descriptor'} = shift if @_;
626 return $self->{'_descriptor'};
632 Usage : my $model = $parser->model();
633 Function: Get/Set model; Infernal currently does not output
634 the model name (Rfam ID)
635 Returns : String (name of model)
636 Args : [optional] String (name of model)
637 Note : this is a synonym for descriptor()
641 sub model
{ shift->descriptor(@_) }
646 Usage : my $database = $parser->database();
647 Function: Get/Set database; Infernal currently does not output
649 Returns : String (database name)
650 Args : [optional] String (database name)
656 return $self->{'_database'} = shift if @_;
657 return $self->{'_database'};
660 =head2 query_accession
662 Title : query_accession
663 Usage : my $acc = $parser->query_accession();
664 Function: Get/Set query (model) accession; RNAMotif currently does not output
666 Returns : String (accession)
667 Args : [optional] String (accession)
671 sub query_accession
{
673 return $self->{'_query_accession'} = shift if @_;
674 return $self->{'_query_accession'};
677 =head2 algorithm_version
679 Title : algorithm_version
680 Usage : my $ver = $parser->algorithm_version();
681 Function: Get/Set algorithm version (not defined in RNAMotif output)
682 Returns : String (accession)
683 Args : [optional] String (accession)
687 sub algorithm_version
{
689 return $self->{'_algorithm'} = shift if @_;
690 return $self->{'_algorithm'};
696 Usage : my $cutoff = $parser->hsp_minscore();
697 Function: Get/Set min score cutoff (for generating Hits/HSPs).
698 Returns : score (number)
699 Args : [optional] score (number)
700 Note : Cannot be set along with hsp_maxscore()
705 my ($self, $score) = shift;
706 $self->throw('Minscore not set to a number') if
707 ($score && $score !~ m{[0-9.]+});
708 return $self->{'_hsp_minscore'} = shift if @_;
709 return $self->{'_hsp_minscore'};
715 Usage : my $cutoff = $parser->hsp_maxscore();
716 Function: Get/Set max score cutoff (for generating Hits/HSPs).
717 Returns : score (number)
718 Args : [optional] score (number)
719 Note : Cannot be set along with hsp_minscore()
724 my ($self, $score) = shift;
725 $self->throw('Maxscore not set to a number') if
726 ($score && $score !~ m{[0-9.]+});
727 return $self->{'_hsp_maxscore'} = shift if @_;
728 return $self->{'_hsp_maxscore'};
731 =head2 structure_symbols
733 Title : structure_symbols
734 Usage : my $hashref = $parser->structure_symbols();
735 Function: Get/Set RNA structure symbols
736 Returns : Hash ref of delimiters (5' stem, 3' stem, single-strand, etc)
737 : default = < (5-prime)
741 Args : Hash ref of substitute delimiters, using above keys.
745 sub structure_symbols
{
746 my ($self, $delim) = @_;
748 if (ref($delim) =~ m{HASH}) {
749 my %data = %{ $delim };
750 for my $d (@VALID_SYMBOLS) {
751 if ( exists $data{$d} ) {
752 $self->{'_delimiter'}->{$d} = $data{$d};
756 $self->throw("Args to helix_delimiters() should be in a hash reference");
759 return $self->{'_delimiter'};
767 Usage : my ($rna, $meta) = $parser->_motif2meta($str, $descr);
768 Function: Creates meta string from sequence and descriptor
769 Returns : array of sequence, meta strings
770 Args : Array of string data and descriptor data
772 Note: This is currently a quick and simple way of making simple
773 RNA structures (stem-loops, helices, etc) from RNAMotif descriptor
774 data in the output file. It does not currently work with pseudoknots,
775 triplets, G-quartets, or other more complex RNA structural motifs.
780 my ($self, $str, $descriptor) = @_;
782 my @desc_el = split ' ',$descriptor;
783 my @seq_el = split ' ',$str;
784 my $symbol = $self->structure_symbols();
785 if ($#desc_el != $#seq_el) {
786 $self->throw("Descriptor elements and seq elements do not match");
790 my ($seq, $motif) = (shift @seq_el, shift @desc_el);
791 $struct = (index($motif,'h5') == 0) ?
$symbol->{'5-prime'} :
792 (index($motif,'h3') == 0) ?
$symbol->{'3-prime'} :
793 (index($motif,'ss') == 0) ?
$symbol->{'single-strand'} :
794 (index($motif,'ctx')== 0) ?
$symbol->{'single-strand'} :
795 $symbol->{'unknown'};
796 $meta .= $struct x
(length($seq));
799 return ($rna, $meta);