3 # BioPerl module for Bio::SearchIO::rnamotif
5 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
7 # Copyright Chris Fields
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::SearchIO::rnamotif - SearchIO-based RNAMotif parser
19 # do not call this module directly. Use Bio::SearchIO.
23 This is a highly experimental SearchIO-based parser for output from the rnamotif
24 program (one of the programs in the RNAMotif suite). It currently parses only
25 raw rnamotif output for RNAMotif versions 3.0 and above; older versions may work
26 but will not be supported. rmfmt output will not be supported at this time.
32 User feedback is an integral part of the evolution of this and other
33 Bioperl modules. Send your comments and suggestions preferably to
34 the Bioperl mailing list. Your participation is much appreciated.
36 bioperl-l@bioperl.org - General discussion
37 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
41 Report bugs to the Bioperl bug tracking system to help us keep track
42 of the bugs and their resolution. Bug reports can be submitted via the
45 http://bugzilla.open-bio.org/
47 =head1 AUTHOR - Chris Fields
49 Email cjfields-at-uiuc-dot-edu
53 The rest of the documentation details each of the object methods.
54 Internal methods are usually preceded with a _
58 # Let the code begin...
60 package Bio
::SearchIO
::rnamotif
;
63 use base
qw(Bio::SearchIO);
72 # commented out tags have not been assigned
74 'Hsp_score' => 'HSP-score',
75 'Hsp_custom-data' => 'HSP-custom_score',
77 # rnamotif has no evalue
79 # descriptor has no start, end; same as hit start, end
80 'Hsp_query-from' => 'HSP-query_start',
81 'Hsp_query-to' => 'HSP-query_end',
82 'Hsp_hit-from' => 'HSP-hit_start',
83 'Hsp_hit-to' => 'HSP-hit_end',
85 # descriptor has no start, end
87 'Hsp_hseq' => 'HSP-hit_seq',
88 'Hsp_align-len' => 'HSP-hsp_length',
90 # build this from scratch, simple WUSS-format
91 'Hsp_structure' => 'HSP-meta',
92 'Hsp_stranded' => 'HSP-stranded',
94 # not supported for RNAMotif
96 'Hit_id' => 'HIT-name',
97 'Hit_accession' => 'HIT-accession',
98 'Hit_gi' => 'HIT-ncbi_gi',
99 'Hit_def' => 'HIT-description',
100 'Hit_score' => 'HIT-score', # best HSP score
102 'RNAMotif_program' => 'RESULT-algorithm_name', # get/set
103 'RNAMotif_version' => 'RESULT-algorithm_version', # get/set
104 'RNAMotif_query-def'=> 'RESULT-query_name', # get/set
105 # No length (query is a descriptor)
106 'RNAMotif_query-acc'=> 'RESULT-query_accession', # get/set
107 'RNAMotif_querydesc'=> 'RESULT-query_description', # get/set
108 'RNAMotif_db' => 'RESULT-database_name', # get/set
111 # use structure_delimiters to set custom delimiters
113 my @VALID_SYMBOLS = qw(5-prime 3-prime single-strand unknown);
114 my %STRUCTURE_SYMBOLS = (
117 'single-strand' => '.',
119 # may add more for quartets, triplets
122 my $DEFAULT_VERSION = '3.0.3';
127 Usage : my $obj = Bio::SearchIO->new();
128 Function: Builds a new Bio::SearchIO::rnamotif object
129 Returns : Bio::SearchIO::rnamotif parser
130 Args : -fh/-file => RNAMotif filename
131 -format => 'rnamotif'
132 -model => query model (or descriptor, in this case)
133 -database => database name (default undef)
134 -query_acc => query accession (default undef)
135 -hsp_minscore => minimum HSP score cutoff
136 -hsp_maxscore => maximum HSP score cutoff
137 -symbols => hash ref of structure symbols to use
138 (default symbols in %STRUCTURE_SYMBOLS hash)
143 my ( $self, @args ) = @_;
144 $self->SUPER::_initialize
(@args);
145 my ($version, $model, $database, $maxcutoff, $mincutoff, $seqdistance,
146 $accession, $symbols) =
147 $self->_rearrange([qw(VERSION MODEL DATABASE HSP_MAXSCORE
148 HSP_MINSCORE SEQ_DISTANCE QUERY_ACC SYMBOLS)],@args);
149 my $handler = $self->_eventHandler;
150 $handler->register_factory(
152 Bio
::Factory
::ObjectFactory
->new(
153 -type
=> 'Bio::Search::Result::GenericResult',
154 -interface
=> 'Bio::Search::Result::ResultI',
155 -verbose
=> $self->verbose
159 $handler->register_factory(
161 Bio
::Factory
::ObjectFactory
->new(
162 -type
=> 'Bio::Search::Hit::ModelHit',
163 -interface
=> 'Bio::Search::Hit::HitI',
164 -verbose
=> $self->verbose
168 $handler->register_factory(
170 Bio
::Factory
::ObjectFactory
->new(
171 -type
=> 'Bio::Search::HSP::ModelHSP',
172 -interface
=> 'Bio::Search::HSP::HSPI',
173 -verbose
=> $self->verbose
176 $model && $self->model($model);
177 $database && $self->database($database);
178 $accession && $self->query_accession($accession);
179 $version ||= $DEFAULT_VERSION;
180 $self->algorithm_version($version);
181 $self->throw("Cannot define both a minimal and maximal cutoff")
182 if (defined($mincutoff) && defined($maxcutoff));
183 defined($mincutoff) && $self->hsp_minscore($mincutoff);
184 defined($maxcutoff) && $self->hsp_maxscore($maxcutoff);
185 $symbols ||= \
%STRUCTURE_SYMBOLS;
186 $self->structure_symbols($symbols);
192 Usage : my $hit = $searchio->next_result;
193 Function: Returns the next Result from a search
194 Returns : Bio::Search::Result::ResultI object
204 my ($rm, $d, $descriptor, $file, $oktobuild);
205 my ($hitid, $hitdesc, $hspid, $lastid, $lastscore);
208 # user-determined Result data
209 my ($accession, $db, $model) =
210 ($self->query_accession, $self->database, $self->model);
211 # HSP building options
212 my $hsp_min = $self->hsp_minscore;
213 my $hsp_max = $self->hsp_maxscore;
214 my $version = $self->algorithm_version;
217 my $verbose = $self->verbose; # cache for speed?
218 $self->start_document();
220 while ( defined( my $line = $self->_readline ) ) {
222 next if $line =~ m{^\s+$};
223 if (index($line,'#RM') == 0) {
224 if (index($line,'#RM scored') == 0 ) {
226 $self->_pushback($line);
229 $self->start_element({'Name' => 'Result'});
230 $self->element_hash({
231 'RNAMotif_program' => 'rnamotif',
232 'RNAMotif_version' => $version,
233 'RNAMotif_query-acc' => $accession,
237 #$self->debug("Start result\n");
238 } elsif (index($line,'#RM descr') == 0) {
239 ($rm, $d, $descriptor) = split ' ', $line, 3;
240 # toss $rm, $d; keep $descr
242 $self->{'_descriptor'} = $descriptor;
244 {'Name' => 'RNAMotif_querydesc',
245 'Data' => $descriptor}
247 } elsif(index($line,'#RM dfile') == 0) {
248 ($rm, $d, $file) = split ' ', $line, 3;
249 # toss $rm, $d; keep $file
252 {'Name' => 'RNAMotif_query-def',
256 $self->debug("Unrecognized line: $line");
258 } elsif ($line =~ s{^>}{}) {
260 ($hitid, $hitdesc) = split ' ',$line,2;
262 if ($self->within_element('hit') && ($hitid ne $lastid)) {
264 {'Name' => 'Hit_score',
265 'Data' => $lastscore}
267 $self->end_element({'Name' => 'Hit'});
268 $self->start_element({'Name' => 'Hit'});
269 } elsif (!$self->within_element('hit')) {
270 $self->start_element({'Name' => 'Hit'});
272 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($hitid);
274 $self->element_hash({
277 'Hit_accession' => $ver ?
"$acc.$ver" :
278 $acc ?
$acc : $hitid,
279 'Hit_def' => $hitdesc}
282 } elsif ($line =~ m{^(\S+)\s+(.+?)\s+(\d+)\s+(\d+)\s+(\d+)\s(.*)$}xm) {
285 my ($score, $strand, $start, $length , $seq) = ($2, $3, $4, $5, $6);
286 $score *= 1; # implicitly cast any odd '0.000' to float
288 unless ($hitid eq $hspid) {
289 $self->throw("IDs do not match!");
291 # check score for possible sprintf data, mark as such, cache result
292 if (!defined($sprintf)) {
293 if ($score =~ m{[^0-9.-]+}gxms) {
294 if (defined $hsp_min || defined $hsp_max ) {
295 $self->warn("HSP data likely contains custom score; ".
296 "ignoring min/maxscore");
298 $sprintf = $oktobuild = 1;
305 if (($hsp_min && $score <= $hsp_min)
306 || ($hsp_max && ($score >= $hsp_max)) ) {
312 # store best hit score based on the hsp min/maxscore only
313 if (defined $hsp_min && $score > $hsp_min) {
314 $lastscore = $score if !$lastscore || $score > $lastscore;
315 } elsif (defined $hsp_max && $score < $hsp_max) {
316 $lastscore = $score if !$lastscore || $score < $lastscore;
324 # calculate start/end
326 $end = $start + $length -1;
328 $end = $start - $length + 1;
331 my ($rna, $meta) = $self->_motif2meta($seq, $descriptor);
333 $self->start_element({'Name' => 'Hsp'});
334 my $rnalen = $rna =~ tr{ATGCatgc}{ATGCatgc};
335 $self->element_hash({
336 'Hsp_stranded' => 'HIT',
338 'Hsp_query-from' => 1,
339 'Hsp_query-to' =>length($rna),
340 'Hsp_hit-from' => $start,
341 'Hsp_hit-to' => $end,
342 'Hsp_structure' => $meta,
343 'Hsp_align-len' => length($rna),
344 'Hsp_score' => $sprintf ?
undef : $score,
345 'Hsp_custom-data' => $sprintf ?
$score : undef,
347 $self->end_element({'Name' => 'Hsp'});
348 $oktobuild = 0 if (!$sprintf);
352 if ($self->within_element('hit')) {
354 {'Name' => 'Hit_score',
355 'Data' => $lastscore}
357 $self->end_element( { 'Name' => 'Hit' } );
360 $self->end_element( { 'Name' => 'Result' } );
362 return $self->end_document();
367 Title : start_element
368 Usage : $eventgenerator->start_element
369 Function: Handles a start element event
371 Args : hashref with at least 2 keys 'Data' and 'Name'
377 my ( $self, $data ) = @_;
379 # we currently don't care about attributes
380 my $nm = $data->{'Name'};
381 my $type = $MODEMAP{$nm};
383 if ( $self->_eventHandler->will_handle($type) ) {
384 my $func = sprintf( "start_%s", lc $type );
385 $self->_eventHandler->$func( $data->{'Attributes'} );
387 unshift @
{ $self->{'_elements'} }, $type;
390 && $type eq 'result' )
392 $self->{'_values'} = {};
393 $self->{'_result'} = undef;
399 Title : start_element
400 Usage : $eventgenerator->end_element
401 Function: Handles an end element event
403 Args : hashref with at least 2 keys, 'Data' and 'Name'
409 my ( $self, $data ) = @_;
410 my $nm = $data->{'Name'};
411 my $type = $MODEMAP{$nm};
415 if ( $self->_eventHandler->will_handle($type) ) {
416 my $func = sprintf( "end_%s", lc $type );
417 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
418 $self->{'_values'} );
420 my $lastelem = shift @
{ $self->{'_elements'} };
422 elsif ( $MAPPING{$nm} ) {
423 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
424 my $key = ( keys %{ $MAPPING{$nm} } )[0];
425 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
426 $self->{'_last_data'};
429 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
433 $self->debug("unknown nm $nm, ignoring\n");
435 $self->{'_last_data'} = ''; # remove read data if we are at
437 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
444 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
445 Function: Convenience method that calls start_element, characters, end_element
447 Args : Hash ref with the keys 'Name' and 'Data'
453 my ( $self, $data ) = @_;
454 # simple data calls (%MAPPING) do not need start_element
455 $self->characters($data);
456 $self->end_element($data);
463 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
464 'Hsp_hit-to' => $end,
465 'Hsp_score' => $lastscore});
466 Function: Convenience method that takes multiple simple data elements and
467 maps to appropriate parameters
469 Args : Hash ref with the mapped key (in %MAPPING) and value
474 my ($self, $data) = @_;
475 $self->throw("Must provide data hash ref") if !$data || !ref($data);
476 for my $nm (sort keys %{$data}) {
477 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
478 if ( $MAPPING{$nm} ) {
479 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
480 my $key = ( keys %{ $MAPPING{$nm} } )[0];
481 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
485 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
494 Usage : $eventgenerator->characters($str)
495 Function: Send a character events
503 my ( $self, $data ) = @_;
504 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
505 $self->{'_last_data'} = $data->{'Data'};
508 =head2 within_element
510 Title : within_element
511 Usage : if( $eventgenerator->within_element($element) ) {}
512 Function: Test if we are within a particular element
513 This is different than 'in' because within can be tested
516 Args : string element name
521 my ( $self, $name ) = @_;
524 || !defined $self->{'_elements'}
525 || scalar @
{ $self->{'_elements'} } == 0 );
526 foreach ( @
{ $self->{'_elements'} } ) {
527 return 1 if ( $_ eq $name );
535 Usage : if( $eventgenerator->in_element($element) ) {}
536 Function: Test if we are in a particular element
537 This is different than 'within' because 'in' only
538 tests its immediate parent.
540 Args : string element name
545 my ( $self, $name ) = @_;
546 return 0 if !defined $self->{'_elements'}->[0];
547 return ( $self->{'_elements'}->[0] eq $name );
550 =head2 start_document
552 Title : start_document
553 Usage : $eventgenerator->start_document
554 Function: Handle a start document event
562 $self->{'_lasttype'} = '';
563 $self->{'_values'} = {};
564 $self->{'_result'} = undef;
565 $self->{'_elements'} = [];
571 Usage : $eventgenerator->end_document
572 Function: Handles an end document event
573 Returns : Bio::Search::Result::ResultI object
580 return $self->{'_result'};
586 Usage : my $count = $searchio->result_count
587 Function: Returns the number of results we have processed
595 return $self->{'_result_count'};
601 Usage : my $descr = $parser->descriptor();
602 Function: Get/Set descriptor name. Some versions of RNAMotif do not add the
603 descriptor name to the output. This also overrides any name found
605 Returns : String (name of model)
606 Args : [optional] String (name of model)
612 return $self->{'_descriptor'} = shift if @_;
613 return $self->{'_descriptor'};
619 Usage : my $model = $parser->model();
620 Function: Get/Set model; Infernal currently does not output
621 the model name (Rfam ID)
622 Returns : String (name of model)
623 Args : [optional] String (name of model)
624 Note : this is a synonym for descriptor()
628 sub model
{ shift->descriptor(@_) }
633 Usage : my $database = $parser->database();
634 Function: Get/Set database; Infernal currently does not output
636 Returns : String (database name)
637 Args : [optional] String (database name)
643 return $self->{'_database'} = shift if @_;
644 return $self->{'_database'};
647 =head2 query_accession
649 Title : query_accession
650 Usage : my $acc = $parser->query_accession();
651 Function: Get/Set query (model) accession; RNAMotif currently does not output
653 Returns : String (accession)
654 Args : [optional] String (accession)
658 sub query_accession
{
660 return $self->{'_query_accession'} = shift if @_;
661 return $self->{'_query_accession'};
664 =head2 algorithm_version
666 Title : algorithm_version
667 Usage : my $ver = $parser->algorithm_version();
668 Function: Get/Set algorithm version (not defined in RNAMotif output)
669 Returns : String (accession)
670 Args : [optional] String (accession)
674 sub algorithm_version
{
676 return $self->{'_algorithm'} = shift if @_;
677 return $self->{'_algorithm'};
683 Usage : my $cutoff = $parser->hsp_minscore();
684 Function: Get/Set min score cutoff (for generating Hits/HSPs).
685 Returns : score (number)
686 Args : [optional] score (number)
687 Note : Cannot be set along with hsp_maxscore()
692 my ($self, $score) = shift;
693 $self->throw('Minscore not set to a number') if
694 ($score && $score !~ m{[0-9.]+});
695 return $self->{'_hsp_minscore'} = shift if @_;
696 return $self->{'_hsp_minscore'};
702 Usage : my $cutoff = $parser->hsp_maxscore();
703 Function: Get/Set max score cutoff (for generating Hits/HSPs).
704 Returns : score (number)
705 Args : [optional] score (number)
706 Note : Cannot be set along with hsp_minscore()
711 my ($self, $score) = shift;
712 $self->throw('Maxscore not set to a number') if
713 ($score && $score !~ m{[0-9.]+});
714 return $self->{'_hsp_maxscore'} = shift if @_;
715 return $self->{'_hsp_maxscore'};
718 =head2 structure_symbols
720 Title : structure_symbols
721 Usage : my $hashref = $parser->structure_symbols();
722 Function: Get/Set RNA structure symbols
723 Returns : Hash ref of delimiters (5' stem, 3' stem, single-strand, etc)
724 : default = < (5-prime)
728 Args : Hash ref of substitute delimiters, using above keys.
732 sub structure_symbols
{
733 my ($self, $delim) = @_;
735 if (ref($delim) =~ m{HASH}) {
736 my %data = %{ $delim };
737 for my $d (@VALID_SYMBOLS) {
738 if ( exists $data{$d} ) {
739 $self->{'_delimiter'}->{$d} = $data{$d};
743 $self->throw("Args to helix_delimiters() should be in a hash reference");
746 return $self->{'_delimiter'};
754 Usage : my ($rna, $meta) = $parser->_motif2meta($str, $descr);
755 Function: Creates meta string from sequence and descriptor
756 Returns : array of sequence, meta strings
757 Args : Array of string data and descriptor data
759 Note: This is currently a quick and simple way of making simple
760 RNA structures (stem-loops, helices, etc) from RNAMotif descriptor
761 data in the output file. It does not currently work with pseudoknots,
762 triplets, G-quartets, or other more complex RNA structural motifs.
767 my ($self, $str, $descriptor) = @_;
769 my @desc_el = split ' ',$descriptor;
770 my @seq_el = split ' ',$str;
771 my $symbol = $self->structure_symbols();
772 if ($#desc_el != $#seq_el) {
773 $self->throw("Descriptor elements and seq elements do not match");
777 my ($seq, $motif) = (shift @seq_el, shift @desc_el);
778 $struct = (index($motif,'h5') == 0) ?
$symbol->{'5-prime'} :
779 (index($motif,'h3') == 0) ?
$symbol->{'3-prime'} :
780 (index($motif,'ss') == 0) ?
$symbol->{'single-strand'} :
781 (index($motif,'ctx')== 0) ?
$symbol->{'single-strand'} :
782 $symbol->{'unknown'};
783 $meta .= $struct x
(length($seq));
786 return ($rna, $meta);