3 # BioPerl module for Bio::SearchIO::rnamotif
5 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
7 # Copyright Chris Fields
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::SearchIO::rnamotif - SearchIO-based RNAMotif parser
21 This is a highly experimental SearchIO-based parser for output from the rnamotif
22 program (one of the programs in the RNAMotif suite). It currently parses only
23 raw rnamotif output for RNAMotif versions 3.0 and above; older versions may work
24 but will not be supported. rmfmt output will not be supported at this time.
30 User feedback is an integral part of the evolution of this and other
31 Bioperl modules. Send your comments and suggestions preferably to
32 the Bioperl mailing list. Your participation is much appreciated.
34 bioperl-l@bioperl.org - General discussion
35 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
39 Report bugs to the Bioperl bug tracking system to help us keep track
40 of the bugs and their resolution. Bug reports can be submitted via the
43 http://bugzilla.open-bio.org/
45 =head1 AUTHOR - Chris Fields
47 Email cjfields-at-uiuc-dot-edu
51 The rest of the documentation details each of the object methods.
52 Internal methods are usually preceded with a _
56 # Let the code begin...
58 package Bio
::SearchIO
::rnamotif
;
61 use base
qw(Bio::SearchIO);
70 # commented out tags have not been assigned
72 'Hsp_score' => 'HSP-score',
73 'Hsp_custom-data' => 'HSP-custom_score',
75 # rnamotif has no evalue
77 # descriptor has no start, end; same as hit start, end
78 'Hsp_query-from' => 'HSP-query_start',
79 'Hsp_query-to' => 'HSP-query_end',
80 'Hsp_hit-from' => 'HSP-hit_start',
81 'Hsp_hit-to' => 'HSP-hit_end',
83 # descriptor has no start, end
85 'Hsp_hseq' => 'HSP-hit_seq',
86 'Hsp_align-len' => 'HSP-hsp_length',
88 # build this from scratch, simple WUSS-format
89 'Hsp_structure' => 'HSP-meta',
90 'Hsp_stranded' => 'HSP-stranded',
92 # not supported for RNAMotif
94 'Hit_id' => 'HIT-name',
95 'Hit_accession' => 'HIT-accession',
96 'Hit_gi' => 'HIT-ncbi_gi',
97 'Hit_def' => 'HIT-description',
98 'Hit_score' => 'HIT-score', # best HSP score
100 'RNAMotif_program' => 'RESULT-algorithm_name', # get/set
101 'RNAMotif_version' => 'RESULT-algorithm_version', # get/set
102 'RNAMotif_query-def'=> 'RESULT-query_name', # get/set
103 # No length (query is a descriptor)
104 'RNAMotif_query-acc'=> 'RESULT-query_accession', # get/set
105 'RNAMotif_querydesc'=> 'RESULT-query_description', # get/set
106 'RNAMotif_db' => 'RESULT-database_name', # get/set
109 # use structure_delimiters to set custom delimiters
111 my @VALID_SYMBOLS = qw(5-prime 3-prime single-strand unknown);
112 my %STRUCTURE_SYMBOLS = (
115 'single-strand' => '.',
117 # may add more for quartets, triplets
120 my $DEFAULT_VERSION = '3.0.3';
125 Usage : my $obj = Bio::SearchIO->new();
126 Function: Builds a new Bio::SearchIO::rnamotif object
127 Returns : Bio::SearchIO::rnamotif parser
128 Args : -fh/-file => RNAMotif filename
129 -format => 'rnamotif'
130 -model => query model (or descriptor, in this case)
131 -database => database name (default undef)
132 -query_acc => query accession (default undef)
133 -hsp_minscore => minimum HSP score cutoff
134 -hsp_maxscore => maximum HSP score cutoff
135 -symbols => hash ref of structure symbols to use
136 (default symbols in %STRUCTURE_SYMBOLS hash)
141 my ( $self, @args ) = @_;
142 $self->SUPER::_initialize
(@args);
143 my ($version, $model, $database, $maxcutoff, $mincutoff, $seqdistance,
144 $accession, $symbols) =
145 $self->_rearrange([qw(VERSION MODEL DATABASE HSP_MAXSCORE
146 HSP_MINSCORE SEQ_DISTANCE QUERY_ACC SYMBOLS)],@args);
147 my $handler = $self->_eventHandler;
148 $handler->register_factory(
150 Bio
::Factory
::ObjectFactory
->new(
151 -type
=> 'Bio::Search::Result::GenericResult',
152 -interface
=> 'Bio::Search::Result::ResultI',
153 -verbose
=> $self->verbose
157 $handler->register_factory(
159 Bio
::Factory
::ObjectFactory
->new(
160 -type
=> 'Bio::Search::Hit::ModelHit',
161 -interface
=> 'Bio::Search::Hit::HitI',
162 -verbose
=> $self->verbose
166 $handler->register_factory(
168 Bio
::Factory
::ObjectFactory
->new(
169 -type
=> 'Bio::Search::HSP::ModelHSP',
170 -interface
=> 'Bio::Search::HSP::HSPI',
171 -verbose
=> $self->verbose
174 $model && $self->model($model);
175 $database && $self->database($database);
176 $accession && $self->query_accession($accession);
177 $version ||= $DEFAULT_VERSION;
178 $self->algorithm_version($version);
179 $self->throw("Cannot define both a minimal and maximal cutoff")
180 if (defined($mincutoff) && defined($maxcutoff));
181 defined($mincutoff) && $self->hsp_minscore($mincutoff);
182 defined($maxcutoff) && $self->hsp_maxscore($maxcutoff);
183 $symbols ||= \
%STRUCTURE_SYMBOLS;
184 $self->structure_symbols($symbols);
190 Usage : my $hit = $searchio->next_result;
191 Function: Returns the next Result from a search
192 Returns : Bio::Search::Result::ResultI object
202 my ($rm, $d, $descriptor, $file, $oktobuild);
203 my ($hitid, $hitdesc, $hspid, $lastid, $lastscore);
206 # user-determined Result data
207 my ($accession, $db, $model) =
208 ($self->query_accession, $self->database, $self->model);
209 # HSP building options
210 my $hsp_min = $self->hsp_minscore;
211 my $hsp_max = $self->hsp_maxscore;
212 my $version = $self->algorithm_version;
215 my $verbose = $self->verbose; # cache for speed?
216 $self->start_document();
218 while ( defined( my $line = $self->_readline ) ) {
220 next if $line =~ m{^\s+$};
221 if (index($line,'#RM') == 0) {
222 if (index($line,'#RM scored') == 0 ) {
224 $self->_pushback($line);
227 $self->start_element({'Name' => 'Result'});
228 $self->element_hash({
229 'RNAMotif_program' => 'rnamotif',
230 'RNAMotif_version' => $version,
231 'RNAMotif_query-acc' => $accession,
235 #$self->debug("Start result\n");
236 } elsif (index($line,'#RM descr') == 0) {
237 ($rm, $d, $descriptor) = split ' ', $line, 3;
238 # toss $rm, $d; keep $descr
240 $self->{'_descriptor'} = $descriptor;
242 {'Name' => 'RNAMotif_querydesc',
243 'Data' => $descriptor}
245 } elsif(index($line,'#RM dfile') == 0) {
246 ($rm, $d, $file) = split ' ', $line, 3;
247 # toss $rm, $d; keep $file
250 {'Name' => 'RNAMotif_query-def',
254 $self->debug("Unrecognized line: $line");
256 } elsif ($line =~ s{^>}{}) {
258 ($hitid, $hitdesc) = split ' ',$line,2;
260 if ($self->within_element('hit') && ($hitid ne $lastid)) {
262 {'Name' => 'Hit_score',
263 'Data' => $lastscore}
265 $self->end_element({'Name' => 'Hit'});
266 $self->start_element({'Name' => 'Hit'});
267 } elsif (!$self->within_element('hit')) {
268 $self->start_element({'Name' => 'Hit'});
270 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($hitid);
272 $self->element_hash({
275 'Hit_accession' => $ver ?
"$acc.$ver" :
276 $acc ?
$acc : $hitid,
277 'Hit_def' => $hitdesc}
280 } elsif ($line =~ m{^(\S+)\s+(.+?)\s+(\d+)\s+(\d+)\s+(\d+)\s(.*)$}xm) {
283 my ($score, $strand, $start, $length , $seq) = ($2, $3, $4, $5, $6);
284 $score *= 1; # implicitly cast any odd '0.000' to float
286 unless ($hitid eq $hspid) {
287 $self->throw("IDs do not match!");
289 # check score for possible sprintf data, mark as such, cache result
290 if (!defined($sprintf)) {
291 if ($score =~ m{[^0-9.-]+}gxms) {
292 if (defined $hsp_min || defined $hsp_max ) {
293 $self->warn("HSP data likely contains custom score; ".
294 "ignoring min/maxscore");
296 $sprintf = $oktobuild = 1;
303 if (($hsp_min && $score <= $hsp_min)
304 || ($hsp_max && ($score >= $hsp_max)) ) {
310 # store best hit score based on the hsp min/maxscore only
311 if (defined $hsp_min && $score > $hsp_min) {
312 $lastscore = $score if !$lastscore || $score > $lastscore;
313 } elsif (defined $hsp_max && $score < $hsp_max) {
314 $lastscore = $score if !$lastscore || $score < $lastscore;
322 # calculate start/end
324 $end = $start + $length -1;
326 $end = $start - $length + 1;
329 my ($rna, $meta) = $self->_motif2meta($seq, $descriptor);
331 $self->start_element({'Name' => 'Hsp'});
332 my $rnalen = $rna =~ tr{ATGCatgc}{ATGCatgc};
333 $self->element_hash({
334 'Hsp_stranded' => 'HIT',
336 'Hsp_query-from' => 1,
337 'Hsp_query-to' =>length($rna),
338 'Hsp_hit-from' => $start,
339 'Hsp_hit-to' => $end,
340 'Hsp_structure' => $meta,
341 'Hsp_align-len' => length($rna),
342 'Hsp_score' => $sprintf ?
undef : $score,
343 'Hsp_custom-data' => $sprintf ?
$score : undef,
345 $self->end_element({'Name' => 'Hsp'});
346 $oktobuild = 0 if (!$sprintf);
350 if ($self->within_element('hit')) {
352 {'Name' => 'Hit_score',
353 'Data' => $lastscore}
355 $self->end_element( { 'Name' => 'Hit' } );
358 $self->end_element( { 'Name' => 'Result' } );
360 return $self->end_document();
365 Title : start_element
366 Usage : $eventgenerator->start_element
367 Function: Handles a start element event
369 Args : hashref with at least 2 keys 'Data' and 'Name'
375 my ( $self, $data ) = @_;
377 # we currently don't care about attributes
378 my $nm = $data->{'Name'};
379 my $type = $MODEMAP{$nm};
381 if ( $self->_eventHandler->will_handle($type) ) {
382 my $func = sprintf( "start_%s", lc $type );
383 $self->_eventHandler->$func( $data->{'Attributes'} );
385 unshift @
{ $self->{'_elements'} }, $type;
388 && $type eq 'result' )
390 $self->{'_values'} = {};
391 $self->{'_result'} = undef;
397 Title : start_element
398 Usage : $eventgenerator->end_element
399 Function: Handles an end element event
401 Args : hashref with at least 2 keys, 'Data' and 'Name'
407 my ( $self, $data ) = @_;
408 my $nm = $data->{'Name'};
409 my $type = $MODEMAP{$nm};
413 if ( $self->_eventHandler->will_handle($type) ) {
414 my $func = sprintf( "end_%s", lc $type );
415 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
416 $self->{'_values'} );
418 my $lastelem = shift @
{ $self->{'_elements'} };
420 elsif ( $MAPPING{$nm} ) {
421 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
422 my $key = ( keys %{ $MAPPING{$nm} } )[0];
423 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
424 $self->{'_last_data'};
427 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
431 $self->debug("unknown nm $nm, ignoring\n");
433 $self->{'_last_data'} = ''; # remove read data if we are at
435 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
442 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
443 Function: Convenience method that calls start_element, characters, end_element
445 Args : Hash ref with the keys 'Name' and 'Data'
451 my ( $self, $data ) = @_;
452 # simple data calls (%MAPPING) do not need start_element
453 $self->characters($data);
454 $self->end_element($data);
461 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
462 'Hsp_hit-to' => $end,
463 'Hsp_score' => $lastscore});
464 Function: Convenience method that takes multiple simple data elements and
465 maps to appropriate parameters
467 Args : Hash ref with the mapped key (in %MAPPING) and value
472 my ($self, $data) = @_;
473 $self->throw("Must provide data hash ref") if !$data || !ref($data);
474 for my $nm (sort keys %{$data}) {
475 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
476 if ( $MAPPING{$nm} ) {
477 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
478 my $key = ( keys %{ $MAPPING{$nm} } )[0];
479 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
483 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
492 Usage : $eventgenerator->characters($str)
493 Function: Send a character events
501 my ( $self, $data ) = @_;
502 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
503 $self->{'_last_data'} = $data->{'Data'};
506 =head2 within_element
508 Title : within_element
509 Usage : if( $eventgenerator->within_element($element) ) {}
510 Function: Test if we are within a particular element
511 This is different than 'in' because within can be tested
514 Args : string element name
519 my ( $self, $name ) = @_;
522 || !defined $self->{'_elements'}
523 || scalar @
{ $self->{'_elements'} } == 0 );
524 foreach ( @
{ $self->{'_elements'} } ) {
525 return 1 if ( $_ eq $name );
533 Usage : if( $eventgenerator->in_element($element) ) {}
534 Function: Test if we are in a particular element
535 This is different than 'within' because 'in' only
536 tests its immediate parent.
538 Args : string element name
543 my ( $self, $name ) = @_;
544 return 0 if !defined $self->{'_elements'}->[0];
545 return ( $self->{'_elements'}->[0] eq $name );
548 =head2 start_document
550 Title : start_document
551 Usage : $eventgenerator->start_document
552 Function: Handle a start document event
560 $self->{'_lasttype'} = '';
561 $self->{'_values'} = {};
562 $self->{'_result'} = undef;
563 $self->{'_elements'} = [];
569 Usage : $eventgenerator->end_document
570 Function: Handles an end document event
571 Returns : Bio::Search::Result::ResultI object
578 return $self->{'_result'};
584 Usage : my $count = $searchio->result_count
585 Function: Returns the number of results we have processed
593 return $self->{'_result_count'};
599 Usage : my $descr = $parser->descriptor();
600 Function: Get/Set descriptor name. Some versions of RNAMotif do not add the
601 descriptor name to the output. This also overrides any name found
603 Returns : String (name of model)
604 Args : [optional] String (name of model)
610 return $self->{'_descriptor'} = shift if @_;
611 return $self->{'_descriptor'};
617 Usage : my $model = $parser->model();
618 Function: Get/Set model; Infernal currently does not output
619 the model name (Rfam ID)
620 Returns : String (name of model)
621 Args : [optional] String (name of model)
622 Note : this is a synonym for descriptor()
626 sub model
{ shift->descriptor(@_) }
631 Usage : my $database = $parser->database();
632 Function: Get/Set database; Infernal currently does not output
634 Returns : String (database name)
635 Args : [optional] String (database name)
641 return $self->{'_database'} = shift if @_;
642 return $self->{'_database'};
645 =head2 query_accession
647 Title : query_accession
648 Usage : my $acc = $parser->query_accession();
649 Function: Get/Set query (model) accession; RNAMotif currently does not output
651 Returns : String (accession)
652 Args : [optional] String (accession)
656 sub query_accession
{
658 return $self->{'_query_accession'} = shift if @_;
659 return $self->{'_query_accession'};
662 =head2 algorithm_version
664 Title : algorithm_version
665 Usage : my $ver = $parser->algorithm_version();
666 Function: Get/Set algorithm version (not defined in RNAMotif output)
667 Returns : String (accession)
668 Args : [optional] String (accession)
672 sub algorithm_version
{
674 return $self->{'_algorithm'} = shift if @_;
675 return $self->{'_algorithm'};
681 Usage : my $cutoff = $parser->hsp_minscore();
682 Function: Get/Set min score cutoff (for generating Hits/HSPs).
683 Returns : score (number)
684 Args : [optional] score (number)
685 Note : Cannot be set along with hsp_maxscore()
690 my ($self, $score) = shift;
691 $self->throw('Minscore not set to a number') if
692 ($score && $score !~ m{[0-9.]+});
693 return $self->{'_hsp_minscore'} = shift if @_;
694 return $self->{'_hsp_minscore'};
700 Usage : my $cutoff = $parser->hsp_maxscore();
701 Function: Get/Set max score cutoff (for generating Hits/HSPs).
702 Returns : score (number)
703 Args : [optional] score (number)
704 Note : Cannot be set along with hsp_minscore()
709 my ($self, $score) = shift;
710 $self->throw('Maxscore not set to a number') if
711 ($score && $score !~ m{[0-9.]+});
712 return $self->{'_hsp_maxscore'} = shift if @_;
713 return $self->{'_hsp_maxscore'};
716 =head2 structure_symbols
718 Title : structure_symbols
719 Usage : my $hashref = $parser->structure_symbols();
720 Function: Get/Set RNA structure symbols
721 Returns : Hash ref of delimiters (5' stem, 3' stem, single-strand, etc)
722 : default = < (5-prime)
726 Args : Hash ref of substitute delimiters, using above keys.
730 sub structure_symbols
{
731 my ($self, $delim) = @_;
733 if (ref($delim) =~ m{HASH}) {
734 my %data = %{ $delim };
735 for my $d (@VALID_SYMBOLS) {
736 if ( exists $data{$d} ) {
737 $self->{'_delimiter'}->{$d} = $data{$d};
741 $self->throw("Args to helix_delimiters() should be in a hash reference");
744 return $self->{'_delimiter'};
752 Usage : my ($rna, $meta) = $parser->_motif2meta($str, $descr);
753 Function: Creates meta string from sequence and descriptor
754 Returns : array of sequence, meta strings
755 Args : Array of string data and descriptor data
757 Note: This is currently a quick and simple way of making simple
758 RNA structures (stem-loops, helices, etc) from RNAMotif descriptor
759 data in the output file. It does not currently work with pseudoknots,
760 triplets, G-quartets, or other more complex RNA structural motifs.
765 my ($self, $str, $descriptor) = @_;
767 my @desc_el = split ' ',$descriptor;
768 my @seq_el = split ' ',$str;
769 my $symbol = $self->structure_symbols();
770 if ($#desc_el != $#seq_el) {
771 $self->throw("Descriptor elements and seq elements do not match");
775 my ($seq, $motif) = (shift @seq_el, shift @desc_el);
776 $struct = (index($motif,'h5') == 0) ?
$symbol->{'5-prime'} :
777 (index($motif,'h3') == 0) ?
$symbol->{'3-prime'} :
778 (index($motif,'ss') == 0) ?
$symbol->{'single-strand'} :
779 (index($motif,'ctx')== 0) ?
$symbol->{'single-strand'} :
780 $symbol->{'unknown'};
781 $meta .= $struct x
(length($seq));
784 return ($rna, $meta);