bug 2485 and related fixes
[bioperl-live.git] / Bio / SearchIO / rnamotif.pm
blobd7acf058398c1edf893fbad1e34f9c5d414b5941
1 # $Id$
3 # BioPerl module for Bio::SearchIO::rnamotif
5 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
7 # Copyright Chris Fields
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SearchIO::rnamotif - SearchIO-based RNAMotif parser
17 =head1 SYNOPSIS
19 =head1 DESCRIPTION
21 This is a highly experimental SearchIO-based parser for output from the rnamotif
22 program (one of the programs in the RNAMotif suite). It currently parses only
23 raw rnamotif output for RNAMotif versions 3.0 and above; older versions may work
24 but will not be supported. rmfmt output will not be supported at this time.
26 =head1 FEEDBACK
28 =head2 Mailing Lists
30 User feedback is an integral part of the evolution of this and other
31 Bioperl modules. Send your comments and suggestions preferably to
32 the Bioperl mailing list. Your participation is much appreciated.
34 bioperl-l@bioperl.org - General discussion
35 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
37 =head2 Reporting Bugs
39 Report bugs to the Bioperl bug tracking system to help us keep track
40 of the bugs and their resolution. Bug reports can be submitted via the
41 web:
43 http://bugzilla.open-bio.org/
45 =head1 AUTHOR - Chris Fields
47 Email cjfields-at-uiuc-dot-edu
49 =head1 APPENDIX
51 The rest of the documentation details each of the object methods.
52 Internal methods are usually preceded with a _
54 =cut
56 # Let the code begin...
58 package Bio::SearchIO::rnamotif;
59 use strict;
61 use base qw(Bio::SearchIO);
63 my %MODEMAP = (
64 'Result' => 'result',
65 'Hit' => 'hit',
66 'Hsp' => 'hsp'
69 my %MAPPING = (
70 # commented out tags have not been assigned
72 'Hsp_score' => 'HSP-score',
73 'Hsp_custom-data' => 'HSP-custom_score',
75 # rnamotif has no evalue
77 # descriptor has no start, end; same as hit start, end
78 'Hsp_query-from' => 'HSP-query_start',
79 'Hsp_query-to' => 'HSP-query_end',
80 'Hsp_hit-from' => 'HSP-hit_start',
81 'Hsp_hit-to' => 'HSP-hit_end',
83 # descriptor has no start, end
85 'Hsp_hseq' => 'HSP-hit_seq',
86 'Hsp_align-len' => 'HSP-hsp_length',
88 # build this from scratch, simple WUSS-format
89 'Hsp_structure' => 'HSP-meta',
90 'Hsp_stranded' => 'HSP-stranded',
92 # not supported for RNAMotif
94 'Hit_id' => 'HIT-name',
95 'Hit_accession' => 'HIT-accession',
96 'Hit_gi' => 'HIT-ncbi_gi',
97 'Hit_def' => 'HIT-description',
98 'Hit_score' => 'HIT-score', # best HSP score
100 'RNAMotif_program' => 'RESULT-algorithm_name', # get/set
101 'RNAMotif_version' => 'RESULT-algorithm_version', # get/set
102 'RNAMotif_query-def'=> 'RESULT-query_name', # get/set
103 # No length (query is a descriptor)
104 'RNAMotif_query-acc'=> 'RESULT-query_accession', # get/set
105 'RNAMotif_querydesc'=> 'RESULT-query_description', # get/set
106 'RNAMotif_db' => 'RESULT-database_name', # get/set
109 # use structure_delimiters to set custom delimiters
111 my @VALID_SYMBOLS = qw(5-prime 3-prime single-strand unknown);
112 my %STRUCTURE_SYMBOLS = (
113 '5-prime' => '<',
114 '3-prime' => '>',
115 'single-strand' => '.',
116 'unknown' => '?'
117 # may add more for quartets, triplets
120 my $DEFAULT_VERSION = '3.0.3';
122 =head2 new
124 Title : new
125 Usage : my $obj = Bio::SearchIO->new();
126 Function: Builds a new Bio::SearchIO::rnamotif object
127 Returns : Bio::SearchIO::rnamotif parser
128 Args : -fh/-file => RNAMotif filename
129 -format => 'rnamotif'
130 -model => query model (or descriptor, in this case)
131 -database => database name (default undef)
132 -query_acc => query accession (default undef)
133 -hsp_minscore => minimum HSP score cutoff
134 -hsp_maxscore => maximum HSP score cutoff
135 -symbols => hash ref of structure symbols to use
136 (default symbols in %STRUCTURE_SYMBOLS hash)
138 =cut
140 sub _initialize {
141 my ( $self, @args ) = @_;
142 $self->SUPER::_initialize(@args);
143 my ($version, $model, $database, $maxcutoff, $mincutoff, $seqdistance,
144 $accession, $symbols) =
145 $self->_rearrange([qw(VERSION MODEL DATABASE HSP_MAXSCORE
146 HSP_MINSCORE SEQ_DISTANCE QUERY_ACC SYMBOLS)],@args);
147 my $handler = $self->_eventHandler;
148 $handler->register_factory(
149 'result',
150 Bio::Factory::ObjectFactory->new(
151 -type => 'Bio::Search::Result::GenericResult',
152 -interface => 'Bio::Search::Result::ResultI',
153 -verbose => $self->verbose
157 $handler->register_factory(
158 'hit',
159 Bio::Factory::ObjectFactory->new(
160 -type => 'Bio::Search::Hit::ModelHit',
161 -interface => 'Bio::Search::Hit::HitI',
162 -verbose => $self->verbose
166 $handler->register_factory(
167 'hsp',
168 Bio::Factory::ObjectFactory->new(
169 -type => 'Bio::Search::HSP::ModelHSP',
170 -interface => 'Bio::Search::HSP::HSPI',
171 -verbose => $self->verbose
174 $model && $self->model($model);
175 $database && $self->database($database);
176 $accession && $self->query_accession($accession);
177 $version ||= $DEFAULT_VERSION;
178 $self->algorithm_version($version);
179 $self->throw("Cannot define both a minimal and maximal cutoff")
180 if (defined($mincutoff) && defined($maxcutoff));
181 defined($mincutoff) && $self->hsp_minscore($mincutoff);
182 defined($maxcutoff) && $self->hsp_maxscore($maxcutoff);
183 $symbols ||= \%STRUCTURE_SYMBOLS;
184 $self->structure_symbols($symbols);
187 =head2 next_result
189 Title : next_result
190 Usage : my $hit = $searchio->next_result;
191 Function: Returns the next Result from a search
192 Returns : Bio::Search::Result::ResultI object
193 Args : none
195 =cut
197 sub next_result {
198 my ($self) = @_;
199 my $seentop = 0;
200 local $/ = "\n";
201 local $_;
202 my ($rm, $d, $descriptor, $file, $oktobuild);
203 my ($hitid, $hitdesc, $hspid, $lastid, $lastscore);
204 my $sprintf;
206 # user-determined Result data
207 my ($accession, $db, $model) =
208 ($self->query_accession, $self->database, $self->model);
209 # HSP building options
210 my $hsp_min = $self->hsp_minscore;
211 my $hsp_max = $self->hsp_maxscore;
212 my $version = $self->algorithm_version;
213 my $laststart;
215 my $verbose = $self->verbose; # cache for speed?
216 $self->start_document();
217 PARSER:
218 while ( defined( my $line = $self->_readline ) ) {
219 # start of report
220 next if $line =~ m{^\s+$};
221 if (index($line,'#RM') == 0) {
222 if (index($line,'#RM scored') == 0 ) {
223 if ($seentop) {
224 $self->_pushback($line);
225 last PARSER;
227 $self->start_element({'Name' => 'Result'});
228 $self->element_hash({
229 'RNAMotif_program' => 'rnamotif',
230 'RNAMotif_version' => $version,
231 'RNAMotif_query-acc' => $accession,
232 'RNAMotif_db' => $db
233 });
234 $seentop = 1;
235 #$self->debug("Start result\n");
236 } elsif (index($line,'#RM descr') == 0) {
237 ($rm, $d, $descriptor) = split ' ', $line, 3;
238 # toss $rm, $d; keep $descr
239 chomp $descriptor;
240 $self->{'_descriptor'} = $descriptor;
241 $self->element(
242 {'Name' => 'RNAMotif_querydesc',
243 'Data' => $descriptor}
245 } elsif(index($line,'#RM dfile') == 0) {
246 ($rm, $d, $file) = split ' ', $line, 3;
247 # toss $rm, $d; keep $file
248 chomp $file;
249 $self->element(
250 {'Name' => 'RNAMotif_query-def',
251 'Data' => $file}
253 } else {
254 $self->debug("Unrecognized line: $line");
256 } elsif ($line =~ s{^>}{}) {
257 chomp $line;
258 ($hitid, $hitdesc) = split ' ',$line,2;
260 if ($self->within_element('hit') && ($hitid ne $lastid)) {
261 $self->element(
262 {'Name' => 'Hit_score',
263 'Data' => $lastscore}
264 ) if $lastscore;
265 $self->end_element({'Name' => 'Hit'});
266 $self->start_element({'Name' => 'Hit'});
267 } elsif (!$self->within_element('hit')) {
268 $self->start_element({'Name' => 'Hit'});
270 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($hitid);
272 $self->element_hash({
273 'Hit_id' => $hitid,
274 'Hit_gi' => $gi,
275 'Hit_accession' => $ver ? "$acc.$ver" :
276 $acc ? $acc : $hitid,
277 'Hit_def' => $hitdesc}
279 $lastid = $hitid;
280 } elsif ($line =~ m{^(\S+)\s+(.+?)\s+(\d+)\s+(\d+)\s+(\d+)\s(.*)$}xm) {
281 chomp $line;
282 my $hspid = $1;
283 my ($score, $strand, $start, $length , $seq) = ($2, $3, $4, $5, $6);
284 $score *= 1; # implicitly cast any odd '0.000' to float
285 # sanity check ids
286 unless ($hitid eq $hspid) {
287 $self->throw("IDs do not match!");
289 # check score for possible sprintf data, mark as such, cache result
290 if (!defined($sprintf)) {
291 if ($score =~ m{[^0-9.-]+}gxms) {
292 if (defined $hsp_min || defined $hsp_max ) {
293 $self->warn("HSP data likely contains custom score; ".
294 "ignoring min/maxscore");
296 $sprintf = $oktobuild = 1;
297 } else {
298 $sprintf = 0;
302 if (!$sprintf) {
303 if (($hsp_min && $score <= $hsp_min)
304 || ($hsp_max && ($score >= $hsp_max)) ) {
305 # do not build HSP
306 $oktobuild = 0;
307 } else {
308 $oktobuild = 1;
310 # store best hit score based on the hsp min/maxscore only
311 if (defined $hsp_min && $score > $hsp_min) {
312 $lastscore = $score if !$lastscore || $score > $lastscore;
313 } elsif (defined $hsp_max && $score < $hsp_max) {
314 $lastscore = $score if !$lastscore || $score < $lastscore;
319 # build HSP
320 if ($oktobuild) {
321 my $end;
322 # calculate start/end
323 if( $strand==0 ) {
324 $end = $start + $length -1;
325 } else {
326 $end = $start - $length + 1;
329 my ($rna, $meta) = $self->_motif2meta($seq, $descriptor);
331 $self->start_element({'Name' => 'Hsp'});
332 my $rnalen = $rna =~ tr{ATGCatgc}{ATGCatgc};
333 $self->element_hash({
334 'Hsp_stranded' => 'HIT',
335 'Hsp_hseq' => $rna,
336 'Hsp_query-from' => 1,
337 'Hsp_query-to' =>length($rna),
338 'Hsp_hit-from' => $start,
339 'Hsp_hit-to' => $end,
340 'Hsp_structure' => $meta,
341 'Hsp_align-len' => length($rna),
342 'Hsp_score' => $sprintf ? undef : $score,
343 'Hsp_custom-data' => $sprintf ? $score : undef,
345 $self->end_element({'Name' => 'Hsp'});
346 $oktobuild = 0 if (!$sprintf);
350 if ($self->within_element('hit')) {
351 $self->element(
352 {'Name' => 'Hit_score',
353 'Data' => $lastscore}
354 ) if $lastscore;
355 $self->end_element( { 'Name' => 'Hit' } );
357 if ($seentop) {
358 $self->end_element( { 'Name' => 'Result' } );
360 return $self->end_document();
363 =head2 start_element
365 Title : start_element
366 Usage : $eventgenerator->start_element
367 Function: Handles a start element event
368 Returns : none
369 Args : hashref with at least 2 keys 'Data' and 'Name'
372 =cut
374 sub start_element {
375 my ( $self, $data ) = @_;
377 # we currently don't care about attributes
378 my $nm = $data->{'Name'};
379 my $type = $MODEMAP{$nm};
380 if ($type) {
381 if ( $self->_eventHandler->will_handle($type) ) {
382 my $func = sprintf( "start_%s", lc $type );
383 $self->_eventHandler->$func( $data->{'Attributes'} );
385 unshift @{ $self->{'_elements'} }, $type;
387 if ( defined $type
388 && $type eq 'result' )
390 $self->{'_values'} = {};
391 $self->{'_result'} = undef;
395 =head2 end_element
397 Title : start_element
398 Usage : $eventgenerator->end_element
399 Function: Handles an end element event
400 Returns : none
401 Args : hashref with at least 2 keys, 'Data' and 'Name'
404 =cut
406 sub end_element {
407 my ( $self, $data ) = @_;
408 my $nm = $data->{'Name'};
409 my $type = $MODEMAP{$nm};
410 my $rc;
412 if ($type) {
413 if ( $self->_eventHandler->will_handle($type) ) {
414 my $func = sprintf( "end_%s", lc $type );
415 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
416 $self->{'_values'} );
418 my $lastelem = shift @{ $self->{'_elements'} };
420 elsif ( $MAPPING{$nm} ) {
421 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
422 my $key = ( keys %{ $MAPPING{$nm} } )[0];
423 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
424 $self->{'_last_data'};
426 else {
427 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
430 else {
431 $self->debug("unknown nm $nm, ignoring\n");
433 $self->{'_last_data'} = ''; # remove read data if we are at
434 # end of an element
435 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
436 return $rc;
439 =head2 element
441 Title : element
442 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
443 Function: Convenience method that calls start_element, characters, end_element
444 Returns : none
445 Args : Hash ref with the keys 'Name' and 'Data'
448 =cut
450 sub element {
451 my ( $self, $data ) = @_;
452 # simple data calls (%MAPPING) do not need start_element
453 $self->characters($data);
454 $self->end_element($data);
458 =head2 element_hash
460 Title : element
461 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
462 'Hsp_hit-to' => $end,
463 'Hsp_score' => $lastscore});
464 Function: Convenience method that takes multiple simple data elements and
465 maps to appropriate parameters
466 Returns : none
467 Args : Hash ref with the mapped key (in %MAPPING) and value
469 =cut
471 sub element_hash {
472 my ($self, $data) = @_;
473 $self->throw("Must provide data hash ref") if !$data || !ref($data);
474 for my $nm (sort keys %{$data}) {
475 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
476 if ( $MAPPING{$nm} ) {
477 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
478 my $key = ( keys %{ $MAPPING{$nm} } )[0];
479 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
480 $data->{$nm};
482 else {
483 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
489 =head2 characters
491 Title : characters
492 Usage : $eventgenerator->characters($str)
493 Function: Send a character events
494 Returns : none
495 Args : string
498 =cut
500 sub characters {
501 my ( $self, $data ) = @_;
502 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
503 $self->{'_last_data'} = $data->{'Data'};
506 =head2 within_element
508 Title : within_element
509 Usage : if( $eventgenerator->within_element($element) ) {}
510 Function: Test if we are within a particular element
511 This is different than 'in' because within can be tested
512 for a whole block.
513 Returns : boolean
514 Args : string element name
516 =cut
518 sub within_element {
519 my ( $self, $name ) = @_;
520 return 0
521 if ( !defined $name
522 || !defined $self->{'_elements'}
523 || scalar @{ $self->{'_elements'} } == 0 );
524 foreach ( @{ $self->{'_elements'} } ) {
525 return 1 if ( $_ eq $name );
527 return 0;
530 =head2 in_element
532 Title : in_element
533 Usage : if( $eventgenerator->in_element($element) ) {}
534 Function: Test if we are in a particular element
535 This is different than 'within' because 'in' only
536 tests its immediate parent.
537 Returns : boolean
538 Args : string element name
540 =cut
542 sub in_element {
543 my ( $self, $name ) = @_;
544 return 0 if !defined $self->{'_elements'}->[0];
545 return ( $self->{'_elements'}->[0] eq $name );
548 =head2 start_document
550 Title : start_document
551 Usage : $eventgenerator->start_document
552 Function: Handle a start document event
553 Returns : none
554 Args : none
556 =cut
558 sub start_document {
559 my ($self) = @_;
560 $self->{'_lasttype'} = '';
561 $self->{'_values'} = {};
562 $self->{'_result'} = undef;
563 $self->{'_elements'} = [];
566 =head2 end_document
568 Title : end_document
569 Usage : $eventgenerator->end_document
570 Function: Handles an end document event
571 Returns : Bio::Search::Result::ResultI object
572 Args : none
574 =cut
576 sub end_document {
577 my ($self) = @_;
578 return $self->{'_result'};
581 =head2 result_count
583 Title : result_count
584 Usage : my $count = $searchio->result_count
585 Function: Returns the number of results we have processed
586 Returns : integer
587 Args : none
589 =cut
591 sub result_count {
592 my $self = shift;
593 return $self->{'_result_count'};
596 =head2 descriptor
598 Title : descriptor
599 Usage : my $descr = $parser->descriptor();
600 Function: Get/Set descriptor name. Some versions of RNAMotif do not add the
601 descriptor name to the output. This also overrides any name found
602 while parsing.
603 Returns : String (name of model)
604 Args : [optional] String (name of model)
606 =cut
608 sub descriptor {
609 my $self = shift;
610 return $self->{'_descriptor'} = shift if @_;
611 return $self->{'_descriptor'};
614 =head2 model
616 Title : model
617 Usage : my $model = $parser->model();
618 Function: Get/Set model; Infernal currently does not output
619 the model name (Rfam ID)
620 Returns : String (name of model)
621 Args : [optional] String (name of model)
622 Note : this is a synonym for descriptor()
624 =cut
626 sub model { shift->descriptor(@_) }
628 =head2 database
630 Title : database
631 Usage : my $database = $parser->database();
632 Function: Get/Set database; Infernal currently does not output
633 the database name
634 Returns : String (database name)
635 Args : [optional] String (database name)
637 =cut
639 sub database {
640 my $self = shift;
641 return $self->{'_database'} = shift if @_;
642 return $self->{'_database'};
645 =head2 query_accession
647 Title : query_accession
648 Usage : my $acc = $parser->query_accession();
649 Function: Get/Set query (model) accession; RNAMotif currently does not output
650 the accession number
651 Returns : String (accession)
652 Args : [optional] String (accession)
654 =cut
656 sub query_accession {
657 my $self = shift;
658 return $self->{'_query_accession'} = shift if @_;
659 return $self->{'_query_accession'};
662 =head2 algorithm_version
664 Title : algorithm_version
665 Usage : my $ver = $parser->algorithm_version();
666 Function: Get/Set algorithm version (not defined in RNAMotif output)
667 Returns : String (accession)
668 Args : [optional] String (accession)
670 =cut
672 sub algorithm_version {
673 my $self = shift;
674 return $self->{'_algorithm'} = shift if @_;
675 return $self->{'_algorithm'};
678 =head2 hsp_minscore
680 Title : hsp_minscore
681 Usage : my $cutoff = $parser->hsp_minscore();
682 Function: Get/Set min score cutoff (for generating Hits/HSPs).
683 Returns : score (number)
684 Args : [optional] score (number)
685 Note : Cannot be set along with hsp_maxscore()
687 =cut
689 sub hsp_minscore {
690 my ($self, $score) = shift;
691 $self->throw('Minscore not set to a number') if
692 ($score && $score !~ m{[0-9.]+});
693 return $self->{'_hsp_minscore'} = shift if @_;
694 return $self->{'_hsp_minscore'};
697 =head2 hsp_maxscore
699 Title : hsp_maxscore
700 Usage : my $cutoff = $parser->hsp_maxscore();
701 Function: Get/Set max score cutoff (for generating Hits/HSPs).
702 Returns : score (number)
703 Args : [optional] score (number)
704 Note : Cannot be set along with hsp_minscore()
706 =cut
708 sub hsp_maxscore {
709 my ($self, $score) = shift;
710 $self->throw('Maxscore not set to a number') if
711 ($score && $score !~ m{[0-9.]+});
712 return $self->{'_hsp_maxscore'} = shift if @_;
713 return $self->{'_hsp_maxscore'};
716 =head2 structure_symbols
718 Title : structure_symbols
719 Usage : my $hashref = $parser->structure_symbols();
720 Function: Get/Set RNA structure symbols
721 Returns : Hash ref of delimiters (5' stem, 3' stem, single-strand, etc)
722 : default = < (5-prime)
723 > (3-prime)
724 . (single-strand)
725 ? (unknown)
726 Args : Hash ref of substitute delimiters, using above keys.
728 =cut
730 sub structure_symbols {
731 my ($self, $delim) = @_;
732 if ($delim) {
733 if (ref($delim) =~ m{HASH}) {
734 my %data = %{ $delim };
735 for my $d (@VALID_SYMBOLS) {
736 if ( exists $data{$d} ) {
737 $self->{'_delimiter'}->{$d} = $data{$d};
740 } else {
741 $self->throw("Args to helix_delimiters() should be in a hash reference");
744 return $self->{'_delimiter'};
747 #Private methods
749 =head2 _motif2meta
751 Title : _motif2meta
752 Usage : my ($rna, $meta) = $parser->_motif2meta($str, $descr);
753 Function: Creates meta string from sequence and descriptor
754 Returns : array of sequence, meta strings
755 Args : Array of string data and descriptor data
757 Note: This is currently a quick and simple way of making simple
758 RNA structures (stem-loops, helices, etc) from RNAMotif descriptor
759 data in the output file. It does not currently work with pseudoknots,
760 triplets, G-quartets, or other more complex RNA structural motifs.
762 =cut
764 sub _motif2meta {
765 my ($self, $str, $descriptor) = @_;
766 my ($rna, $meta);
767 my @desc_el = split ' ',$descriptor;
768 my @seq_el = split ' ',$str;
769 my $symbol = $self->structure_symbols();
770 if ($#desc_el != $#seq_el) {
771 $self->throw("Descriptor elements and seq elements do not match");
773 while (@desc_el) {
774 my $struct;
775 my ($seq, $motif) = (shift @seq_el, shift @desc_el);
776 $struct = (index($motif,'h5') == 0) ? $symbol->{'5-prime'} :
777 (index($motif,'h3') == 0) ? $symbol->{'3-prime'} :
778 (index($motif,'ss') == 0) ? $symbol->{'single-strand'} :
779 (index($motif,'ctx')== 0) ? $symbol->{'single-strand'} :
780 $symbol->{'unknown'};
781 $meta .= $struct x (length($seq));
782 $rna .= $seq;
784 return ($rna, $meta);