see if Bio::DB::Sam package will work for tests
[bioperl-live.git] / Bio / SearchIO / rnamotif.pm
blob7785c4189b8ab4cee6f24abc3842cae0b760c737
2 # BioPerl module for Bio::SearchIO::rnamotif
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SearchIO::rnamotif - SearchIO-based RNAMotif parser
18 =head1 SYNOPSIS
20 # do not call this module directly. Use Bio::SearchIO.
22 =head1 DESCRIPTION
24 This is a highly experimental SearchIO-based parser for output from the rnamotif
25 program (one of the programs in the RNAMotif suite). It currently parses only
26 raw rnamotif output for RNAMotif versions 3.0 and above; older versions may work
27 but will not be supported. rmfmt output will not be supported at this time.
29 =head1 FEEDBACK
31 =head2 Mailing Lists
33 User feedback is an integral part of the evolution of this and other
34 Bioperl modules. Send your comments and suggestions preferably to
35 the Bioperl mailing list. Your participation is much appreciated.
37 bioperl-l@bioperl.org - General discussion
38 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
40 =head2 Support
42 Please direct usage questions or support issues to the mailing list:
44 I<bioperl-l@bioperl.org>
46 rather than to the module maintainer directly. Many experienced and
47 reponsive experts will be able look at the problem and quickly
48 address it. Please include a thorough description of the problem
49 with code and data examples if at all possible.
51 =head2 Reporting Bugs
53 Report bugs to the Bioperl bug tracking system to help us keep track
54 of the bugs and their resolution. Bug reports can be submitted via the
55 web:
57 https://redmine.open-bio.org/projects/bioperl/
59 =head1 AUTHOR - Chris Fields
61 Email cjfields-at-uiuc-dot-edu
63 =head1 APPENDIX
65 The rest of the documentation details each of the object methods.
66 Internal methods are usually preceded with a _
68 =cut
70 # Let the code begin...
72 package Bio::SearchIO::rnamotif;
73 use strict;
75 use base qw(Bio::SearchIO);
77 my %MODEMAP = (
78 'Result' => 'result',
79 'Hit' => 'hit',
80 'Hsp' => 'hsp'
83 my %MAPPING = (
84 # commented out tags have not been assigned
86 'Hsp_score' => 'HSP-score',
87 'Hsp_custom-data' => 'HSP-custom_score',
89 # rnamotif has no evalue
91 # descriptor has no start, end; same as hit start, end
92 'Hsp_query-from' => 'HSP-query_start',
93 'Hsp_query-to' => 'HSP-query_end',
94 'Hsp_hit-from' => 'HSP-hit_start',
95 'Hsp_hit-to' => 'HSP-hit_end',
97 # descriptor has no start, end
99 'Hsp_hseq' => 'HSP-hit_seq',
100 'Hsp_align-len' => 'HSP-hsp_length',
102 # build this from scratch, simple WUSS-format
103 'Hsp_structure' => 'HSP-meta',
104 'Hsp_stranded' => 'HSP-stranded',
106 # not supported for RNAMotif
108 'Hit_id' => 'HIT-name',
109 'Hit_accession' => 'HIT-accession',
110 'Hit_gi' => 'HIT-ncbi_gi',
111 'Hit_def' => 'HIT-description',
112 'Hit_score' => 'HIT-score', # best HSP score
114 'RNAMotif_program' => 'RESULT-algorithm_name', # get/set
115 'RNAMotif_version' => 'RESULT-algorithm_version', # get/set
116 'RNAMotif_query-def'=> 'RESULT-query_name', # get/set
117 # No length (query is a descriptor)
118 'RNAMotif_query-acc'=> 'RESULT-query_accession', # get/set
119 'RNAMotif_querydesc'=> 'RESULT-query_description', # get/set
120 'RNAMotif_db' => 'RESULT-database_name', # get/set
123 # use structure_delimiters to set custom delimiters
125 my @VALID_SYMBOLS = qw(5-prime 3-prime single-strand unknown);
126 my %STRUCTURE_SYMBOLS = (
127 '5-prime' => '<',
128 '3-prime' => '>',
129 'single-strand' => '.',
130 'unknown' => '?'
131 # may add more for quartets, triplets
134 my $DEFAULT_VERSION = '3.0.3';
136 =head2 new
138 Title : new
139 Usage : my $obj = Bio::SearchIO->new();
140 Function: Builds a new Bio::SearchIO::rnamotif object
141 Returns : Bio::SearchIO::rnamotif parser
142 Args : -fh/-file => RNAMotif filename
143 -format => 'rnamotif'
144 -model => query model (or descriptor, in this case)
145 -database => database name (default undef)
146 -query_acc => query accession (default undef)
147 -hsp_minscore => minimum HSP score cutoff
148 -hsp_maxscore => maximum HSP score cutoff
149 -symbols => hash ref of structure symbols to use
150 (default symbols in %STRUCTURE_SYMBOLS hash)
152 =cut
154 sub _initialize {
155 my ( $self, @args ) = @_;
156 $self->SUPER::_initialize(@args);
157 my ($version, $model, $database, $maxcutoff, $mincutoff, $seqdistance,
158 $accession, $symbols) =
159 $self->_rearrange([qw(VERSION MODEL DATABASE HSP_MAXSCORE
160 HSP_MINSCORE SEQ_DISTANCE QUERY_ACC SYMBOLS)],@args);
161 my $handler = $self->_eventHandler;
162 $handler->register_factory(
163 'result',
164 Bio::Factory::ObjectFactory->new(
165 -type => 'Bio::Search::Result::GenericResult',
166 -interface => 'Bio::Search::Result::ResultI',
167 -verbose => $self->verbose
171 $handler->register_factory(
172 'hit',
173 Bio::Factory::ObjectFactory->new(
174 -type => 'Bio::Search::Hit::ModelHit',
175 -interface => 'Bio::Search::Hit::HitI',
176 -verbose => $self->verbose
180 $handler->register_factory(
181 'hsp',
182 Bio::Factory::ObjectFactory->new(
183 -type => 'Bio::Search::HSP::ModelHSP',
184 -interface => 'Bio::Search::HSP::HSPI',
185 -verbose => $self->verbose
188 $model && $self->model($model);
189 $database && $self->database($database);
190 $accession && $self->query_accession($accession);
191 $version ||= $DEFAULT_VERSION;
192 $self->algorithm_version($version);
193 $self->throw("Cannot define both a minimal and maximal cutoff")
194 if (defined($mincutoff) && defined($maxcutoff));
195 defined($mincutoff) && $self->hsp_minscore($mincutoff);
196 defined($maxcutoff) && $self->hsp_maxscore($maxcutoff);
197 $symbols ||= \%STRUCTURE_SYMBOLS;
198 $self->structure_symbols($symbols);
201 =head2 next_result
203 Title : next_result
204 Usage : my $hit = $searchio->next_result;
205 Function: Returns the next Result from a search
206 Returns : Bio::Search::Result::ResultI object
207 Args : none
209 =cut
211 sub next_result {
212 my ($self) = @_;
213 my $seentop = 0;
214 local $/ = "\n";
215 local $_;
216 my ($rm, $d, $descriptor, $file, $oktobuild);
217 my ($hitid, $hitdesc, $hspid, $lastid, $lastscore);
218 my $sprintf;
220 # user-determined Result data
221 my ($accession, $db, $model) =
222 ($self->query_accession, $self->database, $self->model);
223 # HSP building options
224 my $hsp_min = $self->hsp_minscore;
225 my $hsp_max = $self->hsp_maxscore;
226 my $version = $self->algorithm_version;
227 my $laststart;
229 my $verbose = $self->verbose; # cache for speed?
230 $self->start_document();
231 PARSER:
232 while ( defined( my $line = $self->_readline ) ) {
233 # start of report
234 next if $line =~ m{^\s+$};
235 if (index($line,'#RM') == 0) {
236 if (index($line,'#RM scored') == 0 ) {
237 if ($seentop) {
238 $self->_pushback($line);
239 last PARSER;
241 $self->start_element({'Name' => 'Result'});
242 $self->element_hash({
243 'RNAMotif_program' => 'rnamotif',
244 'RNAMotif_version' => $version,
245 'RNAMotif_query-acc' => $accession,
246 'RNAMotif_db' => $db
247 });
248 $seentop = 1;
249 #$self->debug("Start result\n");
250 } elsif (index($line,'#RM descr') == 0) {
251 ($rm, $d, $descriptor) = split ' ', $line, 3;
252 # toss $rm, $d; keep $descr
253 chomp $descriptor;
254 $self->{'_descriptor'} = $descriptor;
255 $self->element(
256 {'Name' => 'RNAMotif_querydesc',
257 'Data' => $descriptor}
259 } elsif(index($line,'#RM dfile') == 0) {
260 ($rm, $d, $file) = split ' ', $line, 3;
261 # toss $rm, $d; keep $file
262 chomp $file;
263 $self->element(
264 {'Name' => 'RNAMotif_query-def',
265 'Data' => $file}
267 } else {
268 $self->debug("Unrecognized line: $line");
270 } elsif ($line =~ s{^>}{}) {
271 chomp $line;
272 ($hitid, $hitdesc) = split ' ',$line,2;
274 if ($self->within_element('hit') && ($hitid ne $lastid)) {
275 $self->element(
276 {'Name' => 'Hit_score',
277 'Data' => $lastscore}
278 ) if $lastscore;
279 $self->end_element({'Name' => 'Hit'});
280 $self->start_element({'Name' => 'Hit'});
281 } elsif (!$self->within_element('hit')) {
282 $self->start_element({'Name' => 'Hit'});
284 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($hitid);
286 $self->element_hash({
287 'Hit_id' => $hitid,
288 'Hit_gi' => $gi,
289 'Hit_accession' => $ver ? "$acc.$ver" :
290 $acc ? $acc : $hitid,
291 'Hit_def' => $hitdesc}
293 $lastid = $hitid;
294 } elsif ($line =~ m{^(\S+)\s+(.+?)\s+(\d+)\s+(\d+)\s+(\d+)\s(.*)$}xm) {
295 chomp $line;
296 my $hspid = $1;
297 my ($score, $strand, $start, $length , $seq) = ($2, $3, $4, $5, $6);
298 $score *= 1; # implicitly cast any odd '0.000' to float
299 # sanity check ids
300 unless ($hitid eq $hspid) {
301 $self->throw("IDs do not match!");
303 # check score for possible sprintf data, mark as such, cache result
304 if (!defined($sprintf)) {
305 if ($score =~ m{[^0-9.-]+}gxms) {
306 if (defined $hsp_min || defined $hsp_max ) {
307 $self->warn("HSP data likely contains custom score; ".
308 "ignoring min/maxscore");
310 $sprintf = $oktobuild = 1;
311 } else {
312 $sprintf = 0;
316 if (!$sprintf) {
317 if (($hsp_min && $score <= $hsp_min)
318 || ($hsp_max && ($score >= $hsp_max)) ) {
319 # do not build HSP
320 $oktobuild = 0;
321 } else {
322 $oktobuild = 1;
324 # store best hit score based on the hsp min/maxscore only
325 if (defined $hsp_min && $score > $hsp_min) {
326 $lastscore = $score if !$lastscore || $score > $lastscore;
327 } elsif (defined $hsp_max && $score < $hsp_max) {
328 $lastscore = $score if !$lastscore || $score < $lastscore;
333 # build HSP
334 if ($oktobuild) {
335 my $end;
336 # calculate start/end
337 if( $strand==0 ) {
338 $end = $start + $length -1;
339 } else {
340 $end = $start - $length + 1;
343 my ($rna, $meta) = $self->_motif2meta($seq, $descriptor);
345 $self->start_element({'Name' => 'Hsp'});
346 my $rnalen = $rna =~ tr{ATGCatgc}{ATGCatgc};
347 $self->element_hash({
348 'Hsp_stranded' => 'HIT',
349 'Hsp_hseq' => $rna,
350 'Hsp_query-from' => 1,
351 'Hsp_query-to' =>length($rna),
352 'Hsp_hit-from' => $start,
353 'Hsp_hit-to' => $end,
354 'Hsp_structure' => $meta,
355 'Hsp_align-len' => length($rna),
356 'Hsp_score' => $sprintf ? undef : $score,
357 'Hsp_custom-data' => $sprintf ? $score : undef,
359 $self->end_element({'Name' => 'Hsp'});
360 $oktobuild = 0 if (!$sprintf);
364 if ($self->within_element('hit')) {
365 $self->element(
366 {'Name' => 'Hit_score',
367 'Data' => $lastscore}
368 ) if $lastscore;
369 $self->end_element( { 'Name' => 'Hit' } );
371 if ($seentop) {
372 $self->end_element( { 'Name' => 'Result' } );
374 return $self->end_document();
377 =head2 start_element
379 Title : start_element
380 Usage : $eventgenerator->start_element
381 Function: Handles a start element event
382 Returns : none
383 Args : hashref with at least 2 keys 'Data' and 'Name'
386 =cut
388 sub start_element {
389 my ( $self, $data ) = @_;
391 # we currently don't care about attributes
392 my $nm = $data->{'Name'};
393 my $type = $MODEMAP{$nm};
394 if ($type) {
395 if ( $self->_eventHandler->will_handle($type) ) {
396 my $func = sprintf( "start_%s", lc $type );
397 $self->_eventHandler->$func( $data->{'Attributes'} );
399 unshift @{ $self->{'_elements'} }, $type;
401 if ( defined $type
402 && $type eq 'result' )
404 $self->{'_values'} = {};
405 $self->{'_result'} = undef;
409 =head2 end_element
411 Title : start_element
412 Usage : $eventgenerator->end_element
413 Function: Handles an end element event
414 Returns : none
415 Args : hashref with at least 2 keys, 'Data' and 'Name'
418 =cut
420 sub end_element {
421 my ( $self, $data ) = @_;
422 my $nm = $data->{'Name'};
423 my $type = $MODEMAP{$nm};
424 my $rc;
426 if ($type) {
427 if ( $self->_eventHandler->will_handle($type) ) {
428 my $func = sprintf( "end_%s", lc $type );
429 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
430 $self->{'_values'} );
432 my $lastelem = shift @{ $self->{'_elements'} };
434 elsif ( $MAPPING{$nm} ) {
435 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
436 my $key = ( keys %{ $MAPPING{$nm} } )[0];
437 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
438 $self->{'_last_data'};
440 else {
441 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
444 else {
445 $self->debug("unknown nm $nm, ignoring\n");
447 $self->{'_last_data'} = ''; # remove read data if we are at
448 # end of an element
449 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
450 return $rc;
453 =head2 element
455 Title : element
456 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
457 Function: Convenience method that calls start_element, characters, end_element
458 Returns : none
459 Args : Hash ref with the keys 'Name' and 'Data'
462 =cut
464 sub element {
465 my ( $self, $data ) = @_;
466 # simple data calls (%MAPPING) do not need start_element
467 $self->characters($data);
468 $self->end_element($data);
472 =head2 element_hash
474 Title : element
475 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
476 'Hsp_hit-to' => $end,
477 'Hsp_score' => $lastscore});
478 Function: Convenience method that takes multiple simple data elements and
479 maps to appropriate parameters
480 Returns : none
481 Args : Hash ref with the mapped key (in %MAPPING) and value
483 =cut
485 sub element_hash {
486 my ($self, $data) = @_;
487 $self->throw("Must provide data hash ref") if !$data || !ref($data);
488 for my $nm (sort keys %{$data}) {
489 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
490 if ( $MAPPING{$nm} ) {
491 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
492 my $key = ( keys %{ $MAPPING{$nm} } )[0];
493 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
494 $data->{$nm};
496 else {
497 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
503 =head2 characters
505 Title : characters
506 Usage : $eventgenerator->characters($str)
507 Function: Send a character events
508 Returns : none
509 Args : string
512 =cut
514 sub characters {
515 my ( $self, $data ) = @_;
516 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
517 $self->{'_last_data'} = $data->{'Data'};
520 =head2 within_element
522 Title : within_element
523 Usage : if( $eventgenerator->within_element($element) ) {}
524 Function: Test if we are within a particular element
525 This is different than 'in' because within can be tested
526 for a whole block.
527 Returns : boolean
528 Args : string element name
530 =cut
532 sub within_element {
533 my ( $self, $name ) = @_;
534 return 0
535 if ( !defined $name
536 || !defined $self->{'_elements'}
537 || scalar @{ $self->{'_elements'} } == 0 );
538 foreach ( @{ $self->{'_elements'} } ) {
539 return 1 if ( $_ eq $name );
541 return 0;
544 =head2 in_element
546 Title : in_element
547 Usage : if( $eventgenerator->in_element($element) ) {}
548 Function: Test if we are in a particular element
549 This is different than 'within' because 'in' only
550 tests its immediate parent.
551 Returns : boolean
552 Args : string element name
554 =cut
556 sub in_element {
557 my ( $self, $name ) = @_;
558 return 0 if !defined $self->{'_elements'}->[0];
559 return ( $self->{'_elements'}->[0] eq $name );
562 =head2 start_document
564 Title : start_document
565 Usage : $eventgenerator->start_document
566 Function: Handle a start document event
567 Returns : none
568 Args : none
570 =cut
572 sub start_document {
573 my ($self) = @_;
574 $self->{'_lasttype'} = '';
575 $self->{'_values'} = {};
576 $self->{'_result'} = undef;
577 $self->{'_elements'} = [];
580 =head2 end_document
582 Title : end_document
583 Usage : $eventgenerator->end_document
584 Function: Handles an end document event
585 Returns : Bio::Search::Result::ResultI object
586 Args : none
588 =cut
590 sub end_document {
591 my ($self) = @_;
592 return $self->{'_result'};
595 =head2 result_count
597 Title : result_count
598 Usage : my $count = $searchio->result_count
599 Function: Returns the number of results we have processed
600 Returns : integer
601 Args : none
603 =cut
605 sub result_count {
606 my $self = shift;
607 return $self->{'_result_count'};
610 =head2 descriptor
612 Title : descriptor
613 Usage : my $descr = $parser->descriptor();
614 Function: Get/Set descriptor name. Some versions of RNAMotif do not add the
615 descriptor name to the output. This also overrides any name found
616 while parsing.
617 Returns : String (name of model)
618 Args : [optional] String (name of model)
620 =cut
622 sub descriptor {
623 my $self = shift;
624 return $self->{'_descriptor'} = shift if @_;
625 return $self->{'_descriptor'};
628 =head2 model
630 Title : model
631 Usage : my $model = $parser->model();
632 Function: Get/Set model; Infernal currently does not output
633 the model name (Rfam ID)
634 Returns : String (name of model)
635 Args : [optional] String (name of model)
636 Note : this is a synonym for descriptor()
638 =cut
640 sub model { shift->descriptor(@_) }
642 =head2 database
644 Title : database
645 Usage : my $database = $parser->database();
646 Function: Get/Set database; Infernal currently does not output
647 the database name
648 Returns : String (database name)
649 Args : [optional] String (database name)
651 =cut
653 sub database {
654 my $self = shift;
655 return $self->{'_database'} = shift if @_;
656 return $self->{'_database'};
659 =head2 query_accession
661 Title : query_accession
662 Usage : my $acc = $parser->query_accession();
663 Function: Get/Set query (model) accession; RNAMotif currently does not output
664 the accession number
665 Returns : String (accession)
666 Args : [optional] String (accession)
668 =cut
670 sub query_accession {
671 my $self = shift;
672 return $self->{'_query_accession'} = shift if @_;
673 return $self->{'_query_accession'};
676 =head2 algorithm_version
678 Title : algorithm_version
679 Usage : my $ver = $parser->algorithm_version();
680 Function: Get/Set algorithm version (not defined in RNAMotif output)
681 Returns : String (accession)
682 Args : [optional] String (accession)
684 =cut
686 sub algorithm_version {
687 my $self = shift;
688 return $self->{'_algorithm'} = shift if @_;
689 return $self->{'_algorithm'};
692 =head2 hsp_minscore
694 Title : hsp_minscore
695 Usage : my $cutoff = $parser->hsp_minscore();
696 Function: Get/Set min score cutoff (for generating Hits/HSPs).
697 Returns : score (number)
698 Args : [optional] score (number)
699 Note : Cannot be set along with hsp_maxscore()
701 =cut
703 sub hsp_minscore {
704 my ($self, $score) = shift;
705 $self->throw('Minscore not set to a number') if
706 ($score && $score !~ m{[0-9.]+});
707 return $self->{'_hsp_minscore'} = shift if @_;
708 return $self->{'_hsp_minscore'};
711 =head2 hsp_maxscore
713 Title : hsp_maxscore
714 Usage : my $cutoff = $parser->hsp_maxscore();
715 Function: Get/Set max score cutoff (for generating Hits/HSPs).
716 Returns : score (number)
717 Args : [optional] score (number)
718 Note : Cannot be set along with hsp_minscore()
720 =cut
722 sub hsp_maxscore {
723 my ($self, $score) = shift;
724 $self->throw('Maxscore not set to a number') if
725 ($score && $score !~ m{[0-9.]+});
726 return $self->{'_hsp_maxscore'} = shift if @_;
727 return $self->{'_hsp_maxscore'};
730 =head2 structure_symbols
732 Title : structure_symbols
733 Usage : my $hashref = $parser->structure_symbols();
734 Function: Get/Set RNA structure symbols
735 Returns : Hash ref of delimiters (5' stem, 3' stem, single-strand, etc)
736 : default = < (5-prime)
737 > (3-prime)
738 . (single-strand)
739 ? (unknown)
740 Args : Hash ref of substitute delimiters, using above keys.
742 =cut
744 sub structure_symbols {
745 my ($self, $delim) = @_;
746 if ($delim) {
747 if (ref($delim) =~ m{HASH}) {
748 my %data = %{ $delim };
749 for my $d (@VALID_SYMBOLS) {
750 if ( exists $data{$d} ) {
751 $self->{'_delimiter'}->{$d} = $data{$d};
754 } else {
755 $self->throw("Args to helix_delimiters() should be in a hash reference");
758 return $self->{'_delimiter'};
761 #Private methods
763 =head2 _motif2meta
765 Title : _motif2meta
766 Usage : my ($rna, $meta) = $parser->_motif2meta($str, $descr);
767 Function: Creates meta string from sequence and descriptor
768 Returns : array of sequence, meta strings
769 Args : Array of string data and descriptor data
771 Note: This is currently a quick and simple way of making simple
772 RNA structures (stem-loops, helices, etc) from RNAMotif descriptor
773 data in the output file. It does not currently work with pseudoknots,
774 triplets, G-quartets, or other more complex RNA structural motifs.
776 =cut
778 sub _motif2meta {
779 my ($self, $str, $descriptor) = @_;
780 my ($rna, $meta);
781 my @desc_el = split ' ',$descriptor;
782 my @seq_el = split ' ',$str;
783 my $symbol = $self->structure_symbols();
784 if ($#desc_el != $#seq_el) {
785 $self->throw("Descriptor elements and seq elements do not match");
787 while (@desc_el) {
788 my $struct;
789 my ($seq, $motif) = (shift @seq_el, shift @desc_el);
790 $struct = (index($motif,'h5') == 0) ? $symbol->{'5-prime'} :
791 (index($motif,'h3') == 0) ? $symbol->{'3-prime'} :
792 (index($motif,'ss') == 0) ? $symbol->{'single-strand'} :
793 (index($motif,'ctx')== 0) ? $symbol->{'single-strand'} :
794 $symbol->{'unknown'};
795 $meta .= $struct x (length($seq));
796 $rna .= $seq;
798 return ($rna, $meta);