bug 2485 and related fixes
[bioperl-live.git] / Bio / SearchIO / erpin.pm
blob8734b3a15e51e41525c973b4279dbbcfbf344c76
1 # $Id$
3 # BioPerl module for Bio::SearchIO::erpin
5 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
7 # Copyright Chris Fields
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SearchIO::erpin - SearchIO-based ERPIN parser
17 =head1 SYNOPSIS
19 =head1 DESCRIPTION
21 This is an experimental SearchIO-based parser for output from
22 the erpin program. It currently parses erpin output for ERPIN
23 versions 4.2.5 and above; older versions may work but will not be supported.
25 =head1 FEEDBACK
27 =head2 Mailing Lists
29 User feedback is an integral part of the evolution of this and other
30 Bioperl modules. Send your comments and suggestions preferably to
31 the Bioperl mailing list. Your participation is much appreciated.
33 bioperl-l@bioperl.org - General discussion
34 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
36 =head2 Reporting Bugs
38 Report bugs to the Bioperl bug tracking system to help us keep track
39 of the bugs and their resolution. Bug reports can be submitted via the
40 web:
42 http://bugzilla.open-bio.org/
44 =head1 AUTHOR - Chris Fields
46 Email cjfields-at-uiuc-dot-edu
48 =head1 APPENDIX
50 The rest of the documentation details each of the object methods.
51 Internal methods are usually preceded with a _
53 =cut
55 # Let the code begin...
57 package Bio::SearchIO::erpin;
58 use strict;
60 use Data::Dumper;
61 use base qw(Bio::SearchIO);
63 my %MODEMAP = (
64 'Result' => 'result',
65 'Hit' => 'hit',
66 'Hsp' => 'hsp'
69 my %MAPPING = (
70 'Hsp_bit-score' => 'HSP-bits',
71 'Hsp_score' => 'HSP-score',
72 'Hsp_evalue' => 'HSP-evalue', # no evalues yet
73 'Hsp_query-from' => 'HSP-query_start',
74 'Hsp_query-to' => 'HSP-query_end',
75 'Hsp_hit-from' => 'HSP-hit_start', #
76 'Hsp_hit-to' => 'HSP-hit_end', #
77 'Hsp_gaps' => 'HSP-hsp_gaps',
78 'Hsp_hitgaps' => 'HSP-hit_gaps',
79 'Hsp_querygaps' => 'HSP-query_gaps',
80 'Hsp_qseq' => 'HSP-query_seq',
81 'Hsp_hseq' => 'HSP-hit_seq',
82 'Hsp_midline' => 'HSP-homology_seq',
83 'Hsp_structure' => 'HSP-meta',
84 'Hsp_align-len' => 'HSP-hsp_length',
85 'Hsp_stranded' => 'HSP-stranded',
87 # not supported yet
88 'Hsp_positive' => 'HSP-conserved',
89 'Hsp_identity' => 'HSP-identical',
91 'Hit_id' => 'HIT-name',
92 'Hit_len' => 'HIT-length',
93 'Hit_gi' => 'HIT-ncbi_gi',
94 'Hit_accession' => 'HIT-accession',
95 'Hit_def' => 'HIT-description',
96 'Hit_signif' => 'HIT-significance', # none yet
97 'Hit_score' => 'HIT-score', # best HSP bit score
98 'Hit_bits' => 'HIT-bits', # best HSP bit score
100 'ERPIN_program' => 'RESULT-algorithm_name', # get/set
101 'ERPIN_version' => 'RESULT-algorithm_version', # get/set
102 'ERPIN_query-def'=> 'RESULT-query_name', # get/set
103 'ERPIN_query-len'=> 'RESULT-query_length',
104 'ERPIN_query-acc'=> 'RESULT-query_accession', # get/set
105 'ERPIN_querydesc'=> 'RESULT-query_description', # get/set
106 'ERPIN_db' => 'RESULT-database_name', # get/set
107 'ERPIN_db-len' => 'RESULT-database_entries', # none yet
108 'ERPIN_db-let' => 'RESULT-database_letters', # none yet
110 'Parameters_cutoff' => { 'RESULT-parameters' => 'cutoff' },
111 'Parameters_expect' => { 'RESULT-parameters' => 'expect' },
113 'Parameters_include' => { 'RESULT-parameters' => 'include' },
114 'Parameters_sc-match' => { 'RESULT-parameters' => 'match' },
115 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch' },
116 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen' },
117 'Parameters_gap-extend' => { 'RESULT-parameters' => 'gapext' },
118 'Parameters_filter' => { 'RESULT-parameters' => 'filter' },
119 'Parameters_allowgaps' => { 'RESULT-parameters' => 'allowgaps' },
120 'Parameters_full_dbpath' => { 'RESULT-parameters' => 'full_dbpath' },
121 'Statistics_db-let' => { 'RESULT-statistics' => 'dbletters' },
124 my $MINSCORE = 0;
125 my $DEFAULT_VERSION = '4.2.5';
126 my $DEFAULT_ALGORITHM = 'erpin';
128 =head2 new
130 Title : new
131 Usage : my $obj = Bio::SearchIO::infernal->new();
132 Function: Builds a new Bio::SearchIO::infernal object
133 Returns : Bio::SearchIO::infernal
134 Args : -fh/-file => cmsearch (infernal) filename
135 -format => 'erpin'
136 -algorithm => algorithm (default 'Infernal')
137 -query_acc => query accession, eg. Rfam accession (default undef)
138 -hsp_minscore => minimum HSP score cutoff
139 -version => ERPIN version (not reported in output)
141 =cut
143 sub _initialize {
144 my ( $self, @args ) = @_;
145 $self->SUPER::_initialize(@args);
146 my ($cutoff, $accession, $version) =
147 $self->_rearrange([qw(HSP_MINSCORE QUERY_ACC VERSION)],@args);
148 my $handler = $self->_eventHandler;
149 $handler->register_factory(
150 'result',
151 Bio::Factory::ObjectFactory->new(
152 -type => 'Bio::Search::Result::GenericResult',
153 -interface => 'Bio::Search::Result::ResultI',
154 -verbose => $self->verbose()
158 $handler->register_factory(
159 'hit',
160 Bio::Factory::ObjectFactory->new(
161 -type => 'Bio::Search::Hit::ModelHit',
162 -interface => 'Bio::Search::Hit::HitI',
163 -verbose => $self->verbose()
167 $handler->register_factory(
168 'hsp',
169 Bio::Factory::ObjectFactory->new(
170 -type => 'Bio::Search::HSP::ModelHSP',
171 -interface => 'Bio::Search::HSP::HSPI',
172 -verbose => $self->verbose()
175 $accession && $self->query_accession($accession);
176 $cutoff ||= $MINSCORE;
177 $self->hsp_minscore($cutoff);
178 $version ||= $DEFAULT_VERSION;
179 $self->algorithm_version($version);
182 =head2 next_result
184 Title : next_result
185 Usage : my $hit = $searchio->next_result;
186 Function: Returns the next Result from a search
187 Returns : Bio::Search::Result::ResultI object
188 Args : none
190 =cut
192 sub next_result {
193 my ($self) = @_;
194 my $seentop = 0;
195 local $/ = "\n";
196 local $_;
197 my $accession = $self->query_accession;
198 my $minscore = $self->hsp_minscore;
199 my $version = $self->algorithm_version;
200 my $verbose = $self->verbose; # cache for speed?
201 $self->start_document();
202 my ($lasthit, $lastscore, $lastlen, $lasteval);
203 #my $hitline;
204 PARSER:
205 while ( defined( my $line = $self->_readline ) ) {
206 next if $line =~ m{^\s*$};
207 if ($line =~ m{^Training\sset:\s+"(.*)"}xmso) {
208 if ($seentop) {
209 $self->_pushback($line);
210 last PARSER;
212 $self->start_element({'Name' => 'Result'});
213 $self->element_hash( {
214 'ERPIN_query-def' => $1,
215 'ERPIN_program' =>'erpin',
216 'ERPIN_version' => $version,
217 'ERPIN_query-acc' => $accession,
219 $seentop = 1;
220 # parse rest of header here
221 HEADER:
222 while (defined ($line = $self->_readline) ) {
223 next if $line =~ m{^\s*$};
224 if (index($line, '>') == 0 ||
225 index($line, '-------- at level 1 --------') == 0) {
226 $self->_pushback($line);
227 last HEADER;
229 if ($line =~ m{^\s+(\d+\ssequences\sof\slength\s\d+)}xmso) {
230 $self->element(
231 {'Name' => 'ERPIN_querydesc',
232 'Data' => $1}
234 } elsif ($line =~ m{^Cutoff:\s+(\S+)}xmso) {
235 $self->element(
236 {'Name' => 'Parameters_cutoff',
237 'Data' => $1}
239 } elsif ($line =~ m{^Database:\s+"(.*)"}xmso) {
240 $self->element(
241 {'Name' => 'ERPIN_db',
242 'Data' => $1}
244 } elsif ($line =~ m{^\s+(\d+)\snucleotides\sto\sbe\sprocessed\sin\s(\d+)\ssequences}xmso) {
245 $self->element_hash(
246 {'ERPIN_db-len' => $2,
247 'ERPIN_db-let' => $1}
249 } elsif ($line =~ m{^E-value\sat\scutoff\s\S+\sfor\s\S+\sdouble\sstrand\sdata:\s+(\S+)}xmso) {
250 $self->element(
251 {'Name' => 'Parameters_expect',
252 'Data' => $1}
254 } elsif ($line =~ m{^\s+(ATGC\sratios:\s+(?:\S+\s+\S+\s+\S+\s+\S+))}) {
255 $self->element(
256 {'Name' => 'Statistics_db-let',
257 'Data' => $1}
261 } elsif ($line =~ m{^>(\S+)\s+(.*)}xmso ) {
262 my ($id, $desc) = ($1, $2);
263 chomp $desc;
264 # desc line is repeated for each strand, so must check
265 # prior to starting a new hit
266 if (!$lasthit || $id ne $lasthit) {
267 if ($self->within_element('hit') ) {
268 $self->element_hash({
269 'Hit_signif' => $lasteval,
270 'Hit_score' => $lastscore,
271 'Hit_bits' => $lastscore
273 $self->end_element({'Name' => 'Hit'});
275 $self->start_element({'Name' => 'Hit'});
276 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($id);
278 $self->element_hash({
279 'Hit_id' => $id,
280 'Hit_gi' => $gi,
281 'Hit_accession' => $ver ? "$acc.$ver" :
282 $acc ? $acc : $id,
283 'Hit_def' => $desc
286 $lasthit = $id;
287 } elsif ( (index($line, 'FW') == 0) || (index($line, 'RC') == 0)) {
288 my ($str, $hn, $pos, $score, $eval) = split ' ', $line;
289 if ($minscore < $score) {
290 $self->start_element({'Name' => 'Hsp'});
292 my ($start, $end) = split m{\.\.}, $pos, 2;
293 ($start, $end) = ($end, $start) if ($str eq 'RC');
294 $line = $self->_readline;
295 chomp $line;
296 $self->element_hash({
297 'Hsp_stranded' => 'HIT',
298 'Hsp_hit-from' => $start,
299 'Hsp_hit-to' => $end,
300 'Hsp_score' => $score,
301 'Hsp_bit-score' => $score,
302 'Hsp_evalue' => $eval,
303 'Hsp_query-from' => 1,
304 'Hsp_query-to' => length($line),
305 'Hsp_align-len' => length($line),
306 'Hsp_hseq' =>$line
308 $self->end_element({'Name' => 'Hsp'});
309 $lastscore = $score if (!$lastscore || $lastscore < $score);
310 $lasteval = $eval if (!$lasteval || $lasteval > $eval);
312 } else {
313 #$self->debug("Dropped data: $line");
316 if ($seentop) {
317 if ($self->within_element('hit')) {
318 $self->element_hash({
319 'Hit_signif' => $lasteval,
320 'Hit_score' => $lastscore,
321 'Hit_bits' => $lastscore
323 $self->end_element({'Name' => 'Hit'});
325 $self->end_element({'Name' => 'Result'});
327 return $self->end_document();
330 =head2 start_element
332 Title : start_element
333 Usage : $eventgenerator->start_element
334 Function: Handles a start element event
335 Returns : none
336 Args : hashref with at least 2 keys 'Data' and 'Name'
338 =cut
340 sub start_element {
341 my ( $self, $data ) = @_;
343 # we currently don't care about attributes
344 my $nm = $data->{'Name'};
345 my $type = $MODEMAP{$nm};
346 if ($type) {
347 if ( $self->_eventHandler->will_handle($type) ) {
348 my $func = sprintf( "start_%s", lc $type );
349 $self->_eventHandler->$func( $data->{'Attributes'} );
351 unshift @{ $self->{'_elements'} }, $type;
353 if ( defined $type
354 && $type eq 'result' )
356 $self->{'_values'} = {};
357 $self->{'_result'} = undef;
361 =head2 end_element
363 Title : start_element
364 Usage : $eventgenerator->end_element
365 Function: Handles an end element event
366 Returns : none
367 Args : hashref with at least 2 keys, 'Data' and 'Name'
370 =cut
372 sub end_element {
373 my ( $self, $data ) = @_;
374 my $nm = $data->{'Name'};
375 my $type = $MODEMAP{$nm};
376 my $rc;
378 if ($type) {
379 if ( $self->_eventHandler->will_handle($type) ) {
380 my $func = sprintf( "end_%s", lc $type );
381 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
382 $self->{'_values'} );
384 my $lastelem = shift @{ $self->{'_elements'} };
386 elsif ( $MAPPING{$nm} ) {
387 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
388 my $key = ( keys %{ $MAPPING{$nm} } )[0];
389 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
390 $self->{'_last_data'};
392 else {
393 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
396 else {
397 $self->debug("unknown nm $nm, ignoring\n");
399 $self->{'_last_data'} = ''; # remove read data if we are at
400 # end of an element
401 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
402 return $rc;
405 =head2 element
407 Title : element
408 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
409 Function: Convenience method that calls start_element, characters, end_element
410 Returns : none
411 Args : Hash ref with the keys 'Name' and 'Data'
414 =cut
416 sub element {
417 my ( $self, $data ) = @_;
418 # simple data calls (%MAPPING) do not need start_element
419 $self->characters($data);
420 $self->end_element($data);
423 =head2 element_hash
425 Title : element
426 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
427 'Hsp_hit-to' => $end,
428 'Hsp_score' => $lastscore});
429 Function: Convenience method that takes multiple simple data elements and
430 maps to appropriate parameters
431 Returns : none
432 Args : Hash ref with the mapped key (in %MAPPING) and value
434 =cut
436 sub element_hash {
437 my ($self, $data) = @_;
438 $self->throw("Must provide data hash ref") if !$data || !ref($data);
439 for my $nm (sort keys %{$data}) {
440 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
441 if ( $MAPPING{$nm} ) {
442 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
443 my $key = ( keys %{ $MAPPING{$nm} } )[0];
444 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
445 $data->{$nm};
447 else {
448 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
454 =head2 characters
456 Title : characters
457 Usage : $eventgenerator->characters($str)
458 Function: Send a character events
459 Returns : none
460 Args : string
463 =cut
465 sub characters {
466 my ( $self, $data ) = @_;
467 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
468 $self->{'_last_data'} = $data->{'Data'};
471 =head2 within_element
473 Title : within_element
474 Usage : if( $eventgenerator->within_element($element) ) {}
475 Function: Test if we are within a particular element
476 This is different than 'in' because within can be tested
477 for a whole block.
478 Returns : boolean
479 Args : string element name
481 =cut
483 sub within_element {
484 my ( $self, $name ) = @_;
485 return 0
486 if ( !defined $name
487 || !defined $self->{'_elements'}
488 || scalar @{ $self->{'_elements'} } == 0 );
489 foreach ( @{ $self->{'_elements'} } ) {
490 return 1 if ( $_ eq $name );
492 return 0;
495 =head2 in_element
497 Title : in_element
498 Usage : if( $eventgenerator->in_element($element) ) {}
499 Function: Test if we are in a particular element
500 This is different than 'within' because 'in' only
501 tests its immediate parent.
502 Returns : boolean
503 Args : string element name
505 =cut
507 sub in_element {
508 my ( $self, $name ) = @_;
509 return 0 if !defined $self->{'_elements'}->[0];
510 return ( $self->{'_elements'}->[0] eq $name );
513 =head2 start_document
515 Title : start_document
516 Usage : $eventgenerator->start_document
517 Function: Handle a start document event
518 Returns : none
519 Args : none
521 =cut
523 sub start_document {
524 my ($self) = @_;
525 $self->{'_lasttype'} = '';
526 $self->{'_values'} = {};
527 $self->{'_result'} = undef;
528 $self->{'_elements'} = [];
531 =head2 end_document
533 Title : end_document
534 Usage : $eventgenerator->end_document
535 Function: Handles an end document event
536 Returns : Bio::Search::Result::ResultI object
537 Args : none
539 =cut
541 sub end_document {
542 my ($self) = @_;
543 return $self->{'_result'};
546 =head2 result_count
548 Title : result_count
549 Usage : my $count = $searchio->result_count
550 Function: Returns the number of results we have processed
551 Returns : integer
552 Args : none
554 =cut
556 sub result_count {
557 my $self = shift;
558 return $self->{'_result_count'};
561 =head2 query_accession
563 Title : query_accession
564 Usage : my $acc = $parser->query_accession();
565 Function: Get/Set query (model) accession; Infernal currently does not output
566 the accession number (Rfam accession #)
567 Returns : String (accession)
568 Args : [optional] String (accession)
570 =cut
572 sub query_accession {
573 my $self = shift;
574 return $self->{'_query_accession'} = shift if @_;
575 return $self->{'_query_accession'};
578 =head2 hsp_minscore
580 Title : hsp_minscore
581 Usage : my $cutoff = $parser->hsp_minscore();
582 Function: Get/Set min bit score cutoff (for generating Hits/HSPs)
583 Returns : score (number)
584 Args : [optional] score (number)
586 =cut
588 sub hsp_minscore {
589 my $self = shift;
590 return $self->{'_hsp_minscore'} = shift if @_;
591 return $self->{'_hsp_minscore'};
594 =head2 algorithm_version
596 Title : algorithm_version
597 Usage : my $ver = $parser->algorithm_version();
598 Function: Get/Set algorithm version (not defined in RNAMotif output)
599 Returns : String (accession)
600 Args : [optional] String (accession)
602 =cut
604 sub algorithm_version {
605 my $self = shift;
606 return $self->{'_algorithm'} = shift if @_;
607 return $self->{'_algorithm'};