bug 2549; fixed small bug in Bio::Taxon which doesn't catch -common_name
[bioperl-live.git] / Bio / SearchIO / erpin.pm
blobe5ab165a1b70d835df36393dc9b236b868ffef76
1 # $Id$
3 # BioPerl module for Bio::SearchIO::erpin
5 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
7 # Copyright Chris Fields
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SearchIO::erpin - SearchIO-based ERPIN parser
17 =head1 SYNOPSIS
19 # do not call this module directly. Use Bio::SearchIO.
21 =head1 DESCRIPTION
23 This is an experimental SearchIO-based parser for output from
24 the erpin program. It currently parses erpin output for ERPIN
25 versions 4.2.5 and above; older versions may work but will not be supported.
27 =head1 FEEDBACK
29 =head2 Mailing Lists
31 User feedback is an integral part of the evolution of this and other
32 Bioperl modules. Send your comments and suggestions preferably to
33 the Bioperl mailing list. Your participation is much appreciated.
35 bioperl-l@bioperl.org - General discussion
36 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
38 =head2 Reporting Bugs
40 Report bugs to the Bioperl bug tracking system to help us keep track
41 of the bugs and their resolution. Bug reports can be submitted via the
42 web:
44 http://bugzilla.open-bio.org/
46 =head1 AUTHOR - Chris Fields
48 Email cjfields-at-uiuc-dot-edu
50 =head1 APPENDIX
52 The rest of the documentation details each of the object methods.
53 Internal methods are usually preceded with a _
55 =cut
57 # Let the code begin...
59 package Bio::SearchIO::erpin;
60 use strict;
62 use Data::Dumper;
63 use base qw(Bio::SearchIO);
65 my %MODEMAP = (
66 'Result' => 'result',
67 'Hit' => 'hit',
68 'Hsp' => 'hsp'
71 my %MAPPING = (
72 'Hsp_bit-score' => 'HSP-bits',
73 'Hsp_score' => 'HSP-score',
74 'Hsp_evalue' => 'HSP-evalue', # no evalues yet
75 'Hsp_query-from' => 'HSP-query_start',
76 'Hsp_query-to' => 'HSP-query_end',
77 'Hsp_hit-from' => 'HSP-hit_start', #
78 'Hsp_hit-to' => 'HSP-hit_end', #
79 'Hsp_gaps' => 'HSP-hsp_gaps',
80 'Hsp_hitgaps' => 'HSP-hit_gaps',
81 'Hsp_querygaps' => 'HSP-query_gaps',
82 'Hsp_qseq' => 'HSP-query_seq',
83 'Hsp_hseq' => 'HSP-hit_seq',
84 'Hsp_midline' => 'HSP-homology_seq',
85 'Hsp_structure' => 'HSP-meta',
86 'Hsp_align-len' => 'HSP-hsp_length',
87 'Hsp_stranded' => 'HSP-stranded',
89 # not supported yet
90 'Hsp_positive' => 'HSP-conserved',
91 'Hsp_identity' => 'HSP-identical',
93 'Hit_id' => 'HIT-name',
94 'Hit_len' => 'HIT-length',
95 'Hit_gi' => 'HIT-ncbi_gi',
96 'Hit_accession' => 'HIT-accession',
97 'Hit_def' => 'HIT-description',
98 'Hit_signif' => 'HIT-significance', # none yet
99 'Hit_score' => 'HIT-score', # best HSP bit score
100 'Hit_bits' => 'HIT-bits', # best HSP bit score
102 'ERPIN_program' => 'RESULT-algorithm_name', # get/set
103 'ERPIN_version' => 'RESULT-algorithm_version', # get/set
104 'ERPIN_query-def'=> 'RESULT-query_name', # get/set
105 'ERPIN_query-len'=> 'RESULT-query_length',
106 'ERPIN_query-acc'=> 'RESULT-query_accession', # get/set
107 'ERPIN_querydesc'=> 'RESULT-query_description', # get/set
108 'ERPIN_db' => 'RESULT-database_name', # get/set
109 'ERPIN_db-len' => 'RESULT-database_entries', # none yet
110 'ERPIN_db-let' => 'RESULT-database_letters', # none yet
112 'Parameters_cutoff' => { 'RESULT-parameters' => 'cutoff' },
113 'Parameters_expect' => { 'RESULT-parameters' => 'expect' },
115 'Parameters_include' => { 'RESULT-parameters' => 'include' },
116 'Parameters_sc-match' => { 'RESULT-parameters' => 'match' },
117 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch' },
118 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen' },
119 'Parameters_gap-extend' => { 'RESULT-parameters' => 'gapext' },
120 'Parameters_filter' => { 'RESULT-parameters' => 'filter' },
121 'Parameters_allowgaps' => { 'RESULT-parameters' => 'allowgaps' },
122 'Parameters_full_dbpath' => { 'RESULT-parameters' => 'full_dbpath' },
123 'Statistics_db-let' => { 'RESULT-statistics' => 'dbletters' },
126 my $MINSCORE = 0;
127 my $DEFAULT_VERSION = '4.2.5';
128 my $DEFAULT_ALGORITHM = 'erpin';
130 =head2 new
132 Title : new
133 Usage : my $obj = Bio::SearchIO::infernal->new();
134 Function: Builds a new Bio::SearchIO::infernal object
135 Returns : Bio::SearchIO::infernal
136 Args : -fh/-file => cmsearch (infernal) filename
137 -format => 'erpin'
138 -algorithm => algorithm (default 'Infernal')
139 -query_acc => query accession, eg. Rfam accession (default undef)
140 -hsp_minscore => minimum HSP score cutoff
141 -version => ERPIN version (not reported in output)
143 =cut
145 sub _initialize {
146 my ( $self, @args ) = @_;
147 $self->SUPER::_initialize(@args);
148 my ($cutoff, $accession, $version) =
149 $self->_rearrange([qw(HSP_MINSCORE QUERY_ACC VERSION)],@args);
150 my $handler = $self->_eventHandler;
151 $handler->register_factory(
152 'result',
153 Bio::Factory::ObjectFactory->new(
154 -type => 'Bio::Search::Result::GenericResult',
155 -interface => 'Bio::Search::Result::ResultI',
156 -verbose => $self->verbose()
160 $handler->register_factory(
161 'hit',
162 Bio::Factory::ObjectFactory->new(
163 -type => 'Bio::Search::Hit::ModelHit',
164 -interface => 'Bio::Search::Hit::HitI',
165 -verbose => $self->verbose()
169 $handler->register_factory(
170 'hsp',
171 Bio::Factory::ObjectFactory->new(
172 -type => 'Bio::Search::HSP::ModelHSP',
173 -interface => 'Bio::Search::HSP::HSPI',
174 -verbose => $self->verbose()
177 $accession && $self->query_accession($accession);
178 $cutoff ||= $MINSCORE;
179 $self->hsp_minscore($cutoff);
180 $version ||= $DEFAULT_VERSION;
181 $self->algorithm_version($version);
184 =head2 next_result
186 Title : next_result
187 Usage : my $hit = $searchio->next_result;
188 Function: Returns the next Result from a search
189 Returns : Bio::Search::Result::ResultI object
190 Args : none
192 =cut
194 sub next_result {
195 my ($self) = @_;
196 my $seentop = 0;
197 local $/ = "\n";
198 local $_;
199 my $accession = $self->query_accession;
200 my $minscore = $self->hsp_minscore;
201 my $version = $self->algorithm_version;
202 my $verbose = $self->verbose; # cache for speed?
203 $self->start_document();
204 my ($lasthit, $lastscore, $lastlen, $lasteval);
205 #my $hitline;
206 PARSER:
207 while ( defined( my $line = $self->_readline ) ) {
208 next if $line =~ m{^\s*$};
209 if ($line =~ m{^Training\sset:\s+"(.*)"}xmso) {
210 if ($seentop) {
211 $self->_pushback($line);
212 last PARSER;
214 $self->start_element({'Name' => 'Result'});
215 $self->element_hash( {
216 'ERPIN_query-def' => $1,
217 'ERPIN_program' =>'erpin',
218 'ERPIN_version' => $version,
219 'ERPIN_query-acc' => $accession,
221 $seentop = 1;
222 # parse rest of header here
223 HEADER:
224 while (defined ($line = $self->_readline) ) {
225 next if $line =~ m{^\s*$};
226 if (index($line, '>') == 0 ||
227 index($line, '-------- at level 1 --------') == 0) {
228 $self->_pushback($line);
229 last HEADER;
231 if ($line =~ m{^\s+(\d+\ssequences\sof\slength\s\d+)}xmso) {
232 $self->element(
233 {'Name' => 'ERPIN_querydesc',
234 'Data' => $1}
236 } elsif ($line =~ m{^Cutoff:\s+(\S+)}xmso) {
237 $self->element(
238 {'Name' => 'Parameters_cutoff',
239 'Data' => $1}
241 } elsif ($line =~ m{^Database:\s+"(.*)"}xmso) {
242 $self->element(
243 {'Name' => 'ERPIN_db',
244 'Data' => $1}
246 } elsif ($line =~ m{^\s+(\d+)\snucleotides\sto\sbe\sprocessed\sin\s(\d+)\ssequences}xmso) {
247 $self->element_hash(
248 {'ERPIN_db-len' => $2,
249 'ERPIN_db-let' => $1}
251 } elsif ($line =~ m{^E-value\sat\scutoff\s\S+\sfor\s\S+\sdouble\sstrand\sdata:\s+(\S+)}xmso) {
252 $self->element(
253 {'Name' => 'Parameters_expect',
254 'Data' => $1}
256 } elsif ($line =~ m{^\s+(ATGC\sratios:\s+(?:\S+\s+\S+\s+\S+\s+\S+))}) {
257 $self->element(
258 {'Name' => 'Statistics_db-let',
259 'Data' => $1}
263 } elsif ($line =~ m{^>(\S+)\s+(.*)}xmso ) {
264 my ($id, $desc) = ($1, $2);
265 chomp $desc;
266 # desc line is repeated for each strand, so must check
267 # prior to starting a new hit
268 if (!$lasthit || $id ne $lasthit) {
269 if ($self->within_element('hit') ) {
270 $self->element_hash({
271 'Hit_signif' => $lasteval,
272 'Hit_score' => $lastscore,
273 'Hit_bits' => $lastscore
275 $self->end_element({'Name' => 'Hit'});
277 $self->start_element({'Name' => 'Hit'});
278 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($id);
280 $self->element_hash({
281 'Hit_id' => $id,
282 'Hit_gi' => $gi,
283 'Hit_accession' => $ver ? "$acc.$ver" :
284 $acc ? $acc : $id,
285 'Hit_def' => $desc
288 $lasthit = $id;
289 } elsif ( (index($line, 'FW') == 0) || (index($line, 'RC') == 0)) {
290 my ($str, $hn, $pos, $score, $eval) = split ' ', $line;
291 if ($minscore < $score) {
292 $self->start_element({'Name' => 'Hsp'});
294 my ($start, $end) = split m{\.\.}, $pos, 2;
295 ($start, $end) = ($end, $start) if ($str eq 'RC');
296 $line = $self->_readline;
297 chomp $line;
298 $self->element_hash({
299 'Hsp_stranded' => 'HIT',
300 'Hsp_hit-from' => $start,
301 'Hsp_hit-to' => $end,
302 'Hsp_score' => $score,
303 'Hsp_bit-score' => $score,
304 'Hsp_evalue' => $eval,
305 'Hsp_query-from' => 1,
306 'Hsp_query-to' => length($line),
307 'Hsp_align-len' => length($line),
308 'Hsp_hseq' =>$line
310 $self->end_element({'Name' => 'Hsp'});
311 $lastscore = $score if (!$lastscore || $lastscore < $score);
312 $lasteval = $eval if (!$lasteval || $lasteval > $eval);
314 } else {
315 #$self->debug("Dropped data: $line");
318 if ($seentop) {
319 if ($self->within_element('hit')) {
320 $self->element_hash({
321 'Hit_signif' => $lasteval,
322 'Hit_score' => $lastscore,
323 'Hit_bits' => $lastscore
325 $self->end_element({'Name' => 'Hit'});
327 $self->end_element({'Name' => 'Result'});
329 return $self->end_document();
332 =head2 start_element
334 Title : start_element
335 Usage : $eventgenerator->start_element
336 Function: Handles a start element event
337 Returns : none
338 Args : hashref with at least 2 keys 'Data' and 'Name'
340 =cut
342 sub start_element {
343 my ( $self, $data ) = @_;
345 # we currently don't care about attributes
346 my $nm = $data->{'Name'};
347 my $type = $MODEMAP{$nm};
348 if ($type) {
349 if ( $self->_eventHandler->will_handle($type) ) {
350 my $func = sprintf( "start_%s", lc $type );
351 $self->_eventHandler->$func( $data->{'Attributes'} );
353 unshift @{ $self->{'_elements'} }, $type;
355 if ( defined $type
356 && $type eq 'result' )
358 $self->{'_values'} = {};
359 $self->{'_result'} = undef;
363 =head2 end_element
365 Title : start_element
366 Usage : $eventgenerator->end_element
367 Function: Handles an end element event
368 Returns : none
369 Args : hashref with at least 2 keys, 'Data' and 'Name'
372 =cut
374 sub end_element {
375 my ( $self, $data ) = @_;
376 my $nm = $data->{'Name'};
377 my $type = $MODEMAP{$nm};
378 my $rc;
380 if ($type) {
381 if ( $self->_eventHandler->will_handle($type) ) {
382 my $func = sprintf( "end_%s", lc $type );
383 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
384 $self->{'_values'} );
386 my $lastelem = shift @{ $self->{'_elements'} };
388 elsif ( $MAPPING{$nm} ) {
389 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
390 my $key = ( keys %{ $MAPPING{$nm} } )[0];
391 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
392 $self->{'_last_data'};
394 else {
395 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
398 else {
399 $self->debug("unknown nm $nm, ignoring\n");
401 $self->{'_last_data'} = ''; # remove read data if we are at
402 # end of an element
403 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
404 return $rc;
407 =head2 element
409 Title : element
410 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
411 Function: Convenience method that calls start_element, characters, end_element
412 Returns : none
413 Args : Hash ref with the keys 'Name' and 'Data'
416 =cut
418 sub element {
419 my ( $self, $data ) = @_;
420 # simple data calls (%MAPPING) do not need start_element
421 $self->characters($data);
422 $self->end_element($data);
425 =head2 element_hash
427 Title : element
428 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
429 'Hsp_hit-to' => $end,
430 'Hsp_score' => $lastscore});
431 Function: Convenience method that takes multiple simple data elements and
432 maps to appropriate parameters
433 Returns : none
434 Args : Hash ref with the mapped key (in %MAPPING) and value
436 =cut
438 sub element_hash {
439 my ($self, $data) = @_;
440 $self->throw("Must provide data hash ref") if !$data || !ref($data);
441 for my $nm (sort keys %{$data}) {
442 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
443 if ( $MAPPING{$nm} ) {
444 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
445 my $key = ( keys %{ $MAPPING{$nm} } )[0];
446 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
447 $data->{$nm};
449 else {
450 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
456 =head2 characters
458 Title : characters
459 Usage : $eventgenerator->characters($str)
460 Function: Send a character events
461 Returns : none
462 Args : string
465 =cut
467 sub characters {
468 my ( $self, $data ) = @_;
469 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
470 $self->{'_last_data'} = $data->{'Data'};
473 =head2 within_element
475 Title : within_element
476 Usage : if( $eventgenerator->within_element($element) ) {}
477 Function: Test if we are within a particular element
478 This is different than 'in' because within can be tested
479 for a whole block.
480 Returns : boolean
481 Args : string element name
483 =cut
485 sub within_element {
486 my ( $self, $name ) = @_;
487 return 0
488 if ( !defined $name
489 || !defined $self->{'_elements'}
490 || scalar @{ $self->{'_elements'} } == 0 );
491 foreach ( @{ $self->{'_elements'} } ) {
492 return 1 if ( $_ eq $name );
494 return 0;
497 =head2 in_element
499 Title : in_element
500 Usage : if( $eventgenerator->in_element($element) ) {}
501 Function: Test if we are in a particular element
502 This is different than 'within' because 'in' only
503 tests its immediate parent.
504 Returns : boolean
505 Args : string element name
507 =cut
509 sub in_element {
510 my ( $self, $name ) = @_;
511 return 0 if !defined $self->{'_elements'}->[0];
512 return ( $self->{'_elements'}->[0] eq $name );
515 =head2 start_document
517 Title : start_document
518 Usage : $eventgenerator->start_document
519 Function: Handle a start document event
520 Returns : none
521 Args : none
523 =cut
525 sub start_document {
526 my ($self) = @_;
527 $self->{'_lasttype'} = '';
528 $self->{'_values'} = {};
529 $self->{'_result'} = undef;
530 $self->{'_elements'} = [];
533 =head2 end_document
535 Title : end_document
536 Usage : $eventgenerator->end_document
537 Function: Handles an end document event
538 Returns : Bio::Search::Result::ResultI object
539 Args : none
541 =cut
543 sub end_document {
544 my ($self) = @_;
545 return $self->{'_result'};
548 =head2 result_count
550 Title : result_count
551 Usage : my $count = $searchio->result_count
552 Function: Returns the number of results we have processed
553 Returns : integer
554 Args : none
556 =cut
558 sub result_count {
559 my $self = shift;
560 return $self->{'_result_count'};
563 =head2 query_accession
565 Title : query_accession
566 Usage : my $acc = $parser->query_accession();
567 Function: Get/Set query (model) accession; Infernal currently does not output
568 the accession number (Rfam accession #)
569 Returns : String (accession)
570 Args : [optional] String (accession)
572 =cut
574 sub query_accession {
575 my $self = shift;
576 return $self->{'_query_accession'} = shift if @_;
577 return $self->{'_query_accession'};
580 =head2 hsp_minscore
582 Title : hsp_minscore
583 Usage : my $cutoff = $parser->hsp_minscore();
584 Function: Get/Set min bit score cutoff (for generating Hits/HSPs)
585 Returns : score (number)
586 Args : [optional] score (number)
588 =cut
590 sub hsp_minscore {
591 my $self = shift;
592 return $self->{'_hsp_minscore'} = shift if @_;
593 return $self->{'_hsp_minscore'};
596 =head2 algorithm_version
598 Title : algorithm_version
599 Usage : my $ver = $parser->algorithm_version();
600 Function: Get/Set algorithm version (not defined in RNAMotif output)
601 Returns : String (accession)
602 Args : [optional] String (accession)
604 =cut
606 sub algorithm_version {
607 my $self = shift;
608 return $self->{'_algorithm'} = shift if @_;
609 return $self->{'_algorithm'};