3 # BioPerl module for Bio::SearchIO::erpin
5 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
7 # Copyright Chris Fields
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::SearchIO::erpin - SearchIO-based ERPIN parser
19 # do not call this module directly. Use Bio::SearchIO.
23 This is an experimental SearchIO-based parser for output from
24 the erpin program. It currently parses erpin output for ERPIN
25 versions 4.2.5 and above; older versions may work but will not be supported.
31 User feedback is an integral part of the evolution of this and other
32 Bioperl modules. Send your comments and suggestions preferably to
33 the Bioperl mailing list. Your participation is much appreciated.
35 bioperl-l@bioperl.org - General discussion
36 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
40 Report bugs to the Bioperl bug tracking system to help us keep track
41 of the bugs and their resolution. Bug reports can be submitted via the
44 http://bugzilla.open-bio.org/
46 =head1 AUTHOR - Chris Fields
48 Email cjfields-at-uiuc-dot-edu
52 The rest of the documentation details each of the object methods.
53 Internal methods are usually preceded with a _
57 # Let the code begin...
59 package Bio
::SearchIO
::erpin
;
63 use base
qw(Bio::SearchIO);
72 'Hsp_bit-score' => 'HSP-bits',
73 'Hsp_score' => 'HSP-score',
74 'Hsp_evalue' => 'HSP-evalue', # no evalues yet
75 'Hsp_query-from' => 'HSP-query_start',
76 'Hsp_query-to' => 'HSP-query_end',
77 'Hsp_hit-from' => 'HSP-hit_start', #
78 'Hsp_hit-to' => 'HSP-hit_end', #
79 'Hsp_gaps' => 'HSP-hsp_gaps',
80 'Hsp_hitgaps' => 'HSP-hit_gaps',
81 'Hsp_querygaps' => 'HSP-query_gaps',
82 'Hsp_qseq' => 'HSP-query_seq',
83 'Hsp_hseq' => 'HSP-hit_seq',
84 'Hsp_midline' => 'HSP-homology_seq',
85 'Hsp_structure' => 'HSP-meta',
86 'Hsp_align-len' => 'HSP-hsp_length',
87 'Hsp_stranded' => 'HSP-stranded',
90 'Hsp_positive' => 'HSP-conserved',
91 'Hsp_identity' => 'HSP-identical',
93 'Hit_id' => 'HIT-name',
94 'Hit_len' => 'HIT-length',
95 'Hit_gi' => 'HIT-ncbi_gi',
96 'Hit_accession' => 'HIT-accession',
97 'Hit_def' => 'HIT-description',
98 'Hit_signif' => 'HIT-significance', # none yet
99 'Hit_score' => 'HIT-score', # best HSP bit score
100 'Hit_bits' => 'HIT-bits', # best HSP bit score
102 'ERPIN_program' => 'RESULT-algorithm_name', # get/set
103 'ERPIN_version' => 'RESULT-algorithm_version', # get/set
104 'ERPIN_query-def'=> 'RESULT-query_name', # get/set
105 'ERPIN_query-len'=> 'RESULT-query_length',
106 'ERPIN_query-acc'=> 'RESULT-query_accession', # get/set
107 'ERPIN_querydesc'=> 'RESULT-query_description', # get/set
108 'ERPIN_db' => 'RESULT-database_name', # get/set
109 'ERPIN_db-len' => 'RESULT-database_entries', # none yet
110 'ERPIN_db-let' => 'RESULT-database_letters', # none yet
112 'Parameters_cutoff' => { 'RESULT-parameters' => 'cutoff' },
113 'Parameters_expect' => { 'RESULT-parameters' => 'expect' },
115 'Parameters_include' => { 'RESULT-parameters' => 'include' },
116 'Parameters_sc-match' => { 'RESULT-parameters' => 'match' },
117 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch' },
118 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen' },
119 'Parameters_gap-extend' => { 'RESULT-parameters' => 'gapext' },
120 'Parameters_filter' => { 'RESULT-parameters' => 'filter' },
121 'Parameters_allowgaps' => { 'RESULT-parameters' => 'allowgaps' },
122 'Parameters_full_dbpath' => { 'RESULT-parameters' => 'full_dbpath' },
123 'Statistics_db-let' => { 'RESULT-statistics' => 'dbletters' },
127 my $DEFAULT_VERSION = '4.2.5';
128 my $DEFAULT_ALGORITHM = 'erpin';
133 Usage : my $obj = Bio::SearchIO::infernal->new();
134 Function: Builds a new Bio::SearchIO::infernal object
135 Returns : Bio::SearchIO::infernal
136 Args : -fh/-file => cmsearch (infernal) filename
138 -algorithm => algorithm (default 'Infernal')
139 -query_acc => query accession, eg. Rfam accession (default undef)
140 -hsp_minscore => minimum HSP score cutoff
141 -version => ERPIN version (not reported in output)
146 my ( $self, @args ) = @_;
147 $self->SUPER::_initialize
(@args);
148 my ($cutoff, $accession, $version) =
149 $self->_rearrange([qw(HSP_MINSCORE QUERY_ACC VERSION)],@args);
150 my $handler = $self->_eventHandler;
151 $handler->register_factory(
153 Bio
::Factory
::ObjectFactory
->new(
154 -type
=> 'Bio::Search::Result::GenericResult',
155 -interface
=> 'Bio::Search::Result::ResultI',
156 -verbose
=> $self->verbose()
160 $handler->register_factory(
162 Bio
::Factory
::ObjectFactory
->new(
163 -type
=> 'Bio::Search::Hit::ModelHit',
164 -interface
=> 'Bio::Search::Hit::HitI',
165 -verbose
=> $self->verbose()
169 $handler->register_factory(
171 Bio
::Factory
::ObjectFactory
->new(
172 -type
=> 'Bio::Search::HSP::ModelHSP',
173 -interface
=> 'Bio::Search::HSP::HSPI',
174 -verbose
=> $self->verbose()
177 $accession && $self->query_accession($accession);
178 $cutoff ||= $MINSCORE;
179 $self->hsp_minscore($cutoff);
180 $version ||= $DEFAULT_VERSION;
181 $self->algorithm_version($version);
187 Usage : my $hit = $searchio->next_result;
188 Function: Returns the next Result from a search
189 Returns : Bio::Search::Result::ResultI object
199 my $accession = $self->query_accession;
200 my $minscore = $self->hsp_minscore;
201 my $version = $self->algorithm_version;
202 my $verbose = $self->verbose; # cache for speed?
203 $self->start_document();
204 my ($lasthit, $lastscore, $lastlen, $lasteval);
207 while ( defined( my $line = $self->_readline ) ) {
208 next if $line =~ m{^\s*$};
209 if ($line =~ m{^Training\sset:\s+"(.*)"}xmso) {
211 $self->_pushback($line);
214 $self->start_element({'Name' => 'Result'});
215 $self->element_hash( {
216 'ERPIN_query-def' => $1,
217 'ERPIN_program' =>'erpin',
218 'ERPIN_version' => $version,
219 'ERPIN_query-acc' => $accession,
222 # parse rest of header here
224 while (defined ($line = $self->_readline) ) {
225 next if $line =~ m{^\s*$};
226 if (index($line, '>') == 0 ||
227 index($line, '-------- at level 1 --------') == 0) {
228 $self->_pushback($line);
231 if ($line =~ m{^\s+(\d+\ssequences\sof\slength\s\d+)}xmso) {
233 {'Name' => 'ERPIN_querydesc',
236 } elsif ($line =~ m{^Cutoff:\s+(\S+)}xmso) {
238 {'Name' => 'Parameters_cutoff',
241 } elsif ($line =~ m{^Database:\s+"(.*)"}xmso) {
243 {'Name' => 'ERPIN_db',
246 } elsif ($line =~ m{^\s+(\d+)\snucleotides\sto\sbe\sprocessed\sin\s(\d+)\ssequences}xmso) {
248 {'ERPIN_db-len' => $2,
249 'ERPIN_db-let' => $1}
251 } elsif ($line =~ m{^E-value\sat\scutoff\s\S+\sfor\s\S+\sdouble\sstrand\sdata:\s+(\S+)}xmso) {
253 {'Name' => 'Parameters_expect',
256 } elsif ($line =~ m{^\s+(ATGC\sratios:\s+(?:\S+\s+\S+\s+\S+\s+\S+))}) {
258 {'Name' => 'Statistics_db-let',
263 } elsif ($line =~ m{^>(\S+)\s+(.*)}xmso ) {
264 my ($id, $desc) = ($1, $2);
266 # desc line is repeated for each strand, so must check
267 # prior to starting a new hit
268 if (!$lasthit || $id ne $lasthit) {
269 if ($self->within_element('hit') ) {
270 $self->element_hash({
271 'Hit_signif' => $lasteval,
272 'Hit_score' => $lastscore,
273 'Hit_bits' => $lastscore
275 $self->end_element({'Name' => 'Hit'});
277 $self->start_element({'Name' => 'Hit'});
278 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($id);
280 $self->element_hash({
283 'Hit_accession' => $ver ?
"$acc.$ver" :
289 } elsif ( (index($line, 'FW') == 0) || (index($line, 'RC') == 0)) {
290 my ($str, $hn, $pos, $score, $eval) = split ' ', $line;
291 if ($minscore < $score) {
292 $self->start_element({'Name' => 'Hsp'});
294 my ($start, $end) = split m{\.\.}, $pos, 2;
295 ($start, $end) = ($end, $start) if ($str eq 'RC');
296 $line = $self->_readline;
298 $self->element_hash({
299 'Hsp_stranded' => 'HIT',
300 'Hsp_hit-from' => $start,
301 'Hsp_hit-to' => $end,
302 'Hsp_score' => $score,
303 'Hsp_bit-score' => $score,
304 'Hsp_evalue' => $eval,
305 'Hsp_query-from' => 1,
306 'Hsp_query-to' => length($line),
307 'Hsp_align-len' => length($line),
310 $self->end_element({'Name' => 'Hsp'});
311 $lastscore = $score if (!$lastscore || $lastscore < $score);
312 $lasteval = $eval if (!$lasteval || $lasteval > $eval);
315 #$self->debug("Dropped data: $line");
319 if ($self->within_element('hit')) {
320 $self->element_hash({
321 'Hit_signif' => $lasteval,
322 'Hit_score' => $lastscore,
323 'Hit_bits' => $lastscore
325 $self->end_element({'Name' => 'Hit'});
327 $self->end_element({'Name' => 'Result'});
329 return $self->end_document();
334 Title : start_element
335 Usage : $eventgenerator->start_element
336 Function: Handles a start element event
338 Args : hashref with at least 2 keys 'Data' and 'Name'
343 my ( $self, $data ) = @_;
345 # we currently don't care about attributes
346 my $nm = $data->{'Name'};
347 my $type = $MODEMAP{$nm};
349 if ( $self->_eventHandler->will_handle($type) ) {
350 my $func = sprintf( "start_%s", lc $type );
351 $self->_eventHandler->$func( $data->{'Attributes'} );
353 unshift @
{ $self->{'_elements'} }, $type;
356 && $type eq 'result' )
358 $self->{'_values'} = {};
359 $self->{'_result'} = undef;
365 Title : start_element
366 Usage : $eventgenerator->end_element
367 Function: Handles an end element event
369 Args : hashref with at least 2 keys, 'Data' and 'Name'
375 my ( $self, $data ) = @_;
376 my $nm = $data->{'Name'};
377 my $type = $MODEMAP{$nm};
381 if ( $self->_eventHandler->will_handle($type) ) {
382 my $func = sprintf( "end_%s", lc $type );
383 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
384 $self->{'_values'} );
386 my $lastelem = shift @
{ $self->{'_elements'} };
388 elsif ( $MAPPING{$nm} ) {
389 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
390 my $key = ( keys %{ $MAPPING{$nm} } )[0];
391 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
392 $self->{'_last_data'};
395 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
399 $self->debug("unknown nm $nm, ignoring\n");
401 $self->{'_last_data'} = ''; # remove read data if we are at
403 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
410 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
411 Function: Convenience method that calls start_element, characters, end_element
413 Args : Hash ref with the keys 'Name' and 'Data'
419 my ( $self, $data ) = @_;
420 # simple data calls (%MAPPING) do not need start_element
421 $self->characters($data);
422 $self->end_element($data);
428 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
429 'Hsp_hit-to' => $end,
430 'Hsp_score' => $lastscore});
431 Function: Convenience method that takes multiple simple data elements and
432 maps to appropriate parameters
434 Args : Hash ref with the mapped key (in %MAPPING) and value
439 my ($self, $data) = @_;
440 $self->throw("Must provide data hash ref") if !$data || !ref($data);
441 for my $nm (sort keys %{$data}) {
442 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
443 if ( $MAPPING{$nm} ) {
444 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
445 my $key = ( keys %{ $MAPPING{$nm} } )[0];
446 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
450 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
459 Usage : $eventgenerator->characters($str)
460 Function: Send a character events
468 my ( $self, $data ) = @_;
469 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
470 $self->{'_last_data'} = $data->{'Data'};
473 =head2 within_element
475 Title : within_element
476 Usage : if( $eventgenerator->within_element($element) ) {}
477 Function: Test if we are within a particular element
478 This is different than 'in' because within can be tested
481 Args : string element name
486 my ( $self, $name ) = @_;
489 || !defined $self->{'_elements'}
490 || scalar @
{ $self->{'_elements'} } == 0 );
491 foreach ( @
{ $self->{'_elements'} } ) {
492 return 1 if ( $_ eq $name );
500 Usage : if( $eventgenerator->in_element($element) ) {}
501 Function: Test if we are in a particular element
502 This is different than 'within' because 'in' only
503 tests its immediate parent.
505 Args : string element name
510 my ( $self, $name ) = @_;
511 return 0 if !defined $self->{'_elements'}->[0];
512 return ( $self->{'_elements'}->[0] eq $name );
515 =head2 start_document
517 Title : start_document
518 Usage : $eventgenerator->start_document
519 Function: Handle a start document event
527 $self->{'_lasttype'} = '';
528 $self->{'_values'} = {};
529 $self->{'_result'} = undef;
530 $self->{'_elements'} = [];
536 Usage : $eventgenerator->end_document
537 Function: Handles an end document event
538 Returns : Bio::Search::Result::ResultI object
545 return $self->{'_result'};
551 Usage : my $count = $searchio->result_count
552 Function: Returns the number of results we have processed
560 return $self->{'_result_count'};
563 =head2 query_accession
565 Title : query_accession
566 Usage : my $acc = $parser->query_accession();
567 Function: Get/Set query (model) accession; Infernal currently does not output
568 the accession number (Rfam accession #)
569 Returns : String (accession)
570 Args : [optional] String (accession)
574 sub query_accession
{
576 return $self->{'_query_accession'} = shift if @_;
577 return $self->{'_query_accession'};
583 Usage : my $cutoff = $parser->hsp_minscore();
584 Function: Get/Set min bit score cutoff (for generating Hits/HSPs)
585 Returns : score (number)
586 Args : [optional] score (number)
592 return $self->{'_hsp_minscore'} = shift if @_;
593 return $self->{'_hsp_minscore'};
596 =head2 algorithm_version
598 Title : algorithm_version
599 Usage : my $ver = $parser->algorithm_version();
600 Function: Get/Set algorithm version (not defined in RNAMotif output)
601 Returns : String (accession)
602 Args : [optional] String (accession)
606 sub algorithm_version
{
608 return $self->{'_algorithm'} = shift if @_;
609 return $self->{'_algorithm'};