3 # BioPerl module for Bio::SearchIO::erpin
5 # Cared for by Chris Fields <cjfields-at-uiuc-dot-edu>
7 # Copyright Chris Fields
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
15 Bio::SearchIO::erpin - SearchIO-based ERPIN parser
21 This is an experimental SearchIO-based parser for output from
22 the erpin program. It currently parses erpin output for ERPIN
23 versions 4.2.5 and above; older versions may work but will not be supported.
29 User feedback is an integral part of the evolution of this and other
30 Bioperl modules. Send your comments and suggestions preferably to
31 the Bioperl mailing list. Your participation is much appreciated.
33 bioperl-l@bioperl.org - General discussion
34 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
38 Report bugs to the Bioperl bug tracking system to help us keep track
39 of the bugs and their resolution. Bug reports can be submitted via the
42 http://bugzilla.open-bio.org/
44 =head1 AUTHOR - Chris Fields
46 Email cjfields-at-uiuc-dot-edu
50 The rest of the documentation details each of the object methods.
51 Internal methods are usually preceded with a _
55 # Let the code begin...
57 package Bio
::SearchIO
::erpin
;
61 use base
qw(Bio::SearchIO);
70 'Hsp_bit-score' => 'HSP-bits',
71 'Hsp_score' => 'HSP-score',
72 'Hsp_evalue' => 'HSP-evalue', # no evalues yet
73 'Hsp_query-from' => 'HSP-query_start',
74 'Hsp_query-to' => 'HSP-query_end',
75 'Hsp_hit-from' => 'HSP-hit_start', #
76 'Hsp_hit-to' => 'HSP-hit_end', #
77 'Hsp_gaps' => 'HSP-hsp_gaps',
78 'Hsp_hitgaps' => 'HSP-hit_gaps',
79 'Hsp_querygaps' => 'HSP-query_gaps',
80 'Hsp_qseq' => 'HSP-query_seq',
81 'Hsp_hseq' => 'HSP-hit_seq',
82 'Hsp_midline' => 'HSP-homology_seq',
83 'Hsp_structure' => 'HSP-meta',
84 'Hsp_align-len' => 'HSP-hsp_length',
85 'Hsp_stranded' => 'HSP-stranded',
88 'Hsp_positive' => 'HSP-conserved',
89 'Hsp_identity' => 'HSP-identical',
91 'Hit_id' => 'HIT-name',
92 'Hit_len' => 'HIT-length',
93 'Hit_gi' => 'HIT-ncbi_gi',
94 'Hit_accession' => 'HIT-accession',
95 'Hit_def' => 'HIT-description',
96 'Hit_signif' => 'HIT-significance', # none yet
97 'Hit_score' => 'HIT-score', # best HSP bit score
98 'Hit_bits' => 'HIT-bits', # best HSP bit score
100 'ERPIN_program' => 'RESULT-algorithm_name', # get/set
101 'ERPIN_version' => 'RESULT-algorithm_version', # get/set
102 'ERPIN_query-def'=> 'RESULT-query_name', # get/set
103 'ERPIN_query-len'=> 'RESULT-query_length',
104 'ERPIN_query-acc'=> 'RESULT-query_accession', # get/set
105 'ERPIN_querydesc'=> 'RESULT-query_description', # get/set
106 'ERPIN_db' => 'RESULT-database_name', # get/set
107 'ERPIN_db-len' => 'RESULT-database_entries', # none yet
108 'ERPIN_db-let' => 'RESULT-database_letters', # none yet
110 'Parameters_cutoff' => { 'RESULT-parameters' => 'cutoff' },
111 'Parameters_expect' => { 'RESULT-parameters' => 'expect' },
113 'Parameters_include' => { 'RESULT-parameters' => 'include' },
114 'Parameters_sc-match' => { 'RESULT-parameters' => 'match' },
115 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch' },
116 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen' },
117 'Parameters_gap-extend' => { 'RESULT-parameters' => 'gapext' },
118 'Parameters_filter' => { 'RESULT-parameters' => 'filter' },
119 'Parameters_allowgaps' => { 'RESULT-parameters' => 'allowgaps' },
120 'Parameters_full_dbpath' => { 'RESULT-parameters' => 'full_dbpath' },
121 'Statistics_db-let' => { 'RESULT-statistics' => 'dbletters' },
125 my $DEFAULT_VERSION = '4.2.5';
126 my $DEFAULT_ALGORITHM = 'erpin';
131 Usage : my $obj = Bio::SearchIO::infernal->new();
132 Function: Builds a new Bio::SearchIO::infernal object
133 Returns : Bio::SearchIO::infernal
134 Args : -fh/-file => cmsearch (infernal) filename
136 -algorithm => algorithm (default 'Infernal')
137 -query_acc => query accession, eg. Rfam accession (default undef)
138 -hsp_minscore => minimum HSP score cutoff
139 -version => ERPIN version (not reported in output)
144 my ( $self, @args ) = @_;
145 $self->SUPER::_initialize
(@args);
146 my ($cutoff, $accession, $version) =
147 $self->_rearrange([qw(HSP_MINSCORE QUERY_ACC VERSION)],@args);
148 my $handler = $self->_eventHandler;
149 $handler->register_factory(
151 Bio
::Factory
::ObjectFactory
->new(
152 -type
=> 'Bio::Search::Result::GenericResult',
153 -interface
=> 'Bio::Search::Result::ResultI',
154 -verbose
=> $self->verbose()
158 $handler->register_factory(
160 Bio
::Factory
::ObjectFactory
->new(
161 -type
=> 'Bio::Search::Hit::ModelHit',
162 -interface
=> 'Bio::Search::Hit::HitI',
163 -verbose
=> $self->verbose()
167 $handler->register_factory(
169 Bio
::Factory
::ObjectFactory
->new(
170 -type
=> 'Bio::Search::HSP::ModelHSP',
171 -interface
=> 'Bio::Search::HSP::HSPI',
172 -verbose
=> $self->verbose()
175 $accession && $self->query_accession($accession);
176 $cutoff ||= $MINSCORE;
177 $self->hsp_minscore($cutoff);
178 $version ||= $DEFAULT_VERSION;
179 $self->algorithm_version($version);
185 Usage : my $hit = $searchio->next_result;
186 Function: Returns the next Result from a search
187 Returns : Bio::Search::Result::ResultI object
197 my $accession = $self->query_accession;
198 my $minscore = $self->hsp_minscore;
199 my $version = $self->algorithm_version;
200 my $verbose = $self->verbose; # cache for speed?
201 $self->start_document();
202 my ($lasthit, $lastscore, $lastlen, $lasteval);
205 while ( defined( my $line = $self->_readline ) ) {
206 next if $line =~ m{^\s*$};
207 if ($line =~ m{^Training\sset:\s+"(.*)"}xmso) {
209 $self->_pushback($line);
212 $self->start_element({'Name' => 'Result'});
213 $self->element_hash( {
214 'ERPIN_query-def' => $1,
215 'ERPIN_program' =>'erpin',
216 'ERPIN_version' => $version,
217 'ERPIN_query-acc' => $accession,
220 # parse rest of header here
222 while (defined ($line = $self->_readline) ) {
223 next if $line =~ m{^\s*$};
224 if (index($line, '>') == 0 ||
225 index($line, '-------- at level 1 --------') == 0) {
226 $self->_pushback($line);
229 if ($line =~ m{^\s+(\d+\ssequences\sof\slength\s\d+)}xmso) {
231 {'Name' => 'ERPIN_querydesc',
234 } elsif ($line =~ m{^Cutoff:\s+(\S+)}xmso) {
236 {'Name' => 'Parameters_cutoff',
239 } elsif ($line =~ m{^Database:\s+"(.*)"}xmso) {
241 {'Name' => 'ERPIN_db',
244 } elsif ($line =~ m{^\s+(\d+)\snucleotides\sto\sbe\sprocessed\sin\s(\d+)\ssequences}xmso) {
246 {'ERPIN_db-len' => $2,
247 'ERPIN_db-let' => $1}
249 } elsif ($line =~ m{^E-value\sat\scutoff\s\S+\sfor\s\S+\sdouble\sstrand\sdata:\s+(\S+)}xmso) {
251 {'Name' => 'Parameters_expect',
254 } elsif ($line =~ m{^\s+(ATGC\sratios:\s+(?:\S+\s+\S+\s+\S+\s+\S+))}) {
256 {'Name' => 'Statistics_db-let',
261 } elsif ($line =~ m{^>(\S+)\s+(.*)}xmso ) {
262 my ($id, $desc) = ($1, $2);
264 # desc line is repeated for each strand, so must check
265 # prior to starting a new hit
266 if (!$lasthit || $id ne $lasthit) {
267 if ($self->within_element('hit') ) {
268 $self->element_hash({
269 'Hit_signif' => $lasteval,
270 'Hit_score' => $lastscore,
271 'Hit_bits' => $lastscore
273 $self->end_element({'Name' => 'Hit'});
275 $self->start_element({'Name' => 'Hit'});
276 my ($gi, $acc, $ver) = $self->_get_seq_identifiers($id);
278 $self->element_hash({
281 'Hit_accession' => $ver ?
"$acc.$ver" :
287 } elsif ( (index($line, 'FW') == 0) || (index($line, 'RC') == 0)) {
288 my ($str, $hn, $pos, $score, $eval) = split ' ', $line;
289 if ($minscore < $score) {
290 $self->start_element({'Name' => 'Hsp'});
292 my ($start, $end) = split m{\.\.}, $pos, 2;
293 ($start, $end) = ($end, $start) if ($str eq 'RC');
294 $line = $self->_readline;
296 $self->element_hash({
297 'Hsp_stranded' => 'HIT',
298 'Hsp_hit-from' => $start,
299 'Hsp_hit-to' => $end,
300 'Hsp_score' => $score,
301 'Hsp_bit-score' => $score,
302 'Hsp_evalue' => $eval,
303 'Hsp_query-from' => 1,
304 'Hsp_query-to' => length($line),
305 'Hsp_align-len' => length($line),
308 $self->end_element({'Name' => 'Hsp'});
309 $lastscore = $score if (!$lastscore || $lastscore < $score);
310 $lasteval = $eval if (!$lasteval || $lasteval > $eval);
313 #$self->debug("Dropped data: $line");
317 if ($self->within_element('hit')) {
318 $self->element_hash({
319 'Hit_signif' => $lasteval,
320 'Hit_score' => $lastscore,
321 'Hit_bits' => $lastscore
323 $self->end_element({'Name' => 'Hit'});
325 $self->end_element({'Name' => 'Result'});
327 return $self->end_document();
332 Title : start_element
333 Usage : $eventgenerator->start_element
334 Function: Handles a start element event
336 Args : hashref with at least 2 keys 'Data' and 'Name'
341 my ( $self, $data ) = @_;
343 # we currently don't care about attributes
344 my $nm = $data->{'Name'};
345 my $type = $MODEMAP{$nm};
347 if ( $self->_eventHandler->will_handle($type) ) {
348 my $func = sprintf( "start_%s", lc $type );
349 $self->_eventHandler->$func( $data->{'Attributes'} );
351 unshift @
{ $self->{'_elements'} }, $type;
354 && $type eq 'result' )
356 $self->{'_values'} = {};
357 $self->{'_result'} = undef;
363 Title : start_element
364 Usage : $eventgenerator->end_element
365 Function: Handles an end element event
367 Args : hashref with at least 2 keys, 'Data' and 'Name'
373 my ( $self, $data ) = @_;
374 my $nm = $data->{'Name'};
375 my $type = $MODEMAP{$nm};
379 if ( $self->_eventHandler->will_handle($type) ) {
380 my $func = sprintf( "end_%s", lc $type );
381 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
382 $self->{'_values'} );
384 my $lastelem = shift @
{ $self->{'_elements'} };
386 elsif ( $MAPPING{$nm} ) {
387 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
388 my $key = ( keys %{ $MAPPING{$nm} } )[0];
389 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
390 $self->{'_last_data'};
393 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
397 $self->debug("unknown nm $nm, ignoring\n");
399 $self->{'_last_data'} = ''; # remove read data if we are at
401 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
408 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
409 Function: Convenience method that calls start_element, characters, end_element
411 Args : Hash ref with the keys 'Name' and 'Data'
417 my ( $self, $data ) = @_;
418 # simple data calls (%MAPPING) do not need start_element
419 $self->characters($data);
420 $self->end_element($data);
426 Usage : $eventhandler->element_hash({'Hsp_hit-from' => $start,
427 'Hsp_hit-to' => $end,
428 'Hsp_score' => $lastscore});
429 Function: Convenience method that takes multiple simple data elements and
430 maps to appropriate parameters
432 Args : Hash ref with the mapped key (in %MAPPING) and value
437 my ($self, $data) = @_;
438 $self->throw("Must provide data hash ref") if !$data || !ref($data);
439 for my $nm (sort keys %{$data}) {
440 next if $data->{$nm} && $data->{$nm} =~ m{^\s*$}o;
441 if ( $MAPPING{$nm} ) {
442 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
443 my $key = ( keys %{ $MAPPING{$nm} } )[0];
444 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
448 $self->{'_values'}->{ $MAPPING{$nm} } = $data->{$nm};
457 Usage : $eventgenerator->characters($str)
458 Function: Send a character events
466 my ( $self, $data ) = @_;
467 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
468 $self->{'_last_data'} = $data->{'Data'};
471 =head2 within_element
473 Title : within_element
474 Usage : if( $eventgenerator->within_element($element) ) {}
475 Function: Test if we are within a particular element
476 This is different than 'in' because within can be tested
479 Args : string element name
484 my ( $self, $name ) = @_;
487 || !defined $self->{'_elements'}
488 || scalar @
{ $self->{'_elements'} } == 0 );
489 foreach ( @
{ $self->{'_elements'} } ) {
490 return 1 if ( $_ eq $name );
498 Usage : if( $eventgenerator->in_element($element) ) {}
499 Function: Test if we are in a particular element
500 This is different than 'within' because 'in' only
501 tests its immediate parent.
503 Args : string element name
508 my ( $self, $name ) = @_;
509 return 0 if !defined $self->{'_elements'}->[0];
510 return ( $self->{'_elements'}->[0] eq $name );
513 =head2 start_document
515 Title : start_document
516 Usage : $eventgenerator->start_document
517 Function: Handle a start document event
525 $self->{'_lasttype'} = '';
526 $self->{'_values'} = {};
527 $self->{'_result'} = undef;
528 $self->{'_elements'} = [];
534 Usage : $eventgenerator->end_document
535 Function: Handles an end document event
536 Returns : Bio::Search::Result::ResultI object
543 return $self->{'_result'};
549 Usage : my $count = $searchio->result_count
550 Function: Returns the number of results we have processed
558 return $self->{'_result_count'};
561 =head2 query_accession
563 Title : query_accession
564 Usage : my $acc = $parser->query_accession();
565 Function: Get/Set query (model) accession; Infernal currently does not output
566 the accession number (Rfam accession #)
567 Returns : String (accession)
568 Args : [optional] String (accession)
572 sub query_accession
{
574 return $self->{'_query_accession'} = shift if @_;
575 return $self->{'_query_accession'};
581 Usage : my $cutoff = $parser->hsp_minscore();
582 Function: Get/Set min bit score cutoff (for generating Hits/HSPs)
583 Returns : score (number)
584 Args : [optional] score (number)
590 return $self->{'_hsp_minscore'} = shift if @_;
591 return $self->{'_hsp_minscore'};
594 =head2 algorithm_version
596 Title : algorithm_version
597 Usage : my $ver = $parser->algorithm_version();
598 Function: Get/Set algorithm version (not defined in RNAMotif output)
599 Returns : String (accession)
600 Args : [optional] String (accession)
604 sub algorithm_version
{
606 return $self->{'_algorithm'} = shift if @_;
607 return $self->{'_algorithm'};