Merge pull request #181 from bioperl/limit-dockerhub-trigger
[bioperl-live.git] / Bio / SearchIO / hmmer3.pm
blob35498177b76ae260af8fa77e133ca90f2e32c5a2
2 # BioPerl module for Bio::SearchIO::hmmer3
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Thomas Sharpton <thomas.sharpton@gmail.com>
8 # Copyright Thomas Sharpton
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SearchIO::hmmer3
18 =head1 SYNOPSIS
20 use Bio::SearchIO;
22 my $searchio = Bio::SearchIO->new(
23 -format => 'hmmer',
24 -version => 3,
25 -file => 'hmmsearch.out'
28 my $result = $searchio->next_result;
29 my $hit = $result->next_hit;
30 print $hit->name, $hit->description, $hit->significance,
31 $hit->score, "\n";
33 my $hsp = $hit->next_hsp;
34 print $hsp->start('hit'), $hsp->end('hit'), $hsp->start('query'),
35 $hsp->end('query'), "\n";
37 =head1 DESCRIPTION
39 Code to parse output from hmmsearch, hmmscan, phmmer and nhmmer, compatible with
40 both version 2 and version 3 of the HMMER package from L<http://hmmer.org>.
42 =head1 FEEDBACK
44 =head2 Mailing Lists
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to
48 the Bioperl mailing list. Your participation is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53 =head2 Support
55 Please direct usage questions or support issues to the mailing list:
57 L<bioperl-l@bioperl.org>
59 rather than to the module maintainer directly. Many experienced and
60 reponsive experts will be able look at the problem and quickly
61 address it. Please include a thorough description of the problem
62 with code and data examples if at all possible.
64 =head2 Reporting Bugs
66 Report bugs to the Bioperl bug tracking system to help us keep track
67 of the bugs and their resolution. Bug reports can be submitted via
68 the web:
70 https://github.com/bioperl/bioperl-live/issues
72 =head1 AUTHOR - Thomas Sharpton
74 Email thomas.sharpton@gmail.com
76 Describe contact details here
78 =head1 CONTRIBUTORS
80 Additional contributors names and emails here
82 briano at bioteam.net
84 =head1 APPENDIX
86 The rest of the documentation details each of the object methods.
87 Internal methods are usually preceded with a _
89 =cut
91 # Let the code begin...
93 package Bio::SearchIO::hmmer3;
95 use strict;
96 use Data::Dumper;
97 use Bio::Factory::ObjectFactory;
98 use Bio::Tools::IUPAC;
99 use vars qw(%MAPPING %MODEMAP);
100 use base qw(Bio::SearchIO::hmmer);
102 BEGIN {
104 # mapping of HMMER items to Bioperl hash keys
105 %MODEMAP = (
106 'HMMER_Output' => 'result',
107 'Hit' => 'hit',
108 'Hsp' => 'hsp'
111 %MAPPING = (
112 'Hsp_bit-score' => 'HSP-bits',
113 'Hsp_score' => 'HSP-score',
114 'Hsp_evalue' => 'HSP-evalue',
115 'Hsp_query-from' => 'HSP-query_start',
116 'Hsp_query-to' => 'HSP-query_end',
117 'Hsp_query-strand' => 'HSP-query_strand',
118 'Hsp_hit-from' => 'HSP-hit_start',
119 'Hsp_hit-to' => 'HSP-hit_end',
120 'Hsp_hit-strand' => 'HSP-hit_strand',
121 'Hsp_positive' => 'HSP-conserved',
122 'Hsp_identity' => 'HSP-identical',
123 'Hsp_gaps' => 'HSP-hsp_gaps',
124 'Hsp_hitgaps' => 'HSP-hit_gaps',
125 'Hsp_querygaps' => 'HSP-query_gaps',
126 'Hsp_qseq' => 'HSP-query_seq',
127 'Hsp_csline' => 'HSP-cs_seq',
128 'Hsp_hseq' => 'HSP-hit_seq',
129 'Hsp_midline' => 'HSP-homology_seq',
130 'Hsp_pline' => 'HSP-pp_seq',
131 'Hsp_align-len' => 'HSP-hsp_length',
132 'Hsp_query-frame' => 'HSP-query_frame',
133 'Hsp_hit-frame' => 'HSP-hit_frame',
135 'Hit_id' => 'HIT-name',
136 'Hit_len' => 'HIT-length',
137 'Hit_accession' => 'HIT-accession',
138 'Hit_desc' => 'HIT-description',
139 'Hit_signif' => 'HIT-significance',
140 'Hit_score' => 'HIT-score',
142 'HMMER_program' => 'RESULT-algorithm_name',
143 'HMMER_version' => 'RESULT-algorithm_version',
144 'HMMER_query-def' => 'RESULT-query_name',
145 'HMMER_query-len' => 'RESULT-query_length',
146 'HMMER_query-acc' => 'RESULT-query_accession',
147 'HMMER_querydesc' => 'RESULT-query_description',
148 'HMMER_hmm' => 'RESULT-hmm_name',
149 'HMMER_seqfile' => 'RESULT-sequence_file',
150 'HMMER_db' => 'RESULT-database_name',
154 =head2 next_result
156 Title : next_result
157 Usage : my $hit = $searchio->next_result;
158 Function: Returns the next Result from a search
159 Returns : Bio::Search::Result::ResultI object
160 Args : none
162 =cut
164 sub next_result {
165 my ($self) = @_;
166 my ( $buffer, $last, @hit_list, @hsp_list, %hspinfo, %hitinfo, %domaincounter );
167 local $/ = "\n";
169 my @ambiguous_nt = keys %Bio::Tools::IUPAC::IUB;
170 my $ambiguous_nt = join '', @ambiguous_nt;
172 my $verbose = $self->verbose; # cache for speed? JES's idea in hmmer.pm
173 $self->start_document();
175 # This is here to ensure that next_result doesn't produce infinite loop
176 if ( !defined( $buffer = $self->_readline ) ) {
177 return undef;
179 else {
180 $self->_pushback($buffer);
183 my $hit_counter = 0; # helper variable for non-unique hit IDs
185 # Regex goes here for HMMER3
186 # Start with hmmsearch processing
187 while ( defined( $buffer = $self->_readline ) ) {
188 my $lineorig = $buffer;
189 chomp $buffer;
191 # Grab the program name
192 if ( $buffer =~ m/^\#\s(\S+)\s\:\:\s/ ) {
193 my $prog = $1;
195 # TO DO: customize the above regex to adapt to other
196 # program types (hmmscan, etc)
197 $self->start_element( { 'Name' => 'HMMER_Output' } );
198 $self->{'_result_count'}++; #Might need to move to another block
199 $self->element(
200 { 'Name' => 'HMMER_program',
201 'Data' => uc($prog)
206 # Get the HMMER package version and release date
207 elsif ( $buffer =~ m/^\#\sHMMER\s+(\S+)\s+\((.+)\)/ ) {
208 my $version = $1;
209 my $versiondate = $2;
210 $self->{'_hmmidline'} = $buffer;
211 $self->element(
212 { 'Name' => 'HMMER_version',
213 'Data' => $version
218 # Get the query info
219 elsif ( $buffer =~ /^\#\squery (?:\w+ )?file\:\s+(\S+)/ ) {
220 if ( $self->{'_reporttype'} eq 'HMMSEARCH'
221 || $self->{'_reporttype'} eq 'PHMMER'
222 || $self->{'_reporttype'} eq 'NHMMER' )
224 $self->{'_hmmfileline'} = $lineorig;
225 $self->element(
226 { 'Name' => 'HMMER_hmm',
227 'Data' => $1
231 elsif ( $self->{'_reporttype'} eq 'HMMSCAN' ) {
232 $self->{'_hmmseqline'} = $lineorig;
233 $self->element(
234 { 'Name' => 'HMMER_seqfile',
235 'Data' => $1
241 # If this is a report without alignments
242 elsif ( $buffer =~ m/^\#\sshow\salignments\sin\soutput/ ) {
243 $self->{'_alnreport'} = 0;
246 # Get the database info
247 elsif ( $buffer =~ m/^\#\starget\s\S+\sdatabase\:\s+(\S+)/ ) {
249 if ( $self->{'_reporttype'} eq 'HMMSEARCH'
250 || $self->{'_reporttype'} eq 'PHMMER'
251 || $self->{'_reporttype'} eq 'NHMMER' )
253 $self->{'_hmmseqline'} = $lineorig;
254 $self->element(
255 { 'Name' => 'HMMER_seqfile',
256 'Data' => $1
260 elsif ( $self->{'_reporttype'} eq 'HMMSCAN' ) {
261 $self->{'_hmmfileline'} = $lineorig;
262 $self->element(
263 { 'Name' => 'HMMER_hmm',
264 'Data' => $1
270 # Get query data
271 elsif ( $buffer =~ s/^Query:\s+// ) {
272 # For multi-query reports
273 if ( ( not exists $self->{_values}->{"RESULT-algorithm_name"}
274 or not exists $self->{_values}->{"RESULT-algorithm_version"}
276 and exists $self->{_hmmidline}
278 my ($version, $versiondate) = $self->{_hmmidline} =~ m/^\#\sHMMER\s+(\S+)\s+\((.+)\)/;
279 $self->element(
280 { 'Name' => 'HMMER_program',
281 'Data' => $self->{_reporttype}
284 $self->element(
285 { 'Name' => 'HMMER_version',
286 'Data' => $version
290 if ( ( not exists $self->{_values}->{"RESULT-hmm_name"}
291 or not exists $self->{_values}->{"RESULT-sequence_file"}
293 and ( exists $self->{_hmmfileline}
294 or exists $self->{_hmmseqline}
297 if ( $self->{'_reporttype'} eq 'HMMSEARCH'
298 or $self->{'_reporttype'} eq 'PHMMER'
299 or $self->{'_reporttype'} eq 'NHMMER'
301 my ($qry_file) = $self->{_hmmfileline} =~ m/^\#\squery (?:\w+ )?file\:\s+(\S+)/;
302 my ($target_file) = $self->{_hmmseqline} =~ m/^\#\starget\s\S+\sdatabase\:\s+(\S+)/;
303 $self->element(
304 { 'Name' => 'HMMER_hmm',
305 'Data' => $qry_file
308 $self->element(
309 { 'Name' => 'HMMER_seqfile',
310 'Data' => $target_file
314 elsif ( $self->{'_reporttype'} eq 'HMMSCAN' ) {
315 my ($qry_file) = $self->{_hmmseqline} =~ m/^\#\squery \w+ file\:\s+(\S+)/;
316 my ($target_file) = $self->{_hmmfileline} =~ m/^\#\starget\s\S+\sdatabase\:\s+(\S+)/;
317 $self->element(
318 { 'Name' => 'HMMER_seqfile',
319 'Data' => $qry_file
322 $self->element(
323 { 'Name' => 'HMMER_hmm',
324 'Data' => $target_file
330 unless ($buffer =~ s/\s+\[[L|M]\=(\d+)\]$//) {
331 warn "Error parsing length for query, offending line $buffer\n";
332 exit(0);
334 my $querylen = $1;
335 $self->element(
336 { 'Name' => 'HMMER_query-len',
337 'Data' => $querylen
340 $self->element(
341 { 'Name' => 'HMMER_query-def',
342 'Data' => $buffer
347 # Get Accession data
348 elsif ( $buffer =~ s/^Accession:\s+// ) {
349 $buffer =~ s/\s+$//;
350 $self->element(
351 { 'Name' => 'HMMER_query-acc',
352 'Data' => $buffer
357 # Get description data
358 elsif ( $buffer =~ s/^Description:\s+// ) {
359 $buffer =~ s/\s+$//;
360 $self->element(
361 { 'Name' => 'HMMER_querydesc',
362 'Data' => $buffer
367 # hmmsearch, nhmmer, and hmmscan-specific formatting here
368 elsif (
369 defined $self->{'_reporttype'}
370 && ( $self->{'_reporttype'} eq 'HMMSEARCH'
371 || $self->{'_reporttype'} eq 'HMMSCAN'
372 || $self->{'_reporttype'} eq 'PHMMER'
373 || $self->{'_reporttype'} eq 'NHMMER' )
376 # Complete sequence table data above inclusion threshold,
377 # hmmsearch or hmmscan
378 if ( $buffer =~ m/Scores for complete sequence/ ) {
379 while ( defined( $buffer = $self->_readline ) ) {
380 if ( $buffer =~ m/inclusion threshold/
381 || $buffer =~ m/Domain( and alignment)? annotation for each/
382 || $buffer =~ m/\[No hits detected/
383 || $buffer =~ m!^//! )
385 $self->_pushback($buffer);
386 last;
388 elsif ( $buffer =~ m/^\s+E-value\s+score/
389 || $buffer =~ m/\-\-\-/
390 || $buffer =~ m/^$/
393 next;
396 # Grab table data
397 $hit_counter++;
398 my ($eval_full, $score_full, $bias_full, $eval_best,
399 $score_best, $bias_best, $exp, $n,
400 $hitid, $desc, @hitline
402 @hitline = split( " ", $buffer );
403 $eval_full = shift @hitline;
404 $score_full = shift @hitline;
405 $bias_full = shift @hitline;
406 $eval_best = shift @hitline;
407 $score_best = shift @hitline;
408 $bias_best = shift @hitline;
409 $exp = shift @hitline;
410 $n = shift @hitline;
411 $hitid = shift @hitline;
412 $desc = join " ", @hitline;
414 $desc = '' if ( !defined($desc) );
416 push @hit_list,
417 [ $hitid, $desc, $eval_full, $score_full ];
418 $hitinfo{"$hitid.$hit_counter"} = $#hit_list;
422 # nhmmer
423 elsif ( $buffer =~ /Scores for complete hits/ ) {
424 while ( defined( $buffer = $self->_readline ) ) {
425 if ( $buffer =~ /inclusion threshold/
426 || $buffer =~ /Annotation for each hit/
427 || $buffer =~ /\[No hits detected/
428 || $buffer =~ m!^//! )
430 $self->_pushback($buffer);
431 last;
433 elsif ( $buffer =~ m/^\s+E-value\s+score/
434 || $buffer =~ m/\-\-\-/
435 || $buffer =~ m/^$/
438 next;
441 # Grab table data
442 $hit_counter++;
443 my ($eval, $score, $bias, $hitid,
444 $start, $end, $desc, @hitline
446 @hitline = split( " ", $buffer );
447 $eval = shift @hitline;
448 $score = shift @hitline;
449 $bias = shift @hitline;
450 $hitid = shift @hitline;
451 $start = shift @hitline;
452 $end = shift @hitline;
453 $desc = join ' ', @hitline;
455 $desc = '' if ( !defined($desc) );
457 push @hit_list, [ $hitid, $desc, $eval, $score ];
458 $hitinfo{"$hitid.$hit_counter"} = $#hit_list;
462 # Complete sequence table data below inclusion threshold
463 elsif ( $buffer =~ /inclusion threshold/ ) {
464 while ( defined( $buffer = $self->_readline ) ) {
465 if ( $buffer =~ /Domain( and alignment)? annotation for each/
466 || $buffer =~ /Internal pipeline statistics summary/
467 || $buffer =~ /Annotation for each hit\s+\(and alignments\)/
470 $self->_pushback($buffer);
471 last;
473 elsif ( $buffer =~ m/inclusion threshold/
474 || $buffer =~ m/^$/
477 next;
480 # Grab table data
481 $hit_counter++;
482 my ($eval_full, $score_full, $bias_full, $eval_best,
483 $score_best, $bias_best, $exp, $n,
484 $hitid, $desc, @hitline
486 @hitline = split( " ", $buffer );
487 $eval_full = shift @hitline;
488 $score_full = shift @hitline;
489 $bias_full = shift @hitline;
490 $eval_best = shift @hitline;
491 $score_best = shift @hitline;
492 $bias_best = shift @hitline;
493 $exp = shift @hitline;
494 $n = shift @hitline;
495 $hitid = shift @hitline;
496 $desc = join " ", @hitline;
498 $desc = '' if ( !defined($desc) );
500 push @hit_list,
501 [ $hitid, $desc, $eval_full, $score_full ];
502 $hitinfo{"$hitid.$hit_counter"} = $#hit_list;
506 # Domain annotation for each sequence table data,
507 # for hmmscan, hmmsearch & nhmmer
508 elsif ( $buffer =~ /Domain( and alignment)? annotation for each/
509 or $buffer =~ /Annotation for each hit\s+\(and alignments\)/
511 @hsp_list = (); # Here for multi-query reports
512 my $name;
513 my $annot_counter = 0;
515 while ( defined( $buffer = $self->_readline ) ) {
516 if ( $buffer =~ /\[No targets detected/
517 || $buffer =~ /Internal pipeline statistics/ )
519 $self->_pushback($buffer);
520 last;
523 if ( $buffer =~ m/^\>\>\s(\S*)\s+(.*)/ ) {
524 $name = $1;
525 my $desc = $2;
526 $annot_counter++;
527 $domaincounter{"$name.$annot_counter"} = 0;
529 # The Hit Description from the Scores table can be truncated if
530 # its too long, so use the '>>' line description when its longer
531 if (length $hit_list[
532 $hitinfo{"$name.$annot_counter"}
534 [1] < length $desc
536 $hit_list[ $hitinfo{"$name.$annot_counter"} ][1] = $desc;
539 while ( defined( $buffer = $self->_readline ) ) {
540 if ( $buffer =~ m/Internal pipeline statistics/
541 || $buffer =~ m/Alignments for each domain/
542 || $buffer =~ m/^\s+Alignment:/
543 || $buffer =~ m/^\>\>/ )
545 $self->_pushback($buffer);
546 last;
548 elsif ( $buffer =~ m/^\s+score\s+bias/
549 || $buffer =~ m/^\s+\#\s+score/
550 || $buffer =~ m/^\s+------\s+/
551 || $buffer =~ m/^\s\-\-\-\s+/
552 || $buffer =~ m/^$/
555 next;
558 # Grab hsp data from table, push into @hsp;
559 if ($self->{'_reporttype'} =~ m/(?:HMMSCAN|HMMSEARCH|PHMMER|NHMMER)/) {
560 my ( $domain_num, $score, $bias,
561 $ceval, $ieval,
562 $hmm_start, $hmm_stop, $hmm_cov,
563 $seq_start, $seq_stop, $seq_cov,
564 $env_start, $env_stop, $env_cov,
565 $hitlength, $acc );
566 my @vals;
568 if ( # HMMSCAN & HMMSEARCH
569 ( $domain_num, $score, $bias,
570 $ceval, $ieval,
571 $hmm_start, $hmm_stop, $hmm_cov,
572 $seq_start, $seq_stop, $seq_cov,
573 $env_start, $env_stop, $env_cov,
574 $acc )
575 = ( $buffer =~
576 m|^\s+(\d+)\s\!*\?*\s+ # domain number
577 (\S+)\s+(\S+)\s+ # score, bias
578 (\S+)\s+(\S+)\s+ # c-eval, i-eval
579 (\d+)\s+(\d+)\s+(\S+)\s+ # hmm start, stop, coverage
580 (\d+)\s+(\d+)\s+(\S+)\s+ # seq start, stop, coverage
581 (\d+)\s+(\d+)\s+(\S+)\s+ # env start, stop, coverage
582 (\S+) # posterior probability accuracy
583 \s*$|ox
586 # Values assigned when IF succeeded
588 # Try to get the Hit length from the alignment information
589 $hitlength = 0;
590 if ($self->{'_reporttype'} eq 'HMMSEARCH' || $self->{'_reporttype'} eq 'PHMMER') {
591 # For Hmmsearch, if seq coverage ends in ']' it means that the alignment
592 # runs until the end. In that case add the END coordinate to @hitinfo
593 # to use it as Hit Length
594 if ( $seq_cov =~ m/\]$/ ) {
595 $hitlength = $seq_stop;
598 elsif ($self->{'_reporttype'} eq 'HMMSCAN') {
599 # For Hmmscan, if hmm coverage ends in ']' it means that the alignment
600 # runs until the end. In that case add the END coordinate to @hitinfo
601 # to use it as Hit Length
602 if ( $hmm_cov =~ m/\]$/ ) {
603 $hitlength = $hmm_stop;
607 elsif ( # NHMMER
608 ( $score, $bias, $ceval,
609 $hmm_start, $hmm_stop, $hmm_cov,
610 $seq_start, $seq_stop, $seq_cov,
611 $env_start, $env_stop, $env_cov,
612 $hitlength, $acc )
613 = ( $buffer =~
614 m|^\s+[!?]\s+
615 (\S+)\s+(\S+)\s+(\S+)\s+ # score, bias, evalue
616 (\d+)\s+(\d+)\s+(\S+)\s+ # hmm start, stop, coverage
617 (\d+)\s+(\d+)\s+(\S+)\s+ # seq start, stop, coverage
618 (\d+)\s+(\d+)\s+(\S+)\s+ # env start, stop, coverage
619 (\d+)\s+(\S+) # target length, pp accuracy
620 .*$|ox
623 # Values assigned when IF succeeded
625 else {
626 print STDERR "Missed this line: $buffer\n";
627 next;
630 my $info = $hit_list[ $hitinfo{"$name.$annot_counter"} ];
631 if ( !defined $info ) {
632 $self->warn(
633 "Incomplete information: can't find HSP $name in list of hits\n"
635 next;
638 $domaincounter{"$name.$annot_counter"}++;
639 my $hsp_key
640 = $name . "_" . $domaincounter{"$name.$annot_counter"};
642 # Keep it simple for now. let's customize later
643 @vals = (
644 $hmm_start, $hmm_stop,
645 $seq_start, $seq_stop,
646 $score, $ceval,
647 $hitlength, '',
648 '', '',
649 '', ''
651 push @hsp_list, [ $name, @vals ];
652 $hspinfo{"$hsp_key.$annot_counter"} = $#hsp_list;
656 elsif ( $buffer =~ /Alignment(?:s for each domain)?:/ ) {
657 #line counter
658 my $count = 0;
660 # There's an optional block, so we sometimes need to
661 # count to 3, and sometimes to 4.
662 my $max_count = 3;
663 my $lastdomain;
664 my $hsp;
665 my ( $csline, $hline, $midline, $qline, $pline );
667 # To avoid deleting whitespaces from the homology line,
668 # keep track of the position and length of the alignment
669 # in each individual hline/qline, to take them as reference
670 # and use them in the homology line
671 my $align_offset = 0;
672 my $align_length = 0;
674 while ( defined( $buffer = $self->_readline ) ) {
675 if ( $buffer =~ m/^\>\>/
676 || $buffer =~ m/Internal pipeline statistics/ )
678 $self->_pushback($buffer);
679 last;
681 elsif ($buffer =~ m/^$/ )
683 # Reset these scalars on empty lines to help
684 # distinguish between the consensus structure/reference
685 # tracks (CS|RF lines) and homology lines ending in
686 # CS or RF aminoacids
687 $align_offset = 0;
688 $align_length = 0;
689 next;
692 if ( $buffer =~ /\s\s\=\=\sdomain\s(\d+)\s+/
693 or $buffer =~ /\s\sscore:\s\S+\s+/
695 my $domainnum = $1 || 1;
696 $count = 0;
697 my $key = $name . "_" . $domainnum;
698 $hsp = $hsp_list[ $hspinfo{"$key.$annot_counter"} ];
699 $csline = $$hsp[-5];
700 $hline = $$hsp[-4];
701 $midline = $$hsp[-3];
702 $qline = $$hsp[-2];
703 $pline = $$hsp[-1];
704 $lastdomain = $name;
706 # Consensus Structure or Reference track, some reports
707 # don't have it. Since it appears on top of the alignment,
708 # the reset of $align_length to 0 between alignment blocks
709 # avoid confusing homology lines with it.
710 elsif ( $buffer =~ m/\s+\S+\s(?:CS|RF)$/ and $align_length == 0 ) {
711 my @data = split( " ", $buffer );
712 $csline .= $data[-2];
713 $max_count++;
714 $count++;
715 next;
717 # Query line and Hit line swaps positions
718 # depending of the program
719 elsif ( $count == $max_count - 3
720 or $count == $max_count - 1
722 my @data = split( " ", $buffer );
724 my $line_offset = 0;
725 # Use \Q\E on match to avoid errors on alignments
726 # that include stop codons (*)
727 while ($buffer =~ m/\Q$data[-2]\E/g) {
728 $line_offset = pos $buffer;
730 if ($line_offset != 0) {
731 $align_length = length $data[-2];
732 $align_offset = $line_offset - $align_length;
735 if ($self->{'_reporttype'} eq 'HMMSCAN') {
736 # hit sequence
737 $hline .= $data[-2] if ($count == $max_count - 3);
738 # query sequence
739 $qline .= $data[-2] if ($count == $max_count - 1);
741 else { # hmmsearch & nhmmer
742 # hit sequence
743 $hline .= $data[-2] if ($count == $max_count - 1);
744 # query sequence
745 $qline .= $data[-2] if ($count == $max_count - 3);
748 $count++;
749 next;
751 # conservation track
752 # storage isn't quite right - need to remove
753 # leading/lagging whitespace while preserving
754 # gap data (latter isn't done, former is)
755 elsif ( $count == $max_count - 2 ) {
756 $midline .= substr $buffer, $align_offset, $align_length;
757 $count++;
758 next;
760 # posterior probability track
761 elsif ( $count == $max_count ) {
762 my @data = split(" ", $buffer);
763 $pline .= $data[-2];
764 $count = 0;
765 $max_count = 3;
766 $$hsp[-5] = $csline;
767 $$hsp[-4] = $hline;
768 $$hsp[-3] = $midline;
769 $$hsp[-2] = $qline;
770 $$hsp[-1] = $pline;
771 next;
773 else {
774 print STDERR "Missed this line: $buffer\n";
781 # End of report
782 elsif ( $buffer =~ m/Internal pipeline statistics/ || $buffer =~ m!^//! ) {
783 # If within hit, hsp close;
784 if ( $self->within_element('hit') ) {
785 if ( $self->within_element('hsp') ) {
786 $self->end_element( { 'Name' => 'Hsp' } );
788 $self->end_element( { 'Name' => 'Hit' } );
791 # Grab summary statistics of run
792 while ( defined( $buffer = $self->_readline ) ) {
793 last if ( $buffer =~ m/^\/\/$/ );
796 # Do a lot of processing of hits and hsps here
797 my $index = 0;
798 while ( my $hit = shift @hit_list ) {
799 $index++;
800 my $hit_name = shift @$hit;
801 my $hit_desc = shift @$hit;
802 my $hit_signif = shift @$hit;
803 my $hit_score = shift @$hit;
804 my $num_domains = $domaincounter{"$hit_name.$index"} || 0;
806 $self->start_element( { 'Name' => 'Hit' } );
807 $self->element(
808 { 'Name' => 'Hit_id',
809 'Data' => $hit_name
812 $self->element(
813 { 'Name' => 'Hit_desc',
814 'Data' => $hit_desc
817 $self->element(
818 { 'Name' => 'Hit_signif',
819 'Data' => $hit_signif
822 $self->element(
823 { 'Name' => 'Hit_score',
824 'Data' => $hit_score
828 for my $i ( 1 .. $num_domains ) {
829 my $key = $hit_name . "_" . $i;
830 my $hsp = $hsp_list[ $hspinfo{"$key.$index"} ];
831 if ( defined $hsp ) {
832 my $hsp_name = shift @$hsp;
833 $self->start_element( { 'Name' => 'Hsp' } );
834 # Since HMMER doesn't print some data explicitly,
835 # calculate it from the homology line (midline)
836 if ($$hsp[-3] ne '') {
837 my $length = length $$hsp[-3];
838 my $identical = ($$hsp[-3] =~ tr/a-zA-Z//);
839 my $positive = ($$hsp[-3] =~ tr/+//) + $identical;
840 $self->element(
842 'Name' => 'Hsp_align-len',
843 'Data' => $length
846 $self->element(
847 { 'Name' => 'Hsp_identity',
848 'Data' => $identical
851 $self->element(
852 { 'Name' => 'Hsp_positive',
853 'Data' => $positive
857 else {
858 $self->element(
859 { 'Name' => 'Hsp_identity',
860 'Data' => 0
863 $self->element(
864 { 'Name' => 'Hsp_positive',
865 'Data' => 0
869 if ( $self->{'_reporttype'} eq 'HMMSCAN' ) {
870 $self->element(
871 { 'Name' => 'Hsp_hit-from',
872 'Data' => shift @$hsp
875 $self->element(
876 { 'Name' => 'Hsp_hit-to',
877 'Data' => shift @$hsp
880 $self->element(
881 { 'Name' => 'Hsp_query-from',
882 'Data' => shift @$hsp
885 $self->element(
886 { 'Name' => 'Hsp_query-to',
887 'Data' => shift @$hsp
891 elsif ( $self->{'_reporttype'} eq 'HMMSEARCH'
892 or $self->{'_reporttype'} eq 'NHMMER'
894 $self->element(
895 { 'Name' => 'Hsp_query-from',
896 'Data' => shift @$hsp
899 $self->element(
900 { 'Name' => 'Hsp_query-to',
901 'Data' => shift @$hsp
904 $self->element(
905 { 'Name' => 'Hsp_hit-from',
906 'Data' => shift @$hsp
909 $self->element(
910 { 'Name' => 'Hsp_hit-to',
911 'Data' => shift @$hsp
915 $self->element(
916 { 'Name' => 'Hsp_score',
917 'Data' => shift @$hsp
920 $self->element(
921 { 'Name' => 'Hsp_evalue',
922 'Data' => shift @$hsp
925 my $hitlength = shift @$hsp;
926 if ( $hitlength != 0 ) {
927 $self->element(
928 { 'Name' => 'Hit_len',
929 'Data' => $hitlength
933 $self->element(
934 { 'Name' => 'Hsp_csline',
935 'Data' => shift @$hsp
938 $self->element(
939 { 'Name' => 'Hsp_hseq',
940 'Data' => shift @$hsp
943 $self->element(
944 { 'Name' => 'Hsp_midline',
945 'Data' => shift @$hsp
948 $self->element(
949 { 'Name' => 'Hsp_qseq',
950 'Data' => shift @$hsp
953 $self->element(
954 { 'Name' => 'Hsp_pline',
955 'Data' => shift @$hsp
959 # Only nhmmer output has strand information
960 if ( $self->{'_reporttype'} eq 'NHMMER' ) {
961 my $hstart = $self->get_from_element('HSP-hit_start');
962 my $hend = $self->get_from_element('HSP-hit_end');
963 my $hstrand = ( $hstart < $hend ) ? 1 : -1;
965 my $qstart = $self->get_from_element('HSP-query_start');
966 my $qend = $self->get_from_element('HSP-query_end');
967 my $qstrand = ( $qstart < $qend ) ? 1 : -1;
969 $self->element(
970 { 'Name' => 'Hsp_query-strand',
971 'Data' => $qstrand
974 $self->element(
975 { 'Name' => 'Hsp_hit-strand',
976 'Data' => $hstrand
981 $self->end_element( { 'Name' => 'Hsp' } );
984 $self->end_element( { 'Name' => 'Hit' } );
986 @hit_list = ();
987 %hitinfo = ();
988 last;
991 else {
992 print STDERR "Missed this line: $buffer\n";
993 $self->debug($buffer);
995 $last = $buffer;
997 $self->end_element( { 'Name' => 'HMMER_Output' } );
998 my $result = $self->end_document();
999 return $result;
1002 =head2 start_element
1004 Title : start_element
1005 Usage : $eventgenerator->start_element
1006 Function: Handles a start event
1007 Returns : none
1008 Args : hashref with at least 2 keys 'Data' and 'Name'
1010 =cut
1012 sub start_element {
1014 my ( $self, $data ) = @_;
1016 # we currently don't care about attributes
1017 my $nm = $data->{'Name'};
1018 my $type = $MODEMAP{$nm};
1019 if ($type) {
1020 if ( $self->_eventHandler->will_handle($type) ) {
1021 my $func = sprintf( "start_%s", lc $type );
1022 $self->_eventHandler->$func( $data->{'Attributes'} );
1024 unshift @{ $self->{'_elements'} }, $type;
1026 if ( defined $type
1027 && $type eq 'result' )
1029 $self->{'_values'} = {};
1030 $self->{'_result'} = undef;
1034 =head2 end_element
1036 Title : end_element
1037 Usage : $eventgeneartor->end_element
1038 Function: Handles and end element event
1039 Returns : none
1040 Args : hashref with at least 2 keys 'Data' and 'Name'
1042 =cut
1044 sub end_element {
1046 my ( $self, $data ) = @_;
1047 my $nm = $data->{'Name'};
1048 my $type = $MODEMAP{$nm};
1049 my $rc;
1051 if ( $nm eq 'HMMER_program' ) {
1052 if ( $self->{'_last_data'} =~ /([NP]?HMM\S+)/i ) {
1053 $self->{'_reporttype'} = uc $1;
1057 # Hsp are sort of weird, in that they end when another
1058 # object begins so have to detect this in end_element for now
1059 if ( $nm eq 'Hsp' ) {
1060 foreach my $line (qw(Hsp_csline Hsp_qseq Hsp_midline Hsp_hseq Hsp_pline)) {
1061 my $data = $self->{'_last_hspdata'}->{$line};
1062 if ( $data && $line eq 'Hsp_hseq' ) {
1064 # replace hmm '.' gap symbol by '-'
1065 $data =~ s/\./-/g;
1067 $self->element(
1068 { 'Name' => $line,
1069 'Data' => $data
1073 $self->{'_last_hspdata'} = {};
1075 if ($type) {
1076 if ( $self->_eventHandler->will_handle($type) ) {
1077 my $func = sprintf( "end_%s", lc $type );
1078 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
1079 $self->{'_values'} );
1081 my $lastelem = shift @{ $self->{'_elements'} };
1083 # Flush corresponding values from the {_values} buffer
1084 my $name = uc $type;
1085 foreach my $key (keys %{ $self->{_values} }) {
1086 delete $self->{_values}->{$key} if ($key =~ m/^$name-/);
1089 elsif ( $MAPPING{$nm} ) {
1090 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
1091 my $key = ( keys %{ $MAPPING{$nm} } )[0];
1092 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} }
1093 = $self->{'_last_data'};
1095 else {
1096 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
1098 # print "lastdata is " . $self->{'_last_data'} . "\n";
1101 else {
1102 $self->debug("unknown nm $nm, ignoring\n");
1104 $self->{'_last_data'} = ''; # remove read data if we are at
1105 # end of an element
1106 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
1107 return $rc;
1110 =head2 element
1112 Title : element
1113 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
1114 Function: Convenience method that calls start_element, characters, end_element
1115 Returns : none
1116 Args : Hash ref with the keys 'Name' and 'Data'
1118 =cut
1120 sub element {
1121 my ( $self, $data ) = @_;
1122 $self->start_element($data);
1123 $self->characters($data);
1124 $self->end_element($data);
1127 =head2 get_from_element
1129 Title : get_from_element
1130 Usage : $self->get_from_element('HSP-hit_start');
1131 Function: Convenience method to retrieve data from '_values' hash
1132 Returns : string
1133 Args : key
1135 =cut
1137 sub get_from_element {
1138 my ($self,$key) = @_;
1139 my $values = $self->{_values};
1140 $values->{$key};
1143 =head2 characters
1145 Title : characters
1146 Usage : $eventgenerator->characters($str)
1147 Function: Send a character events
1148 Returns : none
1149 Args : string
1151 =cut
1153 sub characters {
1154 my ( $self, $data ) = @_;
1156 if ( $self->in_element('hsp')
1157 && $data->{'Name'} =~ /Hsp\_(?:qseq|hseq|csline|pline|midline)/o
1158 && defined $data->{'Data'} )
1160 $self->{'_last_hspdata'}->{ $data->{'Name'} } .= $data->{'Data'};
1162 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
1164 $self->{'_last_data'} = $data->{'Data'};
1167 =head2 within_element
1169 Title : within_element
1170 Usage : if( $eventgenerator->within_element( $element ) ) {}
1171 Function: Test if we are within a particular element
1172 This is different than 'in' because within can be tested for
1173 a whole block
1174 Returns : boolean
1175 Args : string element name
1177 =cut
1179 sub within_element {
1180 my ( $self, $name ) = @_;
1181 return 0
1182 if ( !defined $name
1183 || !defined $self->{'_elements'}
1184 || scalar @{ $self->{'_elements'} } == 0 );
1185 foreach my $element ( @{ $self->{'_elements'} } ) {
1186 return 1 if ( $element eq $name );
1188 return 0;
1191 =head2 in_element
1193 Title : in_element
1194 Usage : if( $eventgenerator->in_element( $element ) ) {}
1195 Function: Test if we are in a particular element
1196 This is different than 'within' because 'in' only
1197 tests its immediate parent
1198 Returns : boolean
1199 Args : string element name
1201 =cut
1203 sub in_element {
1204 my ( $self, $name ) = @_;
1205 return 0 if !defined $self->{'_elements'}->[0];
1206 return ( $self->{'_elements'}->[0] eq $name );
1209 =head2 start_document
1211 Title : start_document
1212 Usage : $eventgenerator->start_document
1213 Function: Handle a start document event
1214 Returns : none
1215 Args : none
1217 =cut
1219 sub start_document {
1220 my ($self) = @_;
1221 $self->{'_lasttype'} = '';
1222 $self->{'_values'} = {};
1223 $self->{'_result'} = undef;
1224 $self->{'_elements'} = [];
1227 =head2 end_document
1229 Title : end_document
1230 Usage : $eventgenerator->end_document
1231 Function: Handles an end document event
1232 Returns : Bio::Search::Result::ResultI object
1233 Args : none
1235 =cut
1237 sub end_document {
1238 my ($self) = @_;
1239 return $self->{'_result'};
1242 =head2 result_count
1244 Title : result_count
1245 Usage : my $count = $searchio->result_count
1246 Function: Returns the number of results processed
1247 Returns : interger
1248 Args : none
1250 =cut
1252 sub result_count {
1253 my $self = shift;
1254 return $self->{'_result_count'};