2 # BioPerl module for Bio::SearchIO::hmmer3
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Thomas Sharpton <thomas.sharpton@gmail.com>
8 # Copyright Thomas Sharpton
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
22 my $searchio = Bio::SearchIO->new(
25 -file => 'hmmsearch.out'
28 my $result = $searchio->next_result;
29 my $hit = $result->next_hit;
30 print $hit->name, $hit->description, $hit->significance,
33 my $hsp = $hit->next_hsp;
34 print $hsp->start('hit'), $hsp->end('hit'), $hsp->start('query'),
35 $hsp->end('query'), "\n";
39 Code to parse output from hmmsearch, hmmscan, and nhmmer, compatible with
40 both version 2 and version 3 of the HMMER package from L<http://hmmer.org>.
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to
48 the Bioperl mailing list. Your participation is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55 Please direct usage questions or support issues to the mailing list:
57 L<bioperl-l@bioperl.org>
59 rather than to the module maintainer directly. Many experienced and
60 reponsive experts will be able look at the problem and quickly
61 address it. Please include a thorough description of the problem
62 with code and data examples if at all possible.
66 Report bugs to the Bioperl bug tracking system to help us keep track
67 of the bugs and their resolution. Bug reports can be submitted via
70 https://redmine.open-bio.org/projects/bioperl/
72 =head1 AUTHOR - Thomas Sharpton
74 Email thomas.sharpton@gmail.com
76 Describe contact details here
80 Additional contributors names and emails here
86 The rest of the documentation details each of the object methods.
87 Internal methods are usually preceded with a _
91 # Let the code begin...
93 package Bio
::SearchIO
::hmmer3
;
97 use Bio
::Factory
::ObjectFactory
;
98 use vars
qw(%MAPPING %MODEMAP);
99 use base qw(Bio::SearchIO::hmmer);
103 # mapping of HMMER items to Bioperl hash keys
105 'HMMER_Output' => 'result',
111 'Hsp_bit-score' => 'HSP-bits',
112 'Hsp_score' => 'HSP-score',
113 'Hsp_evalue' => 'HSP-evalue',
114 'Hsp_query-from' => 'HSP-query_start',
115 'Hsp_query-to' => 'HSP-query_end',
116 'Hsp_query-strand' => 'HSP-query_strand',
117 'Hsp_hit-from' => 'HSP-hit_start',
118 'Hsp_hit-to' => 'HSP-hit_end',
119 'Hsp_hit-strand' => 'HSP-hit_strand',
120 'Hsp_positive' => 'HSP-conserved',
121 'Hsp_identity' => 'HSP-identical',
122 'Hsp_gaps' => 'HSP-hsp_gaps',
123 'Hsp_hitgaps' => 'HSP-hit_gaps',
124 'Hsp_querygaps' => 'HSP-query_gaps',
125 'Hsp_qseq' => 'HSP-query_seq',
126 'Hsp_hseq' => 'HSP-hit_seq',
127 'Hsp_midline' => 'HSP-homology_seq',
128 'Hsp_align-len' => 'HSP-hsp_length',
129 'Hsp_query-frame' => 'HSP-query_frame',
130 'Hsp_hit-frame' => 'HSP-hit_frame',
132 'Hit_id' => 'HIT-name',
133 'Hit_len' => 'HIT-length',
134 'Hit_accession' => 'HIT-accession',
135 'Hit_desc' => 'HIT-description',
136 'Hit_signif' => 'HIT-significance',
137 'Hit_score' => 'HIT-score',
139 'HMMER_program' => 'RESULT-algorithm_name',
140 'HMMER_version' => 'RESULT-algorithm_version',
141 'HMMER_query-def' => 'RESULT-query_name',
142 'HMMER_query-len' => 'RESULT-query_length',
143 'HMMER_query-acc' => 'RESULT-query_accession',
144 'HMMER_querydesc' => 'RESULT-query_description',
145 'HMMER_hmm' => 'RESULT-hmm_name',
146 'HMMER_seqfile' => 'RESULT-sequence_file',
147 'HMMER_db' => 'RESULT-database_name',
154 Usage : my $hit = $searchio->next_result;
155 Function: Returns the next Result from a search
156 Returns : Bio::Search::Result::ResultI object
163 my $seentop = 0; # Placeholder for when we deal with multi-query reports
165 my ( $last, @hit_list, @hsp_list, %hspinfo, %hitinfo, %domaincounter );
169 my $verbose = $self->verbose; # cache for speed? JES's idea in hmmer.pm
170 $self->start_document();
173 # This is here to ensure that next_result doesn't produce infinite loop
174 if ( !defined( $_ = $self->_readline ) ) {
178 $self->_pushback($_);
181 # Regex goes here for HMMER3
182 # Start with hmmsearch processing
183 while ( defined( $_ = $self->_readline ) ) {
187 # Grab the program name
188 if ( $_ =~ m/^\#\s(\S+)\s\:\:\s/ ) {
191 # TO DO: customize the above regex to adapt to other
192 # program types (hmmscan, etc)
193 $self->start_element( { 'Name' => 'HMMER_Output' } );
194 $self->{'_result_count'}++; #Might need to move to another block
196 { 'Name' => 'HMMER_program',
202 # Get the HMMER package version and release date
203 elsif ( $_ =~ m/^\#\sHMMER\s+(\S+)\s+\((.+)\)/ ) {
205 my $versiondate = $2;
206 $self->{'_hmmidline'} = $_;
208 { 'Name' => 'HMMER_version',
215 elsif ( $_ =~ /^\#\squery \w+ file\:\s+(\S+)/ ) {
216 if ( $self->{'_reporttype'} eq 'HMMSEARCH'
217 || $self->{'_reporttype'} eq 'NHMMER' )
219 $self->{'_hmmfileline'} = $lineorig;
221 { 'Name' => 'HMMER_hmm',
226 elsif ( $self->{'_reporttype'} eq 'HMMSCAN' ) {
227 $self->{'_hmmseqline'} = $lineorig;
229 { 'Name' => 'HMMER_seqfile',
236 # If this is a report without alignments
237 elsif ( $_ =~ m/^\#\sshow\salignments\sin\soutput/ ) {
238 $self->{'_alnreport'} = 0;
241 # Get the database info
242 elsif ( $_ =~ m/^\#\starget\s\S+\sdatabase\:\s+(\S+)/ ) {
244 if ( $self->{'_reporttype'} eq 'HMMSEARCH'
245 || $self->{'_reporttype'} eq 'NHMMER' )
247 $self->{'_hmmseqline'} = $lineorig;
249 { 'Name' => 'HMMER_seqfile',
254 elsif ( $self->{'_reporttype'} eq 'HMMSCAN' ) {
255 $self->{'_hmmfileline'} = $lineorig;
257 { 'Name' => 'HMMER_hmm',
265 elsif ( $_ =~ s/^Query:\s+// ) {
267 # TO DO: code to deal with multi-query report
268 unless (s/\s+\[[L|M]\=(\d+)\]$//) {
269 warn "Error parsing length for query, offending line $_\n";
274 { 'Name' => 'HMMER_query-len',
279 { 'Name' => 'HMMER_query-def',
286 elsif ( $_ =~ s/^Accession:\s+// ) {
289 { 'Name' => 'HMMER_query-acc',
295 # Get description data
296 elsif ( $_ =~ s/^Description:\s+// ) {
299 { 'Name' => 'HMMER_querydesc',
305 # hmmsearch, nhmmer, and hmmscan-specific formatting here
307 defined $self->{'_reporttype'}
308 && ( $self->{'_reporttype'} eq 'HMMSEARCH'
309 || $self->{'_reporttype'} eq 'HMMSCAN'
310 || $self->{'_reporttype'} eq 'NHMMER' )
313 # Complete sequence table data above inclusion threshold,
314 # hmmsearch or hmmscan
315 if ( $_ =~ m/Scores for complete sequence/ ) {
316 while ( defined( $_ = $self->_readline ) ) {
317 if ( $_ =~ m/inclusion threshold/
318 || m/Domain( and alignment)? annotation for each/
319 || m/\[No hits detected/
322 $self->_pushback($_);
327 next if ( m/\-\-\-/ || m/^\s+E-value\s+score/ || m/^$/ );
328 my ($eval_full, $score_full, $bias_full, $eval_best,
329 $score_best, $bias_best, $exp, $n,
330 $hitid, $desc, @hitline
332 @hitline = split( " ", $_ );
333 $eval_full = shift @hitline;
334 $score_full = shift @hitline;
335 $bias_full = shift @hitline;
336 $eval_best = shift @hitline;
337 $score_best = shift @hitline;
338 $bias_best = shift @hitline;
339 $exp = shift @hitline;
341 $hitid = shift @hitline;
342 $desc = join " ", @hitline;
344 if ( !defined($desc) ) {
348 [ $hitid, $desc, $eval_full, $score_full ];
349 $hitinfo{$hitid} = $#hit_list;
354 if ( $_ =~ m/Scores for complete hits/ ) {
355 while ( defined( $_ = $self->_readline ) ) {
356 if ( $_ =~ m/inclusion threshold/
357 || m/Annotation for each hit/
358 || m/\[No hits detected/
361 $self->_pushback($_);
366 next if ( m/\-\-\-/ || m/^\s+E-value\s+score/ || m/^$/ );
367 my ($eval, $score, $bias, $hitid,
368 $start, $end, $desc, @hitline
370 @hitline = split( " ", $_ );
371 $eval = shift @hitline;
372 $score = shift @hitline;
373 $bias = shift @hitline;
374 $hitid = shift @hitline;
375 $start = shift @hitline;
376 $end = shift @hitline;
377 $desc = join ' ', @hitline;
379 $desc = '' if ( !defined($desc) );
381 push @hit_list, [ $hitid, $desc, $eval, $score ];
382 $hitinfo{$hitid} = $#hit_list;
386 # Complete sequence table data below inclusion threshold
387 # not currently fully implemented
388 elsif ( $_ =~ m/inclusion threshold/ ) {
389 while ( defined( $_ = $self->_readline ) ) {
390 if ( $_ =~ m/Domain( and alignment)? annotation for each/
391 || m/Internal pipeline statistics summary/ )
393 $self->_pushback($_);
396 next if ( $_ =~ m/^$/ );
397 my ($eval_full, $score_full, $bias_full, $eval_best,
398 $score_best, $bias_best, $exp, $n,
399 $hitid, $desc, @hitline
401 @hitline = split( " ", $_ );
402 $eval_full = shift @hitline;
403 $score_full = shift @hitline;
404 $bias_full = shift @hitline;
405 $eval_best = shift @hitline;
406 $score_best = shift @hitline;
407 $bias_best = shift @hitline;
408 $exp = shift @hitline;
410 $hitid = shift @hitline;
411 $desc = join " ", @hitline;
413 $hitinfo{$hitid} = "below_inclusion";
417 # Domain annotation for each sequence table data, hmmscan
418 elsif ( $_ =~ m/Domain( and alignment)? annotation for each/ ) {
419 @hsp_list = (); # Here for multi-query reports
422 while ( defined( $_ = $self->_readline ) ) {
423 if ( $_ =~ m/Internal pipeline statistics/
424 || m/\[No targets detected/ )
426 $self->_pushback($_);
429 if ( $_ =~ m/^\>\>\s(.*?)\s+/ ) {
432 # Skip hits below inclusion threshold
433 next if ( $hitinfo{$name} eq "below_inclusion" );
434 $domaincounter{$name} = 0;
436 while ( defined( $_ = $self->_readline ) ) {
438 # Grab table data for sequence
439 if ( $_ =~ m/Internal pipeline statistics/
442 $self->_pushback($_);
445 if ( $_ =~ m/Alignments for each domain/ ) {
446 $self->_pushback($_);
449 if ( $_ =~ m/^\s+\#\s+score/
450 || $_ =~ m/^\s\-\-\-\s+/
460 # Grab hsp data from table, push into @hsp;
461 if (my ($domain_num, $score, $bias,
462 $ceval, $ieval, $hmmstart,
463 $hmmstop, $qalistart, $qalistop,
464 $envstart, $envstop, $envbound,
467 = m
|^\s
+(\d
+)\s\
!*\?*\s
+ # domain number
468 (\S
+)\s
+(\S
+)\s
+ # score, bias
469 (\S
+)\s
+(\S
+)\s
+ # c-eval, i-eval
470 (\d
+)\s
+(\d
+).+?
# hmm start, stop
471 (\d
+)\s
+(\d
+).+?
# query start, stop
472 (\d
+)\s
+(\d
+).+?
# env start, stop
477 # Keep it simple for now. let's customize later
480 $qalistart, $qalistop,
485 my $info = $hit_list[ $hitinfo{$name} ];
486 if ( !defined $info ) {
488 "Incomplete sequence information; can't find $name, hitinfo says $hitinfo{$name}\n"
492 $domaincounter{$name}++;
494 = $name . "_" . $domaincounter{$name};
495 push @hsp_list, [ $name, @vals ];
496 $hspinfo{$hsp_key} = $#hsp_list;
499 print "missed this line: $_\n";
503 elsif ( $_ =~ m/Alignments for each domain/ ) {
504 my $domain_count = 0;
509 # There's an optional block, so we sometimes need to
510 # count to 3, and sometimes to 4.
514 my ( $hline, $midline, $qline );
516 while ( defined( $_ = $self->_readline ) ) {
518 || $_ =~ m/Internal pipeline statistics/ )
520 $self->_pushback($_);
523 elsif ($hitinfo{$name} eq "below_inclusion"
528 elsif ( $_ =~ /\s\s\=\=\sdomain\s(\d+)\s+/ ) {
531 my $key = $name . "_" . $domainnum;
532 $hsp = $hsp_list[ $hspinfo{$key} ];
534 $midline = $$hsp[-2];
539 # model data track, some reports don't have
540 elsif ( $_ =~ m/\s+\S+\sCS$/ ) {
546 elsif ( $count == $max_count - 3 ) {
549 my @data = split( " ", $_ );
555 elsif ( $count == $max_count - 2 ) {
558 # storage isn't quite right - need to remove
559 # leading/lagging whitespace while preserving
560 # gap data (latter isn't done, former is)
567 elsif ( $count == $max_count - 1 ) {
570 my @data = split( " ", $_ );
576 elsif ( $count == $max_count ) {
583 $$hsp[-2] = $midline;
595 # Annotation for each hit, nhmmer
596 # This code is currently incomplete, the alignment strings
597 # are not being captured
598 elsif ( $_ =~ m/Annotation for each hit\s+\(and alignments\)/ ) {
602 while ( defined( $_ = $self->_readline ) ) {
603 if ( $_ =~ m/Internal pipeline statistics/
604 || m/\[No targets detected/ )
606 $self->_pushback($_);
609 if ( $_ =~ m/^>>\s(.*?)\s+/ ) {
612 while ( defined( $_ = $self->_readline ) ) {
614 if ( $_ =~ m/Internal pipeline statistics/
617 $self->_pushback($_);
621 $_ =~ /^\s+#\s+score/
622 || $_ =~ /^\s+------\s+/
625 || $_ =~ /^\s+Alignment:/
626 || $_ =~ /^\s+score:/
627 || $_ =~ /^\s+score\s+bias/
628 || $_ =~ /^\s+\S+\s+\d+\s+([\s+gatc-]+)/ # Alignment, line 1
629 || $_ =~ /^\s{20,}([\s+gatc-]+)/ # Alignment, line 2
630 || $_ =~ /^\s+$name\s+\d+\s+([\s+GATC-]+)/ # Alignment, line 3
631 || $_ =~ /^\s+[\d.\*]+/ # Alignment, line 4
639 (\d
+)\s
+(\d
+)\s
+[.\
[\
]]*\s
+
640 (\d
+)\s
+(\d
+)\s
+[.\
[\
]]*\s
+
641 (\d
+)\s
+(\d
+)\s
+[.\
[\
]]*\s
+
645 my ($score, $bias, $eval,
646 $hmmstart, $hmmstop, $hitstart,
647 $hitstop, $envstart, $envstop,
651 $1, $2, $3, $4, $5, $6,
662 my $info = $hit_list[ $hitinfo{$name} ];
663 if ( !defined $info ) {
665 "Incomplete information: can't find HSP $name in list of hits\n"
669 $domaincounter{$name}++;
670 my $hsp_key = $name . "_" . $domaincounter{$name};
671 push @hsp_list, [ $name, @vals ];
672 $hspinfo{$hsp_key} = $#hsp_list;
675 print "Missed this line: $_\n";
682 elsif ( m/Internal pipeline statistics/ || m!^//! ) {
684 # If within hit, hsp close;
685 if ( $self->within_element('hit') ) {
686 if ( $self->within_element('hsp') ) {
687 $self->end_element( { 'Name' => 'Hsp' } );
689 $self->end_element( { 'Name' => 'Hit' } );
692 # Grab summary statistics of run
693 while ( defined( $_ = $self->_readline ) ) {
694 last if ( $_ =~ m/^\/\
/$/ );
697 # Do a lot of processing of hits and hsps here
698 while ( my $hit = shift @hit_list ) {
699 my $hit_name = shift @
$hit;
700 my $hit_desc = shift @
$hit;
701 my $hit_signif = shift @
$hit;
702 my $hit_score = shift @
$hit;
703 my $num_domains = $domaincounter{$hit_name} || 0;
705 $self->start_element( { 'Name' => 'Hit' } );
707 { 'Name' => 'Hit_id',
712 { 'Name' => 'Hit_desc',
717 { 'Name' => 'Hit_signif',
718 'Data' => $hit_signif
722 { 'Name' => 'Hit_score',
727 for my $i ( 1 .. $num_domains ) {
728 my $key = $hit_name . "_" . $i;
729 my $hsp = $hsp_list[ $hspinfo{$key} ];
730 if ( defined $hsp ) {
731 my $hsp_name = shift @
$hsp;
732 $self->start_element( { 'Name' => 'Hsp' } );
734 { 'Name' => 'Hsp_identity',
739 { 'Name' => 'Hsp_positive',
744 { 'Name' => 'Hsp_hit-from',
745 'Data' => shift @
$hsp
749 { 'Name' => 'Hsp_hit-to',
750 'Data' => shift @
$hsp
754 { 'Name' => 'Hsp_query-from',
755 'Data' => shift @
$hsp
759 { 'Name' => 'Hsp_query-to',
760 'Data' => shift @
$hsp
764 { 'Name' => 'Hsp_score',
765 'Data' => shift @
$hsp
769 { 'Name' => 'Hsp_evalue',
770 'Data' => shift @
$hsp
774 { 'Name' => 'Hsp_hseq',
775 'Data' => shift @
$hsp
779 { 'Name' => 'Hsp_midline',
780 'Data' => shift @
$hsp
784 { 'Name' => 'Hsp_qseq',
785 'Data' => shift @
$hsp
788 $self->end_element( { 'Name' => 'Hsp' } );
791 $self->end_element( { 'Name' => 'Hit' } );
799 print "Missed line: $_\n";
804 $self->end_element( { 'Name' => 'HMMER_Output' } );
805 my $result = $self->end_document();
811 Title : start_element
812 Usage : $eventgenerator->start_element
813 Function: Handles a start event
815 Args : hashref with at least 2 keys 'Data' and 'Name'
821 my ( $self, $data ) = @_;
823 # we currently don't care about attributes
824 my $nm = $data->{'Name'};
825 my $type = $MODEMAP{$nm};
827 if ( $self->_eventHandler->will_handle($type) ) {
828 my $func = sprintf( "start_%s", lc $type );
829 $self->_eventHandler->$func( $data->{'Attributes'} );
831 unshift @
{ $self->{'_elements'} }, $type;
834 && $type eq 'result' )
836 $self->{'_values'} = {};
837 $self->{'_result'} = undef;
844 Usage : $eventgeneartor->end_element
845 Function: Handles and end element event
847 Args : hashref with at least 2 keys 'Data' and 'Name'
853 my ( $self, $data ) = @_;
854 my $nm = $data->{'Name'};
855 my $type = $MODEMAP{$nm};
858 if ( $nm eq 'HMMER_program' ) {
859 if ( $self->{'_last_data'} =~ /(N?HMM\S+)/i ) {
860 $self->{'_reporttype'} = uc $1;
864 # Hsp are sort of weird, in that they end when another
865 # object begins so have to detect this in end_element for now
866 if ( $nm eq 'Hsp' ) {
867 foreach (qw(Hsp_qseq Hsp_midline Hsp_hseq)) {
868 my $data = $self->{'_last_hspdata'}->{$_};
869 if ( $data && $_ eq 'Hsp_hseq' ) {
871 # replace hmm '.' gap symbol by '-'
880 $self->{'_last_hspdata'} = {};
883 if ( $self->_eventHandler->will_handle($type) ) {
884 my $func = sprintf( "end_%s", lc $type );
885 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
886 $self->{'_values'} );
888 my $lastelem = shift @
{ $self->{'_elements'} };
890 elsif ( $MAPPING{$nm} ) {
891 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
892 my $key = ( keys %{ $MAPPING{$nm} } )[0];
893 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} }
894 = $self->{'_last_data'};
897 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
899 # print "lastdata is " . $self->{'_last_data'} . "\n";
903 $self->debug("unknown nm $nm, ignoring\n");
905 $self->{'_last_data'} = ''; # remove read data if we are at
907 $self->{'_result'} = $rc if ( defined $type && $type eq 'result' );
914 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
915 Function: Convienence method that calls start_element, characters, end_element
917 Args : Hash ref with the keys 'Name' and 'Data'
922 my ( $self, $data ) = @_;
923 $self->start_element($data);
924 $self->characters($data);
925 $self->end_element($data);
931 Usage : $eventgenerator->characters($str)
932 Function: Send a character events
939 my ( $self, $data ) = @_;
941 if ( $self->in_element('hsp')
942 && $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/o
943 && defined $data->{'Data'} )
945 $self->{'_last_hspdata'}->{ $data->{'Name'} } .= $data->{'Data'};
947 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
949 $self->{'_last_data'} = $data->{'Data'};
952 =head2 within_element
954 Title : within_element
955 Usage : if( $eventgenerator->within_element( $element ) ) {}
956 Function: Test if we are within a particular element
957 This is different than 'in' because within can be tested for
960 Args : string element name
965 my ( $self, $name ) = @_;
968 || !defined $self->{'_elements'}
969 || scalar @
{ $self->{'_elements'} } == 0 );
970 foreach ( @
{ $self->{'_elements'} } ) {
971 return 1 if ( $_ eq $name );
979 Usage : if( $eventgenerator->in_element( $element ) ) {}
980 Function: Test if we are in a particular element
981 This is different than 'within' because 'in' only
982 tests its immediate parent
984 Args : string element name
989 my ( $self, $name ) = @_;
990 return 0 if !defined $self->{'_elements'}->[0];
991 return ( $self->{'_elements'}->[0] eq $name );
994 =head2 start_document
996 Title : start_document
997 Usage : $eventgenerator->start_document
998 Function: Handle a start document event
1004 sub start_document
{
1006 $self->{'_lasttype'} = '';
1007 $self->{'_values'} = {};
1008 $self->{'_result'} = undef;
1009 $self->{'_elements'} = [];
1014 Title : end_document
1015 Usage : $eventgenerator->end_document
1016 Function: Handles an end document event
1017 Returns : Bio::Search::Result::ResultI object
1024 return $self->{'_result'};
1029 Title : result_count
1030 Usage : my $count = $searchio->result_count
1031 Function: Returns the number of results processed
1039 return $self->{'_result_count'};