2 # Perl Module for HMMResults
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@sanger.ac.uk>
8 #Copyright Genome Research Limited (1997).
12 Bio::Tools::HMMER::Results - Object representing HMMER output results
16 # parse a hmmsearch file (can also parse a hmmpfam file)
17 $res = Bio::Tools::HMMER::Results->new( -file => 'output.hmm' ,
18 -type => 'hmmsearch');
20 # print out the results for each sequence
21 foreach $seq ( $res->each_Set ) {
22 print "Sequence bit score is",$seq->bits,"\n";
23 foreach $domain ( $seq->each_Domain ) {
24 print " Domain start ",$domain->start," end ",$domain->end,
25 " score ",$domain->bits,"\n";
29 # new result object on a sequence/domain cutoff of
30 # 25 bits sequence, 15 bits domain
31 $newresult = $res->filter_on_cutoff(25,15);
33 # alternative way of getting out all domains directly
34 foreach $domain ( $res->each_Domain ) {
35 print "Domain on ",$domain->seq_id," with score ",
36 $domain->bits," evalue ",$domain->evalue,"\n";
41 This object represents HMMER output, either from hmmsearch or
42 hmmpfam. For hmmsearch, a series of HMMER::Set objects are made, one
43 for each sequence, which have the the bits score for the object. For
44 hmmpfam searches, only one Set object is made.
47 These objects come from the original HMMResults modules used
48 internally in Pfam, written by Ewan Birney. Ewan then converted them to
49 BioPerl objects in 1999. That conversion is meant to be backwardly
50 compatible, but may not be (caveat emptor).
56 User feedback is an integral part of the evolution of this and other
57 Bioperl modules. Send your comments and suggestions preferably to one
58 of the Bioperl mailing lists. Your participation is much appreciated.
60 bioperl-l@bioperl.org - General discussion
61 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
65 Please direct usage questions or support issues to the mailing list:
67 I<bioperl-l@bioperl.org>
69 rather than to the module maintainer directly. Many experienced and
70 reponsive experts will be able look at the problem and quickly
71 address it. Please include a thorough description of the problem
72 with code and data examples if at all possible.
76 Report bugs to the Bioperl bug tracking system to help us keep track
77 the bugs and their resolution. Bug reports can be submitted via the
80 https://github.com/bioperl/bioperl-live/issues
82 =head1 AUTHOR - Ewan Birney
84 Email birney@ebi.ac.uk
88 Jason Stajich, jason-at-bioperl.org
92 The rest of the documentation details each of the object
93 methods. Internal methods are usually preceded with a _
97 package Bio
::Tools
::HMMER
::Results
;
101 use Bio
::Tools
::HMMER
::Domain
;
102 use Bio
::Tools
::HMMER
::Set
;
105 use base
qw(Bio::Root::Root Bio::Root::IO Bio::SeqAnalysisParserI);
108 my($class,@args) = @_;
110 my $self = $class->SUPER::new
(@args);
112 $self->{'domain'} = []; # array of HMMUnits
115 my ($parsetype) = $self->_rearrange([qw(TYPE)],@args);
116 $self->_initialize_io(@args);
117 if( !defined $parsetype ) {
118 $self->throw("No parse type provided. should be hmmsearch or hmmpfam");
120 $self->parsetype($parsetype);
121 if( defined $self->_fh() ) {
122 if( $parsetype eq 'hmmsearch' ) {
123 $self->_parse_hmmsearch($self->_fh());
124 } elsif ( $parsetype eq 'hmmpfam' ) {
125 $self->_parse_hmmpfam($self->_fh());
127 $self->throw("Did not recoginise type $parsetype");
131 return $self; # success - we hope!
138 Usage : while( my $feat = $res->next_feature ) { # do something }
139 Function: SeqAnalysisParserI implementing function
141 Returns : A Bio::SeqFeatureI compliant object, in this case,
142 each DomainUnit object, ie, flattening the Sequence
152 if( $self->{'_started_next_feature'} == 1 ) {
153 return shift @
{$self->{'_next_feature_array'}};
155 $self->{'_started_next_feature'} = 1;
157 foreach my $seq ( $self->each_Set() ) {
158 foreach my $unit ( $seq->each_Domain() ) {
162 my $res = shift @array;
163 $self->{'_next_feature_array'} = \
@array;
167 $self->throw("Should not reach here! Error!");
174 Usage : print "There are ",$res->number," domains hit\n";
175 Function: provides the number of domains in the HMMER report
183 $ref = $self->{'domain'};
186 @val = @
{$self->{'domain'}};
193 Usage : $obj->seqfile($newval)
196 Returns : value of seqfile
197 Args : newvalue (optional)
203 my ($self,$value) = @_;
204 if( defined $value) {
205 $self->{'seqfile'} = $value;
207 return $self->{'seqfile'};
214 Usage : $obj->hmmfile($newval)
217 Returns : value of hmmfile
218 Args : newvalue (optional)
224 my ($self,$value) = @_;
225 if( defined $value) {
226 $self->{'hmmfile'} = $value;
228 return $self->{'hmmfile'};
235 Usage : $res->add_Domain($unit)
236 Function: adds a domain to the results array. Mainly used internally.
237 Args : A Bio::Tools::HMMER::Domain
247 $name = $unit->seq_id();
249 if( ! exists $self->{'seq'}->{$name} ) {
250 $self->warn("Adding a domain of $name but with no HMMSequence. Will be kept in domain array but not added to a HMMSequence");
252 $self->{'seq'}->{$name}->add_Domain($unit);
254 push(@
{$self->{'domain'}},$unit);
261 Usage : foreach $domain ( $res->each_Domain() )
262 Function: array of Domain units which are held in this report
273 foreach $u ( @
{$self->{'domain'}} ) {
281 =head2 domain_bits_cutoff_from_evalue
283 Title : domain_bits_cutoff_from_evalue
284 Usage : $cutoff = domain_bits_cutoff_from_evalue(0.01);
285 Function: return a bits cutoff from an evalue using the
286 scores here. Somewhat interesting logic:
287 Find the two bit score which straddle the evalue
288 if( 25 is between these two points) return 25
289 else return the midpoint.
291 This logic tries to ensure that with large signal to
292 noise separation one still has sensible 25 bit cutoff
298 sub domain_bits_cutoff_from_evalue
{
301 my ($dom,$prev,@doms,$cutoff,$sep,$seen);
303 @doms = $self->each_Domain;
306 @doms = map { $_->[0] }
307 sort { $b->[1] <=> $a->[1] }
308 map { [ $_, $_->bits] } @doms;
310 foreach $_ ( @doms ) {
311 if( $_->evalue > $eval ) {
319 if( ! defined $prev || $seen == 0) {
320 $self->throw("Evalue is either above or below the list...");
324 $sep = $prev->bits - $dom->bits ;
327 return $prev->bits();
329 if( $dom->bits < 25 && $prev->bits > 25 ) {
333 return int( $dom->bits + $sep/2 ) ;
338 sub dictate_hmm_acc
{
344 foreach $unit ( $self->eachHMMUnit() ) {
349 =head2 write_FT_output
351 Title : write_FT_output
352 Usage : $res->write_FT_output(\*STDOUT,'DOMAIN')
353 Function: writes feature table output ala swissprot
360 sub write_FT_output
{
366 if( !defined $idt ) {
370 foreach $seq ( $self->each_Set() ) {
371 print $file sprintf("ID %s\n",$seq->name());
372 foreach $unit ( $seq->each_Domain() ) {
373 print $file sprintf("FT %s %d %d %s\n",$idt,
374 $unit->start,$unit->end,$unit->hmmname);
380 =head2 filter_on_cutoff
382 Title : filter_on_cutoff
383 Usage : $newresults = $results->filter_on_cutoff(25,15);
384 Function: Produces a new HMMER::Results module which has
385 been trimmed at the cutoff.
386 Returns : a Bio::Tools::HMMER::Results module
387 Args : sequence cutoff and domain cutoff. in bits score
388 if you want one cutoff, simply use same number both places
392 sub filter_on_cutoff
{
396 my ($new,$seq,$unit,@array,@narray);
398 if( !defined $domthr ) {
399 $self->throw("hmmresults filter on cutoff needs two arguments");
402 $new = Bio
::Tools
::HMMER
::Results
->new(-type
=> $self->parsetype);
404 foreach $seq ( $self->each_Set()) {
405 next if( $seq->bits() < $seqthr );
407 foreach $unit ( $seq->each_Domain() ) {
408 next if( $unit->bits() < $domthr );
409 $new->add_Domain($unit);
415 =head2 write_ascii_out
417 Title : write_ascii_out
418 Usage : $res->write_ascii_out(\*STDOUT)
420 seq seq_start seq_end model-acc model_start model_end model_name
424 FIXME: Now that we have no modelacc, this is probably a bad thing.
428 # writes as seq sstart send modelacc hstart hend modelname
430 sub write_ascii_out
{
440 foreach $seq ( $self->each_Set()) {
441 foreach $unit ( $seq->each_Domain()) {
442 print $fh sprintf("%s %4d %4d %s %4d %4d %4.2f %4.2g %s\n",
443 $unit->seq_id(),$unit->start(),$unit->end(),
444 $unit->hmmacc,$unit->hstart,$unit->hend,
445 $unit->bits,$unit->evalue,$unit->hmmname);
451 =head2 write_GDF_bits
453 Title : write_GDF_bits
454 Usage : $res->write_GDF_bits(25,15,\*STDOUT)
455 Function: writes GDF format with a sequence,domain threshold
470 if( !defined $file ) {
471 $self->throw("Attempting to use write_GDF_bits without passing in correct arguments!");
475 foreach $seq ( $self->each_Set()) {
477 if( $seq->bits() < $seqt ) {
481 foreach $unit ( $seq->each_Domain() ) {
482 if( $unit->bits() < $domt ) {
490 @narray = sort { my ($aa,$bb,$st_a,$st_b);
496 return $st_a <=> $st_b;
502 foreach $unit ( @narray ) {
503 print $file sprintf("%-24s\t%6d\t%6d\t%15s\t%.1f\t%g\n",$unit->get_nse(),$unit->start(),$unit->end(),$unit->seq_id(),$unit->bits(),$unit->evalue);
508 sub write_scores_bits
{
517 if( !defined $file ) {
518 $self->warn("Attempting to use write_scores_bits without passing in correct arguments!");
522 foreach $seq ( $self->eachHMMSequence()) {
524 if( $seq->bits() < $seqt ) {
528 foreach $unit ( $seq->eachHMMUnit() ) {
529 if( $unit->bits() < $domt ) {
537 @narray = sort { my ($aa,$bb,$st_a,$st_b);
543 foreach $unit ( @narray ) {
544 print $file sprintf("%4.2f %s\n",$unit->bits(),$unit->get_nse());
554 if( !defined $file ) {
559 foreach $unit ( $self->eachHMMUnit() ) {
560 print $file sprintf("%-24s\t%6d\t%6d\t%15s\t%.1f\t%g\n",$unit->get_nse(),$unit->start(),$unit->end(),$unit->seq_id(),$unit->bits(),$unit->evalue);
569 my ($seq,$unit,$hseq,$hdom,$noiseseq,$noisedom);
571 $hseq = $hdom = -100000;
573 foreach $seq ( $self->eachHMMSequence()) {
574 if( $seq->bits() < $seqt && $seq->bits() > $hseq ) {
575 $hseq = $seq->bits();
578 foreach $unit ( $seq->eachHMMUnit() ) {
579 if( (($seq->bits() < $seqt) || ($seq->bits() > $seqt && $unit->bits < $domt)) && $unit->bits() > $hdom ) {
580 $hdom = $unit->bits();
587 return ($noiseseq,$noisedom);
596 my ($seq,$unit,$lowseq,$lowdom,$trueseq,$truedom);
598 if( ! defined $domt ) {
599 $self->warn("lowest true needs at least a domain threshold cut-off");
603 $lowseq = $lowdom = 100000;
605 foreach $seq ( $self->eachHMMSequence()) {
607 if( $seq->bits() >= $seqt && $seq->bits() < $lowseq ) {
608 $lowseq = $seq->bits();
611 if( $seq->bits() < $seqt ) {
615 foreach $unit ( $seq->eachHMMUnit() ) {
616 if( $unit->bits() >= $domt && $unit->bits() < $lowdom ) {
617 $lowdom = $unit->bits();
624 return ($trueseq,$truedom);
633 Usage : Mainly internal function
646 $name = $seq->name();
648 if( exists $self->{'seq'}->{$name} ) {
649 $self->throw("You alredy have $name in HMMResults!");
651 $self->{'seq'}->{$name} = $seq;
671 foreach $name ( keys %{$self->{'seq'}} ) {
672 push(@array,$self->{'seq'}->{$name});
681 Usage : $set = $res->get_Set('sequence-name');
682 Function: returns the Set for a particular sequence
683 Returns : a HMMER::Set object
684 Args : name of the sequence
693 return $self->{'seq'}->{$name};
697 =head2 _parse_hmmpfam
699 Title : _parse_hmmpfam
700 Usage : $res->_parse_hmmpfam($filehandle)
712 my ($id,$sqfrom,$sqto,$hmmf,$hmmt,$sc,$ev,
713 $unit,$nd,$seq,$name,$seqname,$from,
714 $to,%hash,%acc,$acc);
718 if( /^HMM file:\s+(\S+)/ ) { $self->hmmfile($1); next; }
719 elsif( /^Sequence file:\s+(\S+)/ ) { $self->seqfile($1); next }
720 elsif( /^Query(\s+sequence)?:\s+(\S+)/ ) {
724 $seq = Bio
::Tools
::HMMER
::Set
->new();
726 $seq ->name($seqname);
727 $self->add_Set($seq);
732 if( /Accession:\s+(\S+)/ ) { $seq->accession($1); next }
733 elsif( s/^Description:\s+// ) { chomp; $seq->desc($_); next }
734 /^Parsed for domains/ && last;
736 # This is to parse out the accession numbers in old Pfam format.
737 # now not support due to changes in HMMER.
739 if( (($id,$acc, $sc, $ev, $nd) = /^\s*(\S+)\s+(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/)) {
740 $hash{$id} = $sc; # we need this for the sequence
741 # core of the domains below!
744 # this is the more common parsing routine
745 } elsif ( (($id,$sc, $ev, $nd) =
746 /^\s*(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/) ) {
748 $hash{$id} = $sc; # we need this for the
749 # sequence score of hte domains below!
757 # this is meant to match
759 #Sequence Domain seq-f seq-t hmm-f hmm-t score E-value
760 #-------- ------- ----- ----- ----- ----- ----- -------
761 #PF00621 1/1 198 372 .. 1 207 [] 281.6 1e-80
763 if( (($id, $sqfrom, $sqto, $hmmf,$hmmt,$sc, $ev) =
764 /(\S+)\s+\S+\s+(\d+)\s+(\d+).+?(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\S+)\s*$/)) {
765 $unit = Bio
::Tools
::HMMER
::Domain
->new();
766 $unit->seq_id ($seqname);
767 $unit->hmmname ($id);
768 $unit->start ($sqfrom);
770 $unit->hstart($hmmf);
775 if( !exists($hash{$id}) ) {
776 $self->throw("HMMResults parsing error in hmmpfam for $id - can't find sequecne score");
779 $unit->seqbits($hash{$id});
781 if( defined $acc{$id} ) {
782 $unit->hmmacc($acc{$id});
785 # this should find it's own sequence!
786 $self->add_Domain($unit);
789 if( m{^//} ) { next; }
792 # parses alignment lines. Icky as we have to break on the same line
793 # that we need to read to place the alignment lines with the unit.
796 (!defined $_ || m{^//}) && last;
799 # PF00621: domain 1 of 1, from 198 to 372
800 if( /^\s*(\S+):.*from\s+(\d+)\s+to\s+(\d+)/ ) {
806 # find the HMMUnit which this alignment is from
808 $unit = $self->get_unit_nse($seqname,$name,$from,$to);
809 if( !defined $unit ) {
810 $self->warn("Could not find $name $from $to unit even though I am reading it in. ugh!");
816 /^\s*\S+:.*from\s+\d+\s+to\s+\d+/ && last;
817 $unit->add_alignment_line($_);
824 # back to main 'Query:' loop
829 # mainly internal function
840 $seq = $self->get_Set($seqname);
842 if( !defined $seq ) {
843 $self->throw("Could not get sequence name $seqname - so can't get its unit");
846 foreach $unit ( $seq->each_Domain() ) {
847 if( $unit->hmmname() eq $domname && $unit->start() == $start && $unit->end() == $end ) {
856 =head2 _parse_hmmsearch
858 Title : _parse_hmmsearch
859 Usage : $res->_parse_hmmsearch($filehandle)
867 sub _parse_hmmsearch
{
870 my ($id,$sqfrom,$sqto,$sc,$ev,$unit,$nd,$seq,$hmmf,$hmmt,
871 $hmmfname,$hmmacc, $hmmid, %seqh);
875 /^HMM file:\s+(\S+)/ and do { $self->hmmfile($1); $hmmfname = $1 };
876 /^Accession:\s+(\S+)/ and do { $hmmacc = $1 };
877 /^Query HMM:\s+(\S+)/ and do { $hmmid = $1 };
878 /^Sequence database:\s+(\S+)/ and do { $self->seqfile($1) };
879 /^Scores for complete sequences/ && last;
882 $hmmfname = "given" if not $hmmfname;
885 /^Parsed for domains/ && last;
886 if( (($id, $sc, $ev, $nd) = /(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/)) {
887 $seq = Bio
::Tools
::HMMER
::Set
->new();
892 $self->add_Set($seq);
893 $seq->accession($hmmacc);
898 /^Alignments of top-scoring domains/ && last;
899 if( (($id, $sqfrom, $sqto, $hmmf, $hmmt, $sc, $ev) = /(\S+)\s+\S+\s+(\d+)\s+(\d+).+?(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\S+)\s*$/)) {
900 $unit = Bio
::Tools
::HMMER
::Domain
->new();
903 $unit->hmmname($hmmfname);
904 $unit->start($sqfrom);
907 $unit->hstart($hmmf);
910 $unit->seqbits($seqh{$id});
911 $self->add_Domain($unit);
918 ## Recognize and store domain alignments
924 /^Histogram of all scores/ && last;
927 # PF00621: domain 1 of 1, from 198 to 372
928 if( /^\s*(\S+):.*from\s+(\d+)\s+to\s+(\d+)/ ) {
933 # find the HMMUnit which this alignment is from
934 $unit = $self->get_unit_nse($name,$hmmfname,$from,$to);
936 if( !defined $unit ) {
937 $self->warn("Could not find $name $from $to unit even though I am reading it in. ugh!");
941 /^Histogram of all scores/ && last;
942 /^\s*\S+:.*from\s+\d+\s+to\s+\d+/ && last;
943 $unit->add_alignment_line($_);
957 Usage : $obj->parsetype($newval)
959 Returns : value of parsetype
960 Args : newvalue (optional)
966 my ($self,$value) = @_;
967 if( defined $value) {
968 $self->{'_parsetype'} = $value;
970 return $self->{'_parsetype'};