3 # Perl Module for HMMResults
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Ewan Birney <birney@sanger.ac.uk>
9 #Copyright Genome Research Limited (1997).
13 Bio::Tools::HMMER::Results - Object representing HMMER output results
17 # parse a hmmsearch file (can also parse a hmmpfam file)
18 $res = Bio::Tools::HMMER::Results->new( -file => 'output.hmm' ,
19 -type => 'hmmsearch');
21 # print out the results for each sequence
22 foreach $seq ( $res->each_Set ) {
23 print "Sequence bit score is",$seq->bits,"\n";
24 foreach $domain ( $seq->each_Domain ) {
25 print " Domain start ",$domain->start," end ",$domain->end,
26 " score ",$domain->bits,"\n";
30 # new result object on a sequence/domain cutoff of
31 # 25 bits sequence, 15 bits domain
32 $newresult = $res->filter_on_cutoff(25,15);
34 # alternative way of getting out all domains directly
35 foreach $domain ( $res->each_Domain ) {
36 print "Domain on ",$domain->seq_id," with score ",
37 $domain->bits," evalue ",$domain->evalue,"\n";
42 This object represents HMMER output, either from hmmsearch or
43 hmmpfam. For hmmsearch, a series of HMMER::Set objects are made, one
44 for each sequence, which have the the bits score for the object. For
45 hmmpfam searches, only one Set object is made.
48 These objects come from the original HMMResults modules used
49 internally in Pfam, written by Ewan Birney. Ewan then converted them to
50 BioPerl objects in 1999. That conversion is meant to be backwardly
51 compatible, but may not be (caveat emptor).
57 User feedback is an integral part of the evolution of this and other
58 Bioperl modules. Send your comments and suggestions preferably to one
59 of the Bioperl mailing lists. Your participation is much appreciated.
61 bioperl-l@bioperl.org - General discussion
62 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
66 Please direct usage questions or support issues to the mailing list:
68 L<bioperl-l@bioperl.org>
70 rather than to the module maintainer directly. Many experienced and
71 reponsive experts will be able look at the problem and quickly
72 address it. Please include a thorough description of the problem
73 with code and data examples if at all possible.
77 Report bugs to the Bioperl bug tracking system to help us keep track
78 the bugs and their resolution. Bug reports can be submitted via the
81 http://bugzilla.open-bio.org/
83 =head1 AUTHOR - Ewan Birney
85 Email birney@ebi.ac.uk
89 Jason Stajich, jason-at-bioperl.org
93 The rest of the documentation details each of the object
94 methods. Internal methods are usually preceded with a _
98 package Bio
::Tools
::HMMER
::Results
;
102 use Bio
::Tools
::HMMER
::Domain
;
103 use Bio
::Tools
::HMMER
::Set
;
106 use base
qw(Bio::Root::Root Bio::Root::IO Bio::SeqAnalysisParserI);
109 my($class,@args) = @_;
111 my $self = $class->SUPER::new
(@args);
113 $self->{'domain'} = []; # array of HMMUnits
116 my ($parsetype) = $self->_rearrange([qw(TYPE)],@args);
117 $self->_initialize_io(@args);
118 if( !defined $parsetype ) {
119 $self->throw("No parse type provided. should be hmmsearch or hmmpfam");
121 $self->parsetype($parsetype);
122 if( defined $self->_fh() ) {
123 if( $parsetype eq 'hmmsearch' ) {
124 $self->_parse_hmmsearch($self->_fh());
125 } elsif ( $parsetype eq 'hmmpfam' ) {
126 $self->_parse_hmmpfam($self->_fh());
128 $self->throw("Did not recoginise type $parsetype");
132 return $self; # success - we hope!
139 Usage : while( my $feat = $res->next_feature ) { # do something }
140 Function: SeqAnalysisParserI implementing function
142 Returns : A Bio::SeqFeatureI compliant object, in this case,
143 each DomainUnit object, ie, flattening the Sequence
153 if( $self->{'_started_next_feature'} == 1 ) {
154 return shift @
{$self->{'_next_feature_array'}};
156 $self->{'_started_next_feature'} = 1;
158 foreach my $seq ( $self->each_Set() ) {
159 foreach my $unit ( $seq->each_Domain() ) {
163 my $res = shift @array;
164 $self->{'_next_feature_array'} = \
@array;
168 $self->throw("Should not reach here! Error!");
175 Usage : print "There are ",$res->number," domains hit\n";
176 Function: provides the number of domains in the HMMER report
184 $ref = $self->{'domain'};
187 @val = @
{$self->{'domain'}};
194 Usage : $obj->seqfile($newval)
197 Returns : value of seqfile
198 Args : newvalue (optional)
204 my ($self,$value) = @_;
205 if( defined $value) {
206 $self->{'seqfile'} = $value;
208 return $self->{'seqfile'};
215 Usage : $obj->hmmfile($newval)
218 Returns : value of hmmfile
219 Args : newvalue (optional)
225 my ($self,$value) = @_;
226 if( defined $value) {
227 $self->{'hmmfile'} = $value;
229 return $self->{'hmmfile'};
236 Usage : $res->add_Domain($unit)
237 Function: adds a domain to the results array. Mainly used internally.
238 Args : A Bio::Tools::HMMER::Domain
248 $name = $unit->seq_id();
250 if( ! exists $self->{'seq'}->{$name} ) {
251 $self->warn("Adding a domain of $name but with no HMMSequence. Will be kept in domain array but not added to a HMMSequence");
253 $self->{'seq'}->{$name}->add_Domain($unit);
255 push(@
{$self->{'domain'}},$unit);
262 Usage : foreach $domain ( $res->each_Domain() )
263 Function: array of Domain units which are held in this report
274 foreach $u ( @
{$self->{'domain'}} ) {
282 =head2 domain_bits_cutoff_from_evalue
284 Title : domain_bits_cutoff_from_evalue
285 Usage : $cutoff = domain_bits_cutoff_from_evalue(0.01);
286 Function: return a bits cutoff from an evalue using the
287 scores here. Somewhat interesting logic:
288 Find the two bit score which straddle the evalue
289 if( 25 is between these two points) return 25
290 else return the midpoint.
292 This logic tries to ensure that with large signal to
293 noise separation one still has sensible 25 bit cutoff
299 sub domain_bits_cutoff_from_evalue
{
302 my ($dom,$prev,@doms,$cutoff,$sep,$seen);
304 @doms = $self->each_Domain;
307 @doms = map { $_->[0] }
308 sort { $b->[1] <=> $a->[1] }
309 map { [ $_, $_->bits] } @doms;
311 foreach $_ ( @doms ) {
312 if( $_->evalue > $eval ) {
320 if( ! defined $prev || $seen == 0) {
321 $self->throw("Evalue is either above or below the list...");
325 $sep = $prev->bits - $dom->bits ;
328 return $prev->bits();
330 if( $dom->bits < 25 && $prev->bits > 25 ) {
334 return int( $dom->bits + $sep/2 ) ;
339 sub dictate_hmm_acc
{
345 foreach $unit ( $self->eachHMMUnit() ) {
350 =head2 write_FT_output
352 Title : write_FT_output
353 Usage : $res->write_FT_output(\*STDOUT,'DOMAIN')
354 Function: writes feature table output ala swissprot
361 sub write_FT_output
{
367 if( !defined $idt ) {
371 foreach $seq ( $self->each_Set() ) {
372 print $file sprintf("ID %s\n",$seq->name());
373 foreach $unit ( $seq->each_Domain() ) {
374 print $file sprintf("FT %s %d %d %s\n",$idt,
375 $unit->start,$unit->end,$unit->hmmname);
381 =head2 filter_on_cutoff
383 Title : filter_on_cutoff
384 Usage : $newresults = $results->filter_on_cutoff(25,15);
385 Function: Produces a new HMMER::Results module which has
386 been trimmed at the cutoff.
387 Returns : a Bio::Tools::HMMER::Results module
388 Args : sequence cutoff and domain cutoff. in bits score
389 if you want one cutoff, simply use same number both places
393 sub filter_on_cutoff
{
397 my ($new,$seq,$unit,@array,@narray);
399 if( !defined $domthr ) {
400 $self->throw("hmmresults filter on cutoff needs two arguments");
403 $new = Bio
::Tools
::HMMER
::Results
->new(-type
=> $self->parsetype);
405 foreach $seq ( $self->each_Set()) {
406 next if( $seq->bits() < $seqthr );
408 foreach $unit ( $seq->each_Domain() ) {
409 next if( $unit->bits() < $domthr );
410 $new->add_Domain($unit);
416 =head2 write_ascii_out
418 Title : write_ascii_out
419 Usage : $res->write_ascii_out(\*STDOUT)
421 seq seq_start seq_end model-acc model_start model_end model_name
425 FIXME: Now that we have no modelacc, this is probably a bad thing.
429 # writes as seq sstart send modelacc hstart hend modelname
431 sub write_ascii_out
{
441 foreach $seq ( $self->each_Set()) {
442 foreach $unit ( $seq->each_Domain()) {
443 print $fh sprintf("%s %4d %4d %s %4d %4d %4.2f %4.2g %s\n",
444 $unit->seq_id(),$unit->start(),$unit->end(),
445 $unit->hmmacc,$unit->hstart,$unit->hend,
446 $unit->bits,$unit->evalue,$unit->hmmname);
452 =head2 write_GDF_bits
454 Title : write_GDF_bits
455 Usage : $res->write_GDF_bits(25,15,\*STDOUT)
456 Function: writes GDF format with a sequence,domain threshold
471 if( !defined $file ) {
472 $self->throw("Attempting to use write_GDF_bits without passing in correct arguments!");
476 foreach $seq ( $self->each_Set()) {
478 if( $seq->bits() < $seqt ) {
482 foreach $unit ( $seq->each_Domain() ) {
483 if( $unit->bits() < $domt ) {
491 @narray = sort { my ($aa,$bb,$st_a,$st_b);
497 return $st_a <=> $st_b;
503 foreach $unit ( @narray ) {
504 print $file sprintf("%-24s\t%6d\t%6d\t%15s\t%.1f\t%g\n",$unit->get_nse(),$unit->start(),$unit->end(),$unit->seq_id(),$unit->bits(),$unit->evalue);
509 sub write_scores_bits
{
518 if( !defined $file ) {
519 $self->warn("Attempting to use write_scores_bits without passing in correct arguments!");
523 foreach $seq ( $self->eachHMMSequence()) {
525 if( $seq->bits() < $seqt ) {
529 foreach $unit ( $seq->eachHMMUnit() ) {
530 if( $unit->bits() < $domt ) {
538 @narray = sort { my ($aa,$bb,$st_a,$st_b);
544 foreach $unit ( @narray ) {
545 print $file sprintf("%4.2f %s\n",$unit->bits(),$unit->get_nse());
555 if( !defined $file ) {
560 foreach $unit ( $self->eachHMMUnit() ) {
561 print $file sprintf("%-24s\t%6d\t%6d\t%15s\t%.1f\t%g\n",$unit->get_nse(),$unit->start(),$unit->end(),$unit->seq_id(),$unit->bits(),$unit->evalue);
570 my ($seq,$unit,$hseq,$hdom,$noiseseq,$noisedom);
572 $hseq = $hdom = -100000;
574 foreach $seq ( $self->eachHMMSequence()) {
575 if( $seq->bits() < $seqt && $seq->bits() > $hseq ) {
576 $hseq = $seq->bits();
579 foreach $unit ( $seq->eachHMMUnit() ) {
580 if( (($seq->bits() < $seqt) || ($seq->bits() > $seqt && $unit->bits < $domt)) && $unit->bits() > $hdom ) {
581 $hdom = $unit->bits();
588 return ($noiseseq,$noisedom);
597 my ($seq,$unit,$lowseq,$lowdom,$trueseq,$truedom);
599 if( ! defined $domt ) {
600 $self->warn("lowest true needs at least a domain threshold cut-off");
604 $lowseq = $lowdom = 100000;
606 foreach $seq ( $self->eachHMMSequence()) {
608 if( $seq->bits() >= $seqt && $seq->bits() < $lowseq ) {
609 $lowseq = $seq->bits();
612 if( $seq->bits() < $seqt ) {
616 foreach $unit ( $seq->eachHMMUnit() ) {
617 if( $unit->bits() >= $domt && $unit->bits() < $lowdom ) {
618 $lowdom = $unit->bits();
625 return ($trueseq,$truedom);
634 Usage : Mainly internal function
647 $name = $seq->name();
649 if( exists $self->{'seq'}->{$name} ) {
650 $self->throw("You alredy have $name in HMMResults!");
652 $self->{'seq'}->{$name} = $seq;
672 foreach $name ( keys %{$self->{'seq'}} ) {
673 push(@array,$self->{'seq'}->{$name});
682 Usage : $set = $res->get_Set('sequence-name');
683 Function: returns the Set for a particular sequence
684 Returns : a HMMER::Set object
685 Args : name of the sequence
694 return $self->{'seq'}->{$name};
698 =head2 _parse_hmmpfam
700 Title : _parse_hmmpfam
701 Usage : $res->_parse_hmmpfam($filehandle)
713 my ($id,$sqfrom,$sqto,$hmmf,$hmmt,$sc,$ev,
714 $unit,$nd,$seq,$name,$seqname,$from,
715 $to,%hash,%acc,$acc);
719 if( /^HMM file:\s+(\S+)/ ) { $self->hmmfile($1); next; }
720 elsif( /^Sequence file:\s+(\S+)/ ) { $self->seqfile($1); next }
721 elsif( /^Query(\s+sequence)?:\s+(\S+)/ ) {
725 $seq = Bio
::Tools
::HMMER
::Set
->new();
727 $seq ->name($seqname);
728 $self->add_Set($seq);
733 if( /Accession:\s+(\S+)/ ) { $seq->accession($1); next }
734 elsif( s/^Description:\s+// ) { chomp; $seq->desc($_); next }
735 /^Parsed for domains/ && last;
737 # This is to parse out the accession numbers in old Pfam format.
738 # now not support due to changes in HMMER.
740 if( (($id,$acc, $sc, $ev, $nd) = /^\s*(\S+)\s+(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/)) {
741 $hash{$id} = $sc; # we need this for the sequence
742 # core of the domains below!
745 # this is the more common parsing routine
746 } elsif ( (($id,$sc, $ev, $nd) =
747 /^\s*(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/) ) {
749 $hash{$id} = $sc; # we need this for the
750 # sequence score of hte domains below!
758 # this is meant to match
760 #Sequence Domain seq-f seq-t hmm-f hmm-t score E-value
761 #-------- ------- ----- ----- ----- ----- ----- -------
762 #PF00621 1/1 198 372 .. 1 207 [] 281.6 1e-80
764 if( (($id, $sqfrom, $sqto, $hmmf,$hmmt,$sc, $ev) =
765 /(\S+)\s+\S+\s+(\d+)\s+(\d+).+?(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\S+)\s*$/)) {
766 $unit = Bio
::Tools
::HMMER
::Domain
->new();
767 $unit->seq_id ($seqname);
768 $unit->hmmname ($id);
769 $unit->start ($sqfrom);
771 $unit->hstart($hmmf);
776 if( !exists($hash{$id}) ) {
777 $self->throw("HMMResults parsing error in hmmpfam for $id - can't find sequecne score");
780 $unit->seqbits($hash{$id});
782 if( defined $acc{$id} ) {
783 $unit->hmmacc($acc{$id});
786 # this should find it's own sequence!
787 $self->add_Domain($unit);
790 if( m{^//} ) { next; }
793 # parses alignment lines. Icky as we have to break on the same line
794 # that we need to read to place the alignment lines with the unit.
797 (!defined $_ || m{^//}) && last;
800 # PF00621: domain 1 of 1, from 198 to 372
801 if( /^\s*(\S+):.*from\s+(\d+)\s+to\s+(\d+)/ ) {
807 # find the HMMUnit which this alignment is from
809 $unit = $self->get_unit_nse($seqname,$name,$from,$to);
810 if( !defined $unit ) {
811 $self->warn("Could not find $name $from $to unit even though I am reading it in. ugh!");
817 /^\s*\S+:.*from\s+\d+\s+to\s+\d+/ && last;
818 $unit->add_alignment_line($_);
825 # back to main 'Query:' loop
830 # mainly internal function
841 $seq = $self->get_Set($seqname);
843 if( !defined $seq ) {
844 $self->throw("Could not get sequence name $seqname - so can't get its unit");
847 foreach $unit ( $seq->each_Domain() ) {
848 if( $unit->hmmname() eq $domname && $unit->start() == $start && $unit->end() == $end ) {
857 =head2 _parse_hmmsearch
859 Title : _parse_hmmsearch
860 Usage : $res->_parse_hmmsearch($filehandle)
868 sub _parse_hmmsearch
{
871 my ($id,$sqfrom,$sqto,$sc,$ev,$unit,$nd,$seq,$hmmf,$hmmt,
872 $hmmfname,$hmmacc, $hmmid, %seqh);
876 /^HMM file:\s+(\S+)/ and do { $self->hmmfile($1); $hmmfname = $1 };
877 /^Accession:\s+(\S+)/ and do { $hmmacc = $1 };
878 /^Query HMM:\s+(\S+)/ and do { $hmmid = $1 };
879 /^Sequence database:\s+(\S+)/ and do { $self->seqfile($1) };
880 /^Scores for complete sequences/ && last;
883 $hmmfname = "given" if not $hmmfname;
886 /^Parsed for domains/ && last;
887 if( (($id, $sc, $ev, $nd) = /(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/)) {
888 $seq = Bio
::Tools
::HMMER
::Set
->new();
893 $self->add_Set($seq);
894 $seq->accession($hmmacc);
899 /^Alignments of top-scoring domains/ && last;
900 if( (($id, $sqfrom, $sqto, $hmmf, $hmmt, $sc, $ev) = /(\S+)\s+\S+\s+(\d+)\s+(\d+).+?(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\S+)\s*$/)) {
901 $unit = Bio
::Tools
::HMMER
::Domain
->new();
904 $unit->hmmname($hmmfname);
905 $unit->start($sqfrom);
908 $unit->hstart($hmmf);
911 $unit->seqbits($seqh{$id});
912 $self->add_Domain($unit);
919 ## Recognize and store domain alignments
925 /^Histogram of all scores/ && last;
928 # PF00621: domain 1 of 1, from 198 to 372
929 if( /^\s*(\S+):.*from\s+(\d+)\s+to\s+(\d+)/ ) {
934 # find the HMMUnit which this alignment is from
935 $unit = $self->get_unit_nse($name,$hmmfname,$from,$to);
937 if( !defined $unit ) {
938 $self->warn("Could not find $name $from $to unit even though I am reading it in. ugh!");
942 /^Histogram of all scores/ && last;
943 /^\s*\S+:.*from\s+\d+\s+to\s+\d+/ && last;
944 $unit->add_alignment_line($_);
958 Usage : $obj->parsetype($newval)
960 Returns : value of parsetype
961 Args : newvalue (optional)
967 my ($self,$value) = @_;
968 if( defined $value) {
969 $self->{'_parsetype'} = $value;
971 return $self->{'_parsetype'};