From 41b4de2894f7711f24b89ba23bc0fcfefa90e76a Mon Sep 17 00:00:00 2001 From: "Francisco J. Ossandon" Date: Wed, 6 Aug 2014 13:02:38 -0400 Subject: [PATCH] hmmer3.pm: Fixed a bug where an homology line ending in CS or RF aminoacids made "next_seq" think it was the consensus structure track, braking the parser. Added a test that show this --- Bio/SearchIO/hmmer3.pm | 15 ++++- t/SearchIO/hmmer.t | 127 +++++++++++++++++++++++++++++------------- t/data/hmmscan_sec_struct.out | 47 ++++++++++++++++ 3 files changed, 146 insertions(+), 43 deletions(-) diff --git a/Bio/SearchIO/hmmer3.pm b/Bio/SearchIO/hmmer3.pm index 6b4210ab9..ce0a10986 100644 --- a/Bio/SearchIO/hmmer3.pm +++ b/Bio/SearchIO/hmmer3.pm @@ -461,7 +461,7 @@ sub next_result { elsif ( /inclusion threshold/ ) { while ( defined( $_ = $self->_readline ) ) { if ( /Domain( and alignment)? annotation for each/ - || /Internal pipeline statistics summary/ + || /Internal pipeline statistics summary/ || /Annotation for each hit\s+\(and alignments\)/ ) { @@ -676,6 +676,12 @@ sub next_result { } elsif ($_ =~ m/^$/ ) { + # Reset these scalars on empty lines to help + # distinguish between the consensus structure/reference + # tracks (CS|RF lines) and homology lines ending in + # CS or RF aminoacids + $align_offset = 0; + $align_length = 0; next; } @@ -693,8 +699,11 @@ sub next_result { $pline = $$hsp[-1]; $lastdomain = $name; } - # model data track, some reports don't have - elsif ( $_ =~ m/\s+\S+\s(?:CS|RF)$/ ) { + # Consensus Structure or Reference track, some reports + # don't have it. Since it appears on top of the alignment, + # the reset of $align_length to 0 between alignment blocks + # avoid confusing homology lines with it. + elsif ( $_ =~ m/\s+\S+\s(?:CS|RF)$/ and $align_length == 0 ) { my @data = split( " ", $_ ); $csline .= $data[-2]; $max_count++; diff --git a/t/SearchIO/hmmer.t b/t/SearchIO/hmmer.t index 60e011289..a0ae4704a 100644 --- a/t/SearchIO/hmmer.t +++ b/t/SearchIO/hmmer.t @@ -7,7 +7,7 @@ BEGIN { use lib '.'; use Bio::Root::Test; - test_begin( -tests => 798 ); + test_begin( -tests => 816 ); use_ok('Bio::SearchIO'); } @@ -1173,50 +1173,97 @@ $searchio = Bio::SearchIO->new( ] ); +my $result_counter = 0; while ( $result = $searchio->next_result ) { - is( ref($result), - 'Bio::Search::Result::HMMERResult', - 'Check for the correct result reference type' - ); - is( $result->algorithm, 'HMMSCAN', 'Check algorithm' ); - is( $result->algorithm_version, '3.0', 'Check algorithm version' ); - is( $result->hmm_name, 'Pfam-A.hmm', 'Check hmm_name' ); - is( $result->sequence_file, 'BA000019.orf8.fasta', - 'Check sequence_file' ); - is( $result->query_name, 'BA000019.orf8', 'Check query_name' ); - is( $result->query_length, '348', 'Check query_length' ); - is( $result->query_description, '', 'Check query_description' ); - is( $result->num_hits(), 3, 'Check num_hits' ); - my ( $hsp, $hit ); + $result_counter++; + if ($result_counter == 1) { + is( ref($result), + 'Bio::Search::Result::HMMERResult', + 'Check for the correct result reference type' + ); + is( $result->algorithm, 'HMMSCAN', 'Check algorithm' ); + is( $result->algorithm_version, '3.0', 'Check algorithm version' ); + is( $result->hmm_name, 'Pfam-A.hmm', 'Check hmm_name' ); + is( $result->sequence_file, 'BA000019.orf8.fasta', 'Check sequence_file' ); + is( $result->query_name, 'BA000019.orf8', 'Check query_name' ); + is( $result->query_length, 348, 'Check query_length' ); + is( $result->query_description, '', 'Check query_description' ); + is( $result->num_hits(), 3, 'Check num_hits' ); + my ( $hsp, $hit ); - while ( $hit = $result->next_model ) { - if ($hit->name eq 'PKSI-KS_m3') { - # Hit length is usually unknown for HMMSCAN and HMMSEARCH but not for NHMMER. - # When is not known, sometimes it can be deduced from domain data '[]' - is( $hit->length, 16, 'Check hit length' ); - is( $hit->frac_aligned_query, 0.09 ); - is( $hit->frac_aligned_hit, '1.00' ); + while ( $hit = $result->next_model ) { + if ($hit->name eq 'PKSI-KS_m3') { + # Hit length is usually unknown for HMMSCAN and HMMSEARCH but not for NHMMER. + # When is not known, sometimes it can be deduced from domain data '[]' + is( $hit->length, 16, 'Check hit length' ); + is( $hit->frac_aligned_query, 0.09 ); + is( $hit->frac_aligned_hit, '1.00' ); + } + my @expected = @{ shift @multi_hits }; + is( ref($hit), 'Bio::Search::Hit::HMMERHit', + 'Check for the correct hit reference type' ); + is( $hit->name, shift @expected, 'Check hit name' ); + is( $hit->description, shift @expected, 'Check for hit description' ); + is( $hit->raw_score, shift @expected, 'Check hit raw_score' ); + float_is( + $hit->significance, + shift @expected, + 'Check hit significance' + ); + is( $hit->num_hsps, shift @expected, 'Check num_hsps' ); + my @hsp_list = @{ shift @expected }; + + while ( defined( $hsp = $hit->next_domain ) ) { + my @hsp_exp = @{ shift @hsp_list }; + is( ref($hsp), 'Bio::Search::HSP::HMMERHSP', + 'Check for correct hsp reference type' ); + is( $hsp->hit_string, shift @hsp_exp, 'Check hit sequence' ); + is( $hsp->query_string, shift @hsp_exp, 'Check query sequence' ); + } } - my @expected = @{ shift @multi_hits }; - is( ref($hit), 'Bio::Search::Hit::HMMERHit', - 'Check for the correct hit reference type' ); - is( $hit->name, shift @expected, 'Check hit name' ); - is( $hit->description, shift @expected, 'Check for hit description' ); - is( $hit->raw_score, shift @expected, 'Check hit raw_score' ); - float_is( - $hit->significance, - shift @expected, - 'Check hit significance' + } + elsif ($result_counter == 2) { + is( ref($result), + 'Bio::Search::Result::HMMERResult', + 'Check for the correct result reference type' ); - is( $hit->num_hsps, shift @expected, 'Check num_hsps' ); - my @hsp_list = @{ shift @expected }; + is( $result->algorithm, 'HMMSCAN', 'Check algorithm' ); + is( $result->algorithm_version, '3.0', 'Check algorithm version' ); + is( $result->query_name, 'lcl|aorf_00010|P1', 'Check query_name' ); + is( $result->query_length, 132, 'Check query_length' ); + is( $result->query_description, 'IS481.original transposase', 'Check query_description' ); + is( $result->num_hits(), 1, 'Check num_hits' ); + my ( $hsp, $hit ); - while ( defined( $hsp = $hit->next_domain ) ) { - my @hsp_exp = @{ shift @hsp_list }; - is( ref($hsp), 'Bio::Search::HSP::HMMERHSP', - 'Check for correct hsp reference type' ); - is( $hsp->hit_string, shift @hsp_exp, 'Check hit sequence' ); - is( $hsp->query_string, shift @hsp_exp, 'Check query sequence' ); + while ( $hit = $result->next_model ) { + is( ref($hit), 'Bio::Search::Hit::HMMERHit', + 'Check for the correct hit reference type' ); + is( $hit->name, 'IS481.original.hmm', 'Check hit name' ); + is( $hit->description, '', 'Check for hit description' ); + is( $hit->raw_score, '130.0', 'Check hit raw_score' ); + float_is( $hit->significance, 3.4e-040, 'Check hit significance' ); + is( $hit->num_hsps, 1, 'Check num_hsps' ); + + while ( defined( $hsp = $hit->next_domain ) ) { + is( ref($hsp), 'Bio::Search::HSP::HMMERHSP', + 'Check for correct hsp reference type' ); + is( $hsp->query_string, + 'GEIETAHPSYLGSQDTFYVGNITGAGR----------------------------IYQQTFVDTYSKWDSTKLYTTKTPITAADLLNDRVLSFFA-EQGMGIIRLLTDRSTEYCSKA--ETQDYELCLALNDIEHTKTKVYHPQTNDICRRFHKA', + 'Check for query string' + ); + is( $hsp->hit_string, + 'kRYErdhPgeLvhmDvkklgripdgGgvkighRwrgrtrgrgkrtnqsrnrglgkayvitaiDDhSRfayaeilsdettttaadfllraaayfygkigeeiitrvlTDnGaayrskkrsakhdFqealaelGIkhilTrprsPqTNGKiERFhrT', + 'Check for hit string' + ); + is( $hsp->homology_string, + '+++E++hP +L+++D++++g+i + G+ +y++t++D++S+ +++++++t++taad l++ ++ f+ ++++i r lTD+ ++y+sk ++ d+ +la ++I+h++T++++PqTN ++ RFh+ ', + 'Check for homology string' + ); + is( $hsp->posterior_string, + '579*******************88888............................****************************************.********************8..**********************************95', + 'Check for posterior probability string' + ); + } } } } diff --git a/t/data/hmmscan_sec_struct.out b/t/data/hmmscan_sec_struct.out index 6be7893e7..d922f7ff2 100644 --- a/t/data/hmmscan_sec_struct.out +++ b/t/data/hmmscan_sec_struct.out @@ -91,3 +91,50 @@ Domain search space (domZ): 2 [number of targets reported over t # CPU time: 0.24u 0.15s 00:00:00.39 Elapsed: 00:00:00.27 # Mc/sec: 2782.58 // +Query: lcl|aorf_00010|P1 [L=132] +Description: IS481.original transposase +Scores for complete sequence (score includes all domains): + --- full sequence --- --- best 1 domain --- -#dom- + E-value score bias E-value score bias exp N Model Description + ------- ------ ----- ------- ------ ----- ---- -- -------- ----------- + 3.4e-40 130.0 0.4 3.8e-40 129.9 0.3 1.0 1 IS481.original.hmm + + +Domain annotation for each model (and alignments): +>> IS481.original.hmm + # score bias c-Evalue i-Evalue hmmfrom hmm to alifrom ali to envfrom env to acc + --- ------ ----- --------- --------- ------- ------- ------- ------- ------- ------- ---- + 1 ! 129.9 0.3 6.6e-42 3.8e-40 127 281 .. 7 130 .. 2 132 .] 0.97 + + Alignments for each domain: + == domain 1 score: 129.9 bits; conditional E-value: 6.6e-42 + IS481.original.hmm 127 kRYErdhPgeLvhmDvkklgripdgGgvkighRwrgrtrgrgkrtnqsrnrglgkayvitaiDDhSRfayaeilsd 202 + +++E++hP +L+++D++++g+i + G+ +y++t++D++S+ ++++++ + lcl|aorf_00010|P1 7 GEIETAHPSYLGSQDTFYVGNITGAGR----------------------------IYQQTFVDTYSKWDSTKLYTT 54 + 579*******************88888............................********************* PP + + IS481.original.hmm 203 ettttaadfllraaayfygkigeeiitrvlTDnGaayrskkrsakhdFqealaelGIkhilTrprsPqTNGKiERF 278 + +t++taad l++ ++ f+ ++++i r lTD+ ++y+sk ++ d+ +la ++I+h++T++++PqTN ++ RF + lcl|aorf_00010|P1 55 KTPITAADLLNDRVLSFFA-EQGMGIIRLLTDRSTEYCSKA--ETQDYELCLALNDIEHTKTKVYHPQTNDICRRF 127 + *******************.********************8..********************************* PP + + IS481.original.hmm 279 hrT 281 + h+ + lcl|aorf_00010|P1 128 HKA 130 + *95 PP + + + +Internal pipeline statistics summary: +------------------------------------- +Query sequence(s): 1 (132 residues) +Target model(s): 116 (57162 nodes) +Passed MSV filter: 4 (0.0344828); expected 2.3 (0.02) +Passed bias filter: 4 (0.0344828); expected 2.3 (0.02) +Passed Vit filter: 3 (0.0258621); expected 0.1 (0.001) +Passed Fwd filter: 1 (0.0172414); expected 0.0 (1e-05) +Initial search space (Z): 116 [actual number of targets] +Domain search space (domZ): 1 [number of targets reported over threshold] +# CPU time: 0.06u 0.00s 00:00:00.06 Elapsed: 00:00:00.06 +# Mc/sec: 117.90 +// -- 2.11.4.GIT