3 # BioPerl module for Bio::Tools::Signalp::ExtendedSignalp
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Emmanuel Quevillon <emmanuel.quevillon@versailles.inra.fr>
9 # Copyright Emmanuel Quevillon
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Tools::Signalp::ExtendedSignalp - enhanced parser for Signalp output
21 use Bio::Tools::Signalp::ExtendedSignalp;
22 my $params = [qw(maxC maxY maxS meanS D)];
23 my $parser = new Bio::Tools::Signalp::ExtendedSignalp(
28 $parser->factors($params);
29 while( my $sp_feat = $parser->next_feature ) {
32 push @sp_feat, $sp_feat;
37 # Please direct questions and support issues to I<bioperl-l@bioperl.org>
39 Parser module for Signalp.
41 Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Signalp
42 originally written by Marc Sohrmann (ms2 a sanger.ac.uk) Written in BioPipe by
43 Balamurugan Kumarasamy (savikalpa a fugu-sg.org) Cared for by the Fugu
44 Informatics team (fuguteam@fugu-sg.org)
46 You may distribute this module under the same terms as perl itself
48 Compared to the original SignalP, this method allow the user to filter results
49 out based on maxC maxY maxS meanS and D factor cutoff for the Neural Network (NN)
50 method only. The HMM method does not give any filters with 'YES' or 'NO' as result.
52 The user must be aware that the filters can only by applied on NN method.
53 Also, to ensure the compability with original Signalp parsing module, the user
54 must know that by default, if filters are empty, max Y and mean S filters are
55 automatically used to filter results.
57 If the used gives a list, then the parser will only report protein having 'YES'
60 This module supports parsing for full, summary and short output form signalp.
61 Actually, full and summary are equivalent in terms of filtering results.
67 User feedback is an integral part of the evolution of this and other
68 Bioperl modules. Send your comments and suggestions preferably to
69 the Bioperl mailing list. Your participation is much appreciated.
71 bioperl-l@bioperl.org - General discussion
72 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
76 Please direct usage questions or support issues to the mailing list:
78 I<bioperl-l@bioperl.org>
80 rather than to the module maintainer directly. Many experienced and
81 reponsive experts will be able look at the problem and quickly
82 address it. Please include a thorough description of the problem
83 with code and data examples if at all possible.
87 Report bugs to the Bioperl bug tracking system to help us keep track
88 of the bugs and their resolution. Bug reports can be submitted via
91 http://bugzilla.open-bio.org/
95 Based on the Bio::Tools::Signalp module
96 Emmanuel Quevillon <emmanuel.quevillon@versailles.inra.fr>
100 The rest of the documentation details each of the object methods.
101 Internal methods are usually preceded with a _
105 package Bio
::Tools
::Signalp
::ExtendedSignalp
;
109 use Bio
::SeqFeature
::Generic
;
110 # don't need Bio::Root::Root/IO (already in inheritance tree)
111 use base
qw(Bio::Tools::Signalp Bio::Tools::AnalysisResult);
125 Usage : my $obj = new Bio::Tools::Signalp::ExtendedSignalp();
126 Function: Builds a new Bio::Tools::Signalp::ExtendedSignalp object
127 Returns : Bio::Tools::Signalp::ExtendedSignalp
128 Args : -fh/-file => $val, # for initing input, see Bio::Root::IO
134 my($class,@args) = @_;
136 my $self = $class->SUPER::new
(@args);
137 $self->_initialize_io(@args);
139 my $factors = $self->_rearrange([qw(FACTORS)], @args);
140 #To behave like the parent module (Bio::Tools::Signalp) we default factors to these two factors
141 if($factors && scalar(@
$factors)){
145 $factors = [qw(maxY meanS)];
147 $factors && $self->factors($factors);
155 Usage : my $feat = $signalp->next_feature
156 Function: Get the next result feature from parser data
157 Returns : Bio::SeqFeature::Generic
167 if(!$self->_parsed()){
171 return shift @
{$self->{_features
}} || undef;
178 Usage : my $feat = $signalp->_filterok
179 Function: Check if the factors required by the user are all ok.
181 Args : hash reference
188 my($self, $hash) = @_;
190 #We hope everything will be fine ;)
193 #If the user did not give any filter, we keep eveything
194 return $bool unless keys %{$self->{_factors
}};
196 #If only one of the factors parsed is equal to NO based on the user factors cutoff
197 #Then the filter is not ok.
198 foreach my $fact (keys %{$self->factors()}){
199 if(exists($hash->{$fact}) && $hash->{$fact} =~ /^N/){
211 Usage : my $feat = $signalp->factors
212 Function: Get/Set the filters required from the user
214 Args : array reference
221 my($self, $array) = @_;
224 $self->{_factors
} = { };
225 foreach my $f (@
$array){
226 if(exists($FACTS->{$f})){
227 $self->{_factors
}->{$f} = 1;
230 $self->throw("[$f] incorrect factor. Supported:\n- ".join("\n- ", keys %$FACTS)."\n");
235 return $self->{_factors
};
242 Usage : obj->_parsed()
243 Function: Get/Set if the result is parsed or not
252 my($self, $parsed) = @_;
254 if(defined($parsed)){
255 $self->{_parsed
} = $parsed;
258 return $self->{_parsed
};
266 Function: Parse the SignalP result
277 #Let's read the file...
278 while (my $line = $self->_readline()) {
281 #We want to be sure to catch the first non empty line to be ablte to determine
282 #which format we are working with...
283 next unless ($line =~ /^>(\S+)|^# SignalP-[NHM]+ \S+ predictions/);
285 if($line =~ /^>(\S+)/){
286 $self->_pushback($line);
287 $self->_parse_summary_format();
290 elsif($line =~ /^# SignalP-[NHM]+ \S+ predictions/){
291 $self->_pushback($line);
292 $self->_parse_short_format();
296 $self->throw("Unable to determine the format type.");
303 =head2 _parse_summary_format
305 Title : _parse_summary_format
306 Usage : $self->_parse_summary_format
307 Function: Method to parse summary/full format from signalp output
308 It automatically fills filtered features.
314 sub _parse_summary_format
{
321 while(my $line = $self->_readline()){
323 if($line =~ /^SignalP-NN result:/){
324 $self->_pushback($line);
325 $feature = $self->_parse_nn_result($feature);
327 if($line =~ /^SignalP-HMM result:/){
328 $self->_pushback($line);
329 $feature = $self->_parse_hmm_result($feature);
332 if($line =~ /^---------/ && $feature){
333 my $new_feature = $self->create_feature($feature);
334 push @
{$self->{_features
}}, $new_feature if $new_feature;
343 =head2 _parse_nn_result
345 Title : _parse_nn_result
346 Usage : obj->_parse_nn_result
347 Function: Parses the Neuronal Network (NN) part of the result
348 Returns : Hash reference
354 sub _parse_nn_result
{
356 my($self, $feature) = @_;
362 #>MGG_11635.5 length = 100
363 ## Measure Position Value Cutoff signal peptide?
364 # max. C 37 0.087 0.32 NO
365 # max. Y 37 0.042 0.33 NO
366 # max. S 3 0.062 0.87 NO
367 # mean S 1-36 0.024 0.48 NO
368 # D 1-36 0.033 0.43 NO
370 while(my $line = $self->_readline()){
374 if($line =~ /^SignalP-NN result:/){
379 $self->throw("Wrong line for parsing NN results.") unless $ok;
381 if ($line=~/^\>(\S+)\s+length/) {
386 elsif($line =~ /max\.\s+C\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
387 $feature->{maxCprob
} = $1;
391 elsif ($line =~ /max\.\s+Y\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
392 $feature->{maxYprob
} = $1;
396 elsif($line =~ /max\.\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
397 $feature->{maxSprob
} = $1;
401 elsif ($line=~/mean\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
402 $feature->{meanSprob
} = $1;
406 elsif ($line=~/\s+D\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
407 $feature->{Dprob
} = $1;
411 #If we don't have this line it means that all the factors cutoff are equal to 'NO'
412 elsif ($line =~ /Most likely cleavage site between pos\.\s+(\d+)/) {
413 #if($self->_filterok(\%facts)){
414 #$feature->{name} = $self->seqname();
415 #$feature->{start} = 1;
416 $feature->{end
} = $1 + 1; #To be consistent with end given in short format
420 elsif($line =~ /^\s*$/){
425 if($self->_filterok(\
%facts)){
426 $feature->{name
} = $self->seqname();
427 $feature->{start
} = 1;
428 $feature->{nnPrediction
} = 'signal-peptide';
435 =head2 _parse_hmm_result
437 Title : _parse_hmm_result
438 Usage : obj->_parse_hmm_result
439 Function: Parses the Hiden Markov Model (HMM) part of the result
440 Returns : Hash reference
445 sub _parse_hmm_result
{
447 my ($self, $feature_hash) = @_;
453 #Prediction: Non-secretory protein
454 #Signal peptide probability: 0.000
455 #Signal anchor probability: 0.000
456 #Max cleavage site probability: 0.000 between pos. -1 and 0
458 while(my $line = $self->_readline()){
461 next if $line =~ /^\s*$/o;
463 if($line =~ /^SignalP-HMM result:/){
468 $self->throw("Wrong line for parsing HMM result.") unless $ok;
470 if($line =~ /^>(\S+)/){
471 #In case we already seen a name with NN results
472 $feature_hash->{name
} = $1 unless $self->seqname();
474 elsif($line =~ /Prediction: (.+)$/){
475 $feature_hash->{hmmPrediction
} = $1;
477 elsif($line =~ /Signal peptide probability: ([0-9\.]+)/){
478 $feature_hash->{peptideProb
} = $1;
480 elsif($line =~ /Signal anchor probability: ([0-9\.]+)/){
481 $feature_hash->{anchorProb
} = $1;
483 elsif($line =~ /Max cleavage site probability: (\S+) between pos. \S+ and (\S+)/){
484 $feature_hash->{cleavageSiteProb
} = $1;
485 #Strange case, if we don't have an end value in NN result (no nn method launched)
486 #We try anyway to get an end value, unless this value is lower than 1 which is
488 $feature_hash->{end
} = $2 if($2 > 1 && !$feature_hash->{end
});
489 $feature_hash->{start
} = 1 unless $feature_hash->{start
};
494 return $feature_hash;
497 =head2 _parse_short_format
499 Title : _parse_short_format
500 Usage : $self->_parse_short_format
501 Function: Method to parse short format from signalp output
502 It automatically fills filtered features.
508 sub _parse_short_format
{
514 $self->{_oformat
} = 'short';
517 # SignalP-NN euk predictions # SignalP-HMM euk predictions
518 # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ?
519 #Q5A8M1_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8M1_CANAL Q 0.001 35 N 0.002 N
520 #O74127_YARLI 0.121 21 N 0.284 21 N 0.953 11 Y 0.826 Y 0.555 Y O74127_YARLI S 0.485 23 N 0.668 Y
521 #Q5VJ86_9PEZI 0.355 24 Y 0.375 24 Y 0.798 12 N 0.447 N 0.411 N Q5VJ86_9PEZI Q 0.180 23 N 0.339 N
522 #Q5A8U5_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8U5_CANAL Q 0.001 35 N 0.002 N
524 while(my $line = $self->_readline()){
527 next if $line =~ /^\s*$|^# name/;
530 $method = $line =~ /SignalP-NN .+ SignalP-HMM/ ?
531 'both' : $line =~ /SignalP-NN/ ?
536 #$self->throw("It looks like the format is not 'short' format.") unless($ok);
538 my @data = split(/\s+/, $line);
539 $self->seqname($data[0]);
544 #NN results gives more fields than HMM
545 if($method eq 'both' || $method eq 'nn'){
547 $feature->{maxCprob
} = $data[1];
548 $factors->{maxC
} = $data[3];
549 $feature->{maxYprob
} = $data[4];
550 $factors->{maxY
} = $data[6];
551 $feature->{maxSprob
} = $data[7];
552 $factors->{maxS
} = $data[9];
553 $feature->{meanSprob
}= $data[10];
554 $factors->{meanS
} = $data[11];
555 $feature->{Dprob
} = $data[12];
556 $factors->{D
} = $data[13];
557 #It looks like the max Y position is reported as the most likely cleavage position
558 $feature->{end
} = $data[5];
559 $feature->{nnPrediction
} = 'signal-peptide';
561 if($method eq 'both'){
562 $feature->{hmmPrediction
} = $data[15] eq 'Q' ?
'Non-secretory protein' : 'Signal peptide';
563 $feature->{cleavageSiteProb
} = $data[16];
564 $feature->{peptideProb
} = $data[19];
567 elsif($method eq 'hmm'){
568 #In short output anchor probability is not given
569 $feature->{hmmPrediction
} = $data[1] eq 'Q' ?
'Non-secretory protein' : 'Signal peptide';
570 $feature->{cleavageSiteProb
} = $data[2];
571 $feature->{peptideProb
} = $data[5];
572 #It looks like the max cleavage probability position is given by the Cmax proability
573 $feature->{end
} = $data[3];
576 #Unfortunately, we cannot parse the filters for hmm method.
577 if($self->_filterok($factors)){
578 $feature->{name
} = $self->seqname();
579 $feature->{start
} = 1;
580 $feature->{source
} = 'Signalp';
581 $feature->{primary
} = 'signal_peptide';
582 $feature->{program
} = 'Signalp';
583 $feature->{logic_name
} = 'signal_peptide';
585 my $new_feat = $self->create_feature($feature);
586 push @
{$self->{_features
}}, $new_feat if $new_feat;
593 =head2 create_feature
595 Title : create_feature
596 Usage : obj->create_feature(\%feature)
597 Function: Internal(not to be used directly)
606 my ($self, $feat) = @_;
608 #If we don't have neither start nor end, we return.
609 unless($feat->{name
} && $feat->{start
} && $feat->{end
}){
613 # create feature object
614 my $feature = Bio
::SeqFeature
::Generic
->new(
615 -seq_id
=> $feat->{name
},
616 -start
=> $feat->{start
},
617 -end
=> $feat->{end
},
618 -score
=> defined($feat->{peptideProb
}) ?
$feat->{peptideProb
} : '',
619 -source
=> 'Signalp',
620 -primary
=> 'signal_peptide',
621 -logic_name
=> 'signal_peptide',
624 $feature->add_tag_value('peptideProb', $feat->{peptideProb
});
625 $feature->add_tag_value('anchorProb', $feat->{anchorProb
});
626 $feature->add_tag_value('evalue',$feat->{anchorProb
});
627 $feature->add_tag_value('percent_id','NULL');
628 $feature->add_tag_value("hid",$feat->{primary
});
629 $feature->add_tag_value('signalpPrediction', $feat->{hmmPrediction
});
630 $feature->add_tag_value('cleavageSiteProb', $feat->{cleavageSiteProb
}) if($feat->{cleavageSiteProb
});
631 $feature->add_tag_value('nnPrediction', $feat->{nnPrediction
}) if($feat->{nnPrediction
});
632 $feature->add_tag_value('maxCprob', $feat->{maxCprob
}) if(defined($feat->{maxCprob
}));
633 $feature->add_tag_value('maxSprob', $feat->{maxSprob
}) if(defined($feat->{maxSprob
}));
634 $feature->add_tag_value('maxYprob', $feat->{maxYprob
}) if(defined($feat->{maxYprob
}));
635 $feature->add_tag_value('meanSprob', $feat->{meanSprob
}) if(defined($feat->{meanSprob
}));
636 $feature->add_tag_value('Dprob', $feat->{Dprob
}) if(defined($feat->{Dprob
}));
645 Usage : obj->seqname($name)
646 Function: Internal(not to be used directly)
654 my ($self,$seqname)=@_;
656 if (defined($seqname)){
657 $self->{'seqname'} = $seqname;
660 return $self->{'seqname'};