tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / Tools / Signalp / ExtendedSignalp.pm
blobca17bced6196e461991f9956ca7851c4c53be5e9
1 # $Id$
3 # BioPerl module for Bio::Tools::Signalp::ExtendedSignalp
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Emmanuel Quevillon <emmanuel.quevillon@versailles.inra.fr>
9 # Copyright Emmanuel Quevillon
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Tools::Signalp::ExtendedSignalp - enhanced parser for Signalp output
19 =head1 SYNOPSIS
21 use Bio::Tools::Signalp::ExtendedSignalp;
22 my $params = [qw(maxC maxY maxS meanS D)];
23 my $parser = new Bio::Tools::Signalp::ExtendedSignalp(
24 -fh => $filehandle
25 -factors => $params
28 $parser->factors($params);
29 while( my $sp_feat = $parser->next_feature ) {
30 #do something
31 #eg
32 push @sp_feat, $sp_feat;
35 =head1 DESCRIPTION
37 # Please direct questions and support issues to I<bioperl-l@bioperl.org>
39 Parser module for Signalp.
41 Based on the EnsEMBL module Bio::EnsEMBL::Pipeline::Runnable::Protein::Signalp
42 originally written by Marc Sohrmann (ms2 a sanger.ac.uk) Written in BioPipe by
43 Balamurugan Kumarasamy (savikalpa a fugu-sg.org) Cared for by the Fugu
44 Informatics team (fuguteam@fugu-sg.org)
46 You may distribute this module under the same terms as perl itself
48 Compared to the original SignalP, this method allow the user to filter results
49 out based on maxC maxY maxS meanS and D factor cutoff for the Neural Network (NN)
50 method only. The HMM method does not give any filters with 'YES' or 'NO' as result.
52 The user must be aware that the filters can only by applied on NN method.
53 Also, to ensure the compability with original Signalp parsing module, the user
54 must know that by default, if filters are empty, max Y and mean S filters are
55 automatically used to filter results.
57 If the used gives a list, then the parser will only report protein having 'YES'
58 for each factor.
60 This module supports parsing for full, summary and short output form signalp.
61 Actually, full and summary are equivalent in terms of filtering results.
63 =head1 FEEDBACK
65 =head2 Mailing Lists
67 User feedback is an integral part of the evolution of this and other
68 Bioperl modules. Send your comments and suggestions preferably to
69 the Bioperl mailing list. Your participation is much appreciated.
71 bioperl-l@bioperl.org - General discussion
72 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
74 =head2 Support
76 Please direct usage questions or support issues to the mailing list:
78 I<bioperl-l@bioperl.org>
80 rather than to the module maintainer directly. Many experienced and
81 reponsive experts will be able look at the problem and quickly
82 address it. Please include a thorough description of the problem
83 with code and data examples if at all possible.
85 =head2 Reporting Bugs
87 Report bugs to the Bioperl bug tracking system to help us keep track
88 of the bugs and their resolution. Bug reports can be submitted via
89 the web:
91 http://bugzilla.open-bio.org/
93 =head1 AUTHOR
95 Based on the Bio::Tools::Signalp module
96 Emmanuel Quevillon <emmanuel.quevillon@versailles.inra.fr>
98 =head1 APPENDIX
100 The rest of the documentation details each of the object methods.
101 Internal methods are usually preceded with a _
103 =cut
105 package Bio::Tools::Signalp::ExtendedSignalp;
107 use strict;
108 use Data::Dumper;
109 use Bio::SeqFeature::Generic;
110 # don't need Bio::Root::Root/IO (already in inheritance tree)
111 use base qw(Bio::Tools::Signalp Bio::Tools::AnalysisResult);
113 #Supported arguments
114 my $FACTS = {
115 'maxC' => 1,
116 'maxS' => 1,
117 'maxY' => 1,
118 'meanS' => 1,
119 'D' => 1,
122 =head2 new
124 Title : new
125 Usage : my $obj = new Bio::Tools::Signalp::ExtendedSignalp();
126 Function: Builds a new Bio::Tools::Signalp::ExtendedSignalp object
127 Returns : Bio::Tools::Signalp::ExtendedSignalp
128 Args : -fh/-file => $val, # for initing input, see Bio::Root::IO
131 =cut
133 sub new {
134 my($class,@args) = @_;
136 my $self = $class->SUPER::new(@args);
137 $self->_initialize_io(@args);
139 my $factors = $self->_rearrange([qw(FACTORS)], @args);
140 #To behave like the parent module (Bio::Tools::Signalp) we default factors to these two factors
141 if($factors && scalar(@$factors)){
142 $factors = $factors;
144 else{
145 $factors = [qw(maxY meanS)];
147 $factors && $self->factors($factors);
149 return $self;
152 =head2 next_feature
154 Title : next_feature
155 Usage : my $feat = $signalp->next_feature
156 Function: Get the next result feature from parser data
157 Returns : Bio::SeqFeature::Generic
158 Args : none
161 =cut
163 sub next_feature {
165 my ($self) = @_;
167 if(!$self->_parsed()){
168 $self->_parse();
171 return shift @{$self->{_features}} || undef;
175 =head2 _filterok
177 Title : _filterok
178 Usage : my $feat = $signalp->_filterok
179 Function: Check if the factors required by the user are all ok.
180 Returns : 1/0
181 Args : hash reference
184 =cut
186 sub _filterok {
188 my($self, $hash) = @_;
190 #We hope everything will be fine ;)
191 my $bool = 1;
193 #If the user did not give any filter, we keep eveything
194 return $bool unless keys %{$self->{_factors}};
196 #If only one of the factors parsed is equal to NO based on the user factors cutoff
197 #Then the filter is not ok.
198 foreach my $fact (keys %{$self->factors()}){
199 if(exists($hash->{$fact}) && $hash->{$fact} =~ /^N/){
200 $bool = 0;
204 return $bool;
208 =head2 factors
210 Title : factors
211 Usage : my $feat = $signalp->factors
212 Function: Get/Set the filters required from the user
213 Returns : hash
214 Args : array reference
217 =cut
219 sub factors {
221 my($self, $array) = @_;
223 if($array){
224 $self->{_factors} = { };
225 foreach my $f (@$array){
226 if(exists($FACTS->{$f})){
227 $self->{_factors}->{$f} = 1;
229 else{
230 $self->throw("[$f] incorrect factor. Supported:\n- ".join("\n- ", keys %$FACTS)."\n");
235 return $self->{_factors};
239 =head2 _parsed
241 Title : _parsed
242 Usage : obj->_parsed()
243 Function: Get/Set if the result is parsed or not
244 Returns : 1/0 scalar
245 Args : On set 1
248 =cut
250 sub _parsed {
252 my($self, $parsed) = @_;
254 if(defined($parsed)){
255 $self->{_parsed} = $parsed;
258 return $self->{_parsed};
262 =head2 _parse
264 Title : _parse
265 Usage : obj->_parse
266 Function: Parse the SignalP result
267 Returns :
268 Args :
271 =cut
273 sub _parse {
275 my($self) = @_;
277 #Let's read the file...
278 while (my $line = $self->_readline()) {
280 chomp $line;
281 #We want to be sure to catch the first non empty line to be ablte to determine
282 #which format we are working with...
283 next unless ($line =~ /^>(\S+)|^# SignalP-[NHM]+ \S+ predictions/);
285 if($line =~ /^>(\S+)/){
286 $self->_pushback($line);
287 $self->_parse_summary_format();
288 last;
290 elsif($line =~ /^# SignalP-[NHM]+ \S+ predictions/){
291 $self->_pushback($line);
292 $self->_parse_short_format();
293 last;
295 else{
296 $self->throw("Unable to determine the format type.");
300 return;
303 =head2 _parse_summary_format
305 Title : _parse_summary_format
306 Usage : $self->_parse_summary_format
307 Function: Method to parse summary/full format from signalp output
308 It automatically fills filtered features.
309 Returns :
310 Args :
312 =cut
314 sub _parse_summary_format {
316 my($self) = @_;
318 my $feature = undef;
319 my $ok = 0;
321 while(my $line = $self->_readline()){
323 if($line =~ /^SignalP-NN result:/){
324 $self->_pushback($line);
325 $feature = $self->_parse_nn_result($feature);
327 if($line =~ /^SignalP-HMM result:/){
328 $self->_pushback($line);
329 $feature = $self->_parse_hmm_result($feature);
332 if($line =~ /^---------/ && $feature){
333 my $new_feature = $self->create_feature($feature);
334 push @{$self->{_features}}, $new_feature if $new_feature;
335 $feature = undef;
339 return;
343 =head2 _parse_nn_result
345 Title : _parse_nn_result
346 Usage : obj->_parse_nn_result
347 Function: Parses the Neuronal Network (NN) part of the result
348 Returns : Hash reference
349 Args :
352 =cut
354 sub _parse_nn_result {
356 my($self, $feature) = @_;
358 my $ok = 0;
359 my %facts;
361 #SignalP-NN result:
362 #>MGG_11635.5 length = 100
363 ## Measure Position Value Cutoff signal peptide?
364 # max. C 37 0.087 0.32 NO
365 # max. Y 37 0.042 0.33 NO
366 # max. S 3 0.062 0.87 NO
367 # mean S 1-36 0.024 0.48 NO
368 # D 1-36 0.033 0.43 NO
370 while(my $line = $self->_readline()){
372 chomp $line;
374 if($line =~ /^SignalP-NN result:/){
375 $ok = 1;
376 next;
379 $self->throw("Wrong line for parsing NN results.") unless $ok;
381 if ($line=~/^\>(\S+)\s+length/) {
382 $self->seqname($1);
383 %facts = ();
384 next;
386 elsif($line =~ /max\.\s+C\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
387 $feature->{maxCprob} = $1;
388 $facts{maxC} = $2;
389 next;
391 elsif ($line =~ /max\.\s+Y\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
392 $feature->{maxYprob} = $1;
393 $facts{maxY} = $2;
394 next;
396 elsif($line =~ /max\.\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
397 $feature->{maxSprob} = $1;
398 $facts{maxS} = $2;
399 next;
401 elsif ($line=~/mean\s+S\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
402 $feature->{meanSprob} = $1;
403 $facts{meanS} = $2;
404 next;
406 elsif ($line=~/\s+D\s+(\S+)\s+\S+\s+\S+\s+(\S+)/) {
407 $feature->{Dprob} = $1;
408 $facts{D} = $2;
409 next;
411 #If we don't have this line it means that all the factors cutoff are equal to 'NO'
412 elsif ($line =~ /Most likely cleavage site between pos\.\s+(\d+)/) {
413 #if($self->_filterok(\%facts)){
414 #$feature->{name} = $self->seqname();
415 #$feature->{start} = 1;
416 $feature->{end} = $1 + 1; #To be consistent with end given in short format
418 #return $feature;
420 elsif($line =~ /^\s*$/){
421 last;
425 if($self->_filterok(\%facts)){
426 $feature->{name} = $self->seqname();
427 $feature->{start} = 1;
428 $feature->{nnPrediction} = 'signal-peptide';
431 return $feature;
435 =head2 _parse_hmm_result
437 Title : _parse_hmm_result
438 Usage : obj->_parse_hmm_result
439 Function: Parses the Hiden Markov Model (HMM) part of the result
440 Returns : Hash reference
441 Args :
443 =cut
445 sub _parse_hmm_result {
447 my ($self, $feature_hash) = @_;
449 my $ok = 0;
451 #SignalP-HMM result:
452 #>MGG_11635.5
453 #Prediction: Non-secretory protein
454 #Signal peptide probability: 0.000
455 #Signal anchor probability: 0.000
456 #Max cleavage site probability: 0.000 between pos. -1 and 0
458 while(my $line = $self->_readline()){
460 chomp $line;
461 next if $line =~ /^\s*$/o;
463 if($line =~ /^SignalP-HMM result:/){
464 $ok = 1;
465 next;
468 $self->throw("Wrong line for parsing HMM result.") unless $ok;
470 if($line =~ /^>(\S+)/){
471 #In case we already seen a name with NN results
472 $feature_hash->{name} = $1 unless $self->seqname();
474 elsif($line =~ /Prediction: (.+)$/){
475 $feature_hash->{hmmPrediction} = $1;
477 elsif($line =~ /Signal peptide probability: ([0-9\.]+)/){
478 $feature_hash->{peptideProb} = $1;
480 elsif($line =~ /Signal anchor probability: ([0-9\.]+)/){
481 $feature_hash->{anchorProb} = $1;
483 elsif($line =~ /Max cleavage site probability: (\S+) between pos. \S+ and (\S+)/){
484 $feature_hash->{cleavageSiteProb} = $1;
485 #Strange case, if we don't have an end value in NN result (no nn method launched)
486 #We try anyway to get an end value, unless this value is lower than 1 which is
487 #the start
488 $feature_hash->{end} = $2 if($2 > 1 && !$feature_hash->{end});
489 $feature_hash->{start} = 1 unless $feature_hash->{start};
490 last;
494 return $feature_hash;
497 =head2 _parse_short_format
499 Title : _parse_short_format
500 Usage : $self->_parse_short_format
501 Function: Method to parse short format from signalp output
502 It automatically fills filtered features.
503 Returns :
504 Args :
506 =cut
508 sub _parse_short_format {
510 my($self) = @_;
512 my $ok = 0;
513 my $method = undef;
514 $self->{_oformat} = 'short';
516 #Output example
517 # SignalP-NN euk predictions # SignalP-HMM euk predictions
518 # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ?
519 #Q5A8M1_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8M1_CANAL Q 0.001 35 N 0.002 N
520 #O74127_YARLI 0.121 21 N 0.284 21 N 0.953 11 Y 0.826 Y 0.555 Y O74127_YARLI S 0.485 23 N 0.668 Y
521 #Q5VJ86_9PEZI 0.355 24 Y 0.375 24 Y 0.798 12 N 0.447 N 0.411 N Q5VJ86_9PEZI Q 0.180 23 N 0.339 N
522 #Q5A8U5_CANAL 0.085 27 N 0.190 35 N 0.936 27 Y 0.418 N 0.304 N Q5A8U5_CANAL Q 0.001 35 N 0.002 N
524 while(my $line = $self->_readline()){
526 chomp $line;
527 next if $line =~ /^\s*$|^# name/;
529 if($line =~ /^#/){
530 $method = $line =~ /SignalP-NN .+ SignalP-HMM/ ?
531 'both' : $line =~ /SignalP-NN/ ?
532 'nn' : 'hmm';
533 next;
536 #$self->throw("It looks like the format is not 'short' format.") unless($ok);
538 my @data = split(/\s+/, $line);
539 $self->seqname($data[0]);
541 my $factors = { };
542 my $feature = { };
544 #NN results gives more fields than HMM
545 if($method eq 'both' || $method eq 'nn'){
547 $feature->{maxCprob} = $data[1];
548 $factors->{maxC} = $data[3];
549 $feature->{maxYprob} = $data[4];
550 $factors->{maxY} = $data[6];
551 $feature->{maxSprob} = $data[7];
552 $factors->{maxS} = $data[9];
553 $feature->{meanSprob}= $data[10];
554 $factors->{meanS} = $data[11];
555 $feature->{Dprob} = $data[12];
556 $factors->{D} = $data[13];
557 #It looks like the max Y position is reported as the most likely cleavage position
558 $feature->{end} = $data[5];
559 $feature->{nnPrediction} = 'signal-peptide';
561 if($method eq 'both'){
562 $feature->{hmmPrediction} = $data[15] eq 'Q' ? 'Non-secretory protein' : 'Signal peptide';
563 $feature->{cleavageSiteProb} = $data[16];
564 $feature->{peptideProb} = $data[19];
567 elsif($method eq 'hmm'){
568 #In short output anchor probability is not given
569 $feature->{hmmPrediction} = $data[1] eq 'Q' ? 'Non-secretory protein' : 'Signal peptide';
570 $feature->{cleavageSiteProb} = $data[2];
571 $feature->{peptideProb} = $data[5];
572 #It looks like the max cleavage probability position is given by the Cmax proability
573 $feature->{end} = $data[3];
576 #Unfortunately, we cannot parse the filters for hmm method.
577 if($self->_filterok($factors)){
578 $feature->{name} = $self->seqname();
579 $feature->{start} = 1;
580 $feature->{source} = 'Signalp';
581 $feature->{primary} = 'signal_peptide';
582 $feature->{program} = 'Signalp';
583 $feature->{logic_name} = 'signal_peptide';
585 my $new_feat = $self->create_feature($feature);
586 push @{$self->{_features}}, $new_feat if $new_feat;
590 return;
593 =head2 create_feature
595 Title : create_feature
596 Usage : obj->create_feature(\%feature)
597 Function: Internal(not to be used directly)
598 Returns :
599 Args :
602 =cut
604 sub create_feature {
606 my ($self, $feat) = @_;
608 #If we don't have neither start nor end, we return.
609 unless($feat->{name} && $feat->{start} && $feat->{end}){
610 return;
613 # create feature object
614 my $feature = Bio::SeqFeature::Generic->new(
615 -seq_id => $feat->{name},
616 -start => $feat->{start},
617 -end => $feat->{end},
618 -score => defined($feat->{peptideProb}) ? $feat->{peptideProb} : '',
619 -source => 'Signalp',
620 -primary => 'signal_peptide',
621 -logic_name => 'signal_peptide',
624 $feature->add_tag_value('peptideProb', $feat->{peptideProb});
625 $feature->add_tag_value('anchorProb', $feat->{anchorProb});
626 $feature->add_tag_value('evalue',$feat->{anchorProb});
627 $feature->add_tag_value('percent_id','NULL');
628 $feature->add_tag_value("hid",$feat->{primary});
629 $feature->add_tag_value('signalpPrediction', $feat->{hmmPrediction});
630 $feature->add_tag_value('cleavageSiteProb', $feat->{cleavageSiteProb}) if($feat->{cleavageSiteProb});
631 $feature->add_tag_value('nnPrediction', $feat->{nnPrediction}) if($feat->{nnPrediction});
632 $feature->add_tag_value('maxCprob', $feat->{maxCprob}) if(defined($feat->{maxCprob}));
633 $feature->add_tag_value('maxSprob', $feat->{maxSprob}) if(defined($feat->{maxSprob}));
634 $feature->add_tag_value('maxYprob', $feat->{maxYprob}) if(defined($feat->{maxYprob}));
635 $feature->add_tag_value('meanSprob', $feat->{meanSprob}) if(defined($feat->{meanSprob}));
636 $feature->add_tag_value('Dprob', $feat->{Dprob}) if(defined($feat->{Dprob}));
638 return $feature;
642 =head2 seqname
644 Title : seqname
645 Usage : obj->seqname($name)
646 Function: Internal(not to be used directly)
647 Returns :
648 Args :
651 =cut
653 sub seqname{
654 my ($self,$seqname)=@_;
656 if (defined($seqname)){
657 $self->{'seqname'} = $seqname;
660 return $self->{'seqname'};