1 #-----------------------------------------------------------------------------
2 # PACKAGE : Bio::Tools::Sigcleave
3 # AUTHOR : Chris Dagdigian, dag@sonsorol.org
4 # CREATED : Jan 28 1999
7 # Copyright (c) 1997-9 bioperl, Chris Dagdigian and others. All Rights Reserved.
8 # This module is free software; you can redistribute it and/or
9 # modify it under the same terms as Perl itself.
13 # Object framework ripped from Steve Chervits's SeqPattern.pm
15 # Core EGCG Sigcleave emulation from perl code developed by
16 # Danh Nguyen & Kamalakar Gulukota which itself was based
17 # loosely on Colgrove's signal.c program.
19 # The overall idea is to replicate the output of the sigcleave
20 # program which was distributed with the EGCG extension to the GCG sequence
21 # analysis package. There is also an accessor method for just getting at
24 #-----------------------------------------------------------------------------
28 Bio::Tools::Sigcleave - Bioperl object for sigcleave analysis
32 =head2 Object Creation
34 use Bio::Tools::Sigcleave ();
36 # to keep the module backwar compatible, you can pass it a sequence string, but
37 # there recommended say is to pass it a Seq object
40 $seq = "MVLLLILSVLLLKEDVRGSAQSSERRVVAHMPGDIIIGALFSVHHQPTVDKVHERKCGAVREQYGI";
41 $sig = Bio::Tools::Sigcleave->new(-seq => $seq,
46 $seqobj = Bio::PrimarySeq->new(-seq => $seq);
48 $sig = Bio::Tools::Sigcleave->new(-seq => $seqobj,
52 # now you can detect procaryotic signal sequences as well as eucaryotic
53 $sig->matrix('eucaryotic'); # or 'procaryotic'
56 =head2 Object Methods & Accessors
58 # you can use this method to fine tune the threshod before printing out the results
61 %raw_results = $sig->signals;
62 $formatted_output = $sig->pretty_print;
66 "Sigcleave" was a program distributed as part of the free EGCG add-on
67 to earlier versions of the GCG Sequence Analysis package. A new
68 implementation of the algorithm is now part of EMBOSS package.
70 From the EGCG documentation:
72 SigCleave uses the von Heijne method to locate signal sequences, and
73 to identify the cleavage site. The method is 95% accurate in
74 resolving signal sequences from non-signal sequences with a cutoff
75 score of 3.5, and 75-80% accurate in identifying the cleavage
76 site. The program reports all hits above a minimum value.
78 The EGCG Sigcleave program was written by Peter Rice (E-mail:
79 pmr@sanger.ac.uk Post: Informatics Division, The Sanger Centre,
80 Wellcome Trust Genome Campus, Hinxton, Cambs, CB10 1SA, UK).
82 Since EGCG is no longer distributed for the latest versions of GCG,
83 this code was developed to emulate the output of the original program
84 as much as possible for those who lost access to sigcleave when
85 upgrading to newer versions of GCG.
87 There are 2 accessor methods for this object. "signals" will return a
88 perl associative array containing the sigcleave scores keyed by amino
89 acid position. "pretty_print" returns a formatted string similar to
90 the output of the original sigcleave utility.
92 In both cases, the "threshold" setting controls the score reporting
93 level. If no value for threshold is passed in by the user, the code
94 defaults to a reporting value of 3.5.
96 In this implemntation the accessor will never return any
97 score/position pair which does not meet the threshold limit. This is
98 the slightly different from the behaviour of the 8.1 EGCG sigcleave
99 program which will report the highest of the under-threshold results
100 if nothing else is found.
103 Example of pretty_print output:
105 SIGCLEAVE of sigtest from: 1 to 146
107 Report scores over 3.5
108 Maximum score 4.9 at residue 131
110 Sequence: FVILAAMSIQGSA-NLQTQWKSTASLALET
111 | (signal) | (mature peptide)
114 Other entries above 3.5
116 Maximum score 3.7 at residue 112
118 Sequence: CSRQLFGWLFCKV-HPGAIVFVILAAMSIQGSANLQTQWKSTASLALET
119 | (signal) | (mature peptide)
125 When updating and maintaining a module, it helps to know that people
126 are actually using it. Let us know if you find a bug, think this code
127 is useful or have any improvements/features to suggest.
129 =head2 Reporting Bugs
131 Report bugs to the Bioperl bug tracking system to help us keep track
132 the bugs and their resolution. Bug reports can be submitted via the
135 http://bugzilla.open-bio.org/
139 Chris Dagdigian, dag-at-sonsorol.org & others
143 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
147 Bio::Tools::Sigcleave, $Id$
151 Copyright (c) 1999 Chris Dagdigian & others. All Rights Reserved.
152 This module is free software; you can redistribute it and/or modify it
153 under the same terms as Perl itself.
155 =head1 REFERENCES / SEE ALSO
157 von Heijne G. (1986) "A new method for predicting signal sequences
158 cleavage sites." Nucleic Acids Res. 14, 4683-4690.
160 von Heijne G. (1987) in "Sequence Analysis in Molecular Biology:
161 Treasure Trove or Trivial Pursuit" (Acad. Press, (1987), 113-117).
166 The following documentation describes the various functions
167 contained in this module. Some functions are for internal
168 use and are not meant to be called by the user; they are
169 preceded by an underscore ("_").
176 #### END of main POD documentation.
181 package Bio
::Tools
::Sigcleave
;
185 use base
qw(Bio::Root::Root);
187 use vars qw
($ID %WeightTable_euc %WeightTable_pro );
188 $ID = 'Bio::Tools::Sigcleave';
191 #Sample: 161 aligned sequences
192 # R -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 +1 +2 Expect
193 'A' => [16, 13, 14, 15, 20, 18, 18, 17, 25, 15, 47, 6, 80, 18, 6, 14.5],
194 'C' => [ 3, 6, 9, 7, 9, 14, 6, 8, 5, 6, 19, 3, 9, 8, 3, 4.5],
195 'D' => [ 0, 0, 0, 0, 0, 0, 0, 0, 5, 3, 0, 5, 0, 10, 11, 8.9],
196 'E' => [ 0, 0, 0, 1, 0, 0, 0, 0, 3, 7, 0, 7, 0, 13, 14, 10.0],
197 'F' => [13, 9, 11, 11, 6, 7, 18, 13, 4, 5, 0, 13, 0, 6, 4, 5.6],
198 'G' => [ 4, 4, 3, 6, 3, 13, 3, 2, 19, 34, 5, 7, 39, 10, 7, 12.1],
199 'H' => [ 0, 0, 0, 0, 0, 1, 1, 0, 5, 0, 0, 6, 0, 4, 2, 3.4],
200 'I' => [15, 15, 8, 6, 11, 5, 4, 8, 5, 1, 10, 5, 0, 8, 7, 7.4],
201 'K' => [ 0, 0, 0, 1, 0, 0, 1, 0, 0, 4, 0, 2, 0, 11, 9, 11.3],
202 'L' => [71, 68, 72, 79, 78, 45, 64, 49, 10, 23, 8, 20, 1, 8, 4, 12.1],
203 'M' => [ 0, 3, 7, 4, 1, 6, 2, 2, 0, 0, 0, 1, 0, 1, 2, 2.7],
204 'N' => [ 0, 1, 0, 1, 1, 0, 0, 0, 3, 3, 0, 10, 0, 4, 7, 7.1],
205 'P' => [ 2, 0, 2, 0, 0, 4, 1, 8, 20, 14, 0, 1, 3, 0, 22, 7.4],
206 'Q' => [ 0, 0, 0, 1, 0, 6, 1, 0, 10, 8, 0, 18, 3, 19, 10, 6.3],
207 'R' => [ 2, 0, 0, 0, 0, 1, 0, 0, 7, 4, 0, 15, 0, 12, 9, 7.6],
208 'S' => [ 9, 3, 8, 6, 13, 10, 15, 16, 26, 11, 23, 17, 20, 15, 10, 11.4],
209 'T' => [ 2, 10, 5, 4, 5, 13, 7, 7, 12, 6, 17, 8, 6, 3, 10, 9.7],
210 'V' => [20, 25, 15, 18, 13, 15, 11, 27, 0, 12, 32, 3, 0, 8, 17, 11.1],
211 'W' => [ 4, 3, 3, 1, 1, 2, 6, 3, 1, 3, 0, 9, 0, 2, 0, 1.8],
212 'Y' => [ 0, 1, 4, 0, 0, 1, 3, 1, 1, 2, 0, 5, 0, 1, 7, 5.6]
216 #Sample: 36 aligned sequences
217 # R -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 +1 +2 Expect
218 'A' => [0, 8, 8, 9, 6, 7, 5, 6, 7, 7, 24, 2, 31, 18, 4, 3.2],
219 'C' => [1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1.0],
220 'D' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 8, 2.0],
221 'E' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 8, 2.2],
222 'F' => [2, 4, 3, 4, 1, 1, 8, 0, 4, 1, 0, 7, 0, 1, 0, 1.3],
223 'G' => [4, 2, 2, 2, 3, 5, 2, 4, 2, 2, 0, 2, 2, 1, 0, 2.7],
224 'H' => [0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 7, 0, 1, 0, 0.8],
225 'I' => [3, 1, 5, 1, 5, 0, 1, 3, 0, 0, 0, 0, 0, 0, 2, 1.7],
226 'K' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, 2.5],
227 'L' => [8, 11, 9, 8, 9, 13, 1, 0, 2, 2, 1, 2, 0, 0, 1, 2.7],
228 'M' => [0, 2, 1, 1, 3, 2, 3, 0, 1, 2, 0, 4, 0, 0, 1, 0.6],
229 'N' => [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 3, 0, 1, 4, 1.6],
230 'P' => [0, 1, 1, 1, 1, 1, 2, 3, 5, 2, 0, 0, 0, 0, 5, 1.7],
231 'Q' => [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 3, 0, 0, 1, 1.4],
232 'R' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1.7],
233 'S' => [1, 0, 1, 4, 4, 1, 5, 15, 5, 8, 5, 2, 2, 0, 0, 2.6],
234 'T' => [2, 0, 4, 2, 2, 2, 2, 2, 5, 1, 3, 0, 1, 1, 2, 2.2],
235 'V' => [5, 7, 1, 3, 1, 4, 7, 0, 0, 4, 3, 0, 0, 2, 0, 2.5],
236 'W' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0.4],
237 'Y' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 1.3]
242 ## Now we calculate the _real_ values for the weight tables
245 ## yeah yeah yeah there is lots of math here that gets repeated
246 ## every single time a sigcleave object gets created. This is
247 ## a quick hack to make sure that we get the scores as accurate as
248 ## possible. Need all those significant digits....
250 ## suggestions for speedup aproaches welcome
254 foreach my $i (keys %WeightTable_euc) {
255 my $expected = $WeightTable_euc{$i}[15];
257 for (my $j=0; $j<16; $j++) {
258 if ($WeightTable_euc{$i}[$j] == 0) {
259 $WeightTable_euc{$i}[$j] = 1;
260 if ($j == 10 || $j == 12) {
261 $WeightTable_euc{$i}[$j] = 1.e
-10;
264 $WeightTable_euc{$i}[$j] = log($WeightTable_euc{$i}[$j]/$expected);
270 foreach my $i (keys %WeightTable_pro) {
271 my $expected = $WeightTable_pro{$i}[15];
273 for (my $j=0; $j<16; $j++) {
274 if ($WeightTable_pro{$i}[$j] == 0) {
275 $WeightTable_pro{$i}[$j] = 1;
276 if ($j == 10 || $j == 12) {
277 $WeightTable_pro{$i}[$j] = 1.e
-10;
280 $WeightTable_pro{$i}[$j] = log($WeightTable_pro{$i}[$j]/$expected);
285 #####################################################################################
287 #####################################################################################
291 my ($class, @args) = @_;
293 my $self = $class->SUPER::new
(@args);
294 #my $self = Bio::Seq->new(@args);
296 my ($seq, $threshold, $matrix) = $self->_rearrange([qw(SEQ THRESHOLD MATRIX)],@args);
298 defined $threshold && $self->threshold($threshold);
299 $matrix && $self->matrix($matrix);
300 $seq && $self->seq($seq);
310 Usage : $value = $self->threshold
311 Purpose : Read/write method sigcleave score reporting threshold.
313 Argument : new value, float
314 Throws : on non-number argument
315 Comments : defaults to 3.5
323 my ($self, $value) = @_;
324 if( defined $value) {
325 $self->throw("I need a number, not [$value]")
326 if $value !~ /^[+-]?[\d\.]+$/;
327 $self->{'_threshold'} = $value;
329 return $self->{'_threshold'} || 3.5 ;
335 Usage : $value = $self->matrix('procaryotic')
336 Purpose : Read/write method sigcleave matrix.
338 Argument : new value: 'eucaryotic' or 'procaryotic'
339 Throws : on non-number argument
340 Comments : defaults to 3.5
348 my ($self, $value) = @_;
349 if( defined $value) {
350 $self->throw("I need 'eucaryotic' or 'procaryotic', not [$value]")
351 unless $value eq 'eucaryotic' or $value eq 'procaryotic';
352 $self->{'_matrix'} = $value;
354 return $self->{'_matrix'} || 'eucaryotic' ;
360 Usage : $value = $self->seq($seq_object)
361 Purpose : set the Seq object to be used
363 Argument : protein sequence or Seq object
371 my ($self, $value) = @_;
372 if( defined $value) {
373 if ($value->isa('Bio::PrimarySeqI')) {
374 $self->{'_seq'} = $value;
376 $self->{'_seq'} = Bio
::PrimarySeq
->new(-seq
=> $value,
377 -alphabet
=> 'protein');
380 return $self->{'_seq'};
386 Usage : N/A This is an internal method. Not meant to be called from outside
389 Purpose : calculates sigcleave score and amino acid position for the
390 : given protein sequence. The score reporting threshold can
391 : be adjusted by passing in the "threshold" parameter during
392 : object construction. If no threshold is passed in, the code
393 : defaults to reporting any scores equal to or above 3.5
395 Returns : nothing. results are added to the object
425 my $seqEnd = $self->seq->length;
426 my $pep = $self->seq->seq;
427 my $minWeight = $self->threshold;
428 my $matrix = $self->matrix;
430 ## The weight table is keyed by UPPERCASE letters so we uppercase
431 ## the pep string because we don't want to alter the actual object
436 for ($seqPos = $seqBegin; $seqPos < $seqEnd; $seqPos++) {
437 $istart = (0 > $seqPos + $pVal)?
0 : $seqPos + $pVal;
438 $iend = ($seqPos + $nVal - 1 < $seqEnd)?
$seqPos + $nVal - 1 : $seqEnd;
439 $icol= $iend - $istart + 1;
441 for ($k=0; $k<$icol; $k++) {
442 $c = substr($pep, $istart + $k, 1);
444 ## CD: The if(defined) stuff was put in here because Sigcleave.pm
445 ## CD: kept getting warnings about undefined vals during 'make test' ...
446 if ($matrix eq 'eucaryotic') {
447 $weight += $WeightTable_euc{$c}[$k] if defined $WeightTable_euc{$c}[$k];
449 $weight += $WeightTable_pro{$c}[$k] if defined $WeightTable_pro{$c}[$k];
452 $signals{$seqPos+1} = sprintf ("%.1f", $weight) if $weight >= $minWeight;
454 $self->{"_signal_scores"} = { %signals };
461 Usage : %sigcleave_results = $sig->signals;
463 Purpose : Accessor method for sigcleave results
465 Returns : Associative array. The key value represents the amino acid position
466 : and the value represents the score. Only scores that
467 : are greater than or equal to the THRESHOLD value are reported.
483 # do the calculations
486 foreach $position ( sort keys %{ $self->{'_signal_scores'} } ) {
487 $results{$position} = $self->{'_signal_scores'}{$position};
496 Usage : $count = $sig->result_count;
498 Purpose : Accessor method for sigcleave results
500 Returns : Integer, number of results above the threshold
515 return keys %{ $self->{'_signal_scores'} };
522 Usage : $output = $sig->pretty_print;
523 : print $sig->pretty_print;
525 Purpose : Emulates the output of the EGCG Sigcleave
528 Returns : A formatted string.
543 my %results = $self->signals;
544 my @hits = keys %results;
545 my $hitcount = $#hits; $hitcount++;
546 my $thresh = $self->threshold;
547 my $seqlen = $self->seq->length || 0;
548 my $name = $self->seq->id || 'NONAME';
549 my $pep = $self->seq->seq;
552 $output = "SIGCLEAVE of $name from: 1 to $seqlen\n\n";
555 $output .= "Report scores over $thresh\n";
556 foreach $pos ((sort { $results{$b} cmp $results{$a} } keys %results)) {
557 my $start = $pos - 15;
558 $start = 1 if $start < 1;
559 my $sig = substr($pep,$start -1,$pos-$start );
561 $output .= sprintf ("Maximum score %1.1f at residue %3d\n",$results{$pos},$pos);
563 $output .= " Sequence: ";
565 $output .= "-" x
(15- length($sig));
567 $output .= substr($pep,$pos-1,50);
570 $output .= "| \(signal\) | \(mature peptide\)\n";
571 $output .= sprintf(" %3d %3d\n\n",$start,$pos);
573 if (($hitcount > 1) && ($cnt == 1)) {
574 $output .= " Other entries above $thresh\n\n";
587 #########################################################################
589 #########################################################################