* sync with trunk
[bioperl-live.git] / Bio / Tools / Sigcleave.pm
blob02c5623ac52006995bede2a196d31bae4e3ea3d4
1 #-----------------------------------------------------------------------------
2 # PACKAGE : Bio::Tools::Sigcleave
3 # AUTHOR : Chris Dagdigian, dag@sonsorol.org
4 # CREATED : Jan 28 1999
5 # REVISION: $Id$
7 # Copyright (c) 1997-9 bioperl, Chris Dagdigian and others. All Rights Reserved.
8 # This module is free software; you can redistribute it and/or
9 # modify it under the same terms as Perl itself.
11 # _History_
13 # Object framework ripped from Steve Chervits's SeqPattern.pm
15 # Core EGCG Sigcleave emulation from perl code developed by
16 # Danh Nguyen & Kamalakar Gulukota which itself was based
17 # loosely on Colgrove's signal.c program.
19 # The overall idea is to replicate the output of the sigcleave
20 # program which was distributed with the EGCG extension to the GCG sequence
21 # analysis package. There is also an accessor method for just getting at
22 # the raw results.
24 #-----------------------------------------------------------------------------
26 =head1 NAME
28 Bio::Tools::Sigcleave - Bioperl object for sigcleave analysis
30 =head1 SYNOPSIS
32 =head2 Object Creation
34 use Bio::Tools::Sigcleave ();
36 # to keep the module backwar compatible, you can pass it a sequence string, but
37 # there recommended say is to pass it a Seq object
39 # this works
40 $seq = "MVLLLILSVLLLKEDVRGSAQSSERRVVAHMPGDIIIGALFSVHHQPTVDKVHERKCGAVREQYGI";
41 $sig = Bio::Tools::Sigcleave->new(-seq => $seq,
42 -type => 'protein',
43 -threshold=>'3.5',
45 # but you do:
46 $seqobj = Bio::PrimarySeq->new(-seq => $seq);
48 $sig = Bio::Tools::Sigcleave->new(-seq => $seqobj,
49 -threshold=>'3.5',
52 # now you can detect procaryotic signal sequences as well as eucaryotic
53 $sig->matrix('eucaryotic'); # or 'procaryotic'
56 =head2 Object Methods & Accessors
58 # you can use this method to fine tune the threshod before printing out the results
59 $sig->result_count:
61 %raw_results = $sig->signals;
62 $formatted_output = $sig->pretty_print;
64 =head1 DESCRIPTION
66 "Sigcleave" was a program distributed as part of the free EGCG add-on
67 to earlier versions of the GCG Sequence Analysis package. A new
68 implementation of the algorithm is now part of EMBOSS package.
70 From the EGCG documentation:
72 SigCleave uses the von Heijne method to locate signal sequences, and
73 to identify the cleavage site. The method is 95% accurate in
74 resolving signal sequences from non-signal sequences with a cutoff
75 score of 3.5, and 75-80% accurate in identifying the cleavage
76 site. The program reports all hits above a minimum value.
78 The EGCG Sigcleave program was written by Peter Rice (E-mail:
79 pmr@sanger.ac.uk Post: Informatics Division, The Sanger Centre,
80 Wellcome Trust Genome Campus, Hinxton, Cambs, CB10 1SA, UK).
82 Since EGCG is no longer distributed for the latest versions of GCG,
83 this code was developed to emulate the output of the original program
84 as much as possible for those who lost access to sigcleave when
85 upgrading to newer versions of GCG.
87 There are 2 accessor methods for this object. "signals" will return a
88 perl associative array containing the sigcleave scores keyed by amino
89 acid position. "pretty_print" returns a formatted string similar to
90 the output of the original sigcleave utility.
92 In both cases, the "threshold" setting controls the score reporting
93 level. If no value for threshold is passed in by the user, the code
94 defaults to a reporting value of 3.5.
96 In this implemntation the accessor will never return any
97 score/position pair which does not meet the threshold limit. This is
98 the slightly different from the behaviour of the 8.1 EGCG sigcleave
99 program which will report the highest of the under-threshold results
100 if nothing else is found.
103 Example of pretty_print output:
105 SIGCLEAVE of sigtest from: 1 to 146
107 Report scores over 3.5
108 Maximum score 4.9 at residue 131
110 Sequence: FVILAAMSIQGSA-NLQTQWKSTASLALET
111 | (signal) | (mature peptide)
112 118 131
114 Other entries above 3.5
116 Maximum score 3.7 at residue 112
118 Sequence: CSRQLFGWLFCKV-HPGAIVFVILAAMSIQGSANLQTQWKSTASLALET
119 | (signal) | (mature peptide)
120 99 112
123 =head1 FEEDBACK
125 When updating and maintaining a module, it helps to know that people
126 are actually using it. Let us know if you find a bug, think this code
127 is useful or have any improvements/features to suggest.
129 =head2 Reporting Bugs
131 Report bugs to the Bioperl bug tracking system to help us keep track
132 the bugs and their resolution. Bug reports can be submitted via the
133 web:
135 http://bugzilla.open-bio.org/
137 =head1 AUTHOR
139 Chris Dagdigian, dag-at-sonsorol.org & others
141 =head1 CONTRIBUTORS
143 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
145 =head1 VERSION
147 Bio::Tools::Sigcleave, $Id$
149 =head1 COPYRIGHT
151 Copyright (c) 1999 Chris Dagdigian & others. All Rights Reserved.
152 This module is free software; you can redistribute it and/or modify it
153 under the same terms as Perl itself.
155 =head1 REFERENCES / SEE ALSO
157 von Heijne G. (1986) "A new method for predicting signal sequences
158 cleavage sites." Nucleic Acids Res. 14, 4683-4690.
160 von Heijne G. (1987) in "Sequence Analysis in Molecular Biology:
161 Treasure Trove or Trivial Pursuit" (Acad. Press, (1987), 113-117).
164 =head1 APPENDIX
166 The following documentation describes the various functions
167 contained in this module. Some functions are for internal
168 use and are not meant to be called by the user; they are
169 preceded by an underscore ("_").
171 =cut
176 #### END of main POD documentation.
181 package Bio::Tools::Sigcleave;
183 use Bio::PrimarySeq;
185 use base qw(Bio::Root::Root);
186 use strict;
187 use vars qw ($ID %WeightTable_euc %WeightTable_pro );
188 $ID = 'Bio::Tools::Sigcleave';
190 %WeightTable_euc = (
191 #Sample: 161 aligned sequences
192 # R -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 +1 +2 Expect
193 'A' => [16, 13, 14, 15, 20, 18, 18, 17, 25, 15, 47, 6, 80, 18, 6, 14.5],
194 'C' => [ 3, 6, 9, 7, 9, 14, 6, 8, 5, 6, 19, 3, 9, 8, 3, 4.5],
195 'D' => [ 0, 0, 0, 0, 0, 0, 0, 0, 5, 3, 0, 5, 0, 10, 11, 8.9],
196 'E' => [ 0, 0, 0, 1, 0, 0, 0, 0, 3, 7, 0, 7, 0, 13, 14, 10.0],
197 'F' => [13, 9, 11, 11, 6, 7, 18, 13, 4, 5, 0, 13, 0, 6, 4, 5.6],
198 'G' => [ 4, 4, 3, 6, 3, 13, 3, 2, 19, 34, 5, 7, 39, 10, 7, 12.1],
199 'H' => [ 0, 0, 0, 0, 0, 1, 1, 0, 5, 0, 0, 6, 0, 4, 2, 3.4],
200 'I' => [15, 15, 8, 6, 11, 5, 4, 8, 5, 1, 10, 5, 0, 8, 7, 7.4],
201 'K' => [ 0, 0, 0, 1, 0, 0, 1, 0, 0, 4, 0, 2, 0, 11, 9, 11.3],
202 'L' => [71, 68, 72, 79, 78, 45, 64, 49, 10, 23, 8, 20, 1, 8, 4, 12.1],
203 'M' => [ 0, 3, 7, 4, 1, 6, 2, 2, 0, 0, 0, 1, 0, 1, 2, 2.7],
204 'N' => [ 0, 1, 0, 1, 1, 0, 0, 0, 3, 3, 0, 10, 0, 4, 7, 7.1],
205 'P' => [ 2, 0, 2, 0, 0, 4, 1, 8, 20, 14, 0, 1, 3, 0, 22, 7.4],
206 'Q' => [ 0, 0, 0, 1, 0, 6, 1, 0, 10, 8, 0, 18, 3, 19, 10, 6.3],
207 'R' => [ 2, 0, 0, 0, 0, 1, 0, 0, 7, 4, 0, 15, 0, 12, 9, 7.6],
208 'S' => [ 9, 3, 8, 6, 13, 10, 15, 16, 26, 11, 23, 17, 20, 15, 10, 11.4],
209 'T' => [ 2, 10, 5, 4, 5, 13, 7, 7, 12, 6, 17, 8, 6, 3, 10, 9.7],
210 'V' => [20, 25, 15, 18, 13, 15, 11, 27, 0, 12, 32, 3, 0, 8, 17, 11.1],
211 'W' => [ 4, 3, 3, 1, 1, 2, 6, 3, 1, 3, 0, 9, 0, 2, 0, 1.8],
212 'Y' => [ 0, 1, 4, 0, 0, 1, 3, 1, 1, 2, 0, 5, 0, 1, 7, 5.6]
215 %WeightTable_pro = (
216 #Sample: 36 aligned sequences
217 # R -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 +1 +2 Expect
218 'A' => [0, 8, 8, 9, 6, 7, 5, 6, 7, 7, 24, 2, 31, 18, 4, 3.2],
219 'C' => [1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1.0],
220 'D' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 8, 2.0],
221 'E' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 8, 2.2],
222 'F' => [2, 4, 3, 4, 1, 1, 8, 0, 4, 1, 0, 7, 0, 1, 0, 1.3],
223 'G' => [4, 2, 2, 2, 3, 5, 2, 4, 2, 2, 0, 2, 2, 1, 0, 2.7],
224 'H' => [0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 7, 0, 1, 0, 0.8],
225 'I' => [3, 1, 5, 1, 5, 0, 1, 3, 0, 0, 0, 0, 0, 0, 2, 1.7],
226 'K' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, 2.5],
227 'L' => [8, 11, 9, 8, 9, 13, 1, 0, 2, 2, 1, 2, 0, 0, 1, 2.7],
228 'M' => [0, 2, 1, 1, 3, 2, 3, 0, 1, 2, 0, 4, 0, 0, 1, 0.6],
229 'N' => [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 3, 0, 1, 4, 1.6],
230 'P' => [0, 1, 1, 1, 1, 1, 2, 3, 5, 2, 0, 0, 0, 0, 5, 1.7],
231 'Q' => [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 3, 0, 0, 1, 1.4],
232 'R' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1.7],
233 'S' => [1, 0, 1, 4, 4, 1, 5, 15, 5, 8, 5, 2, 2, 0, 0, 2.6],
234 'T' => [2, 0, 4, 2, 2, 2, 2, 2, 5, 1, 3, 0, 1, 1, 2, 2.2],
235 'V' => [5, 7, 1, 3, 1, 4, 7, 0, 0, 4, 3, 0, 0, 2, 0, 2.5],
236 'W' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0.4],
237 'Y' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 1.3]
242 ## Now we calculate the _real_ values for the weight tables
245 ## yeah yeah yeah there is lots of math here that gets repeated
246 ## every single time a sigcleave object gets created. This is
247 ## a quick hack to make sure that we get the scores as accurate as
248 ## possible. Need all those significant digits....
250 ## suggestions for speedup aproaches welcome
254 foreach my $i (keys %WeightTable_euc) {
255 my $expected = $WeightTable_euc{$i}[15];
256 if ($expected > 0) {
257 for (my $j=0; $j<16; $j++) {
258 if ($WeightTable_euc{$i}[$j] == 0) {
259 $WeightTable_euc{$i}[$j] = 1;
260 if ($j == 10 || $j == 12) {
261 $WeightTable_euc{$i}[$j] = 1.e-10;
264 $WeightTable_euc{$i}[$j] = log($WeightTable_euc{$i}[$j]/$expected);
270 foreach my $i (keys %WeightTable_pro) {
271 my $expected = $WeightTable_pro{$i}[15];
272 if ($expected > 0) {
273 for (my $j=0; $j<16; $j++) {
274 if ($WeightTable_pro{$i}[$j] == 0) {
275 $WeightTable_pro{$i}[$j] = 1;
276 if ($j == 10 || $j == 12) {
277 $WeightTable_pro{$i}[$j] = 1.e-10;
280 $WeightTable_pro{$i}[$j] = log($WeightTable_pro{$i}[$j]/$expected);
285 #####################################################################################
286 ## CONSTRUCTOR ##
287 #####################################################################################
290 sub new {
291 my ($class, @args) = @_;
293 my $self = $class->SUPER::new(@args);
294 #my $self = Bio::Seq->new(@args);
296 my ($seq, $threshold, $matrix) = $self->_rearrange([qw(SEQ THRESHOLD MATRIX)],@args);
298 defined $threshold && $self->threshold($threshold);
299 $matrix && $self->matrix($matrix);
300 $seq && $self->seq($seq);
302 return $self;
307 =head1 threshold
309 Title : threshold
310 Usage : $value = $self->threshold
311 Purpose : Read/write method sigcleave score reporting threshold.
312 Returns : float.
313 Argument : new value, float
314 Throws : on non-number argument
315 Comments : defaults to 3.5
316 See Also : n/a
318 =cut
320 #----------------
321 sub threshold {
322 #----------------
323 my ($self, $value) = @_;
324 if( defined $value) {
325 $self->throw("I need a number, not [$value]")
326 if $value !~ /^[+-]?[\d\.]+$/;
327 $self->{'_threshold'} = $value;
329 return $self->{'_threshold'} || 3.5 ;
332 =head1 matrix
334 Title : matrix
335 Usage : $value = $self->matrix('procaryotic')
336 Purpose : Read/write method sigcleave matrix.
337 Returns : float.
338 Argument : new value: 'eucaryotic' or 'procaryotic'
339 Throws : on non-number argument
340 Comments : defaults to 3.5
341 See Also : n/a
343 =cut
345 #----------------
346 sub matrix {
347 #----------------
348 my ($self, $value) = @_;
349 if( defined $value) {
350 $self->throw("I need 'eucaryotic' or 'procaryotic', not [$value]")
351 unless $value eq 'eucaryotic' or $value eq 'procaryotic';
352 $self->{'_matrix'} = $value;
354 return $self->{'_matrix'} || 'eucaryotic' ;
357 =head1 seq
359 Title : seq
360 Usage : $value = $self->seq($seq_object)
361 Purpose : set the Seq object to be used
362 Returns : Seq object
363 Argument : protein sequence or Seq object
364 See Also : n/a
366 =cut
368 #----------------
369 sub seq {
370 #----------------
371 my ($self, $value) = @_;
372 if( defined $value) {
373 if ($value->isa('Bio::PrimarySeqI')) {
374 $self->{'_seq'} = $value;
375 } else {
376 $self->{'_seq'} = Bio::PrimarySeq->new(-seq => $value,
377 -alphabet => 'protein');
380 return $self->{'_seq'};
383 =head1 _Analyze
385 Title : _Analyze
386 Usage : N/A This is an internal method. Not meant to be called from outside
387 : the package
389 Purpose : calculates sigcleave score and amino acid position for the
390 : given protein sequence. The score reporting threshold can
391 : be adjusted by passing in the "threshold" parameter during
392 : object construction. If no threshold is passed in, the code
393 : defaults to reporting any scores equal to or above 3.5
395 Returns : nothing. results are added to the object
396 Argument : none.
397 Throws : nothing.
398 Comments : nothing.
399 See Also : n/a
401 =cut
403 #----------------
404 sub _Analyze {
405 #----------------
406 my($self) = @_;
408 my %signals;
409 my @hitWeight = ();
410 my @hitsort = ();
411 my @hitpos = ();
412 my $maxSite = "";
413 my $seqPos = "";
414 my $istart = "";
415 my $iend = "";
416 my $icol = "";
417 my $i = "";
418 my $weight = "";
419 my $k = 0;
420 my $c = 0;
421 my $seqBegin = 0;
422 my $pVal = -13;
423 my $nVal = 2;
424 my $nHits = 0;
425 my $seqEnd = $self->seq->length;
426 my $pep = $self->seq->seq;
427 my $minWeight = $self->threshold;
428 my $matrix = $self->matrix;
430 ## The weight table is keyed by UPPERCASE letters so we uppercase
431 ## the pep string because we don't want to alter the actual object
432 ## sequence.
434 $pep =~ tr/a-z/A-Z/;
436 for ($seqPos = $seqBegin; $seqPos < $seqEnd; $seqPos++) {
437 $istart = (0 > $seqPos + $pVal)? 0 : $seqPos + $pVal;
438 $iend = ($seqPos + $nVal - 1 < $seqEnd)? $seqPos + $nVal - 1 : $seqEnd;
439 $icol= $iend - $istart + 1;
440 $weight = 0.00;
441 for ($k=0; $k<$icol; $k++) {
442 $c = substr($pep, $istart + $k, 1);
444 ## CD: The if(defined) stuff was put in here because Sigcleave.pm
445 ## CD: kept getting warnings about undefined vals during 'make test' ...
446 if ($matrix eq 'eucaryotic') {
447 $weight += $WeightTable_euc{$c}[$k] if defined $WeightTable_euc{$c}[$k];
448 } else {
449 $weight += $WeightTable_pro{$c}[$k] if defined $WeightTable_pro{$c}[$k];
452 $signals{$seqPos+1} = sprintf ("%.1f", $weight) if $weight >= $minWeight;
454 $self->{"_signal_scores"} = { %signals };
458 =head1 signals
460 Title : signals
461 Usage : %sigcleave_results = $sig->signals;
463 Purpose : Accessor method for sigcleave results
465 Returns : Associative array. The key value represents the amino acid position
466 : and the value represents the score. Only scores that
467 : are greater than or equal to the THRESHOLD value are reported.
469 Argument : none.
470 Throws : none.
471 Comments : none.
472 See Also : THRESHOLD
474 =cut
476 #----------------
477 sub signals {
478 #----------------
479 my $self = shift;
480 my %results;
481 my $position;
483 # do the calculations
484 $self->_Analyze;
486 foreach $position ( sort keys %{ $self->{'_signal_scores'} } ) {
487 $results{$position} = $self->{'_signal_scores'}{$position};
489 return %results;
493 =head1 result_count
495 Title : result_count
496 Usage : $count = $sig->result_count;
498 Purpose : Accessor method for sigcleave results
500 Returns : Integer, number of results above the threshold
502 Argument : none.
503 Throws : none.
504 Comments : none.
506 See Also : THRESHOLD
508 =cut
510 #----------------
511 sub result_count {
512 #----------------
513 my $self = shift;
514 $self->_Analyze;
515 return keys %{ $self->{'_signal_scores'} };
519 =head1 pretty_print
521 Title : pretty_print
522 Usage : $output = $sig->pretty_print;
523 : print $sig->pretty_print;
525 Purpose : Emulates the output of the EGCG Sigcleave
526 : utility.
528 Returns : A formatted string.
529 Argument : none.
530 Throws : none.
531 Comments : none.
532 See Also : n/a
534 =cut
536 #----------------
537 sub pretty_print {
538 #----------------
539 my $self = shift;
540 my $pos;
541 my $output;
542 my $cnt = 1;
543 my %results = $self->signals;
544 my @hits = keys %results;
545 my $hitcount = $#hits; $hitcount++;
546 my $thresh = $self->threshold;
547 my $seqlen = $self->seq->length || 0;
548 my $name = $self->seq->id || 'NONAME';
549 my $pep = $self->seq->seq;
550 $pep =~ tr/a-z/A-Z/;
552 $output = "SIGCLEAVE of $name from: 1 to $seqlen\n\n";
554 if ($hitcount > 0) {
555 $output .= "Report scores over $thresh\n";
556 foreach $pos ((sort { $results{$b} cmp $results{$a} } keys %results)) {
557 my $start = $pos - 15;
558 $start = 1 if $start < 1;
559 my $sig = substr($pep,$start -1,$pos-$start );
561 $output .= sprintf ("Maximum score %1.1f at residue %3d\n",$results{$pos},$pos);
562 $output .= "\n";
563 $output .= " Sequence: ";
564 $output .= $sig;
565 $output .= "-" x (15- length($sig));
566 $output .= "-";
567 $output .= substr($pep,$pos-1,50);
568 $output .= "\n";
569 $output .= " " x 12;
570 $output .= "| \(signal\) | \(mature peptide\)\n";
571 $output .= sprintf(" %3d %3d\n\n",$start,$pos);
573 if (($hitcount > 1) && ($cnt == 1)) {
574 $output .= " Other entries above $thresh\n\n";
576 $cnt++;
579 $output;
584 __END__
587 #########################################################################
588 # End of class
589 #########################################################################