maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / Tools / OddCodes.pm
blob0aaaa9b60b507359d2f0b47306ca1beacf87ba97
1 #$Id$
2 #-----------------------------------------------------------------------------
3 # PACKAGE : OddCodes.pm
4 # PURPOSE : To write amino acid sequences in alternative alphabets
5 # AUTHOR : Derek Gatherer (D.Gatherer@organon.nhe.akzonobel.nl)
6 # SOURCE :
7 # CREATED : 8th July 2000
8 # MODIFIED :
9 # DISCLAIMER : I am employed in the pharmaceutical industry but my
10 # : employers do not endorse or sponsor this module
11 # : in any way whatsoever. The above email address is
12 # : given purely for the purpose of easy communication
13 # : with the author, and does not imply any connection
14 # : between my employers and anything written below.
15 # LICENCE : You may distribute this module under the same terms
16 # : as the rest of BioPerl.
17 #----------------------------------------------------------------------------
19 =head1 NAME
21 Bio::Tools::OddCodes - Object holding alternative alphabet coding for
22 one protein sequence
24 =head1 SYNOPSIS
26 # Take a sequence object from eg, an inputstream, and creates an
27 # object for the purposes of rewriting that sequence in another
28 # alphabet. These are abbreviated amino acid sequence alphabets,
29 # designed to simplify the statistical aspects of analysing protein
30 # sequences, by reducing the combinatorial explosion of the
31 # 20-letter alphabet. These abbreviated alphabets range in size
32 # from 2 to 8.
34 # Creating the OddCodes object, eg:
36 my $inputstream = Bio::SeqIO->new( '-file' => "seqfile",
37 '-format' => 'Fasta');
38 my $seqobj = $inputstream->next_seq();
39 my $oddcode_obj = Bio::Tools::Oddcodes->new(-seq => $seqobj);
41 # or:
43 my $seqobj = Bio::PrimarySeq->new
44 (-seq=>'[cut and paste a sequence here]',
45 -alphabet => 'protein',
46 -id => 'test');
47 my $oddcode_obj = Bio::Tools::OddCodes->new(-seq => $seqobj);
49 # do the alternative coding, returning the answer as a reference to
50 # a string
52 my $output = $oddcode_obj->structural();
53 my $output = $oddcode_obj->chemical();
54 my $output = $oddcode_obj->functional();
55 my $output = $oddcode_obj->charge();
56 my $output = $oddcode_obj->hydrophobic();
57 my $output = $oddcode_obj->Dayhoff();
58 my $output = $oddcode_obj->Sneath();
59 my $output = $oddcode_obj->Stanfel();
62 # display sequence in new form, eg:
64 my $new_coding = $$output;
65 print "\n$new_coding";
67 =head1 DESCRIPTION
69 Bio::Tools::Oddcodes is a welterweight object for rewriting a protein
70 sequence in an alternative alphabet. Eight of these are provided, ranging
71 from the the 2-letter hydrophobic alphabet, to the 8-letter chemical
72 alphabet. These are useful for the statistical analysis of protein
73 sequences since they can partially avoid the combinatorial explosion
74 produced by the full 20-letter alphabet (eg. 400 dimers, 8000 trimers
75 etc.)
77 The objects will print out a warning if the input sequence is not a
78 protein. If you know what you are doing, you can silence the warning
79 by setting verbose() to a negative value.
81 See SYNOPSIS above for object creation code.
83 =head1 REFERENCES
85 Stanfel LE (1996) A new approach to clustering the amino acids. J. theor.
86 Biol. 183, 195-205.
88 Karlin S, Ost F and Blaisdell BE (1989) Patterns in DNA and amino acid
89 sequences and their statistical significance. Chapter 6 of: Mathematical
90 Methods for DNA Sequences. Waterman MS (ed.) CRC Press, Boca Raton , FL.
92 =head1 FEEDBACK
94 =head2 Mailing Lists
96 User feedback is an integral part of the evolution of this and other
97 Bioperl modules. Send your comments and suggestions preferably to one
98 of the Bioperl mailing lists. Your participation is much appreciated.
100 bioperl-l@bioperl.org - General discussion
101 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
103 =head2 Support
105 Please direct usage questions or support issues to the mailing list:
107 I<bioperl-l@bioperl.org>
109 rather than to the module maintainer directly. Many experienced and
110 reponsive experts will be able look at the problem and quickly
111 address it. Please include a thorough description of the problem
112 with code and data examples if at all possible.
114 =head2 Reporting Bugs
116 Report bugs to the Bioperl bug tracking system to help us keep track
117 the bugs and their resolution. Bug reports can be submitted via the
118 web:
120 https://github.com/bioperl/bioperl-live/issues
122 =head1 AUTHOR
124 Derek Gatherer
126 =head1 APPENDIX
128 The rest of the documentation details each of the object methods.
129 Internal methods are usually preceded with a _
131 =cut
133 package Bio::Tools::OddCodes;
134 use strict;
137 use base qw(Bio::Root::Root);
139 sub new
141 my($class,@args) = @_;
143 my $self = $class->SUPER::new(@args);
145 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
146 if((! defined($seqobj)) && @args && ref($args[0])) {
147 # parameter not passed as named parameter?
148 $seqobj = $args[0];
150 unless ($seqobj->isa("Bio::PrimarySeqI"))
152 $self->throw("Bio::Tools::OddCodes only works on PrimarySeqI objects");
155 $self->{'_seqref'} = $seqobj;
157 return $self;
160 =head2 structural
162 Title : structural
163 Usage : $output = $oddcode_obj->structural();
164 Function: turns amino acid sequence into 3-letter structural alphabet
165 : A (ambivalent), E (external), I (internal)
166 Example : a sequence ACDEFGH will become AAEEIAE
167 Returns : Reference to the new sequence string
168 Args : none
170 =cut
172 sub structural()
174 my $self = $_[0];
175 my $seqstring = &_pullseq($self); # see _pullseq() below
177 # now the real business
179 $seqstring =~ tr/[ACGPSTWY]/1/;
180 $seqstring =~ tr/[RNDQEHK]/2/;
181 $seqstring =~ tr/[ILMFV]/3/;
182 $seqstring =~ tr/1/A/;
183 $seqstring =~ tr/2/E/;
184 $seqstring =~ tr/3/I/;
186 return \$seqstring;
188 # and that's that one
191 =head2 functional
193 Title : functional
194 Usage : $output = $oddcode_obj->functional();
195 Function: turns amino acid sequence into 4-letter functional alphabet
196 : A (acidic), C (basic), H (hydrophobic), P (polar)
197 Example : a sequence ACDEFGH will become HPAAHHC
198 Returns : Reference to the new sequence string
199 Args : none
201 =cut
203 sub functional()
205 my $self = $_[0];
206 my $seqstring = &_pullseq($self);
208 # now the real business
210 $seqstring =~ tr/[DE]/1/;
211 $seqstring =~ tr/[HKR]/2/;
212 $seqstring =~ tr/[AFILMPVW]/3/;
213 $seqstring =~ tr/[CGNQSTY]/4/;
214 $seqstring =~ tr/1/A/;
215 $seqstring =~ tr/2/C/;
216 $seqstring =~ tr/3/H/;
217 $seqstring =~ tr/4/P/;
219 return \$seqstring;
221 # and that's that one
224 =head2 hydrophobic
226 Title : hydrophobic
227 Usage : $output = $oddcode_obj->hydrophobic();
228 Function: turns amino acid sequence into 2-letter hydrophobicity alphabet
229 : O (hydrophobic), I (hydrophilic)
230 Example : a sequence ACDEFGH will become OIIIOII
231 Returns : Reference to the new sequence string
232 Args : none
234 =cut
236 sub hydrophobic()
238 my $self = $_[0];
239 my $seqstring = &_pullseq($self);
241 # now the real business
243 $seqstring =~ tr/[AFILMPVW]/1/;
244 $seqstring =~ tr/[CDEGHKNQRSTY]/2/;
245 $seqstring =~ tr/1/I/;
246 $seqstring =~ tr/2/O/;
248 return \$seqstring;
250 # and that's that one
253 =head2 Dayhoff
255 Title : Dayhoff
256 Usage : $output = $oddcode_obj->Dayhoff();
257 Function: turns amino acid sequence into 6-letter Dayhoff alphabet
258 Example : a sequence ACDEFGH will become CADDGCE
259 : A (=C), C (=AGPST), D (=DENQ),
260 : E (=HKR), F (=ILMV), G (=FWY)
261 Returns : Reference to the new sequence string
262 Args : none
264 =cut
266 sub Dayhoff()
268 my $self = $_[0];
269 my $seqstring = &_pullseq($self);
271 # now the real business
273 $seqstring =~ tr/[C]/1/;
274 $seqstring =~ tr/[AGPST]/2/;
275 $seqstring =~ tr/[DENQ]/3/;
276 $seqstring =~ tr/[HKR]/4/;
277 $seqstring =~ tr/[ILMV]/5/;
278 $seqstring =~ tr/[FWY]/6/;
279 $seqstring =~ tr/1/A/;
280 $seqstring =~ tr/2/C/;
281 $seqstring =~ tr/3/D/;
282 $seqstring =~ tr/4/E/;
283 $seqstring =~ tr/5/F/;
284 $seqstring =~ tr/6/G/;
286 return \$seqstring;
288 # and that's that one
291 =head2 Sneath
293 Title : Sneath
294 Usage : $output = $oddcode_obj->Sneath();
295 Function: turns amino acid sequence into 7-letter Sneath alphabet
296 Example : a sequence ACDEFGH will become CEFFHCF
297 : A (=ILV), C (=AGP), D (=MNQ), E (=CST),
298 : F (=DE), G (=KR), H (=FHWY)
299 Returns : Reference to the new sequence string
300 Args : none
302 =cut
304 sub Sneath()
306 my $self = $_[0];
307 my $seqstring = &_pullseq($self);
309 # now the real business
311 $seqstring =~ tr/[ILV]/1/;
312 $seqstring =~ tr/[AGP]/2/;
313 $seqstring =~ tr/[MNQ]/3/;
314 $seqstring =~ tr/[CST]/4/;
315 $seqstring =~ tr/[DE]/5/;
316 $seqstring =~ tr/[KR]/6/;
317 $seqstring =~ tr/[FHWY]/7/;
318 $seqstring =~ tr/1/A/;
319 $seqstring =~ tr/2/C/;
320 $seqstring =~ tr/3/D/;
321 $seqstring =~ tr/4/E/;
322 $seqstring =~ tr/5/F/;
323 $seqstring =~ tr/6/G/;
324 $seqstring =~ tr/7/H/;
326 return \$seqstring;
328 # and that's that one
331 =head2 Stanfel
333 Title : Stanfel
334 Usage : $output = $oddcode_obj->Stanfel();
335 Function: turns amino acid sequence into 4-letter Stanfel alphabet
336 Example : a sequence ACDEFGH will become AACCDAE
337 : A (=ACGILMPSTV), C (=DENQ), D (=FWY), E (=HKR)
338 Returns : Reference to the new sequence string
339 Args : none
341 =cut
343 sub Stanfel()
345 my $self = $_[0];
346 my $seqstring = &_pullseq($self);
348 # now the real business
350 $seqstring =~ tr/[ACGILMPSTV]/1/;
351 $seqstring =~ tr/[DENQ]/2/;
352 $seqstring =~ tr/[FWY]/3/;
353 $seqstring =~ tr/[HKR]/4/;
354 $seqstring =~ tr/1/A/;
355 $seqstring =~ tr/2/C/;
356 $seqstring =~ tr/3/D/;
357 $seqstring =~ tr/4/E/;
359 return \$seqstring;
361 # and that's that one
364 =head2 chemical
366 Title : chemical
367 Usage : $output = $oddcode_obj->chemical();
368 Function: turns amino acid sequence into 8-letter chemical alphabet
369 : A (acidic), L (aliphatic), M (amide), R (aromatic)
370 : C (basic), H (hydroxyl), I (imino), S (sulphur)
371 Example : a sequence ACDEFGH will become LSAARAC
372 Returns : Reference to the new sequence string
373 Args : none
375 =cut
377 sub chemical()
379 my $self = $_[0];
380 my $seqstring = &_pullseq($self);
382 # now the real business
384 $seqstring =~ tr/[DE]/1/;
385 $seqstring =~ tr/[AGILV]/2/;
386 $seqstring =~ tr/[NQ]/3/;
387 $seqstring =~ tr/[FWY]/4/;
388 $seqstring =~ tr/[RHK]/5/;
389 $seqstring =~ tr/[ST]/6/;
390 $seqstring =~ tr/P/7/;
391 $seqstring =~ tr/[CM]/8/;
392 $seqstring =~ tr/1/A/;
393 $seqstring =~ tr/2/L/;
394 $seqstring =~ tr/3/M/;
395 $seqstring =~ tr/4/R/;
396 $seqstring =~ tr/5/C/;
397 $seqstring =~ tr/6/H/;
398 $seqstring =~ tr/7/I/;
399 $seqstring =~ tr/8/S/;
401 return \$seqstring;
403 # and that's that one
406 =head2 charge
408 Title : charge
409 Usage : $output = $oddcode_obj->charge();
410 Function: turns amino acid sequence into 3-letter charge alphabet
411 Example : a sequence ACDEFGH will become NNAANNC
412 : A (negative; NOT anode), C (positive; NOT cathode), N (neutral)
413 Returns : Reference to the new sequence string
414 Args : none
416 =cut
418 sub charge()
420 my $self = $_[0];
421 my $seqstring = &_pullseq($self);
423 # now the real business
425 $seqstring =~ tr/[DE]/1/;
426 $seqstring =~ tr/[HKR]/2/;
427 $seqstring =~ tr/[ACFGILMNPQSTVWY]/3/;
428 $seqstring =~ tr/1/A/;
429 $seqstring =~ tr/2/C/;
430 $seqstring =~ tr/3/N/;
432 return \$seqstring;
434 # and that's that one
437 # _pullseq is called within each of the subroutines
438 # it just checks a few things and returns the sequence
440 sub _pullseq
442 my $self = $_[0];
444 my $seqobj = $self->{'_seqref'};
446 unless ($seqobj->isa("Bio::PrimarySeqI"))
448 $self->throw("die, OddCodes works only on PrimarySeqI objects\n");
450 $self->warn("\tAll OddCode alphabets need a protein sequence,\n".
451 "\tbut BioPerl thinks this is not: [". $seqobj->id. "]")
452 unless $seqobj->alphabet eq 'protein' or $self->verbose < 0;;
454 my $seqstring = uc $seqobj->seq();
456 if(length($seqstring)<1)
458 $self->throw("$seqstring: die, sequence has zero length\n");
460 return $seqstring;