sync w/ main trunk
[bioperl-live.git] / Bio / Tools / IUPAC.pm
blob70ba4ac4d91923e14b8e4556bdd78de02a681e58
1 # $Id$
3 # BioPerl module for IUPAC
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Aaron Mackey <amackey@virginia.edu>
9 # Copyright Aaron Mackey
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Tools::IUPAC - Generates unique Seq objects from an ambiguous Seq object
19 =head1 SYNOPSIS
21 use Bio::Seq;
22 use Bio::Tools::IUPAC;
24 my $ambiseq = Bio::Seq->new(-seq => 'ARTCGUTGR', -alphabet => 'dna');
25 my $stream = Bio::Tools::IUPAC->new(-seq => $ambiseq);
27 while ($uniqueseq = $stream->next_seq()) {
28 # process the unique Seq object.
31 =head1 DESCRIPTION
33 IUPAC is a tool that produces a stream of unique, "strict"-satisfying Seq
34 objects from an ambiquous Seq object (containing non-standard characters given
35 the meaning shown below)
37 Extended DNA / RNA alphabet :
38 (includes symbols for nucleotide ambiguity)
39 ------------------------------------------
40 Symbol Meaning Nucleic Acid
41 ------------------------------------------
42 A A Adenine
43 C C Cytosine
44 G G Guanine
45 T T Thymine
46 U U Uracil
47 M A or C
48 R A or G
49 W A or T
50 S C or G
51 Y C or T
52 K G or T
53 V A or C or G
54 H A or C or T
55 D A or G or T
56 B C or G or T
57 X G or A or T or C
58 N G or A or T or C
60 IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
61 Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
63 -----------------------------------
65 Amino Acid alphabet:
66 ------------------------------------------
67 Symbol Meaning
68 ------------------------------------------
69 A Alanine
70 B Aspartic Acid, Asparagine
71 C Cystine
72 D Aspartic Acid
73 E Glutamic Acid
74 F Phenylalanine
75 G Glycine
76 H Histidine
77 I Isoleucine
78 J Isoleucine/Leucine
79 K Lysine
80 L Leucine
81 M Methionine
82 N Asparagine
83 O Pyrrolysine
84 P Proline
85 Q Glutamine
86 R Arginine
87 S Serine
88 T Threonine
89 U Selenocysteine
90 V Valine
91 W Tryptophan
92 X Unknown
93 Y Tyrosine
94 Z Glutamic Acid, Glutamine
95 * Terminator
98 IUPAC-IUP AMINO ACID SYMBOLS:
99 Biochem J. 1984 Apr 15; 219(2): 345-373
100 Eur J Biochem. 1993 Apr 1; 213(1): 2
102 =head1 FEEDBACK
104 =head2 Mailing Lists
106 User feedback is an integral part of the evolution of this and other
107 Bioperl modules. Send your comments and suggestions preferably to one
108 of the Bioperl mailing lists. Your participation is much appreciated.
110 bioperl-l@bioperl.org - General discussion
111 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
113 =head2 Support
115 Please direct usage questions or support issues to the mailing list:
117 L<bioperl-l@bioperl.org>
119 rather than to the module maintainer directly. Many experienced and
120 reponsive experts will be able look at the problem and quickly
121 address it. Please include a thorough description of the problem
122 with code and data examples if at all possible.
124 =head2 Reporting Bugs
126 Report bugs to the Bioperl bug tracking system to help us keep track
127 the bugs and their resolution. Bug reports can be submitted via the
128 web:
130 http://bugzilla.open-bio.org/
132 =head1 AUTHOR - Aaron Mackey
134 Email amackey-at-virginia.edu
136 =head1 APPENDIX
138 The rest of the documentation details each of the object
139 methods. Internal methods are usually preceded with a _
141 =cut
144 # Let the code begin...
146 package Bio::Tools::IUPAC;
148 use strict;
149 use vars qw(%IUP %IUB %REV_IUB $AUTOLOAD);
151 BEGIN {
152 %IUB = ( A => [qw(A)],
153 C => [qw(C)],
154 G => [qw(G)],
155 T => [qw(T)],
156 U => [qw(U)],
157 M => [qw(A C)],
158 R => [qw(A G)],
159 W => [qw(A T)],
160 S => [qw(C G)],
161 Y => [qw(C T)],
162 K => [qw(G T)],
163 V => [qw(A C G)],
164 H => [qw(A C T)],
165 D => [qw(A G T)],
166 B => [qw(C G T)],
167 X => [qw(G A T C)],
168 N => [qw(G A T C)]
170 %REV_IUB = (A => 'A',
171 T => 'T',
172 C => 'C',
173 G => 'G',
174 AC => 'M',
175 AG => 'R',
176 AT => 'W',
177 CG => 'S',
178 CT => 'Y',
179 'GT' => 'K',
180 ACG => 'V',
181 ACT => 'H',
182 AGT => 'D',
183 CGT => 'B',
184 ACGT=> 'N',
185 N => 'N'
189 %IUP = (A => [qw(A)],
190 B => [qw(D N)],
191 C => [qw(C)],
192 D => [qw(D)],
193 E => [qw(E)],
194 F => [qw(F)],
195 G => [qw(G)],
196 H => [qw(H)],
197 I => [qw(I)],
198 J => [qw(I L)],
199 K => [qw(K)],
200 L => [qw(L)],
201 M => [qw(M)],
202 N => [qw(N)],
203 O => [qw(O)],
204 P => [qw(P)],
205 Q => [qw(Q)],
206 R => [qw(R)],
207 S => [qw(S)],
208 T => [qw(T)],
209 U => [qw(U)],
210 V => [qw(V)],
211 W => [qw(W)],
212 X => [qw(X)],
213 Y => [qw(Y)],
214 Z => [qw(E Q)],
215 '*' => ['*']
219 use base qw(Bio::Root::Root);
221 =head2 new
223 Title : new
224 Usage : Bio::Tools::IUPAC->new( $seq)
225 Function: returns a new seq stream (akin to SeqIO)
226 Returns : a Bio::Tools::IUPAC stream object that will produce unique
227 Seq objects on demand.
228 Args : an ambiguously coded Seq.pm object that has a specified 'alphabet'
231 =cut
234 sub new {
235 my($class,@args) = @_;
236 my $self = $class->SUPER::new(@args);
238 my ($seq) = $self->_rearrange([qw(SEQ)],@args);
239 if((! defined($seq)) && @args && ref($args[0])) {
240 # parameter not passed as named parameter?
241 $seq = $args[0];
243 $seq->isa('Bio::Seq') or
244 $self->throw("Must supply a Seq.pm object to IUPAC!");
245 $self->{'_SeqObj'} = $seq;
246 if ($self->{'_SeqObj'}->alphabet() =~ m/^[dr]na$/i ) {
247 # nucleotide seq object
248 $self->{'_alpha'} = [ map { $IUB{uc($_)} }
249 split('', $self->{'_SeqObj'}->seq()) ];
250 } elsif ($self->{'_SeqObj'}->alphabet() =~ m/^protein$/i ) {
251 # amino acid seq object
252 $self->{'_alpha'} = [ map { $IUP{uc($_)} }
253 split('', $self->{'_SeqObj'}->seq()) ];
254 } else { # unknown type: we could make a guess, but let's not.
255 $self->throw("You must specify the 'type' of sequence provided to IUPAC");
257 $self->{'_string'} = [(0) x length($self->{'_SeqObj'}->seq())];
258 scalar @{$self->{'_string'}} or $self->throw("Sequence has zero-length!");
259 $self->{'_string'}->[0] = -1;
260 return $self;
263 =head2 next_seq
265 Title : next_seq
266 Usage : $iupac->next_seq()
267 Function: returns the next unique Seq object
268 Returns : a Seq.pm object
269 Args : none.
272 =cut
274 sub next_seq{
275 my ($self) = @_;
277 for my $i ( 0 .. $#{$self->{'_string'}} ) {
278 next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
279 if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
280 if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
281 return;
282 } else {
283 $self->{'_string'}->[$i] = 0;
284 next;
286 } else {
287 $self->{'_string'}->[$i]++;
288 my $j = -1;
289 $self->{'_SeqObj'}->seq(join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}}));
290 my $desc = $self->{'_SeqObj'}->desc();
291 if ( !defined $desc ) { $desc = ""; }
293 $self->{'_num'}++;
294 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
295 $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
296 $self->{'_SeqObj'}->desc($desc);
297 $self->{'_num'} =~ s/,//g;
298 return $self->{'_SeqObj'};
303 =head2 iupac_iup
305 Title : iupac_iup
306 Usage : my %aasymbols = $iupac->iupac_iup
307 Function: Returns a hash of PROTEIN symbols -> symbol components
308 Returns : Hash
309 Args : none
311 =cut
313 sub iupac_iup{
314 return %IUP;
318 =head2 iupac_iub
320 Title : iupac_iub
321 Usage : my %dnasymbols = $iupac->iupac_iub
322 Function: Returns a hash of DNA symbols -> symbol components
323 Returns : Hash
324 Args : none
326 =cut
328 sub iupac_iub{
329 return %IUB;
332 =head2 iupac_rev_iub
334 Title : iupac_rev_iub
335 Usage : my %dnasymbols = $iupac->iupac_rev_iub
336 Function: Returns a hash of nucleotide combinations -> IUPAC code
337 (a reverse of the iupac_iub hash).
338 Returns : Hash
339 Args : none
341 =cut
343 sub iupac_rev_iub{
344 return %REV_IUB;
347 =head2 count
349 Title : count
350 Usage : my $total = $iupac->count();
351 Function: Calculates the number of unique, unambiguous sequences that
352 this ambiguous sequence could generate
353 Return : int
354 Args : none
356 =cut
358 sub count {
359 my ($self) = @_;
361 my $count = 1;
362 $count *= scalar(@$_) for (@{$self->{'_alpha'}});
363 return $count;
367 sub AUTOLOAD {
369 my $self = shift @_;
370 my $method = $AUTOLOAD;
371 $method =~ s/.*:://;
372 return $self->{'_SeqObj'}->$method(@_)
373 unless $method eq 'DESTROY';