small update
[bioperl-live.git] / Bio / Tools / IUPAC.pm
blob8e6aa43e75ab43f07bb43ad7c68e5b302796aac3
1 # $Id$
3 # BioPerl module for IUPAC
5 # Cared for by Aaron Mackey <amackey@virginia.edu>
7 # Copyright Aaron Mackey
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::Tools::IUPAC - Generates unique Seq objects from an ambiguous Seq object
17 =head1 SYNOPSIS
19 use Bio::Seq;
20 use Bio::Tools::IUPAC;
22 my $ambiseq = Bio::Seq->new(-seq => 'ARTCGUTGR', -alphabet => 'dna');
23 my $stream = Bio::Tools::IUPAC->new(-seq => $ambiseq);
25 while ($uniqueseq = $stream->next_seq()) {
26 # process the unique Seq object.
29 =head1 DESCRIPTION
31 IUPAC is a tool that produces a stream of unique, "strict"-satisfying Seq
32 objects from an ambiquous Seq object (containing non-standard characters given
33 the meaning shown below)
35 Extended DNA / RNA alphabet :
36 (includes symbols for nucleotide ambiguity)
37 ------------------------------------------
38 Symbol Meaning Nucleic Acid
39 ------------------------------------------
40 A A Adenine
41 C C Cytosine
42 G G Guanine
43 T T Thymine
44 U U Uracil
45 M A or C
46 R A or G
47 W A or T
48 S C or G
49 Y C or T
50 K G or T
51 V A or C or G
52 H A or C or T
53 D A or G or T
54 B C or G or T
55 X G or A or T or C
56 N G or A or T or C
58 IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
59 Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
61 -----------------------------------
63 Amino Acid alphabet:
64 ------------------------------------------
65 Symbol Meaning
66 ------------------------------------------
67 A Alanine
68 B Aspartic Acid, Asparagine
69 C Cystine
70 D Aspartic Acid
71 E Glutamic Acid
72 F Phenylalanine
73 G Glycine
74 H Histidine
75 I Isoleucine
76 J Isoleucine/Leucine
77 K Lysine
78 L Leucine
79 M Methionine
80 N Asparagine
81 O Pyrrolysine
82 P Proline
83 Q Glutamine
84 R Arginine
85 S Serine
86 T Threonine
87 U Selenocysteine
88 V Valine
89 W Tryptophan
90 X Unknown
91 Y Tyrosine
92 Z Glutamic Acid, Glutamine
93 * Terminator
96 IUPAC-IUP AMINO ACID SYMBOLS:
97 Biochem J. 1984 Apr 15; 219(2): 345-373
98 Eur J Biochem. 1993 Apr 1; 213(1): 2
100 =head1 FEEDBACK
102 =head2 Mailing Lists
104 User feedback is an integral part of the evolution of this and other
105 Bioperl modules. Send your comments and suggestions preferably to one
106 of the Bioperl mailing lists. Your participation is much appreciated.
108 bioperl-l@bioperl.org - General discussion
109 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
111 =head2 Reporting Bugs
113 Report bugs to the Bioperl bug tracking system to help us keep track
114 the bugs and their resolution. Bug reports can be submitted via the
115 web:
117 http://bugzilla.open-bio.org/
119 =head1 AUTHOR - Aaron Mackey
121 Email amackey-at-virginia.edu
123 =head1 APPENDIX
125 The rest of the documentation details each of the object
126 methods. Internal methods are usually preceded with a _
128 =cut
131 # Let the code begin...
133 package Bio::Tools::IUPAC;
135 use strict;
136 use vars qw(%IUP %IUB %REV_IUB $AUTOLOAD);
138 BEGIN {
139 %IUB = ( A => [qw(A)],
140 C => [qw(C)],
141 G => [qw(G)],
142 T => [qw(T)],
143 U => [qw(U)],
144 M => [qw(A C)],
145 R => [qw(A G)],
146 W => [qw(A T)],
147 S => [qw(C G)],
148 Y => [qw(C T)],
149 K => [qw(G T)],
150 V => [qw(A C G)],
151 H => [qw(A C T)],
152 D => [qw(A G T)],
153 B => [qw(C G T)],
154 X => [qw(G A T C)],
155 N => [qw(G A T C)]
157 %REV_IUB = (A => 'A',
158 T => 'T',
159 C => 'C',
160 G => 'G',
161 AC => 'M',
162 AG => 'R',
163 AT => 'W',
164 CG => 'S',
165 CT => 'Y',
166 'GT' => 'K',
167 ACG => 'V',
168 ACT => 'H',
169 AGT => 'D',
170 CGT => 'B',
171 ACGT=> 'N',
172 N => 'N'
176 %IUP = (A => [qw(A)],
177 B => [qw(D N)],
178 C => [qw(C)],
179 D => [qw(D)],
180 E => [qw(E)],
181 F => [qw(F)],
182 G => [qw(G)],
183 H => [qw(H)],
184 I => [qw(I)],
185 J => [qw(I L)],
186 K => [qw(K)],
187 L => [qw(L)],
188 M => [qw(M)],
189 N => [qw(N)],
190 O => [qw(O)],
191 P => [qw(P)],
192 Q => [qw(Q)],
193 R => [qw(R)],
194 S => [qw(S)],
195 T => [qw(T)],
196 U => [qw(U)],
197 V => [qw(V)],
198 W => [qw(W)],
199 X => [qw(X)],
200 Y => [qw(Y)],
201 Z => [qw(E Q)],
202 '*' => ['*']
206 use base qw(Bio::Root::Root);
208 =head2 new
210 Title : new
211 Usage : Bio::Tools::IUPAC->new( $seq)
212 Function: returns a new seq stream (akin to SeqIO)
213 Returns : a Bio::Tools::IUPAC stream object that will produce unique
214 Seq objects on demand.
215 Args : an ambiguously coded Seq.pm object that has a specified 'alphabet'
218 =cut
221 sub new {
222 my($class,@args) = @_;
223 my $self = $class->SUPER::new(@args);
225 my ($seq) = $self->_rearrange([qw(SEQ)],@args);
226 if((! defined($seq)) && @args && ref($args[0])) {
227 # parameter not passed as named parameter?
228 $seq = $args[0];
230 $seq->isa('Bio::Seq') or
231 $self->throw("Must supply a Seq.pm object to IUPAC!");
232 $self->{'_SeqObj'} = $seq;
233 if ($self->{'_SeqObj'}->alphabet() =~ m/^[dr]na$/i ) {
234 # nucleotide seq object
235 $self->{'_alpha'} = [ map { $IUB{uc($_)} }
236 split('', $self->{'_SeqObj'}->seq()) ];
237 } elsif ($self->{'_SeqObj'}->alphabet() =~ m/^protein$/i ) {
238 # amino acid seq object
239 $self->{'_alpha'} = [ map { $IUP{uc($_)} }
240 split('', $self->{'_SeqObj'}->seq()) ];
241 } else { # unknown type: we could make a guess, but let's not.
242 $self->throw("You must specify the 'type' of sequence provided to IUPAC");
244 $self->{'_string'} = [(0) x length($self->{'_SeqObj'}->seq())];
245 scalar @{$self->{'_string'}} or $self->throw("Sequence has zero-length!");
246 $self->{'_string'}->[0] = -1;
247 return $self;
250 =head2 next_seq
252 Title : next_seq
253 Usage : $iupac->next_seq()
254 Function: returns the next unique Seq object
255 Returns : a Seq.pm object
256 Args : none.
259 =cut
261 sub next_seq{
262 my ($self) = @_;
264 for my $i ( 0 .. $#{$self->{'_string'}} ) {
265 next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
266 if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
267 if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
268 return;
269 } else {
270 $self->{'_string'}->[$i] = 0;
271 next;
273 } else {
274 $self->{'_string'}->[$i]++;
275 my $j = -1;
276 $self->{'_SeqObj'}->seq(join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}}));
277 my $desc = $self->{'_SeqObj'}->desc();
278 if ( !defined $desc ) { $desc = ""; }
280 $self->{'_num'}++;
281 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
282 $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
283 $self->{'_SeqObj'}->desc($desc);
284 $self->{'_num'} =~ s/,//g;
285 return $self->{'_SeqObj'};
290 =head2 iupac_iup
292 Title : iupac_iup
293 Usage : my %aasymbols = $iupac->iupac_iup
294 Function: Returns a hash of PROTEIN symbols -> symbol components
295 Returns : Hash
296 Args : none
298 =cut
300 sub iupac_iup{
301 return %IUP;
305 =head2 iupac_iub
307 Title : iupac_iub
308 Usage : my %dnasymbols = $iupac->iupac_iub
309 Function: Returns a hash of DNA symbols -> symbol components
310 Returns : Hash
311 Args : none
313 =cut
315 sub iupac_iub{
316 return %IUB;
319 =head2 iupac_rev_iub
321 Title : iupac_rev_iub
322 Usage : my %dnasymbols = $iupac->iupac_rev_iub
323 Function: Returns a hash of nucleotide combinations -> IUPAC code
324 (a reverse of the iupac_iub hash).
325 Returns : Hash
326 Args : none
328 =cut
330 sub iupac_rev_iub{
331 return %REV_IUB;
334 =head2 count
336 Title : count
337 Usage : my $total = $iupac->count();
338 Function: Calculates the number of unique, unambiguous sequences that
339 this ambiguous sequence could generate
340 Return : int
341 Args : none
343 =cut
345 sub count {
346 my ($self) = @_;
348 my $count = 1;
349 $count *= scalar(@$_) for (@{$self->{'_alpha'}});
350 return $count;
354 sub AUTOLOAD {
356 my $self = shift @_;
357 my $method = $AUTOLOAD;
358 $method =~ s/.*:://;
359 return $self->{'_SeqObj'}->$method(@_)
360 unless $method eq 'DESTROY';