3 # BioPerl module for IUPAC
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Aaron Mackey <amackey@virginia.edu>
9 # Copyright Aaron Mackey
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Tools::IUPAC - Generates unique Seq objects from an ambiguous Seq object
22 use Bio::Tools::IUPAC;
24 my $ambiseq = Bio::Seq->new(-seq => 'ARTCGUTGR', -alphabet => 'dna');
25 my $stream = Bio::Tools::IUPAC->new(-seq => $ambiseq);
27 while ($uniqueseq = $stream->next_seq()) {
28 # process the unique Seq object.
33 IUPAC is a tool that produces a stream of unique, "strict"-satisfying Seq
34 objects from an ambiquous Seq object (containing non-standard characters given
35 the meaning shown below)
37 Extended DNA / RNA alphabet :
38 (includes symbols for nucleotide ambiguity)
39 ------------------------------------------
40 Symbol Meaning Nucleic Acid
41 ------------------------------------------
60 IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
61 Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
63 -----------------------------------
66 ------------------------------------------
68 ------------------------------------------
70 B Aspartic Acid, Asparagine
94 Z Glutamic Acid, Glutamine
98 IUPAC-IUP AMINO ACID SYMBOLS:
99 Biochem J. 1984 Apr 15; 219(2): 345-373
100 Eur J Biochem. 1993 Apr 1; 213(1): 2
106 User feedback is an integral part of the evolution of this and other
107 Bioperl modules. Send your comments and suggestions preferably to one
108 of the Bioperl mailing lists. Your participation is much appreciated.
110 bioperl-l@bioperl.org - General discussion
111 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
115 Please direct usage questions or support issues to the mailing list:
117 L<bioperl-l@bioperl.org>
119 rather than to the module maintainer directly. Many experienced and
120 reponsive experts will be able look at the problem and quickly
121 address it. Please include a thorough description of the problem
122 with code and data examples if at all possible.
124 =head2 Reporting Bugs
126 Report bugs to the Bioperl bug tracking system to help us keep track
127 the bugs and their resolution. Bug reports can be submitted via the
130 http://bugzilla.open-bio.org/
132 =head1 AUTHOR - Aaron Mackey
134 Email amackey-at-virginia.edu
138 The rest of the documentation details each of the object
139 methods. Internal methods are usually preceded with a _
144 # Let the code begin...
146 package Bio
::Tools
::IUPAC
;
149 use vars
qw(%IUP %IUB %REV_IUB $AUTOLOAD);
152 %IUB = ( A => [qw(A)],
170 %REV_IUB = (A
=> 'A',
189 %IUP = (A
=> [qw(A)],
219 use base
qw(Bio::Root::Root);
224 Usage : Bio::Tools::IUPAC->new( $seq)
225 Function: returns a new seq stream (akin to SeqIO)
226 Returns : a Bio::Tools::IUPAC stream object that will produce unique
227 Seq objects on demand.
228 Args : an ambiguously coded Seq.pm object that has a specified 'alphabet'
235 my($class,@args) = @_;
236 my $self = $class->SUPER::new
(@args);
238 my ($seq) = $self->_rearrange([qw(SEQ)],@args);
239 if((! defined($seq)) && @args && ref($args[0])) {
240 # parameter not passed as named parameter?
243 $seq->isa('Bio::Seq') or
244 $self->throw("Must supply a Seq.pm object to IUPAC!");
245 $self->{'_SeqObj'} = $seq;
246 if ($self->{'_SeqObj'}->alphabet() =~ m/^[dr]na$/i ) {
247 # nucleotide seq object
248 $self->{'_alpha'} = [ map { $IUB{uc($_)} }
249 split('', $self->{'_SeqObj'}->seq()) ];
250 } elsif ($self->{'_SeqObj'}->alphabet() =~ m/^protein$/i ) {
251 # amino acid seq object
252 $self->{'_alpha'} = [ map { $IUP{uc($_)} }
253 split('', $self->{'_SeqObj'}->seq()) ];
254 } else { # unknown type: we could make a guess, but let's not.
255 $self->throw("You must specify the 'type' of sequence provided to IUPAC");
257 $self->{'_string'} = [(0) x
length($self->{'_SeqObj'}->seq())];
258 scalar @
{$self->{'_string'}} or $self->throw("Sequence has zero-length!");
259 $self->{'_string'}->[0] = -1;
266 Usage : $iupac->next_seq()
267 Function: returns the next unique Seq object
268 Returns : a Seq.pm object
277 for my $i ( 0 .. $#{$self->{'_string'}} ) {
278 next unless $self->{'_string'}->[$i] || @
{$self->{'_alpha'}->[$i]} > 1;
279 if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
280 if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
283 $self->{'_string'}->[$i] = 0;
287 $self->{'_string'}->[$i]++;
289 $self->{'_SeqObj'}->seq(join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @
{$self->{'_string'}}));
290 my $desc = $self->{'_SeqObj'}->desc();
291 if ( !defined $desc ) { $desc = ""; }
294 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
295 $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
296 $self->{'_SeqObj'}->desc($desc);
297 $self->{'_num'} =~ s/,//g;
298 return $self->{'_SeqObj'};
306 Usage : my %aasymbols = $iupac->iupac_iup
307 Function: Returns a hash of PROTEIN symbols -> symbol components
321 Usage : my %dnasymbols = $iupac->iupac_iub
322 Function: Returns a hash of DNA symbols -> symbol components
334 Title : iupac_rev_iub
335 Usage : my %dnasymbols = $iupac->iupac_rev_iub
336 Function: Returns a hash of nucleotide combinations -> IUPAC code
337 (a reverse of the iupac_iub hash).
350 Usage : my $total = $iupac->count();
351 Function: Calculates the number of unique, unambiguous sequences that
352 this ambiguous sequence could generate
362 $count *= scalar(@
$_) for (@
{$self->{'_alpha'}});
370 my $method = $AUTOLOAD;
372 return $self->{'_SeqObj'}->$method(@_)
373 unless $method eq 'DESTROY';