2 # BioPerl module for IUPAC
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Aaron Mackey <amackey@virginia.edu>
8 # Copyright Aaron Mackey
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tools::IUPAC - Generates unique sequence objects or regular expressions from
17 an ambiguous IUPAC sequence
22 use Bio::Tools::IUPAC;
24 # Get the IUPAC code for proteins
25 my %iupac_prot = Bio::Tools::IUPAC->new->iupac_iup;
27 # Create a sequence with degenerate residues
28 my $ambiseq = Bio::PrimarySeq->new(-seq => 'ARTCGUTGN', -alphabet => 'dna');
30 # Create all possible non-degenerate sequences
31 my $iupac = Bio::Tools::IUPAC->new(-seq => $ambiseq);
32 while ($uniqueseq = $iupac->next_seq()) {
33 # process the unique Bio::Seq object.
36 # Get a regular expression that matches all possible sequences
37 my $regexp = $iupac->regexp();
41 Bio::Tools::IUPAC is a tool that manipulates sequences with ambiguous residues
42 following the IUPAC conventions. Non-standard characters have the meaning
45 IUPAC-IUB SYMBOLS FOR NUCLEOTIDE (DNA OR RNA) NOMENCLATURE:
46 Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030
48 ---------------------------------------------------------------
49 Symbol Meaning Nucleic Acid
50 ---------------------------------------------------------------
62 V A or C or G not T (closest unused char after T)
63 H A or C or T not G (closest unused char after G)
64 D A or G or T not C (closest unused char after C)
65 B C or G or T not A (closest unused char after A)
66 X G or A or T or C Unknown (very rarely used)
67 N G or A or T or C Unknown (commonly used)
70 IUPAC-IUP AMINO ACID SYMBOLS:
71 Biochem J. 1984 Apr 15; 219(2): 345-373
72 Eur J Biochem. 1993 Apr 1; 213(1): 2
74 ------------------------------------------
76 ------------------------------------------
78 B Aspartic Acid, Asparagine
102 Z Glutamic Acid, Glutamine
105 There are a few things Bio::Tools::IUPAC can do for you:
111 report the IUPAC mapping between ambiguous and non-ambiguous residues
115 produce a stream of all possible corresponding unambiguous Bio::Seq objects given
116 an ambiguous sequence object
120 convert an ambiguous sequence object to a corresponding regular expression
128 User feedback is an integral part of the evolution of this and other
129 Bioperl modules. Send your comments and suggestions preferably to one
130 of the Bioperl mailing lists. Your participation is much appreciated.
132 bioperl-l@bioperl.org - General discussion
133 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
137 Please direct usage questions or support issues to the mailing list:
139 I<bioperl-l@bioperl.org>
141 rather than to the module maintainer directly. Many experienced and
142 reponsive experts will be able look at the problem and quickly
143 address it. Please include a thorough description of the problem
144 with code and data examples if at all possible.
146 =head2 Reporting Bugs
148 Report bugs to the Bioperl bug tracking system to help us keep track
149 the bugs and their resolution. Bug reports can be submitted via the
152 https://github.com/bioperl/bioperl-live/issues
154 =head1 AUTHOR - Aaron Mackey
156 Email amackey-at-virginia.edu
160 The rest of the documentation details each of the object
161 methods. Internal methods are usually preceded with a _
166 package Bio
::Tools
::IUPAC
;
169 use base
qw(Bio::Root::Root);
171 # Ambiguous nucleic residues are matched to unambiguous residues
192 # Same as %IUB but ambiguous residues are matched to ambiguous residues only
204 N
=> [qw(B D H K M N R S V W Y)],
207 # The inverse of %IUB
228 # Same thing with proteins now
269 Usage : Bio::Tools::IUPAC->new($seq);
270 Function: Create a new IUPAC object, which acts as a sequence stream (akin to
272 Args : an ambiguously coded sequence object that has a specified 'alphabet'
273 Returns : a Bio::Tools::IUPAC object.
278 my ($class,@args) = @_;
279 my $self = $class->SUPER::new
(@args);
280 my ($seq) = $self->_rearrange([qw(SEQ)],@args);
282 if ( (not defined $seq) && @args && ref($args[0]) ) {
283 # parameter not passed as named parameter?
288 if (not $seq->isa('Bio::PrimarySeqI')) {
289 $self->throw('Must supply a sequence object');
291 if (length $seq->seq == 0) {
292 $self->throw('Sequence had zero-length');
294 $self->{'_seq'} = $seq;
303 my %iupac = $self->iupac;
304 $self->{'_alpha'} = [ map { $iupac{uc $_} } split('', $self->{'_seq'}->seq) ];
305 $self->{'_string'} = [(0) x
length($self->{'_seq'}->seq())];
306 $self->{'_string'}->[0] = -1;
313 Usage : $iupac->next_seq();
314 Function: returns the next unique sequence object
316 Returns : a Bio::Seq object
323 if (not exists $self->{'_string'}) {
324 $self->_initialize();
327 for my $i ( 0 .. $#{$self->{'_string'}} ) {
328 next unless $self->{'_string'}->[$i] || @
{$self->{'_alpha'}->[$i]} > 1;
329 if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
330 if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
333 $self->{'_string'}->[$i] = 0;
337 $self->{'_string'}->[$i]++;
339 my $seqstr = join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @
{$self->{'_string'}});
340 my $desc = $self->{'_seq'}->desc() || '';
342 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
343 $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
344 $self->{'_num'} =~ s/,//g;
346 # Return a fresh sequence object
347 return Bio
::PrimarySeq
->new(-seq
=> $seqstr, -desc
=> $desc);
356 Usage : my %symbols = $iupac->iupac;
357 Function: Returns a hash of symbols -> symbol components of the right type
358 for the given sequence, i.e. it is the same as iupac_iup() if
359 Bio::Tools::IUPAC was given a proteic sequence, or iupac_iub() if the
360 sequence was nucleic. For example, the key 'M' has the value ['A', 'C'].
368 my $alphabet = lc( $self->{'_seq'}->alphabet() );
369 if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
370 return %IUB; # nucleic
371 } elsif ( $alphabet eq 'protein' ) {
372 return %IUP; # proteic
374 $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
383 Usage : my %symbols = $iupac->iupac_amb;
384 Function: Same as iupac() but only contains a mapping between ambiguous residues
385 and the ambiguous residues they map to. For example, the key 'N' has
386 the value ['R', 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N'],
387 i.e. it matches all other ambiguous residues.
395 my $alphabet = lc( $self->{'_seq'}->alphabet() );
396 if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
397 return %IUB_AMB; # nucleic
398 } elsif ( $alphabet eq 'protein' ) {
399 return %IUP_AMB; # proteic
401 $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
409 Usage : my %aasymbols = $iupac->iupac_iup;
410 Function: Returns a hash of PROTEIN symbols -> non-ambiguous symbol components
423 Title : iupac_iup_amb
424 Usage : my %aasymbols = $iupac->iupac_iup_amb;
425 Function: Returns a hash of PROTEIN symbols -> ambiguous symbol components
439 Usage : my %dnasymbols = $iupac->iupac_iub;
440 Function: Returns a hash of DNA symbols -> non-ambiguous symbol components
453 Title : iupac_iub_amb
454 Usage : my %dnasymbols = $iupac->iupac_iub;
455 Function: Returns a hash of DNA symbols -> ambiguous symbol components
468 Title : iupac_rev_iub
469 Usage : my %dnasymbols = $iupac->iupac_rev_iub;
470 Function: Returns a hash of nucleotide combinations -> IUPAC code
471 (a reverse of the iupac_iub hash).
485 Usage : my $total = $iupac->count();
486 Function: Calculates the number of unique, unambiguous sequences that
487 this ambiguous sequence could generate
495 if (not exists $self->{'_string'}) {
496 $self->_initialize();
499 $count *= scalar(@
$_) for (@
{$self->{'_alpha'}});
507 Usage : my $re = $iupac->regexp();
508 Function: Converts the ambiguous sequence into a regular expression that
509 matches all of the corresponding ambiguous and non-ambiguous sequences.
510 You can further manipulate the resulting regular expression with the
511 Bio::Tools::SeqPattern module. After you are done building your
512 regular expression, you might want to compile it and make it case-
515 Args : 1 to match RNA: T and U characters will match interchangeably
516 Return : regular expression
521 my ($self, $match_rna) = @_;
523 my $seq = $self->{'_seq'}->seq;
524 my %iupac = $self->iupac;
525 my %iupac_amb = $self->iupac_amb;
526 for my $pos (0 .. length($seq)-1) {
527 my $res = substr $seq, $pos, 1;
528 my $iupacs = $iupac{$res};
529 my $iupacs_amb = $iupac_amb{$res} || [];
530 if (not defined $iupacs) {
531 $self->throw("Primer sequence '$seq' is not a valid IUPAC sequence.".
532 " Offending character was '$res'.\n");
534 my $part = join '', (@
$iupacs, @
$iupacs_amb);
536 $part =~ s/T/TU/i || $part =~ s/U/TU/i;
538 if (length $part > 1) {
539 $part = '['.$part.']';
550 my $method = $AUTOLOAD;
552 return $self->{'_seq'}->$method(@_)
553 unless $method eq 'DESTROY';