t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / Tools / IUPAC.pm
blob1d33986ca7bdba041f2458e5ad26256d8191f056
2 # BioPerl module for IUPAC
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Aaron Mackey <amackey@virginia.edu>
8 # Copyright Aaron Mackey
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::IUPAC - Generates unique sequence objects or regular expressions from
17 an ambiguous IUPAC sequence
19 =head1 SYNOPSIS
21 use Bio::PrimarySeq;
22 use Bio::Tools::IUPAC;
24 # Get the IUPAC code for proteins
25 my %iupac_prot = Bio::Tools::IUPAC->new->iupac_iup;
27 # Create a sequence with degenerate residues
28 my $ambiseq = Bio::PrimarySeq->new(-seq => 'ARTCGUTGN', -alphabet => 'dna');
30 # Create all possible non-degenerate sequences
31 my $iupac = Bio::Tools::IUPAC->new(-seq => $ambiseq);
32 while ($uniqueseq = $iupac->next_seq()) {
33 # process the unique Bio::Seq object.
36 # Get a regular expression that matches all possible sequences
37 my $regexp = $iupac->regexp();
39 =head1 DESCRIPTION
41 Bio::Tools::IUPAC is a tool that manipulates sequences with ambiguous residues
42 following the IUPAC conventions. Non-standard characters have the meaning
43 described below:
45 IUPAC-IUB SYMBOLS FOR NUCLEOTIDE (DNA OR RNA) NOMENCLATURE:
46 Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030
48 ---------------------------------------------------------------
49 Symbol Meaning Nucleic Acid
50 ---------------------------------------------------------------
51 A A Adenine
52 C C Cytosine
53 G G Guanine
54 T T Thymine
55 U U Uracil
56 M A or C aMino
57 R A or G puRine
58 W A or T Weak
59 S C or G Strong
60 Y C or T pYrimidine
61 K G or T Keto
62 V A or C or G not T (closest unused char after T)
63 H A or C or T not G (closest unused char after G)
64 D A or G or T not C (closest unused char after C)
65 B C or G or T not A (closest unused char after A)
66 X G or A or T or C Unknown (very rarely used)
67 N G or A or T or C Unknown (commonly used)
70 IUPAC-IUP AMINO ACID SYMBOLS:
71 Biochem J. 1984 Apr 15; 219(2): 345-373
72 Eur J Biochem. 1993 Apr 1; 213(1): 2
74 ------------------------------------------
75 Symbol Meaning
76 ------------------------------------------
77 A Alanine
78 B Aspartic Acid, Asparagine
79 C Cysteine
80 D Aspartic Acid
81 E Glutamic Acid
82 F Phenylalanine
83 G Glycine
84 H Histidine
85 I Isoleucine
86 J Isoleucine/Leucine
87 K Lysine
88 L Leucine
89 M Methionine
90 N Asparagine
91 O Pyrrolysine
92 P Proline
93 Q Glutamine
94 R Arginine
95 S Serine
96 T Threonine
97 U Selenocysteine
98 V Valine
99 W Tryptophan
100 X Unknown
101 Y Tyrosine
102 Z Glutamic Acid, Glutamine
103 * Terminator
105 There are a few things Bio::Tools::IUPAC can do for you:
107 =over
109 =item *
111 report the IUPAC mapping between ambiguous and non-ambiguous residues
113 =item *
115 produce a stream of all possible corresponding unambiguous Bio::Seq objects given
116 an ambiguous sequence object
118 =item *
120 convert an ambiguous sequence object to a corresponding regular expression
122 =back
124 =head1 FEEDBACK
126 =head2 Mailing Lists
128 User feedback is an integral part of the evolution of this and other
129 Bioperl modules. Send your comments and suggestions preferably to one
130 of the Bioperl mailing lists. Your participation is much appreciated.
132 bioperl-l@bioperl.org - General discussion
133 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
135 =head2 Support
137 Please direct usage questions or support issues to the mailing list:
139 I<bioperl-l@bioperl.org>
141 rather than to the module maintainer directly. Many experienced and
142 reponsive experts will be able look at the problem and quickly
143 address it. Please include a thorough description of the problem
144 with code and data examples if at all possible.
146 =head2 Reporting Bugs
148 Report bugs to the Bioperl bug tracking system to help us keep track
149 the bugs and their resolution. Bug reports can be submitted via the
150 web:
152 https://github.com/bioperl/bioperl-live/issues
154 =head1 AUTHOR - Aaron Mackey
156 Email amackey-at-virginia.edu
158 =head1 APPENDIX
160 The rest of the documentation details each of the object
161 methods. Internal methods are usually preceded with a _
163 =cut
166 package Bio::Tools::IUPAC;
168 use strict;
169 use base qw(Bio::Root::Root);
170 use vars qw(%IUB %IUB_AMB %REV_IUB %IUP %IUP_AMB $AUTOLOAD);
172 BEGIN {
173 # Ambiguous nucleic residues are matched to unambiguous residues
174 %IUB = (
175 A => [qw(A)],
176 C => [qw(C)],
177 G => [qw(G)],
178 T => [qw(T)],
179 U => [qw(U)],
180 M => [qw(A C)],
181 R => [qw(A G)],
182 S => [qw(C G)],
183 W => [qw(A T)],
184 Y => [qw(C T)],
185 K => [qw(G T)],
186 V => [qw(A C G)],
187 H => [qw(A C T)],
188 D => [qw(A G T)],
189 B => [qw(C G T)],
190 N => [qw(A C G T)],
191 X => [qw(A C G T)],
194 # Same as %IUB but ambiguous residues are matched to ambiguous residues only
195 %IUB_AMB = (
196 M => [qw(M)],
197 R => [qw(R)],
198 W => [qw(W)],
199 S => [qw(S)],
200 Y => [qw(Y)],
201 K => [qw(K)],
202 V => [qw(M R S V)],
203 H => [qw(H M W Y)],
204 D => [qw(D K R W)],
205 B => [qw(B K S Y)],
206 N => [qw(B D H K M N R S V W Y)],
209 # The inverse of %IUB
210 %REV_IUB = (
211 A => 'A',
212 T => 'T',
213 U => 'U',
214 C => 'C',
215 G => 'G',
216 AC => 'M',
217 AG => 'R',
218 AT => 'W',
219 CG => 'S',
220 CT => 'Y',
221 GT => 'K',
222 ACG => 'V',
223 ACT => 'H',
224 AGT => 'D',
225 CGT => 'B',
226 ACGT => 'N',
227 N => 'N'
230 # Same thing with proteins now
231 %IUP = (
232 A => [qw(A)],
233 B => [qw(D N)],
234 C => [qw(C)],
235 D => [qw(D)],
236 E => [qw(E)],
237 F => [qw(F)],
238 G => [qw(G)],
239 H => [qw(H)],
240 I => [qw(I)],
241 J => [qw(I L)],
242 K => [qw(K)],
243 L => [qw(L)],
244 M => [qw(M)],
245 N => [qw(N)],
246 O => [qw(O)],
247 P => [qw(P)],
248 Q => [qw(Q)],
249 R => [qw(R)],
250 S => [qw(S)],
251 T => [qw(T)],
252 U => [qw(U)],
253 V => [qw(V)],
254 W => [qw(W)],
255 X => [qw(X)],
256 Y => [qw(Y)],
257 Z => [qw(E Q)],
258 '*' => [qw(*)],
261 %IUP_AMB = (
262 B => [qw(B)],
263 J => [qw(J)],
264 Z => [qw(Z)],
270 =head2 new
272 Title : new
273 Usage : Bio::Tools::IUPAC->new($seq);
274 Function: Create a new IUPAC object, which acts as a sequence stream (akin to
275 SeqIO)
276 Args : an ambiguously coded sequence object that has a specified 'alphabet'
277 Returns : a Bio::Tools::IUPAC object.
279 =cut
281 sub new {
282 my ($class,@args) = @_;
283 my $self = $class->SUPER::new(@args);
284 my ($seq) = $self->_rearrange([qw(SEQ)],@args);
286 if ( (not defined $seq) && @args && ref($args[0]) ) {
287 # parameter not passed as named parameter?
288 $seq = $args[0];
291 if (defined $seq) {
292 if (not $seq->isa('Bio::PrimarySeqI')) {
293 $self->throw('Must supply a sequence object');
295 if (length $seq->seq == 0) {
296 $self->throw('Sequence had zero-length');
298 $self->{'_seq'} = $seq;
301 return $self;
305 sub _initialize {
306 my ($self) = @_;
307 my %iupac = $self->iupac;
308 $self->{'_alpha'} = [ map { $iupac{uc $_} } split('', $self->{'_seq'}->seq) ];
309 $self->{'_string'} = [(0) x length($self->{'_seq'}->seq())];
310 $self->{'_string'}->[0] = -1;
314 =head2 next_seq
316 Title : next_seq
317 Usage : $iupac->next_seq();
318 Function: returns the next unique sequence object
319 Args : none.
320 Returns : a Bio::Seq object
322 =cut
324 sub next_seq {
325 my ($self) = @_;
327 if (not exists $self->{'_string'}) {
328 $self->_initialize();
331 for my $i ( 0 .. $#{$self->{'_string'}} ) {
332 next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
333 if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
334 if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
335 return;
336 } else {
337 $self->{'_string'}->[$i] = 0;
338 next;
340 } else {
341 $self->{'_string'}->[$i]++;
342 my $j = -1;
343 my $seqstr = join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}});
344 my $desc = $self->{'_seq'}->desc() || '';
345 $self->{'_num'}++;
346 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
347 $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
348 $self->{'_num'} =~ s/,//g;
350 # Return a fresh sequence object
351 return Bio::PrimarySeq->new(-seq => $seqstr, -desc => $desc);
357 =head2 iupac
359 Title : iupac
360 Usage : my %symbols = $iupac->iupac;
361 Function: Returns a hash of symbols -> symbol components of the right type
362 for the given sequence, i.e. it is the same as iupac_iup() if
363 Bio::Tools::IUPAC was given a proteic sequence, or iupac_iub() if the
364 sequence was nucleic. For example, the key 'M' has the value ['A', 'C'].
365 Args : none
366 Returns : Hash
368 =cut
370 sub iupac {
371 my ($self) = @_;
372 my $alphabet = lc( $self->{'_seq'}->alphabet() );
373 if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
374 return %IUB; # nucleic
375 } elsif ( $alphabet eq 'protein' ) {
376 return %IUP; # proteic
377 } else {
378 $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
384 =head2 iupac_amb
386 Title : iupac_amb
387 Usage : my %symbols = $iupac->iupac_amb;
388 Function: Same as iupac() but only contains a mapping between ambiguous residues
389 and the ambiguous residues they map to. For example, the key 'N' has
390 the value ['R', 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N'],
391 i.e. it matches all other ambiguous residues.
392 Args : none
393 Returns : Hash
395 =cut
397 sub iupac_amb {
398 my ($self) = @_;
399 my $alphabet = lc( $self->{'_seq'}->alphabet() );
400 if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
401 return %IUB_AMB; # nucleic
402 } elsif ( $alphabet eq 'protein' ) {
403 return %IUP_AMB; # proteic
404 } else {
405 $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
410 =head2 iupac_iup
412 Title : iupac_iup
413 Usage : my %aasymbols = $iupac->iupac_iup;
414 Function: Returns a hash of PROTEIN symbols -> non-ambiguous symbol components
415 Args : none
416 Returns : Hash
418 =cut
420 sub iupac_iup {
421 return %IUP;
425 =head2 iupac_iup_amb
427 Title : iupac_iup_amb
428 Usage : my %aasymbols = $iupac->iupac_iup_amb;
429 Function: Returns a hash of PROTEIN symbols -> ambiguous symbol components
430 Args : none
431 Returns : Hash
433 =cut
435 sub iupac_iup_amb {
436 return %IUP_AMB;
440 =head2 iupac_iub
442 Title : iupac_iub
443 Usage : my %dnasymbols = $iupac->iupac_iub;
444 Function: Returns a hash of DNA symbols -> non-ambiguous symbol components
445 Args : none
446 Returns : Hash
448 =cut
450 sub iupac_iub {
451 return %IUB;
455 =head2 iupac_iub_amb
457 Title : iupac_iub_amb
458 Usage : my %dnasymbols = $iupac->iupac_iub;
459 Function: Returns a hash of DNA symbols -> ambiguous symbol components
460 Args : none
461 Returns : Hash
463 =cut
465 sub iupac_iub_amb {
466 return %IUB_AMB;
470 =head2 iupac_rev_iub
472 Title : iupac_rev_iub
473 Usage : my %dnasymbols = $iupac->iupac_rev_iub;
474 Function: Returns a hash of nucleotide combinations -> IUPAC code
475 (a reverse of the iupac_iub hash).
476 Args : none
477 Returns : Hash
479 =cut
481 sub iupac_rev_iub {
482 return %REV_IUB;
486 =head2 count
488 Title : count
489 Usage : my $total = $iupac->count();
490 Function: Calculates the number of unique, unambiguous sequences that
491 this ambiguous sequence could generate
492 Args : none
493 Return : int
495 =cut
497 sub count {
498 my ($self) = @_;
499 if (not exists $self->{'_string'}) {
500 $self->_initialize();
502 my $count = 1;
503 $count *= scalar(@$_) for (@{$self->{'_alpha'}});
504 return $count;
508 =head2 regexp
510 Title : regexp
511 Usage : my $re = $iupac->regexp();
512 Function: Converts the ambiguous sequence into a regular expression that
513 matches all of the corresponding ambiguous and non-ambiguous sequences.
514 You can further manipulate the resulting regular expression with the
515 Bio::Tools::SeqPattern module. After you are done building your
516 regular expression, you might want to compile it and make it case-
517 insensitive:
518 $re = qr/$re/i;
519 Args : 1 to match RNA: T and U characters will match interchangeably
520 Return : regular expression
522 =cut
524 sub regexp {
525 my ($self, $match_rna) = @_;
526 my $re;
527 my $seq = $self->{'_seq'}->seq;
528 my %iupac = $self->iupac;
529 my %iupac_amb = $self->iupac_amb;
530 for my $pos (0 .. length($seq)-1) {
531 my $res = substr $seq, $pos, 1;
532 my $iupacs = $iupac{$res};
533 my $iupacs_amb = $iupac_amb{$res} || [];
534 if (not defined $iupacs) {
535 $self->throw("Primer sequence '$seq' is not a valid IUPAC sequence.".
536 " Offending character was '$res'.\n");
538 my $part = join '', (@$iupacs, @$iupacs_amb);
539 if ($match_rna) {
540 $part =~ s/T/TU/i || $part =~ s/U/TU/i;
542 if (length $part > 1) {
543 $part = '['.$part.']';
545 $re .= $part;
547 return $re;
551 sub AUTOLOAD {
552 my $self = shift @_;
553 my $method = $AUTOLOAD;
554 $method =~ s/.*:://;
555 return $self->{'_seq'}->$method(@_)
556 unless $method eq 'DESTROY';