Bio::Tools::CodonTable and Bio::Tools::IUPAC: use our and drop BEGIN blocks.
[bioperl-live.git] / lib / Bio / Tools / IUPAC.pm
blob5f2b5b9003011fca292f8012b8e047599b83b65e
2 # BioPerl module for IUPAC
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Aaron Mackey <amackey@virginia.edu>
8 # Copyright Aaron Mackey
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tools::IUPAC - Generates unique sequence objects or regular expressions from
17 an ambiguous IUPAC sequence
19 =head1 SYNOPSIS
21 use Bio::PrimarySeq;
22 use Bio::Tools::IUPAC;
24 # Get the IUPAC code for proteins
25 my %iupac_prot = Bio::Tools::IUPAC->new->iupac_iup;
27 # Create a sequence with degenerate residues
28 my $ambiseq = Bio::PrimarySeq->new(-seq => 'ARTCGUTGN', -alphabet => 'dna');
30 # Create all possible non-degenerate sequences
31 my $iupac = Bio::Tools::IUPAC->new(-seq => $ambiseq);
32 while ($uniqueseq = $iupac->next_seq()) {
33 # process the unique Bio::Seq object.
36 # Get a regular expression that matches all possible sequences
37 my $regexp = $iupac->regexp();
39 =head1 DESCRIPTION
41 Bio::Tools::IUPAC is a tool that manipulates sequences with ambiguous residues
42 following the IUPAC conventions. Non-standard characters have the meaning
43 described below:
45 IUPAC-IUB SYMBOLS FOR NUCLEOTIDE (DNA OR RNA) NOMENCLATURE:
46 Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030
48 ---------------------------------------------------------------
49 Symbol Meaning Nucleic Acid
50 ---------------------------------------------------------------
51 A A Adenine
52 C C Cytosine
53 G G Guanine
54 T T Thymine
55 U U Uracil
56 M A or C aMino
57 R A or G puRine
58 W A or T Weak
59 S C or G Strong
60 Y C or T pYrimidine
61 K G or T Keto
62 V A or C or G not T (closest unused char after T)
63 H A or C or T not G (closest unused char after G)
64 D A or G or T not C (closest unused char after C)
65 B C or G or T not A (closest unused char after A)
66 X G or A or T or C Unknown (very rarely used)
67 N G or A or T or C Unknown (commonly used)
70 IUPAC-IUP AMINO ACID SYMBOLS:
71 Biochem J. 1984 Apr 15; 219(2): 345-373
72 Eur J Biochem. 1993 Apr 1; 213(1): 2
74 ------------------------------------------
75 Symbol Meaning
76 ------------------------------------------
77 A Alanine
78 B Aspartic Acid, Asparagine
79 C Cysteine
80 D Aspartic Acid
81 E Glutamic Acid
82 F Phenylalanine
83 G Glycine
84 H Histidine
85 I Isoleucine
86 J Isoleucine/Leucine
87 K Lysine
88 L Leucine
89 M Methionine
90 N Asparagine
91 O Pyrrolysine
92 P Proline
93 Q Glutamine
94 R Arginine
95 S Serine
96 T Threonine
97 U Selenocysteine
98 V Valine
99 W Tryptophan
100 X Unknown
101 Y Tyrosine
102 Z Glutamic Acid, Glutamine
103 * Terminator
105 There are a few things Bio::Tools::IUPAC can do for you:
107 =over
109 =item *
111 report the IUPAC mapping between ambiguous and non-ambiguous residues
113 =item *
115 produce a stream of all possible corresponding unambiguous Bio::Seq objects given
116 an ambiguous sequence object
118 =item *
120 convert an ambiguous sequence object to a corresponding regular expression
122 =back
124 =head1 FEEDBACK
126 =head2 Mailing Lists
128 User feedback is an integral part of the evolution of this and other
129 Bioperl modules. Send your comments and suggestions preferably to one
130 of the Bioperl mailing lists. Your participation is much appreciated.
132 bioperl-l@bioperl.org - General discussion
133 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
135 =head2 Support
137 Please direct usage questions or support issues to the mailing list:
139 I<bioperl-l@bioperl.org>
141 rather than to the module maintainer directly. Many experienced and
142 reponsive experts will be able look at the problem and quickly
143 address it. Please include a thorough description of the problem
144 with code and data examples if at all possible.
146 =head2 Reporting Bugs
148 Report bugs to the Bioperl bug tracking system to help us keep track
149 the bugs and their resolution. Bug reports can be submitted via the
150 web:
152 https://github.com/bioperl/bioperl-live/issues
154 =head1 AUTHOR - Aaron Mackey
156 Email amackey-at-virginia.edu
158 =head1 APPENDIX
160 The rest of the documentation details each of the object
161 methods. Internal methods are usually preceded with a _
163 =cut
166 package Bio::Tools::IUPAC;
168 use strict;
169 use base qw(Bio::Root::Root);
171 # Ambiguous nucleic residues are matched to unambiguous residues
172 our %IUB = (
173 A => [qw(A)],
174 C => [qw(C)],
175 G => [qw(G)],
176 T => [qw(T)],
177 U => [qw(U)],
178 M => [qw(A C)],
179 R => [qw(A G)],
180 S => [qw(C G)],
181 W => [qw(A T)],
182 Y => [qw(C T)],
183 K => [qw(G T)],
184 V => [qw(A C G)],
185 H => [qw(A C T)],
186 D => [qw(A G T)],
187 B => [qw(C G T)],
188 N => [qw(A C G T)],
189 X => [qw(A C G T)],
192 # Same as %IUB but ambiguous residues are matched to ambiguous residues only
193 our %IUB_AMB = (
194 M => [qw(M)],
195 R => [qw(R)],
196 W => [qw(W)],
197 S => [qw(S)],
198 Y => [qw(Y)],
199 K => [qw(K)],
200 V => [qw(M R S V)],
201 H => [qw(H M W Y)],
202 D => [qw(D K R W)],
203 B => [qw(B K S Y)],
204 N => [qw(B D H K M N R S V W Y)],
207 # The inverse of %IUB
208 our %REV_IUB = (
209 A => 'A',
210 T => 'T',
211 U => 'U',
212 C => 'C',
213 G => 'G',
214 AC => 'M',
215 AG => 'R',
216 AT => 'W',
217 CG => 'S',
218 CT => 'Y',
219 GT => 'K',
220 ACG => 'V',
221 ACT => 'H',
222 AGT => 'D',
223 CGT => 'B',
224 ACGT => 'N',
225 N => 'N'
228 # Same thing with proteins now
229 our %IUP = (
230 A => [qw(A)],
231 B => [qw(D N)],
232 C => [qw(C)],
233 D => [qw(D)],
234 E => [qw(E)],
235 F => [qw(F)],
236 G => [qw(G)],
237 H => [qw(H)],
238 I => [qw(I)],
239 J => [qw(I L)],
240 K => [qw(K)],
241 L => [qw(L)],
242 M => [qw(M)],
243 N => [qw(N)],
244 O => [qw(O)],
245 P => [qw(P)],
246 Q => [qw(Q)],
247 R => [qw(R)],
248 S => [qw(S)],
249 T => [qw(T)],
250 U => [qw(U)],
251 V => [qw(V)],
252 W => [qw(W)],
253 X => [qw(X)],
254 Y => [qw(Y)],
255 Z => [qw(E Q)],
256 '*' => [qw(*)],
259 our %IUP_AMB = (
260 B => [qw(B)],
261 J => [qw(J)],
262 Z => [qw(Z)],
266 =head2 new
268 Title : new
269 Usage : Bio::Tools::IUPAC->new($seq);
270 Function: Create a new IUPAC object, which acts as a sequence stream (akin to
271 SeqIO)
272 Args : an ambiguously coded sequence object that has a specified 'alphabet'
273 Returns : a Bio::Tools::IUPAC object.
275 =cut
277 sub new {
278 my ($class,@args) = @_;
279 my $self = $class->SUPER::new(@args);
280 my ($seq) = $self->_rearrange([qw(SEQ)],@args);
282 if ( (not defined $seq) && @args && ref($args[0]) ) {
283 # parameter not passed as named parameter?
284 $seq = $args[0];
287 if (defined $seq) {
288 if (not $seq->isa('Bio::PrimarySeqI')) {
289 $self->throw('Must supply a sequence object');
291 if (length $seq->seq == 0) {
292 $self->throw('Sequence had zero-length');
294 $self->{'_seq'} = $seq;
297 return $self;
301 sub _initialize {
302 my ($self) = @_;
303 my %iupac = $self->iupac;
304 $self->{'_alpha'} = [ map { $iupac{uc $_} } split('', $self->{'_seq'}->seq) ];
305 $self->{'_string'} = [(0) x length($self->{'_seq'}->seq())];
306 $self->{'_string'}->[0] = -1;
310 =head2 next_seq
312 Title : next_seq
313 Usage : $iupac->next_seq();
314 Function: returns the next unique sequence object
315 Args : none.
316 Returns : a Bio::Seq object
318 =cut
320 sub next_seq {
321 my ($self) = @_;
323 if (not exists $self->{'_string'}) {
324 $self->_initialize();
327 for my $i ( 0 .. $#{$self->{'_string'}} ) {
328 next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
329 if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
330 if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
331 return;
332 } else {
333 $self->{'_string'}->[$i] = 0;
334 next;
336 } else {
337 $self->{'_string'}->[$i]++;
338 my $j = -1;
339 my $seqstr = join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}});
340 my $desc = $self->{'_seq'}->desc() || '';
341 $self->{'_num'}++;
342 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
343 $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
344 $self->{'_num'} =~ s/,//g;
346 # Return a fresh sequence object
347 return Bio::PrimarySeq->new(-seq => $seqstr, -desc => $desc);
353 =head2 iupac
355 Title : iupac
356 Usage : my %symbols = $iupac->iupac;
357 Function: Returns a hash of symbols -> symbol components of the right type
358 for the given sequence, i.e. it is the same as iupac_iup() if
359 Bio::Tools::IUPAC was given a proteic sequence, or iupac_iub() if the
360 sequence was nucleic. For example, the key 'M' has the value ['A', 'C'].
361 Args : none
362 Returns : Hash
364 =cut
366 sub iupac {
367 my ($self) = @_;
368 my $alphabet = lc( $self->{'_seq'}->alphabet() );
369 if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
370 return %IUB; # nucleic
371 } elsif ( $alphabet eq 'protein' ) {
372 return %IUP; # proteic
373 } else {
374 $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
380 =head2 iupac_amb
382 Title : iupac_amb
383 Usage : my %symbols = $iupac->iupac_amb;
384 Function: Same as iupac() but only contains a mapping between ambiguous residues
385 and the ambiguous residues they map to. For example, the key 'N' has
386 the value ['R', 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N'],
387 i.e. it matches all other ambiguous residues.
388 Args : none
389 Returns : Hash
391 =cut
393 sub iupac_amb {
394 my ($self) = @_;
395 my $alphabet = lc( $self->{'_seq'}->alphabet() );
396 if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
397 return %IUB_AMB; # nucleic
398 } elsif ( $alphabet eq 'protein' ) {
399 return %IUP_AMB; # proteic
400 } else {
401 $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
406 =head2 iupac_iup
408 Title : iupac_iup
409 Usage : my %aasymbols = $iupac->iupac_iup;
410 Function: Returns a hash of PROTEIN symbols -> non-ambiguous symbol components
411 Args : none
412 Returns : Hash
414 =cut
416 sub iupac_iup {
417 return %IUP;
421 =head2 iupac_iup_amb
423 Title : iupac_iup_amb
424 Usage : my %aasymbols = $iupac->iupac_iup_amb;
425 Function: Returns a hash of PROTEIN symbols -> ambiguous symbol components
426 Args : none
427 Returns : Hash
429 =cut
431 sub iupac_iup_amb {
432 return %IUP_AMB;
436 =head2 iupac_iub
438 Title : iupac_iub
439 Usage : my %dnasymbols = $iupac->iupac_iub;
440 Function: Returns a hash of DNA symbols -> non-ambiguous symbol components
441 Args : none
442 Returns : Hash
444 =cut
446 sub iupac_iub {
447 return %IUB;
451 =head2 iupac_iub_amb
453 Title : iupac_iub_amb
454 Usage : my %dnasymbols = $iupac->iupac_iub;
455 Function: Returns a hash of DNA symbols -> ambiguous symbol components
456 Args : none
457 Returns : Hash
459 =cut
461 sub iupac_iub_amb {
462 return %IUB_AMB;
466 =head2 iupac_rev_iub
468 Title : iupac_rev_iub
469 Usage : my %dnasymbols = $iupac->iupac_rev_iub;
470 Function: Returns a hash of nucleotide combinations -> IUPAC code
471 (a reverse of the iupac_iub hash).
472 Args : none
473 Returns : Hash
475 =cut
477 sub iupac_rev_iub {
478 return %REV_IUB;
482 =head2 count
484 Title : count
485 Usage : my $total = $iupac->count();
486 Function: Calculates the number of unique, unambiguous sequences that
487 this ambiguous sequence could generate
488 Args : none
489 Return : int
491 =cut
493 sub count {
494 my ($self) = @_;
495 if (not exists $self->{'_string'}) {
496 $self->_initialize();
498 my $count = 1;
499 $count *= scalar(@$_) for (@{$self->{'_alpha'}});
500 return $count;
504 =head2 regexp
506 Title : regexp
507 Usage : my $re = $iupac->regexp();
508 Function: Converts the ambiguous sequence into a regular expression that
509 matches all of the corresponding ambiguous and non-ambiguous sequences.
510 You can further manipulate the resulting regular expression with the
511 Bio::Tools::SeqPattern module. After you are done building your
512 regular expression, you might want to compile it and make it case-
513 insensitive:
514 $re = qr/$re/i;
515 Args : 1 to match RNA: T and U characters will match interchangeably
516 Return : regular expression
518 =cut
520 sub regexp {
521 my ($self, $match_rna) = @_;
522 my $re;
523 my $seq = $self->{'_seq'}->seq;
524 my %iupac = $self->iupac;
525 my %iupac_amb = $self->iupac_amb;
526 for my $pos (0 .. length($seq)-1) {
527 my $res = substr $seq, $pos, 1;
528 my $iupacs = $iupac{$res};
529 my $iupacs_amb = $iupac_amb{$res} || [];
530 if (not defined $iupacs) {
531 $self->throw("Primer sequence '$seq' is not a valid IUPAC sequence.".
532 " Offending character was '$res'.\n");
534 my $part = join '', (@$iupacs, @$iupacs_amb);
535 if ($match_rna) {
536 $part =~ s/T/TU/i || $part =~ s/U/TU/i;
538 if (length $part > 1) {
539 $part = '['.$part.']';
541 $re .= $part;
543 return $re;
547 sub AUTOLOAD {
548 my $self = shift @_;
549 our $AUTOLOAD;
550 my $method = $AUTOLOAD;
551 $method =~ s/.*:://;
552 return $self->{'_seq'}->$method(@_)
553 unless $method eq 'DESTROY';