Bio/Tools/IUPAC.pm

   1 #
   2 # BioPerl module for IUPAC
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Aaron Mackey <amackey@virginia.edu>
   7 #
   8 # Copyright Aaron Mackey
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Tools::IUPAC - Generates unique sequence objects or regular expressions from
  17 an ambiguous IUPAC sequence
  18
  19 =head1 SYNOPSIS
  20
  21  use Bio::PrimarySeq;
  22  use Bio::Tools::IUPAC;
  23
  24  # Get the IUPAC code for proteins
  25  my %iupac_prot = Bio::Tools::IUPAC->new->iupac_iup;
  26
  27  # Create a sequence with degenerate residues
  28  my $ambiseq = Bio::PrimarySeq->new(-seq => 'ARTCGUTGN', -alphabet => 'dna');
  29
  30  # Create all possible non-degenerate sequences
  31  my $iupac = Bio::Tools::IUPAC->new(-seq => $ambiseq);
  32  while ($uniqueseq = $iupac->next_seq()) {
  33      # process the unique Bio::Seq object.
  34  }
  35
  36  # Get a regular expression that matches all possible sequences
  37  my $regexp = $iupac->regexp();
  38
  39 =head1 DESCRIPTION
  40
  41 Bio::Tools::IUPAC is a tool that manipulates sequences with ambiguous residues
  42 following the IUPAC conventions. Non-standard characters have the meaning
  43 described below:
  44
  45     IUPAC-IUB SYMBOLS FOR NUCLEOTIDE (DNA OR RNA) NOMENCLATURE:
  46       Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030
  47
  48     ---------------------------------------------------------------
  49     Symbol       Meaning      Nucleic Acid
  50     ---------------------------------------------------------------
  51      A            A           Adenine
  52      C            C           Cytosine
  53      G            G           Guanine
  54      T            T           Thymine
  55      U            U           Uracil
  56      M          A or C        aMino
  57      R          A or G        puRine
  58      W          A or T        Weak
  59      S          C or G        Strong
  60      Y          C or T        pYrimidine
  61      K          G or T        Keto
  62      V        A or C or G     not T (closest unused char after T)
  63      H        A or C or T     not G (closest unused char after G)
  64      D        A or G or T     not C (closest unused char after C)
  65      B        C or G or T     not A (closest unused char after A)
  66      X      G or A or T or C  Unknown (very rarely used)
  67      N      G or A or T or C  Unknown (commonly used)
  68
  69
  70     IUPAC-IUP AMINO ACID SYMBOLS:
  71       Biochem J. 1984 Apr 15; 219(2): 345-373
  72       Eur J Biochem. 1993 Apr 1; 213(1): 2
  73
  74     ------------------------------------------
  75     Symbol           Meaning
  76     ------------------------------------------
  77     A        Alanine
  78     B        Aspartic Acid, Asparagine
  79     C        Cysteine
  80     D        Aspartic Acid
  81     E        Glutamic Acid
  82     F        Phenylalanine
  83     G        Glycine
  84     H        Histidine
  85     I        Isoleucine
  86     J        Isoleucine/Leucine
  87     K        Lysine
  88     L        Leucine
  89     M        Methionine
  90     N        Asparagine
  91     O        Pyrrolysine
  92     P        Proline
  93     Q        Glutamine
  94     R        Arginine
  95     S        Serine
  96     T        Threonine
  97     U        Selenocysteine
  98     V        Valine
  99     W        Tryptophan
 100     X        Unknown
 101     Y        Tyrosine
 102     Z        Glutamic Acid, Glutamine
 103     *        Terminator
 104
 105 There are a few things Bio::Tools::IUPAC can do for you:
 106
 107 =over
 108
 109 =item *
 110
 111 report the IUPAC mapping between ambiguous and non-ambiguous residues
 112
 113 =item *
 114
 115 produce a stream of all possible corresponding unambiguous Bio::Seq objects given
 116 an ambiguous sequence object
 117
 118 =item *
 119
 120 convert an ambiguous sequence object to a corresponding regular expression
 121
 122 =back
 123
 124 =head1 FEEDBACK
 125
 126 =head2 Mailing Lists
 127
 128 User feedback is an integral part of the evolution of this and other
 129 Bioperl modules. Send your comments and suggestions preferably to one
 130 of the Bioperl mailing lists.  Your participation is much appreciated.
 131
 132   bioperl-l@bioperl.org                  - General discussion
 133   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 134
 135 =head2 Support
 136
 137 Please direct usage questions or support issues to the mailing list:
 138
 139 I<bioperl-l@bioperl.org>
 140
 141 rather than to the module maintainer directly. Many experienced and
 142 reponsive experts will be able look at the problem and quickly
 143 address it. Please include a thorough description of the problem
 144 with code and data examples if at all possible.
 145
 146 =head2 Reporting Bugs
 147
 148 Report bugs to the Bioperl bug tracking system to help us keep track
 149 the bugs and their resolution.  Bug reports can be submitted via the
 150 web:
 151
 152   https://github.com/bioperl/bioperl-live/issues
 153
 154 =head1 AUTHOR - Aaron Mackey
 155
 156 Email amackey-at-virginia.edu
 157
 158 =head1 APPENDIX
 159
 160 The rest of the documentation details each of the object
 161 methods. Internal methods are usually preceded with a _
 162
 163 =cut
 164
 165
 166 package Bio::Tools::IUPAC;
 167
 168 use strict;
 169 use base qw(Bio::Root::Root);
 170 use vars qw(%IUB %IUB_AMB %REV_IUB %IUP %IUP_AMB $AUTOLOAD);
 171
 172 BEGIN {
 173     # Ambiguous nucleic residues are matched to unambiguous residues
 174     %IUB = (
 175         A => [qw(A)],
 176         C => [qw(C)],
 177         G => [qw(G)],
 178         T => [qw(T)],
 179         U => [qw(U)],
 180         M => [qw(A C)],
 181         R => [qw(A G)],
 182         S => [qw(C G)],
 183         W => [qw(A T)],
 184         Y => [qw(C T)],
 185         K => [qw(G T)],
 186         V => [qw(A C G)],
 187         H => [qw(A C T)],
 188         D => [qw(A G T)],
 189         B => [qw(C G T)],
 190         N => [qw(A C G T)],
 191         X => [qw(A C G T)],
 192     );
 193
 194     # Same as %IUB but ambiguous residues are matched to ambiguous residues only
 195     %IUB_AMB = (
 196         M => [qw(M)],
 197         R => [qw(R)],
 198         W => [qw(W)],
 199         S => [qw(S)],
 200         Y => [qw(Y)],
 201         K => [qw(K)],
 202         V => [qw(M R S V)],
 203         H => [qw(H M W Y)],
 204         D => [qw(D K R W)],
 205         B => [qw(B K S Y)],
 206         N => [qw(B D H K M N R S V W Y)],
 207     );
 208
 209     # The inverse of %IUB
 210     %REV_IUB = (
 211         A    => 'A',
 212         T    => 'T',
 213         U    => 'U',
 214         C    => 'C',
 215         G    => 'G',
 216         AC   => 'M',
 217         AG   => 'R',
 218         AT   => 'W',
 219         CG   => 'S',
 220         CT   => 'Y',
 221         GT   => 'K',
 222         ACG  => 'V',
 223         ACT  => 'H',
 224         AGT  => 'D',
 225         CGT  => 'B',
 226         ACGT => 'N',
 227         N    => 'N'
 228     );
 229
 230     # Same thing with proteins now
 231     %IUP = (
 232         A => [qw(A)],
 233         B => [qw(D N)],
 234         C => [qw(C)],
 235         D => [qw(D)],
 236         E => [qw(E)],
 237         F => [qw(F)],
 238         G => [qw(G)],
 239         H => [qw(H)],
 240         I => [qw(I)],
 241         J => [qw(I L)],
 242         K => [qw(K)],
 243         L => [qw(L)],
 244         M => [qw(M)],
 245         N => [qw(N)],
 246         O => [qw(O)],
 247         P => [qw(P)],
 248         Q => [qw(Q)],
 249         R => [qw(R)],
 250         S => [qw(S)],
 251         T => [qw(T)],
 252         U => [qw(U)],
 253         V => [qw(V)],
 254         W => [qw(W)],
 255         X => [qw(X)],
 256         Y => [qw(Y)],
 257         Z => [qw(E Q)],
 258         '*' => [qw(*)],
 259     );
 260
 261     %IUP_AMB = (
 262         B => [qw(B)],
 263         J => [qw(J)],
 264         Z => [qw(Z)],
 265     );
 266
 267 }
 268
 269
 270 =head2 new
 271
 272  Title   : new
 273  Usage   : Bio::Tools::IUPAC->new($seq);
 274  Function: Create a new IUPAC object, which acts as a sequence stream (akin to
 275            SeqIO)
 276  Args    : an ambiguously coded sequence object that has a specified 'alphabet'
 277  Returns : a Bio::Tools::IUPAC object.
 278
 279 =cut
 280
 281 sub new {
 282     my ($class,@args) = @_;
 283     my $self = $class->SUPER::new(@args);
 284     my ($seq) = $self->_rearrange([qw(SEQ)],@args);
 285
 286     if ( (not defined $seq) && @args && ref($args[0]) ) {
 287         # parameter not passed as named parameter?
 288         $seq = $args[0];
 289     }
 290
 291     if (defined $seq) {
 292         if (not $seq->isa('Bio::PrimarySeqI')) {
 293             $self->throw('Must supply a sequence object');
 294         }
 295         if (length $seq->seq == 0) {
 296             $self->throw('Sequence had zero-length');
 297         }
 298         $self->{'_seq'} = $seq;
 299     }
 300
 301     return $self;
 302 }
 303
 304
 305 sub _initialize {
 306     my ($self) = @_;
 307     my %iupac = $self->iupac;
 308     $self->{'_alpha'} = [ map { $iupac{uc $_} } split('', $self->{'_seq'}->seq) ];
 309     $self->{'_string'} = [(0) x length($self->{'_seq'}->seq())];
 310     $self->{'_string'}->[0] = -1;
 311 }
 312
 313
 314 =head2 next_seq
 315
 316  Title   : next_seq
 317  Usage   : $iupac->next_seq();
 318  Function: returns the next unique sequence object
 319  Args    : none.
 320  Returns : a Bio::Seq object
 321
 322 =cut
 323
 324 sub next_seq {
 325     my ($self) = @_;
 326
 327     if (not exists $self->{'_string'}) {
 328         $self->_initialize();
 329     }
 330
 331     for my $i ( 0 .. $#{$self->{'_string'}} ) {
 332         next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
 333         if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
 334             if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
 335                 return;
 336             } else {
 337                 $self->{'_string'}->[$i] = 0;
 338                 next;
 339             }
 340         } else {
 341             $self->{'_string'}->[$i]++;
 342             my $j = -1;
 343             my $seqstr = join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}});
 344             my $desc   = $self->{'_seq'}->desc() || '';
 345             $self->{'_num'}++;
 346             1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
 347             $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
 348             $self->{'_num'} =~ s/,//g;
 349
 350             # Return a fresh sequence object
 351             return Bio::PrimarySeq->new(-seq  => $seqstr, -desc => $desc);
 352         }
 353     }
 354 }
 355
 356
 357 =head2 iupac
 358
 359  Title   : iupac
 360  Usage   : my %symbols = $iupac->iupac;
 361  Function: Returns a hash of symbols -> symbol components of the right type
 362            for the given sequence, i.e. it is the same as iupac_iup() if
 363            Bio::Tools::IUPAC was given a proteic sequence, or iupac_iub() if the
 364            sequence was nucleic. For example, the key 'M' has the value ['A', 'C'].
 365  Args    : none
 366  Returns : Hash
 367
 368 =cut
 369
 370 sub iupac {
 371     my ($self) = @_;
 372     my $alphabet = lc( $self->{'_seq'}->alphabet() );
 373     if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
 374         return %IUB; # nucleic
 375     } elsif ( $alphabet eq 'protein' ) {
 376         return %IUP; # proteic
 377     } else {
 378         $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
 379     }
 380 }
 381
 382
 383
 384 =head2 iupac_amb
 385
 386  Title   : iupac_amb
 387  Usage   : my %symbols = $iupac->iupac_amb;
 388  Function: Same as iupac() but only contains a mapping between ambiguous residues
 389            and the ambiguous residues they map to. For example, the key 'N' has
 390            the value ['R', 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N'],
 391            i.e. it matches all other ambiguous residues.
 392  Args    : none
 393  Returns : Hash
 394
 395 =cut
 396
 397 sub iupac_amb {
 398     my ($self) = @_;
 399     my $alphabet = lc( $self->{'_seq'}->alphabet() );
 400     if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
 401         return %IUB_AMB; # nucleic
 402     } elsif ( $alphabet eq 'protein' ) {
 403         return %IUP_AMB; # proteic
 404     } else {
 405         $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
 406     }
 407 }
 408
 409
 410 =head2 iupac_iup
 411
 412  Title   : iupac_iup
 413  Usage   : my %aasymbols = $iupac->iupac_iup;
 414  Function: Returns a hash of PROTEIN symbols -> non-ambiguous symbol components
 415  Args    : none
 416  Returns : Hash
 417
 418 =cut
 419
 420 sub iupac_iup {
 421    return %IUP;
 422 }
 423
 424
 425 =head2 iupac_iup_amb
 426
 427  Title   : iupac_iup_amb
 428  Usage   : my %aasymbols = $iupac->iupac_iup_amb;
 429  Function: Returns a hash of PROTEIN symbols -> ambiguous symbol components
 430  Args    : none
 431  Returns : Hash
 432
 433 =cut
 434
 435 sub iupac_iup_amb {
 436    return %IUP_AMB;
 437 }
 438
 439
 440 =head2 iupac_iub
 441
 442  Title   : iupac_iub
 443  Usage   : my %dnasymbols = $iupac->iupac_iub;
 444  Function: Returns a hash of DNA symbols -> non-ambiguous symbol components
 445  Args    : none
 446  Returns : Hash
 447
 448 =cut
 449
 450 sub iupac_iub {
 451    return %IUB;
 452 }
 453
 454
 455 =head2 iupac_iub_amb
 456
 457  Title   : iupac_iub_amb
 458  Usage   : my %dnasymbols = $iupac->iupac_iub;
 459  Function: Returns a hash of DNA symbols -> ambiguous symbol components
 460  Args    : none
 461  Returns : Hash
 462
 463 =cut
 464
 465 sub iupac_iub_amb {
 466    return %IUB_AMB;
 467 }
 468
 469
 470 =head2 iupac_rev_iub
 471
 472  Title   : iupac_rev_iub
 473  Usage   : my %dnasymbols = $iupac->iupac_rev_iub;
 474  Function: Returns a hash of nucleotide combinations -> IUPAC code
 475            (a reverse of the iupac_iub hash).
 476  Args    : none
 477  Returns : Hash
 478
 479 =cut
 480
 481 sub iupac_rev_iub {
 482    return %REV_IUB;
 483 }
 484
 485
 486 =head2 count
 487
 488  Title   : count
 489  Usage   : my $total = $iupac->count();
 490  Function: Calculates the number of unique, unambiguous sequences that
 491            this ambiguous sequence could generate
 492  Args    : none
 493  Return  : int
 494
 495 =cut
 496
 497 sub count {
 498     my ($self) = @_;
 499     if (not exists $self->{'_string'}) {
 500         $self->_initialize();
 501     }
 502     my $count = 1;
 503     $count *= scalar(@$_) for (@{$self->{'_alpha'}});
 504     return $count;
 505 }
 506
 507
 508 =head2 regexp
 509
 510  Title   : regexp
 511  Usage   : my $re = $iupac->regexp();
 512  Function: Converts the ambiguous sequence into a regular expression that
 513            matches all of the corresponding ambiguous and non-ambiguous sequences.
 514            You can further manipulate the resulting regular expression with the
 515            Bio::Tools::SeqPattern module. After you are done building your
 516            regular expression, you might want to compile it and make it case-
 517            insensitive:
 518               $re = qr/$re/i;
 519  Args    : 1 to match RNA: T and U characters will match interchangeably
 520  Return  : regular expression
 521
 522 =cut
 523
 524 sub regexp {
 525     my ($self, $match_rna) = @_;
 526     my $re;
 527     my $seq = $self->{'_seq'}->seq;
 528     my %iupac = $self->iupac;
 529     my %iupac_amb = $self->iupac_amb;
 530     for my $pos (0 .. length($seq)-1) {
 531         my $res = substr $seq, $pos, 1;
 532         my $iupacs = $iupac{$res};
 533         my $iupacs_amb = $iupac_amb{$res} || [];
 534         if (not defined $iupacs) {
 535             $self->throw("Primer sequence '$seq' is not a valid IUPAC sequence.".
 536                          " Offending character was '$res'.\n");
 537         }
 538         my $part = join '', (@$iupacs, @$iupacs_amb);
 539         if ($match_rna) {
 540             $part =~ s/T/TU/i || $part =~ s/U/TU/i;
 541         }
 542         if (length $part > 1) {
 543            $part = '['.$part.']';
 544         }
 545         $re .= $part;
 546     }
 547     return $re;
 548 }
 549
 550
 551 sub AUTOLOAD {
 552     my $self = shift @_;
 553     my $method = $AUTOLOAD;
 554     $method =~ s/.*:://;
 555     return $self->{'_seq'}->$method(@_)
 556         unless $method eq 'DESTROY';
 557 }
 558
 559 1;
 560