lib/Bio/Tools/IUPAC.pm

   1 #
   2 # BioPerl module for IUPAC
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Aaron Mackey <amackey@virginia.edu>
   7 #
   8 # Copyright Aaron Mackey
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Tools::IUPAC - Generates unique sequence objects or regular expressions from
  17 an ambiguous IUPAC sequence
  18
  19 =head1 SYNOPSIS
  20
  21  use Bio::PrimarySeq;
  22  use Bio::Tools::IUPAC;
  23
  24  # Get the IUPAC code for proteins
  25  my %iupac_prot = Bio::Tools::IUPAC->new->iupac_iup;
  26
  27  # Create a sequence with degenerate residues
  28  my $ambiseq = Bio::PrimarySeq->new(-seq => 'ARTCGUTGN', -alphabet => 'dna');
  29
  30  # Create all possible non-degenerate sequences
  31  my $iupac = Bio::Tools::IUPAC->new(-seq => $ambiseq);
  32  while ($uniqueseq = $iupac->next_seq()) {
  33      # process the unique Bio::Seq object.
  34  }
  35
  36  # Get a regular expression that matches all possible sequences
  37  my $regexp = $iupac->regexp();
  38
  39 =head1 DESCRIPTION
  40
  41 Bio::Tools::IUPAC is a tool that manipulates sequences with ambiguous residues
  42 following the IUPAC conventions. Non-standard characters have the meaning
  43 described below:
  44
  45     IUPAC-IUB SYMBOLS FOR NUCLEOTIDE (DNA OR RNA) NOMENCLATURE:
  46       Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030
  47
  48     ---------------------------------------------------------------
  49     Symbol       Meaning      Nucleic Acid
  50     ---------------------------------------------------------------
  51      A            A           Adenine
  52      C            C           Cytosine
  53      G            G           Guanine
  54      T            T           Thymine
  55      U            U           Uracil
  56      M          A or C        aMino
  57      R          A or G        puRine
  58      W          A or T        Weak
  59      S          C or G        Strong
  60      Y          C or T        pYrimidine
  61      K          G or T        Keto
  62      V        A or C or G     not T (closest unused char after T)
  63      H        A or C or T     not G (closest unused char after G)
  64      D        A or G or T     not C (closest unused char after C)
  65      B        C or G or T     not A (closest unused char after A)
  66      X      G or A or T or C  Unknown (very rarely used)
  67      N      G or A or T or C  Unknown (commonly used)
  68
  69
  70     IUPAC-IUP AMINO ACID SYMBOLS:
  71       Biochem J. 1984 Apr 15; 219(2): 345-373
  72       Eur J Biochem. 1993 Apr 1; 213(1): 2
  73
  74     ------------------------------------------
  75     Symbol           Meaning
  76     ------------------------------------------
  77     A        Alanine
  78     B        Aspartic Acid, Asparagine
  79     C        Cysteine
  80     D        Aspartic Acid
  81     E        Glutamic Acid
  82     F        Phenylalanine
  83     G        Glycine
  84     H        Histidine
  85     I        Isoleucine
  86     J        Isoleucine/Leucine
  87     K        Lysine
  88     L        Leucine
  89     M        Methionine
  90     N        Asparagine
  91     O        Pyrrolysine
  92     P        Proline
  93     Q        Glutamine
  94     R        Arginine
  95     S        Serine
  96     T        Threonine
  97     U        Selenocysteine
  98     V        Valine
  99     W        Tryptophan
 100     X        Unknown
 101     Y        Tyrosine
 102     Z        Glutamic Acid, Glutamine
 103     *        Terminator
 104
 105 There are a few things Bio::Tools::IUPAC can do for you:
 106
 107 =over
 108
 109 =item *
 110
 111 report the IUPAC mapping between ambiguous and non-ambiguous residues
 112
 113 =item *
 114
 115 produce a stream of all possible corresponding unambiguous Bio::Seq objects given
 116 an ambiguous sequence object
 117
 118 =item *
 119
 120 convert an ambiguous sequence object to a corresponding regular expression
 121
 122 =back
 123
 124 =head1 FEEDBACK
 125
 126 =head2 Mailing Lists
 127
 128 User feedback is an integral part of the evolution of this and other
 129 Bioperl modules. Send your comments and suggestions preferably to one
 130 of the Bioperl mailing lists.  Your participation is much appreciated.
 131
 132   bioperl-l@bioperl.org                  - General discussion
 133   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 134
 135 =head2 Support
 136
 137 Please direct usage questions or support issues to the mailing list:
 138
 139 I<bioperl-l@bioperl.org>
 140
 141 rather than to the module maintainer directly. Many experienced and
 142 reponsive experts will be able look at the problem and quickly
 143 address it. Please include a thorough description of the problem
 144 with code and data examples if at all possible.
 145
 146 =head2 Reporting Bugs
 147
 148 Report bugs to the Bioperl bug tracking system to help us keep track
 149 the bugs and their resolution.  Bug reports can be submitted via the
 150 web:
 151
 152   https://github.com/bioperl/bioperl-live/issues
 153
 154 =head1 AUTHOR - Aaron Mackey
 155
 156 Email amackey-at-virginia.edu
 157
 158 =head1 APPENDIX
 159
 160 The rest of the documentation details each of the object
 161 methods. Internal methods are usually preceded with a _
 162
 163 =cut
 164
 165
 166 package Bio::Tools::IUPAC;
 167
 168 use strict;
 169 use base qw(Bio::Root::Root);
 170
 171 # Ambiguous nucleic residues are matched to unambiguous residues
 172 our %IUB = (
 173     A => [qw(A)],
 174     C => [qw(C)],
 175     G => [qw(G)],
 176     T => [qw(T)],
 177     U => [qw(U)],
 178     M => [qw(A C)],
 179     R => [qw(A G)],
 180     S => [qw(C G)],
 181     W => [qw(A T)],
 182     Y => [qw(C T)],
 183     K => [qw(G T)],
 184     V => [qw(A C G)],
 185     H => [qw(A C T)],
 186     D => [qw(A G T)],
 187     B => [qw(C G T)],
 188     N => [qw(A C G T)],
 189     X => [qw(A C G T)],
 190 );
 191
 192 # Same as %IUB but ambiguous residues are matched to ambiguous residues only
 193 our %IUB_AMB = (
 194     M => [qw(M)],
 195     R => [qw(R)],
 196     W => [qw(W)],
 197     S => [qw(S)],
 198     Y => [qw(Y)],
 199     K => [qw(K)],
 200     V => [qw(M R S V)],
 201     H => [qw(H M W Y)],
 202     D => [qw(D K R W)],
 203     B => [qw(B K S Y)],
 204     N => [qw(B D H K M N R S V W Y)],
 205     );
 206
 207 # The inverse of %IUB
 208 our %REV_IUB = (
 209     A    => 'A',
 210     T    => 'T',
 211     U    => 'U',
 212     C    => 'C',
 213     G    => 'G',
 214     AC   => 'M',
 215     AG   => 'R',
 216     AT   => 'W',
 217     CG   => 'S',
 218     CT   => 'Y',
 219     GT   => 'K',
 220     ACG  => 'V',
 221     ACT  => 'H',
 222     AGT  => 'D',
 223     CGT  => 'B',
 224     ACGT => 'N',
 225     N    => 'N'
 226 );
 227
 228 # Same thing with proteins now
 229 our %IUP = (
 230     A => [qw(A)],
 231     B => [qw(D N)],
 232     C => [qw(C)],
 233     D => [qw(D)],
 234     E => [qw(E)],
 235     F => [qw(F)],
 236     G => [qw(G)],
 237     H => [qw(H)],
 238     I => [qw(I)],
 239     J => [qw(I L)],
 240     K => [qw(K)],
 241     L => [qw(L)],
 242     M => [qw(M)],
 243     N => [qw(N)],
 244     O => [qw(O)],
 245     P => [qw(P)],
 246     Q => [qw(Q)],
 247     R => [qw(R)],
 248     S => [qw(S)],
 249     T => [qw(T)],
 250     U => [qw(U)],
 251     V => [qw(V)],
 252     W => [qw(W)],
 253     X => [qw(X)],
 254     Y => [qw(Y)],
 255     Z => [qw(E Q)],
 256     '*' => [qw(*)],
 257 );
 258
 259 our %IUP_AMB = (
 260     B => [qw(B)],
 261     J => [qw(J)],
 262     Z => [qw(Z)],
 263 );
 264
 265
 266 =head2 new
 267
 268  Title   : new
 269  Usage   : Bio::Tools::IUPAC->new($seq);
 270  Function: Create a new IUPAC object, which acts as a sequence stream (akin to
 271            SeqIO)
 272  Args    : an ambiguously coded sequence object that has a specified 'alphabet'
 273  Returns : a Bio::Tools::IUPAC object.
 274
 275 =cut
 276
 277 sub new {
 278     my ($class,@args) = @_;
 279     my $self = $class->SUPER::new(@args);
 280     my ($seq) = $self->_rearrange([qw(SEQ)],@args);
 281
 282     if ( (not defined $seq) && @args && ref($args[0]) ) {
 283         # parameter not passed as named parameter?
 284         $seq = $args[0];
 285     }
 286
 287     if (defined $seq) {
 288         if (not $seq->isa('Bio::PrimarySeqI')) {
 289             $self->throw('Must supply a sequence object');
 290         }
 291         if (length $seq->seq == 0) {
 292             $self->throw('Sequence had zero-length');
 293         }
 294         $self->{'_seq'} = $seq;
 295     }
 296
 297     return $self;
 298 }
 299
 300
 301 sub _initialize {
 302     my ($self) = @_;
 303     my %iupac = $self->iupac;
 304     $self->{'_alpha'} = [ map { $iupac{uc $_} } split('', $self->{'_seq'}->seq) ];
 305     $self->{'_string'} = [(0) x length($self->{'_seq'}->seq())];
 306     $self->{'_string'}->[0] = -1;
 307 }
 308
 309
 310 =head2 next_seq
 311
 312  Title   : next_seq
 313  Usage   : $iupac->next_seq();
 314  Function: returns the next unique sequence object
 315  Args    : none.
 316  Returns : a Bio::Seq object
 317
 318 =cut
 319
 320 sub next_seq {
 321     my ($self) = @_;
 322
 323     if (not exists $self->{'_string'}) {
 324         $self->_initialize();
 325     }
 326
 327     for my $i ( 0 .. $#{$self->{'_string'}} ) {
 328         next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
 329         if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
 330             if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
 331                 return;
 332             } else {
 333                 $self->{'_string'}->[$i] = 0;
 334                 next;
 335             }
 336         } else {
 337             $self->{'_string'}->[$i]++;
 338             my $j = -1;
 339             my $seqstr = join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}});
 340             my $desc   = $self->{'_seq'}->desc() || '';
 341             $self->{'_num'}++;
 342             1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
 343             $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
 344             $self->{'_num'} =~ s/,//g;
 345
 346             # Return a fresh sequence object
 347             return Bio::PrimarySeq->new(-seq  => $seqstr, -desc => $desc);
 348         }
 349     }
 350 }
 351
 352
 353 =head2 iupac
 354
 355  Title   : iupac
 356  Usage   : my %symbols = $iupac->iupac;
 357  Function: Returns a hash of symbols -> symbol components of the right type
 358            for the given sequence, i.e. it is the same as iupac_iup() if
 359            Bio::Tools::IUPAC was given a proteic sequence, or iupac_iub() if the
 360            sequence was nucleic. For example, the key 'M' has the value ['A', 'C'].
 361  Args    : none
 362  Returns : Hash
 363
 364 =cut
 365
 366 sub iupac {
 367     my ($self) = @_;
 368     my $alphabet = lc( $self->{'_seq'}->alphabet() );
 369     if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
 370         return %IUB; # nucleic
 371     } elsif ( $alphabet eq 'protein' ) {
 372         return %IUP; # proteic
 373     } else {
 374         $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
 375     }
 376 }
 377
 378
 379
 380 =head2 iupac_amb
 381
 382  Title   : iupac_amb
 383  Usage   : my %symbols = $iupac->iupac_amb;
 384  Function: Same as iupac() but only contains a mapping between ambiguous residues
 385            and the ambiguous residues they map to. For example, the key 'N' has
 386            the value ['R', 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N'],
 387            i.e. it matches all other ambiguous residues.
 388  Args    : none
 389  Returns : Hash
 390
 391 =cut
 392
 393 sub iupac_amb {
 394     my ($self) = @_;
 395     my $alphabet = lc( $self->{'_seq'}->alphabet() );
 396     if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
 397         return %IUB_AMB; # nucleic
 398     } elsif ( $alphabet eq 'protein' ) {
 399         return %IUP_AMB; # proteic
 400     } else {
 401         $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
 402     }
 403 }
 404
 405
 406 =head2 iupac_iup
 407
 408  Title   : iupac_iup
 409  Usage   : my %aasymbols = $iupac->iupac_iup;
 410  Function: Returns a hash of PROTEIN symbols -> non-ambiguous symbol components
 411  Args    : none
 412  Returns : Hash
 413
 414 =cut
 415
 416 sub iupac_iup {
 417    return %IUP;
 418 }
 419
 420
 421 =head2 iupac_iup_amb
 422
 423  Title   : iupac_iup_amb
 424  Usage   : my %aasymbols = $iupac->iupac_iup_amb;
 425  Function: Returns a hash of PROTEIN symbols -> ambiguous symbol components
 426  Args    : none
 427  Returns : Hash
 428
 429 =cut
 430
 431 sub iupac_iup_amb {
 432    return %IUP_AMB;
 433 }
 434
 435
 436 =head2 iupac_iub
 437
 438  Title   : iupac_iub
 439  Usage   : my %dnasymbols = $iupac->iupac_iub;
 440  Function: Returns a hash of DNA symbols -> non-ambiguous symbol components
 441  Args    : none
 442  Returns : Hash
 443
 444 =cut
 445
 446 sub iupac_iub {
 447    return %IUB;
 448 }
 449
 450
 451 =head2 iupac_iub_amb
 452
 453  Title   : iupac_iub_amb
 454  Usage   : my %dnasymbols = $iupac->iupac_iub;
 455  Function: Returns a hash of DNA symbols -> ambiguous symbol components
 456  Args    : none
 457  Returns : Hash
 458
 459 =cut
 460
 461 sub iupac_iub_amb {
 462    return %IUB_AMB;
 463 }
 464
 465
 466 =head2 iupac_rev_iub
 467
 468  Title   : iupac_rev_iub
 469  Usage   : my %dnasymbols = $iupac->iupac_rev_iub;
 470  Function: Returns a hash of nucleotide combinations -> IUPAC code
 471            (a reverse of the iupac_iub hash).
 472  Args    : none
 473  Returns : Hash
 474
 475 =cut
 476
 477 sub iupac_rev_iub {
 478    return %REV_IUB;
 479 }
 480
 481
 482 =head2 count
 483
 484  Title   : count
 485  Usage   : my $total = $iupac->count();
 486  Function: Calculates the number of unique, unambiguous sequences that
 487            this ambiguous sequence could generate
 488  Args    : none
 489  Return  : int
 490
 491 =cut
 492
 493 sub count {
 494     my ($self) = @_;
 495     if (not exists $self->{'_string'}) {
 496         $self->_initialize();
 497     }
 498     my $count = 1;
 499     $count *= scalar(@$_) for (@{$self->{'_alpha'}});
 500     return $count;
 501 }
 502
 503
 504 =head2 regexp
 505
 506  Title   : regexp
 507  Usage   : my $re = $iupac->regexp();
 508  Function: Converts the ambiguous sequence into a regular expression that
 509            matches all of the corresponding ambiguous and non-ambiguous sequences.
 510            You can further manipulate the resulting regular expression with the
 511            Bio::Tools::SeqPattern module. After you are done building your
 512            regular expression, you might want to compile it and make it case-
 513            insensitive:
 514               $re = qr/$re/i;
 515  Args    : 1 to match RNA: T and U characters will match interchangeably
 516  Return  : regular expression
 517
 518 =cut
 519
 520 sub regexp {
 521     my ($self, $match_rna) = @_;
 522     my $re;
 523     my $seq = $self->{'_seq'}->seq;
 524     my %iupac = $self->iupac;
 525     my %iupac_amb = $self->iupac_amb;
 526     for my $pos (0 .. length($seq)-1) {
 527         my $res = substr $seq, $pos, 1;
 528         my $iupacs = $iupac{$res};
 529         my $iupacs_amb = $iupac_amb{$res} || [];
 530         if (not defined $iupacs) {
 531             $self->throw("Primer sequence '$seq' is not a valid IUPAC sequence.".
 532                          " Offending character was '$res'.\n");
 533         }
 534         my $part = join '', (@$iupacs, @$iupacs_amb);
 535         if ($match_rna) {
 536             $part =~ s/T/TU/i || $part =~ s/U/TU/i;
 537         }
 538         if (length $part > 1) {
 539            $part = '['.$part.']';
 540         }
 541         $re .= $part;
 542     }
 543     return $re;
 544 }
 545
 546
 547 sub AUTOLOAD {
 548     my $self = shift @_;
 549     our $AUTOLOAD;
 550     my $method = $AUTOLOAD;
 551     $method =~ s/.*:://;
 552     return $self->{'_seq'}->$method(@_)
 553         unless $method eq 'DESTROY';
 554 }
 555
 556 1;