Bio/Tools/IUPAC.pm

   1 # $Id$
   2 #
   3 # BioPerl module for IUPAC
   4 #
   5 # Cared for by Aaron Mackey <amackey@virginia.edu>
   6 #
   7 # Copyright Aaron Mackey
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12
  13 =head1 NAME
  14
  15 Bio::Tools::IUPAC - Generates unique Seq objects from an ambiguous Seq object
  16
  17 =head1 SYNOPSIS
  18
  19  use Bio::Seq;
  20  use Bio::Tools::IUPAC;
  21
  22  my $ambiseq = Bio::Seq->new(-seq => 'ARTCGUTGR', -alphabet => 'dna');
  23  my $stream  = Bio::Tools::IUPAC->new(-seq => $ambiseq);
  24
  25  while ($uniqueseq = $stream->next_seq()) {
  26      # process the unique Seq object.
  27  }
  28
  29 =head1 DESCRIPTION
  30
  31 IUPAC is a tool that produces a stream of unique, "strict"-satisfying Seq
  32 objects from an ambiquous Seq object (containing non-standard characters given
  33 the meaning shown below)
  34
  35         Extended DNA / RNA alphabet :
  36         (includes symbols for nucleotide ambiguity)
  37         ------------------------------------------
  38         Symbol       Meaning      Nucleic Acid
  39         ------------------------------------------
  40          A            A           Adenine
  41          C            C           Cytosine
  42          G            G           Guanine
  43          T            T           Thymine
  44          U            U           Uracil
  45          M          A or C
  46          R          A or G
  47          W          A or T
  48          S          C or G
  49          Y          C or T
  50          K          G or T
  51          V        A or C or G
  52          H        A or C or T
  53          D        A or G or T
  54          B        C or G or T
  55          X      G or A or T or C
  56          N      G or A or T or C
  57
  58         IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
  59           Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
  60
  61 -----------------------------------
  62
  63        Amino Acid alphabet:
  64         ------------------------------------------
  65         Symbol           Meaning
  66         ------------------------------------------
  67         A        Alanine
  68         B        Aspartic Acid, Asparagine
  69         C        Cystine
  70         D        Aspartic Acid
  71         E        Glutamic Acid
  72         F        Phenylalanine
  73         G        Glycine
  74         H        Histidine
  75         I        Isoleucine
  76         J        Isoleucine/Leucine
  77         K        Lysine
  78         L        Leucine
  79         M        Methionine
  80         N        Asparagine
  81         O        Pyrrolysine
  82         P        Proline
  83         Q        Glutamine
  84         R        Arginine
  85         S        Serine
  86         T        Threonine
  87         U        Selenocysteine
  88         V        Valine
  89         W        Tryptophan
  90         X        Unknown
  91         Y        Tyrosine
  92         Z        Glutamic Acid, Glutamine
  93         *        Terminator
  94
  95
  96         IUPAC-IUP AMINO ACID SYMBOLS:
  97           Biochem J. 1984 Apr 15; 219(2): 345-373
  98           Eur J Biochem. 1993 Apr 1; 213(1): 2
  99
 100 =head1 FEEDBACK
 101
 102 =head2 Mailing Lists
 103
 104 User feedback is an integral part of the evolution of this and other
 105 Bioperl modules. Send your comments and suggestions preferably to one
 106 of the Bioperl mailing lists.  Your participation is much appreciated.
 107
 108   bioperl-l@bioperl.org                  - General discussion
 109   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 110
 111 =head2 Reporting Bugs
 112
 113 Report bugs to the Bioperl bug tracking system to help us keep track
 114 the bugs and their resolution.  Bug reports can be submitted via the
 115 web:
 116
 117   http://bugzilla.open-bio.org/
 118
 119 =head1 AUTHOR - Aaron Mackey
 120
 121 Email amackey-at-virginia.edu
 122
 123 =head1 APPENDIX
 124
 125 The rest of the documentation details each of the object
 126 methods. Internal methods are usually preceded with a _
 127
 128 =cut
 129
 130
 131 # Let the code begin...
 132
 133 package Bio::Tools::IUPAC;
 134
 135 use strict;
 136 use vars qw(%IUP %IUB %REV_IUB $AUTOLOAD);
 137
 138 BEGIN {
 139     %IUB = ( A => [qw(A)],
 140              C => [qw(C)],
 141              G => [qw(G)],
 142              T => [qw(T)],
 143              U => [qw(U)],
 144              M => [qw(A C)],
 145              R => [qw(A G)],
 146              W => [qw(A T)],
 147              S => [qw(C G)],
 148              Y => [qw(C T)],
 149              K => [qw(G T)],
 150              V => [qw(A C G)],
 151              H => [qw(A C T)],
 152              D => [qw(A G T)],
 153              B => [qw(C G T)],
 154              X => [qw(G A T C)],
 155              N => [qw(G A T C)]
 156              );
 157         %REV_IUB = (A   => 'A',
 158                                 T       => 'T',
 159                                 C       => 'C',
 160                                 G       => 'G',
 161                                 AC      => 'M',
 162                                 AG      => 'R',
 163                                 AT      => 'W',
 164                                 CG      => 'S',
 165                                 CT      => 'Y',
 166                                 'GT'    => 'K',
 167                                 ACG     => 'V',
 168                                 ACT     => 'H',
 169                                 AGT     => 'D',
 170                                 CGT     => 'B',
 171                                 ACGT=> 'N',
 172                                 N       => 'N'
 173                                 );
 174
 175
 176     %IUP = (A => [qw(A)],
 177             B => [qw(D N)],
 178             C => [qw(C)],
 179             D => [qw(D)],
 180             E => [qw(E)],
 181             F => [qw(F)],
 182             G => [qw(G)],
 183             H => [qw(H)],
 184             I => [qw(I)],
 185         J => [qw(I L)],
 186             K => [qw(K)],
 187             L => [qw(L)],
 188             M => [qw(M)],
 189             N => [qw(N)],
 190         O => [qw(O)],
 191             P => [qw(P)],
 192             Q => [qw(Q)],
 193             R => [qw(R)],
 194             S => [qw(S)],
 195             T => [qw(T)],
 196             U => [qw(U)],
 197             V => [qw(V)],
 198             W => [qw(W)],
 199             X => [qw(X)],
 200             Y => [qw(Y)],
 201             Z => [qw(E Q)],
 202             '*' => ['*']
 203             );
 204
 205 }
 206 use base qw(Bio::Root::Root);
 207
 208 =head2 new
 209
 210  Title   : new
 211  Usage   : Bio::Tools::IUPAC->new( $seq)
 212  Function: returns a new seq stream (akin to SeqIO)
 213  Returns : a Bio::Tools::IUPAC stream object that will produce unique
 214            Seq objects on demand.
 215  Args    : an ambiguously coded Seq.pm object that has a specified 'alphabet'
 216
 217
 218 =cut
 219
 220
 221 sub new {
 222     my($class,@args) = @_;
 223     my $self = $class->SUPER::new(@args);
 224
 225     my ($seq) = $self->_rearrange([qw(SEQ)],@args);
 226     if((! defined($seq)) && @args && ref($args[0])) {
 227         # parameter not passed as named parameter?
 228         $seq = $args[0];
 229     }
 230     $seq->isa('Bio::Seq') or
 231         $self->throw("Must supply a Seq.pm object to IUPAC!");
 232     $self->{'_SeqObj'} = $seq;
 233     if ($self->{'_SeqObj'}->alphabet() =~ m/^[dr]na$/i ) {
 234         # nucleotide seq object
 235         $self->{'_alpha'} = [ map { $IUB{uc($_)} }
 236                               split('', $self->{'_SeqObj'}->seq()) ];
 237     } elsif ($self->{'_SeqObj'}->alphabet() =~ m/^protein$/i ) {
 238         # amino acid seq object
 239         $self->{'_alpha'} = [ map { $IUP{uc($_)} }
 240                                split('', $self->{'_SeqObj'}->seq()) ];
 241     } else { # unknown type: we could make a guess, but let's not.
 242         $self->throw("You must specify the 'type' of sequence provided to IUPAC");
 243     }
 244     $self->{'_string'} = [(0) x length($self->{'_SeqObj'}->seq())];
 245     scalar @{$self->{'_string'}} or $self->throw("Sequence has zero-length!");
 246     $self->{'_string'}->[0] = -1;
 247     return $self;
 248 }
 249
 250 =head2 next_seq
 251
 252  Title   : next_seq
 253  Usage   : $iupac->next_seq()
 254  Function: returns the next unique Seq object
 255  Returns : a Seq.pm object
 256  Args    : none.
 257
 258
 259 =cut
 260
 261 sub next_seq{
 262     my ($self) = @_;
 263
 264     for my $i ( 0 .. $#{$self->{'_string'}} ) {
 265         next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
 266         if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
 267             if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
 268                 return;
 269             } else {
 270                 $self->{'_string'}->[$i] = 0;
 271                 next;
 272             }
 273         } else {
 274             $self->{'_string'}->[$i]++;
 275             my $j = -1;
 276             $self->{'_SeqObj'}->seq(join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}}));
 277             my $desc = $self->{'_SeqObj'}->desc();
 278             if ( !defined $desc ) { $desc = ""; }
 279
 280             $self->{'_num'}++;
 281             1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
 282             $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
 283             $self->{'_SeqObj'}->desc($desc);
 284             $self->{'_num'} =~ s/,//g;
 285             return $self->{'_SeqObj'};
 286         }
 287     }
 288 }
 289
 290 =head2 iupac_iup
 291
 292  Title   : iupac_iup
 293  Usage   : my %aasymbols = $iupac->iupac_iup
 294  Function: Returns a hash of PROTEIN symbols -> symbol components
 295  Returns : Hash
 296  Args    : none
 297
 298 =cut
 299
 300 sub iupac_iup{
 301    return %IUP;
 302
 303 }
 304
 305 =head2 iupac_iub
 306
 307  Title   : iupac_iub
 308  Usage   : my %dnasymbols = $iupac->iupac_iub
 309  Function: Returns a hash of DNA symbols -> symbol components
 310  Returns : Hash
 311  Args    : none
 312
 313 =cut
 314
 315 sub iupac_iub{
 316    return %IUB;
 317 }
 318
 319 =head2 iupac_rev_iub
 320
 321  Title   : iupac_rev_iub
 322  Usage   : my %dnasymbols = $iupac->iupac_rev_iub
 323  Function: Returns a hash of nucleotide combinations -> IUPAC code
 324            (a reverse of the iupac_iub hash).
 325  Returns : Hash
 326  Args    : none
 327
 328 =cut
 329
 330 sub iupac_rev_iub{
 331    return %REV_IUB;
 332 }
 333
 334 =head2 count
 335
 336  Title   : count
 337  Usage   : my $total = $iupac->count();
 338  Function: Calculates the number of unique, unambiguous sequences that
 339            this ambiguous sequence could generate
 340  Return  : int
 341  Args    : none
 342
 343 =cut
 344
 345 sub count {
 346     my ($self) = @_;
 347
 348     my $count = 1;
 349     $count *= scalar(@$_) for (@{$self->{'_alpha'}});
 350     return $count;
 351 }
 352
 353
 354 sub AUTOLOAD {
 355
 356     my $self = shift @_;
 357     my $method = $AUTOLOAD;
 358     $method =~ s/.*:://;
 359     return $self->{'_SeqObj'}->$method(@_)
 360         unless $method eq 'DESTROY';
 361 }
 362
 363 1;
 364