Bio/Align/Utilities.pm

   1 # $Id$
   2 #
   3 # BioPerl module for Bio::Align::Utilities
   4 #
   5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   6 #
   7 # Cared for by Jason Stajich <jason-at-bioperl.org>
   8 #
   9 # Copyright Jason Stajich
  10 #
  11 # You may distribute this module under the same terms as perl itself
  12
  13 # POD documentation - main docs before the code
  14
  15 =head1 NAME
  16
  17 Bio::Align::Utilities - A collection of utilities regarding converting
  18 and manipulating alignment objects
  19
  20 =head1 SYNOPSIS
  21
  22   use Bio::Align::Utilities qw(:all);
  23   # %dnaseqs is a hash of CDS sequences (spliced)
  24
  25
  26   # Even if the protein alignments are local make sure the start/end
  27   # stored in the LocatableSeq objects are to the full length protein.
  28   # The CoDing Sequence that is passed in should still be the full
  29   # length CDS as the nt alignment will be generated.
  30   #
  31   my $dna_aln = &aa_to_dna_aln($aa_aln,\%dnaseqs);
  32
  33
  34   # generate bootstraps
  35   my $replicates = &bootstrap_replicates($aln,$count);
  36
  37
  38 =head1 DESCRIPTION
  39
  40 This module contains utility methods for manipulating sequence
  41 alignments ( L<Bio::Align::AlignI>) objects.
  42
  43 The B<aa_to_dna_aln> utility is essentially the same as the B<mrtrans>
  44 program by Bill Pearson available at
  45 ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar.  Of course this
  46 is a pure-perl implementation, but just to mention that if anything
  47 seems odd you can check the alignments generated against Bill's
  48 program.
  49
  50 =head1 FEEDBACK
  51
  52 =head2 Mailing Lists
  53
  54 User feedback is an integral part of the evolution of this and other
  55 Bioperl modules. Send your comments and suggestions preferably to
  56 the Bioperl mailing list.  Your participation is much appreciated.
  57
  58   bioperl-l@bioperl.org                  - General discussion
  59   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  60
  61 =head2 Support
  62
  63 Please direct usage questions or support issues to the mailing list:
  64
  65 L<bioperl-l@bioperl.org>
  66
  67 rather than to the module maintainer directly. Many experienced and
  68 reponsive experts will be able look at the problem and quickly
  69 address it. Please include a thorough description of the problem
  70 with code and data examples if at all possible.
  71
  72 =head2 Reporting Bugs
  73
  74 Report bugs to the Bioperl bug tracking system to help us keep track
  75 of the bugs and their resolution. Bug reports can be submitted via the
  76 web:
  77
  78   http://bugzilla.open-bio.org/
  79
  80 =head1 AUTHOR - Jason Stajich
  81
  82 Email jason-at-bioperl-dot-org
  83
  84 =head1 APPENDIX
  85
  86 The rest of the documentation details each of the object methods.
  87 Internal methods are usually preceded with a _
  88
  89 =cut
  90
  91 #' keep my emacs happy
  92 # Let the code begin...
  93
  94
  95 package Bio::Align::Utilities;
  96 use vars qw(@EXPORT @EXPORT_OK $GAP $CODONGAP %EXPORT_TAGS);
  97 use strict;
  98 use Carp;
  99 use Bio::Root::Version;
 100 require Exporter;
 101
 102 use base qw(Exporter);
 103
 104 @EXPORT = qw();
 105 @EXPORT_OK = qw(aa_to_dna_aln bootstrap_replicates cat);
 106 %EXPORT_TAGS = (all =>[@EXPORT, @EXPORT_OK]);
 107 BEGIN {
 108     use constant CODONSIZE => 3;
 109     $GAP = '-';
 110     $CODONGAP = $GAP x CODONSIZE;
 111 }
 112
 113 =head2 aa_to_dna_aln
 114
 115  Title   : aa_to_dna_aln
 116  Usage   : my $dnaaln = aa_to_dna_aln($aa_aln, \%seqs);
 117  Function: Will convert an AA alignment to DNA space given the
 118            corresponding DNA sequences.  Note that this method expects
 119            the DNA sequences to be in frame +1 (GFF frame 0) as it will
 120            start to project into coordinates starting at the first base of
 121            the DNA sequence, if this alignment represents a different
 122            frame for the cDNA you will need to edit the DNA sequences
 123            to remove the 1st or 2nd bases (and revcom if things should be).
 124  Returns : Bio::Align::AlignI object
 125  Args    : 2 arguments, the alignment and a hashref.
 126            Alignment is a Bio::Align::AlignI of amino acid sequences.
 127            The hash reference should have keys which are
 128            the display_ids for the aa
 129            sequences in the alignment and the values are a
 130            Bio::PrimarySeqI object for the corresponding
 131            spliced cDNA sequence.
 132
 133 See also: L<Bio::Align::AlignI>, L<Bio::SimpleAlign>, L<Bio::PrimarySeq>
 134
 135 =cut
 136
 137 sub aa_to_dna_aln {
 138     my ($aln,$dnaseqs) = @_;
 139     unless( defined $aln &&
 140             ref($aln) &&
 141             $aln->isa('Bio::Align::AlignI') ) {
 142         croak('Must provide a valid Bio::Align::AlignI object as the first argument to aa_to_dna_aln, see the documentation for proper usage and the method signature');
 143     }
 144     my $alnlen = $aln->length;
 145     my $dnaalign = Bio::SimpleAlign->new();
 146     $aln->map_chars('\.',$GAP);
 147
 148     foreach my $seq ( $aln->each_seq ) {
 149         my $aa_seqstr = $seq->seq();
 150         my $id = $seq->display_id;
 151         my $dnaseq = $dnaseqs->{$id} || $aln->throw("cannot find ".
 152                                                      $seq->display_id);
 153         my $start_offset = ($seq->start - 1) * CODONSIZE;
 154
 155         $dnaseq = $dnaseq->seq();
 156         my $dnalen = $dnaseqs->{$id}->length;
 157         my $nt_seqstr;
 158         my $j = 0;
 159         for( my $i = 0; $i < $alnlen; $i++ ) {
 160             my $char = substr($aa_seqstr,$i + $start_offset,1);
 161             if ( $char eq $GAP || $j >= $dnalen )  {
 162                 $nt_seqstr .= $CODONGAP;
 163             } else {
 164                 $nt_seqstr .= substr($dnaseq,$j,CODONSIZE);
 165                 $j += CODONSIZE;
 166             }
 167         }
 168         $nt_seqstr .= $GAP x (($alnlen * 3) - length($nt_seqstr));
 169
 170         my $newdna = Bio::LocatableSeq->new(-display_id  => $id,
 171                                            -alphabet    => 'dna',
 172                                            -start       => $start_offset+1,
 173                                            -end         => ($seq->end *
 174                                                             CODONSIZE),
 175                                            -strand      => 1,
 176                                            -seq         => $nt_seqstr);
 177         $dnaalign->add_seq($newdna);
 178     }
 179     return $dnaalign;
 180 }
 181
 182 =head2 bootstrap_replicates
 183
 184  Title   : bootstrap_replicates
 185  Usage   : my $alns = &bootstrap_replicates($aln,100);
 186  Function: Generate a pseudo-replicate of the data by randomly
 187            sampling, with replacement, the columns from an alignment for
 188            the non-parametric bootstrap.
 189  Returns : Arrayref of L<Bio::SimpleAlign> objects
 190  Args    : L<Bio::SimpleAlign> object
 191            Number of replicates to generate
 192
 193 =cut
 194
 195 sub bootstrap_replicates {
 196    my ($aln,$count) = @_;
 197    $count ||= 1;
 198    my $alen = $aln->length;
 199    my (@seqs,@nm);
 200    $aln->set_displayname_flat(1);
 201    for my $s ( $aln->each_seq ) {
 202        push @seqs, $s->seq();
 203        push @nm, $s->id;
 204    }
 205    my (@alns,$i);
 206    while( $count-- > 0 ) {
 207        my @newseqs;
 208        for($i =0; $i < $alen; $i++ ) {
 209            my $index = int(rand($alen));
 210            my $c = 0;
 211            for ( @seqs ) {
 212                $newseqs[$c++] .= substr($_,$index,1);
 213            }
 214        }
 215        my $newaln = Bio::SimpleAlign->new();
 216        my $i = 0;
 217        for my $s ( @newseqs ) {
 218        (my $tmp = $s) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g;
 219            $newaln->add_seq( Bio::LocatableSeq->new
 220                              (-start         => 1,
 221                               -end           => length($tmp),
 222                               -display_id    => $nm[$i++],
 223                               -seq           => $s));
 224        }
 225        push @alns, $newaln;
 226    }
 227    return \@alns;
 228 }
 229
 230 =head2 cat
 231
 232  Title     : cat
 233  Usage     : $aln123 = cat($aln1, $aln2, $aln3)
 234  Function  : Concatenates alignment objects. Sequences are identified by id.
 235              An error will be thrown if the sequence ids are not unique in the
 236              first alignment. If any ids are not present or not unique in any
 237              of the additional alignments then those sequences are omitted from
 238              the concatenated alignment, and a warning is issued. An error will
 239              be thrown if any of the alignments are not flush, since
 240              concatenating such alignments is unlikely to make biological
 241              sense.
 242  Returns   : A new Bio::SimpleAlign object
 243  Args      : A list of Bio::SimpleAlign objects
 244
 245 =cut
 246
 247 sub cat {
 248     my ($self, @aln) = @_;
 249     $self->throw("cat method called with no arguments") unless $self;
 250     for ($self,@aln) {
 251         $self->throw($_->id. " not a Bio::Align::AlignI object") unless $_->isa('Bio::Align::AlignI');
 252         $self->throw($_->id. " is not flush") unless $_->is_flush;
 253     }
 254     my $aln = $self->new;
 255     $aln->id($self->id);
 256     $aln->annotation($self->annotation);
 257     my %unique;
 258   SEQ: foreach my $seq ( $self->each_seq() ) {
 259         throw("ID: ", $seq->id, " is not unique in initial alignment.") if exists $unique{$seq->id};
 260         $unique{$seq->id}=1;
 261
 262         # Can be Bio::LocatableSeq, Bio::Seq::Meta or Bio::Seq::Meta::Array
 263         my $new_seq = $seq->new(-id=> $seq->id,
 264                                 -strand  => $seq->strand,
 265                                 -verbose => $self->verbose);
 266         $new_seq->seq($seq->seq);
 267         $new_seq->start($seq->start);
 268         $new_seq->end($seq->end);
 269         if ($new_seq->isa('Bio::Seq::MetaI')) {
 270             for my $meta_name ($seq->meta_names) {
 271                 $new_seq->named_submeta($meta_name, $new_seq->start, $new_seq->end, $seq->named_meta($meta_name));
 272             }
 273         }
 274         for my $cat_aln (@aln) {
 275             my @cat_seq=$cat_aln->each_seq_with_id($seq->id);
 276             if (@cat_seq==0) {
 277                 $self->warn($seq->id. " not found in alignment ". $cat_aln->id. ", skipping this sequence.");
 278                 next SEQ;
 279             }
 280             if (@cat_seq>1) {
 281                 $self->warn($seq->id. " found multiple times in alignment ".  $cat_aln->id. ", skipping this sequence.");
 282                 next SEQ;
 283             }
 284             my $cat_seq=$cat_seq[0];
 285             my $old_end=$new_seq->end;
 286             $new_seq->seq($new_seq->seq.$cat_seq->seq);
 287
 288             # Not sure if this is a sensible way to deal with end coordinates
 289             $new_seq->end($new_seq->end+$cat_seq->end+1-$cat_seq->start);
 290
 291             if ($cat_seq->isa('Bio::Seq::Meta::Array')) {
 292                 unless ($new_seq->isa('Bio::Seq::Meta::Array')) {
 293                     my $meta_seq=Bio::Seq::Meta::Array->new;
 294                     $meta_seq->seq($new_seq->seq);
 295                     $meta_seq->start($new_seq->start);
 296                     $meta_seq->end($new_seq->end);
 297                     if ($new_seq->isa('Bio::Seq::Meta')) {
 298                         for my $meta_name ($new_seq->meta_names) {
 299                             $meta_seq->named_submeta($meta_name,
 300                                                      $new_seq->start,
 301                                                      $old_end,
 302                                                      [split(//, $new_seq->named_meta($meta_name))]
 303                                                     );
 304                         }
 305                     }
 306                     $new_seq=$meta_seq;
 307                 }
 308                 for my $meta_name ($cat_seq->meta_names) {
 309                     $new_seq->named_submeta($meta_name,
 310                                             $old_end+1,
 311                                             $new_seq->end,
 312                                             $cat_seq->named_meta($meta_name)
 313                                            );
 314                 }
 315             } elsif ($cat_seq->isa('Bio::Seq::Meta')) {
 316                 if ($new_seq->isa('Bio::Seq::Meta::Array')) {
 317                     for my $meta_name ($cat_seq->meta_names) {
 318                         $new_seq->named_submeta($meta_name,
 319                                                 $old_end+1,
 320                                                 $new_seq->end,
 321                                                 [split(//,$cat_seq->named_meta($meta_name))]
 322                                                );
 323                     }
 324                 } else {
 325                     unless ($new_seq->isa('Bio::Seq::Meta')) {
 326                         my $meta_seq=Bio::Seq::Meta::Array->new;
 327                         $meta_seq->seq($new_seq->seq);
 328                         $meta_seq->start($new_seq->start);
 329                         $meta_seq->end($new_seq->end);
 330                         $new_seq=$meta_seq;
 331                     }
 332                     for my $meta_name ($cat_seq->meta_names) {
 333                         $new_seq->named_submeta($meta_name,
 334                                                 $old_end+1,
 335                                                 $new_seq->end,
 336                                                 $cat_seq->named_meta($meta_name)
 337                                                );
 338                     }
 339                 }
 340             }
 341         }
 342         $aln->add_seq($new_seq);
 343     }
 344     my $cons_meta = $self->consensus_meta;
 345     my $new_cons_meta;
 346     if ($cons_meta) {
 347         $new_cons_meta = Bio::Seq::Meta->new();
 348         for my $meta_name ($cons_meta->meta_names) {
 349             $new_cons_meta->named_submeta($meta_name, 1, $self->length, $cons_meta->$meta_name);
 350         }
 351     }
 352     my $end=$self->length;
 353     for my $cat_aln (@aln) {
 354         my $cat_cons_meta=$cat_aln->consensus_meta;
 355         if ($cat_cons_meta) {
 356             $new_cons_meta = Bio::Seq::Meta->new() if !$new_cons_meta;
 357             for my $meta_name ($cat_cons_meta->meta_names) {
 358                 $new_cons_meta->named_submeta($meta_name, $end+1, $end+$cat_aln->length, $cat_cons_meta->$meta_name);
 359             }
 360         }
 361         $end+=$cat_aln->length;
 362     }
 363     $aln->consensus_meta($new_cons_meta) if $new_cons_meta;
 364     return $aln;
 365 }
 366
 367
 368 1;