Bio/Align/Utilities.pm

   1 #
   2 # BioPerl module for Bio::Align::Utilities
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Jason Stajich <jason-at-bioperl.org> and Brian Osborne
   7 #
   8 # Copyright Jason Stajich
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Align::Utilities - A collection of utilities regarding converting
  17 and manipulating alignment objects
  18
  19 =head1 SYNOPSIS
  20
  21   use Bio::Align::Utilities qw(:all);
  22
  23   # Even if the protein alignments are local make sure the start/end
  24   # stored in the LocatableSeq objects are to the full length protein.
  25   # The coding sequence that is passed in should still be the full
  26   # length CDS as the nt alignment will be generated.
  27   # %dnaseqs is a hash of CDS sequences (spliced)
  28   my $dna_aln = aa_to_dna_aln($aa_aln,\%dnaseqs);
  29
  30   # The reverse, which is simpler. The input alignment has to be
  31   # translate-able, with gap lengths and an overall length divisible by 3
  32   my $aa_aln = dna_to_aa_aln($dna_al);
  33
  34   # Generate bootstraps
  35   my $replicates = bootstrap_replicates($aln,$count);
  36
  37 =head1 DESCRIPTION
  38
  39 This module contains utility methods for manipulating sequence
  40 alignments (L<Bio::Align::AlignI>) objects.
  41
  42 The B<aa_to_dna_aln> utility is essentially the same as the B<mrtrans>
  43 program by Bill Pearson available at
  44 ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar.  Of course this
  45 is a pure-Perl implementation, but just to mention that if anything
  46 seems odd you can check the alignments generated against Bill's
  47 program.
  48
  49 =head1 FEEDBACK
  50
  51 =head2 Mailing Lists
  52
  53 User feedback is an integral part of the evolution of this and other
  54 Bioperl modules. Send your comments and suggestions preferably to
  55 the Bioperl mailing list.  Your participation is much appreciated.
  56
  57   bioperl-l@bioperl.org                  - General discussion
  58   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  59
  60 =head2 Support
  61
  62 Please direct usage questions or support issues to the mailing list:
  63
  64 I<bioperl-l@bioperl.org>
  65
  66 rather than to the module maintainer directly. Many experienced and
  67 reponsive experts will be able look at the problem and quickly
  68 address it. Please include a thorough description of the problem
  69 with code and data examples if at all possible.
  70
  71 =head2 Reporting Bugs
  72
  73 Report bugs to the Bioperl bug tracking system to help us keep track
  74 of the bugs and their resolution. Bug reports can be submitted via the
  75 web:
  76
  77   https://redmine.open-bio.org/projects/bioperl/
  78
  79 =head1 AUTHOR - Jason Stajich
  80
  81 Email jason-at-bioperl-dot-org
  82
  83 =head1 APPENDIX
  84
  85 The rest of the documentation details each of the object methods.
  86 Internal methods are usually preceded with a _
  87
  88 =cut
  89
  90 #' keep my emacs happy
  91 # Let the code begin...
  92
  93 package Bio::Align::Utilities;
  94 use vars qw(@EXPORT @EXPORT_OK $GAP $CODONGAP %EXPORT_TAGS );
  95 use strict;
  96 use Carp;
  97 use Bio::Root::Version;
  98 require Exporter;
  99
 100 use base qw(Exporter);
 101
 102 @EXPORT = qw();
 103 @EXPORT_OK =
 104   qw(aa_to_dna_aln bootstrap_replicates cat bootstrap_replicates_codons dna_to_aa_aln);
 105 %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
 106
 107 BEGIN {
 108     use constant CODONSIZE => 3;
 109     $GAP       = '-';
 110     $CODONGAP  = $GAP x CODONSIZE;
 111 }
 112
 113 =head2 aa_to_dna_aln
 114
 115  Title   : aa_to_dna_aln
 116  Usage   : my $dnaaln = aa_to_dna_aln($aa_aln, \%seqs);
 117  Function: Will convert an AA alignment to DNA space given the
 118            corresponding DNA sequences.  Note that this method expects
 119            the DNA sequences to be in frame +1 (GFF frame 0) as it will
 120            start to project into coordinates starting at the first base of
 121            the DNA sequence, if this alignment represents a different
 122            frame for the cDNA you will need to edit the DNA sequences
 123            to remove the 1st or 2nd bases (and revcom if things should be).
 124  Returns : Bio::Align::AlignI object
 125  Args    : 2 arguments, the alignment and a hashref.
 126            Alignment is a Bio::Align::AlignI of amino acid sequences.
 127            The hash reference should have keys which are
 128            the display_ids for the aa
 129            sequences in the alignment and the values are a
 130            Bio::PrimarySeqI object for the corresponding
 131            spliced cDNA sequence.
 132
 133 See also: L<Bio::Align::AlignI>, L<Bio::SimpleAlign>, L<Bio::PrimarySeq>
 134
 135 =cut
 136
 137 sub aa_to_dna_aln {
 138     my ( $aln, $dnaseqs ) = @_;
 139     unless ( defined $aln
 140         && ref($aln)
 141         && $aln->isa('Bio::Align::AlignI') )
 142     {
 143         croak(
 144 'Must provide a valid Bio::Align::AlignI object as the first argument to aa_to_dna_aln, see the documentation for proper usage and the method signature'
 145         );
 146     }
 147     my $alnlen   = $aln->length;
 148     my $dnaalign = Bio::SimpleAlign->new();
 149     $aln->map_chars( '\.', $GAP );
 150
 151     foreach my $seq ( $aln->each_seq ) {
 152         my $aa_seqstr = $seq->seq();
 153         my $id        = $seq->display_id;
 154         my $dnaseq =
 155           $dnaseqs->{$id} || $aln->throw( "cannot find " . $seq->display_id );
 156         my $start_offset = ( $seq->start - 1 ) * CODONSIZE;
 157
 158         $dnaseq = $dnaseq->seq();
 159         my $dnalen = $dnaseqs->{$id}->length;
 160         my $nt_seqstr;
 161         my $j = 0;
 162         for ( my $i = 0 ; $i < $alnlen ; $i++ ) {
 163             my $char = substr( $aa_seqstr, $i + $start_offset, 1 );
 164             if ( $char eq $GAP || $j >= $dnalen ) {
 165                 $nt_seqstr .= $CODONGAP;
 166             }
 167             else {
 168                 $nt_seqstr .= substr( $dnaseq, $j, CODONSIZE );
 169                 $j += CODONSIZE;
 170             }
 171         }
 172         $nt_seqstr .= $GAP x ( ( $alnlen * 3 ) - length($nt_seqstr) );
 173
 174         my $newdna = Bio::LocatableSeq->new(
 175             -display_id => $id,
 176             -alphabet   => 'dna',
 177             -start      => $start_offset + 1,
 178             -end        => ( $seq->end * CODONSIZE ),
 179             -strand     => 1,
 180             -seq        => $nt_seqstr
 181         );
 182         $dnaalign->add_seq($newdna);
 183     }
 184     return $dnaalign;
 185 }
 186
 187 =head2 dna_to_aa_aln
 188
 189  Title   : dna_to_aa_aln
 190  Usage   : my $aa_aln = dna_to_aa_aln($dna_aln);
 191  Function: Convert a DNA alignment to an amino acid alignment where
 192            the length of all alignment strings and the lengths of any
 193            gaps must be divisible by 3
 194  Returns : Bio::Align::AlignI object
 195  Args    : the DNA alignment, a Bio::Align::AlignI of DNA sequences
 196
 197 See also: L<Bio::Align::AlignI>, L<Bio::SimpleAlign>, L<Bio::PrimarySeq>
 198
 199 =cut
 200
 201 sub dna_to_aa_aln {
 202     my $dna_aln = shift;
 203     unless ( defined $dna_aln
 204         && ref($dna_aln)
 205         && $dna_aln->isa('Bio::Align::AlignI') ) {
 206         croak(
 207 'Must provide a valid Bio::Align::AlignI object as the argument to dna_to_aa_aln'
 208         );
 209     }
 210     my $codon_table = Bio::Tools::CodonTable->new;
 211     my $aa_aln      = Bio::SimpleAlign->new;
 212
 213     for my $seq ( $dna_aln->each_seq ) {
 214         my ($aa_str, $aa_len);
 215         my @aln_str = split '', $seq->seq;
 216         croak("All lines in the alignment must have lengths divisible by 3")
 217           if ( scalar(@aln_str) % CODONSIZE );
 218
 219         while ( @aln_str ) {
 220             my $triplet = join '', (splice( @aln_str, 0, CODONSIZE ));
 221
 222             if ( $triplet =~ /^[GATC]+$/i ) {
 223                 $aa_str .= $codon_table->translate($triplet);
 224                 $aa_len++;
 225             }
 226             elsif ( $triplet =~ /^[$Bio::LocatableSeq::GAP_SYMBOLS]+$/ ) {
 227                 $aa_str .= $GAP;
 228             }
 229             else {
 230                 croak("The triplet '$triplet' is neither a valid codon nor all gaps");
 231             }
 232         }
 233         my $new_aa = Bio::LocatableSeq->new(
 234             -display_id => $seq->display_id,
 235             -alphabet   => 'protein',
 236             -start      => 1,
 237             -end        => $aa_len,
 238             -strand     => 1,
 239             -seq        => $aa_str
 240         );
 241
 242         $aa_aln->add_seq($new_aa);
 243     }
 244
 245     $aa_aln;
 246 }
 247
 248 =head2 bootstrap_replicates
 249
 250  Title   : bootstrap_replicates
 251  Usage   : my $alns = &bootstrap_replicates($aln,100);
 252  Function: Generate a pseudo-replicate of the data by randomly
 253            sampling, with replacement, the columns from an alignment for
 254            the non-parametric bootstrap.
 255  Returns : Arrayref of L<Bio::SimpleAlign> objects
 256  Args    : L<Bio::SimpleAlign> object
 257            Number of replicates to generate
 258
 259 =cut
 260
 261 sub bootstrap_replicates {
 262     my ( $aln, $count ) = @_;
 263     $count ||= 1;
 264     my $alen = $aln->length;
 265     my ( @seqs, @nm );
 266     $aln->set_displayname_flat(1);
 267     for my $s ( $aln->each_seq ) {
 268         push @seqs, $s->seq();
 269         push @nm,   $s->id;
 270     }
 271     my ( @alns, $i );
 272     while ( $count-- > 0 ) {
 273         my @newseqs;
 274         for ( $i = 0 ; $i < $alen ; $i++ ) {
 275             my $index = int( rand($alen) );
 276             my $c     = 0;
 277             for (@seqs) {
 278                 $newseqs[ $c++ ] .= substr( $_, $index, 1 );
 279             }
 280         }
 281         my $newaln = Bio::SimpleAlign->new();
 282         my $i      = 0;
 283         for my $s (@newseqs) {
 284             ( my $tmp = $s ) =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]+//g;
 285             $newaln->add_seq(
 286                 Bio::LocatableSeq->new(
 287                     -start      => 1,
 288                     -end        => length($tmp),
 289                     -display_id => $nm[ $i++ ],
 290                     -seq        => $s
 291                 )
 292             );
 293         }
 294         push @alns, $newaln;
 295     }
 296     return \@alns;
 297 }
 298
 299 =head2 bootstrap_replicates_codons
 300
 301  Title   : bootstrap_replicates_codons
 302  Usage   : my $alns = &bootstrap_replicates_codons($aln,100);
 303  Function: Generate a pseudo-replicate of the data by randomly
 304            sampling, with replacement, the columns from a codon alignment for
 305            the non-parametric bootstrap. The alignment is assumed to start on
 306            the first position of a codon.
 307  Returns : Arrayref of L<Bio::SimpleAlign> objects
 308  Args    : L<Bio::SimpleAlign> object
 309            Number of replicates to generate
 310
 311 =cut
 312
 313 sub bootstrap_replicates_codons {
 314     my ( $aln, $count ) = @_;
 315     $count ||= 1;
 316     my $alen   = $aln->length;
 317     my $ncodon = int( $alen / 3 );
 318     my ( @seqs, @nm );
 319     $aln->set_displayname_flat(1);
 320     for my $s ( $aln->each_seq ) {
 321         push @seqs, $s->seq();
 322         push @nm,   $s->id;
 323     }
 324     my ( @alns, $i );
 325     while ( $count-- > 0 ) {
 326         my @newseqs;
 327         for ( $i = 0 ; $i < $ncodon ; $i++ ) {
 328             my $index  = int( rand($ncodon) );
 329             my $seqpos = $index * 3;
 330             my $c      = 0;
 331             for (@seqs) {
 332                 $newseqs[ $c++ ] .= substr( $_, $seqpos, 3 );
 333             }
 334         }
 335         my $newaln = Bio::SimpleAlign->new();
 336         my $i      = 0;
 337         for my $s (@newseqs) {
 338             ( my $tmp = $s ) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g;
 339             $newaln->add_seq(
 340                 Bio::LocatableSeq->new(
 341                     -start      => 1,
 342                     -end        => length($tmp),
 343                     -display_id => $nm[ $i++ ],
 344                     -seq        => $s
 345                 )
 346             );
 347         }
 348         push @alns, $newaln;
 349     }
 350     return \@alns;
 351 }
 352
 353 =head2 cat
 354
 355  Title     : cat
 356  Usage     : $aln123 = cat($aln1, $aln2, $aln3)
 357  Function  : Concatenates alignment objects. Sequences are identified by id.
 358              An error will be thrown if the sequence ids are not unique in the
 359              first alignment. If any ids are not present or not unique in any
 360              of the additional alignments then those sequences are omitted from
 361              the concatenated alignment, and a warning is issued. An error will
 362              be thrown if any of the alignments are not flush, since
 363              concatenating such alignments is unlikely to make biological
 364              sense.
 365  Returns   : A new Bio::SimpleAlign object
 366  Args      : A list of Bio::SimpleAlign objects
 367
 368 =cut
 369
 370 sub cat {
 371     my ( $self, @aln ) = @_;
 372     $self->throw("cat method called with no arguments") unless $self;
 373     for ( $self, @aln ) {
 374         $self->throw( $_->id . " is not a Bio::Align::AlignI object" )
 375           unless $_->isa('Bio::Align::AlignI');
 376         $self->throw( $_->id . " is not flush" ) unless $_->is_flush;
 377     }
 378     my $aln = $self->new;
 379     $aln->id( $self->id );
 380     $aln->annotation( $self->annotation );
 381     my %unique;
 382   SEQ: foreach my $seq ( $self->each_seq() ) {
 383         throw( "ID: ", $seq->id, " is not unique in initial alignment." )
 384           if exists $unique{ $seq->id };
 385         $unique{ $seq->id } = 1;
 386
 387         # Can be Bio::LocatableSeq, Bio::Seq::Meta or Bio::Seq::Meta::Array
 388         my $new_seq = $seq->new(
 389             -id      => $seq->id,
 390             -strand  => $seq->strand,
 391             -verbose => $self->verbose
 392         );
 393         $new_seq->seq( $seq->seq );
 394         $new_seq->start( $seq->start );
 395         $new_seq->end( $seq->end );
 396         if ( $new_seq->isa('Bio::Seq::MetaI') ) {
 397             for my $meta_name ( $seq->meta_names ) {
 398                 $new_seq->named_submeta( $meta_name, $new_seq->start,
 399                     $new_seq->end, $seq->named_meta($meta_name) );
 400             }
 401         }
 402         for my $cat_aln (@aln) {
 403             my @cat_seq = $cat_aln->each_seq_with_id( $seq->id );
 404             if ( @cat_seq == 0 ) {
 405                 $self->warn( $seq->id
 406                       . " not found in alignment "
 407                       . $cat_aln->id
 408                       . ", skipping this sequence." );
 409                 next SEQ;
 410             }
 411             if ( @cat_seq > 1 ) {
 412                 $self->warn( $seq->id
 413                       . " found multiple times in alignment "
 414                       . $cat_aln->id
 415                       . ", skipping this sequence." );
 416                 next SEQ;
 417             }
 418             my $cat_seq = $cat_seq[0];
 419             my $old_end = $new_seq->end;
 420             $new_seq->seq( $new_seq->seq . $cat_seq->seq );
 421
 422             # Not sure if this is a sensible way to deal with end coordinates
 423             $new_seq->end(
 424                 $new_seq->end + $cat_seq->end + 1 - $cat_seq->start );
 425
 426             if ( $cat_seq->isa('Bio::Seq::Meta::Array') ) {
 427                 unless ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
 428                     my $meta_seq = Bio::Seq::Meta::Array->new;
 429                     $meta_seq->seq( $new_seq->seq );
 430                     $meta_seq->start( $new_seq->start );
 431                     $meta_seq->end( $new_seq->end );
 432                     if ( $new_seq->isa('Bio::Seq::Meta') ) {
 433                         for my $meta_name ( $new_seq->meta_names ) {
 434                             $meta_seq->named_submeta(
 435                                 $meta_name,
 436                                 $new_seq->start,
 437                                 $old_end,
 438                                 [
 439                                     split(
 440                                         //, $new_seq->named_meta($meta_name)
 441                                     )
 442                                 ]
 443                             );
 444                         }
 445                     }
 446                     $new_seq = $meta_seq;
 447                 }
 448                 for my $meta_name ( $cat_seq->meta_names ) {
 449                     $new_seq->named_submeta( $meta_name, $old_end + 1,
 450                         $new_seq->end, $cat_seq->named_meta($meta_name) );
 451                 }
 452             }
 453             elsif ( $cat_seq->isa('Bio::Seq::Meta') ) {
 454                 if ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
 455                     for my $meta_name ( $cat_seq->meta_names ) {
 456                         $new_seq->named_submeta( $meta_name, $old_end + 1,
 457                             $new_seq->end,
 458                             [ split( //, $cat_seq->named_meta($meta_name) ) ] );
 459                     }
 460                 }
 461                 else {
 462                     unless ( $new_seq->isa('Bio::Seq::Meta') ) {
 463                         my $meta_seq = Bio::Seq::Meta::Array->new;
 464                         $meta_seq->seq( $new_seq->seq );
 465                         $meta_seq->start( $new_seq->start );
 466                         $meta_seq->end( $new_seq->end );
 467                         $new_seq = $meta_seq;
 468                     }
 469                     for my $meta_name ( $cat_seq->meta_names ) {
 470                         $new_seq->named_submeta( $meta_name, $old_end + 1,
 471                             $new_seq->end, $cat_seq->named_meta($meta_name) );
 472                     }
 473                 }
 474             }
 475         }
 476         $aln->add_seq($new_seq);
 477     }
 478     my $cons_meta = $self->consensus_meta;
 479     my $new_cons_meta;
 480     if ($cons_meta) {
 481         $new_cons_meta = Bio::Seq::Meta->new();
 482         for my $meta_name ( $cons_meta->meta_names ) {
 483             $new_cons_meta->named_submeta( $meta_name, 1, $self->length,
 484                 $cons_meta->$meta_name );
 485         }
 486     }
 487     my $end = $self->length;
 488     for my $cat_aln (@aln) {
 489         my $cat_cons_meta = $cat_aln->consensus_meta;
 490         if ($cat_cons_meta) {
 491             $new_cons_meta = Bio::Seq::Meta->new() if !$new_cons_meta;
 492             for my $meta_name ( $cat_cons_meta->meta_names ) {
 493                 $new_cons_meta->named_submeta(
 494                     $meta_name, $end + 1,
 495                     $end + $cat_aln->length,
 496                     $cat_cons_meta->$meta_name
 497                 );
 498             }
 499         }
 500         $end += $cat_aln->length;
 501     }
 502     $aln->consensus_meta($new_cons_meta) if $new_cons_meta;
 503     return $aln;
 504 }
 505
 506 1;