Bio/Align/Utilities.pm

   1 package Bio::Align::Utilities;
   2 use strict;
   3 use warnings;
   4 use Carp;
   5 use Bio::Root::Version;
   6
   7 use Exporter 'import';
   8 our @EXPORT_OK = qw(
   9   aa_to_dna_aln
  10   bootstrap_replicates
  11   cat
  12   bootstrap_replicates_codons
  13   dna_to_aa_aln
  14   most_common_sequences
  15 );
  16 our %EXPORT_TAGS = (all => \@EXPORT_OK);
  17
  18 #
  19 # BioPerl module for Bio::Align::Utilities
  20 #
  21 # Please direct questions and support issues to <bioperl-l@bioperl.org>
  22 #
  23 # Cared for by Jason Stajich <jason-at-bioperl.org> and Brian Osborne
  24 #
  25 # Copyright Jason Stajich
  26 #
  27 # You may distribute this module under the same terms as perl itself
  28
  29 # POD documentation - main docs before the code
  30
  31 =head1 NAME
  32
  33 Bio::Align::Utilities - A collection of utilities regarding converting
  34 and manipulating alignment objects
  35
  36 =head1 SYNOPSIS
  37
  38   use Bio::Align::Utilities qw(:all);
  39
  40   # Even if the protein alignments are local make sure the start/end
  41   # stored in the LocatableSeq objects are to the full length protein.
  42   # The coding sequence that is passed in should still be the full
  43   # length CDS as the nt alignment will be generated.
  44   # %dnaseqs is a hash of CDS sequences (spliced)
  45   my $dna_aln = aa_to_dna_aln($aa_aln,\%dnaseqs);
  46
  47   # The reverse, which is simpler. The input alignment has to be
  48   # translate-able, with gap lengths and an overall length divisible by 3
  49   my $aa_aln = dna_to_aa_aln($dna_al);
  50
  51   # Generate bootstraps
  52   my $replicates = bootstrap_replicates($aln,$count);
  53
  54 =head1 DESCRIPTION
  55
  56 This module contains utility methods for manipulating sequence
  57 alignments (L<Bio::Align::AlignI>) objects.
  58
  59 The B<aa_to_dna_aln> utility is essentially the same as the B<mrtrans>
  60 program by Bill Pearson available at
  61 ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar.  Of course this
  62 is a pure-Perl implementation, but just to mention that if anything
  63 seems odd you can check the alignments generated against Bill's
  64 program.
  65
  66 =head1 FEEDBACK
  67
  68 =head2 Mailing Lists
  69
  70 User feedback is an integral part of the evolution of this and other
  71 Bioperl modules. Send your comments and suggestions preferably to
  72 the Bioperl mailing list.  Your participation is much appreciated.
  73
  74   bioperl-l@bioperl.org                  - General discussion
  75   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  76
  77 =head2 Support
  78
  79 Please direct usage questions or support issues to the mailing list:
  80
  81 I<bioperl-l@bioperl.org>
  82
  83 rather than to the module maintainer directly. Many experienced and
  84 reponsive experts will be able look at the problem and quickly
  85 address it. Please include a thorough description of the problem
  86 with code and data examples if at all possible.
  87
  88 =head2 Reporting Bugs
  89
  90 Report bugs to the Bioperl bug tracking system to help us keep track
  91 of the bugs and their resolution. Bug reports can be submitted via the
  92 web:
  93
  94   https://github.com/bioperl/bioperl-live/issues
  95
  96 =head1 AUTHOR - Jason Stajich
  97
  98 Email jason-at-bioperl-dot-org
  99
 100 =head1 APPENDIX
 101
 102 The rest of the documentation details each of the object methods.
 103 Internal methods are usually preceded with a _
 104
 105 =cut
 106
 107 use constant CODONSIZE => 3;
 108 our $GAP       = '-';
 109 our $CODONGAP  = $GAP x CODONSIZE;
 110
 111 =head2 aa_to_dna_aln
 112
 113  Title   : aa_to_dna_aln
 114  Usage   : my $dnaaln = aa_to_dna_aln($aa_aln, \%seqs);
 115  Function: Will convert an AA alignment to DNA space given the
 116            corresponding DNA sequences.  Note that this method expects
 117            the DNA sequences to be in frame +1 (GFF frame 0) as it will
 118            start to project into coordinates starting at the first base of
 119            the DNA sequence, if this alignment represents a different
 120            frame for the cDNA you will need to edit the DNA sequences
 121            to remove the 1st or 2nd bases (and revcom if things should be).
 122  Returns : Bio::Align::AlignI object
 123  Args    : 2 arguments, the alignment and a hashref.
 124            Alignment is a Bio::Align::AlignI of amino acid sequences.
 125            The hash reference should have keys which are
 126            the display_ids for the aa
 127            sequences in the alignment and the values are a
 128            Bio::PrimarySeqI object for the corresponding
 129            spliced cDNA sequence.
 130
 131 See also: L<Bio::Align::AlignI>, L<Bio::SimpleAlign>, L<Bio::PrimarySeq>
 132
 133 =cut
 134
 135 sub aa_to_dna_aln {
 136     my ( $aln, $dnaseqs ) = @_;
 137     unless ( defined $aln
 138         && ref($aln)
 139         && $aln->isa('Bio::Align::AlignI') )
 140     {
 141         croak(
 142 'Must provide a valid Bio::Align::AlignI object as the first argument to aa_to_dna_aln, see the documentation for proper usage and the method signature'
 143         );
 144     }
 145     my $alnlen   = $aln->length;
 146     my $dnaalign = Bio::SimpleAlign->new();
 147     $aln->map_chars( '\.', $GAP );
 148
 149     foreach my $seq ( $aln->each_seq ) {
 150         my $aa_seqstr = $seq->seq();
 151         my $pepid     = $seq->display_id;
 152         my $dnaseq    = $dnaseqs->{$pepid} || $aln->throw( "cannot find " . $seq->display_id );
 153         my $start_offset = ( $seq->start - 1 ) * CODONSIZE;
 154         $dnaseq = $dnaseq->seq();
 155         my $dnalen = $dnaseqs->{$pepid}->length;
 156         my $dnaid = $dnaseqs->{$pepid}->display_id || $pepid; # try to use DNAseq obj ID (issue #137)
 157         my $nt_seqstr;
 158         my $j = 0;
 159         for ( my $i = 0 ; $i < $alnlen ; $i++ ) {
 160             my $char = substr( $aa_seqstr, $i + $start_offset, 1 );
 161             if ( $char eq $GAP || $j >= $dnalen ) {
 162                 $nt_seqstr .= $CODONGAP;
 163             }
 164             else {
 165                 $nt_seqstr .= substr( $dnaseq, $j, CODONSIZE );
 166                 $j += CODONSIZE;
 167             }
 168         }
 169         $nt_seqstr .= $GAP x ( ( $alnlen * 3 ) - length($nt_seqstr) );
 170
 171         my $newdna = Bio::LocatableSeq->new(
 172             -display_id => $dnaid,
 173             -alphabet   => 'dna',
 174             -start      => $start_offset + 1,
 175             -end        => ( $seq->end * CODONSIZE ),
 176             -strand     => 1,
 177             -seq        => $nt_seqstr
 178         );
 179         $dnaalign->add_seq($newdna);
 180     }
 181     return $dnaalign;
 182 }
 183
 184 =head2 dna_to_aa_aln
 185
 186  Title   : dna_to_aa_aln
 187  Usage   : my $aa_aln = dna_to_aa_aln($dna_aln);
 188  Function: Convert a DNA alignment to an amino acid alignment where
 189            the length of all alignment strings and the lengths of any
 190            gaps must be divisible by 3
 191  Returns : Bio::Align::AlignI object
 192  Args    : the DNA alignment, a Bio::Align::AlignI of DNA sequences
 193
 194 See also: L<Bio::Align::AlignI>, L<Bio::SimpleAlign>, L<Bio::PrimarySeq>
 195
 196 =cut
 197
 198 sub dna_to_aa_aln {
 199     my $dna_aln = shift;
 200     unless ( defined $dna_aln
 201         && ref($dna_aln)
 202         && $dna_aln->isa('Bio::Align::AlignI') ) {
 203         croak(
 204 'Must provide a valid Bio::Align::AlignI object as the argument to dna_to_aa_aln'
 205         );
 206     }
 207     my $codon_table = Bio::Tools::CodonTable->new;
 208     my $aa_aln      = Bio::SimpleAlign->new;
 209
 210     for my $seq ( $dna_aln->each_seq ) {
 211         my ($aa_str, $aa_len);
 212         my @aln_str = split '', $seq->seq;
 213         croak("All lines in the alignment must have lengths divisible by 3")
 214           if ( scalar(@aln_str) % CODONSIZE );
 215
 216         while ( @aln_str ) {
 217             my $triplet = join '', (splice( @aln_str, 0, CODONSIZE ));
 218
 219             if ( $triplet =~ /^[GATC]+$/i ) {
 220                 $aa_str .= $codon_table->translate($triplet);
 221                 $aa_len++;
 222             }
 223             elsif ( $triplet =~ /^[$Bio::LocatableSeq::GAP_SYMBOLS]+$/ ) {
 224                 $aa_str .= $GAP;
 225             }
 226             else {
 227                 croak("The triplet '$triplet' is neither a valid codon nor all gaps");
 228             }
 229         }
 230         my $new_aa = Bio::LocatableSeq->new(
 231             -display_id => $seq->display_id,
 232             -alphabet   => 'protein',
 233             -start      => 1,
 234             -end        => $aa_len,
 235             -strand     => 1,
 236             -seq        => $aa_str
 237         );
 238
 239         $aa_aln->add_seq($new_aa);
 240     }
 241
 242     $aa_aln;
 243 }
 244
 245 =head2 bootstrap_replicates
 246
 247  Title   : bootstrap_replicates
 248  Usage   : my $alns = &bootstrap_replicates($aln,100);
 249  Function: Generate a pseudo-replicate of the data by randomly
 250            sampling, with replacement, the columns from an alignment for
 251            the non-parametric bootstrap.
 252  Returns : Arrayref of L<Bio::SimpleAlign> objects
 253  Args    : L<Bio::SimpleAlign> object
 254            Number of replicates to generate
 255
 256 =cut
 257
 258 sub bootstrap_replicates {
 259     my ( $aln, $count ) = @_;
 260     $count ||= 1;
 261     my $alen = $aln->length;
 262     my ( @seqs, @nm );
 263     $aln->set_displayname_flat(1);
 264     for my $s ( $aln->each_seq ) {
 265         push @seqs, $s->seq();
 266         push @nm,   $s->id;
 267     }
 268     my ( @alns, $i );
 269     while ( $count-- > 0 ) {
 270         my @newseqs;
 271         for ( $i = 0 ; $i < $alen ; $i++ ) {
 272             my $index = int( rand($alen) );
 273             my $c     = 0;
 274             for (@seqs) {
 275                 $newseqs[ $c++ ] .= substr( $_, $index, 1 );
 276             }
 277         }
 278         my $newaln = Bio::SimpleAlign->new();
 279         my $i      = 0;
 280         for my $s (@newseqs) {
 281             ( my $tmp = $s ) =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]+//g;
 282             $newaln->add_seq(
 283                 Bio::LocatableSeq->new(
 284                     -start      => 1,
 285                     -end        => length($tmp),
 286                     -display_id => $nm[ $i++ ],
 287                     -seq        => $s
 288                 )
 289             );
 290         }
 291         push @alns, $newaln;
 292     }
 293     return \@alns;
 294 }
 295
 296 =head2 bootstrap_replicates_codons
 297
 298  Title   : bootstrap_replicates_codons
 299  Usage   : my $alns = &bootstrap_replicates_codons($aln,100);
 300  Function: Generate a pseudo-replicate of the data by randomly
 301            sampling, with replacement, the columns from a codon alignment for
 302            the non-parametric bootstrap. The alignment is assumed to start on
 303            the first position of a codon.
 304  Returns : Arrayref of L<Bio::SimpleAlign> objects
 305  Args    : L<Bio::SimpleAlign> object
 306            Number of replicates to generate
 307
 308 =cut
 309
 310 sub bootstrap_replicates_codons {
 311     my ( $aln, $count ) = @_;
 312     $count ||= 1;
 313     my $alen   = $aln->length;
 314     my $ncodon = int( $alen / 3 );
 315     my ( @seqs, @nm );
 316     $aln->set_displayname_flat(1);
 317     for my $s ( $aln->each_seq ) {
 318         push @seqs, $s->seq();
 319         push @nm,   $s->id;
 320     }
 321     my ( @alns, $i );
 322     while ( $count-- > 0 ) {
 323         my @newseqs;
 324         for ( $i = 0 ; $i < $ncodon ; $i++ ) {
 325             my $index  = int( rand($ncodon) );
 326             my $seqpos = $index * 3;
 327             my $c      = 0;
 328             for (@seqs) {
 329                 $newseqs[ $c++ ] .= substr( $_, $seqpos, 3 );
 330             }
 331         }
 332         my $newaln = Bio::SimpleAlign->new();
 333         my $i      = 0;
 334         for my $s (@newseqs) {
 335             ( my $tmp = $s ) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g;
 336             $newaln->add_seq(
 337                 Bio::LocatableSeq->new(
 338                     -start      => 1,
 339                     -end        => length($tmp),
 340                     -display_id => $nm[ $i++ ],
 341                     -seq        => $s
 342                 )
 343             );
 344         }
 345         push @alns, $newaln;
 346     }
 347     return \@alns;
 348 }
 349
 350 =head2 cat
 351
 352  Title     : cat
 353  Usage     : $aln123 = cat($aln1, $aln2, $aln3)
 354  Function  : Concatenates alignment objects. Sequences are identified by id.
 355              An error will be thrown if the sequence ids are not unique in the
 356              first alignment. If any ids are not present or not unique in any
 357              of the additional alignments then those sequences are omitted from
 358              the concatenated alignment, and a warning is issued. An error will
 359              be thrown if any of the alignments are not flush, since
 360              concatenating such alignments is unlikely to make biological
 361              sense.
 362  Returns   : A new Bio::SimpleAlign object
 363  Args      : A list of Bio::SimpleAlign objects
 364
 365 =cut
 366
 367 sub cat {
 368     my ( $self, @aln ) = @_;
 369     $self->throw("cat method called with no arguments") unless $self;
 370     for ( $self, @aln ) {
 371         $self->throw( $_->id . " is not a Bio::Align::AlignI object" )
 372           unless $_->isa('Bio::Align::AlignI');
 373         $self->throw( $_->id . " is not flush" ) unless $_->is_flush;
 374     }
 375     my $aln = $self->new;
 376     $aln->id( $self->id );
 377     $aln->annotation( $self->annotation );
 378     my %unique;
 379   SEQ: foreach my $seq ( $self->each_seq() ) {
 380         throw( "ID: ", $seq->id, " is not unique in initial alignment." )
 381           if exists $unique{ $seq->id };
 382         $unique{ $seq->id } = 1;
 383
 384         # Can be Bio::LocatableSeq, Bio::Seq::Meta or Bio::Seq::Meta::Array
 385         my $new_seq = $seq->new(
 386             -id      => $seq->id,
 387             -strand  => $seq->strand,
 388             -verbose => $self->verbose
 389         );
 390         $new_seq->seq( $seq->seq );
 391         $new_seq->start( $seq->start );
 392         $new_seq->end( $seq->end );
 393         if ( $new_seq->isa('Bio::Seq::MetaI') ) {
 394             for my $meta_name ( $seq->meta_names ) {
 395                 $new_seq->named_submeta( $meta_name, $new_seq->start,
 396                     $new_seq->end, $seq->named_meta($meta_name) );
 397             }
 398         }
 399         for my $cat_aln (@aln) {
 400             my @cat_seq = $cat_aln->each_seq_with_id( $seq->id );
 401             if ( @cat_seq == 0 ) {
 402                 $self->warn( $seq->id
 403                       . " not found in alignment "
 404                       . $cat_aln->id
 405                       . ", skipping this sequence." );
 406                 next SEQ;
 407             }
 408             if ( @cat_seq > 1 ) {
 409                 $self->warn( $seq->id
 410                       . " found multiple times in alignment "
 411                       . $cat_aln->id
 412                       . ", skipping this sequence." );
 413                 next SEQ;
 414             }
 415             my $cat_seq = $cat_seq[0];
 416             my $old_end = $new_seq->end;
 417             $new_seq->seq( $new_seq->seq . $cat_seq->seq );
 418
 419             # Not sure if this is a sensible way to deal with end coordinates
 420             $new_seq->end(
 421                 $new_seq->end + $cat_seq->end + 1 - $cat_seq->start );
 422
 423             if ( $cat_seq->isa('Bio::Seq::Meta::Array') ) {
 424                 unless ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
 425                     my $meta_seq = Bio::Seq::Meta::Array->new;
 426                     $meta_seq->seq( $new_seq->seq );
 427                     $meta_seq->start( $new_seq->start );
 428                     $meta_seq->end( $new_seq->end );
 429                     if ( $new_seq->isa('Bio::Seq::Meta') ) {
 430                         for my $meta_name ( $new_seq->meta_names ) {
 431                             $meta_seq->named_submeta(
 432                                 $meta_name,
 433                                 $new_seq->start,
 434                                 $old_end,
 435                                 [
 436                                     split(
 437                                         //, $new_seq->named_meta($meta_name)
 438                                     )
 439                                 ]
 440                             );
 441                         }
 442                     }
 443                     $new_seq = $meta_seq;
 444                 }
 445                 for my $meta_name ( $cat_seq->meta_names ) {
 446                     $new_seq->named_submeta( $meta_name, $old_end + 1,
 447                         $new_seq->end, $cat_seq->named_meta($meta_name) );
 448                 }
 449             }
 450             elsif ( $cat_seq->isa('Bio::Seq::Meta') ) {
 451                 if ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
 452                     for my $meta_name ( $cat_seq->meta_names ) {
 453                         $new_seq->named_submeta( $meta_name, $old_end + 1,
 454                             $new_seq->end,
 455                             [ split( //, $cat_seq->named_meta($meta_name) ) ] );
 456                     }
 457                 }
 458                 else {
 459                     unless ( $new_seq->isa('Bio::Seq::Meta') ) {
 460                         my $meta_seq = Bio::Seq::Meta::Array->new;
 461                         $meta_seq->seq( $new_seq->seq );
 462                         $meta_seq->start( $new_seq->start );
 463                         $meta_seq->end( $new_seq->end );
 464                         $new_seq = $meta_seq;
 465                     }
 466                     for my $meta_name ( $cat_seq->meta_names ) {
 467                         $new_seq->named_submeta( $meta_name, $old_end + 1,
 468                             $new_seq->end, $cat_seq->named_meta($meta_name) );
 469                     }
 470                 }
 471             }
 472         }
 473         $aln->add_seq($new_seq);
 474     }
 475     my $cons_meta = $self->consensus_meta;
 476     my $new_cons_meta;
 477     if ($cons_meta) {
 478         $new_cons_meta = Bio::Seq::Meta->new();
 479         for my $meta_name ( $cons_meta->meta_names ) {
 480             $new_cons_meta->named_submeta( $meta_name, 1, $self->length,
 481                 $cons_meta->$meta_name );
 482         }
 483     }
 484     my $end = $self->length;
 485     for my $cat_aln (@aln) {
 486         my $cat_cons_meta = $cat_aln->consensus_meta;
 487         if ($cat_cons_meta) {
 488             $new_cons_meta = Bio::Seq::Meta->new() if !$new_cons_meta;
 489             for my $meta_name ( $cat_cons_meta->meta_names ) {
 490                 $new_cons_meta->named_submeta(
 491                     $meta_name, $end + 1,
 492                     $end + $cat_aln->length,
 493                     $cat_cons_meta->$meta_name
 494                 );
 495             }
 496         }
 497         $end += $cat_aln->length;
 498     }
 499     $aln->consensus_meta($new_cons_meta) if $new_cons_meta;
 500     return $aln;
 501 }
 502
 503
 504 =head2 most_common_sequences
 505
 506  Title     : most_common_sequences
 507  Usage     : @common = most_common_sequences ($align, $case_sensitivity)
 508  Function  : Returns an array of the sequences that appear most often in the
 509              alignment (although this probably makes more sense when there is
 510              only a single most common sequence).  Sequences are compared after
 511              removing any "-" (gap characters), and ambiguous units (e.g., R
 512              for purines) are only compared to themselves.  The returned
 513              sequence is also missing the "-" since they don't actually make
 514              part of the sequence.
 515  Returns   : Array of text strings.
 516  Arguments : Optional argument defining whether the comparison between sequences
 517              to find the most common should be case sensitive. Defaults to
 518              false, i.e, not case sensitive.
 519
 520 =cut
 521
 522 sub most_common_sequences {
 523     my $align = shift
 524       or croak ("Must provide Bio::AlignI object to Bio::Align::Utilities::most_common_sequences");
 525     my $case_sensitive = shift; # defaults to false (we get undef if nothing)
 526
 527     ## We keep track of the max on this loop. Saves us having to
 528     ## transverse the hash table later to find the maximum value.
 529     my $max = 0;
 530     my %counts;
 531     foreach ($align->each_seq) {
 532         (my $seq = $_->seq) =~ tr/-//d;
 533         $seq = uc ($seq) unless $case_sensitive;
 534         $max++ if (++$counts{$seq} > $max);
 535     }
 536     my @common = grep ($counts{$_} == $max, keys %counts);
 537     return @common;
 538 }
 539
 540 1;