From 8a95e51cbb864d6185ad9d059cd1fc228c5eca26 Mon Sep 17 00:00:00 2001 From: Brian Osborne Date: Thu, 28 Mar 2013 10:51:50 -0400 Subject: [PATCH] Add dna_to_aa_aln method and tests --- Bio/Align/Utilities.pm | 402 ++++++++++++++++++++++++++++++------------------- t/Align/AlignUtil.t | 123 ++++++++------- t/AlignIO/fasta.t | 1 - 3 files changed, 313 insertions(+), 213 deletions(-) rewrite t/Align/AlignUtil.t (88%) diff --git a/Bio/Align/Utilities.pm b/Bio/Align/Utilities.pm index 62dc47b8b..90bc80c54 100644 --- a/Bio/Align/Utilities.pm +++ b/Bio/Align/Utilities.pm @@ -1,9 +1,9 @@ # # BioPerl module for Bio::Align::Utilities # -# Please direct questions and support issues to +# Please direct questions and support issues to # -# Cared for by Jason Stajich +# Cared for by Jason Stajich and Brian Osborne # # Copyright Jason Stajich # @@ -25,11 +25,14 @@ and manipulating alignment objects # The coding sequence that is passed in should still be the full # length CDS as the nt alignment will be generated. # %dnaseqs is a hash of CDS sequences (spliced) + my $dna_aln = aa_to_dna_aln($aa_aln,\%dnaseqs); - my $dna_aln = &aa_to_dna_aln($aa_aln,\%dnaseqs); + # The reverse, which is simpler. The input alignment has to be + # translate-able, with gap lengths and an overall length divisible by 3 + my $aa_aln = dna_to_aa_aln($dna_al); - # generate bootstraps - my $replicates = &bootstrap_replicates($aln,$count); + # Generate bootstraps + my $replicates = bootstrap_replicates($aln,$count); =head1 DESCRIPTION @@ -39,7 +42,7 @@ alignments (L) objects. The B utility is essentially the same as the B program by Bill Pearson available at ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar. Of course this -is a pure-perl implementation, but just to mention that if anything +is a pure-Perl implementation, but just to mention that if anything seems odd you can check the alignments generated against Bill's program. @@ -87,9 +90,8 @@ Internal methods are usually preceded with a _ #' keep my emacs happy # Let the code begin... - package Bio::Align::Utilities; -use vars qw(@EXPORT @EXPORT_OK $GAP $CODONGAP %EXPORT_TAGS); +use vars qw(@EXPORT @EXPORT_OK $GAP $CODONGAP %EXPORT_TAGS ); use strict; use Carp; use Bio::Root::Version; @@ -98,13 +100,14 @@ require Exporter; use base qw(Exporter); @EXPORT = qw(); -@EXPORT_OK = qw(aa_to_dna_aln bootstrap_replicates cat bootstrap_replicates_codons); -%EXPORT_TAGS = (all =>[@EXPORT, @EXPORT_OK]); +@EXPORT_OK = + qw(aa_to_dna_aln bootstrap_replicates cat bootstrap_replicates_codons dna_to_aa_aln); +%EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] ); BEGIN { use constant CODONSIZE => 3; - $GAP = '-'; - $CODONGAP = $GAP x CODONSIZE; + $GAP = '-'; + $CODONGAP = $GAP x CODONSIZE; } =head2 aa_to_dna_aln @@ -181,6 +184,67 @@ sub aa_to_dna_aln { return $dnaalign; } +=head2 dna_to_aa_aln + + Title : dna_to_aa_aln + Usage : my $aa_aln = dna_to_aa_aln($dna_aln); + Function: Convert a DNA alignment to an amino acid alignment where + the length of all alignment strings and the lengths of any + gaps must be divisible by 3 + Returns : Bio::Align::AlignI object + Args : the DNA alignment, a Bio::Align::AlignI of DNA sequences + +See also: L, L, L + +=cut + +sub dna_to_aa_aln { + my $dna_aln = shift; + unless ( defined $dna_aln + && ref($dna_aln) + && $dna_aln->isa('Bio::Align::AlignI') ) { + croak( +'Must provide a valid Bio::Align::AlignI object as the argument to dna_to_aa_aln' + ); + } + my $codon_table = Bio::Tools::CodonTable->new; + my $aa_aln = Bio::SimpleAlign->new; + + for my $seq ( $dna_aln->each_seq ) { + my ($aa_str, $aa_len); + my @aln_str = split '', $seq->seq; + croak("All lines in the alignment must have lengths divisible by 3") + if ( scalar(@aln_str) % CODONSIZE ); + + while ( @aln_str ) { + my $triplet = join '', (splice( @aln_str, 0, CODONSIZE )); + + if ( $triplet =~ /^[GATC]+$/i ) { + $aa_str .= $codon_table->translate($triplet); + $aa_len++; + } + elsif ( $triplet =~ /^[$Bio::LocatableSeq::GAP_SYMBOLS]+$/ ) { + $aa_str .= $GAP; + } + else { + croak("The triplet '$triplet' is neither a valid codon nor all gaps"); + } + } + my $new_aa = Bio::LocatableSeq->new( + -display_id => $seq->display_id, + -alphabet => 'protein', + -start => 1, + -end => $aa_len, + -strand => 1, + -seq => $aa_str + ); + + $aa_aln->add_seq($new_aa); + } + + $aa_aln; +} + =head2 bootstrap_replicates Title : bootstrap_replicates @@ -195,38 +259,41 @@ sub aa_to_dna_aln { =cut sub bootstrap_replicates { - my ($aln,$count) = @_; - $count ||= 1; - my $alen = $aln->length; - my (@seqs,@nm); - $aln->set_displayname_flat(1); - for my $s ( $aln->each_seq ) { - push @seqs, $s->seq(); - push @nm, $s->id; - } - my (@alns,$i); - while( $count-- > 0 ) { - my @newseqs; - for($i =0; $i < $alen; $i++ ) { - my $index = int(rand($alen)); - my $c = 0; - for ( @seqs ) { - $newseqs[$c++] .= substr($_,$index,1); - } - } - my $newaln = Bio::SimpleAlign->new(); - my $i = 0; - for my $s ( @newseqs ) { - (my $tmp = $s) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g; - $newaln->add_seq( Bio::LocatableSeq->new - (-start => 1, - -end => length($tmp), - -display_id => $nm[$i++], - -seq => $s)); - } - push @alns, $newaln; - } - return \@alns; + my ( $aln, $count ) = @_; + $count ||= 1; + my $alen = $aln->length; + my ( @seqs, @nm ); + $aln->set_displayname_flat(1); + for my $s ( $aln->each_seq ) { + push @seqs, $s->seq(); + push @nm, $s->id; + } + my ( @alns, $i ); + while ( $count-- > 0 ) { + my @newseqs; + for ( $i = 0 ; $i < $alen ; $i++ ) { + my $index = int( rand($alen) ); + my $c = 0; + for (@seqs) { + $newseqs[ $c++ ] .= substr( $_, $index, 1 ); + } + } + my $newaln = Bio::SimpleAlign->new(); + my $i = 0; + for my $s (@newseqs) { + ( my $tmp = $s ) =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]+//g; + $newaln->add_seq( + Bio::LocatableSeq->new( + -start => 1, + -end => length($tmp), + -display_id => $nm[ $i++ ], + -seq => $s + ) + ); + } + push @alns, $newaln; + } + return \@alns; } =head2 bootstrap_replicates_codons @@ -236,7 +303,7 @@ sub bootstrap_replicates { Function: Generate a pseudo-replicate of the data by randomly sampling, with replacement, the columns from a codon alignment for the non-parametric bootstrap. The alignment is assumed to start on - the first position of a codon. + the first position of a codon. Returns : Arrayref of L objects Args : L object Number of replicates to generate @@ -244,43 +311,45 @@ sub bootstrap_replicates { =cut sub bootstrap_replicates_codons { - my ($aln,$count) = @_; - $count ||= 1; - my $alen = $aln->length; - my $ncodon = int($alen/3); - my (@seqs,@nm); - $aln->set_displayname_flat(1); - for my $s ( $aln->each_seq ) { - push @seqs, $s->seq(); - push @nm, $s->id; - } - my (@alns,$i); - while( $count-- > 0 ) { - my @newseqs; - for($i =0; $i < $ncodon; $i++ ) { - my $index = int(rand($ncodon)); - my $seqpos = $index * 3; - my $c = 0; - for ( @seqs ) { - $newseqs[$c++] .= substr($_,$seqpos,3); - } - } - my $newaln = Bio::SimpleAlign->new(); - my $i = 0; - for my $s ( @newseqs ) { - (my $tmp = $s) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g; - $newaln->add_seq( Bio::LocatableSeq->new - (-start => 1, - -end => length($tmp), - -display_id => $nm[$i++], - -seq => $s)); - } - push @alns, $newaln; - } - return \@alns; + my ( $aln, $count ) = @_; + $count ||= 1; + my $alen = $aln->length; + my $ncodon = int( $alen / 3 ); + my ( @seqs, @nm ); + $aln->set_displayname_flat(1); + for my $s ( $aln->each_seq ) { + push @seqs, $s->seq(); + push @nm, $s->id; + } + my ( @alns, $i ); + while ( $count-- > 0 ) { + my @newseqs; + for ( $i = 0 ; $i < $ncodon ; $i++ ) { + my $index = int( rand($ncodon) ); + my $seqpos = $index * 3; + my $c = 0; + for (@seqs) { + $newseqs[ $c++ ] .= substr( $_, $seqpos, 3 ); + } + } + my $newaln = Bio::SimpleAlign->new(); + my $i = 0; + for my $s (@newseqs) { + ( my $tmp = $s ) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g; + $newaln->add_seq( + Bio::LocatableSeq->new( + -start => 1, + -end => length($tmp), + -display_id => $nm[ $i++ ], + -seq => $s + ) + ); + } + push @alns, $newaln; + } + return \@alns; } - =head2 cat Title : cat @@ -299,96 +368,107 @@ sub bootstrap_replicates_codons { =cut sub cat { - my ($self, @aln) = @_; + my ( $self, @aln ) = @_; $self->throw("cat method called with no arguments") unless $self; - for ($self,@aln) { - $self->throw($_->id. " not a Bio::Align::AlignI object") unless $_->isa('Bio::Align::AlignI'); - $self->throw($_->id. " is not flush") unless $_->is_flush; + for ( $self, @aln ) { + $self->throw( $_->id . " is not a Bio::Align::AlignI object" ) + unless $_->isa('Bio::Align::AlignI'); + $self->throw( $_->id . " is not flush" ) unless $_->is_flush; } my $aln = $self->new; - $aln->id($self->id); - $aln->annotation($self->annotation); + $aln->id( $self->id ); + $aln->annotation( $self->annotation ); my %unique; SEQ: foreach my $seq ( $self->each_seq() ) { - throw("ID: ", $seq->id, " is not unique in initial alignment.") if exists $unique{$seq->id}; - $unique{$seq->id}=1; + throw( "ID: ", $seq->id, " is not unique in initial alignment." ) + if exists $unique{ $seq->id }; + $unique{ $seq->id } = 1; # Can be Bio::LocatableSeq, Bio::Seq::Meta or Bio::Seq::Meta::Array - my $new_seq = $seq->new(-id=> $seq->id, - -strand => $seq->strand, - -verbose => $self->verbose); - $new_seq->seq($seq->seq); - $new_seq->start($seq->start); - $new_seq->end($seq->end); - if ($new_seq->isa('Bio::Seq::MetaI')) { - for my $meta_name ($seq->meta_names) { - $new_seq->named_submeta($meta_name, $new_seq->start, $new_seq->end, $seq->named_meta($meta_name)); + my $new_seq = $seq->new( + -id => $seq->id, + -strand => $seq->strand, + -verbose => $self->verbose + ); + $new_seq->seq( $seq->seq ); + $new_seq->start( $seq->start ); + $new_seq->end( $seq->end ); + if ( $new_seq->isa('Bio::Seq::MetaI') ) { + for my $meta_name ( $seq->meta_names ) { + $new_seq->named_submeta( $meta_name, $new_seq->start, + $new_seq->end, $seq->named_meta($meta_name) ); } } - for my $cat_aln (@aln) { - my @cat_seq=$cat_aln->each_seq_with_id($seq->id); - if (@cat_seq==0) { - $self->warn($seq->id. " not found in alignment ". $cat_aln->id. ", skipping this sequence."); + for my $cat_aln (@aln) { + my @cat_seq = $cat_aln->each_seq_with_id( $seq->id ); + if ( @cat_seq == 0 ) { + $self->warn( $seq->id + . " not found in alignment " + . $cat_aln->id + . ", skipping this sequence." ); next SEQ; } - if (@cat_seq>1) { - $self->warn($seq->id. " found multiple times in alignment ". $cat_aln->id. ", skipping this sequence."); + if ( @cat_seq > 1 ) { + $self->warn( $seq->id + . " found multiple times in alignment " + . $cat_aln->id + . ", skipping this sequence." ); next SEQ; } - my $cat_seq=$cat_seq[0]; - my $old_end=$new_seq->end; - $new_seq->seq($new_seq->seq.$cat_seq->seq); - + my $cat_seq = $cat_seq[0]; + my $old_end = $new_seq->end; + $new_seq->seq( $new_seq->seq . $cat_seq->seq ); + # Not sure if this is a sensible way to deal with end coordinates - $new_seq->end($new_seq->end+$cat_seq->end+1-$cat_seq->start); - - if ($cat_seq->isa('Bio::Seq::Meta::Array')) { - unless ($new_seq->isa('Bio::Seq::Meta::Array')) { - my $meta_seq=Bio::Seq::Meta::Array->new; - $meta_seq->seq($new_seq->seq); - $meta_seq->start($new_seq->start); - $meta_seq->end($new_seq->end); - if ($new_seq->isa('Bio::Seq::Meta')) { - for my $meta_name ($new_seq->meta_names) { - $meta_seq->named_submeta($meta_name, - $new_seq->start, - $old_end, - [split(//, $new_seq->named_meta($meta_name))] - ); + $new_seq->end( + $new_seq->end + $cat_seq->end + 1 - $cat_seq->start ); + + if ( $cat_seq->isa('Bio::Seq::Meta::Array') ) { + unless ( $new_seq->isa('Bio::Seq::Meta::Array') ) { + my $meta_seq = Bio::Seq::Meta::Array->new; + $meta_seq->seq( $new_seq->seq ); + $meta_seq->start( $new_seq->start ); + $meta_seq->end( $new_seq->end ); + if ( $new_seq->isa('Bio::Seq::Meta') ) { + for my $meta_name ( $new_seq->meta_names ) { + $meta_seq->named_submeta( + $meta_name, + $new_seq->start, + $old_end, + [ + split( + //, $new_seq->named_meta($meta_name) + ) + ] + ); } } - $new_seq=$meta_seq; + $new_seq = $meta_seq; } - for my $meta_name ($cat_seq->meta_names) { - $new_seq->named_submeta($meta_name, - $old_end+1, - $new_seq->end, - $cat_seq->named_meta($meta_name) - ); + for my $meta_name ( $cat_seq->meta_names ) { + $new_seq->named_submeta( $meta_name, $old_end + 1, + $new_seq->end, $cat_seq->named_meta($meta_name) ); } - } elsif ($cat_seq->isa('Bio::Seq::Meta')) { - if ($new_seq->isa('Bio::Seq::Meta::Array')) { - for my $meta_name ($cat_seq->meta_names) { - $new_seq->named_submeta($meta_name, - $old_end+1, - $new_seq->end, - [split(//,$cat_seq->named_meta($meta_name))] - ); + } + elsif ( $cat_seq->isa('Bio::Seq::Meta') ) { + if ( $new_seq->isa('Bio::Seq::Meta::Array') ) { + for my $meta_name ( $cat_seq->meta_names ) { + $new_seq->named_submeta( $meta_name, $old_end + 1, + $new_seq->end, + [ split( //, $cat_seq->named_meta($meta_name) ) ] ); } - } else { - unless ($new_seq->isa('Bio::Seq::Meta')) { - my $meta_seq=Bio::Seq::Meta::Array->new; - $meta_seq->seq($new_seq->seq); - $meta_seq->start($new_seq->start); - $meta_seq->end($new_seq->end); - $new_seq=$meta_seq; + } + else { + unless ( $new_seq->isa('Bio::Seq::Meta') ) { + my $meta_seq = Bio::Seq::Meta::Array->new; + $meta_seq->seq( $new_seq->seq ); + $meta_seq->start( $new_seq->start ); + $meta_seq->end( $new_seq->end ); + $new_seq = $meta_seq; } - for my $meta_name ($cat_seq->meta_names) { - $new_seq->named_submeta($meta_name, - $old_end+1, - $new_seq->end, - $cat_seq->named_meta($meta_name) - ); + for my $meta_name ( $cat_seq->meta_names ) { + $new_seq->named_submeta( $meta_name, $old_end + 1, + $new_seq->end, $cat_seq->named_meta($meta_name) ); } } } @@ -399,24 +479,28 @@ sub cat { my $new_cons_meta; if ($cons_meta) { $new_cons_meta = Bio::Seq::Meta->new(); - for my $meta_name ($cons_meta->meta_names) { - $new_cons_meta->named_submeta($meta_name, 1, $self->length, $cons_meta->$meta_name); + for my $meta_name ( $cons_meta->meta_names ) { + $new_cons_meta->named_submeta( $meta_name, 1, $self->length, + $cons_meta->$meta_name ); } } - my $end=$self->length; + my $end = $self->length; for my $cat_aln (@aln) { - my $cat_cons_meta=$cat_aln->consensus_meta; + my $cat_cons_meta = $cat_aln->consensus_meta; if ($cat_cons_meta) { $new_cons_meta = Bio::Seq::Meta->new() if !$new_cons_meta; - for my $meta_name ($cat_cons_meta->meta_names) { - $new_cons_meta->named_submeta($meta_name, $end+1, $end+$cat_aln->length, $cat_cons_meta->$meta_name); + for my $meta_name ( $cat_cons_meta->meta_names ) { + $new_cons_meta->named_submeta( + $meta_name, $end + 1, + $end + $cat_aln->length, + $cat_cons_meta->$meta_name + ); } } - $end+=$cat_aln->length; - } + $end += $cat_aln->length; + } $aln->consensus_meta($new_cons_meta) if $new_cons_meta; return $aln; } - 1; diff --git a/t/Align/AlignUtil.t b/t/Align/AlignUtil.t dissimilarity index 88% index db7d998ec..9f6263af0 100644 --- a/t/Align/AlignUtil.t +++ b/t/Align/AlignUtil.t @@ -1,53 +1,70 @@ -# -*-Perl-*- Test Harness script for Bioperl -# $Id$ - -use strict; - -BEGIN { - use lib '.'; - use Bio::Root::Test; - - test_begin(-tests => 33); - - use_ok('Bio::Align::Utilities',qw(aa_to_dna_aln bootstrap_replicates cat) ); - use_ok('Bio::AlignIO'); - use_ok('Bio::SeqIO'); -} - -my $in = Bio::AlignIO->new(-format => 'clustalw', - -file => test_input_file('pep-266.aln')); -my $aln = $in->next_aln(); -isa_ok($aln, 'Bio::Align::AlignI'); -$in->close(); - -my $seqin = Bio::SeqIO->new(-format => 'fasta', - -file => test_input_file('cds-266.fas')); -# get the cds sequences -my %cds_seq; -while( my $seq = $seqin->next_seq ) { - $cds_seq{$seq->display_id} = $seq; -} - -my $cds_aln = &aa_to_dna_aln($aln,\%cds_seq); - -my @aa_seqs = $aln->each_seq; - -for my $cdsseq ( $cds_aln->each_seq ) { - my $peptrans = $cdsseq->translate(); - my $aaseq = shift @aa_seqs; - is($peptrans->seq(),$aaseq->seq()); -} - -my $bootstraps = &bootstrap_replicates($aln,10); - -is(scalar @$bootstraps, 10); - -my $sub_aln1=$aln->slice(1,100); -my $sub_aln2=$aln->slice(101,200); -my $sub_aln3=$aln->slice(1,200); -my $cat_aln=cat($sub_aln1, $sub_aln2); -my @seq=$sub_aln3->each_seq; -for my $seq ($cat_aln->each_seq) { - my $refseq=shift @seq; - is($seq->seq, $refseq->seq); -} +# -*-Perl-*- Test Harness script for Bioperl + +use strict; + +BEGIN { + use lib '.'; + use Bio::Root::Test; + + test_begin( -tests => 47 ); + + use_ok( 'Bio::Align::Utilities', + qw( aa_to_dna_aln bootstrap_replicates cat dna_to_aa_aln ) ); + use_ok('Bio::AlignIO'); + use_ok('Bio::SeqIO'); +} + +my $in = Bio::AlignIO->new( + -format => 'clustalw', + -file => test_input_file('pep-266.aln') +); +my $pep_aln = $in->next_aln(); +isa_ok( $pep_aln, 'Bio::Align::AlignI' ); +$in->close(); + +# aa_to_dna_aln +my $seqin = Bio::SeqIO->new( + -format => 'fasta', + -file => test_input_file('cds-266.fas') +); + +my %dna_seq; +while ( my $seq = $seqin->next_seq ) { + $dna_seq{ $seq->display_id } = $seq; +} + +my $dna_aln = aa_to_dna_aln( $pep_aln, \%dna_seq ); + +my @aa_seqs = $pep_aln->each_seq; + +for my $dna_seq ( $dna_aln->each_seq ) { + my $peptrans = $dna_seq->translate(); + my $aaseq = shift @aa_seqs; + is( $peptrans->seq(), $aaseq->seq() ); +} + +# dna_to_aa_aln +my $aa_aln = dna_to_aa_aln($dna_aln); + +my @pep_seqs = $aa_aln->each_seq; + +for my $dna_seq ( $dna_aln->each_seq ) { + my $peptrans = $dna_seq->translate(); + my $aaseq = shift @pep_seqs; + is( $peptrans->seq, $aaseq->seq ); +} + +# bootstrap_replicates +my $bootstraps = bootstrap_replicates( $pep_aln, 10 ); +is( scalar @$bootstraps, 10 ); + +# cat +my $sub_aln1 = $pep_aln->slice( 1, 100 ); +my $sub_aln2 = $pep_aln->slice( 101, 200 ); +my $sub_aln3 = $pep_aln->slice( 1, 200 ); +my $cat_aln = cat( $sub_aln1, $sub_aln2 ); +my @seq = $sub_aln3->each_seq; +for my $seq ( $cat_aln->each_seq ) { + my $refseq = shift @seq; + is( $seq->seq, $refseq->seq ); +} diff --git a/t/AlignIO/fasta.t b/t/AlignIO/fasta.t index 843c05e44..8ad9d452e 100644 --- a/t/AlignIO/fasta.t +++ b/t/AlignIO/fasta.t @@ -1,5 +1,4 @@ # -*-Perl-*- Test Harness script for Bioperl -# $Id: fasta.t 14971 2008-10-28 16:08:52Z cjfields $ use strict; -- 2.11.4.GIT