t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / Align / Utilities.pm
blob649ec637b9c0a77e9e03a39e84cf611ebb51633c
1 package Bio::Align::Utilities;
2 use strict;
3 use warnings;
4 use Carp;
5 use Bio::Root::Version;
7 use Exporter 'import';
8 our @EXPORT_OK = qw(
9 aa_to_dna_aln
10 bootstrap_replicates
11 cat
12 bootstrap_replicates_codons
13 dna_to_aa_aln
14 most_common_sequences
16 our %EXPORT_TAGS = (all => \@EXPORT_OK);
19 # BioPerl module for Bio::Align::Utilities
21 # Please direct questions and support issues to <bioperl-l@bioperl.org>
23 # Cared for by Jason Stajich <jason-at-bioperl.org> and Brian Osborne
25 # Copyright Jason Stajich
27 # You may distribute this module under the same terms as perl itself
29 # POD documentation - main docs before the code
31 =head1 NAME
33 Bio::Align::Utilities - A collection of utilities regarding converting
34 and manipulating alignment objects
36 =head1 SYNOPSIS
38 use Bio::Align::Utilities qw(:all);
40 # Even if the protein alignments are local make sure the start/end
41 # stored in the LocatableSeq objects are to the full length protein.
42 # The coding sequence that is passed in should still be the full
43 # length CDS as the nt alignment will be generated.
44 # %dnaseqs is a hash of CDS sequences (spliced)
45 my $dna_aln = aa_to_dna_aln($aa_aln,\%dnaseqs);
47 # The reverse, which is simpler. The input alignment has to be
48 # translate-able, with gap lengths and an overall length divisible by 3
49 my $aa_aln = dna_to_aa_aln($dna_al);
51 # Generate bootstraps
52 my $replicates = bootstrap_replicates($aln,$count);
54 =head1 DESCRIPTION
56 This module contains utility methods for manipulating sequence
57 alignments (L<Bio::Align::AlignI>) objects.
59 The B<aa_to_dna_aln> utility is essentially the same as the B<mrtrans>
60 program by Bill Pearson available at
61 ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar. Of course this
62 is a pure-Perl implementation, but just to mention that if anything
63 seems odd you can check the alignments generated against Bill's
64 program.
66 =head1 FEEDBACK
68 =head2 Mailing Lists
70 User feedback is an integral part of the evolution of this and other
71 Bioperl modules. Send your comments and suggestions preferably to
72 the Bioperl mailing list. Your participation is much appreciated.
74 bioperl-l@bioperl.org - General discussion
75 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
77 =head2 Support
79 Please direct usage questions or support issues to the mailing list:
81 I<bioperl-l@bioperl.org>
83 rather than to the module maintainer directly. Many experienced and
84 reponsive experts will be able look at the problem and quickly
85 address it. Please include a thorough description of the problem
86 with code and data examples if at all possible.
88 =head2 Reporting Bugs
90 Report bugs to the Bioperl bug tracking system to help us keep track
91 of the bugs and their resolution. Bug reports can be submitted via the
92 web:
94 https://github.com/bioperl/bioperl-live/issues
96 =head1 AUTHOR - Jason Stajich
98 Email jason-at-bioperl-dot-org
100 =head1 APPENDIX
102 The rest of the documentation details each of the object methods.
103 Internal methods are usually preceded with a _
105 =cut
107 use constant CODONSIZE => 3;
108 our $GAP = '-';
109 our $CODONGAP = $GAP x CODONSIZE;
111 =head2 aa_to_dna_aln
113 Title : aa_to_dna_aln
114 Usage : my $dnaaln = aa_to_dna_aln($aa_aln, \%seqs);
115 Function: Will convert an AA alignment to DNA space given the
116 corresponding DNA sequences. Note that this method expects
117 the DNA sequences to be in frame +1 (GFF frame 0) as it will
118 start to project into coordinates starting at the first base of
119 the DNA sequence, if this alignment represents a different
120 frame for the cDNA you will need to edit the DNA sequences
121 to remove the 1st or 2nd bases (and revcom if things should be).
122 Returns : Bio::Align::AlignI object
123 Args : 2 arguments, the alignment and a hashref.
124 Alignment is a Bio::Align::AlignI of amino acid sequences.
125 The hash reference should have keys which are
126 the display_ids for the aa
127 sequences in the alignment and the values are a
128 Bio::PrimarySeqI object for the corresponding
129 spliced cDNA sequence.
131 See also: L<Bio::Align::AlignI>, L<Bio::SimpleAlign>, L<Bio::PrimarySeq>
133 =cut
135 sub aa_to_dna_aln {
136 my ( $aln, $dnaseqs ) = @_;
137 unless ( defined $aln
138 && ref($aln)
139 && $aln->isa('Bio::Align::AlignI') )
141 croak(
142 'Must provide a valid Bio::Align::AlignI object as the first argument to aa_to_dna_aln, see the documentation for proper usage and the method signature'
145 my $alnlen = $aln->length;
146 my $dnaalign = Bio::SimpleAlign->new();
147 $aln->map_chars( '\.', $GAP );
149 foreach my $seq ( $aln->each_seq ) {
150 my $aa_seqstr = $seq->seq();
151 my $pepid = $seq->display_id;
152 my $dnaseq = $dnaseqs->{$pepid} || $aln->throw( "cannot find " . $seq->display_id );
153 my $start_offset = ( $seq->start - 1 ) * CODONSIZE;
154 $dnaseq = $dnaseq->seq();
155 my $dnalen = $dnaseqs->{$pepid}->length;
156 my $dnaid = $dnaseqs->{$pepid}->display_id || $pepid; # try to use DNAseq obj ID (issue #137)
157 my $nt_seqstr;
158 my $j = 0;
159 for ( my $i = 0 ; $i < $alnlen ; $i++ ) {
160 my $char = substr( $aa_seqstr, $i + $start_offset, 1 );
161 if ( $char eq $GAP || $j >= $dnalen ) {
162 $nt_seqstr .= $CODONGAP;
164 else {
165 $nt_seqstr .= substr( $dnaseq, $j, CODONSIZE );
166 $j += CODONSIZE;
169 $nt_seqstr .= $GAP x ( ( $alnlen * 3 ) - length($nt_seqstr) );
171 my $newdna = Bio::LocatableSeq->new(
172 -display_id => $dnaid,
173 -alphabet => 'dna',
174 -start => $start_offset + 1,
175 -end => ( $seq->end * CODONSIZE ),
176 -strand => 1,
177 -seq => $nt_seqstr
179 $dnaalign->add_seq($newdna);
181 return $dnaalign;
184 =head2 dna_to_aa_aln
186 Title : dna_to_aa_aln
187 Usage : my $aa_aln = dna_to_aa_aln($dna_aln);
188 Function: Convert a DNA alignment to an amino acid alignment where
189 the length of all alignment strings and the lengths of any
190 gaps must be divisible by 3
191 Returns : Bio::Align::AlignI object
192 Args : the DNA alignment, a Bio::Align::AlignI of DNA sequences
194 See also: L<Bio::Align::AlignI>, L<Bio::SimpleAlign>, L<Bio::PrimarySeq>
196 =cut
198 sub dna_to_aa_aln {
199 my $dna_aln = shift;
200 unless ( defined $dna_aln
201 && ref($dna_aln)
202 && $dna_aln->isa('Bio::Align::AlignI') ) {
203 croak(
204 'Must provide a valid Bio::Align::AlignI object as the argument to dna_to_aa_aln'
207 my $codon_table = Bio::Tools::CodonTable->new;
208 my $aa_aln = Bio::SimpleAlign->new;
210 for my $seq ( $dna_aln->each_seq ) {
211 my ($aa_str, $aa_len);
212 my @aln_str = split '', $seq->seq;
213 croak("All lines in the alignment must have lengths divisible by 3")
214 if ( scalar(@aln_str) % CODONSIZE );
216 while ( @aln_str ) {
217 my $triplet = join '', (splice( @aln_str, 0, CODONSIZE ));
219 if ( $triplet =~ /^[GATC]+$/i ) {
220 $aa_str .= $codon_table->translate($triplet);
221 $aa_len++;
223 elsif ( $triplet =~ /^[$Bio::LocatableSeq::GAP_SYMBOLS]+$/ ) {
224 $aa_str .= $GAP;
226 else {
227 croak("The triplet '$triplet' is neither a valid codon nor all gaps");
230 my $new_aa = Bio::LocatableSeq->new(
231 -display_id => $seq->display_id,
232 -alphabet => 'protein',
233 -start => 1,
234 -end => $aa_len,
235 -strand => 1,
236 -seq => $aa_str
239 $aa_aln->add_seq($new_aa);
242 $aa_aln;
245 =head2 bootstrap_replicates
247 Title : bootstrap_replicates
248 Usage : my $alns = &bootstrap_replicates($aln,100);
249 Function: Generate a pseudo-replicate of the data by randomly
250 sampling, with replacement, the columns from an alignment for
251 the non-parametric bootstrap.
252 Returns : Arrayref of L<Bio::SimpleAlign> objects
253 Args : L<Bio::SimpleAlign> object
254 Number of replicates to generate
256 =cut
258 sub bootstrap_replicates {
259 my ( $aln, $count ) = @_;
260 $count ||= 1;
261 my $alen = $aln->length;
262 my ( @seqs, @nm );
263 $aln->set_displayname_flat(1);
264 for my $s ( $aln->each_seq ) {
265 push @seqs, $s->seq();
266 push @nm, $s->id;
268 my ( @alns, $i );
269 while ( $count-- > 0 ) {
270 my @newseqs;
271 for ( $i = 0 ; $i < $alen ; $i++ ) {
272 my $index = int( rand($alen) );
273 my $c = 0;
274 for (@seqs) {
275 $newseqs[ $c++ ] .= substr( $_, $index, 1 );
278 my $newaln = Bio::SimpleAlign->new();
279 my $i = 0;
280 for my $s (@newseqs) {
281 ( my $tmp = $s ) =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]+//g;
282 $newaln->add_seq(
283 Bio::LocatableSeq->new(
284 -start => 1,
285 -end => length($tmp),
286 -display_id => $nm[ $i++ ],
287 -seq => $s
291 push @alns, $newaln;
293 return \@alns;
296 =head2 bootstrap_replicates_codons
298 Title : bootstrap_replicates_codons
299 Usage : my $alns = &bootstrap_replicates_codons($aln,100);
300 Function: Generate a pseudo-replicate of the data by randomly
301 sampling, with replacement, the columns from a codon alignment for
302 the non-parametric bootstrap. The alignment is assumed to start on
303 the first position of a codon.
304 Returns : Arrayref of L<Bio::SimpleAlign> objects
305 Args : L<Bio::SimpleAlign> object
306 Number of replicates to generate
308 =cut
310 sub bootstrap_replicates_codons {
311 my ( $aln, $count ) = @_;
312 $count ||= 1;
313 my $alen = $aln->length;
314 my $ncodon = int( $alen / 3 );
315 my ( @seqs, @nm );
316 $aln->set_displayname_flat(1);
317 for my $s ( $aln->each_seq ) {
318 push @seqs, $s->seq();
319 push @nm, $s->id;
321 my ( @alns, $i );
322 while ( $count-- > 0 ) {
323 my @newseqs;
324 for ( $i = 0 ; $i < $ncodon ; $i++ ) {
325 my $index = int( rand($ncodon) );
326 my $seqpos = $index * 3;
327 my $c = 0;
328 for (@seqs) {
329 $newseqs[ $c++ ] .= substr( $_, $seqpos, 3 );
332 my $newaln = Bio::SimpleAlign->new();
333 my $i = 0;
334 for my $s (@newseqs) {
335 ( my $tmp = $s ) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g;
336 $newaln->add_seq(
337 Bio::LocatableSeq->new(
338 -start => 1,
339 -end => length($tmp),
340 -display_id => $nm[ $i++ ],
341 -seq => $s
345 push @alns, $newaln;
347 return \@alns;
350 =head2 cat
352 Title : cat
353 Usage : $aln123 = cat($aln1, $aln2, $aln3)
354 Function : Concatenates alignment objects. Sequences are identified by id.
355 An error will be thrown if the sequence ids are not unique in the
356 first alignment. If any ids are not present or not unique in any
357 of the additional alignments then those sequences are omitted from
358 the concatenated alignment, and a warning is issued. An error will
359 be thrown if any of the alignments are not flush, since
360 concatenating such alignments is unlikely to make biological
361 sense.
362 Returns : A new Bio::SimpleAlign object
363 Args : A list of Bio::SimpleAlign objects
365 =cut
367 sub cat {
368 my ( $self, @aln ) = @_;
369 $self->throw("cat method called with no arguments") unless $self;
370 for ( $self, @aln ) {
371 $self->throw( $_->id . " is not a Bio::Align::AlignI object" )
372 unless $_->isa('Bio::Align::AlignI');
373 $self->throw( $_->id . " is not flush" ) unless $_->is_flush;
375 my $aln = $self->new;
376 $aln->id( $self->id );
377 $aln->annotation( $self->annotation );
378 my %unique;
379 SEQ: foreach my $seq ( $self->each_seq() ) {
380 throw( "ID: ", $seq->id, " is not unique in initial alignment." )
381 if exists $unique{ $seq->id };
382 $unique{ $seq->id } = 1;
384 # Can be Bio::LocatableSeq, Bio::Seq::Meta or Bio::Seq::Meta::Array
385 my $new_seq = $seq->new(
386 -id => $seq->id,
387 -strand => $seq->strand,
388 -verbose => $self->verbose
390 $new_seq->seq( $seq->seq );
391 $new_seq->start( $seq->start );
392 $new_seq->end( $seq->end );
393 if ( $new_seq->isa('Bio::Seq::MetaI') ) {
394 for my $meta_name ( $seq->meta_names ) {
395 $new_seq->named_submeta( $meta_name, $new_seq->start,
396 $new_seq->end, $seq->named_meta($meta_name) );
399 for my $cat_aln (@aln) {
400 my @cat_seq = $cat_aln->each_seq_with_id( $seq->id );
401 if ( @cat_seq == 0 ) {
402 $self->warn( $seq->id
403 . " not found in alignment "
404 . $cat_aln->id
405 . ", skipping this sequence." );
406 next SEQ;
408 if ( @cat_seq > 1 ) {
409 $self->warn( $seq->id
410 . " found multiple times in alignment "
411 . $cat_aln->id
412 . ", skipping this sequence." );
413 next SEQ;
415 my $cat_seq = $cat_seq[0];
416 my $old_end = $new_seq->end;
417 $new_seq->seq( $new_seq->seq . $cat_seq->seq );
419 # Not sure if this is a sensible way to deal with end coordinates
420 $new_seq->end(
421 $new_seq->end + $cat_seq->end + 1 - $cat_seq->start );
423 if ( $cat_seq->isa('Bio::Seq::Meta::Array') ) {
424 unless ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
425 my $meta_seq = Bio::Seq::Meta::Array->new;
426 $meta_seq->seq( $new_seq->seq );
427 $meta_seq->start( $new_seq->start );
428 $meta_seq->end( $new_seq->end );
429 if ( $new_seq->isa('Bio::Seq::Meta') ) {
430 for my $meta_name ( $new_seq->meta_names ) {
431 $meta_seq->named_submeta(
432 $meta_name,
433 $new_seq->start,
434 $old_end,
436 split(
437 //, $new_seq->named_meta($meta_name)
443 $new_seq = $meta_seq;
445 for my $meta_name ( $cat_seq->meta_names ) {
446 $new_seq->named_submeta( $meta_name, $old_end + 1,
447 $new_seq->end, $cat_seq->named_meta($meta_name) );
450 elsif ( $cat_seq->isa('Bio::Seq::Meta') ) {
451 if ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
452 for my $meta_name ( $cat_seq->meta_names ) {
453 $new_seq->named_submeta( $meta_name, $old_end + 1,
454 $new_seq->end,
455 [ split( //, $cat_seq->named_meta($meta_name) ) ] );
458 else {
459 unless ( $new_seq->isa('Bio::Seq::Meta') ) {
460 my $meta_seq = Bio::Seq::Meta::Array->new;
461 $meta_seq->seq( $new_seq->seq );
462 $meta_seq->start( $new_seq->start );
463 $meta_seq->end( $new_seq->end );
464 $new_seq = $meta_seq;
466 for my $meta_name ( $cat_seq->meta_names ) {
467 $new_seq->named_submeta( $meta_name, $old_end + 1,
468 $new_seq->end, $cat_seq->named_meta($meta_name) );
473 $aln->add_seq($new_seq);
475 my $cons_meta = $self->consensus_meta;
476 my $new_cons_meta;
477 if ($cons_meta) {
478 $new_cons_meta = Bio::Seq::Meta->new();
479 for my $meta_name ( $cons_meta->meta_names ) {
480 $new_cons_meta->named_submeta( $meta_name, 1, $self->length,
481 $cons_meta->$meta_name );
484 my $end = $self->length;
485 for my $cat_aln (@aln) {
486 my $cat_cons_meta = $cat_aln->consensus_meta;
487 if ($cat_cons_meta) {
488 $new_cons_meta = Bio::Seq::Meta->new() if !$new_cons_meta;
489 for my $meta_name ( $cat_cons_meta->meta_names ) {
490 $new_cons_meta->named_submeta(
491 $meta_name, $end + 1,
492 $end + $cat_aln->length,
493 $cat_cons_meta->$meta_name
497 $end += $cat_aln->length;
499 $aln->consensus_meta($new_cons_meta) if $new_cons_meta;
500 return $aln;
504 =head2 most_common_sequences
506 Title : most_common_sequences
507 Usage : @common = most_common_sequences ($align, $case_sensitivity)
508 Function : Returns an array of the sequences that appear most often in the
509 alignment (although this probably makes more sense when there is
510 only a single most common sequence). Sequences are compared after
511 removing any "-" (gap characters), and ambiguous units (e.g., R
512 for purines) are only compared to themselves. The returned
513 sequence is also missing the "-" since they don't actually make
514 part of the sequence.
515 Returns : Array of text strings.
516 Arguments : Optional argument defining whether the comparison between sequences
517 to find the most common should be case sensitive. Defaults to
518 false, i.e, not case sensitive.
520 =cut
522 sub most_common_sequences {
523 my $align = shift
524 or croak ("Must provide Bio::AlignI object to Bio::Align::Utilities::most_common_sequences");
525 my $case_sensitive = shift; # defaults to false (we get undef if nothing)
527 ## We keep track of the max on this loop. Saves us having to
528 ## transverse the hash table later to find the maximum value.
529 my $max = 0;
530 my %counts;
531 foreach ($align->each_seq) {
532 (my $seq = $_->seq) =~ tr/-//d;
533 $seq = uc ($seq) unless $case_sensitive;
534 $max++ if (++$counts{$seq} > $max);
536 my @common = grep ($counts{$_} == $max, keys %counts);
537 return @common;