comment out FeatureIO, let Annotated tests fail until they are fixed
[bioperl-live.git] / Bio / Align / Utilities.pm
blob90bc80c54762c13f44e49d95a3bc0429cff2bb5c
2 # BioPerl module for Bio::Align::Utilities
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org> and Brian Osborne
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Align::Utilities - A collection of utilities regarding converting
17 and manipulating alignment objects
19 =head1 SYNOPSIS
21 use Bio::Align::Utilities qw(:all);
23 # Even if the protein alignments are local make sure the start/end
24 # stored in the LocatableSeq objects are to the full length protein.
25 # The coding sequence that is passed in should still be the full
26 # length CDS as the nt alignment will be generated.
27 # %dnaseqs is a hash of CDS sequences (spliced)
28 my $dna_aln = aa_to_dna_aln($aa_aln,\%dnaseqs);
30 # The reverse, which is simpler. The input alignment has to be
31 # translate-able, with gap lengths and an overall length divisible by 3
32 my $aa_aln = dna_to_aa_aln($dna_al);
34 # Generate bootstraps
35 my $replicates = bootstrap_replicates($aln,$count);
37 =head1 DESCRIPTION
39 This module contains utility methods for manipulating sequence
40 alignments (L<Bio::Align::AlignI>) objects.
42 The B<aa_to_dna_aln> utility is essentially the same as the B<mrtrans>
43 program by Bill Pearson available at
44 ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar. Of course this
45 is a pure-Perl implementation, but just to mention that if anything
46 seems odd you can check the alignments generated against Bill's
47 program.
49 =head1 FEEDBACK
51 =head2 Mailing Lists
53 User feedback is an integral part of the evolution of this and other
54 Bioperl modules. Send your comments and suggestions preferably to
55 the Bioperl mailing list. Your participation is much appreciated.
57 bioperl-l@bioperl.org - General discussion
58 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
60 =head2 Support
62 Please direct usage questions or support issues to the mailing list:
64 I<bioperl-l@bioperl.org>
66 rather than to the module maintainer directly. Many experienced and
67 reponsive experts will be able look at the problem and quickly
68 address it. Please include a thorough description of the problem
69 with code and data examples if at all possible.
71 =head2 Reporting Bugs
73 Report bugs to the Bioperl bug tracking system to help us keep track
74 of the bugs and their resolution. Bug reports can be submitted via the
75 web:
77 https://redmine.open-bio.org/projects/bioperl/
79 =head1 AUTHOR - Jason Stajich
81 Email jason-at-bioperl-dot-org
83 =head1 APPENDIX
85 The rest of the documentation details each of the object methods.
86 Internal methods are usually preceded with a _
88 =cut
90 #' keep my emacs happy
91 # Let the code begin...
93 package Bio::Align::Utilities;
94 use vars qw(@EXPORT @EXPORT_OK $GAP $CODONGAP %EXPORT_TAGS );
95 use strict;
96 use Carp;
97 use Bio::Root::Version;
98 require Exporter;
100 use base qw(Exporter);
102 @EXPORT = qw();
103 @EXPORT_OK =
104 qw(aa_to_dna_aln bootstrap_replicates cat bootstrap_replicates_codons dna_to_aa_aln);
105 %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
107 BEGIN {
108 use constant CODONSIZE => 3;
109 $GAP = '-';
110 $CODONGAP = $GAP x CODONSIZE;
113 =head2 aa_to_dna_aln
115 Title : aa_to_dna_aln
116 Usage : my $dnaaln = aa_to_dna_aln($aa_aln, \%seqs);
117 Function: Will convert an AA alignment to DNA space given the
118 corresponding DNA sequences. Note that this method expects
119 the DNA sequences to be in frame +1 (GFF frame 0) as it will
120 start to project into coordinates starting at the first base of
121 the DNA sequence, if this alignment represents a different
122 frame for the cDNA you will need to edit the DNA sequences
123 to remove the 1st or 2nd bases (and revcom if things should be).
124 Returns : Bio::Align::AlignI object
125 Args : 2 arguments, the alignment and a hashref.
126 Alignment is a Bio::Align::AlignI of amino acid sequences.
127 The hash reference should have keys which are
128 the display_ids for the aa
129 sequences in the alignment and the values are a
130 Bio::PrimarySeqI object for the corresponding
131 spliced cDNA sequence.
133 See also: L<Bio::Align::AlignI>, L<Bio::SimpleAlign>, L<Bio::PrimarySeq>
135 =cut
137 sub aa_to_dna_aln {
138 my ( $aln, $dnaseqs ) = @_;
139 unless ( defined $aln
140 && ref($aln)
141 && $aln->isa('Bio::Align::AlignI') )
143 croak(
144 'Must provide a valid Bio::Align::AlignI object as the first argument to aa_to_dna_aln, see the documentation for proper usage and the method signature'
147 my $alnlen = $aln->length;
148 my $dnaalign = Bio::SimpleAlign->new();
149 $aln->map_chars( '\.', $GAP );
151 foreach my $seq ( $aln->each_seq ) {
152 my $aa_seqstr = $seq->seq();
153 my $id = $seq->display_id;
154 my $dnaseq =
155 $dnaseqs->{$id} || $aln->throw( "cannot find " . $seq->display_id );
156 my $start_offset = ( $seq->start - 1 ) * CODONSIZE;
158 $dnaseq = $dnaseq->seq();
159 my $dnalen = $dnaseqs->{$id}->length;
160 my $nt_seqstr;
161 my $j = 0;
162 for ( my $i = 0 ; $i < $alnlen ; $i++ ) {
163 my $char = substr( $aa_seqstr, $i + $start_offset, 1 );
164 if ( $char eq $GAP || $j >= $dnalen ) {
165 $nt_seqstr .= $CODONGAP;
167 else {
168 $nt_seqstr .= substr( $dnaseq, $j, CODONSIZE );
169 $j += CODONSIZE;
172 $nt_seqstr .= $GAP x ( ( $alnlen * 3 ) - length($nt_seqstr) );
174 my $newdna = Bio::LocatableSeq->new(
175 -display_id => $id,
176 -alphabet => 'dna',
177 -start => $start_offset + 1,
178 -end => ( $seq->end * CODONSIZE ),
179 -strand => 1,
180 -seq => $nt_seqstr
182 $dnaalign->add_seq($newdna);
184 return $dnaalign;
187 =head2 dna_to_aa_aln
189 Title : dna_to_aa_aln
190 Usage : my $aa_aln = dna_to_aa_aln($dna_aln);
191 Function: Convert a DNA alignment to an amino acid alignment where
192 the length of all alignment strings and the lengths of any
193 gaps must be divisible by 3
194 Returns : Bio::Align::AlignI object
195 Args : the DNA alignment, a Bio::Align::AlignI of DNA sequences
197 See also: L<Bio::Align::AlignI>, L<Bio::SimpleAlign>, L<Bio::PrimarySeq>
199 =cut
201 sub dna_to_aa_aln {
202 my $dna_aln = shift;
203 unless ( defined $dna_aln
204 && ref($dna_aln)
205 && $dna_aln->isa('Bio::Align::AlignI') ) {
206 croak(
207 'Must provide a valid Bio::Align::AlignI object as the argument to dna_to_aa_aln'
210 my $codon_table = Bio::Tools::CodonTable->new;
211 my $aa_aln = Bio::SimpleAlign->new;
213 for my $seq ( $dna_aln->each_seq ) {
214 my ($aa_str, $aa_len);
215 my @aln_str = split '', $seq->seq;
216 croak("All lines in the alignment must have lengths divisible by 3")
217 if ( scalar(@aln_str) % CODONSIZE );
219 while ( @aln_str ) {
220 my $triplet = join '', (splice( @aln_str, 0, CODONSIZE ));
222 if ( $triplet =~ /^[GATC]+$/i ) {
223 $aa_str .= $codon_table->translate($triplet);
224 $aa_len++;
226 elsif ( $triplet =~ /^[$Bio::LocatableSeq::GAP_SYMBOLS]+$/ ) {
227 $aa_str .= $GAP;
229 else {
230 croak("The triplet '$triplet' is neither a valid codon nor all gaps");
233 my $new_aa = Bio::LocatableSeq->new(
234 -display_id => $seq->display_id,
235 -alphabet => 'protein',
236 -start => 1,
237 -end => $aa_len,
238 -strand => 1,
239 -seq => $aa_str
242 $aa_aln->add_seq($new_aa);
245 $aa_aln;
248 =head2 bootstrap_replicates
250 Title : bootstrap_replicates
251 Usage : my $alns = &bootstrap_replicates($aln,100);
252 Function: Generate a pseudo-replicate of the data by randomly
253 sampling, with replacement, the columns from an alignment for
254 the non-parametric bootstrap.
255 Returns : Arrayref of L<Bio::SimpleAlign> objects
256 Args : L<Bio::SimpleAlign> object
257 Number of replicates to generate
259 =cut
261 sub bootstrap_replicates {
262 my ( $aln, $count ) = @_;
263 $count ||= 1;
264 my $alen = $aln->length;
265 my ( @seqs, @nm );
266 $aln->set_displayname_flat(1);
267 for my $s ( $aln->each_seq ) {
268 push @seqs, $s->seq();
269 push @nm, $s->id;
271 my ( @alns, $i );
272 while ( $count-- > 0 ) {
273 my @newseqs;
274 for ( $i = 0 ; $i < $alen ; $i++ ) {
275 my $index = int( rand($alen) );
276 my $c = 0;
277 for (@seqs) {
278 $newseqs[ $c++ ] .= substr( $_, $index, 1 );
281 my $newaln = Bio::SimpleAlign->new();
282 my $i = 0;
283 for my $s (@newseqs) {
284 ( my $tmp = $s ) =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]+//g;
285 $newaln->add_seq(
286 Bio::LocatableSeq->new(
287 -start => 1,
288 -end => length($tmp),
289 -display_id => $nm[ $i++ ],
290 -seq => $s
294 push @alns, $newaln;
296 return \@alns;
299 =head2 bootstrap_replicates_codons
301 Title : bootstrap_replicates_codons
302 Usage : my $alns = &bootstrap_replicates_codons($aln,100);
303 Function: Generate a pseudo-replicate of the data by randomly
304 sampling, with replacement, the columns from a codon alignment for
305 the non-parametric bootstrap. The alignment is assumed to start on
306 the first position of a codon.
307 Returns : Arrayref of L<Bio::SimpleAlign> objects
308 Args : L<Bio::SimpleAlign> object
309 Number of replicates to generate
311 =cut
313 sub bootstrap_replicates_codons {
314 my ( $aln, $count ) = @_;
315 $count ||= 1;
316 my $alen = $aln->length;
317 my $ncodon = int( $alen / 3 );
318 my ( @seqs, @nm );
319 $aln->set_displayname_flat(1);
320 for my $s ( $aln->each_seq ) {
321 push @seqs, $s->seq();
322 push @nm, $s->id;
324 my ( @alns, $i );
325 while ( $count-- > 0 ) {
326 my @newseqs;
327 for ( $i = 0 ; $i < $ncodon ; $i++ ) {
328 my $index = int( rand($ncodon) );
329 my $seqpos = $index * 3;
330 my $c = 0;
331 for (@seqs) {
332 $newseqs[ $c++ ] .= substr( $_, $seqpos, 3 );
335 my $newaln = Bio::SimpleAlign->new();
336 my $i = 0;
337 for my $s (@newseqs) {
338 ( my $tmp = $s ) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g;
339 $newaln->add_seq(
340 Bio::LocatableSeq->new(
341 -start => 1,
342 -end => length($tmp),
343 -display_id => $nm[ $i++ ],
344 -seq => $s
348 push @alns, $newaln;
350 return \@alns;
353 =head2 cat
355 Title : cat
356 Usage : $aln123 = cat($aln1, $aln2, $aln3)
357 Function : Concatenates alignment objects. Sequences are identified by id.
358 An error will be thrown if the sequence ids are not unique in the
359 first alignment. If any ids are not present or not unique in any
360 of the additional alignments then those sequences are omitted from
361 the concatenated alignment, and a warning is issued. An error will
362 be thrown if any of the alignments are not flush, since
363 concatenating such alignments is unlikely to make biological
364 sense.
365 Returns : A new Bio::SimpleAlign object
366 Args : A list of Bio::SimpleAlign objects
368 =cut
370 sub cat {
371 my ( $self, @aln ) = @_;
372 $self->throw("cat method called with no arguments") unless $self;
373 for ( $self, @aln ) {
374 $self->throw( $_->id . " is not a Bio::Align::AlignI object" )
375 unless $_->isa('Bio::Align::AlignI');
376 $self->throw( $_->id . " is not flush" ) unless $_->is_flush;
378 my $aln = $self->new;
379 $aln->id( $self->id );
380 $aln->annotation( $self->annotation );
381 my %unique;
382 SEQ: foreach my $seq ( $self->each_seq() ) {
383 throw( "ID: ", $seq->id, " is not unique in initial alignment." )
384 if exists $unique{ $seq->id };
385 $unique{ $seq->id } = 1;
387 # Can be Bio::LocatableSeq, Bio::Seq::Meta or Bio::Seq::Meta::Array
388 my $new_seq = $seq->new(
389 -id => $seq->id,
390 -strand => $seq->strand,
391 -verbose => $self->verbose
393 $new_seq->seq( $seq->seq );
394 $new_seq->start( $seq->start );
395 $new_seq->end( $seq->end );
396 if ( $new_seq->isa('Bio::Seq::MetaI') ) {
397 for my $meta_name ( $seq->meta_names ) {
398 $new_seq->named_submeta( $meta_name, $new_seq->start,
399 $new_seq->end, $seq->named_meta($meta_name) );
402 for my $cat_aln (@aln) {
403 my @cat_seq = $cat_aln->each_seq_with_id( $seq->id );
404 if ( @cat_seq == 0 ) {
405 $self->warn( $seq->id
406 . " not found in alignment "
407 . $cat_aln->id
408 . ", skipping this sequence." );
409 next SEQ;
411 if ( @cat_seq > 1 ) {
412 $self->warn( $seq->id
413 . " found multiple times in alignment "
414 . $cat_aln->id
415 . ", skipping this sequence." );
416 next SEQ;
418 my $cat_seq = $cat_seq[0];
419 my $old_end = $new_seq->end;
420 $new_seq->seq( $new_seq->seq . $cat_seq->seq );
422 # Not sure if this is a sensible way to deal with end coordinates
423 $new_seq->end(
424 $new_seq->end + $cat_seq->end + 1 - $cat_seq->start );
426 if ( $cat_seq->isa('Bio::Seq::Meta::Array') ) {
427 unless ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
428 my $meta_seq = Bio::Seq::Meta::Array->new;
429 $meta_seq->seq( $new_seq->seq );
430 $meta_seq->start( $new_seq->start );
431 $meta_seq->end( $new_seq->end );
432 if ( $new_seq->isa('Bio::Seq::Meta') ) {
433 for my $meta_name ( $new_seq->meta_names ) {
434 $meta_seq->named_submeta(
435 $meta_name,
436 $new_seq->start,
437 $old_end,
439 split(
440 //, $new_seq->named_meta($meta_name)
446 $new_seq = $meta_seq;
448 for my $meta_name ( $cat_seq->meta_names ) {
449 $new_seq->named_submeta( $meta_name, $old_end + 1,
450 $new_seq->end, $cat_seq->named_meta($meta_name) );
453 elsif ( $cat_seq->isa('Bio::Seq::Meta') ) {
454 if ( $new_seq->isa('Bio::Seq::Meta::Array') ) {
455 for my $meta_name ( $cat_seq->meta_names ) {
456 $new_seq->named_submeta( $meta_name, $old_end + 1,
457 $new_seq->end,
458 [ split( //, $cat_seq->named_meta($meta_name) ) ] );
461 else {
462 unless ( $new_seq->isa('Bio::Seq::Meta') ) {
463 my $meta_seq = Bio::Seq::Meta::Array->new;
464 $meta_seq->seq( $new_seq->seq );
465 $meta_seq->start( $new_seq->start );
466 $meta_seq->end( $new_seq->end );
467 $new_seq = $meta_seq;
469 for my $meta_name ( $cat_seq->meta_names ) {
470 $new_seq->named_submeta( $meta_name, $old_end + 1,
471 $new_seq->end, $cat_seq->named_meta($meta_name) );
476 $aln->add_seq($new_seq);
478 my $cons_meta = $self->consensus_meta;
479 my $new_cons_meta;
480 if ($cons_meta) {
481 $new_cons_meta = Bio::Seq::Meta->new();
482 for my $meta_name ( $cons_meta->meta_names ) {
483 $new_cons_meta->named_submeta( $meta_name, 1, $self->length,
484 $cons_meta->$meta_name );
487 my $end = $self->length;
488 for my $cat_aln (@aln) {
489 my $cat_cons_meta = $cat_aln->consensus_meta;
490 if ($cat_cons_meta) {
491 $new_cons_meta = Bio::Seq::Meta->new() if !$new_cons_meta;
492 for my $meta_name ( $cat_cons_meta->meta_names ) {
493 $new_cons_meta->named_submeta(
494 $meta_name, $end + 1,
495 $end + $cat_aln->length,
496 $cat_cons_meta->$meta_name
500 $end += $cat_aln->length;
502 $aln->consensus_meta($new_cons_meta) if $new_cons_meta;
503 return $aln;