sync w/ main trunk
[bioperl-live.git] / Bio / Align / Utilities.pm
blob44dc0c166793cc32e5633ec24e3666fa0cfe539a
1 # $Id$
3 # BioPerl module for Bio::Align::Utilities
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason-at-bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Align::Utilities - A collection of utilities regarding converting
18 and manipulating alignment objects
20 =head1 SYNOPSIS
22 use Bio::Align::Utilities qw(:all);
23 # %dnaseqs is a hash of CDS sequences (spliced)
26 # Even if the protein alignments are local make sure the start/end
27 # stored in the LocatableSeq objects are to the full length protein.
28 # The CoDing Sequence that is passed in should still be the full
29 # length CDS as the nt alignment will be generated.
31 my $dna_aln = &aa_to_dna_aln($aa_aln,\%dnaseqs);
34 # generate bootstraps
35 my $replicates = &bootstrap_replicates($aln,$count);
38 =head1 DESCRIPTION
40 This module contains utility methods for manipulating sequence
41 alignments ( L<Bio::Align::AlignI>) objects.
43 The B<aa_to_dna_aln> utility is essentially the same as the B<mrtrans>
44 program by Bill Pearson available at
45 ftp://ftp.virginia.edu/pub/fasta/other/mrtrans.shar. Of course this
46 is a pure-perl implementation, but just to mention that if anything
47 seems odd you can check the alignments generated against Bill's
48 program.
50 =head1 FEEDBACK
52 =head2 Mailing Lists
54 User feedback is an integral part of the evolution of this and other
55 Bioperl modules. Send your comments and suggestions preferably to
56 the Bioperl mailing list. Your participation is much appreciated.
58 bioperl-l@bioperl.org - General discussion
59 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
61 =head2 Support
63 Please direct usage questions or support issues to the mailing list:
65 L<bioperl-l@bioperl.org>
67 rather than to the module maintainer directly. Many experienced and
68 reponsive experts will be able look at the problem and quickly
69 address it. Please include a thorough description of the problem
70 with code and data examples if at all possible.
72 =head2 Reporting Bugs
74 Report bugs to the Bioperl bug tracking system to help us keep track
75 of the bugs and their resolution. Bug reports can be submitted via the
76 web:
78 http://bugzilla.open-bio.org/
80 =head1 AUTHOR - Jason Stajich
82 Email jason-at-bioperl-dot-org
84 =head1 APPENDIX
86 The rest of the documentation details each of the object methods.
87 Internal methods are usually preceded with a _
89 =cut
91 #' keep my emacs happy
92 # Let the code begin...
95 package Bio::Align::Utilities;
96 use vars qw(@EXPORT @EXPORT_OK $GAP $CODONGAP %EXPORT_TAGS);
97 use strict;
98 use Carp;
99 use Bio::Root::Version;
100 require Exporter;
102 use base qw(Exporter);
104 @EXPORT = qw();
105 @EXPORT_OK = qw(aa_to_dna_aln bootstrap_replicates cat);
106 %EXPORT_TAGS = (all =>[@EXPORT, @EXPORT_OK]);
107 BEGIN {
108 use constant CODONSIZE => 3;
109 $GAP = '-';
110 $CODONGAP = $GAP x CODONSIZE;
113 =head2 aa_to_dna_aln
115 Title : aa_to_dna_aln
116 Usage : my $dnaaln = aa_to_dna_aln($aa_aln, \%seqs);
117 Function: Will convert an AA alignment to DNA space given the
118 corresponding DNA sequences. Note that this method expects
119 the DNA sequences to be in frame +1 (GFF frame 0) as it will
120 start to project into coordinates starting at the first base of
121 the DNA sequence, if this alignment represents a different
122 frame for the cDNA you will need to edit the DNA sequences
123 to remove the 1st or 2nd bases (and revcom if things should be).
124 Returns : Bio::Align::AlignI object
125 Args : 2 arguments, the alignment and a hashref.
126 Alignment is a Bio::Align::AlignI of amino acid sequences.
127 The hash reference should have keys which are
128 the display_ids for the aa
129 sequences in the alignment and the values are a
130 Bio::PrimarySeqI object for the corresponding
131 spliced cDNA sequence.
133 See also: L<Bio::Align::AlignI>, L<Bio::SimpleAlign>, L<Bio::PrimarySeq>
135 =cut
137 sub aa_to_dna_aln {
138 my ($aln,$dnaseqs) = @_;
139 unless( defined $aln &&
140 ref($aln) &&
141 $aln->isa('Bio::Align::AlignI') ) {
142 croak('Must provide a valid Bio::Align::AlignI object as the first argument to aa_to_dna_aln, see the documentation for proper usage and the method signature');
144 my $alnlen = $aln->length;
145 my $dnaalign = Bio::SimpleAlign->new();
146 $aln->map_chars('\.',$GAP);
148 foreach my $seq ( $aln->each_seq ) {
149 my $aa_seqstr = $seq->seq();
150 my $id = $seq->display_id;
151 my $dnaseq = $dnaseqs->{$id} || $aln->throw("cannot find ".
152 $seq->display_id);
153 my $start_offset = ($seq->start - 1) * CODONSIZE;
155 $dnaseq = $dnaseq->seq();
156 my $dnalen = $dnaseqs->{$id}->length;
157 my $nt_seqstr;
158 my $j = 0;
159 for( my $i = 0; $i < $alnlen; $i++ ) {
160 my $char = substr($aa_seqstr,$i + $start_offset,1);
161 if ( $char eq $GAP || $j >= $dnalen ) {
162 $nt_seqstr .= $CODONGAP;
163 } else {
164 $nt_seqstr .= substr($dnaseq,$j,CODONSIZE);
165 $j += CODONSIZE;
168 $nt_seqstr .= $GAP x (($alnlen * 3) - length($nt_seqstr));
170 my $newdna = Bio::LocatableSeq->new(-display_id => $id,
171 -alphabet => 'dna',
172 -start => $start_offset+1,
173 -end => ($seq->end *
174 CODONSIZE),
175 -strand => 1,
176 -seq => $nt_seqstr);
177 $dnaalign->add_seq($newdna);
179 return $dnaalign;
182 =head2 bootstrap_replicates
184 Title : bootstrap_replicates
185 Usage : my $alns = &bootstrap_replicates($aln,100);
186 Function: Generate a pseudo-replicate of the data by randomly
187 sampling, with replacement, the columns from an alignment for
188 the non-parametric bootstrap.
189 Returns : Arrayref of L<Bio::SimpleAlign> objects
190 Args : L<Bio::SimpleAlign> object
191 Number of replicates to generate
193 =cut
195 sub bootstrap_replicates {
196 my ($aln,$count) = @_;
197 $count ||= 1;
198 my $alen = $aln->length;
199 my (@seqs,@nm);
200 $aln->set_displayname_flat(1);
201 for my $s ( $aln->each_seq ) {
202 push @seqs, $s->seq();
203 push @nm, $s->id;
205 my (@alns,$i);
206 while( $count-- > 0 ) {
207 my @newseqs;
208 for($i =0; $i < $alen; $i++ ) {
209 my $index = int(rand($alen));
210 my $c = 0;
211 for ( @seqs ) {
212 $newseqs[$c++] .= substr($_,$index,1);
215 my $newaln = Bio::SimpleAlign->new();
216 my $i = 0;
217 for my $s ( @newseqs ) {
218 (my $tmp = $s) =~ s{[$Bio::LocatableSeq::GAP_SYMBOLS]+}{}g;
219 $newaln->add_seq( Bio::LocatableSeq->new
220 (-start => 1,
221 -end => length($tmp),
222 -display_id => $nm[$i++],
223 -seq => $s));
225 push @alns, $newaln;
227 return \@alns;
230 =head2 cat
232 Title : cat
233 Usage : $aln123 = cat($aln1, $aln2, $aln3)
234 Function : Concatenates alignment objects. Sequences are identified by id.
235 An error will be thrown if the sequence ids are not unique in the
236 first alignment. If any ids are not present or not unique in any
237 of the additional alignments then those sequences are omitted from
238 the concatenated alignment, and a warning is issued. An error will
239 be thrown if any of the alignments are not flush, since
240 concatenating such alignments is unlikely to make biological
241 sense.
242 Returns : A new Bio::SimpleAlign object
243 Args : A list of Bio::SimpleAlign objects
245 =cut
247 sub cat {
248 my ($self, @aln) = @_;
249 $self->throw("cat method called with no arguments") unless $self;
250 for ($self,@aln) {
251 $self->throw($_->id. " not a Bio::Align::AlignI object") unless $_->isa('Bio::Align::AlignI');
252 $self->throw($_->id. " is not flush") unless $_->is_flush;
254 my $aln = $self->new;
255 $aln->id($self->id);
256 $aln->annotation($self->annotation);
257 my %unique;
258 SEQ: foreach my $seq ( $self->each_seq() ) {
259 throw("ID: ", $seq->id, " is not unique in initial alignment.") if exists $unique{$seq->id};
260 $unique{$seq->id}=1;
262 # Can be Bio::LocatableSeq, Bio::Seq::Meta or Bio::Seq::Meta::Array
263 my $new_seq = $seq->new(-id=> $seq->id,
264 -strand => $seq->strand,
265 -verbose => $self->verbose);
266 $new_seq->seq($seq->seq);
267 $new_seq->start($seq->start);
268 $new_seq->end($seq->end);
269 if ($new_seq->isa('Bio::Seq::MetaI')) {
270 for my $meta_name ($seq->meta_names) {
271 $new_seq->named_submeta($meta_name, $new_seq->start, $new_seq->end, $seq->named_meta($meta_name));
274 for my $cat_aln (@aln) {
275 my @cat_seq=$cat_aln->each_seq_with_id($seq->id);
276 if (@cat_seq==0) {
277 $self->warn($seq->id. " not found in alignment ". $cat_aln->id. ", skipping this sequence.");
278 next SEQ;
280 if (@cat_seq>1) {
281 $self->warn($seq->id. " found multiple times in alignment ". $cat_aln->id. ", skipping this sequence.");
282 next SEQ;
284 my $cat_seq=$cat_seq[0];
285 my $old_end=$new_seq->end;
286 $new_seq->seq($new_seq->seq.$cat_seq->seq);
288 # Not sure if this is a sensible way to deal with end coordinates
289 $new_seq->end($new_seq->end+$cat_seq->end+1-$cat_seq->start);
291 if ($cat_seq->isa('Bio::Seq::Meta::Array')) {
292 unless ($new_seq->isa('Bio::Seq::Meta::Array')) {
293 my $meta_seq=Bio::Seq::Meta::Array->new;
294 $meta_seq->seq($new_seq->seq);
295 $meta_seq->start($new_seq->start);
296 $meta_seq->end($new_seq->end);
297 if ($new_seq->isa('Bio::Seq::Meta')) {
298 for my $meta_name ($new_seq->meta_names) {
299 $meta_seq->named_submeta($meta_name,
300 $new_seq->start,
301 $old_end,
302 [split(//, $new_seq->named_meta($meta_name))]
306 $new_seq=$meta_seq;
308 for my $meta_name ($cat_seq->meta_names) {
309 $new_seq->named_submeta($meta_name,
310 $old_end+1,
311 $new_seq->end,
312 $cat_seq->named_meta($meta_name)
315 } elsif ($cat_seq->isa('Bio::Seq::Meta')) {
316 if ($new_seq->isa('Bio::Seq::Meta::Array')) {
317 for my $meta_name ($cat_seq->meta_names) {
318 $new_seq->named_submeta($meta_name,
319 $old_end+1,
320 $new_seq->end,
321 [split(//,$cat_seq->named_meta($meta_name))]
324 } else {
325 unless ($new_seq->isa('Bio::Seq::Meta')) {
326 my $meta_seq=Bio::Seq::Meta::Array->new;
327 $meta_seq->seq($new_seq->seq);
328 $meta_seq->start($new_seq->start);
329 $meta_seq->end($new_seq->end);
330 $new_seq=$meta_seq;
332 for my $meta_name ($cat_seq->meta_names) {
333 $new_seq->named_submeta($meta_name,
334 $old_end+1,
335 $new_seq->end,
336 $cat_seq->named_meta($meta_name)
342 $aln->add_seq($new_seq);
344 my $cons_meta = $self->consensus_meta;
345 my $new_cons_meta;
346 if ($cons_meta) {
347 $new_cons_meta = Bio::Seq::Meta->new();
348 for my $meta_name ($cons_meta->meta_names) {
349 $new_cons_meta->named_submeta($meta_name, 1, $self->length, $cons_meta->$meta_name);
352 my $end=$self->length;
353 for my $cat_aln (@aln) {
354 my $cat_cons_meta=$cat_aln->consensus_meta;
355 if ($cat_cons_meta) {
356 $new_cons_meta = Bio::Seq::Meta->new() if !$new_cons_meta;
357 for my $meta_name ($cat_cons_meta->meta_names) {
358 $new_cons_meta->named_submeta($meta_name, $end+1, $end+$cat_aln->length, $cat_cons_meta->$meta_name);
361 $end+=$cat_aln->length;
363 $aln->consensus_meta($new_cons_meta) if $new_cons_meta;
364 return $aln;