Add tests for memory leaks and weaken for Issue #81
[bioperl-live.git] / Bio / SimpleAlign.pm
blob98bb25d6fbe54b255d47ff02c301a6e6efa8f5c4
1 package Bio::SimpleAlign;
2 use strict;
3 use warnings;
4 use Bio::LocatableSeq; # uses Seq's as list
5 use Bio::Seq;
6 use Bio::SeqFeature::Generic;
8 use parent qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI Bio::FeatureHolderI);
10 # BioPerl module for SimpleAlign
12 # Please direct questions and support issues to <bioperl-l@bioperl.org>
14 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
16 # Copyright Ewan Birney
18 # You may distribute this module under the same terms as perl itself
20 # POD documentation - main docs before the code
22 # History:
23 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
24 # May 2001 major rewrite - Heikki Lehvaslaiho
26 =head1 NAME
28 Bio::SimpleAlign - Multiple alignments held as a set of sequences
30 =head1 SYNOPSIS
32 # Use Bio::AlignIO to read in the alignment
33 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
34 $aln = $str->next_aln();
36 # Describe
37 print $aln->length;
38 print $aln->num_residues;
39 print $aln->is_flush;
40 print $aln->num_sequences;
41 print $aln->score;
42 print $aln->percentage_identity;
43 print $aln->consensus_string(50);
45 # Find the position in the alignment for a sequence location
46 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
48 # Extract sequences and check values for the alignment column $pos
49 foreach $seq ($aln->each_seq) {
50 $res = $seq->subseq($pos, $pos);
51 $count{$res}++;
53 foreach $res (keys %count) {
54 printf "Res: %s Count: %2d\n", $res, $count{$res};
57 # Manipulate
58 $aln->remove_seq($seq);
59 $mini_aln = $aln->slice(20,30); # get a block of columns
60 $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
61 $new_aln = $aln->remove_columns([20,30]); # remove by position
62 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
64 # Analyze
65 $str = $aln->consensus_string($threshold_percent);
66 $str = $aln->match_line();
67 $str = $aln->cigar_line();
68 $id = $aln->percentage_identity;
70 # See the module documentation for details and more methods.
72 =head1 DESCRIPTION
74 SimpleAlign is an object that handles a multiple sequence alignment
75 (MSA). It is very permissive of types (it does not insist on sequences
76 being all same length, for example). Think of it as a set of sequences
77 with a whole series of built-in manipulations and methods for reading and
78 writing alignments.
80 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
81 to store its sequences. These are subsequences with a start and end
82 positions in the parent reference sequence. Each sequence in the
83 SimpleAlign object is a Bio::LocatableSeq.
85 SimpleAlign expects the combination of name, start, and end for a
86 given sequence to be unique in the alignment, and this is the key for the
87 internal hashes (name, start, end are abbreviated C<nse> in the code).
88 However, in some cases people do not want the name/start-end to be displayed:
89 either multiple names in an alignment or names specific to the alignment
90 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
91 C<displayname>, and generally is what is used to print out the
92 alignment. They default to name/start-end.
94 The SimpleAlign Module is derived from the Align module by Ewan Birney.
96 =head1 FEEDBACK
98 =head2 Mailing Lists
100 User feedback is an integral part of the evolution of this and other
101 Bioperl modules. Send your comments and suggestions preferably to one
102 of the Bioperl mailing lists. Your participation is much appreciated.
104 bioperl-l@bioperl.org - General discussion
105 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
107 =head2 Support
109 Please direct usage questions or support issues to the mailing list:
111 I<bioperl-l@bioperl.org>
113 rather than to the module maintainer directly. Many experienced and
114 reponsive experts will be able look at the problem and quickly
115 address it. Please include a thorough description of the problem
116 with code and data examples if at all possible.
118 =head2 Reporting Bugs
120 Report bugs to the Bioperl bug tracking system to help us keep track
121 the bugs and their resolution. Bug reports can be submitted via the
122 web:
124 https://github.com/bioperl/bioperl-live/issues
126 =head1 AUTHOR
128 Ewan Birney, birney@ebi.ac.uk
130 =head1 CONTRIBUTORS
132 Allen Day, allenday-at-ucla.edu,
133 Richard Adams, Richard.Adams-at-ed.ac.uk,
134 David J. Evans, David.Evans-at-vir.gla.ac.uk,
135 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
136 Allen Smith, allens-at-cpan.org,
137 Jason Stajich, jason-at-bioperl.org,
138 Anthony Underwood, aunderwood-at-phls.org.uk,
139 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
140 Brian Osborne, bosborne at alum.mit.edu
141 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
142 Hongyu Zhang, forward at hongyu.org
143 Jay Hannah, jay at jays.net
144 Alexandr Bezginov, albezg at gmail.com
146 =head1 SEE ALSO
148 L<Bio::LocatableSeq>
150 =head1 APPENDIX
152 The rest of the documentation details each of the object
153 methods. Internal methods are usually preceded with a _
155 =cut
157 ## This data should probably be in a more centralized module...
158 ## it is taken from Clustalw documentation.
159 ## These are all the positively scoring groups that occur in the
160 ## Gonnet Pam250 matrix. The strong and weak groups are
161 ## defined as strong score >0.5 and weak score =<0.5 respectively.
162 our %CONSERVATION_GROUPS = (
163 'strong' => [qw(STA NEQK NHQK NDEQ QHRK MILV MILF HY FYW )],
164 'weak' => [qw(CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY)],
168 =head2 new
170 Title : new
171 Usage : my $aln = Bio::SimpleAlign->new();
172 Function : Creates a new simple align object
173 Returns : Bio::SimpleAlign
174 Args : -source => string representing the source program
175 where this alignment came from
176 -annotation => Bio::AnnotationCollectionI
177 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
178 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
179 -consensus => consensus string
180 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
182 =cut
185 sub new {
186 my($class,@args) = @_;
188 my $self = $class->SUPER::new(@args);
190 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
191 SOURCE
192 SCORE
194 ACCESSION
195 DESCRIPTION
196 SEQS
197 FEATURES
198 ANNOTATION
199 SEQ_ANNOTATION
200 CONSENSUS
201 CONSENSUS_META
202 )], @args);
203 $src && $self->source($src);
204 defined $score && $self->score($score);
205 # we need to set up internal hashs first!
207 $self->{'_seq'} = {};
208 $self->{'_order'} = {};
209 $self->{'_start_end_lists'} = {};
210 $self->{'_dis_name'} = {};
211 $self->{'_id'} = 'NoName';
212 # maybe we should automatically read in from args. Hmmm...
213 $id && $self->id($id);
214 $acc && $self->accession($acc);
215 $desc && $self->description($desc);
216 $coll && $self->annotation($coll);
217 # sequence annotation is layered into a provided annotation collection (or dies)
218 if ($sa) {
219 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
220 "with a sequence annotation collection")
221 if !$coll;
222 $coll->add_Annotation('seq_annotation', $sa);
224 if ($feats && ref $feats eq 'ARRAY') {
225 for my $feat (@$feats) {
226 $self->add_SeqFeature($feat);
229 $con && $self->consensus($con);
230 $cmeta && $self->consensus_meta($cmeta);
231 # assumes these are in correct alignment order
232 if ($seqs && ref($seqs) eq 'ARRAY') {
233 for my $seq (@$seqs) {
234 $self->add_seq($seq);
238 return $self; # success - we hope!
241 =head1 Modifier methods
243 These methods modify the MSA by adding, removing or shuffling complete
244 sequences.
246 =head2 add_seq
248 Title : add_seq
249 Usage : $myalign->add_seq($newseq);
250 $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
251 Function : Adds another sequence to the alignment. *Does not* align
252 it - just adds it to the hashes.
253 If -ORDER is specified, the sequence is inserted at the
254 the position spec'd by -ORDER, and existing sequences
255 are pushed down the storage array.
256 Returns : nothing
257 Args : A Bio::LocatableSeq object
258 Positive integer for the sequence position (optional)
260 See L<Bio::LocatableSeq> for more information
262 =cut
264 sub addSeq {
265 my $self = shift;
266 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
267 $self->add_seq(@_);
270 sub add_seq {
271 my $self = shift;
272 my @args = @_;
273 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
274 my ($name,$id,$start,$end);
276 unless ($seq) {
277 $self->throw("LocatableSeq argument required");
279 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
280 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
282 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
283 $order--; # jay's patch (user-specified order is 1-origin)
285 if ($order < 0) {
286 $self->throw("User-specified value for ORDER must be >= 1");
289 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
291 # build the symbol list for this sequence,
292 # will prune out the gap and missing/match chars
293 # when actually asked for the symbol list in the
294 # symbol_chars
295 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
297 $name = $seq->get_nse;
299 if( $self->{'_seq'}->{$name} ) {
300 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
302 else {
303 $self->debug( "Assigning $name to $order\n");
305 my $ordh = $self->{'_order'};
306 if ($ordh->{$order}) {
307 # make space to insert
308 # $c->() returns (in reverse order) the first subsequence
309 # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
310 # (3,2,1), and $c->(2,4,5) returns (2).
311 my $c;
312 $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
313 map {
314 $ordh->{$_+1} = $ordh->{$_}
315 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
318 $ordh->{$order} = $name;
320 unless( exists( $self->{'_start_end_lists'}->{$id})) {
321 $self->{'_start_end_lists'}->{$id} = [];
323 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
326 $self->{'_seq'}->{$name} = $seq;
331 =head2 remove_seq
333 Title : remove_seq
334 Usage : $aln->remove_seq($seq);
335 Function : Removes a single sequence from an alignment
336 Returns :
337 Argument : a Bio::LocatableSeq object
339 =cut
341 sub removeSeq {
342 my $self = shift;
343 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
344 $self->remove_seq(@_);
347 sub remove_seq {
348 my $self = shift;
349 my $seq = shift;
350 my ($name,$id);
352 $self->throw("Need Bio::Locatable seq argument ")
353 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
355 $id = $seq->id();
356 $name = $seq->get_nse;
358 if( !exists $self->{'_seq'}->{$name} ) {
359 $self->throw("Sequence $name does not exist in the alignment to remove!");
362 delete $self->{'_seq'}->{$name};
364 # we need to remove this seq from the start_end_lists hash
366 if (exists $self->{'_start_end_lists'}->{$id}) {
367 # we need to find the sequence in the array.
369 my ($i, $found);;
370 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
371 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
372 $found = 1;
373 last;
376 if ($found) {
377 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
379 else {
380 $self->throw("Could not find the sequence to remoce from the start-end list");
383 else {
384 $self->throw("There is no seq list for the name $id");
386 # we need to shift order hash
387 my %rev_order = reverse %{$self->{'_order'}};
388 my $no = $rev_order{$name};
389 my $num_sequences = $self->num_sequences;
390 for (; $no < $num_sequences; $no++) {
391 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
393 delete $self->{'_order'}->{$no};
394 return 1;
398 =head2 purge
400 Title : purge
401 Usage : $aln->purge(0.7);
402 Function: Removes sequences above given sequence similarity
403 This function will grind on large alignments. Beware!
404 Example :
405 Returns : An array of the removed sequences
406 Args : float, threshold for similarity
408 =cut
410 sub purge {
411 my ($self,$perc) = @_;
412 my (%duplicate, @dups);
414 my @seqs = $self->each_seq();
416 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
417 my $seq = $seqs[$i];
419 #skip if already in duplicate hash
420 next if exists $duplicate{$seq->display_id} ;
421 my $one = $seq->seq();
423 my @one = split '', $one; #split to get 1aa per array element
425 for (my $j=$i+1;$j < @seqs;$j++) {
426 my $seq2 = $seqs[$j];
428 #skip if already in duplicate hash
429 next if exists $duplicate{$seq2->display_id} ;
431 my $two = $seq2->seq();
432 my @two = split '', $two;
434 my $count = 0;
435 my $res = 0;
436 for (my $k=0;$k<@one;$k++) {
437 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
438 $one[$k] eq $two[$k]) {
439 $count++;
441 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
442 $two[$k] ne '.' && $two[$k] ne '-' ) {
443 $res++;
447 my $ratio = 0;
448 $ratio = $count/$res unless $res == 0;
450 # if above threshold put in duplicate hash and push onto
451 # duplicate array for returning to get_unique
452 if ( $ratio > $perc ) {
453 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
454 $duplicate{$seq2->display_id} = 1;
455 push @dups, $seq2;
459 foreach my $seq (@dups) {
460 $self->remove_seq($seq);
462 return @dups;
465 =head2 sort_alphabetically
467 Title : sort_alphabetically
468 Usage : $ali->sort_alphabetically
469 Function : Changes the order of the alignment to alphabetical on name
470 followed by numerical by number.
471 Returns :
472 Argument :
474 =cut
476 sub sort_alphabetically {
477 my $self = shift;
478 my ($seq,$nse,@arr,%hash,$count);
480 foreach $seq ( $self->each_seq() ) {
481 $nse = $seq->get_nse;
482 $hash{$nse} = $seq;
485 $count = 0;
487 %{$self->{'_order'}} = (); # reset the hash;
489 foreach $nse ( sort _alpha_startend keys %hash) {
490 $self->{'_order'}->{$count} = $nse;
492 $count++;
497 =head2 sort_by_list
499 Title : sort_by_list
500 Usage : $aln_ordered=$aln->sort_by_list($list_file)
501 Function : Arbitrarily order sequences in an alignment
502 Returns : A new Bio::SimpleAlign object
503 Argument : a file listing sequence names in intended order (one name per line)
505 =cut
507 sub sort_by_list {
508 my ($self, $list) = @_;
509 my (@seq, @ids, %order);
511 foreach my $seq ( $self->each_seq() ) {
512 push @seq, $seq;
513 push @ids, $seq->display_id;
516 my $ct=1;
517 open my $listfh, '<', $list or $self->throw("Could not read file '$list': $!");
518 while (<$listfh>) {
519 chomp;
520 my $name=$_;
521 $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
522 $order{$name}=$ct++;
524 close($listfh);
526 # use the map-sort-map idiom:
527 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
528 my $aln = $self->new;
529 foreach (@sorted) { $aln->add_seq($_) }
530 return $aln;
533 =head2 set_new_reference
535 Title : set_new_reference
536 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
537 the sequence whoes name is "B31" (full, exact, and case-sensitive),
538 as the reference (1st) sequence
539 Function : Change/Set a new reference (i.e., the first) sequence
540 Returns : a new Bio::SimpleAlign object.
541 Throws an exception if designated sequence not found
542 Argument : a positive integer of sequence order, or a sequence name
543 in the original alignment
545 =cut
547 sub set_new_reference {
548 my ($self, $seqid) = @_;
549 my $aln = $self->new;
550 my (@seq, @ids, @new_seq);
551 my $is_num=0;
552 foreach my $seq ( $self->each_seq() ) {
553 push @seq, $seq;
554 push @ids, $seq->display_id;
557 if ($seqid =~ /^\d+$/) { # argument is seq position
558 $is_num=1;
559 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
560 } else { # argument is a seq name
561 $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
564 for (my $i=0; $i<=$#seq; $i++) {
565 my $pos=$i+1;
566 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
567 unshift @new_seq, $seq[$i];
568 } else {
569 push @new_seq, $seq[$i];
572 foreach (@new_seq) { $aln->add_seq($_); }
573 return $aln;
576 sub _in_aln { # check if input name exists in the alignment
577 my ($str, $ref) = @_;
578 foreach (@$ref) {
579 return 1 if $str eq $_;
581 return 0;
585 =head2 uniq_seq
587 Title : uniq_seq
588 Usage : $aln->uniq_seq(): Remove identical sequences in
589 in the alignment. Ambiguous base ("N", "n") and
590 leading and ending gaps ("-") are NOT counted as
591 differences.
592 Function : Make a new alignment of unique sequence types (STs)
593 Returns : 1a. if called in a scalar context,
594 a new Bio::SimpleAlign object (all sequences renamed as "ST")
595 1b. if called in an array context,
596 a new Bio::SimpleAlign object, and a hashref whose keys
597 are sequence types, and whose values are arrayrefs to
598 lists of sequence ids within the corresponding sequence type
599 2. if $aln->verbose > 0, ST of each sequence is sent to
600 STDERR (in a tabular format)
601 Argument : None
603 =cut
605 sub uniq_seq {
606 my ($self, $seqid) = @_;
607 my $aln = $self->new;
608 my (%member, %order, @seq, @uniq_str, $st);
609 my $order=0;
610 my $len = $self->length();
611 $st = {};
612 foreach my $seq ( $self->each_seq() ) {
613 my $str = $seq->seq();
615 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
616 # comparing two sequence strings
618 # 1st, convert "n", "N" to "?" (for DNA sequence only):
619 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
620 # 2nd, convert leading and ending gaps to "?":
621 $str = &_convert_leading_ending_gaps($str, '-', '?');
622 # Note that '?' also can mean unknown residue.
623 # I don't like making global class member changes like this, too
624 # prone to errors... -- cjfields 08-11-18
625 local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
626 my $new = Bio::LocatableSeq->new(
627 -id => $seq->id(),
628 -alphabet=> $seq->alphabet,
629 -seq => $str,
630 -start => $seq->start,
631 -end => $seq->end
633 push @seq, $new;
636 foreach my $seq (@seq) {
637 my $str = $seq->seq();
638 my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
639 if ($seen) { # seen before
640 my @memb = @{$member{$key}};
641 push @memb, $seq;
642 $member{$key} = \@memb;
643 } else { # not seen
644 push @uniq_str, $key;
645 $order++;
646 $member{$key} = [ ($seq) ];
647 $order{$key} = $order;
651 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
652 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
653 my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
654 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
655 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
656 my $gap='-';
657 my $end= CORE::length($str2);
658 $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
659 my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
660 -seq =>$str2,
661 -start=>1,
662 -end =>$end
664 $aln->add_seq($new);
665 foreach (@{$member{$str}}) {
666 push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
667 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
670 return wantarray ? ($aln, $st) : $aln;
673 sub _check_uniq { # check if same seq exists in the alignment
674 my ($str1, $ref, $length) = @_;
675 my @char1=split //, $str1;
676 my @array=@$ref;
678 return (0, $str1) if @array==0; # not seen (1st sequence)
680 foreach my $str2 (@array) {
681 my $diff=0;
682 my @char2=split //, $str2;
683 for (my $i=0; $i<=$length-1; $i++) {
684 next if $char1[$i] eq '?';
685 next if $char2[$i] eq '?';
686 $diff++ if $char1[$i] ne $char2[$i];
688 return (1, $str2) if $diff == 0; # seen before
691 return (0, $str1); # not seen
694 sub _convert_leading_ending_gaps {
695 my $s=shift;
696 my $sym1=shift;
697 my $sym2=shift;
698 my @array=split //, $s;
699 # convert leading char:
700 for (my $i=0; $i<=$#array; $i++) {
701 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
703 # convert ending char:
704 for (my $i = $#array; $i>= 0; $i--) {
705 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
707 my $s_new=join '', @array;
708 return $s_new;
711 =head1 Sequence selection methods
713 Methods returning one or more sequences objects.
715 =head2 each_seq
717 Title : each_seq
718 Usage : foreach $seq ( $align->each_seq() )
719 Function : Gets a Seq object from the alignment
720 Returns : Seq object
721 Argument :
723 =cut
725 sub eachSeq {
726 my $self = shift;
727 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
728 $self->each_seq();
731 sub each_seq {
732 my $self = shift;
733 my (@arr,$order);
735 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
736 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
737 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
740 return @arr;
744 =head2 each_alphabetically
746 Title : each_alphabetically
747 Usage : foreach $seq ( $ali->each_alphabetically() )
748 Function : Returns a sequence object, but the objects are returned
749 in alphabetically sorted order.
750 Does not change the order of the alignment.
751 Returns : Seq object
752 Argument :
754 =cut
756 sub each_alphabetically {
757 my $self = shift;
758 my ($seq,$nse,@arr,%hash,$count);
760 foreach $seq ( $self->each_seq() ) {
761 $nse = $seq->get_nse;
762 $hash{$nse} = $seq;
765 foreach $nse ( sort _alpha_startend keys %hash) {
766 push(@arr,$hash{$nse});
768 return @arr;
771 sub _alpha_startend {
772 my ($aname,$astart,$bname,$bstart);
773 ($aname,$astart) = split (/-/,$a);
774 ($bname,$bstart) = split (/-/,$b);
776 if( $aname eq $bname ) {
777 return $astart <=> $bstart;
779 else {
780 return $aname cmp $bname;
784 =head2 each_seq_with_id
786 Title : each_seq_with_id
787 Usage : foreach $seq ( $align->each_seq_with_id() )
788 Function : Gets a Seq objects from the alignment, the contents
789 being those sequences with the given name (there may be
790 more than one)
791 Returns : Seq object
792 Argument : a seq name
794 =cut
796 sub eachSeqWithId {
797 my $self = shift;
798 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
799 $self->each_seq_with_id(@_);
802 sub each_seq_with_id {
803 my $self = shift;
804 my $id = shift;
806 $self->throw("Method each_seq_with_id needs a sequence name argument")
807 unless defined $id;
809 my (@arr, $seq);
811 if (exists($self->{'_start_end_lists'}->{$id})) {
812 @arr = @{$self->{'_start_end_lists'}->{$id}};
814 return @arr;
817 =head2 get_seq_by_pos
819 Title : get_seq_by_pos
820 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
821 Function : Gets a sequence based on its position in the alignment.
822 Numbering starts from 1. Sequence positions larger than
823 num_sequences() will throw an error.
824 Returns : a Bio::LocatableSeq object
825 Args : positive integer for the sequence position
827 =cut
829 sub get_seq_by_pos {
831 my $self = shift;
832 my ($pos) = @_;
834 $self->throw("Sequence position has to be a positive integer, not [$pos]")
835 unless $pos =~ /^\d+$/ and $pos > 0;
836 $self->throw("No sequence at position [$pos]")
837 unless $pos <= $self->num_sequences ;
839 my $nse = $self->{'_order'}->{--$pos};
840 return $self->{'_seq'}->{$nse};
843 =head2 get_seq_by_id
845 Title : get_seq_by_id
846 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
847 Function : Gets a sequence based on its name.
848 Sequences that do not exist will warn and return undef
849 Returns : a Bio::LocatableSeq object
850 Args : string for sequence name
852 =cut
854 sub get_seq_by_id {
855 my ($self,$name) = @_;
856 unless( defined $name ) {
857 $self->warn("Must provide a sequence name");
858 return;
860 for my $seq ( values %{$self->{'_seq'}} ) {
861 if ( $seq->id eq $name) {
862 return $seq;
865 return;
868 =head2 seq_with_features
870 Title : seq_with_features
871 Usage : $seq = $aln->seq_with_features(-pos => 1,
872 -consensus => 60
873 -mask =>
874 sub { my $consensus = shift;
876 for my $i (1..5){
877 my $n = 'N' x $i;
878 my $q = '\?' x $i;
879 while($consensus =~ /[^?]$q[^?]/){
880 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
883 return $consensus;
886 Function: produces a Bio::Seq object by first splicing gaps from -pos
887 (by means of a splice_by_seq_pos() call), then creating
888 features using non-? chars (by means of a consensus_string()
889 call with stringency -consensus).
890 Returns : a Bio::Seq object
891 Args : -pos : required. sequence from which to build the Bio::Seq
892 object
893 -consensus : optional, defaults to consensus_string()'s
894 default cutoff value
895 -mask : optional, a coderef to apply to consensus_string()'s
896 output before building features. this may be useful for
897 closing gaps of 1 bp by masking over them with N, for
898 instance
900 =cut
902 sub seq_with_features{
903 my ($self,%arg) = @_;
905 #first do the preparatory splice
906 $self->throw("must provide a -pos argument") unless $arg{-pos};
907 $self->splice_by_seq_pos($arg{-pos});
909 my $consensus_string = $self->consensus_string($arg{-consensus});
910 $consensus_string = $arg{-mask}->($consensus_string)
911 if defined($arg{-mask});
913 my(@bs,@es);
915 push @bs, 1 if $consensus_string =~ /^[^?]/;
917 while($consensus_string =~ /\?[^?]/g){
918 push @bs, pos($consensus_string);
920 while($consensus_string =~ /[^?]\?/g){
921 push @es, pos($consensus_string);
924 push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
926 my $seq = Bio::Seq->new();
928 # my $rootfeature = Bio::SeqFeature::Generic->new(
929 # -source_tag => 'location',
930 # -start => $self->get_seq_by_pos($arg{-pos})->start,
931 # -end => $self->get_seq_by_pos($arg{-pos})->end,
932 # );
933 # $seq->add_SeqFeature($rootfeature);
935 while(my $b = shift @bs){
936 my $e = shift @es;
937 $seq->add_SeqFeature(
938 Bio::SeqFeature::Generic->new(
939 -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
940 -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
941 -source_tag => $self->source || 'MSA',
946 return $seq;
950 =head1 Create new alignments
952 The result of these methods are horizontal or vertical subsets of the
953 current MSA.
955 =head2 select
957 Title : select
958 Usage : $aln2 = $aln->select(1, 3) # three first sequences
959 Function : Creates a new alignment from a continuous subset of
960 sequences. Numbering starts from 1. Sequence positions
961 larger than num_sequences() will throw an error.
962 Returns : a Bio::SimpleAlign object
963 Args : positive integer for the first sequence
964 positive integer for the last sequence to include (optional)
966 =cut
968 sub select {
969 my $self = shift;
970 my ($start, $end) = @_;
972 $self->throw("Select start has to be a positive integer, not [$start]")
973 unless $start =~ /^\d+$/ and $start > 0;
974 $self->throw("Select end has to be a positive integer, not [$end]")
975 unless $end =~ /^\d+$/ and $end > 0;
976 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
977 unless $start <= $end;
979 my $aln = $self->new;
980 foreach my $pos ($start .. $end) {
981 $aln->add_seq($self->get_seq_by_pos($pos));
983 $aln->id($self->id);
984 # fix for meta, sf, ann
985 return $aln;
988 =head2 select_noncont
990 Title : select_noncont
991 Usage : # 1st and 3rd sequences, sorted
992 $aln2 = $aln->select_noncont(1, 3)
994 # 1st and 3rd sequences, sorted (same as first)
995 $aln2 = $aln->select_noncont(3, 1)
997 # 1st and 3rd sequences, unsorted
998 $aln2 = $aln->select_noncont('nosort',3, 1)
1000 Function : Creates a new alignment from a subset of sequences. Numbering
1001 starts from 1. Sequence positions larger than num_sequences() will
1002 throw an error. Sorts the order added to new alignment by default,
1003 to prevent sorting pass 'nosort' as the first argument in the list.
1004 Returns : a Bio::SimpleAlign object
1005 Args : array of integers for the sequences. If the string 'nosort' is
1006 passed as the first argument, the sequences will not be sorted
1007 in the new alignment but will appear in the order listed.
1009 =cut
1011 sub select_noncont {
1012 my $self = shift;
1013 my $nosort = 0;
1014 my (@pos) = @_;
1015 if ($pos[0] !~ m{^\d+$}) {
1016 my $sortcmd = shift @pos;
1017 if ($sortcmd eq 'nosort') {
1018 $nosort = 1;
1019 } else {
1020 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1024 my $end = $self->num_sequences;
1025 foreach ( @pos ) {
1026 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1027 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1030 @pos = sort {$a <=> $b} @pos unless $nosort;
1032 my $aln = $self->new;
1033 foreach my $p (@pos) {
1034 $aln->add_seq($self->get_seq_by_pos($p));
1036 $aln->id($self->id);
1037 # fix for meta, sf, ann
1038 return $aln;
1041 =head2 select_noncont_by_name
1043 Title : select_noncont_by_name
1044 Usage : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1045 Function : Creates a new alignment from a subset of sequences which are
1046 selected by name (sequence ID).
1047 Returns : a Bio::SimpleAlign object
1048 Args : array of names (i.e., identifiers) for the sequences.
1050 =cut
1052 sub select_noncont_by_name {
1053 my ($self, @names) = @_;
1055 my $aln = $self->new;
1056 foreach my $name (@names) {
1057 $aln->add_seq($self->get_seq_by_id($name));
1059 $aln->id($self->id);
1061 return $aln;
1064 =head2 slice
1066 Title : slice
1067 Usage : $aln2 = $aln->slice(20,30)
1068 Function : Creates a slice from the alignment inclusive of start and
1069 end columns, and the first column in the alignment is denoted 1.
1070 Sequences with no residues in the slice are excluded from the
1071 new alignment and a warning is printed. Slice beyond the length of
1072 the sequence does not do padding.
1073 Returns : A Bio::SimpleAlign object
1074 Args : Positive integer for start column, positive integer for end column,
1075 optional boolean which if true will keep gap-only columns in the newly
1076 created slice. Example:
1078 $aln2 = $aln->slice(20,30,1)
1080 =cut
1082 sub slice {
1083 my $self = shift;
1084 my ($start, $end, $keep_gap_only) = @_;
1086 $self->throw("Slice start has to be a positive integer, not [$start]")
1087 unless $start =~ /^\d+$/ and $start > 0;
1088 $self->throw("Slice end has to be a positive integer, not [$end]")
1089 unless $end =~ /^\d+$/ and $end > 0;
1090 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1091 unless $start <= $end;
1092 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1093 "[$start] is too big.") if $start > $self->length;
1094 my $cons_meta = $self->consensus_meta;
1095 my $aln = $self->new;
1096 $aln->id($self->id);
1097 foreach my $seq ( $self->each_seq() ) {
1098 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1099 Bio::Seq::Meta->new
1100 (-id => $seq->id,
1101 -alphabet => $seq->alphabet,
1102 -strand => $seq->strand,
1103 -verbose => $self->verbose) :
1104 Bio::LocatableSeq->new
1105 (-id => $seq->id,
1106 -alphabet => $seq->alphabet,
1107 -strand => $seq->strand,
1108 -verbose => $self->verbose);
1110 # seq
1111 my $seq_end = $end;
1112 $seq_end = $seq->length if( $end > $seq->length );
1114 my $slice_seq = $seq->subseq($start, $seq_end);
1115 $new_seq->seq( $slice_seq );
1117 # Allowed extra characters in string
1118 my $allowed_chars = '';
1119 if (exists $self->{_mask_char}) {
1120 $allowed_chars = $self->{_mask_char};
1121 $allowed_chars = quotemeta $allowed_chars;
1123 $slice_seq =~ s/[^\w$allowed_chars]//g;
1125 if ($start > 1) {
1126 my $pre_start_seq = $seq->subseq(1, $start - 1);
1127 $pre_start_seq =~ s/[^\w$allowed_chars]//g;
1128 if (!defined($seq->strand)) {
1129 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1130 } elsif ($seq->strand < 0){
1131 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1132 } else {
1133 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1135 } else {
1136 if ((defined $seq->strand)&&($seq->strand < 0)){
1137 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1138 } else {
1139 $new_seq->start( $seq->start);
1142 if ($new_seq->isa('Bio::Seq::MetaI')) {
1143 for my $meta_name ($seq->meta_names) {
1144 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1147 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1149 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1150 $aln->add_seq($new_seq);
1151 } else {
1152 if( $keep_gap_only ) {
1153 $aln->add_seq($new_seq);
1154 } else {
1155 my $nse = $seq->get_nse();
1156 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1157 " Sequence excluded from the new alignment.");
1161 if ($cons_meta) {
1162 my $new = Bio::Seq::Meta->new();
1163 for my $meta_name ($cons_meta->meta_names) {
1164 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1166 $aln->consensus_meta($new);
1168 $aln->annotation($self->annotation);
1169 # fix for meta, sf, ann
1170 return $aln;
1173 =head2 remove_columns
1175 Title : remove_columns
1176 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1177 $aln2 = $aln->remove_columns([0,0],[6,8])
1178 Function : Creates an aligment with columns removed corresponding to
1179 the specified type or by specifying the columns by number.
1180 Returns : Bio::SimpleAlign object
1181 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1182 'all_gaps_columns') or array ref where the referenced array
1183 contains a pair of integers that specify a range.
1184 The first column is 0
1186 =cut
1188 sub remove_columns {
1189 my ($self,@args) = @_;
1190 @args || $self->throw("Must supply column ranges or column types");
1191 my $aln;
1193 if ($args[0][0] =~ /^[a-z_]+$/i) {
1194 $aln = $self->_remove_columns_by_type($args[0]);
1195 } elsif ($args[0][0] =~ /^\d+$/) {
1196 $aln = $self->_remove_columns_by_num(\@args);
1197 } else {
1198 $self->throw("You must pass array references to remove_columns(), not @args");
1200 # fix for meta, sf, ann
1201 $aln;
1205 =head2 remove_gaps
1207 Title : remove_gaps
1208 Usage : $aln2 = $aln->remove_gaps
1209 Function : Creates an aligment with gaps removed
1210 Returns : a Bio::SimpleAlign object
1211 Args : a gap character(optional) if none specified taken
1212 from $self->gap_char,
1213 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1214 indicates that only all-gaps columns should be deleted
1216 Used from method L<remove_columns> in most cases. Set gap character
1217 using L<gap_char()|gap_char>.
1219 =cut
1221 sub remove_gaps {
1222 my ($self,$gapchar,$all_gaps_columns) = @_;
1223 my $gap_line;
1224 if ($all_gaps_columns) {
1225 $gap_line = $self->all_gap_line($gapchar);
1226 } else {
1227 $gap_line = $self->gap_line($gapchar);
1229 my $aln = $self->new;
1231 my @remove;
1232 my $length = 0;
1233 my $del_char = $gapchar || $self->gap_char;
1234 # Do the matching to get the segments to remove
1235 while ($gap_line =~ m/[$del_char]/g) {
1236 my $start = pos($gap_line)-1;
1237 $gap_line =~ m/\G[$del_char]+/gc;
1238 my $end = pos($gap_line)-1;
1240 #have to offset the start and end for subsequent removes
1241 $start-=$length;
1242 $end -=$length;
1243 $length += ($end-$start+1);
1244 push @remove, [$start,$end];
1247 #remove the segments
1248 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1249 # fix for meta, sf, ann
1250 return $aln;
1254 sub _remove_col {
1255 my ($self,$aln,$remove) = @_;
1256 my @new;
1258 my $gap = $self->gap_char;
1260 # splice out the segments and create new seq
1261 foreach my $seq($self->each_seq){
1262 my $new_seq = Bio::LocatableSeq->new(
1263 -id => $seq->id,
1264 -alphabet=> $seq->alphabet,
1265 -strand => $seq->strand,
1266 -verbose => $self->verbose);
1267 my $sequence = $seq->seq;
1268 foreach my $pair(@{$remove}){
1269 my $start = $pair->[0];
1270 my $end = $pair->[1];
1271 $sequence = $seq->seq unless $sequence;
1272 my $orig = $sequence;
1273 my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1274 my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1275 $sequence = $head.$tail;
1276 # start
1277 unless (defined $new_seq->start) {
1278 if ($start == 0) {
1279 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1280 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1282 else {
1283 my $start_adjust = $orig =~ /^$gap+/;
1284 if ($start_adjust) {
1285 $start_adjust = $+[0] == $start;
1287 $new_seq->start($seq->start + $start_adjust);
1290 # end
1291 if (($end + 1) >= CORE::length($orig)) {
1292 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1293 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1295 else {
1296 $new_seq->end($seq->end);
1300 if ($new_seq->end < $new_seq->start) {
1301 # we removed all columns except for gaps: set to 0 to indicate no
1302 # sequence
1303 $new_seq->start(0);
1304 $new_seq->end(0);
1307 $new_seq->seq($sequence) if $sequence;
1308 push @new, $new_seq;
1310 # add the new seqs to the alignment
1311 foreach my $new(@new){
1312 $aln->add_seq($new);
1314 # fix for meta, sf, ann
1315 return $aln;
1318 sub _remove_columns_by_type {
1319 my ($self,$type) = @_;
1320 my $aln = $self->new;
1321 my @remove;
1323 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1324 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1325 my %matchchars = ( 'match' => '\*',
1326 'weak' => '\.',
1327 'strong' => ':',
1328 'mismatch' => ' ',
1329 'gaps' => '',
1330 'all_gaps_columns' => ''
1332 # get the characters to delete against
1333 my $del_char;
1334 foreach my $type (@{$type}){
1335 $del_char.= $matchchars{$type};
1338 my $length = 0;
1339 my $match_line = $self->match_line;
1340 # do the matching to get the segments to remove
1341 if($del_char){
1342 while($match_line =~ m/[$del_char]/g ){
1343 my $start = pos($match_line)-1;
1344 $match_line=~/\G[$del_char]+/gc;
1345 my $end = pos($match_line)-1;
1347 #have to offset the start and end for subsequent removes
1348 $start-=$length;
1349 $end -=$length;
1350 $length += ($end-$start+1);
1351 push @remove, [$start,$end];
1355 # remove the segments
1356 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1357 $aln = $aln->remove_gaps() if $gap;
1358 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1359 # fix for meta, sf, ann
1360 $aln;
1364 sub _remove_columns_by_num {
1365 my ($self,$positions) = @_;
1366 my $aln = $self->new;
1368 # sort the positions
1369 @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1371 my @remove;
1372 my $length = 0;
1373 foreach my $pos (@{$positions}) {
1374 my ($start, $end) = @{$pos};
1376 #have to offset the start and end for subsequent removes
1377 $start-=$length;
1378 $end -=$length;
1379 $length += ($end-$start+1);
1380 push @remove, [$start,$end];
1383 #remove the segments
1384 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1385 # fix for meta, sf, ann
1386 $aln;
1390 =head1 Change sequences within the MSA
1392 These methods affect characters in all sequences without changing the
1393 alignment.
1395 =head2 splice_by_seq_pos
1397 Title : splice_by_seq_pos
1398 Usage : $status = splice_by_seq_pos(1);
1399 Function: splices all aligned sequences where the specified sequence
1400 has gaps.
1401 Example :
1402 Returns : 1 on success
1403 Args : position of sequence to splice by
1406 =cut
1408 sub splice_by_seq_pos{
1409 my ($self,$pos) = @_;
1411 my $guide = $self->get_seq_by_pos($pos);
1412 my $guide_seq = $guide->seq;
1414 $guide_seq =~ s/\./\-/g;
1416 my @gaps = ();
1417 $pos = -1;
1418 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1419 unshift @gaps, $pos;
1420 $pos++;
1423 foreach my $seq ($self->each_seq){
1424 my @bases = split '', $seq->seq;
1426 splice(@bases, $_, 1) foreach @gaps;
1427 $seq->seq(join('', @bases));
1433 =head2 map_chars
1435 Title : map_chars
1436 Usage : $ali->map_chars('\.','-')
1437 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1438 characters.
1440 Note that the first argument is interpreted as a regexp
1441 so be careful and escape any wild card characters (e.g.
1442 do $ali->map_chars('\.','-') to replace periods with dashes.
1443 Returns : 1 on success
1444 Argument : A regexp and a string
1446 =cut
1448 sub map_chars {
1449 my $self = shift;
1450 my $from = shift;
1451 my $to = shift;
1452 my ( $seq, $temp );
1454 $self->throw("Need two arguments: a regexp and a string")
1455 unless defined $from and defined $to;
1457 foreach $seq ( $self->each_seq() ) {
1458 $temp = $seq->seq();
1459 $temp =~ s/$from/$to/g;
1460 $seq->seq($temp);
1462 return 1;
1466 =head2 uppercase
1468 Title : uppercase()
1469 Usage : $ali->uppercase()
1470 Function : Sets all the sequences to uppercase
1471 Returns : 1 on success
1472 Argument :
1474 =cut
1476 sub uppercase {
1477 my $self = shift;
1478 my $seq;
1479 my $temp;
1481 foreach $seq ( $self->each_seq() ) {
1482 $temp = $seq->seq();
1483 $temp =~ tr/[a-z]/[A-Z]/;
1485 $seq->seq($temp);
1487 return 1;
1490 =head2 cigar_line
1492 Title : cigar_line()
1493 Usage : %cigars = $align->cigar_line()
1494 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1495 Report) line for each sequence in the alignment. Examples are
1496 "1,60" or "5,10:12,58", where the numbers refer to conserved
1497 positions within the alignment. The keys of the hash are the
1498 NSEs (name/start/end) assigned to each sequence.
1499 Args : threshold (optional, defaults to 100)
1500 Returns : Hash of strings (cigar lines)
1502 =cut
1504 sub cigar_line {
1505 my $self = shift;
1506 my $thr=shift||100;
1507 my %cigars;
1509 my @consensus = split "",($self->consensus_string($thr));
1510 my $len = $self->length;
1511 my $gapchar = $self->gap_char;
1513 # create a precursor, something like (1,4,5,6,7,33,45),
1514 # where each number corresponds to a conserved position
1515 foreach my $seq ( $self->each_seq ) {
1516 my @seq = split "", uc ($seq->seq);
1517 my $pos = 1;
1518 for (my $x = 0 ; $x < $len ; $x++ ) {
1519 if ($seq[$x] eq $consensus[$x]) {
1520 push @{$cigars{$seq->get_nse}},$pos;
1521 $pos++;
1522 } elsif ($seq[$x] ne $gapchar) {
1523 $pos++;
1527 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1528 for my $name (keys %cigars) {
1529 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1530 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1531 push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1532 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1533 ${$cigars{$name}}[$#{$cigars{$name}}] );
1534 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1535 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1536 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1537 splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1541 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1542 for my $name (keys %cigars) {
1543 my @remove;
1544 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1545 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1546 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1547 unshift @remove,$x;
1550 for my $pos (@remove) {
1551 splice @{$cigars{$name}}, $pos, 1;
1554 # join and punctuate
1555 for my $name (keys %cigars) {
1556 my ($start,$end,$str) = "";
1557 while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1558 $str .= ($start . "," . $end . ":");
1560 $str =~ s/:$//;
1561 $cigars{$name} = $str;
1563 %cigars;
1567 =head2 match_line
1569 Title : match_line()
1570 Usage : $line = $align->match_line()
1571 Function : Generates a match line - much like consensus string
1572 except that a line indicating the '*' for a match.
1573 Args : (optional) Match line characters ('*' by default)
1574 (optional) Strong match char (':' by default)
1575 (optional) Weak match char ('.' by default)
1576 Returns : String
1578 =cut
1580 sub match_line {
1581 my ($self,$matchlinechar, $strong, $weak) = @_;
1582 my %matchchars = ('match' => $matchlinechar || '*',
1583 'weak' => $weak || '.',
1584 'strong' => $strong || ':',
1585 'mismatch' => ' ',
1588 my @seqchars;
1589 my $alphabet;
1590 foreach my $seq ( $self->each_seq ) {
1591 push @seqchars, [ split(//, uc ($seq->seq)) ];
1592 $alphabet = $seq->alphabet unless defined $alphabet;
1594 my $refseq = shift @seqchars;
1595 # let's just march down the columns
1596 my $matchline;
1597 POS:
1598 foreach my $pos ( 0..$self->length ) {
1599 my $refchar = $refseq->[$pos];
1600 my $char = $matchchars{'mismatch'};
1601 unless( defined $refchar ) {
1602 last if $pos == $self->length; # short circuit on last residue
1603 # this in place to handle jason's soon-to-be-committed
1604 # intron mapping code
1605 goto bottom;
1607 my %col = ($refchar => 1);
1608 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1609 foreach my $seq ( @seqchars ) {
1610 next if $pos >= scalar @$seq;
1611 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1612 $seq->[$pos] eq ' ' );
1613 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1615 my @colresidues = sort keys %col;
1617 # if all the values are the same
1618 if( $dash ) { $char = $matchchars{'mismatch'} }
1619 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1620 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1621 # matches for protein seqs
1622 TYPE:
1623 foreach my $type ( qw(strong weak) ) {
1624 # iterate through categories
1625 my %groups;
1626 # iterate through each of the aa in the col
1627 # look to see which groups it is in
1628 foreach my $c ( @colresidues ) {
1629 foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1630 push @{$groups{$f}},$c;
1633 GRP:
1634 foreach my $cols ( values %groups ) {
1635 @$cols = sort @$cols;
1636 # now we are just testing to see if two arrays
1637 # are identical w/o changing either one
1638 # have to be same len
1639 next if( scalar @$cols != scalar @colresidues );
1640 # walk down the length and check each slot
1641 for($_=0;$_ < (scalar @$cols);$_++ ) {
1642 next GRP if( $cols->[$_] ne $colresidues[$_] );
1644 $char = $matchchars{$type};
1645 last TYPE;
1649 bottom:
1650 $matchline .= $char;
1652 return $matchline;
1656 =head2 gap_line
1658 Title : gap_line()
1659 Usage : $line = $align->gap_line()
1660 Function : Generates a gap line - much like consensus string
1661 except that a line where '-' represents gap
1662 Args : (optional) gap line characters ('-' by default)
1663 Returns : string
1665 =cut
1667 sub gap_line {
1668 my ($self,$gapchar) = @_;
1669 $gapchar = $gapchar || $self->gap_char;
1670 my %gap_hsh; # column gaps vector
1671 foreach my $seq ( $self->each_seq ) {
1672 my $i = 0;
1673 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/}
1674 map {[$i++, $_]} split(//, uc ($seq->seq));
1676 my $gap_line;
1677 foreach my $pos ( 0..$self->length-1 ) {
1678 $gap_line .= (exists $gap_hsh{$pos}) ? $self->gap_char:'.';
1680 return $gap_line;
1683 =head2 all_gap_line
1685 Title : all_gap_line()
1686 Usage : $line = $align->all_gap_line()
1687 Function : Generates a gap line - much like consensus string
1688 except that a line where '-' represents all-gap column
1689 Args : (optional) gap line characters ('-' by default)
1690 Returns : string
1692 =cut
1694 sub all_gap_line {
1695 my ($self,$gapchar) = @_;
1696 $gapchar = $gapchar || $self->gap_char;
1697 my %gap_hsh; # column gaps counter hash
1698 my @seqs = $self->each_seq;
1699 foreach my $seq ( @seqs ) {
1700 my $i = 0;
1701 map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/}
1702 map {[$i++, $_]} split(//, uc ($seq->seq));
1704 my $gap_line;
1705 foreach my $pos ( 0..$self->length-1 ) {
1706 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1707 # gaps column
1708 $gap_line .= $self->gap_char;
1709 } else {
1710 $gap_line .= '.';
1713 return $gap_line;
1716 =head2 gap_col_matrix
1718 Title : gap_col_matrix()
1719 Usage : my $cols = $align->gap_col_matrix()
1720 Function : Generates an array where each element in the array is a
1721 hash reference with a key of the sequence name and a
1722 value of 1 if the sequence has a gap at that column
1723 Returns : Reference to an array
1724 Args : Optional: gap line character ($aln->gap_char or '-' by default)
1726 =cut
1728 sub gap_col_matrix {
1729 my ( $self, $gapchar ) = @_;
1730 $gapchar = $gapchar || $self->gap_char;
1731 my %gap_hsh; # column gaps vector
1732 my @cols;
1733 foreach my $seq ( $self->each_seq ) {
1734 my $i = 0;
1735 my $str = $seq->seq;
1736 my $len = $seq->length;
1737 my $ch;
1738 my $id = $seq->display_id;
1739 while ( $i < $len ) {
1740 $ch = substr( $str, $i, 1 );
1741 $cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ );
1744 return \@cols;
1747 =head2 match
1749 Title : match()
1750 Usage : $ali->match()
1751 Function : Goes through all columns and changes residues that are
1752 identical to residue in first sequence to match '.'
1753 character. Sets match_char.
1755 USE WITH CARE: Most MSA formats do not support match
1756 characters in sequences, so this is mostly for output
1757 only. NEXUS format (Bio::AlignIO::nexus) can handle
1759 Returns : 1 on success
1760 Argument : a match character, optional, defaults to '.'
1762 =cut
1764 sub match {
1765 my ( $self, $match ) = @_;
1767 $match ||= '.';
1768 my ($matching_char) = $match;
1769 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/; #';
1770 $self->map_chars( $matching_char, '-' );
1772 my @seqs = $self->each_seq();
1773 return 1 unless scalar @seqs > 1;
1775 my $refseq = shift @seqs;
1776 my @refseq = split //, $refseq->seq;
1777 my $gapchar = $self->gap_char;
1779 foreach my $seq (@seqs) {
1780 my @varseq = split //, $seq->seq();
1781 for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1782 $varseq[$i] = $match
1783 if defined $refseq[$i]
1784 && ( $refseq[$i] =~ /[A-Za-z\*]/
1785 || $refseq[$i] =~ /$gapchar/ )
1786 && $refseq[$i] eq $varseq[$i];
1788 $seq->seq( join '', @varseq );
1790 $self->match_char($match);
1791 return 1;
1796 =head2 unmatch
1798 Title : unmatch()
1799 Usage : $ali->unmatch()
1800 Function : Undoes the effect of method match. Unsets match_char.
1801 Returns : 1 on success
1802 Argument : a match character, optional, defaults to '.'
1804 See L<match> and L<match_char>
1806 =cut
1808 sub unmatch {
1809 my ( $self, $match ) = @_;
1811 $match ||= '.';
1813 my @seqs = $self->each_seq();
1814 return 1 unless scalar @seqs > 1;
1816 my $refseq = shift @seqs;
1817 my @refseq = split //, $refseq->seq;
1818 my $gapchar = $self->gap_char;
1819 foreach my $seq (@seqs) {
1820 my @varseq = split //, $seq->seq();
1821 for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1822 $varseq[$i] = $refseq[$i]
1823 if defined $refseq[$i]
1824 && ( $refseq[$i] =~ /[A-Za-z\*]/
1825 || $refseq[$i] =~ /$gapchar/ )
1826 && $varseq[$i] eq $match;
1828 $seq->seq( join '', @varseq );
1830 $self->match_char('');
1831 return 1;
1835 =head1 MSA attributes
1837 Methods for setting and reading the MSA attributes.
1839 Note that the methods defining character semantics depend on the user
1840 to set them sensibly. They are needed only by certain input/output
1841 methods. Unset them by setting to an empty string ('').
1843 =head2 id
1845 Title : id
1846 Usage : $myalign->id("Ig")
1847 Function : Gets/sets the id field of the alignment
1848 Returns : An id string
1849 Argument : An id string (optional)
1851 =cut
1853 sub id {
1854 my ( $self, $name ) = @_;
1856 if ( defined($name) ) {
1857 $self->{'_id'} = $name;
1860 return $self->{'_id'};
1863 =head2 accession
1865 Title : accession
1866 Usage : $myalign->accession("PF00244")
1867 Function : Gets/sets the accession field of the alignment
1868 Returns : An acc string
1869 Argument : An acc string (optional)
1871 =cut
1873 sub accession {
1874 my ( $self, $acc ) = @_;
1876 if ( defined($acc) ) {
1877 $self->{'_accession'} = $acc;
1880 return $self->{'_accession'};
1883 =head2 description
1885 Title : description
1886 Usage : $myalign->description("14-3-3 proteins")
1887 Function : Gets/sets the description field of the alignment
1888 Returns : An description string
1889 Argument : An description string (optional)
1891 =cut
1893 sub description {
1894 my ( $self, $name ) = @_;
1896 if ( defined($name) ) {
1897 $self->{'_description'} = $name;
1900 return $self->{'_description'};
1903 =head2 missing_char
1905 Title : missing_char
1906 Usage : $myalign->missing_char("?")
1907 Function : Gets/sets the missing_char attribute of the alignment
1908 It is generally recommended to set it to 'n' or 'N'
1909 for nucleotides and to 'X' for protein.
1910 Returns : An missing_char string,
1911 Argument : An missing_char string (optional)
1913 =cut
1915 sub missing_char {
1916 my ( $self, $char ) = @_;
1918 if ( defined $char ) {
1919 $self->throw("Single missing character, not [$char]!")
1920 if CORE::length($char) > 1;
1921 $self->{'_missing_char'} = $char;
1924 return $self->{'_missing_char'};
1928 =head2 match_char
1930 Title : match_char
1931 Usage : $myalign->match_char('.')
1932 Function : Gets/sets the match_char attribute of the alignment
1933 Returns : An match_char string,
1934 Argument : An match_char string (optional)
1936 =cut
1938 sub match_char {
1939 my ( $self, $char ) = @_;
1941 if ( defined $char ) {
1942 $self->throw("Single match character, not [$char]!")
1943 if CORE::length($char) > 1;
1944 $self->{'_match_char'} = $char;
1947 return $self->{'_match_char'};
1950 =head2 gap_char
1952 Title : gap_char
1953 Usage : $myalign->gap_char('-')
1954 Function : Gets/sets the gap_char attribute of the alignment
1955 Returns : An gap_char string, defaults to '-'
1956 Argument : An gap_char string (optional)
1958 =cut
1960 sub gap_char {
1961 my ( $self, $char ) = @_;
1963 if ( defined $char || !defined $self->{'_gap_char'} ) {
1964 $char = '-' unless defined $char;
1965 $self->throw("Single gap character, not [$char]!")
1966 if CORE::length($char) > 1;
1967 $self->{'_gap_char'} = $char;
1969 return $self->{'_gap_char'};
1973 =head2 symbol_chars
1975 Title : symbol_chars
1976 Usage : my @symbolchars = $aln->symbol_chars;
1977 Function: Returns all the seen symbols (other than gaps)
1978 Returns : array of characters that are the seen symbols
1979 Args : boolean to include the gap/missing/match characters
1981 =cut
1983 sub symbol_chars{
1984 my ($self,$includeextra) = @_;
1986 unless ($self->{'_symbols'}) {
1987 foreach my $seq ($self->each_seq) {
1988 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1991 my %copy = %{$self->{'_symbols'}};
1992 if( ! $includeextra ) {
1993 foreach my $char ( $self->gap_char, $self->match_char,
1994 $self->missing_char) {
1995 delete $copy{$char} if( defined $char );
1998 return keys %copy;
2001 =head1 Alignment descriptors
2003 These read only methods describe the MSA in various ways.
2006 =head2 score
2008 Title : score
2009 Usage : $str = $ali->score()
2010 Function : get/set a score of the alignment
2011 Returns : a score for the alignment
2012 Argument : an optional score to set
2014 =cut
2016 sub score {
2017 my $self = shift;
2018 $self->{score} = shift if @_;
2019 return $self->{score};
2022 =head2 consensus_string
2024 Title : consensus_string
2025 Usage : $str = $ali->consensus_string($threshold_percent)
2026 Function : Makes a strict consensus
2027 Returns : Consensus string
2028 Argument : Optional threshold ranging from 0 to 100.
2029 The consensus residue has to appear at least threshold %
2030 of the sequences at a given location, otherwise a '?'
2031 character will be placed at that location.
2032 (Default value = 0%)
2034 =cut
2036 sub consensus_string {
2037 my $self = shift;
2038 my $threshold = shift;
2040 my $out = "";
2041 my $len = $self->length - 1;
2043 foreach ( 0 .. $len ) {
2044 $out .= $self->_consensus_aa( $_, $threshold );
2046 return $out;
2050 =head2 consensus_conservation
2052 Title : consensus_conservation
2053 Usage : @conservation = $ali->consensus_conservation();
2054 Function : Conservation (as a percent) of each position of alignment
2055 Returns : Array of percentages [0-100]. Gap columns are 0% conserved.
2056 Argument :
2058 =cut
2060 sub consensus_conservation {
2061 my $self = shift;
2062 my @cons;
2063 my $num_sequences = $self->num_sequences;
2064 foreach my $point (0..$self->length-1) {
2065 my %hash = $self->_consensus_counts($point);
2066 # max frequency of a non-gap letter
2067 my $max = (sort {$b<=>$a} values %hash )[0];
2068 push @cons, 100 * $max / $num_sequences;
2070 return @cons;
2073 sub _consensus_aa {
2074 my $self = shift;
2075 my $point = shift;
2076 my $threshold_percent = shift || -1 ;
2077 my ($seq,%hash,$count,$letter,$key);
2078 my $gapchar = $self->gap_char;
2079 %hash = $self->_consensus_counts($point);
2080 my $number_of_sequences = $self->num_sequences();
2081 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2082 $count = -1;
2083 $letter = '?';
2085 foreach $key ( sort keys %hash ) {
2086 # print "Now at $key $hash{$key}\n";
2087 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2088 $letter = $key;
2089 $count = $hash{$key};
2092 return $letter;
2095 # Frequency of each letter in one column
2096 sub _consensus_counts {
2097 my $self = shift;
2098 my $point = shift;
2099 my %hash;
2100 my $gapchar = $self->gap_char;
2101 foreach my $seq ( $self->each_seq() ) {
2102 my $letter = substr($seq->seq,$point,1);
2103 $self->throw("--$point-----------") if $letter eq '';
2104 ($letter eq $gapchar || $letter =~ /\./) && next;
2105 $hash{$letter}++;
2107 return %hash;
2111 =head2 consensus_iupac
2113 Title : consensus_iupac
2114 Usage : $str = $ali->consensus_iupac()
2115 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2116 and RNA. The output is in upper case except when gaps in
2117 a column force output to be in lower case.
2119 Note that if your alignment sequences contain a lot of
2120 IUPAC ambiquity codes you often have to manually set
2121 alphabet. Bio::PrimarySeq::_guess_type thinks they
2122 indicate a protein sequence.
2123 Returns : consensus string
2124 Argument : none
2125 Throws : on protein sequences
2127 =cut
2129 sub consensus_iupac {
2130 my $self = shift;
2131 my $out = "";
2132 my $len = $self->length - 1;
2134 # only DNA and RNA sequences are valid
2135 foreach my $seq ( $self->each_seq() ) {
2136 $self->throw( "Seq [" . $seq->get_nse . "] is a protein" )
2137 if $seq->alphabet eq 'protein';
2140 # loop over the alignment columns
2141 foreach my $count ( 0 .. $len ) {
2142 $out .= $self->_consensus_iupac($count);
2144 return $out;
2147 sub _consensus_iupac {
2148 my ($self, $column) = @_;
2149 my ($string, $char, $rna);
2151 #determine all residues in a column
2152 foreach my $seq ( $self->each_seq() ) {
2153 $string .= substr($seq->seq, $column, 1);
2155 $string = uc $string;
2157 # quick exit if there's an N in the string
2158 if ($string =~ /N/) {
2159 $string =~ /\W/ ? return 'n' : return 'N';
2161 # ... or if there are only gap characters
2162 return '-' if $string =~ /^\W+$/;
2164 # treat RNA as DNA in regexps
2165 if ($string =~ /U/) {
2166 $string =~ s/U/T/;
2167 $rna = 1;
2170 # the following s///'s only need to be done to the _first_ ambiguity code
2171 # as we only need to see the _range_ of characters in $string
2173 if ($string =~ /[VDHB]/) {
2174 $string =~ s/V/AGC/;
2175 $string =~ s/D/AGT/;
2176 $string =~ s/H/ACT/;
2177 $string =~ s/B/CTG/;
2180 if ($string =~ /[SKYRWM]/) {
2181 $string =~ s/S/GC/;
2182 $string =~ s/K/GT/;
2183 $string =~ s/Y/CT/;
2184 $string =~ s/R/AG/;
2185 $string =~ s/W/AT/;
2186 $string =~ s/M/AC/;
2189 # and now the guts of the thing
2191 if ($string =~ /A/) {
2192 $char = 'A'; # A A
2193 if ($string =~ /G/) {
2194 $char = 'R'; # A and G (purines) R
2195 if ($string =~ /C/) {
2196 $char = 'V'; # A and G and C V
2197 if ($string =~ /T/) {
2198 $char = 'N'; # A and G and C and T N
2200 } elsif ($string =~ /T/) {
2201 $char = 'D'; # A and G and T D
2203 } elsif ($string =~ /C/) {
2204 $char = 'M'; # A and C M
2205 if ($string =~ /T/) {
2206 $char = 'H'; # A and C and T H
2208 } elsif ($string =~ /T/) {
2209 $char = 'W'; # A and T W
2211 } elsif ($string =~ /C/) {
2212 $char = 'C'; # C C
2213 if ($string =~ /T/) {
2214 $char = 'Y'; # C and T (pyrimidines) Y
2215 if ($string =~ /G/) {
2216 $char = 'B'; # C and T and G B
2218 } elsif ($string =~ /G/) {
2219 $char = 'S'; # C and G S
2221 } elsif ($string =~ /G/) {
2222 $char = 'G'; # G G
2223 if ($string =~ /C/) {
2224 $char = 'S'; # G and C S
2225 } elsif ($string =~ /T/) {
2226 $char = 'K'; # G and T K
2228 } elsif ($string =~ /T/) {
2229 $char = 'T'; # T T
2232 $char = 'U' if $rna and $char eq 'T';
2233 $char = lc $char if $string =~ /\W/;
2235 return $char;
2239 =head2 consensus_meta
2241 Title : consensus_meta
2242 Usage : $seqmeta = $ali->consensus_meta()
2243 Function : Returns a Bio::Seq::Meta object containing the consensus
2244 strings derived from meta data analysis.
2245 Returns : Bio::Seq::Meta
2246 Argument : Bio::Seq::Meta
2247 Throws : non-MetaI object
2249 =cut
2251 sub consensus_meta {
2252 my ($self, $meta) = @_;
2253 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2254 $self->throw('Not a Bio::Seq::MetaI object');
2256 return $self->{'_aln_meta'} = $meta if $meta;
2257 return $self->{'_aln_meta'}
2260 =head2 is_flush
2262 Title : is_flush
2263 Usage : if ( $ali->is_flush() )
2264 Function : Tells you whether the alignment
2265 : is flush, i.e. all of the same length
2266 Returns : 1 or 0
2267 Argument :
2269 =cut
2271 sub is_flush {
2272 my ( $self, $report ) = @_;
2273 my $seq;
2274 my $length = (-1);
2275 my $temp;
2277 foreach $seq ( $self->each_seq() ) {
2278 if ( $length == (-1) ) {
2279 $length = CORE::length( $seq->seq() );
2280 next;
2283 $temp = CORE::length( $seq->seq() );
2284 if ( $temp != $length ) {
2285 $self->warn(
2286 "expecting $length not $temp from " . $seq->display_id )
2287 if ($report);
2288 $self->debug(
2289 "expecting $length not $temp from " . $seq->display_id );
2290 $self->debug( $seq->seq() . "\n" );
2291 return 0;
2295 return 1;
2299 =head2 length
2301 Title : length()
2302 Usage : $len = $ali->length()
2303 Function : Returns the maximum length of the alignment.
2304 To be sure the alignment is a block, use is_flush
2305 Returns : Integer
2306 Argument :
2308 =cut
2310 sub length_aln {
2311 my $self = shift;
2312 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2313 $self->length(@_);
2316 sub length {
2317 my $self = shift;
2318 my $seq;
2319 my $length = -1;
2320 my $temp;
2322 foreach $seq ( $self->each_seq() ) {
2323 $temp = $seq->length();
2324 if( $temp > $length ) {
2325 $length = $temp;
2329 return $length;
2333 =head2 maxdisplayname_length
2335 Title : maxdisplayname_length
2336 Usage : $ali->maxdisplayname_length()
2337 Function : Gets the maximum length of the displayname in the
2338 alignment. Used in writing out various MSA formats.
2339 Returns : integer
2340 Argument :
2342 =cut
2344 sub maxname_length {
2345 my $self = shift;
2346 $self->deprecated("maxname_length - deprecated method.".
2347 " Use maxdisplayname_length() instead.");
2348 $self->maxdisplayname_length();
2351 sub maxnse_length {
2352 my $self = shift;
2353 $self->deprecated("maxnse_length - deprecated method.".
2354 " Use maxnse_length() instead.");
2355 $self->maxdisplayname_length();
2358 sub maxdisplayname_length {
2359 my $self = shift;
2360 my $maxname = (-1);
2361 my ( $seq, $len );
2363 foreach $seq ( $self->each_seq() ) {
2364 $len = CORE::length $self->displayname( $seq->get_nse() );
2366 if ( $len > $maxname ) {
2367 $maxname = $len;
2371 return $maxname;
2374 =head2 max_metaname_length
2376 Title : max_metaname_length
2377 Usage : $ali->max_metaname_length()
2378 Function : Gets the maximum length of the meta name tags in the
2379 alignment for the sequences and for the alignment.
2380 Used in writing out various MSA formats.
2381 Returns : integer
2382 Argument : None
2384 =cut
2386 sub max_metaname_length {
2387 my $self = shift;
2388 my $maxname = (-1);
2389 my ($seq,$len);
2391 # check seq meta first
2392 for $seq ( $self->each_seq() ) {
2393 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2394 for my $mtag ($seq->meta_names) {
2395 $len = CORE::length $mtag;
2396 if( $len > $maxname ) {
2397 $maxname = $len;
2402 # alignment meta
2403 for my $meta ($self->consensus_meta) {
2404 next unless $meta;
2405 for my $name ($meta->meta_names) {
2406 $len = CORE::length $name;
2407 if( $len > $maxname ) {
2408 $maxname = $len;
2413 return $maxname;
2416 =head2 num_residues
2418 Title : num_residues
2419 Usage : $no = $ali->num_residues
2420 Function : number of residues in total in the alignment
2421 Returns : integer
2422 Argument :
2423 Note : replaces no_residues()
2425 =cut
2427 sub num_residues {
2428 my $self = shift;
2429 my $count = 0;
2431 foreach my $seq ( $self->each_seq ) {
2432 my $str = $seq->seq();
2434 $count += ( $str =~ s/[A-Za-z]//g );
2437 return $count;
2440 =head2 num_sequences
2442 Title : num_sequences
2443 Usage : $depth = $ali->num_sequences
2444 Function : number of sequence in the sequence alignment
2445 Returns : integer
2446 Argument : none
2447 Note : replaces no_sequences()
2449 =cut
2451 sub num_sequences {
2452 my $self = shift;
2453 return scalar($self->each_seq);
2456 =head2 average_percentage_identity
2458 Title : average_percentage_identity
2459 Usage : $id = $align->average_percentage_identity
2460 Function: The function uses a fast method to calculate the average
2461 percentage identity of the alignment
2462 Returns : The average percentage identity of the alignment
2463 Args : None
2464 Notes : This method implemented by Kevin Howe calculates a figure that is
2465 designed to be similar to the average pairwise identity of the
2466 alignment (identical in the absence of gaps), without having to
2467 explicitly calculate pairwise identities proposed by Richard Durbin.
2468 Validated by Ewan Birney ad Alex Bateman.
2470 =cut
2472 sub average_percentage_identity{
2473 my ($self,@args) = @_;
2475 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2476 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2478 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2480 if (! $self->is_flush()) {
2481 $self->throw("All sequences in the alignment must be the same length");
2484 @seqs = $self->each_seq();
2485 $len = $self->length();
2487 # load the each hash with correct keys for existence checks
2489 for( my $index=0; $index < $len; $index++) {
2490 foreach my $letter (@alphabet) {
2491 $countHashes[$index]->{$letter} = 0;
2494 foreach my $seq (@seqs) {
2495 my @seqChars = split //, $seq->seq();
2496 for( my $column=0; $column < @seqChars; $column++ ) {
2497 my $char = uc($seqChars[$column]);
2498 if (exists $countHashes[$column]->{$char}) {
2499 $countHashes[$column]->{$char}++;
2504 $total = 0;
2505 $divisor = 0;
2506 for(my $column =0; $column < $len; $column++) {
2507 my %hash = %{$countHashes[$column]};
2508 $subdivisor = 0;
2509 foreach my $res (keys %hash) {
2510 $total += $hash{$res}*($hash{$res} - 1);
2511 $subdivisor += $hash{$res};
2513 $divisor += $subdivisor * ($subdivisor - 1);
2515 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2518 =head2 percentage_identity
2520 Title : percentage_identity
2521 Usage : $id = $align->percentage_identity
2522 Function: The function calculates the average percentage identity
2523 (aliased to average_percentage_identity)
2524 Returns : The average percentage identity
2525 Args : None
2527 =cut
2529 sub percentage_identity {
2530 my $self = shift;
2531 return $self->average_percentage_identity();
2534 =head2 overall_percentage_identity
2536 Title : overall_percentage_identity
2537 Usage : $id = $align->overall_percentage_identity
2538 $id = $align->overall_percentage_identity('short')
2539 Function: The function calculates the percentage identity of
2540 the conserved columns
2541 Returns : The percentage identity of the conserved columns
2542 Args : length value to use, optional defaults to alignment length
2543 possible values: 'align', 'short', 'long'
2545 The argument values 'short' and 'long' refer to shortest and longest
2546 sequence in the alignment. Method modification code by Hongyu Zhang.
2548 =cut
2550 sub overall_percentage_identity{
2551 my ($self, $length_measure) = @_;
2553 my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2555 my %enum = map {$_ => undef} qw (align short long);
2557 $self->throw("Unknown argument [$length_measure]")
2558 if $length_measure and not exists $enum{$length_measure};
2559 $length_measure ||= 'align';
2561 if (! $self->is_flush()) {
2562 $self->throw("All sequences in the alignment must be the same length");
2565 # Count the residues seen at each position
2566 my $len;
2567 my $total = 0; # number of positions with identical residues
2568 my @countHashes;
2569 my @seqs = $self->each_seq;
2570 my $nof_seqs = scalar @seqs;
2571 my $aln_len = $self->length();
2572 for my $seq (@seqs) {
2573 my $seqstr = $seq->seq;
2575 # Count residues for given sequence
2576 for my $column (0 .. $aln_len-1) {
2577 my $char = uc( substr($seqstr, $column, 1) );
2578 if ( exists $alphabet{$char} ) {
2580 # This is a valid char
2581 if ( defined $countHashes[$column]->{$char} ) {
2582 $countHashes[$column]->{$char}++;
2583 } else {
2584 $countHashes[$column]->{$char} = 1;
2587 if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2588 # All sequences have this same residue
2589 $total++;
2595 # Sequence length
2596 if ($length_measure eq 'short' || $length_measure eq 'long') {
2597 my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2598 if ($length_measure eq 'short') {
2599 if ( (not defined $len) || ($seq_len < $len) ) {
2600 $len = $seq_len;
2602 } elsif ($length_measure eq 'long') {
2603 if ( (not defined $len) || ($seq_len > $len) ) {
2604 $len = $seq_len;
2611 if ($length_measure eq 'align') {
2612 $len = $aln_len;
2615 return ($total / $len ) * 100.0;
2620 =head1 Alignment positions
2622 Methods to map a sequence position into an alignment column and back.
2623 column_from_residue_number() does the former. The latter is really a
2624 property of the sequence object and can done using
2625 L<Bio::LocatableSeq::location_from_column>:
2627 # select somehow a sequence from the alignment, e.g.
2628 my $seq = $aln->get_seq_by_pos(1);
2629 #$loc is undef or Bio::LocationI object
2630 my $loc = $seq->location_from_column(5);
2632 =head2 column_from_residue_number
2634 Title : column_from_residue_number
2635 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2636 Function: This function gives the position in the alignment
2637 (i.e. column number) of the given residue number in the
2638 sequence with the given name. For example, for the
2639 alignment
2641 Seq1/91-97 AC..DEF.GH.
2642 Seq2/24-30 ACGG.RTY...
2643 Seq3/43-51 AC.DDEF.GHI
2645 column_from_residue_number( "Seq1", 94 ) returns 6.
2646 column_from_residue_number( "Seq2", 25 ) returns 2.
2647 column_from_residue_number( "Seq3", 50 ) returns 10.
2649 An exception is thrown if the residue number would lie
2650 outside the length of the aligment
2651 (e.g. column_from_residue_number( "Seq2", 22 )
2653 Note: If the the parent sequence is represented by more than
2654 one alignment sequence and the residue number is present in
2655 them, this method finds only the first one.
2657 Returns : A column number for the position in the alignment of the
2658 given residue in the given sequence (1 = first column)
2659 Args : A sequence id/name (not a name/start-end)
2660 A residue number in the whole sequence (not just that
2661 segment of it in the alignment)
2663 =cut
2665 sub column_from_residue_number {
2666 my ( $self, $name, $resnumber ) = @_;
2668 $self->throw("No sequence with name [$name]")
2669 unless $self->{'_start_end_lists'}->{$name};
2670 $self->throw("Second argument residue number missing") unless $resnumber;
2672 foreach my $seq ( $self->each_seq_with_id($name) ) {
2673 my $col;
2674 eval { $col = $seq->column_from_residue_number($resnumber); };
2675 next if $@;
2676 return $col;
2679 $self->throw( "Could not find a sequence segment in $name "
2680 . "containing residue number $resnumber" );
2684 =head1 Sequence names
2686 Methods to manipulate the display name. The default name based on the
2687 sequence id and subsequence positions can be overridden in various
2688 ways.
2690 =head2 displayname
2692 Title : displayname
2693 Usage : $myalign->displayname("Ig", "IgA")
2694 Function : Gets/sets the display name of a sequence in the alignment
2695 Returns : A display name string
2696 Argument : name of the sequence
2697 displayname of the sequence (optional)
2699 =cut
2701 sub displayname {
2702 my ( $self, $name, $disname ) = @_;
2704 $self->throw("No sequence with name [$name]")
2705 unless defined $self->{'_seq'}->{$name};
2707 if ( $disname and $name ) {
2708 $self->{'_dis_name'}->{$name} = $disname;
2709 return $disname;
2711 elsif ( defined $self->{'_dis_name'}->{$name} ) {
2712 return $self->{'_dis_name'}->{$name};
2714 else {
2715 return $name;
2719 sub get_displayname {
2720 my $self = shift;
2721 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2722 $self->displayname(@_);
2725 sub set_displayname {
2726 my $self = shift;
2727 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2728 $self->displayname(@_);
2732 =head2 set_displayname_count
2734 Title : set_displayname_count
2735 Usage : $ali->set_displayname_count
2736 Function : Sets the names to be name_# where # is the number of
2737 times this name has been used.
2738 Returns : 1, on success
2739 Argument :
2741 =cut
2743 sub set_displayname_count {
2744 my $self= shift;
2745 my (@arr,$name,$seq,$count,$temp,$nse);
2747 foreach $seq ( $self->each_alphabetically() ) {
2748 $nse = $seq->get_nse();
2750 #name will be set when this is the second
2751 #time (or greater) is has been seen
2753 if( defined $name and $name eq ($seq->id()) ) {
2754 $temp = sprintf("%s_%s",$name,$count);
2755 $self->displayname($nse,$temp);
2756 $count++;
2757 } else {
2758 $count = 1;
2759 $name = $seq->id();
2760 $temp = sprintf("%s_%s",$name,$count);
2761 $self->displayname($nse,$temp);
2762 $count++;
2765 return 1;
2768 =head2 set_displayname_flat
2770 Title : set_displayname_flat
2771 Usage : $ali->set_displayname_flat()
2772 Function : Makes all the sequences be displayed as just their name,
2773 not name/start-end (NSE)
2774 Returns : 1
2775 Argument :
2777 =cut
2779 sub set_displayname_flat {
2780 my $self = shift;
2781 my ( $nse, $seq );
2783 foreach $seq ( $self->each_seq() ) {
2784 $nse = $seq->get_nse();
2785 $self->displayname( $nse, $seq->id() );
2787 return 1;
2791 =head2 set_displayname_normal
2793 Title : set_displayname_normal
2794 Usage : $ali->set_displayname_normal()
2795 Function : Makes all the sequences be displayed as name/start-end (NSE)
2796 Returns : 1, on success
2797 Argument :
2799 =cut
2801 sub set_displayname_normal {
2802 my $self = shift;
2803 my ( $nse, $seq );
2805 foreach $seq ( $self->each_seq() ) {
2806 $nse = $seq->get_nse();
2807 $self->displayname( $nse, $nse );
2809 return 1;
2812 =head2 source
2814 Title : source
2815 Usage : $obj->source($newval)
2816 Function: sets the Alignment source program
2817 Example :
2818 Returns : value of source
2819 Args : newvalue (optional)
2821 =cut
2823 sub source {
2824 my ( $self, $value ) = @_;
2825 if ( defined $value ) {
2826 $self->{'_source'} = $value;
2828 return $self->{'_source'};
2832 =head2 set_displayname_safe
2834 Title : set_displayname_safe
2835 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2836 Function : Assign machine-generated serial names to sequences in input order.
2837 Designed to protect names during PHYLIP runs. Assign 10-char string
2838 in the form of "S000000001" to "S999999999". Restore the original
2839 names using "restore_displayname".
2840 Returns : 1. a new $aln with system names;
2841 2. a hash ref for restoring names
2842 Argument : Number for id length (default 10)
2844 =cut
2846 sub set_displayname_safe {
2847 my $self = shift;
2848 my $idlength = shift || 10;
2849 my ( $seq, %phylip_name );
2850 my $ct = 0;
2851 my $new = Bio::SimpleAlign->new();
2852 foreach $seq ( $self->each_seq() ) {
2853 $ct++;
2854 my $pname = "S" . sprintf "%0" . ( $idlength - 1 ) . "s", $ct;
2855 $phylip_name{$pname} = $seq->id();
2856 my $new_seq = Bio::LocatableSeq->new(
2857 -id => $pname,
2858 -seq => $seq->seq(),
2859 -alphabet => $seq->alphabet,
2860 -start => $seq->{_start},
2861 -end => $seq->{_end}
2863 $new->add_seq($new_seq);
2866 $self->debug(
2867 "$ct seq names changed. Restore names by using restore_displayname.");
2868 return ( $new, \%phylip_name );
2872 =head2 restore_displayname
2874 Title : restore_displayname
2875 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2876 Function : Restore original sequence names (after running
2877 $ali->set_displayname_safe)
2878 Returns : a new $aln with names restored.
2879 Argument : a hash reference of names from "set_displayname_safe".
2881 =cut
2883 sub restore_displayname {
2884 my $self = shift;
2885 my $ref=shift;
2886 my %name=%$ref;
2887 my $new=Bio::SimpleAlign->new();
2888 foreach my $seq ( $self->each_seq() ) {
2889 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2890 my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2891 -seq => $seq->seq(),
2892 -alphabet => $seq->alphabet,
2893 -start => $seq->{_start},
2894 -end => $seq->{_end}
2896 $new->add_seq($new_seq);
2898 return $new;
2901 =head2 sort_by_start
2903 Title : sort_by_start
2904 Usage : $ali->sort_by_start
2905 Function : Changes the order of the alignment to the start position of each
2906 subalignment
2907 Returns : 1 on success
2908 Argument :
2910 =cut
2912 sub sort_by_start {
2913 my $self = shift;
2914 my ($seq,$nse,@arr,%hash,$count);
2915 foreach $seq ( $self->each_seq() ) {
2916 $nse = $seq->get_nse;
2917 $hash{$nse} = $seq;
2919 $count = 0;
2920 %{$self->{'_order'}} = (); # reset the hash;
2921 foreach $nse ( sort _startend keys %hash) {
2922 $self->{'_order'}->{$count} = $nse;
2923 $count++;
2928 sub _startend {
2929 my ($aname,$arange) = split (/[\/]/,$a);
2930 my ($bname,$brange) = split (/[\/]/,$b);
2931 my ($astart,$aend) = split(/\-/,$arange);
2932 my ($bstart,$bend) = split(/\-/,$brange);
2933 return $astart <=> $bstart;
2936 =head2 bracket_string
2938 Title : bracket_string
2939 Usage : my @params = (-refseq => 'testseq',
2940 -allele1 => 'allele1',
2941 -allele2 => 'allele2',
2942 -delimiters => '{}',
2943 -separator => '/');
2944 $str = $aln->bracket_string(@params)
2946 Function : When supplied with a list of parameters (see below), returns a
2947 string in BIC format. This is used for allelic comparisons.
2948 Briefly, if either allele contains a base change when compared to
2949 the refseq, the base or gap for each allele is represented in
2950 brackets in the order present in the 'alleles' parameter.
2952 For the following data:
2954 >testseq
2955 GGATCCATTGCTACT
2956 >allele1
2957 GGATCCATTCCTACT
2958 >allele2
2959 GGAT--ATTCCTCCT
2961 the returned string with parameters 'refseq => testseq' and
2962 'alleles => [qw(allele1 allele2)]' would be:
2964 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2965 Returns : BIC-formatted string
2966 Argument : Required args
2967 refseq : string (ID) of the reference sequence used
2968 as basis for comparison
2969 allele1 : string (ID) of the first allele
2970 allele2 : string (ID) of the second allele
2971 Optional args
2972 delimiters: two symbol string of left and right delimiters.
2973 Only the first two symbols are used
2974 default = '[]'
2975 separator : string used as a separator. Only the first
2976 symbol is used
2977 default = '/'
2978 Throws : On no refseq/alleles, or invalid refseq/alleles.
2980 =cut
2982 sub bracket_string {
2983 my ($self, @args) = @_;
2984 my ($ref, $a1, $a2, $delim, $sep) =
2985 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2986 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2987 my ($ld, $rd);
2988 ($ld, $rd) = split('', $delim, 2) if $delim;
2989 $ld ||= '[';
2990 $rd ||= ']';
2991 $sep ||= '/';
2992 my ($refseq, $allele1, $allele2) =
2993 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2994 if (!$refseq || !$allele1 || !$allele2) {
2995 $self->throw("One of your refseq/allele IDs is invalid!");
2997 my $len = $self->length-1;
2998 my $bic = '';
2999 # loop over the alignment columns
3000 for my $column ( 0 .. $len ) {
3001 my $string;
3002 my ($compres, $res1, $res2) =
3003 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
3004 # are any of the allele symbols different from the refseq?
3005 $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
3006 $ld.$res1.$sep.$res2.$rd;
3007 $bic .= $string;
3009 return $bic;
3013 =head2 methods implementing Bio::FeatureHolderI
3015 FeatureHolderI implementation to support labeled character sets like one
3016 would get from NEXUS represented data.
3018 =head2 get_SeqFeatures
3020 Usage : @features = $aln->get_SeqFeatures
3021 Function: Get the feature objects held by this feature holder.
3022 Example :
3023 Returns : an array of Bio::SeqFeatureI implementing objects
3024 Args : optional filter coderef, taking a Bio::SeqFeatureI
3025 : as argument, returning TRUE if wanted, FALSE if
3026 : unwanted
3028 =cut
3030 sub get_SeqFeatures {
3031 my $self = shift;
3032 my $filter_cb = shift;
3033 $self->throw("Arg (filter callback) must be a coderef")
3034 unless !defined($filter_cb)
3035 or ref($filter_cb) eq 'CODE';
3036 if ( !defined $self->{'_as_feat'} ) {
3037 $self->{'_as_feat'} = [];
3039 if ($filter_cb) {
3040 return grep { $filter_cb->($_) } @{ $self->{'_as_feat'} };
3042 return @{ $self->{'_as_feat'} };
3046 =head2 add_SeqFeature
3048 Usage : $aln->add_SeqFeature($subfeat);
3049 Function: Adds a SeqFeature into the SeqFeature array. The 'EXPAND' qualifier
3050 (see L<Bio::FeatureHolderI>) is supported, but has no effect.
3051 Example :
3052 Returns : 1 on success
3053 Args : a Bio::SeqFeatureI object
3055 =cut
3057 sub add_SeqFeature {
3058 my ($self, @feat) = @_;
3060 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3062 if (scalar @feat > 1) {
3063 $self->deprecated(
3064 -message => 'Providing an array of features to Bio::SimpleAlign add_SeqFeature()'.
3065 ' is deprecated and will be removed in a future version. '.
3066 'Add a single feature at a time instead.',
3067 -warn_version => 1.007,
3068 -throw_version => 1.009,
3072 for my $feat ( @feat ) {
3074 next if $feat eq 'EXPAND'; # Need to support it for FeatureHolderI compliance
3076 if( !$feat->isa("Bio::SeqFeatureI") ) {
3077 $self->throw("Expected a Bio::SeqFeatureI object, but got a $feat.");
3080 push @{$self->{'_as_feat'}}, $feat;
3082 return 1;
3086 =head2 remove_SeqFeatures
3088 Usage : $obj->remove_SeqFeatures
3089 Function: Removes all SeqFeatures. If you want to remove only a subset,
3090 remove that subset from the returned array, and add back the rest.
3091 Returns : The array of Bio::SeqFeatureI features that was
3092 deleted from this alignment.
3093 Args : none
3095 =cut
3097 sub remove_SeqFeatures {
3098 my $self = shift;
3100 return () unless $self->{'_as_feat'};
3101 my @feats = @{$self->{'_as_feat'}};
3102 $self->{'_as_feat'} = [];
3103 return @feats;
3106 =head2 feature_count
3108 Title : feature_count
3109 Usage : $obj->feature_count()
3110 Function: Return the number of SeqFeatures attached to the alignment
3111 Returns : integer representing the number of SeqFeatures
3112 Args : None
3114 =cut
3116 sub feature_count {
3117 my ($self) = @_;
3119 if (defined($self->{'_as_feat'})) {
3120 return ($#{$self->{'_as_feat'}} + 1);
3121 } else {
3122 return 0;
3126 =head2 get_all_SeqFeatures
3128 Title : get_all_SeqFeatures
3129 Usage :
3130 Function: Get all SeqFeatures.
3131 Example :
3132 Returns : an array of Bio::SeqFeatureI implementing objects
3133 Args : none
3134 Note : Falls through to Bio::FeatureHolderI implementation.
3136 =cut
3138 =head2 methods for Bio::AnnotatableI
3140 AnnotatableI implementation to support sequence alignments which
3141 contain annotation (NEXUS, Stockholm).
3143 =head2 annotation
3145 Title : annotation
3146 Usage : $ann = $aln->annotation or
3147 $aln->annotation($ann)
3148 Function: Gets or sets the annotation
3149 Returns : Bio::AnnotationCollectionI object
3150 Args : None or Bio::AnnotationCollectionI object
3152 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3153 for more information
3155 =cut
3157 sub annotation {
3158 my ($obj,$value) = @_;
3159 if( defined $value ) {
3160 $obj->throw("object of class ".ref($value)." does not implement ".
3161 "Bio::AnnotationCollectionI. Too bad.")
3162 unless $value->isa("Bio::AnnotationCollectionI");
3163 $obj->{'_annotation'} = $value;
3164 } elsif( ! defined $obj->{'_annotation'}) {
3165 $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3167 return $obj->{'_annotation'};
3170 =head1 Deprecated methods
3172 =cut
3174 =head2 no_residues
3176 Title : no_residues
3177 Usage : $no = $ali->no_residues
3178 Function : number of residues in total in the alignment
3179 Returns : integer
3180 Argument :
3181 Note : deprecated in favor of num_residues()
3183 =cut
3185 sub no_residues {
3186 my $self = shift;
3187 $self->deprecated(-warn_version => 1.0069,
3188 -throw_version => 1.0075,
3189 -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3190 $self->num_residues(@_);
3193 =head2 no_sequences
3195 Title : no_sequences
3196 Usage : $depth = $ali->no_sequences
3197 Function : number of sequence in the sequence alignment
3198 Returns : integer
3199 Argument :
3200 Note : deprecated in favor of num_sequences()
3202 =cut
3204 sub no_sequences {
3205 my $self = shift;
3206 $self->deprecated(-warn_version => 1.0069,
3207 -throw_version => 1.0075,
3208 -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3209 $self->num_sequences(@_);
3212 =head2 mask_columns
3214 Title : mask_columns
3215 Usage : $aln2 = $aln->mask_columns(20,30)
3216 Function : Masks a slice of the alignment inclusive of start and
3217 end columns, and the first column in the alignment is denoted 1.
3218 Mask beyond the length of the sequence does not do padding.
3219 Returns : A Bio::SimpleAlign object
3220 Args : Positive integer for start column, positive integer for end column,
3221 optional string value use for the mask. Example:
3223 $aln2 = $aln->mask_columns(20,30,'?')
3224 Note : Masking must use a character that is not used for gaps or
3225 frameshifts. These can be adjusted using the relevant global
3226 variables, but be aware these may be (uncontrollably) modified
3227 elsewhere within BioPerl (see bug 2715)
3229 =cut
3231 sub mask_columns {
3232 #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3233 my $self = shift;
3235 my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3236 $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3238 # coordinates are alignment-based, not sequence-based
3239 my ($start, $end, $mask_char) = @_;
3240 unless (defined $mask_char) { $mask_char = 'N' }
3242 $self->throw("Mask start has to be a positive integer and less than ".
3243 "alignment length, not [$start]")
3244 unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3245 $self->throw("Mask end has to be a positive integer and less than ".
3246 "alignment length, not [$end]")
3247 unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3248 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3249 "end [$end]") unless $start <= $end;
3250 $self->throw("Mask character $mask_char has to be a single character ".
3251 "and not a gap or frameshift symbol")
3252 unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3254 my $aln = $self->new;
3255 $aln->id($self->id);
3256 foreach my $seq ( $self->each_seq() ) {
3257 my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3258 -alphabet => $seq->alphabet,
3259 -strand => $seq->strand,
3260 -verbose => $self->verbose);
3262 # convert from 1-based alignment coords!
3263 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3264 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3265 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3266 $new_seq->seq($new_dna_string);
3267 $aln->add_seq($new_seq);
3269 # Preserve chosen mask character, it may be need later (like in 'slice')
3270 $aln->{_mask_char} = $mask_char;
3271 return $aln;