* sync with trunk
[bioperl-live.git] / Bio / SimpleAlign.pm
blobd8ff4543294e4f305b05c40d366ceb08a278d0a6
1 # $Id$
2 # BioPerl module for SimpleAlign
4 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
6 # Copyright Ewan Birney
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
12 # History:
13 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
14 # May 2001 major rewrite - Heikki Lehvaslaiho
16 =head1 NAME
18 Bio::SimpleAlign - Multiple alignments held as a set of sequences
20 =head1 SYNOPSIS
22 # Use Bio::AlignIO to read in the alignment
23 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
24 $aln = $str->next_aln();
26 # Describe
27 print $aln->length;
28 print $aln->no_residues;
29 print $aln->is_flush;
30 print $aln->no_sequences;
31 print $aln->score;
32 print $aln->percentage_identity;
33 print $aln->consensus_string(50);
35 # Find the position in the alignment for a sequence location
36 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
38 # Extract sequences and check values for the alignment column $pos
39 foreach $seq ($aln->each_seq) {
40 $res = $seq->subseq($pos, $pos);
41 $count{$res}++;
43 foreach $res (keys %count) {
44 printf "Res: %s Count: %2d\n", $res, $count{$res};
47 # Manipulate
48 $aln->remove_seq($seq);
49 $mini_aln = $aln->slice(20,30); # get a block of columns
50 $mini_aln = $aln->select_noncont(1,3,5,7,11); # get single columns
51 $new_aln = $aln->remove_columns([20,30]); # remove by position
52 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
54 # Analyze
55 $str = $aln->consensus_string($threshold_percent);
56 $str = $aln->match_line();
57 $str = $aln->cigar_line();
58 $id = $aln->percentage_identity;
60 # See the module documentation for details and more methods.
62 =head1 DESCRIPTION
64 SimpleAlign is an object that handles a multiple sequence alignment
65 (MSA). It is very permissive of types (it does not insist on sequences
66 being all same length, for example). Think of it as a set of sequences
67 with a whole series of built-in manipulations and methods for reading and
68 writing alignments.
70 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
71 to store its sequences. These are subsequences with a start and end
72 positions in the parent reference sequence. Each sequence in the
73 SimpleAlign object is a Bio::LocatableSeq.
75 SimpleAlign expects the combination of name, start, and end for a
76 given sequence to be unique in the alignment, and this is the key for the
77 internal hashes (name, start, end are abbreviated C<nse> in the code).
78 However, in some cases people do not want the name/start-end to be displayed:
79 either multiple names in an alignment or names specific to the alignment
80 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
81 C<displayname>, and generally is what is used to print out the
82 alignment. They default to name/start-end.
84 The SimpleAlign Module is derived from the Align module by Ewan Birney.
86 =head1 FEEDBACK
88 =head2 Mailing Lists
90 User feedback is an integral part of the evolution of this and other
91 Bioperl modules. Send your comments and suggestions preferably to one
92 of the Bioperl mailing lists. Your participation is much appreciated.
94 bioperl-l@bioperl.org - General discussion
95 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
97 =head2 Reporting Bugs
99 Report bugs to the Bioperl bug tracking system to help us keep track
100 the bugs and their resolution. Bug reports can be submitted via the
101 web:
103 http://bugzilla.open-bio.org/
105 =head1 AUTHOR
107 Ewan Birney, birney@ebi.ac.uk
109 =head1 CONTRIBUTORS
111 Allen Day, allenday-at-ucla.edu,
112 Richard Adams, Richard.Adams-at-ed.ac.uk,
113 David J. Evans, David.Evans-at-vir.gla.ac.uk,
114 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
115 Allen Smith, allens-at-cpan.org,
116 Jason Stajich, jason-at-bioperl.org,
117 Anthony Underwood, aunderwood-at-phls.org.uk,
118 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
119 Brian Osborne, bosborne at alum.mit.edu
120 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
121 Hongyu Zhang, forward at hongyu.org
123 =head1 SEE ALSO
125 L<Bio::LocatableSeq>
127 =head1 APPENDIX
129 The rest of the documentation details each of the object
130 methods. Internal methods are usually preceded with a _
132 =cut
134 # 'Let the code begin...
136 package Bio::SimpleAlign;
137 use vars qw(%CONSERVATION_GROUPS);
138 use strict;
140 use Bio::LocatableSeq; # uses Seq's as list
142 use Bio::Seq;
143 use Bio::SeqFeature::Generic;
145 BEGIN {
146 # This data should probably be in a more centralized module...
147 # it is taken from Clustalw documentation.
148 # These are all the positively scoring groups that occur in the
149 # Gonnet Pam250 matrix. The strong and weak groups are
150 # defined as strong score >0.5 and weak score =<0.5 respectively.
152 %CONSERVATION_GROUPS = (
153 'strong' => [ qw(
155 NEQK
156 NHQK
157 NDEQ
158 QHRK
159 MILV
160 MILF
162 FYW )],
163 'weak' => [ qw(
167 STNK
168 STPA
169 SGND
170 SNDEQK
171 NDEQHK
172 NEQHRK
173 FVLIM
174 HFY )],);
177 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
178 Bio::FeatureHolderI);
180 =head2 new
182 Title : new
183 Usage : my $aln = Bio::SimpleAlign->new();
184 Function : Creates a new simple align object
185 Returns : Bio::SimpleAlign
186 Args : -source => string representing the source program
187 where this alignment came from
188 -annotation => Bio::AnnotationCollectionI
189 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
190 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
191 -consensus => consensus string
192 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
194 =cut
197 sub new {
198 my($class,@args) = @_;
200 my $self = $class->SUPER::new(@args);
202 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
203 SOURCE
204 SCORE
206 ACCESSION
207 DESCRIPTION
208 SEQS
209 FEATURES
210 ANNOTATION
211 SEQ_ANNOTATION
212 CONSENSUS
213 CONSENSUS_META
214 )], @args);
215 $src && $self->source($src);
216 defined $score && $self->score($score);
217 # we need to set up internal hashs first!
219 $self->{'_seq'} = {};
220 $self->{'_order'} = {};
221 $self->{'_start_end_lists'} = {};
222 $self->{'_dis_name'} = {};
223 $self->{'_id'} = 'NoName';
224 # maybe we should automatically read in from args. Hmmm...
225 $id && $self->id($id);
226 $acc && $self->accession($acc);
227 $desc && $self->description($desc);
228 $coll && $self->annotation($coll);
229 # sequence annotation is layered into a provided annotation collection (or dies)
230 if ($sa) {
231 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
232 "with a sequence annotation collection")
233 if !$coll;
234 $coll->add_Annotation('seq_annotation', $sa);
236 if ($feats && ref $feats eq 'ARRAY') {
237 for my $feat (@$feats) {
238 $self->add_SeqFeature($feat);
241 $con && $self->consensus($con);
242 $cmeta && $self->consensus_meta($cmeta);
243 # assumes these are in correct alignment order
244 if ($seqs && ref($seqs) eq 'ARRAY') {
245 for my $seq (@$seqs) {
246 $self->add_seq($seq);
250 return $self; # success - we hope!
253 =head1 Modifier methods
255 These methods modify the MSA by adding, removing or shuffling complete
256 sequences.
258 =head2 add_seq
260 Title : add_seq
261 Usage : $myalign->add_seq($newseq);
262 Function : Adds another sequence to the alignment. *Does not* align
263 it - just adds it to the hashes.
264 Returns : nothing
265 Args : a Bio::LocatableSeq object
266 order (optional)
268 See L<Bio::LocatableSeq> for more information
270 =cut
272 sub addSeq {
273 my $self = shift;
274 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
275 $self->add_seq(@_);
278 sub add_seq {
279 my $self = shift;
280 my $seq = shift;
281 my $order = shift;
282 my ($name,$id,$start,$end);
284 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
285 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
288 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
290 # build the symbol list for this sequence,
291 # will prune out the gap and missing/match chars
292 # when actually asked for the symbol list in the
293 # symbol_chars
294 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
296 if( !defined $order ) {
297 $order = keys %{$self->{'_seq'}};
299 $name = $seq->get_nse;
301 if( $self->{'_seq'}->{$name} ) {
302 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
304 else {
305 $self->debug( "Assigning $name to $order\n");
307 $self->{'_order'}->{$order} = $name;
309 unless( exists( $self->{'_start_end_lists'}->{$id})) {
310 $self->{'_start_end_lists'}->{$id} = [];
312 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
315 $self->{'_seq'}->{$name} = $seq;
320 =head2 remove_seq
322 Title : remove_seq
323 Usage : $aln->remove_seq($seq);
324 Function : Removes a single sequence from an alignment
325 Returns :
326 Argument : a Bio::LocatableSeq object
328 =cut
330 sub removeSeq {
331 my $self = shift;
332 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
333 $self->remove_seq(@_);
336 sub remove_seq {
337 my $self = shift;
338 my $seq = shift;
339 my ($name,$id,$start,$end);
341 $self->throw("Need Bio::Locatable seq argument ")
342 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
344 $id = $seq->id();
345 $start = $seq->start();
346 $end = $seq->end();
347 $name = sprintf("%s/%d-%d",$id,$start,$end);
349 if( !exists $self->{'_seq'}->{$name} ) {
350 $self->throw("Sequence $name does not exist in the alignment to remove!");
353 delete $self->{'_seq'}->{$name};
355 # we need to remove this seq from the start_end_lists hash
357 if (exists $self->{'_start_end_lists'}->{$id}) {
358 # we need to find the sequence in the array.
360 my ($i, $found);;
361 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
362 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
363 $found = 1;
364 last;
367 if ($found) {
368 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
370 else {
371 $self->throw("Could not find the sequence to remoce from the start-end list");
374 else {
375 $self->throw("There is no seq list for the name $id");
377 # we need to shift order hash
378 my %rev_order = reverse %{$self->{'_order'}};
379 my $no = $rev_order{$name};
380 my $no_sequences = $self->no_sequences;
381 for (; $no < $no_sequences; $no++) {
382 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
384 delete $self->{'_order'}->{$no};
385 return 1;
389 =head2 purge
391 Title : purge
392 Usage : $aln->purge(0.7);
393 Function: Removes sequences above given sequence similarity
394 This function will grind on large alignments. Beware!
395 Example :
396 Returns : An array of the removed sequences
397 Args : float, threshold for similarity
399 =cut
401 sub purge {
402 my ($self,$perc) = @_;
403 my (%duplicate, @dups);
405 my @seqs = $self->each_seq();
407 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
408 my $seq = $seqs[$i];
410 #skip if already in duplicate hash
411 next if exists $duplicate{$seq->display_id} ;
412 my $one = $seq->seq();
414 my @one = split '', $one; #split to get 1aa per array element
416 for (my $j=$i+1;$j < @seqs;$j++) {
417 my $seq2 = $seqs[$j];
419 #skip if already in duplicate hash
420 next if exists $duplicate{$seq2->display_id} ;
422 my $two = $seq2->seq();
423 my @two = split '', $two;
425 my $count = 0;
426 my $res = 0;
427 for (my $k=0;$k<@one;$k++) {
428 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
429 $one[$k] eq $two[$k]) {
430 $count++;
432 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
433 $two[$k] ne '.' && $two[$k] ne '-' ) {
434 $res++;
438 my $ratio = 0;
439 $ratio = $count/$res unless $res == 0;
441 # if above threshold put in duplicate hash and push onto
442 # duplicate array for returning to get_unique
443 if ( $ratio > $perc ) {
444 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
445 $duplicate{$seq2->display_id} = 1;
446 push @dups, $seq2;
450 foreach my $seq (@dups) {
451 $self->remove_seq($seq);
453 return @dups;
456 =head2 sort_alphabetically
458 Title : sort_alphabetically
459 Usage : $ali->sort_alphabetically
460 Function : Changes the order of the alignment to alphabetical on name
461 followed by numerical by number.
462 Returns :
463 Argument :
465 =cut
467 sub sort_alphabetically {
468 my $self = shift;
469 my ($seq,$nse,@arr,%hash,$count);
471 foreach $seq ( $self->each_seq() ) {
472 $nse = $seq->get_nse;
473 $hash{$nse} = $seq;
476 $count = 0;
478 %{$self->{'_order'}} = (); # reset the hash;
480 foreach $nse ( sort _alpha_startend keys %hash) {
481 $self->{'_order'}->{$count} = $nse;
483 $count++;
488 =head2 sort_by_list
490 Title : sort_by_list
491 Usage : $aln_ordered=$aln->sort_by_list($list_file)
492 Function : Arbitrarily order sequences in an alignment
493 Returns : A new Bio::SimpleAlign object
494 Argument : a file listing sequence names in intended order (one name per line)
496 =cut
498 sub sort_by_list {
499 my ($self, $list) = @_;
500 my (@seq, @ids, %order);
502 foreach my $seq ( $self->each_seq() ) {
503 push @seq, $seq;
504 push @ids, $seq->display_id;
507 my $ct=1;
508 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
509 while (<$listfh>) {
510 chomp;
511 my $name=$_;
512 $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
513 $order{$name}=$ct++;
515 close($listfh);
517 # use the map-sort-map idiom:
518 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
519 my $aln = $self->new;
520 foreach (@sorted) { $aln->add_seq($_) }
521 return $aln;
524 =head2 set_new_reference
526 Title : set_new_reference
527 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
528 the sequence whoes name is "B31" (full, exact, and case-sensitive),
529 as the reference (1st) sequence
530 Function : Change/Set a new reference (i.e., the first) sequence
531 Returns : a new Bio::SimpleAlign object.
532 Throws an exception if designated sequence not found
533 Argument : a positive integer of sequence order, or a sequence name
534 in the original alignment
536 =cut
538 sub set_new_reference {
539 my ($self, $seqid) = @_;
540 my $aln = $self->new;
541 my (@seq, @ids, @new_seq);
542 my $is_num=0;
543 foreach my $seq ( $self->each_seq() ) {
544 push @seq, $seq;
545 push @ids, $seq->display_id;
548 if ($seqid =~ /^\d+$/) { # argument is seq position
549 $is_num=1;
550 $self->throw("The new reference sequence number has to be a positive integer >1 and <= no_sequences ") if ($seqid <= 1 || $seqid > $self->no_sequences);
551 } else { # argument is a seq name
552 $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
555 for (my $i=0; $i<=$#seq; $i++) {
556 my $pos=$i+1;
557 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
558 unshift @new_seq, $seq[$i];
559 } else {
560 push @new_seq, $seq[$i];
563 foreach (@new_seq) { $aln->add_seq($_); }
564 return $aln;
567 sub _in_aln { # check if input name exists in the alignment
568 my ($str, $ref) = @_;
569 foreach (@$ref) {
570 return 1 if $str eq $_;
572 return 0;
576 =head2 uniq_seq
578 Title : uniq_seq
579 Usage : $aln->uniq_seq(): Remove identical sequences in
580 in the alignment. Ambiguous base ("N", "n") and
581 leading and ending gaps ("-") are NOT counted as
582 differences.
583 Function : Make a new alignment of unique sequence types (STs)
584 Returns : 1. a new Bio::SimpleAlign object (all sequences renamed as "ST")
585 2. ST of each sequence in STDERR
586 Argument : None
588 =cut
590 sub uniq_seq {
591 my ($self, $seqid) = @_;
592 my $aln = $self->new;
593 my (%member, %order, @seq, @uniq_str);
594 my $order=0;
595 my $len = $self->length();
596 foreach my $seq ( $self->each_seq() ) {
597 my $str = $seq->seq();
599 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
600 # comparing two sequence strings
602 # 1st, convert "n", "N" to "?" (for DNA sequence only):
603 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
604 # 2nd, convert leading and ending gaps to "?":
605 $str = &_convert_leading_ending_gaps($str, '-', '?');
606 # Note that '?' also can mean unknown residue.
607 # I don't like making global class member changes like this, too
608 # prone to errors... -- cjfields 08-11-18
609 local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
610 my $new = Bio::LocatableSeq->new(
611 -id => $seq->id(),
612 -alphabet=> $seq->alphabet,
613 -seq => $str,
614 -start => $seq->start,
615 -end => $seq->end
617 push @seq, $new;
620 foreach my $seq (@seq) {
621 my $str = $seq->seq();
622 my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
623 if ($seen) { # seen before
624 my @memb = @{$member{$key}};
625 push @memb, $seq;
626 $member{$key} = \@memb;
627 } else { # not seen
628 push @uniq_str, $key;
629 $order++;
630 $member{$key} = [ ($seq) ];
631 $order{$key} = $order;
635 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
636 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
637 my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
638 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
639 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
640 my $gap='-';
641 my $end=length($str2);
642 $end -= length($1) while $str2 =~ m/($gap+)/g;
643 my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
644 -seq =>$str2,
645 -start=>1,
646 -end =>$end
648 $aln->add_seq($new);
649 # print STDERR "ST".$order{$str}, "\t=>";
650 foreach (@{$member{$str}}) {
651 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
653 # print STDERR "\n";
655 return $aln;
658 sub _check_uniq { # check if same seq exists in the alignment
659 my ($str1, $ref, $length) = @_;
660 my @char1=split //, $str1;
661 my @array=@$ref;
663 return (0, $str1) if @array==0; # not seen (1st sequence)
665 foreach my $str2 (@array) {
666 my $diff=0;
667 my @char2=split //, $str2;
668 for (my $i=0; $i<=$length-1; $i++) {
669 next if $char1[$i] eq '?';
670 next if $char2[$i] eq '?';
671 $diff++ if $char1[$i] ne $char2[$i];
673 return (1, $str2) if $diff == 0; # seen before
676 return (0, $str1); # not seen
679 sub _convert_leading_ending_gaps {
680 my $s=shift;
681 my $sym1=shift;
682 my $sym2=shift;
683 my @array=split //, $s;
684 # convert leading char:
685 for (my $i=0; $i<=$#array; $i++) {
686 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
688 # convert ending char:
689 for (my $i = $#array; $i>= 0; $i--) {
690 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
692 my $s_new=join '', @array;
693 return $s_new;
696 =head1 Sequence selection methods
698 Methods returning one or more sequences objects.
700 =head2 each_seq
702 Title : each_seq
703 Usage : foreach $seq ( $align->each_seq() )
704 Function : Gets a Seq object from the alignment
705 Returns : Seq object
706 Argument :
708 =cut
710 sub eachSeq {
711 my $self = shift;
712 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
713 $self->each_seq();
716 sub each_seq {
717 my $self = shift;
718 my (@arr,$order);
720 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
721 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
722 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
725 return @arr;
729 =head2 each_alphabetically
731 Title : each_alphabetically
732 Usage : foreach $seq ( $ali->each_alphabetically() )
733 Function : Returns a sequence object, but the objects are returned
734 in alphabetically sorted order.
735 Does not change the order of the alignment.
736 Returns : Seq object
737 Argument :
739 =cut
741 sub each_alphabetically {
742 my $self = shift;
743 my ($seq,$nse,@arr,%hash,$count);
745 foreach $seq ( $self->each_seq() ) {
746 $nse = $seq->get_nse;
747 $hash{$nse} = $seq;
750 foreach $nse ( sort _alpha_startend keys %hash) {
751 push(@arr,$hash{$nse});
753 return @arr;
756 sub _alpha_startend {
757 my ($aname,$astart,$bname,$bstart);
758 ($aname,$astart) = split (/-/,$a);
759 ($bname,$bstart) = split (/-/,$b);
761 if( $aname eq $bname ) {
762 return $astart <=> $bstart;
764 else {
765 return $aname cmp $bname;
769 =head2 each_seq_with_id
771 Title : each_seq_with_id
772 Usage : foreach $seq ( $align->each_seq_with_id() )
773 Function : Gets a Seq objects from the alignment, the contents
774 being those sequences with the given name (there may be
775 more than one)
776 Returns : Seq object
777 Argument : a seq name
779 =cut
781 sub eachSeqWithId {
782 my $self = shift;
783 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
784 $self->each_seq_with_id(@_);
787 sub each_seq_with_id {
788 my $self = shift;
789 my $id = shift;
791 $self->throw("Method each_seq_with_id needs a sequence name argument")
792 unless defined $id;
794 my (@arr, $seq);
796 if (exists($self->{'_start_end_lists'}->{$id})) {
797 @arr = @{$self->{'_start_end_lists'}->{$id}};
799 return @arr;
802 =head2 get_seq_by_pos
804 Title : get_seq_by_pos
805 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
806 Function : Gets a sequence based on its position in the alignment.
807 Numbering starts from 1. Sequence positions larger than
808 no_sequences() will thow an error.
809 Returns : a Bio::LocatableSeq object
810 Args : positive integer for the sequence position
812 =cut
814 sub get_seq_by_pos {
816 my $self = shift;
817 my ($pos) = @_;
819 $self->throw("Sequence position has to be a positive integer, not [$pos]")
820 unless $pos =~ /^\d+$/ and $pos > 0;
821 $self->throw("No sequence at position [$pos]")
822 unless $pos <= $self->no_sequences ;
824 my $nse = $self->{'_order'}->{--$pos};
825 return $self->{'_seq'}->{$nse};
828 =head2 get_seq_by_id
830 Title : get_seq_by_id
831 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
832 Function : Gets a sequence based on its name.
833 Sequences that do not exist will warn and return undef
834 Returns : a Bio::LocatableSeq object
835 Args : string for sequence name
837 =cut
839 sub get_seq_by_id {
840 my ($self,$name) = @_;
841 unless( defined $name ) {
842 $self->warn("Must provide a sequence name");
843 return;
845 for my $seq ( values %{$self->{'_seq'}} ) {
846 if ( $seq->id eq $name) {
847 return $seq;
850 return;
853 =head2 seq_with_features
855 Title : seq_with_features
856 Usage : $seq = $aln->seq_with_features(-pos => 1,
857 -consensus => 60
858 -mask =>
859 sub { my $consensus = shift;
861 for my $i (1..5){
862 my $n = 'N' x $i;
863 my $q = '\?' x $i;
864 while($consensus =~ /[^?]$q[^?]/){
865 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
868 return $consensus;
871 Function: produces a Bio::Seq object by first splicing gaps from -pos
872 (by means of a splice_by_seq_pos() call), then creating
873 features using non-? chars (by means of a consensus_string()
874 call with stringency -consensus).
875 Returns : a Bio::Seq object
876 Args : -pos : required. sequence from which to build the Bio::Seq
877 object
878 -consensus : optional, defaults to consensus_string()'s
879 default cutoff value
880 -mask : optional, a coderef to apply to consensus_string()'s
881 output before building features. this may be useful for
882 closing gaps of 1 bp by masking over them with N, for
883 instance
885 =cut
887 sub seq_with_features{
888 my ($self,%arg) = @_;
890 #first do the preparatory splice
891 $self->throw("must provide a -pos argument") unless $arg{-pos};
892 $self->splice_by_seq_pos($arg{-pos});
894 my $consensus_string = $self->consensus_string($arg{-consensus});
895 $consensus_string = $arg{-mask}->($consensus_string)
896 if defined($arg{-mask});
898 my(@bs,@es);
900 push @bs, 1 if $consensus_string =~ /^[^?]/;
902 while($consensus_string =~ /\?[^?]/g){
903 push @bs, pos($consensus_string);
905 while($consensus_string =~ /[^?]\?/g){
906 push @es, pos($consensus_string);
909 push @es, length($consensus_string) if $consensus_string =~ /[^?]$/;
911 my $seq = Bio::Seq->new();
913 # my $rootfeature = Bio::SeqFeature::Generic->new(
914 # -source_tag => 'location',
915 # -start => $self->get_seq_by_pos($arg{-pos})->start,
916 # -end => $self->get_seq_by_pos($arg{-pos})->end,
917 # );
918 # $seq->add_SeqFeature($rootfeature);
920 while(my $b = shift @bs){
921 my $e = shift @es;
922 $seq->add_SeqFeature(
923 Bio::SeqFeature::Generic->new(
924 -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
925 -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
926 -source_tag => $self->source || 'MSA',
931 return $seq;
935 =head1 Create new alignments
937 The result of these methods are horizontal or vertical subsets of the
938 current MSA.
940 =head2 select
942 Title : select
943 Usage : $aln2 = $aln->select(1, 3) # three first sequences
944 Function : Creates a new alignment from a continuous subset of
945 sequences. Numbering starts from 1. Sequence positions
946 larger than no_sequences() will thow an error.
947 Returns : a Bio::SimpleAlign object
948 Args : positive integer for the first sequence
949 positive integer for the last sequence to include (optional)
951 =cut
953 sub select {
954 my $self = shift;
955 my ($start, $end) = @_;
957 $self->throw("Select start has to be a positive integer, not [$start]")
958 unless $start =~ /^\d+$/ and $start > 0;
959 $self->throw("Select end has to be a positive integer, not [$end]")
960 unless $end =~ /^\d+$/ and $end > 0;
961 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
962 unless $start <= $end;
964 my $aln = $self->new;
965 foreach my $pos ($start .. $end) {
966 $aln->add_seq($self->get_seq_by_pos($pos));
968 $aln->id($self->id);
969 # fix for meta, sf, ann
970 return $aln;
973 =head2 select_noncont
975 Title : select_noncont
976 Usage : # 1st and 3rd sequences, sorted
977 $aln2 = $aln->select_noncont(1, 3)
979 # 1st and 3rd sequences, sorted (same as first)
980 $aln2 = $aln->select_noncont(3, 1)
982 # 1st and 3rd sequences, unsorted
983 $aln2 = $aln->select_noncont('nosort',3, 1)
985 Function : Creates a new alignment from a subset of sequences. Numbering
986 starts from 1. Sequence positions larger than no_sequences() will
987 throw an error. Sorts the order added to new alignment by default,
988 to prevent sorting pass 'nosort' as the first argument in the list.
989 Returns : a Bio::SimpleAlign object
990 Args : array of integers for the sequences. If the string 'nosort' is
991 passed as the first argument, the sequences will not be sorted
992 in the new alignment but will appear in the order listed.
994 =cut
996 sub select_noncont {
997 my $self = shift;
998 my $nosort = 0;
999 my (@pos) = @_;
1000 if ($pos[0] !~ m{^\d+$}) {
1001 my $sortcmd = shift @pos;
1002 if ($sortcmd eq 'nosort') {
1003 $nosort = 1;
1004 } else {
1005 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1009 my $end = $self->no_sequences;
1010 foreach ( @pos ) {
1011 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1012 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1015 @pos = sort {$a <=> $b} @pos unless $nosort;
1017 my $aln = $self->new;
1018 foreach my $p (@pos) {
1019 $aln->add_seq($self->get_seq_by_pos($p));
1021 $aln->id($self->id);
1022 # fix for meta, sf, ann
1023 return $aln;
1026 =head2 slice
1028 Title : slice
1029 Usage : $aln2 = $aln->slice(20,30)
1030 Function : Creates a slice from the alignment inclusive of start and
1031 end columns, and the first column in the alignment is denoted 1.
1032 Sequences with no residues in the slice are excluded from the
1033 new alignment and a warning is printed. Slice beyond the length of
1034 the sequence does not do padding.
1035 Returns : A Bio::SimpleAlign object
1036 Args : Positive integer for start column, positive integer for end column,
1037 optional boolean which if true will keep gap-only columns in the newly
1038 created slice. Example:
1040 $aln2 = $aln->slice(20,30,1)
1042 =cut
1044 sub slice {
1045 my $self = shift;
1046 my ($start, $end, $keep_gap_only) = @_;
1048 $self->throw("Slice start has to be a positive integer, not [$start]")
1049 unless $start =~ /^\d+$/ and $start > 0;
1050 $self->throw("Slice end has to be a positive integer, not [$end]")
1051 unless $end =~ /^\d+$/ and $end > 0;
1052 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1053 unless $start <= $end;
1054 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1055 "[$start] is too big.") if $start > $self->length;
1056 my $cons_meta = $self->consensus_meta;
1057 my $aln = $self->new;
1058 $aln->id($self->id);
1059 foreach my $seq ( $self->each_seq() ) {
1060 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1061 Bio::Seq::Meta->new
1062 (-id => $seq->id,
1063 -alphabet => $seq->alphabet,
1064 -strand => $seq->strand,
1065 -verbose => $self->verbose) :
1066 Bio::LocatableSeq->new
1067 (-id => $seq->id,
1068 -alphabet => $seq->alphabet,
1069 -strand => $seq->strand,
1070 -verbose => $self->verbose);
1072 # seq
1073 my $seq_end = $end;
1074 $seq_end = $seq->length if( $end > $seq->length );
1076 my $slice_seq = $seq->subseq($start, $seq_end);
1077 $new_seq->seq( $slice_seq );
1079 $slice_seq =~ s/\W//g;
1081 if ($start > 1) {
1082 my $pre_start_seq = $seq->subseq(1, $start - 1);
1083 $pre_start_seq =~ s/\W//g;
1084 if (!defined($seq->strand)) {
1085 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1086 } elsif ($seq->strand < 0){
1087 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1088 } else {
1089 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1091 } else {
1092 $new_seq->start( $seq->start);
1094 if ($new_seq->isa('Bio::Seq::MetaI')) {
1095 for my $meta_name ($seq->meta_names) {
1096 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1099 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1101 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1102 $aln->add_seq($new_seq);
1103 } else {
1104 if( $keep_gap_only ) {
1105 $aln->add_seq($new_seq);
1106 } else {
1107 my $nse = $seq->get_nse();
1108 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1109 " Sequence excluded from the new alignment.");
1113 if ($cons_meta) {
1114 my $new = Bio::Seq::Meta->new();
1115 for my $meta_name ($cons_meta->meta_names) {
1116 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1118 $aln->consensus_meta($new);
1120 $aln->annotation($self->annotation);
1121 # fix for meta, sf, ann
1122 return $aln;
1125 =head2 remove_columns
1127 Title : remove_columns
1128 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1129 $aln2 = $aln->remove_columns([0,0],[6,8])
1130 Function : Creates an aligment with columns removed corresponding to
1131 the specified type or by specifying the columns by number.
1132 Returns : Bio::SimpleAlign object
1133 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1134 'all_gaps_columns') or array ref where the referenced array
1135 contains a pair of integers that specify a range.
1136 The first column is 0,
1138 =cut
1140 sub remove_columns {
1141 my ($self,@args) = @_;
1142 @args || return $self;
1143 my $aln;
1145 if ($args[0][0] =~ /^[a-z_]+$/i) {
1146 $aln = $self->_remove_columns_by_type($args[0]);
1147 } elsif ($args[0][0] =~ /^\d+$/) {
1148 $aln = $self->_remove_columns_by_num(\@args);
1149 } else {
1150 $self->throw("You must pass array references to remove_columns(), not @args");
1152 # fix for meta, sf, ann
1153 $aln;
1157 =head2 remove_gaps
1159 Title : remove_gaps
1160 Usage : $aln2 = $aln->remove_gaps
1161 Function : Creates an aligment with gaps removed
1162 Returns : a Bio::SimpleAlign object
1163 Args : a gap character(optional) if none specified taken
1164 from $self->gap_char,
1165 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1166 indicates that only all-gaps columns should be deleted
1168 Used from method L<remove_columns> in most cases. Set gap character
1169 using L<gap_char()|gap_char>.
1171 =cut
1173 sub remove_gaps {
1174 my ($self,$gapchar,$all_gaps_columns) = @_;
1175 my $gap_line;
1176 if ($all_gaps_columns) {
1177 $gap_line = $self->all_gap_line($gapchar);
1178 } else {
1179 $gap_line = $self->gap_line($gapchar);
1181 my $aln = $self->new;
1183 my @remove;
1184 my $length = 0;
1185 my $del_char = $gapchar || $self->gap_char;
1186 # Do the matching to get the segments to remove
1187 while ($gap_line =~ m/[$del_char]/g) {
1188 my $start = pos($gap_line)-1;
1189 $gap_line=~/\G[$del_char]+/gc;
1190 my $end = pos($gap_line)-1;
1192 #have to offset the start and end for subsequent removes
1193 $start-=$length;
1194 $end -=$length;
1195 $length += ($end-$start+1);
1196 push @remove, [$start,$end];
1199 #remove the segments
1200 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1201 # fix for meta, sf, ann
1202 return $aln;
1206 sub _remove_col {
1207 my ($self,$aln,$remove) = @_;
1208 my @new;
1210 my $gap = $self->gap_char;
1212 # splice out the segments and create new seq
1213 foreach my $seq($self->each_seq){
1214 my $new_seq = Bio::LocatableSeq->new(
1215 -id => $seq->id,
1216 -alphabet=> $seq->alphabet,
1217 -strand => $seq->strand,
1218 -verbose => $self->verbose);
1219 my $sequence = $seq->seq;
1220 foreach my $pair(@{$remove}){
1221 my $start = $pair->[0];
1222 my $end = $pair->[1];
1223 $sequence = $seq->seq unless $sequence;
1224 my $orig = $sequence;
1225 my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1226 my $tail = ($end + 1) >= length($sequence) ? '' : substr($sequence, $end + 1);
1227 $sequence = $head.$tail;
1228 # start
1229 unless (defined $new_seq->start) {
1230 if ($start == 0) {
1231 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1232 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1234 else {
1235 my $start_adjust = $orig =~ /^$gap+/;
1236 if ($start_adjust) {
1237 $start_adjust = $+[0] == $start;
1239 $new_seq->start($seq->start + $start_adjust);
1242 # end
1243 if (($end + 1) >= length($orig)) {
1244 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1245 $new_seq->end($seq->end - (length($orig) - $start) + $end_adjust);
1247 else {
1248 $new_seq->end($seq->end);
1252 if ($new_seq->end < $new_seq->start) {
1253 # we removed all columns except for gaps: set to 0 to indicate no
1254 # sequence
1255 $new_seq->start(0);
1256 $new_seq->end(0);
1259 $new_seq->seq($sequence) if $sequence;
1260 push @new, $new_seq;
1262 # add the new seqs to the alignment
1263 foreach my $new(@new){
1264 $aln->add_seq($new);
1266 # fix for meta, sf, ann
1267 return $aln;
1270 sub _remove_columns_by_type {
1271 my ($self,$type) = @_;
1272 my $aln = $self->new;
1273 my @remove;
1275 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1276 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1277 my %matchchars = ( 'match' => '\*',
1278 'weak' => '\.',
1279 'strong' => ':',
1280 'mismatch' => ' ',
1281 'gaps' => '',
1282 'all_gaps_columns' => ''
1284 # get the characters to delete against
1285 my $del_char;
1286 foreach my $type (@{$type}){
1287 $del_char.= $matchchars{$type};
1290 my $length = 0;
1291 my $match_line = $self->match_line;
1292 # do the matching to get the segments to remove
1293 if($del_char){
1294 while($match_line =~ m/[$del_char]/g ){
1295 my $start = pos($match_line)-1;
1296 $match_line=~/\G[$del_char]+/gc;
1297 my $end = pos($match_line)-1;
1299 #have to offset the start and end for subsequent removes
1300 $start-=$length;
1301 $end -=$length;
1302 $length += ($end-$start+1);
1303 push @remove, [$start,$end];
1307 # remove the segments
1308 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1309 $aln = $aln->remove_gaps() if $gap;
1310 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1311 # fix for meta, sf, ann
1312 $aln;
1316 sub _remove_columns_by_num {
1317 my ($self,$positions) = @_;
1318 my $aln = $self->new;
1320 # sort the positions
1321 @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1323 my @remove;
1324 my $length = 0;
1325 foreach my $pos (@{$positions}) {
1326 my ($start, $end) = @{$pos};
1328 #have to offset the start and end for subsequent removes
1329 $start-=$length;
1330 $end -=$length;
1331 $length += ($end-$start+1);
1332 push @remove, [$start,$end];
1335 #remove the segments
1336 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1337 # fix for meta, sf, ann
1338 $aln;
1342 =head1 Change sequences within the MSA
1344 These methods affect characters in all sequences without changing the
1345 alignment.
1347 =head2 splice_by_seq_pos
1349 Title : splice_by_seq_pos
1350 Usage : $status = splice_by_seq_pos(1);
1351 Function: splices all aligned sequences where the specified sequence
1352 has gaps.
1353 Example :
1354 Returns : 1 on success
1355 Args : position of sequence to splice by
1358 =cut
1360 sub splice_by_seq_pos{
1361 my ($self,$pos) = @_;
1363 my $guide = $self->get_seq_by_pos($pos);
1364 my $guide_seq = $guide->seq;
1366 $guide_seq =~ s/\./\-/g;
1368 my @gaps = ();
1369 $pos = -1;
1370 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1371 unshift @gaps, $pos;
1372 $pos++;
1375 foreach my $seq ($self->each_seq){
1376 my @bases = split '', $seq->seq;
1378 splice(@bases, $_, 1) foreach @gaps;
1379 $seq->seq(join('', @bases));
1385 =head2 map_chars
1387 Title : map_chars
1388 Usage : $ali->map_chars('\.','-')
1389 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1390 characters
1392 Notice that the from (arg1) is interpretted as a regex,
1393 so be careful about quoting meta characters (eg
1394 $ali->map_chars('.','-') wont do what you want)
1395 Returns :
1396 Argument : 'from' rexexp
1397 'to' string
1399 =cut
1401 sub map_chars {
1402 my $self = shift;
1403 my $from = shift;
1404 my $to = shift;
1405 my ($seq,$temp);
1407 $self->throw("Need exactly two arguments")
1408 unless defined $from and defined $to;
1410 foreach $seq ( $self->each_seq() ) {
1411 $temp = $seq->seq();
1412 $temp =~ s/$from/$to/g;
1413 $seq->seq($temp);
1415 return 1;
1419 =head2 uppercase
1421 Title : uppercase()
1422 Usage : $ali->uppercase()
1423 Function : Sets all the sequences to uppercase
1424 Returns :
1425 Argument :
1427 =cut
1429 sub uppercase {
1430 my $self = shift;
1431 my $seq;
1432 my $temp;
1434 foreach $seq ( $self->each_seq() ) {
1435 $temp = $seq->seq();
1436 $temp =~ tr/[a-z]/[A-Z]/;
1438 $seq->seq($temp);
1440 return 1;
1443 =head2 cigar_line
1445 Title : cigar_line()
1446 Usage : %cigars = $align->cigar_line()
1447 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1448 Report) line for each sequence in the alignment. Examples are
1449 "1,60" or "5,10:12,58", where the numbers refer to conserved
1450 positions within the alignment. The keys of the hash are the
1451 NSEs (name/start/end) assigned to each sequence.
1452 Args : threshold (optional, defaults to 100)
1453 Returns : Hash of strings (cigar lines)
1455 =cut
1457 sub cigar_line {
1458 my $self = shift;
1459 my $thr=shift||100;
1460 my %cigars;
1462 my @consensus = split "",($self->consensus_string($thr));
1463 my $len = $self->length;
1464 my $gapchar = $self->gap_char;
1466 # create a precursor, something like (1,4,5,6,7,33,45),
1467 # where each number corresponds to a conserved position
1468 foreach my $seq ( $self->each_seq ) {
1469 my @seq = split "", uc ($seq->seq);
1470 my $pos = 1;
1471 for (my $x = 0 ; $x < $len ; $x++ ) {
1472 if ($seq[$x] eq $consensus[$x]) {
1473 push @{$cigars{$seq->get_nse}},$pos;
1474 $pos++;
1475 } elsif ($seq[$x] ne $gapchar) {
1476 $pos++;
1480 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1481 for my $name (keys %cigars) {
1482 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1483 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1484 push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1485 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1486 ${$cigars{$name}}[$#{$cigars{$name}}] );
1487 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1488 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1489 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1490 splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1494 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1495 for my $name (keys %cigars) {
1496 my @remove;
1497 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1498 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1499 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1500 unshift @remove,$x;
1503 for my $pos (@remove) {
1504 splice @{$cigars{$name}}, $pos, 1;
1507 # join and punctuate
1508 for my $name (keys %cigars) {
1509 my ($start,$end,$str) = "";
1510 while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1511 $str .= ($start . "," . $end . ":");
1513 $str =~ s/:$//;
1514 $cigars{$name} = $str;
1516 %cigars;
1520 =head2 match_line
1522 Title : match_line()
1523 Usage : $line = $align->match_line()
1524 Function : Generates a match line - much like consensus string
1525 except that a line indicating the '*' for a match.
1526 Args : (optional) Match line characters ('*' by default)
1527 (optional) Strong match char (':' by default)
1528 (optional) Weak match char ('.' by default)
1529 Returns : String
1531 =cut
1533 sub match_line {
1534 my ($self,$matchlinechar, $strong, $weak) = @_;
1535 my %matchchars = ('match' => $matchlinechar || '*',
1536 'weak' => $weak || '.',
1537 'strong' => $strong || ':',
1538 'mismatch' => ' ',
1541 my @seqchars;
1542 my $alphabet;
1543 foreach my $seq ( $self->each_seq ) {
1544 push @seqchars, [ split(//, uc ($seq->seq)) ];
1545 $alphabet = $seq->alphabet unless defined $alphabet;
1547 my $refseq = shift @seqchars;
1548 # let's just march down the columns
1549 my $matchline;
1550 POS:
1551 foreach my $pos ( 0..$self->length ) {
1552 my $refchar = $refseq->[$pos];
1553 my $char = $matchchars{'mismatch'};
1554 unless( defined $refchar ) {
1555 last if $pos == $self->length; # short circuit on last residue
1556 # this in place to handle jason's soon-to-be-committed
1557 # intron mapping code
1558 goto bottom;
1560 my %col = ($refchar => 1);
1561 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1562 foreach my $seq ( @seqchars ) {
1563 next if $pos >= scalar @$seq;
1564 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1565 $seq->[$pos] eq ' ' );
1566 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1568 my @colresidues = sort keys %col;
1570 # if all the values are the same
1571 if( $dash ) { $char = $matchchars{'mismatch'} }
1572 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1573 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1574 # matches for protein seqs
1575 TYPE:
1576 foreach my $type ( qw(strong weak) ) {
1577 # iterate through categories
1578 my %groups;
1579 # iterate through each of the aa in the col
1580 # look to see which groups it is in
1581 foreach my $c ( @colresidues ) {
1582 foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1583 push @{$groups{$f}},$c;
1586 GRP:
1587 foreach my $cols ( values %groups ) {
1588 @$cols = sort @$cols;
1589 # now we are just testing to see if two arrays
1590 # are identical w/o changing either one
1591 # have to be same len
1592 next if( scalar @$cols != scalar @colresidues );
1593 # walk down the length and check each slot
1594 for($_=0;$_ < (scalar @$cols);$_++ ) {
1595 next GRP if( $cols->[$_] ne $colresidues[$_] );
1597 $char = $matchchars{$type};
1598 last TYPE;
1602 bottom:
1603 $matchline .= $char;
1605 return $matchline;
1609 =head2 gap_line
1611 Title : gap_line()
1612 Usage : $line = $align->gap_line()
1613 Function : Generates a gap line - much like consensus string
1614 except that a line where '-' represents gap
1615 Args : (optional) gap line characters ('-' by default)
1616 Returns : string
1618 =cut
1620 sub gap_line {
1621 my ($self,$gapchar) = @_;
1622 $gapchar = $gapchar || $self->gap_char;
1623 my %gap_hsh; # column gaps vector
1624 foreach my $seq ( $self->each_seq ) {
1625 my $i = 0;
1626 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1627 map {[$i++, $_]} split(//, uc ($seq->seq));
1629 my $gap_line;
1630 foreach my $pos ( 0..$self->length-1 ) {
1631 $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1633 return $gap_line;
1636 =head2 all_gap_line
1638 Title : all_gap_line()
1639 Usage : $line = $align->all_gap_line()
1640 Function : Generates a gap line - much like consensus string
1641 except that a line where '-' represents all-gap column
1642 Args : (optional) gap line characters ('-' by default)
1643 Returns : string
1645 =cut
1647 sub all_gap_line {
1648 my ($self,$gapchar) = @_;
1649 $gapchar = $gapchar || $self->gap_char;
1650 my %gap_hsh; # column gaps counter hash
1651 my @seqs = $self->each_seq;
1652 foreach my $seq ( @seqs ) {
1653 my $i = 0;
1654 map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1655 map {[$i++, $_]} split(//, uc ($seq->seq));
1657 my $gap_line;
1658 foreach my $pos ( 0..$self->length-1 ) {
1659 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1660 # gaps column
1661 $gap_line .= $gapchar;
1662 } else {
1663 $gap_line .= '.';
1666 return $gap_line;
1669 =head2 gap_col_matrix
1671 Title : gap_col_matrix()
1672 Usage : my $cols = $align->gap_col_matrix()
1673 Function : Generates an array of hashes where
1674 each entry in the array is a hash reference
1675 with keys of all the sequence names and
1676 and value of 1 or 0 if the sequence has a gap at that column
1677 Args : (optional) gap line characters ($aln->gap_char or '-' by default)
1679 =cut
1681 sub gap_col_matrix {
1682 my ($self,$gapchar) = @_;
1683 $gapchar = $gapchar || $self->gap_char;
1684 my %gap_hsh; # column gaps vector
1685 my @cols;
1686 foreach my $seq ( $self->each_seq ) {
1687 my $i = 0;
1688 my $str = $seq->seq;
1689 my $len = $seq->length;
1690 my $ch;
1691 my $id = $seq->display_id;
1692 while( $i < $len ) {
1693 $ch = substr($str, $i, 1);
1694 $cols[$i++]->{$id} = ($ch eq $gapchar);
1697 return \@cols;
1700 =head2 match
1702 Title : match()
1703 Usage : $ali->match()
1704 Function : Goes through all columns and changes residues that are
1705 identical to residue in first sequence to match '.'
1706 character. Sets match_char.
1708 USE WITH CARE: Most MSA formats do not support match
1709 characters in sequences, so this is mostly for output
1710 only. NEXUS format (Bio::AlignIO::nexus) can handle
1712 Returns : 1
1713 Argument : a match character, optional, defaults to '.'
1715 =cut
1717 sub match {
1718 my ($self, $match) = @_;
1720 $match ||= '.';
1721 my ($matching_char) = $match;
1722 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1723 $self->map_chars($matching_char, '-');
1725 my @seqs = $self->each_seq();
1726 return 1 unless scalar @seqs > 1;
1728 my $refseq = shift @seqs ;
1729 my @refseq = split //, $refseq->seq;
1730 my $gapchar = $self->gap_char;
1732 foreach my $seq ( @seqs ) {
1733 my @varseq = split //, $seq->seq();
1734 for ( my $i=0; $i < scalar @varseq; $i++) {
1735 $varseq[$i] = $match if defined $refseq[$i] &&
1736 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1737 $refseq[$i] =~ /$gapchar/ )
1738 && $refseq[$i] eq $varseq[$i];
1740 $seq->seq(join '', @varseq);
1742 $self->match_char($match);
1743 return 1;
1747 =head2 unmatch
1749 Title : unmatch()
1750 Usage : $ali->unmatch()
1751 Function : Undoes the effect of method match. Unsets match_char.
1752 Returns : 1
1753 Argument : a match character, optional, defaults to '.'
1755 See L<match> and L<match_char>
1757 =cut
1759 sub unmatch {
1760 my ($self, $match) = @_;
1762 $match ||= '.';
1764 my @seqs = $self->each_seq();
1765 return 1 unless scalar @seqs > 1;
1767 my $refseq = shift @seqs ;
1768 my @refseq = split //, $refseq->seq;
1769 my $gapchar = $self->gap_char;
1770 foreach my $seq ( @seqs ) {
1771 my @varseq = split //, $seq->seq();
1772 for ( my $i=0; $i < scalar @varseq; $i++) {
1773 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1774 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1775 $refseq[$i] =~ /$gapchar/ ) &&
1776 $varseq[$i] eq $match;
1778 $seq->seq(join '', @varseq);
1780 $self->match_char('');
1781 return 1;
1784 =head1 MSA attributes
1786 Methods for setting and reading the MSA attributes.
1788 Note that the methods defining character semantics depend on the user
1789 to set them sensibly. They are needed only by certain input/output
1790 methods. Unset them by setting to an empty string ('').
1792 =head2 id
1794 Title : id
1795 Usage : $myalign->id("Ig")
1796 Function : Gets/sets the id field of the alignment
1797 Returns : An id string
1798 Argument : An id string (optional)
1800 =cut
1802 sub id {
1803 my ($self, $name) = @_;
1805 if (defined( $name )) {
1806 $self->{'_id'} = $name;
1809 return $self->{'_id'};
1812 =head2 accession
1814 Title : accession
1815 Usage : $myalign->accession("PF00244")
1816 Function : Gets/sets the accession field of the alignment
1817 Returns : An acc string
1818 Argument : An acc string (optional)
1820 =cut
1822 sub accession {
1823 my ($self, $acc) = @_;
1825 if (defined( $acc )) {
1826 $self->{'_accession'} = $acc;
1829 return $self->{'_accession'};
1832 =head2 description
1834 Title : description
1835 Usage : $myalign->description("14-3-3 proteins")
1836 Function : Gets/sets the description field of the alignment
1837 Returns : An description string
1838 Argument : An description string (optional)
1840 =cut
1842 sub description {
1843 my ($self, $name) = @_;
1845 if (defined( $name )) {
1846 $self->{'_description'} = $name;
1849 return $self->{'_description'};
1852 =head2 missing_char
1854 Title : missing_char
1855 Usage : $myalign->missing_char("?")
1856 Function : Gets/sets the missing_char attribute of the alignment
1857 It is generally recommended to set it to 'n' or 'N'
1858 for nucleotides and to 'X' for protein.
1859 Returns : An missing_char string,
1860 Argument : An missing_char string (optional)
1862 =cut
1864 sub missing_char {
1865 my ($self, $char) = @_;
1867 if (defined $char ) {
1868 $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1869 $self->{'_missing_char'} = $char;
1872 return $self->{'_missing_char'};
1875 =head2 match_char
1877 Title : match_char
1878 Usage : $myalign->match_char('.')
1879 Function : Gets/sets the match_char attribute of the alignment
1880 Returns : An match_char string,
1881 Argument : An match_char string (optional)
1883 =cut
1885 sub match_char {
1886 my ($self, $char) = @_;
1888 if (defined $char ) {
1889 $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1890 $self->{'_match_char'} = $char;
1893 return $self->{'_match_char'};
1896 =head2 gap_char
1898 Title : gap_char
1899 Usage : $myalign->gap_char('-')
1900 Function : Gets/sets the gap_char attribute of the alignment
1901 Returns : An gap_char string, defaults to '-'
1902 Argument : An gap_char string (optional)
1904 =cut
1906 sub gap_char {
1907 my ($self, $char) = @_;
1909 if (defined $char || ! defined $self->{'_gap_char'} ) {
1910 $char= '-' unless defined $char;
1911 $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1912 $self->{'_gap_char'} = $char;
1914 return $self->{'_gap_char'};
1917 =head2 symbol_chars
1919 Title : symbol_chars
1920 Usage : my @symbolchars = $aln->symbol_chars;
1921 Function: Returns all the seen symbols (other than gaps)
1922 Returns : array of characters that are the seen symbols
1923 Args : boolean to include the gap/missing/match characters
1925 =cut
1927 sub symbol_chars{
1928 my ($self,$includeextra) = @_;
1930 unless ($self->{'_symbols'}) {
1931 foreach my $seq ($self->each_seq) {
1932 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1935 my %copy = %{$self->{'_symbols'}};
1936 if( ! $includeextra ) {
1937 foreach my $char ( $self->gap_char, $self->match_char,
1938 $self->missing_char) {
1939 delete $copy{$char} if( defined $char );
1942 return keys %copy;
1945 =head1 Alignment descriptors
1947 These read only methods describe the MSA in various ways.
1950 =head2 score
1952 Title : score
1953 Usage : $str = $ali->score()
1954 Function : get/set a score of the alignment
1955 Returns : a score for the alignment
1956 Argument : an optional score to set
1958 =cut
1960 sub score {
1961 my $self = shift;
1962 $self->{score} = shift if @_;
1963 return $self->{score};
1966 =head2 consensus_string
1968 Title : consensus_string
1969 Usage : $str = $ali->consensus_string($threshold_percent)
1970 Function : Makes a strict consensus
1971 Returns : Consensus string
1972 Argument : Optional treshold ranging from 0 to 100.
1973 The consensus residue has to appear at least threshold %
1974 of the sequences at a given location, otherwise a '?'
1975 character will be placed at that location.
1976 (Default value = 0%)
1978 =cut
1980 sub consensus_string {
1981 my $self = shift;
1982 my $threshold = shift;
1984 my $out = "";
1985 my $len = $self->length - 1;
1987 foreach ( 0 .. $len ) {
1988 $out .= $self->_consensus_aa($_,$threshold);
1990 return $out;
1993 sub _consensus_aa {
1994 my $self = shift;
1995 my $point = shift;
1996 my $threshold_percent = shift || -1 ;
1997 my ($seq,%hash,$count,$letter,$key);
1998 my $gapchar = $self->gap_char;
1999 foreach $seq ( $self->each_seq() ) {
2000 $letter = substr($seq->seq,$point,1);
2001 $self->throw("--$point-----------") if $letter eq '';
2002 ($letter eq $gapchar || $letter =~ /\./) && next;
2003 # print "Looking at $letter\n";
2004 $hash{$letter}++;
2006 my $number_of_sequences = $self->no_sequences();
2007 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2008 $count = -1;
2009 $letter = '?';
2011 foreach $key ( sort keys %hash ) {
2012 # print "Now at $key $hash{$key}\n";
2013 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2014 $letter = $key;
2015 $count = $hash{$key};
2018 return $letter;
2022 =head2 consensus_iupac
2024 Title : consensus_iupac
2025 Usage : $str = $ali->consensus_iupac()
2026 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2027 and RNA. The output is in upper case except when gaps in
2028 a column force output to be in lower case.
2030 Note that if your alignment sequences contain a lot of
2031 IUPAC ambiquity codes you often have to manually set
2032 alphabet. Bio::PrimarySeq::_guess_type thinks they
2033 indicate a protein sequence.
2034 Returns : consensus string
2035 Argument : none
2036 Throws : on protein sequences
2038 =cut
2040 sub consensus_iupac {
2041 my $self = shift;
2042 my $out = "";
2043 my $len = $self->length-1;
2045 # only DNA and RNA sequences are valid
2046 foreach my $seq ( $self->each_seq() ) {
2047 $self->throw("Seq [". $seq->get_nse. "] is a protein")
2048 if $seq->alphabet eq 'protein';
2050 # loop over the alignment columns
2051 foreach my $count ( 0 .. $len ) {
2052 $out .= $self->_consensus_iupac($count);
2054 return $out;
2057 sub _consensus_iupac {
2058 my ($self, $column) = @_;
2059 my ($string, $char, $rna);
2061 #determine all residues in a column
2062 foreach my $seq ( $self->each_seq() ) {
2063 $string .= substr($seq->seq, $column, 1);
2065 $string = uc $string;
2067 # quick exit if there's an N in the string
2068 if ($string =~ /N/) {
2069 $string =~ /\W/ ? return 'n' : return 'N';
2071 # ... or if there are only gap characters
2072 return '-' if $string =~ /^\W+$/;
2074 # treat RNA as DNA in regexps
2075 if ($string =~ /U/) {
2076 $string =~ s/U/T/;
2077 $rna = 1;
2080 # the following s///'s only need to be done to the _first_ ambiguity code
2081 # as we only need to see the _range_ of characters in $string
2083 if ($string =~ /[VDHB]/) {
2084 $string =~ s/V/AGC/;
2085 $string =~ s/D/AGT/;
2086 $string =~ s/H/ACT/;
2087 $string =~ s/B/CTG/;
2090 if ($string =~ /[SKYRWM]/) {
2091 $string =~ s/S/GC/;
2092 $string =~ s/K/GT/;
2093 $string =~ s/Y/CT/;
2094 $string =~ s/R/AG/;
2095 $string =~ s/W/AT/;
2096 $string =~ s/M/AC/;
2099 # and now the guts of the thing
2101 if ($string =~ /A/) {
2102 $char = 'A'; # A A
2103 if ($string =~ /G/) {
2104 $char = 'R'; # A and G (purines) R
2105 if ($string =~ /C/) {
2106 $char = 'V'; # A and G and C V
2107 if ($string =~ /T/) {
2108 $char = 'N'; # A and G and C and T N
2110 } elsif ($string =~ /T/) {
2111 $char = 'D'; # A and G and T D
2113 } elsif ($string =~ /C/) {
2114 $char = 'M'; # A and C M
2115 if ($string =~ /T/) {
2116 $char = 'H'; # A and C and T H
2118 } elsif ($string =~ /T/) {
2119 $char = 'W'; # A and T W
2121 } elsif ($string =~ /C/) {
2122 $char = 'C'; # C C
2123 if ($string =~ /T/) {
2124 $char = 'Y'; # C and T (pyrimidines) Y
2125 if ($string =~ /G/) {
2126 $char = 'B'; # C and T and G B
2128 } elsif ($string =~ /G/) {
2129 $char = 'S'; # C and G S
2131 } elsif ($string =~ /G/) {
2132 $char = 'G'; # G G
2133 if ($string =~ /C/) {
2134 $char = 'S'; # G and C S
2135 } elsif ($string =~ /T/) {
2136 $char = 'K'; # G and T K
2138 } elsif ($string =~ /T/) {
2139 $char = 'T'; # T T
2142 $char = 'U' if $rna and $char eq 'T';
2143 $char = lc $char if $string =~ /\W/;
2145 return $char;
2149 =head2 consensus_meta
2151 Title : consensus_meta
2152 Usage : $seqmeta = $ali->consensus_meta()
2153 Function : Returns a Bio::Seq::Meta object containing the consensus
2154 strings derived from meta data analysis.
2155 Returns : Bio::Seq::Meta
2156 Argument : Bio::Seq::Meta
2157 Throws : non-MetaI object
2159 =cut
2161 sub consensus_meta {
2162 my ($self, $meta) = @_;
2163 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2164 $self->throw('Not a Bio::Seq::MetaI object');
2166 return $self->{'_aln_meta'} = $meta if $meta;
2167 return $self->{'_aln_meta'}
2170 =head2 is_flush
2172 Title : is_flush
2173 Usage : if ( $ali->is_flush() )
2174 Function : Tells you whether the alignment
2175 : is flush, i.e. all of the same length
2176 Returns : 1 or 0
2177 Argument :
2179 =cut
2181 sub is_flush {
2182 my ($self,$report) = @_;
2183 my $seq;
2184 my $length = (-1);
2185 my $temp;
2187 foreach $seq ( $self->each_seq() ) {
2188 if( $length == (-1) ) {
2189 $length = CORE::length($seq->seq());
2190 next;
2193 $temp = CORE::length($seq->seq());
2194 if( $temp != $length ) {
2195 $self->warn("expecting $length not $temp from ".
2196 $seq->display_id) if( $report );
2197 $self->debug("expecting $length not $temp from ".
2198 $seq->display_id);
2199 $self->debug($seq->seq(). "\n");
2200 return 0;
2204 return 1;
2208 =head2 length
2210 Title : length()
2211 Usage : $len = $ali->length()
2212 Function : Returns the maximum length of the alignment.
2213 To be sure the alignment is a block, use is_flush
2214 Returns : Integer
2215 Argument :
2217 =cut
2219 sub length_aln {
2220 my $self = shift;
2221 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2222 $self->length(@_);
2225 sub length {
2226 my $self = shift;
2227 my $seq;
2228 my $length = -1;
2229 my $temp;
2231 foreach $seq ( $self->each_seq() ) {
2232 $temp = $seq->length();
2233 if( $temp > $length ) {
2234 $length = $temp;
2238 return $length;
2242 =head2 maxdisplayname_length
2244 Title : maxdisplayname_length
2245 Usage : $ali->maxdisplayname_length()
2246 Function : Gets the maximum length of the displayname in the
2247 alignment. Used in writing out various MSA formats.
2248 Returns : integer
2249 Argument :
2251 =cut
2253 sub maxname_length {
2254 my $self = shift;
2255 $self->deprecated("maxname_length - deprecated method.".
2256 " Use maxdisplayname_length() instead.");
2257 $self->maxdisplayname_length();
2260 sub maxnse_length {
2261 my $self = shift;
2262 $self->deprecated("maxnse_length - deprecated method.".
2263 " Use maxnse_length() instead.");
2264 $self->maxdisplayname_length();
2267 sub maxdisplayname_length {
2268 my $self = shift;
2269 my $maxname = (-1);
2270 my ($seq,$len);
2272 foreach $seq ( $self->each_seq() ) {
2273 $len = CORE::length $self->displayname($seq->get_nse());
2275 if( $len > $maxname ) {
2276 $maxname = $len;
2280 return $maxname;
2283 =head2 max_metaname_length
2285 Title : max_metaname_length
2286 Usage : $ali->max_metaname_length()
2287 Function : Gets the maximum length of the meta name tags in the
2288 alignment for the sequences and for the alignment.
2289 Used in writing out various MSA formats.
2290 Returns : integer
2291 Argument : None
2293 =cut
2295 sub max_metaname_length {
2296 my $self = shift;
2297 my $maxname = (-1);
2298 my ($seq,$len);
2300 # check seq meta first
2301 for $seq ( $self->each_seq() ) {
2302 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2303 for my $mtag ($seq->meta_names) {
2304 $len = CORE::length $mtag;
2305 if( $len > $maxname ) {
2306 $maxname = $len;
2311 # alignment meta
2312 for my $meta ($self->consensus_meta) {
2313 next unless $meta;
2314 for my $name ($meta->meta_names) {
2315 $len = CORE::length $name;
2316 if( $len > $maxname ) {
2317 $maxname = $len;
2322 return $maxname;
2325 =head2 no_residues
2327 Title : no_residues
2328 Usage : $no = $ali->no_residues
2329 Function : number of residues in total in the alignment
2330 Returns : integer
2331 Argument :
2333 =cut
2335 sub no_residues {
2336 my $self = shift;
2337 my $count = 0;
2339 foreach my $seq ($self->each_seq) {
2340 my $str = $seq->seq();
2342 $count += ($str =~ s/[A-Za-z]//g);
2345 return $count;
2348 =head2 no_sequences
2350 Title : no_sequences
2351 Usage : $depth = $ali->no_sequences
2352 Function : number of sequence in the sequence alignment
2353 Returns : integer
2354 Argument :
2356 =cut
2358 sub no_sequences {
2359 my $self = shift;
2361 return scalar($self->each_seq);
2365 =head2 average_percentage_identity
2367 Title : average_percentage_identity
2368 Usage : $id = $align->average_percentage_identity
2369 Function: The function uses a fast method to calculate the average
2370 percentage identity of the alignment
2371 Returns : The average percentage identity of the alignment
2372 Args : None
2373 Notes : This method implemented by Kevin Howe calculates a figure that is
2374 designed to be similar to the average pairwise identity of the
2375 alignment (identical in the absence of gaps), without having to
2376 explicitly calculate pairwise identities proposed by Richard Durbin.
2377 Validated by Ewan Birney ad Alex Bateman.
2379 =cut
2381 sub average_percentage_identity{
2382 my ($self,@args) = @_;
2384 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2385 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2387 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2389 if (! $self->is_flush()) {
2390 $self->throw("All sequences in the alignment must be the same length");
2393 @seqs = $self->each_seq();
2394 $len = $self->length();
2396 # load the each hash with correct keys for existence checks
2398 for( my $index=0; $index < $len; $index++) {
2399 foreach my $letter (@alphabet) {
2400 $countHashes[$index]->{$letter} = 0;
2403 foreach my $seq (@seqs) {
2404 my @seqChars = split //, $seq->seq();
2405 for( my $column=0; $column < @seqChars; $column++ ) {
2406 my $char = uc($seqChars[$column]);
2407 if (exists $countHashes[$column]->{$char}) {
2408 $countHashes[$column]->{$char}++;
2413 $total = 0;
2414 $divisor = 0;
2415 for(my $column =0; $column < $len; $column++) {
2416 my %hash = %{$countHashes[$column]};
2417 $subdivisor = 0;
2418 foreach my $res (keys %hash) {
2419 $total += $hash{$res}*($hash{$res} - 1);
2420 $subdivisor += $hash{$res};
2422 $divisor += $subdivisor * ($subdivisor - 1);
2424 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2427 =head2 percentage_identity
2429 Title : percentage_identity
2430 Usage : $id = $align->percentage_identity
2431 Function: The function calculates the average percentage identity
2432 (aliased to average_percentage_identity)
2433 Returns : The average percentage identity
2434 Args : None
2436 =cut
2438 sub percentage_identity {
2439 my $self = shift;
2440 return $self->average_percentage_identity();
2443 =head2 overall_percentage_identity
2445 Title : overall_percentage_identity
2446 Usage : $id = $align->overall_percentage_identity
2447 $id = $align->overall_percentage_identity('short')
2448 Function: The function calculates the percentage identity of
2449 the conserved columns
2450 Returns : The percentage identity of the conserved columns
2451 Args : length value to use, optional defaults to alignment length
2452 possible values: 'align', 'short', 'long'
2454 The argument values 'short' and 'long' refer to shortest and longest
2455 sequence in the alignment. Method modification code by Hongyu Zhang.
2457 =cut
2459 sub overall_percentage_identity{
2460 my ($self, $length_measure) = @_;
2462 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2463 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2465 my ($len, $total, @seqs, @countHashes);
2467 my %enum = map {$_ => 1} qw (align short long);
2469 $self->throw("Unknown argument [$length_measure]")
2470 if $length_measure and not $enum{$length_measure};
2471 $length_measure ||= 'align';
2473 if (! $self->is_flush()) {
2474 $self->throw("All sequences in the alignment must be the same length");
2477 @seqs = $self->each_seq();
2478 $len = $self->length();
2480 # load the each hash with correct keys for existence checks
2481 for( my $index=0; $index < $len; $index++) {
2482 foreach my $letter (@alphabet) {
2483 $countHashes[$index]->{$letter} = 0;
2486 foreach my $seq (@seqs) {
2487 my @seqChars = split //, $seq->seq();
2488 for( my $column=0; $column < @seqChars; $column++ ) {
2489 my $char = uc($seqChars[$column]);
2490 if (exists $countHashes[$column]->{$char}) {
2491 $countHashes[$column]->{$char}++;
2496 $total = 0;
2497 for(my $column =0; $column < $len; $column++) {
2498 my %hash = %{$countHashes[$column]};
2499 foreach ( values %hash ) {
2500 next if( $_ == 0 );
2501 $total++ if( $_ == scalar @seqs );
2502 last;
2506 if ($length_measure eq 'short') {
2507 ## find the shortest length
2508 $len = 0;
2509 foreach my $seq ($self->each_seq) {
2510 my $count = $seq->seq =~ tr/[A-Za-z]//;
2511 if ($len) {
2512 $len = $count if $count < $len;
2513 } else {
2514 $len = $count;
2518 elsif ($length_measure eq 'long') {
2519 ## find the longest length
2520 $len = 0;
2521 foreach my $seq ($self->each_seq) {
2522 my $count = $seq->seq =~ tr/[A-Za-z]//;
2523 if ($len) {
2524 $len = $count if $count > $len;
2525 } else {
2526 $len = $count;
2531 return ($total / $len ) * 100.0;
2536 =head1 Alignment positions
2538 Methods to map a sequence position into an alignment column and back.
2539 column_from_residue_number() does the former. The latter is really a
2540 property of the sequence object and can done using
2541 L<Bio::LocatableSeq::location_from_column>:
2543 # select somehow a sequence from the alignment, e.g.
2544 my $seq = $aln->get_seq_by_pos(1);
2545 #$loc is undef or Bio::LocationI object
2546 my $loc = $seq->location_from_column(5);
2548 =head2 column_from_residue_number
2550 Title : column_from_residue_number
2551 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2552 Function: This function gives the position in the alignment
2553 (i.e. column number) of the given residue number in the
2554 sequence with the given name. For example, for the
2555 alignment
2557 Seq1/91-97 AC..DEF.GH.
2558 Seq2/24-30 ACGG.RTY...
2559 Seq3/43-51 AC.DDEF.GHI
2561 column_from_residue_number( "Seq1", 94 ) returns 6.
2562 column_from_residue_number( "Seq2", 25 ) returns 2.
2563 column_from_residue_number( "Seq3", 50 ) returns 10.
2565 An exception is thrown if the residue number would lie
2566 outside the length of the aligment
2567 (e.g. column_from_residue_number( "Seq2", 22 )
2569 Note: If the the parent sequence is represented by more than
2570 one alignment sequence and the residue number is present in
2571 them, this method finds only the first one.
2573 Returns : A column number for the position in the alignment of the
2574 given residue in the given sequence (1 = first column)
2575 Args : A sequence id/name (not a name/start-end)
2576 A residue number in the whole sequence (not just that
2577 segment of it in the alignment)
2579 =cut
2581 sub column_from_residue_number {
2582 my ($self, $name, $resnumber) = @_;
2584 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2585 $self->throw("Second argument residue number missing") unless $resnumber;
2587 foreach my $seq ($self->each_seq_with_id($name)) {
2588 my $col;
2589 eval {
2590 $col = $seq->column_from_residue_number($resnumber);
2592 next if $@;
2593 return $col;
2596 $self->throw("Could not find a sequence segment in $name ".
2597 "containing residue number $resnumber");
2601 =head1 Sequence names
2603 Methods to manipulate the display name. The default name based on the
2604 sequence id and subsequence positions can be overridden in various
2605 ways.
2607 =head2 displayname
2609 Title : displayname
2610 Usage : $myalign->displayname("Ig", "IgA")
2611 Function : Gets/sets the display name of a sequence in the alignment
2612 Returns : A display name string
2613 Argument : name of the sequence
2614 displayname of the sequence (optional)
2616 =cut
2618 sub get_displayname {
2619 my $self = shift;
2620 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2621 $self->displayname(@_);
2624 sub set_displayname {
2625 my $self = shift;
2626 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2627 $self->displayname(@_);
2630 sub displayname {
2631 my ($self, $name, $disname) = @_;
2633 $self->throw("No sequence with name [$name]")
2634 unless defined $self->{'_seq'}->{$name};
2636 if( $disname and $name) {
2637 $self->{'_dis_name'}->{$name} = $disname;
2638 return $disname;
2640 elsif( defined $self->{'_dis_name'}->{$name} ) {
2641 return $self->{'_dis_name'}->{$name};
2642 } else {
2643 return $name;
2647 =head2 set_displayname_count
2649 Title : set_displayname_count
2650 Usage : $ali->set_displayname_count
2651 Function : Sets the names to be name_# where # is the number of
2652 times this name has been used.
2653 Returns : 1, on success
2654 Argument :
2656 =cut
2658 sub set_displayname_count {
2659 my $self= shift;
2660 my (@arr,$name,$seq,$count,$temp,$nse);
2662 foreach $seq ( $self->each_alphabetically() ) {
2663 $nse = $seq->get_nse();
2665 #name will be set when this is the second
2666 #time (or greater) is has been seen
2668 if( defined $name and $name eq ($seq->id()) ) {
2669 $temp = sprintf("%s_%s",$name,$count);
2670 $self->displayname($nse,$temp);
2671 $count++;
2672 } else {
2673 $count = 1;
2674 $name = $seq->id();
2675 $temp = sprintf("%s_%s",$name,$count);
2676 $self->displayname($nse,$temp);
2677 $count++;
2680 return 1;
2683 =head2 set_displayname_flat
2685 Title : set_displayname_flat
2686 Usage : $ali->set_displayname_flat()
2687 Function : Makes all the sequences be displayed as just their name,
2688 not name/start-end
2689 Returns : 1
2690 Argument :
2692 =cut
2694 sub set_displayname_flat {
2695 my $self = shift;
2696 my ($nse,$seq);
2698 foreach $seq ( $self->each_seq() ) {
2699 $nse = $seq->get_nse();
2700 $self->displayname($nse,$seq->id());
2702 return 1;
2705 =head2 set_displayname_normal
2707 Title : set_displayname_normal
2708 Usage : $ali->set_displayname_normal()
2709 Function : Makes all the sequences be displayed as name/start-end
2710 Returns : 1, on success
2711 Argument :
2713 =cut
2715 sub set_displayname_normal {
2716 my $self = shift;
2717 my ($nse,$seq);
2719 foreach $seq ( $self->each_seq() ) {
2720 $nse = $seq->get_nse();
2721 $self->displayname($nse,$nse);
2723 return 1;
2726 =head2 source
2728 Title : source
2729 Usage : $obj->source($newval)
2730 Function: sets the Alignment source program
2731 Example :
2732 Returns : value of source
2733 Args : newvalue (optional)
2736 =cut
2738 sub source{
2739 my ($self,$value) = @_;
2740 if( defined $value) {
2741 $self->{'_source'} = $value;
2743 return $self->{'_source'};
2746 =head2 set_displayname_safe
2748 Title : set_displayname_safe
2749 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2750 Function : Assign machine-generated serial names to sequences in input order.
2751 Designed to protect names during PHYLIP runs. Assign 10-char string
2752 in the form of "S000000001" to "S999999999". Restore the original
2753 names using "restore_displayname".
2754 Returns : 1. a new $aln with system names;
2755 2. a hash ref for restoring names
2756 Argument : Number for id length (default 10)
2758 =cut
2760 sub set_displayname_safe {
2761 my $self = shift;
2762 my $idlength = shift || 10;
2763 my ($seq, %phylip_name);
2764 my $ct=0;
2765 my $new=Bio::SimpleAlign->new();
2766 foreach $seq ( $self->each_seq() ) {
2767 $ct++;
2768 my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2769 $phylip_name{$pname}=$seq->id();
2770 my $new_seq= Bio::LocatableSeq->new(-id => $pname,
2771 -seq => $seq->seq(),
2772 -alphabet => $seq->alphabet,
2773 -start => $seq->{_start},
2774 -end => $seq->{_end}
2776 $new->add_seq($new_seq);
2779 $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2780 return ($new, \%phylip_name);
2783 =head2 restore_displayname
2785 Title : restore_displayname
2786 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2787 Function : Restore original sequence names (after running
2788 $ali->set_displayname_safe)
2789 Returns : a new $aln with names restored.
2790 Argument : a hash reference of names from "set_displayname_safe".
2792 =cut
2794 sub restore_displayname {
2795 my $self = shift;
2796 my $ref=shift;
2797 my %name=%$ref;
2798 my $new=Bio::SimpleAlign->new();
2799 foreach my $seq ( $self->each_seq() ) {
2800 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2801 my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2802 -seq => $seq->seq(),
2803 -alphabet => $seq->alphabet,
2804 -start => $seq->{_start},
2805 -end => $seq->{_end}
2807 $new->add_seq($new_seq);
2809 return $new;
2812 =head2 sort_by_start
2814 Title : sort_by_start
2815 Usage : $ali->sort_by_start
2816 Function : Changes the order of the alignment to the start position of each
2817 subalignment
2818 Returns :
2819 Argument :
2821 =cut
2823 sub sort_by_start {
2824 my $self = shift;
2825 my ($seq,$nse,@arr,%hash,$count);
2826 foreach $seq ( $self->each_seq() ) {
2827 $nse = $seq->get_nse;
2828 $hash{$nse} = $seq;
2830 $count = 0;
2831 %{$self->{'_order'}} = (); # reset the hash;
2832 foreach $nse ( sort _startend keys %hash) {
2833 $self->{'_order'}->{$count} = $nse;
2834 $count++;
2839 sub _startend
2841 my ($aname,$arange) = split (/[\/]/,$a);
2842 my ($bname,$brange) = split (/[\/]/,$b);
2843 my ($astart,$aend) = split(/\-/,$arange);
2844 my ($bstart,$bend) = split(/\-/,$brange);
2845 return $astart <=> $bstart;
2848 =head2 bracket_string
2850 Title : bracket_string
2851 Usage : my @params = (-refseq => 'testseq',
2852 -allele1 => 'allele1',
2853 -allele2 => 'allele2',
2854 -delimiters => '{}',
2855 -separator => '/');
2856 $str = $aln->bracket_string(@params)
2858 Function : When supplied with a list of parameters (see below), returns a
2859 string in BIC format. This is used for allelic comparisons.
2860 Briefly, if either allele contains a base change when compared to
2861 the refseq, the base or gap for each allele is represented in
2862 brackets in the order present in the 'alleles' parameter.
2864 For the following data:
2866 >testseq
2867 GGATCCATTGCTACT
2868 >allele1
2869 GGATCCATTCCTACT
2870 >allele2
2871 GGAT--ATTCCTCCT
2873 the returned string with parameters 'refseq => testseq' and
2874 'alleles => [qw(allele1 allele2)]' would be:
2876 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2877 Returns : BIC-formatted string
2878 Argument : Required args
2879 refseq : string (ID) of the reference sequence used
2880 as basis for comparison
2881 allele1 : string (ID) of the first allele
2882 allele2 : string (ID) of the second allele
2883 Optional args
2884 delimiters: two symbol string of left and right delimiters.
2885 Only the first two symbols are used
2886 default = '[]'
2887 separator : string used as a separator. Only the first
2888 symbol is used
2889 default = '/'
2890 Throws : On no refseq/alleles, or invalid refseq/alleles.
2892 =cut
2894 sub bracket_string {
2895 my ($self, @args) = @_;
2896 my ($ref, $a1, $a2, $delim, $sep) =
2897 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2898 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2899 my ($ld, $rd);
2900 ($ld, $rd) = split('', $delim, 2) if $delim;
2901 $ld ||= '[';
2902 $rd ||= ']';
2903 $sep ||= '/';
2904 my ($refseq, $allele1, $allele2) =
2905 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2906 if (!$refseq || !$allele1 || !$allele2) {
2907 $self->throw("One of your refseq/allele IDs is invalid!");
2909 my $len = $self->length-1;
2910 my $bic = '';
2911 # loop over the alignment columns
2912 for my $column ( 0 .. $len ) {
2913 my $string;
2914 my ($compres, $res1, $res2) =
2915 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2916 # are any of the allele symbols different from the refseq?
2917 $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2918 $ld.$res1.$sep.$res2.$rd;
2919 $bic .= $string;
2921 return $bic;
2925 =head2 methods for Bio::FeatureHolder
2927 FeatureHolder implementation to support labeled character sets like one
2928 would get from NEXUS represented data.
2930 =head2 get_SeqFeatures
2932 Usage :
2933 Function: Get the feature objects held by this feature holder.
2934 Example :
2935 Returns : an array of Bio::SeqFeatureI implementing objects
2936 Args : none
2938 At some day we may want to expand this method to allow for a feature
2939 filter to be passed in.
2941 =cut
2943 sub get_SeqFeatures {
2944 my $self = shift;
2946 if( !defined $self->{'_as_feat'} ) {
2947 $self->{'_as_feat'} = [];
2949 return @{$self->{'_as_feat'}};
2952 =head2 add_SeqFeature
2954 Usage : $feat->add_SeqFeature($subfeat);
2955 $feat->add_SeqFeature($subfeat,'EXPAND')
2956 Function: adds a SeqFeature into the subSeqFeature array.
2957 with no 'EXPAND' qualifer, subfeat will be tested
2958 as to whether it lies inside the parent, and throw
2959 an exception if not.
2961 If EXPAND is used, the parent''s start/end/strand will
2962 be adjusted so that it grows to accommodate the new
2963 subFeature
2964 Example :
2965 Returns : nothing
2966 Args : a Bio::SeqFeatureI object
2968 =cut
2970 sub add_SeqFeature {
2971 my ($self,@feat) = @_;
2973 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
2975 foreach my $feat ( @feat ) {
2976 if( !$feat->isa("Bio::SeqFeatureI") ) {
2977 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
2980 push(@{$self->{'_as_feat'}},$feat);
2982 return 1;
2986 =head2 remove_SeqFeatures
2988 Usage : $obj->remove_SeqFeatures
2989 Function: Removes all sub SeqFeatures. If you want to remove only a subset,
2990 remove that subset from the returned array, and add back the rest.
2991 Returns : The array of Bio::SeqFeatureI implementing sub-features that was
2992 deleted from this feature.
2993 Args : none
2995 =cut
2997 sub remove_SeqFeatures {
2998 my $self = shift;
3000 return () unless $self->{'_as_feat'};
3001 my @feats = @{$self->{'_as_feat'}};
3002 $self->{'_as_feat'} = [];
3003 return @feats;
3006 =head2 feature_count
3008 Title : feature_count
3009 Usage : $obj->feature_count()
3010 Function: Return the number of SeqFeatures attached to a feature holder.
3012 This is before flattening a possible sub-feature tree.
3014 We provide a default implementation here that just counts
3015 the number of objects returned by get_SeqFeatures().
3016 Implementors may want to override this with a more
3017 efficient implementation.
3019 Returns : integer representing the number of SeqFeatures
3020 Args : None
3022 At some day we may want to expand this method to allow for a feature
3023 filter to be passed in.
3025 Our default implementation allows for any number of additional
3026 arguments and will pass them on to get_SeqFeatures(). I.e., in order to
3027 support filter arguments, just support them in get_SeqFeatures().
3029 =cut
3031 sub feature_count {
3032 my ($self) = @_;
3034 if (defined($self->{'_as_feat'})) {
3035 return ($#{$self->{'_as_feat'}} + 1);
3036 } else {
3037 return 0;
3041 =head2 get_all_SeqFeatures
3043 Title : get_all_SeqFeatures
3044 Usage :
3045 Function: Get the flattened tree of feature objects held by this
3046 feature holder. The difference to get_SeqFeatures is that
3047 the entire tree of sub-features will be flattened out.
3049 We provide a default implementation here, so implementors
3050 don''t necessarily need to implement this method.
3052 Example :
3053 Returns : an array of Bio::SeqFeatureI implementing objects
3054 Args : none
3056 At some day we may want to expand this method to allow for a feature
3057 filter to be passed in.
3059 Our default implementation allows for any number of additional
3060 arguments and will pass them on to any invocation of
3061 get_SeqFeatures(), wherever a component of the tree implements
3062 FeatureHolderI. I.e., in order to support filter arguments, just
3063 support them in get_SeqFeatures().
3065 =cut
3067 =head2 methods for Bio::AnnotatableI
3069 AnnotatableI implementation to support sequence alignments which
3070 contain annotation (NEXUS, Stockholm).
3072 =head2 annotation
3074 Title : annotation
3075 Usage : $ann = $aln->annotation or
3076 $aln->annotation($ann)
3077 Function: Gets or sets the annotation
3078 Returns : Bio::AnnotationCollectionI object
3079 Args : None or Bio::AnnotationCollectionI object
3081 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3082 for more information
3084 =cut
3086 sub annotation {
3087 my ($obj,$value) = @_;
3088 if( defined $value ) {
3089 $obj->throw("object of class ".ref($value)." does not implement ".
3090 "Bio::AnnotationCollectionI. Too bad.")
3091 unless $value->isa("Bio::AnnotationCollectionI");
3092 $obj->{'_annotation'} = $value;
3093 } elsif( ! defined $obj->{'_annotation'}) {
3094 $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3096 return $obj->{'_annotation'};