* seq_inds is not defined for Model-based HSPs
[bioperl-live.git] / Bio / SimpleAlign.pm
blobc7a7eaeaf3cd202be32610333a2218303c291ad5
1 # $Id$
2 # BioPerl module for SimpleAlign
4 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
6 # Copyright Ewan Birney
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
12 # History:
13 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
14 # May 2001 major rewrite - Heikki Lehvaslaiho
16 =head1 NAME
18 Bio::SimpleAlign - Multiple alignments held as a set of sequences
20 =head1 SYNOPSIS
22 # Use Bio::AlignIO to read in the alignment
23 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
24 $aln = $str->next_aln();
26 # Describe
27 print $aln->length;
28 print $aln->no_residues;
29 print $aln->is_flush;
30 print $aln->no_sequences;
31 print $aln->score;
32 print $aln->percentage_identity;
33 print $aln->consensus_string(50);
35 # Find the position in the alignment for a sequence location
36 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
38 # Extract sequences and check values for the alignment column $pos
39 foreach $seq ($aln->each_seq) {
40 $res = $seq->subseq($pos, $pos);
41 $count{$res}++;
43 foreach $res (keys %count) {
44 printf "Res: %s Count: %2d\n", $res, $count{$res};
47 # Manipulate
48 $aln->remove_seq($seq);
49 $mini_aln = $aln->slice(20,30); # get a block of columns
50 $mini_aln = $aln->select_noncont(1,3,5,7,11); # get single columns
51 $new_aln = $aln->remove_columns([20,30]); # remove by position
52 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
54 # Analyze
55 $str = $aln->consensus_string($threshold_percent);
56 $str = $aln->match_line();
57 $str = $aln->cigar_line()
58 $id = $aln->percentage_identity;
60 See the module documentation for details and more methods.
62 =head1 DESCRIPTION
64 SimpleAlign is an object that handles a multiple sequence alignment
65 (MSA). It is very permissive of types (it does not insist on sequences
66 being all same length, for example). Think of it as a set of sequences
67 with a whole series of built-in manipulations and methods for reading and
68 writing alignments.
70 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
71 to store its sequences. These are subsequences with a start and end
72 positions in the parent reference sequence. Each sequence in the
73 SimpleAlign object is a Bio::LocatableSeq.
75 SimpleAlign expects the combination of name, start, and end for a
76 given sequence to be unique in the alignment, and this is the key for the
77 internal hashes (name, start, end are abbreviated C<nse> in the code).
78 However, in some cases people do not want the name/start-end to be displayed:
79 either multiple names in an alignment or names specific to the alignment
80 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
81 C<displayname>, and generally is what is used to print out the
82 alignment. They default to name/start-end.
84 The SimpleAlign Module is derived from the Align module by Ewan Birney.
86 =head1 FEEDBACK
88 =head2 Mailing Lists
90 User feedback is an integral part of the evolution of this and other
91 Bioperl modules. Send your comments and suggestions preferably to one
92 of the Bioperl mailing lists. Your participation is much appreciated.
94 bioperl-l@bioperl.org - General discussion
95 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
97 =head2 Reporting Bugs
99 Report bugs to the Bioperl bug tracking system to help us keep track
100 the bugs and their resolution. Bug reports can be submitted via the
101 web:
103 http://bugzilla.open-bio.org/
105 =head1 AUTHOR
107 Ewan Birney, birney@ebi.ac.uk
109 =head1 CONTRIBUTORS
111 Allen Day, allenday-at-ucla.edu,
112 Richard Adams, Richard.Adams-at-ed.ac.uk,
113 David J. Evans, David.Evans-at-vir.gla.ac.uk,
114 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
115 Allen Smith, allens-at-cpan.org,
116 Jason Stajich, jason-at-bioperl.org,
117 Anthony Underwood, aunderwood-at-phls.org.uk,
118 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
119 Brian Osborne, bosborne at alum.mit.edu
120 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
121 Hongyu Zhang, forward at hongyu.org
123 =head1 SEE ALSO
125 L<Bio::LocatableSeq>
127 =head1 APPENDIX
129 The rest of the documentation details each of the object
130 methods. Internal methods are usually preceded with a _
132 =cut
134 # 'Let the code begin...
136 package Bio::SimpleAlign;
137 use vars qw(%CONSERVATION_GROUPS);
138 use strict;
140 use Bio::LocatableSeq; # uses Seq's as list
142 use Bio::Seq;
143 use Bio::SeqFeature::Generic;
145 BEGIN {
146 # This data should probably be in a more centralized module...
147 # it is taken from Clustalw documentation.
148 # These are all the positively scoring groups that occur in the
149 # Gonnet Pam250 matrix. The strong and weak groups are
150 # defined as strong score >0.5 and weak score =<0.5 respectively.
152 %CONSERVATION_GROUPS = (
153 'strong' => [ qw(
155 NEQK
156 NHQK
157 NDEQ
158 QHRK
159 MILV
160 MILF
162 FYW )],
163 'weak' => [ qw(
167 STNK
168 STPA
169 SGND
170 SNDEQK
171 NDEQHK
172 NEQHRK
173 FVLIM
174 HFY )],);
177 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
178 Bio::FeatureHolderI);
180 =head2 new
182 Title : new
183 Usage : my $aln = Bio::SimpleAlign->new();
184 Function : Creates a new simple align object
185 Returns : Bio::SimpleAlign
186 Args : -source => string representing the source program
187 where this alignment came from
189 =cut
192 sub new {
193 my($class,@args) = @_;
195 my $self = $class->SUPER::new(@args);
197 my ($src,$score,$id) = $self->_rearrange([qw(SOURCE SCORE ID)], @args);
198 $src && $self->source($src);
199 defined $score && $self->score($score);
200 # we need to set up internal hashs first!
202 $self->{'_seq'} = {};
203 $self->{'_order'} = {};
204 $self->{'_start_end_lists'} = {};
205 $self->{'_dis_name'} = {};
206 $self->{'_id'} = 'NoName';
207 # maybe we should automatically read in from args. Hmmm...
208 $id && $self->id($id);
210 return $self; # success - we hope!
213 =head1 Modifier methods
215 These methods modify the MSA by adding, removing or shuffling complete
216 sequences.
218 =head2 add_seq
220 Title : add_seq
221 Usage : $myalign->add_seq($newseq);
222 Function : Adds another sequence to the alignment. *Does not* align
223 it - just adds it to the hashes.
224 Returns : nothing
225 Args : a Bio::LocatableSeq object
226 order (optional)
228 See L<Bio::LocatableSeq> for more information
230 =cut
232 sub addSeq {
233 my $self = shift;
234 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
235 $self->add_seq(@_);
238 sub add_seq {
239 my $self = shift;
240 my $seq = shift;
241 my $order = shift;
242 my ($name,$id,$start,$end);
244 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
245 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
248 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
249 $start = $seq->start();
250 $end = $seq->end();
252 # build the symbol list for this sequence,
253 # will prune out the gap and missing/match chars
254 # when actually asked for the symbol list in the
255 # symbol_chars
256 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
258 if( !defined $order ) {
259 $order = keys %{$self->{'_seq'}};
261 $name = sprintf("%s/%d-%d",$id,$start,$end);
263 if( $self->{'_seq'}->{$name} ) {
264 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
266 else {
267 $self->debug( "Assigning $name to $order\n");
269 $self->{'_order'}->{$order} = $name;
271 unless( exists( $self->{'_start_end_lists'}->{$id})) {
272 $self->{'_start_end_lists'}->{$id} = [];
274 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
277 $self->{'_seq'}->{$name} = $seq;
282 =head2 remove_seq
284 Title : remove_seq
285 Usage : $aln->remove_seq($seq);
286 Function : Removes a single sequence from an alignment
287 Returns :
288 Argument : a Bio::LocatableSeq object
290 =cut
292 sub removeSeq {
293 my $self = shift;
294 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
295 $self->remove_seq(@_);
298 sub remove_seq {
299 my $self = shift;
300 my $seq = shift;
301 my ($name,$id,$start,$end);
303 $self->throw("Need Bio::Locatable seq argument ")
304 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
306 $id = $seq->id();
307 $start = $seq->start();
308 $end = $seq->end();
309 $name = sprintf("%s/%d-%d",$id,$start,$end);
311 if( !exists $self->{'_seq'}->{$name} ) {
312 $self->throw("Sequence $name does not exist in the alignment to remove!");
315 delete $self->{'_seq'}->{$name};
317 # we need to remove this seq from the start_end_lists hash
319 if (exists $self->{'_start_end_lists'}->{$id}) {
320 # we need to find the sequence in the array.
322 my ($i, $found);;
323 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
324 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
325 $found = 1;
326 last;
329 if ($found) {
330 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
332 else {
333 $self->throw("Could not find the sequence to remoce from the start-end list");
336 else {
337 $self->throw("There is no seq list for the name $id");
339 # we need to shift order hash
340 my %rev_order = reverse %{$self->{'_order'}};
341 my $no = $rev_order{$name};
342 my $no_sequences = $self->no_sequences;
343 for (; $no < $no_sequences; $no++) {
344 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
346 delete $self->{'_order'}->{$no};
347 return 1;
351 =head2 purge
353 Title : purge
354 Usage : $aln->purge(0.7);
355 Function: Removes sequences above given sequence similarity
356 This function will grind on large alignments. Beware!
357 Example :
358 Returns : An array of the removed sequences
359 Args : float, threshold for similarity
361 =cut
363 sub purge {
364 my ($self,$perc) = @_;
365 my (%duplicate, @dups);
367 my @seqs = $self->each_seq();
369 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
370 my $seq = $seqs[$i];
372 #skip if already in duplicate hash
373 next if exists $duplicate{$seq->display_id} ;
374 my $one = $seq->seq();
376 my @one = split '', $one; #split to get 1aa per array element
378 for (my $j=$i+1;$j < @seqs;$j++) {
379 my $seq2 = $seqs[$j];
381 #skip if already in duplicate hash
382 next if exists $duplicate{$seq2->display_id} ;
384 my $two = $seq2->seq();
385 my @two = split '', $two;
387 my $count = 0;
388 my $res = 0;
389 for (my $k=0;$k<@one;$k++) {
390 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
391 $one[$k] eq $two[$k]) {
392 $count++;
394 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
395 $two[$k] ne '.' && $two[$k] ne '-' ) {
396 $res++;
400 my $ratio = 0;
401 $ratio = $count/$res unless $res == 0;
403 # if above threshold put in duplicate hash and push onto
404 # duplicate array for returning to get_unique
405 if ( $ratio > $perc ) {
406 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
407 $duplicate{$seq2->display_id} = 1;
408 push @dups, $seq2;
412 foreach my $seq (@dups) {
413 $self->remove_seq($seq);
415 return @dups;
418 =head2 sort_alphabetically
420 Title : sort_alphabetically
421 Usage : $ali->sort_alphabetically
422 Function : Changes the order of the alignment to alphabetical on name
423 followed by numerical by number.
424 Returns :
425 Argument :
427 =cut
429 sub sort_alphabetically {
430 my $self = shift;
431 my ($seq,$nse,@arr,%hash,$count);
433 foreach $seq ( $self->each_seq() ) {
434 $nse = $seq->get_nse;
435 $hash{$nse} = $seq;
438 $count = 0;
440 %{$self->{'_order'}} = (); # reset the hash;
442 foreach $nse ( sort _alpha_startend keys %hash) {
443 $self->{'_order'}->{$count} = $nse;
445 $count++;
450 =head2 sort_by_list
452 Title : sort_by_list
453 Usage : $aln_ordered=$aln->sort_by_list($list_file)
454 Function : Arbitrarily order sequences in an alignment
455 Returns : A new Bio::SimpleAlign object
456 Argument : a file listing sequence names in intended order (one name per line)
458 =cut
460 sub sort_by_list {
461 my ($self, $list) = @_;
462 my (@seq, @ids, %order);
464 foreach my $seq ( $self->each_seq() ) {
465 push @seq, $seq;
466 push @ids, $seq->display_id;
469 my $ct=1;
470 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
471 while (<$listfh>) {
472 chomp;
473 my $name=$_;
474 $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
475 $order{$name}=$ct++;
477 close($listfh);
479 # use the map-sort-map idiom:
480 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
481 my $aln = $self->new;
482 foreach (@sorted) { $aln->add_seq($_) }
483 return $aln;
486 =head2 set_new_reference
488 Title : set_new_reference
489 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
490 the sequence whoes name is "B31" (full, exact, and case-sensitive),
491 as the reference (1st) sequence
492 Function : Change/Set a new reference (i.e., the first) sequence
493 Returns : a new Bio::SimpleAlign object.
494 Throws an exception if designated sequence not found
495 Argument : a positive integer of sequence order, or a sequence name
496 in the original alignment
498 =cut
500 sub set_new_reference {
501 my ($self, $seqid) = @_;
502 my $aln = $self->new;
503 my (@seq, @ids, @new_seq);
504 my $is_num=0;
505 foreach my $seq ( $self->each_seq() ) {
506 push @seq, $seq;
507 push @ids, $seq->display_id;
510 if ($seqid =~ /^\d+$/) { # argument is seq position
511 $is_num=1;
512 $self->throw("The new reference sequence number has to be a positive integer >1 and <= no_sequences ") if ($seqid <= 1 || $seqid > $self->no_sequences);
513 } else { # argument is a seq name
514 $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
517 for (my $i=0; $i<=$#seq; $i++) {
518 my $pos=$i+1;
519 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
520 unshift @new_seq, $seq[$i];
521 } else {
522 push @new_seq, $seq[$i];
525 foreach (@new_seq) { $aln->add_seq($_); }
526 return $aln;
529 sub _in_aln { # check if input name exists in the alignment
530 my ($str, $ref) = @_;
531 foreach (@$ref) {
532 return 1 if $str eq $_;
534 return 0;
538 =head2 uniq_seq
540 Title : uniq_seq
541 Usage : $aln->uniq_seq(): Remove identical sequences in
542 in the alignment. Ambiguous base ("N", "n") and
543 leading and ending gaps ("-") are NOT counted as
544 differences.
545 Function : Make a new alignment of unique sequence types (STs)
546 Returns : 1. a new Bio::SimpleAlign object (all sequences renamed as "ST")
547 2. ST of each sequence in STDERR
548 Argument : None
550 =cut
552 sub uniq_seq {
553 my ($self, $seqid) = @_;
554 my $aln = $self->new;
555 my (%member, %order, @seq, @uniq_str);
556 my $order=0;
557 my $len = $self->length();
558 foreach my $seq ( $self->each_seq() ) {
559 my $str = $seq->seq();
561 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
562 # comparing two sequence strings
564 # 1st, convert "n", "N" to "?" (for DNA sequence only):
565 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
566 # 2nd, convert leading and ending gaps to "?":
567 $str = &_convert_leading_ending_gaps($str, '-', '?');
568 my $new = Bio::LocatableSeq->new(-id => $seq->id(),
569 -alphabet=> $seq->alphabet,
570 -seq => $str,
571 -start => $seq->start,
572 -end => $seq->end
574 push @seq, $new;
577 foreach my $seq (@seq) {
578 my $str = $seq->seq();
579 my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
580 if ($seen) { # seen before
581 my @memb = @{$member{$key}};
582 push @memb, $seq;
583 $member{$key} = \@memb;
584 } else { # not seen
585 push @uniq_str, $key;
586 $order++;
587 $member{$key} = [ ($seq) ];
588 $order{$key} = $order;
592 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
593 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
594 my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
595 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
596 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
597 my $gap='-';
598 my $end=length($str2);
599 $end -= length($1) while $str2 =~ m/($gap+)/g;
600 my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
601 -seq =>$str2,
602 -start=>1,
603 -end =>$end
605 $aln->add_seq($new);
606 # print STDERR "ST".$order{$str}, "\t=>";
607 foreach (@{$member{$str}}) {
608 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
610 # print STDERR "\n";
612 return $aln;
615 sub _check_uniq { # check if same seq exists in the alignment
616 my ($str1, $ref, $length) = @_;
617 my @char1=split //, $str1;
618 my @array=@$ref;
620 return (0, $str1) if @array==0; # not seen (1st sequence)
622 foreach my $str2 (@array) {
623 my $diff=0;
624 my @char2=split //, $str2;
625 for (my $i=0; $i<=$length-1; $i++) {
626 next if $char1[$i] eq '?';
627 next if $char2[$i] eq '?';
628 $diff++ if $char1[$i] ne $char2[$i];
630 return (1, $str2) if $diff == 0; # seen before
633 return (0, $str1); # not seen
636 sub _convert_leading_ending_gaps {
637 my $s=shift;
638 my $sym1=shift;
639 my $sym2=shift;
640 my @array=split //, $s;
641 # convert leading char:
642 for (my $i=0; $i<=$#array; $i++) {
643 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
645 # convert ending char:
646 for (my $i = $#array; $i>= 0; $i--) {
647 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
649 my $s_new=join '', @array;
650 return $s_new;
653 =head1 Sequence selection methods
655 Methods returning one or more sequences objects.
657 =head2 each_seq
659 Title : each_seq
660 Usage : foreach $seq ( $align->each_seq() )
661 Function : Gets a Seq object from the alignment
662 Returns : Seq object
663 Argument :
665 =cut
667 sub eachSeq {
668 my $self = shift;
669 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
670 $self->each_seq();
673 sub each_seq {
674 my $self = shift;
675 my (@arr,$order);
677 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
678 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
679 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
682 return @arr;
686 =head2 each_alphabetically
688 Title : each_alphabetically
689 Usage : foreach $seq ( $ali->each_alphabetically() )
690 Function : Returns a sequence object, but the objects are returned
691 in alphabetically sorted order.
692 Does not change the order of the alignment.
693 Returns : Seq object
694 Argument :
696 =cut
698 sub each_alphabetically {
699 my $self = shift;
700 my ($seq,$nse,@arr,%hash,$count);
702 foreach $seq ( $self->each_seq() ) {
703 $nse = $seq->get_nse;
704 $hash{$nse} = $seq;
707 foreach $nse ( sort _alpha_startend keys %hash) {
708 push(@arr,$hash{$nse});
710 return @arr;
713 sub _alpha_startend {
714 my ($aname,$astart,$bname,$bstart);
715 ($aname,$astart) = split (/-/,$a);
716 ($bname,$bstart) = split (/-/,$b);
718 if( $aname eq $bname ) {
719 return $astart <=> $bstart;
721 else {
722 return $aname cmp $bname;
726 =head2 each_seq_with_id
728 Title : each_seq_with_id
729 Usage : foreach $seq ( $align->each_seq_with_id() )
730 Function : Gets a Seq objects from the alignment, the contents
731 being those sequences with the given name (there may be
732 more than one)
733 Returns : Seq object
734 Argument : a seq name
736 =cut
738 sub eachSeqWithId {
739 my $self = shift;
740 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
741 $self->each_seq_with_id(@_);
744 sub each_seq_with_id {
745 my $self = shift;
746 my $id = shift;
748 $self->throw("Method each_seq_with_id needs a sequence name argument")
749 unless defined $id;
751 my (@arr, $seq);
753 if (exists($self->{'_start_end_lists'}->{$id})) {
754 @arr = @{$self->{'_start_end_lists'}->{$id}};
756 return @arr;
759 =head2 get_seq_by_pos
761 Title : get_seq_by_pos
762 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
763 Function : Gets a sequence based on its position in the alignment.
764 Numbering starts from 1. Sequence positions larger than
765 no_sequences() will thow an error.
766 Returns : a Bio::LocatableSeq object
767 Args : positive integer for the sequence position
769 =cut
771 sub get_seq_by_pos {
773 my $self = shift;
774 my ($pos) = @_;
776 $self->throw("Sequence position has to be a positive integer, not [$pos]")
777 unless $pos =~ /^\d+$/ and $pos > 0;
778 $self->throw("No sequence at position [$pos]")
779 unless $pos <= $self->no_sequences ;
781 my $nse = $self->{'_order'}->{--$pos};
782 return $self->{'_seq'}->{$nse};
785 =head2 get_seq_by_id
787 Title : get_seq_by_id
788 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
789 Function : Gets a sequence based on its name.
790 Sequences that do not exist will warn and return undef
791 Returns : a Bio::LocatableSeq object
792 Args : string for sequence name
794 =cut
796 sub get_seq_by_id {
797 my ($self,$name) = @_;
798 unless( defined $name ) {
799 $self->warn("Must provide a sequence name");
800 return undef;
802 for my $seq ( values %{$self->{'_seq'}} ) {
803 if ( $seq->id eq $name) {
804 return $seq;
807 return undef;
810 =head2 seq_with_features
812 Title : seq_with_features
813 Usage : $seq = $aln->seq_with_features(-pos => 1,
814 -consensus => 60
815 -mask =>
816 sub { my $consensus = shift;
818 for my $i (1..5){
819 my $n = 'N' x $i;
820 my $q = '\?' x $i;
821 while($consensus =~ /[^?]$q[^?]/){
822 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
825 return $consensus;
828 Function: produces a Bio::Seq object by first splicing gaps from -pos
829 (by means of a splice_by_seq_pos() call), then creating
830 features using non-? chars (by means of a consensus_string()
831 call with stringency -consensus).
832 Returns : a Bio::Seq object
833 Args : -pos : required. sequence from which to build the Bio::Seq
834 object
835 -consensus : optional, defaults to consensus_string()'s
836 default cutoff value
837 -mask : optional, a coderef to apply to consensus_string()'s
838 output before building features. this may be useful for
839 closing gaps of 1 bp by masking over them with N, for
840 instance
842 =cut
844 sub seq_with_features{
845 my ($self,%arg) = @_;
847 #first do the preparatory splice
848 $self->throw("must provide a -pos argument") unless $arg{-pos};
849 $self->splice_by_seq_pos($arg{-pos});
851 my $consensus_string = $self->consensus_string($arg{-consensus});
852 $consensus_string = $arg{-mask}->($consensus_string)
853 if defined($arg{-mask});
855 my(@bs,@es);
857 push @bs, 1 if $consensus_string =~ /^[^?]/;
859 while($consensus_string =~ /\?[^?]/g){
860 push @bs, pos($consensus_string);
862 while($consensus_string =~ /[^?]\?/g){
863 push @es, pos($consensus_string);
866 push @es, length($consensus_string) if $consensus_string =~ /[^?]$/;
868 my $seq = Bio::Seq->new();
870 # my $rootfeature = Bio::SeqFeature::Generic->new(
871 # -source_tag => 'location',
872 # -start => $self->get_seq_by_pos($arg{-pos})->start,
873 # -end => $self->get_seq_by_pos($arg{-pos})->end,
874 # );
875 # $seq->add_SeqFeature($rootfeature);
877 while(my $b = shift @bs){
878 my $e = shift @es;
879 $seq->add_SeqFeature(
880 Bio::SeqFeature::Generic->new(
881 -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
882 -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
883 -source_tag => $self->source || 'MSA',
888 return $seq;
892 =head1 Create new alignments
894 The result of these methods are horizontal or vertical subsets of the
895 current MSA.
897 =head2 select
899 Title : select
900 Usage : $aln2 = $aln->select(1, 3) # three first sequences
901 Function : Creates a new alignment from a continuous subset of
902 sequences. Numbering starts from 1. Sequence positions
903 larger than no_sequences() will thow an error.
904 Returns : a Bio::SimpleAlign object
905 Args : positive integer for the first sequence
906 positive integer for the last sequence to include (optional)
908 =cut
910 sub select {
911 my $self = shift;
912 my ($start, $end) = @_;
914 $self->throw("Select start has to be a positive integer, not [$start]")
915 unless $start =~ /^\d+$/ and $start > 0;
916 $self->throw("Select end has to be a positive integer, not [$end]")
917 unless $end =~ /^\d+$/ and $end > 0;
918 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
919 unless $start <= $end;
921 my $aln = $self->new;
922 foreach my $pos ($start .. $end) {
923 $aln->add_seq($self->get_seq_by_pos($pos));
925 $aln->id($self->id);
926 # fix for meta, sf, ann
927 return $aln;
930 =head2 select_noncont
932 Title : select_noncont
933 Usage : # 1st and 3rd sequences, sorted
934 $aln2 = $aln->select_noncont(1, 3)
936 # 1st and 3rd sequences, sorted (same as first)
937 $aln2 = $aln->select_noncont(3, 1)
939 # 1st and 3rd sequences, unsorted
940 $aln2 = $aln->select_noncont('nosort',3, 1)
942 Function : Creates a new alignment from a subset of sequences. Numbering
943 starts from 1. Sequence positions larger than no_sequences() will
944 throw an error. Sorts the order added to new alignment by default,
945 to prevent sorting pass 'nosort' as the first argument in the list.
946 Returns : a Bio::SimpleAlign object
947 Args : array of integers for the sequences. If the string 'nosort' is
948 passed as the first argument, the sequences will not be sorted
949 in the new alignment but will appear in the order listed.
951 =cut
953 sub select_noncont {
954 my $self = shift;
955 my $nosort = 0;
956 my (@pos) = @_;
957 if ($pos[0] !~ m{^\d+$}) {
958 my $sortcmd = shift @pos;
959 if ($sortcmd eq 'nosort') {
960 $nosort = 1;
961 } else {
962 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
966 my $end = $self->no_sequences;
967 foreach ( @pos ) {
968 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
969 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
972 @pos = sort {$a <=> $b} @pos unless $nosort;
974 my $aln = $self->new;
975 foreach my $p (@pos) {
976 $aln->add_seq($self->get_seq_by_pos($p));
978 $aln->id($self->id);
979 # fix for meta, sf, ann
980 return $aln;
983 =head2 slice
985 Title : slice
986 Usage : $aln2 = $aln->slice(20,30)
987 Function : Creates a slice from the alignment inclusive of start and
988 end columns, and the first column in the alignment is denoted 1.
989 Sequences with no residues in the slice are excluded from the
990 new alignment and a warning is printed. Slice beyond the length of
991 the sequence does not do padding.
992 Returns : A Bio::SimpleAlign object
993 Args : Positive integer for start column, positive integer for end column,
994 optional boolean which if true will keep gap-only columns in the newly
995 created slice. Example:
997 $aln2 = $aln->slice(20,30,1)
999 =cut
1001 sub slice {
1002 my $self = shift;
1003 my ($start, $end, $keep_gap_only) = @_;
1005 $self->throw("Slice start has to be a positive integer, not [$start]")
1006 unless $start =~ /^\d+$/ and $start > 0;
1007 $self->throw("Slice end has to be a positive integer, not [$end]")
1008 unless $end =~ /^\d+$/ and $end > 0;
1009 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1010 unless $start <= $end;
1011 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1012 "[$start] is too big.") if $start > $self->length;
1013 my $cons_meta = $self->consensus_meta;
1014 my $aln = $self->new;
1015 $aln->id($self->id);
1016 foreach my $seq ( $self->each_seq() ) {
1017 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1018 Bio::Seq::Meta->new
1019 (-id => $seq->id,
1020 -alphabet => $seq->alphabet,
1021 -strand => $seq->strand,
1022 -verbose => $self->verbose) :
1023 Bio::LocatableSeq->new
1024 (-id => $seq->id,
1025 -alphabet => $seq->alphabet,
1026 -strand => $seq->strand,
1027 -verbose => $self->verbose);
1029 # seq
1030 my $seq_end = $end;
1031 $seq_end = $seq->length if( $end > $seq->length );
1033 my $slice_seq = $seq->subseq($start, $seq_end);
1034 $new_seq->seq( $slice_seq );
1036 $slice_seq =~ s/\W//g;
1038 if ($start > 1) {
1039 my $pre_start_seq = $seq->subseq(1, $start - 1);
1040 $pre_start_seq =~ s/\W//g;
1041 if (!defined($seq->strand)) {
1042 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1043 } elsif ($seq->strand < 0){
1044 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1045 } else {
1046 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1048 } else {
1049 $new_seq->start( $seq->start);
1051 if ($new_seq->isa('Bio::Seq::MetaI')) {
1052 for my $meta_name ($seq->meta_names) {
1053 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1056 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1058 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1059 $aln->add_seq($new_seq);
1060 } else {
1061 if( $keep_gap_only ) {
1062 $aln->add_seq($new_seq);
1063 } else {
1064 my $nse = $seq->get_nse();
1065 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1066 " Sequence excluded from the new alignment.");
1070 if ($cons_meta) {
1071 my $new = Bio::Seq::Meta->new();
1072 for my $meta_name ($cons_meta->meta_names) {
1073 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1075 $aln->consensus_meta($new);
1077 $aln->annotation($self->annotation);
1078 # fix for meta, sf, ann
1079 return $aln;
1082 =head2 remove_columns
1084 Title : remove_columns
1085 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1086 $aln2 = $aln->remove_columns([0,0],[6,8])
1087 Function : Creates an aligment with columns removed corresponding to
1088 the specified type or by specifying the columns by number.
1089 Returns : Bio::SimpleAlign object
1090 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1091 'all_gaps_columns') or array ref where the referenced array
1092 contains a pair of integers that specify a range.
1093 The first column is 0,
1095 =cut
1097 sub remove_columns {
1098 my ($self,@args) = @_;
1099 @args || return $self;
1100 my $aln;
1102 if ($args[0][0] =~ /^[a-z_]+$/i) {
1103 $aln = $self->_remove_columns_by_type($args[0]);
1104 } elsif ($args[0][0] =~ /^\d+$/) {
1105 $aln = $self->_remove_columns_by_num(\@args);
1106 } else {
1107 $self->throw("You must pass array references to remove_columns(), not @args");
1109 # fix for meta, sf, ann
1110 $aln;
1114 =head2 remove_gaps
1116 Title : remove_gaps
1117 Usage : $aln2 = $aln->remove_gaps
1118 Function : Creates an aligment with gaps removed
1119 Returns : a Bio::SimpleAlign object
1120 Args : a gap character(optional) if none specified taken
1121 from $self->gap_char,
1122 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1123 indicates that only all-gaps columns should be deleted
1125 Used from method L<remove_columns> in most cases. Set gap character
1126 using L<gap_char()|gap_char>.
1128 =cut
1130 sub remove_gaps {
1131 my ($self,$gapchar,$all_gaps_columns) = @_;
1132 my $gap_line;
1133 if ($all_gaps_columns) {
1134 $gap_line = $self->all_gap_line($gapchar);
1135 } else {
1136 $gap_line = $self->gap_line($gapchar);
1138 my $aln = $self->new;
1140 my @remove;
1141 my $length = 0;
1142 my $del_char = $gapchar || $self->gap_char;
1143 # Do the matching to get the segments to remove
1144 while ($gap_line =~ m/[$del_char]/g) {
1145 my $start = pos($gap_line)-1;
1146 $gap_line=~/\G[$del_char]+/gc;
1147 my $end = pos($gap_line)-1;
1149 #have to offset the start and end for subsequent removes
1150 $start-=$length;
1151 $end -=$length;
1152 $length += ($end-$start+1);
1153 push @remove, [$start,$end];
1156 #remove the segments
1157 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1158 # fix for meta, sf, ann
1159 return $aln;
1163 sub _remove_col {
1164 my ($self,$aln,$remove) = @_;
1165 my @new;
1167 my $gap = $self->gap_char;
1169 # splice out the segments and create new seq
1170 foreach my $seq($self->each_seq){
1171 my $new_seq = Bio::LocatableSeq->new(
1172 -id => $seq->id,
1173 -alphabet=> $seq->alphabet,
1174 -strand => $seq->strand,
1175 -verbose => $self->verbose);
1176 my $sequence = $seq->seq;
1177 foreach my $pair(@{$remove}){
1178 my $start = $pair->[0];
1179 my $end = $pair->[1];
1180 $sequence = $seq->seq unless $sequence;
1181 my $orig = $sequence;
1182 my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1183 my $tail = ($end + 1) >= length($sequence) ? '' : substr($sequence, $end + 1);
1184 $sequence = $head.$tail;
1185 # start
1186 unless (defined $new_seq->start) {
1187 if ($start == 0) {
1188 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1189 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1191 else {
1192 my $start_adjust = $orig =~ /^$gap+/;
1193 if ($start_adjust) {
1194 $start_adjust = $+[0] == $start;
1196 $new_seq->start($seq->start + $start_adjust);
1199 # end
1200 if (($end + 1) >= length($orig)) {
1201 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1202 $new_seq->end($seq->end - (length($orig) - $start) + $end_adjust);
1204 else {
1205 $new_seq->end($seq->end);
1209 if ($new_seq->end < $new_seq->start) {
1210 # we removed all columns except for gaps: set to 0 to indicate no
1211 # sequence
1212 $new_seq->start(0);
1213 $new_seq->end(0);
1216 $new_seq->seq($sequence) if $sequence;
1217 push @new, $new_seq;
1219 # add the new seqs to the alignment
1220 foreach my $new(@new){
1221 $aln->add_seq($new);
1223 # fix for meta, sf, ann
1224 return $aln;
1227 sub _remove_columns_by_type {
1228 my ($self,$type) = @_;
1229 my $aln = $self->new;
1230 my @remove;
1232 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1233 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1234 my %matchchars = ( 'match' => '\*',
1235 'weak' => '\.',
1236 'strong' => ':',
1237 'mismatch' => ' ',
1238 'gaps' => '',
1239 'all_gaps_columns' => ''
1241 # get the characters to delete against
1242 my $del_char;
1243 foreach my $type (@{$type}){
1244 $del_char.= $matchchars{$type};
1247 my $length = 0;
1248 my $match_line = $self->match_line;
1249 # do the matching to get the segments to remove
1250 if($del_char){
1251 while($match_line =~ m/[$del_char]/g ){
1252 my $start = pos($match_line)-1;
1253 $match_line=~/\G[$del_char]+/gc;
1254 my $end = pos($match_line)-1;
1256 #have to offset the start and end for subsequent removes
1257 $start-=$length;
1258 $end -=$length;
1259 $length += ($end-$start+1);
1260 push @remove, [$start,$end];
1264 # remove the segments
1265 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1266 $aln = $aln->remove_gaps() if $gap;
1267 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1268 # fix for meta, sf, ann
1269 $aln;
1273 sub _remove_columns_by_num {
1274 my ($self,$positions) = @_;
1275 my $aln = $self->new;
1277 # sort the positions
1278 @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1280 my @remove;
1281 my $length = 0;
1282 foreach my $pos (@{$positions}) {
1283 my ($start, $end) = @{$pos};
1285 #have to offset the start and end for subsequent removes
1286 $start-=$length;
1287 $end -=$length;
1288 $length += ($end-$start+1);
1289 push @remove, [$start,$end];
1292 #remove the segments
1293 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1294 # fix for meta, sf, ann
1295 $aln;
1299 =head1 Change sequences within the MSA
1301 These methods affect characters in all sequences without changing the
1302 alignment.
1304 =head2 splice_by_seq_pos
1306 Title : splice_by_seq_pos
1307 Usage : $status = splice_by_seq_pos(1);
1308 Function: splices all aligned sequences where the specified sequence
1309 has gaps.
1310 Example :
1311 Returns : 1 on success
1312 Args : position of sequence to splice by
1315 =cut
1317 sub splice_by_seq_pos{
1318 my ($self,$pos) = @_;
1320 my $guide = $self->get_seq_by_pos($pos);
1321 my $guide_seq = $guide->seq;
1323 $guide_seq =~ s/\./\-/g;
1325 my @gaps = ();
1326 $pos = -1;
1327 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1328 unshift @gaps, $pos;
1329 $pos++;
1332 foreach my $seq ($self->each_seq){
1333 my @bases = split '', $seq->seq;
1335 splice(@bases, $_, 1) foreach @gaps;
1336 $seq->seq(join('', @bases));
1342 =head2 map_chars
1344 Title : map_chars
1345 Usage : $ali->map_chars('\.','-')
1346 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1347 characters
1349 Notice that the from (arg1) is interpretted as a regex,
1350 so be careful about quoting meta characters (eg
1351 $ali->map_chars('.','-') wont do what you want)
1352 Returns :
1353 Argument : 'from' rexexp
1354 'to' string
1356 =cut
1358 sub map_chars {
1359 my $self = shift;
1360 my $from = shift;
1361 my $to = shift;
1362 my ($seq,$temp);
1364 $self->throw("Need exactly two arguments")
1365 unless defined $from and defined $to;
1367 foreach $seq ( $self->each_seq() ) {
1368 $temp = $seq->seq();
1369 $temp =~ s/$from/$to/g;
1370 $seq->seq($temp);
1372 return 1;
1376 =head2 uppercase
1378 Title : uppercase()
1379 Usage : $ali->uppercase()
1380 Function : Sets all the sequences to uppercase
1381 Returns :
1382 Argument :
1384 =cut
1386 sub uppercase {
1387 my $self = shift;
1388 my $seq;
1389 my $temp;
1391 foreach $seq ( $self->each_seq() ) {
1392 $temp = $seq->seq();
1393 $temp =~ tr/[a-z]/[A-Z]/;
1395 $seq->seq($temp);
1397 return 1;
1400 =head2 cigar_line
1402 Title : cigar_line()
1403 Usage : %cigars = $align->cigar_line()
1404 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1405 Report) line for each sequence in the alignment. Examples are
1406 "1,60" or "5,10:12,58", where the numbers refer to conserved
1407 positions within the alignment. The keys of the hash are the
1408 NSEs (name/start/end) assigned to each sequence.
1409 Args : threshold (optional, defaults to 100)
1410 Returns : Hash of strings (cigar lines)
1412 =cut
1414 sub cigar_line {
1415 my $self = shift;
1416 my $thr=shift||100;
1417 my %cigars;
1419 my @consensus = split "",($self->consensus_string($thr));
1420 my $len = $self->length;
1421 my $gapchar = $self->gap_char;
1423 # create a precursor, something like (1,4,5,6,7,33,45),
1424 # where each number corresponds to a conserved position
1425 foreach my $seq ( $self->each_seq ) {
1426 my @seq = split "", uc ($seq->seq);
1427 my $pos = 1;
1428 for (my $x = 0 ; $x < $len ; $x++ ) {
1429 if ($seq[$x] eq $consensus[$x]) {
1430 push @{$cigars{$seq->get_nse}},$pos;
1431 $pos++;
1432 } elsif ($seq[$x] ne $gapchar) {
1433 $pos++;
1437 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1438 for my $name (keys %cigars) {
1439 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1440 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1441 push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1442 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1443 ${$cigars{$name}}[$#{$cigars{$name}}] );
1444 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1445 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1446 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1447 splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1451 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1452 for my $name (keys %cigars) {
1453 my @remove;
1454 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1455 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1456 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1457 unshift @remove,$x;
1460 for my $pos (@remove) {
1461 splice @{$cigars{$name}}, $pos, 1;
1464 # join and punctuate
1465 for my $name (keys %cigars) {
1466 my ($start,$end,$str) = "";
1467 while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1468 $str .= ($start . "," . $end . ":");
1470 $str =~ s/:$//;
1471 $cigars{$name} = $str;
1473 %cigars;
1477 =head2 match_line
1479 Title : match_line()
1480 Usage : $line = $align->match_line()
1481 Function : Generates a match line - much like consensus string
1482 except that a line indicating the '*' for a match.
1483 Args : (optional) Match line characters ('*' by default)
1484 (optional) Strong match char (':' by default)
1485 (optional) Weak match char ('.' by default)
1486 Returns : String
1488 =cut
1490 sub match_line {
1491 my ($self,$matchlinechar, $strong, $weak) = @_;
1492 my %matchchars = ('match' => $matchlinechar || '*',
1493 'weak' => $weak || '.',
1494 'strong' => $strong || ':',
1495 'mismatch' => ' ',
1498 my @seqchars;
1499 my $alphabet;
1500 foreach my $seq ( $self->each_seq ) {
1501 push @seqchars, [ split(//, uc ($seq->seq)) ];
1502 $alphabet = $seq->alphabet unless defined $alphabet;
1504 my $refseq = shift @seqchars;
1505 # let's just march down the columns
1506 my $matchline;
1507 POS:
1508 foreach my $pos ( 0..$self->length ) {
1509 my $refchar = $refseq->[$pos];
1510 my $char = $matchchars{'mismatch'};
1511 unless( defined $refchar ) {
1512 last if $pos == $self->length; # short circuit on last residue
1513 # this in place to handle jason's soon-to-be-committed
1514 # intron mapping code
1515 goto bottom;
1517 my %col = ($refchar => 1);
1518 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1519 foreach my $seq ( @seqchars ) {
1520 next if $pos >= scalar @$seq;
1521 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1522 $seq->[$pos] eq ' ' );
1523 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1525 my @colresidues = sort keys %col;
1527 # if all the values are the same
1528 if( $dash ) { $char = $matchchars{'mismatch'} }
1529 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1530 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1531 # matches for protein seqs
1532 TYPE:
1533 foreach my $type ( qw(strong weak) ) {
1534 # iterate through categories
1535 my %groups;
1536 # iterate through each of the aa in the col
1537 # look to see which groups it is in
1538 foreach my $c ( @colresidues ) {
1539 foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1540 push @{$groups{$f}},$c;
1543 GRP:
1544 foreach my $cols ( values %groups ) {
1545 @$cols = sort @$cols;
1546 # now we are just testing to see if two arrays
1547 # are identical w/o changing either one
1548 # have to be same len
1549 next if( scalar @$cols != scalar @colresidues );
1550 # walk down the length and check each slot
1551 for($_=0;$_ < (scalar @$cols);$_++ ) {
1552 next GRP if( $cols->[$_] ne $colresidues[$_] );
1554 $char = $matchchars{$type};
1555 last TYPE;
1559 bottom:
1560 $matchline .= $char;
1562 return $matchline;
1566 =head2 gap_line
1568 Title : gap_line()
1569 Usage : $line = $align->gap_line()
1570 Function : Generates a gap line - much like consensus string
1571 except that a line where '-' represents gap
1572 Args : (optional) gap line characters ('-' by default)
1573 Returns : string
1575 =cut
1577 sub gap_line {
1578 my ($self,$gapchar) = @_;
1579 $gapchar = $gapchar || $self->gap_char;
1580 my %gap_hsh; # column gaps vector
1581 foreach my $seq ( $self->each_seq ) {
1582 my $i = 0;
1583 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1584 map {[$i++, $_]} split(//, uc ($seq->seq));
1586 my $gap_line;
1587 foreach my $pos ( 0..$self->length-1 ) {
1588 $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1590 return $gap_line;
1593 =head2 all_gap_line
1595 Title : all_gap_line()
1596 Usage : $line = $align->all_gap_line()
1597 Function : Generates a gap line - much like consensus string
1598 except that a line where '-' represents all-gap column
1599 Args : (optional) gap line characters ('-' by default)
1600 Returns : string
1602 =cut
1604 sub all_gap_line {
1605 my ($self,$gapchar) = @_;
1606 $gapchar = $gapchar || $self->gap_char;
1607 my %gap_hsh; # column gaps counter hash
1608 my @seqs = $self->each_seq;
1609 foreach my $seq ( @seqs ) {
1610 my $i = 0;
1611 map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1612 map {[$i++, $_]} split(//, uc ($seq->seq));
1614 my $gap_line;
1615 foreach my $pos ( 0..$self->length-1 ) {
1616 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1617 # gaps column
1618 $gap_line .= $gapchar;
1619 } else {
1620 $gap_line .= '.';
1623 return $gap_line;
1626 =head2 gap_col_matrix
1628 Title : gap_col_matrix()
1629 Usage : my $cols = $align->gap_col_matrix()
1630 Function : Generates an array of hashes where
1631 each entry in the array is a hash reference
1632 with keys of all the sequence names and
1633 and value of 1 or 0 if the sequence has a gap at that column
1634 Args : (optional) gap line characters ($aln->gap_char or '-' by default)
1636 =cut
1638 sub gap_col_matrix {
1639 my ($self,$gapchar) = @_;
1640 $gapchar = $gapchar || $self->gap_char;
1641 my %gap_hsh; # column gaps vector
1642 my @cols;
1643 foreach my $seq ( $self->each_seq ) {
1644 my $i = 0;
1645 my $str = $seq->seq;
1646 my $len = $seq->length;
1647 my $ch;
1648 my $id = $seq->display_id;
1649 while( $i < $len ) {
1650 $ch = substr($str, $i, 1);
1651 $cols[$i++]->{$id} = ($ch eq $gapchar);
1654 return \@cols;
1657 =head2 match
1659 Title : match()
1660 Usage : $ali->match()
1661 Function : Goes through all columns and changes residues that are
1662 identical to residue in first sequence to match '.'
1663 character. Sets match_char.
1665 USE WITH CARE: Most MSA formats do not support match
1666 characters in sequences, so this is mostly for output
1667 only. NEXUS format (Bio::AlignIO::nexus) can handle
1669 Returns : 1
1670 Argument : a match character, optional, defaults to '.'
1672 =cut
1674 sub match {
1675 my ($self, $match) = @_;
1677 $match ||= '.';
1678 my ($matching_char) = $match;
1679 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1680 $self->map_chars($matching_char, '-');
1682 my @seqs = $self->each_seq();
1683 return 1 unless scalar @seqs > 1;
1685 my $refseq = shift @seqs ;
1686 my @refseq = split //, $refseq->seq;
1687 my $gapchar = $self->gap_char;
1689 foreach my $seq ( @seqs ) {
1690 my @varseq = split //, $seq->seq();
1691 for ( my $i=0; $i < scalar @varseq; $i++) {
1692 $varseq[$i] = $match if defined $refseq[$i] &&
1693 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1694 $refseq[$i] =~ /$gapchar/ )
1695 && $refseq[$i] eq $varseq[$i];
1697 $seq->seq(join '', @varseq);
1699 $self->match_char($match);
1700 return 1;
1704 =head2 unmatch
1706 Title : unmatch()
1707 Usage : $ali->unmatch()
1708 Function : Undoes the effect of method match. Unsets match_char.
1709 Returns : 1
1710 Argument : a match character, optional, defaults to '.'
1712 See L<match> and L<match_char>
1714 =cut
1716 sub unmatch {
1717 my ($self, $match) = @_;
1719 $match ||= '.';
1721 my @seqs = $self->each_seq();
1722 return 1 unless scalar @seqs > 1;
1724 my $refseq = shift @seqs ;
1725 my @refseq = split //, $refseq->seq;
1726 my $gapchar = $self->gap_char;
1727 foreach my $seq ( @seqs ) {
1728 my @varseq = split //, $seq->seq();
1729 for ( my $i=0; $i < scalar @varseq; $i++) {
1730 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1731 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1732 $refseq[$i] =~ /$gapchar/ ) &&
1733 $varseq[$i] eq $match;
1735 $seq->seq(join '', @varseq);
1737 $self->match_char('');
1738 return 1;
1741 =head1 MSA attributes
1743 Methods for setting and reading the MSA attributes.
1745 Note that the methods defining character semantics depend on the user
1746 to set them sensibly. They are needed only by certain input/output
1747 methods. Unset them by setting to an empty string ('').
1749 =head2 id
1751 Title : id
1752 Usage : $myalign->id("Ig")
1753 Function : Gets/sets the id field of the alignment
1754 Returns : An id string
1755 Argument : An id string (optional)
1757 =cut
1759 sub id {
1760 my ($self, $name) = @_;
1762 if (defined( $name )) {
1763 $self->{'_id'} = $name;
1766 return $self->{'_id'};
1769 =head2 accession
1771 Title : accession
1772 Usage : $myalign->accession("PF00244")
1773 Function : Gets/sets the accession field of the alignment
1774 Returns : An acc string
1775 Argument : An acc string (optional)
1777 =cut
1779 sub accession {
1780 my ($self, $acc) = @_;
1782 if (defined( $acc )) {
1783 $self->{'_accession'} = $acc;
1786 return $self->{'_accession'};
1789 =head2 description
1791 Title : description
1792 Usage : $myalign->description("14-3-3 proteins")
1793 Function : Gets/sets the description field of the alignment
1794 Returns : An description string
1795 Argument : An description string (optional)
1797 =cut
1799 sub description {
1800 my ($self, $name) = @_;
1802 if (defined( $name )) {
1803 $self->{'_description'} = $name;
1806 return $self->{'_description'};
1809 =head2 missing_char
1811 Title : missing_char
1812 Usage : $myalign->missing_char("?")
1813 Function : Gets/sets the missing_char attribute of the alignment
1814 It is generally recommended to set it to 'n' or 'N'
1815 for nucleotides and to 'X' for protein.
1816 Returns : An missing_char string,
1817 Argument : An missing_char string (optional)
1819 =cut
1821 sub missing_char {
1822 my ($self, $char) = @_;
1824 if (defined $char ) {
1825 $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1826 $self->{'_missing_char'} = $char;
1829 return $self->{'_missing_char'};
1832 =head2 match_char
1834 Title : match_char
1835 Usage : $myalign->match_char('.')
1836 Function : Gets/sets the match_char attribute of the alignment
1837 Returns : An match_char string,
1838 Argument : An match_char string (optional)
1840 =cut
1842 sub match_char {
1843 my ($self, $char) = @_;
1845 if (defined $char ) {
1846 $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1847 $self->{'_match_char'} = $char;
1850 return $self->{'_match_char'};
1853 =head2 gap_char
1855 Title : gap_char
1856 Usage : $myalign->gap_char('-')
1857 Function : Gets/sets the gap_char attribute of the alignment
1858 Returns : An gap_char string, defaults to '-'
1859 Argument : An gap_char string (optional)
1861 =cut
1863 sub gap_char {
1864 my ($self, $char) = @_;
1866 if (defined $char || ! defined $self->{'_gap_char'} ) {
1867 $char= '-' unless defined $char;
1868 $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1869 $self->{'_gap_char'} = $char;
1871 return $self->{'_gap_char'};
1874 =head2 symbol_chars
1876 Title : symbol_chars
1877 Usage : my @symbolchars = $aln->symbol_chars;
1878 Function: Returns all the seen symbols (other than gaps)
1879 Returns : array of characters that are the seen symbols
1880 Args : boolean to include the gap/missing/match characters
1882 =cut
1884 sub symbol_chars{
1885 my ($self,$includeextra) = @_;
1887 unless ($self->{'_symbols'}) {
1888 foreach my $seq ($self->each_seq) {
1889 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1892 my %copy = %{$self->{'_symbols'}};
1893 if( ! $includeextra ) {
1894 foreach my $char ( $self->gap_char, $self->match_char,
1895 $self->missing_char) {
1896 delete $copy{$char} if( defined $char );
1899 return keys %copy;
1902 =head1 Alignment descriptors
1904 These read only methods describe the MSA in various ways.
1907 =head2 score
1909 Title : score
1910 Usage : $str = $ali->score()
1911 Function : get/set a score of the alignment
1912 Returns : a score for the alignment
1913 Argument : an optional score to set
1915 =cut
1917 sub score {
1918 my $self = shift;
1919 $self->{score} = shift if @_;
1920 return $self->{score};
1923 =head2 consensus_string
1925 Title : consensus_string
1926 Usage : $str = $ali->consensus_string($threshold_percent)
1927 Function : Makes a strict consensus
1928 Returns : Consensus string
1929 Argument : Optional treshold ranging from 0 to 100.
1930 The consensus residue has to appear at least threshold %
1931 of the sequences at a given location, otherwise a '?'
1932 character will be placed at that location.
1933 (Default value = 0%)
1935 =cut
1937 sub consensus_string {
1938 my $self = shift;
1939 my $threshold = shift;
1941 my $out = "";
1942 my $len = $self->length - 1;
1944 foreach ( 0 .. $len ) {
1945 $out .= $self->_consensus_aa($_,$threshold);
1947 return $out;
1950 sub _consensus_aa {
1951 my $self = shift;
1952 my $point = shift;
1953 my $threshold_percent = shift || -1 ;
1954 my ($seq,%hash,$count,$letter,$key);
1955 my $gapchar = $self->gap_char;
1956 foreach $seq ( $self->each_seq() ) {
1957 $letter = substr($seq->seq,$point,1);
1958 $self->throw("--$point-----------") if $letter eq '';
1959 ($letter eq $gapchar || $letter =~ /\./) && next;
1960 # print "Looking at $letter\n";
1961 $hash{$letter}++;
1963 my $number_of_sequences = $self->no_sequences();
1964 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
1965 $count = -1;
1966 $letter = '?';
1968 foreach $key ( sort keys %hash ) {
1969 # print "Now at $key $hash{$key}\n";
1970 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
1971 $letter = $key;
1972 $count = $hash{$key};
1975 return $letter;
1979 =head2 consensus_iupac
1981 Title : consensus_iupac
1982 Usage : $str = $ali->consensus_iupac()
1983 Function : Makes a consensus using IUPAC ambiguity codes from DNA
1984 and RNA. The output is in upper case except when gaps in
1985 a column force output to be in lower case.
1987 Note that if your alignment sequences contain a lot of
1988 IUPAC ambiquity codes you often have to manually set
1989 alphabet. Bio::PrimarySeq::_guess_type thinks they
1990 indicate a protein sequence.
1991 Returns : consensus string
1992 Argument : none
1993 Throws : on protein sequences
1995 =cut
1997 sub consensus_iupac {
1998 my $self = shift;
1999 my $out = "";
2000 my $len = $self->length-1;
2002 # only DNA and RNA sequences are valid
2003 foreach my $seq ( $self->each_seq() ) {
2004 $self->throw("Seq [". $seq->get_nse. "] is a protein")
2005 if $seq->alphabet eq 'protein';
2007 # loop over the alignment columns
2008 foreach my $count ( 0 .. $len ) {
2009 $out .= $self->_consensus_iupac($count);
2011 return $out;
2014 sub _consensus_iupac {
2015 my ($self, $column) = @_;
2016 my ($string, $char, $rna);
2018 #determine all residues in a column
2019 foreach my $seq ( $self->each_seq() ) {
2020 $string .= substr($seq->seq, $column, 1);
2022 $string = uc $string;
2024 # quick exit if there's an N in the string
2025 if ($string =~ /N/) {
2026 $string =~ /\W/ ? return 'n' : return 'N';
2028 # ... or if there are only gap characters
2029 return '-' if $string =~ /^\W+$/;
2031 # treat RNA as DNA in regexps
2032 if ($string =~ /U/) {
2033 $string =~ s/U/T/;
2034 $rna = 1;
2037 # the following s///'s only need to be done to the _first_ ambiguity code
2038 # as we only need to see the _range_ of characters in $string
2040 if ($string =~ /[VDHB]/) {
2041 $string =~ s/V/AGC/;
2042 $string =~ s/D/AGT/;
2043 $string =~ s/H/ACT/;
2044 $string =~ s/B/CTG/;
2047 if ($string =~ /[SKYRWM]/) {
2048 $string =~ s/S/GC/;
2049 $string =~ s/K/GT/;
2050 $string =~ s/Y/CT/;
2051 $string =~ s/R/AG/;
2052 $string =~ s/W/AT/;
2053 $string =~ s/M/AC/;
2056 # and now the guts of the thing
2058 if ($string =~ /A/) {
2059 $char = 'A'; # A A
2060 if ($string =~ /G/) {
2061 $char = 'R'; # A and G (purines) R
2062 if ($string =~ /C/) {
2063 $char = 'V'; # A and G and C V
2064 if ($string =~ /T/) {
2065 $char = 'N'; # A and G and C and T N
2067 } elsif ($string =~ /T/) {
2068 $char = 'D'; # A and G and T D
2070 } elsif ($string =~ /C/) {
2071 $char = 'M'; # A and C M
2072 if ($string =~ /T/) {
2073 $char = 'H'; # A and C and T H
2075 } elsif ($string =~ /T/) {
2076 $char = 'W'; # A and T W
2078 } elsif ($string =~ /C/) {
2079 $char = 'C'; # C C
2080 if ($string =~ /T/) {
2081 $char = 'Y'; # C and T (pyrimidines) Y
2082 if ($string =~ /G/) {
2083 $char = 'B'; # C and T and G B
2085 } elsif ($string =~ /G/) {
2086 $char = 'S'; # C and G S
2088 } elsif ($string =~ /G/) {
2089 $char = 'G'; # G G
2090 if ($string =~ /C/) {
2091 $char = 'S'; # G and C S
2092 } elsif ($string =~ /T/) {
2093 $char = 'K'; # G and T K
2095 } elsif ($string =~ /T/) {
2096 $char = 'T'; # T T
2099 $char = 'U' if $rna and $char eq 'T';
2100 $char = lc $char if $string =~ /\W/;
2102 return $char;
2106 =head2 consensus_meta
2108 Title : consensus_meta
2109 Usage : $seqmeta = $ali->consensus_meta()
2110 Function : Returns a Bio::Seq::Meta object containing the consensus
2111 strings derived from meta data analysis.
2112 Returns : Bio::Seq::Meta
2113 Argument : Bio::Seq::Meta
2114 Throws : non-MetaI object
2116 =cut
2118 sub consensus_meta {
2119 my ($self, $meta) = @_;
2120 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2121 $self->throw('Not a Bio::Seq::MetaI object');
2123 return $self->{'_aln_meta'} = $meta if $meta;
2124 return $self->{'_aln_meta'}
2127 =head2 is_flush
2129 Title : is_flush
2130 Usage : if ( $ali->is_flush() )
2131 Function : Tells you whether the alignment
2132 : is flush, i.e. all of the same length
2133 Returns : 1 or 0
2134 Argument :
2136 =cut
2138 sub is_flush {
2139 my ($self,$report) = @_;
2140 my $seq;
2141 my $length = (-1);
2142 my $temp;
2144 foreach $seq ( $self->each_seq() ) {
2145 if( $length == (-1) ) {
2146 $length = CORE::length($seq->seq());
2147 next;
2150 $temp = CORE::length($seq->seq());
2151 if( $temp != $length ) {
2152 $self->warn("expecting $length not $temp from ".
2153 $seq->display_id) if( $report );
2154 $self->debug("expecting $length not $temp from ".
2155 $seq->display_id);
2156 $self->debug($seq->seq(). "\n");
2157 return 0;
2161 return 1;
2165 =head2 length
2167 Title : length()
2168 Usage : $len = $ali->length()
2169 Function : Returns the maximum length of the alignment.
2170 To be sure the alignment is a block, use is_flush
2171 Returns : Integer
2172 Argument :
2174 =cut
2176 sub length_aln {
2177 my $self = shift;
2178 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2179 $self->length(@_);
2182 sub length {
2183 my $self = shift;
2184 my $seq;
2185 my $length = -1;
2186 my $temp;
2188 foreach $seq ( $self->each_seq() ) {
2189 $temp = $seq->length();
2190 if( $temp > $length ) {
2191 $length = $temp;
2195 return $length;
2199 =head2 maxdisplayname_length
2201 Title : maxdisplayname_length
2202 Usage : $ali->maxdisplayname_length()
2203 Function : Gets the maximum length of the displayname in the
2204 alignment. Used in writing out various MSA formats.
2205 Returns : integer
2206 Argument :
2208 =cut
2210 sub maxname_length {
2211 my $self = shift;
2212 $self->deprecated("maxname_length - deprecated method.".
2213 " Use maxdisplayname_length() instead.");
2214 $self->maxdisplayname_length();
2217 sub maxnse_length {
2218 my $self = shift;
2219 $self->deprecated("maxnse_length - deprecated method.".
2220 " Use maxnse_length() instead.");
2221 $self->maxdisplayname_length();
2224 sub maxdisplayname_length {
2225 my $self = shift;
2226 my $maxname = (-1);
2227 my ($seq,$len);
2229 foreach $seq ( $self->each_seq() ) {
2230 $len = CORE::length $self->displayname($seq->get_nse());
2232 if( $len > $maxname ) {
2233 $maxname = $len;
2237 return $maxname;
2240 =head2 max_metaname_length
2242 Title : max_metaname_length
2243 Usage : $ali->max_metaname_length()
2244 Function : Gets the maximum length of the meta name tags in the
2245 alignment for the sequences and for the alignment.
2246 Used in writing out various MSA formats.
2247 Returns : integer
2248 Argument : None
2250 =cut
2252 sub max_metaname_length {
2253 my $self = shift;
2254 my $maxname = (-1);
2255 my ($seq,$len);
2257 # check seq meta first
2258 for $seq ( $self->each_seq() ) {
2259 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2260 for my $mtag ($seq->meta_names) {
2261 $len = CORE::length $mtag;
2262 if( $len > $maxname ) {
2263 $maxname = $len;
2268 # alignment meta
2269 for my $meta ($self->consensus_meta) {
2270 next unless $meta;
2271 for my $name ($meta->meta_names) {
2272 $len = CORE::length $name;
2273 if( $len > $maxname ) {
2274 $maxname = $len;
2279 return $maxname;
2282 =head2 no_residues
2284 Title : no_residues
2285 Usage : $no = $ali->no_residues
2286 Function : number of residues in total in the alignment
2287 Returns : integer
2288 Argument :
2290 =cut
2292 sub no_residues {
2293 my $self = shift;
2294 my $count = 0;
2296 foreach my $seq ($self->each_seq) {
2297 my $str = $seq->seq();
2299 $count += ($str =~ s/[A-Za-z]//g);
2302 return $count;
2305 =head2 no_sequences
2307 Title : no_sequences
2308 Usage : $depth = $ali->no_sequences
2309 Function : number of sequence in the sequence alignment
2310 Returns : integer
2311 Argument :
2313 =cut
2315 sub no_sequences {
2316 my $self = shift;
2318 return scalar($self->each_seq);
2322 =head2 average_percentage_identity
2324 Title : average_percentage_identity
2325 Usage : $id = $align->average_percentage_identity
2326 Function: The function uses a fast method to calculate the average
2327 percentage identity of the alignment
2328 Returns : The average percentage identity of the alignment
2329 Args : None
2330 Notes : This method implemented by Kevin Howe calculates a figure that is
2331 designed to be similar to the average pairwise identity of the
2332 alignment (identical in the absence of gaps), without having to
2333 explicitly calculate pairwise identities proposed by Richard Durbin.
2334 Validated by Ewan Birney ad Alex Bateman.
2336 =cut
2338 sub average_percentage_identity{
2339 my ($self,@args) = @_;
2341 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2342 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2344 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2346 if (! $self->is_flush()) {
2347 $self->throw("All sequences in the alignment must be the same length");
2350 @seqs = $self->each_seq();
2351 $len = $self->length();
2353 # load the each hash with correct keys for existence checks
2355 for( my $index=0; $index < $len; $index++) {
2356 foreach my $letter (@alphabet) {
2357 $countHashes[$index]->{$letter} = 0;
2360 foreach my $seq (@seqs) {
2361 my @seqChars = split //, $seq->seq();
2362 for( my $column=0; $column < @seqChars; $column++ ) {
2363 my $char = uc($seqChars[$column]);
2364 if (exists $countHashes[$column]->{$char}) {
2365 $countHashes[$column]->{$char}++;
2370 $total = 0;
2371 $divisor = 0;
2372 for(my $column =0; $column < $len; $column++) {
2373 my %hash = %{$countHashes[$column]};
2374 $subdivisor = 0;
2375 foreach my $res (keys %hash) {
2376 $total += $hash{$res}*($hash{$res} - 1);
2377 $subdivisor += $hash{$res};
2379 $divisor += $subdivisor * ($subdivisor - 1);
2381 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2384 =head2 percentage_identity
2386 Title : percentage_identity
2387 Usage : $id = $align->percentage_identity
2388 Function: The function calculates the average percentage identity
2389 (aliased to average_percentage_identity)
2390 Returns : The average percentage identity
2391 Args : None
2393 =cut
2395 sub percentage_identity {
2396 my $self = shift;
2397 return $self->average_percentage_identity();
2400 =head2 overall_percentage_identity
2402 Title : overall_percentage_identity
2403 Usage : $id = $align->overall_percentage_identity
2404 $id = $align->overall_percentage_identity('short')
2405 Function: The function calculates the percentage identity of
2406 the conserved columns
2407 Returns : The percentage identity of the conserved columns
2408 Args : length value to use, optional defaults to alignment length
2409 possible values: 'align', 'short', 'long'
2411 The argument values 'short' and 'long' refer to shortest and longest
2412 sequence in the alignment. Method modification code by Hongyu Zhang.
2414 =cut
2416 sub overall_percentage_identity{
2417 my ($self, $length_measure) = @_;
2419 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2420 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2422 my ($len, $total, @seqs, @countHashes);
2424 my %enum = map {$_ => 1} qw (align short long);
2426 $self->throw("Unknown argument [$length_measure]")
2427 if $length_measure and not $enum{$length_measure};
2428 $length_measure ||= 'align';
2430 if (! $self->is_flush()) {
2431 $self->throw("All sequences in the alignment must be the same length");
2434 @seqs = $self->each_seq();
2435 $len = $self->length();
2437 # load the each hash with correct keys for existence checks
2438 for( my $index=0; $index < $len; $index++) {
2439 foreach my $letter (@alphabet) {
2440 $countHashes[$index]->{$letter} = 0;
2443 foreach my $seq (@seqs) {
2444 my @seqChars = split //, $seq->seq();
2445 for( my $column=0; $column < @seqChars; $column++ ) {
2446 my $char = uc($seqChars[$column]);
2447 if (exists $countHashes[$column]->{$char}) {
2448 $countHashes[$column]->{$char}++;
2453 $total = 0;
2454 for(my $column =0; $column < $len; $column++) {
2455 my %hash = %{$countHashes[$column]};
2456 foreach ( values %hash ) {
2457 next if( $_ == 0 );
2458 $total++ if( $_ == scalar @seqs );
2459 last;
2463 if ($length_measure eq 'short') {
2464 ## find the shortest length
2465 $len = 0;
2466 foreach my $seq ($self->each_seq) {
2467 my $count = $seq->seq =~ tr/[A-Za-z]//;
2468 if ($len) {
2469 $len = $count if $count < $len;
2470 } else {
2471 $len = $count;
2475 elsif ($length_measure eq 'long') {
2476 ## find the longest length
2477 $len = 0;
2478 foreach my $seq ($self->each_seq) {
2479 my $count = $seq->seq =~ tr/[A-Za-z]//;
2480 if ($len) {
2481 $len = $count if $count > $len;
2482 } else {
2483 $len = $count;
2488 return ($total / $len ) * 100.0;
2493 =head1 Alignment positions
2495 Methods to map a sequence position into an alignment column and back.
2496 column_from_residue_number() does the former. The latter is really a
2497 property of the sequence object and can done using
2498 L<Bio::LocatableSeq::location_from_column>:
2500 # select somehow a sequence from the alignment, e.g.
2501 my $seq = $aln->get_seq_by_pos(1);
2502 #$loc is undef or Bio::LocationI object
2503 my $loc = $seq->location_from_column(5);
2505 =head2 column_from_residue_number
2507 Title : column_from_residue_number
2508 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2509 Function: This function gives the position in the alignment
2510 (i.e. column number) of the given residue number in the
2511 sequence with the given name. For example, for the
2512 alignment
2514 Seq1/91-97 AC..DEF.GH.
2515 Seq2/24-30 ACGG.RTY...
2516 Seq3/43-51 AC.DDEF.GHI
2518 column_from_residue_number( "Seq1", 94 ) returns 6.
2519 column_from_residue_number( "Seq2", 25 ) returns 2.
2520 column_from_residue_number( "Seq3", 50 ) returns 10.
2522 An exception is thrown if the residue number would lie
2523 outside the length of the aligment
2524 (e.g. column_from_residue_number( "Seq2", 22 )
2526 Note: If the the parent sequence is represented by more than
2527 one alignment sequence and the residue number is present in
2528 them, this method finds only the first one.
2530 Returns : A column number for the position in the alignment of the
2531 given residue in the given sequence (1 = first column)
2532 Args : A sequence id/name (not a name/start-end)
2533 A residue number in the whole sequence (not just that
2534 segment of it in the alignment)
2536 =cut
2538 sub column_from_residue_number {
2539 my ($self, $name, $resnumber) = @_;
2541 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2542 $self->throw("Second argument residue number missing") unless $resnumber;
2544 foreach my $seq ($self->each_seq_with_id($name)) {
2545 my $col;
2546 eval {
2547 $col = $seq->column_from_residue_number($resnumber);
2549 next if $@;
2550 return $col;
2553 $self->throw("Could not find a sequence segment in $name ".
2554 "containing residue number $resnumber");
2558 =head1 Sequence names
2560 Methods to manipulate the display name. The default name based on the
2561 sequence id and subsequence positions can be overridden in various
2562 ways.
2564 =head2 displayname
2566 Title : displayname
2567 Usage : $myalign->displayname("Ig", "IgA")
2568 Function : Gets/sets the display name of a sequence in the alignment
2569 Returns : A display name string
2570 Argument : name of the sequence
2571 displayname of the sequence (optional)
2573 =cut
2575 sub get_displayname {
2576 my $self = shift;
2577 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2578 $self->displayname(@_);
2581 sub set_displayname {
2582 my $self = shift;
2583 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2584 $self->displayname(@_);
2587 sub displayname {
2588 my ($self, $name, $disname) = @_;
2590 $self->throw("No sequence with name [$name]")
2591 unless defined $self->{'_seq'}->{$name};
2593 if( $disname and $name) {
2594 $self->{'_dis_name'}->{$name} = $disname;
2595 return $disname;
2597 elsif( defined $self->{'_dis_name'}->{$name} ) {
2598 return $self->{'_dis_name'}->{$name};
2599 } else {
2600 return $name;
2604 =head2 set_displayname_count
2606 Title : set_displayname_count
2607 Usage : $ali->set_displayname_count
2608 Function : Sets the names to be name_# where # is the number of
2609 times this name has been used.
2610 Returns : 1, on success
2611 Argument :
2613 =cut
2615 sub set_displayname_count {
2616 my $self= shift;
2617 my (@arr,$name,$seq,$count,$temp,$nse);
2619 foreach $seq ( $self->each_alphabetically() ) {
2620 $nse = $seq->get_nse();
2622 #name will be set when this is the second
2623 #time (or greater) is has been seen
2625 if( defined $name and $name eq ($seq->id()) ) {
2626 $temp = sprintf("%s_%s",$name,$count);
2627 $self->displayname($nse,$temp);
2628 $count++;
2629 } else {
2630 $count = 1;
2631 $name = $seq->id();
2632 $temp = sprintf("%s_%s",$name,$count);
2633 $self->displayname($nse,$temp);
2634 $count++;
2637 return 1;
2640 =head2 set_displayname_flat
2642 Title : set_displayname_flat
2643 Usage : $ali->set_displayname_flat()
2644 Function : Makes all the sequences be displayed as just their name,
2645 not name/start-end
2646 Returns : 1
2647 Argument :
2649 =cut
2651 sub set_displayname_flat {
2652 my $self = shift;
2653 my ($nse,$seq);
2655 foreach $seq ( $self->each_seq() ) {
2656 $nse = $seq->get_nse();
2657 $self->displayname($nse,$seq->id());
2659 return 1;
2662 =head2 set_displayname_normal
2664 Title : set_displayname_normal
2665 Usage : $ali->set_displayname_normal()
2666 Function : Makes all the sequences be displayed as name/start-end
2667 Returns : 1, on success
2668 Argument :
2670 =cut
2672 sub set_displayname_normal {
2673 my $self = shift;
2674 my ($nse,$seq);
2676 foreach $seq ( $self->each_seq() ) {
2677 $nse = $seq->get_nse();
2678 $self->displayname($nse,$nse);
2680 return 1;
2683 =head2 source
2685 Title : source
2686 Usage : $obj->source($newval)
2687 Function: sets the Alignment source program
2688 Example :
2689 Returns : value of source
2690 Args : newvalue (optional)
2693 =cut
2695 sub source{
2696 my ($self,$value) = @_;
2697 if( defined $value) {
2698 $self->{'_source'} = $value;
2700 return $self->{'_source'};
2703 =head2 set_displayname_safe
2705 Title : set_displayname_safe
2706 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2707 Function : Assign machine-generated serial names to sequences in input order.
2708 Designed to protect names during PHYLIP runs. Assign 10-char string
2709 in the form of "S000000001" to "S999999999". Restore the original
2710 names using "restore_displayname".
2711 Returns : 1. a new $aln with system names;
2712 2. a hash ref for restoring names
2713 Argument : Number for id length (default 10)
2715 =cut
2717 sub set_displayname_safe {
2718 my $self = shift;
2719 my $idlength = shift || 10;
2720 my ($seq, %phylip_name);
2721 my $ct=0;
2722 my $new=Bio::SimpleAlign->new();
2723 foreach $seq ( $self->each_seq() ) {
2724 $ct++;
2725 my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2726 $phylip_name{$pname}=$seq->id();
2727 my $new_seq= Bio::LocatableSeq->new(-id => $pname,
2728 -seq => $seq->seq(),
2729 -alphabet => $seq->alphabet,
2730 -start => $seq->{_start},
2731 -end => $seq->{_end}
2733 $new->add_seq($new_seq);
2736 $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2737 return ($new, \%phylip_name);
2740 =head2 restore_displayname
2742 Title : restore_displayname
2743 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2744 Function : Restore original sequence names (after running
2745 $ali->set_displayname_safe)
2746 Returns : a new $aln with names restored.
2747 Argument : a hash reference of names from "set_displayname_safe".
2749 =cut
2751 sub restore_displayname {
2752 my $self = shift;
2753 my $ref=shift;
2754 my %name=%$ref;
2755 my $new=Bio::SimpleAlign->new();
2756 foreach my $seq ( $self->each_seq() ) {
2757 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2758 my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2759 -seq => $seq->seq(),
2760 -alphabet => $seq->alphabet,
2761 -start => $seq->{_start},
2762 -end => $seq->{_end}
2764 $new->add_seq($new_seq);
2766 return $new;
2769 =head2 sort_by_start
2771 Title : sort_by_start
2772 Usage : $ali->sort_by_start
2773 Function : Changes the order of the alignment to the start position of each
2774 subalignment
2775 Returns :
2776 Argument :
2778 =cut
2780 sub sort_by_start {
2781 my $self = shift;
2782 my ($seq,$nse,@arr,%hash,$count);
2783 foreach $seq ( $self->each_seq() ) {
2784 $nse = $seq->get_nse;
2785 $hash{$nse} = $seq;
2787 $count = 0;
2788 %{$self->{'_order'}} = (); # reset the hash;
2789 foreach $nse ( sort _startend keys %hash) {
2790 $self->{'_order'}->{$count} = $nse;
2791 $count++;
2796 sub _startend
2798 my ($aname,$arange) = split (/[\/]/,$a);
2799 my ($bname,$brange) = split (/[\/]/,$b);
2800 my ($astart,$aend) = split(/\-/,$arange);
2801 my ($bstart,$bend) = split(/\-/,$brange);
2802 return $astart <=> $bstart;
2805 =head2 bracket_string
2807 Title : bracket_string
2808 Usage : my @params = (-refseq => 'testseq',
2809 -allele1 => 'allele1',
2810 -allele2 => 'allele2',
2811 -delimiters => '{}',
2812 -separator => '/');
2813 $str = $aln->bracket_string(@params)
2815 Function : When supplied with a list of parameters (see below), returns a
2816 string in BIC format. This is used for allelic comparisons.
2817 Briefly, if either allele contains a base change when compared to
2818 the refseq, the base or gap for each allele is represented in
2819 brackets in the order present in the 'alleles' parameter.
2821 For the following data:
2823 >testseq
2824 GGATCCATTGCTACT
2825 >allele1
2826 GGATCCATTCCTACT
2827 >allele2
2828 GGAT--ATTCCTCCT
2830 the returned string with parameters 'refseq => testseq' and
2831 'alleles => [qw(allele1 allele2)]' would be:
2833 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2834 Returns : BIC-formatted string
2835 Argument : Required args
2836 refseq : string (ID) of the reference sequence used
2837 as basis for comparison
2838 allele1 : string (ID) of the first allele
2839 allele2 : string (ID) of the second allele
2840 Optional args
2841 delimiters: two symbol string of left and right delimiters.
2842 Only the first two symbols are used
2843 default = '[]'
2844 separator : string used as a separator. Only the first
2845 symbol is used
2846 default = '/'
2847 Throws : On no refseq/alleles, or invalid refseq/alleles.
2849 =cut
2851 sub bracket_string {
2852 my ($self, @args) = @_;
2853 my ($ref, $a1, $a2, $delim, $sep) =
2854 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2855 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2856 my ($ld, $rd);
2857 ($ld, $rd) = split('', $delim, 2) if $delim;
2858 $ld ||= '[';
2859 $rd ||= ']';
2860 $sep ||= '/';
2861 my ($refseq, $allele1, $allele2) =
2862 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2863 if (!$refseq || !$allele1 || !$allele2) {
2864 $self->throw("One of your refseq/allele IDs is invalid!");
2866 my $len = $self->length-1;
2867 my $bic = '';
2868 # loop over the alignment columns
2869 for my $column ( 0 .. $len ) {
2870 my $string;
2871 my ($compres, $res1, $res2) =
2872 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2873 # are any of the allele symbols different from the refseq?
2874 $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2875 $ld.$res1.$sep.$res2.$rd;
2876 $bic .= $string;
2878 return $bic;
2882 =head2 methods for Bio::FeatureHolder
2884 FeatureHolder implementation to support labeled character sets like one
2885 would get from NEXUS represented data.
2887 =head2 get_SeqFeatures
2889 Usage :
2890 Function: Get the feature objects held by this feature holder.
2891 Example :
2892 Returns : an array of Bio::SeqFeatureI implementing objects
2893 Args : none
2895 At some day we may want to expand this method to allow for a feature
2896 filter to be passed in.
2898 =cut
2900 sub get_SeqFeatures {
2901 my $self = shift;
2903 if( !defined $self->{'_as_feat'} ) {
2904 $self->{'_as_feat'} = [];
2906 return @{$self->{'_as_feat'}};
2909 =head2 add_SeqFeature
2911 Usage : $feat->add_SeqFeature($subfeat);
2912 $feat->add_SeqFeature($subfeat,'EXPAND')
2913 Function: adds a SeqFeature into the subSeqFeature array.
2914 with no 'EXPAND' qualifer, subfeat will be tested
2915 as to whether it lies inside the parent, and throw
2916 an exception if not.
2918 If EXPAND is used, the parent''s start/end/strand will
2919 be adjusted so that it grows to accommodate the new
2920 subFeature
2921 Example :
2922 Returns : nothing
2923 Args : a Bio::SeqFeatureI object
2925 =cut
2927 sub add_SeqFeature {
2928 my ($self,@feat) = @_;
2930 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
2932 foreach my $feat ( @feat ) {
2933 if( !$feat->isa("Bio::SeqFeatureI") ) {
2934 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
2937 push(@{$self->{'_as_feat'}},$feat);
2939 return 1;
2943 =head2 remove_SeqFeatures
2945 Usage : $obj->remove_SeqFeatures
2946 Function: Removes all sub SeqFeatures. If you want to remove only a subset,
2947 remove that subset from the returned array, and add back the rest.
2948 Returns : The array of Bio::SeqFeatureI implementing sub-features that was
2949 deleted from this feature.
2950 Args : none
2952 =cut
2954 sub remove_SeqFeatures {
2955 my $self = shift;
2957 return () unless $self->{'_as_feat'};
2958 my @feats = @{$self->{'_as_feat'}};
2959 $self->{'_as_feat'} = [];
2960 return @feats;
2963 =head2 feature_count
2965 Title : feature_count
2966 Usage : $obj->feature_count()
2967 Function: Return the number of SeqFeatures attached to a feature holder.
2969 This is before flattening a possible sub-feature tree.
2971 We provide a default implementation here that just counts
2972 the number of objects returned by get_SeqFeatures().
2973 Implementors may want to override this with a more
2974 efficient implementation.
2976 Returns : integer representing the number of SeqFeatures
2977 Args : None
2979 At some day we may want to expand this method to allow for a feature
2980 filter to be passed in.
2982 Our default implementation allows for any number of additional
2983 arguments and will pass them on to get_SeqFeatures(). I.e., in order to
2984 support filter arguments, just support them in get_SeqFeatures().
2986 sub feature_count {
2987 my ($self) = @_;
2989 if (defined($self->{'_as_feat'})) {
2990 return ($#{$self->{'_as_feat'}} + 1);
2991 } else {
2992 return 0;
2996 =head2 get_all_SeqFeatures
2998 Title : get_all_SeqFeatures
2999 Usage :
3000 Function: Get the flattened tree of feature objects held by this
3001 feature holder. The difference to get_SeqFeatures is that
3002 the entire tree of sub-features will be flattened out.
3004 We provide a default implementation here, so implementors
3005 don''t necessarily need to implement this method.
3007 Example :
3008 Returns : an array of Bio::SeqFeatureI implementing objects
3009 Args : none
3011 At some day we may want to expand this method to allow for a feature
3012 filter to be passed in.
3014 Our default implementation allows for any number of additional
3015 arguments and will pass them on to any invocation of
3016 get_SeqFeatures(), wherever a component of the tree implements
3017 FeatureHolderI. I.e., in order to support filter arguments, just
3018 support them in get_SeqFeatures().
3020 =cut
3022 =head2 methods for Bio::AnnotatableI
3024 AnnotatableI implementation to support sequence alignments which
3025 contain annotation (NEXUS, Stockholm).
3027 =head2 annotation
3029 Title : annotation
3030 Usage : $ann = $aln->annotation or
3031 $aln->annotation($ann)
3032 Function: Gets or sets the annotation
3033 Returns : Bio::AnnotationCollectionI object
3034 Args : None or Bio::AnnotationCollectionI object
3036 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3037 for more information
3039 =cut
3041 sub annotation {
3042 my ($obj,$value) = @_;
3043 if( defined $value ) {
3044 $obj->throw("object of class ".ref($value)." does not implement ".
3045 "Bio::AnnotationCollectionI. Too bad.")
3046 unless $value->isa("Bio::AnnotationCollectionI");
3047 $obj->{'_annotation'} = $value;
3048 } elsif( ! defined $obj->{'_annotation'}) {
3049 $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3051 return $obj->{'_annotation'};