2 # BioPerl module for SimpleAlign
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
15 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
16 # May 2001 major rewrite - Heikki Lehvaslaiho
20 Bio::SimpleAlign - Multiple alignments held as a set of sequences
24 # Use Bio::AlignIO to read in the alignment
25 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
26 $aln = $str->next_aln();
30 print $aln->num_residues;
32 print $aln->num_sequences;
34 print $aln->percentage_identity;
35 print $aln->consensus_string(50);
37 # Find the position in the alignment for a sequence location
38 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
40 # Extract sequences and check values for the alignment column $pos
41 foreach $seq ($aln->each_seq) {
42 $res = $seq->subseq($pos, $pos);
45 foreach $res (keys %count) {
46 printf "Res: %s Count: %2d\n", $res, $count{$res};
50 $aln->remove_seq($seq);
51 $mini_aln = $aln->slice(20,30); # get a block of columns
52 $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
53 $new_aln = $aln->remove_columns([20,30]); # remove by position
54 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
57 $str = $aln->consensus_string($threshold_percent);
58 $str = $aln->match_line();
59 $str = $aln->cigar_line();
60 $id = $aln->percentage_identity;
62 # See the module documentation for details and more methods.
66 SimpleAlign is an object that handles a multiple sequence alignment
67 (MSA). It is very permissive of types (it does not insist on sequences
68 being all same length, for example). Think of it as a set of sequences
69 with a whole series of built-in manipulations and methods for reading and
72 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
73 to store its sequences. These are subsequences with a start and end
74 positions in the parent reference sequence. Each sequence in the
75 SimpleAlign object is a Bio::LocatableSeq.
77 SimpleAlign expects the combination of name, start, and end for a
78 given sequence to be unique in the alignment, and this is the key for the
79 internal hashes (name, start, end are abbreviated C<nse> in the code).
80 However, in some cases people do not want the name/start-end to be displayed:
81 either multiple names in an alignment or names specific to the alignment
82 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
83 C<displayname>, and generally is what is used to print out the
84 alignment. They default to name/start-end.
86 The SimpleAlign Module is derived from the Align module by Ewan Birney.
92 User feedback is an integral part of the evolution of this and other
93 Bioperl modules. Send your comments and suggestions preferably to one
94 of the Bioperl mailing lists. Your participation is much appreciated.
96 bioperl-l@bioperl.org - General discussion
97 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
101 Please direct usage questions or support issues to the mailing list:
103 I<bioperl-l@bioperl.org>
105 rather than to the module maintainer directly. Many experienced and
106 reponsive experts will be able look at the problem and quickly
107 address it. Please include a thorough description of the problem
108 with code and data examples if at all possible.
110 =head2 Reporting Bugs
112 Report bugs to the Bioperl bug tracking system to help us keep track
113 the bugs and their resolution. Bug reports can be submitted via the
116 http://bugzilla.open-bio.org/
120 Ewan Birney, birney@ebi.ac.uk
124 Allen Day, allenday-at-ucla.edu,
125 Richard Adams, Richard.Adams-at-ed.ac.uk,
126 David J. Evans, David.Evans-at-vir.gla.ac.uk,
127 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
128 Allen Smith, allens-at-cpan.org,
129 Jason Stajich, jason-at-bioperl.org,
130 Anthony Underwood, aunderwood-at-phls.org.uk,
131 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
132 Brian Osborne, bosborne at alum.mit.edu
133 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
134 Hongyu Zhang, forward at hongyu.org
135 Jay Hannah, jay at jays.net
136 Alexandr Bezginov, albezg at gmail.com
144 The rest of the documentation details each of the object
145 methods. Internal methods are usually preceded with a _
149 # 'Let the code begin...
151 package Bio
::SimpleAlign
;
152 use vars
qw(%CONSERVATION_GROUPS);
155 use Bio::LocatableSeq; # uses Seq's as list
158 use Bio::SeqFeature::Generic;
161 # This data should probably be in a more centralized module...
162 # it is taken from Clustalw documentation.
163 # These are all the positively scoring groups that occur in the
164 # Gonnet Pam250 matrix. The strong and weak groups are
165 # defined as strong score >0.5 and weak score =<0.5 respectively.
167 %CONSERVATION_GROUPS = (
192 use base
qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
193 Bio::FeatureHolderI);
198 Usage : my $aln = Bio::SimpleAlign->new();
199 Function : Creates a new simple align object
200 Returns : Bio::SimpleAlign
201 Args : -source => string representing the source program
202 where this alignment came from
203 -annotation => Bio::AnnotationCollectionI
204 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
205 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
206 -consensus => consensus string
207 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
213 my($class,@args) = @_;
215 my $self = $class->SUPER::new
(@args);
217 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
230 $src && $self->source($src);
231 defined $score && $self->score($score);
232 # we need to set up internal hashs first!
234 $self->{'_seq'} = {};
235 $self->{'_order'} = {};
236 $self->{'_start_end_lists'} = {};
237 $self->{'_dis_name'} = {};
238 $self->{'_id'} = 'NoName';
239 # maybe we should automatically read in from args. Hmmm...
240 $id && $self->id($id);
241 $acc && $self->accession($acc);
242 $desc && $self->description($desc);
243 $coll && $self->annotation($coll);
244 # sequence annotation is layered into a provided annotation collection (or dies)
246 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
247 "with a sequence annotation collection")
249 $coll->add_Annotation('seq_annotation', $sa);
251 if ($feats && ref $feats eq 'ARRAY') {
252 for my $feat (@
$feats) {
253 $self->add_SeqFeature($feat);
256 $con && $self->consensus($con);
257 $cmeta && $self->consensus_meta($cmeta);
258 # assumes these are in correct alignment order
259 if ($seqs && ref($seqs) eq 'ARRAY') {
260 for my $seq (@
$seqs) {
261 $self->add_seq($seq);
265 return $self; # success - we hope!
268 =head1 Modifier methods
270 These methods modify the MSA by adding, removing or shuffling complete
276 Usage : $myalign->add_seq($newseq);
277 $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
278 Function : Adds another sequence to the alignment. *Does not* align
279 it - just adds it to the hashes.
280 If -ORDER is specified, the sequence is inserted at the
281 the position spec'd by -ORDER, and existing sequences
282 are pushed down the storage array.
284 Args : A Bio::LocatableSeq object
285 Positive integer for the sequence position (optional)
287 See L<Bio::LocatableSeq> for more information
293 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
300 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
301 my ($name,$id,$start,$end);
304 $self->throw("LocatableSeq argument required");
306 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
307 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
309 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
310 $order--; # jay's patch (user-specified order is 1-origin)
313 $self->throw("User-specified value for ORDER must be >= 1");
316 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
318 # build the symbol list for this sequence,
319 # will prune out the gap and missing/match chars
320 # when actually asked for the symbol list in the
322 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
324 $name = $seq->get_nse;
326 if( $self->{'_seq'}->{$name} ) {
327 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
330 $self->debug( "Assigning $name to $order\n");
332 my $ordh = $self->{'_order'};
333 if ($ordh->{$order}) {
334 # make space to insert
335 # $c->() returns (in reverse order) the first subsequence
336 # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
337 # (3,2,1), and $c->(2,4,5) returns (2).
339 $c = sub { return (($_[1]-$_[0] == 1) ?
($c->(@_[1..$#_]),$_[0]) : $_[0]); };
341 $ordh->{$_+1} = $ordh->{$_}
342 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
345 $ordh->{$order} = $name;
347 unless( exists( $self->{'_start_end_lists'}->{$id})) {
348 $self->{'_start_end_lists'}->{$id} = [];
350 push @
{$self->{'_start_end_lists'}->{$id}}, $seq;
353 $self->{'_seq'}->{$name} = $seq;
361 Usage : $aln->remove_seq($seq);
362 Function : Removes a single sequence from an alignment
364 Argument : a Bio::LocatableSeq object
370 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
371 $self->remove_seq(@_);
377 my ($name,$id,$start,$end);
379 $self->throw("Need Bio::Locatable seq argument ")
380 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
383 $start = $seq->start();
385 $name = sprintf("%s/%d-%d",$id,$start,$end);
387 if( !exists $self->{'_seq'}->{$name} ) {
388 $self->throw("Sequence $name does not exist in the alignment to remove!");
391 delete $self->{'_seq'}->{$name};
393 # we need to remove this seq from the start_end_lists hash
395 if (exists $self->{'_start_end_lists'}->{$id}) {
396 # we need to find the sequence in the array.
399 for ($i=0; $i < @
{$self->{'_start_end_lists'}->{$id}}; $i++) {
400 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
406 splice @
{$self->{'_start_end_lists'}->{$id}}, $i, 1;
409 $self->throw("Could not find the sequence to remoce from the start-end list");
413 $self->throw("There is no seq list for the name $id");
415 # we need to shift order hash
416 my %rev_order = reverse %{$self->{'_order'}};
417 my $no = $rev_order{$name};
418 my $num_sequences = $self->num_sequences;
419 for (; $no < $num_sequences; $no++) {
420 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
422 delete $self->{'_order'}->{$no};
430 Usage : $aln->purge(0.7);
431 Function: Removes sequences above given sequence similarity
432 This function will grind on large alignments. Beware!
434 Returns : An array of the removed sequences
435 Args : float, threshold for similarity
440 my ($self,$perc) = @_;
441 my (%duplicate, @dups);
443 my @seqs = $self->each_seq();
445 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
448 #skip if already in duplicate hash
449 next if exists $duplicate{$seq->display_id} ;
450 my $one = $seq->seq();
452 my @one = split '', $one; #split to get 1aa per array element
454 for (my $j=$i+1;$j < @seqs;$j++) {
455 my $seq2 = $seqs[$j];
457 #skip if already in duplicate hash
458 next if exists $duplicate{$seq2->display_id} ;
460 my $two = $seq2->seq();
461 my @two = split '', $two;
465 for (my $k=0;$k<@one;$k++) {
466 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
467 $one[$k] eq $two[$k]) {
470 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
471 $two[$k] ne '.' && $two[$k] ne '-' ) {
477 $ratio = $count/$res unless $res == 0;
479 # if above threshold put in duplicate hash and push onto
480 # duplicate array for returning to get_unique
481 if ( $ratio > $perc ) {
482 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
483 $duplicate{$seq2->display_id} = 1;
488 foreach my $seq (@dups) {
489 $self->remove_seq($seq);
494 =head2 sort_alphabetically
496 Title : sort_alphabetically
497 Usage : $ali->sort_alphabetically
498 Function : Changes the order of the alignment to alphabetical on name
499 followed by numerical by number.
505 sub sort_alphabetically
{
507 my ($seq,$nse,@arr,%hash,$count);
509 foreach $seq ( $self->each_seq() ) {
510 $nse = $seq->get_nse;
516 %{$self->{'_order'}} = (); # reset the hash;
518 foreach $nse ( sort _alpha_startend
keys %hash) {
519 $self->{'_order'}->{$count} = $nse;
529 Usage : $aln_ordered=$aln->sort_by_list($list_file)
530 Function : Arbitrarily order sequences in an alignment
531 Returns : A new Bio::SimpleAlign object
532 Argument : a file listing sequence names in intended order (one name per line)
537 my ($self, $list) = @_;
538 my (@seq, @ids, %order);
540 foreach my $seq ( $self->each_seq() ) {
542 push @ids, $seq->display_id;
546 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
550 $self->throw("Not found in alignment: $name") unless &_in_aln
($name, \
@ids);
555 # use the map-sort-map idiom:
556 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
557 my $aln = $self->new;
558 foreach (@sorted) { $aln->add_seq($_) }
562 =head2 set_new_reference
564 Title : set_new_reference
565 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
566 the sequence whoes name is "B31" (full, exact, and case-sensitive),
567 as the reference (1st) sequence
568 Function : Change/Set a new reference (i.e., the first) sequence
569 Returns : a new Bio::SimpleAlign object.
570 Throws an exception if designated sequence not found
571 Argument : a positive integer of sequence order, or a sequence name
572 in the original alignment
576 sub set_new_reference
{
577 my ($self, $seqid) = @_;
578 my $aln = $self->new;
579 my (@seq, @ids, @new_seq);
581 foreach my $seq ( $self->each_seq() ) {
583 push @ids, $seq->display_id;
586 if ($seqid =~ /^\d+$/) { # argument is seq position
588 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
589 } else { # argument is a seq name
590 $self->throw("The new reference sequence not in alignment ") unless &_in_aln
($seqid, \
@ids);
593 for (my $i=0; $i<=$#seq; $i++) {
595 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
596 unshift @new_seq, $seq[$i];
598 push @new_seq, $seq[$i];
601 foreach (@new_seq) { $aln->add_seq($_); }
605 sub _in_aln
{ # check if input name exists in the alignment
606 my ($str, $ref) = @_;
608 return 1 if $str eq $_;
617 Usage : $aln->uniq_seq(): Remove identical sequences in
618 in the alignment. Ambiguous base ("N", "n") and
619 leading and ending gaps ("-") are NOT counted as
621 Function : Make a new alignment of unique sequence types (STs)
622 Returns : 1a. if called in a scalar context,
623 a new Bio::SimpleAlign object (all sequences renamed as "ST")
624 1b. if called in an array context,
625 a new Bio::SimpleAlign object, and a hashref whose keys
626 are sequence types, and whose values are arrayrefs to
627 lists of sequence ids within the corresponding sequence type
628 2. if $aln->verbose > 0, ST of each sequence is sent to
629 STDERR (in a tabular format)
635 my ($self, $seqid) = @_;
636 my $aln = $self->new;
637 my (%member, %order, @seq, @uniq_str, $st);
639 my $len = $self->length();
641 foreach my $seq ( $self->each_seq() ) {
642 my $str = $seq->seq();
644 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
645 # comparing two sequence strings
647 # 1st, convert "n", "N" to "?" (for DNA sequence only):
648 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
649 # 2nd, convert leading and ending gaps to "?":
650 $str = &_convert_leading_ending_gaps
($str, '-', '?');
651 # Note that '?' also can mean unknown residue.
652 # I don't like making global class member changes like this, too
653 # prone to errors... -- cjfields 08-11-18
654 local $Bio::LocatableSeq
::GAP_SYMBOLS
= '-\?';
655 my $new = Bio
::LocatableSeq
->new(
657 -alphabet
=> $seq->alphabet,
659 -start
=> $seq->start,
665 foreach my $seq (@seq) {
666 my $str = $seq->seq();
667 my ($seen, $key) = &_check_uniq
($str, \
@uniq_str, $len);
668 if ($seen) { # seen before
669 my @memb = @
{$member{$key}};
671 $member{$key} = \
@memb;
673 push @uniq_str, $key;
675 $member{$key} = [ ($seq) ];
676 $order{$key} = $order;
680 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
681 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
682 my $str2 = &_convert_leading_ending_gaps
($str, '?', '-');
683 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
684 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
686 my $end= CORE
::length($str2);
687 $end -= CORE
::length($1) while $str2 =~ m/($gap+)/g;
688 my $new = Bio
::LocatableSeq
->new(-id
=>"ST".$order{$str},
694 foreach (@
{$member{$str}}) {
695 push @
{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
696 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
699 return wantarray ?
($aln, $st) : $aln;
702 sub _check_uniq
{ # check if same seq exists in the alignment
703 my ($str1, $ref, $length) = @_;
704 my @char1=split //, $str1;
707 return (0, $str1) if @array==0; # not seen (1st sequence)
709 foreach my $str2 (@array) {
711 my @char2=split //, $str2;
712 for (my $i=0; $i<=$length-1; $i++) {
713 next if $char1[$i] eq '?';
714 next if $char2[$i] eq '?';
715 $diff++ if $char1[$i] ne $char2[$i];
717 return (1, $str2) if $diff == 0; # seen before
720 return (0, $str1); # not seen
723 sub _convert_leading_ending_gaps
{
727 my @array=split //, $s;
728 # convert leading char:
729 for (my $i=0; $i<=$#array; $i++) {
730 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
732 # convert ending char:
733 for (my $i = $#array; $i>= 0; $i--) {
734 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
736 my $s_new=join '', @array;
740 =head1 Sequence selection methods
742 Methods returning one or more sequences objects.
747 Usage : foreach $seq ( $align->each_seq() )
748 Function : Gets a Seq object from the alignment
756 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
764 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
765 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
766 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
773 =head2 each_alphabetically
775 Title : each_alphabetically
776 Usage : foreach $seq ( $ali->each_alphabetically() )
777 Function : Returns a sequence object, but the objects are returned
778 in alphabetically sorted order.
779 Does not change the order of the alignment.
785 sub each_alphabetically
{
787 my ($seq,$nse,@arr,%hash,$count);
789 foreach $seq ( $self->each_seq() ) {
790 $nse = $seq->get_nse;
794 foreach $nse ( sort _alpha_startend
keys %hash) {
795 push(@arr,$hash{$nse});
800 sub _alpha_startend
{
801 my ($aname,$astart,$bname,$bstart);
802 ($aname,$astart) = split (/-/,$a);
803 ($bname,$bstart) = split (/-/,$b);
805 if( $aname eq $bname ) {
806 return $astart <=> $bstart;
809 return $aname cmp $bname;
813 =head2 each_seq_with_id
815 Title : each_seq_with_id
816 Usage : foreach $seq ( $align->each_seq_with_id() )
817 Function : Gets a Seq objects from the alignment, the contents
818 being those sequences with the given name (there may be
821 Argument : a seq name
827 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
828 $self->each_seq_with_id(@_);
831 sub each_seq_with_id
{
835 $self->throw("Method each_seq_with_id needs a sequence name argument")
840 if (exists($self->{'_start_end_lists'}->{$id})) {
841 @arr = @
{$self->{'_start_end_lists'}->{$id}};
846 =head2 get_seq_by_pos
848 Title : get_seq_by_pos
849 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
850 Function : Gets a sequence based on its position in the alignment.
851 Numbering starts from 1. Sequence positions larger than
852 num_sequences() will thow an error.
853 Returns : a Bio::LocatableSeq object
854 Args : positive integer for the sequence position
863 $self->throw("Sequence position has to be a positive integer, not [$pos]")
864 unless $pos =~ /^\d+$/ and $pos > 0;
865 $self->throw("No sequence at position [$pos]")
866 unless $pos <= $self->num_sequences ;
868 my $nse = $self->{'_order'}->{--$pos};
869 return $self->{'_seq'}->{$nse};
874 Title : get_seq_by_id
875 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
876 Function : Gets a sequence based on its name.
877 Sequences that do not exist will warn and return undef
878 Returns : a Bio::LocatableSeq object
879 Args : string for sequence name
884 my ($self,$name) = @_;
885 unless( defined $name ) {
886 $self->warn("Must provide a sequence name");
889 for my $seq ( values %{$self->{'_seq'}} ) {
890 if ( $seq->id eq $name) {
897 =head2 seq_with_features
899 Title : seq_with_features
900 Usage : $seq = $aln->seq_with_features(-pos => 1,
903 sub { my $consensus = shift;
908 while($consensus =~ /[^?]$q[^?]/){
909 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
915 Function: produces a Bio::Seq object by first splicing gaps from -pos
916 (by means of a splice_by_seq_pos() call), then creating
917 features using non-? chars (by means of a consensus_string()
918 call with stringency -consensus).
919 Returns : a Bio::Seq object
920 Args : -pos : required. sequence from which to build the Bio::Seq
922 -consensus : optional, defaults to consensus_string()'s
924 -mask : optional, a coderef to apply to consensus_string()'s
925 output before building features. this may be useful for
926 closing gaps of 1 bp by masking over them with N, for
931 sub seq_with_features
{
932 my ($self,%arg) = @_;
934 #first do the preparatory splice
935 $self->throw("must provide a -pos argument") unless $arg{-pos};
936 $self->splice_by_seq_pos($arg{-pos});
938 my $consensus_string = $self->consensus_string($arg{-consensus
});
939 $consensus_string = $arg{-mask
}->($consensus_string)
940 if defined($arg{-mask
});
944 push @bs, 1 if $consensus_string =~ /^[^?]/;
946 while($consensus_string =~ /\?[^?]/g){
947 push @bs, pos($consensus_string);
949 while($consensus_string =~ /[^?]\?/g){
950 push @es, pos($consensus_string);
953 push @es, CORE
::length($consensus_string) if $consensus_string =~ /[^?]$/;
955 my $seq = Bio
::Seq
->new();
957 # my $rootfeature = Bio::SeqFeature::Generic->new(
958 # -source_tag => 'location',
959 # -start => $self->get_seq_by_pos($arg{-pos})->start,
960 # -end => $self->get_seq_by_pos($arg{-pos})->end,
962 # $seq->add_SeqFeature($rootfeature);
964 while(my $b = shift @bs){
966 $seq->add_SeqFeature(
967 Bio
::SeqFeature
::Generic
->new(
968 -start
=> $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
969 -end
=> $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
970 -source_tag
=> $self->source || 'MSA',
979 =head1 Create new alignments
981 The result of these methods are horizontal or vertical subsets of the
987 Usage : $aln2 = $aln->select(1, 3) # three first sequences
988 Function : Creates a new alignment from a continuous subset of
989 sequences. Numbering starts from 1. Sequence positions
990 larger than num_sequences() will thow an error.
991 Returns : a Bio::SimpleAlign object
992 Args : positive integer for the first sequence
993 positive integer for the last sequence to include (optional)
999 my ($start, $end) = @_;
1001 $self->throw("Select start has to be a positive integer, not [$start]")
1002 unless $start =~ /^\d+$/ and $start > 0;
1003 $self->throw("Select end has to be a positive integer, not [$end]")
1004 unless $end =~ /^\d+$/ and $end > 0;
1005 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1006 unless $start <= $end;
1008 my $aln = $self->new;
1009 foreach my $pos ($start .. $end) {
1010 $aln->add_seq($self->get_seq_by_pos($pos));
1012 $aln->id($self->id);
1013 # fix for meta, sf, ann
1017 =head2 select_noncont
1019 Title : select_noncont
1020 Usage : # 1st and 3rd sequences, sorted
1021 $aln2 = $aln->select_noncont(1, 3)
1023 # 1st and 3rd sequences, sorted (same as first)
1024 $aln2 = $aln->select_noncont(3, 1)
1026 # 1st and 3rd sequences, unsorted
1027 $aln2 = $aln->select_noncont('nosort',3, 1)
1029 Function : Creates a new alignment from a subset of sequences. Numbering
1030 starts from 1. Sequence positions larger than num_sequences() will
1031 throw an error. Sorts the order added to new alignment by default,
1032 to prevent sorting pass 'nosort' as the first argument in the list.
1033 Returns : a Bio::SimpleAlign object
1034 Args : array of integers for the sequences. If the string 'nosort' is
1035 passed as the first argument, the sequences will not be sorted
1036 in the new alignment but will appear in the order listed.
1040 sub select_noncont
{
1044 if ($pos[0] !~ m{^\d+$}) {
1045 my $sortcmd = shift @pos;
1046 if ($sortcmd eq 'nosort') {
1049 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1053 my $end = $self->num_sequences;
1055 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1056 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1059 @pos = sort {$a <=> $b} @pos unless $nosort;
1061 my $aln = $self->new;
1062 foreach my $p (@pos) {
1063 $aln->add_seq($self->get_seq_by_pos($p));
1065 $aln->id($self->id);
1066 # fix for meta, sf, ann
1073 Usage : $aln2 = $aln->slice(20,30)
1074 Function : Creates a slice from the alignment inclusive of start and
1075 end columns, and the first column in the alignment is denoted 1.
1076 Sequences with no residues in the slice are excluded from the
1077 new alignment and a warning is printed. Slice beyond the length of
1078 the sequence does not do padding.
1079 Returns : A Bio::SimpleAlign object
1080 Args : Positive integer for start column, positive integer for end column,
1081 optional boolean which if true will keep gap-only columns in the newly
1082 created slice. Example:
1084 $aln2 = $aln->slice(20,30,1)
1090 my ($start, $end, $keep_gap_only) = @_;
1092 $self->throw("Slice start has to be a positive integer, not [$start]")
1093 unless $start =~ /^\d+$/ and $start > 0;
1094 $self->throw("Slice end has to be a positive integer, not [$end]")
1095 unless $end =~ /^\d+$/ and $end > 0;
1096 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1097 unless $start <= $end;
1098 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1099 "[$start] is too big.") if $start > $self->length;
1100 my $cons_meta = $self->consensus_meta;
1101 my $aln = $self->new;
1102 $aln->id($self->id);
1103 foreach my $seq ( $self->each_seq() ) {
1104 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1107 -alphabet
=> $seq->alphabet,
1108 -strand
=> $seq->strand,
1109 -verbose
=> $self->verbose) :
1110 Bio
::LocatableSeq
->new
1112 -alphabet
=> $seq->alphabet,
1113 -strand
=> $seq->strand,
1114 -verbose
=> $self->verbose);
1118 $seq_end = $seq->length if( $end > $seq->length );
1120 my $slice_seq = $seq->subseq($start, $seq_end);
1121 $new_seq->seq( $slice_seq );
1123 $slice_seq =~ s/\W//g;
1126 my $pre_start_seq = $seq->subseq(1, $start - 1);
1127 $pre_start_seq =~ s/\W//g;
1128 if (!defined($seq->strand)) {
1129 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1130 } elsif ($seq->strand < 0){
1131 $new_seq->start( $seq->end - CORE
::length($pre_start_seq) - CORE
::length($slice_seq) + 1);
1133 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1136 if ((defined $seq->strand)&&($seq->strand < 0)){
1137 $new_seq->start( $seq->end - CORE
::length($slice_seq) + 1);
1139 $new_seq->start( $seq->start);
1142 if ($new_seq->isa('Bio::Seq::MetaI')) {
1143 for my $meta_name ($seq->meta_names) {
1144 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1147 $new_seq->end( $new_seq->start + CORE
::length($slice_seq) - 1 );
1149 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1150 $aln->add_seq($new_seq);
1152 if( $keep_gap_only ) {
1153 $aln->add_seq($new_seq);
1155 my $nse = $seq->get_nse();
1156 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1157 " Sequence excluded from the new alignment.");
1162 my $new = Bio
::Seq
::Meta
->new();
1163 for my $meta_name ($cons_meta->meta_names) {
1164 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1166 $aln->consensus_meta($new);
1168 $aln->annotation($self->annotation);
1169 # fix for meta, sf, ann
1173 =head2 remove_columns
1175 Title : remove_columns
1176 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1177 $aln2 = $aln->remove_columns([0,0],[6,8])
1178 Function : Creates an aligment with columns removed corresponding to
1179 the specified type or by specifying the columns by number.
1180 Returns : Bio::SimpleAlign object
1181 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1182 'all_gaps_columns') or array ref where the referenced array
1183 contains a pair of integers that specify a range.
1184 The first column is 0
1188 sub remove_columns
{
1189 my ($self,@args) = @_;
1190 @args || $self->throw("Must supply column ranges or column types");
1193 if ($args[0][0] =~ /^[a-z_]+$/i) {
1194 $aln = $self->_remove_columns_by_type($args[0]);
1195 } elsif ($args[0][0] =~ /^\d+$/) {
1196 $aln = $self->_remove_columns_by_num(\
@args);
1198 $self->throw("You must pass array references to remove_columns(), not @args");
1200 # fix for meta, sf, ann
1208 Usage : $aln2 = $aln->remove_gaps
1209 Function : Creates an aligment with gaps removed
1210 Returns : a Bio::SimpleAlign object
1211 Args : a gap character(optional) if none specified taken
1212 from $self->gap_char,
1213 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1214 indicates that only all-gaps columns should be deleted
1216 Used from method L<remove_columns> in most cases. Set gap character
1217 using L<gap_char()|gap_char>.
1222 my ($self,$gapchar,$all_gaps_columns) = @_;
1224 if ($all_gaps_columns) {
1225 $gap_line = $self->all_gap_line($gapchar);
1227 $gap_line = $self->gap_line($gapchar);
1229 my $aln = $self->new;
1233 my $del_char = $gapchar || $self->gap_char;
1234 # Do the matching to get the segments to remove
1235 while ($gap_line =~ m/[$del_char]/g) {
1236 my $start = pos($gap_line)-1;
1237 $gap_line=~/\G[$del_char]+/gc;
1238 my $end = pos($gap_line)-1;
1240 #have to offset the start and end for subsequent removes
1243 $length += ($end-$start+1);
1244 push @remove, [$start,$end];
1247 #remove the segments
1248 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1249 # fix for meta, sf, ann
1255 my ($self,$aln,$remove) = @_;
1258 my $gap = $self->gap_char;
1260 # splice out the segments and create new seq
1261 foreach my $seq($self->each_seq){
1262 my $new_seq = Bio
::LocatableSeq
->new(
1264 -alphabet
=> $seq->alphabet,
1265 -strand
=> $seq->strand,
1266 -verbose
=> $self->verbose);
1267 my $sequence = $seq->seq;
1268 foreach my $pair(@
{$remove}){
1269 my $start = $pair->[0];
1270 my $end = $pair->[1];
1271 $sequence = $seq->seq unless $sequence;
1272 my $orig = $sequence;
1273 my $head = $start > 0 ?
substr($sequence, 0, $start) : '';
1274 my $tail = ($end + 1) >= CORE
::length($sequence) ?
'' : substr($sequence, $end + 1);
1275 $sequence = $head.$tail;
1277 unless (defined $new_seq->start) {
1279 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1280 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1283 my $start_adjust = $orig =~ /^$gap+/;
1284 if ($start_adjust) {
1285 $start_adjust = $+[0] == $start;
1287 $new_seq->start($seq->start + $start_adjust);
1291 if (($end + 1) >= CORE
::length($orig)) {
1292 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1293 $new_seq->end($seq->end - (CORE
::length($orig) - $start) + $end_adjust);
1296 $new_seq->end($seq->end);
1300 if ($new_seq->end < $new_seq->start) {
1301 # we removed all columns except for gaps: set to 0 to indicate no
1307 $new_seq->seq($sequence) if $sequence;
1308 push @new, $new_seq;
1310 # add the new seqs to the alignment
1311 foreach my $new(@new){
1312 $aln->add_seq($new);
1314 # fix for meta, sf, ann
1318 sub _remove_columns_by_type
{
1319 my ($self,$type) = @_;
1320 my $aln = $self->new;
1323 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @
{$type});
1324 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@
{$type});
1325 my %matchchars = ( 'match' => '\*',
1330 'all_gaps_columns' => ''
1332 # get the characters to delete against
1334 foreach my $type (@
{$type}){
1335 $del_char.= $matchchars{$type};
1339 my $match_line = $self->match_line;
1340 # do the matching to get the segments to remove
1342 while($match_line =~ m/[$del_char]/g ){
1343 my $start = pos($match_line)-1;
1344 $match_line=~/\G[$del_char]+/gc;
1345 my $end = pos($match_line)-1;
1347 #have to offset the start and end for subsequent removes
1350 $length += ($end-$start+1);
1351 push @remove, [$start,$end];
1355 # remove the segments
1356 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1357 $aln = $aln->remove_gaps() if $gap;
1358 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1359 # fix for meta, sf, ann
1364 sub _remove_columns_by_num
{
1365 my ($self,$positions) = @_;
1366 my $aln = $self->new;
1368 # sort the positions
1369 @
$positions = sort { $a->[0] <=> $b->[0] } @
$positions;
1373 foreach my $pos (@
{$positions}) {
1374 my ($start, $end) = @
{$pos};
1376 #have to offset the start and end for subsequent removes
1379 $length += ($end-$start+1);
1380 push @remove, [$start,$end];
1383 #remove the segments
1384 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1385 # fix for meta, sf, ann
1390 =head1 Change sequences within the MSA
1392 These methods affect characters in all sequences without changing the
1395 =head2 splice_by_seq_pos
1397 Title : splice_by_seq_pos
1398 Usage : $status = splice_by_seq_pos(1);
1399 Function: splices all aligned sequences where the specified sequence
1402 Returns : 1 on success
1403 Args : position of sequence to splice by
1408 sub splice_by_seq_pos
{
1409 my ($self,$pos) = @_;
1411 my $guide = $self->get_seq_by_pos($pos);
1412 my $guide_seq = $guide->seq;
1414 $guide_seq =~ s/\./\-/g;
1418 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1419 unshift @gaps, $pos;
1423 foreach my $seq ($self->each_seq){
1424 my @bases = split '', $seq->seq;
1426 splice(@bases, $_, 1) foreach @gaps;
1427 $seq->seq(join('', @bases));
1436 Usage : $ali->map_chars('\.','-')
1437 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1440 Notice that the from (arg1) is interpretted as a regex,
1441 so be careful about quoting meta characters (eg
1442 $ali->map_chars('.','-') wont do what you want)
1444 Argument : 'from' rexexp
1455 $self->throw("Need exactly two arguments")
1456 unless defined $from and defined $to;
1458 foreach $seq ( $self->each_seq() ) {
1459 $temp = $seq->seq();
1460 $temp =~ s/$from/$to/g;
1470 Usage : $ali->uppercase()
1471 Function : Sets all the sequences to uppercase
1482 foreach $seq ( $self->each_seq() ) {
1483 $temp = $seq->seq();
1484 $temp =~ tr/[a-z]/[A-Z]/;
1493 Title : cigar_line()
1494 Usage : %cigars = $align->cigar_line()
1495 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1496 Report) line for each sequence in the alignment. Examples are
1497 "1,60" or "5,10:12,58", where the numbers refer to conserved
1498 positions within the alignment. The keys of the hash are the
1499 NSEs (name/start/end) assigned to each sequence.
1500 Args : threshold (optional, defaults to 100)
1501 Returns : Hash of strings (cigar lines)
1510 my @consensus = split "",($self->consensus_string($thr));
1511 my $len = $self->length;
1512 my $gapchar = $self->gap_char;
1514 # create a precursor, something like (1,4,5,6,7,33,45),
1515 # where each number corresponds to a conserved position
1516 foreach my $seq ( $self->each_seq ) {
1517 my @seq = split "", uc ($seq->seq);
1519 for (my $x = 0 ; $x < $len ; $x++ ) {
1520 if ($seq[$x] eq $consensus[$x]) {
1521 push @
{$cigars{$seq->get_nse}},$pos;
1523 } elsif ($seq[$x] ne $gapchar) {
1528 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1529 for my $name (keys %cigars) {
1530 splice @
{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1531 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1532 push @
{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1533 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1534 ${$cigars{$name}}[$#{$cigars{$name}}] );
1535 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1536 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1537 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1538 splice @
{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1542 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1543 for my $name (keys %cigars) {
1545 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1546 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1547 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1551 for my $pos (@remove) {
1552 splice @
{$cigars{$name}}, $pos, 1;
1555 # join and punctuate
1556 for my $name (keys %cigars) {
1557 my ($start,$end,$str) = "";
1558 while ( ($start,$end) = splice @
{$cigars{$name}}, 0, 2 ) {
1559 $str .= ($start . "," . $end . ":");
1562 $cigars{$name} = $str;
1570 Title : match_line()
1571 Usage : $line = $align->match_line()
1572 Function : Generates a match line - much like consensus string
1573 except that a line indicating the '*' for a match.
1574 Args : (optional) Match line characters ('*' by default)
1575 (optional) Strong match char (':' by default)
1576 (optional) Weak match char ('.' by default)
1582 my ($self,$matchlinechar, $strong, $weak) = @_;
1583 my %matchchars = ('match' => $matchlinechar || '*',
1584 'weak' => $weak || '.',
1585 'strong' => $strong || ':',
1591 foreach my $seq ( $self->each_seq ) {
1592 push @seqchars, [ split(//, uc ($seq->seq)) ];
1593 $alphabet = $seq->alphabet unless defined $alphabet;
1595 my $refseq = shift @seqchars;
1596 # let's just march down the columns
1599 foreach my $pos ( 0..$self->length ) {
1600 my $refchar = $refseq->[$pos];
1601 my $char = $matchchars{'mismatch'};
1602 unless( defined $refchar ) {
1603 last if $pos == $self->length; # short circuit on last residue
1604 # this in place to handle jason's soon-to-be-committed
1605 # intron mapping code
1608 my %col = ($refchar => 1);
1609 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1610 foreach my $seq ( @seqchars ) {
1611 next if $pos >= scalar @
$seq;
1612 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1613 $seq->[$pos] eq ' ' );
1614 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1616 my @colresidues = sort keys %col;
1618 # if all the values are the same
1619 if( $dash ) { $char = $matchchars{'mismatch'} }
1620 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1621 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1622 # matches for protein seqs
1624 foreach my $type ( qw(strong weak) ) {
1625 # iterate through categories
1627 # iterate through each of the aa in the col
1628 # look to see which groups it is in
1629 foreach my $c ( @colresidues ) {
1630 foreach my $f ( grep { index($_,$c) >= 0 } @
{$CONSERVATION_GROUPS{$type}} ) {
1631 push @
{$groups{$f}},$c;
1635 foreach my $cols ( values %groups ) {
1636 @
$cols = sort @
$cols;
1637 # now we are just testing to see if two arrays
1638 # are identical w/o changing either one
1639 # have to be same len
1640 next if( scalar @
$cols != scalar @colresidues );
1641 # walk down the length and check each slot
1642 for($_=0;$_ < (scalar @
$cols);$_++ ) {
1643 next GRP
if( $cols->[$_] ne $colresidues[$_] );
1645 $char = $matchchars{$type};
1651 $matchline .= $char;
1660 Usage : $line = $align->gap_line()
1661 Function : Generates a gap line - much like consensus string
1662 except that a line where '-' represents gap
1663 Args : (optional) gap line characters ('-' by default)
1669 my ($self,$gapchar) = @_;
1670 $gapchar = $gapchar || $self->gap_char;
1671 my %gap_hsh; # column gaps vector
1672 foreach my $seq ( $self->each_seq ) {
1674 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1675 map {[$i++, $_]} split(//, uc ($seq->seq));
1678 foreach my $pos ( 0..$self->length-1 ) {
1679 $gap_line .= (exists $gap_hsh{$pos}) ?
$gapchar:'.';
1686 Title : all_gap_line()
1687 Usage : $line = $align->all_gap_line()
1688 Function : Generates a gap line - much like consensus string
1689 except that a line where '-' represents all-gap column
1690 Args : (optional) gap line characters ('-' by default)
1696 my ($self,$gapchar) = @_;
1697 $gapchar = $gapchar || $self->gap_char;
1698 my %gap_hsh; # column gaps counter hash
1699 my @seqs = $self->each_seq;
1700 foreach my $seq ( @seqs ) {
1702 map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1703 map {[$i++, $_]} split(//, uc ($seq->seq));
1706 foreach my $pos ( 0..$self->length-1 ) {
1707 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1709 $gap_line .= $gapchar;
1717 =head2 gap_col_matrix
1719 Title : gap_col_matrix()
1720 Usage : my $cols = $align->gap_col_matrix()
1721 Function : Generates an array of hashes where
1722 each entry in the array is a hash reference
1723 with keys of all the sequence names and
1724 and value of 1 or 0 if the sequence has a gap at that column
1725 Args : (optional) gap line characters ($aln->gap_char or '-' by default)
1729 sub gap_col_matrix
{
1730 my ($self,$gapchar) = @_;
1731 $gapchar = $gapchar || $self->gap_char;
1732 my %gap_hsh; # column gaps vector
1734 foreach my $seq ( $self->each_seq ) {
1736 my $str = $seq->seq;
1737 my $len = $seq->length;
1739 my $id = $seq->display_id;
1740 while( $i < $len ) {
1741 $ch = substr($str, $i, 1);
1742 $cols[$i++]->{$id} = ($ch eq $gapchar);
1751 Usage : $ali->match()
1752 Function : Goes through all columns and changes residues that are
1753 identical to residue in first sequence to match '.'
1754 character. Sets match_char.
1756 USE WITH CARE: Most MSA formats do not support match
1757 characters in sequences, so this is mostly for output
1758 only. NEXUS format (Bio::AlignIO::nexus) can handle
1761 Argument : a match character, optional, defaults to '.'
1766 my ($self, $match) = @_;
1769 my ($matching_char) = $match;
1770 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1771 $self->map_chars($matching_char, '-');
1773 my @seqs = $self->each_seq();
1774 return 1 unless scalar @seqs > 1;
1776 my $refseq = shift @seqs ;
1777 my @refseq = split //, $refseq->seq;
1778 my $gapchar = $self->gap_char;
1780 foreach my $seq ( @seqs ) {
1781 my @varseq = split //, $seq->seq();
1782 for ( my $i=0; $i < scalar @varseq; $i++) {
1783 $varseq[$i] = $match if defined $refseq[$i] &&
1784 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1785 $refseq[$i] =~ /$gapchar/ )
1786 && $refseq[$i] eq $varseq[$i];
1788 $seq->seq(join '', @varseq);
1790 $self->match_char($match);
1798 Usage : $ali->unmatch()
1799 Function : Undoes the effect of method match. Unsets match_char.
1801 Argument : a match character, optional, defaults to '.'
1803 See L<match> and L<match_char>
1808 my ($self, $match) = @_;
1812 my @seqs = $self->each_seq();
1813 return 1 unless scalar @seqs > 1;
1815 my $refseq = shift @seqs ;
1816 my @refseq = split //, $refseq->seq;
1817 my $gapchar = $self->gap_char;
1818 foreach my $seq ( @seqs ) {
1819 my @varseq = split //, $seq->seq();
1820 for ( my $i=0; $i < scalar @varseq; $i++) {
1821 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1822 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1823 $refseq[$i] =~ /$gapchar/ ) &&
1824 $varseq[$i] eq $match;
1826 $seq->seq(join '', @varseq);
1828 $self->match_char('');
1832 =head1 MSA attributes
1834 Methods for setting and reading the MSA attributes.
1836 Note that the methods defining character semantics depend on the user
1837 to set them sensibly. They are needed only by certain input/output
1838 methods. Unset them by setting to an empty string ('').
1843 Usage : $myalign->id("Ig")
1844 Function : Gets/sets the id field of the alignment
1845 Returns : An id string
1846 Argument : An id string (optional)
1851 my ($self, $name) = @_;
1853 if (defined( $name )) {
1854 $self->{'_id'} = $name;
1857 return $self->{'_id'};
1863 Usage : $myalign->accession("PF00244")
1864 Function : Gets/sets the accession field of the alignment
1865 Returns : An acc string
1866 Argument : An acc string (optional)
1871 my ($self, $acc) = @_;
1873 if (defined( $acc )) {
1874 $self->{'_accession'} = $acc;
1877 return $self->{'_accession'};
1883 Usage : $myalign->description("14-3-3 proteins")
1884 Function : Gets/sets the description field of the alignment
1885 Returns : An description string
1886 Argument : An description string (optional)
1891 my ($self, $name) = @_;
1893 if (defined( $name )) {
1894 $self->{'_description'} = $name;
1897 return $self->{'_description'};
1902 Title : missing_char
1903 Usage : $myalign->missing_char("?")
1904 Function : Gets/sets the missing_char attribute of the alignment
1905 It is generally recommended to set it to 'n' or 'N'
1906 for nucleotides and to 'X' for protein.
1907 Returns : An missing_char string,
1908 Argument : An missing_char string (optional)
1913 my ($self, $char) = @_;
1915 if (defined $char ) {
1916 $self->throw("Single missing character, not [$char]!") if CORE
::length($char) > 1;
1917 $self->{'_missing_char'} = $char;
1920 return $self->{'_missing_char'};
1926 Usage : $myalign->match_char('.')
1927 Function : Gets/sets the match_char attribute of the alignment
1928 Returns : An match_char string,
1929 Argument : An match_char string (optional)
1934 my ($self, $char) = @_;
1936 if (defined $char ) {
1937 $self->throw("Single match character, not [$char]!") if CORE
::length($char) > 1;
1938 $self->{'_match_char'} = $char;
1941 return $self->{'_match_char'};
1947 Usage : $myalign->gap_char('-')
1948 Function : Gets/sets the gap_char attribute of the alignment
1949 Returns : An gap_char string, defaults to '-'
1950 Argument : An gap_char string (optional)
1955 my ($self, $char) = @_;
1957 if (defined $char || ! defined $self->{'_gap_char'} ) {
1958 $char= '-' unless defined $char;
1959 $self->throw("Single gap character, not [$char]!") if CORE
::length($char) > 1;
1960 $self->{'_gap_char'} = $char;
1962 return $self->{'_gap_char'};
1967 Title : symbol_chars
1968 Usage : my @symbolchars = $aln->symbol_chars;
1969 Function: Returns all the seen symbols (other than gaps)
1970 Returns : array of characters that are the seen symbols
1971 Args : boolean to include the gap/missing/match characters
1976 my ($self,$includeextra) = @_;
1978 unless ($self->{'_symbols'}) {
1979 foreach my $seq ($self->each_seq) {
1980 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1983 my %copy = %{$self->{'_symbols'}};
1984 if( ! $includeextra ) {
1985 foreach my $char ( $self->gap_char, $self->match_char,
1986 $self->missing_char) {
1987 delete $copy{$char} if( defined $char );
1993 =head1 Alignment descriptors
1995 These read only methods describe the MSA in various ways.
2001 Usage : $str = $ali->score()
2002 Function : get/set a score of the alignment
2003 Returns : a score for the alignment
2004 Argument : an optional score to set
2010 $self->{score
} = shift if @_;
2011 return $self->{score
};
2014 =head2 consensus_string
2016 Title : consensus_string
2017 Usage : $str = $ali->consensus_string($threshold_percent)
2018 Function : Makes a strict consensus
2019 Returns : Consensus string
2020 Argument : Optional treshold ranging from 0 to 100.
2021 The consensus residue has to appear at least threshold %
2022 of the sequences at a given location, otherwise a '?'
2023 character will be placed at that location.
2024 (Default value = 0%)
2028 sub consensus_string
{
2030 my $threshold = shift;
2033 my $len = $self->length - 1;
2035 foreach ( 0 .. $len ) {
2036 $out .= $self->_consensus_aa($_,$threshold);
2044 my $threshold_percent = shift || -1 ;
2045 my ($seq,%hash,$count,$letter,$key);
2046 my $gapchar = $self->gap_char;
2047 foreach $seq ( $self->each_seq() ) {
2048 $letter = substr($seq->seq,$point,1);
2049 $self->throw("--$point-----------") if $letter eq '';
2050 ($letter eq $gapchar || $letter =~ /\./) && next;
2051 # print "Looking at $letter\n";
2054 my $number_of_sequences = $self->num_sequences();
2055 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2059 foreach $key ( sort keys %hash ) {
2060 # print "Now at $key $hash{$key}\n";
2061 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2063 $count = $hash{$key};
2070 =head2 consensus_iupac
2072 Title : consensus_iupac
2073 Usage : $str = $ali->consensus_iupac()
2074 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2075 and RNA. The output is in upper case except when gaps in
2076 a column force output to be in lower case.
2078 Note that if your alignment sequences contain a lot of
2079 IUPAC ambiquity codes you often have to manually set
2080 alphabet. Bio::PrimarySeq::_guess_type thinks they
2081 indicate a protein sequence.
2082 Returns : consensus string
2084 Throws : on protein sequences
2088 sub consensus_iupac
{
2091 my $len = $self->length-1;
2093 # only DNA and RNA sequences are valid
2094 foreach my $seq ( $self->each_seq() ) {
2095 $self->throw("Seq [". $seq->get_nse. "] is a protein")
2096 if $seq->alphabet eq 'protein';
2098 # loop over the alignment columns
2099 foreach my $count ( 0 .. $len ) {
2100 $out .= $self->_consensus_iupac($count);
2105 sub _consensus_iupac
{
2106 my ($self, $column) = @_;
2107 my ($string, $char, $rna);
2109 #determine all residues in a column
2110 foreach my $seq ( $self->each_seq() ) {
2111 $string .= substr($seq->seq, $column, 1);
2113 $string = uc $string;
2115 # quick exit if there's an N in the string
2116 if ($string =~ /N/) {
2117 $string =~ /\W/ ?
return 'n' : return 'N';
2119 # ... or if there are only gap characters
2120 return '-' if $string =~ /^\W+$/;
2122 # treat RNA as DNA in regexps
2123 if ($string =~ /U/) {
2128 # the following s///'s only need to be done to the _first_ ambiguity code
2129 # as we only need to see the _range_ of characters in $string
2131 if ($string =~ /[VDHB]/) {
2132 $string =~ s/V/AGC/;
2133 $string =~ s/D/AGT/;
2134 $string =~ s/H/ACT/;
2135 $string =~ s/B/CTG/;
2138 if ($string =~ /[SKYRWM]/) {
2147 # and now the guts of the thing
2149 if ($string =~ /A/) {
2151 if ($string =~ /G/) {
2152 $char = 'R'; # A and G (purines) R
2153 if ($string =~ /C/) {
2154 $char = 'V'; # A and G and C V
2155 if ($string =~ /T/) {
2156 $char = 'N'; # A and G and C and T N
2158 } elsif ($string =~ /T/) {
2159 $char = 'D'; # A and G and T D
2161 } elsif ($string =~ /C/) {
2162 $char = 'M'; # A and C M
2163 if ($string =~ /T/) {
2164 $char = 'H'; # A and C and T H
2166 } elsif ($string =~ /T/) {
2167 $char = 'W'; # A and T W
2169 } elsif ($string =~ /C/) {
2171 if ($string =~ /T/) {
2172 $char = 'Y'; # C and T (pyrimidines) Y
2173 if ($string =~ /G/) {
2174 $char = 'B'; # C and T and G B
2176 } elsif ($string =~ /G/) {
2177 $char = 'S'; # C and G S
2179 } elsif ($string =~ /G/) {
2181 if ($string =~ /C/) {
2182 $char = 'S'; # G and C S
2183 } elsif ($string =~ /T/) {
2184 $char = 'K'; # G and T K
2186 } elsif ($string =~ /T/) {
2190 $char = 'U' if $rna and $char eq 'T';
2191 $char = lc $char if $string =~ /\W/;
2197 =head2 consensus_meta
2199 Title : consensus_meta
2200 Usage : $seqmeta = $ali->consensus_meta()
2201 Function : Returns a Bio::Seq::Meta object containing the consensus
2202 strings derived from meta data analysis.
2203 Returns : Bio::Seq::Meta
2204 Argument : Bio::Seq::Meta
2205 Throws : non-MetaI object
2209 sub consensus_meta
{
2210 my ($self, $meta) = @_;
2211 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2212 $self->throw('Not a Bio::Seq::MetaI object');
2214 return $self->{'_aln_meta'} = $meta if $meta;
2215 return $self->{'_aln_meta'}
2221 Usage : if ( $ali->is_flush() )
2222 Function : Tells you whether the alignment
2223 : is flush, i.e. all of the same length
2230 my ($self,$report) = @_;
2235 foreach $seq ( $self->each_seq() ) {
2236 if( $length == (-1) ) {
2237 $length = CORE
::length($seq->seq());
2241 $temp = CORE
::length($seq->seq());
2242 if( $temp != $length ) {
2243 $self->warn("expecting $length not $temp from ".
2244 $seq->display_id) if( $report );
2245 $self->debug("expecting $length not $temp from ".
2247 $self->debug($seq->seq(). "\n");
2259 Usage : $len = $ali->length()
2260 Function : Returns the maximum length of the alignment.
2261 To be sure the alignment is a block, use is_flush
2269 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2279 foreach $seq ( $self->each_seq() ) {
2280 $temp = $seq->length();
2281 if( $temp > $length ) {
2290 =head2 maxdisplayname_length
2292 Title : maxdisplayname_length
2293 Usage : $ali->maxdisplayname_length()
2294 Function : Gets the maximum length of the displayname in the
2295 alignment. Used in writing out various MSA formats.
2301 sub maxname_length
{
2303 $self->deprecated("maxname_length - deprecated method.".
2304 " Use maxdisplayname_length() instead.");
2305 $self->maxdisplayname_length();
2310 $self->deprecated("maxnse_length - deprecated method.".
2311 " Use maxnse_length() instead.");
2312 $self->maxdisplayname_length();
2315 sub maxdisplayname_length
{
2320 foreach $seq ( $self->each_seq() ) {
2321 $len = CORE
::length $self->displayname($seq->get_nse());
2323 if( $len > $maxname ) {
2331 =head2 max_metaname_length
2333 Title : max_metaname_length
2334 Usage : $ali->max_metaname_length()
2335 Function : Gets the maximum length of the meta name tags in the
2336 alignment for the sequences and for the alignment.
2337 Used in writing out various MSA formats.
2343 sub max_metaname_length
{
2348 # check seq meta first
2349 for $seq ( $self->each_seq() ) {
2350 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2351 for my $mtag ($seq->meta_names) {
2352 $len = CORE
::length $mtag;
2353 if( $len > $maxname ) {
2360 for my $meta ($self->consensus_meta) {
2362 for my $name ($meta->meta_names) {
2363 $len = CORE
::length $name;
2364 if( $len > $maxname ) {
2375 Title : num_residues
2376 Usage : $no = $ali->num_residues
2377 Function : number of residues in total in the alignment
2380 Note : replaces no_residues()
2388 foreach my $seq ($self->each_seq) {
2389 my $str = $seq->seq();
2391 $count += ($str =~ s/[A-Za-z]//g);
2397 =head2 num_sequences
2399 Title : num_sequences
2400 Usage : $depth = $ali->num_sequences
2401 Function : number of sequence in the sequence alignment
2404 Note : replaces no_sequences()
2410 return scalar($self->each_seq);
2414 =head2 average_percentage_identity
2416 Title : average_percentage_identity
2417 Usage : $id = $align->average_percentage_identity
2418 Function: The function uses a fast method to calculate the average
2419 percentage identity of the alignment
2420 Returns : The average percentage identity of the alignment
2422 Notes : This method implemented by Kevin Howe calculates a figure that is
2423 designed to be similar to the average pairwise identity of the
2424 alignment (identical in the absence of gaps), without having to
2425 explicitly calculate pairwise identities proposed by Richard Durbin.
2426 Validated by Ewan Birney ad Alex Bateman.
2430 sub average_percentage_identity
{
2431 my ($self,@args) = @_;
2433 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2434 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2436 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2438 if (! $self->is_flush()) {
2439 $self->throw("All sequences in the alignment must be the same length");
2442 @seqs = $self->each_seq();
2443 $len = $self->length();
2445 # load the each hash with correct keys for existence checks
2447 for( my $index=0; $index < $len; $index++) {
2448 foreach my $letter (@alphabet) {
2449 $countHashes[$index]->{$letter} = 0;
2452 foreach my $seq (@seqs) {
2453 my @seqChars = split //, $seq->seq();
2454 for( my $column=0; $column < @seqChars; $column++ ) {
2455 my $char = uc($seqChars[$column]);
2456 if (exists $countHashes[$column]->{$char}) {
2457 $countHashes[$column]->{$char}++;
2464 for(my $column =0; $column < $len; $column++) {
2465 my %hash = %{$countHashes[$column]};
2467 foreach my $res (keys %hash) {
2468 $total += $hash{$res}*($hash{$res} - 1);
2469 $subdivisor += $hash{$res};
2471 $divisor += $subdivisor * ($subdivisor - 1);
2473 return $divisor > 0 ?
($total / $divisor )*100.0 : 0;
2476 =head2 percentage_identity
2478 Title : percentage_identity
2479 Usage : $id = $align->percentage_identity
2480 Function: The function calculates the average percentage identity
2481 (aliased to average_percentage_identity)
2482 Returns : The average percentage identity
2487 sub percentage_identity
{
2489 return $self->average_percentage_identity();
2492 =head2 overall_percentage_identity
2494 Title : overall_percentage_identity
2495 Usage : $id = $align->overall_percentage_identity
2496 $id = $align->overall_percentage_identity('short')
2497 Function: The function calculates the percentage identity of
2498 the conserved columns
2499 Returns : The percentage identity of the conserved columns
2500 Args : length value to use, optional defaults to alignment length
2501 possible values: 'align', 'short', 'long'
2503 The argument values 'short' and 'long' refer to shortest and longest
2504 sequence in the alignment. Method modification code by Hongyu Zhang.
2508 sub overall_percentage_identity
{
2509 my ($self, $length_measure) = @_;
2511 my %alphabet = map {$_ => undef} qw
(A C G T U B D E F H I J K L M N O P Q R S V W X Y Z
);
2513 my %enum = map {$_ => undef} qw
(align short long
);
2515 $self->throw("Unknown argument [$length_measure]")
2516 if $length_measure and not exists $enum{$length_measure};
2517 $length_measure ||= 'align';
2519 if (! $self->is_flush()) {
2520 $self->throw("All sequences in the alignment must be the same length");
2523 # Count the residues seen at each position
2525 my $total = 0; # number of positions with identical residues
2527 my @seqs = $self->each_seq;
2528 my $nof_seqs = scalar @seqs;
2529 my $aln_len = $self->length();
2530 for my $seq (@seqs) {
2531 my $seqstr = $seq->seq;
2533 # Count residues for given sequence
2534 for my $column (0 .. $aln_len-1) {
2535 my $char = uc( substr($seqstr, $column, 1) );
2536 if ( exists $alphabet{$char} ) {
2538 # This is a valid char
2539 if ( defined $countHashes[$column]->{$char} ) {
2540 $countHashes[$column]->{$char}++;
2542 $countHashes[$column]->{$char} = 1;
2545 if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2546 # All sequences have this same residue
2554 if ($length_measure eq 'short' || $length_measure eq 'long') {
2555 my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2556 if ($length_measure eq 'short') {
2557 if ( (not defined $len) || ($seq_len < $len) ) {
2560 } elsif ($length_measure eq 'long') {
2561 if ( (not defined $len) || ($seq_len > $len) ) {
2569 if ($length_measure eq 'align') {
2573 return ($total / $len ) * 100.0;
2578 =head1 Alignment positions
2580 Methods to map a sequence position into an alignment column and back.
2581 column_from_residue_number() does the former. The latter is really a
2582 property of the sequence object and can done using
2583 L<Bio::LocatableSeq::location_from_column>:
2585 # select somehow a sequence from the alignment, e.g.
2586 my $seq = $aln->get_seq_by_pos(1);
2587 #$loc is undef or Bio::LocationI object
2588 my $loc = $seq->location_from_column(5);
2590 =head2 column_from_residue_number
2592 Title : column_from_residue_number
2593 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2594 Function: This function gives the position in the alignment
2595 (i.e. column number) of the given residue number in the
2596 sequence with the given name. For example, for the
2599 Seq1/91-97 AC..DEF.GH.
2600 Seq2/24-30 ACGG.RTY...
2601 Seq3/43-51 AC.DDEF.GHI
2603 column_from_residue_number( "Seq1", 94 ) returns 6.
2604 column_from_residue_number( "Seq2", 25 ) returns 2.
2605 column_from_residue_number( "Seq3", 50 ) returns 10.
2607 An exception is thrown if the residue number would lie
2608 outside the length of the aligment
2609 (e.g. column_from_residue_number( "Seq2", 22 )
2611 Note: If the the parent sequence is represented by more than
2612 one alignment sequence and the residue number is present in
2613 them, this method finds only the first one.
2615 Returns : A column number for the position in the alignment of the
2616 given residue in the given sequence (1 = first column)
2617 Args : A sequence id/name (not a name/start-end)
2618 A residue number in the whole sequence (not just that
2619 segment of it in the alignment)
2623 sub column_from_residue_number
{
2624 my ($self, $name, $resnumber) = @_;
2626 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2627 $self->throw("Second argument residue number missing") unless $resnumber;
2629 foreach my $seq ($self->each_seq_with_id($name)) {
2632 $col = $seq->column_from_residue_number($resnumber);
2638 $self->throw("Could not find a sequence segment in $name ".
2639 "containing residue number $resnumber");
2643 =head1 Sequence names
2645 Methods to manipulate the display name. The default name based on the
2646 sequence id and subsequence positions can be overridden in various
2652 Usage : $myalign->displayname("Ig", "IgA")
2653 Function : Gets/sets the display name of a sequence in the alignment
2654 Returns : A display name string
2655 Argument : name of the sequence
2656 displayname of the sequence (optional)
2660 sub get_displayname
{
2662 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2663 $self->displayname(@_);
2666 sub set_displayname
{
2668 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2669 $self->displayname(@_);
2673 my ($self, $name, $disname) = @_;
2675 $self->throw("No sequence with name [$name]")
2676 unless defined $self->{'_seq'}->{$name};
2678 if( $disname and $name) {
2679 $self->{'_dis_name'}->{$name} = $disname;
2682 elsif( defined $self->{'_dis_name'}->{$name} ) {
2683 return $self->{'_dis_name'}->{$name};
2689 =head2 set_displayname_count
2691 Title : set_displayname_count
2692 Usage : $ali->set_displayname_count
2693 Function : Sets the names to be name_# where # is the number of
2694 times this name has been used.
2695 Returns : 1, on success
2700 sub set_displayname_count
{
2702 my (@arr,$name,$seq,$count,$temp,$nse);
2704 foreach $seq ( $self->each_alphabetically() ) {
2705 $nse = $seq->get_nse();
2707 #name will be set when this is the second
2708 #time (or greater) is has been seen
2710 if( defined $name and $name eq ($seq->id()) ) {
2711 $temp = sprintf("%s_%s",$name,$count);
2712 $self->displayname($nse,$temp);
2717 $temp = sprintf("%s_%s",$name,$count);
2718 $self->displayname($nse,$temp);
2725 =head2 set_displayname_flat
2727 Title : set_displayname_flat
2728 Usage : $ali->set_displayname_flat()
2729 Function : Makes all the sequences be displayed as just their name,
2736 sub set_displayname_flat
{
2740 foreach $seq ( $self->each_seq() ) {
2741 $nse = $seq->get_nse();
2742 $self->displayname($nse,$seq->id());
2747 =head2 set_displayname_normal
2749 Title : set_displayname_normal
2750 Usage : $ali->set_displayname_normal()
2751 Function : Makes all the sequences be displayed as name/start-end
2752 Returns : 1, on success
2757 sub set_displayname_normal
{
2761 foreach $seq ( $self->each_seq() ) {
2762 $nse = $seq->get_nse();
2763 $self->displayname($nse,$nse);
2771 Usage : $obj->source($newval)
2772 Function: sets the Alignment source program
2774 Returns : value of source
2775 Args : newvalue (optional)
2781 my ($self,$value) = @_;
2782 if( defined $value) {
2783 $self->{'_source'} = $value;
2785 return $self->{'_source'};
2788 =head2 set_displayname_safe
2790 Title : set_displayname_safe
2791 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2792 Function : Assign machine-generated serial names to sequences in input order.
2793 Designed to protect names during PHYLIP runs. Assign 10-char string
2794 in the form of "S000000001" to "S999999999". Restore the original
2795 names using "restore_displayname".
2796 Returns : 1. a new $aln with system names;
2797 2. a hash ref for restoring names
2798 Argument : Number for id length (default 10)
2802 sub set_displayname_safe
{
2804 my $idlength = shift || 10;
2805 my ($seq, %phylip_name);
2807 my $new=Bio
::SimpleAlign
->new();
2808 foreach $seq ( $self->each_seq() ) {
2810 my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2811 $phylip_name{$pname}=$seq->id();
2812 my $new_seq= Bio
::LocatableSeq
->new(-id
=> $pname,
2813 -seq
=> $seq->seq(),
2814 -alphabet
=> $seq->alphabet,
2815 -start
=> $seq->{_start
},
2816 -end
=> $seq->{_end
}
2818 $new->add_seq($new_seq);
2821 $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2822 return ($new, \
%phylip_name);
2825 =head2 restore_displayname
2827 Title : restore_displayname
2828 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2829 Function : Restore original sequence names (after running
2830 $ali->set_displayname_safe)
2831 Returns : a new $aln with names restored.
2832 Argument : a hash reference of names from "set_displayname_safe".
2836 sub restore_displayname
{
2840 my $new=Bio
::SimpleAlign
->new();
2841 foreach my $seq ( $self->each_seq() ) {
2842 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2843 my $new_seq= Bio
::LocatableSeq
->new(-id
=> $name{$seq->id()},
2844 -seq
=> $seq->seq(),
2845 -alphabet
=> $seq->alphabet,
2846 -start
=> $seq->{_start
},
2847 -end
=> $seq->{_end
}
2849 $new->add_seq($new_seq);
2854 =head2 sort_by_start
2856 Title : sort_by_start
2857 Usage : $ali->sort_by_start
2858 Function : Changes the order of the alignment to the start position of each
2867 my ($seq,$nse,@arr,%hash,$count);
2868 foreach $seq ( $self->each_seq() ) {
2869 $nse = $seq->get_nse;
2873 %{$self->{'_order'}} = (); # reset the hash;
2874 foreach $nse ( sort _startend
keys %hash) {
2875 $self->{'_order'}->{$count} = $nse;
2883 my ($aname,$arange) = split (/[\/]/,$a);
2884 my ($bname,$brange) = split (/[\/]/,$b);
2885 my ($astart,$aend) = split(/\-/,$arange);
2886 my ($bstart,$bend) = split(/\-/,$brange);
2887 return $astart <=> $bstart;
2890 =head2 bracket_string
2892 Title : bracket_string
2893 Usage : my @params = (-refseq => 'testseq',
2894 -allele1 => 'allele1',
2895 -allele2 => 'allele2',
2896 -delimiters => '{}',
2898 $str = $aln->bracket_string(@params)
2900 Function : When supplied with a list of parameters (see below), returns a
2901 string in BIC format. This is used for allelic comparisons.
2902 Briefly, if either allele contains a base change when compared to
2903 the refseq, the base or gap for each allele is represented in
2904 brackets in the order present in the 'alleles' parameter.
2906 For the following data:
2915 the returned string with parameters 'refseq => testseq' and
2916 'alleles => [qw(allele1 allele2)]' would be:
2918 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2919 Returns : BIC-formatted string
2920 Argument : Required args
2921 refseq : string (ID) of the reference sequence used
2922 as basis for comparison
2923 allele1 : string (ID) of the first allele
2924 allele2 : string (ID) of the second allele
2926 delimiters: two symbol string of left and right delimiters.
2927 Only the first two symbols are used
2929 separator : string used as a separator. Only the first
2932 Throws : On no refseq/alleles, or invalid refseq/alleles.
2936 sub bracket_string
{
2937 my ($self, @args) = @_;
2938 my ($ref, $a1, $a2, $delim, $sep) =
2939 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2940 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2942 ($ld, $rd) = split('', $delim, 2) if $delim;
2946 my ($refseq, $allele1, $allele2) =
2947 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2948 if (!$refseq || !$allele1 || !$allele2) {
2949 $self->throw("One of your refseq/allele IDs is invalid!");
2951 my $len = $self->length-1;
2953 # loop over the alignment columns
2954 for my $column ( 0 .. $len ) {
2956 my ($compres, $res1, $res2) =
2957 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2958 # are any of the allele symbols different from the refseq?
2959 $string = ($compres eq $res1 && $compres eq $res2) ?
$compres :
2960 $ld.$res1.$sep.$res2.$rd;
2967 =head2 methods implementing Bio::FeatureHolderI
2969 FeatureHolderI implementation to support labeled character sets like one
2970 would get from NEXUS represented data.
2972 =head2 get_SeqFeatures
2974 Usage : @features = $aln->get_SeqFeatures
2975 Function: Get the feature objects held by this feature holder.
2977 Returns : an array of Bio::SeqFeatureI implementing objects
2978 Args : optional filter coderef, taking a Bio::SeqFeatureI
2979 : as argument, returning TRUE if wanted, FALSE if
2984 sub get_SeqFeatures
{
2986 my $filter_cb = shift;
2987 $self->throw("Arg (filter callback) must be a coderef") unless
2988 !defined($filter_cb) or ref($filter_cb) eq 'CODE';
2989 if( !defined $self->{'_as_feat'} ) {
2990 $self->{'_as_feat'} = [];
2993 return grep { $filter_cb->($_) } @
{$self->{'_as_feat'}};
2995 return @
{$self->{'_as_feat'}};
2998 =head2 add_SeqFeature
3000 Usage : $aln->add_SeqFeature($subfeat);
3001 Function: adds a SeqFeature into the SeqFeature array.
3003 Returns : true on success
3004 Args : a Bio::SeqFeatureI object
3005 Note : This implementation is not compliant
3006 with Bio::FeatureHolderI
3010 sub add_SeqFeature
{
3011 my ($self,@feat) = @_;
3013 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3015 foreach my $feat ( @feat ) {
3016 if( !$feat->isa("Bio::SeqFeatureI") ) {
3017 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3020 push(@
{$self->{'_as_feat'}},$feat);
3026 =head2 remove_SeqFeatures
3028 Usage : $obj->remove_SeqFeatures
3029 Function: Removes all SeqFeatures. If you want to remove only a subset,
3030 remove that subset from the returned array, and add back the rest.
3031 Returns : The array of Bio::SeqFeatureI features that was
3032 deleted from this alignment.
3037 sub remove_SeqFeatures
{
3040 return () unless $self->{'_as_feat'};
3041 my @feats = @
{$self->{'_as_feat'}};
3042 $self->{'_as_feat'} = [];
3046 =head2 feature_count
3048 Title : feature_count
3049 Usage : $obj->feature_count()
3050 Function: Return the number of SeqFeatures attached to the alignment
3051 Returns : integer representing the number of SeqFeatures
3059 if (defined($self->{'_as_feat'})) {
3060 return ($#{$self->{'_as_feat'}} + 1);
3066 =head2 get_all_SeqFeatures
3068 Title : get_all_SeqFeatures
3070 Function: Get all SeqFeatures.
3072 Returns : an array of Bio::SeqFeatureI implementing objects
3074 Note : Falls through to Bio::FeatureHolderI implementation.
3078 =head2 methods for Bio::AnnotatableI
3080 AnnotatableI implementation to support sequence alignments which
3081 contain annotation (NEXUS, Stockholm).
3086 Usage : $ann = $aln->annotation or
3087 $aln->annotation($ann)
3088 Function: Gets or sets the annotation
3089 Returns : Bio::AnnotationCollectionI object
3090 Args : None or Bio::AnnotationCollectionI object
3092 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3093 for more information
3098 my ($obj,$value) = @_;
3099 if( defined $value ) {
3100 $obj->throw("object of class ".ref($value)." does not implement ".
3101 "Bio::AnnotationCollectionI. Too bad.")
3102 unless $value->isa("Bio::AnnotationCollectionI");
3103 $obj->{'_annotation'} = $value;
3104 } elsif( ! defined $obj->{'_annotation'}) {
3105 $obj->{'_annotation'} = Bio
::Annotation
::Collection
->new();
3107 return $obj->{'_annotation'};
3110 =head1 Deprecated methods
3117 Usage : $no = $ali->no_residues
3118 Function : number of residues in total in the alignment
3121 Note : deprecated in favor of num_residues()
3127 $self->deprecated(-warn_version
=> 1.0069,
3128 -throw_version
=> 1.0075,
3129 -message
=> 'Use of method no_residues() is deprecated, use num_residues() instead');
3130 $self->num_residues(@_);
3135 Title : no_sequences
3136 Usage : $depth = $ali->no_sequences
3137 Function : number of sequence in the sequence alignment
3140 Note : deprecated in favor of num_sequences()
3146 $self->deprecated(-warn_version
=> 1.0069,
3147 -throw_version
=> 1.0075,
3148 -message
=> 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3149 $self->num_sequences(@_);
3154 Title : mask_columns
3155 Usage : $aln2 = $aln->mask_columns(20,30)
3156 Function : Masks a slice of the alignment inclusive of start and
3157 end columns, and the first column in the alignment is denoted 1.
3158 Mask beyond the length of the sequence does not do padding.
3159 Returns : A Bio::SimpleAlign object
3160 Args : Positive integer for start column, positive integer for end column,
3161 optional string value use for the mask. Example:
3163 $aln2 = $aln->mask_columns(20,30,'?')
3164 Note : Masking must use a character that is not used for gaps or
3165 frameshifts. These can be adjusted using the relevant global
3166 variables, but be aware these may be (uncontrollably) modified
3167 elsewhere within BioPerl (see bug 2715)
3172 #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3175 my $nonres = $Bio::LocatableSeq
::GAP_SYMBOLS
.
3176 $Bio::LocatableSeq
::FRAMESHIFT_SYMBOLS
;
3178 # coordinates are alignment-based, not sequence-based
3179 my ($start, $end, $mask_char) = @_;
3180 unless (defined $mask_char) { $mask_char = 'N' }
3182 $self->throw("Mask start has to be a positive integer and less than ".
3183 "alignment length, not [$start]")
3184 unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3185 $self->throw("Mask end has to be a positive integer and less than ".
3186 "alignment length, not [$end]")
3187 unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3188 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3189 "end [$end]") unless $start <= $end;
3190 $self->throw("Mask character $mask_char has to be a single character ".
3191 "and not a gap or frameshift symbol")
3192 unless CORE
::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3194 my $aln = $self->new;
3195 $aln->id($self->id);
3196 foreach my $seq ( $self->each_seq() ) {
3197 my $new_seq = Bio
::LocatableSeq
->new(-id
=> $seq->id,
3198 -alphabet
=> $seq->alphabet,
3199 -strand
=> $seq->strand,
3200 -verbose
=> $self->verbose);
3202 # convert from 1-based alignment coords!
3203 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3204 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3205 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3206 $new_seq->seq($new_dna_string);
3207 $aln->add_seq($new_seq);