1 # BioPerl module for SimpleAlign
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
14 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
15 # May 2001 major rewrite - Heikki Lehvaslaiho
19 Bio::SimpleAlign - Multiple alignments held as a set of sequences
23 # Use Bio::AlignIO to read in the alignment
24 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
25 $aln = $str->next_aln();
29 print $aln->num_residues;
31 print $aln->num_sequences;
33 print $aln->percentage_identity;
34 print $aln->consensus_string(50);
36 # Find the position in the alignment for a sequence location
37 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
39 # Extract sequences and check values for the alignment column $pos
40 foreach $seq ($aln->each_seq) {
41 $res = $seq->subseq($pos, $pos);
44 foreach $res (keys %count) {
45 printf "Res: %s Count: %2d\n", $res, $count{$res};
49 $aln->remove_seq($seq);
50 $mini_aln = $aln->slice(20,30); # get a block of columns
51 $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
52 $new_aln = $aln->remove_columns([20,30]); # remove by position
53 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
56 $str = $aln->consensus_string($threshold_percent);
57 $str = $aln->match_line();
58 $str = $aln->cigar_line();
59 $id = $aln->percentage_identity;
61 # See the module documentation for details and more methods.
65 SimpleAlign is an object that handles a multiple sequence alignment
66 (MSA). It is very permissive of types (it does not insist on sequences
67 being all same length, for example). Think of it as a set of sequences
68 with a whole series of built-in manipulations and methods for reading and
71 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
72 to store its sequences. These are subsequences with a start and end
73 positions in the parent reference sequence. Each sequence in the
74 SimpleAlign object is a Bio::LocatableSeq.
76 SimpleAlign expects the combination of name, start, and end for a
77 given sequence to be unique in the alignment, and this is the key for the
78 internal hashes (name, start, end are abbreviated C<nse> in the code).
79 However, in some cases people do not want the name/start-end to be displayed:
80 either multiple names in an alignment or names specific to the alignment
81 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
82 C<displayname>, and generally is what is used to print out the
83 alignment. They default to name/start-end.
85 The SimpleAlign Module is derived from the Align module by Ewan Birney.
91 User feedback is an integral part of the evolution of this and other
92 Bioperl modules. Send your comments and suggestions preferably to one
93 of the Bioperl mailing lists. Your participation is much appreciated.
95 bioperl-l@bioperl.org - General discussion
96 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
100 Please direct usage questions or support issues to the mailing list:
102 I<bioperl-l@bioperl.org>
104 rather than to the module maintainer directly. Many experienced and
105 reponsive experts will be able look at the problem and quickly
106 address it. Please include a thorough description of the problem
107 with code and data examples if at all possible.
109 =head2 Reporting Bugs
111 Report bugs to the Bioperl bug tracking system to help us keep track
112 the bugs and their resolution. Bug reports can be submitted via the
115 https://redmine.open-bio.org/projects/bioperl/
119 Ewan Birney, birney@ebi.ac.uk
123 Allen Day, allenday-at-ucla.edu,
124 Richard Adams, Richard.Adams-at-ed.ac.uk,
125 David J. Evans, David.Evans-at-vir.gla.ac.uk,
126 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
127 Allen Smith, allens-at-cpan.org,
128 Jason Stajich, jason-at-bioperl.org,
129 Anthony Underwood, aunderwood-at-phls.org.uk,
130 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
131 Brian Osborne, bosborne at alum.mit.edu
132 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
133 Hongyu Zhang, forward at hongyu.org
134 Jay Hannah, jay at jays.net
135 Alexandr Bezginov, albezg at gmail.com
143 The rest of the documentation details each of the object
144 methods. Internal methods are usually preceded with a _
148 # 'Let the code begin...
150 package Bio
::SimpleAlign
;
151 use vars
qw(%CONSERVATION_GROUPS);
154 use Bio::LocatableSeq; # uses Seq's as list
157 use Bio::SeqFeature::Generic;
160 # This data should probably be in a more centralized module...
161 # it is taken from Clustalw documentation.
162 # These are all the positively scoring groups that occur in the
163 # Gonnet Pam250 matrix. The strong and weak groups are
164 # defined as strong score >0.5 and weak score =<0.5 respectively.
166 %CONSERVATION_GROUPS = (
191 use base
qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
192 Bio::FeatureHolderI);
197 Usage : my $aln = Bio::SimpleAlign->new();
198 Function : Creates a new simple align object
199 Returns : Bio::SimpleAlign
200 Args : -source => string representing the source program
201 where this alignment came from
202 -annotation => Bio::AnnotationCollectionI
203 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
204 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
205 -consensus => consensus string
206 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
212 my($class,@args) = @_;
214 my $self = $class->SUPER::new
(@args);
216 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
229 $src && $self->source($src);
230 defined $score && $self->score($score);
231 # we need to set up internal hashs first!
233 $self->{'_seq'} = {};
234 $self->{'_order'} = {};
235 $self->{'_start_end_lists'} = {};
236 $self->{'_dis_name'} = {};
237 $self->{'_id'} = 'NoName';
238 # maybe we should automatically read in from args. Hmmm...
239 $id && $self->id($id);
240 $acc && $self->accession($acc);
241 $desc && $self->description($desc);
242 $coll && $self->annotation($coll);
243 # sequence annotation is layered into a provided annotation collection (or dies)
245 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
246 "with a sequence annotation collection")
248 $coll->add_Annotation('seq_annotation', $sa);
250 if ($feats && ref $feats eq 'ARRAY') {
251 for my $feat (@
$feats) {
252 $self->add_SeqFeature($feat);
255 $con && $self->consensus($con);
256 $cmeta && $self->consensus_meta($cmeta);
257 # assumes these are in correct alignment order
258 if ($seqs && ref($seqs) eq 'ARRAY') {
259 for my $seq (@
$seqs) {
260 $self->add_seq($seq);
264 return $self; # success - we hope!
267 =head1 Modifier methods
269 These methods modify the MSA by adding, removing or shuffling complete
275 Usage : $myalign->add_seq($newseq);
276 $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
277 Function : Adds another sequence to the alignment. *Does not* align
278 it - just adds it to the hashes.
279 If -ORDER is specified, the sequence is inserted at the
280 the position spec'd by -ORDER, and existing sequences
281 are pushed down the storage array.
283 Args : A Bio::LocatableSeq object
284 Positive integer for the sequence position (optional)
286 See L<Bio::LocatableSeq> for more information
292 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
299 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
300 my ($name,$id,$start,$end);
303 $self->throw("LocatableSeq argument required");
305 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
306 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
308 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
309 $order--; # jay's patch (user-specified order is 1-origin)
312 $self->throw("User-specified value for ORDER must be >= 1");
315 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
317 # build the symbol list for this sequence,
318 # will prune out the gap and missing/match chars
319 # when actually asked for the symbol list in the
321 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
323 $name = $seq->get_nse;
325 if( $self->{'_seq'}->{$name} ) {
326 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
329 $self->debug( "Assigning $name to $order\n");
331 my $ordh = $self->{'_order'};
332 if ($ordh->{$order}) {
333 # make space to insert
334 # $c->() returns (in reverse order) the first subsequence
335 # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
336 # (3,2,1), and $c->(2,4,5) returns (2).
338 $c = sub { return (($_[1]-$_[0] == 1) ?
($c->(@_[1..$#_]),$_[0]) : $_[0]); };
340 $ordh->{$_+1} = $ordh->{$_}
341 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
344 $ordh->{$order} = $name;
346 unless( exists( $self->{'_start_end_lists'}->{$id})) {
347 $self->{'_start_end_lists'}->{$id} = [];
349 push @
{$self->{'_start_end_lists'}->{$id}}, $seq;
352 $self->{'_seq'}->{$name} = $seq;
360 Usage : $aln->remove_seq($seq);
361 Function : Removes a single sequence from an alignment
363 Argument : a Bio::LocatableSeq object
369 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
370 $self->remove_seq(@_);
376 my ($name,$id,$start,$end);
378 $self->throw("Need Bio::Locatable seq argument ")
379 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
382 $start = $seq->start();
384 $name = sprintf("%s/%d-%d",$id,$start,$end);
386 if( !exists $self->{'_seq'}->{$name} ) {
387 $self->throw("Sequence $name does not exist in the alignment to remove!");
390 delete $self->{'_seq'}->{$name};
392 # we need to remove this seq from the start_end_lists hash
394 if (exists $self->{'_start_end_lists'}->{$id}) {
395 # we need to find the sequence in the array.
398 for ($i=0; $i < @
{$self->{'_start_end_lists'}->{$id}}; $i++) {
399 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
405 splice @
{$self->{'_start_end_lists'}->{$id}}, $i, 1;
408 $self->throw("Could not find the sequence to remoce from the start-end list");
412 $self->throw("There is no seq list for the name $id");
414 # we need to shift order hash
415 my %rev_order = reverse %{$self->{'_order'}};
416 my $no = $rev_order{$name};
417 my $num_sequences = $self->num_sequences;
418 for (; $no < $num_sequences; $no++) {
419 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
421 delete $self->{'_order'}->{$no};
429 Usage : $aln->purge(0.7);
430 Function: Removes sequences above given sequence similarity
431 This function will grind on large alignments. Beware!
433 Returns : An array of the removed sequences
434 Args : float, threshold for similarity
439 my ($self,$perc) = @_;
440 my (%duplicate, @dups);
442 my @seqs = $self->each_seq();
444 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
447 #skip if already in duplicate hash
448 next if exists $duplicate{$seq->display_id} ;
449 my $one = $seq->seq();
451 my @one = split '', $one; #split to get 1aa per array element
453 for (my $j=$i+1;$j < @seqs;$j++) {
454 my $seq2 = $seqs[$j];
456 #skip if already in duplicate hash
457 next if exists $duplicate{$seq2->display_id} ;
459 my $two = $seq2->seq();
460 my @two = split '', $two;
464 for (my $k=0;$k<@one;$k++) {
465 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
466 $one[$k] eq $two[$k]) {
469 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
470 $two[$k] ne '.' && $two[$k] ne '-' ) {
476 $ratio = $count/$res unless $res == 0;
478 # if above threshold put in duplicate hash and push onto
479 # duplicate array for returning to get_unique
480 if ( $ratio > $perc ) {
481 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
482 $duplicate{$seq2->display_id} = 1;
487 foreach my $seq (@dups) {
488 $self->remove_seq($seq);
493 =head2 sort_alphabetically
495 Title : sort_alphabetically
496 Usage : $ali->sort_alphabetically
497 Function : Changes the order of the alignment to alphabetical on name
498 followed by numerical by number.
504 sub sort_alphabetically
{
506 my ($seq,$nse,@arr,%hash,$count);
508 foreach $seq ( $self->each_seq() ) {
509 $nse = $seq->get_nse;
515 %{$self->{'_order'}} = (); # reset the hash;
517 foreach $nse ( sort _alpha_startend
keys %hash) {
518 $self->{'_order'}->{$count} = $nse;
528 Usage : $aln_ordered=$aln->sort_by_list($list_file)
529 Function : Arbitrarily order sequences in an alignment
530 Returns : A new Bio::SimpleAlign object
531 Argument : a file listing sequence names in intended order (one name per line)
536 my ($self, $list) = @_;
537 my (@seq, @ids, %order);
539 foreach my $seq ( $self->each_seq() ) {
541 push @ids, $seq->display_id;
545 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
549 $self->throw("Not found in alignment: $name") unless &_in_aln
($name, \
@ids);
554 # use the map-sort-map idiom:
555 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
556 my $aln = $self->new;
557 foreach (@sorted) { $aln->add_seq($_) }
561 =head2 set_new_reference
563 Title : set_new_reference
564 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
565 the sequence whoes name is "B31" (full, exact, and case-sensitive),
566 as the reference (1st) sequence
567 Function : Change/Set a new reference (i.e., the first) sequence
568 Returns : a new Bio::SimpleAlign object.
569 Throws an exception if designated sequence not found
570 Argument : a positive integer of sequence order, or a sequence name
571 in the original alignment
575 sub set_new_reference
{
576 my ($self, $seqid) = @_;
577 my $aln = $self->new;
578 my (@seq, @ids, @new_seq);
580 foreach my $seq ( $self->each_seq() ) {
582 push @ids, $seq->display_id;
585 if ($seqid =~ /^\d+$/) { # argument is seq position
587 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
588 } else { # argument is a seq name
589 $self->throw("The new reference sequence not in alignment ") unless &_in_aln
($seqid, \
@ids);
592 for (my $i=0; $i<=$#seq; $i++) {
594 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
595 unshift @new_seq, $seq[$i];
597 push @new_seq, $seq[$i];
600 foreach (@new_seq) { $aln->add_seq($_); }
604 sub _in_aln
{ # check if input name exists in the alignment
605 my ($str, $ref) = @_;
607 return 1 if $str eq $_;
616 Usage : $aln->uniq_seq(): Remove identical sequences in
617 in the alignment. Ambiguous base ("N", "n") and
618 leading and ending gaps ("-") are NOT counted as
620 Function : Make a new alignment of unique sequence types (STs)
621 Returns : 1a. if called in a scalar context,
622 a new Bio::SimpleAlign object (all sequences renamed as "ST")
623 1b. if called in an array context,
624 a new Bio::SimpleAlign object, and a hashref whose keys
625 are sequence types, and whose values are arrayrefs to
626 lists of sequence ids within the corresponding sequence type
627 2. if $aln->verbose > 0, ST of each sequence is sent to
628 STDERR (in a tabular format)
634 my ($self, $seqid) = @_;
635 my $aln = $self->new;
636 my (%member, %order, @seq, @uniq_str, $st);
638 my $len = $self->length();
640 foreach my $seq ( $self->each_seq() ) {
641 my $str = $seq->seq();
643 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
644 # comparing two sequence strings
646 # 1st, convert "n", "N" to "?" (for DNA sequence only):
647 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
648 # 2nd, convert leading and ending gaps to "?":
649 $str = &_convert_leading_ending_gaps
($str, '-', '?');
650 # Note that '?' also can mean unknown residue.
651 # I don't like making global class member changes like this, too
652 # prone to errors... -- cjfields 08-11-18
653 local $Bio::LocatableSeq
::GAP_SYMBOLS
= '-\?';
654 my $new = Bio
::LocatableSeq
->new(
656 -alphabet
=> $seq->alphabet,
658 -start
=> $seq->start,
664 foreach my $seq (@seq) {
665 my $str = $seq->seq();
666 my ($seen, $key) = &_check_uniq
($str, \
@uniq_str, $len);
667 if ($seen) { # seen before
668 my @memb = @
{$member{$key}};
670 $member{$key} = \
@memb;
672 push @uniq_str, $key;
674 $member{$key} = [ ($seq) ];
675 $order{$key} = $order;
679 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
680 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
681 my $str2 = &_convert_leading_ending_gaps
($str, '?', '-');
682 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
683 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
685 my $end= CORE
::length($str2);
686 $end -= CORE
::length($1) while $str2 =~ m/($gap+)/g;
687 my $new = Bio
::LocatableSeq
->new(-id
=>"ST".$order{$str},
693 foreach (@
{$member{$str}}) {
694 push @
{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
695 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
698 return wantarray ?
($aln, $st) : $aln;
701 sub _check_uniq
{ # check if same seq exists in the alignment
702 my ($str1, $ref, $length) = @_;
703 my @char1=split //, $str1;
706 return (0, $str1) if @array==0; # not seen (1st sequence)
708 foreach my $str2 (@array) {
710 my @char2=split //, $str2;
711 for (my $i=0; $i<=$length-1; $i++) {
712 next if $char1[$i] eq '?';
713 next if $char2[$i] eq '?';
714 $diff++ if $char1[$i] ne $char2[$i];
716 return (1, $str2) if $diff == 0; # seen before
719 return (0, $str1); # not seen
722 sub _convert_leading_ending_gaps
{
726 my @array=split //, $s;
727 # convert leading char:
728 for (my $i=0; $i<=$#array; $i++) {
729 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
731 # convert ending char:
732 for (my $i = $#array; $i>= 0; $i--) {
733 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
735 my $s_new=join '', @array;
739 =head1 Sequence selection methods
741 Methods returning one or more sequences objects.
746 Usage : foreach $seq ( $align->each_seq() )
747 Function : Gets a Seq object from the alignment
755 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
763 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
764 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
765 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
772 =head2 each_alphabetically
774 Title : each_alphabetically
775 Usage : foreach $seq ( $ali->each_alphabetically() )
776 Function : Returns a sequence object, but the objects are returned
777 in alphabetically sorted order.
778 Does not change the order of the alignment.
784 sub each_alphabetically
{
786 my ($seq,$nse,@arr,%hash,$count);
788 foreach $seq ( $self->each_seq() ) {
789 $nse = $seq->get_nse;
793 foreach $nse ( sort _alpha_startend
keys %hash) {
794 push(@arr,$hash{$nse});
799 sub _alpha_startend
{
800 my ($aname,$astart,$bname,$bstart);
801 ($aname,$astart) = split (/-/,$a);
802 ($bname,$bstart) = split (/-/,$b);
804 if( $aname eq $bname ) {
805 return $astart <=> $bstart;
808 return $aname cmp $bname;
812 =head2 each_seq_with_id
814 Title : each_seq_with_id
815 Usage : foreach $seq ( $align->each_seq_with_id() )
816 Function : Gets a Seq objects from the alignment, the contents
817 being those sequences with the given name (there may be
820 Argument : a seq name
826 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
827 $self->each_seq_with_id(@_);
830 sub each_seq_with_id
{
834 $self->throw("Method each_seq_with_id needs a sequence name argument")
839 if (exists($self->{'_start_end_lists'}->{$id})) {
840 @arr = @
{$self->{'_start_end_lists'}->{$id}};
845 =head2 get_seq_by_pos
847 Title : get_seq_by_pos
848 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
849 Function : Gets a sequence based on its position in the alignment.
850 Numbering starts from 1. Sequence positions larger than
851 num_sequences() will thow an error.
852 Returns : a Bio::LocatableSeq object
853 Args : positive integer for the sequence position
862 $self->throw("Sequence position has to be a positive integer, not [$pos]")
863 unless $pos =~ /^\d+$/ and $pos > 0;
864 $self->throw("No sequence at position [$pos]")
865 unless $pos <= $self->num_sequences ;
867 my $nse = $self->{'_order'}->{--$pos};
868 return $self->{'_seq'}->{$nse};
873 Title : get_seq_by_id
874 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
875 Function : Gets a sequence based on its name.
876 Sequences that do not exist will warn and return undef
877 Returns : a Bio::LocatableSeq object
878 Args : string for sequence name
883 my ($self,$name) = @_;
884 unless( defined $name ) {
885 $self->warn("Must provide a sequence name");
888 for my $seq ( values %{$self->{'_seq'}} ) {
889 if ( $seq->id eq $name) {
896 =head2 seq_with_features
898 Title : seq_with_features
899 Usage : $seq = $aln->seq_with_features(-pos => 1,
902 sub { my $consensus = shift;
907 while($consensus =~ /[^?]$q[^?]/){
908 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
914 Function: produces a Bio::Seq object by first splicing gaps from -pos
915 (by means of a splice_by_seq_pos() call), then creating
916 features using non-? chars (by means of a consensus_string()
917 call with stringency -consensus).
918 Returns : a Bio::Seq object
919 Args : -pos : required. sequence from which to build the Bio::Seq
921 -consensus : optional, defaults to consensus_string()'s
923 -mask : optional, a coderef to apply to consensus_string()'s
924 output before building features. this may be useful for
925 closing gaps of 1 bp by masking over them with N, for
930 sub seq_with_features
{
931 my ($self,%arg) = @_;
933 #first do the preparatory splice
934 $self->throw("must provide a -pos argument") unless $arg{-pos};
935 $self->splice_by_seq_pos($arg{-pos});
937 my $consensus_string = $self->consensus_string($arg{-consensus
});
938 $consensus_string = $arg{-mask
}->($consensus_string)
939 if defined($arg{-mask
});
943 push @bs, 1 if $consensus_string =~ /^[^?]/;
945 while($consensus_string =~ /\?[^?]/g){
946 push @bs, pos($consensus_string);
948 while($consensus_string =~ /[^?]\?/g){
949 push @es, pos($consensus_string);
952 push @es, CORE
::length($consensus_string) if $consensus_string =~ /[^?]$/;
954 my $seq = Bio
::Seq
->new();
956 # my $rootfeature = Bio::SeqFeature::Generic->new(
957 # -source_tag => 'location',
958 # -start => $self->get_seq_by_pos($arg{-pos})->start,
959 # -end => $self->get_seq_by_pos($arg{-pos})->end,
961 # $seq->add_SeqFeature($rootfeature);
963 while(my $b = shift @bs){
965 $seq->add_SeqFeature(
966 Bio
::SeqFeature
::Generic
->new(
967 -start
=> $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
968 -end
=> $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
969 -source_tag
=> $self->source || 'MSA',
978 =head1 Create new alignments
980 The result of these methods are horizontal or vertical subsets of the
986 Usage : $aln2 = $aln->select(1, 3) # three first sequences
987 Function : Creates a new alignment from a continuous subset of
988 sequences. Numbering starts from 1. Sequence positions
989 larger than num_sequences() will thow an error.
990 Returns : a Bio::SimpleAlign object
991 Args : positive integer for the first sequence
992 positive integer for the last sequence to include (optional)
998 my ($start, $end) = @_;
1000 $self->throw("Select start has to be a positive integer, not [$start]")
1001 unless $start =~ /^\d+$/ and $start > 0;
1002 $self->throw("Select end has to be a positive integer, not [$end]")
1003 unless $end =~ /^\d+$/ and $end > 0;
1004 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1005 unless $start <= $end;
1007 my $aln = $self->new;
1008 foreach my $pos ($start .. $end) {
1009 $aln->add_seq($self->get_seq_by_pos($pos));
1011 $aln->id($self->id);
1012 # fix for meta, sf, ann
1016 =head2 select_noncont
1018 Title : select_noncont
1019 Usage : # 1st and 3rd sequences, sorted
1020 $aln2 = $aln->select_noncont(1, 3)
1022 # 1st and 3rd sequences, sorted (same as first)
1023 $aln2 = $aln->select_noncont(3, 1)
1025 # 1st and 3rd sequences, unsorted
1026 $aln2 = $aln->select_noncont('nosort',3, 1)
1028 Function : Creates a new alignment from a subset of sequences. Numbering
1029 starts from 1. Sequence positions larger than num_sequences() will
1030 throw an error. Sorts the order added to new alignment by default,
1031 to prevent sorting pass 'nosort' as the first argument in the list.
1032 Returns : a Bio::SimpleAlign object
1033 Args : array of integers for the sequences. If the string 'nosort' is
1034 passed as the first argument, the sequences will not be sorted
1035 in the new alignment but will appear in the order listed.
1039 sub select_noncont
{
1043 if ($pos[0] !~ m{^\d+$}) {
1044 my $sortcmd = shift @pos;
1045 if ($sortcmd eq 'nosort') {
1048 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1052 my $end = $self->num_sequences;
1054 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1055 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1058 @pos = sort {$a <=> $b} @pos unless $nosort;
1060 my $aln = $self->new;
1061 foreach my $p (@pos) {
1062 $aln->add_seq($self->get_seq_by_pos($p));
1064 $aln->id($self->id);
1065 # fix for meta, sf, ann
1072 Usage : $aln2 = $aln->slice(20,30)
1073 Function : Creates a slice from the alignment inclusive of start and
1074 end columns, and the first column in the alignment is denoted 1.
1075 Sequences with no residues in the slice are excluded from the
1076 new alignment and a warning is printed. Slice beyond the length of
1077 the sequence does not do padding.
1078 Returns : A Bio::SimpleAlign object
1079 Args : Positive integer for start column, positive integer for end column,
1080 optional boolean which if true will keep gap-only columns in the newly
1081 created slice. Example:
1083 $aln2 = $aln->slice(20,30,1)
1089 my ($start, $end, $keep_gap_only) = @_;
1091 $self->throw("Slice start has to be a positive integer, not [$start]")
1092 unless $start =~ /^\d+$/ and $start > 0;
1093 $self->throw("Slice end has to be a positive integer, not [$end]")
1094 unless $end =~ /^\d+$/ and $end > 0;
1095 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1096 unless $start <= $end;
1097 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1098 "[$start] is too big.") if $start > $self->length;
1099 my $cons_meta = $self->consensus_meta;
1100 my $aln = $self->new;
1101 $aln->id($self->id);
1102 foreach my $seq ( $self->each_seq() ) {
1103 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1106 -alphabet
=> $seq->alphabet,
1107 -strand
=> $seq->strand,
1108 -verbose
=> $self->verbose) :
1109 Bio
::LocatableSeq
->new
1111 -alphabet
=> $seq->alphabet,
1112 -strand
=> $seq->strand,
1113 -verbose
=> $self->verbose);
1117 $seq_end = $seq->length if( $end > $seq->length );
1119 my $slice_seq = $seq->subseq($start, $seq_end);
1120 $new_seq->seq( $slice_seq );
1122 $slice_seq =~ s/\W//g;
1125 my $pre_start_seq = $seq->subseq(1, $start - 1);
1126 $pre_start_seq =~ s/\W//g;
1127 if (!defined($seq->strand)) {
1128 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1129 } elsif ($seq->strand < 0){
1130 $new_seq->start( $seq->end - CORE
::length($pre_start_seq) - CORE
::length($slice_seq) + 1);
1132 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1135 if ((defined $seq->strand)&&($seq->strand < 0)){
1136 $new_seq->start( $seq->end - CORE
::length($slice_seq) + 1);
1138 $new_seq->start( $seq->start);
1141 if ($new_seq->isa('Bio::Seq::MetaI')) {
1142 for my $meta_name ($seq->meta_names) {
1143 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1146 $new_seq->end( $new_seq->start + CORE
::length($slice_seq) - 1 );
1148 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1149 $aln->add_seq($new_seq);
1151 if( $keep_gap_only ) {
1152 $aln->add_seq($new_seq);
1154 my $nse = $seq->get_nse();
1155 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1156 " Sequence excluded from the new alignment.");
1161 my $new = Bio
::Seq
::Meta
->new();
1162 for my $meta_name ($cons_meta->meta_names) {
1163 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1165 $aln->consensus_meta($new);
1167 $aln->annotation($self->annotation);
1168 # fix for meta, sf, ann
1172 =head2 remove_columns
1174 Title : remove_columns
1175 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1176 $aln2 = $aln->remove_columns([0,0],[6,8])
1177 Function : Creates an aligment with columns removed corresponding to
1178 the specified type or by specifying the columns by number.
1179 Returns : Bio::SimpleAlign object
1180 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1181 'all_gaps_columns') or array ref where the referenced array
1182 contains a pair of integers that specify a range.
1183 The first column is 0
1187 sub remove_columns
{
1188 my ($self,@args) = @_;
1189 @args || $self->throw("Must supply column ranges or column types");
1192 if ($args[0][0] =~ /^[a-z_]+$/i) {
1193 $aln = $self->_remove_columns_by_type($args[0]);
1194 } elsif ($args[0][0] =~ /^\d+$/) {
1195 $aln = $self->_remove_columns_by_num(\
@args);
1197 $self->throw("You must pass array references to remove_columns(), not @args");
1199 # fix for meta, sf, ann
1207 Usage : $aln2 = $aln->remove_gaps
1208 Function : Creates an aligment with gaps removed
1209 Returns : a Bio::SimpleAlign object
1210 Args : a gap character(optional) if none specified taken
1211 from $self->gap_char,
1212 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1213 indicates that only all-gaps columns should be deleted
1215 Used from method L<remove_columns> in most cases. Set gap character
1216 using L<gap_char()|gap_char>.
1221 my ($self,$gapchar,$all_gaps_columns) = @_;
1223 if ($all_gaps_columns) {
1224 $gap_line = $self->all_gap_line($gapchar);
1226 $gap_line = $self->gap_line($gapchar);
1228 my $aln = $self->new;
1232 my $del_char = $gapchar || $self->gap_char;
1233 # Do the matching to get the segments to remove
1234 while ($gap_line =~ m/[$del_char]/g) {
1235 my $start = pos($gap_line)-1;
1236 $gap_line=~/\G[$del_char]+/gc;
1237 my $end = pos($gap_line)-1;
1239 #have to offset the start and end for subsequent removes
1242 $length += ($end-$start+1);
1243 push @remove, [$start,$end];
1246 #remove the segments
1247 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1248 # fix for meta, sf, ann
1254 my ($self,$aln,$remove) = @_;
1257 my $gap = $self->gap_char;
1259 # splice out the segments and create new seq
1260 foreach my $seq($self->each_seq){
1261 my $new_seq = Bio
::LocatableSeq
->new(
1263 -alphabet
=> $seq->alphabet,
1264 -strand
=> $seq->strand,
1265 -verbose
=> $self->verbose);
1266 my $sequence = $seq->seq;
1267 foreach my $pair(@
{$remove}){
1268 my $start = $pair->[0];
1269 my $end = $pair->[1];
1270 $sequence = $seq->seq unless $sequence;
1271 my $orig = $sequence;
1272 my $head = $start > 0 ?
substr($sequence, 0, $start) : '';
1273 my $tail = ($end + 1) >= CORE
::length($sequence) ?
'' : substr($sequence, $end + 1);
1274 $sequence = $head.$tail;
1276 unless (defined $new_seq->start) {
1278 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1279 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1282 my $start_adjust = $orig =~ /^$gap+/;
1283 if ($start_adjust) {
1284 $start_adjust = $+[0] == $start;
1286 $new_seq->start($seq->start + $start_adjust);
1290 if (($end + 1) >= CORE
::length($orig)) {
1291 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1292 $new_seq->end($seq->end - (CORE
::length($orig) - $start) + $end_adjust);
1295 $new_seq->end($seq->end);
1299 if ($new_seq->end < $new_seq->start) {
1300 # we removed all columns except for gaps: set to 0 to indicate no
1306 $new_seq->seq($sequence) if $sequence;
1307 push @new, $new_seq;
1309 # add the new seqs to the alignment
1310 foreach my $new(@new){
1311 $aln->add_seq($new);
1313 # fix for meta, sf, ann
1317 sub _remove_columns_by_type
{
1318 my ($self,$type) = @_;
1319 my $aln = $self->new;
1322 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @
{$type});
1323 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@
{$type});
1324 my %matchchars = ( 'match' => '\*',
1329 'all_gaps_columns' => ''
1331 # get the characters to delete against
1333 foreach my $type (@
{$type}){
1334 $del_char.= $matchchars{$type};
1338 my $match_line = $self->match_line;
1339 # do the matching to get the segments to remove
1341 while($match_line =~ m/[$del_char]/g ){
1342 my $start = pos($match_line)-1;
1343 $match_line=~/\G[$del_char]+/gc;
1344 my $end = pos($match_line)-1;
1346 #have to offset the start and end for subsequent removes
1349 $length += ($end-$start+1);
1350 push @remove, [$start,$end];
1354 # remove the segments
1355 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1356 $aln = $aln->remove_gaps() if $gap;
1357 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1358 # fix for meta, sf, ann
1363 sub _remove_columns_by_num
{
1364 my ($self,$positions) = @_;
1365 my $aln = $self->new;
1367 # sort the positions
1368 @
$positions = sort { $a->[0] <=> $b->[0] } @
$positions;
1372 foreach my $pos (@
{$positions}) {
1373 my ($start, $end) = @
{$pos};
1375 #have to offset the start and end for subsequent removes
1378 $length += ($end-$start+1);
1379 push @remove, [$start,$end];
1382 #remove the segments
1383 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1384 # fix for meta, sf, ann
1389 =head1 Change sequences within the MSA
1391 These methods affect characters in all sequences without changing the
1394 =head2 splice_by_seq_pos
1396 Title : splice_by_seq_pos
1397 Usage : $status = splice_by_seq_pos(1);
1398 Function: splices all aligned sequences where the specified sequence
1401 Returns : 1 on success
1402 Args : position of sequence to splice by
1407 sub splice_by_seq_pos
{
1408 my ($self,$pos) = @_;
1410 my $guide = $self->get_seq_by_pos($pos);
1411 my $guide_seq = $guide->seq;
1413 $guide_seq =~ s/\./\-/g;
1417 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1418 unshift @gaps, $pos;
1422 foreach my $seq ($self->each_seq){
1423 my @bases = split '', $seq->seq;
1425 splice(@bases, $_, 1) foreach @gaps;
1426 $seq->seq(join('', @bases));
1435 Usage : $ali->map_chars('\.','-')
1436 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1439 Notice that the from (arg1) is interpretted as a regex,
1440 so be careful about quoting meta characters (eg
1441 $ali->map_chars('.','-') wont do what you want)
1443 Argument : 'from' rexexp
1454 $self->throw("Need exactly two arguments")
1455 unless defined $from and defined $to;
1457 foreach $seq ( $self->each_seq() ) {
1458 $temp = $seq->seq();
1459 $temp =~ s/$from/$to/g;
1469 Usage : $ali->uppercase()
1470 Function : Sets all the sequences to uppercase
1481 foreach $seq ( $self->each_seq() ) {
1482 $temp = $seq->seq();
1483 $temp =~ tr/[a-z]/[A-Z]/;
1492 Title : cigar_line()
1493 Usage : %cigars = $align->cigar_line()
1494 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1495 Report) line for each sequence in the alignment. Examples are
1496 "1,60" or "5,10:12,58", where the numbers refer to conserved
1497 positions within the alignment. The keys of the hash are the
1498 NSEs (name/start/end) assigned to each sequence.
1499 Args : threshold (optional, defaults to 100)
1500 Returns : Hash of strings (cigar lines)
1509 my @consensus = split "",($self->consensus_string($thr));
1510 my $len = $self->length;
1511 my $gapchar = $self->gap_char;
1513 # create a precursor, something like (1,4,5,6,7,33,45),
1514 # where each number corresponds to a conserved position
1515 foreach my $seq ( $self->each_seq ) {
1516 my @seq = split "", uc ($seq->seq);
1518 for (my $x = 0 ; $x < $len ; $x++ ) {
1519 if ($seq[$x] eq $consensus[$x]) {
1520 push @
{$cigars{$seq->get_nse}},$pos;
1522 } elsif ($seq[$x] ne $gapchar) {
1527 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1528 for my $name (keys %cigars) {
1529 splice @
{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1530 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1531 push @
{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1532 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1533 ${$cigars{$name}}[$#{$cigars{$name}}] );
1534 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1535 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1536 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1537 splice @
{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1541 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1542 for my $name (keys %cigars) {
1544 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1545 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1546 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1550 for my $pos (@remove) {
1551 splice @
{$cigars{$name}}, $pos, 1;
1554 # join and punctuate
1555 for my $name (keys %cigars) {
1556 my ($start,$end,$str) = "";
1557 while ( ($start,$end) = splice @
{$cigars{$name}}, 0, 2 ) {
1558 $str .= ($start . "," . $end . ":");
1561 $cigars{$name} = $str;
1569 Title : match_line()
1570 Usage : $line = $align->match_line()
1571 Function : Generates a match line - much like consensus string
1572 except that a line indicating the '*' for a match.
1573 Args : (optional) Match line characters ('*' by default)
1574 (optional) Strong match char (':' by default)
1575 (optional) Weak match char ('.' by default)
1581 my ($self,$matchlinechar, $strong, $weak) = @_;
1582 my %matchchars = ('match' => $matchlinechar || '*',
1583 'weak' => $weak || '.',
1584 'strong' => $strong || ':',
1590 foreach my $seq ( $self->each_seq ) {
1591 push @seqchars, [ split(//, uc ($seq->seq)) ];
1592 $alphabet = $seq->alphabet unless defined $alphabet;
1594 my $refseq = shift @seqchars;
1595 # let's just march down the columns
1598 foreach my $pos ( 0..$self->length ) {
1599 my $refchar = $refseq->[$pos];
1600 my $char = $matchchars{'mismatch'};
1601 unless( defined $refchar ) {
1602 last if $pos == $self->length; # short circuit on last residue
1603 # this in place to handle jason's soon-to-be-committed
1604 # intron mapping code
1607 my %col = ($refchar => 1);
1608 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1609 foreach my $seq ( @seqchars ) {
1610 next if $pos >= scalar @
$seq;
1611 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1612 $seq->[$pos] eq ' ' );
1613 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1615 my @colresidues = sort keys %col;
1617 # if all the values are the same
1618 if( $dash ) { $char = $matchchars{'mismatch'} }
1619 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1620 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1621 # matches for protein seqs
1623 foreach my $type ( qw(strong weak) ) {
1624 # iterate through categories
1626 # iterate through each of the aa in the col
1627 # look to see which groups it is in
1628 foreach my $c ( @colresidues ) {
1629 foreach my $f ( grep { index($_,$c) >= 0 } @
{$CONSERVATION_GROUPS{$type}} ) {
1630 push @
{$groups{$f}},$c;
1634 foreach my $cols ( values %groups ) {
1635 @
$cols = sort @
$cols;
1636 # now we are just testing to see if two arrays
1637 # are identical w/o changing either one
1638 # have to be same len
1639 next if( scalar @
$cols != scalar @colresidues );
1640 # walk down the length and check each slot
1641 for($_=0;$_ < (scalar @
$cols);$_++ ) {
1642 next GRP
if( $cols->[$_] ne $colresidues[$_] );
1644 $char = $matchchars{$type};
1650 $matchline .= $char;
1659 Usage : $line = $align->gap_line()
1660 Function : Generates a gap line - much like consensus string
1661 except that a line where '-' represents gap
1662 Args : (optional) gap line characters ('-' by default)
1668 my ($self,$gapchar) = @_;
1669 $gapchar = $gapchar || $self->gap_char;
1670 my %gap_hsh; # column gaps vector
1671 foreach my $seq ( $self->each_seq ) {
1673 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1674 map {[$i++, $_]} split(//, uc ($seq->seq));
1677 foreach my $pos ( 0..$self->length-1 ) {
1678 $gap_line .= (exists $gap_hsh{$pos}) ?
$gapchar:'.';
1685 Title : all_gap_line()
1686 Usage : $line = $align->all_gap_line()
1687 Function : Generates a gap line - much like consensus string
1688 except that a line where '-' represents all-gap column
1689 Args : (optional) gap line characters ('-' by default)
1695 my ($self,$gapchar) = @_;
1696 $gapchar = $gapchar || $self->gap_char;
1697 my %gap_hsh; # column gaps counter hash
1698 my @seqs = $self->each_seq;
1699 foreach my $seq ( @seqs ) {
1701 map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1702 map {[$i++, $_]} split(//, uc ($seq->seq));
1705 foreach my $pos ( 0..$self->length-1 ) {
1706 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1708 $gap_line .= $gapchar;
1716 =head2 gap_col_matrix
1718 Title : gap_col_matrix()
1719 Usage : my $cols = $align->gap_col_matrix()
1720 Function : Generates an array of hashes where
1721 each entry in the array is a hash reference
1722 with keys of all the sequence names and
1723 and value of 1 or 0 if the sequence has a gap at that column
1724 Args : (optional) gap line characters ($aln->gap_char or '-' by default)
1728 sub gap_col_matrix
{
1729 my ($self,$gapchar) = @_;
1730 $gapchar = $gapchar || $self->gap_char;
1731 my %gap_hsh; # column gaps vector
1733 foreach my $seq ( $self->each_seq ) {
1735 my $str = $seq->seq;
1736 my $len = $seq->length;
1738 my $id = $seq->display_id;
1739 while( $i < $len ) {
1740 $ch = substr($str, $i, 1);
1741 $cols[$i++]->{$id} = ($ch eq $gapchar);
1750 Usage : $ali->match()
1751 Function : Goes through all columns and changes residues that are
1752 identical to residue in first sequence to match '.'
1753 character. Sets match_char.
1755 USE WITH CARE: Most MSA formats do not support match
1756 characters in sequences, so this is mostly for output
1757 only. NEXUS format (Bio::AlignIO::nexus) can handle
1760 Argument : a match character, optional, defaults to '.'
1765 my ($self, $match) = @_;
1768 my ($matching_char) = $match;
1769 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1770 $self->map_chars($matching_char, '-');
1772 my @seqs = $self->each_seq();
1773 return 1 unless scalar @seqs > 1;
1775 my $refseq = shift @seqs ;
1776 my @refseq = split //, $refseq->seq;
1777 my $gapchar = $self->gap_char;
1779 foreach my $seq ( @seqs ) {
1780 my @varseq = split //, $seq->seq();
1781 for ( my $i=0; $i < scalar @varseq; $i++) {
1782 $varseq[$i] = $match if defined $refseq[$i] &&
1783 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1784 $refseq[$i] =~ /$gapchar/ )
1785 && $refseq[$i] eq $varseq[$i];
1787 $seq->seq(join '', @varseq);
1789 $self->match_char($match);
1797 Usage : $ali->unmatch()
1798 Function : Undoes the effect of method match. Unsets match_char.
1800 Argument : a match character, optional, defaults to '.'
1802 See L<match> and L<match_char>
1807 my ($self, $match) = @_;
1811 my @seqs = $self->each_seq();
1812 return 1 unless scalar @seqs > 1;
1814 my $refseq = shift @seqs ;
1815 my @refseq = split //, $refseq->seq;
1816 my $gapchar = $self->gap_char;
1817 foreach my $seq ( @seqs ) {
1818 my @varseq = split //, $seq->seq();
1819 for ( my $i=0; $i < scalar @varseq; $i++) {
1820 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1821 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1822 $refseq[$i] =~ /$gapchar/ ) &&
1823 $varseq[$i] eq $match;
1825 $seq->seq(join '', @varseq);
1827 $self->match_char('');
1831 =head1 MSA attributes
1833 Methods for setting and reading the MSA attributes.
1835 Note that the methods defining character semantics depend on the user
1836 to set them sensibly. They are needed only by certain input/output
1837 methods. Unset them by setting to an empty string ('').
1842 Usage : $myalign->id("Ig")
1843 Function : Gets/sets the id field of the alignment
1844 Returns : An id string
1845 Argument : An id string (optional)
1850 my ($self, $name) = @_;
1852 if (defined( $name )) {
1853 $self->{'_id'} = $name;
1856 return $self->{'_id'};
1862 Usage : $myalign->accession("PF00244")
1863 Function : Gets/sets the accession field of the alignment
1864 Returns : An acc string
1865 Argument : An acc string (optional)
1870 my ($self, $acc) = @_;
1872 if (defined( $acc )) {
1873 $self->{'_accession'} = $acc;
1876 return $self->{'_accession'};
1882 Usage : $myalign->description("14-3-3 proteins")
1883 Function : Gets/sets the description field of the alignment
1884 Returns : An description string
1885 Argument : An description string (optional)
1890 my ($self, $name) = @_;
1892 if (defined( $name )) {
1893 $self->{'_description'} = $name;
1896 return $self->{'_description'};
1901 Title : missing_char
1902 Usage : $myalign->missing_char("?")
1903 Function : Gets/sets the missing_char attribute of the alignment
1904 It is generally recommended to set it to 'n' or 'N'
1905 for nucleotides and to 'X' for protein.
1906 Returns : An missing_char string,
1907 Argument : An missing_char string (optional)
1912 my ($self, $char) = @_;
1914 if (defined $char ) {
1915 $self->throw("Single missing character, not [$char]!") if CORE
::length($char) > 1;
1916 $self->{'_missing_char'} = $char;
1919 return $self->{'_missing_char'};
1925 Usage : $myalign->match_char('.')
1926 Function : Gets/sets the match_char attribute of the alignment
1927 Returns : An match_char string,
1928 Argument : An match_char string (optional)
1933 my ($self, $char) = @_;
1935 if (defined $char ) {
1936 $self->throw("Single match character, not [$char]!") if CORE
::length($char) > 1;
1937 $self->{'_match_char'} = $char;
1940 return $self->{'_match_char'};
1946 Usage : $myalign->gap_char('-')
1947 Function : Gets/sets the gap_char attribute of the alignment
1948 Returns : An gap_char string, defaults to '-'
1949 Argument : An gap_char string (optional)
1954 my ($self, $char) = @_;
1956 if (defined $char || ! defined $self->{'_gap_char'} ) {
1957 $char= '-' unless defined $char;
1958 $self->throw("Single gap character, not [$char]!") if CORE
::length($char) > 1;
1959 $self->{'_gap_char'} = $char;
1961 return $self->{'_gap_char'};
1966 Title : symbol_chars
1967 Usage : my @symbolchars = $aln->symbol_chars;
1968 Function: Returns all the seen symbols (other than gaps)
1969 Returns : array of characters that are the seen symbols
1970 Args : boolean to include the gap/missing/match characters
1975 my ($self,$includeextra) = @_;
1977 unless ($self->{'_symbols'}) {
1978 foreach my $seq ($self->each_seq) {
1979 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1982 my %copy = %{$self->{'_symbols'}};
1983 if( ! $includeextra ) {
1984 foreach my $char ( $self->gap_char, $self->match_char,
1985 $self->missing_char) {
1986 delete $copy{$char} if( defined $char );
1992 =head1 Alignment descriptors
1994 These read only methods describe the MSA in various ways.
2000 Usage : $str = $ali->score()
2001 Function : get/set a score of the alignment
2002 Returns : a score for the alignment
2003 Argument : an optional score to set
2009 $self->{score
} = shift if @_;
2010 return $self->{score
};
2013 =head2 consensus_string
2015 Title : consensus_string
2016 Usage : $str = $ali->consensus_string($threshold_percent)
2017 Function : Makes a strict consensus
2018 Returns : Consensus string
2019 Argument : Optional treshold ranging from 0 to 100.
2020 The consensus residue has to appear at least threshold %
2021 of the sequences at a given location, otherwise a '?'
2022 character will be placed at that location.
2023 (Default value = 0%)
2027 sub consensus_string
{
2029 my $threshold = shift;
2032 my $len = $self->length - 1;
2034 foreach ( 0 .. $len ) {
2035 $out .= $self->_consensus_aa($_,$threshold);
2043 my $threshold_percent = shift || -1 ;
2044 my ($seq,%hash,$count,$letter,$key);
2045 my $gapchar = $self->gap_char;
2046 foreach $seq ( $self->each_seq() ) {
2047 $letter = substr($seq->seq,$point,1);
2048 $self->throw("--$point-----------") if $letter eq '';
2049 ($letter eq $gapchar || $letter =~ /\./) && next;
2050 # print "Looking at $letter\n";
2053 my $number_of_sequences = $self->num_sequences();
2054 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2058 foreach $key ( sort keys %hash ) {
2059 # print "Now at $key $hash{$key}\n";
2060 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2062 $count = $hash{$key};
2069 =head2 consensus_iupac
2071 Title : consensus_iupac
2072 Usage : $str = $ali->consensus_iupac()
2073 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2074 and RNA. The output is in upper case except when gaps in
2075 a column force output to be in lower case.
2077 Note that if your alignment sequences contain a lot of
2078 IUPAC ambiquity codes you often have to manually set
2079 alphabet. Bio::PrimarySeq::_guess_type thinks they
2080 indicate a protein sequence.
2081 Returns : consensus string
2083 Throws : on protein sequences
2087 sub consensus_iupac
{
2090 my $len = $self->length-1;
2092 # only DNA and RNA sequences are valid
2093 foreach my $seq ( $self->each_seq() ) {
2094 $self->throw("Seq [". $seq->get_nse. "] is a protein")
2095 if $seq->alphabet eq 'protein';
2097 # loop over the alignment columns
2098 foreach my $count ( 0 .. $len ) {
2099 $out .= $self->_consensus_iupac($count);
2104 sub _consensus_iupac
{
2105 my ($self, $column) = @_;
2106 my ($string, $char, $rna);
2108 #determine all residues in a column
2109 foreach my $seq ( $self->each_seq() ) {
2110 $string .= substr($seq->seq, $column, 1);
2112 $string = uc $string;
2114 # quick exit if there's an N in the string
2115 if ($string =~ /N/) {
2116 $string =~ /\W/ ?
return 'n' : return 'N';
2118 # ... or if there are only gap characters
2119 return '-' if $string =~ /^\W+$/;
2121 # treat RNA as DNA in regexps
2122 if ($string =~ /U/) {
2127 # the following s///'s only need to be done to the _first_ ambiguity code
2128 # as we only need to see the _range_ of characters in $string
2130 if ($string =~ /[VDHB]/) {
2131 $string =~ s/V/AGC/;
2132 $string =~ s/D/AGT/;
2133 $string =~ s/H/ACT/;
2134 $string =~ s/B/CTG/;
2137 if ($string =~ /[SKYRWM]/) {
2146 # and now the guts of the thing
2148 if ($string =~ /A/) {
2150 if ($string =~ /G/) {
2151 $char = 'R'; # A and G (purines) R
2152 if ($string =~ /C/) {
2153 $char = 'V'; # A and G and C V
2154 if ($string =~ /T/) {
2155 $char = 'N'; # A and G and C and T N
2157 } elsif ($string =~ /T/) {
2158 $char = 'D'; # A and G and T D
2160 } elsif ($string =~ /C/) {
2161 $char = 'M'; # A and C M
2162 if ($string =~ /T/) {
2163 $char = 'H'; # A and C and T H
2165 } elsif ($string =~ /T/) {
2166 $char = 'W'; # A and T W
2168 } elsif ($string =~ /C/) {
2170 if ($string =~ /T/) {
2171 $char = 'Y'; # C and T (pyrimidines) Y
2172 if ($string =~ /G/) {
2173 $char = 'B'; # C and T and G B
2175 } elsif ($string =~ /G/) {
2176 $char = 'S'; # C and G S
2178 } elsif ($string =~ /G/) {
2180 if ($string =~ /C/) {
2181 $char = 'S'; # G and C S
2182 } elsif ($string =~ /T/) {
2183 $char = 'K'; # G and T K
2185 } elsif ($string =~ /T/) {
2189 $char = 'U' if $rna and $char eq 'T';
2190 $char = lc $char if $string =~ /\W/;
2196 =head2 consensus_meta
2198 Title : consensus_meta
2199 Usage : $seqmeta = $ali->consensus_meta()
2200 Function : Returns a Bio::Seq::Meta object containing the consensus
2201 strings derived from meta data analysis.
2202 Returns : Bio::Seq::Meta
2203 Argument : Bio::Seq::Meta
2204 Throws : non-MetaI object
2208 sub consensus_meta
{
2209 my ($self, $meta) = @_;
2210 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2211 $self->throw('Not a Bio::Seq::MetaI object');
2213 return $self->{'_aln_meta'} = $meta if $meta;
2214 return $self->{'_aln_meta'}
2220 Usage : if ( $ali->is_flush() )
2221 Function : Tells you whether the alignment
2222 : is flush, i.e. all of the same length
2229 my ($self,$report) = @_;
2234 foreach $seq ( $self->each_seq() ) {
2235 if( $length == (-1) ) {
2236 $length = CORE
::length($seq->seq());
2240 $temp = CORE
::length($seq->seq());
2241 if( $temp != $length ) {
2242 $self->warn("expecting $length not $temp from ".
2243 $seq->display_id) if( $report );
2244 $self->debug("expecting $length not $temp from ".
2246 $self->debug($seq->seq(). "\n");
2258 Usage : $len = $ali->length()
2259 Function : Returns the maximum length of the alignment.
2260 To be sure the alignment is a block, use is_flush
2268 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2278 foreach $seq ( $self->each_seq() ) {
2279 $temp = $seq->length();
2280 if( $temp > $length ) {
2289 =head2 maxdisplayname_length
2291 Title : maxdisplayname_length
2292 Usage : $ali->maxdisplayname_length()
2293 Function : Gets the maximum length of the displayname in the
2294 alignment. Used in writing out various MSA formats.
2300 sub maxname_length
{
2302 $self->deprecated("maxname_length - deprecated method.".
2303 " Use maxdisplayname_length() instead.");
2304 $self->maxdisplayname_length();
2309 $self->deprecated("maxnse_length - deprecated method.".
2310 " Use maxnse_length() instead.");
2311 $self->maxdisplayname_length();
2314 sub maxdisplayname_length
{
2319 foreach $seq ( $self->each_seq() ) {
2320 $len = CORE
::length $self->displayname($seq->get_nse());
2322 if( $len > $maxname ) {
2330 =head2 max_metaname_length
2332 Title : max_metaname_length
2333 Usage : $ali->max_metaname_length()
2334 Function : Gets the maximum length of the meta name tags in the
2335 alignment for the sequences and for the alignment.
2336 Used in writing out various MSA formats.
2342 sub max_metaname_length
{
2347 # check seq meta first
2348 for $seq ( $self->each_seq() ) {
2349 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2350 for my $mtag ($seq->meta_names) {
2351 $len = CORE
::length $mtag;
2352 if( $len > $maxname ) {
2359 for my $meta ($self->consensus_meta) {
2361 for my $name ($meta->meta_names) {
2362 $len = CORE
::length $name;
2363 if( $len > $maxname ) {
2374 Title : num_residues
2375 Usage : $no = $ali->num_residues
2376 Function : number of residues in total in the alignment
2379 Note : replaces no_residues()
2387 foreach my $seq ($self->each_seq) {
2388 my $str = $seq->seq();
2390 $count += ($str =~ s/[A-Za-z]//g);
2396 =head2 num_sequences
2398 Title : num_sequences
2399 Usage : $depth = $ali->num_sequences
2400 Function : number of sequence in the sequence alignment
2403 Note : replaces no_sequences()
2409 return scalar($self->each_seq);
2413 =head2 average_percentage_identity
2415 Title : average_percentage_identity
2416 Usage : $id = $align->average_percentage_identity
2417 Function: The function uses a fast method to calculate the average
2418 percentage identity of the alignment
2419 Returns : The average percentage identity of the alignment
2421 Notes : This method implemented by Kevin Howe calculates a figure that is
2422 designed to be similar to the average pairwise identity of the
2423 alignment (identical in the absence of gaps), without having to
2424 explicitly calculate pairwise identities proposed by Richard Durbin.
2425 Validated by Ewan Birney ad Alex Bateman.
2429 sub average_percentage_identity
{
2430 my ($self,@args) = @_;
2432 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2433 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2435 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2437 if (! $self->is_flush()) {
2438 $self->throw("All sequences in the alignment must be the same length");
2441 @seqs = $self->each_seq();
2442 $len = $self->length();
2444 # load the each hash with correct keys for existence checks
2446 for( my $index=0; $index < $len; $index++) {
2447 foreach my $letter (@alphabet) {
2448 $countHashes[$index]->{$letter} = 0;
2451 foreach my $seq (@seqs) {
2452 my @seqChars = split //, $seq->seq();
2453 for( my $column=0; $column < @seqChars; $column++ ) {
2454 my $char = uc($seqChars[$column]);
2455 if (exists $countHashes[$column]->{$char}) {
2456 $countHashes[$column]->{$char}++;
2463 for(my $column =0; $column < $len; $column++) {
2464 my %hash = %{$countHashes[$column]};
2466 foreach my $res (keys %hash) {
2467 $total += $hash{$res}*($hash{$res} - 1);
2468 $subdivisor += $hash{$res};
2470 $divisor += $subdivisor * ($subdivisor - 1);
2472 return $divisor > 0 ?
($total / $divisor )*100.0 : 0;
2475 =head2 percentage_identity
2477 Title : percentage_identity
2478 Usage : $id = $align->percentage_identity
2479 Function: The function calculates the average percentage identity
2480 (aliased to average_percentage_identity)
2481 Returns : The average percentage identity
2486 sub percentage_identity
{
2488 return $self->average_percentage_identity();
2491 =head2 overall_percentage_identity
2493 Title : overall_percentage_identity
2494 Usage : $id = $align->overall_percentage_identity
2495 $id = $align->overall_percentage_identity('short')
2496 Function: The function calculates the percentage identity of
2497 the conserved columns
2498 Returns : The percentage identity of the conserved columns
2499 Args : length value to use, optional defaults to alignment length
2500 possible values: 'align', 'short', 'long'
2502 The argument values 'short' and 'long' refer to shortest and longest
2503 sequence in the alignment. Method modification code by Hongyu Zhang.
2507 sub overall_percentage_identity
{
2508 my ($self, $length_measure) = @_;
2510 my %alphabet = map {$_ => undef} qw
(A C G T U B D E F H I J K L M N O P Q R S V W X Y Z
);
2512 my %enum = map {$_ => undef} qw
(align short long
);
2514 $self->throw("Unknown argument [$length_measure]")
2515 if $length_measure and not exists $enum{$length_measure};
2516 $length_measure ||= 'align';
2518 if (! $self->is_flush()) {
2519 $self->throw("All sequences in the alignment must be the same length");
2522 # Count the residues seen at each position
2524 my $total = 0; # number of positions with identical residues
2526 my @seqs = $self->each_seq;
2527 my $nof_seqs = scalar @seqs;
2528 my $aln_len = $self->length();
2529 for my $seq (@seqs) {
2530 my $seqstr = $seq->seq;
2532 # Count residues for given sequence
2533 for my $column (0 .. $aln_len-1) {
2534 my $char = uc( substr($seqstr, $column, 1) );
2535 if ( exists $alphabet{$char} ) {
2537 # This is a valid char
2538 if ( defined $countHashes[$column]->{$char} ) {
2539 $countHashes[$column]->{$char}++;
2541 $countHashes[$column]->{$char} = 1;
2544 if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2545 # All sequences have this same residue
2553 if ($length_measure eq 'short' || $length_measure eq 'long') {
2554 my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2555 if ($length_measure eq 'short') {
2556 if ( (not defined $len) || ($seq_len < $len) ) {
2559 } elsif ($length_measure eq 'long') {
2560 if ( (not defined $len) || ($seq_len > $len) ) {
2568 if ($length_measure eq 'align') {
2572 return ($total / $len ) * 100.0;
2577 =head1 Alignment positions
2579 Methods to map a sequence position into an alignment column and back.
2580 column_from_residue_number() does the former. The latter is really a
2581 property of the sequence object and can done using
2582 L<Bio::LocatableSeq::location_from_column>:
2584 # select somehow a sequence from the alignment, e.g.
2585 my $seq = $aln->get_seq_by_pos(1);
2586 #$loc is undef or Bio::LocationI object
2587 my $loc = $seq->location_from_column(5);
2589 =head2 column_from_residue_number
2591 Title : column_from_residue_number
2592 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2593 Function: This function gives the position in the alignment
2594 (i.e. column number) of the given residue number in the
2595 sequence with the given name. For example, for the
2598 Seq1/91-97 AC..DEF.GH.
2599 Seq2/24-30 ACGG.RTY...
2600 Seq3/43-51 AC.DDEF.GHI
2602 column_from_residue_number( "Seq1", 94 ) returns 6.
2603 column_from_residue_number( "Seq2", 25 ) returns 2.
2604 column_from_residue_number( "Seq3", 50 ) returns 10.
2606 An exception is thrown if the residue number would lie
2607 outside the length of the aligment
2608 (e.g. column_from_residue_number( "Seq2", 22 )
2610 Note: If the the parent sequence is represented by more than
2611 one alignment sequence and the residue number is present in
2612 them, this method finds only the first one.
2614 Returns : A column number for the position in the alignment of the
2615 given residue in the given sequence (1 = first column)
2616 Args : A sequence id/name (not a name/start-end)
2617 A residue number in the whole sequence (not just that
2618 segment of it in the alignment)
2622 sub column_from_residue_number
{
2623 my ($self, $name, $resnumber) = @_;
2625 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2626 $self->throw("Second argument residue number missing") unless $resnumber;
2628 foreach my $seq ($self->each_seq_with_id($name)) {
2631 $col = $seq->column_from_residue_number($resnumber);
2637 $self->throw("Could not find a sequence segment in $name ".
2638 "containing residue number $resnumber");
2642 =head1 Sequence names
2644 Methods to manipulate the display name. The default name based on the
2645 sequence id and subsequence positions can be overridden in various
2651 Usage : $myalign->displayname("Ig", "IgA")
2652 Function : Gets/sets the display name of a sequence in the alignment
2653 Returns : A display name string
2654 Argument : name of the sequence
2655 displayname of the sequence (optional)
2659 sub get_displayname
{
2661 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2662 $self->displayname(@_);
2665 sub set_displayname
{
2667 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2668 $self->displayname(@_);
2672 my ($self, $name, $disname) = @_;
2674 $self->throw("No sequence with name [$name]")
2675 unless defined $self->{'_seq'}->{$name};
2677 if( $disname and $name) {
2678 $self->{'_dis_name'}->{$name} = $disname;
2681 elsif( defined $self->{'_dis_name'}->{$name} ) {
2682 return $self->{'_dis_name'}->{$name};
2688 =head2 set_displayname_count
2690 Title : set_displayname_count
2691 Usage : $ali->set_displayname_count
2692 Function : Sets the names to be name_# where # is the number of
2693 times this name has been used.
2694 Returns : 1, on success
2699 sub set_displayname_count
{
2701 my (@arr,$name,$seq,$count,$temp,$nse);
2703 foreach $seq ( $self->each_alphabetically() ) {
2704 $nse = $seq->get_nse();
2706 #name will be set when this is the second
2707 #time (or greater) is has been seen
2709 if( defined $name and $name eq ($seq->id()) ) {
2710 $temp = sprintf("%s_%s",$name,$count);
2711 $self->displayname($nse,$temp);
2716 $temp = sprintf("%s_%s",$name,$count);
2717 $self->displayname($nse,$temp);
2724 =head2 set_displayname_flat
2726 Title : set_displayname_flat
2727 Usage : $ali->set_displayname_flat()
2728 Function : Makes all the sequences be displayed as just their name,
2735 sub set_displayname_flat
{
2739 foreach $seq ( $self->each_seq() ) {
2740 $nse = $seq->get_nse();
2741 $self->displayname($nse,$seq->id());
2746 =head2 set_displayname_normal
2748 Title : set_displayname_normal
2749 Usage : $ali->set_displayname_normal()
2750 Function : Makes all the sequences be displayed as name/start-end
2751 Returns : 1, on success
2756 sub set_displayname_normal
{
2760 foreach $seq ( $self->each_seq() ) {
2761 $nse = $seq->get_nse();
2762 $self->displayname($nse,$nse);
2770 Usage : $obj->source($newval)
2771 Function: sets the Alignment source program
2773 Returns : value of source
2774 Args : newvalue (optional)
2780 my ($self,$value) = @_;
2781 if( defined $value) {
2782 $self->{'_source'} = $value;
2784 return $self->{'_source'};
2787 =head2 set_displayname_safe
2789 Title : set_displayname_safe
2790 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2791 Function : Assign machine-generated serial names to sequences in input order.
2792 Designed to protect names during PHYLIP runs. Assign 10-char string
2793 in the form of "S000000001" to "S999999999". Restore the original
2794 names using "restore_displayname".
2795 Returns : 1. a new $aln with system names;
2796 2. a hash ref for restoring names
2797 Argument : Number for id length (default 10)
2801 sub set_displayname_safe
{
2803 my $idlength = shift || 10;
2804 my ($seq, %phylip_name);
2806 my $new=Bio
::SimpleAlign
->new();
2807 foreach $seq ( $self->each_seq() ) {
2809 my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2810 $phylip_name{$pname}=$seq->id();
2811 my $new_seq= Bio
::LocatableSeq
->new(-id
=> $pname,
2812 -seq
=> $seq->seq(),
2813 -alphabet
=> $seq->alphabet,
2814 -start
=> $seq->{_start
},
2815 -end
=> $seq->{_end
}
2817 $new->add_seq($new_seq);
2820 $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2821 return ($new, \
%phylip_name);
2824 =head2 restore_displayname
2826 Title : restore_displayname
2827 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2828 Function : Restore original sequence names (after running
2829 $ali->set_displayname_safe)
2830 Returns : a new $aln with names restored.
2831 Argument : a hash reference of names from "set_displayname_safe".
2835 sub restore_displayname
{
2839 my $new=Bio
::SimpleAlign
->new();
2840 foreach my $seq ( $self->each_seq() ) {
2841 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2842 my $new_seq= Bio
::LocatableSeq
->new(-id
=> $name{$seq->id()},
2843 -seq
=> $seq->seq(),
2844 -alphabet
=> $seq->alphabet,
2845 -start
=> $seq->{_start
},
2846 -end
=> $seq->{_end
}
2848 $new->add_seq($new_seq);
2853 =head2 sort_by_start
2855 Title : sort_by_start
2856 Usage : $ali->sort_by_start
2857 Function : Changes the order of the alignment to the start position of each
2866 my ($seq,$nse,@arr,%hash,$count);
2867 foreach $seq ( $self->each_seq() ) {
2868 $nse = $seq->get_nse;
2872 %{$self->{'_order'}} = (); # reset the hash;
2873 foreach $nse ( sort _startend
keys %hash) {
2874 $self->{'_order'}->{$count} = $nse;
2882 my ($aname,$arange) = split (/[\/]/,$a);
2883 my ($bname,$brange) = split (/[\/]/,$b);
2884 my ($astart,$aend) = split(/\-/,$arange);
2885 my ($bstart,$bend) = split(/\-/,$brange);
2886 return $astart <=> $bstart;
2889 =head2 bracket_string
2891 Title : bracket_string
2892 Usage : my @params = (-refseq => 'testseq',
2893 -allele1 => 'allele1',
2894 -allele2 => 'allele2',
2895 -delimiters => '{}',
2897 $str = $aln->bracket_string(@params)
2899 Function : When supplied with a list of parameters (see below), returns a
2900 string in BIC format. This is used for allelic comparisons.
2901 Briefly, if either allele contains a base change when compared to
2902 the refseq, the base or gap for each allele is represented in
2903 brackets in the order present in the 'alleles' parameter.
2905 For the following data:
2914 the returned string with parameters 'refseq => testseq' and
2915 'alleles => [qw(allele1 allele2)]' would be:
2917 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2918 Returns : BIC-formatted string
2919 Argument : Required args
2920 refseq : string (ID) of the reference sequence used
2921 as basis for comparison
2922 allele1 : string (ID) of the first allele
2923 allele2 : string (ID) of the second allele
2925 delimiters: two symbol string of left and right delimiters.
2926 Only the first two symbols are used
2928 separator : string used as a separator. Only the first
2931 Throws : On no refseq/alleles, or invalid refseq/alleles.
2935 sub bracket_string
{
2936 my ($self, @args) = @_;
2937 my ($ref, $a1, $a2, $delim, $sep) =
2938 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2939 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2941 ($ld, $rd) = split('', $delim, 2) if $delim;
2945 my ($refseq, $allele1, $allele2) =
2946 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2947 if (!$refseq || !$allele1 || !$allele2) {
2948 $self->throw("One of your refseq/allele IDs is invalid!");
2950 my $len = $self->length-1;
2952 # loop over the alignment columns
2953 for my $column ( 0 .. $len ) {
2955 my ($compres, $res1, $res2) =
2956 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2957 # are any of the allele symbols different from the refseq?
2958 $string = ($compres eq $res1 && $compres eq $res2) ?
$compres :
2959 $ld.$res1.$sep.$res2.$rd;
2966 =head2 methods implementing Bio::FeatureHolderI
2968 FeatureHolderI implementation to support labeled character sets like one
2969 would get from NEXUS represented data.
2971 =head2 get_SeqFeatures
2973 Usage : @features = $aln->get_SeqFeatures
2974 Function: Get the feature objects held by this feature holder.
2976 Returns : an array of Bio::SeqFeatureI implementing objects
2977 Args : optional filter coderef, taking a Bio::SeqFeatureI
2978 : as argument, returning TRUE if wanted, FALSE if
2983 sub get_SeqFeatures
{
2985 my $filter_cb = shift;
2986 $self->throw("Arg (filter callback) must be a coderef") unless
2987 !defined($filter_cb) or ref($filter_cb) eq 'CODE';
2988 if( !defined $self->{'_as_feat'} ) {
2989 $self->{'_as_feat'} = [];
2992 return grep { $filter_cb->($_) } @
{$self->{'_as_feat'}};
2994 return @
{$self->{'_as_feat'}};
2997 =head2 add_SeqFeature
2999 Usage : $aln->add_SeqFeature($subfeat);
3000 Function: adds a SeqFeature into the SeqFeature array.
3002 Returns : true on success
3003 Args : a Bio::SeqFeatureI object
3004 Note : This implementation is not compliant
3005 with Bio::FeatureHolderI
3009 sub add_SeqFeature
{
3010 my ($self,@feat) = @_;
3012 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3014 foreach my $feat ( @feat ) {
3015 if( !$feat->isa("Bio::SeqFeatureI") ) {
3016 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3019 push(@
{$self->{'_as_feat'}},$feat);
3025 =head2 remove_SeqFeatures
3027 Usage : $obj->remove_SeqFeatures
3028 Function: Removes all SeqFeatures. If you want to remove only a subset,
3029 remove that subset from the returned array, and add back the rest.
3030 Returns : The array of Bio::SeqFeatureI features that was
3031 deleted from this alignment.
3036 sub remove_SeqFeatures
{
3039 return () unless $self->{'_as_feat'};
3040 my @feats = @
{$self->{'_as_feat'}};
3041 $self->{'_as_feat'} = [];
3045 =head2 feature_count
3047 Title : feature_count
3048 Usage : $obj->feature_count()
3049 Function: Return the number of SeqFeatures attached to the alignment
3050 Returns : integer representing the number of SeqFeatures
3058 if (defined($self->{'_as_feat'})) {
3059 return ($#{$self->{'_as_feat'}} + 1);
3065 =head2 get_all_SeqFeatures
3067 Title : get_all_SeqFeatures
3069 Function: Get all SeqFeatures.
3071 Returns : an array of Bio::SeqFeatureI implementing objects
3073 Note : Falls through to Bio::FeatureHolderI implementation.
3077 =head2 methods for Bio::AnnotatableI
3079 AnnotatableI implementation to support sequence alignments which
3080 contain annotation (NEXUS, Stockholm).
3085 Usage : $ann = $aln->annotation or
3086 $aln->annotation($ann)
3087 Function: Gets or sets the annotation
3088 Returns : Bio::AnnotationCollectionI object
3089 Args : None or Bio::AnnotationCollectionI object
3091 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3092 for more information
3097 my ($obj,$value) = @_;
3098 if( defined $value ) {
3099 $obj->throw("object of class ".ref($value)." does not implement ".
3100 "Bio::AnnotationCollectionI. Too bad.")
3101 unless $value->isa("Bio::AnnotationCollectionI");
3102 $obj->{'_annotation'} = $value;
3103 } elsif( ! defined $obj->{'_annotation'}) {
3104 $obj->{'_annotation'} = Bio
::Annotation
::Collection
->new();
3106 return $obj->{'_annotation'};
3109 =head1 Deprecated methods
3116 Usage : $no = $ali->no_residues
3117 Function : number of residues in total in the alignment
3120 Note : deprecated in favor of num_residues()
3126 $self->deprecated(-warn_version
=> 1.0069,
3127 -throw_version
=> 1.0075,
3128 -message
=> 'Use of method no_residues() is deprecated, use num_residues() instead');
3129 $self->num_residues(@_);
3134 Title : no_sequences
3135 Usage : $depth = $ali->no_sequences
3136 Function : number of sequence in the sequence alignment
3139 Note : deprecated in favor of num_sequences()
3145 $self->deprecated(-warn_version
=> 1.0069,
3146 -throw_version
=> 1.0075,
3147 -message
=> 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3148 $self->num_sequences(@_);
3153 Title : mask_columns
3154 Usage : $aln2 = $aln->mask_columns(20,30)
3155 Function : Masks a slice of the alignment inclusive of start and
3156 end columns, and the first column in the alignment is denoted 1.
3157 Mask beyond the length of the sequence does not do padding.
3158 Returns : A Bio::SimpleAlign object
3159 Args : Positive integer for start column, positive integer for end column,
3160 optional string value use for the mask. Example:
3162 $aln2 = $aln->mask_columns(20,30,'?')
3163 Note : Masking must use a character that is not used for gaps or
3164 frameshifts. These can be adjusted using the relevant global
3165 variables, but be aware these may be (uncontrollably) modified
3166 elsewhere within BioPerl (see bug 2715)
3171 #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3174 my $nonres = $Bio::LocatableSeq
::GAP_SYMBOLS
.
3175 $Bio::LocatableSeq
::FRAMESHIFT_SYMBOLS
;
3177 # coordinates are alignment-based, not sequence-based
3178 my ($start, $end, $mask_char) = @_;
3179 unless (defined $mask_char) { $mask_char = 'N' }
3181 $self->throw("Mask start has to be a positive integer and less than ".
3182 "alignment length, not [$start]")
3183 unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3184 $self->throw("Mask end has to be a positive integer and less than ".
3185 "alignment length, not [$end]")
3186 unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3187 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3188 "end [$end]") unless $start <= $end;
3189 $self->throw("Mask character $mask_char has to be a single character ".
3190 "and not a gap or frameshift symbol")
3191 unless CORE
::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3193 my $aln = $self->new;
3194 $aln->id($self->id);
3195 foreach my $seq ( $self->each_seq() ) {
3196 my $new_seq = Bio
::LocatableSeq
->new(-id
=> $seq->id,
3197 -alphabet
=> $seq->alphabet,
3198 -strand
=> $seq->strand,
3199 -verbose
=> $self->verbose);
3201 # convert from 1-based alignment coords!
3202 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3203 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3204 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3205 $new_seq->seq($new_dna_string);
3206 $aln->add_seq($new_seq);