2 # BioPerl module for SimpleAlign
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
15 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
16 # May 2001 major rewrite - Heikki Lehvaslaiho
20 Bio::SimpleAlign - Multiple alignments held as a set of sequences
24 # Use Bio::AlignIO to read in the alignment
25 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
26 $aln = $str->next_aln();
30 print $aln->num_residues;
32 print $aln->num_sequences;
34 print $aln->percentage_identity;
35 print $aln->consensus_string(50);
37 # Find the position in the alignment for a sequence location
38 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
40 # Extract sequences and check values for the alignment column $pos
41 foreach $seq ($aln->each_seq) {
42 $res = $seq->subseq($pos, $pos);
45 foreach $res (keys %count) {
46 printf "Res: %s Count: %2d\n", $res, $count{$res};
50 $aln->remove_seq($seq);
51 $mini_aln = $aln->slice(20,30); # get a block of columns
52 $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
53 $new_aln = $aln->remove_columns([20,30]); # remove by position
54 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
57 $str = $aln->consensus_string($threshold_percent);
58 $str = $aln->match_line();
59 $str = $aln->cigar_line();
60 $id = $aln->percentage_identity;
62 # See the module documentation for details and more methods.
66 SimpleAlign is an object that handles a multiple sequence alignment
67 (MSA). It is very permissive of types (it does not insist on sequences
68 being all same length, for example). Think of it as a set of sequences
69 with a whole series of built-in manipulations and methods for reading and
72 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
73 to store its sequences. These are subsequences with a start and end
74 positions in the parent reference sequence. Each sequence in the
75 SimpleAlign object is a Bio::LocatableSeq.
77 SimpleAlign expects the combination of name, start, and end for a
78 given sequence to be unique in the alignment, and this is the key for the
79 internal hashes (name, start, end are abbreviated C<nse> in the code).
80 However, in some cases people do not want the name/start-end to be displayed:
81 either multiple names in an alignment or names specific to the alignment
82 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
83 C<displayname>, and generally is what is used to print out the
84 alignment. They default to name/start-end.
86 The SimpleAlign Module is derived from the Align module by Ewan Birney.
92 User feedback is an integral part of the evolution of this and other
93 Bioperl modules. Send your comments and suggestions preferably to one
94 of the Bioperl mailing lists. Your participation is much appreciated.
96 bioperl-l@bioperl.org - General discussion
97 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
101 Please direct usage questions or support issues to the mailing list:
103 I<bioperl-l@bioperl.org>
105 rather than to the module maintainer directly. Many experienced and
106 reponsive experts will be able look at the problem and quickly
107 address it. Please include a thorough description of the problem
108 with code and data examples if at all possible.
110 =head2 Reporting Bugs
112 Report bugs to the Bioperl bug tracking system to help us keep track
113 the bugs and their resolution. Bug reports can be submitted via the
116 http://bugzilla.open-bio.org/
120 Ewan Birney, birney@ebi.ac.uk
124 Allen Day, allenday-at-ucla.edu,
125 Richard Adams, Richard.Adams-at-ed.ac.uk,
126 David J. Evans, David.Evans-at-vir.gla.ac.uk,
127 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
128 Allen Smith, allens-at-cpan.org,
129 Jason Stajich, jason-at-bioperl.org,
130 Anthony Underwood, aunderwood-at-phls.org.uk,
131 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
132 Brian Osborne, bosborne at alum.mit.edu
133 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
134 Hongyu Zhang, forward at hongyu.org
135 Jay Hannah, jay at jays.net
136 Alexandr Bezginov, albezg at gmail.com
144 The rest of the documentation details each of the object
145 methods. Internal methods are usually preceded with a _
149 # 'Let the code begin...
151 package Bio
::SimpleAlign
;
152 use vars
qw(%CONSERVATION_GROUPS);
155 use Bio::LocatableSeq; # uses Seq's as list
158 use Bio::SeqFeature::Generic;
161 # This data should probably be in a more centralized module...
162 # it is taken from Clustalw documentation.
163 # These are all the positively scoring groups that occur in the
164 # Gonnet Pam250 matrix. The strong and weak groups are
165 # defined as strong score >0.5 and weak score =<0.5 respectively.
167 %CONSERVATION_GROUPS = (
192 use base
qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
193 Bio::FeatureHolderI);
198 Usage : my $aln = Bio::SimpleAlign->new();
199 Function : Creates a new simple align object
200 Returns : Bio::SimpleAlign
201 Args : -source => string representing the source program
202 where this alignment came from
203 -annotation => Bio::AnnotationCollectionI
204 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
205 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
206 -consensus => consensus string
207 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
213 my($class,@args) = @_;
215 my $self = $class->SUPER::new
(@args);
217 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
230 $src && $self->source($src);
231 defined $score && $self->score($score);
232 # we need to set up internal hashs first!
234 $self->{'_seq'} = {};
235 $self->{'_order'} = {};
236 $self->{'_start_end_lists'} = {};
237 $self->{'_dis_name'} = {};
238 $self->{'_id'} = 'NoName';
239 # maybe we should automatically read in from args. Hmmm...
240 $id && $self->id($id);
241 $acc && $self->accession($acc);
242 $desc && $self->description($desc);
243 $coll && $self->annotation($coll);
244 # sequence annotation is layered into a provided annotation collection (or dies)
246 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
247 "with a sequence annotation collection")
249 $coll->add_Annotation('seq_annotation', $sa);
251 if ($feats && ref $feats eq 'ARRAY') {
252 for my $feat (@
$feats) {
253 $self->add_SeqFeature($feat);
256 $con && $self->consensus($con);
257 $cmeta && $self->consensus_meta($cmeta);
258 # assumes these are in correct alignment order
259 if ($seqs && ref($seqs) eq 'ARRAY') {
260 for my $seq (@
$seqs) {
261 $self->add_seq($seq);
265 return $self; # success - we hope!
268 =head1 Modifier methods
270 These methods modify the MSA by adding, removing or shuffling complete
276 Usage : $myalign->add_seq($newseq);
277 $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
278 Function : Adds another sequence to the alignment. *Does not* align
279 it - just adds it to the hashes.
280 If -ORDER is specified, the sequence is inserted at the
281 the position spec'd by -ORDER, and existing sequences
282 are pushed down the storage array.
284 Args : A Bio::LocatableSeq object
285 Positive integer for the sequence position (optional)
287 See L<Bio::LocatableSeq> for more information
293 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
300 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
301 my ($name,$id,$start,$end);
304 $self->throw("LocatableSeq argument required");
306 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
307 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
309 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
310 $order--; # jay's patch (user-specified order is 1-origin)
313 $self->throw("User-specified value for ORDER must be >= 1");
316 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
318 # build the symbol list for this sequence,
319 # will prune out the gap and missing/match chars
320 # when actually asked for the symbol list in the
322 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
324 $name = $seq->get_nse;
326 if( $self->{'_seq'}->{$name} ) {
327 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
330 $self->debug( "Assigning $name to $order\n");
332 my $ordh = $self->{'_order'};
333 if ($ordh->{$order}) {
334 # make space to insert
335 # $c->() returns (in reverse order) the first subsequence
336 # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
337 # (3,2,1), and $c->(2,4,5) returns (2).
339 $c = sub { return (($_[1]-$_[0] == 1) ?
($c->(@_[1..$#_]),$_[0]) : $_[0]); };
341 $ordh->{$_+1} = $ordh->{$_}
342 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
345 $ordh->{$order} = $name;
347 unless( exists( $self->{'_start_end_lists'}->{$id})) {
348 $self->{'_start_end_lists'}->{$id} = [];
350 push @
{$self->{'_start_end_lists'}->{$id}}, $seq;
353 $self->{'_seq'}->{$name} = $seq;
361 Usage : $aln->remove_seq($seq);
362 Function : Removes a single sequence from an alignment
364 Argument : a Bio::LocatableSeq object
370 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
371 $self->remove_seq(@_);
377 my ($name,$id,$start,$end);
379 $self->throw("Need Bio::Locatable seq argument ")
380 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
383 $start = $seq->start();
385 $name = sprintf("%s/%d-%d",$id,$start,$end);
387 if( !exists $self->{'_seq'}->{$name} ) {
388 $self->throw("Sequence $name does not exist in the alignment to remove!");
391 delete $self->{'_seq'}->{$name};
393 # we need to remove this seq from the start_end_lists hash
395 if (exists $self->{'_start_end_lists'}->{$id}) {
396 # we need to find the sequence in the array.
399 for ($i=0; $i < @
{$self->{'_start_end_lists'}->{$id}}; $i++) {
400 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
406 splice @
{$self->{'_start_end_lists'}->{$id}}, $i, 1;
409 $self->throw("Could not find the sequence to remoce from the start-end list");
413 $self->throw("There is no seq list for the name $id");
415 # we need to shift order hash
416 my %rev_order = reverse %{$self->{'_order'}};
417 my $no = $rev_order{$name};
418 my $num_sequences = $self->num_sequences;
419 for (; $no < $num_sequences; $no++) {
420 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
422 delete $self->{'_order'}->{$no};
430 Usage : $aln->purge(0.7);
431 Function: Removes sequences above given sequence similarity
432 This function will grind on large alignments. Beware!
434 Returns : An array of the removed sequences
435 Args : float, threshold for similarity
440 my ($self,$perc) = @_;
441 my (%duplicate, @dups);
443 my @seqs = $self->each_seq();
445 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
448 #skip if already in duplicate hash
449 next if exists $duplicate{$seq->display_id} ;
450 my $one = $seq->seq();
452 my @one = split '', $one; #split to get 1aa per array element
454 for (my $j=$i+1;$j < @seqs;$j++) {
455 my $seq2 = $seqs[$j];
457 #skip if already in duplicate hash
458 next if exists $duplicate{$seq2->display_id} ;
460 my $two = $seq2->seq();
461 my @two = split '', $two;
465 for (my $k=0;$k<@one;$k++) {
466 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
467 $one[$k] eq $two[$k]) {
470 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
471 $two[$k] ne '.' && $two[$k] ne '-' ) {
477 $ratio = $count/$res unless $res == 0;
479 # if above threshold put in duplicate hash and push onto
480 # duplicate array for returning to get_unique
481 if ( $ratio > $perc ) {
482 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
483 $duplicate{$seq2->display_id} = 1;
488 foreach my $seq (@dups) {
489 $self->remove_seq($seq);
494 =head2 sort_alphabetically
496 Title : sort_alphabetically
497 Usage : $ali->sort_alphabetically
498 Function : Changes the order of the alignment to alphabetical on name
499 followed by numerical by number.
505 sub sort_alphabetically
{
507 my ($seq,$nse,@arr,%hash,$count);
509 foreach $seq ( $self->each_seq() ) {
510 $nse = $seq->get_nse;
516 %{$self->{'_order'}} = (); # reset the hash;
518 foreach $nse ( sort _alpha_startend
keys %hash) {
519 $self->{'_order'}->{$count} = $nse;
529 Usage : $aln_ordered=$aln->sort_by_list($list_file)
530 Function : Arbitrarily order sequences in an alignment
531 Returns : A new Bio::SimpleAlign object
532 Argument : a file listing sequence names in intended order (one name per line)
537 my ($self, $list) = @_;
538 my (@seq, @ids, %order);
540 foreach my $seq ( $self->each_seq() ) {
542 push @ids, $seq->display_id;
546 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
550 $self->throw("Not found in alignment: $name") unless &_in_aln
($name, \
@ids);
555 # use the map-sort-map idiom:
556 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
557 my $aln = $self->new;
558 foreach (@sorted) { $aln->add_seq($_) }
562 =head2 set_new_reference
564 Title : set_new_reference
565 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
566 the sequence whoes name is "B31" (full, exact, and case-sensitive),
567 as the reference (1st) sequence
568 Function : Change/Set a new reference (i.e., the first) sequence
569 Returns : a new Bio::SimpleAlign object.
570 Throws an exception if designated sequence not found
571 Argument : a positive integer of sequence order, or a sequence name
572 in the original alignment
576 sub set_new_reference
{
577 my ($self, $seqid) = @_;
578 my $aln = $self->new;
579 my (@seq, @ids, @new_seq);
581 foreach my $seq ( $self->each_seq() ) {
583 push @ids, $seq->display_id;
586 if ($seqid =~ /^\d+$/) { # argument is seq position
588 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
589 } else { # argument is a seq name
590 $self->throw("The new reference sequence not in alignment ") unless &_in_aln
($seqid, \
@ids);
593 for (my $i=0; $i<=$#seq; $i++) {
595 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
596 unshift @new_seq, $seq[$i];
598 push @new_seq, $seq[$i];
601 foreach (@new_seq) { $aln->add_seq($_); }
605 sub _in_aln
{ # check if input name exists in the alignment
606 my ($str, $ref) = @_;
608 return 1 if $str eq $_;
617 Usage : $aln->uniq_seq(): Remove identical sequences in
618 in the alignment. Ambiguous base ("N", "n") and
619 leading and ending gaps ("-") are NOT counted as
621 Function : Make a new alignment of unique sequence types (STs)
622 Returns : 1a. if called in a scalar context,
623 a new Bio::SimpleAlign object (all sequences renamed as "ST")
624 1b. if called in an array context,
625 a new Bio::SimpleAlign object, and a hashref whose keys
626 are sequence types, and whose values are arrayrefs to
627 lists of sequence ids within the corresponding sequence type
628 2. if $aln->verbose > 0, ST of each sequence is sent to
629 STDERR (in a tabular format)
635 my ($self, $seqid) = @_;
636 my $aln = $self->new;
637 my (%member, %order, @seq, @uniq_str, $st);
639 my $len = $self->length();
641 foreach my $seq ( $self->each_seq() ) {
642 my $str = $seq->seq();
644 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
645 # comparing two sequence strings
647 # 1st, convert "n", "N" to "?" (for DNA sequence only):
648 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
649 # 2nd, convert leading and ending gaps to "?":
650 $str = &_convert_leading_ending_gaps
($str, '-', '?');
651 # Note that '?' also can mean unknown residue.
652 # I don't like making global class member changes like this, too
653 # prone to errors... -- cjfields 08-11-18
654 local $Bio::LocatableSeq
::GAP_SYMBOLS
= '-\?';
655 my $new = Bio
::LocatableSeq
->new(
657 -alphabet
=> $seq->alphabet,
659 -start
=> $seq->start,
665 foreach my $seq (@seq) {
666 my $str = $seq->seq();
667 my ($seen, $key) = &_check_uniq
($str, \
@uniq_str, $len);
668 if ($seen) { # seen before
669 my @memb = @
{$member{$key}};
671 $member{$key} = \
@memb;
673 push @uniq_str, $key;
675 $member{$key} = [ ($seq) ];
676 $order{$key} = $order;
680 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
681 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
682 my $str2 = &_convert_leading_ending_gaps
($str, '?', '-');
683 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
684 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
686 my $end=length($str2);
687 $end -= length($1) while $str2 =~ m/($gap+)/g;
688 my $new = Bio
::LocatableSeq
->new(-id
=>"ST".$order{$str},
694 foreach (@
{$member{$str}}) {
695 push @
{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
696 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
699 return wantarray ?
($aln, $st) : $aln;
702 sub _check_uniq
{ # check if same seq exists in the alignment
703 my ($str1, $ref, $length) = @_;
704 my @char1=split //, $str1;
707 return (0, $str1) if @array==0; # not seen (1st sequence)
709 foreach my $str2 (@array) {
711 my @char2=split //, $str2;
712 for (my $i=0; $i<=$length-1; $i++) {
713 next if $char1[$i] eq '?';
714 next if $char2[$i] eq '?';
715 $diff++ if $char1[$i] ne $char2[$i];
717 return (1, $str2) if $diff == 0; # seen before
720 return (0, $str1); # not seen
723 sub _convert_leading_ending_gaps
{
727 my @array=split //, $s;
728 # convert leading char:
729 for (my $i=0; $i<=$#array; $i++) {
730 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
732 # convert ending char:
733 for (my $i = $#array; $i>= 0; $i--) {
734 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
736 my $s_new=join '', @array;
740 =head1 Sequence selection methods
742 Methods returning one or more sequences objects.
747 Usage : foreach $seq ( $align->each_seq() )
748 Function : Gets a Seq object from the alignment
756 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
764 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
765 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
766 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
773 =head2 each_alphabetically
775 Title : each_alphabetically
776 Usage : foreach $seq ( $ali->each_alphabetically() )
777 Function : Returns a sequence object, but the objects are returned
778 in alphabetically sorted order.
779 Does not change the order of the alignment.
785 sub each_alphabetically
{
787 my ($seq,$nse,@arr,%hash,$count);
789 foreach $seq ( $self->each_seq() ) {
790 $nse = $seq->get_nse;
794 foreach $nse ( sort _alpha_startend
keys %hash) {
795 push(@arr,$hash{$nse});
800 sub _alpha_startend
{
801 my ($aname,$astart,$bname,$bstart);
802 ($aname,$astart) = split (/-/,$a);
803 ($bname,$bstart) = split (/-/,$b);
805 if( $aname eq $bname ) {
806 return $astart <=> $bstart;
809 return $aname cmp $bname;
813 =head2 each_seq_with_id
815 Title : each_seq_with_id
816 Usage : foreach $seq ( $align->each_seq_with_id() )
817 Function : Gets a Seq objects from the alignment, the contents
818 being those sequences with the given name (there may be
821 Argument : a seq name
827 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
828 $self->each_seq_with_id(@_);
831 sub each_seq_with_id
{
835 $self->throw("Method each_seq_with_id needs a sequence name argument")
840 if (exists($self->{'_start_end_lists'}->{$id})) {
841 @arr = @
{$self->{'_start_end_lists'}->{$id}};
846 =head2 get_seq_by_pos
848 Title : get_seq_by_pos
849 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
850 Function : Gets a sequence based on its position in the alignment.
851 Numbering starts from 1. Sequence positions larger than
852 num_sequences() will thow an error.
853 Returns : a Bio::LocatableSeq object
854 Args : positive integer for the sequence position
863 $self->throw("Sequence position has to be a positive integer, not [$pos]")
864 unless $pos =~ /^\d+$/ and $pos > 0;
865 $self->throw("No sequence at position [$pos]")
866 unless $pos <= $self->num_sequences ;
868 my $nse = $self->{'_order'}->{--$pos};
869 return $self->{'_seq'}->{$nse};
874 Title : get_seq_by_id
875 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
876 Function : Gets a sequence based on its name.
877 Sequences that do not exist will warn and return undef
878 Returns : a Bio::LocatableSeq object
879 Args : string for sequence name
884 my ($self,$name) = @_;
885 unless( defined $name ) {
886 $self->warn("Must provide a sequence name");
889 for my $seq ( values %{$self->{'_seq'}} ) {
890 if ( $seq->id eq $name) {
897 =head2 seq_with_features
899 Title : seq_with_features
900 Usage : $seq = $aln->seq_with_features(-pos => 1,
903 sub { my $consensus = shift;
908 while($consensus =~ /[^?]$q[^?]/){
909 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
915 Function: produces a Bio::Seq object by first splicing gaps from -pos
916 (by means of a splice_by_seq_pos() call), then creating
917 features using non-? chars (by means of a consensus_string()
918 call with stringency -consensus).
919 Returns : a Bio::Seq object
920 Args : -pos : required. sequence from which to build the Bio::Seq
922 -consensus : optional, defaults to consensus_string()'s
924 -mask : optional, a coderef to apply to consensus_string()'s
925 output before building features. this may be useful for
926 closing gaps of 1 bp by masking over them with N, for
931 sub seq_with_features
{
932 my ($self,%arg) = @_;
934 #first do the preparatory splice
935 $self->throw("must provide a -pos argument") unless $arg{-pos};
936 $self->splice_by_seq_pos($arg{-pos});
938 my $consensus_string = $self->consensus_string($arg{-consensus
});
939 $consensus_string = $arg{-mask
}->($consensus_string)
940 if defined($arg{-mask
});
944 push @bs, 1 if $consensus_string =~ /^[^?]/;
946 while($consensus_string =~ /\?[^?]/g){
947 push @bs, pos($consensus_string);
949 while($consensus_string =~ /[^?]\?/g){
950 push @es, pos($consensus_string);
953 push @es, length($consensus_string) if $consensus_string =~ /[^?]$/;
955 my $seq = Bio
::Seq
->new();
957 # my $rootfeature = Bio::SeqFeature::Generic->new(
958 # -source_tag => 'location',
959 # -start => $self->get_seq_by_pos($arg{-pos})->start,
960 # -end => $self->get_seq_by_pos($arg{-pos})->end,
962 # $seq->add_SeqFeature($rootfeature);
964 while(my $b = shift @bs){
966 $seq->add_SeqFeature(
967 Bio
::SeqFeature
::Generic
->new(
968 -start
=> $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
969 -end
=> $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
970 -source_tag
=> $self->source || 'MSA',
979 =head1 Create new alignments
981 The result of these methods are horizontal or vertical subsets of the
987 Usage : $aln2 = $aln->select(1, 3) # three first sequences
988 Function : Creates a new alignment from a continuous subset of
989 sequences. Numbering starts from 1. Sequence positions
990 larger than num_sequences() will thow an error.
991 Returns : a Bio::SimpleAlign object
992 Args : positive integer for the first sequence
993 positive integer for the last sequence to include (optional)
999 my ($start, $end) = @_;
1001 $self->throw("Select start has to be a positive integer, not [$start]")
1002 unless $start =~ /^\d+$/ and $start > 0;
1003 $self->throw("Select end has to be a positive integer, not [$end]")
1004 unless $end =~ /^\d+$/ and $end > 0;
1005 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1006 unless $start <= $end;
1008 my $aln = $self->new;
1009 foreach my $pos ($start .. $end) {
1010 $aln->add_seq($self->get_seq_by_pos($pos));
1012 $aln->id($self->id);
1013 # fix for meta, sf, ann
1017 =head2 select_noncont
1019 Title : select_noncont
1020 Usage : # 1st and 3rd sequences, sorted
1021 $aln2 = $aln->select_noncont(1, 3)
1023 # 1st and 3rd sequences, sorted (same as first)
1024 $aln2 = $aln->select_noncont(3, 1)
1026 # 1st and 3rd sequences, unsorted
1027 $aln2 = $aln->select_noncont('nosort',3, 1)
1029 Function : Creates a new alignment from a subset of sequences. Numbering
1030 starts from 1. Sequence positions larger than num_sequences() will
1031 throw an error. Sorts the order added to new alignment by default,
1032 to prevent sorting pass 'nosort' as the first argument in the list.
1033 Returns : a Bio::SimpleAlign object
1034 Args : array of integers for the sequences. If the string 'nosort' is
1035 passed as the first argument, the sequences will not be sorted
1036 in the new alignment but will appear in the order listed.
1040 sub select_noncont
{
1044 if ($pos[0] !~ m{^\d+$}) {
1045 my $sortcmd = shift @pos;
1046 if ($sortcmd eq 'nosort') {
1049 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1053 my $end = $self->num_sequences;
1055 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1056 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1059 @pos = sort {$a <=> $b} @pos unless $nosort;
1061 my $aln = $self->new;
1062 foreach my $p (@pos) {
1063 $aln->add_seq($self->get_seq_by_pos($p));
1065 $aln->id($self->id);
1066 # fix for meta, sf, ann
1073 Usage : $aln2 = $aln->slice(20,30)
1074 Function : Creates a slice from the alignment inclusive of start and
1075 end columns, and the first column in the alignment is denoted 1.
1076 Sequences with no residues in the slice are excluded from the
1077 new alignment and a warning is printed. Slice beyond the length of
1078 the sequence does not do padding.
1079 Returns : A Bio::SimpleAlign object
1080 Args : Positive integer for start column, positive integer for end column,
1081 optional boolean which if true will keep gap-only columns in the newly
1082 created slice. Example:
1084 $aln2 = $aln->slice(20,30,1)
1090 my ($start, $end, $keep_gap_only) = @_;
1092 $self->throw("Slice start has to be a positive integer, not [$start]")
1093 unless $start =~ /^\d+$/ and $start > 0;
1094 $self->throw("Slice end has to be a positive integer, not [$end]")
1095 unless $end =~ /^\d+$/ and $end > 0;
1096 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1097 unless $start <= $end;
1098 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1099 "[$start] is too big.") if $start > $self->length;
1100 my $cons_meta = $self->consensus_meta;
1101 my $aln = $self->new;
1102 $aln->id($self->id);
1103 foreach my $seq ( $self->each_seq() ) {
1104 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1107 -alphabet
=> $seq->alphabet,
1108 -strand
=> $seq->strand,
1109 -verbose
=> $self->verbose) :
1110 Bio
::LocatableSeq
->new
1112 -alphabet
=> $seq->alphabet,
1113 -strand
=> $seq->strand,
1114 -verbose
=> $self->verbose);
1118 $seq_end = $seq->length if( $end > $seq->length );
1120 my $slice_seq = $seq->subseq($start, $seq_end);
1121 $new_seq->seq( $slice_seq );
1123 $slice_seq =~ s/\W//g;
1126 my $pre_start_seq = $seq->subseq(1, $start - 1);
1127 $pre_start_seq =~ s/\W//g;
1128 if (!defined($seq->strand)) {
1129 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1130 } elsif ($seq->strand < 0){
1131 $new_seq->start( $seq->end - CORE
::length($pre_start_seq) - CORE
::length($slice_seq) + 1);
1133 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1136 $new_seq->start( $seq->start);
1138 if ($new_seq->isa('Bio::Seq::MetaI')) {
1139 for my $meta_name ($seq->meta_names) {
1140 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1143 $new_seq->end( $new_seq->start + CORE
::length($slice_seq) - 1 );
1145 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1146 $aln->add_seq($new_seq);
1148 if( $keep_gap_only ) {
1149 $aln->add_seq($new_seq);
1151 my $nse = $seq->get_nse();
1152 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1153 " Sequence excluded from the new alignment.");
1158 my $new = Bio
::Seq
::Meta
->new();
1159 for my $meta_name ($cons_meta->meta_names) {
1160 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1162 $aln->consensus_meta($new);
1164 $aln->annotation($self->annotation);
1165 # fix for meta, sf, ann
1169 =head2 remove_columns
1171 Title : remove_columns
1172 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1173 $aln2 = $aln->remove_columns([0,0],[6,8])
1174 Function : Creates an aligment with columns removed corresponding to
1175 the specified type or by specifying the columns by number.
1176 Returns : Bio::SimpleAlign object
1177 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1178 'all_gaps_columns') or array ref where the referenced array
1179 contains a pair of integers that specify a range.
1180 The first column is 0
1184 sub remove_columns
{
1185 my ($self,@args) = @_;
1186 @args || $self->throw("Must supply column ranges or column types");
1189 if ($args[0][0] =~ /^[a-z_]+$/i) {
1190 $aln = $self->_remove_columns_by_type($args[0]);
1191 } elsif ($args[0][0] =~ /^\d+$/) {
1192 $aln = $self->_remove_columns_by_num(\
@args);
1194 $self->throw("You must pass array references to remove_columns(), not @args");
1196 # fix for meta, sf, ann
1204 Usage : $aln2 = $aln->remove_gaps
1205 Function : Creates an aligment with gaps removed
1206 Returns : a Bio::SimpleAlign object
1207 Args : a gap character(optional) if none specified taken
1208 from $self->gap_char,
1209 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1210 indicates that only all-gaps columns should be deleted
1212 Used from method L<remove_columns> in most cases. Set gap character
1213 using L<gap_char()|gap_char>.
1218 my ($self,$gapchar,$all_gaps_columns) = @_;
1220 if ($all_gaps_columns) {
1221 $gap_line = $self->all_gap_line($gapchar);
1223 $gap_line = $self->gap_line($gapchar);
1225 my $aln = $self->new;
1229 my $del_char = $gapchar || $self->gap_char;
1230 # Do the matching to get the segments to remove
1231 while ($gap_line =~ m/[$del_char]/g) {
1232 my $start = pos($gap_line)-1;
1233 $gap_line=~/\G[$del_char]+/gc;
1234 my $end = pos($gap_line)-1;
1236 #have to offset the start and end for subsequent removes
1239 $length += ($end-$start+1);
1240 push @remove, [$start,$end];
1243 #remove the segments
1244 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1245 # fix for meta, sf, ann
1251 my ($self,$aln,$remove) = @_;
1254 my $gap = $self->gap_char;
1256 # splice out the segments and create new seq
1257 foreach my $seq($self->each_seq){
1258 my $new_seq = Bio
::LocatableSeq
->new(
1260 -alphabet
=> $seq->alphabet,
1261 -strand
=> $seq->strand,
1262 -verbose
=> $self->verbose);
1263 my $sequence = $seq->seq;
1264 foreach my $pair(@
{$remove}){
1265 my $start = $pair->[0];
1266 my $end = $pair->[1];
1267 $sequence = $seq->seq unless $sequence;
1268 my $orig = $sequence;
1269 my $head = $start > 0 ?
substr($sequence, 0, $start) : '';
1270 my $tail = ($end + 1) >= length($sequence) ?
'' : substr($sequence, $end + 1);
1271 $sequence = $head.$tail;
1273 unless (defined $new_seq->start) {
1275 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1276 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1279 my $start_adjust = $orig =~ /^$gap+/;
1280 if ($start_adjust) {
1281 $start_adjust = $+[0] == $start;
1283 $new_seq->start($seq->start + $start_adjust);
1287 if (($end + 1) >= length($orig)) {
1288 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1289 $new_seq->end($seq->end - (length($orig) - $start) + $end_adjust);
1292 $new_seq->end($seq->end);
1296 if ($new_seq->end < $new_seq->start) {
1297 # we removed all columns except for gaps: set to 0 to indicate no
1303 $new_seq->seq($sequence) if $sequence;
1304 push @new, $new_seq;
1306 # add the new seqs to the alignment
1307 foreach my $new(@new){
1308 $aln->add_seq($new);
1310 # fix for meta, sf, ann
1314 sub _remove_columns_by_type
{
1315 my ($self,$type) = @_;
1316 my $aln = $self->new;
1319 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @
{$type});
1320 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@
{$type});
1321 my %matchchars = ( 'match' => '\*',
1326 'all_gaps_columns' => ''
1328 # get the characters to delete against
1330 foreach my $type (@
{$type}){
1331 $del_char.= $matchchars{$type};
1335 my $match_line = $self->match_line;
1336 # do the matching to get the segments to remove
1338 while($match_line =~ m/[$del_char]/g ){
1339 my $start = pos($match_line)-1;
1340 $match_line=~/\G[$del_char]+/gc;
1341 my $end = pos($match_line)-1;
1343 #have to offset the start and end for subsequent removes
1346 $length += ($end-$start+1);
1347 push @remove, [$start,$end];
1351 # remove the segments
1352 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1353 $aln = $aln->remove_gaps() if $gap;
1354 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1355 # fix for meta, sf, ann
1360 sub _remove_columns_by_num
{
1361 my ($self,$positions) = @_;
1362 my $aln = $self->new;
1364 # sort the positions
1365 @
$positions = sort { $a->[0] <=> $b->[0] } @
$positions;
1369 foreach my $pos (@
{$positions}) {
1370 my ($start, $end) = @
{$pos};
1372 #have to offset the start and end for subsequent removes
1375 $length += ($end-$start+1);
1376 push @remove, [$start,$end];
1379 #remove the segments
1380 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1381 # fix for meta, sf, ann
1386 =head1 Change sequences within the MSA
1388 These methods affect characters in all sequences without changing the
1391 =head2 splice_by_seq_pos
1393 Title : splice_by_seq_pos
1394 Usage : $status = splice_by_seq_pos(1);
1395 Function: splices all aligned sequences where the specified sequence
1398 Returns : 1 on success
1399 Args : position of sequence to splice by
1404 sub splice_by_seq_pos
{
1405 my ($self,$pos) = @_;
1407 my $guide = $self->get_seq_by_pos($pos);
1408 my $guide_seq = $guide->seq;
1410 $guide_seq =~ s/\./\-/g;
1414 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1415 unshift @gaps, $pos;
1419 foreach my $seq ($self->each_seq){
1420 my @bases = split '', $seq->seq;
1422 splice(@bases, $_, 1) foreach @gaps;
1423 $seq->seq(join('', @bases));
1432 Usage : $ali->map_chars('\.','-')
1433 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1436 Notice that the from (arg1) is interpretted as a regex,
1437 so be careful about quoting meta characters (eg
1438 $ali->map_chars('.','-') wont do what you want)
1440 Argument : 'from' rexexp
1451 $self->throw("Need exactly two arguments")
1452 unless defined $from and defined $to;
1454 foreach $seq ( $self->each_seq() ) {
1455 $temp = $seq->seq();
1456 $temp =~ s/$from/$to/g;
1466 Usage : $ali->uppercase()
1467 Function : Sets all the sequences to uppercase
1478 foreach $seq ( $self->each_seq() ) {
1479 $temp = $seq->seq();
1480 $temp =~ tr/[a-z]/[A-Z]/;
1489 Title : cigar_line()
1490 Usage : %cigars = $align->cigar_line()
1491 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1492 Report) line for each sequence in the alignment. Examples are
1493 "1,60" or "5,10:12,58", where the numbers refer to conserved
1494 positions within the alignment. The keys of the hash are the
1495 NSEs (name/start/end) assigned to each sequence.
1496 Args : threshold (optional, defaults to 100)
1497 Returns : Hash of strings (cigar lines)
1506 my @consensus = split "",($self->consensus_string($thr));
1507 my $len = $self->length;
1508 my $gapchar = $self->gap_char;
1510 # create a precursor, something like (1,4,5,6,7,33,45),
1511 # where each number corresponds to a conserved position
1512 foreach my $seq ( $self->each_seq ) {
1513 my @seq = split "", uc ($seq->seq);
1515 for (my $x = 0 ; $x < $len ; $x++ ) {
1516 if ($seq[$x] eq $consensus[$x]) {
1517 push @
{$cigars{$seq->get_nse}},$pos;
1519 } elsif ($seq[$x] ne $gapchar) {
1524 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1525 for my $name (keys %cigars) {
1526 splice @
{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1527 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1528 push @
{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1529 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1530 ${$cigars{$name}}[$#{$cigars{$name}}] );
1531 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1532 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1533 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1534 splice @
{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1538 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1539 for my $name (keys %cigars) {
1541 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1542 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1543 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1547 for my $pos (@remove) {
1548 splice @
{$cigars{$name}}, $pos, 1;
1551 # join and punctuate
1552 for my $name (keys %cigars) {
1553 my ($start,$end,$str) = "";
1554 while ( ($start,$end) = splice @
{$cigars{$name}}, 0, 2 ) {
1555 $str .= ($start . "," . $end . ":");
1558 $cigars{$name} = $str;
1566 Title : match_line()
1567 Usage : $line = $align->match_line()
1568 Function : Generates a match line - much like consensus string
1569 except that a line indicating the '*' for a match.
1570 Args : (optional) Match line characters ('*' by default)
1571 (optional) Strong match char (':' by default)
1572 (optional) Weak match char ('.' by default)
1578 my ($self,$matchlinechar, $strong, $weak) = @_;
1579 my %matchchars = ('match' => $matchlinechar || '*',
1580 'weak' => $weak || '.',
1581 'strong' => $strong || ':',
1587 foreach my $seq ( $self->each_seq ) {
1588 push @seqchars, [ split(//, uc ($seq->seq)) ];
1589 $alphabet = $seq->alphabet unless defined $alphabet;
1591 my $refseq = shift @seqchars;
1592 # let's just march down the columns
1595 foreach my $pos ( 0..$self->length ) {
1596 my $refchar = $refseq->[$pos];
1597 my $char = $matchchars{'mismatch'};
1598 unless( defined $refchar ) {
1599 last if $pos == $self->length; # short circuit on last residue
1600 # this in place to handle jason's soon-to-be-committed
1601 # intron mapping code
1604 my %col = ($refchar => 1);
1605 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1606 foreach my $seq ( @seqchars ) {
1607 next if $pos >= scalar @
$seq;
1608 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1609 $seq->[$pos] eq ' ' );
1610 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1612 my @colresidues = sort keys %col;
1614 # if all the values are the same
1615 if( $dash ) { $char = $matchchars{'mismatch'} }
1616 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1617 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1618 # matches for protein seqs
1620 foreach my $type ( qw(strong weak) ) {
1621 # iterate through categories
1623 # iterate through each of the aa in the col
1624 # look to see which groups it is in
1625 foreach my $c ( @colresidues ) {
1626 foreach my $f ( grep { index($_,$c) >= 0 } @
{$CONSERVATION_GROUPS{$type}} ) {
1627 push @
{$groups{$f}},$c;
1631 foreach my $cols ( values %groups ) {
1632 @
$cols = sort @
$cols;
1633 # now we are just testing to see if two arrays
1634 # are identical w/o changing either one
1635 # have to be same len
1636 next if( scalar @
$cols != scalar @colresidues );
1637 # walk down the length and check each slot
1638 for($_=0;$_ < (scalar @
$cols);$_++ ) {
1639 next GRP
if( $cols->[$_] ne $colresidues[$_] );
1641 $char = $matchchars{$type};
1647 $matchline .= $char;
1656 Usage : $line = $align->gap_line()
1657 Function : Generates a gap line - much like consensus string
1658 except that a line where '-' represents gap
1659 Args : (optional) gap line characters ('-' by default)
1665 my ($self,$gapchar) = @_;
1666 $gapchar = $gapchar || $self->gap_char;
1667 my %gap_hsh; # column gaps vector
1668 foreach my $seq ( $self->each_seq ) {
1670 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1671 map {[$i++, $_]} split(//, uc ($seq->seq));
1674 foreach my $pos ( 0..$self->length-1 ) {
1675 $gap_line .= (exists $gap_hsh{$pos}) ?
$gapchar:'.';
1682 Title : all_gap_line()
1683 Usage : $line = $align->all_gap_line()
1684 Function : Generates a gap line - much like consensus string
1685 except that a line where '-' represents all-gap column
1686 Args : (optional) gap line characters ('-' by default)
1692 my ($self,$gapchar) = @_;
1693 $gapchar = $gapchar || $self->gap_char;
1694 my %gap_hsh; # column gaps counter hash
1695 my @seqs = $self->each_seq;
1696 foreach my $seq ( @seqs ) {
1698 map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1699 map {[$i++, $_]} split(//, uc ($seq->seq));
1702 foreach my $pos ( 0..$self->length-1 ) {
1703 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1705 $gap_line .= $gapchar;
1713 =head2 gap_col_matrix
1715 Title : gap_col_matrix()
1716 Usage : my $cols = $align->gap_col_matrix()
1717 Function : Generates an array of hashes where
1718 each entry in the array is a hash reference
1719 with keys of all the sequence names and
1720 and value of 1 or 0 if the sequence has a gap at that column
1721 Args : (optional) gap line characters ($aln->gap_char or '-' by default)
1725 sub gap_col_matrix
{
1726 my ($self,$gapchar) = @_;
1727 $gapchar = $gapchar || $self->gap_char;
1728 my %gap_hsh; # column gaps vector
1730 foreach my $seq ( $self->each_seq ) {
1732 my $str = $seq->seq;
1733 my $len = $seq->length;
1735 my $id = $seq->display_id;
1736 while( $i < $len ) {
1737 $ch = substr($str, $i, 1);
1738 $cols[$i++]->{$id} = ($ch eq $gapchar);
1747 Usage : $ali->match()
1748 Function : Goes through all columns and changes residues that are
1749 identical to residue in first sequence to match '.'
1750 character. Sets match_char.
1752 USE WITH CARE: Most MSA formats do not support match
1753 characters in sequences, so this is mostly for output
1754 only. NEXUS format (Bio::AlignIO::nexus) can handle
1757 Argument : a match character, optional, defaults to '.'
1762 my ($self, $match) = @_;
1765 my ($matching_char) = $match;
1766 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1767 $self->map_chars($matching_char, '-');
1769 my @seqs = $self->each_seq();
1770 return 1 unless scalar @seqs > 1;
1772 my $refseq = shift @seqs ;
1773 my @refseq = split //, $refseq->seq;
1774 my $gapchar = $self->gap_char;
1776 foreach my $seq ( @seqs ) {
1777 my @varseq = split //, $seq->seq();
1778 for ( my $i=0; $i < scalar @varseq; $i++) {
1779 $varseq[$i] = $match if defined $refseq[$i] &&
1780 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1781 $refseq[$i] =~ /$gapchar/ )
1782 && $refseq[$i] eq $varseq[$i];
1784 $seq->seq(join '', @varseq);
1786 $self->match_char($match);
1794 Usage : $ali->unmatch()
1795 Function : Undoes the effect of method match. Unsets match_char.
1797 Argument : a match character, optional, defaults to '.'
1799 See L<match> and L<match_char>
1804 my ($self, $match) = @_;
1808 my @seqs = $self->each_seq();
1809 return 1 unless scalar @seqs > 1;
1811 my $refseq = shift @seqs ;
1812 my @refseq = split //, $refseq->seq;
1813 my $gapchar = $self->gap_char;
1814 foreach my $seq ( @seqs ) {
1815 my @varseq = split //, $seq->seq();
1816 for ( my $i=0; $i < scalar @varseq; $i++) {
1817 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1818 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1819 $refseq[$i] =~ /$gapchar/ ) &&
1820 $varseq[$i] eq $match;
1822 $seq->seq(join '', @varseq);
1824 $self->match_char('');
1828 =head1 MSA attributes
1830 Methods for setting and reading the MSA attributes.
1832 Note that the methods defining character semantics depend on the user
1833 to set them sensibly. They are needed only by certain input/output
1834 methods. Unset them by setting to an empty string ('').
1839 Usage : $myalign->id("Ig")
1840 Function : Gets/sets the id field of the alignment
1841 Returns : An id string
1842 Argument : An id string (optional)
1847 my ($self, $name) = @_;
1849 if (defined( $name )) {
1850 $self->{'_id'} = $name;
1853 return $self->{'_id'};
1859 Usage : $myalign->accession("PF00244")
1860 Function : Gets/sets the accession field of the alignment
1861 Returns : An acc string
1862 Argument : An acc string (optional)
1867 my ($self, $acc) = @_;
1869 if (defined( $acc )) {
1870 $self->{'_accession'} = $acc;
1873 return $self->{'_accession'};
1879 Usage : $myalign->description("14-3-3 proteins")
1880 Function : Gets/sets the description field of the alignment
1881 Returns : An description string
1882 Argument : An description string (optional)
1887 my ($self, $name) = @_;
1889 if (defined( $name )) {
1890 $self->{'_description'} = $name;
1893 return $self->{'_description'};
1898 Title : missing_char
1899 Usage : $myalign->missing_char("?")
1900 Function : Gets/sets the missing_char attribute of the alignment
1901 It is generally recommended to set it to 'n' or 'N'
1902 for nucleotides and to 'X' for protein.
1903 Returns : An missing_char string,
1904 Argument : An missing_char string (optional)
1909 my ($self, $char) = @_;
1911 if (defined $char ) {
1912 $self->throw("Single missing character, not [$char]!") if CORE
::length($char) > 1;
1913 $self->{'_missing_char'} = $char;
1916 return $self->{'_missing_char'};
1922 Usage : $myalign->match_char('.')
1923 Function : Gets/sets the match_char attribute of the alignment
1924 Returns : An match_char string,
1925 Argument : An match_char string (optional)
1930 my ($self, $char) = @_;
1932 if (defined $char ) {
1933 $self->throw("Single match character, not [$char]!") if CORE
::length($char) > 1;
1934 $self->{'_match_char'} = $char;
1937 return $self->{'_match_char'};
1943 Usage : $myalign->gap_char('-')
1944 Function : Gets/sets the gap_char attribute of the alignment
1945 Returns : An gap_char string, defaults to '-'
1946 Argument : An gap_char string (optional)
1951 my ($self, $char) = @_;
1953 if (defined $char || ! defined $self->{'_gap_char'} ) {
1954 $char= '-' unless defined $char;
1955 $self->throw("Single gap character, not [$char]!") if CORE
::length($char) > 1;
1956 $self->{'_gap_char'} = $char;
1958 return $self->{'_gap_char'};
1963 Title : symbol_chars
1964 Usage : my @symbolchars = $aln->symbol_chars;
1965 Function: Returns all the seen symbols (other than gaps)
1966 Returns : array of characters that are the seen symbols
1967 Args : boolean to include the gap/missing/match characters
1972 my ($self,$includeextra) = @_;
1974 unless ($self->{'_symbols'}) {
1975 foreach my $seq ($self->each_seq) {
1976 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1979 my %copy = %{$self->{'_symbols'}};
1980 if( ! $includeextra ) {
1981 foreach my $char ( $self->gap_char, $self->match_char,
1982 $self->missing_char) {
1983 delete $copy{$char} if( defined $char );
1989 =head1 Alignment descriptors
1991 These read only methods describe the MSA in various ways.
1997 Usage : $str = $ali->score()
1998 Function : get/set a score of the alignment
1999 Returns : a score for the alignment
2000 Argument : an optional score to set
2006 $self->{score
} = shift if @_;
2007 return $self->{score
};
2010 =head2 consensus_string
2012 Title : consensus_string
2013 Usage : $str = $ali->consensus_string($threshold_percent)
2014 Function : Makes a strict consensus
2015 Returns : Consensus string
2016 Argument : Optional treshold ranging from 0 to 100.
2017 The consensus residue has to appear at least threshold %
2018 of the sequences at a given location, otherwise a '?'
2019 character will be placed at that location.
2020 (Default value = 0%)
2024 sub consensus_string
{
2026 my $threshold = shift;
2029 my $len = $self->length - 1;
2031 foreach ( 0 .. $len ) {
2032 $out .= $self->_consensus_aa($_,$threshold);
2040 my $threshold_percent = shift || -1 ;
2041 my ($seq,%hash,$count,$letter,$key);
2042 my $gapchar = $self->gap_char;
2043 foreach $seq ( $self->each_seq() ) {
2044 $letter = substr($seq->seq,$point,1);
2045 $self->throw("--$point-----------") if $letter eq '';
2046 ($letter eq $gapchar || $letter =~ /\./) && next;
2047 # print "Looking at $letter\n";
2050 my $number_of_sequences = $self->num_sequences();
2051 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2055 foreach $key ( sort keys %hash ) {
2056 # print "Now at $key $hash{$key}\n";
2057 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2059 $count = $hash{$key};
2066 =head2 consensus_iupac
2068 Title : consensus_iupac
2069 Usage : $str = $ali->consensus_iupac()
2070 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2071 and RNA. The output is in upper case except when gaps in
2072 a column force output to be in lower case.
2074 Note that if your alignment sequences contain a lot of
2075 IUPAC ambiquity codes you often have to manually set
2076 alphabet. Bio::PrimarySeq::_guess_type thinks they
2077 indicate a protein sequence.
2078 Returns : consensus string
2080 Throws : on protein sequences
2084 sub consensus_iupac
{
2087 my $len = $self->length-1;
2089 # only DNA and RNA sequences are valid
2090 foreach my $seq ( $self->each_seq() ) {
2091 $self->throw("Seq [". $seq->get_nse. "] is a protein")
2092 if $seq->alphabet eq 'protein';
2094 # loop over the alignment columns
2095 foreach my $count ( 0 .. $len ) {
2096 $out .= $self->_consensus_iupac($count);
2101 sub _consensus_iupac
{
2102 my ($self, $column) = @_;
2103 my ($string, $char, $rna);
2105 #determine all residues in a column
2106 foreach my $seq ( $self->each_seq() ) {
2107 $string .= substr($seq->seq, $column, 1);
2109 $string = uc $string;
2111 # quick exit if there's an N in the string
2112 if ($string =~ /N/) {
2113 $string =~ /\W/ ?
return 'n' : return 'N';
2115 # ... or if there are only gap characters
2116 return '-' if $string =~ /^\W+$/;
2118 # treat RNA as DNA in regexps
2119 if ($string =~ /U/) {
2124 # the following s///'s only need to be done to the _first_ ambiguity code
2125 # as we only need to see the _range_ of characters in $string
2127 if ($string =~ /[VDHB]/) {
2128 $string =~ s/V/AGC/;
2129 $string =~ s/D/AGT/;
2130 $string =~ s/H/ACT/;
2131 $string =~ s/B/CTG/;
2134 if ($string =~ /[SKYRWM]/) {
2143 # and now the guts of the thing
2145 if ($string =~ /A/) {
2147 if ($string =~ /G/) {
2148 $char = 'R'; # A and G (purines) R
2149 if ($string =~ /C/) {
2150 $char = 'V'; # A and G and C V
2151 if ($string =~ /T/) {
2152 $char = 'N'; # A and G and C and T N
2154 } elsif ($string =~ /T/) {
2155 $char = 'D'; # A and G and T D
2157 } elsif ($string =~ /C/) {
2158 $char = 'M'; # A and C M
2159 if ($string =~ /T/) {
2160 $char = 'H'; # A and C and T H
2162 } elsif ($string =~ /T/) {
2163 $char = 'W'; # A and T W
2165 } elsif ($string =~ /C/) {
2167 if ($string =~ /T/) {
2168 $char = 'Y'; # C and T (pyrimidines) Y
2169 if ($string =~ /G/) {
2170 $char = 'B'; # C and T and G B
2172 } elsif ($string =~ /G/) {
2173 $char = 'S'; # C and G S
2175 } elsif ($string =~ /G/) {
2177 if ($string =~ /C/) {
2178 $char = 'S'; # G and C S
2179 } elsif ($string =~ /T/) {
2180 $char = 'K'; # G and T K
2182 } elsif ($string =~ /T/) {
2186 $char = 'U' if $rna and $char eq 'T';
2187 $char = lc $char if $string =~ /\W/;
2193 =head2 consensus_meta
2195 Title : consensus_meta
2196 Usage : $seqmeta = $ali->consensus_meta()
2197 Function : Returns a Bio::Seq::Meta object containing the consensus
2198 strings derived from meta data analysis.
2199 Returns : Bio::Seq::Meta
2200 Argument : Bio::Seq::Meta
2201 Throws : non-MetaI object
2205 sub consensus_meta
{
2206 my ($self, $meta) = @_;
2207 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2208 $self->throw('Not a Bio::Seq::MetaI object');
2210 return $self->{'_aln_meta'} = $meta if $meta;
2211 return $self->{'_aln_meta'}
2217 Usage : if ( $ali->is_flush() )
2218 Function : Tells you whether the alignment
2219 : is flush, i.e. all of the same length
2226 my ($self,$report) = @_;
2231 foreach $seq ( $self->each_seq() ) {
2232 if( $length == (-1) ) {
2233 $length = CORE
::length($seq->seq());
2237 $temp = CORE
::length($seq->seq());
2238 if( $temp != $length ) {
2239 $self->warn("expecting $length not $temp from ".
2240 $seq->display_id) if( $report );
2241 $self->debug("expecting $length not $temp from ".
2243 $self->debug($seq->seq(). "\n");
2255 Usage : $len = $ali->length()
2256 Function : Returns the maximum length of the alignment.
2257 To be sure the alignment is a block, use is_flush
2265 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2275 foreach $seq ( $self->each_seq() ) {
2276 $temp = $seq->length();
2277 if( $temp > $length ) {
2286 =head2 maxdisplayname_length
2288 Title : maxdisplayname_length
2289 Usage : $ali->maxdisplayname_length()
2290 Function : Gets the maximum length of the displayname in the
2291 alignment. Used in writing out various MSA formats.
2297 sub maxname_length
{
2299 $self->deprecated("maxname_length - deprecated method.".
2300 " Use maxdisplayname_length() instead.");
2301 $self->maxdisplayname_length();
2306 $self->deprecated("maxnse_length - deprecated method.".
2307 " Use maxnse_length() instead.");
2308 $self->maxdisplayname_length();
2311 sub maxdisplayname_length
{
2316 foreach $seq ( $self->each_seq() ) {
2317 $len = CORE
::length $self->displayname($seq->get_nse());
2319 if( $len > $maxname ) {
2327 =head2 max_metaname_length
2329 Title : max_metaname_length
2330 Usage : $ali->max_metaname_length()
2331 Function : Gets the maximum length of the meta name tags in the
2332 alignment for the sequences and for the alignment.
2333 Used in writing out various MSA formats.
2339 sub max_metaname_length
{
2344 # check seq meta first
2345 for $seq ( $self->each_seq() ) {
2346 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2347 for my $mtag ($seq->meta_names) {
2348 $len = CORE
::length $mtag;
2349 if( $len > $maxname ) {
2356 for my $meta ($self->consensus_meta) {
2358 for my $name ($meta->meta_names) {
2359 $len = CORE
::length $name;
2360 if( $len > $maxname ) {
2371 Title : num_residues
2372 Usage : $no = $ali->num_residues
2373 Function : number of residues in total in the alignment
2376 Note : replaces no_residues()
2384 foreach my $seq ($self->each_seq) {
2385 my $str = $seq->seq();
2387 $count += ($str =~ s/[A-Za-z]//g);
2393 =head2 num_sequences
2395 Title : num_sequences
2396 Usage : $depth = $ali->num_sequences
2397 Function : number of sequence in the sequence alignment
2400 Note : replaces no_sequences()
2406 return scalar($self->each_seq);
2410 =head2 average_percentage_identity
2412 Title : average_percentage_identity
2413 Usage : $id = $align->average_percentage_identity
2414 Function: The function uses a fast method to calculate the average
2415 percentage identity of the alignment
2416 Returns : The average percentage identity of the alignment
2418 Notes : This method implemented by Kevin Howe calculates a figure that is
2419 designed to be similar to the average pairwise identity of the
2420 alignment (identical in the absence of gaps), without having to
2421 explicitly calculate pairwise identities proposed by Richard Durbin.
2422 Validated by Ewan Birney ad Alex Bateman.
2426 sub average_percentage_identity
{
2427 my ($self,@args) = @_;
2429 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2430 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2432 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2434 if (! $self->is_flush()) {
2435 $self->throw("All sequences in the alignment must be the same length");
2438 @seqs = $self->each_seq();
2439 $len = $self->length();
2441 # load the each hash with correct keys for existence checks
2443 for( my $index=0; $index < $len; $index++) {
2444 foreach my $letter (@alphabet) {
2445 $countHashes[$index]->{$letter} = 0;
2448 foreach my $seq (@seqs) {
2449 my @seqChars = split //, $seq->seq();
2450 for( my $column=0; $column < @seqChars; $column++ ) {
2451 my $char = uc($seqChars[$column]);
2452 if (exists $countHashes[$column]->{$char}) {
2453 $countHashes[$column]->{$char}++;
2460 for(my $column =0; $column < $len; $column++) {
2461 my %hash = %{$countHashes[$column]};
2463 foreach my $res (keys %hash) {
2464 $total += $hash{$res}*($hash{$res} - 1);
2465 $subdivisor += $hash{$res};
2467 $divisor += $subdivisor * ($subdivisor - 1);
2469 return $divisor > 0 ?
($total / $divisor )*100.0 : 0;
2472 =head2 percentage_identity
2474 Title : percentage_identity
2475 Usage : $id = $align->percentage_identity
2476 Function: The function calculates the average percentage identity
2477 (aliased to average_percentage_identity)
2478 Returns : The average percentage identity
2483 sub percentage_identity
{
2485 return $self->average_percentage_identity();
2488 =head2 overall_percentage_identity
2490 Title : overall_percentage_identity
2491 Usage : $id = $align->overall_percentage_identity
2492 $id = $align->overall_percentage_identity('short')
2493 Function: The function calculates the percentage identity of
2494 the conserved columns
2495 Returns : The percentage identity of the conserved columns
2496 Args : length value to use, optional defaults to alignment length
2497 possible values: 'align', 'short', 'long'
2499 The argument values 'short' and 'long' refer to shortest and longest
2500 sequence in the alignment. Method modification code by Hongyu Zhang.
2504 sub overall_percentage_identity
{
2505 my ($self, $length_measure) = @_;
2507 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2508 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2510 my ($len, $total, @seqs, @countHashes);
2512 my %enum = map {$_ => 1} qw
(align short long
);
2514 $self->throw("Unknown argument [$length_measure]")
2515 if $length_measure and not $enum{$length_measure};
2516 $length_measure ||= 'align';
2518 if (! $self->is_flush()) {
2519 $self->throw("All sequences in the alignment must be the same length");
2522 @seqs = $self->each_seq();
2523 $len = $self->length();
2525 # load the each hash with correct keys for existence checks
2526 for( my $index=0; $index < $len; $index++) {
2527 foreach my $letter (@alphabet) {
2528 $countHashes[$index]->{$letter} = 0;
2531 foreach my $seq (@seqs) {
2532 my @seqChars = split //, $seq->seq();
2533 for( my $column=0; $column < @seqChars; $column++ ) {
2534 my $char = uc($seqChars[$column]);
2535 if (exists $countHashes[$column]->{$char}) {
2536 $countHashes[$column]->{$char}++;
2542 for(my $column =0; $column < $len; $column++) {
2543 my %hash = %{$countHashes[$column]};
2544 foreach ( values %hash ) {
2546 $total++ if( $_ == scalar @seqs );
2551 if ($length_measure eq 'short') {
2552 ## find the shortest length
2554 foreach my $seq ($self->each_seq) {
2555 my $count = $seq->seq =~ tr/[A-Za-z]//;
2557 $len = $count if $count < $len;
2563 elsif ($length_measure eq 'long') {
2564 ## find the longest length
2566 foreach my $seq ($self->each_seq) {
2567 my $count = $seq->seq =~ tr/[A-Za-z]//;
2569 $len = $count if $count > $len;
2576 return ($total / $len ) * 100.0;
2581 =head1 Alignment positions
2583 Methods to map a sequence position into an alignment column and back.
2584 column_from_residue_number() does the former. The latter is really a
2585 property of the sequence object and can done using
2586 L<Bio::LocatableSeq::location_from_column>:
2588 # select somehow a sequence from the alignment, e.g.
2589 my $seq = $aln->get_seq_by_pos(1);
2590 #$loc is undef or Bio::LocationI object
2591 my $loc = $seq->location_from_column(5);
2593 =head2 column_from_residue_number
2595 Title : column_from_residue_number
2596 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2597 Function: This function gives the position in the alignment
2598 (i.e. column number) of the given residue number in the
2599 sequence with the given name. For example, for the
2602 Seq1/91-97 AC..DEF.GH.
2603 Seq2/24-30 ACGG.RTY...
2604 Seq3/43-51 AC.DDEF.GHI
2606 column_from_residue_number( "Seq1", 94 ) returns 6.
2607 column_from_residue_number( "Seq2", 25 ) returns 2.
2608 column_from_residue_number( "Seq3", 50 ) returns 10.
2610 An exception is thrown if the residue number would lie
2611 outside the length of the aligment
2612 (e.g. column_from_residue_number( "Seq2", 22 )
2614 Note: If the the parent sequence is represented by more than
2615 one alignment sequence and the residue number is present in
2616 them, this method finds only the first one.
2618 Returns : A column number for the position in the alignment of the
2619 given residue in the given sequence (1 = first column)
2620 Args : A sequence id/name (not a name/start-end)
2621 A residue number in the whole sequence (not just that
2622 segment of it in the alignment)
2626 sub column_from_residue_number
{
2627 my ($self, $name, $resnumber) = @_;
2629 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2630 $self->throw("Second argument residue number missing") unless $resnumber;
2632 foreach my $seq ($self->each_seq_with_id($name)) {
2635 $col = $seq->column_from_residue_number($resnumber);
2641 $self->throw("Could not find a sequence segment in $name ".
2642 "containing residue number $resnumber");
2646 =head1 Sequence names
2648 Methods to manipulate the display name. The default name based on the
2649 sequence id and subsequence positions can be overridden in various
2655 Usage : $myalign->displayname("Ig", "IgA")
2656 Function : Gets/sets the display name of a sequence in the alignment
2657 Returns : A display name string
2658 Argument : name of the sequence
2659 displayname of the sequence (optional)
2663 sub get_displayname
{
2665 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2666 $self->displayname(@_);
2669 sub set_displayname
{
2671 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2672 $self->displayname(@_);
2676 my ($self, $name, $disname) = @_;
2678 $self->throw("No sequence with name [$name]")
2679 unless defined $self->{'_seq'}->{$name};
2681 if( $disname and $name) {
2682 $self->{'_dis_name'}->{$name} = $disname;
2685 elsif( defined $self->{'_dis_name'}->{$name} ) {
2686 return $self->{'_dis_name'}->{$name};
2692 =head2 set_displayname_count
2694 Title : set_displayname_count
2695 Usage : $ali->set_displayname_count
2696 Function : Sets the names to be name_# where # is the number of
2697 times this name has been used.
2698 Returns : 1, on success
2703 sub set_displayname_count
{
2705 my (@arr,$name,$seq,$count,$temp,$nse);
2707 foreach $seq ( $self->each_alphabetically() ) {
2708 $nse = $seq->get_nse();
2710 #name will be set when this is the second
2711 #time (or greater) is has been seen
2713 if( defined $name and $name eq ($seq->id()) ) {
2714 $temp = sprintf("%s_%s",$name,$count);
2715 $self->displayname($nse,$temp);
2720 $temp = sprintf("%s_%s",$name,$count);
2721 $self->displayname($nse,$temp);
2728 =head2 set_displayname_flat
2730 Title : set_displayname_flat
2731 Usage : $ali->set_displayname_flat()
2732 Function : Makes all the sequences be displayed as just their name,
2739 sub set_displayname_flat
{
2743 foreach $seq ( $self->each_seq() ) {
2744 $nse = $seq->get_nse();
2745 $self->displayname($nse,$seq->id());
2750 =head2 set_displayname_normal
2752 Title : set_displayname_normal
2753 Usage : $ali->set_displayname_normal()
2754 Function : Makes all the sequences be displayed as name/start-end
2755 Returns : 1, on success
2760 sub set_displayname_normal
{
2764 foreach $seq ( $self->each_seq() ) {
2765 $nse = $seq->get_nse();
2766 $self->displayname($nse,$nse);
2774 Usage : $obj->source($newval)
2775 Function: sets the Alignment source program
2777 Returns : value of source
2778 Args : newvalue (optional)
2784 my ($self,$value) = @_;
2785 if( defined $value) {
2786 $self->{'_source'} = $value;
2788 return $self->{'_source'};
2791 =head2 set_displayname_safe
2793 Title : set_displayname_safe
2794 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2795 Function : Assign machine-generated serial names to sequences in input order.
2796 Designed to protect names during PHYLIP runs. Assign 10-char string
2797 in the form of "S000000001" to "S999999999". Restore the original
2798 names using "restore_displayname".
2799 Returns : 1. a new $aln with system names;
2800 2. a hash ref for restoring names
2801 Argument : Number for id length (default 10)
2805 sub set_displayname_safe
{
2807 my $idlength = shift || 10;
2808 my ($seq, %phylip_name);
2810 my $new=Bio
::SimpleAlign
->new();
2811 foreach $seq ( $self->each_seq() ) {
2813 my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2814 $phylip_name{$pname}=$seq->id();
2815 my $new_seq= Bio
::LocatableSeq
->new(-id
=> $pname,
2816 -seq
=> $seq->seq(),
2817 -alphabet
=> $seq->alphabet,
2818 -start
=> $seq->{_start
},
2819 -end
=> $seq->{_end
}
2821 $new->add_seq($new_seq);
2824 $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2825 return ($new, \
%phylip_name);
2828 =head2 restore_displayname
2830 Title : restore_displayname
2831 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2832 Function : Restore original sequence names (after running
2833 $ali->set_displayname_safe)
2834 Returns : a new $aln with names restored.
2835 Argument : a hash reference of names from "set_displayname_safe".
2839 sub restore_displayname
{
2843 my $new=Bio
::SimpleAlign
->new();
2844 foreach my $seq ( $self->each_seq() ) {
2845 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2846 my $new_seq= Bio
::LocatableSeq
->new(-id
=> $name{$seq->id()},
2847 -seq
=> $seq->seq(),
2848 -alphabet
=> $seq->alphabet,
2849 -start
=> $seq->{_start
},
2850 -end
=> $seq->{_end
}
2852 $new->add_seq($new_seq);
2857 =head2 sort_by_start
2859 Title : sort_by_start
2860 Usage : $ali->sort_by_start
2861 Function : Changes the order of the alignment to the start position of each
2870 my ($seq,$nse,@arr,%hash,$count);
2871 foreach $seq ( $self->each_seq() ) {
2872 $nse = $seq->get_nse;
2876 %{$self->{'_order'}} = (); # reset the hash;
2877 foreach $nse ( sort _startend
keys %hash) {
2878 $self->{'_order'}->{$count} = $nse;
2886 my ($aname,$arange) = split (/[\/]/,$a);
2887 my ($bname,$brange) = split (/[\/]/,$b);
2888 my ($astart,$aend) = split(/\-/,$arange);
2889 my ($bstart,$bend) = split(/\-/,$brange);
2890 return $astart <=> $bstart;
2893 =head2 bracket_string
2895 Title : bracket_string
2896 Usage : my @params = (-refseq => 'testseq',
2897 -allele1 => 'allele1',
2898 -allele2 => 'allele2',
2899 -delimiters => '{}',
2901 $str = $aln->bracket_string(@params)
2903 Function : When supplied with a list of parameters (see below), returns a
2904 string in BIC format. This is used for allelic comparisons.
2905 Briefly, if either allele contains a base change when compared to
2906 the refseq, the base or gap for each allele is represented in
2907 brackets in the order present in the 'alleles' parameter.
2909 For the following data:
2918 the returned string with parameters 'refseq => testseq' and
2919 'alleles => [qw(allele1 allele2)]' would be:
2921 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2922 Returns : BIC-formatted string
2923 Argument : Required args
2924 refseq : string (ID) of the reference sequence used
2925 as basis for comparison
2926 allele1 : string (ID) of the first allele
2927 allele2 : string (ID) of the second allele
2929 delimiters: two symbol string of left and right delimiters.
2930 Only the first two symbols are used
2932 separator : string used as a separator. Only the first
2935 Throws : On no refseq/alleles, or invalid refseq/alleles.
2939 sub bracket_string
{
2940 my ($self, @args) = @_;
2941 my ($ref, $a1, $a2, $delim, $sep) =
2942 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2943 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2945 ($ld, $rd) = split('', $delim, 2) if $delim;
2949 my ($refseq, $allele1, $allele2) =
2950 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2951 if (!$refseq || !$allele1 || !$allele2) {
2952 $self->throw("One of your refseq/allele IDs is invalid!");
2954 my $len = $self->length-1;
2956 # loop over the alignment columns
2957 for my $column ( 0 .. $len ) {
2959 my ($compres, $res1, $res2) =
2960 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2961 # are any of the allele symbols different from the refseq?
2962 $string = ($compres eq $res1 && $compres eq $res2) ?
$compres :
2963 $ld.$res1.$sep.$res2.$rd;
2970 =head2 methods implementing Bio::FeatureHolderI
2972 FeatureHolderI implementation to support labeled character sets like one
2973 would get from NEXUS represented data.
2975 =head2 get_SeqFeatures
2977 Usage : @features = $aln->get_SeqFeatures
2978 Function: Get the feature objects held by this feature holder.
2980 Returns : an array of Bio::SeqFeatureI implementing objects
2981 Args : optional filter coderef, taking a Bio::SeqFeatureI
2982 : as argument, returning TRUE if wanted, FALSE if
2987 sub get_SeqFeatures
{
2989 my $filter_cb = shift;
2990 $self->throw("Arg (filter callback) must be a coderef") unless
2991 !defined($filter_cb) or ref($filter_cb) eq 'CODE';
2992 if( !defined $self->{'_as_feat'} ) {
2993 $self->{'_as_feat'} = [];
2996 return grep { $filter_cb->($_) } @
{$self->{'_as_feat'}};
2998 return @
{$self->{'_as_feat'}};
3001 =head2 add_SeqFeature
3003 Usage : $aln->add_SeqFeature($subfeat);
3004 Function: adds a SeqFeature into the SeqFeature array.
3006 Returns : true on success
3007 Args : a Bio::SeqFeatureI object
3008 Note : This implementation is not compliant
3009 with Bio::FeatureHolderI
3013 sub add_SeqFeature
{
3014 my ($self,@feat) = @_;
3016 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3018 foreach my $feat ( @feat ) {
3019 if( !$feat->isa("Bio::SeqFeatureI") ) {
3020 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3023 push(@
{$self->{'_as_feat'}},$feat);
3029 =head2 remove_SeqFeatures
3031 Usage : $obj->remove_SeqFeatures
3032 Function: Removes all SeqFeatures. If you want to remove only a subset,
3033 remove that subset from the returned array, and add back the rest.
3034 Returns : The array of Bio::SeqFeatureI features that was
3035 deleted from this alignment.
3040 sub remove_SeqFeatures
{
3043 return () unless $self->{'_as_feat'};
3044 my @feats = @
{$self->{'_as_feat'}};
3045 $self->{'_as_feat'} = [];
3049 =head2 feature_count
3051 Title : feature_count
3052 Usage : $obj->feature_count()
3053 Function: Return the number of SeqFeatures attached to the alignment
3054 Returns : integer representing the number of SeqFeatures
3062 if (defined($self->{'_as_feat'})) {
3063 return ($#{$self->{'_as_feat'}} + 1);
3069 =head2 get_all_SeqFeatures
3071 Title : get_all_SeqFeatures
3073 Function: Get all SeqFeatures.
3075 Returns : an array of Bio::SeqFeatureI implementing objects
3077 Note : Falls through to Bio::FeatureHolderI implementation.
3081 =head2 methods for Bio::AnnotatableI
3083 AnnotatableI implementation to support sequence alignments which
3084 contain annotation (NEXUS, Stockholm).
3089 Usage : $ann = $aln->annotation or
3090 $aln->annotation($ann)
3091 Function: Gets or sets the annotation
3092 Returns : Bio::AnnotationCollectionI object
3093 Args : None or Bio::AnnotationCollectionI object
3095 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3096 for more information
3101 my ($obj,$value) = @_;
3102 if( defined $value ) {
3103 $obj->throw("object of class ".ref($value)." does not implement ".
3104 "Bio::AnnotationCollectionI. Too bad.")
3105 unless $value->isa("Bio::AnnotationCollectionI");
3106 $obj->{'_annotation'} = $value;
3107 } elsif( ! defined $obj->{'_annotation'}) {
3108 $obj->{'_annotation'} = Bio
::Annotation
::Collection
->new();
3110 return $obj->{'_annotation'};
3113 =head1 Deprecated methods
3120 Usage : $no = $ali->no_residues
3121 Function : number of residues in total in the alignment
3124 Note : deprecated in favor of num_residues()
3130 $self->deprecated(-warn_version
=> 1.0069,
3131 -throw_version
=> 1.0075,
3132 -message
=> 'Use of method no_residues() is deprecated, use num_residues() instead');
3133 $self->num_residues(@_);
3138 Title : no_sequences
3139 Usage : $depth = $ali->no_sequences
3140 Function : number of sequence in the sequence alignment
3143 Note : deprecated in favor of num_sequences()
3149 $self->deprecated(-warn_version
=> 1.0069,
3150 -throw_version
=> 1.0075,
3151 -message
=> 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3152 $self->num_sequences(@_);
3157 Title : mask_columns
3158 Usage : $aln2 = $aln->mask_columns(20,30)
3159 Function : Masks a slice of the alignment inclusive of start and
3160 end columns, and the first column in the alignment is denoted 1.
3161 Mask beyond the length of the sequence does not do padding.
3162 Returns : A Bio::SimpleAlign object
3163 Args : Positive integer for start column, positive integer for end column,
3164 optional string value use for the mask. Example:
3166 $aln2 = $aln->mask_columns(20,30,'?')
3167 Note : Masking must use a character that is not used for gaps or
3168 frameshifts. These can be adjusted using the relevant global
3169 variables, but be aware these may be (uncontrollably) modified
3170 elsewhere within BioPerl (see bug 2715)
3175 #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3178 my $nonres = $Bio::LocatableSeq
::GAP_SYMBOLS
.
3179 $Bio::LocatableSeq
::FRAMESHIFT_SYMBOLS
;
3181 # coordinates are alignment-based, not sequence-based
3182 my ($start, $end, $mask_char) = @_;
3183 unless (defined $mask_char) { $mask_char = 'N' }
3185 $self->throw("Mask start has to be a positive integer and less than ".
3186 "alignment length, not [$start]")
3187 unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3188 $self->throw("Mask end has to be a positive integer and less than ".
3189 "alignment length, not [$end]")
3190 unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3191 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3192 "end [$end]") unless $start <= $end;
3193 $self->throw("Mask character $mask_char has to be a single character ".
3194 "and not a gap or frameshift symbol")
3195 unless CORE
::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3197 my $aln = $self->new;
3198 $aln->id($self->id);
3199 foreach my $seq ( $self->each_seq() ) {
3200 my $new_seq = Bio
::LocatableSeq
->new(-id
=> $seq->id,
3201 -alphabet
=> $seq->alphabet,
3202 -strand
=> $seq->strand,
3203 -verbose
=> $self->verbose);
3205 # convert from 1-based alignment coords!
3206 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3207 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3208 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3209 $new_seq->seq($new_dna_string);
3210 $aln->add_seq($new_seq);