1 # BioPerl module for SimpleAlign
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
14 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
15 # May 2001 major rewrite - Heikki Lehvaslaiho
19 Bio::SimpleAlign - Multiple alignments held as a set of sequences
23 # Use Bio::AlignIO to read in the alignment
24 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
25 $aln = $str->next_aln();
29 print $aln->num_residues;
31 print $aln->num_sequences;
33 print $aln->percentage_identity;
34 print $aln->consensus_string(50);
36 # Find the position in the alignment for a sequence location
37 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
39 # Extract sequences and check values for the alignment column $pos
40 foreach $seq ($aln->each_seq) {
41 $res = $seq->subseq($pos, $pos);
44 foreach $res (keys %count) {
45 printf "Res: %s Count: %2d\n", $res, $count{$res};
49 $aln->remove_seq($seq);
50 $mini_aln = $aln->slice(20,30); # get a block of columns
51 $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
52 $new_aln = $aln->remove_columns([20,30]); # remove by position
53 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
56 $str = $aln->consensus_string($threshold_percent);
57 $str = $aln->match_line();
58 $str = $aln->cigar_line();
59 $id = $aln->percentage_identity;
61 # See the module documentation for details and more methods.
65 SimpleAlign is an object that handles a multiple sequence alignment
66 (MSA). It is very permissive of types (it does not insist on sequences
67 being all same length, for example). Think of it as a set of sequences
68 with a whole series of built-in manipulations and methods for reading and
71 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
72 to store its sequences. These are subsequences with a start and end
73 positions in the parent reference sequence. Each sequence in the
74 SimpleAlign object is a Bio::LocatableSeq.
76 SimpleAlign expects the combination of name, start, and end for a
77 given sequence to be unique in the alignment, and this is the key for the
78 internal hashes (name, start, end are abbreviated C<nse> in the code).
79 However, in some cases people do not want the name/start-end to be displayed:
80 either multiple names in an alignment or names specific to the alignment
81 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
82 C<displayname>, and generally is what is used to print out the
83 alignment. They default to name/start-end.
85 The SimpleAlign Module is derived from the Align module by Ewan Birney.
91 User feedback is an integral part of the evolution of this and other
92 Bioperl modules. Send your comments and suggestions preferably to one
93 of the Bioperl mailing lists. Your participation is much appreciated.
95 bioperl-l@bioperl.org - General discussion
96 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
100 Please direct usage questions or support issues to the mailing list:
102 I<bioperl-l@bioperl.org>
104 rather than to the module maintainer directly. Many experienced and
105 reponsive experts will be able look at the problem and quickly
106 address it. Please include a thorough description of the problem
107 with code and data examples if at all possible.
109 =head2 Reporting Bugs
111 Report bugs to the Bioperl bug tracking system to help us keep track
112 the bugs and their resolution. Bug reports can be submitted via the
115 https://redmine.open-bio.org/projects/bioperl/
119 Ewan Birney, birney@ebi.ac.uk
123 Allen Day, allenday-at-ucla.edu,
124 Richard Adams, Richard.Adams-at-ed.ac.uk,
125 David J. Evans, David.Evans-at-vir.gla.ac.uk,
126 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
127 Allen Smith, allens-at-cpan.org,
128 Jason Stajich, jason-at-bioperl.org,
129 Anthony Underwood, aunderwood-at-phls.org.uk,
130 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
131 Brian Osborne, bosborne at alum.mit.edu
132 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
133 Hongyu Zhang, forward at hongyu.org
134 Jay Hannah, jay at jays.net
135 Alexandr Bezginov, albezg at gmail.com
143 The rest of the documentation details each of the object
144 methods. Internal methods are usually preceded with a _
148 # 'Let the code begin...
150 package Bio
::SimpleAlign
;
151 use vars
qw(%CONSERVATION_GROUPS);
154 use Bio::LocatableSeq; # uses Seq's as list
157 use Bio::SeqFeature::Generic;
160 # This data should probably be in a more centralized module...
161 # it is taken from Clustalw documentation.
162 # These are all the positively scoring groups that occur in the
163 # Gonnet Pam250 matrix. The strong and weak groups are
164 # defined as strong score >0.5 and weak score =<0.5 respectively.
166 %CONSERVATION_GROUPS = (
191 use base
qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
192 Bio::FeatureHolderI);
197 Usage : my $aln = Bio::SimpleAlign->new();
198 Function : Creates a new simple align object
199 Returns : Bio::SimpleAlign
200 Args : -source => string representing the source program
201 where this alignment came from
202 -annotation => Bio::AnnotationCollectionI
203 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
204 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
205 -consensus => consensus string
206 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
212 my($class,@args) = @_;
214 my $self = $class->SUPER::new
(@args);
216 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
229 $src && $self->source($src);
230 defined $score && $self->score($score);
231 # we need to set up internal hashs first!
233 $self->{'_seq'} = {};
234 $self->{'_order'} = {};
235 $self->{'_start_end_lists'} = {};
236 $self->{'_dis_name'} = {};
237 $self->{'_id'} = 'NoName';
238 # maybe we should automatically read in from args. Hmmm...
239 $id && $self->id($id);
240 $acc && $self->accession($acc);
241 $desc && $self->description($desc);
242 $coll && $self->annotation($coll);
243 # sequence annotation is layered into a provided annotation collection (or dies)
245 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
246 "with a sequence annotation collection")
248 $coll->add_Annotation('seq_annotation', $sa);
250 if ($feats && ref $feats eq 'ARRAY') {
251 for my $feat (@
$feats) {
252 $self->add_SeqFeature($feat);
255 $con && $self->consensus($con);
256 $cmeta && $self->consensus_meta($cmeta);
257 # assumes these are in correct alignment order
258 if ($seqs && ref($seqs) eq 'ARRAY') {
259 for my $seq (@
$seqs) {
260 $self->add_seq($seq);
264 return $self; # success - we hope!
267 =head1 Modifier methods
269 These methods modify the MSA by adding, removing or shuffling complete
275 Usage : $myalign->add_seq($newseq);
276 $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
277 Function : Adds another sequence to the alignment. *Does not* align
278 it - just adds it to the hashes.
279 If -ORDER is specified, the sequence is inserted at the
280 the position spec'd by -ORDER, and existing sequences
281 are pushed down the storage array.
283 Args : A Bio::LocatableSeq object
284 Positive integer for the sequence position (optional)
286 See L<Bio::LocatableSeq> for more information
292 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
299 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
300 my ($name,$id,$start,$end);
303 $self->throw("LocatableSeq argument required");
305 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
306 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
308 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
309 $order--; # jay's patch (user-specified order is 1-origin)
312 $self->throw("User-specified value for ORDER must be >= 1");
315 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
317 # build the symbol list for this sequence,
318 # will prune out the gap and missing/match chars
319 # when actually asked for the symbol list in the
321 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
323 $name = $seq->get_nse;
325 if( $self->{'_seq'}->{$name} ) {
326 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
329 $self->debug( "Assigning $name to $order\n");
331 my $ordh = $self->{'_order'};
332 if ($ordh->{$order}) {
333 # make space to insert
334 # $c->() returns (in reverse order) the first subsequence
335 # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
336 # (3,2,1), and $c->(2,4,5) returns (2).
338 $c = sub { return (($_[1]-$_[0] == 1) ?
($c->(@_[1..$#_]),$_[0]) : $_[0]); };
340 $ordh->{$_+1} = $ordh->{$_}
341 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
344 $ordh->{$order} = $name;
346 unless( exists( $self->{'_start_end_lists'}->{$id})) {
347 $self->{'_start_end_lists'}->{$id} = [];
349 push @
{$self->{'_start_end_lists'}->{$id}}, $seq;
352 $self->{'_seq'}->{$name} = $seq;
360 Usage : $aln->remove_seq($seq);
361 Function : Removes a single sequence from an alignment
363 Argument : a Bio::LocatableSeq object
369 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
370 $self->remove_seq(@_);
378 $self->throw("Need Bio::Locatable seq argument ")
379 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
382 $name = $seq->get_nse;
384 if( !exists $self->{'_seq'}->{$name} ) {
385 $self->throw("Sequence $name does not exist in the alignment to remove!");
388 delete $self->{'_seq'}->{$name};
390 # we need to remove this seq from the start_end_lists hash
392 if (exists $self->{'_start_end_lists'}->{$id}) {
393 # we need to find the sequence in the array.
396 for ($i=0; $i < @
{$self->{'_start_end_lists'}->{$id}}; $i++) {
397 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
403 splice @
{$self->{'_start_end_lists'}->{$id}}, $i, 1;
406 $self->throw("Could not find the sequence to remoce from the start-end list");
410 $self->throw("There is no seq list for the name $id");
412 # we need to shift order hash
413 my %rev_order = reverse %{$self->{'_order'}};
414 my $no = $rev_order{$name};
415 my $num_sequences = $self->num_sequences;
416 for (; $no < $num_sequences; $no++) {
417 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
419 delete $self->{'_order'}->{$no};
427 Usage : $aln->purge(0.7);
428 Function: Removes sequences above given sequence similarity
429 This function will grind on large alignments. Beware!
431 Returns : An array of the removed sequences
432 Args : float, threshold for similarity
437 my ($self,$perc) = @_;
438 my (%duplicate, @dups);
440 my @seqs = $self->each_seq();
442 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
445 #skip if already in duplicate hash
446 next if exists $duplicate{$seq->display_id} ;
447 my $one = $seq->seq();
449 my @one = split '', $one; #split to get 1aa per array element
451 for (my $j=$i+1;$j < @seqs;$j++) {
452 my $seq2 = $seqs[$j];
454 #skip if already in duplicate hash
455 next if exists $duplicate{$seq2->display_id} ;
457 my $two = $seq2->seq();
458 my @two = split '', $two;
462 for (my $k=0;$k<@one;$k++) {
463 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
464 $one[$k] eq $two[$k]) {
467 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
468 $two[$k] ne '.' && $two[$k] ne '-' ) {
474 $ratio = $count/$res unless $res == 0;
476 # if above threshold put in duplicate hash and push onto
477 # duplicate array for returning to get_unique
478 if ( $ratio > $perc ) {
479 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
480 $duplicate{$seq2->display_id} = 1;
485 foreach my $seq (@dups) {
486 $self->remove_seq($seq);
491 =head2 sort_alphabetically
493 Title : sort_alphabetically
494 Usage : $ali->sort_alphabetically
495 Function : Changes the order of the alignment to alphabetical on name
496 followed by numerical by number.
502 sub sort_alphabetically
{
504 my ($seq,$nse,@arr,%hash,$count);
506 foreach $seq ( $self->each_seq() ) {
507 $nse = $seq->get_nse;
513 %{$self->{'_order'}} = (); # reset the hash;
515 foreach $nse ( sort _alpha_startend
keys %hash) {
516 $self->{'_order'}->{$count} = $nse;
526 Usage : $aln_ordered=$aln->sort_by_list($list_file)
527 Function : Arbitrarily order sequences in an alignment
528 Returns : A new Bio::SimpleAlign object
529 Argument : a file listing sequence names in intended order (one name per line)
534 my ($self, $list) = @_;
535 my (@seq, @ids, %order);
537 foreach my $seq ( $self->each_seq() ) {
539 push @ids, $seq->display_id;
543 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
547 $self->throw("Not found in alignment: $name") unless &_in_aln
($name, \
@ids);
552 # use the map-sort-map idiom:
553 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
554 my $aln = $self->new;
555 foreach (@sorted) { $aln->add_seq($_) }
559 =head2 set_new_reference
561 Title : set_new_reference
562 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
563 the sequence whoes name is "B31" (full, exact, and case-sensitive),
564 as the reference (1st) sequence
565 Function : Change/Set a new reference (i.e., the first) sequence
566 Returns : a new Bio::SimpleAlign object.
567 Throws an exception if designated sequence not found
568 Argument : a positive integer of sequence order, or a sequence name
569 in the original alignment
573 sub set_new_reference
{
574 my ($self, $seqid) = @_;
575 my $aln = $self->new;
576 my (@seq, @ids, @new_seq);
578 foreach my $seq ( $self->each_seq() ) {
580 push @ids, $seq->display_id;
583 if ($seqid =~ /^\d+$/) { # argument is seq position
585 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
586 } else { # argument is a seq name
587 $self->throw("The new reference sequence not in alignment ") unless &_in_aln
($seqid, \
@ids);
590 for (my $i=0; $i<=$#seq; $i++) {
592 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
593 unshift @new_seq, $seq[$i];
595 push @new_seq, $seq[$i];
598 foreach (@new_seq) { $aln->add_seq($_); }
602 sub _in_aln
{ # check if input name exists in the alignment
603 my ($str, $ref) = @_;
605 return 1 if $str eq $_;
614 Usage : $aln->uniq_seq(): Remove identical sequences in
615 in the alignment. Ambiguous base ("N", "n") and
616 leading and ending gaps ("-") are NOT counted as
618 Function : Make a new alignment of unique sequence types (STs)
619 Returns : 1a. if called in a scalar context,
620 a new Bio::SimpleAlign object (all sequences renamed as "ST")
621 1b. if called in an array context,
622 a new Bio::SimpleAlign object, and a hashref whose keys
623 are sequence types, and whose values are arrayrefs to
624 lists of sequence ids within the corresponding sequence type
625 2. if $aln->verbose > 0, ST of each sequence is sent to
626 STDERR (in a tabular format)
632 my ($self, $seqid) = @_;
633 my $aln = $self->new;
634 my (%member, %order, @seq, @uniq_str, $st);
636 my $len = $self->length();
638 foreach my $seq ( $self->each_seq() ) {
639 my $str = $seq->seq();
641 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
642 # comparing two sequence strings
644 # 1st, convert "n", "N" to "?" (for DNA sequence only):
645 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
646 # 2nd, convert leading and ending gaps to "?":
647 $str = &_convert_leading_ending_gaps
($str, '-', '?');
648 # Note that '?' also can mean unknown residue.
649 # I don't like making global class member changes like this, too
650 # prone to errors... -- cjfields 08-11-18
651 local $Bio::LocatableSeq
::GAP_SYMBOLS
= '-\?';
652 my $new = Bio
::LocatableSeq
->new(
654 -alphabet
=> $seq->alphabet,
656 -start
=> $seq->start,
662 foreach my $seq (@seq) {
663 my $str = $seq->seq();
664 my ($seen, $key) = &_check_uniq
($str, \
@uniq_str, $len);
665 if ($seen) { # seen before
666 my @memb = @
{$member{$key}};
668 $member{$key} = \
@memb;
670 push @uniq_str, $key;
672 $member{$key} = [ ($seq) ];
673 $order{$key} = $order;
677 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
678 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
679 my $str2 = &_convert_leading_ending_gaps
($str, '?', '-');
680 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
681 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
683 my $end= CORE
::length($str2);
684 $end -= CORE
::length($1) while $str2 =~ m/($gap+)/g;
685 my $new = Bio
::LocatableSeq
->new(-id
=>"ST".$order{$str},
691 foreach (@
{$member{$str}}) {
692 push @
{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
693 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
696 return wantarray ?
($aln, $st) : $aln;
699 sub _check_uniq
{ # check if same seq exists in the alignment
700 my ($str1, $ref, $length) = @_;
701 my @char1=split //, $str1;
704 return (0, $str1) if @array==0; # not seen (1st sequence)
706 foreach my $str2 (@array) {
708 my @char2=split //, $str2;
709 for (my $i=0; $i<=$length-1; $i++) {
710 next if $char1[$i] eq '?';
711 next if $char2[$i] eq '?';
712 $diff++ if $char1[$i] ne $char2[$i];
714 return (1, $str2) if $diff == 0; # seen before
717 return (0, $str1); # not seen
720 sub _convert_leading_ending_gaps
{
724 my @array=split //, $s;
725 # convert leading char:
726 for (my $i=0; $i<=$#array; $i++) {
727 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
729 # convert ending char:
730 for (my $i = $#array; $i>= 0; $i--) {
731 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
733 my $s_new=join '', @array;
737 =head1 Sequence selection methods
739 Methods returning one or more sequences objects.
744 Usage : foreach $seq ( $align->each_seq() )
745 Function : Gets a Seq object from the alignment
753 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
761 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
762 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
763 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
770 =head2 each_alphabetically
772 Title : each_alphabetically
773 Usage : foreach $seq ( $ali->each_alphabetically() )
774 Function : Returns a sequence object, but the objects are returned
775 in alphabetically sorted order.
776 Does not change the order of the alignment.
782 sub each_alphabetically
{
784 my ($seq,$nse,@arr,%hash,$count);
786 foreach $seq ( $self->each_seq() ) {
787 $nse = $seq->get_nse;
791 foreach $nse ( sort _alpha_startend
keys %hash) {
792 push(@arr,$hash{$nse});
797 sub _alpha_startend
{
798 my ($aname,$astart,$bname,$bstart);
799 ($aname,$astart) = split (/-/,$a);
800 ($bname,$bstart) = split (/-/,$b);
802 if( $aname eq $bname ) {
803 return $astart <=> $bstart;
806 return $aname cmp $bname;
810 =head2 each_seq_with_id
812 Title : each_seq_with_id
813 Usage : foreach $seq ( $align->each_seq_with_id() )
814 Function : Gets a Seq objects from the alignment, the contents
815 being those sequences with the given name (there may be
818 Argument : a seq name
824 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
825 $self->each_seq_with_id(@_);
828 sub each_seq_with_id
{
832 $self->throw("Method each_seq_with_id needs a sequence name argument")
837 if (exists($self->{'_start_end_lists'}->{$id})) {
838 @arr = @
{$self->{'_start_end_lists'}->{$id}};
843 =head2 get_seq_by_pos
845 Title : get_seq_by_pos
846 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
847 Function : Gets a sequence based on its position in the alignment.
848 Numbering starts from 1. Sequence positions larger than
849 num_sequences() will throw an error.
850 Returns : a Bio::LocatableSeq object
851 Args : positive integer for the sequence position
860 $self->throw("Sequence position has to be a positive integer, not [$pos]")
861 unless $pos =~ /^\d+$/ and $pos > 0;
862 $self->throw("No sequence at position [$pos]")
863 unless $pos <= $self->num_sequences ;
865 my $nse = $self->{'_order'}->{--$pos};
866 return $self->{'_seq'}->{$nse};
871 Title : get_seq_by_id
872 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
873 Function : Gets a sequence based on its name.
874 Sequences that do not exist will warn and return undef
875 Returns : a Bio::LocatableSeq object
876 Args : string for sequence name
881 my ($self,$name) = @_;
882 unless( defined $name ) {
883 $self->warn("Must provide a sequence name");
886 for my $seq ( values %{$self->{'_seq'}} ) {
887 if ( $seq->id eq $name) {
894 =head2 seq_with_features
896 Title : seq_with_features
897 Usage : $seq = $aln->seq_with_features(-pos => 1,
900 sub { my $consensus = shift;
905 while($consensus =~ /[^?]$q[^?]/){
906 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
912 Function: produces a Bio::Seq object by first splicing gaps from -pos
913 (by means of a splice_by_seq_pos() call), then creating
914 features using non-? chars (by means of a consensus_string()
915 call with stringency -consensus).
916 Returns : a Bio::Seq object
917 Args : -pos : required. sequence from which to build the Bio::Seq
919 -consensus : optional, defaults to consensus_string()'s
921 -mask : optional, a coderef to apply to consensus_string()'s
922 output before building features. this may be useful for
923 closing gaps of 1 bp by masking over them with N, for
928 sub seq_with_features
{
929 my ($self,%arg) = @_;
931 #first do the preparatory splice
932 $self->throw("must provide a -pos argument") unless $arg{-pos};
933 $self->splice_by_seq_pos($arg{-pos});
935 my $consensus_string = $self->consensus_string($arg{-consensus
});
936 $consensus_string = $arg{-mask
}->($consensus_string)
937 if defined($arg{-mask
});
941 push @bs, 1 if $consensus_string =~ /^[^?]/;
943 while($consensus_string =~ /\?[^?]/g){
944 push @bs, pos($consensus_string);
946 while($consensus_string =~ /[^?]\?/g){
947 push @es, pos($consensus_string);
950 push @es, CORE
::length($consensus_string) if $consensus_string =~ /[^?]$/;
952 my $seq = Bio
::Seq
->new();
954 # my $rootfeature = Bio::SeqFeature::Generic->new(
955 # -source_tag => 'location',
956 # -start => $self->get_seq_by_pos($arg{-pos})->start,
957 # -end => $self->get_seq_by_pos($arg{-pos})->end,
959 # $seq->add_SeqFeature($rootfeature);
961 while(my $b = shift @bs){
963 $seq->add_SeqFeature(
964 Bio
::SeqFeature
::Generic
->new(
965 -start
=> $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
966 -end
=> $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
967 -source_tag
=> $self->source || 'MSA',
976 =head1 Create new alignments
978 The result of these methods are horizontal or vertical subsets of the
984 Usage : $aln2 = $aln->select(1, 3) # three first sequences
985 Function : Creates a new alignment from a continuous subset of
986 sequences. Numbering starts from 1. Sequence positions
987 larger than num_sequences() will throw an error.
988 Returns : a Bio::SimpleAlign object
989 Args : positive integer for the first sequence
990 positive integer for the last sequence to include (optional)
996 my ($start, $end) = @_;
998 $self->throw("Select start has to be a positive integer, not [$start]")
999 unless $start =~ /^\d+$/ and $start > 0;
1000 $self->throw("Select end has to be a positive integer, not [$end]")
1001 unless $end =~ /^\d+$/ and $end > 0;
1002 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1003 unless $start <= $end;
1005 my $aln = $self->new;
1006 foreach my $pos ($start .. $end) {
1007 $aln->add_seq($self->get_seq_by_pos($pos));
1009 $aln->id($self->id);
1010 # fix for meta, sf, ann
1014 =head2 select_noncont
1016 Title : select_noncont
1017 Usage : # 1st and 3rd sequences, sorted
1018 $aln2 = $aln->select_noncont(1, 3)
1020 # 1st and 3rd sequences, sorted (same as first)
1021 $aln2 = $aln->select_noncont(3, 1)
1023 # 1st and 3rd sequences, unsorted
1024 $aln2 = $aln->select_noncont('nosort',3, 1)
1026 Function : Creates a new alignment from a subset of sequences. Numbering
1027 starts from 1. Sequence positions larger than num_sequences() will
1028 throw an error. Sorts the order added to new alignment by default,
1029 to prevent sorting pass 'nosort' as the first argument in the list.
1030 Returns : a Bio::SimpleAlign object
1031 Args : array of integers for the sequences. If the string 'nosort' is
1032 passed as the first argument, the sequences will not be sorted
1033 in the new alignment but will appear in the order listed.
1037 sub select_noncont
{
1041 if ($pos[0] !~ m{^\d+$}) {
1042 my $sortcmd = shift @pos;
1043 if ($sortcmd eq 'nosort') {
1046 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1050 my $end = $self->num_sequences;
1052 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1053 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1056 @pos = sort {$a <=> $b} @pos unless $nosort;
1058 my $aln = $self->new;
1059 foreach my $p (@pos) {
1060 $aln->add_seq($self->get_seq_by_pos($p));
1062 $aln->id($self->id);
1063 # fix for meta, sf, ann
1067 =head2 select_noncont_by_name
1069 Title : select_noncont_by_name
1070 Usage : my $aln2 = $aln->select_noncont_by_name('A123', 'B456');
1071 Function : Creates a new alignment from a subset of sequences which are
1072 selected by name (sequence ID).
1073 Returns : a Bio::SimpleAlign object
1074 Args : array of names (i.e., identifiers) for the sequences.
1078 sub select_noncont_by_name
{
1079 my ($self, @names) = @_;
1081 my $aln = $self->new;
1082 foreach my $name (@names) {
1083 $aln->add_seq($self->get_seq_by_id($name));
1085 $aln->id($self->id);
1093 Usage : $aln2 = $aln->slice(20,30)
1094 Function : Creates a slice from the alignment inclusive of start and
1095 end columns, and the first column in the alignment is denoted 1.
1096 Sequences with no residues in the slice are excluded from the
1097 new alignment and a warning is printed. Slice beyond the length of
1098 the sequence does not do padding.
1099 Returns : A Bio::SimpleAlign object
1100 Args : Positive integer for start column, positive integer for end column,
1101 optional boolean which if true will keep gap-only columns in the newly
1102 created slice. Example:
1104 $aln2 = $aln->slice(20,30,1)
1110 my ($start, $end, $keep_gap_only) = @_;
1112 $self->throw("Slice start has to be a positive integer, not [$start]")
1113 unless $start =~ /^\d+$/ and $start > 0;
1114 $self->throw("Slice end has to be a positive integer, not [$end]")
1115 unless $end =~ /^\d+$/ and $end > 0;
1116 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1117 unless $start <= $end;
1118 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1119 "[$start] is too big.") if $start > $self->length;
1120 my $cons_meta = $self->consensus_meta;
1121 my $aln = $self->new;
1122 $aln->id($self->id);
1123 foreach my $seq ( $self->each_seq() ) {
1124 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1127 -alphabet
=> $seq->alphabet,
1128 -strand
=> $seq->strand,
1129 -verbose
=> $self->verbose) :
1130 Bio
::LocatableSeq
->new
1132 -alphabet
=> $seq->alphabet,
1133 -strand
=> $seq->strand,
1134 -verbose
=> $self->verbose);
1138 $seq_end = $seq->length if( $end > $seq->length );
1140 my $slice_seq = $seq->subseq($start, $seq_end);
1141 $new_seq->seq( $slice_seq );
1143 $slice_seq =~ s/\W//g;
1146 my $pre_start_seq = $seq->subseq(1, $start - 1);
1147 $pre_start_seq =~ s/\W//g;
1148 if (!defined($seq->strand)) {
1149 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1150 } elsif ($seq->strand < 0){
1151 $new_seq->start( $seq->end - CORE
::length($pre_start_seq) - CORE
::length($slice_seq) + 1);
1153 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1156 if ((defined $seq->strand)&&($seq->strand < 0)){
1157 $new_seq->start( $seq->end - CORE
::length($slice_seq) + 1);
1159 $new_seq->start( $seq->start);
1162 if ($new_seq->isa('Bio::Seq::MetaI')) {
1163 for my $meta_name ($seq->meta_names) {
1164 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1167 $new_seq->end( $new_seq->start + CORE
::length($slice_seq) - 1 );
1169 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1170 $aln->add_seq($new_seq);
1172 if( $keep_gap_only ) {
1173 $aln->add_seq($new_seq);
1175 my $nse = $seq->get_nse();
1176 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1177 " Sequence excluded from the new alignment.");
1182 my $new = Bio
::Seq
::Meta
->new();
1183 for my $meta_name ($cons_meta->meta_names) {
1184 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1186 $aln->consensus_meta($new);
1188 $aln->annotation($self->annotation);
1189 # fix for meta, sf, ann
1193 =head2 remove_columns
1195 Title : remove_columns
1196 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1197 $aln2 = $aln->remove_columns([0,0],[6,8])
1198 Function : Creates an aligment with columns removed corresponding to
1199 the specified type or by specifying the columns by number.
1200 Returns : Bio::SimpleAlign object
1201 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1202 'all_gaps_columns') or array ref where the referenced array
1203 contains a pair of integers that specify a range.
1204 The first column is 0
1208 sub remove_columns
{
1209 my ($self,@args) = @_;
1210 @args || $self->throw("Must supply column ranges or column types");
1213 if ($args[0][0] =~ /^[a-z_]+$/i) {
1214 $aln = $self->_remove_columns_by_type($args[0]);
1215 } elsif ($args[0][0] =~ /^\d+$/) {
1216 $aln = $self->_remove_columns_by_num(\
@args);
1218 $self->throw("You must pass array references to remove_columns(), not @args");
1220 # fix for meta, sf, ann
1228 Usage : $aln2 = $aln->remove_gaps
1229 Function : Creates an aligment with gaps removed
1230 Returns : a Bio::SimpleAlign object
1231 Args : a gap character(optional) if none specified taken
1232 from $self->gap_char,
1233 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1234 indicates that only all-gaps columns should be deleted
1236 Used from method L<remove_columns> in most cases. Set gap character
1237 using L<gap_char()|gap_char>.
1242 my ($self,$gapchar,$all_gaps_columns) = @_;
1244 if ($all_gaps_columns) {
1245 $gap_line = $self->all_gap_line($gapchar);
1247 $gap_line = $self->gap_line($gapchar);
1249 my $aln = $self->new;
1253 my $del_char = $gapchar || $self->gap_char;
1254 # Do the matching to get the segments to remove
1255 while ($gap_line =~ m/[$del_char]/g) {
1256 my $start = pos($gap_line)-1;
1257 $gap_line =~ m/\G[$del_char]+/gc;
1258 my $end = pos($gap_line)-1;
1260 #have to offset the start and end for subsequent removes
1263 $length += ($end-$start+1);
1264 push @remove, [$start,$end];
1267 #remove the segments
1268 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1269 # fix for meta, sf, ann
1275 my ($self,$aln,$remove) = @_;
1278 my $gap = $self->gap_char;
1280 # splice out the segments and create new seq
1281 foreach my $seq($self->each_seq){
1282 my $new_seq = Bio
::LocatableSeq
->new(
1284 -alphabet
=> $seq->alphabet,
1285 -strand
=> $seq->strand,
1286 -verbose
=> $self->verbose);
1287 my $sequence = $seq->seq;
1288 foreach my $pair(@
{$remove}){
1289 my $start = $pair->[0];
1290 my $end = $pair->[1];
1291 $sequence = $seq->seq unless $sequence;
1292 my $orig = $sequence;
1293 my $head = $start > 0 ?
substr($sequence, 0, $start) : '';
1294 my $tail = ($end + 1) >= CORE
::length($sequence) ?
'' : substr($sequence, $end + 1);
1295 $sequence = $head.$tail;
1297 unless (defined $new_seq->start) {
1299 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1300 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1303 my $start_adjust = $orig =~ /^$gap+/;
1304 if ($start_adjust) {
1305 $start_adjust = $+[0] == $start;
1307 $new_seq->start($seq->start + $start_adjust);
1311 if (($end + 1) >= CORE
::length($orig)) {
1312 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1313 $new_seq->end($seq->end - (CORE
::length($orig) - $start) + $end_adjust);
1316 $new_seq->end($seq->end);
1320 if ($new_seq->end < $new_seq->start) {
1321 # we removed all columns except for gaps: set to 0 to indicate no
1327 $new_seq->seq($sequence) if $sequence;
1328 push @new, $new_seq;
1330 # add the new seqs to the alignment
1331 foreach my $new(@new){
1332 $aln->add_seq($new);
1334 # fix for meta, sf, ann
1338 sub _remove_columns_by_type
{
1339 my ($self,$type) = @_;
1340 my $aln = $self->new;
1343 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @
{$type});
1344 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@
{$type});
1345 my %matchchars = ( 'match' => '\*',
1350 'all_gaps_columns' => ''
1352 # get the characters to delete against
1354 foreach my $type (@
{$type}){
1355 $del_char.= $matchchars{$type};
1359 my $match_line = $self->match_line;
1360 # do the matching to get the segments to remove
1362 while($match_line =~ m/[$del_char]/g ){
1363 my $start = pos($match_line)-1;
1364 $match_line=~/\G[$del_char]+/gc;
1365 my $end = pos($match_line)-1;
1367 #have to offset the start and end for subsequent removes
1370 $length += ($end-$start+1);
1371 push @remove, [$start,$end];
1375 # remove the segments
1376 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1377 $aln = $aln->remove_gaps() if $gap;
1378 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1379 # fix for meta, sf, ann
1384 sub _remove_columns_by_num
{
1385 my ($self,$positions) = @_;
1386 my $aln = $self->new;
1388 # sort the positions
1389 @
$positions = sort { $a->[0] <=> $b->[0] } @
$positions;
1393 foreach my $pos (@
{$positions}) {
1394 my ($start, $end) = @
{$pos};
1396 #have to offset the start and end for subsequent removes
1399 $length += ($end-$start+1);
1400 push @remove, [$start,$end];
1403 #remove the segments
1404 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1405 # fix for meta, sf, ann
1410 =head1 Change sequences within the MSA
1412 These methods affect characters in all sequences without changing the
1415 =head2 splice_by_seq_pos
1417 Title : splice_by_seq_pos
1418 Usage : $status = splice_by_seq_pos(1);
1419 Function: splices all aligned sequences where the specified sequence
1422 Returns : 1 on success
1423 Args : position of sequence to splice by
1428 sub splice_by_seq_pos
{
1429 my ($self,$pos) = @_;
1431 my $guide = $self->get_seq_by_pos($pos);
1432 my $guide_seq = $guide->seq;
1434 $guide_seq =~ s/\./\-/g;
1438 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1439 unshift @gaps, $pos;
1443 foreach my $seq ($self->each_seq){
1444 my @bases = split '', $seq->seq;
1446 splice(@bases, $_, 1) foreach @gaps;
1447 $seq->seq(join('', @bases));
1456 Usage : $ali->map_chars('\.','-')
1457 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1460 Note that the first argument is interpreted as a regexp
1461 so be careful and escape any wild card characters (e.g.
1462 do $ali->map_chars('\.','-') to replace periods with dashes.
1463 Returns : 1 on success
1464 Argument : A regexp and a string
1474 $self->throw("Need two arguments: a regexp and a string")
1475 unless defined $from and defined $to;
1477 foreach $seq ( $self->each_seq() ) {
1478 $temp = $seq->seq();
1479 $temp =~ s/$from/$to/g;
1489 Usage : $ali->uppercase()
1490 Function : Sets all the sequences to uppercase
1491 Returns : 1 on success
1501 foreach $seq ( $self->each_seq() ) {
1502 $temp = $seq->seq();
1503 $temp =~ tr/[a-z]/[A-Z]/;
1512 Title : cigar_line()
1513 Usage : %cigars = $align->cigar_line()
1514 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1515 Report) line for each sequence in the alignment. Examples are
1516 "1,60" or "5,10:12,58", where the numbers refer to conserved
1517 positions within the alignment. The keys of the hash are the
1518 NSEs (name/start/end) assigned to each sequence.
1519 Args : threshold (optional, defaults to 100)
1520 Returns : Hash of strings (cigar lines)
1529 my @consensus = split "",($self->consensus_string($thr));
1530 my $len = $self->length;
1531 my $gapchar = $self->gap_char;
1533 # create a precursor, something like (1,4,5,6,7,33,45),
1534 # where each number corresponds to a conserved position
1535 foreach my $seq ( $self->each_seq ) {
1536 my @seq = split "", uc ($seq->seq);
1538 for (my $x = 0 ; $x < $len ; $x++ ) {
1539 if ($seq[$x] eq $consensus[$x]) {
1540 push @
{$cigars{$seq->get_nse}},$pos;
1542 } elsif ($seq[$x] ne $gapchar) {
1547 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1548 for my $name (keys %cigars) {
1549 splice @
{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1550 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1551 push @
{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1552 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1553 ${$cigars{$name}}[$#{$cigars{$name}}] );
1554 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1555 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1556 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1557 splice @
{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1561 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1562 for my $name (keys %cigars) {
1564 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1565 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1566 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1570 for my $pos (@remove) {
1571 splice @
{$cigars{$name}}, $pos, 1;
1574 # join and punctuate
1575 for my $name (keys %cigars) {
1576 my ($start,$end,$str) = "";
1577 while ( ($start,$end) = splice @
{$cigars{$name}}, 0, 2 ) {
1578 $str .= ($start . "," . $end . ":");
1581 $cigars{$name} = $str;
1589 Title : match_line()
1590 Usage : $line = $align->match_line()
1591 Function : Generates a match line - much like consensus string
1592 except that a line indicating the '*' for a match.
1593 Args : (optional) Match line characters ('*' by default)
1594 (optional) Strong match char (':' by default)
1595 (optional) Weak match char ('.' by default)
1601 my ($self,$matchlinechar, $strong, $weak) = @_;
1602 my %matchchars = ('match' => $matchlinechar || '*',
1603 'weak' => $weak || '.',
1604 'strong' => $strong || ':',
1610 foreach my $seq ( $self->each_seq ) {
1611 push @seqchars, [ split(//, uc ($seq->seq)) ];
1612 $alphabet = $seq->alphabet unless defined $alphabet;
1614 my $refseq = shift @seqchars;
1615 # let's just march down the columns
1618 foreach my $pos ( 0..$self->length ) {
1619 my $refchar = $refseq->[$pos];
1620 my $char = $matchchars{'mismatch'};
1621 unless( defined $refchar ) {
1622 last if $pos == $self->length; # short circuit on last residue
1623 # this in place to handle jason's soon-to-be-committed
1624 # intron mapping code
1627 my %col = ($refchar => 1);
1628 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1629 foreach my $seq ( @seqchars ) {
1630 next if $pos >= scalar @
$seq;
1631 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1632 $seq->[$pos] eq ' ' );
1633 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1635 my @colresidues = sort keys %col;
1637 # if all the values are the same
1638 if( $dash ) { $char = $matchchars{'mismatch'} }
1639 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1640 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1641 # matches for protein seqs
1643 foreach my $type ( qw(strong weak) ) {
1644 # iterate through categories
1646 # iterate through each of the aa in the col
1647 # look to see which groups it is in
1648 foreach my $c ( @colresidues ) {
1649 foreach my $f ( grep { index($_,$c) >= 0 } @
{$CONSERVATION_GROUPS{$type}} ) {
1650 push @
{$groups{$f}},$c;
1654 foreach my $cols ( values %groups ) {
1655 @
$cols = sort @
$cols;
1656 # now we are just testing to see if two arrays
1657 # are identical w/o changing either one
1658 # have to be same len
1659 next if( scalar @
$cols != scalar @colresidues );
1660 # walk down the length and check each slot
1661 for($_=0;$_ < (scalar @
$cols);$_++ ) {
1662 next GRP
if( $cols->[$_] ne $colresidues[$_] );
1664 $char = $matchchars{$type};
1670 $matchline .= $char;
1679 Usage : $line = $align->gap_line()
1680 Function : Generates a gap line - much like consensus string
1681 except that a line where '-' represents gap
1682 Args : (optional) gap line characters ('-' by default)
1688 my ($self,$gapchar) = @_;
1689 $gapchar = $gapchar || $self->gap_char;
1690 my %gap_hsh; # column gaps vector
1691 foreach my $seq ( $self->each_seq ) {
1693 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] =~ m/[$gapchar]/}
1694 map {[$i++, $_]} split(//, uc ($seq->seq));
1697 foreach my $pos ( 0..$self->length-1 ) {
1698 $gap_line .= (exists $gap_hsh{$pos}) ?
$self->gap_char:'.';
1705 Title : all_gap_line()
1706 Usage : $line = $align->all_gap_line()
1707 Function : Generates a gap line - much like consensus string
1708 except that a line where '-' represents all-gap column
1709 Args : (optional) gap line characters ('-' by default)
1715 my ($self,$gapchar) = @_;
1716 $gapchar = $gapchar || $self->gap_char;
1717 my %gap_hsh; # column gaps counter hash
1718 my @seqs = $self->each_seq;
1719 foreach my $seq ( @seqs ) {
1721 map {$gap_hsh{$_->[0]}++} grep {$_->[1] =~ m/[$gapchar]/}
1722 map {[$i++, $_]} split(//, uc ($seq->seq));
1725 foreach my $pos ( 0..$self->length-1 ) {
1726 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1728 $gap_line .= $self->gap_char;
1736 =head2 gap_col_matrix
1738 Title : gap_col_matrix()
1739 Usage : my $cols = $align->gap_col_matrix()
1740 Function : Generates an array where each element in the array is a
1741 hash reference with a key of the sequence name and a
1742 value of 1 if the sequence has a gap at that column
1743 Returns : Reference to an array
1744 Args : Optional: gap line character ($aln->gap_char or '-' by default)
1748 sub gap_col_matrix
{
1749 my ( $self, $gapchar ) = @_;
1750 $gapchar = $gapchar || $self->gap_char;
1751 my %gap_hsh; # column gaps vector
1753 foreach my $seq ( $self->each_seq ) {
1755 my $str = $seq->seq;
1756 my $len = $seq->length;
1758 my $id = $seq->display_id;
1759 while ( $i < $len ) {
1760 $ch = substr( $str, $i, 1 );
1761 $cols[ $i++ ]->{$id} = ( $ch =~ m/[$gapchar]/ );
1770 Usage : $ali->match()
1771 Function : Goes through all columns and changes residues that are
1772 identical to residue in first sequence to match '.'
1773 character. Sets match_char.
1775 USE WITH CARE: Most MSA formats do not support match
1776 characters in sequences, so this is mostly for output
1777 only. NEXUS format (Bio::AlignIO::nexus) can handle
1779 Returns : 1 on success
1780 Argument : a match character, optional, defaults to '.'
1785 my ( $self, $match ) = @_;
1788 my ($matching_char) = $match;
1789 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/; #';
1790 $self->map_chars( $matching_char, '-' );
1792 my @seqs = $self->each_seq();
1793 return 1 unless scalar @seqs > 1;
1795 my $refseq = shift @seqs;
1796 my @refseq = split //, $refseq->seq;
1797 my $gapchar = $self->gap_char;
1799 foreach my $seq (@seqs) {
1800 my @varseq = split //, $seq->seq();
1801 for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1802 $varseq[$i] = $match
1803 if defined $refseq[$i]
1804 && ( $refseq[$i] =~ /[A-Za-z\*]/
1805 || $refseq[$i] =~ /$gapchar/ )
1806 && $refseq[$i] eq $varseq[$i];
1808 $seq->seq( join '', @varseq );
1810 $self->match_char($match);
1819 Usage : $ali->unmatch()
1820 Function : Undoes the effect of method match. Unsets match_char.
1821 Returns : 1 on success
1822 Argument : a match character, optional, defaults to '.'
1824 See L<match> and L<match_char>
1829 my ( $self, $match ) = @_;
1833 my @seqs = $self->each_seq();
1834 return 1 unless scalar @seqs > 1;
1836 my $refseq = shift @seqs;
1837 my @refseq = split //, $refseq->seq;
1838 my $gapchar = $self->gap_char;
1839 foreach my $seq (@seqs) {
1840 my @varseq = split //, $seq->seq();
1841 for ( my $i = 0; $i < scalar @varseq; $i++ ) {
1842 $varseq[$i] = $refseq[$i]
1843 if defined $refseq[$i]
1844 && ( $refseq[$i] =~ /[A-Za-z\*]/
1845 || $refseq[$i] =~ /$gapchar/ )
1846 && $varseq[$i] eq $match;
1848 $seq->seq( join '', @varseq );
1850 $self->match_char('');
1855 =head1 MSA attributes
1857 Methods for setting and reading the MSA attributes.
1859 Note that the methods defining character semantics depend on the user
1860 to set them sensibly. They are needed only by certain input/output
1861 methods. Unset them by setting to an empty string ('').
1866 Usage : $myalign->id("Ig")
1867 Function : Gets/sets the id field of the alignment
1868 Returns : An id string
1869 Argument : An id string (optional)
1874 my ( $self, $name ) = @_;
1876 if ( defined($name) ) {
1877 $self->{'_id'} = $name;
1880 return $self->{'_id'};
1886 Usage : $myalign->accession("PF00244")
1887 Function : Gets/sets the accession field of the alignment
1888 Returns : An acc string
1889 Argument : An acc string (optional)
1894 my ( $self, $acc ) = @_;
1896 if ( defined($acc) ) {
1897 $self->{'_accession'} = $acc;
1900 return $self->{'_accession'};
1906 Usage : $myalign->description("14-3-3 proteins")
1907 Function : Gets/sets the description field of the alignment
1908 Returns : An description string
1909 Argument : An description string (optional)
1914 my ( $self, $name ) = @_;
1916 if ( defined($name) ) {
1917 $self->{'_description'} = $name;
1920 return $self->{'_description'};
1925 Title : missing_char
1926 Usage : $myalign->missing_char("?")
1927 Function : Gets/sets the missing_char attribute of the alignment
1928 It is generally recommended to set it to 'n' or 'N'
1929 for nucleotides and to 'X' for protein.
1930 Returns : An missing_char string,
1931 Argument : An missing_char string (optional)
1936 my ( $self, $char ) = @_;
1938 if ( defined $char ) {
1939 $self->throw("Single missing character, not [$char]!")
1940 if CORE
::length($char) > 1;
1941 $self->{'_missing_char'} = $char;
1944 return $self->{'_missing_char'};
1951 Usage : $myalign->match_char('.')
1952 Function : Gets/sets the match_char attribute of the alignment
1953 Returns : An match_char string,
1954 Argument : An match_char string (optional)
1959 my ( $self, $char ) = @_;
1961 if ( defined $char ) {
1962 $self->throw("Single match character, not [$char]!")
1963 if CORE
::length($char) > 1;
1964 $self->{'_match_char'} = $char;
1967 return $self->{'_match_char'};
1973 Usage : $myalign->gap_char('-')
1974 Function : Gets/sets the gap_char attribute of the alignment
1975 Returns : An gap_char string, defaults to '-'
1976 Argument : An gap_char string (optional)
1981 my ( $self, $char ) = @_;
1983 if ( defined $char || !defined $self->{'_gap_char'} ) {
1984 $char = '-' unless defined $char;
1985 $self->throw("Single gap character, not [$char]!")
1986 if CORE
::length($char) > 1;
1987 $self->{'_gap_char'} = $char;
1989 return $self->{'_gap_char'};
1995 Title : symbol_chars
1996 Usage : my @symbolchars = $aln->symbol_chars;
1997 Function: Returns all the seen symbols (other than gaps)
1998 Returns : array of characters that are the seen symbols
1999 Args : boolean to include the gap/missing/match characters
2004 my ($self,$includeextra) = @_;
2006 unless ($self->{'_symbols'}) {
2007 foreach my $seq ($self->each_seq) {
2008 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
2011 my %copy = %{$self->{'_symbols'}};
2012 if( ! $includeextra ) {
2013 foreach my $char ( $self->gap_char, $self->match_char,
2014 $self->missing_char) {
2015 delete $copy{$char} if( defined $char );
2021 =head1 Alignment descriptors
2023 These read only methods describe the MSA in various ways.
2029 Usage : $str = $ali->score()
2030 Function : get/set a score of the alignment
2031 Returns : a score for the alignment
2032 Argument : an optional score to set
2038 $self->{score
} = shift if @_;
2039 return $self->{score
};
2042 =head2 consensus_string
2044 Title : consensus_string
2045 Usage : $str = $ali->consensus_string($threshold_percent)
2046 Function : Makes a strict consensus
2047 Returns : Consensus string
2048 Argument : Optional threshold ranging from 0 to 100.
2049 The consensus residue has to appear at least threshold %
2050 of the sequences at a given location, otherwise a '?'
2051 character will be placed at that location.
2052 (Default value = 0%)
2056 sub consensus_string
{
2058 my $threshold = shift;
2061 my $len = $self->length - 1;
2063 foreach ( 0 .. $len ) {
2064 $out .= $self->_consensus_aa( $_, $threshold );
2070 =head2 consensus_conservation
2072 Title : consensus_conservation
2073 Usage : @conservation = $ali->consensus_conservation();
2074 Function : Conservation (as a percent) of each position of alignment
2075 Returns : Array of percentages [0-100]. Gap columns are 0% conserved.
2080 sub consensus_conservation
{
2083 my $num_sequences = $self->num_sequences;
2084 foreach my $point (0..$self->length-1) {
2085 my %hash = $self->_consensus_counts($point);
2086 # max frequency of a non-gap letter
2087 my $max = (sort {$b<=>$a} values %hash )[0];
2088 push @cons, 100 * $max / $num_sequences;
2096 my $threshold_percent = shift || -1 ;
2097 my ($seq,%hash,$count,$letter,$key);
2098 my $gapchar = $self->gap_char;
2099 %hash = $self->_consensus_counts($point);
2100 my $number_of_sequences = $self->num_sequences();
2101 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2105 foreach $key ( sort keys %hash ) {
2106 # print "Now at $key $hash{$key}\n";
2107 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2109 $count = $hash{$key};
2115 # Frequency of each letter in one column
2116 sub _consensus_counts
{
2120 my $gapchar = $self->gap_char;
2121 foreach my $seq ( $self->each_seq() ) {
2122 my $letter = substr($seq->seq,$point,1);
2123 $self->throw("--$point-----------") if $letter eq '';
2124 ($letter eq $gapchar || $letter =~ /\./) && next;
2131 =head2 consensus_iupac
2133 Title : consensus_iupac
2134 Usage : $str = $ali->consensus_iupac()
2135 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2136 and RNA. The output is in upper case except when gaps in
2137 a column force output to be in lower case.
2139 Note that if your alignment sequences contain a lot of
2140 IUPAC ambiquity codes you often have to manually set
2141 alphabet. Bio::PrimarySeq::_guess_type thinks they
2142 indicate a protein sequence.
2143 Returns : consensus string
2145 Throws : on protein sequences
2149 sub consensus_iupac
{
2152 my $len = $self->length - 1;
2154 # only DNA and RNA sequences are valid
2155 foreach my $seq ( $self->each_seq() ) {
2156 $self->throw( "Seq [" . $seq->get_nse . "] is a protein" )
2157 if $seq->alphabet eq 'protein';
2160 # loop over the alignment columns
2161 foreach my $count ( 0 .. $len ) {
2162 $out .= $self->_consensus_iupac($count);
2167 sub _consensus_iupac
{
2168 my ($self, $column) = @_;
2169 my ($string, $char, $rna);
2171 #determine all residues in a column
2172 foreach my $seq ( $self->each_seq() ) {
2173 $string .= substr($seq->seq, $column, 1);
2175 $string = uc $string;
2177 # quick exit if there's an N in the string
2178 if ($string =~ /N/) {
2179 $string =~ /\W/ ?
return 'n' : return 'N';
2181 # ... or if there are only gap characters
2182 return '-' if $string =~ /^\W+$/;
2184 # treat RNA as DNA in regexps
2185 if ($string =~ /U/) {
2190 # the following s///'s only need to be done to the _first_ ambiguity code
2191 # as we only need to see the _range_ of characters in $string
2193 if ($string =~ /[VDHB]/) {
2194 $string =~ s/V/AGC/;
2195 $string =~ s/D/AGT/;
2196 $string =~ s/H/ACT/;
2197 $string =~ s/B/CTG/;
2200 if ($string =~ /[SKYRWM]/) {
2209 # and now the guts of the thing
2211 if ($string =~ /A/) {
2213 if ($string =~ /G/) {
2214 $char = 'R'; # A and G (purines) R
2215 if ($string =~ /C/) {
2216 $char = 'V'; # A and G and C V
2217 if ($string =~ /T/) {
2218 $char = 'N'; # A and G and C and T N
2220 } elsif ($string =~ /T/) {
2221 $char = 'D'; # A and G and T D
2223 } elsif ($string =~ /C/) {
2224 $char = 'M'; # A and C M
2225 if ($string =~ /T/) {
2226 $char = 'H'; # A and C and T H
2228 } elsif ($string =~ /T/) {
2229 $char = 'W'; # A and T W
2231 } elsif ($string =~ /C/) {
2233 if ($string =~ /T/) {
2234 $char = 'Y'; # C and T (pyrimidines) Y
2235 if ($string =~ /G/) {
2236 $char = 'B'; # C and T and G B
2238 } elsif ($string =~ /G/) {
2239 $char = 'S'; # C and G S
2241 } elsif ($string =~ /G/) {
2243 if ($string =~ /C/) {
2244 $char = 'S'; # G and C S
2245 } elsif ($string =~ /T/) {
2246 $char = 'K'; # G and T K
2248 } elsif ($string =~ /T/) {
2252 $char = 'U' if $rna and $char eq 'T';
2253 $char = lc $char if $string =~ /\W/;
2259 =head2 consensus_meta
2261 Title : consensus_meta
2262 Usage : $seqmeta = $ali->consensus_meta()
2263 Function : Returns a Bio::Seq::Meta object containing the consensus
2264 strings derived from meta data analysis.
2265 Returns : Bio::Seq::Meta
2266 Argument : Bio::Seq::Meta
2267 Throws : non-MetaI object
2271 sub consensus_meta
{
2272 my ($self, $meta) = @_;
2273 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2274 $self->throw('Not a Bio::Seq::MetaI object');
2276 return $self->{'_aln_meta'} = $meta if $meta;
2277 return $self->{'_aln_meta'}
2283 Usage : if ( $ali->is_flush() )
2284 Function : Tells you whether the alignment
2285 : is flush, i.e. all of the same length
2292 my ( $self, $report ) = @_;
2297 foreach $seq ( $self->each_seq() ) {
2298 if ( $length == (-1) ) {
2299 $length = CORE
::length( $seq->seq() );
2303 $temp = CORE
::length( $seq->seq() );
2304 if ( $temp != $length ) {
2306 "expecting $length not $temp from " . $seq->display_id )
2309 "expecting $length not $temp from " . $seq->display_id );
2310 $self->debug( $seq->seq() . "\n" );
2322 Usage : $len = $ali->length()
2323 Function : Returns the maximum length of the alignment.
2324 To be sure the alignment is a block, use is_flush
2332 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2342 foreach $seq ( $self->each_seq() ) {
2343 $temp = $seq->length();
2344 if( $temp > $length ) {
2353 =head2 maxdisplayname_length
2355 Title : maxdisplayname_length
2356 Usage : $ali->maxdisplayname_length()
2357 Function : Gets the maximum length of the displayname in the
2358 alignment. Used in writing out various MSA formats.
2364 sub maxname_length
{
2366 $self->deprecated("maxname_length - deprecated method.".
2367 " Use maxdisplayname_length() instead.");
2368 $self->maxdisplayname_length();
2373 $self->deprecated("maxnse_length - deprecated method.".
2374 " Use maxnse_length() instead.");
2375 $self->maxdisplayname_length();
2378 sub maxdisplayname_length
{
2383 foreach $seq ( $self->each_seq() ) {
2384 $len = CORE
::length $self->displayname( $seq->get_nse() );
2386 if ( $len > $maxname ) {
2394 =head2 max_metaname_length
2396 Title : max_metaname_length
2397 Usage : $ali->max_metaname_length()
2398 Function : Gets the maximum length of the meta name tags in the
2399 alignment for the sequences and for the alignment.
2400 Used in writing out various MSA formats.
2406 sub max_metaname_length
{
2411 # check seq meta first
2412 for $seq ( $self->each_seq() ) {
2413 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2414 for my $mtag ($seq->meta_names) {
2415 $len = CORE
::length $mtag;
2416 if( $len > $maxname ) {
2423 for my $meta ($self->consensus_meta) {
2425 for my $name ($meta->meta_names) {
2426 $len = CORE
::length $name;
2427 if( $len > $maxname ) {
2438 Title : num_residues
2439 Usage : $no = $ali->num_residues
2440 Function : number of residues in total in the alignment
2443 Note : replaces no_residues()
2451 foreach my $seq ( $self->each_seq ) {
2452 my $str = $seq->seq();
2454 $count += ( $str =~ s/[A-Za-z]//g );
2460 =head2 num_sequences
2462 Title : num_sequences
2463 Usage : $depth = $ali->num_sequences
2464 Function : number of sequence in the sequence alignment
2467 Note : replaces no_sequences()
2473 return scalar($self->each_seq);
2476 =head2 average_percentage_identity
2478 Title : average_percentage_identity
2479 Usage : $id = $align->average_percentage_identity
2480 Function: The function uses a fast method to calculate the average
2481 percentage identity of the alignment
2482 Returns : The average percentage identity of the alignment
2484 Notes : This method implemented by Kevin Howe calculates a figure that is
2485 designed to be similar to the average pairwise identity of the
2486 alignment (identical in the absence of gaps), without having to
2487 explicitly calculate pairwise identities proposed by Richard Durbin.
2488 Validated by Ewan Birney ad Alex Bateman.
2492 sub average_percentage_identity
{
2493 my ($self,@args) = @_;
2495 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2496 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2498 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2500 if (! $self->is_flush()) {
2501 $self->throw("All sequences in the alignment must be the same length");
2504 @seqs = $self->each_seq();
2505 $len = $self->length();
2507 # load the each hash with correct keys for existence checks
2509 for( my $index=0; $index < $len; $index++) {
2510 foreach my $letter (@alphabet) {
2511 $countHashes[$index]->{$letter} = 0;
2514 foreach my $seq (@seqs) {
2515 my @seqChars = split //, $seq->seq();
2516 for( my $column=0; $column < @seqChars; $column++ ) {
2517 my $char = uc($seqChars[$column]);
2518 if (exists $countHashes[$column]->{$char}) {
2519 $countHashes[$column]->{$char}++;
2526 for(my $column =0; $column < $len; $column++) {
2527 my %hash = %{$countHashes[$column]};
2529 foreach my $res (keys %hash) {
2530 $total += $hash{$res}*($hash{$res} - 1);
2531 $subdivisor += $hash{$res};
2533 $divisor += $subdivisor * ($subdivisor - 1);
2535 return $divisor > 0 ?
($total / $divisor )*100.0 : 0;
2538 =head2 percentage_identity
2540 Title : percentage_identity
2541 Usage : $id = $align->percentage_identity
2542 Function: The function calculates the average percentage identity
2543 (aliased to average_percentage_identity)
2544 Returns : The average percentage identity
2549 sub percentage_identity
{
2551 return $self->average_percentage_identity();
2554 =head2 overall_percentage_identity
2556 Title : overall_percentage_identity
2557 Usage : $id = $align->overall_percentage_identity
2558 $id = $align->overall_percentage_identity('short')
2559 Function: The function calculates the percentage identity of
2560 the conserved columns
2561 Returns : The percentage identity of the conserved columns
2562 Args : length value to use, optional defaults to alignment length
2563 possible values: 'align', 'short', 'long'
2565 The argument values 'short' and 'long' refer to shortest and longest
2566 sequence in the alignment. Method modification code by Hongyu Zhang.
2570 sub overall_percentage_identity
{
2571 my ($self, $length_measure) = @_;
2573 my %alphabet = map {$_ => undef} qw
(A C G T U B D E F H I J K L M N O P Q R S V W X Y Z
);
2575 my %enum = map {$_ => undef} qw
(align short long
);
2577 $self->throw("Unknown argument [$length_measure]")
2578 if $length_measure and not exists $enum{$length_measure};
2579 $length_measure ||= 'align';
2581 if (! $self->is_flush()) {
2582 $self->throw("All sequences in the alignment must be the same length");
2585 # Count the residues seen at each position
2587 my $total = 0; # number of positions with identical residues
2589 my @seqs = $self->each_seq;
2590 my $nof_seqs = scalar @seqs;
2591 my $aln_len = $self->length();
2592 for my $seq (@seqs) {
2593 my $seqstr = $seq->seq;
2595 # Count residues for given sequence
2596 for my $column (0 .. $aln_len-1) {
2597 my $char = uc( substr($seqstr, $column, 1) );
2598 if ( exists $alphabet{$char} ) {
2600 # This is a valid char
2601 if ( defined $countHashes[$column]->{$char} ) {
2602 $countHashes[$column]->{$char}++;
2604 $countHashes[$column]->{$char} = 1;
2607 if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2608 # All sequences have this same residue
2616 if ($length_measure eq 'short' || $length_measure eq 'long') {
2617 my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2618 if ($length_measure eq 'short') {
2619 if ( (not defined $len) || ($seq_len < $len) ) {
2622 } elsif ($length_measure eq 'long') {
2623 if ( (not defined $len) || ($seq_len > $len) ) {
2631 if ($length_measure eq 'align') {
2635 return ($total / $len ) * 100.0;
2640 =head1 Alignment positions
2642 Methods to map a sequence position into an alignment column and back.
2643 column_from_residue_number() does the former. The latter is really a
2644 property of the sequence object and can done using
2645 L<Bio::LocatableSeq::location_from_column>:
2647 # select somehow a sequence from the alignment, e.g.
2648 my $seq = $aln->get_seq_by_pos(1);
2649 #$loc is undef or Bio::LocationI object
2650 my $loc = $seq->location_from_column(5);
2652 =head2 column_from_residue_number
2654 Title : column_from_residue_number
2655 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2656 Function: This function gives the position in the alignment
2657 (i.e. column number) of the given residue number in the
2658 sequence with the given name. For example, for the
2661 Seq1/91-97 AC..DEF.GH.
2662 Seq2/24-30 ACGG.RTY...
2663 Seq3/43-51 AC.DDEF.GHI
2665 column_from_residue_number( "Seq1", 94 ) returns 6.
2666 column_from_residue_number( "Seq2", 25 ) returns 2.
2667 column_from_residue_number( "Seq3", 50 ) returns 10.
2669 An exception is thrown if the residue number would lie
2670 outside the length of the aligment
2671 (e.g. column_from_residue_number( "Seq2", 22 )
2673 Note: If the the parent sequence is represented by more than
2674 one alignment sequence and the residue number is present in
2675 them, this method finds only the first one.
2677 Returns : A column number for the position in the alignment of the
2678 given residue in the given sequence (1 = first column)
2679 Args : A sequence id/name (not a name/start-end)
2680 A residue number in the whole sequence (not just that
2681 segment of it in the alignment)
2685 sub column_from_residue_number
{
2686 my ( $self, $name, $resnumber ) = @_;
2688 $self->throw("No sequence with name [$name]")
2689 unless $self->{'_start_end_lists'}->{$name};
2690 $self->throw("Second argument residue number missing") unless $resnumber;
2692 foreach my $seq ( $self->each_seq_with_id($name) ) {
2694 eval { $col = $seq->column_from_residue_number($resnumber); };
2699 $self->throw( "Could not find a sequence segment in $name "
2700 . "containing residue number $resnumber" );
2704 =head1 Sequence names
2706 Methods to manipulate the display name. The default name based on the
2707 sequence id and subsequence positions can be overridden in various
2713 Usage : $myalign->displayname("Ig", "IgA")
2714 Function : Gets/sets the display name of a sequence in the alignment
2715 Returns : A display name string
2716 Argument : name of the sequence
2717 displayname of the sequence (optional)
2722 my ( $self, $name, $disname ) = @_;
2724 $self->throw("No sequence with name [$name]")
2725 unless defined $self->{'_seq'}->{$name};
2727 if ( $disname and $name ) {
2728 $self->{'_dis_name'}->{$name} = $disname;
2731 elsif ( defined $self->{'_dis_name'}->{$name} ) {
2732 return $self->{'_dis_name'}->{$name};
2739 sub get_displayname
{
2741 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2742 $self->displayname(@_);
2745 sub set_displayname
{
2747 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2748 $self->displayname(@_);
2752 =head2 set_displayname_count
2754 Title : set_displayname_count
2755 Usage : $ali->set_displayname_count
2756 Function : Sets the names to be name_# where # is the number of
2757 times this name has been used.
2758 Returns : 1, on success
2763 sub set_displayname_count
{
2765 my (@arr,$name,$seq,$count,$temp,$nse);
2767 foreach $seq ( $self->each_alphabetically() ) {
2768 $nse = $seq->get_nse();
2770 #name will be set when this is the second
2771 #time (or greater) is has been seen
2773 if( defined $name and $name eq ($seq->id()) ) {
2774 $temp = sprintf("%s_%s",$name,$count);
2775 $self->displayname($nse,$temp);
2780 $temp = sprintf("%s_%s",$name,$count);
2781 $self->displayname($nse,$temp);
2788 =head2 set_displayname_flat
2790 Title : set_displayname_flat
2791 Usage : $ali->set_displayname_flat()
2792 Function : Makes all the sequences be displayed as just their name,
2793 not name/start-end (NSE)
2799 sub set_displayname_flat
{
2803 foreach $seq ( $self->each_seq() ) {
2804 $nse = $seq->get_nse();
2805 $self->displayname( $nse, $seq->id() );
2811 =head2 set_displayname_normal
2813 Title : set_displayname_normal
2814 Usage : $ali->set_displayname_normal()
2815 Function : Makes all the sequences be displayed as name/start-end (NSE)
2816 Returns : 1, on success
2821 sub set_displayname_normal
{
2825 foreach $seq ( $self->each_seq() ) {
2826 $nse = $seq->get_nse();
2827 $self->displayname( $nse, $nse );
2835 Usage : $obj->source($newval)
2836 Function: sets the Alignment source program
2838 Returns : value of source
2839 Args : newvalue (optional)
2844 my ( $self, $value ) = @_;
2845 if ( defined $value ) {
2846 $self->{'_source'} = $value;
2848 return $self->{'_source'};
2852 =head2 set_displayname_safe
2854 Title : set_displayname_safe
2855 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2856 Function : Assign machine-generated serial names to sequences in input order.
2857 Designed to protect names during PHYLIP runs. Assign 10-char string
2858 in the form of "S000000001" to "S999999999". Restore the original
2859 names using "restore_displayname".
2860 Returns : 1. a new $aln with system names;
2861 2. a hash ref for restoring names
2862 Argument : Number for id length (default 10)
2866 sub set_displayname_safe
{
2868 my $idlength = shift || 10;
2869 my ( $seq, %phylip_name );
2871 my $new = Bio
::SimpleAlign
->new();
2872 foreach $seq ( $self->each_seq() ) {
2874 my $pname = "S" . sprintf "%0" . ( $idlength - 1 ) . "s", $ct;
2875 $phylip_name{$pname} = $seq->id();
2876 my $new_seq = Bio
::LocatableSeq
->new(
2878 -seq
=> $seq->seq(),
2879 -alphabet
=> $seq->alphabet,
2880 -start
=> $seq->{_start
},
2881 -end
=> $seq->{_end
}
2883 $new->add_seq($new_seq);
2887 "$ct seq names changed. Restore names by using restore_displayname.");
2888 return ( $new, \
%phylip_name );
2892 =head2 restore_displayname
2894 Title : restore_displayname
2895 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2896 Function : Restore original sequence names (after running
2897 $ali->set_displayname_safe)
2898 Returns : a new $aln with names restored.
2899 Argument : a hash reference of names from "set_displayname_safe".
2903 sub restore_displayname
{
2907 my $new=Bio
::SimpleAlign
->new();
2908 foreach my $seq ( $self->each_seq() ) {
2909 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2910 my $new_seq= Bio
::LocatableSeq
->new(-id
=> $name{$seq->id()},
2911 -seq
=> $seq->seq(),
2912 -alphabet
=> $seq->alphabet,
2913 -start
=> $seq->{_start
},
2914 -end
=> $seq->{_end
}
2916 $new->add_seq($new_seq);
2921 =head2 sort_by_start
2923 Title : sort_by_start
2924 Usage : $ali->sort_by_start
2925 Function : Changes the order of the alignment to the start position of each
2927 Returns : 1 on success
2934 my ($seq,$nse,@arr,%hash,$count);
2935 foreach $seq ( $self->each_seq() ) {
2936 $nse = $seq->get_nse;
2940 %{$self->{'_order'}} = (); # reset the hash;
2941 foreach $nse ( sort _startend
keys %hash) {
2942 $self->{'_order'}->{$count} = $nse;
2949 my ($aname,$arange) = split (/[\/]/,$a);
2950 my ($bname,$brange) = split (/[\/]/,$b);
2951 my ($astart,$aend) = split(/\-/,$arange);
2952 my ($bstart,$bend) = split(/\-/,$brange);
2953 return $astart <=> $bstart;
2956 =head2 bracket_string
2958 Title : bracket_string
2959 Usage : my @params = (-refseq => 'testseq',
2960 -allele1 => 'allele1',
2961 -allele2 => 'allele2',
2962 -delimiters => '{}',
2964 $str = $aln->bracket_string(@params)
2966 Function : When supplied with a list of parameters (see below), returns a
2967 string in BIC format. This is used for allelic comparisons.
2968 Briefly, if either allele contains a base change when compared to
2969 the refseq, the base or gap for each allele is represented in
2970 brackets in the order present in the 'alleles' parameter.
2972 For the following data:
2981 the returned string with parameters 'refseq => testseq' and
2982 'alleles => [qw(allele1 allele2)]' would be:
2984 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2985 Returns : BIC-formatted string
2986 Argument : Required args
2987 refseq : string (ID) of the reference sequence used
2988 as basis for comparison
2989 allele1 : string (ID) of the first allele
2990 allele2 : string (ID) of the second allele
2992 delimiters: two symbol string of left and right delimiters.
2993 Only the first two symbols are used
2995 separator : string used as a separator. Only the first
2998 Throws : On no refseq/alleles, or invalid refseq/alleles.
3002 sub bracket_string
{
3003 my ($self, @args) = @_;
3004 my ($ref, $a1, $a2, $delim, $sep) =
3005 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
3006 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
3008 ($ld, $rd) = split('', $delim, 2) if $delim;
3012 my ($refseq, $allele1, $allele2) =
3013 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
3014 if (!$refseq || !$allele1 || !$allele2) {
3015 $self->throw("One of your refseq/allele IDs is invalid!");
3017 my $len = $self->length-1;
3019 # loop over the alignment columns
3020 for my $column ( 0 .. $len ) {
3022 my ($compres, $res1, $res2) =
3023 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
3024 # are any of the allele symbols different from the refseq?
3025 $string = ($compres eq $res1 && $compres eq $res2) ?
$compres :
3026 $ld.$res1.$sep.$res2.$rd;
3033 =head2 methods implementing Bio::FeatureHolderI
3035 FeatureHolderI implementation to support labeled character sets like one
3036 would get from NEXUS represented data.
3038 =head2 get_SeqFeatures
3040 Usage : @features = $aln->get_SeqFeatures
3041 Function: Get the feature objects held by this feature holder.
3043 Returns : an array of Bio::SeqFeatureI implementing objects
3044 Args : optional filter coderef, taking a Bio::SeqFeatureI
3045 : as argument, returning TRUE if wanted, FALSE if
3050 sub get_SeqFeatures
{
3052 my $filter_cb = shift;
3053 $self->throw("Arg (filter callback) must be a coderef")
3054 unless !defined($filter_cb)
3055 or ref($filter_cb) eq 'CODE';
3056 if ( !defined $self->{'_as_feat'} ) {
3057 $self->{'_as_feat'} = [];
3060 return grep { $filter_cb->($_) } @
{ $self->{'_as_feat'} };
3062 return @
{ $self->{'_as_feat'} };
3066 =head2 add_SeqFeature
3068 Usage : $aln->add_SeqFeature($subfeat);
3069 Function: Adds a SeqFeature into the SeqFeature array. The 'EXPAND' qualifier
3070 (see L<Bio::FeatureHolderI>) is supported, but has no effect.
3072 Returns : 1 on success
3073 Args : a Bio::SeqFeatureI object
3077 sub add_SeqFeature
{
3078 my ($self, @feat) = @_;
3080 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3082 if (scalar @feat > 1) {
3084 -message
=> 'Providing an array of features to Bio::SimpleAlign add_SeqFeature()'.
3085 ' is deprecated and will be removed in a future version. '.
3086 'Add a single feature at a time instead.',
3087 -warn_version
=> 1.007,
3088 -throw_version
=> 1.009,
3092 for my $feat ( @feat ) {
3094 next if $feat eq 'EXPAND'; # Need to support it for FeatureHolderI compliance
3096 if( !$feat->isa("Bio::SeqFeatureI") ) {
3097 $self->throw("Expected a Bio::SeqFeatureI object, but got a $feat.");
3100 push @
{$self->{'_as_feat'}}, $feat;
3106 =head2 remove_SeqFeatures
3108 Usage : $obj->remove_SeqFeatures
3109 Function: Removes all SeqFeatures. If you want to remove only a subset,
3110 remove that subset from the returned array, and add back the rest.
3111 Returns : The array of Bio::SeqFeatureI features that was
3112 deleted from this alignment.
3117 sub remove_SeqFeatures
{
3120 return () unless $self->{'_as_feat'};
3121 my @feats = @
{$self->{'_as_feat'}};
3122 $self->{'_as_feat'} = [];
3126 =head2 feature_count
3128 Title : feature_count
3129 Usage : $obj->feature_count()
3130 Function: Return the number of SeqFeatures attached to the alignment
3131 Returns : integer representing the number of SeqFeatures
3139 if (defined($self->{'_as_feat'})) {
3140 return ($#{$self->{'_as_feat'}} + 1);
3146 =head2 get_all_SeqFeatures
3148 Title : get_all_SeqFeatures
3150 Function: Get all SeqFeatures.
3152 Returns : an array of Bio::SeqFeatureI implementing objects
3154 Note : Falls through to Bio::FeatureHolderI implementation.
3158 =head2 methods for Bio::AnnotatableI
3160 AnnotatableI implementation to support sequence alignments which
3161 contain annotation (NEXUS, Stockholm).
3166 Usage : $ann = $aln->annotation or
3167 $aln->annotation($ann)
3168 Function: Gets or sets the annotation
3169 Returns : Bio::AnnotationCollectionI object
3170 Args : None or Bio::AnnotationCollectionI object
3172 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3173 for more information
3178 my ($obj,$value) = @_;
3179 if( defined $value ) {
3180 $obj->throw("object of class ".ref($value)." does not implement ".
3181 "Bio::AnnotationCollectionI. Too bad.")
3182 unless $value->isa("Bio::AnnotationCollectionI");
3183 $obj->{'_annotation'} = $value;
3184 } elsif( ! defined $obj->{'_annotation'}) {
3185 $obj->{'_annotation'} = Bio
::Annotation
::Collection
->new();
3187 return $obj->{'_annotation'};
3190 =head1 Deprecated methods
3197 Usage : $no = $ali->no_residues
3198 Function : number of residues in total in the alignment
3201 Note : deprecated in favor of num_residues()
3207 $self->deprecated(-warn_version
=> 1.0069,
3208 -throw_version
=> 1.0075,
3209 -message
=> 'Use of method no_residues() is deprecated, use num_residues() instead');
3210 $self->num_residues(@_);
3215 Title : no_sequences
3216 Usage : $depth = $ali->no_sequences
3217 Function : number of sequence in the sequence alignment
3220 Note : deprecated in favor of num_sequences()
3226 $self->deprecated(-warn_version
=> 1.0069,
3227 -throw_version
=> 1.0075,
3228 -message
=> 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3229 $self->num_sequences(@_);
3234 Title : mask_columns
3235 Usage : $aln2 = $aln->mask_columns(20,30)
3236 Function : Masks a slice of the alignment inclusive of start and
3237 end columns, and the first column in the alignment is denoted 1.
3238 Mask beyond the length of the sequence does not do padding.
3239 Returns : A Bio::SimpleAlign object
3240 Args : Positive integer for start column, positive integer for end column,
3241 optional string value use for the mask. Example:
3243 $aln2 = $aln->mask_columns(20,30,'?')
3244 Note : Masking must use a character that is not used for gaps or
3245 frameshifts. These can be adjusted using the relevant global
3246 variables, but be aware these may be (uncontrollably) modified
3247 elsewhere within BioPerl (see bug 2715)
3252 #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3255 my $nonres = $Bio::LocatableSeq
::GAP_SYMBOLS
.
3256 $Bio::LocatableSeq
::FRAMESHIFT_SYMBOLS
;
3258 # coordinates are alignment-based, not sequence-based
3259 my ($start, $end, $mask_char) = @_;
3260 unless (defined $mask_char) { $mask_char = 'N' }
3262 $self->throw("Mask start has to be a positive integer and less than ".
3263 "alignment length, not [$start]")
3264 unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3265 $self->throw("Mask end has to be a positive integer and less than ".
3266 "alignment length, not [$end]")
3267 unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3268 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3269 "end [$end]") unless $start <= $end;
3270 $self->throw("Mask character $mask_char has to be a single character ".
3271 "and not a gap or frameshift symbol")
3272 unless CORE
::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3274 my $aln = $self->new;
3275 $aln->id($self->id);
3276 foreach my $seq ( $self->each_seq() ) {
3277 my $new_seq = Bio
::LocatableSeq
->new(-id
=> $seq->id,
3278 -alphabet
=> $seq->alphabet,
3279 -strand
=> $seq->strand,
3280 -verbose
=> $self->verbose);
3282 # convert from 1-based alignment coords!
3283 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3284 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3285 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3286 $new_seq->seq($new_dna_string);
3287 $aln->add_seq($new_seq);