2 # BioPerl module for SimpleAlign
4 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
6 # Copyright Ewan Birney
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
13 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
14 # May 2001 major rewrite - Heikki Lehvaslaiho
18 Bio::SimpleAlign - Multiple alignments held as a set of sequences
22 # Use Bio::AlignIO to read in the alignment
23 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
24 $aln = $str->next_aln();
28 print $aln->no_residues;
30 print $aln->no_sequences;
32 print $aln->percentage_identity;
33 print $aln->consensus_string(50);
35 # Find the position in the alignment for a sequence location
36 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
38 # Extract sequences and check values for the alignment column $pos
39 foreach $seq ($aln->each_seq) {
40 $res = $seq->subseq($pos, $pos);
43 foreach $res (keys %count) {
44 printf "Res: %s Count: %2d\n", $res, $count{$res};
48 $aln->remove_seq($seq);
49 $mini_aln = $aln->slice(20,30); # get a block of columns
50 $mini_aln = $aln->select_noncont(1,3,5,7,11); # get single columns
51 $new_aln = $aln->remove_columns([20,30]); # remove by position
52 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
55 $str = $aln->consensus_string($threshold_percent);
56 $str = $aln->match_line();
57 $str = $aln->cigar_line()
58 $id = $aln->percentage_identity;
60 See the module documentation for details and more methods.
64 SimpleAlign is an object that handles a multiple sequence alignment
65 (MSA). It is very permissive of types (it does not insist on sequences
66 being all same length, for example). Think of it as a set of sequences
67 with a whole series of built-in manipulations and methods for reading and
70 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
71 to store its sequences. These are subsequences with a start and end
72 positions in the parent reference sequence. Each sequence in the
73 SimpleAlign object is a Bio::LocatableSeq.
75 SimpleAlign expects the combination of name, start, and end for a
76 given sequence to be unique in the alignment, and this is the key for the
77 internal hashes (name, start, end are abbreviated C<nse> in the code).
78 However, in some cases people do not want the name/start-end to be displayed:
79 either multiple names in an alignment or names specific to the alignment
80 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
81 C<displayname>, and generally is what is used to print out the
82 alignment. They default to name/start-end.
84 The SimpleAlign Module is derived from the Align module by Ewan Birney.
90 User feedback is an integral part of the evolution of this and other
91 Bioperl modules. Send your comments and suggestions preferably to one
92 of the Bioperl mailing lists. Your participation is much appreciated.
94 bioperl-l@bioperl.org - General discussion
95 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
99 Report bugs to the Bioperl bug tracking system to help us keep track
100 the bugs and their resolution. Bug reports can be submitted via the
103 http://bugzilla.open-bio.org/
107 Ewan Birney, birney@ebi.ac.uk
111 Allen Day, allenday-at-ucla.edu,
112 Richard Adams, Richard.Adams-at-ed.ac.uk,
113 David J. Evans, David.Evans-at-vir.gla.ac.uk,
114 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
115 Allen Smith, allens-at-cpan.org,
116 Jason Stajich, jason-at-bioperl.org,
117 Anthony Underwood, aunderwood-at-phls.org.uk,
118 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
119 Brian Osborne, bosborne at alum.mit.edu
120 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
121 Hongyu Zhang, forward at hongyu.org
129 The rest of the documentation details each of the object
130 methods. Internal methods are usually preceded with a _
134 # 'Let the code begin...
136 package Bio
::SimpleAlign
;
137 use vars
qw(%CONSERVATION_GROUPS);
140 use Bio::LocatableSeq; # uses Seq's as list
143 use Bio::SeqFeature::Generic;
146 # This data should probably be in a more centralized module...
147 # it is taken from Clustalw documentation.
148 # These are all the positively scoring groups that occur in the
149 # Gonnet Pam250 matrix. The strong and weak groups are
150 # defined as strong score >0.5 and weak score =<0.5 respectively.
152 %CONSERVATION_GROUPS = (
177 use base
qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
178 Bio::FeatureHolderI);
183 Usage : my $aln = Bio::SimpleAlign->new();
184 Function : Creates a new simple align object
185 Returns : Bio::SimpleAlign
186 Args : -source => string representing the source program
187 where this alignment came from
193 my($class,@args) = @_;
195 my $self = $class->SUPER::new
(@args);
197 my ($src,$score,$id) = $self->_rearrange([qw(SOURCE SCORE ID)], @args);
198 $src && $self->source($src);
199 defined $score && $self->score($score);
200 # we need to set up internal hashs first!
202 $self->{'_seq'} = {};
203 $self->{'_order'} = {};
204 $self->{'_start_end_lists'} = {};
205 $self->{'_dis_name'} = {};
206 $self->{'_id'} = 'NoName';
207 # maybe we should automatically read in from args. Hmmm...
208 $id && $self->id($id);
210 return $self; # success - we hope!
213 =head1 Modifier methods
215 These methods modify the MSA by adding, removing or shuffling complete
221 Usage : $myalign->add_seq($newseq);
222 Function : Adds another sequence to the alignment. *Does not* align
223 it - just adds it to the hashes.
225 Args : a Bio::LocatableSeq object
228 See L<Bio::LocatableSeq> for more information
234 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
242 my ($name,$id,$start,$end);
244 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
245 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
248 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
249 $start = $seq->start();
252 # build the symbol list for this sequence,
253 # will prune out the gap and missing/match chars
254 # when actually asked for the symbol list in the
256 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
258 if( !defined $order ) {
259 $order = keys %{$self->{'_seq'}};
261 $name = sprintf("%s/%d-%d",$id,$start,$end);
263 if( $self->{'_seq'}->{$name} ) {
264 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
267 $self->debug( "Assigning $name to $order\n");
269 $self->{'_order'}->{$order} = $name;
271 unless( exists( $self->{'_start_end_lists'}->{$id})) {
272 $self->{'_start_end_lists'}->{$id} = [];
274 push @
{$self->{'_start_end_lists'}->{$id}}, $seq;
277 $self->{'_seq'}->{$name} = $seq;
285 Usage : $aln->remove_seq($seq);
286 Function : Removes a single sequence from an alignment
288 Argument : a Bio::LocatableSeq object
294 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
295 $self->remove_seq(@_);
301 my ($name,$id,$start,$end);
303 $self->throw("Need Bio::Locatable seq argument ")
304 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
307 $start = $seq->start();
309 $name = sprintf("%s/%d-%d",$id,$start,$end);
311 if( !exists $self->{'_seq'}->{$name} ) {
312 $self->throw("Sequence $name does not exist in the alignment to remove!");
315 delete $self->{'_seq'}->{$name};
317 # we need to remove this seq from the start_end_lists hash
319 if (exists $self->{'_start_end_lists'}->{$id}) {
320 # we need to find the sequence in the array.
323 for ($i=0; $i < @
{$self->{'_start_end_lists'}->{$id}}; $i++) {
324 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
330 splice @
{$self->{'_start_end_lists'}->{$id}}, $i, 1;
333 $self->throw("Could not find the sequence to remoce from the start-end list");
337 $self->throw("There is no seq list for the name $id");
339 # we need to shift order hash
340 my %rev_order = reverse %{$self->{'_order'}};
341 my $no = $rev_order{$name};
342 my $no_sequences = $self->no_sequences;
343 for (; $no < $no_sequences; $no++) {
344 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
346 delete $self->{'_order'}->{$no};
354 Usage : $aln->purge(0.7);
355 Function: Removes sequences above given sequence similarity
356 This function will grind on large alignments. Beware!
358 Returns : An array of the removed sequences
359 Args : float, threshold for similarity
364 my ($self,$perc) = @_;
365 my (%duplicate, @dups);
367 my @seqs = $self->each_seq();
369 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
372 #skip if already in duplicate hash
373 next if exists $duplicate{$seq->display_id} ;
374 my $one = $seq->seq();
376 my @one = split '', $one; #split to get 1aa per array element
378 for (my $j=$i+1;$j < @seqs;$j++) {
379 my $seq2 = $seqs[$j];
381 #skip if already in duplicate hash
382 next if exists $duplicate{$seq2->display_id} ;
384 my $two = $seq2->seq();
385 my @two = split '', $two;
389 for (my $k=0;$k<@one;$k++) {
390 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
391 $one[$k] eq $two[$k]) {
394 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
395 $two[$k] ne '.' && $two[$k] ne '-' ) {
401 $ratio = $count/$res unless $res == 0;
403 # if above threshold put in duplicate hash and push onto
404 # duplicate array for returning to get_unique
405 if ( $ratio > $perc ) {
406 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
407 $duplicate{$seq2->display_id} = 1;
412 foreach my $seq (@dups) {
413 $self->remove_seq($seq);
418 =head2 sort_alphabetically
420 Title : sort_alphabetically
421 Usage : $ali->sort_alphabetically
422 Function : Changes the order of the alignment to alphabetical on name
423 followed by numerical by number.
429 sub sort_alphabetically
{
431 my ($seq,$nse,@arr,%hash,$count);
433 foreach $seq ( $self->each_seq() ) {
434 $nse = $seq->get_nse;
440 %{$self->{'_order'}} = (); # reset the hash;
442 foreach $nse ( sort _alpha_startend
keys %hash) {
443 $self->{'_order'}->{$count} = $nse;
453 Usage : $aln_ordered=$aln->sort_by_list($list_file)
454 Function : Arbitrarily order sequences in an alignment
455 Returns : A new Bio::SimpleAlign object
456 Argument : a file listing sequence names in intended order (one name per line)
461 my ($self, $list) = @_;
462 my (@seq, @ids, %order);
464 foreach my $seq ( $self->each_seq() ) {
466 push @ids, $seq->display_id;
470 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
474 $self->throw("Not found in alignment: $name") unless &_in_aln
($name, \
@ids);
479 # use the map-sort-map idiom:
480 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
481 my $aln = $self->new;
482 foreach (@sorted) { $aln->add_seq($_) }
486 =head2 set_new_reference
488 Title : set_new_reference
489 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
490 the sequence whoes name is "B31" (full, exact, and case-sensitive),
491 as the reference (1st) sequence
492 Function : Change/Set a new reference (i.e., the first) sequence
493 Returns : a new Bio::SimpleAlign object.
494 Throws an exception if designated sequence not found
495 Argument : a positive integer of sequence order, or a sequence name
496 in the original alignment
500 sub set_new_reference
{
501 my ($self, $seqid) = @_;
502 my $aln = $self->new;
503 my (@seq, @ids, @new_seq);
505 foreach my $seq ( $self->each_seq() ) {
507 push @ids, $seq->display_id;
510 if ($seqid =~ /^\d+$/) { # argument is seq position
512 $self->throw("The new reference sequence number has to be a positive integer >1 and <= no_sequences ") if ($seqid <= 1 || $seqid > $self->no_sequences);
513 } else { # argument is a seq name
514 $self->throw("The new reference sequence not in alignment ") unless &_in_aln
($seqid, \
@ids);
517 for (my $i=0; $i<=$#seq; $i++) {
519 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
520 unshift @new_seq, $seq[$i];
522 push @new_seq, $seq[$i];
525 foreach (@new_seq) { $aln->add_seq($_); }
529 sub _in_aln
{ # check if input name exists in the alignment
530 my ($str, $ref) = @_;
532 return 1 if $str eq $_;
541 Usage : $aln->uniq_seq(): Remove identical sequences in
542 in the alignment. Ambiguous base ("N", "n") and
543 leading and ending gaps ("-") are NOT counted as
545 Function : Make a new alignment of unique sequence types (STs)
546 Returns : 1. a new Bio::SimpleAlign object (all sequences renamed as "ST")
547 2. ST of each sequence in STDERR
553 my ($self, $seqid) = @_;
554 my $aln = $self->new;
555 my (%member, %order, @seq, @uniq_str);
557 my $len = $self->length();
558 foreach my $seq ( $self->each_seq() ) {
559 my $str = $seq->seq();
561 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
562 # comparing two sequence strings
564 # 1st, convert "n", "N" to "?" (for DNA sequence only):
565 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
566 # 2nd, convert leading and ending gaps to "?":
567 $str = &_convert_leading_ending_gaps
($str, '-', '?');
568 my $new = Bio
::LocatableSeq
->new(-id
=> $seq->id(),
569 -alphabet
=> $seq->alphabet,
571 -start
=> $seq->start,
577 foreach my $seq (@seq) {
578 my $str = $seq->seq();
579 my ($seen, $key) = &_check_uniq
($str, \
@uniq_str, $len);
580 if ($seen) { # seen before
581 my @memb = @
{$member{$key}};
583 $member{$key} = \
@memb;
585 push @uniq_str, $key;
587 $member{$key} = [ ($seq) ];
588 $order{$key} = $order;
592 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
593 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
594 my $str2 = &_convert_leading_ending_gaps
($str, '?', '-');
595 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
596 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
598 my $end=length($str2);
599 $end -= length($1) while $str2 =~ m/($gap+)/g;
600 my $new = Bio
::LocatableSeq
->new(-id
=>"ST".$order{$str},
606 # print STDERR "ST".$order{$str}, "\t=>";
607 foreach (@
{$member{$str}}) {
608 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
615 sub _check_uniq
{ # check if same seq exists in the alignment
616 my ($str1, $ref, $length) = @_;
617 my @char1=split //, $str1;
620 return (0, $str1) if @array==0; # not seen (1st sequence)
622 foreach my $str2 (@array) {
624 my @char2=split //, $str2;
625 for (my $i=0; $i<=$length-1; $i++) {
626 next if $char1[$i] eq '?';
627 next if $char2[$i] eq '?';
628 $diff++ if $char1[$i] ne $char2[$i];
630 return (1, $str2) if $diff == 0; # seen before
633 return (0, $str1); # not seen
636 sub _convert_leading_ending_gaps
{
640 my @array=split //, $s;
641 # convert leading char:
642 for (my $i=0; $i<=$#array; $i++) {
643 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
645 # convert ending char:
646 for (my $i = $#array; $i>= 0; $i--) {
647 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
649 my $s_new=join '', @array;
653 =head1 Sequence selection methods
655 Methods returning one or more sequences objects.
660 Usage : foreach $seq ( $align->each_seq() )
661 Function : Gets a Seq object from the alignment
669 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
677 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
678 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
679 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
686 =head2 each_alphabetically
688 Title : each_alphabetically
689 Usage : foreach $seq ( $ali->each_alphabetically() )
690 Function : Returns a sequence object, but the objects are returned
691 in alphabetically sorted order.
692 Does not change the order of the alignment.
698 sub each_alphabetically
{
700 my ($seq,$nse,@arr,%hash,$count);
702 foreach $seq ( $self->each_seq() ) {
703 $nse = $seq->get_nse;
707 foreach $nse ( sort _alpha_startend
keys %hash) {
708 push(@arr,$hash{$nse});
713 sub _alpha_startend
{
714 my ($aname,$astart,$bname,$bstart);
715 ($aname,$astart) = split (/-/,$a);
716 ($bname,$bstart) = split (/-/,$b);
718 if( $aname eq $bname ) {
719 return $astart <=> $bstart;
722 return $aname cmp $bname;
726 =head2 each_seq_with_id
728 Title : each_seq_with_id
729 Usage : foreach $seq ( $align->each_seq_with_id() )
730 Function : Gets a Seq objects from the alignment, the contents
731 being those sequences with the given name (there may be
734 Argument : a seq name
740 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
741 $self->each_seq_with_id(@_);
744 sub each_seq_with_id
{
748 $self->throw("Method each_seq_with_id needs a sequence name argument")
753 if (exists($self->{'_start_end_lists'}->{$id})) {
754 @arr = @
{$self->{'_start_end_lists'}->{$id}};
759 =head2 get_seq_by_pos
761 Title : get_seq_by_pos
762 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
763 Function : Gets a sequence based on its position in the alignment.
764 Numbering starts from 1. Sequence positions larger than
765 no_sequences() will thow an error.
766 Returns : a Bio::LocatableSeq object
767 Args : positive integer for the sequence position
776 $self->throw("Sequence position has to be a positive integer, not [$pos]")
777 unless $pos =~ /^\d+$/ and $pos > 0;
778 $self->throw("No sequence at position [$pos]")
779 unless $pos <= $self->no_sequences ;
781 my $nse = $self->{'_order'}->{--$pos};
782 return $self->{'_seq'}->{$nse};
787 Title : get_seq_by_id
788 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
789 Function : Gets a sequence based on its name.
790 Sequences that do not exist will warn and return undef
791 Returns : a Bio::LocatableSeq object
792 Args : string for sequence name
797 my ($self,$name) = @_;
798 unless( defined $name ) {
799 $self->warn("Must provide a sequence name");
802 for my $seq ( values %{$self->{'_seq'}} ) {
803 if ( $seq->id eq $name) {
810 =head2 seq_with_features
812 Title : seq_with_features
813 Usage : $seq = $aln->seq_with_features(-pos => 1,
816 sub { my $consensus = shift;
821 while($consensus =~ /[^?]$q[^?]/){
822 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
828 Function: produces a Bio::Seq object by first splicing gaps from -pos
829 (by means of a splice_by_seq_pos() call), then creating
830 features using non-? chars (by means of a consensus_string()
831 call with stringency -consensus).
832 Returns : a Bio::Seq object
833 Args : -pos : required. sequence from which to build the Bio::Seq
835 -consensus : optional, defaults to consensus_string()'s
837 -mask : optional, a coderef to apply to consensus_string()'s
838 output before building features. this may be useful for
839 closing gaps of 1 bp by masking over them with N, for
844 sub seq_with_features
{
845 my ($self,%arg) = @_;
847 #first do the preparatory splice
848 $self->throw("must provide a -pos argument") unless $arg{-pos};
849 $self->splice_by_seq_pos($arg{-pos});
851 my $consensus_string = $self->consensus_string($arg{-consensus
});
852 $consensus_string = $arg{-mask
}->($consensus_string)
853 if defined($arg{-mask
});
857 push @bs, 1 if $consensus_string =~ /^[^?]/;
859 while($consensus_string =~ /\?[^?]/g){
860 push @bs, pos($consensus_string);
862 while($consensus_string =~ /[^?]\?/g){
863 push @es, pos($consensus_string);
866 push @es, length($consensus_string) if $consensus_string =~ /[^?]$/;
868 my $seq = Bio
::Seq
->new();
870 # my $rootfeature = Bio::SeqFeature::Generic->new(
871 # -source_tag => 'location',
872 # -start => $self->get_seq_by_pos($arg{-pos})->start,
873 # -end => $self->get_seq_by_pos($arg{-pos})->end,
875 # $seq->add_SeqFeature($rootfeature);
877 while(my $b = shift @bs){
879 $seq->add_SeqFeature(
880 Bio
::SeqFeature
::Generic
->new(
881 -start
=> $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
882 -end
=> $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
883 -source_tag
=> $self->source || 'MSA',
892 =head1 Create new alignments
894 The result of these methods are horizontal or vertical subsets of the
900 Usage : $aln2 = $aln->select(1, 3) # three first sequences
901 Function : Creates a new alignment from a continuous subset of
902 sequences. Numbering starts from 1. Sequence positions
903 larger than no_sequences() will thow an error.
904 Returns : a Bio::SimpleAlign object
905 Args : positive integer for the first sequence
906 positive integer for the last sequence to include (optional)
912 my ($start, $end) = @_;
914 $self->throw("Select start has to be a positive integer, not [$start]")
915 unless $start =~ /^\d+$/ and $start > 0;
916 $self->throw("Select end has to be a positive integer, not [$end]")
917 unless $end =~ /^\d+$/ and $end > 0;
918 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
919 unless $start <= $end;
921 my $aln = $self->new;
922 foreach my $pos ($start .. $end) {
923 $aln->add_seq($self->get_seq_by_pos($pos));
926 # fix for meta, sf, ann
930 =head2 select_noncont
932 Title : select_noncont
933 Usage : # 1st and 3rd sequences, sorted
934 $aln2 = $aln->select_noncont(1, 3)
936 # 1st and 3rd sequences, sorted (same as first)
937 $aln2 = $aln->select_noncont(3, 1)
939 # 1st and 3rd sequences, unsorted
940 $aln2 = $aln->select_noncont('nosort',3, 1)
942 Function : Creates a new alignment from a subset of sequences. Numbering
943 starts from 1. Sequence positions larger than no_sequences() will
944 throw an error. Sorts the order added to new alignment by default,
945 to prevent sorting pass 'nosort' as the first argument in the list.
946 Returns : a Bio::SimpleAlign object
947 Args : array of integers for the sequences. If the string 'nosort' is
948 passed as the first argument, the sequences will not be sorted
949 in the new alignment but will appear in the order listed.
957 if ($pos[0] !~ m{^\d+$}) {
958 my $sortcmd = shift @pos;
959 if ($sortcmd eq 'nosort') {
962 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
966 my $end = $self->no_sequences;
968 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
969 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
972 @pos = sort {$a <=> $b} @pos unless $nosort;
974 my $aln = $self->new;
975 foreach my $p (@pos) {
976 $aln->add_seq($self->get_seq_by_pos($p));
979 # fix for meta, sf, ann
986 Usage : $aln2 = $aln->slice(20,30)
987 Function : Creates a slice from the alignment inclusive of start and
988 end columns, and the first column in the alignment is denoted 1.
989 Sequences with no residues in the slice are excluded from the
990 new alignment and a warning is printed. Slice beyond the length of
991 the sequence does not do padding.
992 Returns : A Bio::SimpleAlign object
993 Args : Positive integer for start column, positive integer for end column,
994 optional boolean which if true will keep gap-only columns in the newly
995 created slice. Example:
997 $aln2 = $aln->slice(20,30,1)
1003 my ($start, $end, $keep_gap_only) = @_;
1005 $self->throw("Slice start has to be a positive integer, not [$start]")
1006 unless $start =~ /^\d+$/ and $start > 0;
1007 $self->throw("Slice end has to be a positive integer, not [$end]")
1008 unless $end =~ /^\d+$/ and $end > 0;
1009 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1010 unless $start <= $end;
1011 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1012 "[$start] is too big.") if $start > $self->length;
1013 my $cons_meta = $self->consensus_meta;
1014 my $aln = $self->new;
1015 $aln->id($self->id);
1016 foreach my $seq ( $self->each_seq() ) {
1017 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1020 -alphabet
=> $seq->alphabet,
1021 -strand
=> $seq->strand,
1022 -verbose
=> $self->verbose) :
1023 Bio
::LocatableSeq
->new
1025 -alphabet
=> $seq->alphabet,
1026 -strand
=> $seq->strand,
1027 -verbose
=> $self->verbose);
1031 $seq_end = $seq->length if( $end > $seq->length );
1033 my $slice_seq = $seq->subseq($start, $seq_end);
1034 $new_seq->seq( $slice_seq );
1036 $slice_seq =~ s/\W//g;
1039 my $pre_start_seq = $seq->subseq(1, $start - 1);
1040 $pre_start_seq =~ s/\W//g;
1041 if (!defined($seq->strand)) {
1042 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1043 } elsif ($seq->strand < 0){
1044 $new_seq->start( $seq->end - CORE
::length($pre_start_seq) - CORE
::length($slice_seq) + 1);
1046 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1049 $new_seq->start( $seq->start);
1051 if ($new_seq->isa('Bio::Seq::MetaI')) {
1052 for my $meta_name ($seq->meta_names) {
1053 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1056 $new_seq->end( $new_seq->start + CORE
::length($slice_seq) - 1 );
1058 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1059 $aln->add_seq($new_seq);
1061 if( $keep_gap_only ) {
1062 $aln->add_seq($new_seq);
1064 my $nse = $seq->get_nse();
1065 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1066 " Sequence excluded from the new alignment.");
1071 my $new = Bio
::Seq
::Meta
->new();
1072 for my $meta_name ($cons_meta->meta_names) {
1073 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1075 $aln->consensus_meta($new);
1077 $aln->annotation($self->annotation);
1078 # fix for meta, sf, ann
1082 =head2 remove_columns
1084 Title : remove_columns
1085 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1086 $aln2 = $aln->remove_columns([0,0],[6,8])
1087 Function : Creates an aligment with columns removed corresponding to
1088 the specified type or by specifying the columns by number.
1089 Returns : Bio::SimpleAlign object
1090 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1091 'all_gaps_columns') or array ref where the referenced array
1092 contains a pair of integers that specify a range.
1093 The first column is 0,
1097 sub remove_columns
{
1098 my ($self,@args) = @_;
1099 @args || return $self;
1102 if ($args[0][0] =~ /^[a-z_]+$/i) {
1103 $aln = $self->_remove_columns_by_type($args[0]);
1104 } elsif ($args[0][0] =~ /^\d+$/) {
1105 $aln = $self->_remove_columns_by_num(\
@args);
1107 $self->throw("You must pass array references to remove_columns(), not @args");
1109 # fix for meta, sf, ann
1117 Usage : $aln2 = $aln->remove_gaps
1118 Function : Creates an aligment with gaps removed
1119 Returns : a Bio::SimpleAlign object
1120 Args : a gap character(optional) if none specified taken
1121 from $self->gap_char,
1122 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1123 indicates that only all-gaps columns should be deleted
1125 Used from method L<remove_columns> in most cases. Set gap character
1126 using L<gap_char()|gap_char>.
1131 my ($self,$gapchar,$all_gaps_columns) = @_;
1133 if ($all_gaps_columns) {
1134 $gap_line = $self->all_gap_line($gapchar);
1136 $gap_line = $self->gap_line($gapchar);
1138 my $aln = $self->new;
1142 my $del_char = $gapchar || $self->gap_char;
1143 # Do the matching to get the segments to remove
1144 while ($gap_line =~ m/[$del_char]/g) {
1145 my $start = pos($gap_line)-1;
1146 $gap_line=~/\G[$del_char]+/gc;
1147 my $end = pos($gap_line)-1;
1149 #have to offset the start and end for subsequent removes
1152 $length += ($end-$start+1);
1153 push @remove, [$start,$end];
1156 #remove the segments
1157 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1158 # fix for meta, sf, ann
1164 my ($self,$aln,$remove) = @_;
1167 my $gap = $self->gap_char;
1169 # splice out the segments and create new seq
1170 foreach my $seq($self->each_seq){
1171 my $new_seq = Bio
::LocatableSeq
->new(
1173 -alphabet
=> $seq->alphabet,
1174 -strand
=> $seq->strand,
1175 -verbose
=> $self->verbose);
1176 my $sequence = $seq->seq;
1177 foreach my $pair(@
{$remove}){
1178 my $start = $pair->[0];
1179 my $end = $pair->[1];
1180 $sequence = $seq->seq unless $sequence;
1181 my $orig = $sequence;
1182 my $head = $start > 0 ?
substr($sequence, 0, $start) : '';
1183 my $tail = ($end + 1) >= length($sequence) ?
'' : substr($sequence, $end + 1);
1184 $sequence = $head.$tail;
1186 unless (defined $new_seq->start) {
1188 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1189 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1192 my $start_adjust = $orig =~ /^$gap+/;
1193 if ($start_adjust) {
1194 $start_adjust = $+[0] == $start;
1196 $new_seq->start($seq->start + $start_adjust);
1200 if (($end + 1) >= length($orig)) {
1201 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1202 $new_seq->end($seq->end - (length($orig) - $start) + $end_adjust);
1205 $new_seq->end($seq->end);
1209 if ($new_seq->end < $new_seq->start) {
1210 # we removed all columns except for gaps: set to 0 to indicate no
1216 $new_seq->seq($sequence) if $sequence;
1217 push @new, $new_seq;
1219 # add the new seqs to the alignment
1220 foreach my $new(@new){
1221 $aln->add_seq($new);
1223 # fix for meta, sf, ann
1227 sub _remove_columns_by_type
{
1228 my ($self,$type) = @_;
1229 my $aln = $self->new;
1232 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @
{$type});
1233 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@
{$type});
1234 my %matchchars = ( 'match' => '\*',
1239 'all_gaps_columns' => ''
1241 # get the characters to delete against
1243 foreach my $type (@
{$type}){
1244 $del_char.= $matchchars{$type};
1248 my $match_line = $self->match_line;
1249 # do the matching to get the segments to remove
1251 while($match_line =~ m/[$del_char]/g ){
1252 my $start = pos($match_line)-1;
1253 $match_line=~/\G[$del_char]+/gc;
1254 my $end = pos($match_line)-1;
1256 #have to offset the start and end for subsequent removes
1259 $length += ($end-$start+1);
1260 push @remove, [$start,$end];
1264 # remove the segments
1265 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1266 $aln = $aln->remove_gaps() if $gap;
1267 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1268 # fix for meta, sf, ann
1273 sub _remove_columns_by_num
{
1274 my ($self,$positions) = @_;
1275 my $aln = $self->new;
1277 # sort the positions
1278 @
$positions = sort { $a->[0] <=> $b->[0] } @
$positions;
1282 foreach my $pos (@
{$positions}) {
1283 my ($start, $end) = @
{$pos};
1285 #have to offset the start and end for subsequent removes
1288 $length += ($end-$start+1);
1289 push @remove, [$start,$end];
1292 #remove the segments
1293 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1294 # fix for meta, sf, ann
1299 =head1 Change sequences within the MSA
1301 These methods affect characters in all sequences without changing the
1304 =head2 splice_by_seq_pos
1306 Title : splice_by_seq_pos
1307 Usage : $status = splice_by_seq_pos(1);
1308 Function: splices all aligned sequences where the specified sequence
1311 Returns : 1 on success
1312 Args : position of sequence to splice by
1317 sub splice_by_seq_pos
{
1318 my ($self,$pos) = @_;
1320 my $guide = $self->get_seq_by_pos($pos);
1321 my $guide_seq = $guide->seq;
1323 $guide_seq =~ s/\./\-/g;
1327 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1328 unshift @gaps, $pos;
1332 foreach my $seq ($self->each_seq){
1333 my @bases = split '', $seq->seq;
1335 splice(@bases, $_, 1) foreach @gaps;
1336 $seq->seq(join('', @bases));
1345 Usage : $ali->map_chars('\.','-')
1346 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1349 Notice that the from (arg1) is interpretted as a regex,
1350 so be careful about quoting meta characters (eg
1351 $ali->map_chars('.','-') wont do what you want)
1353 Argument : 'from' rexexp
1364 $self->throw("Need exactly two arguments")
1365 unless defined $from and defined $to;
1367 foreach $seq ( $self->each_seq() ) {
1368 $temp = $seq->seq();
1369 $temp =~ s/$from/$to/g;
1379 Usage : $ali->uppercase()
1380 Function : Sets all the sequences to uppercase
1391 foreach $seq ( $self->each_seq() ) {
1392 $temp = $seq->seq();
1393 $temp =~ tr/[a-z]/[A-Z]/;
1402 Title : cigar_line()
1403 Usage : %cigars = $align->cigar_line()
1404 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1405 Report) line for each sequence in the alignment. Examples are
1406 "1,60" or "5,10:12,58", where the numbers refer to conserved
1407 positions within the alignment. The keys of the hash are the
1408 NSEs (name/start/end) assigned to each sequence.
1409 Args : threshold (optional, defaults to 100)
1410 Returns : Hash of strings (cigar lines)
1419 my @consensus = split "",($self->consensus_string($thr));
1420 my $len = $self->length;
1421 my $gapchar = $self->gap_char;
1423 # create a precursor, something like (1,4,5,6,7,33,45),
1424 # where each number corresponds to a conserved position
1425 foreach my $seq ( $self->each_seq ) {
1426 my @seq = split "", uc ($seq->seq);
1428 for (my $x = 0 ; $x < $len ; $x++ ) {
1429 if ($seq[$x] eq $consensus[$x]) {
1430 push @
{$cigars{$seq->get_nse}},$pos;
1432 } elsif ($seq[$x] ne $gapchar) {
1437 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1438 for my $name (keys %cigars) {
1439 splice @
{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1440 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1441 push @
{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1442 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1443 ${$cigars{$name}}[$#{$cigars{$name}}] );
1444 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1445 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1446 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1447 splice @
{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1451 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1452 for my $name (keys %cigars) {
1454 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1455 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1456 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1460 for my $pos (@remove) {
1461 splice @
{$cigars{$name}}, $pos, 1;
1464 # join and punctuate
1465 for my $name (keys %cigars) {
1466 my ($start,$end,$str) = "";
1467 while ( ($start,$end) = splice @
{$cigars{$name}}, 0, 2 ) {
1468 $str .= ($start . "," . $end . ":");
1471 $cigars{$name} = $str;
1479 Title : match_line()
1480 Usage : $line = $align->match_line()
1481 Function : Generates a match line - much like consensus string
1482 except that a line indicating the '*' for a match.
1483 Args : (optional) Match line characters ('*' by default)
1484 (optional) Strong match char (':' by default)
1485 (optional) Weak match char ('.' by default)
1491 my ($self,$matchlinechar, $strong, $weak) = @_;
1492 my %matchchars = ('match' => $matchlinechar || '*',
1493 'weak' => $weak || '.',
1494 'strong' => $strong || ':',
1500 foreach my $seq ( $self->each_seq ) {
1501 push @seqchars, [ split(//, uc ($seq->seq)) ];
1502 $alphabet = $seq->alphabet unless defined $alphabet;
1504 my $refseq = shift @seqchars;
1505 # let's just march down the columns
1508 foreach my $pos ( 0..$self->length ) {
1509 my $refchar = $refseq->[$pos];
1510 my $char = $matchchars{'mismatch'};
1511 unless( defined $refchar ) {
1512 last if $pos == $self->length; # short circuit on last residue
1513 # this in place to handle jason's soon-to-be-committed
1514 # intron mapping code
1517 my %col = ($refchar => 1);
1518 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1519 foreach my $seq ( @seqchars ) {
1520 next if $pos >= scalar @
$seq;
1521 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1522 $seq->[$pos] eq ' ' );
1523 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1525 my @colresidues = sort keys %col;
1527 # if all the values are the same
1528 if( $dash ) { $char = $matchchars{'mismatch'} }
1529 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1530 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1531 # matches for protein seqs
1533 foreach my $type ( qw(strong weak) ) {
1534 # iterate through categories
1536 # iterate through each of the aa in the col
1537 # look to see which groups it is in
1538 foreach my $c ( @colresidues ) {
1539 foreach my $f ( grep { index($_,$c) >= 0 } @
{$CONSERVATION_GROUPS{$type}} ) {
1540 push @
{$groups{$f}},$c;
1544 foreach my $cols ( values %groups ) {
1545 @
$cols = sort @
$cols;
1546 # now we are just testing to see if two arrays
1547 # are identical w/o changing either one
1548 # have to be same len
1549 next if( scalar @
$cols != scalar @colresidues );
1550 # walk down the length and check each slot
1551 for($_=0;$_ < (scalar @
$cols);$_++ ) {
1552 next GRP
if( $cols->[$_] ne $colresidues[$_] );
1554 $char = $matchchars{$type};
1560 $matchline .= $char;
1569 Usage : $line = $align->gap_line()
1570 Function : Generates a gap line - much like consensus string
1571 except that a line where '-' represents gap
1572 Args : (optional) gap line characters ('-' by default)
1578 my ($self,$gapchar) = @_;
1579 $gapchar = $gapchar || $self->gap_char;
1580 my %gap_hsh; # column gaps vector
1581 foreach my $seq ( $self->each_seq ) {
1583 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1584 map {[$i++, $_]} split(//, uc ($seq->seq));
1587 foreach my $pos ( 0..$self->length-1 ) {
1588 $gap_line .= (exists $gap_hsh{$pos}) ?
$gapchar:'.';
1595 Title : all_gap_line()
1596 Usage : $line = $align->all_gap_line()
1597 Function : Generates a gap line - much like consensus string
1598 except that a line where '-' represents all-gap column
1599 Args : (optional) gap line characters ('-' by default)
1605 my ($self,$gapchar) = @_;
1606 $gapchar = $gapchar || $self->gap_char;
1607 my %gap_hsh; # column gaps counter hash
1608 my @seqs = $self->each_seq;
1609 foreach my $seq ( @seqs ) {
1611 map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1612 map {[$i++, $_]} split(//, uc ($seq->seq));
1615 foreach my $pos ( 0..$self->length-1 ) {
1616 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1618 $gap_line .= $gapchar;
1626 =head2 gap_col_matrix
1628 Title : gap_col_matrix()
1629 Usage : my $cols = $align->gap_col_matrix()
1630 Function : Generates an array of hashes where
1631 each entry in the array is a hash reference
1632 with keys of all the sequence names and
1633 and value of 1 or 0 if the sequence has a gap at that column
1634 Args : (optional) gap line characters ($aln->gap_char or '-' by default)
1638 sub gap_col_matrix
{
1639 my ($self,$gapchar) = @_;
1640 $gapchar = $gapchar || $self->gap_char;
1641 my %gap_hsh; # column gaps vector
1643 foreach my $seq ( $self->each_seq ) {
1645 my $str = $seq->seq;
1646 my $len = $seq->length;
1648 my $id = $seq->display_id;
1649 while( $i < $len ) {
1650 $ch = substr($str, $i, 1);
1651 $cols[$i++]->{$id} = ($ch eq $gapchar);
1660 Usage : $ali->match()
1661 Function : Goes through all columns and changes residues that are
1662 identical to residue in first sequence to match '.'
1663 character. Sets match_char.
1665 USE WITH CARE: Most MSA formats do not support match
1666 characters in sequences, so this is mostly for output
1667 only. NEXUS format (Bio::AlignIO::nexus) can handle
1670 Argument : a match character, optional, defaults to '.'
1675 my ($self, $match) = @_;
1678 my ($matching_char) = $match;
1679 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1680 $self->map_chars($matching_char, '-');
1682 my @seqs = $self->each_seq();
1683 return 1 unless scalar @seqs > 1;
1685 my $refseq = shift @seqs ;
1686 my @refseq = split //, $refseq->seq;
1687 my $gapchar = $self->gap_char;
1689 foreach my $seq ( @seqs ) {
1690 my @varseq = split //, $seq->seq();
1691 for ( my $i=0; $i < scalar @varseq; $i++) {
1692 $varseq[$i] = $match if defined $refseq[$i] &&
1693 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1694 $refseq[$i] =~ /$gapchar/ )
1695 && $refseq[$i] eq $varseq[$i];
1697 $seq->seq(join '', @varseq);
1699 $self->match_char($match);
1707 Usage : $ali->unmatch()
1708 Function : Undoes the effect of method match. Unsets match_char.
1710 Argument : a match character, optional, defaults to '.'
1712 See L<match> and L<match_char>
1717 my ($self, $match) = @_;
1721 my @seqs = $self->each_seq();
1722 return 1 unless scalar @seqs > 1;
1724 my $refseq = shift @seqs ;
1725 my @refseq = split //, $refseq->seq;
1726 my $gapchar = $self->gap_char;
1727 foreach my $seq ( @seqs ) {
1728 my @varseq = split //, $seq->seq();
1729 for ( my $i=0; $i < scalar @varseq; $i++) {
1730 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1731 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1732 $refseq[$i] =~ /$gapchar/ ) &&
1733 $varseq[$i] eq $match;
1735 $seq->seq(join '', @varseq);
1737 $self->match_char('');
1741 =head1 MSA attributes
1743 Methods for setting and reading the MSA attributes.
1745 Note that the methods defining character semantics depend on the user
1746 to set them sensibly. They are needed only by certain input/output
1747 methods. Unset them by setting to an empty string ('').
1752 Usage : $myalign->id("Ig")
1753 Function : Gets/sets the id field of the alignment
1754 Returns : An id string
1755 Argument : An id string (optional)
1760 my ($self, $name) = @_;
1762 if (defined( $name )) {
1763 $self->{'_id'} = $name;
1766 return $self->{'_id'};
1772 Usage : $myalign->accession("PF00244")
1773 Function : Gets/sets the accession field of the alignment
1774 Returns : An acc string
1775 Argument : An acc string (optional)
1780 my ($self, $acc) = @_;
1782 if (defined( $acc )) {
1783 $self->{'_accession'} = $acc;
1786 return $self->{'_accession'};
1792 Usage : $myalign->description("14-3-3 proteins")
1793 Function : Gets/sets the description field of the alignment
1794 Returns : An description string
1795 Argument : An description string (optional)
1800 my ($self, $name) = @_;
1802 if (defined( $name )) {
1803 $self->{'_description'} = $name;
1806 return $self->{'_description'};
1811 Title : missing_char
1812 Usage : $myalign->missing_char("?")
1813 Function : Gets/sets the missing_char attribute of the alignment
1814 It is generally recommended to set it to 'n' or 'N'
1815 for nucleotides and to 'X' for protein.
1816 Returns : An missing_char string,
1817 Argument : An missing_char string (optional)
1822 my ($self, $char) = @_;
1824 if (defined $char ) {
1825 $self->throw("Single missing character, not [$char]!") if CORE
::length($char) > 1;
1826 $self->{'_missing_char'} = $char;
1829 return $self->{'_missing_char'};
1835 Usage : $myalign->match_char('.')
1836 Function : Gets/sets the match_char attribute of the alignment
1837 Returns : An match_char string,
1838 Argument : An match_char string (optional)
1843 my ($self, $char) = @_;
1845 if (defined $char ) {
1846 $self->throw("Single match character, not [$char]!") if CORE
::length($char) > 1;
1847 $self->{'_match_char'} = $char;
1850 return $self->{'_match_char'};
1856 Usage : $myalign->gap_char('-')
1857 Function : Gets/sets the gap_char attribute of the alignment
1858 Returns : An gap_char string, defaults to '-'
1859 Argument : An gap_char string (optional)
1864 my ($self, $char) = @_;
1866 if (defined $char || ! defined $self->{'_gap_char'} ) {
1867 $char= '-' unless defined $char;
1868 $self->throw("Single gap character, not [$char]!") if CORE
::length($char) > 1;
1869 $self->{'_gap_char'} = $char;
1871 return $self->{'_gap_char'};
1876 Title : symbol_chars
1877 Usage : my @symbolchars = $aln->symbol_chars;
1878 Function: Returns all the seen symbols (other than gaps)
1879 Returns : array of characters that are the seen symbols
1880 Args : boolean to include the gap/missing/match characters
1885 my ($self,$includeextra) = @_;
1887 unless ($self->{'_symbols'}) {
1888 foreach my $seq ($self->each_seq) {
1889 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1892 my %copy = %{$self->{'_symbols'}};
1893 if( ! $includeextra ) {
1894 foreach my $char ( $self->gap_char, $self->match_char,
1895 $self->missing_char) {
1896 delete $copy{$char} if( defined $char );
1902 =head1 Alignment descriptors
1904 These read only methods describe the MSA in various ways.
1910 Usage : $str = $ali->score()
1911 Function : get/set a score of the alignment
1912 Returns : a score for the alignment
1913 Argument : an optional score to set
1919 $self->{score
} = shift if @_;
1920 return $self->{score
};
1923 =head2 consensus_string
1925 Title : consensus_string
1926 Usage : $str = $ali->consensus_string($threshold_percent)
1927 Function : Makes a strict consensus
1928 Returns : Consensus string
1929 Argument : Optional treshold ranging from 0 to 100.
1930 The consensus residue has to appear at least threshold %
1931 of the sequences at a given location, otherwise a '?'
1932 character will be placed at that location.
1933 (Default value = 0%)
1937 sub consensus_string
{
1939 my $threshold = shift;
1942 my $len = $self->length - 1;
1944 foreach ( 0 .. $len ) {
1945 $out .= $self->_consensus_aa($_,$threshold);
1953 my $threshold_percent = shift || -1 ;
1954 my ($seq,%hash,$count,$letter,$key);
1955 my $gapchar = $self->gap_char;
1956 foreach $seq ( $self->each_seq() ) {
1957 $letter = substr($seq->seq,$point,1);
1958 $self->throw("--$point-----------") if $letter eq '';
1959 ($letter eq $gapchar || $letter =~ /\./) && next;
1960 # print "Looking at $letter\n";
1963 my $number_of_sequences = $self->no_sequences();
1964 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
1968 foreach $key ( sort keys %hash ) {
1969 # print "Now at $key $hash{$key}\n";
1970 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
1972 $count = $hash{$key};
1979 =head2 consensus_iupac
1981 Title : consensus_iupac
1982 Usage : $str = $ali->consensus_iupac()
1983 Function : Makes a consensus using IUPAC ambiguity codes from DNA
1984 and RNA. The output is in upper case except when gaps in
1985 a column force output to be in lower case.
1987 Note that if your alignment sequences contain a lot of
1988 IUPAC ambiquity codes you often have to manually set
1989 alphabet. Bio::PrimarySeq::_guess_type thinks they
1990 indicate a protein sequence.
1991 Returns : consensus string
1993 Throws : on protein sequences
1997 sub consensus_iupac
{
2000 my $len = $self->length-1;
2002 # only DNA and RNA sequences are valid
2003 foreach my $seq ( $self->each_seq() ) {
2004 $self->throw("Seq [". $seq->get_nse. "] is a protein")
2005 if $seq->alphabet eq 'protein';
2007 # loop over the alignment columns
2008 foreach my $count ( 0 .. $len ) {
2009 $out .= $self->_consensus_iupac($count);
2014 sub _consensus_iupac
{
2015 my ($self, $column) = @_;
2016 my ($string, $char, $rna);
2018 #determine all residues in a column
2019 foreach my $seq ( $self->each_seq() ) {
2020 $string .= substr($seq->seq, $column, 1);
2022 $string = uc $string;
2024 # quick exit if there's an N in the string
2025 if ($string =~ /N/) {
2026 $string =~ /\W/ ?
return 'n' : return 'N';
2028 # ... or if there are only gap characters
2029 return '-' if $string =~ /^\W+$/;
2031 # treat RNA as DNA in regexps
2032 if ($string =~ /U/) {
2037 # the following s///'s only need to be done to the _first_ ambiguity code
2038 # as we only need to see the _range_ of characters in $string
2040 if ($string =~ /[VDHB]/) {
2041 $string =~ s/V/AGC/;
2042 $string =~ s/D/AGT/;
2043 $string =~ s/H/ACT/;
2044 $string =~ s/B/CTG/;
2047 if ($string =~ /[SKYRWM]/) {
2056 # and now the guts of the thing
2058 if ($string =~ /A/) {
2060 if ($string =~ /G/) {
2061 $char = 'R'; # A and G (purines) R
2062 if ($string =~ /C/) {
2063 $char = 'V'; # A and G and C V
2064 if ($string =~ /T/) {
2065 $char = 'N'; # A and G and C and T N
2067 } elsif ($string =~ /T/) {
2068 $char = 'D'; # A and G and T D
2070 } elsif ($string =~ /C/) {
2071 $char = 'M'; # A and C M
2072 if ($string =~ /T/) {
2073 $char = 'H'; # A and C and T H
2075 } elsif ($string =~ /T/) {
2076 $char = 'W'; # A and T W
2078 } elsif ($string =~ /C/) {
2080 if ($string =~ /T/) {
2081 $char = 'Y'; # C and T (pyrimidines) Y
2082 if ($string =~ /G/) {
2083 $char = 'B'; # C and T and G B
2085 } elsif ($string =~ /G/) {
2086 $char = 'S'; # C and G S
2088 } elsif ($string =~ /G/) {
2090 if ($string =~ /C/) {
2091 $char = 'S'; # G and C S
2092 } elsif ($string =~ /T/) {
2093 $char = 'K'; # G and T K
2095 } elsif ($string =~ /T/) {
2099 $char = 'U' if $rna and $char eq 'T';
2100 $char = lc $char if $string =~ /\W/;
2106 =head2 consensus_meta
2108 Title : consensus_meta
2109 Usage : $seqmeta = $ali->consensus_meta()
2110 Function : Returns a Bio::Seq::Meta object containing the consensus
2111 strings derived from meta data analysis.
2112 Returns : Bio::Seq::Meta
2113 Argument : Bio::Seq::Meta
2114 Throws : non-MetaI object
2118 sub consensus_meta
{
2119 my ($self, $meta) = @_;
2120 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2121 $self->throw('Not a Bio::Seq::MetaI object');
2123 return $self->{'_aln_meta'} = $meta if $meta;
2124 return $self->{'_aln_meta'}
2130 Usage : if ( $ali->is_flush() )
2131 Function : Tells you whether the alignment
2132 : is flush, i.e. all of the same length
2139 my ($self,$report) = @_;
2144 foreach $seq ( $self->each_seq() ) {
2145 if( $length == (-1) ) {
2146 $length = CORE
::length($seq->seq());
2150 $temp = CORE
::length($seq->seq());
2151 if( $temp != $length ) {
2152 $self->warn("expecting $length not $temp from ".
2153 $seq->display_id) if( $report );
2154 $self->debug("expecting $length not $temp from ".
2156 $self->debug($seq->seq(). "\n");
2168 Usage : $len = $ali->length()
2169 Function : Returns the maximum length of the alignment.
2170 To be sure the alignment is a block, use is_flush
2178 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2188 foreach $seq ( $self->each_seq() ) {
2189 $temp = $seq->length();
2190 if( $temp > $length ) {
2199 =head2 maxdisplayname_length
2201 Title : maxdisplayname_length
2202 Usage : $ali->maxdisplayname_length()
2203 Function : Gets the maximum length of the displayname in the
2204 alignment. Used in writing out various MSA formats.
2210 sub maxname_length
{
2212 $self->deprecated("maxname_length - deprecated method.".
2213 " Use maxdisplayname_length() instead.");
2214 $self->maxdisplayname_length();
2219 $self->deprecated("maxnse_length - deprecated method.".
2220 " Use maxnse_length() instead.");
2221 $self->maxdisplayname_length();
2224 sub maxdisplayname_length
{
2229 foreach $seq ( $self->each_seq() ) {
2230 $len = CORE
::length $self->displayname($seq->get_nse());
2232 if( $len > $maxname ) {
2240 =head2 max_metaname_length
2242 Title : max_metaname_length
2243 Usage : $ali->max_metaname_length()
2244 Function : Gets the maximum length of the meta name tags in the
2245 alignment for the sequences and for the alignment.
2246 Used in writing out various MSA formats.
2252 sub max_metaname_length
{
2257 # check seq meta first
2258 for $seq ( $self->each_seq() ) {
2259 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2260 for my $mtag ($seq->meta_names) {
2261 $len = CORE
::length $mtag;
2262 if( $len > $maxname ) {
2269 for my $meta ($self->consensus_meta) {
2271 for my $name ($meta->meta_names) {
2272 $len = CORE
::length $name;
2273 if( $len > $maxname ) {
2285 Usage : $no = $ali->no_residues
2286 Function : number of residues in total in the alignment
2296 foreach my $seq ($self->each_seq) {
2297 my $str = $seq->seq();
2299 $count += ($str =~ s/[A-Za-z]//g);
2307 Title : no_sequences
2308 Usage : $depth = $ali->no_sequences
2309 Function : number of sequence in the sequence alignment
2318 return scalar($self->each_seq);
2322 =head2 average_percentage_identity
2324 Title : average_percentage_identity
2325 Usage : $id = $align->average_percentage_identity
2326 Function: The function uses a fast method to calculate the average
2327 percentage identity of the alignment
2328 Returns : The average percentage identity of the alignment
2330 Notes : This method implemented by Kevin Howe calculates a figure that is
2331 designed to be similar to the average pairwise identity of the
2332 alignment (identical in the absence of gaps), without having to
2333 explicitly calculate pairwise identities proposed by Richard Durbin.
2334 Validated by Ewan Birney ad Alex Bateman.
2338 sub average_percentage_identity
{
2339 my ($self,@args) = @_;
2341 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2342 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2344 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2346 if (! $self->is_flush()) {
2347 $self->throw("All sequences in the alignment must be the same length");
2350 @seqs = $self->each_seq();
2351 $len = $self->length();
2353 # load the each hash with correct keys for existence checks
2355 for( my $index=0; $index < $len; $index++) {
2356 foreach my $letter (@alphabet) {
2357 $countHashes[$index]->{$letter} = 0;
2360 foreach my $seq (@seqs) {
2361 my @seqChars = split //, $seq->seq();
2362 for( my $column=0; $column < @seqChars; $column++ ) {
2363 my $char = uc($seqChars[$column]);
2364 if (exists $countHashes[$column]->{$char}) {
2365 $countHashes[$column]->{$char}++;
2372 for(my $column =0; $column < $len; $column++) {
2373 my %hash = %{$countHashes[$column]};
2375 foreach my $res (keys %hash) {
2376 $total += $hash{$res}*($hash{$res} - 1);
2377 $subdivisor += $hash{$res};
2379 $divisor += $subdivisor * ($subdivisor - 1);
2381 return $divisor > 0 ?
($total / $divisor )*100.0 : 0;
2384 =head2 percentage_identity
2386 Title : percentage_identity
2387 Usage : $id = $align->percentage_identity
2388 Function: The function calculates the average percentage identity
2389 (aliased to average_percentage_identity)
2390 Returns : The average percentage identity
2395 sub percentage_identity
{
2397 return $self->average_percentage_identity();
2400 =head2 overall_percentage_identity
2402 Title : overall_percentage_identity
2403 Usage : $id = $align->overall_percentage_identity
2404 $id = $align->overall_percentage_identity('short')
2405 Function: The function calculates the percentage identity of
2406 the conserved columns
2407 Returns : The percentage identity of the conserved columns
2408 Args : length value to use, optional defaults to alignment length
2409 possible values: 'align', 'short', 'long'
2411 The argument values 'short' and 'long' refer to shortest and longest
2412 sequence in the alignment. Method modification code by Hongyu Zhang.
2416 sub overall_percentage_identity
{
2417 my ($self, $length_measure) = @_;
2419 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2420 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2422 my ($len, $total, @seqs, @countHashes);
2424 my %enum = map {$_ => 1} qw
(align short long
);
2426 $self->throw("Unknown argument [$length_measure]")
2427 if $length_measure and not $enum{$length_measure};
2428 $length_measure ||= 'align';
2430 if (! $self->is_flush()) {
2431 $self->throw("All sequences in the alignment must be the same length");
2434 @seqs = $self->each_seq();
2435 $len = $self->length();
2437 # load the each hash with correct keys for existence checks
2438 for( my $index=0; $index < $len; $index++) {
2439 foreach my $letter (@alphabet) {
2440 $countHashes[$index]->{$letter} = 0;
2443 foreach my $seq (@seqs) {
2444 my @seqChars = split //, $seq->seq();
2445 for( my $column=0; $column < @seqChars; $column++ ) {
2446 my $char = uc($seqChars[$column]);
2447 if (exists $countHashes[$column]->{$char}) {
2448 $countHashes[$column]->{$char}++;
2454 for(my $column =0; $column < $len; $column++) {
2455 my %hash = %{$countHashes[$column]};
2456 foreach ( values %hash ) {
2458 $total++ if( $_ == scalar @seqs );
2463 if ($length_measure eq 'short') {
2464 ## find the shortest length
2466 foreach my $seq ($self->each_seq) {
2467 my $count = $seq->seq =~ tr/[A-Za-z]//;
2469 $len = $count if $count < $len;
2475 elsif ($length_measure eq 'long') {
2476 ## find the longest length
2478 foreach my $seq ($self->each_seq) {
2479 my $count = $seq->seq =~ tr/[A-Za-z]//;
2481 $len = $count if $count > $len;
2488 return ($total / $len ) * 100.0;
2493 =head1 Alignment positions
2495 Methods to map a sequence position into an alignment column and back.
2496 column_from_residue_number() does the former. The latter is really a
2497 property of the sequence object and can done using
2498 L<Bio::LocatableSeq::location_from_column>:
2500 # select somehow a sequence from the alignment, e.g.
2501 my $seq = $aln->get_seq_by_pos(1);
2502 #$loc is undef or Bio::LocationI object
2503 my $loc = $seq->location_from_column(5);
2505 =head2 column_from_residue_number
2507 Title : column_from_residue_number
2508 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2509 Function: This function gives the position in the alignment
2510 (i.e. column number) of the given residue number in the
2511 sequence with the given name. For example, for the
2514 Seq1/91-97 AC..DEF.GH.
2515 Seq2/24-30 ACGG.RTY...
2516 Seq3/43-51 AC.DDEF.GHI
2518 column_from_residue_number( "Seq1", 94 ) returns 6.
2519 column_from_residue_number( "Seq2", 25 ) returns 2.
2520 column_from_residue_number( "Seq3", 50 ) returns 10.
2522 An exception is thrown if the residue number would lie
2523 outside the length of the aligment
2524 (e.g. column_from_residue_number( "Seq2", 22 )
2526 Note: If the the parent sequence is represented by more than
2527 one alignment sequence and the residue number is present in
2528 them, this method finds only the first one.
2530 Returns : A column number for the position in the alignment of the
2531 given residue in the given sequence (1 = first column)
2532 Args : A sequence id/name (not a name/start-end)
2533 A residue number in the whole sequence (not just that
2534 segment of it in the alignment)
2538 sub column_from_residue_number
{
2539 my ($self, $name, $resnumber) = @_;
2541 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2542 $self->throw("Second argument residue number missing") unless $resnumber;
2544 foreach my $seq ($self->each_seq_with_id($name)) {
2547 $col = $seq->column_from_residue_number($resnumber);
2553 $self->throw("Could not find a sequence segment in $name ".
2554 "containing residue number $resnumber");
2558 =head1 Sequence names
2560 Methods to manipulate the display name. The default name based on the
2561 sequence id and subsequence positions can be overridden in various
2567 Usage : $myalign->displayname("Ig", "IgA")
2568 Function : Gets/sets the display name of a sequence in the alignment
2569 Returns : A display name string
2570 Argument : name of the sequence
2571 displayname of the sequence (optional)
2575 sub get_displayname
{
2577 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2578 $self->displayname(@_);
2581 sub set_displayname
{
2583 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2584 $self->displayname(@_);
2588 my ($self, $name, $disname) = @_;
2590 $self->throw("No sequence with name [$name]")
2591 unless defined $self->{'_seq'}->{$name};
2593 if( $disname and $name) {
2594 $self->{'_dis_name'}->{$name} = $disname;
2597 elsif( defined $self->{'_dis_name'}->{$name} ) {
2598 return $self->{'_dis_name'}->{$name};
2604 =head2 set_displayname_count
2606 Title : set_displayname_count
2607 Usage : $ali->set_displayname_count
2608 Function : Sets the names to be name_# where # is the number of
2609 times this name has been used.
2610 Returns : 1, on success
2615 sub set_displayname_count
{
2617 my (@arr,$name,$seq,$count,$temp,$nse);
2619 foreach $seq ( $self->each_alphabetically() ) {
2620 $nse = $seq->get_nse();
2622 #name will be set when this is the second
2623 #time (or greater) is has been seen
2625 if( defined $name and $name eq ($seq->id()) ) {
2626 $temp = sprintf("%s_%s",$name,$count);
2627 $self->displayname($nse,$temp);
2632 $temp = sprintf("%s_%s",$name,$count);
2633 $self->displayname($nse,$temp);
2640 =head2 set_displayname_flat
2642 Title : set_displayname_flat
2643 Usage : $ali->set_displayname_flat()
2644 Function : Makes all the sequences be displayed as just their name,
2651 sub set_displayname_flat
{
2655 foreach $seq ( $self->each_seq() ) {
2656 $nse = $seq->get_nse();
2657 $self->displayname($nse,$seq->id());
2662 =head2 set_displayname_normal
2664 Title : set_displayname_normal
2665 Usage : $ali->set_displayname_normal()
2666 Function : Makes all the sequences be displayed as name/start-end
2667 Returns : 1, on success
2672 sub set_displayname_normal
{
2676 foreach $seq ( $self->each_seq() ) {
2677 $nse = $seq->get_nse();
2678 $self->displayname($nse,$nse);
2686 Usage : $obj->source($newval)
2687 Function: sets the Alignment source program
2689 Returns : value of source
2690 Args : newvalue (optional)
2696 my ($self,$value) = @_;
2697 if( defined $value) {
2698 $self->{'_source'} = $value;
2700 return $self->{'_source'};
2703 =head2 set_displayname_safe
2705 Title : set_displayname_safe
2706 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2707 Function : Assign machine-generated serial names to sequences in input order.
2708 Designed to protect names during PHYLIP runs. Assign 10-char string
2709 in the form of "S000000001" to "S999999999". Restore the original
2710 names using "restore_displayname".
2711 Returns : 1. a new $aln with system names;
2712 2. a hash ref for restoring names
2713 Argument : Number for id length (default 10)
2717 sub set_displayname_safe
{
2719 my $idlength = shift || 10;
2720 my ($seq, %phylip_name);
2722 my $new=Bio
::SimpleAlign
->new();
2723 foreach $seq ( $self->each_seq() ) {
2725 my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2726 $phylip_name{$pname}=$seq->id();
2727 my $new_seq= Bio
::LocatableSeq
->new(-id
=> $pname,
2728 -seq
=> $seq->seq(),
2729 -alphabet
=> $seq->alphabet,
2730 -start
=> $seq->{_start
},
2731 -end
=> $seq->{_end
}
2733 $new->add_seq($new_seq);
2736 $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2737 return ($new, \
%phylip_name);
2740 =head2 restore_displayname
2742 Title : restore_displayname
2743 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2744 Function : Restore original sequence names (after running
2745 $ali->set_displayname_safe)
2746 Returns : a new $aln with names restored.
2747 Argument : a hash reference of names from "set_displayname_safe".
2751 sub restore_displayname
{
2755 my $new=Bio
::SimpleAlign
->new();
2756 foreach my $seq ( $self->each_seq() ) {
2757 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2758 my $new_seq= Bio
::LocatableSeq
->new(-id
=> $name{$seq->id()},
2759 -seq
=> $seq->seq(),
2760 -alphabet
=> $seq->alphabet,
2761 -start
=> $seq->{_start
},
2762 -end
=> $seq->{_end
}
2764 $new->add_seq($new_seq);
2769 =head2 sort_by_start
2771 Title : sort_by_start
2772 Usage : $ali->sort_by_start
2773 Function : Changes the order of the alignment to the start position of each
2782 my ($seq,$nse,@arr,%hash,$count);
2783 foreach $seq ( $self->each_seq() ) {
2784 $nse = $seq->get_nse;
2788 %{$self->{'_order'}} = (); # reset the hash;
2789 foreach $nse ( sort _startend
keys %hash) {
2790 $self->{'_order'}->{$count} = $nse;
2798 my ($aname,$arange) = split (/[\/]/,$a);
2799 my ($bname,$brange) = split (/[\/]/,$b);
2800 my ($astart,$aend) = split(/\-/,$arange);
2801 my ($bstart,$bend) = split(/\-/,$brange);
2802 return $astart <=> $bstart;
2805 =head2 bracket_string
2807 Title : bracket_string
2808 Usage : my @params = (-refseq => 'testseq',
2809 -allele1 => 'allele1',
2810 -allele2 => 'allele2',
2811 -delimiters => '{}',
2813 $str = $aln->bracket_string(@params)
2815 Function : When supplied with a list of parameters (see below), returns a
2816 string in BIC format. This is used for allelic comparisons.
2817 Briefly, if either allele contains a base change when compared to
2818 the refseq, the base or gap for each allele is represented in
2819 brackets in the order present in the 'alleles' parameter.
2821 For the following data:
2830 the returned string with parameters 'refseq => testseq' and
2831 'alleles => [qw(allele1 allele2)]' would be:
2833 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2834 Returns : BIC-formatted string
2835 Argument : Required args
2836 refseq : string (ID) of the reference sequence used
2837 as basis for comparison
2838 allele1 : string (ID) of the first allele
2839 allele2 : string (ID) of the second allele
2841 delimiters: two symbol string of left and right delimiters.
2842 Only the first two symbols are used
2844 separator : string used as a separator. Only the first
2847 Throws : On no refseq/alleles, or invalid refseq/alleles.
2851 sub bracket_string
{
2852 my ($self, @args) = @_;
2853 my ($ref, $a1, $a2, $delim, $sep) =
2854 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2855 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2857 ($ld, $rd) = split('', $delim, 2) if $delim;
2861 my ($refseq, $allele1, $allele2) =
2862 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2863 if (!$refseq || !$allele1 || !$allele2) {
2864 $self->throw("One of your refseq/allele IDs is invalid!");
2866 my $len = $self->length-1;
2868 # loop over the alignment columns
2869 for my $column ( 0 .. $len ) {
2871 my ($compres, $res1, $res2) =
2872 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2873 # are any of the allele symbols different from the refseq?
2874 $string = ($compres eq $res1 && $compres eq $res2) ?
$compres :
2875 $ld.$res1.$sep.$res2.$rd;
2882 =head2 methods for Bio::FeatureHolder
2884 FeatureHolder implementation to support labeled character sets like one
2885 would get from NEXUS represented data.
2887 =head2 get_SeqFeatures
2890 Function: Get the feature objects held by this feature holder.
2892 Returns : an array of Bio::SeqFeatureI implementing objects
2895 At some day we may want to expand this method to allow for a feature
2896 filter to be passed in.
2900 sub get_SeqFeatures
{
2903 if( !defined $self->{'_as_feat'} ) {
2904 $self->{'_as_feat'} = [];
2906 return @
{$self->{'_as_feat'}};
2909 =head2 add_SeqFeature
2911 Usage : $feat->add_SeqFeature($subfeat);
2912 $feat->add_SeqFeature($subfeat,'EXPAND')
2913 Function: adds a SeqFeature into the subSeqFeature array.
2914 with no 'EXPAND' qualifer, subfeat will be tested
2915 as to whether it lies inside the parent, and throw
2916 an exception if not.
2918 If EXPAND is used, the parent''s start/end/strand will
2919 be adjusted so that it grows to accommodate the new
2923 Args : a Bio::SeqFeatureI object
2927 sub add_SeqFeature
{
2928 my ($self,@feat) = @_;
2930 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
2932 foreach my $feat ( @feat ) {
2933 if( !$feat->isa("Bio::SeqFeatureI") ) {
2934 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
2937 push(@
{$self->{'_as_feat'}},$feat);
2943 =head2 remove_SeqFeatures
2945 Usage : $obj->remove_SeqFeatures
2946 Function: Removes all sub SeqFeatures. If you want to remove only a subset,
2947 remove that subset from the returned array, and add back the rest.
2948 Returns : The array of Bio::SeqFeatureI implementing sub-features that was
2949 deleted from this feature.
2954 sub remove_SeqFeatures
{
2957 return () unless $self->{'_as_feat'};
2958 my @feats = @
{$self->{'_as_feat'}};
2959 $self->{'_as_feat'} = [];
2963 =head2 feature_count
2965 Title : feature_count
2966 Usage : $obj->feature_count()
2967 Function: Return the number of SeqFeatures attached to a feature holder.
2969 This is before flattening a possible sub-feature tree.
2971 We provide a default implementation here that just counts
2972 the number of objects returned by get_SeqFeatures().
2973 Implementors may want to override this with a more
2974 efficient implementation.
2976 Returns : integer representing the number of SeqFeatures
2979 At some day we may want to expand this method to allow for a feature
2980 filter to be passed in.
2982 Our default implementation allows for any number of additional
2983 arguments and will pass them on to get_SeqFeatures(). I.e., in order to
2984 support filter arguments, just support them in get_SeqFeatures().
2989 if (defined($self->{'_as_feat'})) {
2990 return ($#{$self->{'_as_feat'}} + 1);
2996 =head2 get_all_SeqFeatures
2998 Title : get_all_SeqFeatures
3000 Function: Get the flattened tree of feature objects held by this
3001 feature holder. The difference to get_SeqFeatures is that
3002 the entire tree of sub-features will be flattened out.
3004 We provide a default implementation here, so implementors
3005 don''t necessarily need to implement this method.
3008 Returns : an array of Bio::SeqFeatureI implementing objects
3011 At some day we may want to expand this method to allow for a feature
3012 filter to be passed in.
3014 Our default implementation allows for any number of additional
3015 arguments and will pass them on to any invocation of
3016 get_SeqFeatures(), wherever a component of the tree implements
3017 FeatureHolderI. I.e., in order to support filter arguments, just
3018 support them in get_SeqFeatures().
3022 =head2 methods for Bio::AnnotatableI
3024 AnnotatableI implementation to support sequence alignments which
3025 contain annotation (NEXUS, Stockholm).
3030 Usage : $ann = $aln->annotation or
3031 $aln->annotation($ann)
3032 Function: Gets or sets the annotation
3033 Returns : Bio::AnnotationCollectionI object
3034 Args : None or Bio::AnnotationCollectionI object
3036 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3037 for more information
3042 my ($obj,$value) = @_;
3043 if( defined $value ) {
3044 $obj->throw("object of class ".ref($value)." does not implement ".
3045 "Bio::AnnotationCollectionI. Too bad.")
3046 unless $value->isa("Bio::AnnotationCollectionI");
3047 $obj->{'_annotation'} = $value;
3048 } elsif( ! defined $obj->{'_annotation'}) {
3049 $obj->{'_annotation'} = Bio
::Annotation
::Collection
->new();
3051 return $obj->{'_annotation'};