2 # BioPerl module for SimpleAlign
4 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
6 # Copyright Ewan Birney
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
13 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
14 # May 2001 major rewrite - Heikki Lehvaslaiho
18 Bio::SimpleAlign - Multiple alignments held as a set of sequences
22 # Use Bio::AlignIO to read in the alignment
23 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
24 $aln = $str->next_aln();
28 print $aln->no_residues;
30 print $aln->no_sequences;
32 print $aln->percentage_identity;
33 print $aln->consensus_string(50);
35 # Find the position in the alignment for a sequence location
36 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
38 # Extract sequences and check values for the alignment column $pos
39 foreach $seq ($aln->each_seq) {
40 $res = $seq->subseq($pos, $pos);
43 foreach $res (keys %count) {
44 printf "Res: %s Count: %2d\n", $res, $count{$res};
48 $aln->remove_seq($seq);
49 $mini_aln = $aln->slice(20,30); # get a block of columns
50 $mini_aln = $aln->select_noncont(1,3,5,7,11); # get single columns
51 $new_aln = $aln->remove_columns([20,30]); # remove by position
52 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
55 $str = $aln->consensus_string($threshold_percent);
56 $str = $aln->match_line();
57 $str = $aln->cigar_line();
58 $id = $aln->percentage_identity;
60 # See the module documentation for details and more methods.
64 SimpleAlign is an object that handles a multiple sequence alignment
65 (MSA). It is very permissive of types (it does not insist on sequences
66 being all same length, for example). Think of it as a set of sequences
67 with a whole series of built-in manipulations and methods for reading and
70 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
71 to store its sequences. These are subsequences with a start and end
72 positions in the parent reference sequence. Each sequence in the
73 SimpleAlign object is a Bio::LocatableSeq.
75 SimpleAlign expects the combination of name, start, and end for a
76 given sequence to be unique in the alignment, and this is the key for the
77 internal hashes (name, start, end are abbreviated C<nse> in the code).
78 However, in some cases people do not want the name/start-end to be displayed:
79 either multiple names in an alignment or names specific to the alignment
80 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
81 C<displayname>, and generally is what is used to print out the
82 alignment. They default to name/start-end.
84 The SimpleAlign Module is derived from the Align module by Ewan Birney.
90 User feedback is an integral part of the evolution of this and other
91 Bioperl modules. Send your comments and suggestions preferably to one
92 of the Bioperl mailing lists. Your participation is much appreciated.
94 bioperl-l@bioperl.org - General discussion
95 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
99 Report bugs to the Bioperl bug tracking system to help us keep track
100 the bugs and their resolution. Bug reports can be submitted via the
103 http://bugzilla.open-bio.org/
107 Ewan Birney, birney@ebi.ac.uk
111 Allen Day, allenday-at-ucla.edu,
112 Richard Adams, Richard.Adams-at-ed.ac.uk,
113 David J. Evans, David.Evans-at-vir.gla.ac.uk,
114 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
115 Allen Smith, allens-at-cpan.org,
116 Jason Stajich, jason-at-bioperl.org,
117 Anthony Underwood, aunderwood-at-phls.org.uk,
118 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
119 Brian Osborne, bosborne at alum.mit.edu
120 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
121 Hongyu Zhang, forward at hongyu.org
129 The rest of the documentation details each of the object
130 methods. Internal methods are usually preceded with a _
134 # 'Let the code begin...
136 package Bio
::SimpleAlign
;
137 use vars
qw(%CONSERVATION_GROUPS);
140 use Bio::LocatableSeq; # uses Seq's as list
143 use Bio::SeqFeature::Generic;
146 # This data should probably be in a more centralized module...
147 # it is taken from Clustalw documentation.
148 # These are all the positively scoring groups that occur in the
149 # Gonnet Pam250 matrix. The strong and weak groups are
150 # defined as strong score >0.5 and weak score =<0.5 respectively.
152 %CONSERVATION_GROUPS = (
177 use base
qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
178 Bio::FeatureHolderI);
183 Usage : my $aln = Bio::SimpleAlign->new();
184 Function : Creates a new simple align object
185 Returns : Bio::SimpleAlign
186 Args : -source => string representing the source program
187 where this alignment came from
188 -annotation => Bio::AnnotationCollectionI
189 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
190 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
191 -consensus => consensus string
192 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
198 my($class,@args) = @_;
200 my $self = $class->SUPER::new
(@args);
202 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
215 $src && $self->source($src);
216 defined $score && $self->score($score);
217 # we need to set up internal hashs first!
219 $self->{'_seq'} = {};
220 $self->{'_order'} = {};
221 $self->{'_start_end_lists'} = {};
222 $self->{'_dis_name'} = {};
223 $self->{'_id'} = 'NoName';
224 # maybe we should automatically read in from args. Hmmm...
225 $id && $self->id($id);
226 $acc && $self->accession($acc);
227 $desc && $self->description($desc);
228 $coll && $self->annotation($coll);
229 # sequence annotation is layered into a provided annotation collection (or dies)
231 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
232 "with a sequence annotation collection")
234 $coll->add_Annotation('seq_annotation', $sa);
236 if ($feats && ref $feats eq 'ARRAY') {
237 for my $feat (@
$feats) {
238 $self->add_SeqFeature($feat);
241 $con && $self->consensus($con);
242 $cmeta && $self->consensus_meta($cmeta);
243 # assumes these are in correct alignment order
244 if ($seqs && ref($seqs) eq 'ARRAY') {
245 for my $seq (@
$seqs) {
246 $self->add_seq($seq);
250 return $self; # success - we hope!
253 =head1 Modifier methods
255 These methods modify the MSA by adding, removing or shuffling complete
261 Usage : $myalign->add_seq($newseq);
262 Function : Adds another sequence to the alignment. *Does not* align
263 it - just adds it to the hashes.
265 Args : a Bio::LocatableSeq object
268 See L<Bio::LocatableSeq> for more information
274 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
282 my ($name,$id,$start,$end);
284 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
285 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
288 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
290 # build the symbol list for this sequence,
291 # will prune out the gap and missing/match chars
292 # when actually asked for the symbol list in the
294 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
296 if( !defined $order ) {
297 $order = keys %{$self->{'_seq'}};
299 $name = $seq->get_nse;
301 if( $self->{'_seq'}->{$name} ) {
302 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
305 $self->debug( "Assigning $name to $order\n");
307 $self->{'_order'}->{$order} = $name;
309 unless( exists( $self->{'_start_end_lists'}->{$id})) {
310 $self->{'_start_end_lists'}->{$id} = [];
312 push @
{$self->{'_start_end_lists'}->{$id}}, $seq;
315 $self->{'_seq'}->{$name} = $seq;
323 Usage : $aln->remove_seq($seq);
324 Function : Removes a single sequence from an alignment
326 Argument : a Bio::LocatableSeq object
332 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
333 $self->remove_seq(@_);
339 my ($name,$id,$start,$end);
341 $self->throw("Need Bio::Locatable seq argument ")
342 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
345 $start = $seq->start();
347 $name = sprintf("%s/%d-%d",$id,$start,$end);
349 if( !exists $self->{'_seq'}->{$name} ) {
350 $self->throw("Sequence $name does not exist in the alignment to remove!");
353 delete $self->{'_seq'}->{$name};
355 # we need to remove this seq from the start_end_lists hash
357 if (exists $self->{'_start_end_lists'}->{$id}) {
358 # we need to find the sequence in the array.
361 for ($i=0; $i < @
{$self->{'_start_end_lists'}->{$id}}; $i++) {
362 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
368 splice @
{$self->{'_start_end_lists'}->{$id}}, $i, 1;
371 $self->throw("Could not find the sequence to remoce from the start-end list");
375 $self->throw("There is no seq list for the name $id");
377 # we need to shift order hash
378 my %rev_order = reverse %{$self->{'_order'}};
379 my $no = $rev_order{$name};
380 my $no_sequences = $self->no_sequences;
381 for (; $no < $no_sequences; $no++) {
382 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
384 delete $self->{'_order'}->{$no};
392 Usage : $aln->purge(0.7);
393 Function: Removes sequences above given sequence similarity
394 This function will grind on large alignments. Beware!
396 Returns : An array of the removed sequences
397 Args : float, threshold for similarity
402 my ($self,$perc) = @_;
403 my (%duplicate, @dups);
405 my @seqs = $self->each_seq();
407 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
410 #skip if already in duplicate hash
411 next if exists $duplicate{$seq->display_id} ;
412 my $one = $seq->seq();
414 my @one = split '', $one; #split to get 1aa per array element
416 for (my $j=$i+1;$j < @seqs;$j++) {
417 my $seq2 = $seqs[$j];
419 #skip if already in duplicate hash
420 next if exists $duplicate{$seq2->display_id} ;
422 my $two = $seq2->seq();
423 my @two = split '', $two;
427 for (my $k=0;$k<@one;$k++) {
428 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
429 $one[$k] eq $two[$k]) {
432 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
433 $two[$k] ne '.' && $two[$k] ne '-' ) {
439 $ratio = $count/$res unless $res == 0;
441 # if above threshold put in duplicate hash and push onto
442 # duplicate array for returning to get_unique
443 if ( $ratio > $perc ) {
444 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
445 $duplicate{$seq2->display_id} = 1;
450 foreach my $seq (@dups) {
451 $self->remove_seq($seq);
456 =head2 sort_alphabetically
458 Title : sort_alphabetically
459 Usage : $ali->sort_alphabetically
460 Function : Changes the order of the alignment to alphabetical on name
461 followed by numerical by number.
467 sub sort_alphabetically
{
469 my ($seq,$nse,@arr,%hash,$count);
471 foreach $seq ( $self->each_seq() ) {
472 $nse = $seq->get_nse;
478 %{$self->{'_order'}} = (); # reset the hash;
480 foreach $nse ( sort _alpha_startend
keys %hash) {
481 $self->{'_order'}->{$count} = $nse;
491 Usage : $aln_ordered=$aln->sort_by_list($list_file)
492 Function : Arbitrarily order sequences in an alignment
493 Returns : A new Bio::SimpleAlign object
494 Argument : a file listing sequence names in intended order (one name per line)
499 my ($self, $list) = @_;
500 my (@seq, @ids, %order);
502 foreach my $seq ( $self->each_seq() ) {
504 push @ids, $seq->display_id;
508 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
512 $self->throw("Not found in alignment: $name") unless &_in_aln
($name, \
@ids);
517 # use the map-sort-map idiom:
518 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
519 my $aln = $self->new;
520 foreach (@sorted) { $aln->add_seq($_) }
524 =head2 set_new_reference
526 Title : set_new_reference
527 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
528 the sequence whoes name is "B31" (full, exact, and case-sensitive),
529 as the reference (1st) sequence
530 Function : Change/Set a new reference (i.e., the first) sequence
531 Returns : a new Bio::SimpleAlign object.
532 Throws an exception if designated sequence not found
533 Argument : a positive integer of sequence order, or a sequence name
534 in the original alignment
538 sub set_new_reference
{
539 my ($self, $seqid) = @_;
540 my $aln = $self->new;
541 my (@seq, @ids, @new_seq);
543 foreach my $seq ( $self->each_seq() ) {
545 push @ids, $seq->display_id;
548 if ($seqid =~ /^\d+$/) { # argument is seq position
550 $self->throw("The new reference sequence number has to be a positive integer >1 and <= no_sequences ") if ($seqid <= 1 || $seqid > $self->no_sequences);
551 } else { # argument is a seq name
552 $self->throw("The new reference sequence not in alignment ") unless &_in_aln
($seqid, \
@ids);
555 for (my $i=0; $i<=$#seq; $i++) {
557 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
558 unshift @new_seq, $seq[$i];
560 push @new_seq, $seq[$i];
563 foreach (@new_seq) { $aln->add_seq($_); }
567 sub _in_aln
{ # check if input name exists in the alignment
568 my ($str, $ref) = @_;
570 return 1 if $str eq $_;
579 Usage : $aln->uniq_seq(): Remove identical sequences in
580 in the alignment. Ambiguous base ("N", "n") and
581 leading and ending gaps ("-") are NOT counted as
583 Function : Make a new alignment of unique sequence types (STs)
584 Returns : 1. a new Bio::SimpleAlign object (all sequences renamed as "ST")
585 2. ST of each sequence in STDERR
591 my ($self, $seqid) = @_;
592 my $aln = $self->new;
593 my (%member, %order, @seq, @uniq_str);
595 my $len = $self->length();
596 foreach my $seq ( $self->each_seq() ) {
597 my $str = $seq->seq();
599 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
600 # comparing two sequence strings
602 # 1st, convert "n", "N" to "?" (for DNA sequence only):
603 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
604 # 2nd, convert leading and ending gaps to "?":
605 $str = &_convert_leading_ending_gaps
($str, '-', '?');
606 # Note that '?' also can mean unknown residue.
607 # I don't like making global class member changes like this, too
608 # prone to errors... -- cjfields 08-11-18
609 local $Bio::LocatableSeq
::GAP_SYMBOLS
= '-\?';
610 my $new = Bio
::LocatableSeq
->new(
612 -alphabet
=> $seq->alphabet,
614 -start
=> $seq->start,
620 foreach my $seq (@seq) {
621 my $str = $seq->seq();
622 my ($seen, $key) = &_check_uniq
($str, \
@uniq_str, $len);
623 if ($seen) { # seen before
624 my @memb = @
{$member{$key}};
626 $member{$key} = \
@memb;
628 push @uniq_str, $key;
630 $member{$key} = [ ($seq) ];
631 $order{$key} = $order;
635 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
636 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
637 my $str2 = &_convert_leading_ending_gaps
($str, '?', '-');
638 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
639 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
641 my $end=length($str2);
642 $end -= length($1) while $str2 =~ m/($gap+)/g;
643 my $new = Bio
::LocatableSeq
->new(-id
=>"ST".$order{$str},
649 # print STDERR "ST".$order{$str}, "\t=>";
650 foreach (@
{$member{$str}}) {
651 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
658 sub _check_uniq
{ # check if same seq exists in the alignment
659 my ($str1, $ref, $length) = @_;
660 my @char1=split //, $str1;
663 return (0, $str1) if @array==0; # not seen (1st sequence)
665 foreach my $str2 (@array) {
667 my @char2=split //, $str2;
668 for (my $i=0; $i<=$length-1; $i++) {
669 next if $char1[$i] eq '?';
670 next if $char2[$i] eq '?';
671 $diff++ if $char1[$i] ne $char2[$i];
673 return (1, $str2) if $diff == 0; # seen before
676 return (0, $str1); # not seen
679 sub _convert_leading_ending_gaps
{
683 my @array=split //, $s;
684 # convert leading char:
685 for (my $i=0; $i<=$#array; $i++) {
686 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
688 # convert ending char:
689 for (my $i = $#array; $i>= 0; $i--) {
690 ($array[$i] eq $sym1) ?
($array[$i] = $sym2):(last);
692 my $s_new=join '', @array;
696 =head1 Sequence selection methods
698 Methods returning one or more sequences objects.
703 Usage : foreach $seq ( $align->each_seq() )
704 Function : Gets a Seq object from the alignment
712 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
720 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
721 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
722 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
729 =head2 each_alphabetically
731 Title : each_alphabetically
732 Usage : foreach $seq ( $ali->each_alphabetically() )
733 Function : Returns a sequence object, but the objects are returned
734 in alphabetically sorted order.
735 Does not change the order of the alignment.
741 sub each_alphabetically
{
743 my ($seq,$nse,@arr,%hash,$count);
745 foreach $seq ( $self->each_seq() ) {
746 $nse = $seq->get_nse;
750 foreach $nse ( sort _alpha_startend
keys %hash) {
751 push(@arr,$hash{$nse});
756 sub _alpha_startend
{
757 my ($aname,$astart,$bname,$bstart);
758 ($aname,$astart) = split (/-/,$a);
759 ($bname,$bstart) = split (/-/,$b);
761 if( $aname eq $bname ) {
762 return $astart <=> $bstart;
765 return $aname cmp $bname;
769 =head2 each_seq_with_id
771 Title : each_seq_with_id
772 Usage : foreach $seq ( $align->each_seq_with_id() )
773 Function : Gets a Seq objects from the alignment, the contents
774 being those sequences with the given name (there may be
777 Argument : a seq name
783 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
784 $self->each_seq_with_id(@_);
787 sub each_seq_with_id
{
791 $self->throw("Method each_seq_with_id needs a sequence name argument")
796 if (exists($self->{'_start_end_lists'}->{$id})) {
797 @arr = @
{$self->{'_start_end_lists'}->{$id}};
802 =head2 get_seq_by_pos
804 Title : get_seq_by_pos
805 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
806 Function : Gets a sequence based on its position in the alignment.
807 Numbering starts from 1. Sequence positions larger than
808 no_sequences() will thow an error.
809 Returns : a Bio::LocatableSeq object
810 Args : positive integer for the sequence position
819 $self->throw("Sequence position has to be a positive integer, not [$pos]")
820 unless $pos =~ /^\d+$/ and $pos > 0;
821 $self->throw("No sequence at position [$pos]")
822 unless $pos <= $self->no_sequences ;
824 my $nse = $self->{'_order'}->{--$pos};
825 return $self->{'_seq'}->{$nse};
830 Title : get_seq_by_id
831 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
832 Function : Gets a sequence based on its name.
833 Sequences that do not exist will warn and return undef
834 Returns : a Bio::LocatableSeq object
835 Args : string for sequence name
840 my ($self,$name) = @_;
841 unless( defined $name ) {
842 $self->warn("Must provide a sequence name");
845 for my $seq ( values %{$self->{'_seq'}} ) {
846 if ( $seq->id eq $name) {
853 =head2 seq_with_features
855 Title : seq_with_features
856 Usage : $seq = $aln->seq_with_features(-pos => 1,
859 sub { my $consensus = shift;
864 while($consensus =~ /[^?]$q[^?]/){
865 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
871 Function: produces a Bio::Seq object by first splicing gaps from -pos
872 (by means of a splice_by_seq_pos() call), then creating
873 features using non-? chars (by means of a consensus_string()
874 call with stringency -consensus).
875 Returns : a Bio::Seq object
876 Args : -pos : required. sequence from which to build the Bio::Seq
878 -consensus : optional, defaults to consensus_string()'s
880 -mask : optional, a coderef to apply to consensus_string()'s
881 output before building features. this may be useful for
882 closing gaps of 1 bp by masking over them with N, for
887 sub seq_with_features
{
888 my ($self,%arg) = @_;
890 #first do the preparatory splice
891 $self->throw("must provide a -pos argument") unless $arg{-pos};
892 $self->splice_by_seq_pos($arg{-pos});
894 my $consensus_string = $self->consensus_string($arg{-consensus
});
895 $consensus_string = $arg{-mask
}->($consensus_string)
896 if defined($arg{-mask
});
900 push @bs, 1 if $consensus_string =~ /^[^?]/;
902 while($consensus_string =~ /\?[^?]/g){
903 push @bs, pos($consensus_string);
905 while($consensus_string =~ /[^?]\?/g){
906 push @es, pos($consensus_string);
909 push @es, length($consensus_string) if $consensus_string =~ /[^?]$/;
911 my $seq = Bio
::Seq
->new();
913 # my $rootfeature = Bio::SeqFeature::Generic->new(
914 # -source_tag => 'location',
915 # -start => $self->get_seq_by_pos($arg{-pos})->start,
916 # -end => $self->get_seq_by_pos($arg{-pos})->end,
918 # $seq->add_SeqFeature($rootfeature);
920 while(my $b = shift @bs){
922 $seq->add_SeqFeature(
923 Bio
::SeqFeature
::Generic
->new(
924 -start
=> $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
925 -end
=> $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
926 -source_tag
=> $self->source || 'MSA',
935 =head1 Create new alignments
937 The result of these methods are horizontal or vertical subsets of the
943 Usage : $aln2 = $aln->select(1, 3) # three first sequences
944 Function : Creates a new alignment from a continuous subset of
945 sequences. Numbering starts from 1. Sequence positions
946 larger than no_sequences() will thow an error.
947 Returns : a Bio::SimpleAlign object
948 Args : positive integer for the first sequence
949 positive integer for the last sequence to include (optional)
955 my ($start, $end) = @_;
957 $self->throw("Select start has to be a positive integer, not [$start]")
958 unless $start =~ /^\d+$/ and $start > 0;
959 $self->throw("Select end has to be a positive integer, not [$end]")
960 unless $end =~ /^\d+$/ and $end > 0;
961 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
962 unless $start <= $end;
964 my $aln = $self->new;
965 foreach my $pos ($start .. $end) {
966 $aln->add_seq($self->get_seq_by_pos($pos));
969 # fix for meta, sf, ann
973 =head2 select_noncont
975 Title : select_noncont
976 Usage : # 1st and 3rd sequences, sorted
977 $aln2 = $aln->select_noncont(1, 3)
979 # 1st and 3rd sequences, sorted (same as first)
980 $aln2 = $aln->select_noncont(3, 1)
982 # 1st and 3rd sequences, unsorted
983 $aln2 = $aln->select_noncont('nosort',3, 1)
985 Function : Creates a new alignment from a subset of sequences. Numbering
986 starts from 1. Sequence positions larger than no_sequences() will
987 throw an error. Sorts the order added to new alignment by default,
988 to prevent sorting pass 'nosort' as the first argument in the list.
989 Returns : a Bio::SimpleAlign object
990 Args : array of integers for the sequences. If the string 'nosort' is
991 passed as the first argument, the sequences will not be sorted
992 in the new alignment but will appear in the order listed.
1000 if ($pos[0] !~ m{^\d+$}) {
1001 my $sortcmd = shift @pos;
1002 if ($sortcmd eq 'nosort') {
1005 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1009 my $end = $self->no_sequences;
1011 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1012 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1015 @pos = sort {$a <=> $b} @pos unless $nosort;
1017 my $aln = $self->new;
1018 foreach my $p (@pos) {
1019 $aln->add_seq($self->get_seq_by_pos($p));
1021 $aln->id($self->id);
1022 # fix for meta, sf, ann
1029 Usage : $aln2 = $aln->slice(20,30)
1030 Function : Creates a slice from the alignment inclusive of start and
1031 end columns, and the first column in the alignment is denoted 1.
1032 Sequences with no residues in the slice are excluded from the
1033 new alignment and a warning is printed. Slice beyond the length of
1034 the sequence does not do padding.
1035 Returns : A Bio::SimpleAlign object
1036 Args : Positive integer for start column, positive integer for end column,
1037 optional boolean which if true will keep gap-only columns in the newly
1038 created slice. Example:
1040 $aln2 = $aln->slice(20,30,1)
1046 my ($start, $end, $keep_gap_only) = @_;
1048 $self->throw("Slice start has to be a positive integer, not [$start]")
1049 unless $start =~ /^\d+$/ and $start > 0;
1050 $self->throw("Slice end has to be a positive integer, not [$end]")
1051 unless $end =~ /^\d+$/ and $end > 0;
1052 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1053 unless $start <= $end;
1054 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1055 "[$start] is too big.") if $start > $self->length;
1056 my $cons_meta = $self->consensus_meta;
1057 my $aln = $self->new;
1058 $aln->id($self->id);
1059 foreach my $seq ( $self->each_seq() ) {
1060 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1063 -alphabet
=> $seq->alphabet,
1064 -strand
=> $seq->strand,
1065 -verbose
=> $self->verbose) :
1066 Bio
::LocatableSeq
->new
1068 -alphabet
=> $seq->alphabet,
1069 -strand
=> $seq->strand,
1070 -verbose
=> $self->verbose);
1074 $seq_end = $seq->length if( $end > $seq->length );
1076 my $slice_seq = $seq->subseq($start, $seq_end);
1077 $new_seq->seq( $slice_seq );
1079 $slice_seq =~ s/\W//g;
1082 my $pre_start_seq = $seq->subseq(1, $start - 1);
1083 $pre_start_seq =~ s/\W//g;
1084 if (!defined($seq->strand)) {
1085 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1086 } elsif ($seq->strand < 0){
1087 $new_seq->start( $seq->end - CORE
::length($pre_start_seq) - CORE
::length($slice_seq) + 1);
1089 $new_seq->start( $seq->start + CORE
::length($pre_start_seq) );
1092 $new_seq->start( $seq->start);
1094 if ($new_seq->isa('Bio::Seq::MetaI')) {
1095 for my $meta_name ($seq->meta_names) {
1096 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1099 $new_seq->end( $new_seq->start + CORE
::length($slice_seq) - 1 );
1101 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1102 $aln->add_seq($new_seq);
1104 if( $keep_gap_only ) {
1105 $aln->add_seq($new_seq);
1107 my $nse = $seq->get_nse();
1108 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1109 " Sequence excluded from the new alignment.");
1114 my $new = Bio
::Seq
::Meta
->new();
1115 for my $meta_name ($cons_meta->meta_names) {
1116 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1118 $aln->consensus_meta($new);
1120 $aln->annotation($self->annotation);
1121 # fix for meta, sf, ann
1125 =head2 remove_columns
1127 Title : remove_columns
1128 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1129 $aln2 = $aln->remove_columns([0,0],[6,8])
1130 Function : Creates an aligment with columns removed corresponding to
1131 the specified type or by specifying the columns by number.
1132 Returns : Bio::SimpleAlign object
1133 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1134 'all_gaps_columns') or array ref where the referenced array
1135 contains a pair of integers that specify a range.
1136 The first column is 0,
1140 sub remove_columns
{
1141 my ($self,@args) = @_;
1142 @args || return $self;
1145 if ($args[0][0] =~ /^[a-z_]+$/i) {
1146 $aln = $self->_remove_columns_by_type($args[0]);
1147 } elsif ($args[0][0] =~ /^\d+$/) {
1148 $aln = $self->_remove_columns_by_num(\
@args);
1150 $self->throw("You must pass array references to remove_columns(), not @args");
1152 # fix for meta, sf, ann
1160 Usage : $aln2 = $aln->remove_gaps
1161 Function : Creates an aligment with gaps removed
1162 Returns : a Bio::SimpleAlign object
1163 Args : a gap character(optional) if none specified taken
1164 from $self->gap_char,
1165 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1166 indicates that only all-gaps columns should be deleted
1168 Used from method L<remove_columns> in most cases. Set gap character
1169 using L<gap_char()|gap_char>.
1174 my ($self,$gapchar,$all_gaps_columns) = @_;
1176 if ($all_gaps_columns) {
1177 $gap_line = $self->all_gap_line($gapchar);
1179 $gap_line = $self->gap_line($gapchar);
1181 my $aln = $self->new;
1185 my $del_char = $gapchar || $self->gap_char;
1186 # Do the matching to get the segments to remove
1187 while ($gap_line =~ m/[$del_char]/g) {
1188 my $start = pos($gap_line)-1;
1189 $gap_line=~/\G[$del_char]+/gc;
1190 my $end = pos($gap_line)-1;
1192 #have to offset the start and end for subsequent removes
1195 $length += ($end-$start+1);
1196 push @remove, [$start,$end];
1199 #remove the segments
1200 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1201 # fix for meta, sf, ann
1207 my ($self,$aln,$remove) = @_;
1210 my $gap = $self->gap_char;
1212 # splice out the segments and create new seq
1213 foreach my $seq($self->each_seq){
1214 my $new_seq = Bio
::LocatableSeq
->new(
1216 -alphabet
=> $seq->alphabet,
1217 -strand
=> $seq->strand,
1218 -verbose
=> $self->verbose);
1219 my $sequence = $seq->seq;
1220 foreach my $pair(@
{$remove}){
1221 my $start = $pair->[0];
1222 my $end = $pair->[1];
1223 $sequence = $seq->seq unless $sequence;
1224 my $orig = $sequence;
1225 my $head = $start > 0 ?
substr($sequence, 0, $start) : '';
1226 my $tail = ($end + 1) >= length($sequence) ?
'' : substr($sequence, $end + 1);
1227 $sequence = $head.$tail;
1229 unless (defined $new_seq->start) {
1231 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1232 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1235 my $start_adjust = $orig =~ /^$gap+/;
1236 if ($start_adjust) {
1237 $start_adjust = $+[0] == $start;
1239 $new_seq->start($seq->start + $start_adjust);
1243 if (($end + 1) >= length($orig)) {
1244 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1245 $new_seq->end($seq->end - (length($orig) - $start) + $end_adjust);
1248 $new_seq->end($seq->end);
1252 if ($new_seq->end < $new_seq->start) {
1253 # we removed all columns except for gaps: set to 0 to indicate no
1259 $new_seq->seq($sequence) if $sequence;
1260 push @new, $new_seq;
1262 # add the new seqs to the alignment
1263 foreach my $new(@new){
1264 $aln->add_seq($new);
1266 # fix for meta, sf, ann
1270 sub _remove_columns_by_type
{
1271 my ($self,$type) = @_;
1272 my $aln = $self->new;
1275 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @
{$type});
1276 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@
{$type});
1277 my %matchchars = ( 'match' => '\*',
1282 'all_gaps_columns' => ''
1284 # get the characters to delete against
1286 foreach my $type (@
{$type}){
1287 $del_char.= $matchchars{$type};
1291 my $match_line = $self->match_line;
1292 # do the matching to get the segments to remove
1294 while($match_line =~ m/[$del_char]/g ){
1295 my $start = pos($match_line)-1;
1296 $match_line=~/\G[$del_char]+/gc;
1297 my $end = pos($match_line)-1;
1299 #have to offset the start and end for subsequent removes
1302 $length += ($end-$start+1);
1303 push @remove, [$start,$end];
1307 # remove the segments
1308 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1309 $aln = $aln->remove_gaps() if $gap;
1310 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1311 # fix for meta, sf, ann
1316 sub _remove_columns_by_num
{
1317 my ($self,$positions) = @_;
1318 my $aln = $self->new;
1320 # sort the positions
1321 @
$positions = sort { $a->[0] <=> $b->[0] } @
$positions;
1325 foreach my $pos (@
{$positions}) {
1326 my ($start, $end) = @
{$pos};
1328 #have to offset the start and end for subsequent removes
1331 $length += ($end-$start+1);
1332 push @remove, [$start,$end];
1335 #remove the segments
1336 $aln = $#remove >= 0 ?
$self->_remove_col($aln,\
@remove) : $self;
1337 # fix for meta, sf, ann
1342 =head1 Change sequences within the MSA
1344 These methods affect characters in all sequences without changing the
1347 =head2 splice_by_seq_pos
1349 Title : splice_by_seq_pos
1350 Usage : $status = splice_by_seq_pos(1);
1351 Function: splices all aligned sequences where the specified sequence
1354 Returns : 1 on success
1355 Args : position of sequence to splice by
1360 sub splice_by_seq_pos
{
1361 my ($self,$pos) = @_;
1363 my $guide = $self->get_seq_by_pos($pos);
1364 my $guide_seq = $guide->seq;
1366 $guide_seq =~ s/\./\-/g;
1370 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1371 unshift @gaps, $pos;
1375 foreach my $seq ($self->each_seq){
1376 my @bases = split '', $seq->seq;
1378 splice(@bases, $_, 1) foreach @gaps;
1379 $seq->seq(join('', @bases));
1388 Usage : $ali->map_chars('\.','-')
1389 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1392 Notice that the from (arg1) is interpretted as a regex,
1393 so be careful about quoting meta characters (eg
1394 $ali->map_chars('.','-') wont do what you want)
1396 Argument : 'from' rexexp
1407 $self->throw("Need exactly two arguments")
1408 unless defined $from and defined $to;
1410 foreach $seq ( $self->each_seq() ) {
1411 $temp = $seq->seq();
1412 $temp =~ s/$from/$to/g;
1422 Usage : $ali->uppercase()
1423 Function : Sets all the sequences to uppercase
1434 foreach $seq ( $self->each_seq() ) {
1435 $temp = $seq->seq();
1436 $temp =~ tr/[a-z]/[A-Z]/;
1445 Title : cigar_line()
1446 Usage : %cigars = $align->cigar_line()
1447 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1448 Report) line for each sequence in the alignment. Examples are
1449 "1,60" or "5,10:12,58", where the numbers refer to conserved
1450 positions within the alignment. The keys of the hash are the
1451 NSEs (name/start/end) assigned to each sequence.
1452 Args : threshold (optional, defaults to 100)
1453 Returns : Hash of strings (cigar lines)
1462 my @consensus = split "",($self->consensus_string($thr));
1463 my $len = $self->length;
1464 my $gapchar = $self->gap_char;
1466 # create a precursor, something like (1,4,5,6,7,33,45),
1467 # where each number corresponds to a conserved position
1468 foreach my $seq ( $self->each_seq ) {
1469 my @seq = split "", uc ($seq->seq);
1471 for (my $x = 0 ; $x < $len ; $x++ ) {
1472 if ($seq[$x] eq $consensus[$x]) {
1473 push @
{$cigars{$seq->get_nse}},$pos;
1475 } elsif ($seq[$x] ne $gapchar) {
1480 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1481 for my $name (keys %cigars) {
1482 splice @
{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1483 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1484 push @
{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1485 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1486 ${$cigars{$name}}[$#{$cigars{$name}}] );
1487 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1488 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1489 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1490 splice @
{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1494 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1495 for my $name (keys %cigars) {
1497 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1498 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1499 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1503 for my $pos (@remove) {
1504 splice @
{$cigars{$name}}, $pos, 1;
1507 # join and punctuate
1508 for my $name (keys %cigars) {
1509 my ($start,$end,$str) = "";
1510 while ( ($start,$end) = splice @
{$cigars{$name}}, 0, 2 ) {
1511 $str .= ($start . "," . $end . ":");
1514 $cigars{$name} = $str;
1522 Title : match_line()
1523 Usage : $line = $align->match_line()
1524 Function : Generates a match line - much like consensus string
1525 except that a line indicating the '*' for a match.
1526 Args : (optional) Match line characters ('*' by default)
1527 (optional) Strong match char (':' by default)
1528 (optional) Weak match char ('.' by default)
1534 my ($self,$matchlinechar, $strong, $weak) = @_;
1535 my %matchchars = ('match' => $matchlinechar || '*',
1536 'weak' => $weak || '.',
1537 'strong' => $strong || ':',
1543 foreach my $seq ( $self->each_seq ) {
1544 push @seqchars, [ split(//, uc ($seq->seq)) ];
1545 $alphabet = $seq->alphabet unless defined $alphabet;
1547 my $refseq = shift @seqchars;
1548 # let's just march down the columns
1551 foreach my $pos ( 0..$self->length ) {
1552 my $refchar = $refseq->[$pos];
1553 my $char = $matchchars{'mismatch'};
1554 unless( defined $refchar ) {
1555 last if $pos == $self->length; # short circuit on last residue
1556 # this in place to handle jason's soon-to-be-committed
1557 # intron mapping code
1560 my %col = ($refchar => 1);
1561 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1562 foreach my $seq ( @seqchars ) {
1563 next if $pos >= scalar @
$seq;
1564 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1565 $seq->[$pos] eq ' ' );
1566 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1568 my @colresidues = sort keys %col;
1570 # if all the values are the same
1571 if( $dash ) { $char = $matchchars{'mismatch'} }
1572 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1573 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1574 # matches for protein seqs
1576 foreach my $type ( qw(strong weak) ) {
1577 # iterate through categories
1579 # iterate through each of the aa in the col
1580 # look to see which groups it is in
1581 foreach my $c ( @colresidues ) {
1582 foreach my $f ( grep { index($_,$c) >= 0 } @
{$CONSERVATION_GROUPS{$type}} ) {
1583 push @
{$groups{$f}},$c;
1587 foreach my $cols ( values %groups ) {
1588 @
$cols = sort @
$cols;
1589 # now we are just testing to see if two arrays
1590 # are identical w/o changing either one
1591 # have to be same len
1592 next if( scalar @
$cols != scalar @colresidues );
1593 # walk down the length and check each slot
1594 for($_=0;$_ < (scalar @
$cols);$_++ ) {
1595 next GRP
if( $cols->[$_] ne $colresidues[$_] );
1597 $char = $matchchars{$type};
1603 $matchline .= $char;
1612 Usage : $line = $align->gap_line()
1613 Function : Generates a gap line - much like consensus string
1614 except that a line where '-' represents gap
1615 Args : (optional) gap line characters ('-' by default)
1621 my ($self,$gapchar) = @_;
1622 $gapchar = $gapchar || $self->gap_char;
1623 my %gap_hsh; # column gaps vector
1624 foreach my $seq ( $self->each_seq ) {
1626 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1627 map {[$i++, $_]} split(//, uc ($seq->seq));
1630 foreach my $pos ( 0..$self->length-1 ) {
1631 $gap_line .= (exists $gap_hsh{$pos}) ?
$gapchar:'.';
1638 Title : all_gap_line()
1639 Usage : $line = $align->all_gap_line()
1640 Function : Generates a gap line - much like consensus string
1641 except that a line where '-' represents all-gap column
1642 Args : (optional) gap line characters ('-' by default)
1648 my ($self,$gapchar) = @_;
1649 $gapchar = $gapchar || $self->gap_char;
1650 my %gap_hsh; # column gaps counter hash
1651 my @seqs = $self->each_seq;
1652 foreach my $seq ( @seqs ) {
1654 map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1655 map {[$i++, $_]} split(//, uc ($seq->seq));
1658 foreach my $pos ( 0..$self->length-1 ) {
1659 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1661 $gap_line .= $gapchar;
1669 =head2 gap_col_matrix
1671 Title : gap_col_matrix()
1672 Usage : my $cols = $align->gap_col_matrix()
1673 Function : Generates an array of hashes where
1674 each entry in the array is a hash reference
1675 with keys of all the sequence names and
1676 and value of 1 or 0 if the sequence has a gap at that column
1677 Args : (optional) gap line characters ($aln->gap_char or '-' by default)
1681 sub gap_col_matrix
{
1682 my ($self,$gapchar) = @_;
1683 $gapchar = $gapchar || $self->gap_char;
1684 my %gap_hsh; # column gaps vector
1686 foreach my $seq ( $self->each_seq ) {
1688 my $str = $seq->seq;
1689 my $len = $seq->length;
1691 my $id = $seq->display_id;
1692 while( $i < $len ) {
1693 $ch = substr($str, $i, 1);
1694 $cols[$i++]->{$id} = ($ch eq $gapchar);
1703 Usage : $ali->match()
1704 Function : Goes through all columns and changes residues that are
1705 identical to residue in first sequence to match '.'
1706 character. Sets match_char.
1708 USE WITH CARE: Most MSA formats do not support match
1709 characters in sequences, so this is mostly for output
1710 only. NEXUS format (Bio::AlignIO::nexus) can handle
1713 Argument : a match character, optional, defaults to '.'
1718 my ($self, $match) = @_;
1721 my ($matching_char) = $match;
1722 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1723 $self->map_chars($matching_char, '-');
1725 my @seqs = $self->each_seq();
1726 return 1 unless scalar @seqs > 1;
1728 my $refseq = shift @seqs ;
1729 my @refseq = split //, $refseq->seq;
1730 my $gapchar = $self->gap_char;
1732 foreach my $seq ( @seqs ) {
1733 my @varseq = split //, $seq->seq();
1734 for ( my $i=0; $i < scalar @varseq; $i++) {
1735 $varseq[$i] = $match if defined $refseq[$i] &&
1736 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1737 $refseq[$i] =~ /$gapchar/ )
1738 && $refseq[$i] eq $varseq[$i];
1740 $seq->seq(join '', @varseq);
1742 $self->match_char($match);
1750 Usage : $ali->unmatch()
1751 Function : Undoes the effect of method match. Unsets match_char.
1753 Argument : a match character, optional, defaults to '.'
1755 See L<match> and L<match_char>
1760 my ($self, $match) = @_;
1764 my @seqs = $self->each_seq();
1765 return 1 unless scalar @seqs > 1;
1767 my $refseq = shift @seqs ;
1768 my @refseq = split //, $refseq->seq;
1769 my $gapchar = $self->gap_char;
1770 foreach my $seq ( @seqs ) {
1771 my @varseq = split //, $seq->seq();
1772 for ( my $i=0; $i < scalar @varseq; $i++) {
1773 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1774 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1775 $refseq[$i] =~ /$gapchar/ ) &&
1776 $varseq[$i] eq $match;
1778 $seq->seq(join '', @varseq);
1780 $self->match_char('');
1784 =head1 MSA attributes
1786 Methods for setting and reading the MSA attributes.
1788 Note that the methods defining character semantics depend on the user
1789 to set them sensibly. They are needed only by certain input/output
1790 methods. Unset them by setting to an empty string ('').
1795 Usage : $myalign->id("Ig")
1796 Function : Gets/sets the id field of the alignment
1797 Returns : An id string
1798 Argument : An id string (optional)
1803 my ($self, $name) = @_;
1805 if (defined( $name )) {
1806 $self->{'_id'} = $name;
1809 return $self->{'_id'};
1815 Usage : $myalign->accession("PF00244")
1816 Function : Gets/sets the accession field of the alignment
1817 Returns : An acc string
1818 Argument : An acc string (optional)
1823 my ($self, $acc) = @_;
1825 if (defined( $acc )) {
1826 $self->{'_accession'} = $acc;
1829 return $self->{'_accession'};
1835 Usage : $myalign->description("14-3-3 proteins")
1836 Function : Gets/sets the description field of the alignment
1837 Returns : An description string
1838 Argument : An description string (optional)
1843 my ($self, $name) = @_;
1845 if (defined( $name )) {
1846 $self->{'_description'} = $name;
1849 return $self->{'_description'};
1854 Title : missing_char
1855 Usage : $myalign->missing_char("?")
1856 Function : Gets/sets the missing_char attribute of the alignment
1857 It is generally recommended to set it to 'n' or 'N'
1858 for nucleotides and to 'X' for protein.
1859 Returns : An missing_char string,
1860 Argument : An missing_char string (optional)
1865 my ($self, $char) = @_;
1867 if (defined $char ) {
1868 $self->throw("Single missing character, not [$char]!") if CORE
::length($char) > 1;
1869 $self->{'_missing_char'} = $char;
1872 return $self->{'_missing_char'};
1878 Usage : $myalign->match_char('.')
1879 Function : Gets/sets the match_char attribute of the alignment
1880 Returns : An match_char string,
1881 Argument : An match_char string (optional)
1886 my ($self, $char) = @_;
1888 if (defined $char ) {
1889 $self->throw("Single match character, not [$char]!") if CORE
::length($char) > 1;
1890 $self->{'_match_char'} = $char;
1893 return $self->{'_match_char'};
1899 Usage : $myalign->gap_char('-')
1900 Function : Gets/sets the gap_char attribute of the alignment
1901 Returns : An gap_char string, defaults to '-'
1902 Argument : An gap_char string (optional)
1907 my ($self, $char) = @_;
1909 if (defined $char || ! defined $self->{'_gap_char'} ) {
1910 $char= '-' unless defined $char;
1911 $self->throw("Single gap character, not [$char]!") if CORE
::length($char) > 1;
1912 $self->{'_gap_char'} = $char;
1914 return $self->{'_gap_char'};
1919 Title : symbol_chars
1920 Usage : my @symbolchars = $aln->symbol_chars;
1921 Function: Returns all the seen symbols (other than gaps)
1922 Returns : array of characters that are the seen symbols
1923 Args : boolean to include the gap/missing/match characters
1928 my ($self,$includeextra) = @_;
1930 unless ($self->{'_symbols'}) {
1931 foreach my $seq ($self->each_seq) {
1932 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1935 my %copy = %{$self->{'_symbols'}};
1936 if( ! $includeextra ) {
1937 foreach my $char ( $self->gap_char, $self->match_char,
1938 $self->missing_char) {
1939 delete $copy{$char} if( defined $char );
1945 =head1 Alignment descriptors
1947 These read only methods describe the MSA in various ways.
1953 Usage : $str = $ali->score()
1954 Function : get/set a score of the alignment
1955 Returns : a score for the alignment
1956 Argument : an optional score to set
1962 $self->{score
} = shift if @_;
1963 return $self->{score
};
1966 =head2 consensus_string
1968 Title : consensus_string
1969 Usage : $str = $ali->consensus_string($threshold_percent)
1970 Function : Makes a strict consensus
1971 Returns : Consensus string
1972 Argument : Optional treshold ranging from 0 to 100.
1973 The consensus residue has to appear at least threshold %
1974 of the sequences at a given location, otherwise a '?'
1975 character will be placed at that location.
1976 (Default value = 0%)
1980 sub consensus_string
{
1982 my $threshold = shift;
1985 my $len = $self->length - 1;
1987 foreach ( 0 .. $len ) {
1988 $out .= $self->_consensus_aa($_,$threshold);
1996 my $threshold_percent = shift || -1 ;
1997 my ($seq,%hash,$count,$letter,$key);
1998 my $gapchar = $self->gap_char;
1999 foreach $seq ( $self->each_seq() ) {
2000 $letter = substr($seq->seq,$point,1);
2001 $self->throw("--$point-----------") if $letter eq '';
2002 ($letter eq $gapchar || $letter =~ /\./) && next;
2003 # print "Looking at $letter\n";
2006 my $number_of_sequences = $self->no_sequences();
2007 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2011 foreach $key ( sort keys %hash ) {
2012 # print "Now at $key $hash{$key}\n";
2013 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2015 $count = $hash{$key};
2022 =head2 consensus_iupac
2024 Title : consensus_iupac
2025 Usage : $str = $ali->consensus_iupac()
2026 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2027 and RNA. The output is in upper case except when gaps in
2028 a column force output to be in lower case.
2030 Note that if your alignment sequences contain a lot of
2031 IUPAC ambiquity codes you often have to manually set
2032 alphabet. Bio::PrimarySeq::_guess_type thinks they
2033 indicate a protein sequence.
2034 Returns : consensus string
2036 Throws : on protein sequences
2040 sub consensus_iupac
{
2043 my $len = $self->length-1;
2045 # only DNA and RNA sequences are valid
2046 foreach my $seq ( $self->each_seq() ) {
2047 $self->throw("Seq [". $seq->get_nse. "] is a protein")
2048 if $seq->alphabet eq 'protein';
2050 # loop over the alignment columns
2051 foreach my $count ( 0 .. $len ) {
2052 $out .= $self->_consensus_iupac($count);
2057 sub _consensus_iupac
{
2058 my ($self, $column) = @_;
2059 my ($string, $char, $rna);
2061 #determine all residues in a column
2062 foreach my $seq ( $self->each_seq() ) {
2063 $string .= substr($seq->seq, $column, 1);
2065 $string = uc $string;
2067 # quick exit if there's an N in the string
2068 if ($string =~ /N/) {
2069 $string =~ /\W/ ?
return 'n' : return 'N';
2071 # ... or if there are only gap characters
2072 return '-' if $string =~ /^\W+$/;
2074 # treat RNA as DNA in regexps
2075 if ($string =~ /U/) {
2080 # the following s///'s only need to be done to the _first_ ambiguity code
2081 # as we only need to see the _range_ of characters in $string
2083 if ($string =~ /[VDHB]/) {
2084 $string =~ s/V/AGC/;
2085 $string =~ s/D/AGT/;
2086 $string =~ s/H/ACT/;
2087 $string =~ s/B/CTG/;
2090 if ($string =~ /[SKYRWM]/) {
2099 # and now the guts of the thing
2101 if ($string =~ /A/) {
2103 if ($string =~ /G/) {
2104 $char = 'R'; # A and G (purines) R
2105 if ($string =~ /C/) {
2106 $char = 'V'; # A and G and C V
2107 if ($string =~ /T/) {
2108 $char = 'N'; # A and G and C and T N
2110 } elsif ($string =~ /T/) {
2111 $char = 'D'; # A and G and T D
2113 } elsif ($string =~ /C/) {
2114 $char = 'M'; # A and C M
2115 if ($string =~ /T/) {
2116 $char = 'H'; # A and C and T H
2118 } elsif ($string =~ /T/) {
2119 $char = 'W'; # A and T W
2121 } elsif ($string =~ /C/) {
2123 if ($string =~ /T/) {
2124 $char = 'Y'; # C and T (pyrimidines) Y
2125 if ($string =~ /G/) {
2126 $char = 'B'; # C and T and G B
2128 } elsif ($string =~ /G/) {
2129 $char = 'S'; # C and G S
2131 } elsif ($string =~ /G/) {
2133 if ($string =~ /C/) {
2134 $char = 'S'; # G and C S
2135 } elsif ($string =~ /T/) {
2136 $char = 'K'; # G and T K
2138 } elsif ($string =~ /T/) {
2142 $char = 'U' if $rna and $char eq 'T';
2143 $char = lc $char if $string =~ /\W/;
2149 =head2 consensus_meta
2151 Title : consensus_meta
2152 Usage : $seqmeta = $ali->consensus_meta()
2153 Function : Returns a Bio::Seq::Meta object containing the consensus
2154 strings derived from meta data analysis.
2155 Returns : Bio::Seq::Meta
2156 Argument : Bio::Seq::Meta
2157 Throws : non-MetaI object
2161 sub consensus_meta
{
2162 my ($self, $meta) = @_;
2163 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2164 $self->throw('Not a Bio::Seq::MetaI object');
2166 return $self->{'_aln_meta'} = $meta if $meta;
2167 return $self->{'_aln_meta'}
2173 Usage : if ( $ali->is_flush() )
2174 Function : Tells you whether the alignment
2175 : is flush, i.e. all of the same length
2182 my ($self,$report) = @_;
2187 foreach $seq ( $self->each_seq() ) {
2188 if( $length == (-1) ) {
2189 $length = CORE
::length($seq->seq());
2193 $temp = CORE
::length($seq->seq());
2194 if( $temp != $length ) {
2195 $self->warn("expecting $length not $temp from ".
2196 $seq->display_id) if( $report );
2197 $self->debug("expecting $length not $temp from ".
2199 $self->debug($seq->seq(). "\n");
2211 Usage : $len = $ali->length()
2212 Function : Returns the maximum length of the alignment.
2213 To be sure the alignment is a block, use is_flush
2221 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2231 foreach $seq ( $self->each_seq() ) {
2232 $temp = $seq->length();
2233 if( $temp > $length ) {
2242 =head2 maxdisplayname_length
2244 Title : maxdisplayname_length
2245 Usage : $ali->maxdisplayname_length()
2246 Function : Gets the maximum length of the displayname in the
2247 alignment. Used in writing out various MSA formats.
2253 sub maxname_length
{
2255 $self->deprecated("maxname_length - deprecated method.".
2256 " Use maxdisplayname_length() instead.");
2257 $self->maxdisplayname_length();
2262 $self->deprecated("maxnse_length - deprecated method.".
2263 " Use maxnse_length() instead.");
2264 $self->maxdisplayname_length();
2267 sub maxdisplayname_length
{
2272 foreach $seq ( $self->each_seq() ) {
2273 $len = CORE
::length $self->displayname($seq->get_nse());
2275 if( $len > $maxname ) {
2283 =head2 max_metaname_length
2285 Title : max_metaname_length
2286 Usage : $ali->max_metaname_length()
2287 Function : Gets the maximum length of the meta name tags in the
2288 alignment for the sequences and for the alignment.
2289 Used in writing out various MSA formats.
2295 sub max_metaname_length
{
2300 # check seq meta first
2301 for $seq ( $self->each_seq() ) {
2302 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2303 for my $mtag ($seq->meta_names) {
2304 $len = CORE
::length $mtag;
2305 if( $len > $maxname ) {
2312 for my $meta ($self->consensus_meta) {
2314 for my $name ($meta->meta_names) {
2315 $len = CORE
::length $name;
2316 if( $len > $maxname ) {
2328 Usage : $no = $ali->no_residues
2329 Function : number of residues in total in the alignment
2339 foreach my $seq ($self->each_seq) {
2340 my $str = $seq->seq();
2342 $count += ($str =~ s/[A-Za-z]//g);
2350 Title : no_sequences
2351 Usage : $depth = $ali->no_sequences
2352 Function : number of sequence in the sequence alignment
2361 return scalar($self->each_seq);
2365 =head2 average_percentage_identity
2367 Title : average_percentage_identity
2368 Usage : $id = $align->average_percentage_identity
2369 Function: The function uses a fast method to calculate the average
2370 percentage identity of the alignment
2371 Returns : The average percentage identity of the alignment
2373 Notes : This method implemented by Kevin Howe calculates a figure that is
2374 designed to be similar to the average pairwise identity of the
2375 alignment (identical in the absence of gaps), without having to
2376 explicitly calculate pairwise identities proposed by Richard Durbin.
2377 Validated by Ewan Birney ad Alex Bateman.
2381 sub average_percentage_identity
{
2382 my ($self,@args) = @_;
2384 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2385 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2387 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2389 if (! $self->is_flush()) {
2390 $self->throw("All sequences in the alignment must be the same length");
2393 @seqs = $self->each_seq();
2394 $len = $self->length();
2396 # load the each hash with correct keys for existence checks
2398 for( my $index=0; $index < $len; $index++) {
2399 foreach my $letter (@alphabet) {
2400 $countHashes[$index]->{$letter} = 0;
2403 foreach my $seq (@seqs) {
2404 my @seqChars = split //, $seq->seq();
2405 for( my $column=0; $column < @seqChars; $column++ ) {
2406 my $char = uc($seqChars[$column]);
2407 if (exists $countHashes[$column]->{$char}) {
2408 $countHashes[$column]->{$char}++;
2415 for(my $column =0; $column < $len; $column++) {
2416 my %hash = %{$countHashes[$column]};
2418 foreach my $res (keys %hash) {
2419 $total += $hash{$res}*($hash{$res} - 1);
2420 $subdivisor += $hash{$res};
2422 $divisor += $subdivisor * ($subdivisor - 1);
2424 return $divisor > 0 ?
($total / $divisor )*100.0 : 0;
2427 =head2 percentage_identity
2429 Title : percentage_identity
2430 Usage : $id = $align->percentage_identity
2431 Function: The function calculates the average percentage identity
2432 (aliased to average_percentage_identity)
2433 Returns : The average percentage identity
2438 sub percentage_identity
{
2440 return $self->average_percentage_identity();
2443 =head2 overall_percentage_identity
2445 Title : overall_percentage_identity
2446 Usage : $id = $align->overall_percentage_identity
2447 $id = $align->overall_percentage_identity('short')
2448 Function: The function calculates the percentage identity of
2449 the conserved columns
2450 Returns : The percentage identity of the conserved columns
2451 Args : length value to use, optional defaults to alignment length
2452 possible values: 'align', 'short', 'long'
2454 The argument values 'short' and 'long' refer to shortest and longest
2455 sequence in the alignment. Method modification code by Hongyu Zhang.
2459 sub overall_percentage_identity
{
2460 my ($self, $length_measure) = @_;
2462 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2463 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2465 my ($len, $total, @seqs, @countHashes);
2467 my %enum = map {$_ => 1} qw
(align short long
);
2469 $self->throw("Unknown argument [$length_measure]")
2470 if $length_measure and not $enum{$length_measure};
2471 $length_measure ||= 'align';
2473 if (! $self->is_flush()) {
2474 $self->throw("All sequences in the alignment must be the same length");
2477 @seqs = $self->each_seq();
2478 $len = $self->length();
2480 # load the each hash with correct keys for existence checks
2481 for( my $index=0; $index < $len; $index++) {
2482 foreach my $letter (@alphabet) {
2483 $countHashes[$index]->{$letter} = 0;
2486 foreach my $seq (@seqs) {
2487 my @seqChars = split //, $seq->seq();
2488 for( my $column=0; $column < @seqChars; $column++ ) {
2489 my $char = uc($seqChars[$column]);
2490 if (exists $countHashes[$column]->{$char}) {
2491 $countHashes[$column]->{$char}++;
2497 for(my $column =0; $column < $len; $column++) {
2498 my %hash = %{$countHashes[$column]};
2499 foreach ( values %hash ) {
2501 $total++ if( $_ == scalar @seqs );
2506 if ($length_measure eq 'short') {
2507 ## find the shortest length
2509 foreach my $seq ($self->each_seq) {
2510 my $count = $seq->seq =~ tr/[A-Za-z]//;
2512 $len = $count if $count < $len;
2518 elsif ($length_measure eq 'long') {
2519 ## find the longest length
2521 foreach my $seq ($self->each_seq) {
2522 my $count = $seq->seq =~ tr/[A-Za-z]//;
2524 $len = $count if $count > $len;
2531 return ($total / $len ) * 100.0;
2536 =head1 Alignment positions
2538 Methods to map a sequence position into an alignment column and back.
2539 column_from_residue_number() does the former. The latter is really a
2540 property of the sequence object and can done using
2541 L<Bio::LocatableSeq::location_from_column>:
2543 # select somehow a sequence from the alignment, e.g.
2544 my $seq = $aln->get_seq_by_pos(1);
2545 #$loc is undef or Bio::LocationI object
2546 my $loc = $seq->location_from_column(5);
2548 =head2 column_from_residue_number
2550 Title : column_from_residue_number
2551 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2552 Function: This function gives the position in the alignment
2553 (i.e. column number) of the given residue number in the
2554 sequence with the given name. For example, for the
2557 Seq1/91-97 AC..DEF.GH.
2558 Seq2/24-30 ACGG.RTY...
2559 Seq3/43-51 AC.DDEF.GHI
2561 column_from_residue_number( "Seq1", 94 ) returns 6.
2562 column_from_residue_number( "Seq2", 25 ) returns 2.
2563 column_from_residue_number( "Seq3", 50 ) returns 10.
2565 An exception is thrown if the residue number would lie
2566 outside the length of the aligment
2567 (e.g. column_from_residue_number( "Seq2", 22 )
2569 Note: If the the parent sequence is represented by more than
2570 one alignment sequence and the residue number is present in
2571 them, this method finds only the first one.
2573 Returns : A column number for the position in the alignment of the
2574 given residue in the given sequence (1 = first column)
2575 Args : A sequence id/name (not a name/start-end)
2576 A residue number in the whole sequence (not just that
2577 segment of it in the alignment)
2581 sub column_from_residue_number
{
2582 my ($self, $name, $resnumber) = @_;
2584 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2585 $self->throw("Second argument residue number missing") unless $resnumber;
2587 foreach my $seq ($self->each_seq_with_id($name)) {
2590 $col = $seq->column_from_residue_number($resnumber);
2596 $self->throw("Could not find a sequence segment in $name ".
2597 "containing residue number $resnumber");
2601 =head1 Sequence names
2603 Methods to manipulate the display name. The default name based on the
2604 sequence id and subsequence positions can be overridden in various
2610 Usage : $myalign->displayname("Ig", "IgA")
2611 Function : Gets/sets the display name of a sequence in the alignment
2612 Returns : A display name string
2613 Argument : name of the sequence
2614 displayname of the sequence (optional)
2618 sub get_displayname
{
2620 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2621 $self->displayname(@_);
2624 sub set_displayname
{
2626 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2627 $self->displayname(@_);
2631 my ($self, $name, $disname) = @_;
2633 $self->throw("No sequence with name [$name]")
2634 unless defined $self->{'_seq'}->{$name};
2636 if( $disname and $name) {
2637 $self->{'_dis_name'}->{$name} = $disname;
2640 elsif( defined $self->{'_dis_name'}->{$name} ) {
2641 return $self->{'_dis_name'}->{$name};
2647 =head2 set_displayname_count
2649 Title : set_displayname_count
2650 Usage : $ali->set_displayname_count
2651 Function : Sets the names to be name_# where # is the number of
2652 times this name has been used.
2653 Returns : 1, on success
2658 sub set_displayname_count
{
2660 my (@arr,$name,$seq,$count,$temp,$nse);
2662 foreach $seq ( $self->each_alphabetically() ) {
2663 $nse = $seq->get_nse();
2665 #name will be set when this is the second
2666 #time (or greater) is has been seen
2668 if( defined $name and $name eq ($seq->id()) ) {
2669 $temp = sprintf("%s_%s",$name,$count);
2670 $self->displayname($nse,$temp);
2675 $temp = sprintf("%s_%s",$name,$count);
2676 $self->displayname($nse,$temp);
2683 =head2 set_displayname_flat
2685 Title : set_displayname_flat
2686 Usage : $ali->set_displayname_flat()
2687 Function : Makes all the sequences be displayed as just their name,
2694 sub set_displayname_flat
{
2698 foreach $seq ( $self->each_seq() ) {
2699 $nse = $seq->get_nse();
2700 $self->displayname($nse,$seq->id());
2705 =head2 set_displayname_normal
2707 Title : set_displayname_normal
2708 Usage : $ali->set_displayname_normal()
2709 Function : Makes all the sequences be displayed as name/start-end
2710 Returns : 1, on success
2715 sub set_displayname_normal
{
2719 foreach $seq ( $self->each_seq() ) {
2720 $nse = $seq->get_nse();
2721 $self->displayname($nse,$nse);
2729 Usage : $obj->source($newval)
2730 Function: sets the Alignment source program
2732 Returns : value of source
2733 Args : newvalue (optional)
2739 my ($self,$value) = @_;
2740 if( defined $value) {
2741 $self->{'_source'} = $value;
2743 return $self->{'_source'};
2746 =head2 set_displayname_safe
2748 Title : set_displayname_safe
2749 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2750 Function : Assign machine-generated serial names to sequences in input order.
2751 Designed to protect names during PHYLIP runs. Assign 10-char string
2752 in the form of "S000000001" to "S999999999". Restore the original
2753 names using "restore_displayname".
2754 Returns : 1. a new $aln with system names;
2755 2. a hash ref for restoring names
2756 Argument : Number for id length (default 10)
2760 sub set_displayname_safe
{
2762 my $idlength = shift || 10;
2763 my ($seq, %phylip_name);
2765 my $new=Bio
::SimpleAlign
->new();
2766 foreach $seq ( $self->each_seq() ) {
2768 my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2769 $phylip_name{$pname}=$seq->id();
2770 my $new_seq= Bio
::LocatableSeq
->new(-id
=> $pname,
2771 -seq
=> $seq->seq(),
2772 -alphabet
=> $seq->alphabet,
2773 -start
=> $seq->{_start
},
2774 -end
=> $seq->{_end
}
2776 $new->add_seq($new_seq);
2779 $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2780 return ($new, \
%phylip_name);
2783 =head2 restore_displayname
2785 Title : restore_displayname
2786 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2787 Function : Restore original sequence names (after running
2788 $ali->set_displayname_safe)
2789 Returns : a new $aln with names restored.
2790 Argument : a hash reference of names from "set_displayname_safe".
2794 sub restore_displayname
{
2798 my $new=Bio
::SimpleAlign
->new();
2799 foreach my $seq ( $self->each_seq() ) {
2800 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2801 my $new_seq= Bio
::LocatableSeq
->new(-id
=> $name{$seq->id()},
2802 -seq
=> $seq->seq(),
2803 -alphabet
=> $seq->alphabet,
2804 -start
=> $seq->{_start
},
2805 -end
=> $seq->{_end
}
2807 $new->add_seq($new_seq);
2812 =head2 sort_by_start
2814 Title : sort_by_start
2815 Usage : $ali->sort_by_start
2816 Function : Changes the order of the alignment to the start position of each
2825 my ($seq,$nse,@arr,%hash,$count);
2826 foreach $seq ( $self->each_seq() ) {
2827 $nse = $seq->get_nse;
2831 %{$self->{'_order'}} = (); # reset the hash;
2832 foreach $nse ( sort _startend
keys %hash) {
2833 $self->{'_order'}->{$count} = $nse;
2841 my ($aname,$arange) = split (/[\/]/,$a);
2842 my ($bname,$brange) = split (/[\/]/,$b);
2843 my ($astart,$aend) = split(/\-/,$arange);
2844 my ($bstart,$bend) = split(/\-/,$brange);
2845 return $astart <=> $bstart;
2848 =head2 bracket_string
2850 Title : bracket_string
2851 Usage : my @params = (-refseq => 'testseq',
2852 -allele1 => 'allele1',
2853 -allele2 => 'allele2',
2854 -delimiters => '{}',
2856 $str = $aln->bracket_string(@params)
2858 Function : When supplied with a list of parameters (see below), returns a
2859 string in BIC format. This is used for allelic comparisons.
2860 Briefly, if either allele contains a base change when compared to
2861 the refseq, the base or gap for each allele is represented in
2862 brackets in the order present in the 'alleles' parameter.
2864 For the following data:
2873 the returned string with parameters 'refseq => testseq' and
2874 'alleles => [qw(allele1 allele2)]' would be:
2876 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2877 Returns : BIC-formatted string
2878 Argument : Required args
2879 refseq : string (ID) of the reference sequence used
2880 as basis for comparison
2881 allele1 : string (ID) of the first allele
2882 allele2 : string (ID) of the second allele
2884 delimiters: two symbol string of left and right delimiters.
2885 Only the first two symbols are used
2887 separator : string used as a separator. Only the first
2890 Throws : On no refseq/alleles, or invalid refseq/alleles.
2894 sub bracket_string
{
2895 my ($self, @args) = @_;
2896 my ($ref, $a1, $a2, $delim, $sep) =
2897 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2898 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2900 ($ld, $rd) = split('', $delim, 2) if $delim;
2904 my ($refseq, $allele1, $allele2) =
2905 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2906 if (!$refseq || !$allele1 || !$allele2) {
2907 $self->throw("One of your refseq/allele IDs is invalid!");
2909 my $len = $self->length-1;
2911 # loop over the alignment columns
2912 for my $column ( 0 .. $len ) {
2914 my ($compres, $res1, $res2) =
2915 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2916 # are any of the allele symbols different from the refseq?
2917 $string = ($compres eq $res1 && $compres eq $res2) ?
$compres :
2918 $ld.$res1.$sep.$res2.$rd;
2925 =head2 methods for Bio::FeatureHolder
2927 FeatureHolder implementation to support labeled character sets like one
2928 would get from NEXUS represented data.
2930 =head2 get_SeqFeatures
2933 Function: Get the feature objects held by this feature holder.
2935 Returns : an array of Bio::SeqFeatureI implementing objects
2938 At some day we may want to expand this method to allow for a feature
2939 filter to be passed in.
2943 sub get_SeqFeatures
{
2946 if( !defined $self->{'_as_feat'} ) {
2947 $self->{'_as_feat'} = [];
2949 return @
{$self->{'_as_feat'}};
2952 =head2 add_SeqFeature
2954 Usage : $feat->add_SeqFeature($subfeat);
2955 $feat->add_SeqFeature($subfeat,'EXPAND')
2956 Function: adds a SeqFeature into the subSeqFeature array.
2957 with no 'EXPAND' qualifer, subfeat will be tested
2958 as to whether it lies inside the parent, and throw
2959 an exception if not.
2961 If EXPAND is used, the parent''s start/end/strand will
2962 be adjusted so that it grows to accommodate the new
2966 Args : a Bio::SeqFeatureI object
2970 sub add_SeqFeature
{
2971 my ($self,@feat) = @_;
2973 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
2975 foreach my $feat ( @feat ) {
2976 if( !$feat->isa("Bio::SeqFeatureI") ) {
2977 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
2980 push(@
{$self->{'_as_feat'}},$feat);
2986 =head2 remove_SeqFeatures
2988 Usage : $obj->remove_SeqFeatures
2989 Function: Removes all sub SeqFeatures. If you want to remove only a subset,
2990 remove that subset from the returned array, and add back the rest.
2991 Returns : The array of Bio::SeqFeatureI implementing sub-features that was
2992 deleted from this feature.
2997 sub remove_SeqFeatures
{
3000 return () unless $self->{'_as_feat'};
3001 my @feats = @
{$self->{'_as_feat'}};
3002 $self->{'_as_feat'} = [];
3006 =head2 feature_count
3008 Title : feature_count
3009 Usage : $obj->feature_count()
3010 Function: Return the number of SeqFeatures attached to a feature holder.
3012 This is before flattening a possible sub-feature tree.
3014 We provide a default implementation here that just counts
3015 the number of objects returned by get_SeqFeatures().
3016 Implementors may want to override this with a more
3017 efficient implementation.
3019 Returns : integer representing the number of SeqFeatures
3022 At some day we may want to expand this method to allow for a feature
3023 filter to be passed in.
3025 Our default implementation allows for any number of additional
3026 arguments and will pass them on to get_SeqFeatures(). I.e., in order to
3027 support filter arguments, just support them in get_SeqFeatures().
3034 if (defined($self->{'_as_feat'})) {
3035 return ($#{$self->{'_as_feat'}} + 1);
3041 =head2 get_all_SeqFeatures
3043 Title : get_all_SeqFeatures
3045 Function: Get the flattened tree of feature objects held by this
3046 feature holder. The difference to get_SeqFeatures is that
3047 the entire tree of sub-features will be flattened out.
3049 We provide a default implementation here, so implementors
3050 don''t necessarily need to implement this method.
3053 Returns : an array of Bio::SeqFeatureI implementing objects
3056 At some day we may want to expand this method to allow for a feature
3057 filter to be passed in.
3059 Our default implementation allows for any number of additional
3060 arguments and will pass them on to any invocation of
3061 get_SeqFeatures(), wherever a component of the tree implements
3062 FeatureHolderI. I.e., in order to support filter arguments, just
3063 support them in get_SeqFeatures().
3067 =head2 methods for Bio::AnnotatableI
3069 AnnotatableI implementation to support sequence alignments which
3070 contain annotation (NEXUS, Stockholm).
3075 Usage : $ann = $aln->annotation or
3076 $aln->annotation($ann)
3077 Function: Gets or sets the annotation
3078 Returns : Bio::AnnotationCollectionI object
3079 Args : None or Bio::AnnotationCollectionI object
3081 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3082 for more information
3087 my ($obj,$value) = @_;
3088 if( defined $value ) {
3089 $obj->throw("object of class ".ref($value)." does not implement ".
3090 "Bio::AnnotationCollectionI. Too bad.")
3091 unless $value->isa("Bio::AnnotationCollectionI");
3092 $obj->{'_annotation'} = $value;
3093 } elsif( ! defined $obj->{'_annotation'}) {
3094 $obj->{'_annotation'} = Bio
::Annotation
::Collection
->new();
3096 return $obj->{'_annotation'};