[bug 3148] switch default to "expasy" until we can work out REST service interface
[bioperl-live.git] / Bio / SimpleAlign.pm
blob45a9658dac462356f7043f3b4b348ab4bad3255c
1 # BioPerl module for SimpleAlign
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
7 # Copyright Ewan Birney
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 # History:
14 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
15 # May 2001 major rewrite - Heikki Lehvaslaiho
17 =head1 NAME
19 Bio::SimpleAlign - Multiple alignments held as a set of sequences
21 =head1 SYNOPSIS
23 # Use Bio::AlignIO to read in the alignment
24 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
25 $aln = $str->next_aln();
27 # Describe
28 print $aln->length;
29 print $aln->num_residues;
30 print $aln->is_flush;
31 print $aln->num_sequences;
32 print $aln->score;
33 print $aln->percentage_identity;
34 print $aln->consensus_string(50);
36 # Find the position in the alignment for a sequence location
37 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
39 # Extract sequences and check values for the alignment column $pos
40 foreach $seq ($aln->each_seq) {
41 $res = $seq->subseq($pos, $pos);
42 $count{$res}++;
44 foreach $res (keys %count) {
45 printf "Res: %s Count: %2d\n", $res, $count{$res};
48 # Manipulate
49 $aln->remove_seq($seq);
50 $mini_aln = $aln->slice(20,30); # get a block of columns
51 $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
52 $new_aln = $aln->remove_columns([20,30]); # remove by position
53 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
55 # Analyze
56 $str = $aln->consensus_string($threshold_percent);
57 $str = $aln->match_line();
58 $str = $aln->cigar_line();
59 $id = $aln->percentage_identity;
61 # See the module documentation for details and more methods.
63 =head1 DESCRIPTION
65 SimpleAlign is an object that handles a multiple sequence alignment
66 (MSA). It is very permissive of types (it does not insist on sequences
67 being all same length, for example). Think of it as a set of sequences
68 with a whole series of built-in manipulations and methods for reading and
69 writing alignments.
71 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
72 to store its sequences. These are subsequences with a start and end
73 positions in the parent reference sequence. Each sequence in the
74 SimpleAlign object is a Bio::LocatableSeq.
76 SimpleAlign expects the combination of name, start, and end for a
77 given sequence to be unique in the alignment, and this is the key for the
78 internal hashes (name, start, end are abbreviated C<nse> in the code).
79 However, in some cases people do not want the name/start-end to be displayed:
80 either multiple names in an alignment or names specific to the alignment
81 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
82 C<displayname>, and generally is what is used to print out the
83 alignment. They default to name/start-end.
85 The SimpleAlign Module is derived from the Align module by Ewan Birney.
87 =head1 FEEDBACK
89 =head2 Mailing Lists
91 User feedback is an integral part of the evolution of this and other
92 Bioperl modules. Send your comments and suggestions preferably to one
93 of the Bioperl mailing lists. Your participation is much appreciated.
95 bioperl-l@bioperl.org - General discussion
96 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
98 =head2 Support
100 Please direct usage questions or support issues to the mailing list:
102 I<bioperl-l@bioperl.org>
104 rather than to the module maintainer directly. Many experienced and
105 reponsive experts will be able look at the problem and quickly
106 address it. Please include a thorough description of the problem
107 with code and data examples if at all possible.
109 =head2 Reporting Bugs
111 Report bugs to the Bioperl bug tracking system to help us keep track
112 the bugs and their resolution. Bug reports can be submitted via the
113 web:
115 http://bugzilla.open-bio.org/
117 =head1 AUTHOR
119 Ewan Birney, birney@ebi.ac.uk
121 =head1 CONTRIBUTORS
123 Allen Day, allenday-at-ucla.edu,
124 Richard Adams, Richard.Adams-at-ed.ac.uk,
125 David J. Evans, David.Evans-at-vir.gla.ac.uk,
126 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
127 Allen Smith, allens-at-cpan.org,
128 Jason Stajich, jason-at-bioperl.org,
129 Anthony Underwood, aunderwood-at-phls.org.uk,
130 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
131 Brian Osborne, bosborne at alum.mit.edu
132 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
133 Hongyu Zhang, forward at hongyu.org
134 Jay Hannah, jay at jays.net
135 Alexandr Bezginov, albezg at gmail.com
137 =head1 SEE ALSO
139 L<Bio::LocatableSeq>
141 =head1 APPENDIX
143 The rest of the documentation details each of the object
144 methods. Internal methods are usually preceded with a _
146 =cut
148 # 'Let the code begin...
150 package Bio::SimpleAlign;
151 use vars qw(%CONSERVATION_GROUPS);
152 use strict;
154 use Bio::LocatableSeq; # uses Seq's as list
156 use Bio::Seq;
157 use Bio::SeqFeature::Generic;
159 BEGIN {
160 # This data should probably be in a more centralized module...
161 # it is taken from Clustalw documentation.
162 # These are all the positively scoring groups that occur in the
163 # Gonnet Pam250 matrix. The strong and weak groups are
164 # defined as strong score >0.5 and weak score =<0.5 respectively.
166 %CONSERVATION_GROUPS = (
167 'strong' => [ qw(
169 NEQK
170 NHQK
171 NDEQ
172 QHRK
173 MILV
174 MILF
176 FYW )],
177 'weak' => [ qw(
181 STNK
182 STPA
183 SGND
184 SNDEQK
185 NDEQHK
186 NEQHRK
187 FVLIM
188 HFY )],);
191 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
192 Bio::FeatureHolderI);
194 =head2 new
196 Title : new
197 Usage : my $aln = Bio::SimpleAlign->new();
198 Function : Creates a new simple align object
199 Returns : Bio::SimpleAlign
200 Args : -source => string representing the source program
201 where this alignment came from
202 -annotation => Bio::AnnotationCollectionI
203 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
204 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
205 -consensus => consensus string
206 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
208 =cut
211 sub new {
212 my($class,@args) = @_;
214 my $self = $class->SUPER::new(@args);
216 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
217 SOURCE
218 SCORE
220 ACCESSION
221 DESCRIPTION
222 SEQS
223 FEATURES
224 ANNOTATION
225 SEQ_ANNOTATION
226 CONSENSUS
227 CONSENSUS_META
228 )], @args);
229 $src && $self->source($src);
230 defined $score && $self->score($score);
231 # we need to set up internal hashs first!
233 $self->{'_seq'} = {};
234 $self->{'_order'} = {};
235 $self->{'_start_end_lists'} = {};
236 $self->{'_dis_name'} = {};
237 $self->{'_id'} = 'NoName';
238 # maybe we should automatically read in from args. Hmmm...
239 $id && $self->id($id);
240 $acc && $self->accession($acc);
241 $desc && $self->description($desc);
242 $coll && $self->annotation($coll);
243 # sequence annotation is layered into a provided annotation collection (or dies)
244 if ($sa) {
245 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
246 "with a sequence annotation collection")
247 if !$coll;
248 $coll->add_Annotation('seq_annotation', $sa);
250 if ($feats && ref $feats eq 'ARRAY') {
251 for my $feat (@$feats) {
252 $self->add_SeqFeature($feat);
255 $con && $self->consensus($con);
256 $cmeta && $self->consensus_meta($cmeta);
257 # assumes these are in correct alignment order
258 if ($seqs && ref($seqs) eq 'ARRAY') {
259 for my $seq (@$seqs) {
260 $self->add_seq($seq);
264 return $self; # success - we hope!
267 =head1 Modifier methods
269 These methods modify the MSA by adding, removing or shuffling complete
270 sequences.
272 =head2 add_seq
274 Title : add_seq
275 Usage : $myalign->add_seq($newseq);
276 $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
277 Function : Adds another sequence to the alignment. *Does not* align
278 it - just adds it to the hashes.
279 If -ORDER is specified, the sequence is inserted at the
280 the position spec'd by -ORDER, and existing sequences
281 are pushed down the storage array.
282 Returns : nothing
283 Args : A Bio::LocatableSeq object
284 Positive integer for the sequence position (optional)
286 See L<Bio::LocatableSeq> for more information
288 =cut
290 sub addSeq {
291 my $self = shift;
292 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
293 $self->add_seq(@_);
296 sub add_seq {
297 my $self = shift;
298 my @args = @_;
299 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
300 my ($name,$id,$start,$end);
302 unless ($seq) {
303 $self->throw("LocatableSeq argument required");
305 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
306 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
308 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
309 $order--; # jay's patch (user-specified order is 1-origin)
311 if ($order < 0) {
312 $self->throw("User-specified value for ORDER must be >= 1");
315 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
317 # build the symbol list for this sequence,
318 # will prune out the gap and missing/match chars
319 # when actually asked for the symbol list in the
320 # symbol_chars
321 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
323 $name = $seq->get_nse;
325 if( $self->{'_seq'}->{$name} ) {
326 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
328 else {
329 $self->debug( "Assigning $name to $order\n");
331 my $ordh = $self->{'_order'};
332 if ($ordh->{$order}) {
333 # make space to insert
334 # $c->() returns (in reverse order) the first subsequence
335 # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
336 # (3,2,1), and $c->(2,4,5) returns (2).
337 my $c;
338 $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
339 map {
340 $ordh->{$_+1} = $ordh->{$_}
341 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
344 $ordh->{$order} = $name;
346 unless( exists( $self->{'_start_end_lists'}->{$id})) {
347 $self->{'_start_end_lists'}->{$id} = [];
349 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
352 $self->{'_seq'}->{$name} = $seq;
357 =head2 remove_seq
359 Title : remove_seq
360 Usage : $aln->remove_seq($seq);
361 Function : Removes a single sequence from an alignment
362 Returns :
363 Argument : a Bio::LocatableSeq object
365 =cut
367 sub removeSeq {
368 my $self = shift;
369 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
370 $self->remove_seq(@_);
373 sub remove_seq {
374 my $self = shift;
375 my $seq = shift;
376 my ($name,$id,$start,$end);
378 $self->throw("Need Bio::Locatable seq argument ")
379 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
381 $id = $seq->id();
382 $start = $seq->start();
383 $end = $seq->end();
384 $name = sprintf("%s/%d-%d",$id,$start,$end);
386 if( !exists $self->{'_seq'}->{$name} ) {
387 $self->throw("Sequence $name does not exist in the alignment to remove!");
390 delete $self->{'_seq'}->{$name};
392 # we need to remove this seq from the start_end_lists hash
394 if (exists $self->{'_start_end_lists'}->{$id}) {
395 # we need to find the sequence in the array.
397 my ($i, $found);;
398 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
399 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
400 $found = 1;
401 last;
404 if ($found) {
405 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
407 else {
408 $self->throw("Could not find the sequence to remoce from the start-end list");
411 else {
412 $self->throw("There is no seq list for the name $id");
414 # we need to shift order hash
415 my %rev_order = reverse %{$self->{'_order'}};
416 my $no = $rev_order{$name};
417 my $num_sequences = $self->num_sequences;
418 for (; $no < $num_sequences; $no++) {
419 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
421 delete $self->{'_order'}->{$no};
422 return 1;
426 =head2 purge
428 Title : purge
429 Usage : $aln->purge(0.7);
430 Function: Removes sequences above given sequence similarity
431 This function will grind on large alignments. Beware!
432 Example :
433 Returns : An array of the removed sequences
434 Args : float, threshold for similarity
436 =cut
438 sub purge {
439 my ($self,$perc) = @_;
440 my (%duplicate, @dups);
442 my @seqs = $self->each_seq();
444 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
445 my $seq = $seqs[$i];
447 #skip if already in duplicate hash
448 next if exists $duplicate{$seq->display_id} ;
449 my $one = $seq->seq();
451 my @one = split '', $one; #split to get 1aa per array element
453 for (my $j=$i+1;$j < @seqs;$j++) {
454 my $seq2 = $seqs[$j];
456 #skip if already in duplicate hash
457 next if exists $duplicate{$seq2->display_id} ;
459 my $two = $seq2->seq();
460 my @two = split '', $two;
462 my $count = 0;
463 my $res = 0;
464 for (my $k=0;$k<@one;$k++) {
465 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
466 $one[$k] eq $two[$k]) {
467 $count++;
469 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
470 $two[$k] ne '.' && $two[$k] ne '-' ) {
471 $res++;
475 my $ratio = 0;
476 $ratio = $count/$res unless $res == 0;
478 # if above threshold put in duplicate hash and push onto
479 # duplicate array for returning to get_unique
480 if ( $ratio > $perc ) {
481 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
482 $duplicate{$seq2->display_id} = 1;
483 push @dups, $seq2;
487 foreach my $seq (@dups) {
488 $self->remove_seq($seq);
490 return @dups;
493 =head2 sort_alphabetically
495 Title : sort_alphabetically
496 Usage : $ali->sort_alphabetically
497 Function : Changes the order of the alignment to alphabetical on name
498 followed by numerical by number.
499 Returns :
500 Argument :
502 =cut
504 sub sort_alphabetically {
505 my $self = shift;
506 my ($seq,$nse,@arr,%hash,$count);
508 foreach $seq ( $self->each_seq() ) {
509 $nse = $seq->get_nse;
510 $hash{$nse} = $seq;
513 $count = 0;
515 %{$self->{'_order'}} = (); # reset the hash;
517 foreach $nse ( sort _alpha_startend keys %hash) {
518 $self->{'_order'}->{$count} = $nse;
520 $count++;
525 =head2 sort_by_list
527 Title : sort_by_list
528 Usage : $aln_ordered=$aln->sort_by_list($list_file)
529 Function : Arbitrarily order sequences in an alignment
530 Returns : A new Bio::SimpleAlign object
531 Argument : a file listing sequence names in intended order (one name per line)
533 =cut
535 sub sort_by_list {
536 my ($self, $list) = @_;
537 my (@seq, @ids, %order);
539 foreach my $seq ( $self->each_seq() ) {
540 push @seq, $seq;
541 push @ids, $seq->display_id;
544 my $ct=1;
545 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
546 while (<$listfh>) {
547 chomp;
548 my $name=$_;
549 $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
550 $order{$name}=$ct++;
552 close($listfh);
554 # use the map-sort-map idiom:
555 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
556 my $aln = $self->new;
557 foreach (@sorted) { $aln->add_seq($_) }
558 return $aln;
561 =head2 set_new_reference
563 Title : set_new_reference
564 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
565 the sequence whoes name is "B31" (full, exact, and case-sensitive),
566 as the reference (1st) sequence
567 Function : Change/Set a new reference (i.e., the first) sequence
568 Returns : a new Bio::SimpleAlign object.
569 Throws an exception if designated sequence not found
570 Argument : a positive integer of sequence order, or a sequence name
571 in the original alignment
573 =cut
575 sub set_new_reference {
576 my ($self, $seqid) = @_;
577 my $aln = $self->new;
578 my (@seq, @ids, @new_seq);
579 my $is_num=0;
580 foreach my $seq ( $self->each_seq() ) {
581 push @seq, $seq;
582 push @ids, $seq->display_id;
585 if ($seqid =~ /^\d+$/) { # argument is seq position
586 $is_num=1;
587 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
588 } else { # argument is a seq name
589 $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
592 for (my $i=0; $i<=$#seq; $i++) {
593 my $pos=$i+1;
594 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
595 unshift @new_seq, $seq[$i];
596 } else {
597 push @new_seq, $seq[$i];
600 foreach (@new_seq) { $aln->add_seq($_); }
601 return $aln;
604 sub _in_aln { # check if input name exists in the alignment
605 my ($str, $ref) = @_;
606 foreach (@$ref) {
607 return 1 if $str eq $_;
609 return 0;
613 =head2 uniq_seq
615 Title : uniq_seq
616 Usage : $aln->uniq_seq(): Remove identical sequences in
617 in the alignment. Ambiguous base ("N", "n") and
618 leading and ending gaps ("-") are NOT counted as
619 differences.
620 Function : Make a new alignment of unique sequence types (STs)
621 Returns : 1a. if called in a scalar context,
622 a new Bio::SimpleAlign object (all sequences renamed as "ST")
623 1b. if called in an array context,
624 a new Bio::SimpleAlign object, and a hashref whose keys
625 are sequence types, and whose values are arrayrefs to
626 lists of sequence ids within the corresponding sequence type
627 2. if $aln->verbose > 0, ST of each sequence is sent to
628 STDERR (in a tabular format)
629 Argument : None
631 =cut
633 sub uniq_seq {
634 my ($self, $seqid) = @_;
635 my $aln = $self->new;
636 my (%member, %order, @seq, @uniq_str, $st);
637 my $order=0;
638 my $len = $self->length();
639 $st = {};
640 foreach my $seq ( $self->each_seq() ) {
641 my $str = $seq->seq();
643 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
644 # comparing two sequence strings
646 # 1st, convert "n", "N" to "?" (for DNA sequence only):
647 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
648 # 2nd, convert leading and ending gaps to "?":
649 $str = &_convert_leading_ending_gaps($str, '-', '?');
650 # Note that '?' also can mean unknown residue.
651 # I don't like making global class member changes like this, too
652 # prone to errors... -- cjfields 08-11-18
653 local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
654 my $new = Bio::LocatableSeq->new(
655 -id => $seq->id(),
656 -alphabet=> $seq->alphabet,
657 -seq => $str,
658 -start => $seq->start,
659 -end => $seq->end
661 push @seq, $new;
664 foreach my $seq (@seq) {
665 my $str = $seq->seq();
666 my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
667 if ($seen) { # seen before
668 my @memb = @{$member{$key}};
669 push @memb, $seq;
670 $member{$key} = \@memb;
671 } else { # not seen
672 push @uniq_str, $key;
673 $order++;
674 $member{$key} = [ ($seq) ];
675 $order{$key} = $order;
679 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
680 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
681 my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
682 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
683 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
684 my $gap='-';
685 my $end= CORE::length($str2);
686 $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
687 my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
688 -seq =>$str2,
689 -start=>1,
690 -end =>$end
692 $aln->add_seq($new);
693 foreach (@{$member{$str}}) {
694 push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
695 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
698 return wantarray ? ($aln, $st) : $aln;
701 sub _check_uniq { # check if same seq exists in the alignment
702 my ($str1, $ref, $length) = @_;
703 my @char1=split //, $str1;
704 my @array=@$ref;
706 return (0, $str1) if @array==0; # not seen (1st sequence)
708 foreach my $str2 (@array) {
709 my $diff=0;
710 my @char2=split //, $str2;
711 for (my $i=0; $i<=$length-1; $i++) {
712 next if $char1[$i] eq '?';
713 next if $char2[$i] eq '?';
714 $diff++ if $char1[$i] ne $char2[$i];
716 return (1, $str2) if $diff == 0; # seen before
719 return (0, $str1); # not seen
722 sub _convert_leading_ending_gaps {
723 my $s=shift;
724 my $sym1=shift;
725 my $sym2=shift;
726 my @array=split //, $s;
727 # convert leading char:
728 for (my $i=0; $i<=$#array; $i++) {
729 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
731 # convert ending char:
732 for (my $i = $#array; $i>= 0; $i--) {
733 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
735 my $s_new=join '', @array;
736 return $s_new;
739 =head1 Sequence selection methods
741 Methods returning one or more sequences objects.
743 =head2 each_seq
745 Title : each_seq
746 Usage : foreach $seq ( $align->each_seq() )
747 Function : Gets a Seq object from the alignment
748 Returns : Seq object
749 Argument :
751 =cut
753 sub eachSeq {
754 my $self = shift;
755 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
756 $self->each_seq();
759 sub each_seq {
760 my $self = shift;
761 my (@arr,$order);
763 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
764 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
765 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
768 return @arr;
772 =head2 each_alphabetically
774 Title : each_alphabetically
775 Usage : foreach $seq ( $ali->each_alphabetically() )
776 Function : Returns a sequence object, but the objects are returned
777 in alphabetically sorted order.
778 Does not change the order of the alignment.
779 Returns : Seq object
780 Argument :
782 =cut
784 sub each_alphabetically {
785 my $self = shift;
786 my ($seq,$nse,@arr,%hash,$count);
788 foreach $seq ( $self->each_seq() ) {
789 $nse = $seq->get_nse;
790 $hash{$nse} = $seq;
793 foreach $nse ( sort _alpha_startend keys %hash) {
794 push(@arr,$hash{$nse});
796 return @arr;
799 sub _alpha_startend {
800 my ($aname,$astart,$bname,$bstart);
801 ($aname,$astart) = split (/-/,$a);
802 ($bname,$bstart) = split (/-/,$b);
804 if( $aname eq $bname ) {
805 return $astart <=> $bstart;
807 else {
808 return $aname cmp $bname;
812 =head2 each_seq_with_id
814 Title : each_seq_with_id
815 Usage : foreach $seq ( $align->each_seq_with_id() )
816 Function : Gets a Seq objects from the alignment, the contents
817 being those sequences with the given name (there may be
818 more than one)
819 Returns : Seq object
820 Argument : a seq name
822 =cut
824 sub eachSeqWithId {
825 my $self = shift;
826 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
827 $self->each_seq_with_id(@_);
830 sub each_seq_with_id {
831 my $self = shift;
832 my $id = shift;
834 $self->throw("Method each_seq_with_id needs a sequence name argument")
835 unless defined $id;
837 my (@arr, $seq);
839 if (exists($self->{'_start_end_lists'}->{$id})) {
840 @arr = @{$self->{'_start_end_lists'}->{$id}};
842 return @arr;
845 =head2 get_seq_by_pos
847 Title : get_seq_by_pos
848 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
849 Function : Gets a sequence based on its position in the alignment.
850 Numbering starts from 1. Sequence positions larger than
851 num_sequences() will thow an error.
852 Returns : a Bio::LocatableSeq object
853 Args : positive integer for the sequence position
855 =cut
857 sub get_seq_by_pos {
859 my $self = shift;
860 my ($pos) = @_;
862 $self->throw("Sequence position has to be a positive integer, not [$pos]")
863 unless $pos =~ /^\d+$/ and $pos > 0;
864 $self->throw("No sequence at position [$pos]")
865 unless $pos <= $self->num_sequences ;
867 my $nse = $self->{'_order'}->{--$pos};
868 return $self->{'_seq'}->{$nse};
871 =head2 get_seq_by_id
873 Title : get_seq_by_id
874 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
875 Function : Gets a sequence based on its name.
876 Sequences that do not exist will warn and return undef
877 Returns : a Bio::LocatableSeq object
878 Args : string for sequence name
880 =cut
882 sub get_seq_by_id {
883 my ($self,$name) = @_;
884 unless( defined $name ) {
885 $self->warn("Must provide a sequence name");
886 return;
888 for my $seq ( values %{$self->{'_seq'}} ) {
889 if ( $seq->id eq $name) {
890 return $seq;
893 return;
896 =head2 seq_with_features
898 Title : seq_with_features
899 Usage : $seq = $aln->seq_with_features(-pos => 1,
900 -consensus => 60
901 -mask =>
902 sub { my $consensus = shift;
904 for my $i (1..5){
905 my $n = 'N' x $i;
906 my $q = '\?' x $i;
907 while($consensus =~ /[^?]$q[^?]/){
908 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
911 return $consensus;
914 Function: produces a Bio::Seq object by first splicing gaps from -pos
915 (by means of a splice_by_seq_pos() call), then creating
916 features using non-? chars (by means of a consensus_string()
917 call with stringency -consensus).
918 Returns : a Bio::Seq object
919 Args : -pos : required. sequence from which to build the Bio::Seq
920 object
921 -consensus : optional, defaults to consensus_string()'s
922 default cutoff value
923 -mask : optional, a coderef to apply to consensus_string()'s
924 output before building features. this may be useful for
925 closing gaps of 1 bp by masking over them with N, for
926 instance
928 =cut
930 sub seq_with_features{
931 my ($self,%arg) = @_;
933 #first do the preparatory splice
934 $self->throw("must provide a -pos argument") unless $arg{-pos};
935 $self->splice_by_seq_pos($arg{-pos});
937 my $consensus_string = $self->consensus_string($arg{-consensus});
938 $consensus_string = $arg{-mask}->($consensus_string)
939 if defined($arg{-mask});
941 my(@bs,@es);
943 push @bs, 1 if $consensus_string =~ /^[^?]/;
945 while($consensus_string =~ /\?[^?]/g){
946 push @bs, pos($consensus_string);
948 while($consensus_string =~ /[^?]\?/g){
949 push @es, pos($consensus_string);
952 push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
954 my $seq = Bio::Seq->new();
956 # my $rootfeature = Bio::SeqFeature::Generic->new(
957 # -source_tag => 'location',
958 # -start => $self->get_seq_by_pos($arg{-pos})->start,
959 # -end => $self->get_seq_by_pos($arg{-pos})->end,
960 # );
961 # $seq->add_SeqFeature($rootfeature);
963 while(my $b = shift @bs){
964 my $e = shift @es;
965 $seq->add_SeqFeature(
966 Bio::SeqFeature::Generic->new(
967 -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
968 -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
969 -source_tag => $self->source || 'MSA',
974 return $seq;
978 =head1 Create new alignments
980 The result of these methods are horizontal or vertical subsets of the
981 current MSA.
983 =head2 select
985 Title : select
986 Usage : $aln2 = $aln->select(1, 3) # three first sequences
987 Function : Creates a new alignment from a continuous subset of
988 sequences. Numbering starts from 1. Sequence positions
989 larger than num_sequences() will thow an error.
990 Returns : a Bio::SimpleAlign object
991 Args : positive integer for the first sequence
992 positive integer for the last sequence to include (optional)
994 =cut
996 sub select {
997 my $self = shift;
998 my ($start, $end) = @_;
1000 $self->throw("Select start has to be a positive integer, not [$start]")
1001 unless $start =~ /^\d+$/ and $start > 0;
1002 $self->throw("Select end has to be a positive integer, not [$end]")
1003 unless $end =~ /^\d+$/ and $end > 0;
1004 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1005 unless $start <= $end;
1007 my $aln = $self->new;
1008 foreach my $pos ($start .. $end) {
1009 $aln->add_seq($self->get_seq_by_pos($pos));
1011 $aln->id($self->id);
1012 # fix for meta, sf, ann
1013 return $aln;
1016 =head2 select_noncont
1018 Title : select_noncont
1019 Usage : # 1st and 3rd sequences, sorted
1020 $aln2 = $aln->select_noncont(1, 3)
1022 # 1st and 3rd sequences, sorted (same as first)
1023 $aln2 = $aln->select_noncont(3, 1)
1025 # 1st and 3rd sequences, unsorted
1026 $aln2 = $aln->select_noncont('nosort',3, 1)
1028 Function : Creates a new alignment from a subset of sequences. Numbering
1029 starts from 1. Sequence positions larger than num_sequences() will
1030 throw an error. Sorts the order added to new alignment by default,
1031 to prevent sorting pass 'nosort' as the first argument in the list.
1032 Returns : a Bio::SimpleAlign object
1033 Args : array of integers for the sequences. If the string 'nosort' is
1034 passed as the first argument, the sequences will not be sorted
1035 in the new alignment but will appear in the order listed.
1037 =cut
1039 sub select_noncont {
1040 my $self = shift;
1041 my $nosort = 0;
1042 my (@pos) = @_;
1043 if ($pos[0] !~ m{^\d+$}) {
1044 my $sortcmd = shift @pos;
1045 if ($sortcmd eq 'nosort') {
1046 $nosort = 1;
1047 } else {
1048 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1052 my $end = $self->num_sequences;
1053 foreach ( @pos ) {
1054 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1055 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1058 @pos = sort {$a <=> $b} @pos unless $nosort;
1060 my $aln = $self->new;
1061 foreach my $p (@pos) {
1062 $aln->add_seq($self->get_seq_by_pos($p));
1064 $aln->id($self->id);
1065 # fix for meta, sf, ann
1066 return $aln;
1069 =head2 slice
1071 Title : slice
1072 Usage : $aln2 = $aln->slice(20,30)
1073 Function : Creates a slice from the alignment inclusive of start and
1074 end columns, and the first column in the alignment is denoted 1.
1075 Sequences with no residues in the slice are excluded from the
1076 new alignment and a warning is printed. Slice beyond the length of
1077 the sequence does not do padding.
1078 Returns : A Bio::SimpleAlign object
1079 Args : Positive integer for start column, positive integer for end column,
1080 optional boolean which if true will keep gap-only columns in the newly
1081 created slice. Example:
1083 $aln2 = $aln->slice(20,30,1)
1085 =cut
1087 sub slice {
1088 my $self = shift;
1089 my ($start, $end, $keep_gap_only) = @_;
1091 $self->throw("Slice start has to be a positive integer, not [$start]")
1092 unless $start =~ /^\d+$/ and $start > 0;
1093 $self->throw("Slice end has to be a positive integer, not [$end]")
1094 unless $end =~ /^\d+$/ and $end > 0;
1095 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1096 unless $start <= $end;
1097 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1098 "[$start] is too big.") if $start > $self->length;
1099 my $cons_meta = $self->consensus_meta;
1100 my $aln = $self->new;
1101 $aln->id($self->id);
1102 foreach my $seq ( $self->each_seq() ) {
1103 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1104 Bio::Seq::Meta->new
1105 (-id => $seq->id,
1106 -alphabet => $seq->alphabet,
1107 -strand => $seq->strand,
1108 -verbose => $self->verbose) :
1109 Bio::LocatableSeq->new
1110 (-id => $seq->id,
1111 -alphabet => $seq->alphabet,
1112 -strand => $seq->strand,
1113 -verbose => $self->verbose);
1115 # seq
1116 my $seq_end = $end;
1117 $seq_end = $seq->length if( $end > $seq->length );
1119 my $slice_seq = $seq->subseq($start, $seq_end);
1120 $new_seq->seq( $slice_seq );
1122 $slice_seq =~ s/\W//g;
1124 if ($start > 1) {
1125 my $pre_start_seq = $seq->subseq(1, $start - 1);
1126 $pre_start_seq =~ s/\W//g;
1127 if (!defined($seq->strand)) {
1128 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1129 } elsif ($seq->strand < 0){
1130 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1131 } else {
1132 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1134 } else {
1135 if ((defined $seq->strand)&&($seq->strand < 0)){
1136 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1137 } else {
1138 $new_seq->start( $seq->start);
1141 if ($new_seq->isa('Bio::Seq::MetaI')) {
1142 for my $meta_name ($seq->meta_names) {
1143 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1146 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1148 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1149 $aln->add_seq($new_seq);
1150 } else {
1151 if( $keep_gap_only ) {
1152 $aln->add_seq($new_seq);
1153 } else {
1154 my $nse = $seq->get_nse();
1155 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1156 " Sequence excluded from the new alignment.");
1160 if ($cons_meta) {
1161 my $new = Bio::Seq::Meta->new();
1162 for my $meta_name ($cons_meta->meta_names) {
1163 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1165 $aln->consensus_meta($new);
1167 $aln->annotation($self->annotation);
1168 # fix for meta, sf, ann
1169 return $aln;
1172 =head2 remove_columns
1174 Title : remove_columns
1175 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1176 $aln2 = $aln->remove_columns([0,0],[6,8])
1177 Function : Creates an aligment with columns removed corresponding to
1178 the specified type or by specifying the columns by number.
1179 Returns : Bio::SimpleAlign object
1180 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1181 'all_gaps_columns') or array ref where the referenced array
1182 contains a pair of integers that specify a range.
1183 The first column is 0
1185 =cut
1187 sub remove_columns {
1188 my ($self,@args) = @_;
1189 @args || $self->throw("Must supply column ranges or column types");
1190 my $aln;
1192 if ($args[0][0] =~ /^[a-z_]+$/i) {
1193 $aln = $self->_remove_columns_by_type($args[0]);
1194 } elsif ($args[0][0] =~ /^\d+$/) {
1195 $aln = $self->_remove_columns_by_num(\@args);
1196 } else {
1197 $self->throw("You must pass array references to remove_columns(), not @args");
1199 # fix for meta, sf, ann
1200 $aln;
1204 =head2 remove_gaps
1206 Title : remove_gaps
1207 Usage : $aln2 = $aln->remove_gaps
1208 Function : Creates an aligment with gaps removed
1209 Returns : a Bio::SimpleAlign object
1210 Args : a gap character(optional) if none specified taken
1211 from $self->gap_char,
1212 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1213 indicates that only all-gaps columns should be deleted
1215 Used from method L<remove_columns> in most cases. Set gap character
1216 using L<gap_char()|gap_char>.
1218 =cut
1220 sub remove_gaps {
1221 my ($self,$gapchar,$all_gaps_columns) = @_;
1222 my $gap_line;
1223 if ($all_gaps_columns) {
1224 $gap_line = $self->all_gap_line($gapchar);
1225 } else {
1226 $gap_line = $self->gap_line($gapchar);
1228 my $aln = $self->new;
1230 my @remove;
1231 my $length = 0;
1232 my $del_char = $gapchar || $self->gap_char;
1233 # Do the matching to get the segments to remove
1234 while ($gap_line =~ m/[$del_char]/g) {
1235 my $start = pos($gap_line)-1;
1236 $gap_line=~/\G[$del_char]+/gc;
1237 my $end = pos($gap_line)-1;
1239 #have to offset the start and end for subsequent removes
1240 $start-=$length;
1241 $end -=$length;
1242 $length += ($end-$start+1);
1243 push @remove, [$start,$end];
1246 #remove the segments
1247 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1248 # fix for meta, sf, ann
1249 return $aln;
1253 sub _remove_col {
1254 my ($self,$aln,$remove) = @_;
1255 my @new;
1257 my $gap = $self->gap_char;
1259 # splice out the segments and create new seq
1260 foreach my $seq($self->each_seq){
1261 my $new_seq = Bio::LocatableSeq->new(
1262 -id => $seq->id,
1263 -alphabet=> $seq->alphabet,
1264 -strand => $seq->strand,
1265 -verbose => $self->verbose);
1266 my $sequence = $seq->seq;
1267 foreach my $pair(@{$remove}){
1268 my $start = $pair->[0];
1269 my $end = $pair->[1];
1270 $sequence = $seq->seq unless $sequence;
1271 my $orig = $sequence;
1272 my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1273 my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1274 $sequence = $head.$tail;
1275 # start
1276 unless (defined $new_seq->start) {
1277 if ($start == 0) {
1278 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1279 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1281 else {
1282 my $start_adjust = $orig =~ /^$gap+/;
1283 if ($start_adjust) {
1284 $start_adjust = $+[0] == $start;
1286 $new_seq->start($seq->start + $start_adjust);
1289 # end
1290 if (($end + 1) >= CORE::length($orig)) {
1291 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1292 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1294 else {
1295 $new_seq->end($seq->end);
1299 if ($new_seq->end < $new_seq->start) {
1300 # we removed all columns except for gaps: set to 0 to indicate no
1301 # sequence
1302 $new_seq->start(0);
1303 $new_seq->end(0);
1306 $new_seq->seq($sequence) if $sequence;
1307 push @new, $new_seq;
1309 # add the new seqs to the alignment
1310 foreach my $new(@new){
1311 $aln->add_seq($new);
1313 # fix for meta, sf, ann
1314 return $aln;
1317 sub _remove_columns_by_type {
1318 my ($self,$type) = @_;
1319 my $aln = $self->new;
1320 my @remove;
1322 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1323 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1324 my %matchchars = ( 'match' => '\*',
1325 'weak' => '\.',
1326 'strong' => ':',
1327 'mismatch' => ' ',
1328 'gaps' => '',
1329 'all_gaps_columns' => ''
1331 # get the characters to delete against
1332 my $del_char;
1333 foreach my $type (@{$type}){
1334 $del_char.= $matchchars{$type};
1337 my $length = 0;
1338 my $match_line = $self->match_line;
1339 # do the matching to get the segments to remove
1340 if($del_char){
1341 while($match_line =~ m/[$del_char]/g ){
1342 my $start = pos($match_line)-1;
1343 $match_line=~/\G[$del_char]+/gc;
1344 my $end = pos($match_line)-1;
1346 #have to offset the start and end for subsequent removes
1347 $start-=$length;
1348 $end -=$length;
1349 $length += ($end-$start+1);
1350 push @remove, [$start,$end];
1354 # remove the segments
1355 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1356 $aln = $aln->remove_gaps() if $gap;
1357 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1358 # fix for meta, sf, ann
1359 $aln;
1363 sub _remove_columns_by_num {
1364 my ($self,$positions) = @_;
1365 my $aln = $self->new;
1367 # sort the positions
1368 @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1370 my @remove;
1371 my $length = 0;
1372 foreach my $pos (@{$positions}) {
1373 my ($start, $end) = @{$pos};
1375 #have to offset the start and end for subsequent removes
1376 $start-=$length;
1377 $end -=$length;
1378 $length += ($end-$start+1);
1379 push @remove, [$start,$end];
1382 #remove the segments
1383 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1384 # fix for meta, sf, ann
1385 $aln;
1389 =head1 Change sequences within the MSA
1391 These methods affect characters in all sequences without changing the
1392 alignment.
1394 =head2 splice_by_seq_pos
1396 Title : splice_by_seq_pos
1397 Usage : $status = splice_by_seq_pos(1);
1398 Function: splices all aligned sequences where the specified sequence
1399 has gaps.
1400 Example :
1401 Returns : 1 on success
1402 Args : position of sequence to splice by
1405 =cut
1407 sub splice_by_seq_pos{
1408 my ($self,$pos) = @_;
1410 my $guide = $self->get_seq_by_pos($pos);
1411 my $guide_seq = $guide->seq;
1413 $guide_seq =~ s/\./\-/g;
1415 my @gaps = ();
1416 $pos = -1;
1417 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1418 unshift @gaps, $pos;
1419 $pos++;
1422 foreach my $seq ($self->each_seq){
1423 my @bases = split '', $seq->seq;
1425 splice(@bases, $_, 1) foreach @gaps;
1426 $seq->seq(join('', @bases));
1432 =head2 map_chars
1434 Title : map_chars
1435 Usage : $ali->map_chars('\.','-')
1436 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1437 characters
1439 Notice that the from (arg1) is interpretted as a regex,
1440 so be careful about quoting meta characters (eg
1441 $ali->map_chars('.','-') wont do what you want)
1442 Returns :
1443 Argument : 'from' rexexp
1444 'to' string
1446 =cut
1448 sub map_chars {
1449 my $self = shift;
1450 my $from = shift;
1451 my $to = shift;
1452 my ($seq,$temp);
1454 $self->throw("Need exactly two arguments")
1455 unless defined $from and defined $to;
1457 foreach $seq ( $self->each_seq() ) {
1458 $temp = $seq->seq();
1459 $temp =~ s/$from/$to/g;
1460 $seq->seq($temp);
1462 return 1;
1466 =head2 uppercase
1468 Title : uppercase()
1469 Usage : $ali->uppercase()
1470 Function : Sets all the sequences to uppercase
1471 Returns :
1472 Argument :
1474 =cut
1476 sub uppercase {
1477 my $self = shift;
1478 my $seq;
1479 my $temp;
1481 foreach $seq ( $self->each_seq() ) {
1482 $temp = $seq->seq();
1483 $temp =~ tr/[a-z]/[A-Z]/;
1485 $seq->seq($temp);
1487 return 1;
1490 =head2 cigar_line
1492 Title : cigar_line()
1493 Usage : %cigars = $align->cigar_line()
1494 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1495 Report) line for each sequence in the alignment. Examples are
1496 "1,60" or "5,10:12,58", where the numbers refer to conserved
1497 positions within the alignment. The keys of the hash are the
1498 NSEs (name/start/end) assigned to each sequence.
1499 Args : threshold (optional, defaults to 100)
1500 Returns : Hash of strings (cigar lines)
1502 =cut
1504 sub cigar_line {
1505 my $self = shift;
1506 my $thr=shift||100;
1507 my %cigars;
1509 my @consensus = split "",($self->consensus_string($thr));
1510 my $len = $self->length;
1511 my $gapchar = $self->gap_char;
1513 # create a precursor, something like (1,4,5,6,7,33,45),
1514 # where each number corresponds to a conserved position
1515 foreach my $seq ( $self->each_seq ) {
1516 my @seq = split "", uc ($seq->seq);
1517 my $pos = 1;
1518 for (my $x = 0 ; $x < $len ; $x++ ) {
1519 if ($seq[$x] eq $consensus[$x]) {
1520 push @{$cigars{$seq->get_nse}},$pos;
1521 $pos++;
1522 } elsif ($seq[$x] ne $gapchar) {
1523 $pos++;
1527 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1528 for my $name (keys %cigars) {
1529 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1530 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1531 push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1532 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1533 ${$cigars{$name}}[$#{$cigars{$name}}] );
1534 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1535 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1536 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1537 splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1541 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1542 for my $name (keys %cigars) {
1543 my @remove;
1544 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1545 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1546 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1547 unshift @remove,$x;
1550 for my $pos (@remove) {
1551 splice @{$cigars{$name}}, $pos, 1;
1554 # join and punctuate
1555 for my $name (keys %cigars) {
1556 my ($start,$end,$str) = "";
1557 while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1558 $str .= ($start . "," . $end . ":");
1560 $str =~ s/:$//;
1561 $cigars{$name} = $str;
1563 %cigars;
1567 =head2 match_line
1569 Title : match_line()
1570 Usage : $line = $align->match_line()
1571 Function : Generates a match line - much like consensus string
1572 except that a line indicating the '*' for a match.
1573 Args : (optional) Match line characters ('*' by default)
1574 (optional) Strong match char (':' by default)
1575 (optional) Weak match char ('.' by default)
1576 Returns : String
1578 =cut
1580 sub match_line {
1581 my ($self,$matchlinechar, $strong, $weak) = @_;
1582 my %matchchars = ('match' => $matchlinechar || '*',
1583 'weak' => $weak || '.',
1584 'strong' => $strong || ':',
1585 'mismatch' => ' ',
1588 my @seqchars;
1589 my $alphabet;
1590 foreach my $seq ( $self->each_seq ) {
1591 push @seqchars, [ split(//, uc ($seq->seq)) ];
1592 $alphabet = $seq->alphabet unless defined $alphabet;
1594 my $refseq = shift @seqchars;
1595 # let's just march down the columns
1596 my $matchline;
1597 POS:
1598 foreach my $pos ( 0..$self->length ) {
1599 my $refchar = $refseq->[$pos];
1600 my $char = $matchchars{'mismatch'};
1601 unless( defined $refchar ) {
1602 last if $pos == $self->length; # short circuit on last residue
1603 # this in place to handle jason's soon-to-be-committed
1604 # intron mapping code
1605 goto bottom;
1607 my %col = ($refchar => 1);
1608 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1609 foreach my $seq ( @seqchars ) {
1610 next if $pos >= scalar @$seq;
1611 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1612 $seq->[$pos] eq ' ' );
1613 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1615 my @colresidues = sort keys %col;
1617 # if all the values are the same
1618 if( $dash ) { $char = $matchchars{'mismatch'} }
1619 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1620 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1621 # matches for protein seqs
1622 TYPE:
1623 foreach my $type ( qw(strong weak) ) {
1624 # iterate through categories
1625 my %groups;
1626 # iterate through each of the aa in the col
1627 # look to see which groups it is in
1628 foreach my $c ( @colresidues ) {
1629 foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1630 push @{$groups{$f}},$c;
1633 GRP:
1634 foreach my $cols ( values %groups ) {
1635 @$cols = sort @$cols;
1636 # now we are just testing to see if two arrays
1637 # are identical w/o changing either one
1638 # have to be same len
1639 next if( scalar @$cols != scalar @colresidues );
1640 # walk down the length and check each slot
1641 for($_=0;$_ < (scalar @$cols);$_++ ) {
1642 next GRP if( $cols->[$_] ne $colresidues[$_] );
1644 $char = $matchchars{$type};
1645 last TYPE;
1649 bottom:
1650 $matchline .= $char;
1652 return $matchline;
1656 =head2 gap_line
1658 Title : gap_line()
1659 Usage : $line = $align->gap_line()
1660 Function : Generates a gap line - much like consensus string
1661 except that a line where '-' represents gap
1662 Args : (optional) gap line characters ('-' by default)
1663 Returns : string
1665 =cut
1667 sub gap_line {
1668 my ($self,$gapchar) = @_;
1669 $gapchar = $gapchar || $self->gap_char;
1670 my %gap_hsh; # column gaps vector
1671 foreach my $seq ( $self->each_seq ) {
1672 my $i = 0;
1673 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1674 map {[$i++, $_]} split(//, uc ($seq->seq));
1676 my $gap_line;
1677 foreach my $pos ( 0..$self->length-1 ) {
1678 $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1680 return $gap_line;
1683 =head2 all_gap_line
1685 Title : all_gap_line()
1686 Usage : $line = $align->all_gap_line()
1687 Function : Generates a gap line - much like consensus string
1688 except that a line where '-' represents all-gap column
1689 Args : (optional) gap line characters ('-' by default)
1690 Returns : string
1692 =cut
1694 sub all_gap_line {
1695 my ($self,$gapchar) = @_;
1696 $gapchar = $gapchar || $self->gap_char;
1697 my %gap_hsh; # column gaps counter hash
1698 my @seqs = $self->each_seq;
1699 foreach my $seq ( @seqs ) {
1700 my $i = 0;
1701 map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1702 map {[$i++, $_]} split(//, uc ($seq->seq));
1704 my $gap_line;
1705 foreach my $pos ( 0..$self->length-1 ) {
1706 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1707 # gaps column
1708 $gap_line .= $gapchar;
1709 } else {
1710 $gap_line .= '.';
1713 return $gap_line;
1716 =head2 gap_col_matrix
1718 Title : gap_col_matrix()
1719 Usage : my $cols = $align->gap_col_matrix()
1720 Function : Generates an array of hashes where
1721 each entry in the array is a hash reference
1722 with keys of all the sequence names and
1723 and value of 1 or 0 if the sequence has a gap at that column
1724 Args : (optional) gap line characters ($aln->gap_char or '-' by default)
1726 =cut
1728 sub gap_col_matrix {
1729 my ($self,$gapchar) = @_;
1730 $gapchar = $gapchar || $self->gap_char;
1731 my %gap_hsh; # column gaps vector
1732 my @cols;
1733 foreach my $seq ( $self->each_seq ) {
1734 my $i = 0;
1735 my $str = $seq->seq;
1736 my $len = $seq->length;
1737 my $ch;
1738 my $id = $seq->display_id;
1739 while( $i < $len ) {
1740 $ch = substr($str, $i, 1);
1741 $cols[$i++]->{$id} = ($ch eq $gapchar);
1744 return \@cols;
1747 =head2 match
1749 Title : match()
1750 Usage : $ali->match()
1751 Function : Goes through all columns and changes residues that are
1752 identical to residue in first sequence to match '.'
1753 character. Sets match_char.
1755 USE WITH CARE: Most MSA formats do not support match
1756 characters in sequences, so this is mostly for output
1757 only. NEXUS format (Bio::AlignIO::nexus) can handle
1759 Returns : 1
1760 Argument : a match character, optional, defaults to '.'
1762 =cut
1764 sub match {
1765 my ($self, $match) = @_;
1767 $match ||= '.';
1768 my ($matching_char) = $match;
1769 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1770 $self->map_chars($matching_char, '-');
1772 my @seqs = $self->each_seq();
1773 return 1 unless scalar @seqs > 1;
1775 my $refseq = shift @seqs ;
1776 my @refseq = split //, $refseq->seq;
1777 my $gapchar = $self->gap_char;
1779 foreach my $seq ( @seqs ) {
1780 my @varseq = split //, $seq->seq();
1781 for ( my $i=0; $i < scalar @varseq; $i++) {
1782 $varseq[$i] = $match if defined $refseq[$i] &&
1783 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1784 $refseq[$i] =~ /$gapchar/ )
1785 && $refseq[$i] eq $varseq[$i];
1787 $seq->seq(join '', @varseq);
1789 $self->match_char($match);
1790 return 1;
1794 =head2 unmatch
1796 Title : unmatch()
1797 Usage : $ali->unmatch()
1798 Function : Undoes the effect of method match. Unsets match_char.
1799 Returns : 1
1800 Argument : a match character, optional, defaults to '.'
1802 See L<match> and L<match_char>
1804 =cut
1806 sub unmatch {
1807 my ($self, $match) = @_;
1809 $match ||= '.';
1811 my @seqs = $self->each_seq();
1812 return 1 unless scalar @seqs > 1;
1814 my $refseq = shift @seqs ;
1815 my @refseq = split //, $refseq->seq;
1816 my $gapchar = $self->gap_char;
1817 foreach my $seq ( @seqs ) {
1818 my @varseq = split //, $seq->seq();
1819 for ( my $i=0; $i < scalar @varseq; $i++) {
1820 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1821 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1822 $refseq[$i] =~ /$gapchar/ ) &&
1823 $varseq[$i] eq $match;
1825 $seq->seq(join '', @varseq);
1827 $self->match_char('');
1828 return 1;
1831 =head1 MSA attributes
1833 Methods for setting and reading the MSA attributes.
1835 Note that the methods defining character semantics depend on the user
1836 to set them sensibly. They are needed only by certain input/output
1837 methods. Unset them by setting to an empty string ('').
1839 =head2 id
1841 Title : id
1842 Usage : $myalign->id("Ig")
1843 Function : Gets/sets the id field of the alignment
1844 Returns : An id string
1845 Argument : An id string (optional)
1847 =cut
1849 sub id {
1850 my ($self, $name) = @_;
1852 if (defined( $name )) {
1853 $self->{'_id'} = $name;
1856 return $self->{'_id'};
1859 =head2 accession
1861 Title : accession
1862 Usage : $myalign->accession("PF00244")
1863 Function : Gets/sets the accession field of the alignment
1864 Returns : An acc string
1865 Argument : An acc string (optional)
1867 =cut
1869 sub accession {
1870 my ($self, $acc) = @_;
1872 if (defined( $acc )) {
1873 $self->{'_accession'} = $acc;
1876 return $self->{'_accession'};
1879 =head2 description
1881 Title : description
1882 Usage : $myalign->description("14-3-3 proteins")
1883 Function : Gets/sets the description field of the alignment
1884 Returns : An description string
1885 Argument : An description string (optional)
1887 =cut
1889 sub description {
1890 my ($self, $name) = @_;
1892 if (defined( $name )) {
1893 $self->{'_description'} = $name;
1896 return $self->{'_description'};
1899 =head2 missing_char
1901 Title : missing_char
1902 Usage : $myalign->missing_char("?")
1903 Function : Gets/sets the missing_char attribute of the alignment
1904 It is generally recommended to set it to 'n' or 'N'
1905 for nucleotides and to 'X' for protein.
1906 Returns : An missing_char string,
1907 Argument : An missing_char string (optional)
1909 =cut
1911 sub missing_char {
1912 my ($self, $char) = @_;
1914 if (defined $char ) {
1915 $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1916 $self->{'_missing_char'} = $char;
1919 return $self->{'_missing_char'};
1922 =head2 match_char
1924 Title : match_char
1925 Usage : $myalign->match_char('.')
1926 Function : Gets/sets the match_char attribute of the alignment
1927 Returns : An match_char string,
1928 Argument : An match_char string (optional)
1930 =cut
1932 sub match_char {
1933 my ($self, $char) = @_;
1935 if (defined $char ) {
1936 $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1937 $self->{'_match_char'} = $char;
1940 return $self->{'_match_char'};
1943 =head2 gap_char
1945 Title : gap_char
1946 Usage : $myalign->gap_char('-')
1947 Function : Gets/sets the gap_char attribute of the alignment
1948 Returns : An gap_char string, defaults to '-'
1949 Argument : An gap_char string (optional)
1951 =cut
1953 sub gap_char {
1954 my ($self, $char) = @_;
1956 if (defined $char || ! defined $self->{'_gap_char'} ) {
1957 $char= '-' unless defined $char;
1958 $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1959 $self->{'_gap_char'} = $char;
1961 return $self->{'_gap_char'};
1964 =head2 symbol_chars
1966 Title : symbol_chars
1967 Usage : my @symbolchars = $aln->symbol_chars;
1968 Function: Returns all the seen symbols (other than gaps)
1969 Returns : array of characters that are the seen symbols
1970 Args : boolean to include the gap/missing/match characters
1972 =cut
1974 sub symbol_chars{
1975 my ($self,$includeextra) = @_;
1977 unless ($self->{'_symbols'}) {
1978 foreach my $seq ($self->each_seq) {
1979 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1982 my %copy = %{$self->{'_symbols'}};
1983 if( ! $includeextra ) {
1984 foreach my $char ( $self->gap_char, $self->match_char,
1985 $self->missing_char) {
1986 delete $copy{$char} if( defined $char );
1989 return keys %copy;
1992 =head1 Alignment descriptors
1994 These read only methods describe the MSA in various ways.
1997 =head2 score
1999 Title : score
2000 Usage : $str = $ali->score()
2001 Function : get/set a score of the alignment
2002 Returns : a score for the alignment
2003 Argument : an optional score to set
2005 =cut
2007 sub score {
2008 my $self = shift;
2009 $self->{score} = shift if @_;
2010 return $self->{score};
2013 =head2 consensus_string
2015 Title : consensus_string
2016 Usage : $str = $ali->consensus_string($threshold_percent)
2017 Function : Makes a strict consensus
2018 Returns : Consensus string
2019 Argument : Optional treshold ranging from 0 to 100.
2020 The consensus residue has to appear at least threshold %
2021 of the sequences at a given location, otherwise a '?'
2022 character will be placed at that location.
2023 (Default value = 0%)
2025 =cut
2027 sub consensus_string {
2028 my $self = shift;
2029 my $threshold = shift;
2031 my $out = "";
2032 my $len = $self->length - 1;
2034 foreach ( 0 .. $len ) {
2035 $out .= $self->_consensus_aa($_,$threshold);
2037 return $out;
2040 sub _consensus_aa {
2041 my $self = shift;
2042 my $point = shift;
2043 my $threshold_percent = shift || -1 ;
2044 my ($seq,%hash,$count,$letter,$key);
2045 my $gapchar = $self->gap_char;
2046 foreach $seq ( $self->each_seq() ) {
2047 $letter = substr($seq->seq,$point,1);
2048 $self->throw("--$point-----------") if $letter eq '';
2049 ($letter eq $gapchar || $letter =~ /\./) && next;
2050 # print "Looking at $letter\n";
2051 $hash{$letter}++;
2053 my $number_of_sequences = $self->num_sequences();
2054 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2055 $count = -1;
2056 $letter = '?';
2058 foreach $key ( sort keys %hash ) {
2059 # print "Now at $key $hash{$key}\n";
2060 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2061 $letter = $key;
2062 $count = $hash{$key};
2065 return $letter;
2069 =head2 consensus_iupac
2071 Title : consensus_iupac
2072 Usage : $str = $ali->consensus_iupac()
2073 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2074 and RNA. The output is in upper case except when gaps in
2075 a column force output to be in lower case.
2077 Note that if your alignment sequences contain a lot of
2078 IUPAC ambiquity codes you often have to manually set
2079 alphabet. Bio::PrimarySeq::_guess_type thinks they
2080 indicate a protein sequence.
2081 Returns : consensus string
2082 Argument : none
2083 Throws : on protein sequences
2085 =cut
2087 sub consensus_iupac {
2088 my $self = shift;
2089 my $out = "";
2090 my $len = $self->length-1;
2092 # only DNA and RNA sequences are valid
2093 foreach my $seq ( $self->each_seq() ) {
2094 $self->throw("Seq [". $seq->get_nse. "] is a protein")
2095 if $seq->alphabet eq 'protein';
2097 # loop over the alignment columns
2098 foreach my $count ( 0 .. $len ) {
2099 $out .= $self->_consensus_iupac($count);
2101 return $out;
2104 sub _consensus_iupac {
2105 my ($self, $column) = @_;
2106 my ($string, $char, $rna);
2108 #determine all residues in a column
2109 foreach my $seq ( $self->each_seq() ) {
2110 $string .= substr($seq->seq, $column, 1);
2112 $string = uc $string;
2114 # quick exit if there's an N in the string
2115 if ($string =~ /N/) {
2116 $string =~ /\W/ ? return 'n' : return 'N';
2118 # ... or if there are only gap characters
2119 return '-' if $string =~ /^\W+$/;
2121 # treat RNA as DNA in regexps
2122 if ($string =~ /U/) {
2123 $string =~ s/U/T/;
2124 $rna = 1;
2127 # the following s///'s only need to be done to the _first_ ambiguity code
2128 # as we only need to see the _range_ of characters in $string
2130 if ($string =~ /[VDHB]/) {
2131 $string =~ s/V/AGC/;
2132 $string =~ s/D/AGT/;
2133 $string =~ s/H/ACT/;
2134 $string =~ s/B/CTG/;
2137 if ($string =~ /[SKYRWM]/) {
2138 $string =~ s/S/GC/;
2139 $string =~ s/K/GT/;
2140 $string =~ s/Y/CT/;
2141 $string =~ s/R/AG/;
2142 $string =~ s/W/AT/;
2143 $string =~ s/M/AC/;
2146 # and now the guts of the thing
2148 if ($string =~ /A/) {
2149 $char = 'A'; # A A
2150 if ($string =~ /G/) {
2151 $char = 'R'; # A and G (purines) R
2152 if ($string =~ /C/) {
2153 $char = 'V'; # A and G and C V
2154 if ($string =~ /T/) {
2155 $char = 'N'; # A and G and C and T N
2157 } elsif ($string =~ /T/) {
2158 $char = 'D'; # A and G and T D
2160 } elsif ($string =~ /C/) {
2161 $char = 'M'; # A and C M
2162 if ($string =~ /T/) {
2163 $char = 'H'; # A and C and T H
2165 } elsif ($string =~ /T/) {
2166 $char = 'W'; # A and T W
2168 } elsif ($string =~ /C/) {
2169 $char = 'C'; # C C
2170 if ($string =~ /T/) {
2171 $char = 'Y'; # C and T (pyrimidines) Y
2172 if ($string =~ /G/) {
2173 $char = 'B'; # C and T and G B
2175 } elsif ($string =~ /G/) {
2176 $char = 'S'; # C and G S
2178 } elsif ($string =~ /G/) {
2179 $char = 'G'; # G G
2180 if ($string =~ /C/) {
2181 $char = 'S'; # G and C S
2182 } elsif ($string =~ /T/) {
2183 $char = 'K'; # G and T K
2185 } elsif ($string =~ /T/) {
2186 $char = 'T'; # T T
2189 $char = 'U' if $rna and $char eq 'T';
2190 $char = lc $char if $string =~ /\W/;
2192 return $char;
2196 =head2 consensus_meta
2198 Title : consensus_meta
2199 Usage : $seqmeta = $ali->consensus_meta()
2200 Function : Returns a Bio::Seq::Meta object containing the consensus
2201 strings derived from meta data analysis.
2202 Returns : Bio::Seq::Meta
2203 Argument : Bio::Seq::Meta
2204 Throws : non-MetaI object
2206 =cut
2208 sub consensus_meta {
2209 my ($self, $meta) = @_;
2210 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2211 $self->throw('Not a Bio::Seq::MetaI object');
2213 return $self->{'_aln_meta'} = $meta if $meta;
2214 return $self->{'_aln_meta'}
2217 =head2 is_flush
2219 Title : is_flush
2220 Usage : if ( $ali->is_flush() )
2221 Function : Tells you whether the alignment
2222 : is flush, i.e. all of the same length
2223 Returns : 1 or 0
2224 Argument :
2226 =cut
2228 sub is_flush {
2229 my ($self,$report) = @_;
2230 my $seq;
2231 my $length = (-1);
2232 my $temp;
2234 foreach $seq ( $self->each_seq() ) {
2235 if( $length == (-1) ) {
2236 $length = CORE::length($seq->seq());
2237 next;
2240 $temp = CORE::length($seq->seq());
2241 if( $temp != $length ) {
2242 $self->warn("expecting $length not $temp from ".
2243 $seq->display_id) if( $report );
2244 $self->debug("expecting $length not $temp from ".
2245 $seq->display_id);
2246 $self->debug($seq->seq(). "\n");
2247 return 0;
2251 return 1;
2255 =head2 length
2257 Title : length()
2258 Usage : $len = $ali->length()
2259 Function : Returns the maximum length of the alignment.
2260 To be sure the alignment is a block, use is_flush
2261 Returns : Integer
2262 Argument :
2264 =cut
2266 sub length_aln {
2267 my $self = shift;
2268 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2269 $self->length(@_);
2272 sub length {
2273 my $self = shift;
2274 my $seq;
2275 my $length = -1;
2276 my $temp;
2278 foreach $seq ( $self->each_seq() ) {
2279 $temp = $seq->length();
2280 if( $temp > $length ) {
2281 $length = $temp;
2285 return $length;
2289 =head2 maxdisplayname_length
2291 Title : maxdisplayname_length
2292 Usage : $ali->maxdisplayname_length()
2293 Function : Gets the maximum length of the displayname in the
2294 alignment. Used in writing out various MSA formats.
2295 Returns : integer
2296 Argument :
2298 =cut
2300 sub maxname_length {
2301 my $self = shift;
2302 $self->deprecated("maxname_length - deprecated method.".
2303 " Use maxdisplayname_length() instead.");
2304 $self->maxdisplayname_length();
2307 sub maxnse_length {
2308 my $self = shift;
2309 $self->deprecated("maxnse_length - deprecated method.".
2310 " Use maxnse_length() instead.");
2311 $self->maxdisplayname_length();
2314 sub maxdisplayname_length {
2315 my $self = shift;
2316 my $maxname = (-1);
2317 my ($seq,$len);
2319 foreach $seq ( $self->each_seq() ) {
2320 $len = CORE::length $self->displayname($seq->get_nse());
2322 if( $len > $maxname ) {
2323 $maxname = $len;
2327 return $maxname;
2330 =head2 max_metaname_length
2332 Title : max_metaname_length
2333 Usage : $ali->max_metaname_length()
2334 Function : Gets the maximum length of the meta name tags in the
2335 alignment for the sequences and for the alignment.
2336 Used in writing out various MSA formats.
2337 Returns : integer
2338 Argument : None
2340 =cut
2342 sub max_metaname_length {
2343 my $self = shift;
2344 my $maxname = (-1);
2345 my ($seq,$len);
2347 # check seq meta first
2348 for $seq ( $self->each_seq() ) {
2349 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2350 for my $mtag ($seq->meta_names) {
2351 $len = CORE::length $mtag;
2352 if( $len > $maxname ) {
2353 $maxname = $len;
2358 # alignment meta
2359 for my $meta ($self->consensus_meta) {
2360 next unless $meta;
2361 for my $name ($meta->meta_names) {
2362 $len = CORE::length $name;
2363 if( $len > $maxname ) {
2364 $maxname = $len;
2369 return $maxname;
2372 =head2 num_residues
2374 Title : num_residues
2375 Usage : $no = $ali->num_residues
2376 Function : number of residues in total in the alignment
2377 Returns : integer
2378 Argument :
2379 Note : replaces no_residues()
2381 =cut
2383 sub num_residues {
2384 my $self = shift;
2385 my $count = 0;
2387 foreach my $seq ($self->each_seq) {
2388 my $str = $seq->seq();
2390 $count += ($str =~ s/[A-Za-z]//g);
2393 return $count;
2396 =head2 num_sequences
2398 Title : num_sequences
2399 Usage : $depth = $ali->num_sequences
2400 Function : number of sequence in the sequence alignment
2401 Returns : integer
2402 Argument : none
2403 Note : replaces no_sequences()
2405 =cut
2407 sub num_sequences {
2408 my $self = shift;
2409 return scalar($self->each_seq);
2413 =head2 average_percentage_identity
2415 Title : average_percentage_identity
2416 Usage : $id = $align->average_percentage_identity
2417 Function: The function uses a fast method to calculate the average
2418 percentage identity of the alignment
2419 Returns : The average percentage identity of the alignment
2420 Args : None
2421 Notes : This method implemented by Kevin Howe calculates a figure that is
2422 designed to be similar to the average pairwise identity of the
2423 alignment (identical in the absence of gaps), without having to
2424 explicitly calculate pairwise identities proposed by Richard Durbin.
2425 Validated by Ewan Birney ad Alex Bateman.
2427 =cut
2429 sub average_percentage_identity{
2430 my ($self,@args) = @_;
2432 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2433 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2435 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2437 if (! $self->is_flush()) {
2438 $self->throw("All sequences in the alignment must be the same length");
2441 @seqs = $self->each_seq();
2442 $len = $self->length();
2444 # load the each hash with correct keys for existence checks
2446 for( my $index=0; $index < $len; $index++) {
2447 foreach my $letter (@alphabet) {
2448 $countHashes[$index]->{$letter} = 0;
2451 foreach my $seq (@seqs) {
2452 my @seqChars = split //, $seq->seq();
2453 for( my $column=0; $column < @seqChars; $column++ ) {
2454 my $char = uc($seqChars[$column]);
2455 if (exists $countHashes[$column]->{$char}) {
2456 $countHashes[$column]->{$char}++;
2461 $total = 0;
2462 $divisor = 0;
2463 for(my $column =0; $column < $len; $column++) {
2464 my %hash = %{$countHashes[$column]};
2465 $subdivisor = 0;
2466 foreach my $res (keys %hash) {
2467 $total += $hash{$res}*($hash{$res} - 1);
2468 $subdivisor += $hash{$res};
2470 $divisor += $subdivisor * ($subdivisor - 1);
2472 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2475 =head2 percentage_identity
2477 Title : percentage_identity
2478 Usage : $id = $align->percentage_identity
2479 Function: The function calculates the average percentage identity
2480 (aliased to average_percentage_identity)
2481 Returns : The average percentage identity
2482 Args : None
2484 =cut
2486 sub percentage_identity {
2487 my $self = shift;
2488 return $self->average_percentage_identity();
2491 =head2 overall_percentage_identity
2493 Title : overall_percentage_identity
2494 Usage : $id = $align->overall_percentage_identity
2495 $id = $align->overall_percentage_identity('short')
2496 Function: The function calculates the percentage identity of
2497 the conserved columns
2498 Returns : The percentage identity of the conserved columns
2499 Args : length value to use, optional defaults to alignment length
2500 possible values: 'align', 'short', 'long'
2502 The argument values 'short' and 'long' refer to shortest and longest
2503 sequence in the alignment. Method modification code by Hongyu Zhang.
2505 =cut
2507 sub overall_percentage_identity{
2508 my ($self, $length_measure) = @_;
2510 my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2512 my %enum = map {$_ => undef} qw (align short long);
2514 $self->throw("Unknown argument [$length_measure]")
2515 if $length_measure and not exists $enum{$length_measure};
2516 $length_measure ||= 'align';
2518 if (! $self->is_flush()) {
2519 $self->throw("All sequences in the alignment must be the same length");
2522 # Count the residues seen at each position
2523 my $len;
2524 my $total = 0; # number of positions with identical residues
2525 my @countHashes;
2526 my @seqs = $self->each_seq;
2527 my $nof_seqs = scalar @seqs;
2528 my $aln_len = $self->length();
2529 for my $seq (@seqs) {
2530 my $seqstr = $seq->seq;
2532 # Count residues for given sequence
2533 for my $column (0 .. $aln_len-1) {
2534 my $char = uc( substr($seqstr, $column, 1) );
2535 if ( exists $alphabet{$char} ) {
2537 # This is a valid char
2538 if ( defined $countHashes[$column]->{$char} ) {
2539 $countHashes[$column]->{$char}++;
2540 } else {
2541 $countHashes[$column]->{$char} = 1;
2544 if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2545 # All sequences have this same residue
2546 $total++;
2552 # Sequence length
2553 if ($length_measure eq 'short' || $length_measure eq 'long') {
2554 my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2555 if ($length_measure eq 'short') {
2556 if ( (not defined $len) || ($seq_len < $len) ) {
2557 $len = $seq_len;
2559 } elsif ($length_measure eq 'long') {
2560 if ( (not defined $len) || ($seq_len > $len) ) {
2561 $len = $seq_len;
2568 if ($length_measure eq 'align') {
2569 $len = $aln_len;
2572 return ($total / $len ) * 100.0;
2577 =head1 Alignment positions
2579 Methods to map a sequence position into an alignment column and back.
2580 column_from_residue_number() does the former. The latter is really a
2581 property of the sequence object and can done using
2582 L<Bio::LocatableSeq::location_from_column>:
2584 # select somehow a sequence from the alignment, e.g.
2585 my $seq = $aln->get_seq_by_pos(1);
2586 #$loc is undef or Bio::LocationI object
2587 my $loc = $seq->location_from_column(5);
2589 =head2 column_from_residue_number
2591 Title : column_from_residue_number
2592 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2593 Function: This function gives the position in the alignment
2594 (i.e. column number) of the given residue number in the
2595 sequence with the given name. For example, for the
2596 alignment
2598 Seq1/91-97 AC..DEF.GH.
2599 Seq2/24-30 ACGG.RTY...
2600 Seq3/43-51 AC.DDEF.GHI
2602 column_from_residue_number( "Seq1", 94 ) returns 6.
2603 column_from_residue_number( "Seq2", 25 ) returns 2.
2604 column_from_residue_number( "Seq3", 50 ) returns 10.
2606 An exception is thrown if the residue number would lie
2607 outside the length of the aligment
2608 (e.g. column_from_residue_number( "Seq2", 22 )
2610 Note: If the the parent sequence is represented by more than
2611 one alignment sequence and the residue number is present in
2612 them, this method finds only the first one.
2614 Returns : A column number for the position in the alignment of the
2615 given residue in the given sequence (1 = first column)
2616 Args : A sequence id/name (not a name/start-end)
2617 A residue number in the whole sequence (not just that
2618 segment of it in the alignment)
2620 =cut
2622 sub column_from_residue_number {
2623 my ($self, $name, $resnumber) = @_;
2625 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2626 $self->throw("Second argument residue number missing") unless $resnumber;
2628 foreach my $seq ($self->each_seq_with_id($name)) {
2629 my $col;
2630 eval {
2631 $col = $seq->column_from_residue_number($resnumber);
2633 next if $@;
2634 return $col;
2637 $self->throw("Could not find a sequence segment in $name ".
2638 "containing residue number $resnumber");
2642 =head1 Sequence names
2644 Methods to manipulate the display name. The default name based on the
2645 sequence id and subsequence positions can be overridden in various
2646 ways.
2648 =head2 displayname
2650 Title : displayname
2651 Usage : $myalign->displayname("Ig", "IgA")
2652 Function : Gets/sets the display name of a sequence in the alignment
2653 Returns : A display name string
2654 Argument : name of the sequence
2655 displayname of the sequence (optional)
2657 =cut
2659 sub get_displayname {
2660 my $self = shift;
2661 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2662 $self->displayname(@_);
2665 sub set_displayname {
2666 my $self = shift;
2667 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2668 $self->displayname(@_);
2671 sub displayname {
2672 my ($self, $name, $disname) = @_;
2674 $self->throw("No sequence with name [$name]")
2675 unless defined $self->{'_seq'}->{$name};
2677 if( $disname and $name) {
2678 $self->{'_dis_name'}->{$name} = $disname;
2679 return $disname;
2681 elsif( defined $self->{'_dis_name'}->{$name} ) {
2682 return $self->{'_dis_name'}->{$name};
2683 } else {
2684 return $name;
2688 =head2 set_displayname_count
2690 Title : set_displayname_count
2691 Usage : $ali->set_displayname_count
2692 Function : Sets the names to be name_# where # is the number of
2693 times this name has been used.
2694 Returns : 1, on success
2695 Argument :
2697 =cut
2699 sub set_displayname_count {
2700 my $self= shift;
2701 my (@arr,$name,$seq,$count,$temp,$nse);
2703 foreach $seq ( $self->each_alphabetically() ) {
2704 $nse = $seq->get_nse();
2706 #name will be set when this is the second
2707 #time (or greater) is has been seen
2709 if( defined $name and $name eq ($seq->id()) ) {
2710 $temp = sprintf("%s_%s",$name,$count);
2711 $self->displayname($nse,$temp);
2712 $count++;
2713 } else {
2714 $count = 1;
2715 $name = $seq->id();
2716 $temp = sprintf("%s_%s",$name,$count);
2717 $self->displayname($nse,$temp);
2718 $count++;
2721 return 1;
2724 =head2 set_displayname_flat
2726 Title : set_displayname_flat
2727 Usage : $ali->set_displayname_flat()
2728 Function : Makes all the sequences be displayed as just their name,
2729 not name/start-end
2730 Returns : 1
2731 Argument :
2733 =cut
2735 sub set_displayname_flat {
2736 my $self = shift;
2737 my ($nse,$seq);
2739 foreach $seq ( $self->each_seq() ) {
2740 $nse = $seq->get_nse();
2741 $self->displayname($nse,$seq->id());
2743 return 1;
2746 =head2 set_displayname_normal
2748 Title : set_displayname_normal
2749 Usage : $ali->set_displayname_normal()
2750 Function : Makes all the sequences be displayed as name/start-end
2751 Returns : 1, on success
2752 Argument :
2754 =cut
2756 sub set_displayname_normal {
2757 my $self = shift;
2758 my ($nse,$seq);
2760 foreach $seq ( $self->each_seq() ) {
2761 $nse = $seq->get_nse();
2762 $self->displayname($nse,$nse);
2764 return 1;
2767 =head2 source
2769 Title : source
2770 Usage : $obj->source($newval)
2771 Function: sets the Alignment source program
2772 Example :
2773 Returns : value of source
2774 Args : newvalue (optional)
2777 =cut
2779 sub source{
2780 my ($self,$value) = @_;
2781 if( defined $value) {
2782 $self->{'_source'} = $value;
2784 return $self->{'_source'};
2787 =head2 set_displayname_safe
2789 Title : set_displayname_safe
2790 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2791 Function : Assign machine-generated serial names to sequences in input order.
2792 Designed to protect names during PHYLIP runs. Assign 10-char string
2793 in the form of "S000000001" to "S999999999". Restore the original
2794 names using "restore_displayname".
2795 Returns : 1. a new $aln with system names;
2796 2. a hash ref for restoring names
2797 Argument : Number for id length (default 10)
2799 =cut
2801 sub set_displayname_safe {
2802 my $self = shift;
2803 my $idlength = shift || 10;
2804 my ($seq, %phylip_name);
2805 my $ct=0;
2806 my $new=Bio::SimpleAlign->new();
2807 foreach $seq ( $self->each_seq() ) {
2808 $ct++;
2809 my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2810 $phylip_name{$pname}=$seq->id();
2811 my $new_seq= Bio::LocatableSeq->new(-id => $pname,
2812 -seq => $seq->seq(),
2813 -alphabet => $seq->alphabet,
2814 -start => $seq->{_start},
2815 -end => $seq->{_end}
2817 $new->add_seq($new_seq);
2820 $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2821 return ($new, \%phylip_name);
2824 =head2 restore_displayname
2826 Title : restore_displayname
2827 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2828 Function : Restore original sequence names (after running
2829 $ali->set_displayname_safe)
2830 Returns : a new $aln with names restored.
2831 Argument : a hash reference of names from "set_displayname_safe".
2833 =cut
2835 sub restore_displayname {
2836 my $self = shift;
2837 my $ref=shift;
2838 my %name=%$ref;
2839 my $new=Bio::SimpleAlign->new();
2840 foreach my $seq ( $self->each_seq() ) {
2841 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2842 my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2843 -seq => $seq->seq(),
2844 -alphabet => $seq->alphabet,
2845 -start => $seq->{_start},
2846 -end => $seq->{_end}
2848 $new->add_seq($new_seq);
2850 return $new;
2853 =head2 sort_by_start
2855 Title : sort_by_start
2856 Usage : $ali->sort_by_start
2857 Function : Changes the order of the alignment to the start position of each
2858 subalignment
2859 Returns :
2860 Argument :
2862 =cut
2864 sub sort_by_start {
2865 my $self = shift;
2866 my ($seq,$nse,@arr,%hash,$count);
2867 foreach $seq ( $self->each_seq() ) {
2868 $nse = $seq->get_nse;
2869 $hash{$nse} = $seq;
2871 $count = 0;
2872 %{$self->{'_order'}} = (); # reset the hash;
2873 foreach $nse ( sort _startend keys %hash) {
2874 $self->{'_order'}->{$count} = $nse;
2875 $count++;
2880 sub _startend
2882 my ($aname,$arange) = split (/[\/]/,$a);
2883 my ($bname,$brange) = split (/[\/]/,$b);
2884 my ($astart,$aend) = split(/\-/,$arange);
2885 my ($bstart,$bend) = split(/\-/,$brange);
2886 return $astart <=> $bstart;
2889 =head2 bracket_string
2891 Title : bracket_string
2892 Usage : my @params = (-refseq => 'testseq',
2893 -allele1 => 'allele1',
2894 -allele2 => 'allele2',
2895 -delimiters => '{}',
2896 -separator => '/');
2897 $str = $aln->bracket_string(@params)
2899 Function : When supplied with a list of parameters (see below), returns a
2900 string in BIC format. This is used for allelic comparisons.
2901 Briefly, if either allele contains a base change when compared to
2902 the refseq, the base or gap for each allele is represented in
2903 brackets in the order present in the 'alleles' parameter.
2905 For the following data:
2907 >testseq
2908 GGATCCATTGCTACT
2909 >allele1
2910 GGATCCATTCCTACT
2911 >allele2
2912 GGAT--ATTCCTCCT
2914 the returned string with parameters 'refseq => testseq' and
2915 'alleles => [qw(allele1 allele2)]' would be:
2917 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2918 Returns : BIC-formatted string
2919 Argument : Required args
2920 refseq : string (ID) of the reference sequence used
2921 as basis for comparison
2922 allele1 : string (ID) of the first allele
2923 allele2 : string (ID) of the second allele
2924 Optional args
2925 delimiters: two symbol string of left and right delimiters.
2926 Only the first two symbols are used
2927 default = '[]'
2928 separator : string used as a separator. Only the first
2929 symbol is used
2930 default = '/'
2931 Throws : On no refseq/alleles, or invalid refseq/alleles.
2933 =cut
2935 sub bracket_string {
2936 my ($self, @args) = @_;
2937 my ($ref, $a1, $a2, $delim, $sep) =
2938 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2939 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2940 my ($ld, $rd);
2941 ($ld, $rd) = split('', $delim, 2) if $delim;
2942 $ld ||= '[';
2943 $rd ||= ']';
2944 $sep ||= '/';
2945 my ($refseq, $allele1, $allele2) =
2946 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2947 if (!$refseq || !$allele1 || !$allele2) {
2948 $self->throw("One of your refseq/allele IDs is invalid!");
2950 my $len = $self->length-1;
2951 my $bic = '';
2952 # loop over the alignment columns
2953 for my $column ( 0 .. $len ) {
2954 my $string;
2955 my ($compres, $res1, $res2) =
2956 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2957 # are any of the allele symbols different from the refseq?
2958 $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2959 $ld.$res1.$sep.$res2.$rd;
2960 $bic .= $string;
2962 return $bic;
2966 =head2 methods implementing Bio::FeatureHolderI
2968 FeatureHolderI implementation to support labeled character sets like one
2969 would get from NEXUS represented data.
2971 =head2 get_SeqFeatures
2973 Usage : @features = $aln->get_SeqFeatures
2974 Function: Get the feature objects held by this feature holder.
2975 Example :
2976 Returns : an array of Bio::SeqFeatureI implementing objects
2977 Args : optional filter coderef, taking a Bio::SeqFeatureI
2978 : as argument, returning TRUE if wanted, FALSE if
2979 : unwanted
2981 =cut
2983 sub get_SeqFeatures {
2984 my $self = shift;
2985 my $filter_cb = shift;
2986 $self->throw("Arg (filter callback) must be a coderef") unless
2987 !defined($filter_cb) or ref($filter_cb) eq 'CODE';
2988 if( !defined $self->{'_as_feat'} ) {
2989 $self->{'_as_feat'} = [];
2991 if ($filter_cb) {
2992 return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
2994 return @{$self->{'_as_feat'}};
2997 =head2 add_SeqFeature
2999 Usage : $aln->add_SeqFeature($subfeat);
3000 Function: adds a SeqFeature into the SeqFeature array.
3001 Example :
3002 Returns : true on success
3003 Args : a Bio::SeqFeatureI object
3004 Note : This implementation is not compliant
3005 with Bio::FeatureHolderI
3007 =cut
3009 sub add_SeqFeature {
3010 my ($self,@feat) = @_;
3012 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3014 foreach my $feat ( @feat ) {
3015 if( !$feat->isa("Bio::SeqFeatureI") ) {
3016 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3019 push(@{$self->{'_as_feat'}},$feat);
3021 return 1;
3025 =head2 remove_SeqFeatures
3027 Usage : $obj->remove_SeqFeatures
3028 Function: Removes all SeqFeatures. If you want to remove only a subset,
3029 remove that subset from the returned array, and add back the rest.
3030 Returns : The array of Bio::SeqFeatureI features that was
3031 deleted from this alignment.
3032 Args : none
3034 =cut
3036 sub remove_SeqFeatures {
3037 my $self = shift;
3039 return () unless $self->{'_as_feat'};
3040 my @feats = @{$self->{'_as_feat'}};
3041 $self->{'_as_feat'} = [];
3042 return @feats;
3045 =head2 feature_count
3047 Title : feature_count
3048 Usage : $obj->feature_count()
3049 Function: Return the number of SeqFeatures attached to the alignment
3050 Returns : integer representing the number of SeqFeatures
3051 Args : None
3053 =cut
3055 sub feature_count {
3056 my ($self) = @_;
3058 if (defined($self->{'_as_feat'})) {
3059 return ($#{$self->{'_as_feat'}} + 1);
3060 } else {
3061 return 0;
3065 =head2 get_all_SeqFeatures
3067 Title : get_all_SeqFeatures
3068 Usage :
3069 Function: Get all SeqFeatures.
3070 Example :
3071 Returns : an array of Bio::SeqFeatureI implementing objects
3072 Args : none
3073 Note : Falls through to Bio::FeatureHolderI implementation.
3075 =cut
3077 =head2 methods for Bio::AnnotatableI
3079 AnnotatableI implementation to support sequence alignments which
3080 contain annotation (NEXUS, Stockholm).
3082 =head2 annotation
3084 Title : annotation
3085 Usage : $ann = $aln->annotation or
3086 $aln->annotation($ann)
3087 Function: Gets or sets the annotation
3088 Returns : Bio::AnnotationCollectionI object
3089 Args : None or Bio::AnnotationCollectionI object
3091 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3092 for more information
3094 =cut
3096 sub annotation {
3097 my ($obj,$value) = @_;
3098 if( defined $value ) {
3099 $obj->throw("object of class ".ref($value)." does not implement ".
3100 "Bio::AnnotationCollectionI. Too bad.")
3101 unless $value->isa("Bio::AnnotationCollectionI");
3102 $obj->{'_annotation'} = $value;
3103 } elsif( ! defined $obj->{'_annotation'}) {
3104 $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3106 return $obj->{'_annotation'};
3109 =head1 Deprecated methods
3111 =cut
3113 =head2 no_residues
3115 Title : no_residues
3116 Usage : $no = $ali->no_residues
3117 Function : number of residues in total in the alignment
3118 Returns : integer
3119 Argument :
3120 Note : deprecated in favor of num_residues()
3122 =cut
3124 sub no_residues {
3125 my $self = shift;
3126 $self->deprecated(-warn_version => 1.0069,
3127 -throw_version => 1.0075,
3128 -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3129 $self->num_residues(@_);
3132 =head2 no_sequences
3134 Title : no_sequences
3135 Usage : $depth = $ali->no_sequences
3136 Function : number of sequence in the sequence alignment
3137 Returns : integer
3138 Argument :
3139 Note : deprecated in favor of num_sequences()
3141 =cut
3143 sub no_sequences {
3144 my $self = shift;
3145 $self->deprecated(-warn_version => 1.0069,
3146 -throw_version => 1.0075,
3147 -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3148 $self->num_sequences(@_);
3151 =head2 mask_columns
3153 Title : mask_columns
3154 Usage : $aln2 = $aln->mask_columns(20,30)
3155 Function : Masks a slice of the alignment inclusive of start and
3156 end columns, and the first column in the alignment is denoted 1.
3157 Mask beyond the length of the sequence does not do padding.
3158 Returns : A Bio::SimpleAlign object
3159 Args : Positive integer for start column, positive integer for end column,
3160 optional string value use for the mask. Example:
3162 $aln2 = $aln->mask_columns(20,30,'?')
3163 Note : Masking must use a character that is not used for gaps or
3164 frameshifts. These can be adjusted using the relevant global
3165 variables, but be aware these may be (uncontrollably) modified
3166 elsewhere within BioPerl (see bug 2715)
3168 =cut
3170 sub mask_columns {
3171 #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3172 my $self = shift;
3174 my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3175 $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3177 # coordinates are alignment-based, not sequence-based
3178 my ($start, $end, $mask_char) = @_;
3179 unless (defined $mask_char) { $mask_char = 'N' }
3181 $self->throw("Mask start has to be a positive integer and less than ".
3182 "alignment length, not [$start]")
3183 unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3184 $self->throw("Mask end has to be a positive integer and less than ".
3185 "alignment length, not [$end]")
3186 unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3187 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3188 "end [$end]") unless $start <= $end;
3189 $self->throw("Mask character $mask_char has to be a single character ".
3190 "and not a gap or frameshift symbol")
3191 unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3193 my $aln = $self->new;
3194 $aln->id($self->id);
3195 foreach my $seq ( $self->each_seq() ) {
3196 my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3197 -alphabet => $seq->alphabet,
3198 -strand => $seq->strand,
3199 -verbose => $self->verbose);
3201 # convert from 1-based alignment coords!
3202 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3203 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3204 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3205 $new_seq->seq($new_dna_string);
3206 $aln->add_seq($new_seq);
3208 return $aln;