tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / SimpleAlign.pm
blob14b5083382f9309019e2bf5d12b8ad9bebed2124
1 # $Id$
2 # BioPerl module for SimpleAlign
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 # History:
15 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
16 # May 2001 major rewrite - Heikki Lehvaslaiho
18 =head1 NAME
20 Bio::SimpleAlign - Multiple alignments held as a set of sequences
22 =head1 SYNOPSIS
24 # Use Bio::AlignIO to read in the alignment
25 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
26 $aln = $str->next_aln();
28 # Describe
29 print $aln->length;
30 print $aln->num_residues;
31 print $aln->is_flush;
32 print $aln->num_sequences;
33 print $aln->score;
34 print $aln->percentage_identity;
35 print $aln->consensus_string(50);
37 # Find the position in the alignment for a sequence location
38 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
40 # Extract sequences and check values for the alignment column $pos
41 foreach $seq ($aln->each_seq) {
42 $res = $seq->subseq($pos, $pos);
43 $count{$res}++;
45 foreach $res (keys %count) {
46 printf "Res: %s Count: %2d\n", $res, $count{$res};
49 # Manipulate
50 $aln->remove_seq($seq);
51 $mini_aln = $aln->slice(20,30); # get a block of columns
52 $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
53 $new_aln = $aln->remove_columns([20,30]); # remove by position
54 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
56 # Analyze
57 $str = $aln->consensus_string($threshold_percent);
58 $str = $aln->match_line();
59 $str = $aln->cigar_line();
60 $id = $aln->percentage_identity;
62 # See the module documentation for details and more methods.
64 =head1 DESCRIPTION
66 SimpleAlign is an object that handles a multiple sequence alignment
67 (MSA). It is very permissive of types (it does not insist on sequences
68 being all same length, for example). Think of it as a set of sequences
69 with a whole series of built-in manipulations and methods for reading and
70 writing alignments.
72 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
73 to store its sequences. These are subsequences with a start and end
74 positions in the parent reference sequence. Each sequence in the
75 SimpleAlign object is a Bio::LocatableSeq.
77 SimpleAlign expects the combination of name, start, and end for a
78 given sequence to be unique in the alignment, and this is the key for the
79 internal hashes (name, start, end are abbreviated C<nse> in the code).
80 However, in some cases people do not want the name/start-end to be displayed:
81 either multiple names in an alignment or names specific to the alignment
82 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
83 C<displayname>, and generally is what is used to print out the
84 alignment. They default to name/start-end.
86 The SimpleAlign Module is derived from the Align module by Ewan Birney.
88 =head1 FEEDBACK
90 =head2 Mailing Lists
92 User feedback is an integral part of the evolution of this and other
93 Bioperl modules. Send your comments and suggestions preferably to one
94 of the Bioperl mailing lists. Your participation is much appreciated.
96 bioperl-l@bioperl.org - General discussion
97 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
99 =head2 Support
101 Please direct usage questions or support issues to the mailing list:
103 I<bioperl-l@bioperl.org>
105 rather than to the module maintainer directly. Many experienced and
106 reponsive experts will be able look at the problem and quickly
107 address it. Please include a thorough description of the problem
108 with code and data examples if at all possible.
110 =head2 Reporting Bugs
112 Report bugs to the Bioperl bug tracking system to help us keep track
113 the bugs and their resolution. Bug reports can be submitted via the
114 web:
116 http://bugzilla.open-bio.org/
118 =head1 AUTHOR
120 Ewan Birney, birney@ebi.ac.uk
122 =head1 CONTRIBUTORS
124 Allen Day, allenday-at-ucla.edu,
125 Richard Adams, Richard.Adams-at-ed.ac.uk,
126 David J. Evans, David.Evans-at-vir.gla.ac.uk,
127 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
128 Allen Smith, allens-at-cpan.org,
129 Jason Stajich, jason-at-bioperl.org,
130 Anthony Underwood, aunderwood-at-phls.org.uk,
131 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
132 Brian Osborne, bosborne at alum.mit.edu
133 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
134 Hongyu Zhang, forward at hongyu.org
135 Jay Hannah, jay at jays.net
136 Alexandr Bezginov, albezg at gmail.com
138 =head1 SEE ALSO
140 L<Bio::LocatableSeq>
142 =head1 APPENDIX
144 The rest of the documentation details each of the object
145 methods. Internal methods are usually preceded with a _
147 =cut
149 # 'Let the code begin...
151 package Bio::SimpleAlign;
152 use vars qw(%CONSERVATION_GROUPS);
153 use strict;
155 use Bio::LocatableSeq; # uses Seq's as list
157 use Bio::Seq;
158 use Bio::SeqFeature::Generic;
160 BEGIN {
161 # This data should probably be in a more centralized module...
162 # it is taken from Clustalw documentation.
163 # These are all the positively scoring groups that occur in the
164 # Gonnet Pam250 matrix. The strong and weak groups are
165 # defined as strong score >0.5 and weak score =<0.5 respectively.
167 %CONSERVATION_GROUPS = (
168 'strong' => [ qw(
170 NEQK
171 NHQK
172 NDEQ
173 QHRK
174 MILV
175 MILF
177 FYW )],
178 'weak' => [ qw(
182 STNK
183 STPA
184 SGND
185 SNDEQK
186 NDEQHK
187 NEQHRK
188 FVLIM
189 HFY )],);
192 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
193 Bio::FeatureHolderI);
195 =head2 new
197 Title : new
198 Usage : my $aln = Bio::SimpleAlign->new();
199 Function : Creates a new simple align object
200 Returns : Bio::SimpleAlign
201 Args : -source => string representing the source program
202 where this alignment came from
203 -annotation => Bio::AnnotationCollectionI
204 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
205 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
206 -consensus => consensus string
207 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
209 =cut
212 sub new {
213 my($class,@args) = @_;
215 my $self = $class->SUPER::new(@args);
217 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
218 SOURCE
219 SCORE
221 ACCESSION
222 DESCRIPTION
223 SEQS
224 FEATURES
225 ANNOTATION
226 SEQ_ANNOTATION
227 CONSENSUS
228 CONSENSUS_META
229 )], @args);
230 $src && $self->source($src);
231 defined $score && $self->score($score);
232 # we need to set up internal hashs first!
234 $self->{'_seq'} = {};
235 $self->{'_order'} = {};
236 $self->{'_start_end_lists'} = {};
237 $self->{'_dis_name'} = {};
238 $self->{'_id'} = 'NoName';
239 # maybe we should automatically read in from args. Hmmm...
240 $id && $self->id($id);
241 $acc && $self->accession($acc);
242 $desc && $self->description($desc);
243 $coll && $self->annotation($coll);
244 # sequence annotation is layered into a provided annotation collection (or dies)
245 if ($sa) {
246 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
247 "with a sequence annotation collection")
248 if !$coll;
249 $coll->add_Annotation('seq_annotation', $sa);
251 if ($feats && ref $feats eq 'ARRAY') {
252 for my $feat (@$feats) {
253 $self->add_SeqFeature($feat);
256 $con && $self->consensus($con);
257 $cmeta && $self->consensus_meta($cmeta);
258 # assumes these are in correct alignment order
259 if ($seqs && ref($seqs) eq 'ARRAY') {
260 for my $seq (@$seqs) {
261 $self->add_seq($seq);
265 return $self; # success - we hope!
268 =head1 Modifier methods
270 These methods modify the MSA by adding, removing or shuffling complete
271 sequences.
273 =head2 add_seq
275 Title : add_seq
276 Usage : $myalign->add_seq($newseq);
277 $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
278 Function : Adds another sequence to the alignment. *Does not* align
279 it - just adds it to the hashes.
280 If -ORDER is specified, the sequence is inserted at the
281 the position spec'd by -ORDER, and existing sequences
282 are pushed down the storage array.
283 Returns : nothing
284 Args : A Bio::LocatableSeq object
285 Positive integer for the sequence position (optional)
287 See L<Bio::LocatableSeq> for more information
289 =cut
291 sub addSeq {
292 my $self = shift;
293 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
294 $self->add_seq(@_);
297 sub add_seq {
298 my $self = shift;
299 my @args = @_;
300 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
301 my ($name,$id,$start,$end);
303 unless ($seq) {
304 $self->throw("LocatableSeq argument required");
306 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
307 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
309 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
310 $order--; # jay's patch (user-specified order is 1-origin)
312 if ($order < 0) {
313 $self->throw("User-specified value for ORDER must be >= 1");
316 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
318 # build the symbol list for this sequence,
319 # will prune out the gap and missing/match chars
320 # when actually asked for the symbol list in the
321 # symbol_chars
322 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
324 $name = $seq->get_nse;
326 if( $self->{'_seq'}->{$name} ) {
327 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
329 else {
330 $self->debug( "Assigning $name to $order\n");
332 my $ordh = $self->{'_order'};
333 if ($ordh->{$order}) {
334 # make space to insert
335 # $c->() returns (in reverse order) the first subsequence
336 # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
337 # (3,2,1), and $c->(2,4,5) returns (2).
338 my $c;
339 $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
340 map {
341 $ordh->{$_+1} = $ordh->{$_}
342 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
345 $ordh->{$order} = $name;
347 unless( exists( $self->{'_start_end_lists'}->{$id})) {
348 $self->{'_start_end_lists'}->{$id} = [];
350 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
353 $self->{'_seq'}->{$name} = $seq;
358 =head2 remove_seq
360 Title : remove_seq
361 Usage : $aln->remove_seq($seq);
362 Function : Removes a single sequence from an alignment
363 Returns :
364 Argument : a Bio::LocatableSeq object
366 =cut
368 sub removeSeq {
369 my $self = shift;
370 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
371 $self->remove_seq(@_);
374 sub remove_seq {
375 my $self = shift;
376 my $seq = shift;
377 my ($name,$id,$start,$end);
379 $self->throw("Need Bio::Locatable seq argument ")
380 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
382 $id = $seq->id();
383 $start = $seq->start();
384 $end = $seq->end();
385 $name = sprintf("%s/%d-%d",$id,$start,$end);
387 if( !exists $self->{'_seq'}->{$name} ) {
388 $self->throw("Sequence $name does not exist in the alignment to remove!");
391 delete $self->{'_seq'}->{$name};
393 # we need to remove this seq from the start_end_lists hash
395 if (exists $self->{'_start_end_lists'}->{$id}) {
396 # we need to find the sequence in the array.
398 my ($i, $found);;
399 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
400 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
401 $found = 1;
402 last;
405 if ($found) {
406 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
408 else {
409 $self->throw("Could not find the sequence to remoce from the start-end list");
412 else {
413 $self->throw("There is no seq list for the name $id");
415 # we need to shift order hash
416 my %rev_order = reverse %{$self->{'_order'}};
417 my $no = $rev_order{$name};
418 my $num_sequences = $self->num_sequences;
419 for (; $no < $num_sequences; $no++) {
420 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
422 delete $self->{'_order'}->{$no};
423 return 1;
427 =head2 purge
429 Title : purge
430 Usage : $aln->purge(0.7);
431 Function: Removes sequences above given sequence similarity
432 This function will grind on large alignments. Beware!
433 Example :
434 Returns : An array of the removed sequences
435 Args : float, threshold for similarity
437 =cut
439 sub purge {
440 my ($self,$perc) = @_;
441 my (%duplicate, @dups);
443 my @seqs = $self->each_seq();
445 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
446 my $seq = $seqs[$i];
448 #skip if already in duplicate hash
449 next if exists $duplicate{$seq->display_id} ;
450 my $one = $seq->seq();
452 my @one = split '', $one; #split to get 1aa per array element
454 for (my $j=$i+1;$j < @seqs;$j++) {
455 my $seq2 = $seqs[$j];
457 #skip if already in duplicate hash
458 next if exists $duplicate{$seq2->display_id} ;
460 my $two = $seq2->seq();
461 my @two = split '', $two;
463 my $count = 0;
464 my $res = 0;
465 for (my $k=0;$k<@one;$k++) {
466 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
467 $one[$k] eq $two[$k]) {
468 $count++;
470 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
471 $two[$k] ne '.' && $two[$k] ne '-' ) {
472 $res++;
476 my $ratio = 0;
477 $ratio = $count/$res unless $res == 0;
479 # if above threshold put in duplicate hash and push onto
480 # duplicate array for returning to get_unique
481 if ( $ratio > $perc ) {
482 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
483 $duplicate{$seq2->display_id} = 1;
484 push @dups, $seq2;
488 foreach my $seq (@dups) {
489 $self->remove_seq($seq);
491 return @dups;
494 =head2 sort_alphabetically
496 Title : sort_alphabetically
497 Usage : $ali->sort_alphabetically
498 Function : Changes the order of the alignment to alphabetical on name
499 followed by numerical by number.
500 Returns :
501 Argument :
503 =cut
505 sub sort_alphabetically {
506 my $self = shift;
507 my ($seq,$nse,@arr,%hash,$count);
509 foreach $seq ( $self->each_seq() ) {
510 $nse = $seq->get_nse;
511 $hash{$nse} = $seq;
514 $count = 0;
516 %{$self->{'_order'}} = (); # reset the hash;
518 foreach $nse ( sort _alpha_startend keys %hash) {
519 $self->{'_order'}->{$count} = $nse;
521 $count++;
526 =head2 sort_by_list
528 Title : sort_by_list
529 Usage : $aln_ordered=$aln->sort_by_list($list_file)
530 Function : Arbitrarily order sequences in an alignment
531 Returns : A new Bio::SimpleAlign object
532 Argument : a file listing sequence names in intended order (one name per line)
534 =cut
536 sub sort_by_list {
537 my ($self, $list) = @_;
538 my (@seq, @ids, %order);
540 foreach my $seq ( $self->each_seq() ) {
541 push @seq, $seq;
542 push @ids, $seq->display_id;
545 my $ct=1;
546 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
547 while (<$listfh>) {
548 chomp;
549 my $name=$_;
550 $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
551 $order{$name}=$ct++;
553 close($listfh);
555 # use the map-sort-map idiom:
556 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
557 my $aln = $self->new;
558 foreach (@sorted) { $aln->add_seq($_) }
559 return $aln;
562 =head2 set_new_reference
564 Title : set_new_reference
565 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
566 the sequence whoes name is "B31" (full, exact, and case-sensitive),
567 as the reference (1st) sequence
568 Function : Change/Set a new reference (i.e., the first) sequence
569 Returns : a new Bio::SimpleAlign object.
570 Throws an exception if designated sequence not found
571 Argument : a positive integer of sequence order, or a sequence name
572 in the original alignment
574 =cut
576 sub set_new_reference {
577 my ($self, $seqid) = @_;
578 my $aln = $self->new;
579 my (@seq, @ids, @new_seq);
580 my $is_num=0;
581 foreach my $seq ( $self->each_seq() ) {
582 push @seq, $seq;
583 push @ids, $seq->display_id;
586 if ($seqid =~ /^\d+$/) { # argument is seq position
587 $is_num=1;
588 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
589 } else { # argument is a seq name
590 $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
593 for (my $i=0; $i<=$#seq; $i++) {
594 my $pos=$i+1;
595 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
596 unshift @new_seq, $seq[$i];
597 } else {
598 push @new_seq, $seq[$i];
601 foreach (@new_seq) { $aln->add_seq($_); }
602 return $aln;
605 sub _in_aln { # check if input name exists in the alignment
606 my ($str, $ref) = @_;
607 foreach (@$ref) {
608 return 1 if $str eq $_;
610 return 0;
614 =head2 uniq_seq
616 Title : uniq_seq
617 Usage : $aln->uniq_seq(): Remove identical sequences in
618 in the alignment. Ambiguous base ("N", "n") and
619 leading and ending gaps ("-") are NOT counted as
620 differences.
621 Function : Make a new alignment of unique sequence types (STs)
622 Returns : 1a. if called in a scalar context,
623 a new Bio::SimpleAlign object (all sequences renamed as "ST")
624 1b. if called in an array context,
625 a new Bio::SimpleAlign object, and a hashref whose keys
626 are sequence types, and whose values are arrayrefs to
627 lists of sequence ids within the corresponding sequence type
628 2. if $aln->verbose > 0, ST of each sequence is sent to
629 STDERR (in a tabular format)
630 Argument : None
632 =cut
634 sub uniq_seq {
635 my ($self, $seqid) = @_;
636 my $aln = $self->new;
637 my (%member, %order, @seq, @uniq_str, $st);
638 my $order=0;
639 my $len = $self->length();
640 $st = {};
641 foreach my $seq ( $self->each_seq() ) {
642 my $str = $seq->seq();
644 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
645 # comparing two sequence strings
647 # 1st, convert "n", "N" to "?" (for DNA sequence only):
648 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
649 # 2nd, convert leading and ending gaps to "?":
650 $str = &_convert_leading_ending_gaps($str, '-', '?');
651 # Note that '?' also can mean unknown residue.
652 # I don't like making global class member changes like this, too
653 # prone to errors... -- cjfields 08-11-18
654 local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
655 my $new = Bio::LocatableSeq->new(
656 -id => $seq->id(),
657 -alphabet=> $seq->alphabet,
658 -seq => $str,
659 -start => $seq->start,
660 -end => $seq->end
662 push @seq, $new;
665 foreach my $seq (@seq) {
666 my $str = $seq->seq();
667 my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
668 if ($seen) { # seen before
669 my @memb = @{$member{$key}};
670 push @memb, $seq;
671 $member{$key} = \@memb;
672 } else { # not seen
673 push @uniq_str, $key;
674 $order++;
675 $member{$key} = [ ($seq) ];
676 $order{$key} = $order;
680 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
681 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
682 my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
683 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
684 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
685 my $gap='-';
686 my $end=length($str2);
687 $end -= length($1) while $str2 =~ m/($gap+)/g;
688 my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
689 -seq =>$str2,
690 -start=>1,
691 -end =>$end
693 $aln->add_seq($new);
694 foreach (@{$member{$str}}) {
695 push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
696 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
699 return wantarray ? ($aln, $st) : $aln;
702 sub _check_uniq { # check if same seq exists in the alignment
703 my ($str1, $ref, $length) = @_;
704 my @char1=split //, $str1;
705 my @array=@$ref;
707 return (0, $str1) if @array==0; # not seen (1st sequence)
709 foreach my $str2 (@array) {
710 my $diff=0;
711 my @char2=split //, $str2;
712 for (my $i=0; $i<=$length-1; $i++) {
713 next if $char1[$i] eq '?';
714 next if $char2[$i] eq '?';
715 $diff++ if $char1[$i] ne $char2[$i];
717 return (1, $str2) if $diff == 0; # seen before
720 return (0, $str1); # not seen
723 sub _convert_leading_ending_gaps {
724 my $s=shift;
725 my $sym1=shift;
726 my $sym2=shift;
727 my @array=split //, $s;
728 # convert leading char:
729 for (my $i=0; $i<=$#array; $i++) {
730 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
732 # convert ending char:
733 for (my $i = $#array; $i>= 0; $i--) {
734 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
736 my $s_new=join '', @array;
737 return $s_new;
740 =head1 Sequence selection methods
742 Methods returning one or more sequences objects.
744 =head2 each_seq
746 Title : each_seq
747 Usage : foreach $seq ( $align->each_seq() )
748 Function : Gets a Seq object from the alignment
749 Returns : Seq object
750 Argument :
752 =cut
754 sub eachSeq {
755 my $self = shift;
756 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
757 $self->each_seq();
760 sub each_seq {
761 my $self = shift;
762 my (@arr,$order);
764 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
765 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
766 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
769 return @arr;
773 =head2 each_alphabetically
775 Title : each_alphabetically
776 Usage : foreach $seq ( $ali->each_alphabetically() )
777 Function : Returns a sequence object, but the objects are returned
778 in alphabetically sorted order.
779 Does not change the order of the alignment.
780 Returns : Seq object
781 Argument :
783 =cut
785 sub each_alphabetically {
786 my $self = shift;
787 my ($seq,$nse,@arr,%hash,$count);
789 foreach $seq ( $self->each_seq() ) {
790 $nse = $seq->get_nse;
791 $hash{$nse} = $seq;
794 foreach $nse ( sort _alpha_startend keys %hash) {
795 push(@arr,$hash{$nse});
797 return @arr;
800 sub _alpha_startend {
801 my ($aname,$astart,$bname,$bstart);
802 ($aname,$astart) = split (/-/,$a);
803 ($bname,$bstart) = split (/-/,$b);
805 if( $aname eq $bname ) {
806 return $astart <=> $bstart;
808 else {
809 return $aname cmp $bname;
813 =head2 each_seq_with_id
815 Title : each_seq_with_id
816 Usage : foreach $seq ( $align->each_seq_with_id() )
817 Function : Gets a Seq objects from the alignment, the contents
818 being those sequences with the given name (there may be
819 more than one)
820 Returns : Seq object
821 Argument : a seq name
823 =cut
825 sub eachSeqWithId {
826 my $self = shift;
827 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
828 $self->each_seq_with_id(@_);
831 sub each_seq_with_id {
832 my $self = shift;
833 my $id = shift;
835 $self->throw("Method each_seq_with_id needs a sequence name argument")
836 unless defined $id;
838 my (@arr, $seq);
840 if (exists($self->{'_start_end_lists'}->{$id})) {
841 @arr = @{$self->{'_start_end_lists'}->{$id}};
843 return @arr;
846 =head2 get_seq_by_pos
848 Title : get_seq_by_pos
849 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
850 Function : Gets a sequence based on its position in the alignment.
851 Numbering starts from 1. Sequence positions larger than
852 num_sequences() will thow an error.
853 Returns : a Bio::LocatableSeq object
854 Args : positive integer for the sequence position
856 =cut
858 sub get_seq_by_pos {
860 my $self = shift;
861 my ($pos) = @_;
863 $self->throw("Sequence position has to be a positive integer, not [$pos]")
864 unless $pos =~ /^\d+$/ and $pos > 0;
865 $self->throw("No sequence at position [$pos]")
866 unless $pos <= $self->num_sequences ;
868 my $nse = $self->{'_order'}->{--$pos};
869 return $self->{'_seq'}->{$nse};
872 =head2 get_seq_by_id
874 Title : get_seq_by_id
875 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
876 Function : Gets a sequence based on its name.
877 Sequences that do not exist will warn and return undef
878 Returns : a Bio::LocatableSeq object
879 Args : string for sequence name
881 =cut
883 sub get_seq_by_id {
884 my ($self,$name) = @_;
885 unless( defined $name ) {
886 $self->warn("Must provide a sequence name");
887 return;
889 for my $seq ( values %{$self->{'_seq'}} ) {
890 if ( $seq->id eq $name) {
891 return $seq;
894 return;
897 =head2 seq_with_features
899 Title : seq_with_features
900 Usage : $seq = $aln->seq_with_features(-pos => 1,
901 -consensus => 60
902 -mask =>
903 sub { my $consensus = shift;
905 for my $i (1..5){
906 my $n = 'N' x $i;
907 my $q = '\?' x $i;
908 while($consensus =~ /[^?]$q[^?]/){
909 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
912 return $consensus;
915 Function: produces a Bio::Seq object by first splicing gaps from -pos
916 (by means of a splice_by_seq_pos() call), then creating
917 features using non-? chars (by means of a consensus_string()
918 call with stringency -consensus).
919 Returns : a Bio::Seq object
920 Args : -pos : required. sequence from which to build the Bio::Seq
921 object
922 -consensus : optional, defaults to consensus_string()'s
923 default cutoff value
924 -mask : optional, a coderef to apply to consensus_string()'s
925 output before building features. this may be useful for
926 closing gaps of 1 bp by masking over them with N, for
927 instance
929 =cut
931 sub seq_with_features{
932 my ($self,%arg) = @_;
934 #first do the preparatory splice
935 $self->throw("must provide a -pos argument") unless $arg{-pos};
936 $self->splice_by_seq_pos($arg{-pos});
938 my $consensus_string = $self->consensus_string($arg{-consensus});
939 $consensus_string = $arg{-mask}->($consensus_string)
940 if defined($arg{-mask});
942 my(@bs,@es);
944 push @bs, 1 if $consensus_string =~ /^[^?]/;
946 while($consensus_string =~ /\?[^?]/g){
947 push @bs, pos($consensus_string);
949 while($consensus_string =~ /[^?]\?/g){
950 push @es, pos($consensus_string);
953 push @es, length($consensus_string) if $consensus_string =~ /[^?]$/;
955 my $seq = Bio::Seq->new();
957 # my $rootfeature = Bio::SeqFeature::Generic->new(
958 # -source_tag => 'location',
959 # -start => $self->get_seq_by_pos($arg{-pos})->start,
960 # -end => $self->get_seq_by_pos($arg{-pos})->end,
961 # );
962 # $seq->add_SeqFeature($rootfeature);
964 while(my $b = shift @bs){
965 my $e = shift @es;
966 $seq->add_SeqFeature(
967 Bio::SeqFeature::Generic->new(
968 -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
969 -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
970 -source_tag => $self->source || 'MSA',
975 return $seq;
979 =head1 Create new alignments
981 The result of these methods are horizontal or vertical subsets of the
982 current MSA.
984 =head2 select
986 Title : select
987 Usage : $aln2 = $aln->select(1, 3) # three first sequences
988 Function : Creates a new alignment from a continuous subset of
989 sequences. Numbering starts from 1. Sequence positions
990 larger than num_sequences() will thow an error.
991 Returns : a Bio::SimpleAlign object
992 Args : positive integer for the first sequence
993 positive integer for the last sequence to include (optional)
995 =cut
997 sub select {
998 my $self = shift;
999 my ($start, $end) = @_;
1001 $self->throw("Select start has to be a positive integer, not [$start]")
1002 unless $start =~ /^\d+$/ and $start > 0;
1003 $self->throw("Select end has to be a positive integer, not [$end]")
1004 unless $end =~ /^\d+$/ and $end > 0;
1005 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1006 unless $start <= $end;
1008 my $aln = $self->new;
1009 foreach my $pos ($start .. $end) {
1010 $aln->add_seq($self->get_seq_by_pos($pos));
1012 $aln->id($self->id);
1013 # fix for meta, sf, ann
1014 return $aln;
1017 =head2 select_noncont
1019 Title : select_noncont
1020 Usage : # 1st and 3rd sequences, sorted
1021 $aln2 = $aln->select_noncont(1, 3)
1023 # 1st and 3rd sequences, sorted (same as first)
1024 $aln2 = $aln->select_noncont(3, 1)
1026 # 1st and 3rd sequences, unsorted
1027 $aln2 = $aln->select_noncont('nosort',3, 1)
1029 Function : Creates a new alignment from a subset of sequences. Numbering
1030 starts from 1. Sequence positions larger than num_sequences() will
1031 throw an error. Sorts the order added to new alignment by default,
1032 to prevent sorting pass 'nosort' as the first argument in the list.
1033 Returns : a Bio::SimpleAlign object
1034 Args : array of integers for the sequences. If the string 'nosort' is
1035 passed as the first argument, the sequences will not be sorted
1036 in the new alignment but will appear in the order listed.
1038 =cut
1040 sub select_noncont {
1041 my $self = shift;
1042 my $nosort = 0;
1043 my (@pos) = @_;
1044 if ($pos[0] !~ m{^\d+$}) {
1045 my $sortcmd = shift @pos;
1046 if ($sortcmd eq 'nosort') {
1047 $nosort = 1;
1048 } else {
1049 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1053 my $end = $self->num_sequences;
1054 foreach ( @pos ) {
1055 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1056 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1059 @pos = sort {$a <=> $b} @pos unless $nosort;
1061 my $aln = $self->new;
1062 foreach my $p (@pos) {
1063 $aln->add_seq($self->get_seq_by_pos($p));
1065 $aln->id($self->id);
1066 # fix for meta, sf, ann
1067 return $aln;
1070 =head2 slice
1072 Title : slice
1073 Usage : $aln2 = $aln->slice(20,30)
1074 Function : Creates a slice from the alignment inclusive of start and
1075 end columns, and the first column in the alignment is denoted 1.
1076 Sequences with no residues in the slice are excluded from the
1077 new alignment and a warning is printed. Slice beyond the length of
1078 the sequence does not do padding.
1079 Returns : A Bio::SimpleAlign object
1080 Args : Positive integer for start column, positive integer for end column,
1081 optional boolean which if true will keep gap-only columns in the newly
1082 created slice. Example:
1084 $aln2 = $aln->slice(20,30,1)
1086 =cut
1088 sub slice {
1089 my $self = shift;
1090 my ($start, $end, $keep_gap_only) = @_;
1092 $self->throw("Slice start has to be a positive integer, not [$start]")
1093 unless $start =~ /^\d+$/ and $start > 0;
1094 $self->throw("Slice end has to be a positive integer, not [$end]")
1095 unless $end =~ /^\d+$/ and $end > 0;
1096 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1097 unless $start <= $end;
1098 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1099 "[$start] is too big.") if $start > $self->length;
1100 my $cons_meta = $self->consensus_meta;
1101 my $aln = $self->new;
1102 $aln->id($self->id);
1103 foreach my $seq ( $self->each_seq() ) {
1104 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1105 Bio::Seq::Meta->new
1106 (-id => $seq->id,
1107 -alphabet => $seq->alphabet,
1108 -strand => $seq->strand,
1109 -verbose => $self->verbose) :
1110 Bio::LocatableSeq->new
1111 (-id => $seq->id,
1112 -alphabet => $seq->alphabet,
1113 -strand => $seq->strand,
1114 -verbose => $self->verbose);
1116 # seq
1117 my $seq_end = $end;
1118 $seq_end = $seq->length if( $end > $seq->length );
1120 my $slice_seq = $seq->subseq($start, $seq_end);
1121 $new_seq->seq( $slice_seq );
1123 $slice_seq =~ s/\W//g;
1125 if ($start > 1) {
1126 my $pre_start_seq = $seq->subseq(1, $start - 1);
1127 $pre_start_seq =~ s/\W//g;
1128 if (!defined($seq->strand)) {
1129 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1130 } elsif ($seq->strand < 0){
1131 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1132 } else {
1133 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1135 } else {
1136 $new_seq->start( $seq->start);
1138 if ($new_seq->isa('Bio::Seq::MetaI')) {
1139 for my $meta_name ($seq->meta_names) {
1140 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1143 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1145 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1146 $aln->add_seq($new_seq);
1147 } else {
1148 if( $keep_gap_only ) {
1149 $aln->add_seq($new_seq);
1150 } else {
1151 my $nse = $seq->get_nse();
1152 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1153 " Sequence excluded from the new alignment.");
1157 if ($cons_meta) {
1158 my $new = Bio::Seq::Meta->new();
1159 for my $meta_name ($cons_meta->meta_names) {
1160 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1162 $aln->consensus_meta($new);
1164 $aln->annotation($self->annotation);
1165 # fix for meta, sf, ann
1166 return $aln;
1169 =head2 remove_columns
1171 Title : remove_columns
1172 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1173 $aln2 = $aln->remove_columns([0,0],[6,8])
1174 Function : Creates an aligment with columns removed corresponding to
1175 the specified type or by specifying the columns by number.
1176 Returns : Bio::SimpleAlign object
1177 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1178 'all_gaps_columns') or array ref where the referenced array
1179 contains a pair of integers that specify a range.
1180 The first column is 0
1182 =cut
1184 sub remove_columns {
1185 my ($self,@args) = @_;
1186 @args || $self->throw("Must supply column ranges or column types");
1187 my $aln;
1189 if ($args[0][0] =~ /^[a-z_]+$/i) {
1190 $aln = $self->_remove_columns_by_type($args[0]);
1191 } elsif ($args[0][0] =~ /^\d+$/) {
1192 $aln = $self->_remove_columns_by_num(\@args);
1193 } else {
1194 $self->throw("You must pass array references to remove_columns(), not @args");
1196 # fix for meta, sf, ann
1197 $aln;
1201 =head2 remove_gaps
1203 Title : remove_gaps
1204 Usage : $aln2 = $aln->remove_gaps
1205 Function : Creates an aligment with gaps removed
1206 Returns : a Bio::SimpleAlign object
1207 Args : a gap character(optional) if none specified taken
1208 from $self->gap_char,
1209 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1210 indicates that only all-gaps columns should be deleted
1212 Used from method L<remove_columns> in most cases. Set gap character
1213 using L<gap_char()|gap_char>.
1215 =cut
1217 sub remove_gaps {
1218 my ($self,$gapchar,$all_gaps_columns) = @_;
1219 my $gap_line;
1220 if ($all_gaps_columns) {
1221 $gap_line = $self->all_gap_line($gapchar);
1222 } else {
1223 $gap_line = $self->gap_line($gapchar);
1225 my $aln = $self->new;
1227 my @remove;
1228 my $length = 0;
1229 my $del_char = $gapchar || $self->gap_char;
1230 # Do the matching to get the segments to remove
1231 while ($gap_line =~ m/[$del_char]/g) {
1232 my $start = pos($gap_line)-1;
1233 $gap_line=~/\G[$del_char]+/gc;
1234 my $end = pos($gap_line)-1;
1236 #have to offset the start and end for subsequent removes
1237 $start-=$length;
1238 $end -=$length;
1239 $length += ($end-$start+1);
1240 push @remove, [$start,$end];
1243 #remove the segments
1244 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1245 # fix for meta, sf, ann
1246 return $aln;
1250 sub _remove_col {
1251 my ($self,$aln,$remove) = @_;
1252 my @new;
1254 my $gap = $self->gap_char;
1256 # splice out the segments and create new seq
1257 foreach my $seq($self->each_seq){
1258 my $new_seq = Bio::LocatableSeq->new(
1259 -id => $seq->id,
1260 -alphabet=> $seq->alphabet,
1261 -strand => $seq->strand,
1262 -verbose => $self->verbose);
1263 my $sequence = $seq->seq;
1264 foreach my $pair(@{$remove}){
1265 my $start = $pair->[0];
1266 my $end = $pair->[1];
1267 $sequence = $seq->seq unless $sequence;
1268 my $orig = $sequence;
1269 my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1270 my $tail = ($end + 1) >= length($sequence) ? '' : substr($sequence, $end + 1);
1271 $sequence = $head.$tail;
1272 # start
1273 unless (defined $new_seq->start) {
1274 if ($start == 0) {
1275 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1276 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1278 else {
1279 my $start_adjust = $orig =~ /^$gap+/;
1280 if ($start_adjust) {
1281 $start_adjust = $+[0] == $start;
1283 $new_seq->start($seq->start + $start_adjust);
1286 # end
1287 if (($end + 1) >= length($orig)) {
1288 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1289 $new_seq->end($seq->end - (length($orig) - $start) + $end_adjust);
1291 else {
1292 $new_seq->end($seq->end);
1296 if ($new_seq->end < $new_seq->start) {
1297 # we removed all columns except for gaps: set to 0 to indicate no
1298 # sequence
1299 $new_seq->start(0);
1300 $new_seq->end(0);
1303 $new_seq->seq($sequence) if $sequence;
1304 push @new, $new_seq;
1306 # add the new seqs to the alignment
1307 foreach my $new(@new){
1308 $aln->add_seq($new);
1310 # fix for meta, sf, ann
1311 return $aln;
1314 sub _remove_columns_by_type {
1315 my ($self,$type) = @_;
1316 my $aln = $self->new;
1317 my @remove;
1319 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1320 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1321 my %matchchars = ( 'match' => '\*',
1322 'weak' => '\.',
1323 'strong' => ':',
1324 'mismatch' => ' ',
1325 'gaps' => '',
1326 'all_gaps_columns' => ''
1328 # get the characters to delete against
1329 my $del_char;
1330 foreach my $type (@{$type}){
1331 $del_char.= $matchchars{$type};
1334 my $length = 0;
1335 my $match_line = $self->match_line;
1336 # do the matching to get the segments to remove
1337 if($del_char){
1338 while($match_line =~ m/[$del_char]/g ){
1339 my $start = pos($match_line)-1;
1340 $match_line=~/\G[$del_char]+/gc;
1341 my $end = pos($match_line)-1;
1343 #have to offset the start and end for subsequent removes
1344 $start-=$length;
1345 $end -=$length;
1346 $length += ($end-$start+1);
1347 push @remove, [$start,$end];
1351 # remove the segments
1352 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1353 $aln = $aln->remove_gaps() if $gap;
1354 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1355 # fix for meta, sf, ann
1356 $aln;
1360 sub _remove_columns_by_num {
1361 my ($self,$positions) = @_;
1362 my $aln = $self->new;
1364 # sort the positions
1365 @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1367 my @remove;
1368 my $length = 0;
1369 foreach my $pos (@{$positions}) {
1370 my ($start, $end) = @{$pos};
1372 #have to offset the start and end for subsequent removes
1373 $start-=$length;
1374 $end -=$length;
1375 $length += ($end-$start+1);
1376 push @remove, [$start,$end];
1379 #remove the segments
1380 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1381 # fix for meta, sf, ann
1382 $aln;
1386 =head1 Change sequences within the MSA
1388 These methods affect characters in all sequences without changing the
1389 alignment.
1391 =head2 splice_by_seq_pos
1393 Title : splice_by_seq_pos
1394 Usage : $status = splice_by_seq_pos(1);
1395 Function: splices all aligned sequences where the specified sequence
1396 has gaps.
1397 Example :
1398 Returns : 1 on success
1399 Args : position of sequence to splice by
1402 =cut
1404 sub splice_by_seq_pos{
1405 my ($self,$pos) = @_;
1407 my $guide = $self->get_seq_by_pos($pos);
1408 my $guide_seq = $guide->seq;
1410 $guide_seq =~ s/\./\-/g;
1412 my @gaps = ();
1413 $pos = -1;
1414 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1415 unshift @gaps, $pos;
1416 $pos++;
1419 foreach my $seq ($self->each_seq){
1420 my @bases = split '', $seq->seq;
1422 splice(@bases, $_, 1) foreach @gaps;
1423 $seq->seq(join('', @bases));
1429 =head2 map_chars
1431 Title : map_chars
1432 Usage : $ali->map_chars('\.','-')
1433 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1434 characters
1436 Notice that the from (arg1) is interpretted as a regex,
1437 so be careful about quoting meta characters (eg
1438 $ali->map_chars('.','-') wont do what you want)
1439 Returns :
1440 Argument : 'from' rexexp
1441 'to' string
1443 =cut
1445 sub map_chars {
1446 my $self = shift;
1447 my $from = shift;
1448 my $to = shift;
1449 my ($seq,$temp);
1451 $self->throw("Need exactly two arguments")
1452 unless defined $from and defined $to;
1454 foreach $seq ( $self->each_seq() ) {
1455 $temp = $seq->seq();
1456 $temp =~ s/$from/$to/g;
1457 $seq->seq($temp);
1459 return 1;
1463 =head2 uppercase
1465 Title : uppercase()
1466 Usage : $ali->uppercase()
1467 Function : Sets all the sequences to uppercase
1468 Returns :
1469 Argument :
1471 =cut
1473 sub uppercase {
1474 my $self = shift;
1475 my $seq;
1476 my $temp;
1478 foreach $seq ( $self->each_seq() ) {
1479 $temp = $seq->seq();
1480 $temp =~ tr/[a-z]/[A-Z]/;
1482 $seq->seq($temp);
1484 return 1;
1487 =head2 cigar_line
1489 Title : cigar_line()
1490 Usage : %cigars = $align->cigar_line()
1491 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1492 Report) line for each sequence in the alignment. Examples are
1493 "1,60" or "5,10:12,58", where the numbers refer to conserved
1494 positions within the alignment. The keys of the hash are the
1495 NSEs (name/start/end) assigned to each sequence.
1496 Args : threshold (optional, defaults to 100)
1497 Returns : Hash of strings (cigar lines)
1499 =cut
1501 sub cigar_line {
1502 my $self = shift;
1503 my $thr=shift||100;
1504 my %cigars;
1506 my @consensus = split "",($self->consensus_string($thr));
1507 my $len = $self->length;
1508 my $gapchar = $self->gap_char;
1510 # create a precursor, something like (1,4,5,6,7,33,45),
1511 # where each number corresponds to a conserved position
1512 foreach my $seq ( $self->each_seq ) {
1513 my @seq = split "", uc ($seq->seq);
1514 my $pos = 1;
1515 for (my $x = 0 ; $x < $len ; $x++ ) {
1516 if ($seq[$x] eq $consensus[$x]) {
1517 push @{$cigars{$seq->get_nse}},$pos;
1518 $pos++;
1519 } elsif ($seq[$x] ne $gapchar) {
1520 $pos++;
1524 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1525 for my $name (keys %cigars) {
1526 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1527 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1528 push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1529 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1530 ${$cigars{$name}}[$#{$cigars{$name}}] );
1531 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1532 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1533 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1534 splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1538 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1539 for my $name (keys %cigars) {
1540 my @remove;
1541 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1542 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1543 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1544 unshift @remove,$x;
1547 for my $pos (@remove) {
1548 splice @{$cigars{$name}}, $pos, 1;
1551 # join and punctuate
1552 for my $name (keys %cigars) {
1553 my ($start,$end,$str) = "";
1554 while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1555 $str .= ($start . "," . $end . ":");
1557 $str =~ s/:$//;
1558 $cigars{$name} = $str;
1560 %cigars;
1564 =head2 match_line
1566 Title : match_line()
1567 Usage : $line = $align->match_line()
1568 Function : Generates a match line - much like consensus string
1569 except that a line indicating the '*' for a match.
1570 Args : (optional) Match line characters ('*' by default)
1571 (optional) Strong match char (':' by default)
1572 (optional) Weak match char ('.' by default)
1573 Returns : String
1575 =cut
1577 sub match_line {
1578 my ($self,$matchlinechar, $strong, $weak) = @_;
1579 my %matchchars = ('match' => $matchlinechar || '*',
1580 'weak' => $weak || '.',
1581 'strong' => $strong || ':',
1582 'mismatch' => ' ',
1585 my @seqchars;
1586 my $alphabet;
1587 foreach my $seq ( $self->each_seq ) {
1588 push @seqchars, [ split(//, uc ($seq->seq)) ];
1589 $alphabet = $seq->alphabet unless defined $alphabet;
1591 my $refseq = shift @seqchars;
1592 # let's just march down the columns
1593 my $matchline;
1594 POS:
1595 foreach my $pos ( 0..$self->length ) {
1596 my $refchar = $refseq->[$pos];
1597 my $char = $matchchars{'mismatch'};
1598 unless( defined $refchar ) {
1599 last if $pos == $self->length; # short circuit on last residue
1600 # this in place to handle jason's soon-to-be-committed
1601 # intron mapping code
1602 goto bottom;
1604 my %col = ($refchar => 1);
1605 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1606 foreach my $seq ( @seqchars ) {
1607 next if $pos >= scalar @$seq;
1608 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1609 $seq->[$pos] eq ' ' );
1610 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1612 my @colresidues = sort keys %col;
1614 # if all the values are the same
1615 if( $dash ) { $char = $matchchars{'mismatch'} }
1616 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1617 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1618 # matches for protein seqs
1619 TYPE:
1620 foreach my $type ( qw(strong weak) ) {
1621 # iterate through categories
1622 my %groups;
1623 # iterate through each of the aa in the col
1624 # look to see which groups it is in
1625 foreach my $c ( @colresidues ) {
1626 foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1627 push @{$groups{$f}},$c;
1630 GRP:
1631 foreach my $cols ( values %groups ) {
1632 @$cols = sort @$cols;
1633 # now we are just testing to see if two arrays
1634 # are identical w/o changing either one
1635 # have to be same len
1636 next if( scalar @$cols != scalar @colresidues );
1637 # walk down the length and check each slot
1638 for($_=0;$_ < (scalar @$cols);$_++ ) {
1639 next GRP if( $cols->[$_] ne $colresidues[$_] );
1641 $char = $matchchars{$type};
1642 last TYPE;
1646 bottom:
1647 $matchline .= $char;
1649 return $matchline;
1653 =head2 gap_line
1655 Title : gap_line()
1656 Usage : $line = $align->gap_line()
1657 Function : Generates a gap line - much like consensus string
1658 except that a line where '-' represents gap
1659 Args : (optional) gap line characters ('-' by default)
1660 Returns : string
1662 =cut
1664 sub gap_line {
1665 my ($self,$gapchar) = @_;
1666 $gapchar = $gapchar || $self->gap_char;
1667 my %gap_hsh; # column gaps vector
1668 foreach my $seq ( $self->each_seq ) {
1669 my $i = 0;
1670 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1671 map {[$i++, $_]} split(//, uc ($seq->seq));
1673 my $gap_line;
1674 foreach my $pos ( 0..$self->length-1 ) {
1675 $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1677 return $gap_line;
1680 =head2 all_gap_line
1682 Title : all_gap_line()
1683 Usage : $line = $align->all_gap_line()
1684 Function : Generates a gap line - much like consensus string
1685 except that a line where '-' represents all-gap column
1686 Args : (optional) gap line characters ('-' by default)
1687 Returns : string
1689 =cut
1691 sub all_gap_line {
1692 my ($self,$gapchar) = @_;
1693 $gapchar = $gapchar || $self->gap_char;
1694 my %gap_hsh; # column gaps counter hash
1695 my @seqs = $self->each_seq;
1696 foreach my $seq ( @seqs ) {
1697 my $i = 0;
1698 map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1699 map {[$i++, $_]} split(//, uc ($seq->seq));
1701 my $gap_line;
1702 foreach my $pos ( 0..$self->length-1 ) {
1703 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1704 # gaps column
1705 $gap_line .= $gapchar;
1706 } else {
1707 $gap_line .= '.';
1710 return $gap_line;
1713 =head2 gap_col_matrix
1715 Title : gap_col_matrix()
1716 Usage : my $cols = $align->gap_col_matrix()
1717 Function : Generates an array of hashes where
1718 each entry in the array is a hash reference
1719 with keys of all the sequence names and
1720 and value of 1 or 0 if the sequence has a gap at that column
1721 Args : (optional) gap line characters ($aln->gap_char or '-' by default)
1723 =cut
1725 sub gap_col_matrix {
1726 my ($self,$gapchar) = @_;
1727 $gapchar = $gapchar || $self->gap_char;
1728 my %gap_hsh; # column gaps vector
1729 my @cols;
1730 foreach my $seq ( $self->each_seq ) {
1731 my $i = 0;
1732 my $str = $seq->seq;
1733 my $len = $seq->length;
1734 my $ch;
1735 my $id = $seq->display_id;
1736 while( $i < $len ) {
1737 $ch = substr($str, $i, 1);
1738 $cols[$i++]->{$id} = ($ch eq $gapchar);
1741 return \@cols;
1744 =head2 match
1746 Title : match()
1747 Usage : $ali->match()
1748 Function : Goes through all columns and changes residues that are
1749 identical to residue in first sequence to match '.'
1750 character. Sets match_char.
1752 USE WITH CARE: Most MSA formats do not support match
1753 characters in sequences, so this is mostly for output
1754 only. NEXUS format (Bio::AlignIO::nexus) can handle
1756 Returns : 1
1757 Argument : a match character, optional, defaults to '.'
1759 =cut
1761 sub match {
1762 my ($self, $match) = @_;
1764 $match ||= '.';
1765 my ($matching_char) = $match;
1766 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1767 $self->map_chars($matching_char, '-');
1769 my @seqs = $self->each_seq();
1770 return 1 unless scalar @seqs > 1;
1772 my $refseq = shift @seqs ;
1773 my @refseq = split //, $refseq->seq;
1774 my $gapchar = $self->gap_char;
1776 foreach my $seq ( @seqs ) {
1777 my @varseq = split //, $seq->seq();
1778 for ( my $i=0; $i < scalar @varseq; $i++) {
1779 $varseq[$i] = $match if defined $refseq[$i] &&
1780 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1781 $refseq[$i] =~ /$gapchar/ )
1782 && $refseq[$i] eq $varseq[$i];
1784 $seq->seq(join '', @varseq);
1786 $self->match_char($match);
1787 return 1;
1791 =head2 unmatch
1793 Title : unmatch()
1794 Usage : $ali->unmatch()
1795 Function : Undoes the effect of method match. Unsets match_char.
1796 Returns : 1
1797 Argument : a match character, optional, defaults to '.'
1799 See L<match> and L<match_char>
1801 =cut
1803 sub unmatch {
1804 my ($self, $match) = @_;
1806 $match ||= '.';
1808 my @seqs = $self->each_seq();
1809 return 1 unless scalar @seqs > 1;
1811 my $refseq = shift @seqs ;
1812 my @refseq = split //, $refseq->seq;
1813 my $gapchar = $self->gap_char;
1814 foreach my $seq ( @seqs ) {
1815 my @varseq = split //, $seq->seq();
1816 for ( my $i=0; $i < scalar @varseq; $i++) {
1817 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1818 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1819 $refseq[$i] =~ /$gapchar/ ) &&
1820 $varseq[$i] eq $match;
1822 $seq->seq(join '', @varseq);
1824 $self->match_char('');
1825 return 1;
1828 =head1 MSA attributes
1830 Methods for setting and reading the MSA attributes.
1832 Note that the methods defining character semantics depend on the user
1833 to set them sensibly. They are needed only by certain input/output
1834 methods. Unset them by setting to an empty string ('').
1836 =head2 id
1838 Title : id
1839 Usage : $myalign->id("Ig")
1840 Function : Gets/sets the id field of the alignment
1841 Returns : An id string
1842 Argument : An id string (optional)
1844 =cut
1846 sub id {
1847 my ($self, $name) = @_;
1849 if (defined( $name )) {
1850 $self->{'_id'} = $name;
1853 return $self->{'_id'};
1856 =head2 accession
1858 Title : accession
1859 Usage : $myalign->accession("PF00244")
1860 Function : Gets/sets the accession field of the alignment
1861 Returns : An acc string
1862 Argument : An acc string (optional)
1864 =cut
1866 sub accession {
1867 my ($self, $acc) = @_;
1869 if (defined( $acc )) {
1870 $self->{'_accession'} = $acc;
1873 return $self->{'_accession'};
1876 =head2 description
1878 Title : description
1879 Usage : $myalign->description("14-3-3 proteins")
1880 Function : Gets/sets the description field of the alignment
1881 Returns : An description string
1882 Argument : An description string (optional)
1884 =cut
1886 sub description {
1887 my ($self, $name) = @_;
1889 if (defined( $name )) {
1890 $self->{'_description'} = $name;
1893 return $self->{'_description'};
1896 =head2 missing_char
1898 Title : missing_char
1899 Usage : $myalign->missing_char("?")
1900 Function : Gets/sets the missing_char attribute of the alignment
1901 It is generally recommended to set it to 'n' or 'N'
1902 for nucleotides and to 'X' for protein.
1903 Returns : An missing_char string,
1904 Argument : An missing_char string (optional)
1906 =cut
1908 sub missing_char {
1909 my ($self, $char) = @_;
1911 if (defined $char ) {
1912 $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1913 $self->{'_missing_char'} = $char;
1916 return $self->{'_missing_char'};
1919 =head2 match_char
1921 Title : match_char
1922 Usage : $myalign->match_char('.')
1923 Function : Gets/sets the match_char attribute of the alignment
1924 Returns : An match_char string,
1925 Argument : An match_char string (optional)
1927 =cut
1929 sub match_char {
1930 my ($self, $char) = @_;
1932 if (defined $char ) {
1933 $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1934 $self->{'_match_char'} = $char;
1937 return $self->{'_match_char'};
1940 =head2 gap_char
1942 Title : gap_char
1943 Usage : $myalign->gap_char('-')
1944 Function : Gets/sets the gap_char attribute of the alignment
1945 Returns : An gap_char string, defaults to '-'
1946 Argument : An gap_char string (optional)
1948 =cut
1950 sub gap_char {
1951 my ($self, $char) = @_;
1953 if (defined $char || ! defined $self->{'_gap_char'} ) {
1954 $char= '-' unless defined $char;
1955 $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1956 $self->{'_gap_char'} = $char;
1958 return $self->{'_gap_char'};
1961 =head2 symbol_chars
1963 Title : symbol_chars
1964 Usage : my @symbolchars = $aln->symbol_chars;
1965 Function: Returns all the seen symbols (other than gaps)
1966 Returns : array of characters that are the seen symbols
1967 Args : boolean to include the gap/missing/match characters
1969 =cut
1971 sub symbol_chars{
1972 my ($self,$includeextra) = @_;
1974 unless ($self->{'_symbols'}) {
1975 foreach my $seq ($self->each_seq) {
1976 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1979 my %copy = %{$self->{'_symbols'}};
1980 if( ! $includeextra ) {
1981 foreach my $char ( $self->gap_char, $self->match_char,
1982 $self->missing_char) {
1983 delete $copy{$char} if( defined $char );
1986 return keys %copy;
1989 =head1 Alignment descriptors
1991 These read only methods describe the MSA in various ways.
1994 =head2 score
1996 Title : score
1997 Usage : $str = $ali->score()
1998 Function : get/set a score of the alignment
1999 Returns : a score for the alignment
2000 Argument : an optional score to set
2002 =cut
2004 sub score {
2005 my $self = shift;
2006 $self->{score} = shift if @_;
2007 return $self->{score};
2010 =head2 consensus_string
2012 Title : consensus_string
2013 Usage : $str = $ali->consensus_string($threshold_percent)
2014 Function : Makes a strict consensus
2015 Returns : Consensus string
2016 Argument : Optional treshold ranging from 0 to 100.
2017 The consensus residue has to appear at least threshold %
2018 of the sequences at a given location, otherwise a '?'
2019 character will be placed at that location.
2020 (Default value = 0%)
2022 =cut
2024 sub consensus_string {
2025 my $self = shift;
2026 my $threshold = shift;
2028 my $out = "";
2029 my $len = $self->length - 1;
2031 foreach ( 0 .. $len ) {
2032 $out .= $self->_consensus_aa($_,$threshold);
2034 return $out;
2037 sub _consensus_aa {
2038 my $self = shift;
2039 my $point = shift;
2040 my $threshold_percent = shift || -1 ;
2041 my ($seq,%hash,$count,$letter,$key);
2042 my $gapchar = $self->gap_char;
2043 foreach $seq ( $self->each_seq() ) {
2044 $letter = substr($seq->seq,$point,1);
2045 $self->throw("--$point-----------") if $letter eq '';
2046 ($letter eq $gapchar || $letter =~ /\./) && next;
2047 # print "Looking at $letter\n";
2048 $hash{$letter}++;
2050 my $number_of_sequences = $self->num_sequences();
2051 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2052 $count = -1;
2053 $letter = '?';
2055 foreach $key ( sort keys %hash ) {
2056 # print "Now at $key $hash{$key}\n";
2057 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2058 $letter = $key;
2059 $count = $hash{$key};
2062 return $letter;
2066 =head2 consensus_iupac
2068 Title : consensus_iupac
2069 Usage : $str = $ali->consensus_iupac()
2070 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2071 and RNA. The output is in upper case except when gaps in
2072 a column force output to be in lower case.
2074 Note that if your alignment sequences contain a lot of
2075 IUPAC ambiquity codes you often have to manually set
2076 alphabet. Bio::PrimarySeq::_guess_type thinks they
2077 indicate a protein sequence.
2078 Returns : consensus string
2079 Argument : none
2080 Throws : on protein sequences
2082 =cut
2084 sub consensus_iupac {
2085 my $self = shift;
2086 my $out = "";
2087 my $len = $self->length-1;
2089 # only DNA and RNA sequences are valid
2090 foreach my $seq ( $self->each_seq() ) {
2091 $self->throw("Seq [". $seq->get_nse. "] is a protein")
2092 if $seq->alphabet eq 'protein';
2094 # loop over the alignment columns
2095 foreach my $count ( 0 .. $len ) {
2096 $out .= $self->_consensus_iupac($count);
2098 return $out;
2101 sub _consensus_iupac {
2102 my ($self, $column) = @_;
2103 my ($string, $char, $rna);
2105 #determine all residues in a column
2106 foreach my $seq ( $self->each_seq() ) {
2107 $string .= substr($seq->seq, $column, 1);
2109 $string = uc $string;
2111 # quick exit if there's an N in the string
2112 if ($string =~ /N/) {
2113 $string =~ /\W/ ? return 'n' : return 'N';
2115 # ... or if there are only gap characters
2116 return '-' if $string =~ /^\W+$/;
2118 # treat RNA as DNA in regexps
2119 if ($string =~ /U/) {
2120 $string =~ s/U/T/;
2121 $rna = 1;
2124 # the following s///'s only need to be done to the _first_ ambiguity code
2125 # as we only need to see the _range_ of characters in $string
2127 if ($string =~ /[VDHB]/) {
2128 $string =~ s/V/AGC/;
2129 $string =~ s/D/AGT/;
2130 $string =~ s/H/ACT/;
2131 $string =~ s/B/CTG/;
2134 if ($string =~ /[SKYRWM]/) {
2135 $string =~ s/S/GC/;
2136 $string =~ s/K/GT/;
2137 $string =~ s/Y/CT/;
2138 $string =~ s/R/AG/;
2139 $string =~ s/W/AT/;
2140 $string =~ s/M/AC/;
2143 # and now the guts of the thing
2145 if ($string =~ /A/) {
2146 $char = 'A'; # A A
2147 if ($string =~ /G/) {
2148 $char = 'R'; # A and G (purines) R
2149 if ($string =~ /C/) {
2150 $char = 'V'; # A and G and C V
2151 if ($string =~ /T/) {
2152 $char = 'N'; # A and G and C and T N
2154 } elsif ($string =~ /T/) {
2155 $char = 'D'; # A and G and T D
2157 } elsif ($string =~ /C/) {
2158 $char = 'M'; # A and C M
2159 if ($string =~ /T/) {
2160 $char = 'H'; # A and C and T H
2162 } elsif ($string =~ /T/) {
2163 $char = 'W'; # A and T W
2165 } elsif ($string =~ /C/) {
2166 $char = 'C'; # C C
2167 if ($string =~ /T/) {
2168 $char = 'Y'; # C and T (pyrimidines) Y
2169 if ($string =~ /G/) {
2170 $char = 'B'; # C and T and G B
2172 } elsif ($string =~ /G/) {
2173 $char = 'S'; # C and G S
2175 } elsif ($string =~ /G/) {
2176 $char = 'G'; # G G
2177 if ($string =~ /C/) {
2178 $char = 'S'; # G and C S
2179 } elsif ($string =~ /T/) {
2180 $char = 'K'; # G and T K
2182 } elsif ($string =~ /T/) {
2183 $char = 'T'; # T T
2186 $char = 'U' if $rna and $char eq 'T';
2187 $char = lc $char if $string =~ /\W/;
2189 return $char;
2193 =head2 consensus_meta
2195 Title : consensus_meta
2196 Usage : $seqmeta = $ali->consensus_meta()
2197 Function : Returns a Bio::Seq::Meta object containing the consensus
2198 strings derived from meta data analysis.
2199 Returns : Bio::Seq::Meta
2200 Argument : Bio::Seq::Meta
2201 Throws : non-MetaI object
2203 =cut
2205 sub consensus_meta {
2206 my ($self, $meta) = @_;
2207 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2208 $self->throw('Not a Bio::Seq::MetaI object');
2210 return $self->{'_aln_meta'} = $meta if $meta;
2211 return $self->{'_aln_meta'}
2214 =head2 is_flush
2216 Title : is_flush
2217 Usage : if ( $ali->is_flush() )
2218 Function : Tells you whether the alignment
2219 : is flush, i.e. all of the same length
2220 Returns : 1 or 0
2221 Argument :
2223 =cut
2225 sub is_flush {
2226 my ($self,$report) = @_;
2227 my $seq;
2228 my $length = (-1);
2229 my $temp;
2231 foreach $seq ( $self->each_seq() ) {
2232 if( $length == (-1) ) {
2233 $length = CORE::length($seq->seq());
2234 next;
2237 $temp = CORE::length($seq->seq());
2238 if( $temp != $length ) {
2239 $self->warn("expecting $length not $temp from ".
2240 $seq->display_id) if( $report );
2241 $self->debug("expecting $length not $temp from ".
2242 $seq->display_id);
2243 $self->debug($seq->seq(). "\n");
2244 return 0;
2248 return 1;
2252 =head2 length
2254 Title : length()
2255 Usage : $len = $ali->length()
2256 Function : Returns the maximum length of the alignment.
2257 To be sure the alignment is a block, use is_flush
2258 Returns : Integer
2259 Argument :
2261 =cut
2263 sub length_aln {
2264 my $self = shift;
2265 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2266 $self->length(@_);
2269 sub length {
2270 my $self = shift;
2271 my $seq;
2272 my $length = -1;
2273 my $temp;
2275 foreach $seq ( $self->each_seq() ) {
2276 $temp = $seq->length();
2277 if( $temp > $length ) {
2278 $length = $temp;
2282 return $length;
2286 =head2 maxdisplayname_length
2288 Title : maxdisplayname_length
2289 Usage : $ali->maxdisplayname_length()
2290 Function : Gets the maximum length of the displayname in the
2291 alignment. Used in writing out various MSA formats.
2292 Returns : integer
2293 Argument :
2295 =cut
2297 sub maxname_length {
2298 my $self = shift;
2299 $self->deprecated("maxname_length - deprecated method.".
2300 " Use maxdisplayname_length() instead.");
2301 $self->maxdisplayname_length();
2304 sub maxnse_length {
2305 my $self = shift;
2306 $self->deprecated("maxnse_length - deprecated method.".
2307 " Use maxnse_length() instead.");
2308 $self->maxdisplayname_length();
2311 sub maxdisplayname_length {
2312 my $self = shift;
2313 my $maxname = (-1);
2314 my ($seq,$len);
2316 foreach $seq ( $self->each_seq() ) {
2317 $len = CORE::length $self->displayname($seq->get_nse());
2319 if( $len > $maxname ) {
2320 $maxname = $len;
2324 return $maxname;
2327 =head2 max_metaname_length
2329 Title : max_metaname_length
2330 Usage : $ali->max_metaname_length()
2331 Function : Gets the maximum length of the meta name tags in the
2332 alignment for the sequences and for the alignment.
2333 Used in writing out various MSA formats.
2334 Returns : integer
2335 Argument : None
2337 =cut
2339 sub max_metaname_length {
2340 my $self = shift;
2341 my $maxname = (-1);
2342 my ($seq,$len);
2344 # check seq meta first
2345 for $seq ( $self->each_seq() ) {
2346 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2347 for my $mtag ($seq->meta_names) {
2348 $len = CORE::length $mtag;
2349 if( $len > $maxname ) {
2350 $maxname = $len;
2355 # alignment meta
2356 for my $meta ($self->consensus_meta) {
2357 next unless $meta;
2358 for my $name ($meta->meta_names) {
2359 $len = CORE::length $name;
2360 if( $len > $maxname ) {
2361 $maxname = $len;
2366 return $maxname;
2369 =head2 num_residues
2371 Title : num_residues
2372 Usage : $no = $ali->num_residues
2373 Function : number of residues in total in the alignment
2374 Returns : integer
2375 Argument :
2376 Note : replaces no_residues()
2378 =cut
2380 sub num_residues {
2381 my $self = shift;
2382 my $count = 0;
2384 foreach my $seq ($self->each_seq) {
2385 my $str = $seq->seq();
2387 $count += ($str =~ s/[A-Za-z]//g);
2390 return $count;
2393 =head2 num_sequences
2395 Title : num_sequences
2396 Usage : $depth = $ali->num_sequences
2397 Function : number of sequence in the sequence alignment
2398 Returns : integer
2399 Argument : none
2400 Note : replaces no_sequences()
2402 =cut
2404 sub num_sequences {
2405 my $self = shift;
2406 return scalar($self->each_seq);
2410 =head2 average_percentage_identity
2412 Title : average_percentage_identity
2413 Usage : $id = $align->average_percentage_identity
2414 Function: The function uses a fast method to calculate the average
2415 percentage identity of the alignment
2416 Returns : The average percentage identity of the alignment
2417 Args : None
2418 Notes : This method implemented by Kevin Howe calculates a figure that is
2419 designed to be similar to the average pairwise identity of the
2420 alignment (identical in the absence of gaps), without having to
2421 explicitly calculate pairwise identities proposed by Richard Durbin.
2422 Validated by Ewan Birney ad Alex Bateman.
2424 =cut
2426 sub average_percentage_identity{
2427 my ($self,@args) = @_;
2429 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2430 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2432 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2434 if (! $self->is_flush()) {
2435 $self->throw("All sequences in the alignment must be the same length");
2438 @seqs = $self->each_seq();
2439 $len = $self->length();
2441 # load the each hash with correct keys for existence checks
2443 for( my $index=0; $index < $len; $index++) {
2444 foreach my $letter (@alphabet) {
2445 $countHashes[$index]->{$letter} = 0;
2448 foreach my $seq (@seqs) {
2449 my @seqChars = split //, $seq->seq();
2450 for( my $column=0; $column < @seqChars; $column++ ) {
2451 my $char = uc($seqChars[$column]);
2452 if (exists $countHashes[$column]->{$char}) {
2453 $countHashes[$column]->{$char}++;
2458 $total = 0;
2459 $divisor = 0;
2460 for(my $column =0; $column < $len; $column++) {
2461 my %hash = %{$countHashes[$column]};
2462 $subdivisor = 0;
2463 foreach my $res (keys %hash) {
2464 $total += $hash{$res}*($hash{$res} - 1);
2465 $subdivisor += $hash{$res};
2467 $divisor += $subdivisor * ($subdivisor - 1);
2469 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2472 =head2 percentage_identity
2474 Title : percentage_identity
2475 Usage : $id = $align->percentage_identity
2476 Function: The function calculates the average percentage identity
2477 (aliased to average_percentage_identity)
2478 Returns : The average percentage identity
2479 Args : None
2481 =cut
2483 sub percentage_identity {
2484 my $self = shift;
2485 return $self->average_percentage_identity();
2488 =head2 overall_percentage_identity
2490 Title : overall_percentage_identity
2491 Usage : $id = $align->overall_percentage_identity
2492 $id = $align->overall_percentage_identity('short')
2493 Function: The function calculates the percentage identity of
2494 the conserved columns
2495 Returns : The percentage identity of the conserved columns
2496 Args : length value to use, optional defaults to alignment length
2497 possible values: 'align', 'short', 'long'
2499 The argument values 'short' and 'long' refer to shortest and longest
2500 sequence in the alignment. Method modification code by Hongyu Zhang.
2502 =cut
2504 sub overall_percentage_identity{
2505 my ($self, $length_measure) = @_;
2507 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2508 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2510 my ($len, $total, @seqs, @countHashes);
2512 my %enum = map {$_ => 1} qw (align short long);
2514 $self->throw("Unknown argument [$length_measure]")
2515 if $length_measure and not $enum{$length_measure};
2516 $length_measure ||= 'align';
2518 if (! $self->is_flush()) {
2519 $self->throw("All sequences in the alignment must be the same length");
2522 @seqs = $self->each_seq();
2523 $len = $self->length();
2525 # load the each hash with correct keys for existence checks
2526 for( my $index=0; $index < $len; $index++) {
2527 foreach my $letter (@alphabet) {
2528 $countHashes[$index]->{$letter} = 0;
2531 foreach my $seq (@seqs) {
2532 my @seqChars = split //, $seq->seq();
2533 for( my $column=0; $column < @seqChars; $column++ ) {
2534 my $char = uc($seqChars[$column]);
2535 if (exists $countHashes[$column]->{$char}) {
2536 $countHashes[$column]->{$char}++;
2541 $total = 0;
2542 for(my $column =0; $column < $len; $column++) {
2543 my %hash = %{$countHashes[$column]};
2544 foreach ( values %hash ) {
2545 next if( $_ == 0 );
2546 $total++ if( $_ == scalar @seqs );
2547 last;
2551 if ($length_measure eq 'short') {
2552 ## find the shortest length
2553 $len = 0;
2554 foreach my $seq ($self->each_seq) {
2555 my $count = $seq->seq =~ tr/[A-Za-z]//;
2556 if ($len) {
2557 $len = $count if $count < $len;
2558 } else {
2559 $len = $count;
2563 elsif ($length_measure eq 'long') {
2564 ## find the longest length
2565 $len = 0;
2566 foreach my $seq ($self->each_seq) {
2567 my $count = $seq->seq =~ tr/[A-Za-z]//;
2568 if ($len) {
2569 $len = $count if $count > $len;
2570 } else {
2571 $len = $count;
2576 return ($total / $len ) * 100.0;
2581 =head1 Alignment positions
2583 Methods to map a sequence position into an alignment column and back.
2584 column_from_residue_number() does the former. The latter is really a
2585 property of the sequence object and can done using
2586 L<Bio::LocatableSeq::location_from_column>:
2588 # select somehow a sequence from the alignment, e.g.
2589 my $seq = $aln->get_seq_by_pos(1);
2590 #$loc is undef or Bio::LocationI object
2591 my $loc = $seq->location_from_column(5);
2593 =head2 column_from_residue_number
2595 Title : column_from_residue_number
2596 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2597 Function: This function gives the position in the alignment
2598 (i.e. column number) of the given residue number in the
2599 sequence with the given name. For example, for the
2600 alignment
2602 Seq1/91-97 AC..DEF.GH.
2603 Seq2/24-30 ACGG.RTY...
2604 Seq3/43-51 AC.DDEF.GHI
2606 column_from_residue_number( "Seq1", 94 ) returns 6.
2607 column_from_residue_number( "Seq2", 25 ) returns 2.
2608 column_from_residue_number( "Seq3", 50 ) returns 10.
2610 An exception is thrown if the residue number would lie
2611 outside the length of the aligment
2612 (e.g. column_from_residue_number( "Seq2", 22 )
2614 Note: If the the parent sequence is represented by more than
2615 one alignment sequence and the residue number is present in
2616 them, this method finds only the first one.
2618 Returns : A column number for the position in the alignment of the
2619 given residue in the given sequence (1 = first column)
2620 Args : A sequence id/name (not a name/start-end)
2621 A residue number in the whole sequence (not just that
2622 segment of it in the alignment)
2624 =cut
2626 sub column_from_residue_number {
2627 my ($self, $name, $resnumber) = @_;
2629 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2630 $self->throw("Second argument residue number missing") unless $resnumber;
2632 foreach my $seq ($self->each_seq_with_id($name)) {
2633 my $col;
2634 eval {
2635 $col = $seq->column_from_residue_number($resnumber);
2637 next if $@;
2638 return $col;
2641 $self->throw("Could not find a sequence segment in $name ".
2642 "containing residue number $resnumber");
2646 =head1 Sequence names
2648 Methods to manipulate the display name. The default name based on the
2649 sequence id and subsequence positions can be overridden in various
2650 ways.
2652 =head2 displayname
2654 Title : displayname
2655 Usage : $myalign->displayname("Ig", "IgA")
2656 Function : Gets/sets the display name of a sequence in the alignment
2657 Returns : A display name string
2658 Argument : name of the sequence
2659 displayname of the sequence (optional)
2661 =cut
2663 sub get_displayname {
2664 my $self = shift;
2665 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2666 $self->displayname(@_);
2669 sub set_displayname {
2670 my $self = shift;
2671 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2672 $self->displayname(@_);
2675 sub displayname {
2676 my ($self, $name, $disname) = @_;
2678 $self->throw("No sequence with name [$name]")
2679 unless defined $self->{'_seq'}->{$name};
2681 if( $disname and $name) {
2682 $self->{'_dis_name'}->{$name} = $disname;
2683 return $disname;
2685 elsif( defined $self->{'_dis_name'}->{$name} ) {
2686 return $self->{'_dis_name'}->{$name};
2687 } else {
2688 return $name;
2692 =head2 set_displayname_count
2694 Title : set_displayname_count
2695 Usage : $ali->set_displayname_count
2696 Function : Sets the names to be name_# where # is the number of
2697 times this name has been used.
2698 Returns : 1, on success
2699 Argument :
2701 =cut
2703 sub set_displayname_count {
2704 my $self= shift;
2705 my (@arr,$name,$seq,$count,$temp,$nse);
2707 foreach $seq ( $self->each_alphabetically() ) {
2708 $nse = $seq->get_nse();
2710 #name will be set when this is the second
2711 #time (or greater) is has been seen
2713 if( defined $name and $name eq ($seq->id()) ) {
2714 $temp = sprintf("%s_%s",$name,$count);
2715 $self->displayname($nse,$temp);
2716 $count++;
2717 } else {
2718 $count = 1;
2719 $name = $seq->id();
2720 $temp = sprintf("%s_%s",$name,$count);
2721 $self->displayname($nse,$temp);
2722 $count++;
2725 return 1;
2728 =head2 set_displayname_flat
2730 Title : set_displayname_flat
2731 Usage : $ali->set_displayname_flat()
2732 Function : Makes all the sequences be displayed as just their name,
2733 not name/start-end
2734 Returns : 1
2735 Argument :
2737 =cut
2739 sub set_displayname_flat {
2740 my $self = shift;
2741 my ($nse,$seq);
2743 foreach $seq ( $self->each_seq() ) {
2744 $nse = $seq->get_nse();
2745 $self->displayname($nse,$seq->id());
2747 return 1;
2750 =head2 set_displayname_normal
2752 Title : set_displayname_normal
2753 Usage : $ali->set_displayname_normal()
2754 Function : Makes all the sequences be displayed as name/start-end
2755 Returns : 1, on success
2756 Argument :
2758 =cut
2760 sub set_displayname_normal {
2761 my $self = shift;
2762 my ($nse,$seq);
2764 foreach $seq ( $self->each_seq() ) {
2765 $nse = $seq->get_nse();
2766 $self->displayname($nse,$nse);
2768 return 1;
2771 =head2 source
2773 Title : source
2774 Usage : $obj->source($newval)
2775 Function: sets the Alignment source program
2776 Example :
2777 Returns : value of source
2778 Args : newvalue (optional)
2781 =cut
2783 sub source{
2784 my ($self,$value) = @_;
2785 if( defined $value) {
2786 $self->{'_source'} = $value;
2788 return $self->{'_source'};
2791 =head2 set_displayname_safe
2793 Title : set_displayname_safe
2794 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2795 Function : Assign machine-generated serial names to sequences in input order.
2796 Designed to protect names during PHYLIP runs. Assign 10-char string
2797 in the form of "S000000001" to "S999999999". Restore the original
2798 names using "restore_displayname".
2799 Returns : 1. a new $aln with system names;
2800 2. a hash ref for restoring names
2801 Argument : Number for id length (default 10)
2803 =cut
2805 sub set_displayname_safe {
2806 my $self = shift;
2807 my $idlength = shift || 10;
2808 my ($seq, %phylip_name);
2809 my $ct=0;
2810 my $new=Bio::SimpleAlign->new();
2811 foreach $seq ( $self->each_seq() ) {
2812 $ct++;
2813 my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2814 $phylip_name{$pname}=$seq->id();
2815 my $new_seq= Bio::LocatableSeq->new(-id => $pname,
2816 -seq => $seq->seq(),
2817 -alphabet => $seq->alphabet,
2818 -start => $seq->{_start},
2819 -end => $seq->{_end}
2821 $new->add_seq($new_seq);
2824 $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2825 return ($new, \%phylip_name);
2828 =head2 restore_displayname
2830 Title : restore_displayname
2831 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2832 Function : Restore original sequence names (after running
2833 $ali->set_displayname_safe)
2834 Returns : a new $aln with names restored.
2835 Argument : a hash reference of names from "set_displayname_safe".
2837 =cut
2839 sub restore_displayname {
2840 my $self = shift;
2841 my $ref=shift;
2842 my %name=%$ref;
2843 my $new=Bio::SimpleAlign->new();
2844 foreach my $seq ( $self->each_seq() ) {
2845 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2846 my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2847 -seq => $seq->seq(),
2848 -alphabet => $seq->alphabet,
2849 -start => $seq->{_start},
2850 -end => $seq->{_end}
2852 $new->add_seq($new_seq);
2854 return $new;
2857 =head2 sort_by_start
2859 Title : sort_by_start
2860 Usage : $ali->sort_by_start
2861 Function : Changes the order of the alignment to the start position of each
2862 subalignment
2863 Returns :
2864 Argument :
2866 =cut
2868 sub sort_by_start {
2869 my $self = shift;
2870 my ($seq,$nse,@arr,%hash,$count);
2871 foreach $seq ( $self->each_seq() ) {
2872 $nse = $seq->get_nse;
2873 $hash{$nse} = $seq;
2875 $count = 0;
2876 %{$self->{'_order'}} = (); # reset the hash;
2877 foreach $nse ( sort _startend keys %hash) {
2878 $self->{'_order'}->{$count} = $nse;
2879 $count++;
2884 sub _startend
2886 my ($aname,$arange) = split (/[\/]/,$a);
2887 my ($bname,$brange) = split (/[\/]/,$b);
2888 my ($astart,$aend) = split(/\-/,$arange);
2889 my ($bstart,$bend) = split(/\-/,$brange);
2890 return $astart <=> $bstart;
2893 =head2 bracket_string
2895 Title : bracket_string
2896 Usage : my @params = (-refseq => 'testseq',
2897 -allele1 => 'allele1',
2898 -allele2 => 'allele2',
2899 -delimiters => '{}',
2900 -separator => '/');
2901 $str = $aln->bracket_string(@params)
2903 Function : When supplied with a list of parameters (see below), returns a
2904 string in BIC format. This is used for allelic comparisons.
2905 Briefly, if either allele contains a base change when compared to
2906 the refseq, the base or gap for each allele is represented in
2907 brackets in the order present in the 'alleles' parameter.
2909 For the following data:
2911 >testseq
2912 GGATCCATTGCTACT
2913 >allele1
2914 GGATCCATTCCTACT
2915 >allele2
2916 GGAT--ATTCCTCCT
2918 the returned string with parameters 'refseq => testseq' and
2919 'alleles => [qw(allele1 allele2)]' would be:
2921 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2922 Returns : BIC-formatted string
2923 Argument : Required args
2924 refseq : string (ID) of the reference sequence used
2925 as basis for comparison
2926 allele1 : string (ID) of the first allele
2927 allele2 : string (ID) of the second allele
2928 Optional args
2929 delimiters: two symbol string of left and right delimiters.
2930 Only the first two symbols are used
2931 default = '[]'
2932 separator : string used as a separator. Only the first
2933 symbol is used
2934 default = '/'
2935 Throws : On no refseq/alleles, or invalid refseq/alleles.
2937 =cut
2939 sub bracket_string {
2940 my ($self, @args) = @_;
2941 my ($ref, $a1, $a2, $delim, $sep) =
2942 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2943 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2944 my ($ld, $rd);
2945 ($ld, $rd) = split('', $delim, 2) if $delim;
2946 $ld ||= '[';
2947 $rd ||= ']';
2948 $sep ||= '/';
2949 my ($refseq, $allele1, $allele2) =
2950 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2951 if (!$refseq || !$allele1 || !$allele2) {
2952 $self->throw("One of your refseq/allele IDs is invalid!");
2954 my $len = $self->length-1;
2955 my $bic = '';
2956 # loop over the alignment columns
2957 for my $column ( 0 .. $len ) {
2958 my $string;
2959 my ($compres, $res1, $res2) =
2960 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2961 # are any of the allele symbols different from the refseq?
2962 $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2963 $ld.$res1.$sep.$res2.$rd;
2964 $bic .= $string;
2966 return $bic;
2970 =head2 methods implementing Bio::FeatureHolderI
2972 FeatureHolderI implementation to support labeled character sets like one
2973 would get from NEXUS represented data.
2975 =head2 get_SeqFeatures
2977 Usage : @features = $aln->get_SeqFeatures
2978 Function: Get the feature objects held by this feature holder.
2979 Example :
2980 Returns : an array of Bio::SeqFeatureI implementing objects
2981 Args : optional filter coderef, taking a Bio::SeqFeatureI
2982 : as argument, returning TRUE if wanted, FALSE if
2983 : unwanted
2985 =cut
2987 sub get_SeqFeatures {
2988 my $self = shift;
2989 my $filter_cb = shift;
2990 $self->throw("Arg (filter callback) must be a coderef") unless
2991 !defined($filter_cb) or ref($filter_cb) eq 'CODE';
2992 if( !defined $self->{'_as_feat'} ) {
2993 $self->{'_as_feat'} = [];
2995 if ($filter_cb) {
2996 return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
2998 return @{$self->{'_as_feat'}};
3001 =head2 add_SeqFeature
3003 Usage : $aln->add_SeqFeature($subfeat);
3004 Function: adds a SeqFeature into the SeqFeature array.
3005 Example :
3006 Returns : true on success
3007 Args : a Bio::SeqFeatureI object
3008 Note : This implementation is not compliant
3009 with Bio::FeatureHolderI
3011 =cut
3013 sub add_SeqFeature {
3014 my ($self,@feat) = @_;
3016 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3018 foreach my $feat ( @feat ) {
3019 if( !$feat->isa("Bio::SeqFeatureI") ) {
3020 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3023 push(@{$self->{'_as_feat'}},$feat);
3025 return 1;
3029 =head2 remove_SeqFeatures
3031 Usage : $obj->remove_SeqFeatures
3032 Function: Removes all SeqFeatures. If you want to remove only a subset,
3033 remove that subset from the returned array, and add back the rest.
3034 Returns : The array of Bio::SeqFeatureI features that was
3035 deleted from this alignment.
3036 Args : none
3038 =cut
3040 sub remove_SeqFeatures {
3041 my $self = shift;
3043 return () unless $self->{'_as_feat'};
3044 my @feats = @{$self->{'_as_feat'}};
3045 $self->{'_as_feat'} = [];
3046 return @feats;
3049 =head2 feature_count
3051 Title : feature_count
3052 Usage : $obj->feature_count()
3053 Function: Return the number of SeqFeatures attached to the alignment
3054 Returns : integer representing the number of SeqFeatures
3055 Args : None
3057 =cut
3059 sub feature_count {
3060 my ($self) = @_;
3062 if (defined($self->{'_as_feat'})) {
3063 return ($#{$self->{'_as_feat'}} + 1);
3064 } else {
3065 return 0;
3069 =head2 get_all_SeqFeatures
3071 Title : get_all_SeqFeatures
3072 Usage :
3073 Function: Get all SeqFeatures.
3074 Example :
3075 Returns : an array of Bio::SeqFeatureI implementing objects
3076 Args : none
3077 Note : Falls through to Bio::FeatureHolderI implementation.
3079 =cut
3081 =head2 methods for Bio::AnnotatableI
3083 AnnotatableI implementation to support sequence alignments which
3084 contain annotation (NEXUS, Stockholm).
3086 =head2 annotation
3088 Title : annotation
3089 Usage : $ann = $aln->annotation or
3090 $aln->annotation($ann)
3091 Function: Gets or sets the annotation
3092 Returns : Bio::AnnotationCollectionI object
3093 Args : None or Bio::AnnotationCollectionI object
3095 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3096 for more information
3098 =cut
3100 sub annotation {
3101 my ($obj,$value) = @_;
3102 if( defined $value ) {
3103 $obj->throw("object of class ".ref($value)." does not implement ".
3104 "Bio::AnnotationCollectionI. Too bad.")
3105 unless $value->isa("Bio::AnnotationCollectionI");
3106 $obj->{'_annotation'} = $value;
3107 } elsif( ! defined $obj->{'_annotation'}) {
3108 $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3110 return $obj->{'_annotation'};
3113 =head1 Deprecated methods
3115 =cut
3117 =head2 no_residues
3119 Title : no_residues
3120 Usage : $no = $ali->no_residues
3121 Function : number of residues in total in the alignment
3122 Returns : integer
3123 Argument :
3124 Note : deprecated in favor of num_residues()
3126 =cut
3128 sub no_residues {
3129 my $self = shift;
3130 $self->deprecated(-warn_version => 1.0069,
3131 -throw_version => 1.0075,
3132 -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3133 $self->num_residues(@_);
3136 =head2 no_sequences
3138 Title : no_sequences
3139 Usage : $depth = $ali->no_sequences
3140 Function : number of sequence in the sequence alignment
3141 Returns : integer
3142 Argument :
3143 Note : deprecated in favor of num_sequences()
3145 =cut
3147 sub no_sequences {
3148 my $self = shift;
3149 $self->deprecated(-warn_version => 1.0069,
3150 -throw_version => 1.0075,
3151 -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3152 $self->num_sequences(@_);
3155 =head2 mask_columns
3157 Title : mask_columns
3158 Usage : $aln2 = $aln->mask_columns(20,30)
3159 Function : Masks a slice of the alignment inclusive of start and
3160 end columns, and the first column in the alignment is denoted 1.
3161 Mask beyond the length of the sequence does not do padding.
3162 Returns : A Bio::SimpleAlign object
3163 Args : Positive integer for start column, positive integer for end column,
3164 optional string value use for the mask. Example:
3166 $aln2 = $aln->mask_columns(20,30,'?')
3167 Note : Masking must use a character that is not used for gaps or
3168 frameshifts. These can be adjusted using the relevant global
3169 variables, but be aware these may be (uncontrollably) modified
3170 elsewhere within BioPerl (see bug 2715)
3172 =cut
3174 sub mask_columns {
3175 #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3176 my $self = shift;
3178 my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3179 $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3181 # coordinates are alignment-based, not sequence-based
3182 my ($start, $end, $mask_char) = @_;
3183 unless (defined $mask_char) { $mask_char = 'N' }
3185 $self->throw("Mask start has to be a positive integer and less than ".
3186 "alignment length, not [$start]")
3187 unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3188 $self->throw("Mask end has to be a positive integer and less than ".
3189 "alignment length, not [$end]")
3190 unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3191 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3192 "end [$end]") unless $start <= $end;
3193 $self->throw("Mask character $mask_char has to be a single character ".
3194 "and not a gap or frameshift symbol")
3195 unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3197 my $aln = $self->new;
3198 $aln->id($self->id);
3199 foreach my $seq ( $self->each_seq() ) {
3200 my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3201 -alphabet => $seq->alphabet,
3202 -strand => $seq->strand,
3203 -verbose => $self->verbose);
3205 # convert from 1-based alignment coords!
3206 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3207 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3208 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3209 $new_seq->seq($new_dna_string);
3210 $aln->add_seq($new_seq);
3212 return $aln;