sync with main trunk
[bioperl-live.git] / Bio / SimpleAlign.pm
blob5c012e875c2657643e5dfd5bccaf4cffe53bf341
1 # $Id$
2 # BioPerl module for SimpleAlign
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 # History:
15 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
16 # May 2001 major rewrite - Heikki Lehvaslaiho
18 =head1 NAME
20 Bio::SimpleAlign - Multiple alignments held as a set of sequences
22 =head1 SYNOPSIS
24 # Use Bio::AlignIO to read in the alignment
25 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
26 $aln = $str->next_aln();
28 # Describe
29 print $aln->length;
30 print $aln->num_residues;
31 print $aln->is_flush;
32 print $aln->num_sequences;
33 print $aln->score;
34 print $aln->percentage_identity;
35 print $aln->consensus_string(50);
37 # Find the position in the alignment for a sequence location
38 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
40 # Extract sequences and check values for the alignment column $pos
41 foreach $seq ($aln->each_seq) {
42 $res = $seq->subseq($pos, $pos);
43 $count{$res}++;
45 foreach $res (keys %count) {
46 printf "Res: %s Count: %2d\n", $res, $count{$res};
49 # Manipulate
50 $aln->remove_seq($seq);
51 $mini_aln = $aln->slice(20,30); # get a block of columns
52 $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
53 $new_aln = $aln->remove_columns([20,30]); # remove by position
54 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
56 # Analyze
57 $str = $aln->consensus_string($threshold_percent);
58 $str = $aln->match_line();
59 $str = $aln->cigar_line();
60 $id = $aln->percentage_identity;
62 # See the module documentation for details and more methods.
64 =head1 DESCRIPTION
66 SimpleAlign is an object that handles a multiple sequence alignment
67 (MSA). It is very permissive of types (it does not insist on sequences
68 being all same length, for example). Think of it as a set of sequences
69 with a whole series of built-in manipulations and methods for reading and
70 writing alignments.
72 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
73 to store its sequences. These are subsequences with a start and end
74 positions in the parent reference sequence. Each sequence in the
75 SimpleAlign object is a Bio::LocatableSeq.
77 SimpleAlign expects the combination of name, start, and end for a
78 given sequence to be unique in the alignment, and this is the key for the
79 internal hashes (name, start, end are abbreviated C<nse> in the code).
80 However, in some cases people do not want the name/start-end to be displayed:
81 either multiple names in an alignment or names specific to the alignment
82 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
83 C<displayname>, and generally is what is used to print out the
84 alignment. They default to name/start-end.
86 The SimpleAlign Module is derived from the Align module by Ewan Birney.
88 =head1 FEEDBACK
90 =head2 Mailing Lists
92 User feedback is an integral part of the evolution of this and other
93 Bioperl modules. Send your comments and suggestions preferably to one
94 of the Bioperl mailing lists. Your participation is much appreciated.
96 bioperl-l@bioperl.org - General discussion
97 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
99 =head2 Support
101 Please direct usage questions or support issues to the mailing list:
103 L<bioperl-l@bioperl.org>
105 rather than to the module maintainer directly. Many experienced and
106 reponsive experts will be able look at the problem and quickly
107 address it. Please include a thorough description of the problem
108 with code and data examples if at all possible.
110 =head2 Reporting Bugs
112 Report bugs to the Bioperl bug tracking system to help us keep track
113 the bugs and their resolution. Bug reports can be submitted via the
114 web:
116 http://bugzilla.open-bio.org/
118 =head1 AUTHOR
120 Ewan Birney, birney@ebi.ac.uk
122 =head1 CONTRIBUTORS
124 Allen Day, allenday-at-ucla.edu,
125 Richard Adams, Richard.Adams-at-ed.ac.uk,
126 David J. Evans, David.Evans-at-vir.gla.ac.uk,
127 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
128 Allen Smith, allens-at-cpan.org,
129 Jason Stajich, jason-at-bioperl.org,
130 Anthony Underwood, aunderwood-at-phls.org.uk,
131 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
132 Brian Osborne, bosborne at alum.mit.edu
133 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
134 Hongyu Zhang, forward at hongyu.org
135 Jay Hannah, jay at jays.net
136 Alexandr Bezginov, albezg at gmail.com
138 =head1 SEE ALSO
140 L<Bio::LocatableSeq>
142 =head1 APPENDIX
144 The rest of the documentation details each of the object
145 methods. Internal methods are usually preceded with a _
147 =cut
149 # 'Let the code begin...
151 package Bio::SimpleAlign;
152 use vars qw(%CONSERVATION_GROUPS);
153 use strict;
155 use Bio::LocatableSeq; # uses Seq's as list
157 use Bio::Seq;
158 use Bio::SeqFeature::Generic;
160 BEGIN {
161 # This data should probably be in a more centralized module...
162 # it is taken from Clustalw documentation.
163 # These are all the positively scoring groups that occur in the
164 # Gonnet Pam250 matrix. The strong and weak groups are
165 # defined as strong score >0.5 and weak score =<0.5 respectively.
167 %CONSERVATION_GROUPS = (
168 'strong' => [ qw(
170 NEQK
171 NHQK
172 NDEQ
173 QHRK
174 MILV
175 MILF
177 FYW )],
178 'weak' => [ qw(
182 STNK
183 STPA
184 SGND
185 SNDEQK
186 NDEQHK
187 NEQHRK
188 FVLIM
189 HFY )],);
192 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
193 Bio::FeatureHolderI);
195 =head2 new
197 Title : new
198 Usage : my $aln = Bio::SimpleAlign->new();
199 Function : Creates a new simple align object
200 Returns : Bio::SimpleAlign
201 Args : -source => string representing the source program
202 where this alignment came from
203 -annotation => Bio::AnnotationCollectionI
204 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
205 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
206 -consensus => consensus string
207 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
209 =cut
212 sub new {
213 my($class,@args) = @_;
215 my $self = $class->SUPER::new(@args);
217 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
218 SOURCE
219 SCORE
221 ACCESSION
222 DESCRIPTION
223 SEQS
224 FEATURES
225 ANNOTATION
226 SEQ_ANNOTATION
227 CONSENSUS
228 CONSENSUS_META
229 )], @args);
230 $src && $self->source($src);
231 defined $score && $self->score($score);
232 # we need to set up internal hashs first!
234 $self->{'_seq'} = {};
235 $self->{'_order'} = {};
236 $self->{'_start_end_lists'} = {};
237 $self->{'_dis_name'} = {};
238 $self->{'_id'} = 'NoName';
239 # maybe we should automatically read in from args. Hmmm...
240 $id && $self->id($id);
241 $acc && $self->accession($acc);
242 $desc && $self->description($desc);
243 $coll && $self->annotation($coll);
244 # sequence annotation is layered into a provided annotation collection (or dies)
245 if ($sa) {
246 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
247 "with a sequence annotation collection")
248 if !$coll;
249 $coll->add_Annotation('seq_annotation', $sa);
251 if ($feats && ref $feats eq 'ARRAY') {
252 for my $feat (@$feats) {
253 $self->add_SeqFeature($feat);
256 $con && $self->consensus($con);
257 $cmeta && $self->consensus_meta($cmeta);
258 # assumes these are in correct alignment order
259 if ($seqs && ref($seqs) eq 'ARRAY') {
260 for my $seq (@$seqs) {
261 $self->add_seq($seq);
265 return $self; # success - we hope!
268 =head1 Modifier methods
270 These methods modify the MSA by adding, removing or shuffling complete
271 sequences.
273 =head2 add_seq
275 Title : add_seq
276 Usage : $myalign->add_seq($newseq);
277 $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
278 Function : Adds another sequence to the alignment. *Does not* align
279 it - just adds it to the hashes.
280 If -ORDER is specified, the sequence is inserted at the
281 the position spec'd by -ORDER, and existing sequences
282 are pushed down the storage array.
283 Returns : nothing
284 Args : A Bio::LocatableSeq object
285 Positive integer for the sequence position (optional)
287 See L<Bio::LocatableSeq> for more information
289 =cut
291 sub addSeq {
292 my $self = shift;
293 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
294 $self->add_seq(@_);
297 sub add_seq {
298 my $self = shift;
299 my @args = @_;
300 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
301 my ($name,$id,$start,$end);
303 unless ($seq) {
304 $self->throw("LocatableSeq argument required");
306 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
307 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
309 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
310 $order--; # jay's patch (user-specified order is 1-origin)
312 if ($order < 0) {
313 $self->throw("User-specified value for ORDER must be >= 1");
316 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
318 # build the symbol list for this sequence,
319 # will prune out the gap and missing/match chars
320 # when actually asked for the symbol list in the
321 # symbol_chars
322 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
324 $name = $seq->get_nse;
326 if( $self->{'_seq'}->{$name} ) {
327 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
329 else {
330 $self->debug( "Assigning $name to $order\n");
332 my $ordh = $self->{'_order'};
333 if ($ordh->{$order}) {
334 # make space to insert
335 # $c->() returns (in reverse order) the first subsequence
336 # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
337 # (3,2,1), and $c->(2,4,5) returns (2).
338 my $c;
339 $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
340 map {
341 $ordh->{$_+1} = $ordh->{$_}
342 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
345 $ordh->{$order} = $name;
347 unless( exists( $self->{'_start_end_lists'}->{$id})) {
348 $self->{'_start_end_lists'}->{$id} = [];
350 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
353 $self->{'_seq'}->{$name} = $seq;
358 =head2 remove_seq
360 Title : remove_seq
361 Usage : $aln->remove_seq($seq);
362 Function : Removes a single sequence from an alignment
363 Returns :
364 Argument : a Bio::LocatableSeq object
366 =cut
368 sub removeSeq {
369 my $self = shift;
370 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
371 $self->remove_seq(@_);
374 sub remove_seq {
375 my $self = shift;
376 my $seq = shift;
377 my ($name,$id,$start,$end);
379 $self->throw("Need Bio::Locatable seq argument ")
380 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
382 $id = $seq->id();
383 $start = $seq->start();
384 $end = $seq->end();
385 $name = sprintf("%s/%d-%d",$id,$start,$end);
387 if( !exists $self->{'_seq'}->{$name} ) {
388 $self->throw("Sequence $name does not exist in the alignment to remove!");
391 delete $self->{'_seq'}->{$name};
393 # we need to remove this seq from the start_end_lists hash
395 if (exists $self->{'_start_end_lists'}->{$id}) {
396 # we need to find the sequence in the array.
398 my ($i, $found);;
399 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
400 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
401 $found = 1;
402 last;
405 if ($found) {
406 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
408 else {
409 $self->throw("Could not find the sequence to remoce from the start-end list");
412 else {
413 $self->throw("There is no seq list for the name $id");
415 # we need to shift order hash
416 my %rev_order = reverse %{$self->{'_order'}};
417 my $no = $rev_order{$name};
418 my $num_sequences = $self->num_sequences;
419 for (; $no < $num_sequences; $no++) {
420 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
422 delete $self->{'_order'}->{$no};
423 return 1;
427 =head2 purge
429 Title : purge
430 Usage : $aln->purge(0.7);
431 Function: Removes sequences above given sequence similarity
432 This function will grind on large alignments. Beware!
433 Example :
434 Returns : An array of the removed sequences
435 Args : float, threshold for similarity
437 =cut
439 sub purge {
440 my ($self,$perc) = @_;
441 my (%duplicate, @dups);
443 my @seqs = $self->each_seq();
445 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
446 my $seq = $seqs[$i];
448 #skip if already in duplicate hash
449 next if exists $duplicate{$seq->display_id} ;
450 my $one = $seq->seq();
452 my @one = split '', $one; #split to get 1aa per array element
454 for (my $j=$i+1;$j < @seqs;$j++) {
455 my $seq2 = $seqs[$j];
457 #skip if already in duplicate hash
458 next if exists $duplicate{$seq2->display_id} ;
460 my $two = $seq2->seq();
461 my @two = split '', $two;
463 my $count = 0;
464 my $res = 0;
465 for (my $k=0;$k<@one;$k++) {
466 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
467 $one[$k] eq $two[$k]) {
468 $count++;
470 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
471 $two[$k] ne '.' && $two[$k] ne '-' ) {
472 $res++;
476 my $ratio = 0;
477 $ratio = $count/$res unless $res == 0;
479 # if above threshold put in duplicate hash and push onto
480 # duplicate array for returning to get_unique
481 if ( $ratio > $perc ) {
482 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
483 $duplicate{$seq2->display_id} = 1;
484 push @dups, $seq2;
488 foreach my $seq (@dups) {
489 $self->remove_seq($seq);
491 return @dups;
494 =head2 sort_alphabetically
496 Title : sort_alphabetically
497 Usage : $ali->sort_alphabetically
498 Function : Changes the order of the alignment to alphabetical on name
499 followed by numerical by number.
500 Returns :
501 Argument :
503 =cut
505 sub sort_alphabetically {
506 my $self = shift;
507 my ($seq,$nse,@arr,%hash,$count);
509 foreach $seq ( $self->each_seq() ) {
510 $nse = $seq->get_nse;
511 $hash{$nse} = $seq;
514 $count = 0;
516 %{$self->{'_order'}} = (); # reset the hash;
518 foreach $nse ( sort _alpha_startend keys %hash) {
519 $self->{'_order'}->{$count} = $nse;
521 $count++;
526 =head2 sort_by_list
528 Title : sort_by_list
529 Usage : $aln_ordered=$aln->sort_by_list($list_file)
530 Function : Arbitrarily order sequences in an alignment
531 Returns : A new Bio::SimpleAlign object
532 Argument : a file listing sequence names in intended order (one name per line)
534 =cut
536 sub sort_by_list {
537 my ($self, $list) = @_;
538 my (@seq, @ids, %order);
540 foreach my $seq ( $self->each_seq() ) {
541 push @seq, $seq;
542 push @ids, $seq->display_id;
545 my $ct=1;
546 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
547 while (<$listfh>) {
548 chomp;
549 my $name=$_;
550 $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
551 $order{$name}=$ct++;
553 close($listfh);
555 # use the map-sort-map idiom:
556 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
557 my $aln = $self->new;
558 foreach (@sorted) { $aln->add_seq($_) }
559 return $aln;
562 =head2 set_new_reference
564 Title : set_new_reference
565 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
566 the sequence whoes name is "B31" (full, exact, and case-sensitive),
567 as the reference (1st) sequence
568 Function : Change/Set a new reference (i.e., the first) sequence
569 Returns : a new Bio::SimpleAlign object.
570 Throws an exception if designated sequence not found
571 Argument : a positive integer of sequence order, or a sequence name
572 in the original alignment
574 =cut
576 sub set_new_reference {
577 my ($self, $seqid) = @_;
578 my $aln = $self->new;
579 my (@seq, @ids, @new_seq);
580 my $is_num=0;
581 foreach my $seq ( $self->each_seq() ) {
582 push @seq, $seq;
583 push @ids, $seq->display_id;
586 if ($seqid =~ /^\d+$/) { # argument is seq position
587 $is_num=1;
588 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
589 } else { # argument is a seq name
590 $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
593 for (my $i=0; $i<=$#seq; $i++) {
594 my $pos=$i+1;
595 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
596 unshift @new_seq, $seq[$i];
597 } else {
598 push @new_seq, $seq[$i];
601 foreach (@new_seq) { $aln->add_seq($_); }
602 return $aln;
605 sub _in_aln { # check if input name exists in the alignment
606 my ($str, $ref) = @_;
607 foreach (@$ref) {
608 return 1 if $str eq $_;
610 return 0;
614 =head2 uniq_seq
616 Title : uniq_seq
617 Usage : $aln->uniq_seq(): Remove identical sequences in
618 in the alignment. Ambiguous base ("N", "n") and
619 leading and ending gaps ("-") are NOT counted as
620 differences.
621 Function : Make a new alignment of unique sequence types (STs)
622 Returns : 1a. if called in a scalar context,
623 a new Bio::SimpleAlign object (all sequences renamed as "ST")
624 1b. if called in an array context,
625 a new Bio::SimpleAlign object, and a hashref whose keys
626 are sequence types, and whose values are arrayrefs to
627 lists of sequence ids within the corresponding sequence type
628 2. if $aln->verbose > 0, ST of each sequence is sent to
629 STDERR (in a tabular format)
630 Argument : None
632 =cut
634 sub uniq_seq {
635 my ($self, $seqid) = @_;
636 my $aln = $self->new;
637 my (%member, %order, @seq, @uniq_str, $st);
638 my $order=0;
639 my $len = $self->length();
640 $st = {};
641 foreach my $seq ( $self->each_seq() ) {
642 my $str = $seq->seq();
644 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
645 # comparing two sequence strings
647 # 1st, convert "n", "N" to "?" (for DNA sequence only):
648 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
649 # 2nd, convert leading and ending gaps to "?":
650 $str = &_convert_leading_ending_gaps($str, '-', '?');
651 # Note that '?' also can mean unknown residue.
652 # I don't like making global class member changes like this, too
653 # prone to errors... -- cjfields 08-11-18
654 local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
655 my $new = Bio::LocatableSeq->new(
656 -id => $seq->id(),
657 -alphabet=> $seq->alphabet,
658 -seq => $str,
659 -start => $seq->start,
660 -end => $seq->end
662 push @seq, $new;
665 foreach my $seq (@seq) {
666 my $str = $seq->seq();
667 my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
668 if ($seen) { # seen before
669 my @memb = @{$member{$key}};
670 push @memb, $seq;
671 $member{$key} = \@memb;
672 } else { # not seen
673 push @uniq_str, $key;
674 $order++;
675 $member{$key} = [ ($seq) ];
676 $order{$key} = $order;
680 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
681 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
682 my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
683 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
684 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
685 my $gap='-';
686 my $end=length($str2);
687 $end -= length($1) while $str2 =~ m/($gap+)/g;
688 my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
689 -seq =>$str2,
690 -start=>1,
691 -end =>$end
693 $aln->add_seq($new);
694 foreach (@{$member{$str}}) {
695 push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
696 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
699 return wantarray ? ($aln, $st) : $aln;
702 sub _check_uniq { # check if same seq exists in the alignment
703 my ($str1, $ref, $length) = @_;
704 my @char1=split //, $str1;
705 my @array=@$ref;
707 return (0, $str1) if @array==0; # not seen (1st sequence)
709 foreach my $str2 (@array) {
710 my $diff=0;
711 my @char2=split //, $str2;
712 for (my $i=0; $i<=$length-1; $i++) {
713 next if $char1[$i] eq '?';
714 next if $char2[$i] eq '?';
715 $diff++ if $char1[$i] ne $char2[$i];
717 return (1, $str2) if $diff == 0; # seen before
720 return (0, $str1); # not seen
723 sub _convert_leading_ending_gaps {
724 my $s=shift;
725 my $sym1=shift;
726 my $sym2=shift;
727 my @array=split //, $s;
728 # convert leading char:
729 for (my $i=0; $i<=$#array; $i++) {
730 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
732 # convert ending char:
733 for (my $i = $#array; $i>= 0; $i--) {
734 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
736 my $s_new=join '', @array;
737 return $s_new;
740 =head1 Sequence selection methods
742 Methods returning one or more sequences objects.
744 =head2 each_seq
746 Title : each_seq
747 Usage : foreach $seq ( $align->each_seq() )
748 Function : Gets a Seq object from the alignment
749 Returns : Seq object
750 Argument :
752 =cut
754 sub eachSeq {
755 my $self = shift;
756 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
757 $self->each_seq();
760 sub each_seq {
761 my $self = shift;
762 my (@arr,$order);
764 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
765 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
766 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
769 return @arr;
773 =head2 each_alphabetically
775 Title : each_alphabetically
776 Usage : foreach $seq ( $ali->each_alphabetically() )
777 Function : Returns a sequence object, but the objects are returned
778 in alphabetically sorted order.
779 Does not change the order of the alignment.
780 Returns : Seq object
781 Argument :
783 =cut
785 sub each_alphabetically {
786 my $self = shift;
787 my ($seq,$nse,@arr,%hash,$count);
789 foreach $seq ( $self->each_seq() ) {
790 $nse = $seq->get_nse;
791 $hash{$nse} = $seq;
794 foreach $nse ( sort _alpha_startend keys %hash) {
795 push(@arr,$hash{$nse});
797 return @arr;
800 sub _alpha_startend {
801 my ($aname,$astart,$bname,$bstart);
802 ($aname,$astart) = split (/-/,$a);
803 ($bname,$bstart) = split (/-/,$b);
805 if( $aname eq $bname ) {
806 return $astart <=> $bstart;
808 else {
809 return $aname cmp $bname;
813 =head2 each_seq_with_id
815 Title : each_seq_with_id
816 Usage : foreach $seq ( $align->each_seq_with_id() )
817 Function : Gets a Seq objects from the alignment, the contents
818 being those sequences with the given name (there may be
819 more than one)
820 Returns : Seq object
821 Argument : a seq name
823 =cut
825 sub eachSeqWithId {
826 my $self = shift;
827 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
828 $self->each_seq_with_id(@_);
831 sub each_seq_with_id {
832 my $self = shift;
833 my $id = shift;
835 $self->throw("Method each_seq_with_id needs a sequence name argument")
836 unless defined $id;
838 my (@arr, $seq);
840 if (exists($self->{'_start_end_lists'}->{$id})) {
841 @arr = @{$self->{'_start_end_lists'}->{$id}};
843 return @arr;
846 =head2 get_seq_by_pos
848 Title : get_seq_by_pos
849 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
850 Function : Gets a sequence based on its position in the alignment.
851 Numbering starts from 1. Sequence positions larger than
852 num_sequences() will thow an error.
853 Returns : a Bio::LocatableSeq object
854 Args : positive integer for the sequence position
856 =cut
858 sub get_seq_by_pos {
860 my $self = shift;
861 my ($pos) = @_;
863 $self->throw("Sequence position has to be a positive integer, not [$pos]")
864 unless $pos =~ /^\d+$/ and $pos > 0;
865 $self->throw("No sequence at position [$pos]")
866 unless $pos <= $self->num_sequences ;
868 my $nse = $self->{'_order'}->{--$pos};
869 return $self->{'_seq'}->{$nse};
872 =head2 get_seq_by_id
874 Title : get_seq_by_id
875 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
876 Function : Gets a sequence based on its name.
877 Sequences that do not exist will warn and return undef
878 Returns : a Bio::LocatableSeq object
879 Args : string for sequence name
881 =cut
883 sub get_seq_by_id {
884 my ($self,$name) = @_;
885 unless( defined $name ) {
886 $self->warn("Must provide a sequence name");
887 return;
889 for my $seq ( values %{$self->{'_seq'}} ) {
890 if ( $seq->id eq $name) {
891 return $seq;
894 return;
897 =head2 seq_with_features
899 Title : seq_with_features
900 Usage : $seq = $aln->seq_with_features(-pos => 1,
901 -consensus => 60
902 -mask =>
903 sub { my $consensus = shift;
905 for my $i (1..5){
906 my $n = 'N' x $i;
907 my $q = '\?' x $i;
908 while($consensus =~ /[^?]$q[^?]/){
909 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
912 return $consensus;
915 Function: produces a Bio::Seq object by first splicing gaps from -pos
916 (by means of a splice_by_seq_pos() call), then creating
917 features using non-? chars (by means of a consensus_string()
918 call with stringency -consensus).
919 Returns : a Bio::Seq object
920 Args : -pos : required. sequence from which to build the Bio::Seq
921 object
922 -consensus : optional, defaults to consensus_string()'s
923 default cutoff value
924 -mask : optional, a coderef to apply to consensus_string()'s
925 output before building features. this may be useful for
926 closing gaps of 1 bp by masking over them with N, for
927 instance
929 =cut
931 sub seq_with_features{
932 my ($self,%arg) = @_;
934 #first do the preparatory splice
935 $self->throw("must provide a -pos argument") unless $arg{-pos};
936 $self->splice_by_seq_pos($arg{-pos});
938 my $consensus_string = $self->consensus_string($arg{-consensus});
939 $consensus_string = $arg{-mask}->($consensus_string)
940 if defined($arg{-mask});
942 my(@bs,@es);
944 push @bs, 1 if $consensus_string =~ /^[^?]/;
946 while($consensus_string =~ /\?[^?]/g){
947 push @bs, pos($consensus_string);
949 while($consensus_string =~ /[^?]\?/g){
950 push @es, pos($consensus_string);
953 push @es, length($consensus_string) if $consensus_string =~ /[^?]$/;
955 my $seq = Bio::Seq->new();
957 # my $rootfeature = Bio::SeqFeature::Generic->new(
958 # -source_tag => 'location',
959 # -start => $self->get_seq_by_pos($arg{-pos})->start,
960 # -end => $self->get_seq_by_pos($arg{-pos})->end,
961 # );
962 # $seq->add_SeqFeature($rootfeature);
964 while(my $b = shift @bs){
965 my $e = shift @es;
966 $seq->add_SeqFeature(
967 Bio::SeqFeature::Generic->new(
968 -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
969 -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
970 -source_tag => $self->source || 'MSA',
975 return $seq;
979 =head1 Create new alignments
981 The result of these methods are horizontal or vertical subsets of the
982 current MSA.
984 =head2 select
986 Title : select
987 Usage : $aln2 = $aln->select(1, 3) # three first sequences
988 Function : Creates a new alignment from a continuous subset of
989 sequences. Numbering starts from 1. Sequence positions
990 larger than num_sequences() will thow an error.
991 Returns : a Bio::SimpleAlign object
992 Args : positive integer for the first sequence
993 positive integer for the last sequence to include (optional)
995 =cut
997 sub select {
998 my $self = shift;
999 my ($start, $end) = @_;
1001 $self->throw("Select start has to be a positive integer, not [$start]")
1002 unless $start =~ /^\d+$/ and $start > 0;
1003 $self->throw("Select end has to be a positive integer, not [$end]")
1004 unless $end =~ /^\d+$/ and $end > 0;
1005 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1006 unless $start <= $end;
1008 my $aln = $self->new;
1009 foreach my $pos ($start .. $end) {
1010 $aln->add_seq($self->get_seq_by_pos($pos));
1012 $aln->id($self->id);
1013 # fix for meta, sf, ann
1014 return $aln;
1017 =head2 select_noncont
1019 Title : select_noncont
1020 Usage : # 1st and 3rd sequences, sorted
1021 $aln2 = $aln->select_noncont(1, 3)
1023 # 1st and 3rd sequences, sorted (same as first)
1024 $aln2 = $aln->select_noncont(3, 1)
1026 # 1st and 3rd sequences, unsorted
1027 $aln2 = $aln->select_noncont('nosort',3, 1)
1029 Function : Creates a new alignment from a subset of sequences. Numbering
1030 starts from 1. Sequence positions larger than num_sequences() will
1031 throw an error. Sorts the order added to new alignment by default,
1032 to prevent sorting pass 'nosort' as the first argument in the list.
1033 Returns : a Bio::SimpleAlign object
1034 Args : array of integers for the sequences. If the string 'nosort' is
1035 passed as the first argument, the sequences will not be sorted
1036 in the new alignment but will appear in the order listed.
1038 =cut
1040 sub select_noncont {
1041 my $self = shift;
1042 my $nosort = 0;
1043 my (@pos) = @_;
1044 if ($pos[0] !~ m{^\d+$}) {
1045 my $sortcmd = shift @pos;
1046 if ($sortcmd eq 'nosort') {
1047 $nosort = 1;
1048 } else {
1049 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1053 my $end = $self->num_sequences;
1054 foreach ( @pos ) {
1055 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1056 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1059 @pos = sort {$a <=> $b} @pos unless $nosort;
1061 my $aln = $self->new;
1062 foreach my $p (@pos) {
1063 $aln->add_seq($self->get_seq_by_pos($p));
1065 $aln->id($self->id);
1066 # fix for meta, sf, ann
1067 return $aln;
1070 =head2 slice
1072 Title : slice
1073 Usage : $aln2 = $aln->slice(20,30)
1074 Function : Creates a slice from the alignment inclusive of start and
1075 end columns, and the first column in the alignment is denoted 1.
1076 Sequences with no residues in the slice are excluded from the
1077 new alignment and a warning is printed. Slice beyond the length of
1078 the sequence does not do padding.
1079 Returns : A Bio::SimpleAlign object
1080 Args : Positive integer for start column, positive integer for end column,
1081 optional boolean which if true will keep gap-only columns in the newly
1082 created slice. Example:
1084 $aln2 = $aln->slice(20,30,1)
1086 =cut
1088 sub slice {
1089 my $self = shift;
1090 my ($start, $end, $keep_gap_only) = @_;
1092 $self->throw("Slice start has to be a positive integer, not [$start]")
1093 unless $start =~ /^\d+$/ and $start > 0;
1094 $self->throw("Slice end has to be a positive integer, not [$end]")
1095 unless $end =~ /^\d+$/ and $end > 0;
1096 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1097 unless $start <= $end;
1098 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1099 "[$start] is too big.") if $start > $self->length;
1100 my $cons_meta = $self->consensus_meta;
1101 my $aln = $self->new;
1102 $aln->id($self->id);
1103 foreach my $seq ( $self->each_seq() ) {
1104 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1105 Bio::Seq::Meta->new
1106 (-id => $seq->id,
1107 -alphabet => $seq->alphabet,
1108 -strand => $seq->strand,
1109 -verbose => $self->verbose) :
1110 Bio::LocatableSeq->new
1111 (-id => $seq->id,
1112 -alphabet => $seq->alphabet,
1113 -strand => $seq->strand,
1114 -verbose => $self->verbose);
1116 # seq
1117 my $seq_end = $end;
1118 $seq_end = $seq->length if( $end > $seq->length );
1120 my $slice_seq = $seq->subseq($start, $seq_end);
1121 $new_seq->seq( $slice_seq );
1123 $slice_seq =~ s/\W//g;
1125 if ($start > 1) {
1126 my $pre_start_seq = $seq->subseq(1, $start - 1);
1127 $pre_start_seq =~ s/\W//g;
1128 if (!defined($seq->strand)) {
1129 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1130 } elsif ($seq->strand < 0){
1131 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1132 } else {
1133 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1135 } else {
1136 $new_seq->start( $seq->start);
1138 if ($new_seq->isa('Bio::Seq::MetaI')) {
1139 for my $meta_name ($seq->meta_names) {
1140 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1143 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1145 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1146 $aln->add_seq($new_seq);
1147 } else {
1148 if( $keep_gap_only ) {
1149 $aln->add_seq($new_seq);
1150 } else {
1151 my $nse = $seq->get_nse();
1152 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1153 " Sequence excluded from the new alignment.");
1157 if ($cons_meta) {
1158 my $new = Bio::Seq::Meta->new();
1159 for my $meta_name ($cons_meta->meta_names) {
1160 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1162 $aln->consensus_meta($new);
1164 $aln->annotation($self->annotation);
1165 # fix for meta, sf, ann
1166 return $aln;
1169 =head2 remove_columns
1171 Title : remove_columns
1172 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1173 $aln2 = $aln->remove_columns([0,0],[6,8])
1174 Function : Creates an aligment with columns removed corresponding to
1175 the specified type or by specifying the columns by number.
1176 Returns : Bio::SimpleAlign object
1177 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1178 'all_gaps_columns') or array ref where the referenced array
1179 contains a pair of integers that specify a range.
1180 The first column is 0
1181 =cut
1183 sub remove_columns {
1184 my ($self,@args) = @_;
1185 @args || $self->throw("Must supply column ranges or column types");
1186 my $aln;
1188 if ($args[0][0] =~ /^[a-z_]+$/i) {
1189 $aln = $self->_remove_columns_by_type($args[0]);
1190 } elsif ($args[0][0] =~ /^\d+$/) {
1191 $aln = $self->_remove_columns_by_num(\@args);
1192 } else {
1193 $self->throw("You must pass array references to remove_columns(), not @args");
1195 # fix for meta, sf, ann
1196 $aln;
1200 =head2 remove_gaps
1202 Title : remove_gaps
1203 Usage : $aln2 = $aln->remove_gaps
1204 Function : Creates an aligment with gaps removed
1205 Returns : a Bio::SimpleAlign object
1206 Args : a gap character(optional) if none specified taken
1207 from $self->gap_char,
1208 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1209 indicates that only all-gaps columns should be deleted
1211 Used from method L<remove_columns> in most cases. Set gap character
1212 using L<gap_char()|gap_char>.
1214 =cut
1216 sub remove_gaps {
1217 my ($self,$gapchar,$all_gaps_columns) = @_;
1218 my $gap_line;
1219 if ($all_gaps_columns) {
1220 $gap_line = $self->all_gap_line($gapchar);
1221 } else {
1222 $gap_line = $self->gap_line($gapchar);
1224 my $aln = $self->new;
1226 my @remove;
1227 my $length = 0;
1228 my $del_char = $gapchar || $self->gap_char;
1229 # Do the matching to get the segments to remove
1230 while ($gap_line =~ m/[$del_char]/g) {
1231 my $start = pos($gap_line)-1;
1232 $gap_line=~/\G[$del_char]+/gc;
1233 my $end = pos($gap_line)-1;
1235 #have to offset the start and end for subsequent removes
1236 $start-=$length;
1237 $end -=$length;
1238 $length += ($end-$start+1);
1239 push @remove, [$start,$end];
1242 #remove the segments
1243 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1244 # fix for meta, sf, ann
1245 return $aln;
1249 sub _remove_col {
1250 my ($self,$aln,$remove) = @_;
1251 my @new;
1253 my $gap = $self->gap_char;
1255 # splice out the segments and create new seq
1256 foreach my $seq($self->each_seq){
1257 my $new_seq = Bio::LocatableSeq->new(
1258 -id => $seq->id,
1259 -alphabet=> $seq->alphabet,
1260 -strand => $seq->strand,
1261 -verbose => $self->verbose);
1262 my $sequence = $seq->seq;
1263 foreach my $pair(@{$remove}){
1264 my $start = $pair->[0];
1265 my $end = $pair->[1];
1266 $sequence = $seq->seq unless $sequence;
1267 my $orig = $sequence;
1268 my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1269 my $tail = ($end + 1) >= length($sequence) ? '' : substr($sequence, $end + 1);
1270 $sequence = $head.$tail;
1271 # start
1272 unless (defined $new_seq->start) {
1273 if ($start == 0) {
1274 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1275 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1277 else {
1278 my $start_adjust = $orig =~ /^$gap+/;
1279 if ($start_adjust) {
1280 $start_adjust = $+[0] == $start;
1282 $new_seq->start($seq->start + $start_adjust);
1285 # end
1286 if (($end + 1) >= length($orig)) {
1287 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1288 $new_seq->end($seq->end - (length($orig) - $start) + $end_adjust);
1290 else {
1291 $new_seq->end($seq->end);
1295 if ($new_seq->end < $new_seq->start) {
1296 # we removed all columns except for gaps: set to 0 to indicate no
1297 # sequence
1298 $new_seq->start(0);
1299 $new_seq->end(0);
1302 $new_seq->seq($sequence) if $sequence;
1303 push @new, $new_seq;
1305 # add the new seqs to the alignment
1306 foreach my $new(@new){
1307 $aln->add_seq($new);
1309 # fix for meta, sf, ann
1310 return $aln;
1313 sub _remove_columns_by_type {
1314 my ($self,$type) = @_;
1315 my $aln = $self->new;
1316 my @remove;
1318 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1319 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1320 my %matchchars = ( 'match' => '\*',
1321 'weak' => '\.',
1322 'strong' => ':',
1323 'mismatch' => ' ',
1324 'gaps' => '',
1325 'all_gaps_columns' => ''
1327 # get the characters to delete against
1328 my $del_char;
1329 foreach my $type (@{$type}){
1330 $del_char.= $matchchars{$type};
1333 my $length = 0;
1334 my $match_line = $self->match_line;
1335 # do the matching to get the segments to remove
1336 if($del_char){
1337 while($match_line =~ m/[$del_char]/g ){
1338 my $start = pos($match_line)-1;
1339 $match_line=~/\G[$del_char]+/gc;
1340 my $end = pos($match_line)-1;
1342 #have to offset the start and end for subsequent removes
1343 $start-=$length;
1344 $end -=$length;
1345 $length += ($end-$start+1);
1346 push @remove, [$start,$end];
1350 # remove the segments
1351 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1352 $aln = $aln->remove_gaps() if $gap;
1353 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1354 # fix for meta, sf, ann
1355 $aln;
1359 sub _remove_columns_by_num {
1360 my ($self,$positions) = @_;
1361 my $aln = $self->new;
1363 # sort the positions
1364 @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1366 my @remove;
1367 my $length = 0;
1368 foreach my $pos (@{$positions}) {
1369 my ($start, $end) = @{$pos};
1371 #have to offset the start and end for subsequent removes
1372 $start-=$length;
1373 $end -=$length;
1374 $length += ($end-$start+1);
1375 push @remove, [$start,$end];
1378 #remove the segments
1379 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1380 # fix for meta, sf, ann
1381 $aln;
1385 =head1 Change sequences within the MSA
1387 These methods affect characters in all sequences without changing the
1388 alignment.
1390 =head2 splice_by_seq_pos
1392 Title : splice_by_seq_pos
1393 Usage : $status = splice_by_seq_pos(1);
1394 Function: splices all aligned sequences where the specified sequence
1395 has gaps.
1396 Example :
1397 Returns : 1 on success
1398 Args : position of sequence to splice by
1401 =cut
1403 sub splice_by_seq_pos{
1404 my ($self,$pos) = @_;
1406 my $guide = $self->get_seq_by_pos($pos);
1407 my $guide_seq = $guide->seq;
1409 $guide_seq =~ s/\./\-/g;
1411 my @gaps = ();
1412 $pos = -1;
1413 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1414 unshift @gaps, $pos;
1415 $pos++;
1418 foreach my $seq ($self->each_seq){
1419 my @bases = split '', $seq->seq;
1421 splice(@bases, $_, 1) foreach @gaps;
1422 $seq->seq(join('', @bases));
1428 =head2 map_chars
1430 Title : map_chars
1431 Usage : $ali->map_chars('\.','-')
1432 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1433 characters
1435 Notice that the from (arg1) is interpretted as a regex,
1436 so be careful about quoting meta characters (eg
1437 $ali->map_chars('.','-') wont do what you want)
1438 Returns :
1439 Argument : 'from' rexexp
1440 'to' string
1442 =cut
1444 sub map_chars {
1445 my $self = shift;
1446 my $from = shift;
1447 my $to = shift;
1448 my ($seq,$temp);
1450 $self->throw("Need exactly two arguments")
1451 unless defined $from and defined $to;
1453 foreach $seq ( $self->each_seq() ) {
1454 $temp = $seq->seq();
1455 $temp =~ s/$from/$to/g;
1456 $seq->seq($temp);
1458 return 1;
1462 =head2 uppercase
1464 Title : uppercase()
1465 Usage : $ali->uppercase()
1466 Function : Sets all the sequences to uppercase
1467 Returns :
1468 Argument :
1470 =cut
1472 sub uppercase {
1473 my $self = shift;
1474 my $seq;
1475 my $temp;
1477 foreach $seq ( $self->each_seq() ) {
1478 $temp = $seq->seq();
1479 $temp =~ tr/[a-z]/[A-Z]/;
1481 $seq->seq($temp);
1483 return 1;
1486 =head2 cigar_line
1488 Title : cigar_line()
1489 Usage : %cigars = $align->cigar_line()
1490 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1491 Report) line for each sequence in the alignment. Examples are
1492 "1,60" or "5,10:12,58", where the numbers refer to conserved
1493 positions within the alignment. The keys of the hash are the
1494 NSEs (name/start/end) assigned to each sequence.
1495 Args : threshold (optional, defaults to 100)
1496 Returns : Hash of strings (cigar lines)
1498 =cut
1500 sub cigar_line {
1501 my $self = shift;
1502 my $thr=shift||100;
1503 my %cigars;
1505 my @consensus = split "",($self->consensus_string($thr));
1506 my $len = $self->length;
1507 my $gapchar = $self->gap_char;
1509 # create a precursor, something like (1,4,5,6,7,33,45),
1510 # where each number corresponds to a conserved position
1511 foreach my $seq ( $self->each_seq ) {
1512 my @seq = split "", uc ($seq->seq);
1513 my $pos = 1;
1514 for (my $x = 0 ; $x < $len ; $x++ ) {
1515 if ($seq[$x] eq $consensus[$x]) {
1516 push @{$cigars{$seq->get_nse}},$pos;
1517 $pos++;
1518 } elsif ($seq[$x] ne $gapchar) {
1519 $pos++;
1523 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1524 for my $name (keys %cigars) {
1525 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1526 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1527 push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1528 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1529 ${$cigars{$name}}[$#{$cigars{$name}}] );
1530 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1531 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1532 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1533 splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1537 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1538 for my $name (keys %cigars) {
1539 my @remove;
1540 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1541 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1542 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1543 unshift @remove,$x;
1546 for my $pos (@remove) {
1547 splice @{$cigars{$name}}, $pos, 1;
1550 # join and punctuate
1551 for my $name (keys %cigars) {
1552 my ($start,$end,$str) = "";
1553 while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1554 $str .= ($start . "," . $end . ":");
1556 $str =~ s/:$//;
1557 $cigars{$name} = $str;
1559 %cigars;
1563 =head2 match_line
1565 Title : match_line()
1566 Usage : $line = $align->match_line()
1567 Function : Generates a match line - much like consensus string
1568 except that a line indicating the '*' for a match.
1569 Args : (optional) Match line characters ('*' by default)
1570 (optional) Strong match char (':' by default)
1571 (optional) Weak match char ('.' by default)
1572 Returns : String
1574 =cut
1576 sub match_line {
1577 my ($self,$matchlinechar, $strong, $weak) = @_;
1578 my %matchchars = ('match' => $matchlinechar || '*',
1579 'weak' => $weak || '.',
1580 'strong' => $strong || ':',
1581 'mismatch' => ' ',
1584 my @seqchars;
1585 my $alphabet;
1586 foreach my $seq ( $self->each_seq ) {
1587 push @seqchars, [ split(//, uc ($seq->seq)) ];
1588 $alphabet = $seq->alphabet unless defined $alphabet;
1590 my $refseq = shift @seqchars;
1591 # let's just march down the columns
1592 my $matchline;
1593 POS:
1594 foreach my $pos ( 0..$self->length ) {
1595 my $refchar = $refseq->[$pos];
1596 my $char = $matchchars{'mismatch'};
1597 unless( defined $refchar ) {
1598 last if $pos == $self->length; # short circuit on last residue
1599 # this in place to handle jason's soon-to-be-committed
1600 # intron mapping code
1601 goto bottom;
1603 my %col = ($refchar => 1);
1604 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1605 foreach my $seq ( @seqchars ) {
1606 next if $pos >= scalar @$seq;
1607 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1608 $seq->[$pos] eq ' ' );
1609 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1611 my @colresidues = sort keys %col;
1613 # if all the values are the same
1614 if( $dash ) { $char = $matchchars{'mismatch'} }
1615 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1616 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1617 # matches for protein seqs
1618 TYPE:
1619 foreach my $type ( qw(strong weak) ) {
1620 # iterate through categories
1621 my %groups;
1622 # iterate through each of the aa in the col
1623 # look to see which groups it is in
1624 foreach my $c ( @colresidues ) {
1625 foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1626 push @{$groups{$f}},$c;
1629 GRP:
1630 foreach my $cols ( values %groups ) {
1631 @$cols = sort @$cols;
1632 # now we are just testing to see if two arrays
1633 # are identical w/o changing either one
1634 # have to be same len
1635 next if( scalar @$cols != scalar @colresidues );
1636 # walk down the length and check each slot
1637 for($_=0;$_ < (scalar @$cols);$_++ ) {
1638 next GRP if( $cols->[$_] ne $colresidues[$_] );
1640 $char = $matchchars{$type};
1641 last TYPE;
1645 bottom:
1646 $matchline .= $char;
1648 return $matchline;
1652 =head2 gap_line
1654 Title : gap_line()
1655 Usage : $line = $align->gap_line()
1656 Function : Generates a gap line - much like consensus string
1657 except that a line where '-' represents gap
1658 Args : (optional) gap line characters ('-' by default)
1659 Returns : string
1661 =cut
1663 sub gap_line {
1664 my ($self,$gapchar) = @_;
1665 $gapchar = $gapchar || $self->gap_char;
1666 my %gap_hsh; # column gaps vector
1667 foreach my $seq ( $self->each_seq ) {
1668 my $i = 0;
1669 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1670 map {[$i++, $_]} split(//, uc ($seq->seq));
1672 my $gap_line;
1673 foreach my $pos ( 0..$self->length-1 ) {
1674 $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1676 return $gap_line;
1679 =head2 all_gap_line
1681 Title : all_gap_line()
1682 Usage : $line = $align->all_gap_line()
1683 Function : Generates a gap line - much like consensus string
1684 except that a line where '-' represents all-gap column
1685 Args : (optional) gap line characters ('-' by default)
1686 Returns : string
1688 =cut
1690 sub all_gap_line {
1691 my ($self,$gapchar) = @_;
1692 $gapchar = $gapchar || $self->gap_char;
1693 my %gap_hsh; # column gaps counter hash
1694 my @seqs = $self->each_seq;
1695 foreach my $seq ( @seqs ) {
1696 my $i = 0;
1697 map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1698 map {[$i++, $_]} split(//, uc ($seq->seq));
1700 my $gap_line;
1701 foreach my $pos ( 0..$self->length-1 ) {
1702 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1703 # gaps column
1704 $gap_line .= $gapchar;
1705 } else {
1706 $gap_line .= '.';
1709 return $gap_line;
1712 =head2 gap_col_matrix
1714 Title : gap_col_matrix()
1715 Usage : my $cols = $align->gap_col_matrix()
1716 Function : Generates an array of hashes where
1717 each entry in the array is a hash reference
1718 with keys of all the sequence names and
1719 and value of 1 or 0 if the sequence has a gap at that column
1720 Args : (optional) gap line characters ($aln->gap_char or '-' by default)
1722 =cut
1724 sub gap_col_matrix {
1725 my ($self,$gapchar) = @_;
1726 $gapchar = $gapchar || $self->gap_char;
1727 my %gap_hsh; # column gaps vector
1728 my @cols;
1729 foreach my $seq ( $self->each_seq ) {
1730 my $i = 0;
1731 my $str = $seq->seq;
1732 my $len = $seq->length;
1733 my $ch;
1734 my $id = $seq->display_id;
1735 while( $i < $len ) {
1736 $ch = substr($str, $i, 1);
1737 $cols[$i++]->{$id} = ($ch eq $gapchar);
1740 return \@cols;
1743 =head2 match
1745 Title : match()
1746 Usage : $ali->match()
1747 Function : Goes through all columns and changes residues that are
1748 identical to residue in first sequence to match '.'
1749 character. Sets match_char.
1751 USE WITH CARE: Most MSA formats do not support match
1752 characters in sequences, so this is mostly for output
1753 only. NEXUS format (Bio::AlignIO::nexus) can handle
1755 Returns : 1
1756 Argument : a match character, optional, defaults to '.'
1758 =cut
1760 sub match {
1761 my ($self, $match) = @_;
1763 $match ||= '.';
1764 my ($matching_char) = $match;
1765 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1766 $self->map_chars($matching_char, '-');
1768 my @seqs = $self->each_seq();
1769 return 1 unless scalar @seqs > 1;
1771 my $refseq = shift @seqs ;
1772 my @refseq = split //, $refseq->seq;
1773 my $gapchar = $self->gap_char;
1775 foreach my $seq ( @seqs ) {
1776 my @varseq = split //, $seq->seq();
1777 for ( my $i=0; $i < scalar @varseq; $i++) {
1778 $varseq[$i] = $match if defined $refseq[$i] &&
1779 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1780 $refseq[$i] =~ /$gapchar/ )
1781 && $refseq[$i] eq $varseq[$i];
1783 $seq->seq(join '', @varseq);
1785 $self->match_char($match);
1786 return 1;
1790 =head2 unmatch
1792 Title : unmatch()
1793 Usage : $ali->unmatch()
1794 Function : Undoes the effect of method match. Unsets match_char.
1795 Returns : 1
1796 Argument : a match character, optional, defaults to '.'
1798 See L<match> and L<match_char>
1800 =cut
1802 sub unmatch {
1803 my ($self, $match) = @_;
1805 $match ||= '.';
1807 my @seqs = $self->each_seq();
1808 return 1 unless scalar @seqs > 1;
1810 my $refseq = shift @seqs ;
1811 my @refseq = split //, $refseq->seq;
1812 my $gapchar = $self->gap_char;
1813 foreach my $seq ( @seqs ) {
1814 my @varseq = split //, $seq->seq();
1815 for ( my $i=0; $i < scalar @varseq; $i++) {
1816 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1817 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1818 $refseq[$i] =~ /$gapchar/ ) &&
1819 $varseq[$i] eq $match;
1821 $seq->seq(join '', @varseq);
1823 $self->match_char('');
1824 return 1;
1827 =head1 MSA attributes
1829 Methods for setting and reading the MSA attributes.
1831 Note that the methods defining character semantics depend on the user
1832 to set them sensibly. They are needed only by certain input/output
1833 methods. Unset them by setting to an empty string ('').
1835 =head2 id
1837 Title : id
1838 Usage : $myalign->id("Ig")
1839 Function : Gets/sets the id field of the alignment
1840 Returns : An id string
1841 Argument : An id string (optional)
1843 =cut
1845 sub id {
1846 my ($self, $name) = @_;
1848 if (defined( $name )) {
1849 $self->{'_id'} = $name;
1852 return $self->{'_id'};
1855 =head2 accession
1857 Title : accession
1858 Usage : $myalign->accession("PF00244")
1859 Function : Gets/sets the accession field of the alignment
1860 Returns : An acc string
1861 Argument : An acc string (optional)
1863 =cut
1865 sub accession {
1866 my ($self, $acc) = @_;
1868 if (defined( $acc )) {
1869 $self->{'_accession'} = $acc;
1872 return $self->{'_accession'};
1875 =head2 description
1877 Title : description
1878 Usage : $myalign->description("14-3-3 proteins")
1879 Function : Gets/sets the description field of the alignment
1880 Returns : An description string
1881 Argument : An description string (optional)
1883 =cut
1885 sub description {
1886 my ($self, $name) = @_;
1888 if (defined( $name )) {
1889 $self->{'_description'} = $name;
1892 return $self->{'_description'};
1895 =head2 missing_char
1897 Title : missing_char
1898 Usage : $myalign->missing_char("?")
1899 Function : Gets/sets the missing_char attribute of the alignment
1900 It is generally recommended to set it to 'n' or 'N'
1901 for nucleotides and to 'X' for protein.
1902 Returns : An missing_char string,
1903 Argument : An missing_char string (optional)
1905 =cut
1907 sub missing_char {
1908 my ($self, $char) = @_;
1910 if (defined $char ) {
1911 $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1912 $self->{'_missing_char'} = $char;
1915 return $self->{'_missing_char'};
1918 =head2 match_char
1920 Title : match_char
1921 Usage : $myalign->match_char('.')
1922 Function : Gets/sets the match_char attribute of the alignment
1923 Returns : An match_char string,
1924 Argument : An match_char string (optional)
1926 =cut
1928 sub match_char {
1929 my ($self, $char) = @_;
1931 if (defined $char ) {
1932 $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1933 $self->{'_match_char'} = $char;
1936 return $self->{'_match_char'};
1939 =head2 gap_char
1941 Title : gap_char
1942 Usage : $myalign->gap_char('-')
1943 Function : Gets/sets the gap_char attribute of the alignment
1944 Returns : An gap_char string, defaults to '-'
1945 Argument : An gap_char string (optional)
1947 =cut
1949 sub gap_char {
1950 my ($self, $char) = @_;
1952 if (defined $char || ! defined $self->{'_gap_char'} ) {
1953 $char= '-' unless defined $char;
1954 $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1955 $self->{'_gap_char'} = $char;
1957 return $self->{'_gap_char'};
1960 =head2 symbol_chars
1962 Title : symbol_chars
1963 Usage : my @symbolchars = $aln->symbol_chars;
1964 Function: Returns all the seen symbols (other than gaps)
1965 Returns : array of characters that are the seen symbols
1966 Args : boolean to include the gap/missing/match characters
1968 =cut
1970 sub symbol_chars{
1971 my ($self,$includeextra) = @_;
1973 unless ($self->{'_symbols'}) {
1974 foreach my $seq ($self->each_seq) {
1975 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1978 my %copy = %{$self->{'_symbols'}};
1979 if( ! $includeextra ) {
1980 foreach my $char ( $self->gap_char, $self->match_char,
1981 $self->missing_char) {
1982 delete $copy{$char} if( defined $char );
1985 return keys %copy;
1988 =head1 Alignment descriptors
1990 These read only methods describe the MSA in various ways.
1993 =head2 score
1995 Title : score
1996 Usage : $str = $ali->score()
1997 Function : get/set a score of the alignment
1998 Returns : a score for the alignment
1999 Argument : an optional score to set
2001 =cut
2003 sub score {
2004 my $self = shift;
2005 $self->{score} = shift if @_;
2006 return $self->{score};
2009 =head2 consensus_string
2011 Title : consensus_string
2012 Usage : $str = $ali->consensus_string($threshold_percent)
2013 Function : Makes a strict consensus
2014 Returns : Consensus string
2015 Argument : Optional treshold ranging from 0 to 100.
2016 The consensus residue has to appear at least threshold %
2017 of the sequences at a given location, otherwise a '?'
2018 character will be placed at that location.
2019 (Default value = 0%)
2021 =cut
2023 sub consensus_string {
2024 my $self = shift;
2025 my $threshold = shift;
2027 my $out = "";
2028 my $len = $self->length - 1;
2030 foreach ( 0 .. $len ) {
2031 $out .= $self->_consensus_aa($_,$threshold);
2033 return $out;
2036 sub _consensus_aa {
2037 my $self = shift;
2038 my $point = shift;
2039 my $threshold_percent = shift || -1 ;
2040 my ($seq,%hash,$count,$letter,$key);
2041 my $gapchar = $self->gap_char;
2042 foreach $seq ( $self->each_seq() ) {
2043 $letter = substr($seq->seq,$point,1);
2044 $self->throw("--$point-----------") if $letter eq '';
2045 ($letter eq $gapchar || $letter =~ /\./) && next;
2046 # print "Looking at $letter\n";
2047 $hash{$letter}++;
2049 my $number_of_sequences = $self->num_sequences();
2050 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2051 $count = -1;
2052 $letter = '?';
2054 foreach $key ( sort keys %hash ) {
2055 # print "Now at $key $hash{$key}\n";
2056 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2057 $letter = $key;
2058 $count = $hash{$key};
2061 return $letter;
2065 =head2 consensus_iupac
2067 Title : consensus_iupac
2068 Usage : $str = $ali->consensus_iupac()
2069 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2070 and RNA. The output is in upper case except when gaps in
2071 a column force output to be in lower case.
2073 Note that if your alignment sequences contain a lot of
2074 IUPAC ambiquity codes you often have to manually set
2075 alphabet. Bio::PrimarySeq::_guess_type thinks they
2076 indicate a protein sequence.
2077 Returns : consensus string
2078 Argument : none
2079 Throws : on protein sequences
2081 =cut
2083 sub consensus_iupac {
2084 my $self = shift;
2085 my $out = "";
2086 my $len = $self->length-1;
2088 # only DNA and RNA sequences are valid
2089 foreach my $seq ( $self->each_seq() ) {
2090 $self->throw("Seq [". $seq->get_nse. "] is a protein")
2091 if $seq->alphabet eq 'protein';
2093 # loop over the alignment columns
2094 foreach my $count ( 0 .. $len ) {
2095 $out .= $self->_consensus_iupac($count);
2097 return $out;
2100 sub _consensus_iupac {
2101 my ($self, $column) = @_;
2102 my ($string, $char, $rna);
2104 #determine all residues in a column
2105 foreach my $seq ( $self->each_seq() ) {
2106 $string .= substr($seq->seq, $column, 1);
2108 $string = uc $string;
2110 # quick exit if there's an N in the string
2111 if ($string =~ /N/) {
2112 $string =~ /\W/ ? return 'n' : return 'N';
2114 # ... or if there are only gap characters
2115 return '-' if $string =~ /^\W+$/;
2117 # treat RNA as DNA in regexps
2118 if ($string =~ /U/) {
2119 $string =~ s/U/T/;
2120 $rna = 1;
2123 # the following s///'s only need to be done to the _first_ ambiguity code
2124 # as we only need to see the _range_ of characters in $string
2126 if ($string =~ /[VDHB]/) {
2127 $string =~ s/V/AGC/;
2128 $string =~ s/D/AGT/;
2129 $string =~ s/H/ACT/;
2130 $string =~ s/B/CTG/;
2133 if ($string =~ /[SKYRWM]/) {
2134 $string =~ s/S/GC/;
2135 $string =~ s/K/GT/;
2136 $string =~ s/Y/CT/;
2137 $string =~ s/R/AG/;
2138 $string =~ s/W/AT/;
2139 $string =~ s/M/AC/;
2142 # and now the guts of the thing
2144 if ($string =~ /A/) {
2145 $char = 'A'; # A A
2146 if ($string =~ /G/) {
2147 $char = 'R'; # A and G (purines) R
2148 if ($string =~ /C/) {
2149 $char = 'V'; # A and G and C V
2150 if ($string =~ /T/) {
2151 $char = 'N'; # A and G and C and T N
2153 } elsif ($string =~ /T/) {
2154 $char = 'D'; # A and G and T D
2156 } elsif ($string =~ /C/) {
2157 $char = 'M'; # A and C M
2158 if ($string =~ /T/) {
2159 $char = 'H'; # A and C and T H
2161 } elsif ($string =~ /T/) {
2162 $char = 'W'; # A and T W
2164 } elsif ($string =~ /C/) {
2165 $char = 'C'; # C C
2166 if ($string =~ /T/) {
2167 $char = 'Y'; # C and T (pyrimidines) Y
2168 if ($string =~ /G/) {
2169 $char = 'B'; # C and T and G B
2171 } elsif ($string =~ /G/) {
2172 $char = 'S'; # C and G S
2174 } elsif ($string =~ /G/) {
2175 $char = 'G'; # G G
2176 if ($string =~ /C/) {
2177 $char = 'S'; # G and C S
2178 } elsif ($string =~ /T/) {
2179 $char = 'K'; # G and T K
2181 } elsif ($string =~ /T/) {
2182 $char = 'T'; # T T
2185 $char = 'U' if $rna and $char eq 'T';
2186 $char = lc $char if $string =~ /\W/;
2188 return $char;
2192 =head2 consensus_meta
2194 Title : consensus_meta
2195 Usage : $seqmeta = $ali->consensus_meta()
2196 Function : Returns a Bio::Seq::Meta object containing the consensus
2197 strings derived from meta data analysis.
2198 Returns : Bio::Seq::Meta
2199 Argument : Bio::Seq::Meta
2200 Throws : non-MetaI object
2202 =cut
2204 sub consensus_meta {
2205 my ($self, $meta) = @_;
2206 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2207 $self->throw('Not a Bio::Seq::MetaI object');
2209 return $self->{'_aln_meta'} = $meta if $meta;
2210 return $self->{'_aln_meta'}
2213 =head2 is_flush
2215 Title : is_flush
2216 Usage : if ( $ali->is_flush() )
2217 Function : Tells you whether the alignment
2218 : is flush, i.e. all of the same length
2219 Returns : 1 or 0
2220 Argument :
2222 =cut
2224 sub is_flush {
2225 my ($self,$report) = @_;
2226 my $seq;
2227 my $length = (-1);
2228 my $temp;
2230 foreach $seq ( $self->each_seq() ) {
2231 if( $length == (-1) ) {
2232 $length = CORE::length($seq->seq());
2233 next;
2236 $temp = CORE::length($seq->seq());
2237 if( $temp != $length ) {
2238 $self->warn("expecting $length not $temp from ".
2239 $seq->display_id) if( $report );
2240 $self->debug("expecting $length not $temp from ".
2241 $seq->display_id);
2242 $self->debug($seq->seq(). "\n");
2243 return 0;
2247 return 1;
2251 =head2 length
2253 Title : length()
2254 Usage : $len = $ali->length()
2255 Function : Returns the maximum length of the alignment.
2256 To be sure the alignment is a block, use is_flush
2257 Returns : Integer
2258 Argument :
2260 =cut
2262 sub length_aln {
2263 my $self = shift;
2264 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2265 $self->length(@_);
2268 sub length {
2269 my $self = shift;
2270 my $seq;
2271 my $length = -1;
2272 my $temp;
2274 foreach $seq ( $self->each_seq() ) {
2275 $temp = $seq->length();
2276 if( $temp > $length ) {
2277 $length = $temp;
2281 return $length;
2285 =head2 maxdisplayname_length
2287 Title : maxdisplayname_length
2288 Usage : $ali->maxdisplayname_length()
2289 Function : Gets the maximum length of the displayname in the
2290 alignment. Used in writing out various MSA formats.
2291 Returns : integer
2292 Argument :
2294 =cut
2296 sub maxname_length {
2297 my $self = shift;
2298 $self->deprecated("maxname_length - deprecated method.".
2299 " Use maxdisplayname_length() instead.");
2300 $self->maxdisplayname_length();
2303 sub maxnse_length {
2304 my $self = shift;
2305 $self->deprecated("maxnse_length - deprecated method.".
2306 " Use maxnse_length() instead.");
2307 $self->maxdisplayname_length();
2310 sub maxdisplayname_length {
2311 my $self = shift;
2312 my $maxname = (-1);
2313 my ($seq,$len);
2315 foreach $seq ( $self->each_seq() ) {
2316 $len = CORE::length $self->displayname($seq->get_nse());
2318 if( $len > $maxname ) {
2319 $maxname = $len;
2323 return $maxname;
2326 =head2 max_metaname_length
2328 Title : max_metaname_length
2329 Usage : $ali->max_metaname_length()
2330 Function : Gets the maximum length of the meta name tags in the
2331 alignment for the sequences and for the alignment.
2332 Used in writing out various MSA formats.
2333 Returns : integer
2334 Argument : None
2336 =cut
2338 sub max_metaname_length {
2339 my $self = shift;
2340 my $maxname = (-1);
2341 my ($seq,$len);
2343 # check seq meta first
2344 for $seq ( $self->each_seq() ) {
2345 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2346 for my $mtag ($seq->meta_names) {
2347 $len = CORE::length $mtag;
2348 if( $len > $maxname ) {
2349 $maxname = $len;
2354 # alignment meta
2355 for my $meta ($self->consensus_meta) {
2356 next unless $meta;
2357 for my $name ($meta->meta_names) {
2358 $len = CORE::length $name;
2359 if( $len > $maxname ) {
2360 $maxname = $len;
2365 return $maxname;
2368 =head2 num_residues
2370 Title : num_residues
2371 Usage : $no = $ali->num_residues
2372 Function : number of residues in total in the alignment
2373 Returns : integer
2374 Argument :
2375 Note : replaces no_residues()
2377 =cut
2379 sub num_residues {
2380 my $self = shift;
2381 my $count = 0;
2383 foreach my $seq ($self->each_seq) {
2384 my $str = $seq->seq();
2386 $count += ($str =~ s/[A-Za-z]//g);
2389 return $count;
2392 =head2 num_sequences
2394 Title : num_sequences
2395 Usage : $depth = $ali->num_sequences
2396 Function : number of sequence in the sequence alignment
2397 Returns : integer
2398 Argument : none
2399 Note : replaces no_sequences()
2401 =cut
2403 sub num_sequences {
2404 my $self = shift;
2405 return scalar($self->each_seq);
2409 =head2 average_percentage_identity
2411 Title : average_percentage_identity
2412 Usage : $id = $align->average_percentage_identity
2413 Function: The function uses a fast method to calculate the average
2414 percentage identity of the alignment
2415 Returns : The average percentage identity of the alignment
2416 Args : None
2417 Notes : This method implemented by Kevin Howe calculates a figure that is
2418 designed to be similar to the average pairwise identity of the
2419 alignment (identical in the absence of gaps), without having to
2420 explicitly calculate pairwise identities proposed by Richard Durbin.
2421 Validated by Ewan Birney ad Alex Bateman.
2423 =cut
2425 sub average_percentage_identity{
2426 my ($self,@args) = @_;
2428 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2429 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2431 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2433 if (! $self->is_flush()) {
2434 $self->throw("All sequences in the alignment must be the same length");
2437 @seqs = $self->each_seq();
2438 $len = $self->length();
2440 # load the each hash with correct keys for existence checks
2442 for( my $index=0; $index < $len; $index++) {
2443 foreach my $letter (@alphabet) {
2444 $countHashes[$index]->{$letter} = 0;
2447 foreach my $seq (@seqs) {
2448 my @seqChars = split //, $seq->seq();
2449 for( my $column=0; $column < @seqChars; $column++ ) {
2450 my $char = uc($seqChars[$column]);
2451 if (exists $countHashes[$column]->{$char}) {
2452 $countHashes[$column]->{$char}++;
2457 $total = 0;
2458 $divisor = 0;
2459 for(my $column =0; $column < $len; $column++) {
2460 my %hash = %{$countHashes[$column]};
2461 $subdivisor = 0;
2462 foreach my $res (keys %hash) {
2463 $total += $hash{$res}*($hash{$res} - 1);
2464 $subdivisor += $hash{$res};
2466 $divisor += $subdivisor * ($subdivisor - 1);
2468 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2471 =head2 percentage_identity
2473 Title : percentage_identity
2474 Usage : $id = $align->percentage_identity
2475 Function: The function calculates the average percentage identity
2476 (aliased to average_percentage_identity)
2477 Returns : The average percentage identity
2478 Args : None
2480 =cut
2482 sub percentage_identity {
2483 my $self = shift;
2484 return $self->average_percentage_identity();
2487 =head2 overall_percentage_identity
2489 Title : overall_percentage_identity
2490 Usage : $id = $align->overall_percentage_identity
2491 $id = $align->overall_percentage_identity('short')
2492 Function: The function calculates the percentage identity of
2493 the conserved columns
2494 Returns : The percentage identity of the conserved columns
2495 Args : length value to use, optional defaults to alignment length
2496 possible values: 'align', 'short', 'long'
2498 The argument values 'short' and 'long' refer to shortest and longest
2499 sequence in the alignment. Method modification code by Hongyu Zhang.
2501 =cut
2503 sub overall_percentage_identity{
2504 my ($self, $length_measure) = @_;
2506 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2507 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2509 my ($len, $total, @seqs, @countHashes);
2511 my %enum = map {$_ => 1} qw (align short long);
2513 $self->throw("Unknown argument [$length_measure]")
2514 if $length_measure and not $enum{$length_measure};
2515 $length_measure ||= 'align';
2517 if (! $self->is_flush()) {
2518 $self->throw("All sequences in the alignment must be the same length");
2521 @seqs = $self->each_seq();
2522 $len = $self->length();
2524 # load the each hash with correct keys for existence checks
2525 for( my $index=0; $index < $len; $index++) {
2526 foreach my $letter (@alphabet) {
2527 $countHashes[$index]->{$letter} = 0;
2530 foreach my $seq (@seqs) {
2531 my @seqChars = split //, $seq->seq();
2532 for( my $column=0; $column < @seqChars; $column++ ) {
2533 my $char = uc($seqChars[$column]);
2534 if (exists $countHashes[$column]->{$char}) {
2535 $countHashes[$column]->{$char}++;
2540 $total = 0;
2541 for(my $column =0; $column < $len; $column++) {
2542 my %hash = %{$countHashes[$column]};
2543 foreach ( values %hash ) {
2544 next if( $_ == 0 );
2545 $total++ if( $_ == scalar @seqs );
2546 last;
2550 if ($length_measure eq 'short') {
2551 ## find the shortest length
2552 $len = 0;
2553 foreach my $seq ($self->each_seq) {
2554 my $count = $seq->seq =~ tr/[A-Za-z]//;
2555 if ($len) {
2556 $len = $count if $count < $len;
2557 } else {
2558 $len = $count;
2562 elsif ($length_measure eq 'long') {
2563 ## find the longest length
2564 $len = 0;
2565 foreach my $seq ($self->each_seq) {
2566 my $count = $seq->seq =~ tr/[A-Za-z]//;
2567 if ($len) {
2568 $len = $count if $count > $len;
2569 } else {
2570 $len = $count;
2575 return ($total / $len ) * 100.0;
2580 =head1 Alignment positions
2582 Methods to map a sequence position into an alignment column and back.
2583 column_from_residue_number() does the former. The latter is really a
2584 property of the sequence object and can done using
2585 L<Bio::LocatableSeq::location_from_column>:
2587 # select somehow a sequence from the alignment, e.g.
2588 my $seq = $aln->get_seq_by_pos(1);
2589 #$loc is undef or Bio::LocationI object
2590 my $loc = $seq->location_from_column(5);
2592 =head2 column_from_residue_number
2594 Title : column_from_residue_number
2595 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2596 Function: This function gives the position in the alignment
2597 (i.e. column number) of the given residue number in the
2598 sequence with the given name. For example, for the
2599 alignment
2601 Seq1/91-97 AC..DEF.GH.
2602 Seq2/24-30 ACGG.RTY...
2603 Seq3/43-51 AC.DDEF.GHI
2605 column_from_residue_number( "Seq1", 94 ) returns 6.
2606 column_from_residue_number( "Seq2", 25 ) returns 2.
2607 column_from_residue_number( "Seq3", 50 ) returns 10.
2609 An exception is thrown if the residue number would lie
2610 outside the length of the aligment
2611 (e.g. column_from_residue_number( "Seq2", 22 )
2613 Note: If the the parent sequence is represented by more than
2614 one alignment sequence and the residue number is present in
2615 them, this method finds only the first one.
2617 Returns : A column number for the position in the alignment of the
2618 given residue in the given sequence (1 = first column)
2619 Args : A sequence id/name (not a name/start-end)
2620 A residue number in the whole sequence (not just that
2621 segment of it in the alignment)
2623 =cut
2625 sub column_from_residue_number {
2626 my ($self, $name, $resnumber) = @_;
2628 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2629 $self->throw("Second argument residue number missing") unless $resnumber;
2631 foreach my $seq ($self->each_seq_with_id($name)) {
2632 my $col;
2633 eval {
2634 $col = $seq->column_from_residue_number($resnumber);
2636 next if $@;
2637 return $col;
2640 $self->throw("Could not find a sequence segment in $name ".
2641 "containing residue number $resnumber");
2645 =head1 Sequence names
2647 Methods to manipulate the display name. The default name based on the
2648 sequence id and subsequence positions can be overridden in various
2649 ways.
2651 =head2 displayname
2653 Title : displayname
2654 Usage : $myalign->displayname("Ig", "IgA")
2655 Function : Gets/sets the display name of a sequence in the alignment
2656 Returns : A display name string
2657 Argument : name of the sequence
2658 displayname of the sequence (optional)
2660 =cut
2662 sub get_displayname {
2663 my $self = shift;
2664 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2665 $self->displayname(@_);
2668 sub set_displayname {
2669 my $self = shift;
2670 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2671 $self->displayname(@_);
2674 sub displayname {
2675 my ($self, $name, $disname) = @_;
2677 $self->throw("No sequence with name [$name]")
2678 unless defined $self->{'_seq'}->{$name};
2680 if( $disname and $name) {
2681 $self->{'_dis_name'}->{$name} = $disname;
2682 return $disname;
2684 elsif( defined $self->{'_dis_name'}->{$name} ) {
2685 return $self->{'_dis_name'}->{$name};
2686 } else {
2687 return $name;
2691 =head2 set_displayname_count
2693 Title : set_displayname_count
2694 Usage : $ali->set_displayname_count
2695 Function : Sets the names to be name_# where # is the number of
2696 times this name has been used.
2697 Returns : 1, on success
2698 Argument :
2700 =cut
2702 sub set_displayname_count {
2703 my $self= shift;
2704 my (@arr,$name,$seq,$count,$temp,$nse);
2706 foreach $seq ( $self->each_alphabetically() ) {
2707 $nse = $seq->get_nse();
2709 #name will be set when this is the second
2710 #time (or greater) is has been seen
2712 if( defined $name and $name eq ($seq->id()) ) {
2713 $temp = sprintf("%s_%s",$name,$count);
2714 $self->displayname($nse,$temp);
2715 $count++;
2716 } else {
2717 $count = 1;
2718 $name = $seq->id();
2719 $temp = sprintf("%s_%s",$name,$count);
2720 $self->displayname($nse,$temp);
2721 $count++;
2724 return 1;
2727 =head2 set_displayname_flat
2729 Title : set_displayname_flat
2730 Usage : $ali->set_displayname_flat()
2731 Function : Makes all the sequences be displayed as just their name,
2732 not name/start-end
2733 Returns : 1
2734 Argument :
2736 =cut
2738 sub set_displayname_flat {
2739 my $self = shift;
2740 my ($nse,$seq);
2742 foreach $seq ( $self->each_seq() ) {
2743 $nse = $seq->get_nse();
2744 $self->displayname($nse,$seq->id());
2746 return 1;
2749 =head2 set_displayname_normal
2751 Title : set_displayname_normal
2752 Usage : $ali->set_displayname_normal()
2753 Function : Makes all the sequences be displayed as name/start-end
2754 Returns : 1, on success
2755 Argument :
2757 =cut
2759 sub set_displayname_normal {
2760 my $self = shift;
2761 my ($nse,$seq);
2763 foreach $seq ( $self->each_seq() ) {
2764 $nse = $seq->get_nse();
2765 $self->displayname($nse,$nse);
2767 return 1;
2770 =head2 source
2772 Title : source
2773 Usage : $obj->source($newval)
2774 Function: sets the Alignment source program
2775 Example :
2776 Returns : value of source
2777 Args : newvalue (optional)
2780 =cut
2782 sub source{
2783 my ($self,$value) = @_;
2784 if( defined $value) {
2785 $self->{'_source'} = $value;
2787 return $self->{'_source'};
2790 =head2 set_displayname_safe
2792 Title : set_displayname_safe
2793 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2794 Function : Assign machine-generated serial names to sequences in input order.
2795 Designed to protect names during PHYLIP runs. Assign 10-char string
2796 in the form of "S000000001" to "S999999999". Restore the original
2797 names using "restore_displayname".
2798 Returns : 1. a new $aln with system names;
2799 2. a hash ref for restoring names
2800 Argument : Number for id length (default 10)
2802 =cut
2804 sub set_displayname_safe {
2805 my $self = shift;
2806 my $idlength = shift || 10;
2807 my ($seq, %phylip_name);
2808 my $ct=0;
2809 my $new=Bio::SimpleAlign->new();
2810 foreach $seq ( $self->each_seq() ) {
2811 $ct++;
2812 my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2813 $phylip_name{$pname}=$seq->id();
2814 my $new_seq= Bio::LocatableSeq->new(-id => $pname,
2815 -seq => $seq->seq(),
2816 -alphabet => $seq->alphabet,
2817 -start => $seq->{_start},
2818 -end => $seq->{_end}
2820 $new->add_seq($new_seq);
2823 $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2824 return ($new, \%phylip_name);
2827 =head2 restore_displayname
2829 Title : restore_displayname
2830 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2831 Function : Restore original sequence names (after running
2832 $ali->set_displayname_safe)
2833 Returns : a new $aln with names restored.
2834 Argument : a hash reference of names from "set_displayname_safe".
2836 =cut
2838 sub restore_displayname {
2839 my $self = shift;
2840 my $ref=shift;
2841 my %name=%$ref;
2842 my $new=Bio::SimpleAlign->new();
2843 foreach my $seq ( $self->each_seq() ) {
2844 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2845 my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2846 -seq => $seq->seq(),
2847 -alphabet => $seq->alphabet,
2848 -start => $seq->{_start},
2849 -end => $seq->{_end}
2851 $new->add_seq($new_seq);
2853 return $new;
2856 =head2 sort_by_start
2858 Title : sort_by_start
2859 Usage : $ali->sort_by_start
2860 Function : Changes the order of the alignment to the start position of each
2861 subalignment
2862 Returns :
2863 Argument :
2865 =cut
2867 sub sort_by_start {
2868 my $self = shift;
2869 my ($seq,$nse,@arr,%hash,$count);
2870 foreach $seq ( $self->each_seq() ) {
2871 $nse = $seq->get_nse;
2872 $hash{$nse} = $seq;
2874 $count = 0;
2875 %{$self->{'_order'}} = (); # reset the hash;
2876 foreach $nse ( sort _startend keys %hash) {
2877 $self->{'_order'}->{$count} = $nse;
2878 $count++;
2883 sub _startend
2885 my ($aname,$arange) = split (/[\/]/,$a);
2886 my ($bname,$brange) = split (/[\/]/,$b);
2887 my ($astart,$aend) = split(/\-/,$arange);
2888 my ($bstart,$bend) = split(/\-/,$brange);
2889 return $astart <=> $bstart;
2892 =head2 bracket_string
2894 Title : bracket_string
2895 Usage : my @params = (-refseq => 'testseq',
2896 -allele1 => 'allele1',
2897 -allele2 => 'allele2',
2898 -delimiters => '{}',
2899 -separator => '/');
2900 $str = $aln->bracket_string(@params)
2902 Function : When supplied with a list of parameters (see below), returns a
2903 string in BIC format. This is used for allelic comparisons.
2904 Briefly, if either allele contains a base change when compared to
2905 the refseq, the base or gap for each allele is represented in
2906 brackets in the order present in the 'alleles' parameter.
2908 For the following data:
2910 >testseq
2911 GGATCCATTGCTACT
2912 >allele1
2913 GGATCCATTCCTACT
2914 >allele2
2915 GGAT--ATTCCTCCT
2917 the returned string with parameters 'refseq => testseq' and
2918 'alleles => [qw(allele1 allele2)]' would be:
2920 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2921 Returns : BIC-formatted string
2922 Argument : Required args
2923 refseq : string (ID) of the reference sequence used
2924 as basis for comparison
2925 allele1 : string (ID) of the first allele
2926 allele2 : string (ID) of the second allele
2927 Optional args
2928 delimiters: two symbol string of left and right delimiters.
2929 Only the first two symbols are used
2930 default = '[]'
2931 separator : string used as a separator. Only the first
2932 symbol is used
2933 default = '/'
2934 Throws : On no refseq/alleles, or invalid refseq/alleles.
2936 =cut
2938 sub bracket_string {
2939 my ($self, @args) = @_;
2940 my ($ref, $a1, $a2, $delim, $sep) =
2941 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2942 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2943 my ($ld, $rd);
2944 ($ld, $rd) = split('', $delim, 2) if $delim;
2945 $ld ||= '[';
2946 $rd ||= ']';
2947 $sep ||= '/';
2948 my ($refseq, $allele1, $allele2) =
2949 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2950 if (!$refseq || !$allele1 || !$allele2) {
2951 $self->throw("One of your refseq/allele IDs is invalid!");
2953 my $len = $self->length-1;
2954 my $bic = '';
2955 # loop over the alignment columns
2956 for my $column ( 0 .. $len ) {
2957 my $string;
2958 my ($compres, $res1, $res2) =
2959 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2960 # are any of the allele symbols different from the refseq?
2961 $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2962 $ld.$res1.$sep.$res2.$rd;
2963 $bic .= $string;
2965 return $bic;
2969 =head2 methods implementing Bio::FeatureHolderI
2971 FeatureHolderI implementation to support labeled character sets like one
2972 would get from NEXUS represented data.
2974 =head2 get_SeqFeatures
2976 Usage : @features = $aln->get_SeqFeatures
2977 Function: Get the feature objects held by this feature holder.
2978 Example :
2979 Returns : an array of Bio::SeqFeatureI implementing objects
2980 Args : optional filter coderef, taking a Bio::SeqFeatureI
2981 : as argument, returning TRUE if wanted, FALSE if
2982 : unwanted
2984 =cut
2986 sub get_SeqFeatures {
2987 my $self = shift;
2988 my $filter_cb = shift;
2989 $self->throw("Arg (filter callback) must be a coderef") unless
2990 !defined($filter_cb) or ref($filter_cb) eq 'CODE';
2991 if( !defined $self->{'_as_feat'} ) {
2992 $self->{'_as_feat'} = [];
2994 if ($filter_cb) {
2995 return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
2997 return @{$self->{'_as_feat'}};
3000 =head2 add_SeqFeature
3002 Usage : $aln->add_SeqFeature($subfeat);
3003 Function: adds a SeqFeature into the SeqFeature array.
3004 Example :
3005 Returns : true on success
3006 Args : a Bio::SeqFeatureI object
3007 Note : This implementation is not compliant
3008 with Bio::FeatureHolderI
3010 =cut
3012 sub add_SeqFeature {
3013 my ($self,@feat) = @_;
3015 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3017 foreach my $feat ( @feat ) {
3018 if( !$feat->isa("Bio::SeqFeatureI") ) {
3019 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3022 push(@{$self->{'_as_feat'}},$feat);
3024 return 1;
3028 =head2 remove_SeqFeatures
3030 Usage : $obj->remove_SeqFeatures
3031 Function: Removes all SeqFeatures. If you want to remove only a subset,
3032 remove that subset from the returned array, and add back the rest.
3033 Returns : The array of Bio::SeqFeatureI features that was
3034 deleted from this alignment.
3035 Args : none
3037 =cut
3039 sub remove_SeqFeatures {
3040 my $self = shift;
3042 return () unless $self->{'_as_feat'};
3043 my @feats = @{$self->{'_as_feat'}};
3044 $self->{'_as_feat'} = [];
3045 return @feats;
3048 =head2 feature_count
3050 Title : feature_count
3051 Usage : $obj->feature_count()
3052 Function: Return the number of SeqFeatures attached to the alignment
3053 Returns : integer representing the number of SeqFeatures
3054 Args : None
3056 =cut
3058 sub feature_count {
3059 my ($self) = @_;
3061 if (defined($self->{'_as_feat'})) {
3062 return ($#{$self->{'_as_feat'}} + 1);
3063 } else {
3064 return 0;
3068 =head2 get_all_SeqFeatures
3070 Title : get_all_SeqFeatures
3071 Usage :
3072 Function: Get all SeqFeatures.
3073 Example :
3074 Returns : an array of Bio::SeqFeatureI implementing objects
3075 Args : none
3076 Note : Falls through to Bio::FeatureHolderI implementation.
3078 =cut
3080 =head2 methods for Bio::AnnotatableI
3082 AnnotatableI implementation to support sequence alignments which
3083 contain annotation (NEXUS, Stockholm).
3085 =head2 annotation
3087 Title : annotation
3088 Usage : $ann = $aln->annotation or
3089 $aln->annotation($ann)
3090 Function: Gets or sets the annotation
3091 Returns : Bio::AnnotationCollectionI object
3092 Args : None or Bio::AnnotationCollectionI object
3094 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3095 for more information
3097 =cut
3099 sub annotation {
3100 my ($obj,$value) = @_;
3101 if( defined $value ) {
3102 $obj->throw("object of class ".ref($value)." does not implement ".
3103 "Bio::AnnotationCollectionI. Too bad.")
3104 unless $value->isa("Bio::AnnotationCollectionI");
3105 $obj->{'_annotation'} = $value;
3106 } elsif( ! defined $obj->{'_annotation'}) {
3107 $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3109 return $obj->{'_annotation'};
3112 =head1 Deprecated methods
3114 =cut
3116 =head2 no_residues
3118 Title : no_residues
3119 Usage : $no = $ali->no_residues
3120 Function : number of residues in total in the alignment
3121 Returns : integer
3122 Argument :
3123 Note : deprecated in favor of num_residues()
3125 =cut
3127 sub no_residues {
3128 my $self = shift;
3129 $self->deprecated(-warn_version => 1.0069,
3130 -throw_version => 1.0075,
3131 -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3132 $self->num_residues(@_);
3135 =head2 no_sequences
3137 Title : no_sequences
3138 Usage : $depth = $ali->no_sequences
3139 Function : number of sequence in the sequence alignment
3140 Returns : integer
3141 Argument :
3142 Note : deprecated in favor of num_sequences()
3144 =cut
3146 sub no_sequences {
3147 my $self = shift;
3148 $self->deprecated(-warn_version => 1.0069,
3149 -throw_version => 1.0075,
3150 -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3151 $self->num_sequences(@_);
3154 =head2 mask_columns
3156 Title : mask_columns
3157 Usage : $aln2 = $aln->mask_columns(20,30)
3158 Function : Masks a slice of the alignment inclusive of start and
3159 end columns, and the first column in the alignment is denoted 1.
3160 Mask beyond the length of the sequence does not do padding.
3161 Returns : A Bio::SimpleAlign object
3162 Args : Positive integer for start column, positive integer for end column,
3163 optional string value use for the mask. Example:
3165 $aln2 = $aln->mask_columns(20,30,'?')
3166 Note : Masking must use a character that is not used for gaps or
3167 frameshifts. These can be adjusted using the relevant global
3168 variables, but be aware these may be (uncontrollably) modified
3169 elsewhere within BioPerl (see bug 2715)
3171 =cut
3173 sub mask_columns {
3174 #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3175 my $self = shift;
3177 my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3178 $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3180 # coordinates are alignment-based, not sequence-based
3181 my ($start, $end, $mask_char) = @_;
3182 unless (defined $mask_char) { $mask_char = 'N' }
3184 $self->throw("Mask start has to be a positive integer and less than ".
3185 "alignment length, not [$start]")
3186 unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3187 $self->throw("Mask end has to be a positive integer and less than ".
3188 "alignment length, not [$end]")
3189 unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3190 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3191 "end [$end]") unless $start <= $end;
3192 $self->throw("Mask character $mask_char has to be a single character ".
3193 "and not a gap or frameshift symbol")
3194 unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3196 my $aln = $self->new;
3197 $aln->id($self->id);
3198 foreach my $seq ( $self->each_seq() ) {
3199 my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3200 -alphabet => $seq->alphabet,
3201 -strand => $seq->strand,
3202 -verbose => $self->verbose);
3204 # convert from 1-based alignment coords!
3205 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3206 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3207 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3208 $new_seq->seq($new_dna_string);
3209 $aln->add_seq($new_seq);
3211 return $aln;