add some significant milestones
[bioperl-live.git] / Bio / SimpleAlign.pm
blob3e3b3467ca6d2dc8515b34d6f2fda77e579759e2
1 # $Id$
2 # BioPerl module for SimpleAlign
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 # History:
15 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
16 # May 2001 major rewrite - Heikki Lehvaslaiho
18 =head1 NAME
20 Bio::SimpleAlign - Multiple alignments held as a set of sequences
22 =head1 SYNOPSIS
24 # Use Bio::AlignIO to read in the alignment
25 $str = Bio::AlignIO->new(-file => 't/data/testaln.pfam');
26 $aln = $str->next_aln();
28 # Describe
29 print $aln->length;
30 print $aln->num_residues;
31 print $aln->is_flush;
32 print $aln->num_sequences;
33 print $aln->score;
34 print $aln->percentage_identity;
35 print $aln->consensus_string(50);
37 # Find the position in the alignment for a sequence location
38 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
40 # Extract sequences and check values for the alignment column $pos
41 foreach $seq ($aln->each_seq) {
42 $res = $seq->subseq($pos, $pos);
43 $count{$res}++;
45 foreach $res (keys %count) {
46 printf "Res: %s Count: %2d\n", $res, $count{$res};
49 # Manipulate
50 $aln->remove_seq($seq);
51 $mini_aln = $aln->slice(20,30); # get a block of columns
52 $mini_aln = $aln->select_noncont(1,3,5,7,11); # select certain sequences
53 $new_aln = $aln->remove_columns([20,30]); # remove by position
54 $new_aln = $aln->remove_columns(['mismatch']); # remove by property
56 # Analyze
57 $str = $aln->consensus_string($threshold_percent);
58 $str = $aln->match_line();
59 $str = $aln->cigar_line();
60 $id = $aln->percentage_identity;
62 # See the module documentation for details and more methods.
64 =head1 DESCRIPTION
66 SimpleAlign is an object that handles a multiple sequence alignment
67 (MSA). It is very permissive of types (it does not insist on sequences
68 being all same length, for example). Think of it as a set of sequences
69 with a whole series of built-in manipulations and methods for reading and
70 writing alignments.
72 SimpleAlign uses L<Bio::LocatableSeq>, a subclass of L<Bio::PrimarySeq>,
73 to store its sequences. These are subsequences with a start and end
74 positions in the parent reference sequence. Each sequence in the
75 SimpleAlign object is a Bio::LocatableSeq.
77 SimpleAlign expects the combination of name, start, and end for a
78 given sequence to be unique in the alignment, and this is the key for the
79 internal hashes (name, start, end are abbreviated C<nse> in the code).
80 However, in some cases people do not want the name/start-end to be displayed:
81 either multiple names in an alignment or names specific to the alignment
82 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
83 C<displayname>, and generally is what is used to print out the
84 alignment. They default to name/start-end.
86 The SimpleAlign Module is derived from the Align module by Ewan Birney.
88 =head1 FEEDBACK
90 =head2 Mailing Lists
92 User feedback is an integral part of the evolution of this and other
93 Bioperl modules. Send your comments and suggestions preferably to one
94 of the Bioperl mailing lists. Your participation is much appreciated.
96 bioperl-l@bioperl.org - General discussion
97 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
99 =head2 Support
101 Please direct usage questions or support issues to the mailing list:
103 I<bioperl-l@bioperl.org>
105 rather than to the module maintainer directly. Many experienced and
106 reponsive experts will be able look at the problem and quickly
107 address it. Please include a thorough description of the problem
108 with code and data examples if at all possible.
110 =head2 Reporting Bugs
112 Report bugs to the Bioperl bug tracking system to help us keep track
113 the bugs and their resolution. Bug reports can be submitted via the
114 web:
116 http://bugzilla.open-bio.org/
118 =head1 AUTHOR
120 Ewan Birney, birney@ebi.ac.uk
122 =head1 CONTRIBUTORS
124 Allen Day, allenday-at-ucla.edu,
125 Richard Adams, Richard.Adams-at-ed.ac.uk,
126 David J. Evans, David.Evans-at-vir.gla.ac.uk,
127 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org,
128 Allen Smith, allens-at-cpan.org,
129 Jason Stajich, jason-at-bioperl.org,
130 Anthony Underwood, aunderwood-at-phls.org.uk,
131 Xintao Wei & Giri Narasimhan, giri-at-cs.fiu.edu
132 Brian Osborne, bosborne at alum.mit.edu
133 Weigang Qiu, Weigang at GENECTR-HUNTER-CUNY-EDU
134 Hongyu Zhang, forward at hongyu.org
135 Jay Hannah, jay at jays.net
136 Alexandr Bezginov, albezg at gmail.com
138 =head1 SEE ALSO
140 L<Bio::LocatableSeq>
142 =head1 APPENDIX
144 The rest of the documentation details each of the object
145 methods. Internal methods are usually preceded with a _
147 =cut
149 # 'Let the code begin...
151 package Bio::SimpleAlign;
152 use vars qw(%CONSERVATION_GROUPS);
153 use strict;
155 use Bio::LocatableSeq; # uses Seq's as list
157 use Bio::Seq;
158 use Bio::SeqFeature::Generic;
160 BEGIN {
161 # This data should probably be in a more centralized module...
162 # it is taken from Clustalw documentation.
163 # These are all the positively scoring groups that occur in the
164 # Gonnet Pam250 matrix. The strong and weak groups are
165 # defined as strong score >0.5 and weak score =<0.5 respectively.
167 %CONSERVATION_GROUPS = (
168 'strong' => [ qw(
170 NEQK
171 NHQK
172 NDEQ
173 QHRK
174 MILV
175 MILF
177 FYW )],
178 'weak' => [ qw(
182 STNK
183 STPA
184 SGND
185 SNDEQK
186 NDEQHK
187 NEQHRK
188 FVLIM
189 HFY )],);
192 use base qw(Bio::Root::Root Bio::Align::AlignI Bio::AnnotatableI
193 Bio::FeatureHolderI);
195 =head2 new
197 Title : new
198 Usage : my $aln = Bio::SimpleAlign->new();
199 Function : Creates a new simple align object
200 Returns : Bio::SimpleAlign
201 Args : -source => string representing the source program
202 where this alignment came from
203 -annotation => Bio::AnnotationCollectionI
204 -seq_annotation => Bio::AnnotationCollectionI for sequences (requires -annotation also be set)
205 -seqs => array ref containing Bio::LocatableSeq or Bio::Seq::Meta
206 -consensus => consensus string
207 -consensus_meta => Bio::Seq::Meta object containing consensus met information (kludge)
209 =cut
212 sub new {
213 my($class,@args) = @_;
215 my $self = $class->SUPER::new(@args);
217 my ($src, $score, $id, $acc, $desc, $seqs, $feats, $coll, $sa, $con, $cmeta) = $self->_rearrange([qw(
218 SOURCE
219 SCORE
221 ACCESSION
222 DESCRIPTION
223 SEQS
224 FEATURES
225 ANNOTATION
226 SEQ_ANNOTATION
227 CONSENSUS
228 CONSENSUS_META
229 )], @args);
230 $src && $self->source($src);
231 defined $score && $self->score($score);
232 # we need to set up internal hashs first!
234 $self->{'_seq'} = {};
235 $self->{'_order'} = {};
236 $self->{'_start_end_lists'} = {};
237 $self->{'_dis_name'} = {};
238 $self->{'_id'} = 'NoName';
239 # maybe we should automatically read in from args. Hmmm...
240 $id && $self->id($id);
241 $acc && $self->accession($acc);
242 $desc && $self->description($desc);
243 $coll && $self->annotation($coll);
244 # sequence annotation is layered into a provided annotation collection (or dies)
245 if ($sa) {
246 $self->throw("Must supply an alignment-based annotation collection (-annotation) ".
247 "with a sequence annotation collection")
248 if !$coll;
249 $coll->add_Annotation('seq_annotation', $sa);
251 if ($feats && ref $feats eq 'ARRAY') {
252 for my $feat (@$feats) {
253 $self->add_SeqFeature($feat);
256 $con && $self->consensus($con);
257 $cmeta && $self->consensus_meta($cmeta);
258 # assumes these are in correct alignment order
259 if ($seqs && ref($seqs) eq 'ARRAY') {
260 for my $seq (@$seqs) {
261 $self->add_seq($seq);
265 return $self; # success - we hope!
268 =head1 Modifier methods
270 These methods modify the MSA by adding, removing or shuffling complete
271 sequences.
273 =head2 add_seq
275 Title : add_seq
276 Usage : $myalign->add_seq($newseq);
277 $myalign->add_seq(-SEQ=>$newseq, -ORDER=>5);
278 Function : Adds another sequence to the alignment. *Does not* align
279 it - just adds it to the hashes.
280 If -ORDER is specified, the sequence is inserted at the
281 the position spec'd by -ORDER, and existing sequences
282 are pushed down the storage array.
283 Returns : nothing
284 Args : A Bio::LocatableSeq object
285 Positive integer for the sequence position (optional)
287 See L<Bio::LocatableSeq> for more information
289 =cut
291 sub addSeq {
292 my $self = shift;
293 $self->deprecated("addSeq - deprecated method. Use add_seq() instead.");
294 $self->add_seq(@_);
297 sub add_seq {
298 my $self = shift;
299 my @args = @_;
300 my ($seq, $order) = $self->_rearrange([qw(SEQ ORDER)], @args);
301 my ($name,$id,$start,$end);
303 unless ($seq) {
304 $self->throw("LocatableSeq argument required");
306 if( ! ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
307 $self->throw("Unable to process non locatable sequences [". ref($seq). "]");
309 !defined($order) and $order = 1 + keys %{$self->{'_seq'}}; # default
310 $order--; # jay's patch (user-specified order is 1-origin)
312 if ($order < 0) {
313 $self->throw("User-specified value for ORDER must be >= 1");
316 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
318 # build the symbol list for this sequence,
319 # will prune out the gap and missing/match chars
320 # when actually asked for the symbol list in the
321 # symbol_chars
322 # map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq) if $seq->seq;
324 $name = $seq->get_nse;
326 if( $self->{'_seq'}->{$name} ) {
327 $self->warn("Replacing one sequence [$name]\n") unless $self->verbose < 0;
329 else {
330 $self->debug( "Assigning $name to $order\n");
332 my $ordh = $self->{'_order'};
333 if ($ordh->{$order}) {
334 # make space to insert
335 # $c->() returns (in reverse order) the first subsequence
336 # of consecutive integers; i.e., $c->(1,2,3,5,6,7) returns
337 # (3,2,1), and $c->(2,4,5) returns (2).
338 my $c;
339 $c = sub { return (($_[1]-$_[0] == 1) ? ($c->(@_[1..$#_]),$_[0]) : $_[0]); };
340 map {
341 $ordh->{$_+1} = $ordh->{$_}
342 } $c->(sort {$a <=> $b} grep {$_ >= $order} keys %{$ordh});
345 $ordh->{$order} = $name;
347 unless( exists( $self->{'_start_end_lists'}->{$id})) {
348 $self->{'_start_end_lists'}->{$id} = [];
350 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
353 $self->{'_seq'}->{$name} = $seq;
358 =head2 remove_seq
360 Title : remove_seq
361 Usage : $aln->remove_seq($seq);
362 Function : Removes a single sequence from an alignment
363 Returns :
364 Argument : a Bio::LocatableSeq object
366 =cut
368 sub removeSeq {
369 my $self = shift;
370 $self->deprecated("removeSeq - deprecated method. Use remove_seq() instead.");
371 $self->remove_seq(@_);
374 sub remove_seq {
375 my $self = shift;
376 my $seq = shift;
377 my ($name,$id,$start,$end);
379 $self->throw("Need Bio::Locatable seq argument ")
380 unless ref $seq && $seq->isa( 'Bio::LocatableSeq');
382 $id = $seq->id();
383 $start = $seq->start();
384 $end = $seq->end();
385 $name = sprintf("%s/%d-%d",$id,$start,$end);
387 if( !exists $self->{'_seq'}->{$name} ) {
388 $self->throw("Sequence $name does not exist in the alignment to remove!");
391 delete $self->{'_seq'}->{$name};
393 # we need to remove this seq from the start_end_lists hash
395 if (exists $self->{'_start_end_lists'}->{$id}) {
396 # we need to find the sequence in the array.
398 my ($i, $found);;
399 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
400 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
401 $found = 1;
402 last;
405 if ($found) {
406 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
408 else {
409 $self->throw("Could not find the sequence to remoce from the start-end list");
412 else {
413 $self->throw("There is no seq list for the name $id");
415 # we need to shift order hash
416 my %rev_order = reverse %{$self->{'_order'}};
417 my $no = $rev_order{$name};
418 my $num_sequences = $self->num_sequences;
419 for (; $no < $num_sequences; $no++) {
420 $self->{'_order'}->{$no} = $self->{'_order'}->{$no+1};
422 delete $self->{'_order'}->{$no};
423 return 1;
427 =head2 purge
429 Title : purge
430 Usage : $aln->purge(0.7);
431 Function: Removes sequences above given sequence similarity
432 This function will grind on large alignments. Beware!
433 Example :
434 Returns : An array of the removed sequences
435 Args : float, threshold for similarity
437 =cut
439 sub purge {
440 my ($self,$perc) = @_;
441 my (%duplicate, @dups);
443 my @seqs = $self->each_seq();
445 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
446 my $seq = $seqs[$i];
448 #skip if already in duplicate hash
449 next if exists $duplicate{$seq->display_id} ;
450 my $one = $seq->seq();
452 my @one = split '', $one; #split to get 1aa per array element
454 for (my $j=$i+1;$j < @seqs;$j++) {
455 my $seq2 = $seqs[$j];
457 #skip if already in duplicate hash
458 next if exists $duplicate{$seq2->display_id} ;
460 my $two = $seq2->seq();
461 my @two = split '', $two;
463 my $count = 0;
464 my $res = 0;
465 for (my $k=0;$k<@one;$k++) {
466 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
467 $one[$k] eq $two[$k]) {
468 $count++;
470 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
471 $two[$k] ne '.' && $two[$k] ne '-' ) {
472 $res++;
476 my $ratio = 0;
477 $ratio = $count/$res unless $res == 0;
479 # if above threshold put in duplicate hash and push onto
480 # duplicate array for returning to get_unique
481 if ( $ratio > $perc ) {
482 $self->warn("duplicate: ", $seq2->display_id) if $self->verbose > 0;
483 $duplicate{$seq2->display_id} = 1;
484 push @dups, $seq2;
488 foreach my $seq (@dups) {
489 $self->remove_seq($seq);
491 return @dups;
494 =head2 sort_alphabetically
496 Title : sort_alphabetically
497 Usage : $ali->sort_alphabetically
498 Function : Changes the order of the alignment to alphabetical on name
499 followed by numerical by number.
500 Returns :
501 Argument :
503 =cut
505 sub sort_alphabetically {
506 my $self = shift;
507 my ($seq,$nse,@arr,%hash,$count);
509 foreach $seq ( $self->each_seq() ) {
510 $nse = $seq->get_nse;
511 $hash{$nse} = $seq;
514 $count = 0;
516 %{$self->{'_order'}} = (); # reset the hash;
518 foreach $nse ( sort _alpha_startend keys %hash) {
519 $self->{'_order'}->{$count} = $nse;
521 $count++;
526 =head2 sort_by_list
528 Title : sort_by_list
529 Usage : $aln_ordered=$aln->sort_by_list($list_file)
530 Function : Arbitrarily order sequences in an alignment
531 Returns : A new Bio::SimpleAlign object
532 Argument : a file listing sequence names in intended order (one name per line)
534 =cut
536 sub sort_by_list {
537 my ($self, $list) = @_;
538 my (@seq, @ids, %order);
540 foreach my $seq ( $self->each_seq() ) {
541 push @seq, $seq;
542 push @ids, $seq->display_id;
545 my $ct=1;
546 open(my $listfh, '<', $list) || $self->throw("can't open file for reading: $list");
547 while (<$listfh>) {
548 chomp;
549 my $name=$_;
550 $self->throw("Not found in alignment: $name") unless &_in_aln($name, \@ids);
551 $order{$name}=$ct++;
553 close($listfh);
555 # use the map-sort-map idiom:
556 my @sorted= map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [$order{$_->id()}, $_] } @seq;
557 my $aln = $self->new;
558 foreach (@sorted) { $aln->add_seq($_) }
559 return $aln;
562 =head2 set_new_reference
564 Title : set_new_reference
565 Usage : $aln->set_new_reference(3 or 'B31'): Select the 3rd sequence, or
566 the sequence whoes name is "B31" (full, exact, and case-sensitive),
567 as the reference (1st) sequence
568 Function : Change/Set a new reference (i.e., the first) sequence
569 Returns : a new Bio::SimpleAlign object.
570 Throws an exception if designated sequence not found
571 Argument : a positive integer of sequence order, or a sequence name
572 in the original alignment
574 =cut
576 sub set_new_reference {
577 my ($self, $seqid) = @_;
578 my $aln = $self->new;
579 my (@seq, @ids, @new_seq);
580 my $is_num=0;
581 foreach my $seq ( $self->each_seq() ) {
582 push @seq, $seq;
583 push @ids, $seq->display_id;
586 if ($seqid =~ /^\d+$/) { # argument is seq position
587 $is_num=1;
588 $self->throw("The new reference sequence number has to be a positive integer >1 and <= num_sequences ") if ($seqid <= 1 || $seqid > $self->num_sequences);
589 } else { # argument is a seq name
590 $self->throw("The new reference sequence not in alignment ") unless &_in_aln($seqid, \@ids);
593 for (my $i=0; $i<=$#seq; $i++) {
594 my $pos=$i+1;
595 if ( ($is_num && $pos == $seqid) || ($seqid eq $seq[$i]->display_id) ) {
596 unshift @new_seq, $seq[$i];
597 } else {
598 push @new_seq, $seq[$i];
601 foreach (@new_seq) { $aln->add_seq($_); }
602 return $aln;
605 sub _in_aln { # check if input name exists in the alignment
606 my ($str, $ref) = @_;
607 foreach (@$ref) {
608 return 1 if $str eq $_;
610 return 0;
614 =head2 uniq_seq
616 Title : uniq_seq
617 Usage : $aln->uniq_seq(): Remove identical sequences in
618 in the alignment. Ambiguous base ("N", "n") and
619 leading and ending gaps ("-") are NOT counted as
620 differences.
621 Function : Make a new alignment of unique sequence types (STs)
622 Returns : 1a. if called in a scalar context,
623 a new Bio::SimpleAlign object (all sequences renamed as "ST")
624 1b. if called in an array context,
625 a new Bio::SimpleAlign object, and a hashref whose keys
626 are sequence types, and whose values are arrayrefs to
627 lists of sequence ids within the corresponding sequence type
628 2. if $aln->verbose > 0, ST of each sequence is sent to
629 STDERR (in a tabular format)
630 Argument : None
632 =cut
634 sub uniq_seq {
635 my ($self, $seqid) = @_;
636 my $aln = $self->new;
637 my (%member, %order, @seq, @uniq_str, $st);
638 my $order=0;
639 my $len = $self->length();
640 $st = {};
641 foreach my $seq ( $self->each_seq() ) {
642 my $str = $seq->seq();
644 # it's necessary to ignore "n", "N", leading gaps and ending gaps in
645 # comparing two sequence strings
647 # 1st, convert "n", "N" to "?" (for DNA sequence only):
648 $str =~ s/n/\?/gi if $str =~ /^[atcgn-]+$/i;
649 # 2nd, convert leading and ending gaps to "?":
650 $str = &_convert_leading_ending_gaps($str, '-', '?');
651 # Note that '?' also can mean unknown residue.
652 # I don't like making global class member changes like this, too
653 # prone to errors... -- cjfields 08-11-18
654 local $Bio::LocatableSeq::GAP_SYMBOLS = '-\?';
655 my $new = Bio::LocatableSeq->new(
656 -id => $seq->id(),
657 -alphabet=> $seq->alphabet,
658 -seq => $str,
659 -start => $seq->start,
660 -end => $seq->end
662 push @seq, $new;
665 foreach my $seq (@seq) {
666 my $str = $seq->seq();
667 my ($seen, $key) = &_check_uniq($str, \@uniq_str, $len);
668 if ($seen) { # seen before
669 my @memb = @{$member{$key}};
670 push @memb, $seq;
671 $member{$key} = \@memb;
672 } else { # not seen
673 push @uniq_str, $key;
674 $order++;
675 $member{$key} = [ ($seq) ];
676 $order{$key} = $order;
680 foreach my $str (sort {$order{$a} <=> $order{$b}} keys %order) { # sort by input order
681 # convert leading/ending "?" back into "-" ("?" throws errors by SimpleAlign):
682 my $str2 = &_convert_leading_ending_gaps($str, '?', '-');
683 # convert middle "?" back into "N" ("?" throws errors by SimpleAlign):
684 $str2 =~ s/\?/N/g if $str2 =~ /^[atcg\-\?]+$/i;
685 my $gap='-';
686 my $end= CORE::length($str2);
687 $end -= CORE::length($1) while $str2 =~ m/($gap+)/g;
688 my $new = Bio::LocatableSeq->new(-id =>"ST".$order{$str},
689 -seq =>$str2,
690 -start=>1,
691 -end =>$end
693 $aln->add_seq($new);
694 foreach (@{$member{$str}}) {
695 push @{$$st{$order{$str}}}, $_->id(); # per Tristan's patch/Bug #2805
696 $self->debug($_->id(), "\t", "ST", $order{$str}, "\n");
699 return wantarray ? ($aln, $st) : $aln;
702 sub _check_uniq { # check if same seq exists in the alignment
703 my ($str1, $ref, $length) = @_;
704 my @char1=split //, $str1;
705 my @array=@$ref;
707 return (0, $str1) if @array==0; # not seen (1st sequence)
709 foreach my $str2 (@array) {
710 my $diff=0;
711 my @char2=split //, $str2;
712 for (my $i=0; $i<=$length-1; $i++) {
713 next if $char1[$i] eq '?';
714 next if $char2[$i] eq '?';
715 $diff++ if $char1[$i] ne $char2[$i];
717 return (1, $str2) if $diff == 0; # seen before
720 return (0, $str1); # not seen
723 sub _convert_leading_ending_gaps {
724 my $s=shift;
725 my $sym1=shift;
726 my $sym2=shift;
727 my @array=split //, $s;
728 # convert leading char:
729 for (my $i=0; $i<=$#array; $i++) {
730 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
732 # convert ending char:
733 for (my $i = $#array; $i>= 0; $i--) {
734 ($array[$i] eq $sym1) ? ($array[$i] = $sym2):(last);
736 my $s_new=join '', @array;
737 return $s_new;
740 =head1 Sequence selection methods
742 Methods returning one or more sequences objects.
744 =head2 each_seq
746 Title : each_seq
747 Usage : foreach $seq ( $align->each_seq() )
748 Function : Gets a Seq object from the alignment
749 Returns : Seq object
750 Argument :
752 =cut
754 sub eachSeq {
755 my $self = shift;
756 $self->deprecated("eachSeq - deprecated method. Use each_seq() instead.");
757 $self->each_seq();
760 sub each_seq {
761 my $self = shift;
762 my (@arr,$order);
764 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
765 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
766 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
769 return @arr;
773 =head2 each_alphabetically
775 Title : each_alphabetically
776 Usage : foreach $seq ( $ali->each_alphabetically() )
777 Function : Returns a sequence object, but the objects are returned
778 in alphabetically sorted order.
779 Does not change the order of the alignment.
780 Returns : Seq object
781 Argument :
783 =cut
785 sub each_alphabetically {
786 my $self = shift;
787 my ($seq,$nse,@arr,%hash,$count);
789 foreach $seq ( $self->each_seq() ) {
790 $nse = $seq->get_nse;
791 $hash{$nse} = $seq;
794 foreach $nse ( sort _alpha_startend keys %hash) {
795 push(@arr,$hash{$nse});
797 return @arr;
800 sub _alpha_startend {
801 my ($aname,$astart,$bname,$bstart);
802 ($aname,$astart) = split (/-/,$a);
803 ($bname,$bstart) = split (/-/,$b);
805 if( $aname eq $bname ) {
806 return $astart <=> $bstart;
808 else {
809 return $aname cmp $bname;
813 =head2 each_seq_with_id
815 Title : each_seq_with_id
816 Usage : foreach $seq ( $align->each_seq_with_id() )
817 Function : Gets a Seq objects from the alignment, the contents
818 being those sequences with the given name (there may be
819 more than one)
820 Returns : Seq object
821 Argument : a seq name
823 =cut
825 sub eachSeqWithId {
826 my $self = shift;
827 $self->deprecated("eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
828 $self->each_seq_with_id(@_);
831 sub each_seq_with_id {
832 my $self = shift;
833 my $id = shift;
835 $self->throw("Method each_seq_with_id needs a sequence name argument")
836 unless defined $id;
838 my (@arr, $seq);
840 if (exists($self->{'_start_end_lists'}->{$id})) {
841 @arr = @{$self->{'_start_end_lists'}->{$id}};
843 return @arr;
846 =head2 get_seq_by_pos
848 Title : get_seq_by_pos
849 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
850 Function : Gets a sequence based on its position in the alignment.
851 Numbering starts from 1. Sequence positions larger than
852 num_sequences() will thow an error.
853 Returns : a Bio::LocatableSeq object
854 Args : positive integer for the sequence position
856 =cut
858 sub get_seq_by_pos {
860 my $self = shift;
861 my ($pos) = @_;
863 $self->throw("Sequence position has to be a positive integer, not [$pos]")
864 unless $pos =~ /^\d+$/ and $pos > 0;
865 $self->throw("No sequence at position [$pos]")
866 unless $pos <= $self->num_sequences ;
868 my $nse = $self->{'_order'}->{--$pos};
869 return $self->{'_seq'}->{$nse};
872 =head2 get_seq_by_id
874 Title : get_seq_by_id
875 Usage : $seq = $aln->get_seq_by_id($name) # seq named $name
876 Function : Gets a sequence based on its name.
877 Sequences that do not exist will warn and return undef
878 Returns : a Bio::LocatableSeq object
879 Args : string for sequence name
881 =cut
883 sub get_seq_by_id {
884 my ($self,$name) = @_;
885 unless( defined $name ) {
886 $self->warn("Must provide a sequence name");
887 return;
889 for my $seq ( values %{$self->{'_seq'}} ) {
890 if ( $seq->id eq $name) {
891 return $seq;
894 return;
897 =head2 seq_with_features
899 Title : seq_with_features
900 Usage : $seq = $aln->seq_with_features(-pos => 1,
901 -consensus => 60
902 -mask =>
903 sub { my $consensus = shift;
905 for my $i (1..5){
906 my $n = 'N' x $i;
907 my $q = '\?' x $i;
908 while($consensus =~ /[^?]$q[^?]/){
909 $consensus =~ s/([^?])$q([^?])/$1$n$2/;
912 return $consensus;
915 Function: produces a Bio::Seq object by first splicing gaps from -pos
916 (by means of a splice_by_seq_pos() call), then creating
917 features using non-? chars (by means of a consensus_string()
918 call with stringency -consensus).
919 Returns : a Bio::Seq object
920 Args : -pos : required. sequence from which to build the Bio::Seq
921 object
922 -consensus : optional, defaults to consensus_string()'s
923 default cutoff value
924 -mask : optional, a coderef to apply to consensus_string()'s
925 output before building features. this may be useful for
926 closing gaps of 1 bp by masking over them with N, for
927 instance
929 =cut
931 sub seq_with_features{
932 my ($self,%arg) = @_;
934 #first do the preparatory splice
935 $self->throw("must provide a -pos argument") unless $arg{-pos};
936 $self->splice_by_seq_pos($arg{-pos});
938 my $consensus_string = $self->consensus_string($arg{-consensus});
939 $consensus_string = $arg{-mask}->($consensus_string)
940 if defined($arg{-mask});
942 my(@bs,@es);
944 push @bs, 1 if $consensus_string =~ /^[^?]/;
946 while($consensus_string =~ /\?[^?]/g){
947 push @bs, pos($consensus_string);
949 while($consensus_string =~ /[^?]\?/g){
950 push @es, pos($consensus_string);
953 push @es, CORE::length($consensus_string) if $consensus_string =~ /[^?]$/;
955 my $seq = Bio::Seq->new();
957 # my $rootfeature = Bio::SeqFeature::Generic->new(
958 # -source_tag => 'location',
959 # -start => $self->get_seq_by_pos($arg{-pos})->start,
960 # -end => $self->get_seq_by_pos($arg{-pos})->end,
961 # );
962 # $seq->add_SeqFeature($rootfeature);
964 while(my $b = shift @bs){
965 my $e = shift @es;
966 $seq->add_SeqFeature(
967 Bio::SeqFeature::Generic->new(
968 -start => $b - 1 + $self->get_seq_by_pos($arg{-pos})->start,
969 -end => $e - 1 + $self->get_seq_by_pos($arg{-pos})->start,
970 -source_tag => $self->source || 'MSA',
975 return $seq;
979 =head1 Create new alignments
981 The result of these methods are horizontal or vertical subsets of the
982 current MSA.
984 =head2 select
986 Title : select
987 Usage : $aln2 = $aln->select(1, 3) # three first sequences
988 Function : Creates a new alignment from a continuous subset of
989 sequences. Numbering starts from 1. Sequence positions
990 larger than num_sequences() will thow an error.
991 Returns : a Bio::SimpleAlign object
992 Args : positive integer for the first sequence
993 positive integer for the last sequence to include (optional)
995 =cut
997 sub select {
998 my $self = shift;
999 my ($start, $end) = @_;
1001 $self->throw("Select start has to be a positive integer, not [$start]")
1002 unless $start =~ /^\d+$/ and $start > 0;
1003 $self->throw("Select end has to be a positive integer, not [$end]")
1004 unless $end =~ /^\d+$/ and $end > 0;
1005 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
1006 unless $start <= $end;
1008 my $aln = $self->new;
1009 foreach my $pos ($start .. $end) {
1010 $aln->add_seq($self->get_seq_by_pos($pos));
1012 $aln->id($self->id);
1013 # fix for meta, sf, ann
1014 return $aln;
1017 =head2 select_noncont
1019 Title : select_noncont
1020 Usage : # 1st and 3rd sequences, sorted
1021 $aln2 = $aln->select_noncont(1, 3)
1023 # 1st and 3rd sequences, sorted (same as first)
1024 $aln2 = $aln->select_noncont(3, 1)
1026 # 1st and 3rd sequences, unsorted
1027 $aln2 = $aln->select_noncont('nosort',3, 1)
1029 Function : Creates a new alignment from a subset of sequences. Numbering
1030 starts from 1. Sequence positions larger than num_sequences() will
1031 throw an error. Sorts the order added to new alignment by default,
1032 to prevent sorting pass 'nosort' as the first argument in the list.
1033 Returns : a Bio::SimpleAlign object
1034 Args : array of integers for the sequences. If the string 'nosort' is
1035 passed as the first argument, the sequences will not be sorted
1036 in the new alignment but will appear in the order listed.
1038 =cut
1040 sub select_noncont {
1041 my $self = shift;
1042 my $nosort = 0;
1043 my (@pos) = @_;
1044 if ($pos[0] !~ m{^\d+$}) {
1045 my $sortcmd = shift @pos;
1046 if ($sortcmd eq 'nosort') {
1047 $nosort = 1;
1048 } else {
1049 $self->throw("Command not recognized: $sortcmd. Only 'nosort' implemented at this time.");
1053 my $end = $self->num_sequences;
1054 foreach ( @pos ) {
1055 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
1056 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
1059 @pos = sort {$a <=> $b} @pos unless $nosort;
1061 my $aln = $self->new;
1062 foreach my $p (@pos) {
1063 $aln->add_seq($self->get_seq_by_pos($p));
1065 $aln->id($self->id);
1066 # fix for meta, sf, ann
1067 return $aln;
1070 =head2 slice
1072 Title : slice
1073 Usage : $aln2 = $aln->slice(20,30)
1074 Function : Creates a slice from the alignment inclusive of start and
1075 end columns, and the first column in the alignment is denoted 1.
1076 Sequences with no residues in the slice are excluded from the
1077 new alignment and a warning is printed. Slice beyond the length of
1078 the sequence does not do padding.
1079 Returns : A Bio::SimpleAlign object
1080 Args : Positive integer for start column, positive integer for end column,
1081 optional boolean which if true will keep gap-only columns in the newly
1082 created slice. Example:
1084 $aln2 = $aln->slice(20,30,1)
1086 =cut
1088 sub slice {
1089 my $self = shift;
1090 my ($start, $end, $keep_gap_only) = @_;
1092 $self->throw("Slice start has to be a positive integer, not [$start]")
1093 unless $start =~ /^\d+$/ and $start > 0;
1094 $self->throw("Slice end has to be a positive integer, not [$end]")
1095 unless $end =~ /^\d+$/ and $end > 0;
1096 $self->throw("Slice start [$start] has to be smaller than or equal to end [$end]")
1097 unless $start <= $end;
1098 $self->throw("This alignment has only ". $self->length . " residues. Slice start " .
1099 "[$start] is too big.") if $start > $self->length;
1100 my $cons_meta = $self->consensus_meta;
1101 my $aln = $self->new;
1102 $aln->id($self->id);
1103 foreach my $seq ( $self->each_seq() ) {
1104 my $new_seq = $seq->isa('Bio::Seq::MetaI') ?
1105 Bio::Seq::Meta->new
1106 (-id => $seq->id,
1107 -alphabet => $seq->alphabet,
1108 -strand => $seq->strand,
1109 -verbose => $self->verbose) :
1110 Bio::LocatableSeq->new
1111 (-id => $seq->id,
1112 -alphabet => $seq->alphabet,
1113 -strand => $seq->strand,
1114 -verbose => $self->verbose);
1116 # seq
1117 my $seq_end = $end;
1118 $seq_end = $seq->length if( $end > $seq->length );
1120 my $slice_seq = $seq->subseq($start, $seq_end);
1121 $new_seq->seq( $slice_seq );
1123 $slice_seq =~ s/\W//g;
1125 if ($start > 1) {
1126 my $pre_start_seq = $seq->subseq(1, $start - 1);
1127 $pre_start_seq =~ s/\W//g;
1128 if (!defined($seq->strand)) {
1129 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1130 } elsif ($seq->strand < 0){
1131 $new_seq->start( $seq->end - CORE::length($pre_start_seq) - CORE::length($slice_seq) + 1);
1132 } else {
1133 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
1135 } else {
1136 if ((defined $seq->strand)&&($seq->strand < 0)){
1137 $new_seq->start( $seq->end - CORE::length($slice_seq) + 1);
1138 } else {
1139 $new_seq->start( $seq->start);
1142 if ($new_seq->isa('Bio::Seq::MetaI')) {
1143 for my $meta_name ($seq->meta_names) {
1144 $new_seq->named_meta($meta_name, $seq->named_submeta($meta_name, $start, $end));
1147 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
1149 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
1150 $aln->add_seq($new_seq);
1151 } else {
1152 if( $keep_gap_only ) {
1153 $aln->add_seq($new_seq);
1154 } else {
1155 my $nse = $seq->get_nse();
1156 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
1157 " Sequence excluded from the new alignment.");
1161 if ($cons_meta) {
1162 my $new = Bio::Seq::Meta->new();
1163 for my $meta_name ($cons_meta->meta_names) {
1164 $new->named_meta($meta_name, $cons_meta->named_submeta($meta_name, $start, $end));
1166 $aln->consensus_meta($new);
1168 $aln->annotation($self->annotation);
1169 # fix for meta, sf, ann
1170 return $aln;
1173 =head2 remove_columns
1175 Title : remove_columns
1176 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) or
1177 $aln2 = $aln->remove_columns([0,0],[6,8])
1178 Function : Creates an aligment with columns removed corresponding to
1179 the specified type or by specifying the columns by number.
1180 Returns : Bio::SimpleAlign object
1181 Args : Array ref of types ('match'|'weak'|'strong'|'mismatch'|'gaps'|
1182 'all_gaps_columns') or array ref where the referenced array
1183 contains a pair of integers that specify a range.
1184 The first column is 0
1186 =cut
1188 sub remove_columns {
1189 my ($self,@args) = @_;
1190 @args || $self->throw("Must supply column ranges or column types");
1191 my $aln;
1193 if ($args[0][0] =~ /^[a-z_]+$/i) {
1194 $aln = $self->_remove_columns_by_type($args[0]);
1195 } elsif ($args[0][0] =~ /^\d+$/) {
1196 $aln = $self->_remove_columns_by_num(\@args);
1197 } else {
1198 $self->throw("You must pass array references to remove_columns(), not @args");
1200 # fix for meta, sf, ann
1201 $aln;
1205 =head2 remove_gaps
1207 Title : remove_gaps
1208 Usage : $aln2 = $aln->remove_gaps
1209 Function : Creates an aligment with gaps removed
1210 Returns : a Bio::SimpleAlign object
1211 Args : a gap character(optional) if none specified taken
1212 from $self->gap_char,
1213 [optional] $all_gaps_columns flag (1 or 0, default is 0)
1214 indicates that only all-gaps columns should be deleted
1216 Used from method L<remove_columns> in most cases. Set gap character
1217 using L<gap_char()|gap_char>.
1219 =cut
1221 sub remove_gaps {
1222 my ($self,$gapchar,$all_gaps_columns) = @_;
1223 my $gap_line;
1224 if ($all_gaps_columns) {
1225 $gap_line = $self->all_gap_line($gapchar);
1226 } else {
1227 $gap_line = $self->gap_line($gapchar);
1229 my $aln = $self->new;
1231 my @remove;
1232 my $length = 0;
1233 my $del_char = $gapchar || $self->gap_char;
1234 # Do the matching to get the segments to remove
1235 while ($gap_line =~ m/[$del_char]/g) {
1236 my $start = pos($gap_line)-1;
1237 $gap_line=~/\G[$del_char]+/gc;
1238 my $end = pos($gap_line)-1;
1240 #have to offset the start and end for subsequent removes
1241 $start-=$length;
1242 $end -=$length;
1243 $length += ($end-$start+1);
1244 push @remove, [$start,$end];
1247 #remove the segments
1248 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1249 # fix for meta, sf, ann
1250 return $aln;
1254 sub _remove_col {
1255 my ($self,$aln,$remove) = @_;
1256 my @new;
1258 my $gap = $self->gap_char;
1260 # splice out the segments and create new seq
1261 foreach my $seq($self->each_seq){
1262 my $new_seq = Bio::LocatableSeq->new(
1263 -id => $seq->id,
1264 -alphabet=> $seq->alphabet,
1265 -strand => $seq->strand,
1266 -verbose => $self->verbose);
1267 my $sequence = $seq->seq;
1268 foreach my $pair(@{$remove}){
1269 my $start = $pair->[0];
1270 my $end = $pair->[1];
1271 $sequence = $seq->seq unless $sequence;
1272 my $orig = $sequence;
1273 my $head = $start > 0 ? substr($sequence, 0, $start) : '';
1274 my $tail = ($end + 1) >= CORE::length($sequence) ? '' : substr($sequence, $end + 1);
1275 $sequence = $head.$tail;
1276 # start
1277 unless (defined $new_seq->start) {
1278 if ($start == 0) {
1279 my $start_adjust = () = substr($orig, 0, $end + 1) =~ /$gap/g;
1280 $new_seq->start($seq->start + $end + 1 - $start_adjust);
1282 else {
1283 my $start_adjust = $orig =~ /^$gap+/;
1284 if ($start_adjust) {
1285 $start_adjust = $+[0] == $start;
1287 $new_seq->start($seq->start + $start_adjust);
1290 # end
1291 if (($end + 1) >= CORE::length($orig)) {
1292 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
1293 $new_seq->end($seq->end - (CORE::length($orig) - $start) + $end_adjust);
1295 else {
1296 $new_seq->end($seq->end);
1300 if ($new_seq->end < $new_seq->start) {
1301 # we removed all columns except for gaps: set to 0 to indicate no
1302 # sequence
1303 $new_seq->start(0);
1304 $new_seq->end(0);
1307 $new_seq->seq($sequence) if $sequence;
1308 push @new, $new_seq;
1310 # add the new seqs to the alignment
1311 foreach my $new(@new){
1312 $aln->add_seq($new);
1314 # fix for meta, sf, ann
1315 return $aln;
1318 sub _remove_columns_by_type {
1319 my ($self,$type) = @_;
1320 my $aln = $self->new;
1321 my @remove;
1323 my $gap = $self->gap_char if (grep { $_ eq 'gaps'} @{$type});
1324 my $all_gaps_columns = $self->gap_char if (grep /all_gaps_columns/,@{$type});
1325 my %matchchars = ( 'match' => '\*',
1326 'weak' => '\.',
1327 'strong' => ':',
1328 'mismatch' => ' ',
1329 'gaps' => '',
1330 'all_gaps_columns' => ''
1332 # get the characters to delete against
1333 my $del_char;
1334 foreach my $type (@{$type}){
1335 $del_char.= $matchchars{$type};
1338 my $length = 0;
1339 my $match_line = $self->match_line;
1340 # do the matching to get the segments to remove
1341 if($del_char){
1342 while($match_line =~ m/[$del_char]/g ){
1343 my $start = pos($match_line)-1;
1344 $match_line=~/\G[$del_char]+/gc;
1345 my $end = pos($match_line)-1;
1347 #have to offset the start and end for subsequent removes
1348 $start-=$length;
1349 $end -=$length;
1350 $length += ($end-$start+1);
1351 push @remove, [$start,$end];
1355 # remove the segments
1356 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1357 $aln = $aln->remove_gaps() if $gap;
1358 $aln = $aln->remove_gaps('', 1) if $all_gaps_columns;
1359 # fix for meta, sf, ann
1360 $aln;
1364 sub _remove_columns_by_num {
1365 my ($self,$positions) = @_;
1366 my $aln = $self->new;
1368 # sort the positions
1369 @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
1371 my @remove;
1372 my $length = 0;
1373 foreach my $pos (@{$positions}) {
1374 my ($start, $end) = @{$pos};
1376 #have to offset the start and end for subsequent removes
1377 $start-=$length;
1378 $end -=$length;
1379 $length += ($end-$start+1);
1380 push @remove, [$start,$end];
1383 #remove the segments
1384 $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;
1385 # fix for meta, sf, ann
1386 $aln;
1390 =head1 Change sequences within the MSA
1392 These methods affect characters in all sequences without changing the
1393 alignment.
1395 =head2 splice_by_seq_pos
1397 Title : splice_by_seq_pos
1398 Usage : $status = splice_by_seq_pos(1);
1399 Function: splices all aligned sequences where the specified sequence
1400 has gaps.
1401 Example :
1402 Returns : 1 on success
1403 Args : position of sequence to splice by
1406 =cut
1408 sub splice_by_seq_pos{
1409 my ($self,$pos) = @_;
1411 my $guide = $self->get_seq_by_pos($pos);
1412 my $guide_seq = $guide->seq;
1414 $guide_seq =~ s/\./\-/g;
1416 my @gaps = ();
1417 $pos = -1;
1418 while(($pos = index($guide_seq, '-', $pos)) > -1 ){
1419 unshift @gaps, $pos;
1420 $pos++;
1423 foreach my $seq ($self->each_seq){
1424 my @bases = split '', $seq->seq;
1426 splice(@bases, $_, 1) foreach @gaps;
1427 $seq->seq(join('', @bases));
1433 =head2 map_chars
1435 Title : map_chars
1436 Usage : $ali->map_chars('\.','-')
1437 Function : Does a s/$arg1/$arg2/ on the sequences. Useful for gap
1438 characters
1440 Notice that the from (arg1) is interpretted as a regex,
1441 so be careful about quoting meta characters (eg
1442 $ali->map_chars('.','-') wont do what you want)
1443 Returns :
1444 Argument : 'from' rexexp
1445 'to' string
1447 =cut
1449 sub map_chars {
1450 my $self = shift;
1451 my $from = shift;
1452 my $to = shift;
1453 my ($seq,$temp);
1455 $self->throw("Need exactly two arguments")
1456 unless defined $from and defined $to;
1458 foreach $seq ( $self->each_seq() ) {
1459 $temp = $seq->seq();
1460 $temp =~ s/$from/$to/g;
1461 $seq->seq($temp);
1463 return 1;
1467 =head2 uppercase
1469 Title : uppercase()
1470 Usage : $ali->uppercase()
1471 Function : Sets all the sequences to uppercase
1472 Returns :
1473 Argument :
1475 =cut
1477 sub uppercase {
1478 my $self = shift;
1479 my $seq;
1480 my $temp;
1482 foreach $seq ( $self->each_seq() ) {
1483 $temp = $seq->seq();
1484 $temp =~ tr/[a-z]/[A-Z]/;
1486 $seq->seq($temp);
1488 return 1;
1491 =head2 cigar_line
1493 Title : cigar_line()
1494 Usage : %cigars = $align->cigar_line()
1495 Function : Generates a "cigar" (Compact Idiosyncratic Gapped Alignment
1496 Report) line for each sequence in the alignment. Examples are
1497 "1,60" or "5,10:12,58", where the numbers refer to conserved
1498 positions within the alignment. The keys of the hash are the
1499 NSEs (name/start/end) assigned to each sequence.
1500 Args : threshold (optional, defaults to 100)
1501 Returns : Hash of strings (cigar lines)
1503 =cut
1505 sub cigar_line {
1506 my $self = shift;
1507 my $thr=shift||100;
1508 my %cigars;
1510 my @consensus = split "",($self->consensus_string($thr));
1511 my $len = $self->length;
1512 my $gapchar = $self->gap_char;
1514 # create a precursor, something like (1,4,5,6,7,33,45),
1515 # where each number corresponds to a conserved position
1516 foreach my $seq ( $self->each_seq ) {
1517 my @seq = split "", uc ($seq->seq);
1518 my $pos = 1;
1519 for (my $x = 0 ; $x < $len ; $x++ ) {
1520 if ($seq[$x] eq $consensus[$x]) {
1521 push @{$cigars{$seq->get_nse}},$pos;
1522 $pos++;
1523 } elsif ($seq[$x] ne $gapchar) {
1524 $pos++;
1528 # duplicate numbers - (1,4,5,6,7,33,45) becomes (1,1,4,5,6,7,33,33,45,45)
1529 for my $name (keys %cigars) {
1530 splice @{$cigars{$name}}, 1, 0, ${$cigars{$name}}[0] if
1531 ( ${$cigars{$name}}[0] + 1 < ${$cigars{$name}}[1] );
1532 push @{$cigars{$name}}, ${$cigars{$name}}[$#{$cigars{$name}}] if
1533 ( ${$cigars{$name}}[($#{$cigars{$name}} - 1)] + 1 <
1534 ${$cigars{$name}}[$#{$cigars{$name}}] );
1535 for ( my $x = 1 ; $x < $#{$cigars{$name}} - 1 ; $x++) {
1536 if (${$cigars{$name}}[$x - 1] + 1 < ${$cigars{$name}}[$x] &&
1537 ${$cigars{$name}}[$x + 1] > ${$cigars{$name}}[$x] + 1) {
1538 splice @{$cigars{$name}}, $x, 0, ${$cigars{$name}}[$x];
1542 # collapse series - (1,1,4,5,6,7,33,33,45,45) becomes (1,1,4,7,33,33,45,45)
1543 for my $name (keys %cigars) {
1544 my @remove;
1545 for ( my $x = 0 ; $x < $#{$cigars{$name}} ; $x++) {
1546 if ( ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x - 1)] + 1 &&
1547 ${$cigars{$name}}[$x] == ${$cigars{$name}}[($x + 1)] - 1 ) {
1548 unshift @remove,$x;
1551 for my $pos (@remove) {
1552 splice @{$cigars{$name}}, $pos, 1;
1555 # join and punctuate
1556 for my $name (keys %cigars) {
1557 my ($start,$end,$str) = "";
1558 while ( ($start,$end) = splice @{$cigars{$name}}, 0, 2 ) {
1559 $str .= ($start . "," . $end . ":");
1561 $str =~ s/:$//;
1562 $cigars{$name} = $str;
1564 %cigars;
1568 =head2 match_line
1570 Title : match_line()
1571 Usage : $line = $align->match_line()
1572 Function : Generates a match line - much like consensus string
1573 except that a line indicating the '*' for a match.
1574 Args : (optional) Match line characters ('*' by default)
1575 (optional) Strong match char (':' by default)
1576 (optional) Weak match char ('.' by default)
1577 Returns : String
1579 =cut
1581 sub match_line {
1582 my ($self,$matchlinechar, $strong, $weak) = @_;
1583 my %matchchars = ('match' => $matchlinechar || '*',
1584 'weak' => $weak || '.',
1585 'strong' => $strong || ':',
1586 'mismatch' => ' ',
1589 my @seqchars;
1590 my $alphabet;
1591 foreach my $seq ( $self->each_seq ) {
1592 push @seqchars, [ split(//, uc ($seq->seq)) ];
1593 $alphabet = $seq->alphabet unless defined $alphabet;
1595 my $refseq = shift @seqchars;
1596 # let's just march down the columns
1597 my $matchline;
1598 POS:
1599 foreach my $pos ( 0..$self->length ) {
1600 my $refchar = $refseq->[$pos];
1601 my $char = $matchchars{'mismatch'};
1602 unless( defined $refchar ) {
1603 last if $pos == $self->length; # short circuit on last residue
1604 # this in place to handle jason's soon-to-be-committed
1605 # intron mapping code
1606 goto bottom;
1608 my %col = ($refchar => 1);
1609 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
1610 foreach my $seq ( @seqchars ) {
1611 next if $pos >= scalar @$seq;
1612 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
1613 $seq->[$pos] eq ' ' );
1614 $col{$seq->[$pos]}++ if defined $seq->[$pos];
1616 my @colresidues = sort keys %col;
1618 # if all the values are the same
1619 if( $dash ) { $char = $matchchars{'mismatch'} }
1620 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
1621 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
1622 # matches for protein seqs
1623 TYPE:
1624 foreach my $type ( qw(strong weak) ) {
1625 # iterate through categories
1626 my %groups;
1627 # iterate through each of the aa in the col
1628 # look to see which groups it is in
1629 foreach my $c ( @colresidues ) {
1630 foreach my $f ( grep { index($_,$c) >= 0 } @{$CONSERVATION_GROUPS{$type}} ) {
1631 push @{$groups{$f}},$c;
1634 GRP:
1635 foreach my $cols ( values %groups ) {
1636 @$cols = sort @$cols;
1637 # now we are just testing to see if two arrays
1638 # are identical w/o changing either one
1639 # have to be same len
1640 next if( scalar @$cols != scalar @colresidues );
1641 # walk down the length and check each slot
1642 for($_=0;$_ < (scalar @$cols);$_++ ) {
1643 next GRP if( $cols->[$_] ne $colresidues[$_] );
1645 $char = $matchchars{$type};
1646 last TYPE;
1650 bottom:
1651 $matchline .= $char;
1653 return $matchline;
1657 =head2 gap_line
1659 Title : gap_line()
1660 Usage : $line = $align->gap_line()
1661 Function : Generates a gap line - much like consensus string
1662 except that a line where '-' represents gap
1663 Args : (optional) gap line characters ('-' by default)
1664 Returns : string
1666 =cut
1668 sub gap_line {
1669 my ($self,$gapchar) = @_;
1670 $gapchar = $gapchar || $self->gap_char;
1671 my %gap_hsh; # column gaps vector
1672 foreach my $seq ( $self->each_seq ) {
1673 my $i = 0;
1674 map {$gap_hsh{$_->[0]} = undef} grep {$_->[1] eq $gapchar}
1675 map {[$i++, $_]} split(//, uc ($seq->seq));
1677 my $gap_line;
1678 foreach my $pos ( 0..$self->length-1 ) {
1679 $gap_line .= (exists $gap_hsh{$pos}) ? $gapchar:'.';
1681 return $gap_line;
1684 =head2 all_gap_line
1686 Title : all_gap_line()
1687 Usage : $line = $align->all_gap_line()
1688 Function : Generates a gap line - much like consensus string
1689 except that a line where '-' represents all-gap column
1690 Args : (optional) gap line characters ('-' by default)
1691 Returns : string
1693 =cut
1695 sub all_gap_line {
1696 my ($self,$gapchar) = @_;
1697 $gapchar = $gapchar || $self->gap_char;
1698 my %gap_hsh; # column gaps counter hash
1699 my @seqs = $self->each_seq;
1700 foreach my $seq ( @seqs ) {
1701 my $i = 0;
1702 map {$gap_hsh{$_->[0]}++} grep {$_->[1] eq $gapchar}
1703 map {[$i++, $_]} split(//, uc ($seq->seq));
1705 my $gap_line;
1706 foreach my $pos ( 0..$self->length-1 ) {
1707 if (exists $gap_hsh{$pos} && $gap_hsh{$pos} == scalar @seqs) {
1708 # gaps column
1709 $gap_line .= $gapchar;
1710 } else {
1711 $gap_line .= '.';
1714 return $gap_line;
1717 =head2 gap_col_matrix
1719 Title : gap_col_matrix()
1720 Usage : my $cols = $align->gap_col_matrix()
1721 Function : Generates an array of hashes where
1722 each entry in the array is a hash reference
1723 with keys of all the sequence names and
1724 and value of 1 or 0 if the sequence has a gap at that column
1725 Args : (optional) gap line characters ($aln->gap_char or '-' by default)
1727 =cut
1729 sub gap_col_matrix {
1730 my ($self,$gapchar) = @_;
1731 $gapchar = $gapchar || $self->gap_char;
1732 my %gap_hsh; # column gaps vector
1733 my @cols;
1734 foreach my $seq ( $self->each_seq ) {
1735 my $i = 0;
1736 my $str = $seq->seq;
1737 my $len = $seq->length;
1738 my $ch;
1739 my $id = $seq->display_id;
1740 while( $i < $len ) {
1741 $ch = substr($str, $i, 1);
1742 $cols[$i++]->{$id} = ($ch eq $gapchar);
1745 return \@cols;
1748 =head2 match
1750 Title : match()
1751 Usage : $ali->match()
1752 Function : Goes through all columns and changes residues that are
1753 identical to residue in first sequence to match '.'
1754 character. Sets match_char.
1756 USE WITH CARE: Most MSA formats do not support match
1757 characters in sequences, so this is mostly for output
1758 only. NEXUS format (Bio::AlignIO::nexus) can handle
1760 Returns : 1
1761 Argument : a match character, optional, defaults to '.'
1763 =cut
1765 sub match {
1766 my ($self, $match) = @_;
1768 $match ||= '.';
1769 my ($matching_char) = $match;
1770 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1771 $self->map_chars($matching_char, '-');
1773 my @seqs = $self->each_seq();
1774 return 1 unless scalar @seqs > 1;
1776 my $refseq = shift @seqs ;
1777 my @refseq = split //, $refseq->seq;
1778 my $gapchar = $self->gap_char;
1780 foreach my $seq ( @seqs ) {
1781 my @varseq = split //, $seq->seq();
1782 for ( my $i=0; $i < scalar @varseq; $i++) {
1783 $varseq[$i] = $match if defined $refseq[$i] &&
1784 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1785 $refseq[$i] =~ /$gapchar/ )
1786 && $refseq[$i] eq $varseq[$i];
1788 $seq->seq(join '', @varseq);
1790 $self->match_char($match);
1791 return 1;
1795 =head2 unmatch
1797 Title : unmatch()
1798 Usage : $ali->unmatch()
1799 Function : Undoes the effect of method match. Unsets match_char.
1800 Returns : 1
1801 Argument : a match character, optional, defaults to '.'
1803 See L<match> and L<match_char>
1805 =cut
1807 sub unmatch {
1808 my ($self, $match) = @_;
1810 $match ||= '.';
1812 my @seqs = $self->each_seq();
1813 return 1 unless scalar @seqs > 1;
1815 my $refseq = shift @seqs ;
1816 my @refseq = split //, $refseq->seq;
1817 my $gapchar = $self->gap_char;
1818 foreach my $seq ( @seqs ) {
1819 my @varseq = split //, $seq->seq();
1820 for ( my $i=0; $i < scalar @varseq; $i++) {
1821 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1822 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1823 $refseq[$i] =~ /$gapchar/ ) &&
1824 $varseq[$i] eq $match;
1826 $seq->seq(join '', @varseq);
1828 $self->match_char('');
1829 return 1;
1832 =head1 MSA attributes
1834 Methods for setting and reading the MSA attributes.
1836 Note that the methods defining character semantics depend on the user
1837 to set them sensibly. They are needed only by certain input/output
1838 methods. Unset them by setting to an empty string ('').
1840 =head2 id
1842 Title : id
1843 Usage : $myalign->id("Ig")
1844 Function : Gets/sets the id field of the alignment
1845 Returns : An id string
1846 Argument : An id string (optional)
1848 =cut
1850 sub id {
1851 my ($self, $name) = @_;
1853 if (defined( $name )) {
1854 $self->{'_id'} = $name;
1857 return $self->{'_id'};
1860 =head2 accession
1862 Title : accession
1863 Usage : $myalign->accession("PF00244")
1864 Function : Gets/sets the accession field of the alignment
1865 Returns : An acc string
1866 Argument : An acc string (optional)
1868 =cut
1870 sub accession {
1871 my ($self, $acc) = @_;
1873 if (defined( $acc )) {
1874 $self->{'_accession'} = $acc;
1877 return $self->{'_accession'};
1880 =head2 description
1882 Title : description
1883 Usage : $myalign->description("14-3-3 proteins")
1884 Function : Gets/sets the description field of the alignment
1885 Returns : An description string
1886 Argument : An description string (optional)
1888 =cut
1890 sub description {
1891 my ($self, $name) = @_;
1893 if (defined( $name )) {
1894 $self->{'_description'} = $name;
1897 return $self->{'_description'};
1900 =head2 missing_char
1902 Title : missing_char
1903 Usage : $myalign->missing_char("?")
1904 Function : Gets/sets the missing_char attribute of the alignment
1905 It is generally recommended to set it to 'n' or 'N'
1906 for nucleotides and to 'X' for protein.
1907 Returns : An missing_char string,
1908 Argument : An missing_char string (optional)
1910 =cut
1912 sub missing_char {
1913 my ($self, $char) = @_;
1915 if (defined $char ) {
1916 $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1917 $self->{'_missing_char'} = $char;
1920 return $self->{'_missing_char'};
1923 =head2 match_char
1925 Title : match_char
1926 Usage : $myalign->match_char('.')
1927 Function : Gets/sets the match_char attribute of the alignment
1928 Returns : An match_char string,
1929 Argument : An match_char string (optional)
1931 =cut
1933 sub match_char {
1934 my ($self, $char) = @_;
1936 if (defined $char ) {
1937 $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1938 $self->{'_match_char'} = $char;
1941 return $self->{'_match_char'};
1944 =head2 gap_char
1946 Title : gap_char
1947 Usage : $myalign->gap_char('-')
1948 Function : Gets/sets the gap_char attribute of the alignment
1949 Returns : An gap_char string, defaults to '-'
1950 Argument : An gap_char string (optional)
1952 =cut
1954 sub gap_char {
1955 my ($self, $char) = @_;
1957 if (defined $char || ! defined $self->{'_gap_char'} ) {
1958 $char= '-' unless defined $char;
1959 $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1960 $self->{'_gap_char'} = $char;
1962 return $self->{'_gap_char'};
1965 =head2 symbol_chars
1967 Title : symbol_chars
1968 Usage : my @symbolchars = $aln->symbol_chars;
1969 Function: Returns all the seen symbols (other than gaps)
1970 Returns : array of characters that are the seen symbols
1971 Args : boolean to include the gap/missing/match characters
1973 =cut
1975 sub symbol_chars{
1976 my ($self,$includeextra) = @_;
1978 unless ($self->{'_symbols'}) {
1979 foreach my $seq ($self->each_seq) {
1980 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
1983 my %copy = %{$self->{'_symbols'}};
1984 if( ! $includeextra ) {
1985 foreach my $char ( $self->gap_char, $self->match_char,
1986 $self->missing_char) {
1987 delete $copy{$char} if( defined $char );
1990 return keys %copy;
1993 =head1 Alignment descriptors
1995 These read only methods describe the MSA in various ways.
1998 =head2 score
2000 Title : score
2001 Usage : $str = $ali->score()
2002 Function : get/set a score of the alignment
2003 Returns : a score for the alignment
2004 Argument : an optional score to set
2006 =cut
2008 sub score {
2009 my $self = shift;
2010 $self->{score} = shift if @_;
2011 return $self->{score};
2014 =head2 consensus_string
2016 Title : consensus_string
2017 Usage : $str = $ali->consensus_string($threshold_percent)
2018 Function : Makes a strict consensus
2019 Returns : Consensus string
2020 Argument : Optional treshold ranging from 0 to 100.
2021 The consensus residue has to appear at least threshold %
2022 of the sequences at a given location, otherwise a '?'
2023 character will be placed at that location.
2024 (Default value = 0%)
2026 =cut
2028 sub consensus_string {
2029 my $self = shift;
2030 my $threshold = shift;
2032 my $out = "";
2033 my $len = $self->length - 1;
2035 foreach ( 0 .. $len ) {
2036 $out .= $self->_consensus_aa($_,$threshold);
2038 return $out;
2041 sub _consensus_aa {
2042 my $self = shift;
2043 my $point = shift;
2044 my $threshold_percent = shift || -1 ;
2045 my ($seq,%hash,$count,$letter,$key);
2046 my $gapchar = $self->gap_char;
2047 foreach $seq ( $self->each_seq() ) {
2048 $letter = substr($seq->seq,$point,1);
2049 $self->throw("--$point-----------") if $letter eq '';
2050 ($letter eq $gapchar || $letter =~ /\./) && next;
2051 # print "Looking at $letter\n";
2052 $hash{$letter}++;
2054 my $number_of_sequences = $self->num_sequences();
2055 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
2056 $count = -1;
2057 $letter = '?';
2059 foreach $key ( sort keys %hash ) {
2060 # print "Now at $key $hash{$key}\n";
2061 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
2062 $letter = $key;
2063 $count = $hash{$key};
2066 return $letter;
2070 =head2 consensus_iupac
2072 Title : consensus_iupac
2073 Usage : $str = $ali->consensus_iupac()
2074 Function : Makes a consensus using IUPAC ambiguity codes from DNA
2075 and RNA. The output is in upper case except when gaps in
2076 a column force output to be in lower case.
2078 Note that if your alignment sequences contain a lot of
2079 IUPAC ambiquity codes you often have to manually set
2080 alphabet. Bio::PrimarySeq::_guess_type thinks they
2081 indicate a protein sequence.
2082 Returns : consensus string
2083 Argument : none
2084 Throws : on protein sequences
2086 =cut
2088 sub consensus_iupac {
2089 my $self = shift;
2090 my $out = "";
2091 my $len = $self->length-1;
2093 # only DNA and RNA sequences are valid
2094 foreach my $seq ( $self->each_seq() ) {
2095 $self->throw("Seq [". $seq->get_nse. "] is a protein")
2096 if $seq->alphabet eq 'protein';
2098 # loop over the alignment columns
2099 foreach my $count ( 0 .. $len ) {
2100 $out .= $self->_consensus_iupac($count);
2102 return $out;
2105 sub _consensus_iupac {
2106 my ($self, $column) = @_;
2107 my ($string, $char, $rna);
2109 #determine all residues in a column
2110 foreach my $seq ( $self->each_seq() ) {
2111 $string .= substr($seq->seq, $column, 1);
2113 $string = uc $string;
2115 # quick exit if there's an N in the string
2116 if ($string =~ /N/) {
2117 $string =~ /\W/ ? return 'n' : return 'N';
2119 # ... or if there are only gap characters
2120 return '-' if $string =~ /^\W+$/;
2122 # treat RNA as DNA in regexps
2123 if ($string =~ /U/) {
2124 $string =~ s/U/T/;
2125 $rna = 1;
2128 # the following s///'s only need to be done to the _first_ ambiguity code
2129 # as we only need to see the _range_ of characters in $string
2131 if ($string =~ /[VDHB]/) {
2132 $string =~ s/V/AGC/;
2133 $string =~ s/D/AGT/;
2134 $string =~ s/H/ACT/;
2135 $string =~ s/B/CTG/;
2138 if ($string =~ /[SKYRWM]/) {
2139 $string =~ s/S/GC/;
2140 $string =~ s/K/GT/;
2141 $string =~ s/Y/CT/;
2142 $string =~ s/R/AG/;
2143 $string =~ s/W/AT/;
2144 $string =~ s/M/AC/;
2147 # and now the guts of the thing
2149 if ($string =~ /A/) {
2150 $char = 'A'; # A A
2151 if ($string =~ /G/) {
2152 $char = 'R'; # A and G (purines) R
2153 if ($string =~ /C/) {
2154 $char = 'V'; # A and G and C V
2155 if ($string =~ /T/) {
2156 $char = 'N'; # A and G and C and T N
2158 } elsif ($string =~ /T/) {
2159 $char = 'D'; # A and G and T D
2161 } elsif ($string =~ /C/) {
2162 $char = 'M'; # A and C M
2163 if ($string =~ /T/) {
2164 $char = 'H'; # A and C and T H
2166 } elsif ($string =~ /T/) {
2167 $char = 'W'; # A and T W
2169 } elsif ($string =~ /C/) {
2170 $char = 'C'; # C C
2171 if ($string =~ /T/) {
2172 $char = 'Y'; # C and T (pyrimidines) Y
2173 if ($string =~ /G/) {
2174 $char = 'B'; # C and T and G B
2176 } elsif ($string =~ /G/) {
2177 $char = 'S'; # C and G S
2179 } elsif ($string =~ /G/) {
2180 $char = 'G'; # G G
2181 if ($string =~ /C/) {
2182 $char = 'S'; # G and C S
2183 } elsif ($string =~ /T/) {
2184 $char = 'K'; # G and T K
2186 } elsif ($string =~ /T/) {
2187 $char = 'T'; # T T
2190 $char = 'U' if $rna and $char eq 'T';
2191 $char = lc $char if $string =~ /\W/;
2193 return $char;
2197 =head2 consensus_meta
2199 Title : consensus_meta
2200 Usage : $seqmeta = $ali->consensus_meta()
2201 Function : Returns a Bio::Seq::Meta object containing the consensus
2202 strings derived from meta data analysis.
2203 Returns : Bio::Seq::Meta
2204 Argument : Bio::Seq::Meta
2205 Throws : non-MetaI object
2207 =cut
2209 sub consensus_meta {
2210 my ($self, $meta) = @_;
2211 if ($meta && (!ref $meta || !$meta->isa('Bio::Seq::MetaI'))) {
2212 $self->throw('Not a Bio::Seq::MetaI object');
2214 return $self->{'_aln_meta'} = $meta if $meta;
2215 return $self->{'_aln_meta'}
2218 =head2 is_flush
2220 Title : is_flush
2221 Usage : if ( $ali->is_flush() )
2222 Function : Tells you whether the alignment
2223 : is flush, i.e. all of the same length
2224 Returns : 1 or 0
2225 Argument :
2227 =cut
2229 sub is_flush {
2230 my ($self,$report) = @_;
2231 my $seq;
2232 my $length = (-1);
2233 my $temp;
2235 foreach $seq ( $self->each_seq() ) {
2236 if( $length == (-1) ) {
2237 $length = CORE::length($seq->seq());
2238 next;
2241 $temp = CORE::length($seq->seq());
2242 if( $temp != $length ) {
2243 $self->warn("expecting $length not $temp from ".
2244 $seq->display_id) if( $report );
2245 $self->debug("expecting $length not $temp from ".
2246 $seq->display_id);
2247 $self->debug($seq->seq(). "\n");
2248 return 0;
2252 return 1;
2256 =head2 length
2258 Title : length()
2259 Usage : $len = $ali->length()
2260 Function : Returns the maximum length of the alignment.
2261 To be sure the alignment is a block, use is_flush
2262 Returns : Integer
2263 Argument :
2265 =cut
2267 sub length_aln {
2268 my $self = shift;
2269 $self->deprecated("length_aln - deprecated method. Use length() instead.");
2270 $self->length(@_);
2273 sub length {
2274 my $self = shift;
2275 my $seq;
2276 my $length = -1;
2277 my $temp;
2279 foreach $seq ( $self->each_seq() ) {
2280 $temp = $seq->length();
2281 if( $temp > $length ) {
2282 $length = $temp;
2286 return $length;
2290 =head2 maxdisplayname_length
2292 Title : maxdisplayname_length
2293 Usage : $ali->maxdisplayname_length()
2294 Function : Gets the maximum length of the displayname in the
2295 alignment. Used in writing out various MSA formats.
2296 Returns : integer
2297 Argument :
2299 =cut
2301 sub maxname_length {
2302 my $self = shift;
2303 $self->deprecated("maxname_length - deprecated method.".
2304 " Use maxdisplayname_length() instead.");
2305 $self->maxdisplayname_length();
2308 sub maxnse_length {
2309 my $self = shift;
2310 $self->deprecated("maxnse_length - deprecated method.".
2311 " Use maxnse_length() instead.");
2312 $self->maxdisplayname_length();
2315 sub maxdisplayname_length {
2316 my $self = shift;
2317 my $maxname = (-1);
2318 my ($seq,$len);
2320 foreach $seq ( $self->each_seq() ) {
2321 $len = CORE::length $self->displayname($seq->get_nse());
2323 if( $len > $maxname ) {
2324 $maxname = $len;
2328 return $maxname;
2331 =head2 max_metaname_length
2333 Title : max_metaname_length
2334 Usage : $ali->max_metaname_length()
2335 Function : Gets the maximum length of the meta name tags in the
2336 alignment for the sequences and for the alignment.
2337 Used in writing out various MSA formats.
2338 Returns : integer
2339 Argument : None
2341 =cut
2343 sub max_metaname_length {
2344 my $self = shift;
2345 my $maxname = (-1);
2346 my ($seq,$len);
2348 # check seq meta first
2349 for $seq ( $self->each_seq() ) {
2350 next if !$seq->isa('Bio::Seq::MetaI' || !$seq->meta_names);
2351 for my $mtag ($seq->meta_names) {
2352 $len = CORE::length $mtag;
2353 if( $len > $maxname ) {
2354 $maxname = $len;
2359 # alignment meta
2360 for my $meta ($self->consensus_meta) {
2361 next unless $meta;
2362 for my $name ($meta->meta_names) {
2363 $len = CORE::length $name;
2364 if( $len > $maxname ) {
2365 $maxname = $len;
2370 return $maxname;
2373 =head2 num_residues
2375 Title : num_residues
2376 Usage : $no = $ali->num_residues
2377 Function : number of residues in total in the alignment
2378 Returns : integer
2379 Argument :
2380 Note : replaces no_residues()
2382 =cut
2384 sub num_residues {
2385 my $self = shift;
2386 my $count = 0;
2388 foreach my $seq ($self->each_seq) {
2389 my $str = $seq->seq();
2391 $count += ($str =~ s/[A-Za-z]//g);
2394 return $count;
2397 =head2 num_sequences
2399 Title : num_sequences
2400 Usage : $depth = $ali->num_sequences
2401 Function : number of sequence in the sequence alignment
2402 Returns : integer
2403 Argument : none
2404 Note : replaces no_sequences()
2406 =cut
2408 sub num_sequences {
2409 my $self = shift;
2410 return scalar($self->each_seq);
2414 =head2 average_percentage_identity
2416 Title : average_percentage_identity
2417 Usage : $id = $align->average_percentage_identity
2418 Function: The function uses a fast method to calculate the average
2419 percentage identity of the alignment
2420 Returns : The average percentage identity of the alignment
2421 Args : None
2422 Notes : This method implemented by Kevin Howe calculates a figure that is
2423 designed to be similar to the average pairwise identity of the
2424 alignment (identical in the absence of gaps), without having to
2425 explicitly calculate pairwise identities proposed by Richard Durbin.
2426 Validated by Ewan Birney ad Alex Bateman.
2428 =cut
2430 sub average_percentage_identity{
2431 my ($self,@args) = @_;
2433 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
2434 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
2436 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
2438 if (! $self->is_flush()) {
2439 $self->throw("All sequences in the alignment must be the same length");
2442 @seqs = $self->each_seq();
2443 $len = $self->length();
2445 # load the each hash with correct keys for existence checks
2447 for( my $index=0; $index < $len; $index++) {
2448 foreach my $letter (@alphabet) {
2449 $countHashes[$index]->{$letter} = 0;
2452 foreach my $seq (@seqs) {
2453 my @seqChars = split //, $seq->seq();
2454 for( my $column=0; $column < @seqChars; $column++ ) {
2455 my $char = uc($seqChars[$column]);
2456 if (exists $countHashes[$column]->{$char}) {
2457 $countHashes[$column]->{$char}++;
2462 $total = 0;
2463 $divisor = 0;
2464 for(my $column =0; $column < $len; $column++) {
2465 my %hash = %{$countHashes[$column]};
2466 $subdivisor = 0;
2467 foreach my $res (keys %hash) {
2468 $total += $hash{$res}*($hash{$res} - 1);
2469 $subdivisor += $hash{$res};
2471 $divisor += $subdivisor * ($subdivisor - 1);
2473 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
2476 =head2 percentage_identity
2478 Title : percentage_identity
2479 Usage : $id = $align->percentage_identity
2480 Function: The function calculates the average percentage identity
2481 (aliased to average_percentage_identity)
2482 Returns : The average percentage identity
2483 Args : None
2485 =cut
2487 sub percentage_identity {
2488 my $self = shift;
2489 return $self->average_percentage_identity();
2492 =head2 overall_percentage_identity
2494 Title : overall_percentage_identity
2495 Usage : $id = $align->overall_percentage_identity
2496 $id = $align->overall_percentage_identity('short')
2497 Function: The function calculates the percentage identity of
2498 the conserved columns
2499 Returns : The percentage identity of the conserved columns
2500 Args : length value to use, optional defaults to alignment length
2501 possible values: 'align', 'short', 'long'
2503 The argument values 'short' and 'long' refer to shortest and longest
2504 sequence in the alignment. Method modification code by Hongyu Zhang.
2506 =cut
2508 sub overall_percentage_identity{
2509 my ($self, $length_measure) = @_;
2511 my %alphabet = map {$_ => undef} qw (A C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
2513 my %enum = map {$_ => undef} qw (align short long);
2515 $self->throw("Unknown argument [$length_measure]")
2516 if $length_measure and not exists $enum{$length_measure};
2517 $length_measure ||= 'align';
2519 if (! $self->is_flush()) {
2520 $self->throw("All sequences in the alignment must be the same length");
2523 # Count the residues seen at each position
2524 my $len;
2525 my $total = 0; # number of positions with identical residues
2526 my @countHashes;
2527 my @seqs = $self->each_seq;
2528 my $nof_seqs = scalar @seqs;
2529 my $aln_len = $self->length();
2530 for my $seq (@seqs) {
2531 my $seqstr = $seq->seq;
2533 # Count residues for given sequence
2534 for my $column (0 .. $aln_len-1) {
2535 my $char = uc( substr($seqstr, $column, 1) );
2536 if ( exists $alphabet{$char} ) {
2538 # This is a valid char
2539 if ( defined $countHashes[$column]->{$char} ) {
2540 $countHashes[$column]->{$char}++;
2541 } else {
2542 $countHashes[$column]->{$char} = 1;
2545 if ( $countHashes[$column]->{$char} == $nof_seqs ) {
2546 # All sequences have this same residue
2547 $total++;
2553 # Sequence length
2554 if ($length_measure eq 'short' || $length_measure eq 'long') {
2555 my $seq_len = $seqstr =~ tr/[A-Za-z]//;
2556 if ($length_measure eq 'short') {
2557 if ( (not defined $len) || ($seq_len < $len) ) {
2558 $len = $seq_len;
2560 } elsif ($length_measure eq 'long') {
2561 if ( (not defined $len) || ($seq_len > $len) ) {
2562 $len = $seq_len;
2569 if ($length_measure eq 'align') {
2570 $len = $aln_len;
2573 return ($total / $len ) * 100.0;
2578 =head1 Alignment positions
2580 Methods to map a sequence position into an alignment column and back.
2581 column_from_residue_number() does the former. The latter is really a
2582 property of the sequence object and can done using
2583 L<Bio::LocatableSeq::location_from_column>:
2585 # select somehow a sequence from the alignment, e.g.
2586 my $seq = $aln->get_seq_by_pos(1);
2587 #$loc is undef or Bio::LocationI object
2588 my $loc = $seq->location_from_column(5);
2590 =head2 column_from_residue_number
2592 Title : column_from_residue_number
2593 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
2594 Function: This function gives the position in the alignment
2595 (i.e. column number) of the given residue number in the
2596 sequence with the given name. For example, for the
2597 alignment
2599 Seq1/91-97 AC..DEF.GH.
2600 Seq2/24-30 ACGG.RTY...
2601 Seq3/43-51 AC.DDEF.GHI
2603 column_from_residue_number( "Seq1", 94 ) returns 6.
2604 column_from_residue_number( "Seq2", 25 ) returns 2.
2605 column_from_residue_number( "Seq3", 50 ) returns 10.
2607 An exception is thrown if the residue number would lie
2608 outside the length of the aligment
2609 (e.g. column_from_residue_number( "Seq2", 22 )
2611 Note: If the the parent sequence is represented by more than
2612 one alignment sequence and the residue number is present in
2613 them, this method finds only the first one.
2615 Returns : A column number for the position in the alignment of the
2616 given residue in the given sequence (1 = first column)
2617 Args : A sequence id/name (not a name/start-end)
2618 A residue number in the whole sequence (not just that
2619 segment of it in the alignment)
2621 =cut
2623 sub column_from_residue_number {
2624 my ($self, $name, $resnumber) = @_;
2626 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
2627 $self->throw("Second argument residue number missing") unless $resnumber;
2629 foreach my $seq ($self->each_seq_with_id($name)) {
2630 my $col;
2631 eval {
2632 $col = $seq->column_from_residue_number($resnumber);
2634 next if $@;
2635 return $col;
2638 $self->throw("Could not find a sequence segment in $name ".
2639 "containing residue number $resnumber");
2643 =head1 Sequence names
2645 Methods to manipulate the display name. The default name based on the
2646 sequence id and subsequence positions can be overridden in various
2647 ways.
2649 =head2 displayname
2651 Title : displayname
2652 Usage : $myalign->displayname("Ig", "IgA")
2653 Function : Gets/sets the display name of a sequence in the alignment
2654 Returns : A display name string
2655 Argument : name of the sequence
2656 displayname of the sequence (optional)
2658 =cut
2660 sub get_displayname {
2661 my $self = shift;
2662 $self->deprecated("get_displayname - deprecated method. Use displayname() instead.");
2663 $self->displayname(@_);
2666 sub set_displayname {
2667 my $self = shift;
2668 $self->deprecated("set_displayname - deprecated method. Use displayname() instead.");
2669 $self->displayname(@_);
2672 sub displayname {
2673 my ($self, $name, $disname) = @_;
2675 $self->throw("No sequence with name [$name]")
2676 unless defined $self->{'_seq'}->{$name};
2678 if( $disname and $name) {
2679 $self->{'_dis_name'}->{$name} = $disname;
2680 return $disname;
2682 elsif( defined $self->{'_dis_name'}->{$name} ) {
2683 return $self->{'_dis_name'}->{$name};
2684 } else {
2685 return $name;
2689 =head2 set_displayname_count
2691 Title : set_displayname_count
2692 Usage : $ali->set_displayname_count
2693 Function : Sets the names to be name_# where # is the number of
2694 times this name has been used.
2695 Returns : 1, on success
2696 Argument :
2698 =cut
2700 sub set_displayname_count {
2701 my $self= shift;
2702 my (@arr,$name,$seq,$count,$temp,$nse);
2704 foreach $seq ( $self->each_alphabetically() ) {
2705 $nse = $seq->get_nse();
2707 #name will be set when this is the second
2708 #time (or greater) is has been seen
2710 if( defined $name and $name eq ($seq->id()) ) {
2711 $temp = sprintf("%s_%s",$name,$count);
2712 $self->displayname($nse,$temp);
2713 $count++;
2714 } else {
2715 $count = 1;
2716 $name = $seq->id();
2717 $temp = sprintf("%s_%s",$name,$count);
2718 $self->displayname($nse,$temp);
2719 $count++;
2722 return 1;
2725 =head2 set_displayname_flat
2727 Title : set_displayname_flat
2728 Usage : $ali->set_displayname_flat()
2729 Function : Makes all the sequences be displayed as just their name,
2730 not name/start-end
2731 Returns : 1
2732 Argument :
2734 =cut
2736 sub set_displayname_flat {
2737 my $self = shift;
2738 my ($nse,$seq);
2740 foreach $seq ( $self->each_seq() ) {
2741 $nse = $seq->get_nse();
2742 $self->displayname($nse,$seq->id());
2744 return 1;
2747 =head2 set_displayname_normal
2749 Title : set_displayname_normal
2750 Usage : $ali->set_displayname_normal()
2751 Function : Makes all the sequences be displayed as name/start-end
2752 Returns : 1, on success
2753 Argument :
2755 =cut
2757 sub set_displayname_normal {
2758 my $self = shift;
2759 my ($nse,$seq);
2761 foreach $seq ( $self->each_seq() ) {
2762 $nse = $seq->get_nse();
2763 $self->displayname($nse,$nse);
2765 return 1;
2768 =head2 source
2770 Title : source
2771 Usage : $obj->source($newval)
2772 Function: sets the Alignment source program
2773 Example :
2774 Returns : value of source
2775 Args : newvalue (optional)
2778 =cut
2780 sub source{
2781 my ($self,$value) = @_;
2782 if( defined $value) {
2783 $self->{'_source'} = $value;
2785 return $self->{'_source'};
2788 =head2 set_displayname_safe
2790 Title : set_displayname_safe
2791 Usage : ($new_aln, $ref_name)=$ali->set_displayname_safe(4)
2792 Function : Assign machine-generated serial names to sequences in input order.
2793 Designed to protect names during PHYLIP runs. Assign 10-char string
2794 in the form of "S000000001" to "S999999999". Restore the original
2795 names using "restore_displayname".
2796 Returns : 1. a new $aln with system names;
2797 2. a hash ref for restoring names
2798 Argument : Number for id length (default 10)
2800 =cut
2802 sub set_displayname_safe {
2803 my $self = shift;
2804 my $idlength = shift || 10;
2805 my ($seq, %phylip_name);
2806 my $ct=0;
2807 my $new=Bio::SimpleAlign->new();
2808 foreach $seq ( $self->each_seq() ) {
2809 $ct++;
2810 my $pname="S". sprintf "%0" . ($idlength-1) . "s", $ct;
2811 $phylip_name{$pname}=$seq->id();
2812 my $new_seq= Bio::LocatableSeq->new(-id => $pname,
2813 -seq => $seq->seq(),
2814 -alphabet => $seq->alphabet,
2815 -start => $seq->{_start},
2816 -end => $seq->{_end}
2818 $new->add_seq($new_seq);
2821 $self->debug("$ct seq names changed. Restore names by using restore_displayname.");
2822 return ($new, \%phylip_name);
2825 =head2 restore_displayname
2827 Title : restore_displayname
2828 Usage : $aln_name_restored=$ali->restore_displayname($hash_ref)
2829 Function : Restore original sequence names (after running
2830 $ali->set_displayname_safe)
2831 Returns : a new $aln with names restored.
2832 Argument : a hash reference of names from "set_displayname_safe".
2834 =cut
2836 sub restore_displayname {
2837 my $self = shift;
2838 my $ref=shift;
2839 my %name=%$ref;
2840 my $new=Bio::SimpleAlign->new();
2841 foreach my $seq ( $self->each_seq() ) {
2842 $self->throw("No sequence with name") unless defined $name{$seq->id()};
2843 my $new_seq= Bio::LocatableSeq->new(-id => $name{$seq->id()},
2844 -seq => $seq->seq(),
2845 -alphabet => $seq->alphabet,
2846 -start => $seq->{_start},
2847 -end => $seq->{_end}
2849 $new->add_seq($new_seq);
2851 return $new;
2854 =head2 sort_by_start
2856 Title : sort_by_start
2857 Usage : $ali->sort_by_start
2858 Function : Changes the order of the alignment to the start position of each
2859 subalignment
2860 Returns :
2861 Argument :
2863 =cut
2865 sub sort_by_start {
2866 my $self = shift;
2867 my ($seq,$nse,@arr,%hash,$count);
2868 foreach $seq ( $self->each_seq() ) {
2869 $nse = $seq->get_nse;
2870 $hash{$nse} = $seq;
2872 $count = 0;
2873 %{$self->{'_order'}} = (); # reset the hash;
2874 foreach $nse ( sort _startend keys %hash) {
2875 $self->{'_order'}->{$count} = $nse;
2876 $count++;
2881 sub _startend
2883 my ($aname,$arange) = split (/[\/]/,$a);
2884 my ($bname,$brange) = split (/[\/]/,$b);
2885 my ($astart,$aend) = split(/\-/,$arange);
2886 my ($bstart,$bend) = split(/\-/,$brange);
2887 return $astart <=> $bstart;
2890 =head2 bracket_string
2892 Title : bracket_string
2893 Usage : my @params = (-refseq => 'testseq',
2894 -allele1 => 'allele1',
2895 -allele2 => 'allele2',
2896 -delimiters => '{}',
2897 -separator => '/');
2898 $str = $aln->bracket_string(@params)
2900 Function : When supplied with a list of parameters (see below), returns a
2901 string in BIC format. This is used for allelic comparisons.
2902 Briefly, if either allele contains a base change when compared to
2903 the refseq, the base or gap for each allele is represented in
2904 brackets in the order present in the 'alleles' parameter.
2906 For the following data:
2908 >testseq
2909 GGATCCATTGCTACT
2910 >allele1
2911 GGATCCATTCCTACT
2912 >allele2
2913 GGAT--ATTCCTCCT
2915 the returned string with parameters 'refseq => testseq' and
2916 'alleles => [qw(allele1 allele2)]' would be:
2918 GGAT[C/-][C/-]ATT[C/C]CT[A/C]CT
2919 Returns : BIC-formatted string
2920 Argument : Required args
2921 refseq : string (ID) of the reference sequence used
2922 as basis for comparison
2923 allele1 : string (ID) of the first allele
2924 allele2 : string (ID) of the second allele
2925 Optional args
2926 delimiters: two symbol string of left and right delimiters.
2927 Only the first two symbols are used
2928 default = '[]'
2929 separator : string used as a separator. Only the first
2930 symbol is used
2931 default = '/'
2932 Throws : On no refseq/alleles, or invalid refseq/alleles.
2934 =cut
2936 sub bracket_string {
2937 my ($self, @args) = @_;
2938 my ($ref, $a1, $a2, $delim, $sep) =
2939 $self->_rearrange([qw(refseq allele1 allele2 delimiters separator)], @args);
2940 $self->throw('Missing refseq/allele1/allele2') if (!$a1 || !$a2 || !$ref);
2941 my ($ld, $rd);
2942 ($ld, $rd) = split('', $delim, 2) if $delim;
2943 $ld ||= '[';
2944 $rd ||= ']';
2945 $sep ||= '/';
2946 my ($refseq, $allele1, $allele2) =
2947 map {( $self->each_seq_with_id($_) )} ($ref, $a1, $a2);
2948 if (!$refseq || !$allele1 || !$allele2) {
2949 $self->throw("One of your refseq/allele IDs is invalid!");
2951 my $len = $self->length-1;
2952 my $bic = '';
2953 # loop over the alignment columns
2954 for my $column ( 0 .. $len ) {
2955 my $string;
2956 my ($compres, $res1, $res2) =
2957 map{substr($_->seq, $column, 1)} ($refseq, $allele1, $allele2);
2958 # are any of the allele symbols different from the refseq?
2959 $string = ($compres eq $res1 && $compres eq $res2) ? $compres :
2960 $ld.$res1.$sep.$res2.$rd;
2961 $bic .= $string;
2963 return $bic;
2967 =head2 methods implementing Bio::FeatureHolderI
2969 FeatureHolderI implementation to support labeled character sets like one
2970 would get from NEXUS represented data.
2972 =head2 get_SeqFeatures
2974 Usage : @features = $aln->get_SeqFeatures
2975 Function: Get the feature objects held by this feature holder.
2976 Example :
2977 Returns : an array of Bio::SeqFeatureI implementing objects
2978 Args : optional filter coderef, taking a Bio::SeqFeatureI
2979 : as argument, returning TRUE if wanted, FALSE if
2980 : unwanted
2982 =cut
2984 sub get_SeqFeatures {
2985 my $self = shift;
2986 my $filter_cb = shift;
2987 $self->throw("Arg (filter callback) must be a coderef") unless
2988 !defined($filter_cb) or ref($filter_cb) eq 'CODE';
2989 if( !defined $self->{'_as_feat'} ) {
2990 $self->{'_as_feat'} = [];
2992 if ($filter_cb) {
2993 return grep { $filter_cb->($_) } @{$self->{'_as_feat'}};
2995 return @{$self->{'_as_feat'}};
2998 =head2 add_SeqFeature
3000 Usage : $aln->add_SeqFeature($subfeat);
3001 Function: adds a SeqFeature into the SeqFeature array.
3002 Example :
3003 Returns : true on success
3004 Args : a Bio::SeqFeatureI object
3005 Note : This implementation is not compliant
3006 with Bio::FeatureHolderI
3008 =cut
3010 sub add_SeqFeature {
3011 my ($self,@feat) = @_;
3013 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
3015 foreach my $feat ( @feat ) {
3016 if( !$feat->isa("Bio::SeqFeatureI") ) {
3017 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
3020 push(@{$self->{'_as_feat'}},$feat);
3022 return 1;
3026 =head2 remove_SeqFeatures
3028 Usage : $obj->remove_SeqFeatures
3029 Function: Removes all SeqFeatures. If you want to remove only a subset,
3030 remove that subset from the returned array, and add back the rest.
3031 Returns : The array of Bio::SeqFeatureI features that was
3032 deleted from this alignment.
3033 Args : none
3035 =cut
3037 sub remove_SeqFeatures {
3038 my $self = shift;
3040 return () unless $self->{'_as_feat'};
3041 my @feats = @{$self->{'_as_feat'}};
3042 $self->{'_as_feat'} = [];
3043 return @feats;
3046 =head2 feature_count
3048 Title : feature_count
3049 Usage : $obj->feature_count()
3050 Function: Return the number of SeqFeatures attached to the alignment
3051 Returns : integer representing the number of SeqFeatures
3052 Args : None
3054 =cut
3056 sub feature_count {
3057 my ($self) = @_;
3059 if (defined($self->{'_as_feat'})) {
3060 return ($#{$self->{'_as_feat'}} + 1);
3061 } else {
3062 return 0;
3066 =head2 get_all_SeqFeatures
3068 Title : get_all_SeqFeatures
3069 Usage :
3070 Function: Get all SeqFeatures.
3071 Example :
3072 Returns : an array of Bio::SeqFeatureI implementing objects
3073 Args : none
3074 Note : Falls through to Bio::FeatureHolderI implementation.
3076 =cut
3078 =head2 methods for Bio::AnnotatableI
3080 AnnotatableI implementation to support sequence alignments which
3081 contain annotation (NEXUS, Stockholm).
3083 =head2 annotation
3085 Title : annotation
3086 Usage : $ann = $aln->annotation or
3087 $aln->annotation($ann)
3088 Function: Gets or sets the annotation
3089 Returns : Bio::AnnotationCollectionI object
3090 Args : None or Bio::AnnotationCollectionI object
3092 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
3093 for more information
3095 =cut
3097 sub annotation {
3098 my ($obj,$value) = @_;
3099 if( defined $value ) {
3100 $obj->throw("object of class ".ref($value)." does not implement ".
3101 "Bio::AnnotationCollectionI. Too bad.")
3102 unless $value->isa("Bio::AnnotationCollectionI");
3103 $obj->{'_annotation'} = $value;
3104 } elsif( ! defined $obj->{'_annotation'}) {
3105 $obj->{'_annotation'} = Bio::Annotation::Collection->new();
3107 return $obj->{'_annotation'};
3110 =head1 Deprecated methods
3112 =cut
3114 =head2 no_residues
3116 Title : no_residues
3117 Usage : $no = $ali->no_residues
3118 Function : number of residues in total in the alignment
3119 Returns : integer
3120 Argument :
3121 Note : deprecated in favor of num_residues()
3123 =cut
3125 sub no_residues {
3126 my $self = shift;
3127 $self->deprecated(-warn_version => 1.0069,
3128 -throw_version => 1.0075,
3129 -message => 'Use of method no_residues() is deprecated, use num_residues() instead');
3130 $self->num_residues(@_);
3133 =head2 no_sequences
3135 Title : no_sequences
3136 Usage : $depth = $ali->no_sequences
3137 Function : number of sequence in the sequence alignment
3138 Returns : integer
3139 Argument :
3140 Note : deprecated in favor of num_sequences()
3142 =cut
3144 sub no_sequences {
3145 my $self = shift;
3146 $self->deprecated(-warn_version => 1.0069,
3147 -throw_version => 1.0075,
3148 -message => 'Use of method no_sequences() is deprecated, use num_sequences() instead');
3149 $self->num_sequences(@_);
3152 =head2 mask_columns
3154 Title : mask_columns
3155 Usage : $aln2 = $aln->mask_columns(20,30)
3156 Function : Masks a slice of the alignment inclusive of start and
3157 end columns, and the first column in the alignment is denoted 1.
3158 Mask beyond the length of the sequence does not do padding.
3159 Returns : A Bio::SimpleAlign object
3160 Args : Positive integer for start column, positive integer for end column,
3161 optional string value use for the mask. Example:
3163 $aln2 = $aln->mask_columns(20,30,'?')
3164 Note : Masking must use a character that is not used for gaps or
3165 frameshifts. These can be adjusted using the relevant global
3166 variables, but be aware these may be (uncontrollably) modified
3167 elsewhere within BioPerl (see bug 2715)
3169 =cut
3171 sub mask_columns {
3172 #based on slice(), but did not include the Bio::Seq::Meta sections as I was not sure what it is doing
3173 my $self = shift;
3175 my $nonres = $Bio::LocatableSeq::GAP_SYMBOLS.
3176 $Bio::LocatableSeq::FRAMESHIFT_SYMBOLS;
3178 # coordinates are alignment-based, not sequence-based
3179 my ($start, $end, $mask_char) = @_;
3180 unless (defined $mask_char) { $mask_char = 'N' }
3182 $self->throw("Mask start has to be a positive integer and less than ".
3183 "alignment length, not [$start]")
3184 unless $start =~ /^\d+$/ && $start > 0 && $start <= $self->length;
3185 $self->throw("Mask end has to be a positive integer and less than ".
3186 "alignment length, not [$end]")
3187 unless $end =~ /^\d+$/ && $end > 0 && $end <= $self->length;
3188 $self->throw("Mask start [$start] has to be smaller than or equal to ".
3189 "end [$end]") unless $start <= $end;
3190 $self->throw("Mask character $mask_char has to be a single character ".
3191 "and not a gap or frameshift symbol")
3192 unless CORE::length($mask_char) == 1 && $mask_char !~ m{$nonres};
3194 my $aln = $self->new;
3195 $aln->id($self->id);
3196 foreach my $seq ( $self->each_seq() ) {
3197 my $new_seq = Bio::LocatableSeq->new(-id => $seq->id,
3198 -alphabet => $seq->alphabet,
3199 -strand => $seq->strand,
3200 -verbose => $self->verbose);
3202 # convert from 1-based alignment coords!
3203 my $masked_string = substr($seq->seq, $start - 1, $end - $start + 1);
3204 $masked_string =~ s{[^$nonres]}{$mask_char}g;
3205 my $new_dna_string = substr($seq->seq,0,$start-1) . $masked_string . substr($seq->seq,$end);
3206 $new_seq->seq($new_dna_string);
3207 $aln->add_seq($new_seq);
3209 return $aln;